IF statement erroring return inccorect results.

Hello All,

As a general rule I try to implement something in a very basic kernel prior to parallelizing it, just to be sure everything outputs correctly and I have a good function to expand on (the results match the current implementation). I am attempting to copy a set of if statements from C into a kernel but they error out after one run on index 682. I’ve confirmed that every array and value that is transferred to the kernel reads correctly from the kernel, but the results I receive are still incorrect. The C code is listed below, followed by the overhead and kernel code.


  int i, id;
  double e[9][2] = { {0, 0},  {1, 0}, {0, 1}, {-1, 0}, {0, -1}, {1, 1}, {-1, 1}, {-1, -1}, {1, -1}};
  double t_1;
  double uf, vf;

  uf = 0.;
  vf = 0.;
  t_1   = 1.0 /  9.0;

  uf = 0.;
  vf = 0.;
  for(i=0;i<Ninflow;++i){
    id = mapI[i];
    fluxQ1[id] = 0.;
    if ( (e[1][0]*nx[id] + e[1][1]*ny[id]) <0 ) fluxQ1[id] = (Q1[id]-Q3[id] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[id] + e[1][1]*ny[id]);
  }

  uf = 0.01;
  vf = 0.;
  for(i=0;i<Nfar;++i){
    id = mapF[i];
    fluxQ1[id] = 0.;
    if ( (e[1][0]*nx[id] + e[1][1]*ny[id]) <0 ) fluxQ1[id] = (Q1[id]-Q3[id] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[id] + e[1][1]*ny[id]);
  }

The overhead:


    err = clSetKernelArg(kernel[8], 0, sizeof(cl_int), (void*)&Ninflowcl);
    err = clSetKernelArg(kernel[8], 1, sizeof(cl_int), (void*)&Nfarcl);
    err = clSetKernelArg(kernel[8], 2, sizeof(cl_mem), (void*)&mapIcl);
    err = clSetKernelArg(kernel[8], 3, sizeof(cl_mem), (void*)&nxcl);
    err = clSetKernelArg(kernel[8], 4, sizeof(cl_mem), (void*)&nycl);
    err = clSetKernelArg(kernel[8], 5, sizeof(cl_mem), (void*)&mapFcl);
    err = clSetKernelArg(kernel[8], 6, sizeof(cl_mem), (void*)&Q1cl);
    err = clSetKernelArg(kernel[8], 7, sizeof(cl_mem), (void*)&Q3cl);
    err = clSetKernelArg(kernel[8], 8, sizeof(cl_mem), (void*)&fluxQ1cl);

The Kernel that I am trying to implement:


__kernel void umBC(int Ninflow,
                   int Nfar,
                   __global int* mapI,
                   __global double* nx,
                   __global double* ny,
                   __global int* mapF,
                   __global double* Q1,
                   __global double* Q3,
                   __global double* fluxQ1)
{

    int i, j;
    int id;
    double e[9][2] = { {0.0, 0.0},  {1.0, 0.0}, {0.0, 1.0}, {-1.0, 0.0}, {0.0, -1.0}, {1.0, 1.0}, {-1.0, 1.0}, {-1.0, -1.0}, {1.0, -1.0}};
    double t_1 = 1.0 / 9.0;
    double uf = 0.0;
    double vf = 0.0;

    for(i=0; i<Ninflow; ++i){
        id = mapI[i];
        fluxQ1[id] = ((e[1][0]*nx[id] + e[1][1]*ny[id]) < 0)*((Q1[id]-Q3[id] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[id] + e[1][1]*ny[id])) + 0.0;
    }
    uf = 0.01;
    vf = 0.0; 
    for(j=0; j<Nfar; ++j){
        id = mapF[j];
        fluxQ1[id] = ((e[1][0]*nx[id] + e[1][1]*ny[id]) < 0)*((Q1[id]-Q3[id] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[id] + e[1][1]*ny[id])) + 0.0;
    }

}

I’m at a complete loss as to why this will return everything as expected, except for the actual calculation. Any help would be greatly appreciated!

I don’t see you using get_global_id function. Do you dispatch your kernel using one thread? Otherwise it can be miltiple threads do reads and writes simultaneously and thus everything can happen.

As far as I’m aware, this kernel should be acting as a singularly threaded instruction because the calculation is trapped inside a for-loop, similar to how a basic matrix multiplication can be implemented onto the kernel with 3 for loops. Although this has given me an idea to test, so I’ll report back on that.
Also, I just realized how absolutely butchered the title to this thread is, whoops.

[QUOTE=Stumpae;37673]As far as I’m aware, this kernel should be acting as a singularly threaded instruction because the calculation is trapped inside a for-loop, similar to how a basic matrix multiplication can be implemented onto the kernel with 3 for loops. Although this has given me an idea to test, so I’ll report back on that.
[/QUOTE]

I gave some rough global parallelizing a try using the below code, but it stills fails on the statements dependent on mapI. mapI is a size 135 int array that passes and returns correctly to the kernel (I checked), and this set of statements fails on the 60th index of the 2nd pull (so after doing this computation 195 times). This is the same failure point as the non parallelized code.


globalWorkSize[0] = Ninflow;
globalWorkSize[1] = Nfar;

//kernel[7] is the umBC kernel below
err = clEnqueueNDRangeKernel(queue[0], kernel[7], 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue[0], fluxQ1cl, CL_TRUE, 0, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, *fluxQ1check, 0, NULL, NULL);

__kernel void umBC(int Ninflow,
                   int Nfar,
                   __global int* mapI,
                   __global double* nx,
                   __global double* ny,
                   __global int* mapF,
                   __global double* Q1,
                   __global double* Q3,
                   __global double* fluxQ1)
{

    int id, idF;
    double e[9][2] = { {0.0, 0.0},  {1.0, 0.0}, {0.0, 1.0}, {-1.0, 0.0}, {0.0, -1.0}, {1.0, 1.0}, {-1.0, 1.0}, {-1.0, -1.0}, {1.0, -1.0}};
    double t_1 = 1.0 / 9.0;
    double uf = 0.0;
    double vf = 0.0;

    int globalx = get_global_id(0);
    int globaly = get_global_id(1);

    id = mapI[globalx];
    fluxQ1[id] = ((e[1][0]*nx[id] + e[1][1]*ny[id]) < 0.0)*((Q1[id]-Q3[id] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[id] + e[1][1]*ny[id])) + 0.0; 

    uf = 0.01;
    vf = 0.0;

    idF = mapF[globaly];
    fluxQ1[idF] = ((e[1][0]*nx[idF] + e[1][1]*ny[idF]) < 0.0)*((Q1[idF]-Q3[idF] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[idF] + e[1][1]*ny[idF])) + 0.0;
}


I still have no idea why this is failing, any wonderful insight out there?