Mild data corruption

Hi,

I’ve written code that passes an array to the GPU. Then one kernel does an operation on the data creating another array. Then a second kernel takes the data from the new array and does an operation creating another array which is read back to the CPU.\

My problem is that the operations done result in a very slight inaccuracy. I’ve debugged the process and found that if I just have the original array’s data transferred to the second array by the first kernel and then that array’s data transferred to the final array by the second kernel. When its read back at the CPU there is a slight difference in the number. It becomes innacurrate after a few decimal points.

So is there a way to assure that significant digits aren’t being altered in the process?

Here are the kernels:


__kernel void Check_for_intersection(
                                     __global const float4 *rays,
                                     __global const float4 *atoms,
                                     __global float *ray_scores
                                     )
{
  const int large_dist = 999.0;
  int rayID = get_global_id(0);
  register unsigned int atomID;
  register float dirX, dirY, dirZ;
  register float distance, smallest;
  register float4 ray;
  //quadratic setup starts for ray
  ray = rays[rayID];
  dirX = large_dist*sin(ray.x)*cos(ray.y);
  dirY = large_dist*sin(ray.x)*sin(ray.y);
  dirZ = large_dist*cos(ray.x);
  // setup our quadratic equation
  float a = (dirX*dirX) + (dirY*dirY) + (dirZ*dirZ);
  for(int particleID = 0; particleID < 200; particleID++){
    int ray_score_ID = (particleID*100000+rayID);
    distance = 9999.;
    smallest = 9999.;
    int atom_ID_start = particleID*200;
    int atom_ID_stop = (particleID+1)*30;
    for(atomID = atom_ID_start; atomID < atom_ID_stop; atomID ++) {
      float4 atom = atoms[atomID];
      float b = 2.0 * ( (dirX*(-atom.x)) + (dirY*(-atom.y)) + (dirZ*(-atom.z)) );
      if(atomID==(atom_ID_start+1)) atm = dirX;
      float c = atom.x*atom.x + atom.y*atom.y + atom.z*atom.z - (atom.w * atom.w);
      float inside_sq = ( b * b ) - ( 4.0 * a * c );
      if (inside_sq > 0) {
        float inside = sqrt(inside_sq);
        float mu1 = -(b-inside) / ( 2.0 * a);
        float x1 =  mu1 * dirX;
        float y1 =  mu1 * dirY;
        float z1 =  mu1 * dirZ;
        float dist1_sq = x1*x1 + y1*y1 + z1*z1;
        float mu2 = -(b+inside) / ( 2.0 * a);
        float x2 = mu2 * dirX;
        float y2 = mu2 * dirY;
        float z2 = mu2 * dirZ;
        float dist2_sq = x2*x2 + y2*y2 + z2*z2;
        if(dist1_sq < distance) distance = sqrt(dist1_sq);
        if(dist2_sq < distance) distance = sqrt(dist2_sq);
      }
      if(distance<smallest) smallest = distance;
    }
    ray_scores[ray_score_ID] = 0.0;
    if ( (ray.z > 0.001) && (smallest > 9998.0) ) {
      ray_scores[ray_score_ID] = 20t;
    }
    if ( (ray.z < 0.001) && (smallest < 9999.0) ) {
      ray_scores[ray_score_ID] = 20;
    }
    if ( (ray.z > 0.001) && (smallest < 9999.0) ) {
      float dist_deviation = smallest - ray.z;
      if (dist_deviation < 0){
        dist_deviation = (ray.z - smallest)*5;
      }
      ray_scores[ray_score_ID] = dist_deviation;
    }
  }
}
__kernel void Get_scores(
                         __global float *ray_scores,
                         __global float *particle_scores
                         )
{
  int particleID = get_global_id(0);
  particle_scores[particleID].x = 0;
  particle_scores[particleID].y = 0;
  int start = particleID*100000;
  int stop = start +40000;
  int ray_score_ID;
  float score = 0;
  int num = 0;
  for(ray_score_ID=start; ray_score_ID<stop; ray_score_ID++){
    if(ray_scores[ray_score_ID]>0){
      score = score + ray_scores[ray_score_ID];
      num = num + 1;
    }
    if(ray_score_ID==start+1000) atm = ray_scores[ray_score_ID];
  }
  particle_scores[particleID] = score/num;
}

This doesn’t sound like ‘data corruption’, it just sounds like the algorithm or your expectations rely on precision beyond that implemented in your hardware.

The hardware is different from your cpu, and there will be differences when using floating point types. Check the opencl specifications for the minimum accuracy required by all implementations, and the vendor documentation for the accuracy of their implementation (nvidia supplies it at least).

If that is insufficient (e.g. sin/cos or sqrt, divide), your option is to implement your own with better accuracy (e.g. see the cephes library), use a higher precision type, or tweak the algorithm to account for the errors.