The following is the pseudo code when I use only one OpenCL device.
I have to consider how to abstract the the pseudo code for multi device.
Sorry for the indent!
//this is the pseudo C++ code
int Target_from, Target_to;
int nPieceSize = 10000; //for example
int err;
for(Target_from=0; Target_from<nTerrainPoints; Target_from+=nPieceSize) //piece by piece, nTerrainPoints: total number of terrain points
{
//the last piece maybe smaller
Target_to = Target_from + nPieceSize;
if(Target_to > nTerrainPoints) Target_to = nTerrainPoints;
double *pTargetCoords = pTerrainCoords + Target_from*3; //pTerrainCoords: All of coordinates(x,y,z) of the terrain
double *pTargetValues = pTerrainValues + Target_from; //pTerrainValues: anomaly results for the whole terrain
int nTargetPoints = Target_to - Target_from;
size_t global_size[1];
global_size[0] = nTargetPoints;
// Create CL buffers
cl_mem targetCoord_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(double)*3*nTargetPoints, pTargetCoords, &err);
if(err < 0)...
cl_mem targetValue_buff = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
sizeof(double)*nTargetPoints, pTargetValues, &err);
if(err < 0)...
// Set kernel arguments
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &targetCoord_buff);
if(err < 0)...
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &targetValue_buff);
if(err < 0)...
for(int i=0; i<nObjects; i++) //nObjects: number of objects underground
{
Object *pObject = objects[i]; //for example
int nObjectPoints = pObject->GetNumberOfPoints(); //for example(function name is not exact)
double *pObjectCoords = pObject->GetCoordPointer(); //for example
int nCells = pObject->GetNumberOfCells(); //for example
int *pCells = pObject->GetCellPointer(); //for example
// object parameter
double fRho = pObject->fRho;
// Create CL buffers
cl_mem objectCoord_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(double)*3*nObjectPoints, pObjectCoords, &err);
if(err < 0)...
cl_mem cell_buff = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(int)*4*nCells, pCells, &err);
if(err < 0)...
// Set kernel arguments
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &objectCoord_buff);
if(err < 0)...
err = clSetKernelArg(kernel, 3, sizeof(cl_mem), &Cell_buff);
if(err < 0)...
err = clSetKernelArg(kernel, 4, sizeof(double), &fRho);
if(err < 0)...
int cell_from, cell_to;
int nCellStep = 20; //for example
for(cell_from=0; cell_from<nCells; cell_from+=nCellStep)
{
cell_to = cell_from + nCellStep;
if(cell_to > nCells) cell_to = nCells;
// Set kernel arguments
err = clSetKernelArg(kernel, 5, sizeof(int), &cell_from);
if(err < 0)...
err = clSetKernelArg(kernel, 6, sizeof(int), &cell_to);
if(err < 0)...
//run the kernel
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_size, NULL, 0, NULL, NULL);
if(err < 0)...
}
clReleaseMemObject(meshCoord_buff);
clReleaseMemObject(Cell_buff);
}
// Read the result
err = clEnqueueReadBuffer(queue, targetValue_buff, CL_TRUE, 0,
sizeof(double)*nTargetPoints, pTargetValues, 0, NULL, NULL);
if(err < 0)...
clReleaseMemObject(targetCoord_buff);
clReleaseMemObject(targetValue_buff);
}
//this is part pseudo kernel code
__kernel void Anomaly(__global double* target_coords, __global double* target_values, //target parameters
__global double* object_coords, __global int4* mesh_cells, double fRho, //object parameters
int cell_from, int cell_to) //loop parameters
{
…
int k, iTargetPoint; //var of loop, the index of the target point
iTargetPoint = get_global_id(0);
dg = 0.0;
for(k=cell_from; k<cell_to; k++) //loop of triangles
{
...
...
ig = (p1.x*sin1-p1.y*cos1)*log((p1.x*cos1+p1.y*sin1+R1)/(p2.x*cos1+p2.y*sin1+R2));
ig+= (p2.x*sin2-p2.y*cos2)*log((p2.x*cos2+p2.y*sin2+R2)/(p3.x*cos2+p3.y*sin2+R3));
ig+= p3.y*log((p1.x+R1)/(p3.x+R3));
ig+= 2*p1.z*atan((p1.y*sin1+(1+cos1)*(p1.x+R1))/(p1.z*sin1));
ig-= 2*p1.z*atan((p2.y*sin1+(1+cos1)*(p2.x+R2))/(p1.z*sin1));
ig+= 2*p1.z*atan((p2.y*sin2+(1+cos2)*(p2.x+R2))/(p1.z*sin2));
ig-= 2*p1.z*atan((p3.y*sin2+(1+cos2)*(p3.x+R3))/(p1.z*sin2));
//adds to the total anomaly
dg += (-n.z*ig); //-n: outward normal
}
target_values[iTargetPoint] += fRho*dg;
}