what implementation of OpenCL are you using?
I didn’t understand this question?
My implementation is developing a CNN algorithm for image processing applications.
The code is as follows in the execution phase after the build program, creating the kernels and creating the buffers is completed:
// Set the Argument values
//shrLog("clSetKernelArg 0 - 7...
");
ciErrNum = clSetKernelArg(ckKernelConv, 0, sizeof(cl_mem), (void*)&cmDevSrc);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernelConv, 1, sizeof(cl_mem), (void*)&cmDevtempB);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernelConv, 2, sizeof(cl_mem), (void*)&cmDevTB);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernelConv, 3, sizeof(cl_int), (void*)&iWidthExtended);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernelConv, 4, sizeof(cl_int), (void*)&TemplateWidth);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
szGlobalWorkSize2D[0] = iWidth;
szGlobalWorkSize2D[1] = iHeight;
// Launch kernel
//shrLog("clEnqueueNDRangeKernel (Convolution)...
");
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernelConv, 2, NULL, szGlobalWorkSize2D, NULL, 0, NULL, &NDrangeevent);
oclCheckErrorEX(ciErrNum,CL_SUCCESS, pCleanup);
// ******** Computation of the entire CNN *************
for(int i = 0; i < CNNiterations;i++)
{
printf("Iteration %d
",i);
// Set the Argument values
//shrLog("clSetKernelArg 0 - 7...
");
ciErrNum = clSetKernelArg(ckKernelConv, 0, sizeof(cl_mem), (void*)&cmDevSrc);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernelConv, 1, sizeof(cl_mem), (void*)&cmDevtempA);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernelConv, 2, sizeof(cl_mem), (void*)&cmDevTA);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernelConv, 3, sizeof(cl_int), (void*)&iWidthExtended);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernelConv, 4, sizeof(cl_int), (void*)&TemplateWidth);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
szGlobalWorkSize2D[0] = iWidth;
szGlobalWorkSize2D[1] = iHeight;
// Launch kernel
//shrLog("clEnqueueNDRangeKernel (Convolution)...
");
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernelConv, 2, NULL, szGlobalWorkSize2D, NULL, 0, NULL, &NDrangeevent);
oclCheckErrorEX(ciErrNum,CL_SUCCESS, pCleanup);
clFinish(cqCommandQueue);
// ************* Computation of the accumulation task ***************
// Set the Argument values
//shrLog("clSetKernelArg 0 - 7...
");
ciErrNum = clSetKernelArg(ckKernelAcc, 0, sizeof(cl_mem), (void*)&cmDevTB);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernelAcc, 1, sizeof(cl_mem), (void*)&cmDevTA);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernelAcc, 2, sizeof(cl_mem), (void*)&cmDevAcc);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernelAcc, 3, sizeof(cl_float), (void*)&bias);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
//szGlobalWorkSize[0] = iWidth;
//szGlobalWorkSize[1] = iHeight;
szGlobalWorkSize1D = iNumElements;
// Launch kernel
//shrLog("clEnqueueNDRangeKernel (Convolution)...
");
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernelAcc, 1, NULL, &szGlobalWorkSize1D, NULL, 0, NULL, &NDrangeevent);
oclCheckErrorEX(ciErrNum,CL_SUCCESS, pCleanup);
clFinish(cqCommandQueue);
// ************* Computaion of the sigmoid funtion *****************
ciErrNum = clSetKernelArg(ckKernelSig, 0, sizeof(cl_mem), (void*)&cmDevAcc);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernelSig, 1, sizeof(cl_mem), (void*)&cmDevDst);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
//szGlobalWorkSize[0] = iWidth;
//szGlobalWorkSize[1] = iHeight;
szGlobalWorkSize1D = iNumElements;
// Launch kernel
//shrLog("clEnqueueNDRangeKernel (Convolution)...
");
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernelSig, 1, NULL, &szGlobalWorkSize1D, NULL, 0, NULL, &NDrangeevent);
oclCheckErrorEX(ciErrNum,CL_SUCCESS, pCleanup);
clFinish(cqCommandQueue);
ciErrNum = clEnqueueCopyBuffer(cqCommandQueue, cmDevDst, cmDevResult, 0, 0, mem_sizeImage, NULL, 0, NULL);
oclCheckErrorEX(ciErrNum,CL_SUCCESS, pCleanup);
if(i < CNNiterations - 2)
{
// ************* Making the output of sigmoid image to exetended image size *****************
ciErrNum = clSetKernelArg(ckKernelImgext, 0, sizeof(cl_mem), (void*)&cmDevDst);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernelImgext, 1, sizeof(cl_mem), (void*)&cmDevSrc);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernelImgext, 2, sizeof(cl_int), (void*)&iWidth);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
//szGlobalWorkSize[0] = iWidth;
//szGlobalWorkSize[1] = iHeight;
szGlobalWorkSize1D = iNumElements;
// Launch kernel
//shrLog("clEnqueueNDRangeKernel (Convolution)...
");
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernelImgext, 1, NULL, &szGlobalWorkSize1D, NULL, 0, NULL, &NDrangeevent);
oclCheckErrorEX(ciErrNum,CL_SUCCESS, pCleanup);
clFinish(cqCommandQueue);
}
}
shrLog("clEnqueueReadBuffer (GPU Output)...
");
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevResult, CL_TRUE, 0, mem_sizeImage, (void *) result, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
clFinish(cqCommandQueue);
Here ckKernelConv is a kernel variable for Convolution, ckKernelAcc is for accumulation fucntion, ckKernelSig is for sigmoid operation and ckKernelimgext is for increasing the size of the image. The kernels are as follows:
Convolution Kernel:
__kernel void Convolution(const __global float * pInput,
__constant float * pFilter,
__global float * pOutput,
const int nInWidth,
const int nFilterWidth)
{
const int nWidth = get_global_size(0);
const int xOut = get_global_id(0);
const int yOut = get_global_id(1);
const int xInTopLeft = xOut;
const int yInTopLeft = yOut;
float sum = 0;
for (int r = 0; r < nFilterWidth; r++)
{
const int idxFtmp = r * nFilterWidth;
const int yIn = yInTopLeft + r;
const int idxIntmp = yIn * nInWidth + xInTopLeft;
for (int c = 0; c < nFilterWidth; c++)
{
const int idxF = idxFtmp + c;
const int idxIn = idxIntmp + c;
sum += pFilter[idxF]*pInput[idxIn];
}
}
const int idxOut = yOut * nWidth + xOut;
pOutput[idxOut] = sum;
}
Sigmoid Kernel:
__kernel void Sigmoid(const __global float * Input,
__global float * Output)
{
const int ind = get_global_id(0);
Output[ind] = 0.5 * ((fabs(Input[ind] + 1)) - (fabs(Input[ind] - 1)));
}
Accumulation Kernel
__kernel void Accumulation(const __global float * TBimg,
__global float * TAimg,
__global float * SUMimg,
const float bias)
{
const int index = get_global_id(0);
SUMimg[index] = TBimg[index] + TAimg[index] + bias;
}
Image extension Kernel
__kernel void Imageext(__global float * pInput,
__global float * pOutput,
const int nInWidth)
{
const int idx = get_global_size(0);
int n = 0;
if(idx % nInWidth == 0)
if(idx != 0 && idx > nInWidth)
n = n + 1;
const int count = (n * 2) + idx + nInWidth + 2;
pOutput[count] = pInput[idx];
}
The memory elements used are:
cmDevSrc – CL_MEM_READ_WRITE
cmDevtempA – CL_MEM_READ_ONLY
cmDevtempB – CL_MEM_READ_ONLY
cmDevTA – CL_MEM_READ_WRITE
cmDevTB – CL_MEM_READ_WRITE
cmDevDst – CL_MEM_READ_WRITE
cmDevResult – CL_MEM_WRITE_ONLY