Hi all,
I am working on using OpenCL for convolution of images.
I cant able to know how to initialize the local memories and global memories.
Can u help me???
//This is the kernel I am using for convolution
__kernel void Convolution(const __global float * pInput,
__constant float * pFilter,
__global float * pOutput,
const int nInWidth,
const int nFilterWidth)
{
const int nWidth = get_global_size(0);
const int xOut = get_global_id(0);
const int yOut = get_global_id(1);
const int xInTopLeft = xOut;
const int yInTopLeft = yOut;
float sum = 0;
for (int r = 0; r < nFilterWidth; r++)
{
const int idxFtmp = r * nFilterWidth;
const int yIn = yInTopLeft + r;
const int idxIntmp = yIn * nInWidth + xInTopLeft;
for (int c = 0; c < nFilterWidth; c++)
{
const int idxF = idxFtmp + c;
const int idxIn = idxIntmp + c;
sum += pFilter[idxF]*pInput[idxIn];
}
}
const int idxOut = yOut * nWidth + xOut;
pOutput[idxOut] = sum;
}
//This is my main program
// I am getting stuck in the red marked place of the code…
int main(int argc, char **argv)
{
printf(“The program execution has started and entered in to MAIN”);
cvWaitKey(0);
IplImage * img1 = NULL;
img1 = cvLoadImage(“road.jpg”, CV_LOAD_IMAGE_COLOR );
cvNamedWindow( “RGB image”, CV_WINDOW_AUTOSIZE );
cvShowImage( “RGB image”, img1 ); // display image
IplImage * img = NULL;
img = cvCreateImage( cvGetSize(img1),img1->depth,1 );
//Converting it to gray scale
cvCvtColor( img1, img, CV_RGB2GRAY );
cvNamedWindow( “Gray image”, CV_WINDOW_AUTOSIZE );
cvShowImage( “Gray image”, img ); // display image
cvWaitKey(0);
//IplImage *img =(IplImage *)img2;
iWidth = img->width;
iHeight = img->height;
iNumElements = iWidth * iHeight;
//cvDestroyWindow( “image” );
//cvReleaseImage( &img ); // release memory
float filter[] = {0,1,0,1,-4,1,0,1,0};
//float b[] = {1,2,3,4,5,6,7,8,9,10};
//float c[10],d[10];
// get command line arg for quick test, if provided
bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, “noprompt”);
// start logs
shrSetLogFileName (“oclConvolution.txt”);
shrLog("%s Starting…
of float elements per Array = %u
", argv[0], iNumElements);
// set and log Global and Local work size dimensions
//szLocalWorkSize = (sizeof(cl_float4)* 1);
//szGlobalWorkSize = shrRoundUp((cl_float)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize
//shrLog("Global Work Size = %u
Local Work Size = %u
of Work Groups = %u
",
// szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));
szLocalWorkSize[0] = iHeight;
szLocalWorkSize[1] = iWidth;
szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], iFilterWidth);
szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], iWidth * iHeight);
// Allocate and initialize host arrays
shrLog( "Allocate and Init Host Mem…
");
//srcA = malloc(sizeof(cl_float4) * szGlobalWorkSize);
//srcB = malloc(sizeof(cl_float4) * szGlobalWorkSize);
GPUoutput = (float *)malloc(iWidth * iHeight * sizeof(cl_float4));
CPUoutput = (float *)malloc(iWidth * iHeight * sizeof(cl_float4));
srcA = (cl_float *)img;
srcB = filter;
// Get the NVIDIA platform
shrLog("Get the NVIDIA platform…
");
ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Get a GPU device
shrLog("Getting the GPU device…
");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create the context
shrLog("clCreateContext…
");
cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create a command-queue
shrLog("clCreateCommandQueue…
");
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
shrLog("clCreateBuffer (SrcA, SrcB and GPUoutput in Device GMEM)…
");
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * iWidth * iHeight, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * iFilterWidth * iFilterWidth, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * iWidth * iHeight, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Read the OpenCL kernel in from source file
shrLog("oclLoadProgSource (%s)…
", cSourceFile);
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
cSourceCL = oclLoadProgSource(cPathAndName, “”, &szKernelLength);
oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
// Create the program
shrLog("clCreateProgramWithSource…
");
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
// Build the program with ‘mad’ Optimization option
#ifdef MAC
char* flags = “-cl-fast-relaxed-math -DMAC”;
#else
char* flags = “-cl-fast-relaxed-math”;
#endif
shrLog("clBuildProgram…
");
ciErrNum = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then cleanup and exit
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), “oclConvolution.ptx”);
Cleanup(EXIT_FAILURE);
}
// Create the kernel
shrLog("clCreateKernel (Convolution)…
");
ckKernel = clCreateKernel(cpProgram, “Convolution”, &ciErrNum);
// Set the Argument values
shrLog("clSetKernelArg 0 - 4…
");
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iWidth);
ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_int), (void*)&iFilterWidth);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // ERROR Check
// --------------------------------------------------------
// Core sequence… copy input data to GPU, compute, copy results back
// Asynchronous write of data to GPU device
shrLog("clEnqueueWriteBuffer (SrcA and SrcB)…
");
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * iWidth * iHeight, srcA, 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * iFilterWidth * iFilterWidth, srcB, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Launch kernel
shrLog("clEnqueueNDRangeKernel (Convolution)…
");
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // ERROR Check
// Read back results and check accumulated errors
shrLog("clEnqueueReadBuffer (GPU Output)…
");
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * iWidth * iHeight, GPUoutput, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Compute and compare results for CPU-host and report errors and pass/fail
shrLog("Comparing against Host/C++ computation…
“);
ConvolutionHost ((const float*)srcA, (float*)srcB, (float*)CPUoutput);
shrBOOL bMatch = shrComparefet((const float*)CPUoutput, (const float*)GPUoutput, (unsigned int)iNumElements, 0.0f, 0);
shrLog(”%s
", (bMatch == shrTRUE) ? “PASSED” : “FAILED”);
IplImage *GPUresultImage = (IplImage *)GPUoutput;
IplImage *CPUresultImage = (IplImage *)CPUoutput;
cvNamedWindow( “image”, CV_WINDOW_AUTOSIZE );
// Diaplaying the result images from GPU and CPU
cvShowImage( “GPU Output”, GPUresultImage );
cvShowImage( “CPU Output”, CPUresultImage );
cvWaitKey(0);
cvDestroyWindow( “image” );
cvDestroyWindow( “GPU Output” );
cvDestroyWindow( “CPU Output” );
//Releasing the image memories
cvReleaseImage( &img );
cvReleaseImage( &GPUresultImage);
cvReleaseImage( &CPUresultImage);
// Diaplaying the result images from GPU and CPU
} // End of Main
Can any one give me some suggestions???
Thanks in advance