Problem with using local memory and global memory

GPUworker · November 2, 2010, 11:46am

Hi all,

I am working on using OpenCL for convolution of images.
I cant able to know how to initialize the local memories and global memories.
Can u help me???

//This is the kernel I am using for convolution
__kernel void Convolution(const __global float * pInput,
__constant float * pFilter,
__global float * pOutput,
const int nInWidth,
const int nFilterWidth)
{
const int nWidth = get_global_size(0);

const int xOut = get_global_id(0);
const int yOut = get_global_id(1);

const int xInTopLeft = xOut;
const int yInTopLeft = yOut;

float sum = 0;
for (int r = 0; r &lt; nFilterWidth; r++)
{
    const int idxFtmp = r * nFilterWidth;

    const int yIn = yInTopLeft + r;
    const int idxIntmp = yIn * nInWidth + xInTopLeft;

    for (int c = 0; c &lt; nFilterWidth; c++)
    {
        const int idxF  = idxFtmp  + c;
        const int idxIn = idxIntmp + c;
        sum += pFilter[idxF]*pInput[idxIn];
    }
}
const int idxOut = yOut * nWidth + xOut;
pOutput[idxOut] = sum;

}

//This is my main program

// I am getting stuck in the red marked place of the code…

int main(int argc, char **argv)
{
printf(“The program execution has started and entered in to MAIN”);
cvWaitKey(0);

IplImage * img1 = NULL;
img1 = cvLoadImage(“road.jpg”, CV_LOAD_IMAGE_COLOR );
cvNamedWindow( “RGB image”, CV_WINDOW_AUTOSIZE );
cvShowImage( “RGB image”, img1 ); // display image

IplImage * img = NULL;

img = cvCreateImage( cvGetSize(img1),img1->depth,1 );

//Converting it to gray scale
cvCvtColor( img1, img, CV_RGB2GRAY );

cvNamedWindow( “Gray image”, CV_WINDOW_AUTOSIZE );
cvShowImage( “Gray image”, img ); // display image

cvWaitKey(0);
//IplImage *img =(IplImage *)img2;
iWidth = img->width;
iHeight = img->height;
iNumElements = iWidth * iHeight;
//cvDestroyWindow( “image” );
//cvReleaseImage( &img ); // release memory

float filter[] = {0,1,0,1,-4,1,0,1,0};

//float b[] = {1,2,3,4,5,6,7,8,9,10};
//float c[10],d[10];

// get command line arg for quick test, if provided
bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, “noprompt”);

// start logs
shrSetLogFileName (“oclConvolution.txt”);
shrLog("%s Starting…

of float elements per Array = %u

", argv[0], iNumElements);

// set and log Global and Local work size dimensions
//szLocalWorkSize = (sizeof(cl_float4)* 1);
//szGlobalWorkSize = shrRoundUp((cl_float)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize
//shrLog("Global Work Size = %u
Local Work Size = %u

of Work Groups = %u

",
// szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));

szLocalWorkSize[0] = iHeight;
szLocalWorkSize[1] = iWidth;
szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], iFilterWidth);
szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], iWidth * iHeight);

// Allocate and initialize host arrays
shrLog( "Allocate and Init Host Mem…
");
//srcA = malloc(sizeof(cl_float4) * szGlobalWorkSize);
//srcB = malloc(sizeof(cl_float4) * szGlobalWorkSize);
GPUoutput = (float *)malloc(iWidth * iHeight * sizeof(cl_float4));
CPUoutput = (float *)malloc(iWidth * iHeight * sizeof(cl_float4));

srcA = (cl_float *)img;
srcB = filter;

// Get the NVIDIA platform
shrLog("Get the NVIDIA platform…
");
ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

// Get a GPU device
shrLog("Getting the GPU device…
");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

// Create the context
shrLog("clCreateContext…
");
cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

// Create a command-queue
shrLog("clCreateCommandQueue…
");
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
shrLog("clCreateBuffer (SrcA, SrcB and GPUoutput in Device GMEM)…
");
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * iWidth * iHeight, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * iFilterWidth * iFilterWidth, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * iWidth * iHeight, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

// Read the OpenCL kernel in from source file
shrLog("oclLoadProgSource (%s)…
", cSourceFile);
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
cSourceCL = oclLoadProgSource(cPathAndName, “”, &szKernelLength);
oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);

// Create the program
shrLog("clCreateProgramWithSource…
");
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);

// Build the program with ‘mad’ Optimization option
#ifdef MAC
char* flags = “-cl-fast-relaxed-math -DMAC”;
#else
char* flags = “-cl-fast-relaxed-math”;
#endif
shrLog("clBuildProgram…
");
ciErrNum = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then cleanup and exit
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), “oclConvolution.ptx”);
Cleanup(EXIT_FAILURE);
}

// Create the kernel
shrLog("clCreateKernel (Convolution)…
");
ckKernel = clCreateKernel(cpProgram, “Convolution”, &ciErrNum);

// Set the Argument values
shrLog("clSetKernelArg 0 - 4…

");
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iWidth);
ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_int), (void*)&iFilterWidth);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // ERROR Check

// --------------------------------------------------------
// Core sequence… copy input data to GPU, compute, copy results back

// Asynchronous write of data to GPU device
shrLog("clEnqueueWriteBuffer (SrcA and SrcB)…
");
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * iWidth * iHeight, srcA, 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * iFilterWidth * iFilterWidth, srcB, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

// Launch kernel
shrLog("clEnqueueNDRangeKernel (Convolution)…
");
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // ERROR Check

// Read back results and check accumulated errors
shrLog("clEnqueueReadBuffer (GPU Output)…

");
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * iWidth * iHeight, GPUoutput, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

// Compute and compare results for CPU-host and report errors and pass/fail
shrLog("Comparing against Host/C++ computation…

“);
ConvolutionHost ((const float*)srcA, (float*)srcB, (float*)CPUoutput);
shrBOOL bMatch = shrComparefet((const float*)CPUoutput, (const float*)GPUoutput, (unsigned int)iNumElements, 0.0f, 0);
shrLog(”%s

", (bMatch == shrTRUE) ? “PASSED” : “FAILED”);

IplImage *GPUresultImage = (IplImage *)GPUoutput;
IplImage *CPUresultImage = (IplImage *)CPUoutput;
cvNamedWindow( “image”, CV_WINDOW_AUTOSIZE );

// Diaplaying the result images from GPU and CPU
cvShowImage( “GPU Output”, GPUresultImage );
cvShowImage( “CPU Output”, CPUresultImage );

cvWaitKey(0);
cvDestroyWindow( “image” );
cvDestroyWindow( “GPU Output” );
cvDestroyWindow( “CPU Output” );

//Releasing the image memories
cvReleaseImage( &img );
cvReleaseImage( &GPUresultImage);
cvReleaseImage( &CPUresultImage);
// Diaplaying the result images from GPU and CPU

} // End of Main

Can any one give me some suggestions???

Thanks in advance

david.garcia · November 2, 2010, 3:44pm

There are two very different things. On one hand, there’s local and global memory. On the other hand there’s the arguments passed to clEnqueueNDRangeKernel. Your problem has to do with clEnqueueNDRangeKernel, not with local or global memory.

This is what you are doing today:

szLocalWorkSize[0] = iHeight;
szLocalWorkSize[1] = iWidth;
szGlobalWorkSize[0] = shrRoundUp((int)szLocalWorkSize[0], iFilterWidth);
szGlobalWorkSize[1] = shrRoundUp((int)szLocalWorkSize[1], iWidth * iHeight);
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);

This is probably what you are trying to do:


szGlobalWorkSize[0] = iWidth;
szGlobalWorkSize[1] = iHeight;
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, NULL, 0, NULL, NULL);

In other words, you only need to select an appropriate global size, which controls how many work-items (samples) will be run. Notice that I’m passing NULL as the local-size argument. This tells OpenCL to select the best local size automatically.

david.garcia · November 2, 2010, 3:48pm

There some other things that don’t look right. For example, you malloc iWidth * iHeight * sizeof(cl_float4) bytes but then when you create the OpenCL buffers you only allocate sizeof(cl_float) * iWidth * iHeight.

GPUworker · November 3, 2010, 3:13am

Thanks for your reply…
I will try the changes

GPUworker · November 7, 2010, 10:14am

I am still facing the problem with the code…

I think I have problem in clEnqueueNDRangeKernel which is used here to launch the kernel

Can any one help in giving the suggestions.

Thanks in advance.

HolyGeneralK · November 8, 2010, 11:36am

Is your kernel launch dependent upon the entire set of memory being passed to the device? I would add a clEnqueueBarrier(cqCommandQueue) to ensure that the data is fully copied to the device before enqueuing your kernel for launch, especially since you are using a non-blocking write.

What is the error code you are getting when you enqueue the kernel? That would be extremely helpful in narrowing down your problem.

david.garcia · November 8, 2010, 4:12pm

I would add a clEnqueueBarrier(cqCommandQueue) to ensure that the data is fully copied to the device before enqueuing your kernel for launch, especially since you are using a non-blocking write.

That shouldn’t be necessary because the queue is in-order.

GPUWorker, can you describe what kind of problem do you have now? Are you sure no OpenCL calls are returning an error code?

GPUworker · November 10, 2010, 3:53am

The error I am getting is :

Debug Assertion Failed!

Expression: _CrtIsValidHeapPointer(pUserData)

Its showing the path of .exe file which I am runinng and also
File:\dd\vctools\crt_bld\self_x86\crt\src\dbgheap.c
Line:1317

It is also saying
For more information on how your program can cause an assertion failure, see the Visual C++ documentation on asserts.

david.garcia · November 10, 2010, 4:28am

_CrtIsValidHeapPointer(pUserData)

That’s MSVC telling you that the pointer you’ve passed does not belong to the heap. The most likely cause is that you are freeing a pointer that was never initialized, or maybe you are freeing the same pointer twice. I don’t remember whether a corrupted heap can produce the same error.

Can you look at the call stack and find out what line of code in your application is generating that error? It’s most likely a call to free() or delete.

GPUworker · November 15, 2010, 2:15am

But it not showing the error on the line where we are freeing the pointers, it is showing at line where we are setting the kernel arguments with commandsetKernelArg

There I used an error check and it is showing the error at that point for me. But while debugging step by step I am getting error at launching the kernel.