Problem with clEnqueueReadBuffer

Dear all,

I am using the concept of multiple kernels in my work. I created some cl_mem devices which can be used in these kernels. There is the transfer of these memory elements from one kernel to another. THis kernels are executed several times in a loop.

But at last while I am trying to retrieve the data back form GPU to CPU at the end of the process, I am getting an error as CL_MEM_OBJECT_ALLOCATION_FAILURE.

How can I solve this problem? Why this problem occurs?

Thanks in advance.

There is the transfer of these memory elements from one kernel to another.

I don’t recommend thinking in those terms. Actual physical data transfers happen between devices, not between kernels.

But at last while I am trying to retrieve the data back form GPU to CPU at the end of the process, I am getting an error as CL_MEM_OBJECT_ALLOCATION_FAILURE.

Can you show us how you are doing this? Is it possible that you are attempting to read a huge amount of data?

Also, what implementation of OpenCL are you using?

what implementation of OpenCL are you using?

I didn’t understand this question?

My implementation is developing a CNN algorithm for image processing applications.

The code is as follows in the execution phase after the build program, creating the kernels and creating the buffers is completed:

// Set the Argument values
	//shrLog("clSetKernelArg 0 - 7...

"); 
	ciErrNum = clSetKernelArg(ckKernelConv, 0, sizeof(cl_mem), (void*)&cmDevSrc);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	ciErrNum = clSetKernelArg(ckKernelConv, 1, sizeof(cl_mem), (void*)&cmDevtempB);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	ciErrNum = clSetKernelArg(ckKernelConv, 2, sizeof(cl_mem), (void*)&cmDevTB);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	ciErrNum = clSetKernelArg(ckKernelConv, 3, sizeof(cl_int), (void*)&iWidthExtended);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	ciErrNum = clSetKernelArg(ckKernelConv, 4, sizeof(cl_int), (void*)&TemplateWidth);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	
	szGlobalWorkSize2D[0] = iWidth;
	szGlobalWorkSize2D[1] = iHeight;
	
	// Launch kernel
	//shrLog("clEnqueueNDRangeKernel (Convolution)...
"); 
	ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernelConv, 2, NULL, szGlobalWorkSize2D, NULL, 0, NULL, &NDrangeevent);
	oclCheckErrorEX(ciErrNum,CL_SUCCESS, pCleanup);	

// ******** Computation of the entire CNN *************
	for(int i = 0; i < CNNiterations;i++)
	{
		printf("Iteration %d
",i);
		// Set the Argument values
		//shrLog("clSetKernelArg 0 - 7...

"); 
		ciErrNum = clSetKernelArg(ckKernelConv, 0, sizeof(cl_mem), (void*)&cmDevSrc);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
		ciErrNum = clSetKernelArg(ckKernelConv, 1, sizeof(cl_mem), (void*)&cmDevtempA);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
		ciErrNum = clSetKernelArg(ckKernelConv, 2, sizeof(cl_mem), (void*)&cmDevTA);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
		ciErrNum = clSetKernelArg(ckKernelConv, 3, sizeof(cl_int), (void*)&iWidthExtended);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
		ciErrNum = clSetKernelArg(ckKernelConv, 4, sizeof(cl_int), (void*)&TemplateWidth);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

		szGlobalWorkSize2D[0] = iWidth;
		szGlobalWorkSize2D[1] = iHeight;

		// Launch kernel
		//shrLog("clEnqueueNDRangeKernel (Convolution)...
"); 
		ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernelConv, 2, NULL, szGlobalWorkSize2D, NULL, 0, NULL, &NDrangeevent);
		oclCheckErrorEX(ciErrNum,CL_SUCCESS, pCleanup);	
		clFinish(cqCommandQueue);

		// ************* Computation of the accumulation task ***************
		// Set the Argument values
		//shrLog("clSetKernelArg 0 - 7...

"); 
		ciErrNum = clSetKernelArg(ckKernelAcc, 0, sizeof(cl_mem), (void*)&cmDevTB);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
		ciErrNum = clSetKernelArg(ckKernelAcc, 1, sizeof(cl_mem), (void*)&cmDevTA);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
		ciErrNum = clSetKernelArg(ckKernelAcc, 2, sizeof(cl_mem), (void*)&cmDevAcc);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
		ciErrNum = clSetKernelArg(ckKernelAcc, 3, sizeof(cl_float), (void*)&bias);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
		
		//szGlobalWorkSize[0] = iWidth;
		//szGlobalWorkSize[1] = iHeight;
		szGlobalWorkSize1D = iNumElements;

		// Launch kernel
		//shrLog("clEnqueueNDRangeKernel (Convolution)...
"); 
		ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernelAcc, 1, NULL, &szGlobalWorkSize1D, NULL, 0, NULL, &NDrangeevent);
		oclCheckErrorEX(ciErrNum,CL_SUCCESS, pCleanup);	
		clFinish(cqCommandQueue);

		// ************* Computaion of the sigmoid funtion *****************
		ciErrNum = clSetKernelArg(ckKernelSig, 0, sizeof(cl_mem), (void*)&cmDevAcc);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
		ciErrNum = clSetKernelArg(ckKernelSig, 1, sizeof(cl_mem), (void*)&cmDevDst);
		oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

		//szGlobalWorkSize[0] = iWidth;
		//szGlobalWorkSize[1] = iHeight;
		szGlobalWorkSize1D = iNumElements;

		// Launch kernel
		//shrLog("clEnqueueNDRangeKernel (Convolution)...
"); 
		ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernelSig, 1, NULL, &szGlobalWorkSize1D, NULL, 0, NULL, &NDrangeevent);
		oclCheckErrorEX(ciErrNum,CL_SUCCESS, pCleanup);
		clFinish(cqCommandQueue);

		ciErrNum = clEnqueueCopyBuffer(cqCommandQueue, cmDevDst, cmDevResult, 0, 0, mem_sizeImage, NULL, 0, NULL);
		oclCheckErrorEX(ciErrNum,CL_SUCCESS, pCleanup);

		if(i < CNNiterations - 2)
		{
			// ************* Making the output of sigmoid image to exetended image size *****************
			ciErrNum = clSetKernelArg(ckKernelImgext, 0, sizeof(cl_mem), (void*)&cmDevDst);
			oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
			ciErrNum = clSetKernelArg(ckKernelImgext, 1, sizeof(cl_mem), (void*)&cmDevSrc);
			oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
			ciErrNum = clSetKernelArg(ckKernelImgext, 2, sizeof(cl_int), (void*)&iWidth);
			oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

			//szGlobalWorkSize[0] = iWidth;
			//szGlobalWorkSize[1] = iHeight;
			szGlobalWorkSize1D = iNumElements;
			// Launch kernel
			//shrLog("clEnqueueNDRangeKernel (Convolution)...
"); 
			ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernelImgext, 1, NULL, &szGlobalWorkSize1D, NULL, 0, NULL, &NDrangeevent);
			oclCheckErrorEX(ciErrNum,CL_SUCCESS, pCleanup);
			clFinish(cqCommandQueue);
		}
	}

	shrLog("clEnqueueReadBuffer (GPU Output)...

"); 
	
	ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevResult, CL_TRUE, 0, mem_sizeImage, (void *) result, 0, NULL, NULL);

	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	clFinish(cqCommandQueue);

Here ckKernelConv is a kernel variable for Convolution, ckKernelAcc is for accumulation fucntion, ckKernelSig is for sigmoid operation and ckKernelimgext is for increasing the size of the image. The kernels are as follows:

Convolution Kernel:

__kernel void Convolution(const __global  float * pInput,
                        __constant float * pFilter,
                        __global  float * pOutput,
                        const int nInWidth,
						const int nFilterWidth)
{

    const int nWidth = get_global_size(0);

    const int xOut = get_global_id(0);
    const int yOut = get_global_id(1);

    const int xInTopLeft = xOut;
    const int yInTopLeft = yOut;

    float sum = 0;
    for (int r = 0; r < nFilterWidth; r++)
    {
        const int idxFtmp = r * nFilterWidth;

        const int yIn = yInTopLeft + r;
        const int idxIntmp = yIn * nInWidth + xInTopLeft;

        for (int c = 0; c < nFilterWidth; c++)
        {
            const int idxF  = idxFtmp  + c;
            const int idxIn = idxIntmp + c;
            sum += pFilter[idxF]*pInput[idxIn];
        }
    } 
    const int idxOut = yOut * nWidth + xOut;
    pOutput[idxOut] = sum;
	
}

Sigmoid Kernel:

__kernel void Sigmoid(const __global  float * Input,
						    __global float * Output)
{

	const int ind = get_global_id(0);

	Output[ind] = 0.5 * ((fabs(Input[ind] + 1)) - (fabs(Input[ind] - 1)));

}

Accumulation Kernel

__kernel void Accumulation(const __global  float * TBimg,
						      __global float * TAimg,
							  __global float * SUMimg,
							  const float bias)
{
	const int index = get_global_id(0);

	SUMimg[index] = TBimg[index] + TAimg[index] + bias;

}

Image extension Kernel

__kernel void Imageext(__global  float * pInput,
                        __global  float * pOutput,
                        const int nInWidth)
{
	const int idx = get_global_size(0);

	int n = 0;

	if(idx % nInWidth == 0)
		if(idx != 0 && idx > nInWidth)
			n = n + 1;

	const int count = (n * 2) + idx + nInWidth + 2;

	pOutput[count] = pInput[idx]; 
	
}

The memory elements used are:

cmDevSrc – CL_MEM_READ_WRITE
cmDevtempA – CL_MEM_READ_ONLY
cmDevtempB – CL_MEM_READ_ONLY
cmDevTA – CL_MEM_READ_WRITE
cmDevTB – CL_MEM_READ_WRITE
cmDevDst – CL_MEM_READ_WRITE
cmDevResult – CL_MEM_WRITE_ONLY

By “implementation of OpenCL” I mean AMD, NVidia, etc.

What is the value of mem_sizeImage? CL_MEM_OBJECT_ALLOCATION_FAILURE means “we don’t have enough device memory to perform this operation”.

I don’t understand what your program is doing, by the way. This is what I see:


for(int i = 0; i < CNNiterations;i++)
{
    Do some stuff here without using cmDevResult;
    Copy the contents of cmDevDst into cmDevResult;
}

Read the contents of cmDevResult;

Why write into cmDevResult in every iteration of the loop if you only read after the loop is finished? Why is cmDevResult needed at all? It looks like you could simply read from cmDevDst directly.

I am using the NVIDIA Geforce 9500 GT.

The size of mem_sizeImage = sizeof(float) * 160000.

Even I tried to copy the data from cmDevDst directly at the initial stage. But then also I am getting the same error CL_MEM_OBJECT_ALLOCATION_FAILURE. So I thought there is some mistake and tried to use a new buffer variable cmDevResult. But, the same thing repeats.

This looks so strange. It’s not a very big buffer! How do you create cmDevResult? Can you show us the call to clCreateBuffer()?

// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
	shrLog("clCreateBuffer process has started....
"); 
	cmDevSrc = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, mem_sizeextImage, NULL, &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	cmDevtempA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_sizetemplate, NULL, &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	cmDevtempB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_sizetemplate, NULL, &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	cmDevTA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, mem_sizeImage, NULL, &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	cmDevTB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, mem_sizeImage, NULL, &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, mem_sizeImage, NULL, &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	//cmDevextsrc = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, mem_sizeextImage, NULL, &ciErrNum);
	//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_only, mem_sizeImage, NULL, &ciErrNum);
	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
	shrLog("End of creating buffers...
");

I had a new observations. I though there may be a memory problem and I changed the input image from 400x400 to 200x200 and even low to 100x100.

Then while using clEnqueueReadBuffer it is showing an error CL_OUT_OF_RESOURCES. Why the error is changing when we reduced the image size?

MY graphic card memory is 512 MB.
If we consider the clCreateBuffer = sizeof(float) * 400 * 400. Then it is more than 640 MB.

Is this the problem because of which the error occurs?

CL_OUT_OF_RESOURCES means almost the same as CL_MEM_OBJECT_ALLOCATION_FAILURE.

I recommend to reduce the number of API calls as much as possible and if you can still reproduce the problem then send a bug report to NVidia. This might be a driver bug.

I thought I would have another thorough look onto the code before I conclude the behaviour. It was because of a invalid memory access by a few threads.

I would consider this thread solved. :slight_smile:

Thank you

/Bharath

Apologies. The previous reply was supposed to be on my own thread. I must find some sleep before I do something else.

/Bharath