error:CL_OUT_OF_RESOURCES when work dimention is 2

I am again stuck with another problem. The function clEnqueueReadBuffer is returning an error value of -5 i.e CL_OUT_OF_RESOURCES when work dimention is 2, {height, width}.

__kernel void hypothesis_opencl(__global uchar *Edges,
                                  __global uchar *Depth,
                                  __global float *Filtered,
                                  __global float *weights_table,
                                  __global float *dist_table,
                                  int height,
                                  int width,
                                  int fltr_radius,
                                  float search_limit)

{
        int2 pos;
        pos.x = (int)get_global_id(0);
        pos.y = (int)get_global_id(1);

        int index = (pos.y+(pos.x*width));

        Filtered[index] = (float)Depth[index];
}
...
...
size_t workSize[2] = {h, w}; // two dimensional Range

retVal = clEnqueueNDRangeKernel(GPUCommandQueue,
                                openCLKernel,
                                2,
                                NULL,
                                workSize,
                                NULL,
                                0,
                                NULL,
                                NULL);
if(retVal != CL_SUCCESS)
{
    std::cout << "Error::clEnqueueNDRangeKernel" <<std::endl;
}

// Copy the output in GPU memory back to CPU memory
retVal = clEnqueueReadBuffer( GPUCommandQueue,
                                buffGPU.frameFilteredDepth,
                              CL_FALSE,
                              0,
                              (sizeof(float) * (w * h)),
                              buffLoc.frameFilteredDepth,
                              0,
                              NULL,
                              NULL);
...
...

Now when when work dimention is 1 i.e height * width it runs well

__kernel void hypothesis_opencl(__global uchar *Edges,
                                  __global uchar *Depth,
                                  __global float *Filtered,
                                  __global float *weights_table,
                                  __global float *dist_table,
                                  int height,
                                  int width,
                                  int fltr_radius,
                                  float search_limit)

{
        index = (int)get_global_id(0);

        Filtered[index] = (float)Depth[index];
}

Do you have any idea what might be causing the error?

Are your buffers the same size when you execute in 1D and 2D? I know that certain implementations of CL allocate buffers lazily, which means that you won’t get an out-of-resources error by the time you allocate the buffer. Instead, you will get the error the first time you try to do anything useful with it, such as launching an NDRange.

Infact in both cases the i use a 1D buffer.

In the function clEnqueueReadBuffer, the GPU buffer “buffGPU.frameFilteredDepth” is allocated using

buff->frameFilteredDepth = clCreateBuffer(GPUContext,
CL_MEM_WRITE_ONLY,
sizeof(float)*HW,
NULL,
&errVal);

and host memory(CPU) buffer “buffLoc.frameFilteredDepth” is created using

buff->frameFilteredDepth = new float[HW];

Can there be any issue with this kind of allocation.

The following code may be the cause of your problem

size_t workSize[2] = {h, w}; // two dimensional Range

Modify this code you described in the beginning of this discussion thread to

size_t workSize[2] = {w, h}; // two dimensional Range

I believe this should get things working.

Nop, it did not work out.
Actually inside the kernel i am using ‘pos.x’ as height and ‘pos.y’ as width. That is why i used size_t workSize[2] = {h, w}

I think i might have done something wrong somewhere else, i am attaching the whole host code below. All the functions for allocation of the buffer are below the main function in the code

#include "yuv_reader.h"
#include "hypothesis_opencl.h"
#include "pugixml.hpp"
#include "config.h"


// Main function
// *********************************************************************
int main(int argc, char **argv)
{
	cl_int retVal; //Return value
	cl_int errNum;

	// Check the command line params
	if(argc != 2)
	{
		std::cout << "USAGE: OpenCLMatch <config_filename.xml>" << std::endl;
		return 0;
	}

	// load config
	pugi::xml_document confXml;
	if(confXml.load_file(argv[1]) != pugi::status_ok)
	{
		std::cout << "ERROR: cannot open XML config file " << argv[1] << std::endl;
		//return -2;
	}

	config conf(confXml);
	if(!conf.initConfig())
	{
		std::cout << "ERROR: XML config file is not complete" << argv[1] << std::endl;
		return -2;
	}

	//**************** Yuv file read *******************
	int frameno = conf.start_frame;
	const int h = conf.height,
			  w = conf.width,
			  format = 420,
			  radius = (conf.radius > RADIUS) ?  MIN(conf.radius, MAXRADIUS) : RADIUS;

	const float sigmaColor = (conf.sigma_color > 0) ? conf.sigma_color : SIGMA_COLOR,
			    sigmaDistance = (conf.sigma_distance > 0) ? conf.sigma_distance : SIGMA_DISTANCE,
			    searchLimit = (conf.search_limit > 0) ? conf.search_limit : SEARCH_LIMIT;

	string filenameCL = conf.openCL_filename;

	CIYuv CIYuv_col(h, w, format);
	CIYuv CIYuv_col444(h, w, 444);
	CIYuv CIYuv_depth(h, w, 400);
	CIYuv CIYuv_filteredDepth(h, w, 400);

	FILE *pf_read_col, *pf_read_depth, *pf_write, *errLog;

	if((errLog = fopen("Debug/errLog.txt", "w")) == NULL)
	{
		std::cout << "Error::Failed to open errLog";
	};


	string fileName = conf.color_filename;
	if((pf_read_col = fopen(fileName.c_str(), "rb")) == NULL)
	{
		std::cout << "main::fopen:: opening YUV color file failed
";
	}

	fileName = conf.depth_filename;
	if((pf_read_depth = fopen(fileName.c_str(), "rb")) == NULL)
	{
		std::cout << "main::fopen:: opening YUV depth file failed
";
	}

	fileName = conf.filtered_filename;
	if((pf_write = fopen(fileName.c_str(), "wb")) == NULL)
	{
		std::cout << "main::fopen:: opening YUV output depth file failed
";
	}

	// Get platforms
	cl_uint nPlatform = 0;
	retVal = clGetPlatformIDs(0, NULL, &nPlatform);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clGetPlatformIDs" << std::endl;
	}

	cl_platform_id* plInfos = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
	retVal = clGetPlatformIDs(nPlatform, plInfos, NULL);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clGetPlatformIDs" << std::endl;
	}

	// Get device
	cl_context GPUContext;
	for (cl_uint i = 0; i < nPlatform; i++)
	{	
		cl_uint nDev = 0;
		retVal = clGetDeviceIDs(plInfos[i], CL_DEVICE_TYPE_GPU, 0, 0, &nDev);
		if(retVal != CL_SUCCESS)
		{
			std::cout << "Error::clGetDeviceIDs" << std::endl;
		}

		cl_device_id* GPUDevices = (cl_device_id*)malloc(nDev * sizeof(cl_device_id));
		retVal = clGetDeviceIDs(plInfos[i], CL_DEVICE_TYPE_GPU, nDev, GPUDevices, 0);
		if(retVal != CL_SUCCESS)
		{
			std::cout << "Error::clGetDeviceIDs" << std::endl;
		}
		
		// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
		GPUContext = clCreateContext(NULL, nDev, GPUDevices, 0, 0, &errNum);
		if(errNum != CL_SUCCESS)
		{
			std::cout << "ERROR: OpenCL::clCreateContext" << std::endl;
			return -1;
		}
	}


	// Get the list of GPU devices associated with this context
	size_t ParmDataBytes;
	retVal = clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clGetContextInfo" << std::endl;
	}

	cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes);

	retVal = clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clGetContextInfo" << std::endl;
	}

	// Create a command-queue on the first GPU device
	cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext,	GPUDevices[0], 0, &errNum);
	if(errNum != CL_SUCCESS)
	{
		std::cout << "ERROR: OpenCL::clCreateCommandQueue" << std::endl;
		return -1;
	}

	// Read OpenCL code from file
	size_t sourceLength = 0;
	char* sourceCL = readTextFile((char*)filenameCL.c_str(), &sourceLength);

	if(sourceCL == NULL || sourceLength<10)
	{
		std::cout << "ERROR::main::readTextFile: Cannot open " << filenameCL << " file" << std::endl;
		return -2;
	}

	// Create OpenCL program with source code
	cl_program OpenCLProgram = clCreateProgramWithSource( GPUContext, 
														  1, 
														  (const char **)&sourceCL, 
														  &sourceLength, 
														  &errNum);
		delete[] sourceCL;
	
	if(errNum != CL_SUCCESS)
	{
		std::cout << "ERROR::main::clCreateProgramWithSource: Unable to create opencl program" << std::endl;
		return -1;
	}

	// Build the program (OpenCL JIT compilation)
	retVal = clBuildProgram(OpenCLProgram, 1, GPUDevices, NULL, NULL, NULL);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "ERROR::main::clBuildProgram: " << retVal << std::endl;
		return -1;
	}

	size_t paramValueSize = 1024 * 1024,
			param_value_size_ret;
	char *paramValue;
	paramValue = (char*)calloc(paramValueSize, sizeof(char));
	retVal = clGetProgramBuildInfo( OpenCLProgram,
									GPUDevices[0],
									CL_PROGRAM_BUILD_LOG,
									paramValueSize,
									paramValue,
									&param_value_size_ret);

	fprintf(errLog, paramValue);
	fclose(errLog);

	// Create a handle to the compiled OpenCL function (Kernel)
	cl_kernel openCLKernel = clCreateKernel( OpenCLProgram,
											 "hypothesis_opencl",
											 &errNum);
	if(errNum != CL_SUCCESS)
	{
		std::cout << "ERROR::main::clCreateKernel: " << errNum << std::endl;
		return -1;
	}

	// Set kernel arguments
	LocalBuffer buffLoc;
	GPUBuffer buffGPU;
	initLocalBuffer(&buffLoc, w, h);

	retVal = initGPUBuffer(&buffGPU, GPUContext, w, h, searchLimit, radius);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::initGPUBuffer" <<std::endl;
	}

	// Fill local buffer with appropriate values
	// pre-calculation of color & distance weights
	init_color_SAD(buffLoc.weightTable, sigmaColor, MAXDIFF);    
	init_distance(buffLoc.distTable, sigmaDistance, radius);


	// Loop till the end of video file
	while((CIYuv_depth.readOneFrame(pf_read_depth, frameno)) == true)
	{
		CIYuv_col.readOneFrame(pf_read_col, frameno);

		std::cout << "Processing frmne no. " << frameno << "
";

		// Set the YUV image format to 444
		CIYuv_col444.setData444_inIYUV(&CIYuv_col);

		int i = 0,
			j = h * w,
			k = h * w * 2;

		// Initialize with some interesting data
		// Copy Y U V data to the local buffer
		for(int r = 0; r < h; r++)
		{
			for(int c = 0; c < w; c++)
			{
				buffLoc.frameCol[i] = CIYuv_col444.Y[r][c];
				buffLoc.frameCol[j++] = CIYuv_col444.U[r][c];
				buffLoc.frameCol[k++] = CIYuv_col444.V[r][c];

				buffLoc.frameDepth[i++] = CIYuv_depth.Y[r][c];
			}
		}

		// Copy the output in CPU memory TO GPU memory 
		retVal = fillGPUBuffer( GPUCommandQueue, 
					   &CIYuv_col,
					   &CIYuv_depth,
					   &buffGPU,
					   &buffLoc );
		if(retVal != CL_SUCCESS)
		{
			std::cout << "Error::fillGPUBuffer" <<std::endl;
		}

		retVal = initKernel( openCLKernel,
					&CIYuv_col,
					&CIYuv_depth,
					&buffGPU);
		if(retVal != CL_SUCCESS)
		{
			std::cout << "Error::initKernel" <<std::endl;
		}

		// Launch the Kernel on the GPU
		size_t workSize[2] = {h, w}; // two dimensional Range

		retVal = clEnqueueNDRangeKernel(	GPUCommandQueue,
								openCLKernel, 
								2, 
								NULL,
								workSize, 
								NULL, 
								0, 
								NULL, 
								NULL);
		if(retVal != CL_SUCCESS)
		{
			std::cout << "Error::clEnqueueNDRangeKernel" <<std::endl;
		}

		// Copy the output in GPU memory back to CPU memory
		retVal = clEnqueueReadBuffer( GPUCommandQueue,
							 buffGPU.frameFilteredDepth, 
							 CL_FALSE, 
							 0,
							 (sizeof(float) * (w * h)), 
							 buffLoc.frameFilteredDepth, 
							 0, 
							 NULL, 
							 NULL);
		if(retVal != CL_SUCCESS)
		{
			std::cout << "Error::clEnqueueReadBuffer" <<std::endl;
		}

		// Putting the output vector back to the image format
		// Copy output vector back to Y
		i = 0;
		for(int r = 0; r < CIYuv_col.getHeight(); r++)
		{
			for(int c = 0; c < CIYuv_col.getWidth(); c++)
			{
				CIYuv_filteredDepth.Y[r][c] = (uchar)buffLoc.frameFilteredDepth[i++];
			}
		}


		// Write to a YUV file
		if((CIYuv_filteredDepth.writeOneFrame(pf_write)) != true)
		{
			std::cout << "ERROR:CIYuv_filteredDepth.writeOneFrame Write YUV file failed
";
		}

		// Increment the frame index
  		frameno ++;

	}

	// Cleanup
	free(GPUDevices);
	clReleaseKernel(openCLKernel);
	clReleaseProgram(OpenCLProgram);
	clReleaseCommandQueue(GPUCommandQueue);
	clReleaseContext(GPUContext);

	releaseGPUBuffer(&buffGPU);
	releaseLocalBuffer(&buffLoc);


		
	// Close all opened files
	fclose(pf_read_col);
	fclose(pf_read_depth);
	fclose(pf_write);

	_getch();
	return 0;
}

int initGPUBuffer(GPUBuffer* buff, cl_context GPUContext, int width, int height, float searchLimit, int radius)
{
	const int HW = height*width;
	const int HW3 = HW*3;
	cl_int errVal;

	// Allocate GPU memory for source vectors AND initialize from CPU memory
	buff->frameCol = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY , sizeof(uchar)*HW3, NULL, &errVal);
	if(errVal != NULL)
	{
		std::cout << "Error::clCreateBuffer" << std::endl;
		return -1;
	}

	buff->frameDepth = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY, sizeof(uchar)*HW, NULL, &errVal);
	if(errVal != NULL)
	{
		std::cout << "Error::clCreateBuffer" << std::endl;
		return -1;
	}

	buff->distTable = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY, sizeof(float)*MAXWND, NULL, &errVal);
	if(errVal != NULL)
	{
		std::cout << "Error::clCreateBuffer" << std::endl;
		return -1;
	}

	buff->weightTable = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY, sizeof(float)*MAXDIFF, NULL, &errVal);
	if(errVal != NULL)
	{
		std::cout << "Error::clCreateBuffer" << std::endl;
		return -1;
	}

	// Allocate output memory on GPU
	buff->frameFilteredDepth = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(float)*HW, NULL, &errVal);
	if(errVal != NULL)
	{
		std::cout << "Error::clCreateBuffer" << std::endl;
		return -1;
	}

	//Set constant scalar paremeters
	buff->width = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(int), &width, &errVal);
	if(errVal != NULL)
	{
		std::cout << "Error::clCreateBuffer" << std::endl;
		return -1;
	}

	buff->height = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(int), &height, &errVal);
	if(errVal != NULL)
	{
		std::cout << "Error::clCreateBuffer" << std::endl;
		return -1;
	}

	buff->searchLimit = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(float), &searchLimit, &errVal);
	if(errVal != NULL)
	{
		std::cout << "Error::clCreateBuffer" << std::endl;
		return -1;
	}

	buff->filterRadius = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(int), &radius, &errVal);
	if(errVal != NULL)
	{
		std::cout << "Error::clCreateBuffer" << std::endl;
		return -1;
	}

	return CL_SUCCESS;

}

void releaseGPUBuffer(GPUBuffer* buff)
{
	clReleaseMemObject(buff->frameCol);
	clReleaseMemObject(buff->frameDepth);
	clReleaseMemObject(buff->distTable);
	clReleaseMemObject(buff->weightTable);
	clReleaseMemObject(buff->frameFilteredDepth);

	clReleaseMemObject(buff->width);
	clReleaseMemObject(buff->height);
	clReleaseMemObject(buff->searchLimit);
	clReleaseMemObject(buff->filterRadius);
}


void initLocalBuffer(LocalBuffer* buff, int width, int height)
{
	const int HW = height*width;
	const int HW3 = HW*3;

	buff->frameCol = new uchar[HW3];
	buff->frameDepth = new uchar[HW];
	buff->frameFilteredDepth = new float[HW];
	buff->distTable = new float[MAXWND];
	buff->weightTable = new float[MAXDIFF];
}

void releaseLocalBuffer(LocalBuffer* buff)
{
	delete[] buff->frameCol;
	delete[] buff->frameDepth;
	delete[] buff->frameFilteredDepth;
	delete[] buff->distTable;
	delete[] buff->weightTable;
}

int fillGPUBuffer(cl_command_queue GPUCommandQueue, CIYuv* CIYuv_col, CIYuv* CIYuv_depth, GPUBuffer* buffGPU, LocalBuffer* buffLoc)
{
	cl_int retVal;

	retVal = clEnqueueWriteBuffer(GPUCommandQueue, buffGPU->frameCol, CL_TRUE, 0,
		sizeof(uchar) * CIYuv_col->getSizeInByte(), buffLoc->frameCol, 0, NULL, NULL);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clEnqueueWriteBuffer" << std::endl;
		return -1;
	}

	retVal = clEnqueueWriteBuffer(GPUCommandQueue, buffGPU->frameDepth, CL_TRUE, 0,
		sizeof(uchar) * (CIYuv_depth->getHeight() * CIYuv_depth->getWidth()), buffLoc->frameDepth, 0, NULL, NULL);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clEnqueueWriteBuffer" << std::endl;
		return -1;
	}

	retVal = clEnqueueWriteBuffer(GPUCommandQueue, buffGPU->frameFilteredDepth, CL_TRUE, 0,
		sizeof(float) * (CIYuv_depth->getHeight() * CIYuv_depth->getWidth()), buffLoc->frameFilteredDepth, 0, NULL, NULL);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clEnqueueWriteBuffer" << std::endl;
		return -1;
	}

	retVal = clEnqueueWriteBuffer(GPUCommandQueue, buffGPU->weightTable, CL_TRUE, 0,
		sizeof(float) * MAXDIFF, buffLoc->weightTable, 0, NULL, NULL);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clEnqueueWriteBuffer" << std::endl;
		return -1;
	}

	retVal = clEnqueueWriteBuffer(GPUCommandQueue, buffGPU->distTable, CL_TRUE, 0,
		sizeof(float) * MAXWND, buffLoc->distTable, 0, NULL, NULL);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clEnqueueWriteBuffer" << std::endl;
		return -1;
	}

	return CL_SUCCESS;
}

int initKernel(cl_kernel kernel, CIYuv* CIYuv_col, CIYuv* CIYuv_depth, GPUBuffer* buffGPU)
{
	cl_int retVal;

	retVal = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffGPU->frameCol);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clSetKernelArg" << std::endl;
		return -1;
	}

	retVal = clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffGPU->frameDepth);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clSetKernelArg" << std::endl;
		return -1;
	}

	retVal = clSetKernelArg(kernel, 2, sizeof(cl_mem), &buffGPU->frameFilteredDepth);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clSetKernelArg" << std::endl;
		return -1;
	}

	retVal = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buffGPU->weightTable);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clSetKernelArg" << std::endl;
		return -1;
	}

	retVal = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buffGPU->distTable);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clSetKernelArg" << std::endl;
		return -1;
	}

	retVal = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buffGPU->height);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clSetKernelArg" << std::endl;
		return -1;
	}

	retVal = clSetKernelArg(kernel, 6, sizeof(cl_mem), &buffGPU->width);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clSetKernelArg" << std::endl;
		return -1;
	}

	retVal = clSetKernelArg(kernel, 7, sizeof(cl_mem), &buffGPU->filterRadius);
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clSetKernelArg" << std::endl;
		return -1;
	}

	retVal = clSetKernelArg(kernel, 8, sizeof(cl_mem), &buffGPU->searchLimit);	
	if(retVal != CL_SUCCESS)
	{
		std::cout << "Error::clSetKernelArg" << std::endl;
		return -1;
	}

	return CL_SUCCESS;
}

char* readTextFile(char* filename, size_t* fileLength)
{
	char* cSourceCL = NULL;

	ifstream progin(filename);
	if(progin.is_open())
	{
		progin.seekg(0, ios::end);
		int srcLength = progin.tellg();
		progin.seekg(0, ios::beg);		
		cSourceCL = new char[srcLength+1];		
		progin.read(cSourceCL, srcLength);
		progin.close();

		
		for(int idx = srcLength; idx > 0; idx--)
		{
			std::cout << cSourceCL[idx] <<"  ";
			if(cSourceCL[idx] == '}')
			{
				*fileLength = idx + 1;
				cSourceCL[*fileLength] = '\0';
				break;
			}
		}
	}

	return cSourceCL;
}

inline int int_SAD(int a, int b)
{
    int sad = a - b;
    return sad > 0 ? sad : -sad;
}

void init_color_SAD(float *weights_table, float sigmaColor, int max_cost)
{
    for(int i=0; i<max_cost; i++)
    {
        weights_table[i] = exp(-((float)i)/sigmaColor);
    }
}

void init_distance(float *distance, float sigmaDistance, int radius)
{
    // pre-calculation of distance weights
	for(int i=-radius, index = 0; i<=radius; i++)
	{
		for(int j=-radius; j<=radius; j++, index++)
		{
			distance[index] = exp(-sqrt((float)i*i+j*j)/sigmaDistance);
		}
	}
}




Now i am even getting CL_OUT_OF_RESOURCES even for the function clEnqueueWriteBuffer.

Is there some problem in which i am allocating the GPU buffers?

I think it’s worth insisting a little more: have you tried reducing (a lot) the size of the buffers and see if the problem goes away? The buffers could be too big for your GPU. There’s no way for us to know how large they are.

Second, try removing parts of your app and see if the problem is still there. Once you have the smallest sequence of OpenCL calls that still reproduce the issue, show us what it looks like.