Problem with Memory not getting freed up

Hi all,
I am interfacing my OpenCL code with Matlab (using mex files) and I am running with some problems in doing that. My THE same code works on the ATI Radeon 5450 Card but it fails on NVIDIA’s GTX 260 because physical memory usage starts shooting up(checked using task manager) and ultimately clCreateContext fails with error code -6 (CL_OUT_OF_HOST_MEMORY)
Now the thing in the OpenCL code all the memory is allocated on the GPU and mex file only has one output variable so what could be the reason for this behavior?
The only thing different is the graphics card so this may be more suitable for NVIDIA OpenCL forum but I am posting it here just to see if someone has better suggestions.

Also they way I am copying data to GPU is like this:
clCreateBuffer
clEnqeueMapBuffer
memcpy /* destination is the pointer got from above command */
clEnqueueUnmap
Is this procedure correct?
I am pasting the mex file code…

Any help would be highly appreciated.
With regards,
richeek

#include <string.h>
#include"cl_resources.h"
#ifdef _CHAR16T
#define CHAR16_T
#endif
#include "mex.h"


void mexFunction (int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
  cl_int status;  
  /* Initialize OpenCL resoruces */
  status = initializeCL();
  if(status != CL_SUCCESS)
	return;

  // input variables
  float *R_re, *R_im, *dist_ZF,*s_re, *s_im ,  *symbol_alphabet_re, *symbol_alphabet_im;
  bool *bittable;
  int kk, nR, nT, nSym, *M, *symbols_ZF_i, total_bits,Sum_M,max_2_M;

  size_t size;
  cl_event events[2], ev;
  // output variables
  float *LLR;
 
  
  if (nrhs!=7)
    mexErrMsgTxt("7 input arguments required 

"
				 "[LLR] = soft_sd(R,s,dist_ZF,symbols_ZF,symbol_alphabet,bittable) ... soft Sphere Decoder

"
				 "  R ... upper triangular matrix obtained from the QR decomposition of the channel H (complex)
"
				 "  s ... received symbol vector, s=Q^H*y (nR x nSym) (complex)
"
                 "  dist_ZF ... Distance of the zero forcing solution (real)
"
                 "  symbols_ZF_i ... indices to symbols of the ZF solution (nT x nSym) (real integer)
"
                 "  M ... number of bits in the corresponding layer (1 x nR) (real)
"
				 "  symbol_alphabet ... for the demapping (2^M_max x nT) (complex)
"
				 "  bittable ... matrix containing the bits according to the symbol_alphabet (M x 2^M) (logical)
"
				 "  LLR  ... max-log-MAP approximation of the LLR values (M*nR) (real)

");
  if (nlhs>1) 
    mexErrMsgTxt("One output lefthand argument required 
");

  // check input variables
  if ( ! mxIsComplex(prhs[0]) )
    mexErrMsgTxt("1st argument 'R' must be a complex-valued (nR x nT) matrix");
  if ( ! mxIsComplex(prhs[1]) )
    mexErrMsgTxt("2nd argument 's' must be a complex-valued (nR x nSym) matrix");
  if (   mxIsComplex(prhs[2]) )
    mexErrMsgTxt("3rd argument 'dist_ZF' must be a real-valued (1 x nSym) matrix");
  if (   mxIsComplex(prhs[3]) )
    mexErrMsgTxt("4th argument 'symbols_ZF_i' must be a real-valued (nT x nSym) integer matrix");
  if (   mxIsComplex(prhs[4]) )
    mexErrMsgTxt("5th argument 'M' must be a real-valued (1 x nT) integer matrix");
  if ( ! mxIsComplex(prhs[5]) )
    mexErrMsgTxt("6th argument 'symbol_alphabet' must be a complex-valued (2^M_max x nT) matrix");
  if ( ! mxIsLogical(prhs[6]) )
    mexErrMsgTxt("7th argument 'bittable' must be a logical (M x 2^M) matrix");
  
  
    nR      = mxGetM(prhs[0]);    //  number of receive antennas
    nT      = mxGetN(prhs[0]);    //  number of transmit antennas
    nSym    = mxGetN(prhs[1]);    //  Block size (number of transmitted symbol vectors)
	Sum_M   = mxGetM(prhs[6]);	  //  sum of the number of bits of M	
	max_2_M = mxGetN(prhs[6]);    //  Maximum value of 2^M

    
    // fetch input variables
    R_re = (float *)(mxGetPr(prhs[0]));                    // fetch pointer to real part of R
	R_im = (float *)(mxGetPi(prhs[0]));                    // fetch pointer to imag part of R
    s_re = (float *)(mxGetPr(prhs[1]));                    // fetch pointer to real part of s
	s_im = (float *)(mxGetPi(prhs[1]));                    // fetch pointer to imag part of s
    dist_ZF = (float *)(mxGetPr(prhs[2]));                 // fetch ZF distance
	symbols_ZF_i = (int *)(mxGetPr(prhs[3]));               // fetch pointer to imag part of ZF solution indices
	M = (int *)(mxGetPr(prhs[4]));                          // fetch pointer to number of bits vector
    symbol_alphabet_re = (float *)(mxGetPr(prhs[5]));      // fetch pointer to real part of symbol alphabet
	symbol_alphabet_im = (float *)(mxGetPi(prhs[5]));      // fetch pointer to imag part of symbol alphabet
    bittable = (bool *)(mxGetPr(prhs[6]));                  // fetch pointer to real part of bit mapping table

    // allocate memory for output variables
    total_bits = 0;
    for(kk=0; kk<nT; kk++)
        total_bits += M[kk]; 
    
	size_t total_size = 0;
	// Allocate Variables on the Device Global Memory of GPU and Load the input variables to the device
	//llr_d = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) *block_length*no_blocks, NULL, &status);
	cl_mem R_re_d;
	size = nR * nT * sizeof(float);
	total_size += size;
	R_re_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
//	R_re_d = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size, R_re, &status);
	if(status != CL_SUCCESS) 
    { 
        mexPrintf("Error: Setting kernel argument. 
");
        return ;
    }
	
	float* R_re_p;
	R_re_p = (float *)clEnqueueMapBuffer(commandQueue,R_re_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: clEnqueueMapBuffer 
");
		return;
	}
	memcpy(R_re_p, R_re, size);
		/* Load the data back on the GPU */
	status = clEnqueueUnmapMemObject(commandQueue,R_re_d,(void *)R_re_p,0,NULL,&ev);
    if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() failed
");
        return;
	}

	status = clWaitForEvents(1, &ev);
	if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() Release failed R_re_d
");
        return;
	}
	cl_mem s_re_d;
	size = nT * nSym * sizeof(float);
	total_size += size;

	s_re_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
//	s_re_d = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size, s_re, &status);
	if(status != CL_SUCCESS) 
    { 
        mexPrintf("Error: Setting kernel argument. 
");
        return;
    }

	float *s_re_p;
	s_re_p = (float *)clEnqueueMapBuffer(commandQueue,s_re_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: clEnqueueMapBuffer 
");
		return;
	}
	memcpy(s_re_p, s_re, size);
		/* Load the data back on the GPU */
	status = clEnqueueUnmapMemObject(commandQueue,s_re_d,(void*)s_re_p,0,NULL,&ev);
    if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() failed
");
        return;
	}

	status = clWaitForEvents(1, &ev);
	if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() Release failed s_re_d
");
        return;
	}
	cl_mem s_im_d;
	size = nT * nSym * sizeof(float);
	total_size += size;
	s_im_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
//	s_im_d = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size, s_im, &status);
	if(status != CL_SUCCESS) 
    { 
        mexPrintf("Error: Setting kernel argument. 
");
        return;
    }

	float *s_im_p;
	s_im_p = (float *)clEnqueueMapBuffer(commandQueue,s_im_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: clEnqueueMapBuffer 
");
		return;
	}
	memcpy(s_im_p, s_im, size);
		/* Load the data back on the GPU */
	status = clEnqueueUnmapMemObject(commandQueue,s_im_d,(void *)s_im_p,0,NULL,&ev);
    if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() s_im failed
");
        return;
	}
	status = clWaitForEvents(1, &ev);
	if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() Release failed s_im_d
");
        return;
	}
	cl_mem R_im_d;
	size = nR * nT * sizeof(float);
	total_size += size;
	R_im_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
//	R_im_d = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size, R_im, &status);
	if(status != CL_SUCCESS) 
    { 
        mexPrintf("Error: Setting kernel argument. 
");
        return ;
    }

	float *R_im_p;
	R_im_p = (float *)clEnqueueMapBuffer(commandQueue,R_im_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: clEnqueueMapBuffer 
");
		return;
	}
	memcpy(R_im_p, R_im, size);
		/* Load the data back on the GPU */
	status = clEnqueueUnmapMemObject(commandQueue,R_im_d,(void *)R_im_p,0,NULL,&ev);
    if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() failed R_im with status %d
", status);
        return;
	}
	status = clWaitForEvents(1, &ev);
	if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() Release failed R_im_d
");
        return;
	}
	cl_mem dist_ZF_d;
	size = 1 * nSym * sizeof(float);
	dist_ZF_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
	total_size += size;
//	dist_ZF_d = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size, dist_ZF, &status);
	if(status != CL_SUCCESS) 
    { 
        mexPrintf("Error: Setting kernel argument. 
");
        return;
    }
	//status = clEnqueueWriteBuffer(commandQueue,dist_ZF_d,1,0,size,dist_ZF,0,0,0);
	float *dist_ZF_p;
	dist_ZF_p = (float *)clEnqueueMapBuffer(commandQueue,dist_ZF_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: clEnqueueMapBuffer 
");
		return;
	}
	memcpy(dist_ZF_p, dist_ZF, size);
		/* Load the data back on the GPU */
	status = clEnqueueUnmapMemObject(commandQueue,dist_ZF_d,(void *)dist_ZF_p,0,NULL,&ev);
    if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() failed
");
        return;
	}
	status = clWaitForEvents(1, &ev);
	if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() Release failed dist_ZF_d
");
        return;
	}

	cl_mem symbols_ZF_index_d;
	size = nT * nSym * sizeof(int);
	symbols_ZF_index_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
	total_size += size;
//	symbols_ZF_index_d = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size, symbols_ZF_i, &status);
	if(status != CL_SUCCESS) 
    { 
        mexPrintf("Error: Setting kernel argument. 
");
        return;
    }

	int *symbols_ZF_index_p;
	symbols_ZF_index_p = (int*)clEnqueueMapBuffer(commandQueue,symbols_ZF_index_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: clEnqueueMapBuffer 
");
		return;
	}
	memcpy(symbols_ZF_index_p, symbols_ZF_i, size);
		/* Load the data back on the GPU */
	status = clEnqueueUnmapMemObject(commandQueue,symbols_ZF_index_d,(void*)symbols_ZF_index_p,0,NULL,&ev);
    if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() failed
");
        return;
	}
	status = clWaitForEvents(1, &ev);
	if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() Release failed symbols_ZF_index_p
");
        return;
	}

	cl_mem M_d;
	size = nT * 1 * sizeof(int);
	M_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
	total_size += size;
//	M_d = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size, M, &status);
	if(status != CL_SUCCESS) 
    { 
        mexPrintf("Error: Setting kernel argument. 
");
        return;
    }

	int *M_p;
	M_p = (int*)clEnqueueMapBuffer(commandQueue,M_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: clEnqueueMapBuffer 
");
		return;
	}
	memcpy(M_p, M, size);
		/* Load the data back on the GPU */
	status = clEnqueueUnmapMemObject(commandQueue, M_d,(void *)M_p,0,NULL,&ev);
    if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() failed
");
        return;
	}
	status = clWaitForEvents(1, &ev);
	if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() Release failed M_d
");
        return;
	}

	cl_mem symbol_alphabet_re_d;
	size = nT * max_2_M * sizeof(float);
	total_size += size;
	symbol_alphabet_re_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
//	symbol_alphabet_re_d = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size, symbol_alphabet_re, &status);
	if(status != CL_SUCCESS) 
    { 
        mexPrintf("Error: Setting kernel argument. 
");
        return;
    }

	float *symbol_alphabet_re_p;
	symbol_alphabet_re_p = (float *)clEnqueueMapBuffer(commandQueue,symbol_alphabet_re_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: clEnqueueMapBuffer 
");
		return;
	}
	memcpy(symbol_alphabet_re_p, symbol_alphabet_re, size);
		/* Load the data back on the GPU */
	status = clEnqueueUnmapMemObject(commandQueue,symbol_alphabet_re_d,(void *)symbol_alphabet_re_p,0,NULL,&ev);
    if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() failed
");
        return;
	}
	status = clWaitForEvents(1, &ev);
	if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() Release failed symbol_alphabet_re_d
");
        return;
	}

	cl_mem symbol_alphabet_im_d;
	size = nT * max_2_M * sizeof(float);
	symbol_alphabet_im_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
	total_size += size;
//	symbol_alphabet_im_d = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR , size, symbol_alphabet_im, &status);
	if(status != CL_SUCCESS) 
    { 
        mexPrintf("Error: Setting kernel argument. 
");
        return;
    }

	float *symbol_alphabet_im_p;
	symbol_alphabet_im_p = (float *)clEnqueueMapBuffer(commandQueue,symbol_alphabet_im_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: clEnqueueMapBuffer 
");
		return;
	}
	memcpy(symbol_alphabet_im_p, symbol_alphabet_im, size);
		/* Load the data back on the GPU */
	status = clEnqueueUnmapMemObject(commandQueue,symbol_alphabet_im_d,(void *)symbol_alphabet_im_p,0,NULL,&ev);
    if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() failed
");
        return;
	}
	status = clWaitForEvents(1, &ev);
	if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() Release failed symbol_alphabet_im_d
");
        return;
	}

		cl_mem bittable_d;
//	size = Sum_M * max_2_M * sizeof(bool);
	size = Sum_M * max_2_M * sizeof(char);
	bittable_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
	total_size += size;
	if(status != CL_SUCCESS) 
    { 
        mexPrintf("Error: Setting kernel argument. 
");
       // return;
    }

//    bool *bittable_p;
    char *bittable_p;
//	bittable_p = (bool*)clEnqueueMapBuffer(commandQueue,bittable_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status);
	bittable_p = (char*)clEnqueueMapBuffer(commandQueue,bittable_d,CL_TRUE,CL_MAP_WRITE,0,size,0,NULL,NULL,&status);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: clEnqueueMapBuffer 
");
		return;
	}
	char *bittable_copy;
	bittable_copy = (char*)malloc(Sum_M*max_2_M*sizeof(char));
	for(int i=0;i<Sum_M*max_2_M; ++i)
	{
		if(bittable[i] == true)
			bittable_copy[i] = 1;
		else
			bittable_copy[i] = 0;
	}
//	memcpy(bittable_p, bittable, size);
	memcpy(bittable_p, bittable_copy, size);
		/* Load the data back on the GPU */
	status = clEnqueueUnmapMemObject(commandQueue,bittable_d,(void*)bittable_p,0,NULL,&ev);
    if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() failed bittable
");
        return;
	}
	status = clWaitForEvents(1, &ev);
	if(status != CL_SUCCESS)
	{
        mexPrintf("clEnqueueUnmapMemObject() Release failed bittable_d
");
        return;
	}

	/* This is the output */
	cl_mem LLR_d;
	size = Sum_M *nSym * sizeof(float);
	total_size += size;
	LLR_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
	if(status != CL_SUCCESS) 
    { 
        mexPrintf("Error: Setting kernel argument. 
");
        return;
    }
	//mexPrintf("Total allocated memory is %d
", total_size);
	/* Set kernel Arguments */
	
    /*** Set appropriate arguments to the kernel ***/

    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&R_re_d);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument.
");
		return;
	}
	status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&R_im_d);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument
");
		return;
	}
	status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&s_re_d);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument
");
		return;
	}
	status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&s_im_d);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument
");
		return;
	}
	status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&dist_ZF_d);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument
");
		return;
	}
	status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&symbols_ZF_index_d);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument
");
		return;
	}
	status = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&symbol_alphabet_re_d);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument
");
		return;
	}
	status = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&symbol_alphabet_im_d);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument
");
		return;
	}
	status = clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *)&bittable_d);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument
");
		return;
	}
	status = clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *)&LLR_d);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument
");
		return;
	}
	status = clSetKernelArg(kernel, 10, sizeof(int), (void *)&nSym);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument
");
		return;
	}
	status = clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *)&M_d);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument
");
		return;
	}
	status = clSetKernelArg(kernel, 12, sizeof(int), (void *)&nT);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument
");
		return;
	}
	status = clSetKernelArg(kernel, 13, sizeof(int), (void *)&nR);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument
");
		return;
	}
	status = clSetKernelArg(kernel, 14, sizeof(int), (void *)&total_bits);
	if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Setting kernel argument
");
		return;
	}

    size_t localThreads[2] = {128, 1};
	size_t globalThreads[2] = {nSym, 1};
	
	status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &events[0]);

    if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)
");
		return;
	}


    /* wait for the kernel call to finish execution */
    status = clWaitForEvents(1, &events[0]);
    if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Waiting for kernel run to finish.(clWaitForEvents)
");
		return;
	}

	// output variable with Sphere Decoder solution of single precision type
    plhs [0] = mxCreateNumericMatrix(total_bits,nSym,mxSINGLE_CLASS,mxREAL);
    

    if(plhs[0] == NULL)
      mexErrMsgTxt("mxCreateNumericMatrix failed(1)
");

    LLR  = (float *) mxGetPr(plhs[0]);                    // fetch pointer for output variable
    if(LLR == NULL ) 
      mexErrMsgTxt("mxCreateNumericMatrix failed(2)
");

	//Copy the data back from the GPU 
	size = Sum_M *nSym * sizeof(float);
	status = clEnqueueReadBuffer(commandQueue,LLR_d,CL_TRUE,0,size,LLR,0,0,&ev);
	if(status != CL_SUCCESS)
	{
		mexPrintf("Error in reading LLR buffer Status is: %d size is(bytes) %d
", status, size);
	}
	status = clWaitForEvents(1, &ev);
    if(status != CL_SUCCESS) 
	{ 
		mexPrintf("Error: Waiting for LLR read to finish.(clWaitForEvents) status is 
", status);
		return;
	}

	free(bittable_copy);
	// Free device memory
	status = 0;
	
    status += clReleaseMemObject(R_re_d);
	status += clReleaseMemObject(R_im_d);
	status += clReleaseMemObject(s_re_d);
	status += clReleaseMemObject(s_im_d);
	status += clReleaseMemObject(dist_ZF_d);
	status += clReleaseMemObject(symbols_ZF_index_d);
	status += clReleaseMemObject(M_d);
	status += clReleaseMemObject(symbol_alphabet_re_d);
	status += clReleaseMemObject(symbol_alphabet_im_d);
	status += clReleaseMemObject(bittable_d);
	status += clReleaseMemObject(LLR_d);
	status += clReleaseKernel(kernel);
    status += clReleaseProgram(program);
	status += clReleaseCommandQueue(commandQueue);
    status += clReleaseContext(context);

	if(status != 0)
		mexPrintf("Error in freeing up the memory");

} 

You may have found a memory leak in NVidia’s OpenCL implementation. Double-check that you are releasing all the resources you allocated with clCreateXXX() functions.

I think you are right since the same code works on ATI Radeon but fails on NVIDIA. I posted the same question in NVIDIA OpenCL forum too but no body replies there…