CL_INVALID_COMMAND_QUEUE when clFinish

Hello All,

I am pretty new in GPU computing using OpenCL. Recently my program got an error code -36 (CL_INVALID_COMMAND_QUEUE) when trying to call clFinish.
The C++ code is really long, so I just paste some major functions below.

Any comment will be appreciated!


    // Step 01: Get platform/device information 
    	OpenCLobj.err = clGetPlatformIDs( 1, &OpenCLobj.platform_id, &OpenCLobj.ret_num_platforms ); err_check( OpenCLobj.err, "clGetPlatformIDs" );
    // Step 02: Get information about the device
    	OpenCLobj.err = clGetDeviceIDs( OpenCLobj.platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &OpenCLobj.device_id, &OpenCLobj.ret_num_devices ); err_check( OpenCLobj.err, "clGetDeviceIDs" );
    // Step 03: Create OpenCL Context
    	OpenCLobj.context = clCreateContext( NULL, 1, &OpenCLobj.device_id, NULL, NULL, &OpenCLobj.err ); err_check( OpenCLobj.err, "clCreateContext" );
    // Step 04: Create Command Queue
    	OpenCLobj.command_queue = clCreateCommandQueue( OpenCLobj.context, OpenCLobj.device_id, 0, &OpenCLobj.err ); err_check( OpenCLobj.err, "clCreateCommandQueue" );

    // Step 06: Read kernel file
    	ifstream file("kernel.cl");
    	string prog( istreambuf_iterator<char>( file ), ( istreambuf_iterator<char>() ) );
    	const char *source_str = prog.c_str();

    // Step 07: Create Kernel program from the read in source
    	OpenCLobj.program = clCreateProgramWithSource( OpenCLobj.context, 1, (const char **) &source_str, 0, &OpenCLobj.err ); err_check( OpenCLobj.err, "clCreateProgramWithSource" );
    // Step 08: Build Kernel Program
    	OpenCLobj.err = clBuildProgram( OpenCLobj.program, 1, &OpenCLobj.device_id, NULL, NULL, NULL );
    	if(OpenCLobj.err == CL_BUILD_PROGRAM_FAILURE){
    				cout << "CL_BUILD_PROGRAM_FAILURE" ;
    				size_t build_log_size=sizeof(char)*900;
    				char * build_log = new char[900];
    				size_t build_log_ret;
    				clGetProgramBuildInfo(OpenCLobj.program,OpenCLobj.device_id,CL_PROGRAM_BUILD_LOG,build_log_size,build_log,&build_log_ret);
    				for(int i=0;i<(build_log_ret)/sizeof(char);i++){
                  	cout << build_log[i];
    				}
    		}
    	err_check( OpenCLobj.err, "clBuildProgram" );

    // Step 09: Create OpenCL Kernel
    		cl_kernel kernel = NULL;
    		kernel = clCreateKernel( OpenCLobj.program, "padding_center", &OpenCLobj.err ); err_check3( OpenCLobj.err, "clCreateKernel" );
    		

    // Step 05: Create memory objects and tranfer the data to memory buffer
    		cl_mem mobj_diffmatrix = NULL;
    		mobj_diffmatrix = clCreateBuffer(OpenCLobj.context, CL_MEM_READ_WRITE, sizeof(float)*AD.Xdim*AD.Ydim, NULL, &OpenCLobj.err); err_check3( OpenCLobj.err, "clCreateBuffer" );
    		OpenCLobj.err = clEnqueueWriteBuffer( OpenCLobj.command_queue, mobj_diffmatrix, CL_TRUE, 0, sizeof(float)*AD.Xdim*AD.Ydim, diffmatrix, 0, NULL, NULL ); err_check3( OpenCLobj.err, "clEnqueueWriteBuffer" );

    // Step 10: Set OpenCL kernel argument
    		OpenCLobj.err = clSetKernelArg( kernel, 0, sizeof( cl_mem ), (void *) &mobj_diffmatrix );	err_check3( OpenCLobj.err, "clSetKernelArg" );
    		OpenCLobj.err = clSetKernelArg( kernel, 1, sizeof( cl_mem ), (void *) &mobj_tmpdiff );	err_check3( OpenCLobj.err, "clSetKernelArg" );
    		
    // Step 11: Execute OpenCL kernel in data parallel
    		size_t work = 1000;
    		size_t localwork = 1000;
    		cl_event clEvent;
    		for (int i = 0; i < AD.nIter; i++)
    		{
    			clEnqueueNDRangeKernel( OpenCLobj.command_queue, kernel, 1, NULL, &work, &localwork, 0, NULL, &clEvent ); err_check3( OpenCLobj.err, "clEnqueueNDRangeKernel" );
    		}


    		OpenCLobj.err = clFlush( OpenCLobj.command_queue );		err_check3( OpenCLobj.err, "clFlush" );
    		OpenCLobj.err = clFinish( OpenCLobj.command_queue );	err_check3( OpenCLobj.err, "clFinish" );

First:

Check cl_info and read the local_worker_size.


    // Step 11: Execute OpenCL kernel in data parallel
    		size_t work = 1000;
    		size_t localwork = 1000;
    		cl_event clEvent;
    		for (int i = 0; i < AD.nIter; i++)
    		{
    			clEnqueueNDRangeKernel( OpenCLobj.command_queue, kernel, 1, NULL, &work, &localwork, 0, NULL, &clEvent ); err_check3( OpenCLobj.err, "clEnqueueNDRangeKernel" );
    		}
    		OpenCLobj.err = clFlush( OpenCLobj.command_queue );		err_check3( OpenCLobj.err, "clFlush" );
    		OpenCLobj.err = clFinish( OpenCLobj.command_queue );	err_check3( OpenCLobj.err, "clFinish" );

Try :
[ul]
[li]localwork = 64
[/li][/ul]