Command queue goes invalid after kernel execution

Hello everyone,

I am currently trying to write my first OpenCL program, although it’s been a while I am stuck at one point. My program reads data from a file, then sends it to a kernel along with its size.
Here is the code for the creation of the kernel to hold the data read from the file (which is the pointer ‘data’) and another buffer to hold the output of the kernel:

mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_uchar)*readSize, data, &ciErr1);

mem2 = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_short)*readSize, out, &ciErr1);

readSize (as you can guess… :slight_smile: ) is the size of the read data, which is a 500x10 matrix of unsigned chars.

Then, the program sets the argument values:

clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&mem1);
clSetKernelArg(ckKernel, 1, sizeof(cl_uint), (void*)&blocksize);
clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&mem2);
clSetKernelArg(ckKernel, 3, sizeof(cl_char), (void*)&cols);
clSetKernelArg(ckKernel, 4, sizeof(cl_char), (void*)&rows);

Afterwards, the kernel is launched:

clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &glob_wsize, NULL, 0, NULL, NULL);

with global worksize = 5000.

I have been struggling to find out why after this last instruction I am getting an invalid command queue.

I hope somebody can help me!!
Thanks a lot.

caseClosed

Well, I’m not sure about your command queue error, you might need to post more of the code
that is causing the problem. One thing I am curious about though is why you do this

mem2 = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_short)*readSize, out, &ciErr1);

if mem2 is write_only and is for the return data from your kernel, what are you copying into
it initially and why?


jason

Yeah, you’re right, I’m not actually copying anything in that buffer, but since my kernel was returning no data I thought that maybe setting the flag to CL_MEM_COPY_HOST_PTR would have also meant to copy the results of the computation in the buffer after the kernel completed its execution. I know it sounds stupid since the OpenCL specification does not say that, but I thought I could try… Anyway, it didn’t work either way :smiley:

I’m posting more of the code:

clGetPlatformIDs(1, &cpPlatform, NULL);

clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);

ctx = clCreateContext(0, 1, &cdDevice, printErr, NULL, &ciErr1);	
			
cqCommandQueue = clCreateCommandQueue(ctx, cdDevice, 0, &ciErr1);

mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_uchar)*readSize, data, &ciErr1);
mem2 = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, sizeof(cl_short)*readSize, out, &ciErr1);

cSourceCL = oclLoadProgSource(cSourceFile, "", &szKernelLength);

cpProgram = clCreateProgramWithSource(ctx, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1);

clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL);

ckKernel = clCreateKernel(cpProgram, "Bucketization", &ciErr1);

clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&mem1);
clSetKernelArg(ckKernel, 1, sizeof(cl_uint), (void*)&blocksize);
clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&mem2);
clSetKernelArg(ckKernel, 3, sizeof(cl_char), (void*)&cols);
clSetKernelArg(ckKernel, 4, sizeof(cl_char), (void*)&rows);

clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &glob_wsize, NULL, 0, NULL, NULL);

clFinish(cqCommandQueue);

And here is the code of the kernel… it’s a reduction kernel, so I’m just posting some of the lines from the beginning (since the rest it’s just repeated loops) otherwise you’ll be overwhelmed!

__kernel void Bucketization(__global unsigned char* G, unsigned int py0, __global short* sanity, char mul_x, char mul_y)
{
	int bx=(get_group_id(0)+get_num_groups(0)*mul_x);
	int by=(get_group_id(1)+get_num_groups(1)*mul_y);
	int tx = get_global_id(0);
	int idx, idy;
	int addr;
	char val, val2, valpy0;

	__local short bucket[864];
	__local unsigned short tp[32];
	__local unsigned short fn[32];
	__local unsigned short tn[32];
	__local unsigned short fp[32];

	int blk_idx = get_group_id(0);

	if (py0 >= bx)
    {
        idx = 999 - bx;
        py0 = 999 - py0;
    }
    else
        idx = bx;

    if (bx >= by)
    {
       idx = 999 - bx;
       idy = 999 - by;
    }
     else
        idy = by;
		
    for (addr=0; addr<27; addr++)
    {
        bucket[addr + 27*tx] = 0;
    }

[...]

I’m using a 1-D NDRange.

Hope you can help me!

Thanks a lot,
caseClosed

Can somebody help me understand what is wrong, please?

I really need a hand here :frowning:

Thanks a lot,
caseClosed

How big is global_wsize? If it is greater than 32 then isn’t this going to run off the end of the array

bucket[addr + 27*tx] = 0;

since
__local short bucket[864];

and
int tx = get_global_id(0);

If global_wsize = 32 then this means that the index to bucket ranges from 0 to (31*27)+26)
which is 0 to 863 which is just in range. Anything larger and you are running off the end of the
allocated space?


jason

Assuming none of the other commands failed, then invalid command queue usually means your code has crashed on the device.

Without the complete source, it’s anyone’s guess as to why.

I tried to set the global worksize to 32, but I still get the same error.

Here you have the entire code:

__kernel void Bucketization(__global unsigned char* G, unsigned int py0, __global short* sanity, char mul_x, char mul_y)
{
   int bx=(get_group_id(0)+get_num_groups(0)*mul_x);
   int by=(get_group_id(1)+get_num_groups(1)*mul_y);
   int tx = get_global_id(0);
   int idx, idy;
   int addr;
   char val, val2, valpy0;

   __local short bucket[864];
   __local unsigned short tp[32];
   __local unsigned short fn[32];
   __local unsigned short tn[32];
   __local unsigned short fp[32];

   int blk_idx = get_group_id(0);

   if (py0 >= bx)
    {
        idx = 999 - bx;
        py0 = 999 - py0;
    }
    else
        idx = bx;

    if (bx >= by)
    {
       idx = 999 - bx;
       idy = 999 - by;
    }
     else
        idy = by;
      
    for (addr=0; addr<27; addr++)
    {
        bucket[addr + 27*tx] = 0;
    }
    
    barrier(CLK_LOCAL_MEM_FENCE);
    for (addr = tx; addr < 1600; addr+=get_num_groups(0))
    {   
	val=G[idx * 1600 + addr];
	val2=G[idy * 1600 + addr];
	valpy0=G[py0 * 1000 + addr];

	if (P[addr])
	{
		bucket[val + 3*val2 + 9*valpy0 + 27*tx] += 1;
	}
	else
	{
		bucket[val + 3*val2 + 9*valpy0 + 27*tx] -= 1;
	}
	barrier(CLK_LOCAL_MEM_FENCE);
    }
	

    if (tx < 32)
   { 
	if (32 >=  64)
	{
		bucket[27*tx+0] += bucket[27*(tx+32)+0];
		bucket[27*tx+1] += bucket[27*(tx+32)+1];
		bucket[27*tx+2] += bucket[27*(tx+32)+2];
		bucket[27*tx+3] += bucket[27*(tx+32)+3];
		bucket[27*tx+4] += bucket[27*(tx+32)+4];
		bucket[27*tx+5] += bucket[27*(tx+32)+5];
		bucket[27*tx+6] += bucket[27*(tx+32)+6];
		bucket[27*tx+7] += bucket[27*(tx+32)+7];
		bucket[27*tx+8] += bucket[27*(tx+32)+8];
		bucket[27*tx+9] += bucket[27*(tx+32)+9];
		bucket[27*tx+10] += bucket[27*(tx+32)+10];
		bucket[27*tx+11] += bucket[27*(tx+32)+11];
		bucket[27*tx+12] += bucket[27*(tx+32)+12];
		bucket[27*tx+13] += bucket[27*(tx+32)+13];
		bucket[27*tx+14] += bucket[27*(tx+32)+14];
		bucket[27*tx+15] += bucket[27*(tx+32)+15];
		bucket[27*tx+16] += bucket[27*(tx+32)+16];
		bucket[27*tx+17] += bucket[27*(tx+32)+17];
		bucket[27*tx+18] += bucket[27*(tx+32)+18];
		bucket[27*tx+19] += bucket[27*(tx+32)+19];
		bucket[27*tx+20] += bucket[27*(tx+32)+20];
		bucket[27*tx+21] += bucket[27*(tx+32)+21];
		bucket[27*tx+22] += bucket[27*(tx+32)+22];
		bucket[27*tx+23] += bucket[27*(tx+32)+23];
		bucket[27*tx+24] += bucket[27*(tx+32)+24];
		bucket[27*tx+25] += bucket[27*(tx+32)+25];
		bucket[27*tx+26] += bucket[27*(tx+32)+26];
	}
	if (32 >=  32)
	{
		bucket[27*tx+0] += bucket[27*(tx+16)+0];
		bucket[27*tx+1] += bucket[27*(tx+16)+1];
		bucket[27*tx+2] += bucket[27*(tx+16)+2];
		bucket[27*tx+3] += bucket[27*(tx+16)+3];
		bucket[27*tx+4] += bucket[27*(tx+16)+4];
		bucket[27*tx+5] += bucket[27*(tx+16)+5];
		bucket[27*tx+6] += bucket[27*(tx+16)+6];
		bucket[27*tx+7] += bucket[27*(tx+16)+7];
		bucket[27*tx+8] += bucket[27*(tx+16)+8];
		bucket[27*tx+9] += bucket[27*(tx+16)+9];
		bucket[27*tx+10] += bucket[27*(tx+16)+10];
		bucket[27*tx+11] += bucket[27*(tx+16)+11];
		bucket[27*tx+12] += bucket[27*(tx+16)+12];
		bucket[27*tx+13] += bucket[27*(tx+16)+13];
		bucket[27*tx+14] += bucket[27*(tx+16)+14];
		bucket[27*tx+15] += bucket[27*(tx+16)+15];
		bucket[27*tx+16] += bucket[27*(tx+16)+16];
		bucket[27*tx+17] += bucket[27*(tx+16)+17];
		bucket[27*tx+18] += bucket[27*(tx+16)+18];
		bucket[27*tx+19] += bucket[27*(tx+16)+19];
		bucket[27*tx+20] += bucket[27*(tx+16)+20];
		bucket[27*tx+21] += bucket[27*(tx+16)+21];
		bucket[27*tx+22] += bucket[27*(tx+16)+22];
		bucket[27*tx+23] += bucket[27*(tx+16)+23];
		bucket[27*tx+24] += bucket[27*(tx+16)+24];
		bucket[27*tx+25] += bucket[27*(tx+16)+25];
		bucket[27*tx+26] += bucket[27*(tx+16)+26];
	}
	if (32 >=  16)
	{
		bucket[27*tx+0] += bucket[27*(tx+8)+0];
		bucket[27*tx+1] += bucket[27*(tx+8)+1];
		bucket[27*tx+2] += bucket[27*(tx+8)+2];
		bucket[27*tx+3] += bucket[27*(tx+8)+3];
		bucket[27*tx+4] += bucket[27*(tx+8)+4];
		bucket[27*tx+5] += bucket[27*(tx+8)+5];
		bucket[27*tx+6] += bucket[27*(tx+8)+6];
		bucket[27*tx+7] += bucket[27*(tx+8)+7];
		bucket[27*tx+8] += bucket[27*(tx+8)+8];
		bucket[27*tx+9] += bucket[27*(tx+8)+9];
		bucket[27*tx+10] += bucket[27*(tx+8)+10];
		bucket[27*tx+11] += bucket[27*(tx+8)+11];
		bucket[27*tx+12] += bucket[27*(tx+8)+12];
		bucket[27*tx+13] += bucket[27*(tx+8)+13];
		bucket[27*tx+14] += bucket[27*(tx+8)+14];
		bucket[27*tx+15] += bucket[27*(tx+8)+15];
		bucket[27*tx+16] += bucket[27*(tx+8)+16];
		bucket[27*tx+17] += bucket[27*(tx+8)+17];
		bucket[27*tx+18] += bucket[27*(tx+8)+18];
		bucket[27*tx+19] += bucket[27*(tx+8)+19];
		bucket[27*tx+20] += bucket[27*(tx+8)+20];
		bucket[27*tx+21] += bucket[27*(tx+8)+21];
		bucket[27*tx+22] += bucket[27*(tx+8)+22];
		bucket[27*tx+23] += bucket[27*(tx+8)+23];
		bucket[27*tx+24] += bucket[27*(tx+8)+24];
		bucket[27*tx+25] += bucket[27*(tx+8)+25];
		bucket[27*tx+26] += bucket[27*(tx+8)+26];
	}
	if (32 >=   8)
	{
		bucket[27*tx+0] += bucket[27*(tx+4)+0];
		bucket[27*tx+1] += bucket[27*(tx+4)+1];
		bucket[27*tx+2] += bucket[27*(tx+4)+2];
		bucket[27*tx+3] += bucket[27*(tx+4)+3];
		bucket[27*tx+4] += bucket[27*(tx+4)+4];
		bucket[27*tx+5] += bucket[27*(tx+4)+5];
		bucket[27*tx+6] += bucket[27*(tx+4)+6];
		bucket[27*tx+7] += bucket[27*(tx+4)+7];
		bucket[27*tx+8] += bucket[27*(tx+4)+8];
		bucket[27*tx+9] += bucket[27*(tx+4)+9];
		bucket[27*tx+10] += bucket[27*(tx+4)+10];
		bucket[27*tx+11] += bucket[27*(tx+4)+11];
		bucket[27*tx+12] += bucket[27*(tx+4)+12];
		bucket[27*tx+13] += bucket[27*(tx+4)+13];
		bucket[27*tx+14] += bucket[27*(tx+4)+14];
		bucket[27*tx+15] += bucket[27*(tx+4)+15];
		bucket[27*tx+16] += bucket[27*(tx+4)+16];
		bucket[27*tx+17] += bucket[27*(tx+4)+17];
		bucket[27*tx+18] += bucket[27*(tx+4)+18];
		bucket[27*tx+19] += bucket[27*(tx+4)+19];
		bucket[27*tx+20] += bucket[27*(tx+4)+20];
		bucket[27*tx+21] += bucket[27*(tx+4)+21];
		bucket[27*tx+22] += bucket[27*(tx+4)+22];
		bucket[27*tx+23] += bucket[27*(tx+4)+23];
		bucket[27*tx+24] += bucket[27*(tx+4)+24];
		bucket[27*tx+25] += bucket[27*(tx+4)+25];
		bucket[27*tx+26] += bucket[27*(tx+4)+26];
	}
	if (32 >=   4)
	{
		bucket[27*tx+0] += bucket[27*(tx+2)+0];
		bucket[27*tx+1] += bucket[27*(tx+2)+1];
		bucket[27*tx+2] += bucket[27*(tx+2)+2];
		bucket[27*tx+3] += bucket[27*(tx+2)+3];
		bucket[27*tx+4] += bucket[27*(tx+2)+4];
		bucket[27*tx+5] += bucket[27*(tx+2)+5];
		bucket[27*tx+6] += bucket[27*(tx+2)+6];
		bucket[27*tx+7] += bucket[27*(tx+2)+7];
		bucket[27*tx+8] += bucket[27*(tx+2)+8];
		bucket[27*tx+9] += bucket[27*(tx+2)+9];
		bucket[27*tx+10] += bucket[27*(tx+2)+10];
		bucket[27*tx+11] += bucket[27*(tx+2)+11];
		bucket[27*tx+12] += bucket[27*(tx+2)+12];
		bucket[27*tx+13] += bucket[27*(tx+2)+13];
		bucket[27*tx+14] += bucket[27*(tx+2)+14];
		bucket[27*tx+15] += bucket[27*(tx+2)+15];
		bucket[27*tx+16] += bucket[27*(tx+2)+16];
		bucket[27*tx+17] += bucket[27*(tx+2)+17];
		bucket[27*tx+18] += bucket[27*(tx+2)+18];
		bucket[27*tx+19] += bucket[27*(tx+2)+19];
		bucket[27*tx+20] += bucket[27*(tx+2)+20];
		bucket[27*tx+21] += bucket[27*(tx+2)+21];
		bucket[27*tx+22] += bucket[27*(tx+2)+22];
		bucket[27*tx+23] += bucket[27*(tx+2)+23];
		bucket[27*tx+24] += bucket[27*(tx+2)+24];
		bucket[27*tx+25] += bucket[27*(tx+2)+25];
		bucket[27*tx+26] += bucket[27*(tx+2)+26];
	}
	if (32 >=   2)
	{
		bucket[27*tx+0] += bucket[27*(tx+1)+0];
		bucket[27*tx+1] += bucket[27*(tx+1)+1];
		bucket[27*tx+2] += bucket[27*(tx+1)+2];
		bucket[27*tx+3] += bucket[27*(tx+1)+3];
		bucket[27*tx+4] += bucket[27*(tx+1)+4];
		bucket[27*tx+5] += bucket[27*(tx+1)+5];
		bucket[27*tx+6] += bucket[27*(tx+1)+6];
		bucket[27*tx+7] += bucket[27*(tx+1)+7];
		bucket[27*tx+8] += bucket[27*(tx+1)+8];
		bucket[27*tx+9] += bucket[27*(tx+1)+9];
		bucket[27*tx+10] += bucket[27*(tx+1)+10];
		bucket[27*tx+11] += bucket[27*(tx+1)+11];
		bucket[27*tx+12] += bucket[27*(tx+1)+12];
		bucket[27*tx+13] += bucket[27*(tx+1)+13];
		bucket[27*tx+14] += bucket[27*(tx+1)+14];
		bucket[27*tx+15] += bucket[27*(tx+1)+15];
		bucket[27*tx+16] += bucket[27*(tx+1)+16];
		bucket[27*tx+17] += bucket[27*(tx+1)+17];
		bucket[27*tx+18] += bucket[27*(tx+1)+18];
		bucket[27*tx+19] += bucket[27*(tx+1)+19];
		bucket[27*tx+20] += bucket[27*(tx+1)+20];
		bucket[27*tx+21] += bucket[27*(tx+1)+21];
		bucket[27*tx+22] += bucket[27*(tx+1)+22];
		bucket[27*tx+23] += bucket[27*(tx+1)+23];
		bucket[27*tx+24] += bucket[27*(tx+1)+24];
		bucket[27*tx+25] += bucket[27*(tx+1)+25];
		bucket[27*tx+26] += bucket[27*(tx+1)+26];
	} 
} //End if tx>32

// Start of bucket accuracy loop
	tp[tx] = 0;
	fn[tx] = 0;
	tn[tx] = 0;
	fp[tx] = 0;
	barrier(CLK_LOCAL_MEM_FENCE); 

	for (addr = tx; addr < 1600; addr+=get_num_groups(0))
	{
		val=G[idx * 1600+addr];
		val2=G[idy * 1600+addr];
		valpy0=G[py0 * 1000 + addr];

		if (P[addr])
		{
			if (*(bucket + val + 3*val2 + 9*valpy0) > 0)
				tp[tx]+=1;
			else
				fn[tx]+=1;
		}
		else
		{
			if (*(bucket + val + 3*val2 + 9*valpy0) < 0)
				tn[tx]+=1;
			else
				fp[tx]+=1;
		}

	   barrier(CLK_LOCAL_MEM_FENCE);
	}
	if (tx < 32)
	{ 
		if (32 >=  64)
		{
			tp[tx] += tp[tx + 32];
			fn[tx] += fn[tx + 32];
			fp[tx] += fp[tx + 32];
			tn[tx] += tn[tx + 32];
		}
		if (32 >=  32)
		{
			tp[tx] += tp[tx + 16];
			fn[tx] += fn[tx + 16];
			fp[tx] += fp[tx + 16];
			tn[tx] += tn[tx + 16];
		}
		if (32 >=  16)
		{
			tp[tx] += tp[tx + 8];
			fn[tx] += fn[tx + 8];
			fp[tx] += fp[tx + 8];
			tn[tx] += tn[tx + 8];
		}
		if (32 >=   8)
		{
			tp[tx] += tp[tx + 4];
			fn[tx] += fn[tx + 4];
			fp[tx] += fp[tx + 4];
			tn[tx] += tn[tx + 4];
		}
		if (32 >=   4)
		{
			tp[tx] += tp[tx + 2];
			fn[tx] += fn[tx + 2];
			fp[tx] += fp[tx + 2];
			tn[tx] += tn[tx + 2];
		}
		if (32 >=   2)
		{
			tp[tx] += tp[tx + 1];
			fn[tx] += fn[tx + 1];
			fp[tx] += fp[tx + 1];
			tn[tx] += tn[tx + 1];
		} 
	} 

	barrier(CLK_LOCAL_MEM_FENCE);

	if (tx == 0)
	{
		if (idx == idy || py0 == idx || py0 == idy) 
		{
			sanity[0 + 4*blk_idx] = 0.0;
		}
		else
		{
			sanity[0 + 4*blk_idx] = convert_int_rte((tp[0]/(float)(tp[0] + fn[0]) + tn[0]/(float)(tn[0] + fp[0]))*5000);
		}
		sanity[1 + 4*blk_idx] = idx;
		sanity[2 + 4*blk_idx] = idy;
		sanity[3+4*blk_idx] = py0;

	}
}

I’m guessing instructions like

bucket[27*(tx+32)+0]

are running off the end of the array as well… ouch.
Any advice?

Thank you,
caseClosed

…invalid command queue usually means your code has crashed on the device. / Without the complete source, it’s anyone’s guess as to why.

Even with the complete source, for my ~2000 lines of code I ultimately used the “méthode tedieuse” – comment out huge chunks, verify that it runs. Uncomment out something, try again, till “command queue invalid” occurs. Of that chunk you’ve just uncommented, subdivide it and comment out bits of that. Find culprit and fix. On to next section. Tedious as I have many helper subroutines, and the strangest things seemed to set it off.
Of course, this was my first attempt at opencl; I had a project I’d worked on for decades that was a good candidate and I just dove into converting it. Made it work first, optimized later.

(op on advice re: possibly running off the end of an array)

Make sure that it doesn’t! One of those was the final gotcha for me; kernel would run fine for a random number of seconds, then command queue invalid. Was running off the end of an array only occasionally.