Opencl returns a truncated array depending on global work size?

I have encountered a frustrating problem that I am unsure how to solve.
I am trying to pass a fixed-length array to my kernel of which I only partially fill (essentially a dynamic array) with an int returned giving me its size.
However the length of the returned array is actually seemingly fixed to the “global_work_size” variable in clEnqueueNDRangeKernel.

onst size_t dimSize = 2;
size_t global_item_size[dimSize];
global_item_size[0] = sizeWidgets;
global_item_size[1] = sizeWidgets;

/* Execute OpenCL kernel as data parallel */
ret = clEnqueueNDRangeKernel(command_queue, kernel, dimSize, NULL, global_item_size, NULL, 0, NULL, NULL);
//ret = clEnqueueTask(command_queue, kernel, NULL, NULL, NULL);
if (ret < 0)
    return false;

/* Copy results from the memory buffer */
/* Transfer result to host */
ret = clEnqueueReadBuffer(command_queue, memobj_widgets, CL_TRUE, 0, sizeWidgets * sizeof(widget), _widgets, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, memobj_contacts, CL_TRUE, 0, (sizeWidgets * sizeWidgets) * sizeof(manifold_cl), _contacts, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, memobj_contacts_size, CL_TRUE, 0, 1 * sizeof(int), &sizeContacts, 0, NULL, NULL);
if (ret < 0)
    return false;

Suppose sizeWidgets is 5, so _contacts is supposed to be 25 elements in size, the size of _contacts when it is copied back from memory is resized to sizeWidgets elements!

My stripped down kernel to debug this:

#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
__kernel void broadphase_kernel(__global widget *_widgets, __global manifold_cl *_contacts, int _WidgetSize, __global int* _contactsSize)
{

    int i = get_global_id(0);
    int j = get_global_id(1);
    __local int index;
    index = (_WidgetSize * get_global_id(1)) + get_global_id(0);

    manifold_cl _m;
    bVec3 temp;
    temp.x = 0;
    temp.y = 0;
    temp.z = 0;
    _m.normal_cl = temp;
    _m.penetration_cl = 0;
    _m.A_cl = i;
    _m.B_cl = j;
    _m.isColliding_cl = false;

    _m.index = -1;
    _m.gid = index;

    //widget temp_widget_a_cl;
    //temp_widget_a_cl = _widgets[i];

    //widget temp_widget_b_cl;
    //temp_widget_b_cl = _widgets[j];

    _contacts[index] = _m;
    atom_inc(&_contactsSize[0]);
}

I have sizeWidgets by sizeWidgets computations to perform and the results I am sticking in a 1D array of that size; how do I solve this problem?

Why have you declared the index variable as __local? This means that the variable will be shared across all work-items in the same work-group, and therefore the work-items will be overwriting each other for your assignment to index, and will subsequently write to the same locations in your _contacts array.

Desperation as I’m basically trying to figure this out via trial and error; removing __local doesn’t change the problem though, that the array of ‘_contacts’ is returned at a truncated size.

In Visual studio, when I look at _contacts[some n] I only get valid values for the range of 0 to sizeWidgets-1; for _contacts[sizewidgets] all the values become invalid (large negative uninitialized values).

However _contacts when originally passed to it, is supposed to be sizeWidgets^2 in size and I verified that all entries had valid initialized values.

Based on the kernel code, I do not believe the computations in the kernel are causing it.

I posted more code here: Stripped down to relevant bits · GitHub

edit:

Okay, I think I got it, but I’m starving so I can’t confirm yet, but:

ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj_contacts);

to

ret = clSetKernelArg(kernel, 1, sizeof(memobj_contacts), (void *)&memobj_contacts);

Seems like it may have fixed it*. Does this seem right to you and why do you think? Most examples I find online with pointer arrays show cl_mem working just fine?
*So far looking in my debugger in visual studio _contacts[~n > sizeWidgets] gives me values that make sense now, I think (I set “id” to -1). I’ll investigate more when I return.

Ok, so by changing SetKernelArg, I think that now the proper array is passed to my kernel.

However, either something is going wrong in the kernel and it stops doing work at global id 6, but shouldn’t be the case as my atom_inc verifies that (for n=6) that there are indeed 36 kernels being launched and doing work but:

When the array is copied back to the host, only the first six elements are returned (in that the “default” values I set when I initialized contacts are returned to me for all all elements beyond n=sizeWidgets-1, all elements for n<sizeWidgets are changed kernel side).

So now I’ve tried it with a n*n sized array of just ints and I get the same problem, it’s constrained by the size as far as I can tell, by the global work size value.
If I change: global_item_size[0] = sizeWidgets;
to
global_item_size[0] = sizeWidgets+n;
Then the amount of elements that I do get back grows by n.

Alright so to better show the problem I made a simple program.

I create an int* array with 10 elements. I create a second one with 100 elements, in the kernel for each element of the second array, I assign it the value of each element it’s index value.

However I only get an array with valid elements for the first 10 elements for the 100 element sized array.


bool bPhysics::OpenCLArrayTest()
{
	cl_context context = NULL;				// OpenCL Context
	cl_command_queue command_queue = NULL;	// OpenCL Command Queue
	cl_mem memobj_a = NULL;
	cl_mem memobj_a_size = NULL;
	cl_mem memobj_b = NULL;
	cl_mem memobj_b_size = NULL;
	cl_program program = NULL;
	cl_kernel kernel = NULL;
	cl_device_id *cdDevices = NULL;     // OpenCL device list
	cl_platform_id platform_id = NULL;		// OpenCL Platform
	cl_uint ret_num_devices;
	cl_uint ret_num_platforms;
	cl_int ret;
	cl_uint uiNumComputeUnits;
	cl_uint uiTargetDevice = 0;	        // OpenCL Device to compute on

	FILE *fp;
	char *fileName;
	char *source_str;
	size_t source_size;

	int _size = 10;
	int d_b_max_size = _size * _size;

	// d_a = device_array
	int *d_a = (int*)malloc(_size * sizeof(int));
	// d_a_size = device_array, the number of elements we pass in.
	int d_a_size = _size;

	// init values for d_a
	for (int i = 0; i < _size; i++)
	{
		d_a[i] = i;
	}

	int *d_b = (int*)malloc(d_b_max_size * sizeof(int));
	int *d_b_size = (int*)malloc(sizeof(int));

	// init values for d_b
	for (int i = 0; i < _size; i++)
	{
		d_b[i] = -1;
	}
	d_b_size[0] = 0;

	/* Get Platform and Device Info */
	ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
	if (ret < 0)
		return false;

	ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 0, NULL, &ret_num_devices);
	//ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
	if (ret < 0)
		return false;

	std::cout << " # of devices = " << ret_num_devices << std::endl;
	cdDevices = (cl_device_id*)malloc(ret_num_devices * sizeof(cl_device_id));
	ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, ret_num_devices, cdDevices, NULL);
	if (ret < 0)
		return false;
	uiTargetDevice = glm::clamp((int)uiTargetDevice, (int)0, (int)(ret_num_devices - 1));

	std::cout << "Using device #: " << uiTargetDevice << std::endl;
	clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
	std::cout << " # of Compute Units = " << uiNumComputeUnits << std::endl;


	/* Create OpenCL context */
	context = clCreateContext(NULL, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ret);
	if (ret < 0)
		return false;

	/* Create Command Queue */
	command_queue = clCreateCommandQueue(context, cdDevices[uiTargetDevice], 0, &ret);
	if (ret < 0)
		return false;

	char string[MEM_SIZE];
	fileName = "broadphase.cl";
	/* Load the source code containing the kernel*/
	fopen_s(&fp, fileName, "r");
	if (!fp) {
		fprintf(stderr, "Failed to load kernel.
");
		exit(1);
	}
	source_str = (char*)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);


	/* Create Kernel Program from the source */
	program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
		(const size_t *)&source_size, &ret);

	if (ret < 0)
		return false;

	/* Build Kernel Program */
	ret = clBuildProgram(program, 1, &cdDevices[uiTargetDevice], NULL, NULL, NULL);
	// First call to know the proper size
	// build failed
	if (ret != CL_SUCCESS) {

		// check build error and build status first
		clGetProgramBuildInfo(program, cdDevices[uiTargetDevice], CL_PROGRAM_BUILD_STATUS,
			sizeof(cl_build_status), &status, NULL);

		// check build log
		clGetProgramBuildInfo(program, cdDevices[uiTargetDevice],
			CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
		programLog = (char*)calloc(logSize + 1, sizeof(char));
		clGetProgramBuildInfo(program, cdDevices[uiTargetDevice],
			CL_PROGRAM_BUILD_LOG, logSize + 1, programLog, NULL);
		printf("Build failed; error=%d, status=%d, programLog:nn%s",
			ret, status, programLog);
		free(programLog);
		std::cout << "Press ENTER to continue...";
		std::cin.ignore(std::numeric_limits<std::streamsize>::max(), '
');
	}
	if (ret < 0)
		return false;


	/* Create OpenCL Kernel */
	kernel = clCreateKernel(program, "test_kernel", &ret);
	if (ret < 0)
		return false;

	/* Create Memory Buffer */
	memobj_a = clCreateBuffer(context, CL_MEM_READ_WRITE, _size * sizeof(int), NULL, &ret);
	if (ret < 0)
		return false;

	memobj_b = clCreateBuffer(context, CL_MEM_READ_WRITE, d_b_max_size * sizeof(int), NULL, &ret);
	if (ret < 0)
		return false;

	memobj_b_size = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &ret);
	if (ret < 0)
		return false;
	memobj_a_size = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &ret);
	if (ret < 0)
		return false;

	/* Copy input data to the memory buffer */
	ret = clEnqueueWriteBuffer(command_queue, memobj_a, CL_TRUE, 0, _size * sizeof(int), d_a, 0, NULL, NULL);
	if (ret < 0)
		return false;
	ret = clEnqueueWriteBuffer(command_queue, memobj_b, CL_TRUE, 0, d_b_max_size * sizeof(int), d_b, 0, NULL, NULL);
	if (ret < 0)
		return false;
	ret = clEnqueueWriteBuffer(command_queue, memobj_a_size, CL_TRUE, 0, sizeof(int), &d_a_size, 0, NULL, NULL);
	if (ret < 0)
		return false;
	ret = clEnqueueWriteBuffer(command_queue, memobj_b_size, CL_TRUE, 0, sizeof(int), d_b_size, 0, NULL, NULL);
	if (ret < 0)
		return false;

	/* Set OpenCL Kernel Parameters */
	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj_a);
	if (ret < 0)
		return false;

	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj_b);
	if (ret < 0)
		return false;
	ret = clSetKernelArg(kernel, 2, sizeof(int), &memobj_a_size);
	if (ret < 0)
		return false;
	ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&memobj_b_size);
	if (ret < 0)
		return false;

	/* Execute OpenCL Kernel */
	const size_t dimSize = 2;
	size_t global_item_size[dimSize];
	global_item_size[0] = _size;
	global_item_size[1] = _size;
	//size_t local_item_size = sizeWidgets;

	/* Execute OpenCL kernel as data parallel */
	ret = clEnqueueNDRangeKernel(command_queue, kernel, dimSize, NULL, global_item_size, NULL, 0, NULL, NULL);
	//ret = clEnqueueTask(command_queue, kernel, NULL, NULL, NULL);
	if (ret < 0)
		return false;

	/* Copy results from the memory buffer */
	/* Transfer result to host */
	ret = clEnqueueReadBuffer(command_queue, memobj_a, CL_TRUE, 0, _size * sizeof(int), d_a, 0, NULL, NULL);
	ret = clEnqueueReadBuffer(command_queue, memobj_b, CL_TRUE, 0, d_b_max_size * sizeof(int), d_b, 0, NULL, NULL);
	ret = clEnqueueReadBuffer(command_queue, memobj_b_size, CL_TRUE, 0, sizeof(int), d_b_size, 0, NULL, NULL);
	if (ret < 0)
		return false;

	ret = clFinish(command_queue);
	if (ret < 0)
		return false;
	/* Display Result */

	for (int i = 0; i < d_b_max_size; i++)
	{
		std::cout << d_b[i] << std::endl;

	}

	std::cout << "Press ENTER to continue...";
	std::cin.ignore(std::numeric_limits<std::streamsize>::max(), '
');
}

My kernel.


__kernel void test_kernel(__global int *device_a_array, __global int *device_b_array, int _aSize, __global int *_bFinalSize)
{
	int i = get_global_id(0);
	int j = get_global_id(1);

	// map a 2D array index to a 1D array
	int index = (_aSize * j) + i;

	// increment "final size" to get count of all kernels that did work
	atom_inc(&_bFinalSize[0]);

	// pass the computer index value to device_b_array at position index
	device_b_array[index] = index;

}

Am I doing something wrong or is there a limitation I’m missing?

e: Fixed a copy paste error but same result.

Okay I solved it!

I had:

ret = clSetKernelArg(kernel, 3, sizeof(int), (void *)&memobj_a_size);

And had to change it to:

ret = clSetKernelArg(kernel, 2, sizeof(int), &_size);

Where _size is an int.

I have no idea why this is, as the problem was in returning an array from the memobj_b buffer? But here we go.

This fixed the problem as it was present in the simple version of my host code above and I applied the fix to the similar issue to my original host code and it works.