Alright so to better show the problem I made a simple program.
I create an int* array with 10 elements. I create a second one with 100 elements, in the kernel for each element of the second array, I assign it the value of each element it’s index value.
However I only get an array with valid elements for the first 10 elements for the 100 element sized array.
bool bPhysics::OpenCLArrayTest()
{
cl_context context = NULL; // OpenCL Context
cl_command_queue command_queue = NULL; // OpenCL Command Queue
cl_mem memobj_a = NULL;
cl_mem memobj_a_size = NULL;
cl_mem memobj_b = NULL;
cl_mem memobj_b_size = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_device_id *cdDevices = NULL; // OpenCL device list
cl_platform_id platform_id = NULL; // OpenCL Platform
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
cl_uint uiNumComputeUnits;
cl_uint uiTargetDevice = 0; // OpenCL Device to compute on
FILE *fp;
char *fileName;
char *source_str;
size_t source_size;
int _size = 10;
int d_b_max_size = _size * _size;
// d_a = device_array
int *d_a = (int*)malloc(_size * sizeof(int));
// d_a_size = device_array, the number of elements we pass in.
int d_a_size = _size;
// init values for d_a
for (int i = 0; i < _size; i++)
{
d_a[i] = i;
}
int *d_b = (int*)malloc(d_b_max_size * sizeof(int));
int *d_b_size = (int*)malloc(sizeof(int));
// init values for d_b
for (int i = 0; i < _size; i++)
{
d_b[i] = -1;
}
d_b_size[0] = 0;
/* Get Platform and Device Info */
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
if (ret < 0)
return false;
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 0, NULL, &ret_num_devices);
//ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
if (ret < 0)
return false;
std::cout << " # of devices = " << ret_num_devices << std::endl;
cdDevices = (cl_device_id*)malloc(ret_num_devices * sizeof(cl_device_id));
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, ret_num_devices, cdDevices, NULL);
if (ret < 0)
return false;
uiTargetDevice = glm::clamp((int)uiTargetDevice, (int)0, (int)(ret_num_devices - 1));
std::cout << "Using device #: " << uiTargetDevice << std::endl;
clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
std::cout << " # of Compute Units = " << uiNumComputeUnits << std::endl;
/* Create OpenCL context */
context = clCreateContext(NULL, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ret);
if (ret < 0)
return false;
/* Create Command Queue */
command_queue = clCreateCommandQueue(context, cdDevices[uiTargetDevice], 0, &ret);
if (ret < 0)
return false;
char string[MEM_SIZE];
fileName = "broadphase.cl";
/* Load the source code containing the kernel*/
fopen_s(&fp, fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.
");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
/* Create Kernel Program from the source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
(const size_t *)&source_size, &ret);
if (ret < 0)
return false;
/* Build Kernel Program */
ret = clBuildProgram(program, 1, &cdDevices[uiTargetDevice], NULL, NULL, NULL);
// First call to know the proper size
// build failed
if (ret != CL_SUCCESS) {
// check build error and build status first
clGetProgramBuildInfo(program, cdDevices[uiTargetDevice], CL_PROGRAM_BUILD_STATUS,
sizeof(cl_build_status), &status, NULL);
// check build log
clGetProgramBuildInfo(program, cdDevices[uiTargetDevice],
CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
programLog = (char*)calloc(logSize + 1, sizeof(char));
clGetProgramBuildInfo(program, cdDevices[uiTargetDevice],
CL_PROGRAM_BUILD_LOG, logSize + 1, programLog, NULL);
printf("Build failed; error=%d, status=%d, programLog:nn%s",
ret, status, programLog);
free(programLog);
std::cout << "Press ENTER to continue...";
std::cin.ignore(std::numeric_limits<std::streamsize>::max(), '
');
}
if (ret < 0)
return false;
/* Create OpenCL Kernel */
kernel = clCreateKernel(program, "test_kernel", &ret);
if (ret < 0)
return false;
/* Create Memory Buffer */
memobj_a = clCreateBuffer(context, CL_MEM_READ_WRITE, _size * sizeof(int), NULL, &ret);
if (ret < 0)
return false;
memobj_b = clCreateBuffer(context, CL_MEM_READ_WRITE, d_b_max_size * sizeof(int), NULL, &ret);
if (ret < 0)
return false;
memobj_b_size = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &ret);
if (ret < 0)
return false;
memobj_a_size = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int), NULL, &ret);
if (ret < 0)
return false;
/* Copy input data to the memory buffer */
ret = clEnqueueWriteBuffer(command_queue, memobj_a, CL_TRUE, 0, _size * sizeof(int), d_a, 0, NULL, NULL);
if (ret < 0)
return false;
ret = clEnqueueWriteBuffer(command_queue, memobj_b, CL_TRUE, 0, d_b_max_size * sizeof(int), d_b, 0, NULL, NULL);
if (ret < 0)
return false;
ret = clEnqueueWriteBuffer(command_queue, memobj_a_size, CL_TRUE, 0, sizeof(int), &d_a_size, 0, NULL, NULL);
if (ret < 0)
return false;
ret = clEnqueueWriteBuffer(command_queue, memobj_b_size, CL_TRUE, 0, sizeof(int), d_b_size, 0, NULL, NULL);
if (ret < 0)
return false;
/* Set OpenCL Kernel Parameters */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj_a);
if (ret < 0)
return false;
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj_b);
if (ret < 0)
return false;
ret = clSetKernelArg(kernel, 2, sizeof(int), &memobj_a_size);
if (ret < 0)
return false;
ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&memobj_b_size);
if (ret < 0)
return false;
/* Execute OpenCL Kernel */
const size_t dimSize = 2;
size_t global_item_size[dimSize];
global_item_size[0] = _size;
global_item_size[1] = _size;
//size_t local_item_size = sizeWidgets;
/* Execute OpenCL kernel as data parallel */
ret = clEnqueueNDRangeKernel(command_queue, kernel, dimSize, NULL, global_item_size, NULL, 0, NULL, NULL);
//ret = clEnqueueTask(command_queue, kernel, NULL, NULL, NULL);
if (ret < 0)
return false;
/* Copy results from the memory buffer */
/* Transfer result to host */
ret = clEnqueueReadBuffer(command_queue, memobj_a, CL_TRUE, 0, _size * sizeof(int), d_a, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, memobj_b, CL_TRUE, 0, d_b_max_size * sizeof(int), d_b, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, memobj_b_size, CL_TRUE, 0, sizeof(int), d_b_size, 0, NULL, NULL);
if (ret < 0)
return false;
ret = clFinish(command_queue);
if (ret < 0)
return false;
/* Display Result */
for (int i = 0; i < d_b_max_size; i++)
{
std::cout << d_b[i] << std::endl;
}
std::cout << "Press ENTER to continue...";
std::cin.ignore(std::numeric_limits<std::streamsize>::max(), '
');
}
My kernel.
__kernel void test_kernel(__global int *device_a_array, __global int *device_b_array, int _aSize, __global int *_bFinalSize)
{
int i = get_global_id(0);
int j = get_global_id(1);
// map a 2D array index to a 1D array
int index = (_aSize * j) + i;
// increment "final size" to get count of all kernels that did work
atom_inc(&_bFinalSize[0]);
// pass the computer index value to device_b_array at position index
device_b_array[index] = index;
}
Am I doing something wrong or is there a limitation I’m missing?
e: Fixed a copy paste error but same result.