I have the following kernel:
__kernel void voxelizer(int global_size,
float h_voxel_size,
__global float* h_min_corner_grid,
__global int *h_dim_grid,
__global float *coords,
__global unsigned int *density) {
//printf("local size is: %d
", get_num_groups(0));
int i = get_global_id(0);
if (i < global_size) {
__private float voxel_size = h_voxel_size;
float3 min_corner_grid = (float3)(
h_min_corner_grid[0],
h_min_corner_grid[1],
h_min_corner_grid[2]
);
int3 dim_grid = (int3)(
h_dim_grid[0],
h_dim_grid[1],
h_dim_grid[2]
);
/*Triangle vertices*/
__private float3 v0 = (float3)(coords[9 * i], coords[9 * i + 1], coords[9 * i + 2]);
__private float3 v1 = (float3)(coords[9 * i + 3], coords[9 * i + 4], coords[9 * i + 5]);
__private float3 v2 = (float3)(coords[9 * i + 6], coords[9 * i + 7], coords[9 * i + 8]);
float3 min_corner_AABB = (float3)(
fmin(v0.x, fmin(v1.x, v2.x)),
fmin(v0.y, fmin(v1.y, v2.y)),
fmin(v0.z, fmin(v1.z, v2.z))
);
float3 max_corner_AABB = (float3)(
fmax(v0.x, fmax(v1.x, v2.x)),
fmax(v0.y, fmax(v1.y, v2.y)),
fmax(v0.z, fmax(v1.z, v2.z))
);
int3 min_corner_ID = (int3)(
floor((min_corner_AABB.x - min_corner_grid.x) / voxel_size),
floor((min_corner_AABB.y - min_corner_grid.y) / voxel_size),
floor((min_corner_AABB.z - min_corner_grid.z) / voxel_size)
);
int3 max_corner_ID = (int3)(
floor((max_corner_AABB.x - min_corner_grid.x) / voxel_size),
floor((max_corner_AABB.y - min_corner_grid.y) / voxel_size),
floor((max_corner_AABB.z - min_corner_grid.z) / voxel_size)
);
for (int j = min_corner_ID.z; j <= max_corner_ID.z; j++) {
for (int k = min_corner_ID.y; k <= max_corner_ID.y; k++) {
for (int l = min_corner_ID.x; l <= max_corner_ID.x; l++) {
__private float3 center = (float3)(
voxel_size / 2 + l * voxel_size + min_corner_grid.x,
voxel_size / 2 + k * voxel_size + min_corner_grid.y,
voxel_size / 2 + j * voxel_size + min_corner_grid.z
);
if (triBoxOverlap(voxel_size, center, v0, v1, v2)) {
unsigned long voxel_index = l + k * dim_grid.x + j * dim_grid.x * dim_grid.y;
__private unsigned int array_index = voxel_index / 32;
__private unsigned int bit_pos = voxel_index % 32;
__private unsigned int mask = 1 << bit_pos;
atomic_or(&density[array_index], mask);
}
}
}
}
}
}
with the following piece of host code:
//Initialization
cl_device_id device = create_device();
print_device_info(device);
cl_int err;
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a context
", err);
exit(1);
}
cl_program program = build_program(context, device, KERNEL_FILE);
cl_command_queue queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a command queue
", err);
exit(1);
};
cl_ulong voxel_counts = dim_grid[0] * dim_grid[1] * dim_grid[2];
cl_uint array_length = ceil((cl_double)voxel_counts / 32.0);
cl_uint *density = (cl_uint*)malloc(sizeof(cl_uint)*array_length);
for (int i = 0; i < array_length; i++) {
density[i] = 0;
}
size_t voxelization_local_size, voxelization_global_size, max_local_size, voxelization_revised_global_size;
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, NULL);
voxelization_global_size = triangles_count;
voxelization_local_size = max_local_size;
if (voxelization_global_size % voxelization_local_size != 0) {
voxelization_revised_global_size = (voxelization_global_size / voxelization_local_size + 1) * voxelization_local_size;
}
else {
voxelization_revised_global_size = voxelization_global_size;
}
cl_mem min_corner_grid_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float) * 3, min_corner_grid, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create the h_minBoundsGrid_buffer
", err);
exit(1);
};
cl_mem dim_grid_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(int) * 3, dim_grid, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create the dimGrid_buffer
", err);
exit(1);
};
cl_mem coords_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float)* triangles_count * 9, coords, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create the coords_buffer
", err);
exit(1);
};
cl_mem density_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * array_length, density, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create the density_buffer
", err);
exit(1);
};
cl_kernel voxelization_kernel;
voxelization_kernel = clCreateKernel(program, "voxelizer", NULL);
if (err < 0) {
printf("Error code: %d. Couldn't create voxelization_kernel
", err);
exit(1);
};
err = clSetKernelArg(voxelization_kernel, 0, sizeof(cl_uint), &voxelization_global_size);
float voxel_size = VOXEL_SIZE;
err |= clSetKernelArg(voxelization_kernel, 1, sizeof(float), &voxel_size);
err |= clSetKernelArg(voxelization_kernel, 2, sizeof(cl_mem), &min_corner_grid_buffer);
err |= clSetKernelArg(voxelization_kernel, 3, sizeof(cl_mem), &dim_grid_buffer);
err |= clSetKernelArg(voxelization_kernel, 4, sizeof(cl_mem), &coords_buffer);
err |= clSetKernelArg(voxelization_kernel, 5, sizeof(cl_mem), &density_buffer);
if (err < 0) {
printf("Error code: %d. Couldn't create an argument for voxelization_kernel
", err);
exit(1);
}
cl_ulong private_mem_size;
err = clGetKernelWorkGroupInfo(voxelization_kernel, device,
CL_KERNEL_PRIVATE_MEM_SIZE, sizeof(size_t), &private_mem_size, NULL);
printf("CL_KERNEL_PRIVATE_MEM_SIZE = %d", private_mem_size)
cl_event voxelization_kernel_event;
err = clEnqueueNDRangeKernel(queue, voxelization_kernel, 1, NULL, &voxelization_revised_global_size,
&voxelization_local_size, 0, NULL, &voxelization_kernel_event);
if (err < 0) {
printf("Error code: %d. Couldn't enqueue the voxelization_kernel
", err);
exit(1);
}
err = clWaitForEvents(1, &voxelization_kernel_event);
if (err < 0) {
printf("Error code: %d, clWaitForEvent
", err);
exit(1);
}
Upon enqueuig the kernel, clWaitForEvents throws CL_OUT_OF_RESOURCES error. It is working fine for global size of 16384 with local size set to 1024. However, for greater global sizes I have the error. I am totally lost in debugging this code. I tried to use clGetKernelWorkGroupInfo to find out if I am exceeding the amount of local and private memory available on my device. However, it gives me 0 private memory and 1 KB local memory which looks strange. I would really appreciate it if you guys can help me out. I am using a GF970 GTX.
BTW, commenting the aotmic OR operation avoids the error. However, I really need that to get correct answer out of the kernel. I suspect seg-fault. In the inner-most part of the kernel, there is an atomic operation. When I comment that, the kernel runs without any error. I think whatever the problem is, it has to do with accessing to density array, which is stored ni global memory. To access the elements of this aray, I use a variable “array_index” which is an unsigned integer. However, it is derived from an unsigned long variable called “voxel_index”. Am I converting this two type of variables correctly?