clWaitForEvents throws CL_OUT_OF_RESOURCES for high number of global size

I have the following kernel:

__kernel void voxelizer(int global_size,
    	float h_voxel_size,
    	__global float* h_min_corner_grid,
    	__global int *h_dim_grid,
    	__global float *coords,
    	__global unsigned int *density) {
    
    	//printf("local size is: %d
", get_num_groups(0));
    	int i = get_global_id(0);
    	if (i < global_size) {
    
    		__private float voxel_size = h_voxel_size;
    		float3 min_corner_grid = (float3)(
    			h_min_corner_grid[0],
    			h_min_corner_grid[1],
    			h_min_corner_grid[2]
    			);
    		int3 dim_grid = (int3)(
    			h_dim_grid[0],
    			h_dim_grid[1],
    			h_dim_grid[2]
    			);
    
    		/*Triangle vertices*/
    		__private float3 v0 = (float3)(coords[9 * i], coords[9 * i + 1], coords[9 * i + 2]);
    		__private float3 v1 = (float3)(coords[9 * i + 3], coords[9 * i + 4], coords[9 * i + 5]);
    		__private float3 v2 = (float3)(coords[9 * i + 6], coords[9 * i + 7], coords[9 * i + 8]);
    
    		float3 min_corner_AABB = (float3)(
    			fmin(v0.x, fmin(v1.x, v2.x)),
    			fmin(v0.y, fmin(v1.y, v2.y)),
    			fmin(v0.z, fmin(v1.z, v2.z))
    			);
    		float3 max_corner_AABB = (float3)(
    			fmax(v0.x, fmax(v1.x, v2.x)),
    			fmax(v0.y, fmax(v1.y, v2.y)),
    			fmax(v0.z, fmax(v1.z, v2.z))
    			);
    		int3 min_corner_ID = (int3)(
    			floor((min_corner_AABB.x - min_corner_grid.x) / voxel_size),
    			floor((min_corner_AABB.y - min_corner_grid.y) / voxel_size),
    			floor((min_corner_AABB.z - min_corner_grid.z) / voxel_size)
    			);
    		int3 max_corner_ID = (int3)(
    			floor((max_corner_AABB.x - min_corner_grid.x) / voxel_size),
    			floor((max_corner_AABB.y - min_corner_grid.y) / voxel_size),
    			floor((max_corner_AABB.z - min_corner_grid.z) / voxel_size)
    			);
    		for (int j = min_corner_ID.z; j <= max_corner_ID.z; j++) {
    			for (int k = min_corner_ID.y; k <= max_corner_ID.y; k++) {
    				for (int l = min_corner_ID.x; l <= max_corner_ID.x; l++) {
    
    					__private float3 center = (float3)(
    						voxel_size / 2 + l * voxel_size + min_corner_grid.x,
    						voxel_size / 2 + k * voxel_size + min_corner_grid.y,
    						voxel_size / 2 + j * voxel_size + min_corner_grid.z
    						);
    					if (triBoxOverlap(voxel_size, center, v0, v1, v2)) {
    						unsigned long voxel_index = l + k * dim_grid.x + j * dim_grid.x * dim_grid.y;
    						__private unsigned int array_index = voxel_index / 32;
    						__private unsigned int bit_pos = voxel_index % 32;
    						__private unsigned int mask = 1 << bit_pos;
    						atomic_or(&density[array_index], mask);
    					}
    				}
    			}
    		}
    	}
    }

with the following piece of host code:

	//Initialization
    	cl_device_id device = create_device();
    	print_device_info(device);
    	cl_int err;
    	cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    	if (err < 0) {
    		printf("Error code: %d. Couldn't create a context
", err);
    		exit(1);
    	}
    	cl_program program = build_program(context, device, KERNEL_FILE);
    	cl_command_queue queue = clCreateCommandQueue(context, device,
    		CL_QUEUE_PROFILING_ENABLE, &err);
    	if (err < 0) {
    		printf("Error code: %d. Couldn't create a command queue
", err);
    		exit(1);
    	};
        cl_ulong voxel_counts = dim_grid[0] * dim_grid[1] * dim_grid[2];
        	cl_uint array_length = ceil((cl_double)voxel_counts / 32.0); 
        	cl_uint *density = (cl_uint*)malloc(sizeof(cl_uint)*array_length);
        	for (int i = 0; i < array_length; i++) {
        		density[i] = 0;
        	}
    size_t voxelization_local_size, voxelization_global_size, max_local_size, voxelization_revised_global_size;
    	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, NULL);
    	voxelization_global_size = triangles_count;
    	voxelization_local_size = max_local_size;
    	if (voxelization_global_size % voxelization_local_size != 0) {
    		voxelization_revised_global_size = (voxelization_global_size / voxelization_local_size + 1) * voxelization_local_size;
    	}
    	else {
    		voxelization_revised_global_size = voxelization_global_size;
    	}
    	cl_mem min_corner_grid_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
    		CL_MEM_COPY_HOST_PTR, sizeof(float) * 3, min_corner_grid, &err);
    	if (err < 0) {
    		printf("Error code: %d. Couldn't create the h_minBoundsGrid_buffer
", err);
    		exit(1);
    	};
    	cl_mem dim_grid_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
    		CL_MEM_COPY_HOST_PTR, sizeof(int) * 3, dim_grid, &err);
    	if (err < 0) {
    		printf("Error code: %d. Couldn't create the dimGrid_buffer
", err);
    		exit(1);
    	};
    	cl_mem coords_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
    		CL_MEM_COPY_HOST_PTR, sizeof(float)* triangles_count * 9, coords, &err);
    	if (err < 0) {
    		printf("Error code: %d. Couldn't create the coords_buffer
", err);
    		exit(1);
    	};
    	cl_mem density_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
    		CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * array_length, density, &err);
    	if (err < 0) {
    		printf("Error code: %d. Couldn't create the density_buffer
", err);
    		exit(1);
    	};
    	cl_kernel voxelization_kernel;
    	voxelization_kernel = clCreateKernel(program, "voxelizer", NULL);
    	if (err < 0) {
    		printf("Error code: %d. Couldn't create voxelization_kernel
", err);
    		exit(1);
    	};
    	err = clSetKernelArg(voxelization_kernel, 0, sizeof(cl_uint), &voxelization_global_size);
    	float voxel_size = VOXEL_SIZE;
    	err |= clSetKernelArg(voxelization_kernel, 1, sizeof(float), &voxel_size);
    	err |= clSetKernelArg(voxelization_kernel, 2, sizeof(cl_mem), &min_corner_grid_buffer);
    	err |= clSetKernelArg(voxelization_kernel, 3, sizeof(cl_mem), &dim_grid_buffer);
    	err |= clSetKernelArg(voxelization_kernel, 4, sizeof(cl_mem), &coords_buffer);
    	err |= clSetKernelArg(voxelization_kernel, 5, sizeof(cl_mem), &density_buffer);
    	if (err < 0) {
    		printf("Error code: %d. Couldn't create an argument for voxelization_kernel
", err);
    		exit(1);
    	}
    	
	cl_ulong private_mem_size;
	err = clGetKernelWorkGroupInfo(voxelization_kernel, device,
		CL_KERNEL_PRIVATE_MEM_SIZE, sizeof(size_t), &private_mem_size, NULL);
	printf("CL_KERNEL_PRIVATE_MEM_SIZE = %d", private_mem_size)
    	
    	cl_event voxelization_kernel_event;
    	err = clEnqueueNDRangeKernel(queue, voxelization_kernel, 1, NULL, &voxelization_revised_global_size,
    		&voxelization_local_size, 0, NULL, &voxelization_kernel_event);
    	if (err < 0) {
    		printf("Error code: %d. Couldn't enqueue the voxelization_kernel
", err);
    		exit(1);
    	}
    	err = clWaitForEvents(1, &voxelization_kernel_event);
    	if (err < 0) {
    		printf("Error code: %d, clWaitForEvent
", err);
    		exit(1);
    	}

Upon enqueuig the kernel, clWaitForEvents throws CL_OUT_OF_RESOURCES error. It is working fine for global size of 16384 with local size set to 1024. However, for greater global sizes I have the error. I am totally lost in debugging this code. I tried to use clGetKernelWorkGroupInfo to find out if I am exceeding the amount of local and private memory available on my device. However, it gives me 0 private memory and 1 KB local memory which looks strange. I would really appreciate it if you guys can help me out. I am using a GF970 GTX.

BTW, commenting the aotmic OR operation avoids the error. However, I really need that to get correct answer out of the kernel. I suspect seg-fault. In the inner-most part of the kernel, there is an atomic operation. When I comment that, the kernel runs without any error. I think whatever the problem is, it has to do with accessing to density array, which is stored ni global memory. To access the elements of this aray, I use a variable “array_index” which is an unsigned integer. However, it is derived from an unsigned long variable called “voxel_index”. Am I converting this two type of variables correctly?

This is a common problem with NVIDIA’s OpenCL. You can only create a certain number of workitems (like in CUDA) and you should workaround it using a for loop, i.e. “for (int i = get_global_id(0); i < actual_work_size; i += global_size(0))”. There is a parameter you can query to determine current device’s block size for portability, but I can’t tell you anything exact. That atomic part, though: no clue.

Thanks for the reply. I think I found the cause of this issue. In the host code, when I allocate the density array, cl_uint can not hold the large number required for malloc. So, I was getting wrong array size and thus seg-fault.