Error propagation from devices to application level

Hi,

We are using the NVIDIA’s OpenCL conformant 1.0 SDK to develop an application. We are compiling the OpenCL code and getting the “.ptx” files as binaries ( different devices have different formats ). When we try to run the code we get an “CL_OUT_OF_RESOURCES (-5)” error. Therefore there is a problem with device capabilities. How can we learn which property had caused the error ? Is there a possibility to propagate device specific error to the application level ?

Ahmet,

I think somebody from GPU team posted a reply to similar question. It is likely due to too many threads being spawned. Make sure you do not exceed device limitations. I use the following code to find out what my device (CPU) can do. May need some tweaking for GPU device.



	/*Display OpenCL system info */
	void printCLInfo()
	{
		size_t p_size;
		size_t arr_tsize[3];
		size_t ret_size;
		char param[100];
		cl_uint entries;
		cl_ulong long_entries;
		cl_bool bool_entries;
		cl_device_id devices[MAX_DEVICES];
		cl_uint num_devices;
		cl_device_local_mem_type mem_type;
		cl_device_type dev_type;
		cl_device_fp_config fp_conf;
		cl_device_exec_capabilities exec_cap;

	

		clGetPlatformInfo(CL_PLATFORM_PROFILE,sizeof(param),param,&ret_size);
		printf("
Platform Profile:	%s
",param);
		clGetPlatformInfo(CL_PLATFORM_VERSION,sizeof(param),param,&ret_size);
		printf("Platform Version:	%s
",param);

		clGetDeviceIDs(CL_DEVICE_TYPE_ALL,MAX_DEVICES,devices,&num_devices);
		printf("Found Devices:		%d
",num_devices);

		for(int i=0; i<num_devices; i++)
		{
			printf("
Device: %d

",i);
	
			clGetDeviceInfo(devices[i],CL_DEVICE_TYPE,sizeof(dev_type),&dev_type,&ret_size);
			printf("	Device Type:		");
			if(dev_type & CL_DEVICE_TYPE_GPU)
				printf("CL_DEVICE_TYPE_GPU ");
			if(dev_type & CL_DEVICE_TYPE_CPU)
				printf("CL_DEVICE_TYPE_CPU ");
			if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
				printf("CL_DEVICE_TYPE_ACCELERATOR ");
			if(dev_type & CL_DEVICE_TYPE_DEFAULT)
				printf("CL_DEVICE_TYPE_DEFAULT ");
			printf("
");


			clGetDeviceInfo(devices[i],CL_DEVICE_NAME,sizeof(param),param,&ret_size);
			printf("	Name: 			%s
",param);

			clGetDeviceInfo(devices[i],CL_DEVICE_VENDOR,sizeof(param),param,&ret_size);
			printf("	Vendor: 		%s
",param);

			clGetDeviceInfo(devices[i],CL_DEVICE_VENDOR_ID,sizeof(cl_uint),&entries,&ret_size);
			printf("	Vendor ID:		%d
",entries);

			clGetDeviceInfo(devices[i],CL_DEVICE_VERSION,sizeof(param),param,&ret_size);
			printf("	Version:		%s
",param);
		
			clGetDeviceInfo(devices[i],CL_DEVICE_PROFILE,sizeof(param),param,&ret_size);
			printf("	Profile:		%s
",param);
	
			clGetDeviceInfo(devices[i],CL_DRIVER_VERSION,sizeof(param),param,&ret_size);
			printf("	Driver: 		%s
",param);

			clGetDeviceInfo(devices[i],CL_DEVICE_EXTENSIONS,sizeof(param),param,&ret_size);
			printf("	Extensions:		%s
",param);

			clGetDeviceInfo(devices[i],CL_DEVICE_MAX_WORK_ITEM_SIZES,3*sizeof(size_t),arr_tsize,&ret_size);
			printf("	Max Work-Item Sizes:	(%d,%d,%d)
",arr_tsize[0],arr_tsize[1],arr_tsize[2]);

			clGetDeviceInfo(devices[i],CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t),&p_size,&ret_size);
			printf("	Max Work Group Size:	%d
",p_size);
		
			clGetDeviceInfo(devices[i],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&entries,&ret_size);
			printf("	Max Compute Units:	%d
",entries);

			clGetDeviceInfo(devices[i],CL_DEVICE_MAX_CLOCK_FREQUENCY,sizeof(cl_uint),&entries,&ret_size);
			printf("	Max Frequency (Mhz):	%d
",entries);

			clGetDeviceInfo(devices[i],CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE,sizeof(cl_uint),&entries,&ret_size);
			printf("	Cache Line (bytes):	%d
",entries);

			clGetDeviceInfo(devices[i],CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,&ret_size);
			printf("	Global Memory (MB):	%llu
",long_entries/1024/1024);

			clGetDeviceInfo(devices[i],CL_DEVICE_LOCAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,&ret_size);
			printf("	Local Memory (MB):	%llu
",long_entries/1024/1024);

			clGetDeviceInfo(devices[i],CL_DEVICE_LOCAL_MEM_TYPE,sizeof(cl_device_local_mem_type),&mem_type,&ret_size);
			if(mem_type & CL_LOCAL)
				printf("	Local Memory Type:	CL_LOCAL
");
			else if(mem_type & CL_GLOBAL)
				printf("	Local Memory Type:	CL_GLOBAL
");
			else
				printf("	Local Memory Type:	UNKNOWN
");


			clGetDeviceInfo(devices[i],CL_DEVICE_MAX_MEM_ALLOC_SIZE,sizeof(cl_ulong),&long_entries,&ret_size);
			printf("	Max Mem Alloc (MB):	%llu
",long_entries/1024/1024);

			clGetDeviceInfo(devices[i],CL_DEVICE_MAX_PARAMETER_SIZE,sizeof(size_t),&p_size,&ret_size);
			printf("	Max Param Size (MB):	%d
",p_size);

			clGetDeviceInfo(devices[i],CL_DEVICE_MEM_BASE_ADDR_ALIGN,sizeof(cl_uint),&entries,&ret_size);
			printf("	Base Mem Align (bits):	%d
",entries);

			clGetDeviceInfo(devices[i],CL_DEVICE_ADDRESS_BITS,sizeof(cl_uint),&entries,&ret_size);
			printf("	Address Space (bits):	%d
",entries);

			clGetDeviceInfo(devices[i],CL_DEVICE_IMAGE_SUPPORT,sizeof(cl_bool),&bool_entries,&ret_size);
			printf("	Image Support:		%d
",bool_entries);

			clGetDeviceInfo(devices[i],CL_DEVICE_TYPE,sizeof(fp_conf),&fp_conf,&ret_size);
			printf("	Float Functionality:	");
			if(fp_conf & CL_FP_DENORM)
				printf("DENORM support ");
			if(fp_conf & CL_FP_ROUND_TO_NEAREST)
				printf("Round to nearest support ");
			if(fp_conf & CL_FP_ROUND_TO_ZERO)
				printf("Round to zero support ");
			if(fp_conf & CL_FP_ROUND_TO_INF)
				printf("Round to +ve/-ve infinity support ");
			if(fp_conf & CL_FP_FMA)
				printf("IEEE754 fused-multiply-add support ");
			if(fp_conf & CL_FP_INF_NAN)
				printf("INF and NaN support ");
			printf("
");


			clGetDeviceInfo(devices[i],CL_DEVICE_ERROR_CORRECTION_SUPPORT,sizeof(cl_bool),&bool_entries,&ret_size);
			printf("	ECC Support:		%d
",bool_entries);

			clGetDeviceInfo(devices[i],CL_DEVICE_EXECUTION_CAPABILITIES,sizeof(cl_device_exec_capabilities),&exec_cap,&ret_size);
			printf("	Exec Functionality:	");
			if(exec_cap & CL_EXEC_KERNEL)
				printf("CL_EXEC_KERNEL ");
			if(exec_cap & CL_EXEC_NATIVE_KERNEL)
				printf("CL_EXEC_NATIVE_KERNEL ");
			printf("
");

			clGetDeviceInfo(devices[i],CL_DEVICE_ENDIAN_LITTLE,sizeof(cl_bool),&bool_entries,&ret_size);
			printf("	Little Endian Device:	%d
",bool_entries);

			clGetDeviceInfo(devices[i],CL_DEVICE_PROFILING_TIMER_RESOLUTION,sizeof(size_t),&p_size,&ret_size);
			printf("	Profiling Res (ns):	%d
",p_size);

			clGetDeviceInfo(devices[i],CL_DEVICE_AVAILABLE,sizeof(cl_bool),&bool_entries,&ret_size);
			printf("	Device Available:	%d
",bool_entries);

		}
	}


The reliable way to determine the max. size of the work-group i.e. the number of work-items that can be specified in local_work_size argument to clEnqueueNDRangeKernel is to call clGetKernelWorkGroupInfo for specific kernel, device with param_name = CL_KERNEL_WORK_GROUP_SIZE. This will return the work group size that can be used. Note that the answer can vary from kernel to kernel so using just the device max values is not sufficient.

The problem as far as we can understand is that the kernel uses too many registers for a single thread. Thus, when we try to set the block size as even a small value such as 8 x 8, the kernel fails. I guess clGetKernelWorkGroupInfo is what we are looking for. Thanks for pointing it out.

Keep in mind that if clGetKernelWorkGroupInfo returns a maximum size of, say, 128, you may be limited in how you can use it. Use clGetDeviceInfo to query the maximum size you are allowed in each dimension. For example, one device may allow a maximum workgroup size of 8 in the 3rd dimension, so you would have to obey that when enqueuing your kernel.