I think somebody from GPU team posted a reply to similar question. It is likely due to too many threads being spawned. Make sure you do not exceed device limitations. I use the following code to find out what my device (CPU) can do. May need some tweaking for GPU device.
/*Display OpenCL system info */
void printCLInfo()
{
size_t p_size;
size_t arr_tsize[3];
size_t ret_size;
char param[100];
cl_uint entries;
cl_ulong long_entries;
cl_bool bool_entries;
cl_device_id devices[MAX_DEVICES];
cl_uint num_devices;
cl_device_local_mem_type mem_type;
cl_device_type dev_type;
cl_device_fp_config fp_conf;
cl_device_exec_capabilities exec_cap;
clGetPlatformInfo(CL_PLATFORM_PROFILE,sizeof(param),param,&ret_size);
printf("
Platform Profile: %s
",param);
clGetPlatformInfo(CL_PLATFORM_VERSION,sizeof(param),param,&ret_size);
printf("Platform Version: %s
",param);
clGetDeviceIDs(CL_DEVICE_TYPE_ALL,MAX_DEVICES,devices,&num_devices);
printf("Found Devices: %d
",num_devices);
for(int i=0; i<num_devices; i++)
{
printf("
Device: %d
",i);
clGetDeviceInfo(devices[i],CL_DEVICE_TYPE,sizeof(dev_type),&dev_type,&ret_size);
printf(" Device Type: ");
if(dev_type & CL_DEVICE_TYPE_GPU)
printf("CL_DEVICE_TYPE_GPU ");
if(dev_type & CL_DEVICE_TYPE_CPU)
printf("CL_DEVICE_TYPE_CPU ");
if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
printf("CL_DEVICE_TYPE_ACCELERATOR ");
if(dev_type & CL_DEVICE_TYPE_DEFAULT)
printf("CL_DEVICE_TYPE_DEFAULT ");
printf("
");
clGetDeviceInfo(devices[i],CL_DEVICE_NAME,sizeof(param),param,&ret_size);
printf(" Name: %s
",param);
clGetDeviceInfo(devices[i],CL_DEVICE_VENDOR,sizeof(param),param,&ret_size);
printf(" Vendor: %s
",param);
clGetDeviceInfo(devices[i],CL_DEVICE_VENDOR_ID,sizeof(cl_uint),&entries,&ret_size);
printf(" Vendor ID: %d
",entries);
clGetDeviceInfo(devices[i],CL_DEVICE_VERSION,sizeof(param),param,&ret_size);
printf(" Version: %s
",param);
clGetDeviceInfo(devices[i],CL_DEVICE_PROFILE,sizeof(param),param,&ret_size);
printf(" Profile: %s
",param);
clGetDeviceInfo(devices[i],CL_DRIVER_VERSION,sizeof(param),param,&ret_size);
printf(" Driver: %s
",param);
clGetDeviceInfo(devices[i],CL_DEVICE_EXTENSIONS,sizeof(param),param,&ret_size);
printf(" Extensions: %s
",param);
clGetDeviceInfo(devices[i],CL_DEVICE_MAX_WORK_ITEM_SIZES,3*sizeof(size_t),arr_tsize,&ret_size);
printf(" Max Work-Item Sizes: (%d,%d,%d)
",arr_tsize[0],arr_tsize[1],arr_tsize[2]);
clGetDeviceInfo(devices[i],CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t),&p_size,&ret_size);
printf(" Max Work Group Size: %d
",p_size);
clGetDeviceInfo(devices[i],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&entries,&ret_size);
printf(" Max Compute Units: %d
",entries);
clGetDeviceInfo(devices[i],CL_DEVICE_MAX_CLOCK_FREQUENCY,sizeof(cl_uint),&entries,&ret_size);
printf(" Max Frequency (Mhz): %d
",entries);
clGetDeviceInfo(devices[i],CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE,sizeof(cl_uint),&entries,&ret_size);
printf(" Cache Line (bytes): %d
",entries);
clGetDeviceInfo(devices[i],CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,&ret_size);
printf(" Global Memory (MB): %llu
",long_entries/1024/1024);
clGetDeviceInfo(devices[i],CL_DEVICE_LOCAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,&ret_size);
printf(" Local Memory (MB): %llu
",long_entries/1024/1024);
clGetDeviceInfo(devices[i],CL_DEVICE_LOCAL_MEM_TYPE,sizeof(cl_device_local_mem_type),&mem_type,&ret_size);
if(mem_type & CL_LOCAL)
printf(" Local Memory Type: CL_LOCAL
");
else if(mem_type & CL_GLOBAL)
printf(" Local Memory Type: CL_GLOBAL
");
else
printf(" Local Memory Type: UNKNOWN
");
clGetDeviceInfo(devices[i],CL_DEVICE_MAX_MEM_ALLOC_SIZE,sizeof(cl_ulong),&long_entries,&ret_size);
printf(" Max Mem Alloc (MB): %llu
",long_entries/1024/1024);
clGetDeviceInfo(devices[i],CL_DEVICE_MAX_PARAMETER_SIZE,sizeof(size_t),&p_size,&ret_size);
printf(" Max Param Size (MB): %d
",p_size);
clGetDeviceInfo(devices[i],CL_DEVICE_MEM_BASE_ADDR_ALIGN,sizeof(cl_uint),&entries,&ret_size);
printf(" Base Mem Align (bits): %d
",entries);
clGetDeviceInfo(devices[i],CL_DEVICE_ADDRESS_BITS,sizeof(cl_uint),&entries,&ret_size);
printf(" Address Space (bits): %d
",entries);
clGetDeviceInfo(devices[i],CL_DEVICE_IMAGE_SUPPORT,sizeof(cl_bool),&bool_entries,&ret_size);
printf(" Image Support: %d
",bool_entries);
clGetDeviceInfo(devices[i],CL_DEVICE_TYPE,sizeof(fp_conf),&fp_conf,&ret_size);
printf(" Float Functionality: ");
if(fp_conf & CL_FP_DENORM)
printf("DENORM support ");
if(fp_conf & CL_FP_ROUND_TO_NEAREST)
printf("Round to nearest support ");
if(fp_conf & CL_FP_ROUND_TO_ZERO)
printf("Round to zero support ");
if(fp_conf & CL_FP_ROUND_TO_INF)
printf("Round to +ve/-ve infinity support ");
if(fp_conf & CL_FP_FMA)
printf("IEEE754 fused-multiply-add support ");
if(fp_conf & CL_FP_INF_NAN)
printf("INF and NaN support ");
printf("
");
clGetDeviceInfo(devices[i],CL_DEVICE_ERROR_CORRECTION_SUPPORT,sizeof(cl_bool),&bool_entries,&ret_size);
printf(" ECC Support: %d
",bool_entries);
clGetDeviceInfo(devices[i],CL_DEVICE_EXECUTION_CAPABILITIES,sizeof(cl_device_exec_capabilities),&exec_cap,&ret_size);
printf(" Exec Functionality: ");
if(exec_cap & CL_EXEC_KERNEL)
printf("CL_EXEC_KERNEL ");
if(exec_cap & CL_EXEC_NATIVE_KERNEL)
printf("CL_EXEC_NATIVE_KERNEL ");
printf("
");
clGetDeviceInfo(devices[i],CL_DEVICE_ENDIAN_LITTLE,sizeof(cl_bool),&bool_entries,&ret_size);
printf(" Little Endian Device: %d
",bool_entries);
clGetDeviceInfo(devices[i],CL_DEVICE_PROFILING_TIMER_RESOLUTION,sizeof(size_t),&p_size,&ret_size);
printf(" Profiling Res (ns): %d
",p_size);
clGetDeviceInfo(devices[i],CL_DEVICE_AVAILABLE,sizeof(cl_bool),&bool_entries,&ret_size);
printf(" Device Available: %d
",bool_entries);
}
}