I have two ATI Radeon 5970 graphics card with Intel Core i7 processor running 64-bit CentOS 5.3 on my system. I have installed ATI Catalyst 12.2(64-bit) drivers and AMD APP SDK 2.4 as well. Output given by CLInfo sample program of SDK is
Number of platforms: 1
Platform Profile: FULL_PROFILE
Platform Version: OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
Platform Name: AMD Accelerated Parallel Processing
Platform Vendor: Advanced Micro Devices, Inc.
Platform Extensions: cl_khr_icd cl_amd_event_callback cl_amd_offline_devices
Platform Name: AMD Accelerated Parallel Processing
Number of devices: 3
Device Type: CL_DEVICE_TYPE_GPU
Device ID: 4098
Max compute units: 20
Max work items dimensions: 3
Max work items[0]: 256
Max work items[1]: 256
Max work items[2]: 256
Max work group size: 256
Preferred vector width char: 16
Preferred vector width short: 8
Preferred vector width int: 4
Preferred vector width long: 2
Preferred vector width float: 4
Preferred vector width double: 0
Max clock frequency: 725Mhz
Address bits: 32
Max memory allocation: 134217728
Image support: Yes
Max number of images read arguments: 128
Max number of images write arguments: 8
Max image 2D width: 8192
Max image 2D height: 8192
Max image 3D width: 2048
Max image 3D height: 2048
Max image 3D depth: 2048
Max samplers within kernel: 16
Max size of kernel argument: 1024
Alignment (bits) of base address: 32768
Minimum alignment (bytes) for any datatype: 128
Single precision floating point capability
Denorms: No
Quiet NaNs: Yes
Round to nearest even: Yes
Round to zero: Yes
Round to +ve and infinity: Yes
IEEE754-2008 fused multiply-add: Yes
Cache type: None
Cache line size: 0
Cache size: 0
Global memory size: 536870912
Constant buffer size: 65536
Max number of constant args: 8
Local memory type: Scratchpad
Local memory size: 32768
Profiling timer resolution: 1
Device endianess: Little
Available: Yes
Compiler available: Yes
Execution capabilities:
Execute OpenCL kernels: Yes
Execute native function: No
Queue properties:
Out-of-Order: No
Profiling : Yes
Platform ID: 0x2b1e1f8c3800
Name: Cypress
Vendor: Advanced Micro Devices, Inc.
Driver version: CAL 1.4.1703
Profile: FULL_PROFILE
Version: OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
Extensions: cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_amd_device_attribute_query cl_amd_printf cl_amd_media_ops cl_amd_popcnt
Device Type: CL_DEVICE_TYPE_GPU
Device ID: 4098
Max compute units: 20
Max work items dimensions: 3
Max work items[0]: 256
Max work items[1]: 256
Max work items[2]: 256
Max work group size: 256
Preferred vector width char: 16
Preferred vector width short: 8
Preferred vector width int: 4
Preferred vector width long: 2
Preferred vector width float: 4
Preferred vector width double: 0
Max clock frequency: 725Mhz
Address bits: 32
Max memory allocation: 134217728
Image support: Yes
Max number of images read arguments: 128
Max number of images write arguments: 8
Max image 2D width: 8192
Max image 2D height: 8192
Max image 3D width: 2048
Max image 3D height: 2048
Max image 3D depth: 2048
Max samplers within kernel: 16
Max size of kernel argument: 1024
Alignment (bits) of base address: 32768
Minimum alignment (bytes) for any datatype: 128
Single precision floating point capability
Denorms: No
Quiet NaNs: Yes
Round to nearest even: Yes
Round to zero: Yes
Round to +ve and infinity: Yes
IEEE754-2008 fused multiply-add: Yes
Cache type: None
Cache line size: 0
Cache size: 0
Global memory size: 536870912
Constant buffer size: 65536
Max number of constant args: 8
Local memory type: Scratchpad
Local memory size: 32768
Profiling timer resolution: 1
Device endianess: Little
Available: Yes
Compiler available: Yes
Execution capabilities:
Execute OpenCL kernels: Yes
Execute native function: No
Queue properties:
Out-of-Order: No
Profiling : Yes
Platform ID: 0x2b1e1f8c3800
Name: Cypress
Vendor: Advanced Micro Devices, Inc.
Driver version: CAL 1.4.1703
Profile: FULL_PROFILE
Version: OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
Extensions: cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_amd_device_attribute_query cl_amd_printf cl_amd_media_ops cl_amd_popcnt
Device Type: CL_DEVICE_TYPE_CPU
Device ID: 4098
Max compute units: 12
Max work items dimensions: 3
Max work items[0]: 1024
Max work items[1]: 1024
Max work items[2]: 1024
Max work group size: 1024
Preferred vector width char: 16
Preferred vector width short: 8
Preferred vector width int: 4
Preferred vector width long: 2
Preferred vector width float: 4
Preferred vector width double: 0
Max clock frequency: 1596Mhz
Address bits: 64
Max memory allocation: 3145740288
Image support: Yes
Max number of images read arguments: 128
Max number of images write arguments: 8
Max image 2D width: 8192
Max image 2D height: 8192
Max image 3D width: 2048
Max image 3D height: 2048
Max image 3D depth: 2048
Max samplers within kernel: 16
Max size of kernel argument: 4096
Alignment (bits) of base address: 1024
Minimum alignment (bytes) for any datatype: 128
Single precision floating point capability
Denorms: Yes
Quiet NaNs: Yes
Round to nearest even: Yes
Round to zero: Yes
Round to +ve and infinity: Yes
IEEE754-2008 fused multiply-add: No
Cache type: Read/Write
Cache line size: 0
Cache size: 0
Global memory size: 12582961152
Constant buffer size: 65536
Max number of constant args: 8
Local memory type: Global
Local memory size: 32768
Profiling timer resolution: 999848
Device endianess: Little
Available: Yes
Compiler available: Yes
Execution capabilities:
Execute OpenCL kernels: Yes
Execute native function: Yes
Queue properties:
Out-of-Order: No
Profiling : Yes
Platform ID: 0x2b1e1f8c3800
Name: Intel(R) Core(TM) i7 CPU X 980 @ 3.33GHz
Vendor: GenuineIntel
Driver version: 2.0
Profile: FULL_PROFILE
Version: OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
Extensions: cl_khr_fp64 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_byte_addressable_store cl_khr_gl_sharing cl_ext_device_fission cl_amd_device_attribute_query cl_amd_vec3 cl_amd_media_ops cl_amd_popcnt cl_amd_printf
Passed!
My system gives segmentation fault when I create a command queue on a GPU device but It works fine on a CPU device for all programs(including sample programs of SDK). For example, the following program(chk_mod.c) works fine with CPU but gives Segmentation fault with GPU
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <time.h>
#define MAX_SOURCE_SIZE (0x10000000)
#define RNGE (100000)
int main()
{
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem Amobj = NULL;
cl_mem Bmobj = NULL;
cl_mem Cmobj = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
clock_t time_i, time_f;
time_i = clock();
int i;
int j;
int *A;
int *B;
A = (int *) malloc( RNGE * sizeof(int) );
B = (int *) malloc( RNGE * sizeof(int) );
FILE *fp;
const char fileName[] = "chk_mod.cl";
size_t source_size;
char *source_str;
fp = fopen(fileName, "r");
if ( ! fp ) {
fprintf(stderr, "Failed to load kernel.
");
exit(1);
}
source_str = (char *) malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
for ( i=0; i < RNGE; i++ ) {
A[ i ] = i;
B[ i ] = A[i];
}
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
Amobj = clCreateBuffer(context, CL_MEM_READ_ONLY, RNGE*sizeof(int), NULL, &ret);
Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, RNGE*sizeof(int), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, RNGE*sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, RNGE*sizeof(int), B, 0, NULL, NULL);
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
kernel = clCreateKernel(program, "data_parallel", &ret);
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &Amobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &Bmobj);
size_t global_item_size = RNGE;
size_t local_item_size = 1;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, 0, &global_item_size, &local_item_size, 0, 0, 0 );
ret = clEnqueueReadBuffer(command_queue, Bmobj, CL_TRUE, 0, RNGE*sizeof(int), B, 0, NULL, NULL);
printf("Result: ");
for ( i=2; i < RNGE; i++ ) {
if ( B[i] ) {
printf( "%d ", B[i] );
}
}
printf("
");
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(Amobj);
ret = clReleaseMemObject(Bmobj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(source_str);
free(A);
free(B);
time_f = clock();
printf("Time elapsed = %7.3fs
", (float) (time_f - time_i)/CLOCKS_PER_SEC);
return 0;
}
Kernel file(chk_mod.cl) is:
__kernel void data_parallel( __global int *A, __global int *B )
{
int t;
int i = get_global_id(0);
for ( t = 2; t < i; t++ ) {
if ( i % t == 0 ) {
B[ i ] = 0;
}
}
}
Can someone please help me with the issue as soon as possible?