Segmentation fault while creating command queue

whitepearl · March 22, 2012, 8:41am

I have two ATI Radeon 5970 graphics card with Intel Core i7 processor running 64-bit CentOS 5.3 on my system. I have installed ATI Catalyst 12.2(64-bit) drivers and AMD APP SDK 2.4 as well. Output given by CLInfo sample program of SDK is

Number of platforms:				 1
  Platform Profile:				 FULL_PROFILE
  Platform Version:				 OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
  Platform Name:					 AMD Accelerated Parallel Processing
  Platform Vendor:				 Advanced Micro Devices, Inc.
  Platform Extensions:			 cl_khr_icd cl_amd_event_callback cl_amd_offline_devices


  Platform Name:					 AMD Accelerated Parallel Processing
Number of devices:				 3
  Device Type:					 CL_DEVICE_TYPE_GPU
  Device ID:					 4098
  Max compute units:				 20
  Max work items dimensions:			 3
    Max work items[0]:				 256
    Max work items[1]:				 256
    Max work items[2]:				 256
  Max work group size:				 256
  Preferred vector width char:			 16
  Preferred vector width short:			 8
  Preferred vector width int:			 4
  Preferred vector width long:			 2
  Preferred vector width float:			 4
  Preferred vector width double:		 0
  Max clock frequency:				 725Mhz
  Address bits:					 32
  Max memory allocation:			 134217728
  Image support:				 Yes
  Max number of images read arguments:	 128
  Max number of images write arguments:	 8
  Max image 2D width:			 8192
  Max image 2D height:			 8192
  Max image 3D width:			 2048
  Max image 3D height:	 2048
  Max image 3D depth:			 2048
  Max samplers within kernel:		 16
  Max size of kernel argument:			 1024
  Alignment (bits) of base address:		 32768
  Minimum alignment (bytes) for any datatype:	 128
  Single precision floating point capability
    Denorms:					 No
    Quiet NaNs:					 Yes
    Round to nearest even:			 Yes
    Round to zero:				 Yes
    Round to +ve and infinity:			 Yes
    IEEE754-2008 fused multiply-add:		 Yes
  Cache type:					 None
  Cache line size:				 0
  Cache size:					 0
  Global memory size:				 536870912
  Constant buffer size:				 65536
  Max number of constant args:			 8
  Local memory type:				 Scratchpad
  Local memory size:				 32768
  Profiling timer resolution:			 1
  Device endianess:				 Little
  Available:					 Yes
  Compiler available:				 Yes
  Execution capabilities:				 
    Execute OpenCL kernels:			 Yes
    Execute native function:			 No
  Queue properties:				 
    Out-of-Order:				 No
    Profiling :					 Yes
  Platform ID:					 0x2b1e1f8c3800
  Name:						 Cypress
  Vendor:					 Advanced Micro Devices, Inc.
  Driver version:				 CAL 1.4.1703
  Profile:					 FULL_PROFILE
  Version:					 OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
  Extensions:					 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_amd_device_attribute_query cl_amd_printf cl_amd_media_ops cl_amd_popcnt 
  Device Type:					 CL_DEVICE_TYPE_GPU
  Device ID:					 4098
  Max compute units:				 20
  Max work items dimensions:			 3
    Max work items[0]:				 256
    Max work items[1]:				 256
    Max work items[2]:				 256
  Max work group size:				 256
  Preferred vector width char:			 16
  Preferred vector width short:			 8
  Preferred vector width int:			 4
  Preferred vector width long:			 2
  Preferred vector width float:			 4
  Preferred vector width double:		 0
  Max clock frequency:				 725Mhz
  Address bits:					 32
  Max memory allocation:			 134217728
  Image support:				 Yes
  Max number of images read arguments:	 128
  Max number of images write arguments:	 8
  Max image 2D width:			 8192
  Max image 2D height:			 8192
  Max image 3D width:			 2048
  Max image 3D height:	 2048
  Max image 3D depth:			 2048
  Max samplers within kernel:		 16
  Max size of kernel argument:			 1024
  Alignment (bits) of base address:		 32768
  Minimum alignment (bytes) for any datatype:	 128
  Single precision floating point capability
    Denorms:					 No
    Quiet NaNs:					 Yes
    Round to nearest even:			 Yes
    Round to zero:				 Yes
    Round to +ve and infinity:			 Yes
    IEEE754-2008 fused multiply-add:		 Yes
  Cache type:					 None
  Cache line size:				 0
  Cache size:					 0
  Global memory size:				 536870912
  Constant buffer size:				 65536
  Max number of constant args:			 8
  Local memory type:				 Scratchpad
  Local memory size:				 32768
  Profiling timer resolution:			 1
  Device endianess:				 Little
  Available:					 Yes
  Compiler available:				 Yes
  Execution capabilities:				 
    Execute OpenCL kernels:			 Yes
    Execute native function:			 No
  Queue properties:				 
    Out-of-Order:				 No
    Profiling :					 Yes
  Platform ID:					 0x2b1e1f8c3800
  Name:						 Cypress
  Vendor:					 Advanced Micro Devices, Inc.
  Driver version:				 CAL 1.4.1703
  Profile:					 FULL_PROFILE
  Version:					 OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
  Extensions:					 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_amd_device_attribute_query cl_amd_printf cl_amd_media_ops cl_amd_popcnt 
  Device Type:					 CL_DEVICE_TYPE_CPU
  Device ID:					 4098
  Max compute units:				 12
  Max work items dimensions:			 3
    Max work items[0]:				 1024
    Max work items[1]:				 1024
    Max work items[2]:				 1024
  Max work group size:				 1024
  Preferred vector width char:			 16
  Preferred vector width short:			 8
  Preferred vector width int:			 4
  Preferred vector width long:			 2
  Preferred vector width float:			 4
  Preferred vector width double:		 0
  Max clock frequency:				 1596Mhz
  Address bits:					 64
  Max memory allocation:			 3145740288
  Image support:				 Yes
  Max number of images read arguments:	 128
  Max number of images write arguments:	 8
  Max image 2D width:			 8192
  Max image 2D height:			 8192
  Max image 3D width:			 2048
  Max image 3D height:	 2048
  Max image 3D depth:			 2048
  Max samplers within kernel:		 16
  Max size of kernel argument:			 4096
  Alignment (bits) of base address:		 1024
  Minimum alignment (bytes) for any datatype:	 128
  Single precision floating point capability
    Denorms:					 Yes
    Quiet NaNs:					 Yes
    Round to nearest even:			 Yes
    Round to zero:				 Yes
    Round to +ve and infinity:			 Yes
    IEEE754-2008 fused multiply-add:		 No
  Cache type:					 Read/Write
  Cache line size:				 0
  Cache size:					 0
  Global memory size:				 12582961152
  Constant buffer size:				 65536
  Max number of constant args:			 8
  Local memory type:				 Global
  Local memory size:				 32768
  Profiling timer resolution:			 999848
  Device endianess:				 Little
  Available:					 Yes
  Compiler available:				 Yes
  Execution capabilities:				 
    Execute OpenCL kernels:			 Yes
    Execute native function:			 Yes
  Queue properties:				 
    Out-of-Order:				 No
    Profiling :					 Yes
  Platform ID:					 0x2b1e1f8c3800
  Name:						 Intel(R) Core(TM) i7 CPU       X 980  @ 3.33GHz
  Vendor:					 GenuineIntel
  Driver version:				 2.0
  Profile:					 FULL_PROFILE
  Version:					 OpenCL 1.1 AMD-APP-SDK-v2.4 (595.10)
  Extensions:					 cl_khr_fp64 cl_amd_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_byte_addressable_store cl_khr_gl_sharing cl_ext_device_fission cl_amd_device_attribute_query cl_amd_vec3 cl_amd_media_ops cl_amd_popcnt cl_amd_printf 


Passed!

My system gives segmentation fault when I create a command queue on a GPU device but It works fine on a CPU device for all programs(including sample programs of SDK). For example, the following program(chk_mod.c) works fine with CPU but gives Segmentation fault with GPU

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <time.h>

#define MAX_SOURCE_SIZE (0x10000000)
#define RNGE (100000)

int main()
{
	cl_platform_id platform_id = NULL;
	cl_device_id device_id = NULL;
	cl_context context = NULL;
	cl_command_queue command_queue = NULL;
	cl_mem Amobj = NULL;
	cl_mem Bmobj = NULL;
	cl_mem Cmobj = NULL;
	cl_program program = NULL;
	cl_kernel kernel = NULL;
	cl_uint ret_num_devices;
	cl_uint ret_num_platforms;
	cl_int ret;

	clock_t time_i, time_f;

	time_i = clock();

	int i;
	int j;
	int *A;
	int *B;

	A = (int *) malloc( RNGE * sizeof(int) );
	B = (int *) malloc( RNGE * sizeof(int) );

	FILE *fp;
	const char fileName[] = "chk_mod.cl";
	size_t source_size;
	char *source_str;

	fp = fopen(fileName, "r");
	if ( ! fp ) {
		fprintf(stderr, "Failed to load kernel.
");
		exit(1);
	}
	source_str = (char *) malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);

	for ( i=0; i < RNGE; i++ ) {
			A[ i ] = i;
			B[ i ] = A[i];
	}

	ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
	ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);

	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

	Amobj = clCreateBuffer(context, CL_MEM_READ_ONLY, RNGE*sizeof(int), NULL, &ret);
	Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, RNGE*sizeof(int), NULL, &ret);

	ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, RNGE*sizeof(int), A, 0, NULL, NULL);
	ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, RNGE*sizeof(int), B, 0, NULL, NULL);

	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
	ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

	kernel = clCreateKernel(program, "data_parallel", &ret);

	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &Amobj);
	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &Bmobj);

	size_t global_item_size = RNGE;
	size_t local_item_size = 1;

	ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, 0, &global_item_size, &local_item_size, 0, 0, 0 );

	ret = clEnqueueReadBuffer(command_queue, Bmobj, CL_TRUE, 0, RNGE*sizeof(int), B, 0, NULL, NULL);

	printf("Result: ");
	for ( i=2; i < RNGE; i++ ) {
		if ( B[i] ) {
			printf( "%d ", B[i] );
		}
	}
	printf("
");

	ret = clFlush(command_queue);
	ret = clFinish(command_queue);
	ret = clReleaseKernel(kernel);
	ret = clReleaseProgram(program);
	ret = clReleaseMemObject(Amobj);
	ret = clReleaseMemObject(Bmobj);
	ret = clReleaseCommandQueue(command_queue);
	ret = clReleaseContext(context);

	free(source_str);

	free(A);
	free(B);

	time_f = clock();
	printf("Time elapsed = %7.3fs
", (float) (time_f - time_i)/CLOCKS_PER_SEC);

	return 0;
}

Kernel file(chk_mod.cl) is:

__kernel void data_parallel( __global int *A, __global int *B )
{
	int t;
	int i = get_global_id(0);

	for ( t = 2; t < i; t++ ) {
		if ( i % t == 0 ) {
			B[ i ] = 0;
		}
	}
}

Can someone please help me with the issue as soon as possible?

notzed · March 22, 2012, 4:09pm

Works here, although it hard-locked my entire system for 60 seconds while it ran.
(HD 5770)

My system gives segmentation fault when I create a command queue on a GPU device but It works fine on a CPU device for all programs(including sample programs of SDK). For example, the following program(chk_mod.c) works fine with CPU but gives Segmentation fault with GPU

Sigh, not code based on that poor example again; add checking to every call to help identify what is going wrong.

whitepearl · March 23, 2012, 1:13am

Though I have added checks yet the problem is not with the code. As soon as I run the code it aborts returning segfault. Even after installing compatible drivers the system gives segfault for any OpenCL program running on GPU(even the samples programs of amd sdk).

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <time.h>

#define MAX_SOURCE_SIZE (0x10000000)
#define RNGE (100000)

int main()
{
        cl_platform_id platform_id = NULL;
        cl_device_id device_id = NULL;
        cl_context context = NULL;
        cl_command_queue command_queue = NULL;
        cl_mem Amobj = NULL;
        cl_mem Bmobj = NULL;
        cl_mem Cmobj = NULL;
        cl_program program = NULL;
        cl_kernel kernel = NULL;
        cl_uint ret_num_devices;
        cl_uint ret_num_platforms;
        cl_int ret;

        clock_t time_i, time_f;

        time_i = clock();

        int i;
        int j;
        int *A;
        int *B;

        A = (int *) malloc( RNGE * sizeof(int) );
        B = (int *) malloc( RNGE * sizeof(int) );

        FILE *fp;
        const char fileName[] = "chk_mod.cl";
        size_t source_size;
        char *source_str;

        fp = fopen(fileName, "r");
        if ( ! fp ) {
                fprintf(stderr, "Failed to load kernel.
");
                exit(1);
        }
        source_str = (char *) malloc(MAX_SOURCE_SIZE);
        source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
        fclose(fp);

        for ( i=0; i < RNGE; i++ ) {
                        A[ i ] = i;
                        B[ i ] = A[i];
        }

        ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot get platform id.
" );
                return 1;
        }


        ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot get device id.
" );
                return 1;
        }


        context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot create context
" );
                return 1;
        }

        command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot create command queue.
" );
                return 1;
        }

        Amobj = clCreateBuffer(context, CL_MEM_READ_ONLY, RNGE*sizeof(int), NULL, &ret);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot create buffer A
" );
                return 1;
        }

        Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, RNGE*sizeof(int), NULL, &ret);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot create buffer B
" );
                return 1;
        }


        ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, RNGE*sizeof(int), A, 0, NULL, NULL);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot enqueue in write buffer.
" );
                return 1;
        }

        ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, RNGE*sizeof(int), B, 0, NULL, NULL);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot enqueue in write buffer.
" );
                return 1;
        }


        program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot create program with source
" );
                return 1;
        }

        ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

        kernel = clCreateKernel(program, "data_parallel", &ret);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot create kernel
" );
                return 1;
        }

 ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &Amobj);
        ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &Bmobj);

        size_t global_item_size = RNGE;
        size_t local_item_size = 1;

        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, 0, &global_item_size, &local_item_size, 0, 0, 0 );

        ret = clEnqueueReadBuffer(command_queue, Bmobj, CL_TRUE, 0, RNGE*sizeof(int), B, 0, NULL, NULL);

        printf("Result: ");
        for ( i=2; i < RNGE; i++ ) {
                if ( B[i] ) {
                        printf( "%d ", B[i] );
                }
        }
        printf("
");

        ret = clFlush(command_queue);
        ret = clFinish(command_queue);
        ret = clReleaseKernel(kernel);
        ret = clReleaseProgram(program);
        ret = clReleaseMemObject(Amobj);
        ret = clReleaseMemObject(Bmobj);
        ret = clReleaseCommandQueue(command_queue);
        ret = clReleaseContext(context);

        free(source_str);

        free(A);
        free(B);

        time_f = clock();
        printf("Time elapsed = %7.3fs
", (float) (time_f - time_i)/CLOCKS_PER_SEC);

        return 0;
}

I am not able to find where the real problem lies. CLInfo returns the complete information about all 3(2 GPU + 1 CPU) devices yet running any code returns segmentation fault.

Somebody please review the problem.