Usertime of execution almost same on CPU as well as GPU?

On running the below code to find prime numbers , it gave the following benchmarks :

Device : CPU

Realtime : approx. 3 sec
Usertime : approx. 32 sec

Device : GPU

Realtime - approx. 37 sec
Usertime - approx. 32 sec

Why is the usertime returned by GPU not less than that of CPU and no optimization occurs?

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <time.h>

#define MAX_SOURCE_SIZE (0x10000000)
#define RNGE (100000)

int main()
{
        cl_platform_id platform_id = NULL;
        cl_device_id device_id = NULL;
        cl_context context = NULL;
        cl_command_queue command_queue = NULL;
        cl_mem Amobj = NULL;
        cl_mem Bmobj = NULL;
        cl_mem Cmobj = NULL;
        cl_program program = NULL;
        cl_kernel kernel = NULL;
        cl_uint ret_num_devices;
        cl_uint ret_num_platforms;
        cl_int ret;

        clock_t time_i, time_f;

        time_i = clock();

        int i;
        int j;
        int *A;
        int *B;

        A = (int *) malloc( RNGE * sizeof(int) );
        B = (int *) malloc( RNGE * sizeof(int) );

        FILE *fp;
        const char fileName[] = "chk_mod.cl";
        size_t source_size;
        char *source_str;

        fp = fopen(fileName, "r");
        if ( ! fp ) {
                fprintf(stderr, "Failed to load kernel.
");
                exit(1);
        }
        source_str = (char *) malloc(MAX_SOURCE_SIZE);
        source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
        fclose(fp);

        for ( i=0; i < RNGE; i++ ) {
                        A[ i ] = i;
                        B[ i ] = A[i];
        }

        ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot get platform id.
" );
                return 1;
        }


        ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot get device id.
" );
                return 1;
        }


        context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot create context
" );
                return 1;
        }

        command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot create command queue.
" );
                return 1;
        }

        Amobj = clCreateBuffer(context, CL_MEM_READ_ONLY, RNGE*sizeof(int), NULL, &ret);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot create buffer A
" );
                return 1;
        }

        Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, RNGE*sizeof(int), NULL, &ret);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot create buffer B
" );
                return 1;
        }


        ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, RNGE*sizeof(int), A, 0, NULL, NULL);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot enqueue in write buffer.
" );
                return 1;
        }

        ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, RNGE*sizeof(int), B, 0, NULL, NULL);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot enqueue in write buffer.
" );
                return 1;
        }


        program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot create program with source
" );
                return 1;
        }

        ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

        kernel = clCreateKernel(program, "data_parallel", &ret);
        if( ret != CL_SUCCESS ) {
                printf( "Error : Cannot create kernel
" );
                return 1;
        }

ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &Amobj);
        ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &Bmobj);

        size_t global_item_size = RNGE;
        size_t local_item_size = 1;

        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, 0, &global_item_size, &local_item_size, 0, 0, 0 );

        ret = clEnqueueReadBuffer(command_queue, Bmobj, CL_TRUE, 0, RNGE*sizeof(int), B, 0, NULL, NULL);

        printf("Result: ");
        for ( i=2; i < RNGE; i++ ) {
                if ( B[i] ) {
                        printf( "%d ", B[i] );
                }
        }
        printf("
");

        ret = clFlush(command_queue);
        ret = clFinish(command_queue);
        ret = clReleaseKernel(kernel);
        ret = clReleaseProgram(program);
        ret = clReleaseMemObject(Amobj);
        ret = clReleaseMemObject(Bmobj);
        ret = clReleaseCommandQueue(command_queue);
        ret = clReleaseContext(context);

        free(source_str);

        free(A);
        free(B);

        time_f = clock();
        printf("Time elapsed = %7.3fs
", (float) (time_f - time_i)/CLOCKS_PER_SEC);

        return 0;
} 


Kernel file(chk_mod.cl) is:
Code:
__kernel void data_parallel( __global int *A, __global int *B )
{
   int t;
   int i = get_global_id(0);

   for ( t = 2; t < i; t++ ) {
      if ( i % t == 0 ) {
         B[ i ] = 0;
      }
   }
}

System specifications :64-bit CentOS 5.3 system with two ATI Radeon 5970 graphics card + Intel Core i7 processor(12 cores)

Ahh so that’s what that loop does. So presumably you worked out your issue - installation problems?

Short answer: Because your opencl implementation just isn’t very good.

Longer answer: OpenCL only provides the means to run the code on different devices, it is up to you to optimise for their specific characteristics or even to schedule work across multiple devices. You can’t expect to make a direct translation of a cpu algorithm and get peak performance. GPU processors are slow(ish) but there are hundreds of them so performance is gained by the right type of parallelism.

And at the end of the day, not every algorithm can be made to match their topology, although this attempt has plenty of run left.

I might also add that integer division is slow as a wet week on any existing gpu hardware, much much slower than floating point (orders of magnitude). Always use floats for arithmetic if you possibly can.

Yes. installation issues are worked out.

Finding prime numbers are small independent tasks so why is GPU not able to give a better performance over CPU?

Could you please help me with the issue ?

Same answer: your code just isn’t very good, it has to be tuned to the hardware.

Well I already suggested using floats as integer division is more than 10x slower than float division on gpus (i’m not sure if you can as there may be numerical stability issues). Apart from that the loop itself does much more work than necessary (hint: once you have a result you can stop searching).

It is unclear why you’re trying this particular problem as it seems quite pointless, is it for course work? If so, it is your task to work this out.

Look at the SDK examples and read the programming guides from the various vendors (amd, nvidia, etc) - and various blogs on the the internet are only a search away. Depending on your level, an introductory book may be more useful - again, search the internets or these forums for recommendations.