CPU faster in vector addition than GPU

Hello,

I have recently started working with OpenCL, but at my first program I ran into some problems.

This is the code:


#include <oclUtils.h>
#include "Timer.h"
#define NUM 512

int main()
{
   cl_platform_id platform;
   cl_uint arrDimension = NUM;
   float *arr1 = new float[NUM];
   float *arr2 = new float[NUM];
   float *arr3 = new float[NUM];
   Timer t1;
   cl_int err = oclGetPlatformID(&platform);
   if(err != CL_SUCCESS)
       printf("O eroare la citirea platformei= %s
",oclErrorString(err));

   cl_device_id gpuDevice;
   err = clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU,1,&gpuDevice,NULL);
   if(err != CL_SUCCESS)
        printf("O eroare la conectarea la dispozitivul de calcul= %s
",oclErrorString(err));
   
   cl_context gpuContext;
   gpuContext = clCreateContext(0,1,&gpuDevice,NULL,NULL,&err);
   if(err != CL_SUCCESS)
        printf("O eroare la crearea contextului= %s
",oclErrorString(err));
   
   cl_command_queue gpuCommandQueue;
   gpuCommandQueue = clCreateCommandQueue(gpuContext,gpuDevice,0,&err);
   if(err != CL_SUCCESS)
        printf("O eroare la crearea cozii de comenzi= %s
",oclErrorString(err));

   cl_program program;
   const char *kernelStr = "__kernel void add(__global float* a, __global float* b, __global float* c)\
                     {\
                         unsigned int i = get_global_id(0);\
                         c[i] = a[i] + b[i];\
                     }";
   size_t kernelLength = strlen(kernelStr);
   program = clCreateProgramWithSource(gpuContext,1,&kernelStr,&kernelLength,&err);
   err = clBuildProgram(program,0,NULL,NULL,NULL,NULL);
   if(err != CL_SUCCESS)
        printf("O eroare la compilarea kernelului= %s
",oclErrorString(err));

   cl_kernel kernel = clCreateKernel(program,"add",&err);
for(int i=0;i<NUM;i++)
{
arr1[i] = i;
arr2[i] = i;
}
 
   cl_mem buf_a = clCreateBuffer(gpuContext,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(cl_float)*NUM, arr1,&err);
   cl_mem buf_b = clCreateBuffer(gpuContext,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(cl_float)*NUM, arr2,&err);
   cl_mem buf_c = clCreateBuffer(gpuContext,CL_MEM_WRITE_ONLY,sizeof(cl_float)*NUM,arr3,&err);

   err = clSetKernelArg(kernel,0,sizeof(cl_mem),(void *)&buf_a);
   err = clSetKernelArg(kernel,1,sizeof(cl_mem),(void *)&buf_b);
   err = clSetKernelArg(kernel,2,sizeof(cl_mem),(void *)&buf_c);
   clFinish(gpuCommandQueue);
   t1.start();
   
   err = clEnqueueNDRangeKernel(gpuCommandQueue,kernel,1,NULL,&arrDimension,0,0,0,0);
   clEnqueueReadBuffer(gpuCommandQueue,buf_c,CL_TRUE,0, NUM*sizeof(cl_float),arr3,0,0,0);
	//for(int i=0;i<NUM;i++)
	//{
	//arr3[i]=arr1[i] +arr2[i];
	//}

printf("Time: %f
",t1.getElapsedTimeInMicroSec());
   clReleaseMemObject(buf_a);
   clReleaseMemObject(buf_b);


  // for(int i=0;i<NUM;i++)
  // {
  //printf("arr3[%d]=%.0f
",i,arr3[i]);
  // }
}

The thing is that cpu processing time is less than GPU processing time. Where am I doing wrong?

Help would be greatfully apreciated.

You’re not doing anything wrong: your expectations are.

Array add is completely memory bound. All you’re doing is timing a local array read/write (and it’s so small: completely in-cache) versus read from memory, a copy across the PCIe bus, then an array read/write in devicememory, and then a copy back across the PCIe bus and write to memory again.

(not to mention the queuing up of a job and launching of a task on the gpu - this alone probably takes more than adding 512 elements).

Of course it will be slower using the GPU to do this - you have much more additional memory movement and such a tiny bit of work to do.

Ok I understand what you are saying, so I fed more data for the GPU processing. The result is the same. CPU outruns the GPU by an order of size.


#include <oclUtils.h>
#include "Timer.h"
#define NUM  1144477

int main()
{
   cl_platform_id platform;
   cl_uint arrDimension = NUM;
   

   size_t szGlobalWorkSize;
   size_t szLocalWorkSize=32;
   szGlobalWorkSize = shrRoundUp ((int) szLocalWorkSize, NUM); 
   printf("Global Work Size = %d 
",(int)szGlobalWorkSize);
   
   float *arr1 = (float*) malloc (sizeof (cl_float) * szGlobalWorkSize);
   float *arr2 = (float*) malloc (sizeof (cl_float) * szGlobalWorkSize);
   float *arr3 = (float*) malloc (sizeof (cl_float) * szGlobalWorkSize);
   Timer t1;
   cl_int err = oclGetPlatformID(&platform);
   if(err != CL_SUCCESS)
       printf("O eroare la citirea platformei= %s
",oclErrorString(err));

   cl_device_id gpuDevice;
   err = clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU,1,&gpuDevice,NULL);
   if(err != CL_SUCCESS)
        printf("O eroare la conectarea la dispozitivul de calcul= %s
",oclErrorString(err));
   char cBuffer[1024];
   err = clGetDeviceInfo(gpuDevice,CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(cBuffer),&cBuffer,NULL);
   if(err != CL_SUCCESS)
        printf("O eroare la interogarea dispozitivului de calcul= %s
",oclErrorString(err));
   printf("Max items: %s
",cBuffer);

   cl_context gpuContext;
   gpuContext = clCreateContext(0,1,&gpuDevice,NULL,NULL,&err);
   if(err != CL_SUCCESS)
        printf("O eroare la crearea contextului= %s
",oclErrorString(err));
   
   cl_command_queue gpuCommandQueue;
   gpuCommandQueue = clCreateCommandQueue(gpuContext,gpuDevice,0,&err);
   if(err != CL_SUCCESS)
        printf("O eroare la crearea cozii de comenzi= %s
",oclErrorString(err));

   cl_program program;
   const char *kernelStr = "__kernel void add(__global float* a, __global float* b, __global float* c)\
                     {\
                         unsigned int i = get_global_id(0);\
                         c[i] = a[i] - b[i];\
                     }";
   size_t kernelLength = strlen(kernelStr);
   program = clCreateProgramWithSource(gpuContext,1,&kernelStr,&kernelLength,&err);
   err = clBuildProgram(program,0,NULL,NULL,NULL,NULL);
   if(err != CL_SUCCESS)
        printf("O eroare la compilarea kernelului= %s
",oclErrorString(err));

   cl_kernel kernel = clCreateKernel(program,"add",&err);
for(int i=0;i<NUM;i++)
{
arr1[i] = i;
arr2[i] = NUM-i;
}
 
   cl_mem buf_a = clCreateBuffer(gpuContext,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(cl_float)*szGlobalWorkSize, arr1,&err);
   cl_mem buf_b = clCreateBuffer(gpuContext,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(cl_float)*szGlobalWorkSize, arr2,&err);
   cl_mem buf_c = clCreateBuffer(gpuContext,CL_MEM_WRITE_ONLY,sizeof(cl_float)*szGlobalWorkSize,arr3,&err);

   err = clSetKernelArg(kernel,0,sizeof(cl_mem),(void *)&buf_a);
   err = clSetKernelArg(kernel,1,sizeof(cl_mem),(void *)&buf_b);
   err = clSetKernelArg(kernel,2,sizeof(cl_mem),(void *)&buf_c);
   clFinish(gpuCommandQueue);
   t1.start();
   
   err = clEnqueueNDRangeKernel(gpuCommandQueue,kernel,1,NULL,&szGlobalWorkSize,&szLocalWorkSize,0,0,0);
   clEnqueueReadBuffer(gpuCommandQueue,buf_c,CL_TRUE,0, szGlobalWorkSize*sizeof(cl_float),arr3,0,0,0);
	//for(int i=0;i<NUM;i++)
	//{
	//arr3[i]=arr1[i] - arr2[i];
	//}

   printf("Time: %f
",t1.getElapsedTimeInMilliSec());
   clReleaseMemObject(buf_a);
   clReleaseMemObject(buf_b);


}

Is this right? And if so, where can I expect a greater performance from the GPU. The scope of OpenCL studying is to do a faster DES encryption algorithm that relies on GPU computation. Is this posible???

Unfortunately no, no you didn’t. A bigger problem just means more memory to copy. It’s still just the memory copy time you’re timing. Copying more of it will just mean there’s more of it to time. You will reduce some of the kernel launch overhead time wrt the cost of each item, but the memory copy is the real problem here.

More work means more work per memory item, not more memory items with no work involved.

If you want to compare gpu time just time the kernel call without the memory copies (bracket it with clFinish()). This wont give you a result on the cpu but if you’re running such a simple kernel it will be for a result fed to another kernel anyway and shouldn’t be coming back to the cpu: i.e. ignoring the memory copy from host to device and back again is entirely valid.

Is this right? And if so, where can I expect a greater performance from the GPU. The scope of OpenCL studying is to do a faster DES encryption algorithm that relies on GPU computation. Is this posible???

You can expect a faster result if you’re doing more work. GPU’s have hundreds of times the ALU performance of a corresponding generation CPU. But if all you’re doing is a single addition, you’re hardly exercising that.

Good info.

Thank you for your replies.