Hello,
I have recently started working with OpenCL, but at my first program I ran into some problems.
This is the code:
#include <oclUtils.h>
#include "Timer.h"
#define NUM 512
int main()
{
cl_platform_id platform;
cl_uint arrDimension = NUM;
float *arr1 = new float[NUM];
float *arr2 = new float[NUM];
float *arr3 = new float[NUM];
Timer t1;
cl_int err = oclGetPlatformID(&platform);
if(err != CL_SUCCESS)
printf("O eroare la citirea platformei= %s
",oclErrorString(err));
cl_device_id gpuDevice;
err = clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU,1,&gpuDevice,NULL);
if(err != CL_SUCCESS)
printf("O eroare la conectarea la dispozitivul de calcul= %s
",oclErrorString(err));
cl_context gpuContext;
gpuContext = clCreateContext(0,1,&gpuDevice,NULL,NULL,&err);
if(err != CL_SUCCESS)
printf("O eroare la crearea contextului= %s
",oclErrorString(err));
cl_command_queue gpuCommandQueue;
gpuCommandQueue = clCreateCommandQueue(gpuContext,gpuDevice,0,&err);
if(err != CL_SUCCESS)
printf("O eroare la crearea cozii de comenzi= %s
",oclErrorString(err));
cl_program program;
const char *kernelStr = "__kernel void add(__global float* a, __global float* b, __global float* c)\
{\
unsigned int i = get_global_id(0);\
c[i] = a[i] + b[i];\
}";
size_t kernelLength = strlen(kernelStr);
program = clCreateProgramWithSource(gpuContext,1,&kernelStr,&kernelLength,&err);
err = clBuildProgram(program,0,NULL,NULL,NULL,NULL);
if(err != CL_SUCCESS)
printf("O eroare la compilarea kernelului= %s
",oclErrorString(err));
cl_kernel kernel = clCreateKernel(program,"add",&err);
for(int i=0;i<NUM;i++)
{
arr1[i] = i;
arr2[i] = i;
}
cl_mem buf_a = clCreateBuffer(gpuContext,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(cl_float)*NUM, arr1,&err);
cl_mem buf_b = clCreateBuffer(gpuContext,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(cl_float)*NUM, arr2,&err);
cl_mem buf_c = clCreateBuffer(gpuContext,CL_MEM_WRITE_ONLY,sizeof(cl_float)*NUM,arr3,&err);
err = clSetKernelArg(kernel,0,sizeof(cl_mem),(void *)&buf_a);
err = clSetKernelArg(kernel,1,sizeof(cl_mem),(void *)&buf_b);
err = clSetKernelArg(kernel,2,sizeof(cl_mem),(void *)&buf_c);
clFinish(gpuCommandQueue);
t1.start();
err = clEnqueueNDRangeKernel(gpuCommandQueue,kernel,1,NULL,&arrDimension,0,0,0,0);
clEnqueueReadBuffer(gpuCommandQueue,buf_c,CL_TRUE,0, NUM*sizeof(cl_float),arr3,0,0,0);
//for(int i=0;i<NUM;i++)
//{
//arr3[i]=arr1[i] +arr2[i];
//}
printf("Time: %f
",t1.getElapsedTimeInMicroSec());
clReleaseMemObject(buf_a);
clReleaseMemObject(buf_b);
// for(int i=0;i<NUM;i++)
// {
//printf("arr3[%d]=%.0f
",i,arr3[i]);
// }
}
The thing is that cpu processing time is less than GPU processing time. Where am I doing wrong?
Help would be greatfully apreciated.