I am getting wrong output for the code below. I guess the problem is with clEnqueueMapBuffer but I am not getting where I am going wrong…
int main (int argc, char **argv)
{
cl_platform_id platform;
cl_device_id device;
cl_program program;
cl_context context;
cl_mem srcimg, dstimg;
cl_command_queue cmd;
cl_kernel kernel;
cl_int error;
cl_image_format image;
cl_uint num_dev;
// input data
float idata[] = {10, 20, 30, 40, 10, 20, 30, 40, 10, 20, 30, 40, 10, 20, 30,40};
// output data buffer
float *odata = (float*)malloc(sizeof(float) * 16);
// transfer target co-ordinate. For a 2D image 3rd component must be zero
size_t origin[] = {0, 0, 0};
// size of 2D object
size_t region[] = {4, 4, 1};
// size of the kernel source
size_t src_size;
// get the kernel source file and load the kernel source into scr_string
const char* filepath = shrFindFilePath("ImageCopyKernel.cl", NULL);
const char * src_string = oclLoadProgSource(filepath, "", &src_size);
error = oclGetPlatformID(&platform);
error = clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU, 1, &device, &num_dev);
context = clCreateContext(0, num_dev, &device, NULL, NULL, &error);
cmd = clCreateCommandQueue(context, device, 0, &error);
// set the image data type being used and the order
image.image_channel_data_type = CL_FLOAT;
image.image_channel_order = CL_RGBA;
// Create the 2D image and the destination buffer.
srcimg = clCreateImage2D(context,CL_MEM_READ_ONLY, &image, 4, 4, sizeof(cl_float4)*4, idata, &error);
dstimg = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float4)*4*4, odata, &error);
cout<<"dstimg --- ok"<<endl;
// Create program source
program = clCreateProgramWithSource(context, 1, (const char **)&src_string, (const size_t*)&src_size, &error);
// build program
clBuildProgram(program, 1, &device, "", NULL, NULL);
// Create kernel
kernel = clCreateKernel(program, "ImageCopyKernel", &error);
int num = sizeof(idata) / sizeof(float);
//Set kernel arguements
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&srcimg);
clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&dstimg);
clSetKernelArg(kernel, 2, sizeof(int), &num);
cin>>x;
size_t local_ws = 4;
size_t global_ws[2] = {4,4};//shrRoundUp(local_ws,
// start the kernel execution
clEnqueueNDRangeKernel(cmd, kernel, 2, 0, global_ws, &local_ws, 0, NULL, NULL);
void * pointer = clEnqueueMapBuffer(cmd, dstimg, CL_TRUE,CL_MAP_READ,sizeof(cl_float4)*4*4, 0, 0, NULL, NULL, &error);
for(int j = 0; j < 16; j++)
cout<<odata[j]<<" ";
clFinish(cmd);
clEnqueueUnmapMemObject(cmd, dstimg, pointer, 0, NULL, NULL);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseMemObject(dstimg);
}
The kernel is defined as follows…
__kernel void ImageCopyKernel(__global const float *src, __global float* dst, const int num)
{
int idx = get_global_id(0);
int ids = get_global_size(0);
int i;
for (i = 0; i<num; i+=ids)
dst[idx] = src[idx];
}