Error with clEnqueueMapBuffer.. Getting wrong results.

I am getting wrong output for the code below. I guess the problem is with clEnqueueMapBuffer but I am not getting where I am going wrong…


int main (int argc, char **argv)
{
	cl_platform_id platform;
	cl_device_id device;
	cl_program program;
	cl_context context;
	cl_mem srcimg, dstimg;
	cl_command_queue cmd;
	cl_kernel kernel;
	cl_int error;
	cl_image_format image;
	cl_uint num_dev;

	// input data
	float idata[] = {10, 20, 30, 40, 10, 20, 30, 40, 10, 20, 30, 40, 10, 20, 30,40};
	// output data buffer
	float *odata = (float*)malloc(sizeof(float) * 16);
	// transfer target co-ordinate. For a 2D image 3rd component must be zero
	size_t origin[] = {0, 0, 0};
	// size of 2D object 
	size_t region[] = {4, 4, 1};

	// size of the kernel source
	size_t src_size;
	
	// get the kernel source file and load the kernel source into scr_string
	const char* filepath = shrFindFilePath("ImageCopyKernel.cl", NULL);
	const char * src_string = oclLoadProgSource(filepath, "", &src_size);
	
	error = oclGetPlatformID(&platform);

	error = clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU, 1, &device, &num_dev);
	context = clCreateContext(0, num_dev, &device, NULL, NULL, &error);
	cmd = clCreateCommandQueue(context, device, 0, &error);
	// set the image data type being used and the order
	image.image_channel_data_type = CL_FLOAT;
	image.image_channel_order = CL_RGBA;
	// Create the 2D image and the destination buffer. 
	srcimg = clCreateImage2D(context,CL_MEM_READ_ONLY, &image, 4, 4, sizeof(cl_float4)*4, idata, &error);
	
	dstimg = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_float4)*4*4, odata, &error);
	cout<<"dstimg --- ok"<<endl;
	// Create program source
	program = clCreateProgramWithSource(context, 1, (const char **)&src_string, (const size_t*)&src_size, &error); 
		// build program
    clBuildProgram(program, 1, &device, "", NULL, NULL);
	// Create kernel 
	kernel = clCreateKernel(program, "ImageCopyKernel", &error);
	int num = sizeof(idata) / sizeof(float);
	//Set kernel arguements
    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&srcimg);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&dstimg);
	clSetKernelArg(kernel, 2, sizeof(int), &num);
	cin>>x;
	size_t local_ws = 4;
	size_t global_ws[2] = {4,4};//shrRoundUp(local_ws, 
	
	// start the kernel execution
	clEnqueueNDRangeKernel(cmd, kernel, 2, 0, global_ws, &local_ws, 0, NULL, NULL);

	void * pointer = clEnqueueMapBuffer(cmd, dstimg, CL_TRUE,CL_MAP_READ,sizeof(cl_float4)*4*4, 0, 0, NULL, NULL, &error);
	for(int j = 0; j < 16; j++)
		cout<<odata[j]<<"  ";
	clFinish(cmd);
	clEnqueueUnmapMemObject(cmd, dstimg, pointer, 0, NULL, NULL);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseMemObject(dstimg);
}

The kernel is defined as follows…


__kernel void ImageCopyKernel(__global const float *src, __global float* dst, const int num)
{
	int idx = get_global_id(0);
	int ids = get_global_size(0);
	int i;
	for (i = 0; i<num; i+=ids)
		dst[idx] = src[idx];
}