Hello,
I am a beginner with OpenCL, and I am trying to run a basic kernel, in the aim of processing image analysing function on GPU. For now I just want to try to execute the following kernel, on CPU Intel Sandy Bridge. I am working on visual studio 2010.
The problem is that my kernel is not executed on the whole data. However, trying to debug, I understood that leaving a printf into the kernel make it working perfectly x). I assume it is because it slows down the execution…
Here is the kernel:
__kernel void ker_FLADIntra_sum_c(
const int stride,
__global int* pix,
__global int* p_flad_sum)
{
int i, j;
j = get_global_id(0);
i= get_global_id(1);
/*printf which make it working*/
//printf("j=%d i=%d sum=%d
", j, i, *p_flad_sum);
*p_flad_sum+= abs(pix[j*stride + i] - pix[(j+1)*stride + i]);
}
The host program is mainly based on OpenCL SDK example, and Intel tutorial. Here is how I manage memory and run the kernel:
//create OpenCL buffer using input array memory
g_inputBuffer = clCreateBuffer(g_context, CL_MEM_READ_ONLY, sizeof(cl_int) * arraySize, NULL, NULL);
g_flad_sum = clCreateBuffer(g_context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, NULL);
if (g_inputBuffer == (cl_mem)0)
{
printf("ERROR: Failed to create input data Buffer
");
return false;
}
err = clEnqueueWriteBuffer( g_cmd_queue, g_inputBuffer, CL_TRUE, 0, sizeof(cl_int) * arraySize, inputArray, 0, NULL, NULL );
err = clEnqueueWriteBuffer( g_cmd_queue, g_flad_sum, CL_TRUE, 0, sizeof(cl_int), p_flad_sum, 0, NULL, NULL );
err = clSetKernelArg(g_kernel, 0, sizeof(cl_int), (void *) &stride);
err |= clSetKernelArg(g_kernel, 1, sizeof(cl_mem), (void *) &g_inputBuffer);
err |= clSetKernelArg(g_kernel, 2, sizeof(cl_mem), (void *) &g_flad_sum);
if (err != CL_SUCCESS)
{
printf("ERROR: Failed to set input kernel arguments
");
return false;
}
// set work-item dimensions
size_t global_work_size[2];
global_work_size[0] = (size_t) stride-1; //number of quad items in input array
global_work_size[1] = (size_t) stride; //skip the last row not to go out of allocation
nd=2; // execute kernel (2D)
if (CL_SUCCESS != clEnqueueNDRangeKernel(g_cmd_queue, g_kernel, nd, NULL, global_work_size, NULL, 0, NULL, NULL))
{
printf("ERROR: Failed to execute sorting kernel
");
return false;
}
err = clFinish(g_cmd_queue);
err = clEnqueueReadBuffer( g_cmd_queue, g_flad_sum, CL_TRUE, 0, sizeof(cl_int), p_flad_sum, 0, NULL, NULL );
I also ran VTune to check the timeline threads behaviour with and without this printf:
With
http://img109.imageshack.us/img109/8823/oclfladvtune2.png
I first thought about the blocking or non-blocking memory read, but it is already on CL_TRUE.
What am I missing?
Thanks.