Hi,
I am pretty new to OpenCL.
I am trying to parallelize the following snippet:
for ( ; i < ncols - radius ; i++) {
ppp = ptrrow + i - radius;
sum = 0.0;
//for (k = kernel.width-1 ; k >= 0 ; k--)
//sum += *ppp++ * kernel.data[k];
sum = ppp[0] * kernel.data[kernel.width-1] + ppp[1] * kernel.data[kernel.width-2]
+ ppp[2] * kernel.data[kernel.width-3] + ppp[3] * kernel.data[kernel.width-4]
+ ppp[4] * kernel.data[kernel.width-5] + ppp[5] * kernel.data[kernel.width-6]
+ ppp[6] * kernel.data[kernel.width-7];
*ptrout++ = sum;
}
I have written the following kernel for the same:
__kernel void convolveImageHoriz(__global float *ppp, __global float *ptrrow, __global float *ptrout, __global float *Kernel, int nrows, int width, int ncols)
{
int i = get_global_id(0);
int radius = width/2;
float sum;
if (i >= radius && i < nrows-radius)
{
ppp = ptrrow + i - radius;
sum = 0.0;
sum = ppp[0] * Kernel[width-1] + ppp[1] * Kernel[width-2]
+ ppp[2] * Kernel[kernel.width-3] + ppp[3] * Kernel[width-4]
+ ppp[4] * Kernel[width-5] + ppp[5] * Kernel[width-6]
+ ppp[6] * Kernel[width-7];
ptrout[i] = sum;
}
}
I know for a fact that there is some issue in the Kernel as the code runs fine when I run some other Kernel.
Could someone help?