Hi,
I have a kernel that I try to optimize and I have something strange. My kernel takes one image as input. For each pixel (c,r), it reads 12 values from the image in the neighborhood of (c,r) (not the direct neighborhood). From these values, I compute two floats “L_xx” and “L_yy”. So far so good. Then, I write the result of “L_xx * L_yy” in the output buffer (coalesced write).
When I do this, the computation takes about 1200 micro sec. When I write “0.f” insated of “L_xx * L_yy”, the computation takes 150 micro sec! Why does “L_xx * L_yy” take 90% of the kernel computation when I already do a lot of floating point operations? I really don’t get it.
I paste the code bellow. See the last two lines. Thanks!
float integrate_surface(__read_only image2d_t input,
const uint width,
const uint height,
const int col,
const int row,
const int halfWidth,
const int halfHeight,
const sampler_t sampler){
int top = row - halfHeight - 1;
int bottom = top + 2 * halfHeight + 1;
int left = col - halfWidth - 1;
int right = left + 2 * halfWidth + 1;
float A = read_imagef(input, sampler, (int2)( left, top )).x;
float B = read_imagef(input, sampler, (int2)( right, top )).x;
float C = read_imagef(input, sampler, (int2)( left, bottom )).x;
float D = read_imagef(input, sampler, (int2)( right, bottom )).x;
return D - B - C + A;
}
__kernel void compute(__read_only image2d_t input,
__global float * output,
const uint width,
const uint height,
const uint h,
const int offset,
const sampler_t sampler){
int c = get_global_id(0);
int r = get_global_id(1);
if (c>width-1 || r>height-1)
return;
int l = 2 * h + 1;
// Compute Lxx
float white_xx = integrate_surface(input, width, height, c, r, h+l, l-1, sampler);
float black_xx = integrate_surface(input, width, height, c, r, h, l-1, sampler);
float L_xx = white_xx - 3.f * black_xx;
// Compute Lyy
float white_yy = integrate_surface(input, width, height, c, r, l-1, h+l, sampler);
float black_yy = integrate_surface(input, width, height, c, r, l-1, h, sampler);
float L_yy = white_yy - 3.f * black_yy;
// Compute result
output[r*width+c] = L_xx * L_yy;
// output[r*width+c] = 0.f;
}