Different output on GPU vs CPU

I’m in the process of writing a 2D (non-separable) convolution routine. I’m trying a couple of approaches towards optimizing the routine, one of which is to use images/textures. I’ve written a test kernel (only for multiple of 8 sized convolution filters at the moment, but I’m just trying to get baseline measurements) and it runs great on the CPU device. Changing to the GPU however yields a totally different result however (and literally all I’m changing is the C to a G in the context creation call). Here is the code for the kernel:


const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
                          CLK_ADDRESS_CLAMP_TO_EDGE |
                          CLK_FILTER_NEAREST;
                          
kernel void filter(read_only image2d_t image,
                   constant float *filter,
                   int filt_width, int filt_height,
                   write_only image2d_t output) {
    int2 ic = (int2)(get_global_id(0), get_global_id(1));
    int2 fc;
    
    int filt_indx = 0;
    filt_width /= 4;
    
    float4 sum = 0;
    
    if (filt_width%2 == 0) {
        int half_filt_width = filt_width/2;
        int half_filt_height = filt_height/2;
        
        float4 imval1, imval2, fval;
        
        for (fc.y = -half_filt_height; fc.y < half_filt_height; fc.y++) {
            fc.x = -half_filt_width;
            
            imval2 = read_imagef(image, sampler, ic + fc);
            for (fc.x++; fc.x <= half_filt_width; fc.x++) {
                imval1 = imval2;
                imval2 = read_imagef(image, sampler, ic + fc);
                fval = vload4(filt_indx++, filter);
                
                sum.x += fval.x*imval1.x;
                sum.x += fval.y*imval1.y;
                sum.x += fval.z*imval1.z;
                sum.x += fval.w*imval1.w;
                
                sum.y += fval.x*imval1.y;
                sum.y += fval.y*imval1.z;
                sum.y += fval.z*imval1.w;
                sum.y += fval.w*imval2.x;
                
                sum.z += fval.x*imval1.z;
                sum.z += fval.y*imval1.w;
                sum.z += fval.z*imval2.x;
                sum.z += fval.w*imval2.y;
                
                sum.w += fval.x*imval1.w;
                sum.w += fval.y*imval2.x;
                sum.w += fval.z*imval2.y;
                sum.w += fval.w*imval2.z;
            }
        }
    }
    else {
    }
    
    write_imagef(output, ic, sum);
}

The code I’m running it with is pretty straightforward (I load a JPEG file, construct the convolution filter, set up the image for the output, enqueue the kernel, read the output back out). For instance, here is how the images are constructed (I’m using the C++ bindings):


    cl::Image2D im1Buf(context, CL_MEM_COPY_HOST_PTR,
                       cl::ImageFormat(CL_RGBA, CL_FLOAT),
                       image1.width()/nelems, image1.height(),
                       image1.width()*sizeof(float),
                       &image1.data()[0]);
    
    cl::Image2D output(context, CL_MEM_READ_WRITE,
                       cl::ImageFormat(CL_RGBA, CL_FLOAT),
                       image1.width()/nelems, image1.height(),
                       image1.width()*sizeof(float));

As I said, this works great on the CPU, and flies like the wind on the GPU, but doesn’t get the correct result. Any ideas what might be going on?

It would be easier to help you if you could simplify your example as much as possible and post the simplified version.

Thanks for the reply… I’ve figured out what was going on at this point, forgot to post a follow up note. It turns out there’s a bug in Apple’s OpenCL implementation involving reads from constant memory within a double for loop (all reads return 0). The following simplified code illustrates the bug:


#include <stdlib.h>
#include <stdio.h>

#include <OpenCL/cl.h>
#include <OpenCL/cl_ext.h>

const char *kernelSrc =
"kernel void filter(constant float *filter," // Making filter global instead of constant works
"                   global float *output) {"
"    size_t filt_indx = 0;"
"    float4 test = 0;"
"    float4 fval;"
"    "
"    for (int j = 0; j < 1; j++) {" // Changing this line or the one below
"        for (int i = 0; i < 1; i++) {" // to "if (true) {" also works
"            fval = vload4(filt_indx++, filter);" // Replacing filt_indx++ with 0 works, though avoiding vload4 doesn't
"            test = fval;"
"        }"
"    }"
"    "
"    vstore4(test, 0, output);"
"}"
;

int main (int argc, char *const argv[]) {
    size_t cb;
    size_t n = 4;
    float filter[n];
    float output[n];
    
    for (int i = 0; i < n; i++)
        filter[i] = i;
    
    for (int i = 0; i < 2; i++) {
        cl_context context;
        if (i == 0) context = clCreateContextFromType(NULL, CL_DEVICE_TYPE_CPU, clLogMessagesToStdoutAPPLE, NULL, NULL);
        else context = clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU, clLogMessagesToStdoutAPPLE, NULL, NULL);
        clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb);
        cl_device_id *devices = (cl_device_id *)malloc(cb*sizeof(cl_device_id));
        clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL);
        cl_command_queue cmdQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
        
        cl_mem filterBuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY, n*sizeof(float), filter, NULL);
        cl_mem outputBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, n*sizeof(float), NULL, NULL);
        
        cl_program program = clCreateProgramWithSource(context, 1, &kernelSrc, NULL, NULL);
        clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
        
        cl_kernel kernel = clCreateKernel(program, "filter", NULL);
        clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&filterBuf);
        clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuf);
        
        clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, &n, NULL, 0, NULL, NULL);
        clEnqueueReadBuffer(cmdQueue, outputBuf, CL_TRUE, 0, n*sizeof(cl_float), output, 0, NULL, NULL);
        
        if (i == 0)
            printf("Running on CPU, ");
        else
            printf("Running on GPU, ");
        printf("output should be 0 1 2 3:
");
        printf("%.0f %.0f %.0f %.0f
", output[0], output[1], output[2], output[3]);
        
        clReleaseContext(context);
        free(devices);
        clReleaseCommandQueue(cmdQueue);
        clReleaseMemObject(filterBuf);
        clReleaseMemObject(outputBuf);
        clReleaseProgram(program);
        clReleaseKernel(kernel);
    }
}

There are other threads complaining about this same issue (see e.g. http://www.khronos.org/message_boards/viewtopic.php?f=28&t=2727). I’ve filed a bug report with Apple (though several weeks later it remains marked as “Open”), and encourage anyone else who has run into this issue to do the same. Hopefully it’s fixed soon, since it’s a reasonably major/annoying bug. Note that it doesn’t occur on NVIDIA’s platform under Windows (though they have their own bugs there involving logical operations on vector types, hooray for immature OpenCL drivers).

Thanks for the follow-up. It’s good to know.