Confusion on output

I am very new to OpenCL and am trying to learn how to use it to improve the performance of a Python-based project of mine. The performance bottleneck in my project is related to inverse distance trees used to calculate nearest-neighbors, so that’s what I’m trying to do in OpenCL.

The algorithm I’m using as a first pass has been cobbled together from various sources on the net like StackOverflow and “Adventures in OpenCL”. When run as the heart of a C++ application, it works fine. When run in OpenCL, it does not. I’m confused!

Here is the source to the OpenCL portion:


struct point {
  int x;
  int z;
};

__kernel void nearest(__global struct point *coords, __global float *values, __global struct point *base, __global float *output, const unsigned int lencoords) {
  // base index
  int g_dataset_id = get_global_id(0);

  int dmin = -1;
  int d;

  for (int i=0; i<lencoords; i++) {

    d = (coords[i].x-base[g_dataset_id].x)*(coords[i].x-base[g_dataset_id].x)+(coords[i].z-base[g_dataset_id].z)*(coords[i].z-base[g_dataset_id].z);

    if (dmin == -1 || d < dmin) {
      output[g_dataset_id] = values[i];
      dmin = d;
    }
  }
}

And here is the Python portion:


import pyopencl as cl
import numpy
from itertools import product
from random import randint

class CL:
    def __init__(self):
        self.ctx = cl.create_some_context()
        self.queue = cl.CommandQueue(self.ctx)

    def loadProgram(self, filename):
        #read in the OpenCL source file as a string
        f = open(filename, 'r')
        fstr = "".join(f.readlines())
        #print fstr
        #create the program
        self.program = cl.Program(self.ctx, fstr).build()

    def popCorn(self):
        mf = cl.mem_flags

        #initialize client side (CPU) arrays
        self.xsize = 4
        self.zsize = 4
        self.lencoords = 4
        self.coords = numpy.array([(randint(0, self.xsize-1),randint(0, self.xsize-1)) for elem in xrange(self.lencoords)], dtype=numpy.uint8)
        self.values = numpy.array([elem+4 for elem in xrange(self.lencoords)], dtype=numpy.float32)
        self.base = numpy.array([(z, x) for (z, x) in product(xrange(self.zsize),xrange(self.xsize))], dtype=numpy.uint8)
        self.output = numpy.zeros((self.zsize*self.xsize), dtype=numpy.float32)

        #create OpenCL buffers
        self.coords_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.coords)
        self.values_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.values)
        self.base_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.base)
        self.output_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, size=self.output.nbytes)

    def execute(self):
        self.program.nearest(self.queue, (self.base.shape), None, self.coords_buf, self.values_buf, self.base_buf, self.output_buf, numpy.int32(self.lencoords))
        self.results = numpy.empty_like(self.output)
        cl.enqueue_read_buffer(self.queue, self.output_buf, self.results).wait()

    def dumpit(self):
        print zip(self.coords, self.values)
        print zip(self.base, self.results)

if __name__ == "__main__":
    example = CL()
    example.loadProgram("nearest.cl")
    example.popCorn()
    example.execute()
    example.dumpit()

The output as found in self.results is a mishmash of values from self.values, not ordered correctly at all. I’m afraid I don’t know even know enough to know how to put this into a debugger or something. Help! Thank you very much in advance.

Jack.

I was able to solve my own problem. My main trouble was argument agreement – don’t send an integer when a float is expected, et cetera. The next step is to figure out how to handle arbitrarily large arrays, but that will be a different thread if necessary.

Here is the resulting code:

First the OpenCL code:

struct point {
  int x;
  int z;
};

__kernel void nearest(__global struct point *coords, __global int *values, __global struct point *base, __global int *output, const unsigned int lencoords) {
  // base index
  int g_dataset_id = get_global_id(0);

  int dmin = -1;
  int d, dx, dz;

  for (int i=0; i<lencoords; i++) {
    dx = coords[i].x-base[g_dataset_id].x;
    dx = dx * dx;
    dz = coords[i].z-base[g_dataset_id].z;
    dz = dz * dz;
    d = dx + dz;
    if (dmin == -1 || d < dmin) {
      output[g_dataset_id] = values[i];
      dmin = d;
    }
  }
}

Now the Python code:

import pyopencl as cl
import numpy
from itertools import product
from random import randint, uniform

class CL:
    def __init__(self):
        self.ctx = cl.create_some_context()
        self.queue = cl.CommandQueue(self.ctx)

    def loadProgram(self, filename):
        #read in the OpenCL source file as a string
        f = open(filename, 'r')
        fstr = "".join(f.readlines())
        #print fstr
        #create the program
        self.program = cl.Program(self.ctx, fstr).build()

    def popCorn(self):
        mf = cl.mem_flags

        #initialize client side (CPU) arrays
        self.xsize = 256
        self.zsize = 256
        self.lenbase = self.xsize * self.zsize
        self.lencoords = max(1, int(self.lenbase*0.05))
        self.coords = numpy.array([(randint(0, self.xsize-1),randint(0, self.xsize-1)) for elem in xrange(self.lencoords)], dtype=numpy.int32)
        self.values = numpy.array([uniform(1,5) for elem in xrange(self.lencoords)], dtype=numpy.int32)
        self.base = numpy.array([(index / self.zsize, index % self.zsize) for index in xrange(self.lenbase)], dtype=numpy.int32)
        self.output = numpy.zeros((self.lenbase), dtype=numpy.int32)

        #create OpenCL buffers
        self.coords_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.coords)
        self.values_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.values)
        self.base_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.base)
        self.output_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, size=self.output.nbytes)

    def execute(self):
        self.program.nearest(self.queue, self.base.shape, None, self.coords_buf, self.values_buf, self.base_buf, self.output_buf, numpy.int32(self.lencoords))
        self.results = numpy.empty_like(self.output)
        cl.enqueue_read_buffer(self.queue, self.output_buf, self.results).wait()
        self.results.resize((self.zsize, self.xsize))

    def dumpit(self):
        for xind in xrange(self.xsize):
            line = ''
            for zind in xrange(self.zsize):
                line += str(int(self.results[xind,zind]))
            print line

            

if __name__ == "__main__":
    example = CL()
    example.loadProgram("nearest.cl")
    example.popCorn()
    example.execute()
    example.dumpit()