I am very new to OpenCL and am trying to learn how to use it to improve the performance of a Python-based project of mine. The performance bottleneck in my project is related to inverse distance trees used to calculate nearest-neighbors, so that’s what I’m trying to do in OpenCL.
The algorithm I’m using as a first pass has been cobbled together from various sources on the net like StackOverflow and “Adventures in OpenCL”. When run as the heart of a C++ application, it works fine. When run in OpenCL, it does not. I’m confused!
Here is the source to the OpenCL portion:
struct point {
int x;
int z;
};
__kernel void nearest(__global struct point *coords, __global float *values, __global struct point *base, __global float *output, const unsigned int lencoords) {
// base index
int g_dataset_id = get_global_id(0);
int dmin = -1;
int d;
for (int i=0; i<lencoords; i++) {
d = (coords[i].x-base[g_dataset_id].x)*(coords[i].x-base[g_dataset_id].x)+(coords[i].z-base[g_dataset_id].z)*(coords[i].z-base[g_dataset_id].z);
if (dmin == -1 || d < dmin) {
output[g_dataset_id] = values[i];
dmin = d;
}
}
}
And here is the Python portion:
import pyopencl as cl
import numpy
from itertools import product
from random import randint
class CL:
def __init__(self):
self.ctx = cl.create_some_context()
self.queue = cl.CommandQueue(self.ctx)
def loadProgram(self, filename):
#read in the OpenCL source file as a string
f = open(filename, 'r')
fstr = "".join(f.readlines())
#print fstr
#create the program
self.program = cl.Program(self.ctx, fstr).build()
def popCorn(self):
mf = cl.mem_flags
#initialize client side (CPU) arrays
self.xsize = 4
self.zsize = 4
self.lencoords = 4
self.coords = numpy.array([(randint(0, self.xsize-1),randint(0, self.xsize-1)) for elem in xrange(self.lencoords)], dtype=numpy.uint8)
self.values = numpy.array([elem+4 for elem in xrange(self.lencoords)], dtype=numpy.float32)
self.base = numpy.array([(z, x) for (z, x) in product(xrange(self.zsize),xrange(self.xsize))], dtype=numpy.uint8)
self.output = numpy.zeros((self.zsize*self.xsize), dtype=numpy.float32)
#create OpenCL buffers
self.coords_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.coords)
self.values_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.values)
self.base_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.base)
self.output_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, size=self.output.nbytes)
def execute(self):
self.program.nearest(self.queue, (self.base.shape), None, self.coords_buf, self.values_buf, self.base_buf, self.output_buf, numpy.int32(self.lencoords))
self.results = numpy.empty_like(self.output)
cl.enqueue_read_buffer(self.queue, self.output_buf, self.results).wait()
def dumpit(self):
print zip(self.coords, self.values)
print zip(self.base, self.results)
if __name__ == "__main__":
example = CL()
example.loadProgram("nearest.cl")
example.popCorn()
example.execute()
example.dumpit()
The output as found in self.results is a mishmash of values from self.values, not ordered correctly at all. I’m afraid I don’t know even know enough to know how to put this into a debugger or something. Help! Thank you very much in advance.
Jack.