OpenCl kernel code behaves differenty (incorrectly) when wrapped in a function.

Hello,

I’m using PyOpenCl and my OpenCl kernel code behaves differently (wrongly) when I put it in a function than when it’s part of the main kernel program. It works correctly on one of my Dell laptops with an Nvidia GPU, but not on my other one with a Dell GPU. Both are using Ubuntu 16.04.

I’m writing a very crude image filtering function to smoothly offset pixels by a non-integer value, eg. translating the image by (.25, 0) pixels will have this effect: newPixelHere = oldPixelHere * .75 + oldPixelToTheLeft * .25.

I’ve attached a redux version of the code, where a simple 3d array is used as a standin for a colour image. As I’ve noted in the code comments, the function filterImg() contains identical code to the commented out code in the main body, but the main body works, whereas the function does not - it just sets “ret” to a constant value of 49. Moreover, the function behaves as expected if I change “i<4” to “i<3” in the main loop, or if I avoid using integer / and %.

I’m just starting with OpenCl programming and am admittedly out of my depth, but I’ve read some things about using loops with caution in OpenCl. Again, this works on my Nvidia GPU, but not on my Dell one.

Finally, although OpenCl had been working properly on my Dell GPU up until this problem, I have been receiving the following warning when I run OpenCl:

beignet-opencl-icd: no supported GPU found, this is probably the wrong opencl-icd package for this hardware
(If you have multiple ICDs installed and OpenCL works, you can ignore this message)

I hadn’t worried much about this since I thought I might have multiple ICDs installed, since there are multiple entries in /etc/OpenCL/vendors/:

~:ls -1 /etc/OpenCL/vendors/
intel-beignet-x86_64-linux-gnu.icd
mesa.icd
~:

… but again, I don’t really know what I’m doing. Any help appreciated, thanks very much in advance.

Jeremy.

python code:

#!/usr/bin/python
import pyopencl as cl
import numpy as np

def printLs3d(ls3d):
	# Utility to print "red" channel of 3d array.	
	for y in range(len(ls3d[0])):
		print
		for x in range(len(ls3d)):
			if ls3d[x][y][0] == 0:
				print "...",
			else:
				print "%03d" % ls3d[x][y][0],

def shadeImg(lsIn):
	printLs3d(lsIn)

	cntxt = cl.create_some_context()
	queue = cl.CommandQueue(cntxt)
	res = (len(lsIn)-1, len(lsIn[0])-1)
	print

	# Inputs
	srcImgAr_buf =  cl.Buffer(cntxt, cl.mem_flags.READ_ONLY |
		#cl.mem_flags.COPY_HOST_PTR,hostbuf=np.array(list(pygame.surfarray.array3d(srcImg))))
		cl.mem_flags.COPY_HOST_PTR,hostbuf=np.array(lsIn))

	# Outputs
	shadedImg = np.zeros((len(lsIn), len(lsIn[0]), len(lsIn[0][0])), dtype=np.uint8)
	shadedImg_buf = cl.Buffer(cntxt, cl.mem_flags.WRITE_ONLY |
		cl.mem_flags.COPY_HOST_PTR,hostbuf=shadedImg)

	kernelPath = "/home/jeremy/dev/warp/testOpenClLoops/testOpenClLoops.c"
	with open(kernelPath) as f:
		kernel = "".join(f.readlines())
	
	bld = cl.Program(cntxt, kernel).build()
	launch = bld.krShadeImg(
			queue,
			#srcImgAr.shape,
			res,
			None,
			np.int32(res[0]),
			np.int32(res[1]),
			srcImgAr_buf,
			shadedImg_buf)
	launch.wait()
	

	cl.enqueue_read_buffer(queue, shadedImg_buf, shadedImg).wait()
	printLs3d(shadedImg)




testIn = [
	[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 
	[[0, 0, 0], [100, 100, 100], [100, 100, 100], [100, 100, 100], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 
	[[0, 0, 0], [100, 100, 100], [100, 100, 100], [100, 100, 100], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 
	[[0, 0, 0], [100, 100, 100], [100, 100, 100], [100, 100, 100], [0, 0, 0], [100, 100, 100], [100, 100, 100], [0, 0, 0]], 
	[[0, 0, 0], [100, 100, 100], [100, 100, 100], [100, 100, 100], [0, 0, 0], [100, 100, 100], [100, 100, 100], [0, 0, 0]], 
	[[0, 0, 0], [100, 100, 100], [100, 100, 100], [100, 100, 100], [0, 0, 0], [100, 100, 100], [100, 100, 100], [0, 0, 0]], 
	[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [100, 100, 100], [100, 100, 100], [0, 0, 0]], 
	[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 
	[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 
	[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]]


testAr = np.array(testIn, dtype=np.uint8)


shadeImg(testAr)


C++ kernel code contained in “/home/jeremy/dev/warp/testOpenClLoops/testOpenClLoops.c”:

void setArrayCell(int x, int y, int xres, int yres,
  uchar* val,
  __global uchar* ret)
{
	if (x >= 0 && x < xres && y >= 0 && y < yres) {
		int i = (x * yres + y) * 3;
		ret[i] = val[0];
		ret[i+1] = val[1];
		ret[i+2] = val[2];
	}
}


void filterImg  (unsigned int x, unsigned int y, int xres, int yres,
	__global uchar* img,
	uchar* ret) {
	// This function contains identical code to the commented-out
	// block in the main body, except some comments.
	// The main body code works; this function doesn't.  Why???

	// Offset xy lookup to weigh influence of each neighbour.
	// If xOfs == yOfs == 0, pixel (x,y) gets full weight.
	// If xOfs == yOfs == 1, pixel (x+1,y+1) gets full weight.
	float xOfs = .75f;
	float yOfs = .5f;

	for (int i=0; i<4; i++) { // ***WORKS IF YOU CHANGE 4 TO 3!
		// Sample 2x2 grid of neighbouring pixels in this order:
		// (x,y), (x+1,y), (x,y+1), (x+1,y+1)
		int dx = i%2; // ***WORKS IF YOU SET dx AND dy TO CONSTANT 0 OR 1
		int dy = i/2;
		int xx = x + dx;
		int yy = y + dy;

		// Calculate weight of this pixel (not 100% sure this is correct)
		float wx = dx == 0 ? xOfs : 1.0f-xOfs;
		float wy = dy == 0 ? yOfs : 1.0f-yOfs;
		float k = wy*wx;


		int address = (xx * yres + yy) * 3;
		for (int j=0; j<3; j++) {
			ret[j] += img[address+j]*k;
		}

	}

}

__kernel void krShadeImg(
			int xres,
			int yres,
			__global uchar* img,
			__global uchar* shadedImg)
{
	unsigned int x = get_global_id(0);
	unsigned int y = get_global_id(1);


	if (x < xres-1 && y < yres-1) {
		uchar ret[3] = {0, 0, 0};

		filterImg(x, y, xres, yres, img, ret);
		// *** TO MAKE THE CODE WORK, comment out the above
		// *** line and uncomment the following block.
		/*
		// Offset xy lookup to weigh influence of each neighbour.
		// If xOfs == yOfs == 0, pixel (x,y) gets full weight.
		// If xOfs == yOfs == 1, pixel (x+1,y+1) gets full weight.
		float xOfs = .75f;
		float yOfs = .5f;

		for (int i=0; i<4; i++) {
			// Sample 2x2 grid of neighbouring pixels in this order:
			// (x,y), (x+1,y), (x,y+1), (x+1,y+1)
			int dx = i%2;
			int dy = i/2;
			int xx = x + dx;
			int yy = y + dy;

			// Calculate weight of this pixel (not 100% sure this is correct)
			float wx = dx == 0 ? xOfs : 1.0f-xOfs;
			float wy = dy == 0 ? yOfs : 1.0f-yOfs;
			float k = wy*wx;


			int address = (xx * yres + yy) * 3;
			for (int j=0; j<3; j++) {
				ret[j] += img[address+j]*k;
			}

		}
		*/
		setArrayCell(x, y, xres, yres, ret, shadedImg);
	}
}


Output when using filterImg function (broken):


Before OpenCl process:

... ... ... ... ... ... ... ... ... ...
... 100 100 100 100 100 ... ... ... ...
... 100 100 100 100 100 ... ... ... ...
... 100 100 100 100 100 ... ... ... ...
... ... ... ... ... ... ... ... ... ...
... ... ... 100 100 100 100 ... ... ...
... ... ... 100 100 100 100 ... ... ...
... ... ... ... ... ... ... ... ... ...

After OpenCl process:

049 049 049 049 049 049 049 049 049 ...
049 049 049 049 049 049 049 049 049 ...
049 049 049 049 049 049 049 049 049 ...
049 049 049 049 049 049 049 049 049 ...
049 049 049 049 049 049 049 049 049 ...
049 049 049 049 049 049 049 049 049 ...
049 049 049 049 049 049 049 049 049 ...
... ... ... ... ... ... ... ... ... ...

Output when using main body code (correct):

Before OpenCl process:

... ... ... ... ... ... ... ... ... ...
... 100 100 100 100 100 ... ... ... ...
... 100 100 100 100 100 ... ... ... ...
... 100 100 100 100 100 ... ... ... ...
... ... ... ... ... ... ... ... ... ...
... ... ... 100 100 100 100 ... ... ...
... ... ... 100 100 100 100 ... ... ...
... ... ... ... ... ... ... ... ... ...

After OpenCl process:

012 049 049 049 049 037 ... ... ... ...
024 098 098 098 098 074 ... ... ... ...
024 098 098 098 098 074 ... ... ... ...
012 049 049 049 049 037 ... ... ... ...
... ... 012 049 049 049 037 ... ... ...
... ... 024 098 098 098 074 ... ... ...
... ... 012 049 049 049 037 ... ... ...
... ... ... ... ... ... ... ... ... ...