OpenCL image2d_t writing mostly zeros

I am trying to use OpenCL and image2d_t objects to speed up image convolution. When I noticed that the output was a blank image of all zeros, I simplified the OpenCL kernel to a basic read from the input and write to the output (shown below). With a little bit of tweaking, I got it to write a few scattered pixels of the image into the output image.

I have verified that the image is intact up until the call to read_imageui() in the OpenCL kernel. I wrote the image to GPU memory with CommandQueue::enqueueWriteImage() and immediately read it back into a brand new buffer in CPU memory with CommandQueue::enqueueReadImage(). The result of this call matched the original input image. However, when I retrieve the pixels with read_imageui() in the kernel, the vast majority of the pixels are set to 0.

C++ source:

int height = 112;
	int width = 9216;
	unsigned int numPixels = height * width;
	unsigned int numInputBytes = numPixels * sizeof(uint16_t);
	unsigned int numDuplicatedInputBytes = numInputBytes * 4;
	unsigned int numOutputBytes = numPixels * sizeof(int32_t);
	
	cl::size_t<3> origin;
	origin.push_back(0);
	origin.push_back(0);
	origin.push_back(0);
	cl::size_t<3> region;
	region.push_back(width);
	region.push_back(height);
	region.push_back(1);

	std::ifstream imageFile("hri_vis_scan.dat", std::ifstream::binary);
	checkErr(imageFile.is_open() ? CL_SUCCESS : -1, "hri_vis_scan.dat");
	uint16_t *image = new uint16_t[numPixels];
	imageFile.read((char *) image, numInputBytes);
	imageFile.close();

	// duplicate our single channel image into all 4 channels for Image2D
	cl_ushort4 *imageDuplicated = new cl_ushort4[numPixels];
	for (int i = 0; i < numPixels; i++)
		for (int j = 0; j < 4; j++)
			imageDuplicated[i].s[j] = image[i];

	cl::Buffer imageBufferOut(context, CL_MEM_WRITE_ONLY, numOutputBytes, NULL, &err);
	checkErr(err, "Buffer::Buffer()");

	cl::ImageFormat inFormat;
	inFormat.image_channel_data_type = CL_UNSIGNED_INT16;
	inFormat.image_channel_order = CL_RGBA;
	cl::Image2D bufferIn(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, inFormat, width, height, 0, imageDuplicated, &err);
	checkErr(err, "Image2D::Image2D()");

	cl::ImageFormat outFormat;
	outFormat.image_channel_data_type = CL_UNSIGNED_INT16;
	outFormat.image_channel_order = CL_RGBA;
	cl::Image2D bufferOut(context, CL_MEM_WRITE_ONLY, outFormat, width, height, 0, NULL, &err);
	checkErr(err, "Image2D::Image2D()");

	int32_t *imageResult = new int32_t[numPixels];
	memset(imageResult, 0, numOutputBytes);

	cl_int4 *imageResultDuplicated = new cl_int4[numPixels];
	for (int i = 0; i < numPixels; i++)
		for (int j = 0; j < 4; j++)
			imageResultDuplicated[i].s[j] = 0;

	std::ifstream kernelFile("convolutionKernel.cl");
	checkErr(kernelFile.is_open() ? CL_SUCCESS : -1, "convolutionKernel.cl");
	std::string imageProg(std::istreambuf_iterator<char>(kernelFile), (std::istreambuf_iterator<char>()));
	cl::Program::Sources imageSource(1, std::make_pair(imageProg.c_str(), imageProg.length() + 1));
	cl::Program imageProgram(context, imageSource);
	err = imageProgram.build(devices, "");
	checkErr(err, "Program::build()");

	cl::Kernel basic(imageProgram, "basic", &err);
	checkErr(err, "Kernel::Kernel()");

	basic.setArg(0, bufferIn);
	basic.setArg(1, bufferOut);
	basic.setArg(2, imageBufferOut);

	queue.finish();

	cl_ushort4 *imageDuplicatedTest = new cl_ushort4[numPixels];
	for (int i = 0; i < numPixels; i++)
	{
		imageDuplicatedTest[i].s[0] = 0;
		imageDuplicatedTest[i].s[1] = 0;
		imageDuplicatedTest[i].s[2] = 0;
		imageDuplicatedTest[i].s[3] = 0;
	}
	double gpuTimer = clock();

	err = queue.enqueueReadImage(bufferIn, CL_FALSE, origin, region, 0, 0, imageDuplicatedTest, NULL, NULL);
	checkErr(err, "CommandQueue::enqueueReadImage()");

// Output from above matches input image

	err = queue.enqueueNDRangeKernel(basic, cl::NullRange, cl::NDRange(height, width), cl::NDRange(1, 1), NULL, NULL);
	checkErr(err, "CommandQueue::enqueueNDRangeKernel()");

	queue.flush();

	err = queue.enqueueReadImage(bufferOut, CL_TRUE, origin, region, 0, 0, imageResultDuplicated, NULL, NULL);
	checkErr(err, "CommandQueue::enqueueReadImage()");

	queue.flush();

	err = queue.enqueueReadBuffer(imageBufferOut, CL_TRUE, 0, numOutputBytes, imageResult, NULL, NULL);
	checkErr(err, "CommandQueue::enqueueReadBuffer()");

	queue.finish();

OpenCL kernel:

    __kernel void basic(__read_only image2d_t input, __write_only image2d_t output, __global int *result)
{
	const sampler_t smp = CLK_NORMALIZED_COORDS_TRUE | //Natural coordinates
         CLK_ADDRESS_NONE | //Clamp to zeros
         CLK_FILTER_NEAREST; //Don't interpolate

	int2 coord = (get_global_id(1), get_global_id(0));

	uint4 pixel = read_imageui(input, smp, coord);
	result[coord.s0 + coord.s1 * 9216] = pixel.s0;
	write_imageui(output, coord, pixel);
}

The coordinates in the kernel are currently mapped to (x, y) = (width, height).

The input image is a single channel greyscale image with 16 bits per pixel, which is why I had to duplicate the channels to fit into OpenCL’s Image2D. The output after convolution will be 32 bits per pixel, which is why numOutputBytes is set to that. Also, although the width and height appear weird, the input image’s dimensions are 9216x7824, so I’m only taking a portion of it to test the code first, so it doesn’t take forever.

I added in a write to global memory after reading from the image in the kernel to see if the issue was reading the image or writing the image. After the kernel executes, this section of global memory also contains mostly zeros.

I’m working with an nVidia GTX 550 Ti GPU and am running Windows 7 32-bit with Visual Studio 2010.

Any help would be greatly appreciated!

Also, just realized that I forgot to switch the sampler to CLK_NORMALIZED_COORDS_FALSE after trying both true and false. It gave the same result in either case.