One Code - Three different output images

Hello,

I’ve just started with OpenCL and a problem I can’t figure out how to solve:

I want to implement a Gauss-Filter in the kernel. I use Image2D-objects for the source- and destination-image.

My problem is, when I repeatedly execute the same exact code (of course execution is after compiling) I get three different output images:

  1. the source image
  2. the blurred image (that’s what always should be the output)
  3. a fully black image

I would really appreciate your help. Thank you very much.

Here comes the code:
device_gauss.cl:


__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE
		| CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

__kernel void gauss(__read_only image2d_t src, __write_only image2d_t dst, __global float *weighter,
		int offset, int rows, int cols) {

	int i,j;
	float sum = 0.0f;
	float norm = 0.0f;

	//Get coordinates
	int gid0 = get_global_id(0);
	int gid1 = get_global_id(1);
	float4 pixel;
	float4 pixeltemp;
	int dist;
	pixel=read_imagef(src,sampler,(int2)(gid0,gid1));
	for (i = -offset; i <= offset; i++) {
		for (j = -offset; j <= offset; j++) {
			// sum up the color values of the neighbour pixels

				pixeltemp = read_imagef(src,sampler,(int2)(gid0+i,gid1 +j));
				dist = (int) (sqrt((i*i)+(j*j))+0.5);
				sum += weighter[dist] * pixeltemp.x;

				norm += weighter[dist];

		}
	}
	pixel.x = (sum)/(norm);

	if(norm>0.0f)
		write_imagef (dst,(int2)(gid0, gid1),pixel);


}


host_gauss.c


#include <CL/cl.h>
#include <stdio.h>
#include <sys/stat.h>
#include <math.h>
#include "load_write_pgm.h"

#define DIMENSION 2
#define WEIGHTER_SIZE 13
#define MILLION 1000000.0

void check(cl_int*);
void copyimg_to_linear(float *src, float *dst, struct imageMatrix* img);
void save_1dimage(float *src, struct imageMatrix * img, char* filename);
void do2Dto1D(struct imageMatrix* source, float* destination);
void do1Dto2D(float* source, struct imageMatrix* destination);
void init_weight_array(size_t size, float* weighter);

int main() {
	cl_int err;
	cl_event event;
	cl_uint amount;
	int i;

	//Get platforms
	clGetPlatformIDs(NULL, NULL, &amount);
	cl_platform_id *platform = (cl_platform_id*) malloc(amount
			* sizeof(cl_platform_id));
	err = clGetPlatformIDs(amount, platform, NULL);
	check(&err);

	//Get devices of first platform
	clGetDeviceIDs(*platform, CL_DEVICE_TYPE_GPU, NULL, NULL, &amount);
	cl_device_id *devices = (cl_device_id*) malloc(amount
			* sizeof(cl_device_id));
	err = clGetDeviceIDs(*platform, CL_DEVICE_TYPE_GPU, amount, devices, NULL);
	check(&err);

	//Get MaxWorkGroupSize
	size_t workgroup = 0;
	err = clGetDeviceInfo(*devices, CL_DEVICE_MAX_WORK_GROUP_SIZE,
			sizeof(workgroup), &workgroup, NULL);
	check(&err);
	int workgroupsize = (int) workgroup;

	//Create a context
	cl_context ctx;
	cl_context_properties props[] = { CL_CONTEXT_PLATFORM,
			(cl_context_properties) *platform, NULL };
	ctx = clCreateContext(props, amount, devices, NULL, NULL, &err);
	check(&err);

	//Create a command queue for all devices associated with the first platform
	cl_command_queue *queues = (cl_command_queue*) malloc(amount * sizeof(cl_command_queue));
	for (i = 0; i < amount; i++) {
		queues[i] = clCreateCommandQueue(ctx, devices[i],
				CL_QUEUE_PROFILING_ENABLE, &err);
	}

	//Read source from file
	FILE *f;
	f = fopen("kernels/device_gauss.cl", "r");
	struct stat finfo;
	if (f == NULL) {
		printf("ERROR: Could not load file. 
");
		exit(EXIT_FAILURE);
	}
	stat("kernels/device_gauss.cl", &finfo);
	char *buffer = (char*) malloc(finfo.st_size + 1);
	char c;
	i = 0;
	while ((c = getc(f)) != EOF) {
		buffer[i] = c;
		i++;
	}
	buffer[i] = '\0';

	//Create program by source
	cl_program program;
	program = clCreateProgramWithSource(ctx, 1, &buffer, NULL, &err);
	check(&err);

	//Build program
	err = clBuildProgram(program, NULL, NULL, NULL, NULL, NULL);
	check(&err);
	//Create kernel object
	cl_kernel gauss_kernel;
	gauss_kernel = clCreateKernel(program, "gauss", &err);
	check(&err);

	//Start measuring
	cl_ulong start;
	clEnqueueMarker(queues[0], &event);
	clFinish(queues[0]);
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
			sizeof(cl_ulong), &start, NULL);

	//Create buffers
	cl_mem d_weighter;
	float* h_src, *h_dst, *weighter;
	struct imageMatrix img;
	int dim[2];
	char* filename = "pictures/pgm.pgm";
	int offset = WEIGHTER_SIZE / 2;
	//method to load .pgm-file
	load_pgm_image(&img, filename);
	dim[0] = img.i_rows;
	dim[1] = img.i_cols;
	//Host-Source-Image
	h_src = (float*) malloc(dim[0] * dim[1] * sizeof(float));
	//Host-Destination-Image
	h_dst = (float*) malloc(dim[0] * dim[1] * sizeof(float));
	//Copy 2D-Image to a 1D-Array
	copyimg_to_linear(h_src, h_dst, &img);
	//fill destination-image with zeros
	h_dst = (float*) calloc(dim[0] * dim[1], sizeof(float));

	//********Create IMG Objects**********
	cl_mem d_src_2d;
	cl_mem d_dst_2d;

	cl_image_format format;
	format.image_channel_order = CL_R;
	format.image_channel_data_type = CL_FLOAT;
	printf("%i
", (int) sizeof(CL_UNSIGNED_INT8));
	int num_channels_per_pixel = 1;
	int channel_size = (int) sizeof(CL_UNSIGNED_INT8);
	int pixel_size = num_channels_per_pixel * channel_size;

	//Create Image2D-Objects für source and destination
	d_src_2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &format, dim[1], dim[0],
			0, NULL, &err);
	check(&err);
	d_dst_2d = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &format, dim[1], dim[0],
			0, NULL, &err);
	check(&err);

	//create weight for neighbor-pixel
	weighter = (float *) malloc(WEIGHTER_SIZE * sizeof(float));
	init_weight_array(WEIGHTER_SIZE, weighter);
	d_weighter = clCreateBuffer(ctx, CL_MEM_ALLOC_HOST_PTR
			| CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY, WEIGHTER_SIZE
			* sizeof(float), weighter, &err);
	check(&err);

	//Set kernel parameter
	err = clSetKernelArg(gauss_kernel, 0, sizeof(cl_mem), (void*) &d_src_2d);
	check(&err);
	err = clSetKernelArg(gauss_kernel, 1, sizeof(cl_mem), (void*) &d_dst_2d);
	check(&err);
	err = clSetKernelArg(gauss_kernel, 2, sizeof(cl_mem), &d_weighter);
	check(&err);
	err = clSetKernelArg(gauss_kernel, 3, sizeof(int), &offset);
	check(&err);
	err = clSetKernelArg(gauss_kernel, 4, sizeof(int), &dim[1]);
	check(&err);
	err = clSetKernelArg(gauss_kernel, 5, sizeof(int), &dim[0]);
	check(&err);

	//Copy image to device
	size_t origin[] = { 0, 0, 0 };
	size_t region[] = { dim[1], dim[0], 1 };
	err = clEnqueueWriteImage(queues[0], d_src_2d, CL_TRUE, origin, region, 0,
			0, h_src, 0, NULL, &event);
	check(&err);

	err = clEnqueueWriteImage(queues[0], d_dst_2d, CL_TRUE, origin, region, 0,
			0, h_dst, 0, NULL, &event);
	check(&err);

	//Enqueue kernel execution command in command queue
	int remainder1 = dim[1] % workgroupsize;
	int remainder2 = dim[0] % workgroupsize;

	size_t global_size[] = { dim[1] + workgroup - remainder1, dim[0]
			+ workgroup - remainder2 };
	err = clEnqueueNDRangeKernel(queues[0], gauss_kernel, 2, NULL, global_size,
			NULL, NULL, NULL, &event);
	check(&err);

	//Wait for execution of the gauss algorithm
	err = clWaitForEvents(1, &event);
	check(&err);

	//Download result from device memory
	err = clEnqueueReadImage(queues[0], d_dst_2d, CL_TRUE, origin, region, 0,
			0, h_dst, NULL, NULL, NULL);
	check(&err);

	//Measure elapsed time
	cl_ulong end;
	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
			&end, NULL);
	clFinish(queues[0]);
	float elapsed = (end - start) / MILLION;
	printf("Elapsed time: 	%f ms
", elapsed);

	//Write result image to disk

	save_1dimage(h_dst, &img, "pictures/result.pgm");

	clReleaseMemObject(d_dst_2d);
	clReleaseMemObject(d_src_2d);
	free(h_src);
	free(h_dst);
	free(weighter);
	destroyImageMatrix(&img);

	return 0;
}

void copyimg_to_linear(float *src, float *dst, struct imageMatrix* img) {
	do2Dto1D(img, src);
	do2Dto1D(img, dst);
}

void save_1dimage(float *src, struct imageMatrix* img, char* filename) {
	do1Dto2D(src, img);
	write_pgm_image(img, filename);
}

/*Transform a 2D array in a 1D Array*/
void do2Dto1D(struct imageMatrix* source, float* destination) {
	int i, j;
	for (i = 0; i < source->i_rows; i++) {
		for (j = 0; j < source->i_cols; j++) {
			destination[j + (i * source->i_cols)] = source->imageMatrix[i][j];
		}
	}
}

/**
 *Transform a 1D array in a 2D Array
 */
void do1Dto2D(float* source, struct imageMatrix* destination) {
	int i, j;
	for (i = 0; i < destination->i_rows; i++) {
		for (j = 0; j < destination->i_cols; j++) {
			destination->imageMatrix[i][j] = source[j + (i
					* destination->i_cols)];
		}
	}
}

/**
 *
 */
void init_weight_array(size_t size, float* weighter) {
	int offset = size / 2;
	int fwhm = 5;
	/*
	 * Given as parameter
	 * FWHM = 2 sqrt(2 ln2) sigma ~ 2.35 sigma
	 */
	float a = (fwhm / 2.354);
	int i;

	/* set up kernel to weight the pixels */
	/* (KERNEL_SIZE - offset -1) is the CORRECT version */
	for (i = -offset; i <= (size - offset - 1); i++) {
		weighter[i + offset] = exp(-i * i / (2 * a * a));
	}
}

void check(cl_int *err) {
	switch (*err) {
	case CL_SUCCESS:
		return;
		break;
	case CL_DEVICE_NOT_FOUND:
		printf("Device not found.");
		break;
	case CL_DEVICE_NOT_AVAILABLE:
		printf("Device not available");
		break;
	case CL_COMPILER_NOT_AVAILABLE:
		printf("Compiler not available");
		break;
	case CL_MEM_OBJECT_ALLOCATION_FAILURE:
		printf("Memory object allocation failure");
		break;
	case CL_OUT_OF_RESOURCES:
		printf("Out of resources");
		break;
	case CL_OUT_OF_HOST_MEMORY:
		printf("Out of host memory");
		break;
	case CL_PROFILING_INFO_NOT_AVAILABLE:
		printf("Profiling information not available");
		break;
	case CL_MEM_COPY_OVERLAP:
		printf("Memory copy overlap");
		break;
	case CL_IMAGE_FORMAT_MISMATCH:
		printf("Image format mismatch");
		break;
	case CL_IMAGE_FORMAT_NOT_SUPPORTED:
		printf("Image format not supported");
		break;
	case CL_BUILD_PROGRAM_FAILURE:
		printf("Program build failure");
		break;
	case CL_MAP_FAILURE:
		printf("Map failure");
		break;
	case CL_INVALID_VALUE:
		printf("Invalid value");
		break;
	case CL_INVALID_DEVICE_TYPE:
		printf("Invalid device type");
		break;
	case CL_INVALID_PLATFORM:
		printf("Invalid platform");
		break;
	case CL_INVALID_DEVICE:
		printf("Invalid device");
		break;
	case CL_INVALID_CONTEXT:
		printf("Invalid context");
		break;
	case CL_INVALID_QUEUE_PROPERTIES:
		printf("Invalid queue properties");
		break;
	case CL_INVALID_COMMAND_QUEUE:
		printf("Invalid command queue");
		break;
	case CL_INVALID_HOST_PTR:
		printf("Invalid host pointer");
		break;
	case CL_INVALID_MEM_OBJECT:
		printf("Invalid memory object");
		break;
	case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
		printf("Invalid image format descriptor");
		break;
	case CL_INVALID_IMAGE_SIZE:
		printf("Invalid image size");
		break;
	case CL_INVALID_SAMPLER:
		printf("Invalid sampler");
		break;
	case CL_INVALID_BINARY:
		printf("Invalid binary");
		break;
	case CL_INVALID_BUILD_OPTIONS:
		printf("Invalid build options");
		break;
	case CL_INVALID_PROGRAM:
		printf("Invalid program");
		break;
	case CL_INVALID_PROGRAM_EXECUTABLE:
		printf("Invalid program executable");
		break;
	case CL_INVALID_KERNEL_NAME:
		printf("Invalid kernel name");
		break;
	case CL_INVALID_KERNEL_DEFINITION:
		printf("Invalid kernel definition");
		break;
	case CL_INVALID_KERNEL:
		printf("Invalid kernel");
		break;
	case CL_INVALID_ARG_INDEX:
		printf("Invalid argument index");
		break;
	case CL_INVALID_ARG_VALUE:
		printf("Invalid argument value");
		break;
	case CL_INVALID_ARG_SIZE:
		printf("Invalid argument size");
		break;
	case CL_INVALID_KERNEL_ARGS:
		printf("Invalid kernel arguments");
		break;
	case CL_INVALID_WORK_DIMENSION:
		printf("Invalid work dimension");
		break;
	case CL_INVALID_WORK_GROUP_SIZE:
		printf("Invalid work group size");
		break;
	case CL_INVALID_WORK_ITEM_SIZE:
		printf("Invalid work item size");
		break;
	case CL_INVALID_GLOBAL_OFFSET:
		printf("Invalid global offset");
		break;
	case CL_INVALID_EVENT_WAIT_LIST:
		printf("Invalid event wait list");
		break;
	case CL_INVALID_EVENT:
		printf("Invalid event");
		break;
	case CL_INVALID_OPERATION:
		printf("Invalid operation");
		break;
	case CL_INVALID_GL_OBJECT:
		printf("Invalid OpenGL object");
		break;
	case CL_INVALID_BUFFER_SIZE:
		printf("Invalid buffer size");
		break;
	case CL_INVALID_MIP_LEVEL:
		printf("Invalid mip-map level");
		break;
	default:
		printf("Unknown");
		break;
	}
	printf("
");
}





int channel_size = (int) sizeof(CL_UNSIGNED_INT8);

This is not doing what you think. CL_UNSIGNED_INT8 is simply an arbitrary number. It’s a code to represent that you want 8 bits per channel.

If the channel format is CL_UNSIGNED_INT8 then the channel size is 1 (byte). If you want to use sizeof, then do this:

int channel_size = sizeof(cl_uchar);

There are other things that look strange:

int remainder1 = dim[1] % workgroupsize;
   int remainder2 = dim[0] % workgroupsize;

   size_t global_size[] = { dim[1] + workgroup - remainder1, dim[0]
         + workgroup - remainder2 };

First, the value stored in workgroupsize is the maximum work-group size supported by the device, not the work-group size that will be used to execute this particular kernel.

Second, you don’t need to know what work-group size will be used to execute this kernel. It’s not useful information for you in this case.

Third, do not try to tweak the global size based on what the work-group size. Do not remove the remainder (why would you do that?). This is what you should do:

size_t global_size[2] = { dim[1], dim[0]};

This above could be the reason you are seeing different outputs at different times.

This code is also not needed:

//Wait for execution of the gauss algorithm
   err = clWaitForEvents(1, &event);
   check(&err);

You don’t need to wait for the kernel to finish because you are using an in-order command queue. That means that the queue will make sure that a command does not start executing a command until all the previously enqueued commands are already finished.

Hey david.garcia,

thanks for your answer.

I implemented the changes you suggested but still I get the three different output images.

It sometimes appears to me, that my kernel isn’t even executed (mostly when the source image is simply “copied” to the destination image). But I can’t see a reason for that…

Do I maybe have a problem with the global_id leaving the image bounds? I actually can’t imagine, especially since I now changed the global_size to the one david.garcia suggested…

I think I need some more hints…