Kernel execution's problem

Hi !
I’m trying to add 2 vectors but the kernel is not executed. I can’t see where the problem is.

I’m running the code on OS X Snow Leopard with the following Makefile:

CC = gcc
SRC = addition.c
DEST = addition
LDFLAGS = -framework OpenCL

all: $(SRC)
	$(CC) -o $(DEST) $(SRC) $(LDFLAGS)

Here is the code:


/* OpenCL core library */
#include <OpenCL/opencl.h>
#include <stdio.h>
#include <stdlib.h>

/* GPU Kernel call */
int opencl_call(long* a, long* b, long* c, size_t n);

static const char* opencl_code[1] = {\
"__kernel void addition(__constant long* a, __constant long* b, __global long* c) \
{ \
	unsigned int i = get_global_id(0); \
	c[i] = a[i] + b[i]; \
}"};

int main(int argc, char** argv)
{
	int i = 0;
	long a[5] = {1, 2, 3, 4, 5};
	long b[5] = {7, 5, 1, 9, 42};
	long c[5] = {0,0,0,0,0};
	if(opencl_call(a, b, c, 5) == -1)
		printf("Error !");
	for(i = 0; i < 5; i++)
		printf("c[%ld] = %ld
", i, c[i]);
	return 0;
}

int opencl_call(long* a, long* b, long* c, size_t n)
{
	/* Return Status */
	cl_int status = CL_SUCCESS;
    
	size_t device_list_size;
	cl_device_id* devices = NULL;
	cl_command_queue queue;
	cl_command_queue_properties prop = 0;
	cl_mem a_buffer, b_buffer, c_buffer;
	cl_program program;
	cl_kernel kernel;
	size_t global_work_size[1], local_work_size[1];


	/* Create OpenCL context */
	cl_context context = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);

	/* Check whether we really have a GPU */
	if(status != CL_SUCCESS)
	{
		printf("Sorry, your GPU is not supported, using CPU instead.
");
		context = clCreateContextFromType(0, CL_DEVICE_TYPE_CPU, NULL, NULL, &status);
	}
	if(status != CL_SUCCESS)
		return -1;

	/* Check how many GPU we have */
	status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &device_list_size);
	if(status != CL_SUCCESS)
		return -1;
	printf("We have %d devices.
", device_list_size);

	/* Get the device list */
	devices = (cl_device_id*)malloc(device_list_size);

	status = clGetContextInfo(context, CL_CONTEXT_DEVICES, device_list_size, devices, NULL);
	if(status != CL_SUCCESS)
		return -1;

	/* Create command queue */
	queue = clCreateCommandQueue(context, devices[0], prop, &status);
	if(status != CL_SUCCESS)
		return -1;


	/* Allocate memory buffers */
	a_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, n*sizeof(long), a, &status);
	b_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, n*sizeof(long), b, &status);
	if(status != CL_SUCCESS)
		return -1;

	/* Output buffer */
	c_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, n*sizeof(long), c, &status);
	if(status != CL_SUCCESS)
		return -1;

	/* Create a CL program using the kernel source */
	program = clCreateProgramWithSource(context, 1, opencl_code, NULL, &status);
	if(status != CL_SUCCESS)
		return -1;

	/* Build OpenCL program. */
	status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
	if(status != CL_SUCCESS)
		return -1;

	/* Create OpenCL kernel */
	kernel = clCreateKernel(program, "addition", &status);

	/* Set kernel arguments */
	status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &a_buffer);
	status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_buffer);
	status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &c_buffer);
	if(status != CL_SUCCESS)
		return -1;

	/* Kernel execution */
	global_work_size[0] = n;
	local_work_size[0] = 1;
	status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);

	status = clEnqueueReadBuffer(queue, c_buffer, CL_TRUE, 0, n*sizeof(long), c, 0, NULL, NULL);

	/* Free memory */
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseCommandQueue(queue);
	clReleaseContext(context);
}

Thanks.

What do you mean by the kernel is not executed? Do you get an error? Do you get the wrong values back? Does it crash?

Try running with CL_LOG_ERRORS=stdout in your environment and see if you get an error printed out.

I get wrong values back:
c[0] = 0
c[1] = 42949672960
c[2] = 8589934594
c[3] = 8589934602
c[4] = 8589934594

No error printed out with CL_LOG_ERRORS=stdout

Try changing your long to cl_long to make sure you’re not having any 64/32 bit issues. Then try reading back your buffers to verify they have what you expect. (E.g., malloc a buffer then read back and verify that it’s what you’ve written.) I’m not coming up with anything else after a quick glance at the code.