Beginner question

Hi,

I have created a dummy program to try some things for my first openCL program. The idea is to be able to work on 3D points with OpenCL kernel. This program do simple dummy task but it is a first try.

This is my main on the host


#define PROGRAM_FILE "add_points.cl"
#define KERNEL_FUNC "add_pts"

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <iostream>

#include <CL/cl.h>

/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() 
{

	cl_platform_id platform;
	cl_device_id dev;
	int err;

	/* Identify a platform */
	err = clGetPlatformIDs(1, &platform, NULL);
	if (err < 0) {
		perror("Couldn't identify a platform");
		exit(1);
	}

	/* Access a device */
	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
	if (err == CL_DEVICE_NOT_FOUND) {
		err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
	}
	if (err < 0) {
		perror("Couldn't access any devices");
		exit(1);
	}

	return dev;
}

/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename)
{

	cl_program program;
	FILE *program_handle;
	char *program_buffer, *program_log;
	size_t program_size, log_size;
	int err;

	/* Read program file and place content into buffer */
	program_handle = fopen(filename, "r");
	if (program_handle == NULL) {
		perror("Couldn't find the program file");
		exit(1);
	}
	fseek(program_handle, 0, SEEK_END);
	program_size = ftell(program_handle);
	rewind(program_handle);
	program_buffer = (char*)malloc(program_size + 1);
	program_buffer[program_size] = '\0';
	fread(program_buffer, sizeof(char), program_size, program_handle);
	fclose(program_handle);

	/* Create program from file */
	program = clCreateProgramWithSource(ctx, 1,
		(const char**)&program_buffer, &program_size, &err);
	if (err < 0) {
		perror("Couldn't create the program");
		exit(1);
	}
	free(program_buffer);

	/* Build program */
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
	if (err < 0) {

		/* Find size of log and print to std output */
		clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
			0, NULL, &log_size);
		program_log = (char*)malloc(log_size + 1);
		program_log[log_size] = '\0';
		clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
			log_size + 1, program_log, NULL);
		printf("%s
", program_log);
		free(program_log);
		exit(1);
	}

	return program;
}

struct PointXYZ
{
	float x;
	float y;
	float z;
};

int main() {

	/* OpenCL structures */
	cl_device_id device;
	cl_context context;
	cl_program program;
	cl_kernel kernel;
	cl_command_queue queue;
	cl_int err;
//	cl_int i, j, err;
	
	size_t local_size, global_size;

	size_t nbPoints = 10;

	PointXYZ *pts = new PointXYZ[nbPoints];
	for (size_t i = 0; i < nbPoints; ++i)
	{
		pts[i].x = i + 1;
		pts[i].y = i + 1;
		pts[i].z = i + 1;
	}

	float *results = new float[nbPoints];
	for (size_t i = 0; i < nbPoints; ++i)
		results[i] = 0;
	
	cl_mem pts_buffer;
	cl_mem results_buffer;

	cl_int num_groups;

	/* Create device and context */
	device = create_device();
	context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
	if (err < 0) 
	{
		perror("Couldn't create a context");
		exit(1);
	}

	/* Build program */
	program = build_program(context, device, PROGRAM_FILE);

	/* Create data buffer */
	global_size = nbPoints;
	local_size = 1;
	num_groups = global_size / local_size;

	pts_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,  nbPoints* sizeof(PointXYZ), pts, &err);
	results_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nbPoints * sizeof(float), results, &err);

	if (err < 0)
	{
		perror("Couldn't create a buffer");
		exit(1);
	};

	/* Create a command queue */
	queue = clCreateCommandQueue(context, device, 0, &err);
	if (err < 0) {
		perror("Couldn't create a command queue");
		exit(1);
	};

	/* Create a kernel */
	kernel = clCreateKernel(program, KERNEL_FUNC, &err);
	if (err < 0) {
		perror("Couldn't create a kernel");
		exit(1);
	};

	/* Create kernel arguments */
	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &pts_buffer);
	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &results_buffer);
	if (err < 0) 
	{
		perror("Couldn't create a kernel argument");
		exit(1);
	}

	/* Enqueue kernel */
	err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
	if (err < 0) 
	{
		perror("Couldn't enqueue the kernel");
		exit(1);
	}

	// Wait for the command queue to get serviced before reading back results
	clFinish(queue);


	/* Read the kernel's output */
	err = clEnqueueReadBuffer( queue, results_buffer, CL_TRUE, 0, sizeof(results), results, 0, NULL, NULL);
	if (err < 0) 
	{
		perror("Couldn't read the buffer");
		exit(1);
	}

	// output
	for (size_t i = 0; i < nbPoints; ++i)
		std::cout << results[i] << std::endl;


	/* Deallocate resources */
	clReleaseKernel(kernel);
	clReleaseMemObject(pts_buffer);
	clReleaseMemObject(results_buffer);
	clReleaseCommandQueue(queue);
	clReleaseProgram(program);
	clReleaseContext(context);

	return 0;
}

This is my kernel



typedef struct PointXYZ
{
	float x;
	float y;
	float z;
} PointXYZ;

__kernel void add_pts(__global struct PointXYZ *pts, __global float* results) 
{
   float sum;   
   PointXYZ pTmp;

   uint global_addr = get_global_id(0);

   pTmp = pts[global_addr];

   sum = pTmp.x + pTmp.y + pTmp.z;

   printf ( "point : %f, %f, %f --> sum : %f :  ID %d
", pTmp.x, pTmp.y, pTmp.z, sum, global_addr);

   results[global_addr] = sum;

}

This my output in con

point : 1.000000, 1.000000, 1.000000 --> sum : 3.000000 : ID 0
point : 3.000000, 3.000000, 3.000000 --> sum : 9.000000 : ID 2
point : 7.000000, 7.000000, 7.000000 --> sum : 21.000000 : ID 6
point : 4.000000, 4.000000, 4.000000 --> sum : 12.000000 : ID 3
point : 5.000000, 5.000000, 5.000000 --> sum : 15.000000 : ID 4
point : 8.000000, 8.000000, 8.000000 --> sum : 24.000000 : ID 7
point : 9.000000, 9.000000, 9.000000 --> sum : 27.000000 : ID 8
point : 2.000000, 2.000000, 2.000000 --> sum : 6.000000 : ID 1
point : 6.000000, 6.000000, 6.000000 --> sum : 18.000000 : ID 5
point : 10.000000, 10.000000, 10.000000 --> sum : 30.000000 : ID 9
3
6
0
0
0
0
0
0
0
0

I was expecting to get sum of each xyz and each position of the result vector. It seems correct for the first 2 positions but nothing for the others. I have looked at a lot of examples and it seems it is the way to do things. Probably something is missing

Thank you

/* Read the kernel’s output */
err = clEnqueueReadBuffer( queue, results_buffer, CL_TRUE, 0, nbPoints * sizeof(float), results, 0, NULL, NULL);