Hello, for my class project i need to implement sum reduction using 1k or more array with float’s. I would like to compute the final sum by calling the kernel “X” time’s, i just can’t understand how to do it right, if i have 512 work groups how many time’s i have to call it? and why? which functions i should use to call it again? how can i add the returned value and send it again?
Part of main code:
error = clGetPlatformIDs(2, cp_Platform, NULL); //array with two devices
error = clGetDeviceIDs(cp_Platform[1], CL_DEVICE_TYPE_GPU, 1, &Device_ID, NULL); // cp_platform[1] = Nvidia GPU
context = clCreateContext(NULL, 1, &Device_ID, NULL, NULL, &error); // creating openCL context ----> error 2
queue = clCreateCommandQueue(context, Device_ID, 0, &error); // creating command queue, executing openCL context on device cp_Platform[1] ****
program = clCreateProgramWithSource(context, 1, (const char **)& kernelSource, (const size_t *)&source_size, &error); //this function creates a program object for this specific openCL
error = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); //compiles and links a program executable from the program source
kernel = clCreateKernel(program, "GPUfunction", &error); //creating kernel object
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, n * sizeof(float), NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float), NULL, NULL);
error = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, n * sizeof(float), h_a, 0, NULL, NULL); //Enqueue commands to write to a buffer object from host memory.
error |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, sizeof(float), h_s, 0, NULL, NULL); //Enqueue commands to write to a buffer object from host memory.
error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); //Used to set the argument value for a specific argument of a kernel.
error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
error |= clSetKernelArg(kernel, 2, sizeof(int), &n);
error |= clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Enqueues a command to execute a kernel on a device.
clFinish(queue);
clEnqueueReadBuffer(queue, d_b, CL_TRUE, 0, sizeof(float), h_s, 0, NULL, NULL); ////writing data from the device (d_b) to host(h_b)
Kernel code:
__kernel void GPUfunction(__global float *vec, __global float *outPutSum, int n)
{
__local float tempSum[512];
int i;
int globalID = get_global_id(0);
int tid = get_local_id(0);
int BlockDIM = get_local_size(0);
if (globalID < n)
{
tempSum[tid] = vec[globalID];
}
else
{
tempSum[tid] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
for (i = BlockDIM / 2; i > 0; i /= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < i)
{
tempSum[tid] += tempSum[tid + i];
}
}
if (tid == 0)
{
outPutSum[get_group_id(0)] = tempSum[0];
}
}