Matrix Addition - what is wrong?

Hello,

I am kinda new to the OpenCL and I tried to write this small testing program of matrix add (don’t get confused by the kernel function name :slight_smile: )

The code runs successfully, however the output matrix is full of zeros instead of the proper result 3.

Could You please help me find the mistake in my code?

Thanks

#include <stdlib.h>
#include <stdio.h>

#include <CL/cl.h>

const char OpenCLSource[] = {
"void MatMul(__global int
matAA, __global int* matBB, __global int* matCC)",
“{”,
" unsigned int i = get_global_id(0);",
" matCC[i] = matAA[i] + matBB[i];",
“}”
};

// Main function
// *********************************************************************
int main(int argc, char *argv[])
{
int clerror = CL_SUCCESS;

cl_mem matAA;
cl_mem matBB;
cl_mem matCC;

const int dsize = 16 * 16;

int matA[dsize];
int matB[dsize];
int matC[dsize];

for(int i = 0; i &lt; dsize; i++)
{
	//matA[i] = matB[i] = rand();
	matA[i] = 1;
	matB[i] = 2;
}

// Query platform ID
cl_platform_id platform;
clGetPlatformIDs (1, &platform, NULL);

// Setup context properties
cl_context_properties props[3];
props[0] = (cl_context_properties)CL_CONTEXT_PLATFORM;
props[1] = (cl_context_properties)platform;
props[2] = (cl_context_properties)0;

// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
cl_context GPUContext = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU,NULL, NULL, NULL);

// Get the list of GPU devices associated with this context
size_t ParmDataBytes;
clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);

// Create a command-queue on the first GPU device
cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], 0, NULL);

// Allocate GPU memory for source vectors AND initialize from CPU memory
matAA = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * dsize, matA, NULL);
matBB = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * dsize, matB, NULL);
matCC = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(int) * dsize, NULL, NULL);

// Create OpenCL program with source code
cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 18, OpenCLSource, NULL, NULL);

// Build the program (OpenCL JIT compilation)
clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);

// Create a handle to the compiled OpenCL function (Kernel)
cl_kernel matMulKernel = clCreateKernel(OpenCLProgram, "MatMul", NULL);

size_t global_work_size[1];
size_t local_work_size[1];

global_work_size[0] = dsize;
local_work_size[0] = dsize;

// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(matMulKernel, 0, sizeof(cl_mem), (void*)&matA);
clSetKernelArg(matMulKernel, 1, sizeof(cl_mem), (void*)&matB);
clSetKernelArg(matMulKernel, 2, sizeof(cl_mem), (void*)&matC);

// Launch the Kernel on the GPU
clEnqueueNDRangeKernel(GPUCommandQueue, matMulKernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);

// Copy the output in GPU memory back to CPU memory
clEnqueueReadBuffer(GPUCommandQueue, matCC, CL_TRUE, 0, global_work_size[0], matC, 0, NULL, NULL);

// Print out the results
for (int i = 0; i &lt; 10; i++)
{
	printf("%d

", matC[i]);
}

// Cleanup
free(GPUDevices);
clReleaseKernel(matMulKernel);
clReleaseProgram(OpenCLProgram);
clReleaseCommandQueue(GPUCommandQueue);
clReleaseContext(GPUContext);
clReleaseMemObject(matAA);
clReleaseMemObject(matBB);
clReleaseMemObject(matCC);

getchar();

return 0;

}

I’m almost surprised that the code didn’t crash.

This is always the first step you should follow: check the error code returned by all functions. Currently the code is not checking for any errors.

Second, the issue appear to be here:

clSetKernelArg(matMulKernel, 0, sizeof(cl_mem), (void*)&matA);
clSetKernelArg(matMulKernel, 1, sizeof(cl_mem), (void*)&matB);
clSetKernelArg(matMulKernel, 2, sizeof(cl_mem), (void*)&matC);

Do you see it now? matA, matB and matC are your local arrays, not the CL buffers. What you were trying to do is this:

clSetKernelArg(matMulKernel, 0, sizeof(cl_mem), (void*)&matAA);
clSetKernelArg(matMulKernel, 1, sizeof(cl_mem), (void*)&matBB);
clSetKernelArg(matMulKernel, 2, sizeof(cl_mem), (void*)&matCC);

I recommend using clearly different names for local variables and for CL objects to avoid this error.

Thank You very much for reply. In the end it crashed as You said :wink:

However there was a mistake a bit earlier. In the definition of the kernel the name of the function was not preceded by the keyword __kernel so the clCreateKernel function did not create the right kernel and everything afterwards just passed without computing anything.

Problema solved!!! :mrgreen: