Hello,
I am kinda new to the OpenCL and I tried to write this small testing program of matrix add (don’t get confused by the kernel function name )
The code runs successfully, however the output matrix is full of zeros instead of the proper result 3.
Could You please help me find the mistake in my code?
Thanks
#include <stdlib.h>
#include <stdio.h>
#include <CL/cl.h>
const char OpenCLSource[] = {
"void MatMul(__global int matAA, __global int* matBB, __global int* matCC)",
“{”,
" unsigned int i = get_global_id(0);",
" matCC[i] = matAA[i] + matBB[i];",
“}”
};
// Main function
// *********************************************************************
int main(int argc, char *argv[])
{
int clerror = CL_SUCCESS;
cl_mem matAA;
cl_mem matBB;
cl_mem matCC;
const int dsize = 16 * 16;
int matA[dsize];
int matB[dsize];
int matC[dsize];
for(int i = 0; i < dsize; i++)
{
//matA[i] = matB[i] = rand();
matA[i] = 1;
matB[i] = 2;
}
// Query platform ID
cl_platform_id platform;
clGetPlatformIDs (1, &platform, NULL);
// Setup context properties
cl_context_properties props[3];
props[0] = (cl_context_properties)CL_CONTEXT_PLATFORM;
props[1] = (cl_context_properties)platform;
props[2] = (cl_context_properties)0;
// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
cl_context GPUContext = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU,NULL, NULL, NULL);
// Get the list of GPU devices associated with this context
size_t ParmDataBytes;
clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);
// Create a command-queue on the first GPU device
cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], 0, NULL);
// Allocate GPU memory for source vectors AND initialize from CPU memory
matAA = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * dsize, matA, NULL);
matBB = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * dsize, matB, NULL);
matCC = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(int) * dsize, NULL, NULL);
// Create OpenCL program with source code
cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 18, OpenCLSource, NULL, NULL);
// Build the program (OpenCL JIT compilation)
clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);
// Create a handle to the compiled OpenCL function (Kernel)
cl_kernel matMulKernel = clCreateKernel(OpenCLProgram, "MatMul", NULL);
size_t global_work_size[1];
size_t local_work_size[1];
global_work_size[0] = dsize;
local_work_size[0] = dsize;
// In the next step we associate the GPU memory with the Kernel arguments
clSetKernelArg(matMulKernel, 0, sizeof(cl_mem), (void*)&matA);
clSetKernelArg(matMulKernel, 1, sizeof(cl_mem), (void*)&matB);
clSetKernelArg(matMulKernel, 2, sizeof(cl_mem), (void*)&matC);
// Launch the Kernel on the GPU
clEnqueueNDRangeKernel(GPUCommandQueue, matMulKernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
// Copy the output in GPU memory back to CPU memory
clEnqueueReadBuffer(GPUCommandQueue, matCC, CL_TRUE, 0, global_work_size[0], matC, 0, NULL, NULL);
// Print out the results
for (int i = 0; i < 10; i++)
{
printf("%d
", matC[i]);
}
// Cleanup
free(GPUDevices);
clReleaseKernel(matMulKernel);
clReleaseProgram(OpenCLProgram);
clReleaseCommandQueue(GPUCommandQueue);
clReleaseContext(GPUContext);
clReleaseMemObject(matAA);
clReleaseMemObject(matBB);
clReleaseMemObject(matCC);
getchar();
return 0;
}