If the problem was that the device doesn’t have enough local memory, it would return an error code when you call clEnqueueNDRangeKernel().
OK.
That should not be necessary. The tile size is mostly independent of the global size. As long as the tile size divides the global size evenly it should work.
That’s what I also read
Have you compared your code with the examples of matrix multiplication from Apple, NVidia or AMD?
Yes, I do. I trying to do like in Nvidia Best Practice Guide, section 3.2.2.2 where there is kernel only. And like I said, it’s not square matrix, it’s row/column matrix with tile size row/column.
There must be something pretty trivial going wrong.
I think so. But can’t find with my beginner knowledge. (please, cf. code at the end of the post.
Is it possible that you are passing the wrong value to the last kernel argument?
I don’t think so, but it’s possible. It works well with global memory. So I would not put the full code for doing by my self, but now I think it’s necessary :
#include <iostream>
#include <iomanip> //for setw()
#include <stdio.h>
using namespace std;
#include "opencl.h"
#define N 8 // matrix dimension
#define TILE_DIM 4
cl_int a[N*N];
cl_int b[N*N];
cl_int c[N*N];
unsigned int n = N;
const char* program_source[] =
{
// "#define N 4
",
"#define TILE_DIM 4
",
"__kernel void MatMult(__global const int* a, __global const int* b, __global int* c, const unsigned int n)", // __local int aTile[TILE_DIM][TILE_DIM]
"{",
"int row = get_global_id(1);",
"int col = get_global_id(0);",
"int Cres = 0;",
"int x = get_local_id(0);",
"int y = get_local_id(1);",
"__local int aTile[TILE_DIM][TILE_DIM];",
"__local int bTile[TILE_DIM][TILE_DIM];",
"aTile[y][x] = a[row*n+ x];",
"bTile[y][x] = b[y*n + col];",
"barrier(CLK_LOCAL_MEM_FENCE);",
"for(int i = 0;i< n ; i++)",
// "{Cres += a[row*n + i ] * b[i*n + col];}",
// "{Cres += aTile[y][i] * b[i*n + col];}",
// "{Cres += a[row*n + i ] * bTile[i][x];}",
"{Cres += aTile[y][i] * bTile[i][x];}",
"c[row*n+col]= Cres;",
"}",
};
int main(int argc, char **argv)
{
cl_int errcode;
//FOR PLATFORM AND DEVICES INFORMATIONS
char dname[500];
cl_uint entries;
cl_ulong long_entries;
int d;
size_t p_size;
const cl_uint num_entries = 10;
cl_platform_id platforms[num_entries];
cl_uint num_platforms;
errcode = clGetPlatformIDs(num_entries, platforms, &num_platforms);
// cout << "Error Code: " << errcode << endl;
cl_device_id devices[num_entries];
cl_uint num_devices;
errcode = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_entries, devices, &num_devices);
// cout << "Error Code: " << errcode << endl;
// PLATFORM AND DEVICES INFORMATIONS
/* obtain information about platform */
/*
clGetPlatformInfo(platforms[0],CL_PLATFORM_NAME,500,dname,NULL);
printf("CL_PLATFORM_NAME = %s
", dname);
clGetPlatformInfo(platforms[0],CL_PLATFORM_VERSION,500,dname,NULL);
printf("CL_PLATFORM_VERSION = %s
", dname);
for (d = 0; d < num_devices; ++d) {
clGetDeviceInfo(devices[d], CL_DEVICE_NAME, 500, dname,NULL);
printf("Device #%d name = %s
", d, dname);
clGetDeviceInfo(devices[d],CL_DRIVER_VERSION, 500, dname,NULL);
printf(" Driver version = %s
", dname);
clGetDeviceInfo(devices[d],CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
printf(" Global Memory (MB): %llu
",long_entries/1024/1024);
clGetDeviceInfo(devices[d],CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,sizeof(cl_ulong),&long_entries,NULL);
printf(" Global Memory Cache (MB): %llu
",long_entries/1024/1024);
clGetDeviceInfo(devices[d],CL_DEVICE_LOCAL_MEM_SIZE,sizeof(cl_ulong),&long_entries,NULL);
printf(" Local Memory (KB): %llu
",long_entries/1024);
clGetDeviceInfo(devices[d],CL_DEVICE_MAX_CLOCK_FREQUENCY,sizeof(cl_ulong),&long_entries,NULL);
printf(" Max clock (MHz) : %llu
",long_entries);
clGetDeviceInfo(devices[d],CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t),&p_size,NULL);
printf(" Max Work Group Size: %d
",p_size);
clGetDeviceInfo(devices[d],CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&entries,NULL);
printf(" Number of parallel compute cores: %d
",entries);
}
//*/
cl_context_properties properties[] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)platforms[0],
0
};
cl_int error;
cl_context context = clCreateContext(properties,
num_devices, devices,
NULL, NULL, &error);
if(error != CL_SUCCESS)
{
cerr << "Erreur clCreateContext" << endl;
return 1;
}
cl_command_queue command_queue = clCreateCommandQueue (context,
devices[0],
NULL, &error);
if(error != CL_SUCCESS)
{
cerr << "Erreur clCreateCommandQueue" << endl;
clReleaseContext(context);
return 1;
}
int NN = N*N;
for(int i=0;i<NN; i++)
{
a[i] = i%4;
b[i] = i%4;
c[i] = -1;
}
cl_mem buffera = clCreateBuffer (context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(cl_int)*NN,
&a,
&error);
if(error != CL_SUCCESS)
{
cerr << "Erreur clCreateBuffer pour a" << endl;
return 1;
}
cl_mem bufferb = clCreateBuffer (context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(cl_int)*NN,
&b,
&error);
if(error != CL_SUCCESS)
{
cerr << "Erreur clCreateBuffer pour b" << endl;
return 1;
}
cl_mem bufferc = clCreateBuffer (context,
CL_MEM_READ_WRITE,
sizeof(cl_int)*NN,
NULL,
&error);
// Buffer pour la donnée aTile
// cl_mem bufferaTile = clCreateBuffer (context,
// CL_MEM_READ_WRITE,
// sizeof(cl_int)*TILE_DIM*TILE_DIM,
// NULL,
// &error);
cl_program program = clCreateProgramWithSource(
context,
sizeof(program_source)/sizeof(char*),
program_source,
NULL,
&error);
if(error != CL_SUCCESS)
{
cerr << "Erreur clCreateBuffer pour c" << endl;
return 1;
}
cl_int result = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "MatMult", NULL);
result = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffera);
result = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferb);
result = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferc);
result = clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
// result = clSetKernelArg(kernel, 4, sizeof(cl_int)*TILE_DIM*N, NULL);
const size_t global_work_size[2] = {N,N};
const size_t local_work_size[2] = {TILE_DIM,TILE_DIM};
result = clEnqueueNDRangeKernel (command_queue,
kernel,
2,
NULL,
&global_work_size[0],
&local_work_size[0],
// NULL,
0, NULL, NULL);
clFinish(command_queue);
result = clEnqueueReadBuffer(command_queue,
bufferc,
CL_TRUE,
0,
sizeof(cl_int)*NN,
&c,
0, NULL, NULL);
clFinish(command_queue);
//*
cout<< "## A MATRIX ##"<<endl;
for(int j=0;j<NN;j++)
{
cout <<setw(5)<< a[j];
if((j+1)%N==0){cout<<endl;}
}
cout<<endl;
cout<< "## B MATRIX ##"<<endl;
for(int j=0;j<NN;j++)
{
cout <<setw(5)<< b[j];
if((j+1)%N==0){cout<<endl;}
}
cout<<endl;
//*/
cout<< "## C MATRIX ##"<<endl;
for(int j=0;j<NN;j++)
{
cout <<setw(5)<< c[j];
if((j+1)%N==0){cout<<endl;}
}
cout<<endl;
//*/
cout<<"c["<<NN-1<<"] ="<< setw(5)<< c[NN-1]<<endl;
// cout<< "resultat = " << result <<endl;
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseMemObject(bufferc);
clReleaseMemObject(bufferb);
clReleaseMemObject(buffera);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
}
compiling with : g++ DemoMatMult.cpp -o DemoMatMult -lOpenCL
Thanks for helping me.