I have a question related to the maximum number of work-items that can be set using enqueueNDRangeKernel. According to the getInfo-functions my graphics card, the meager AMD Radeon HD 6450, has the following specifications:
CL_DEVICE_MAX_WORK_GROUP_SIZE: 256
CL_DEVICE_MAX_WORK_ITEM_SIZES()[0]: 256
But when I set the cl::NDRange global(dim), with dim > 256, it stil executes and calculates values.
Here is the sample code, and OpenCLObject is my own class which takes care of storing kernels, creating a program/platforms etc.
float matrixMultOpenCL(int dim,OpenCLObject &CLObj)
{
if(dim % 4)
{
cout << "dims must be mult of 4";
exit(1);
}
float* A = new float[dim*dim];
float* B = new float[dim*dim];
float* result = new float[dim*dim];
for (int i = 0; i < dim*dim; i++)
{
A[i] = (float)(rand() % 10);
B[i] = (float)(rand() % 10);
}
for (int i = 0; i < dim*dim; i++)
result[i] = 0.0f;
cl_ulong start, finish;
cl::NDRange global(dim);
cl:: NDRange local(1);
cl::NDRange offset(NULL);
//std::cout << CLObj.get_queues()[1].getInfo<CL_QUEUE_DEVICE>().getInfo<CL_DEVICE_PROFILING_TIMER_RESOLUTION>();
cl::Event profiling;
int matMultIndx = CLObj.findKernelIndex("matrix_mult");
void* modelOutputMappedMem;
cl::Buffer bufferR = cl::Buffer(CLObj.get_context(), CL_MEM_WRITE_ONLY, sizeof(float)*dim*dim);
cl::Buffer bufferA = cl::Buffer(CLObj.get_context(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*dim*dim,A);
cl::Buffer bufferB = cl::Buffer(CLObj.get_context(), CL_MEM_READ_ONLY| CL_MEM_COPY_HOST_PTR, sizeof(float)*dim*dim,B);
CLObj.get_kernels()[matMultIndx].setArg(0, bufferA);
CLObj.get_kernels()[matMultIndx].setArg(1, bufferB);
CLObj.get_kernels()[matMultIndx].setArg(2, bufferR);
CLObj.get_queues()[0].enqueueNDRangeKernel(CLObj.get_kernels()[matMultIndx], cl::NullRange, global,local,NULL,&profiling);
modelOutputMappedMem = CLObj.get_queues()[0].enqueueMapBuffer(bufferR,
CL_TRUE,CL_MAP_READ, 0,sizeof(float)*dim*dim);//,&waitList);
memcpy(result,modelOutputMappedMem,sizeof(float)*dim*dim);
CLObj.get_queues()[0].enqueueUnmapMemObject(bufferR,modelOutputMappedMem);
start = profiling.getProfilingInfo<CL_PROFILING_COMMAND_START>();
finish = profiling.getProfilingInfo<CL_PROFILING_COMMAND_END>();
for(int i = 0; i <dim; i++)
{
cout <<"last el: " <<result[dim*dim-1-i]<<endl;
}
delete A;
delete B;
delete result;
//cout<<"time executing kernel: " << float(finish-start)/1000.0f <<"uS";
return float(finish-start)/1000.0f;
}
Shouldn’t this produce an error, or at least return erroneous values for the matrix-multiplication?
Kernel-code:
kernel void matrix_mult(__global float4 *a_mat,
__global float4 *b_mat, __global float *c_mat)
{
float sum;
int num_rows = get_global_size(0);
int vectors_per_row = num_rows/4;
int start = get_global_id(0) * vectors_per_row;
a_mat += start;
c_mat += start*4;
for(int i=0; i<num_rows; i++)
{
sum = 0.0f;
for(int j=0; j<vectors_per_row; j++)
{
sum += dot(a_mat[j],
b_mat[i*vectors_per_row + j]);
}
c_mat[i] = sum;
}
}