I have a question related to the maximum number of work-items that can be set using enqueueNDRangeKernel. According to the getInfo-functions my graphics card, the meager AMD Radeon HD 6450, has the following specifications:

CL_DEVICE_MAX_WORK_GROUP_SIZE: 256

CL_DEVICE_MAX_WORK_ITEM_SIZES()[0]: 256

But when I set the cl::NDRange global(dim), with dim > 256, it stil executes and calculates values.

Here is the sample code, and OpenCLObject is my own class which takes care of storing kernels, creating a program/platforms etc.

Code :float matrixMultOpenCL(int dim,OpenCLObject &CLObj) { if(dim % 4) { cout << "dims must be mult of 4"; exit(1); } float* A = new float[dim*dim]; float* B = new float[dim*dim]; float* result = new float[dim*dim]; for (int i = 0; i < dim*dim; i++) { A[i] = (float)(rand() % 10); B[i] = (float)(rand() % 10); } for (int i = 0; i < dim*dim; i++) result[i] = 0.0f; cl_ulong start, finish; cl::NDRange global(dim); cl:: NDRange local(1); cl::NDRange offset(NULL); //std::cout << CLObj.get_queues()[1].getInfo<CL_QUEUE_DEVICE>().getInfo<CL_DEVICE_PROFILING_TIMER_RESOLUTION>(); cl::Event profiling; int matMultIndx = CLObj.findKernelIndex("matrix_mult"); void* modelOutputMappedMem; cl::Buffer bufferR = cl::Buffer(CLObj.get_context(), CL_MEM_WRITE_ONLY, sizeof(float)*dim*dim); cl::Buffer bufferA = cl::Buffer(CLObj.get_context(), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*dim*dim,A); cl::Buffer bufferB = cl::Buffer(CLObj.get_context(), CL_MEM_READ_ONLY| CL_MEM_COPY_HOST_PTR, sizeof(float)*dim*dim,B); CLObj.get_kernels()[matMultIndx].setArg(0, bufferA); CLObj.get_kernels()[matMultIndx].setArg(1, bufferB); CLObj.get_kernels()[matMultIndx].setArg(2, bufferR); CLObj.get_queues()[0].enqueueNDRangeKernel(CLObj.get_kernels()[matMultIndx], cl::NullRange, global,local,NULL,&profiling); modelOutputMappedMem = CLObj.get_queues()[0].enqueueMapBuffer(bufferR, CL_TRUE,CL_MAP_READ, 0,sizeof(float)*dim*dim);//,&waitList); memcpy(result,modelOutputMappedMem,sizeof(float)*dim*dim); CLObj.get_queues()[0].enqueueUnmapMemObject(bufferR,modelOutputMappedMem); start = profiling.getProfilingInfo<CL_PROFILING_COMMAND_START>(); finish = profiling.getProfilingInfo<CL_PROFILING_COMMAND_END>(); for(int i = 0; i <dim; i++) { cout <<"last el: " <<result[dim*dim-1-i]<<endl; } delete A; delete B; delete result; //cout<<"time executing kernel: " << float(finish-start)/1000.0f <<"uS"; return float(finish-start)/1000.0f; }

Shouldn't this produce an error, or at least return erroneous values for the matrix-multiplication?

Kernel-code:

Code :kernel void matrix_mult(__global float4 *a_mat, __global float4 *b_mat, __global float *c_mat) { float sum; int num_rows = get_global_size(0); int vectors_per_row = num_rows/4; int start = get_global_id(0) * vectors_per_row; a_mat += start; c_mat += start*4; for(int i=0; i<num_rows; i++) { sum = 0.0f; for(int j=0; j<vectors_per_row; j++) { sum += dot(a_mat[j], b_mat[i*vectors_per_row + j]); } c_mat[i] = sum; } }