Unable to run kernel with matrix of size > 1024²

Get the following error when trying to run with size of 2048²: ERROR: clEnqueueReadBuffer(-5). It works fine with 1024². Is there some buffer limits I should be aware of? A am running on a crappy NVidia Quadro FX 570.

Relevant host code:


unsigned int matrixSize = 2048;
const unsigned int matrixSizeSq = matrixSize * matrixSize;
try {

// (...)

// Create kernel data
float* a = new float[matrixSizeSq];
float* b = new float[matrixSizeSq];
float* c = new float[matrixSizeSq];
   
for (unsigned int i = 0; i < matrixSizeSq; ++i) {
  float val = static_cast<float>(i);
  a[i] = val;
  b[i] = val + 1.0f;
  c[i] = 0.0f;
 }
    
// Set kernel arguments
cl::Buffer aBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                   matrixSizeSq * sizeof(float), a, &err);
cl::Buffer bBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                   matrixSizeSq * sizeof(float), b, &err);
cl::Buffer cBuffer(context, CL_MEM_WRITE_ONLY,
                   matrixSizeSq * sizeof(float), 0, &err);

// Create kernel
cl::Kernel kernel(program, "matrixMult", &err);

kernel.setArg(0, aBuffer);
kernel.setArg(1, bBuffer);
kernel.setArg(2, cBuffer);

cl::CommandQueue queue(context, devices[0], 0, &err);

std::vector<cl::Event> events;
cl::Event matMultEvent;
queue.enqueueNDRangeKernel(kernel, 
                           cl::NullRange, 
                           cl::NDRange(matrixSize, matrixSize),
                           cl::NDRange(16, 16),
                           NULL,
                           &matMultEvent);

events.push_back(matMultEvent);

queue.enqueueReadBuffer(cBuffer, CL_FALSE, 0, matrixSizeSq*sizeof(float), c,
                        &events);
queue.finish();

// (...)

}
  catch (cl::Error err) {
    std::cerr 
      << "ERROR: "
      << err.what()
      << "("
      << err.err()
      << ")"
      << std::endl;
}


Kernel code


__kernel void matrixMult(__global const float* a,
                         __global const float* b,
                         __global float* c)
{
  // Vector element index
  int globalIdx = get_global_id(0);
  int globalIdy = get_global_id(1);
  int n = get_global_size(0);

  float sum = 0;
  for (unsigned int k = 0; k < n ; ++k)
    sum = sum + a[globalIdx + (k * n)] * b[k + (globalIdy * n)];
  c[globalIdx + (globalIdy * n)] = sum;
}

I believe there is a bug in Nvidia’s beta drivers that limits the maximum execution size to 65k or something. Try running your code on a Mac (which has no limits on max size) and see if it works. If that’s the case you can file a bug against Nvidia.

It’s more a limitation than a bug, because nVidia explicitly specify this limitation in their OpenCL guide. :cry: