The following Exceptions have occurred : clEnqueueNDRangeKernel(-54)

I get the exception clEnqueueNDRangeKernel(-54) when the command trapeziodal(cl::EnqueueArgs(queue, cl::NDRange(subIntervals), cl::NDRange(local_trapeziodal)), d_x, d_fx, d_intervals, subIntervals); is executed on a CPU (not tried on GPU so can’t verify if the problem exists there). I assume it is some memory issue, but not sure exactly what argument is causing the exception to be thrown. It’s probably something obvious that I’ve missed, as I am new to OpenCL programming.

Below a snippet of the code.

       
        const int subIntervals = 20;
        std::vector<double> h_x(subIntervals);
        std::vector<double> h_fx(subIntervals);	
        std::vector<double> h_intervals(subIntervals); 


	cl::Buffer d_x;
	cl::Buffer d_fx;
	cl::Buffer d_intervals;

    	std::vector<cl::Device> devices;     
    	cl::Device device;

 	
    	for(int i = 0; i < subIntervals; i++)
    	{
       		h_x[i]  = double(i);
        	h_fx[i]  = (double(i) * double(i)) / 4; // function x^2/4
        	h_intervals[i] = 0.0;
    	}

    	try 
    	{
            std::vector<cl::Platform> platforms;
       	    cl::Platform::get(&platforms);
            
            cl::Context context;
        	
	    for (cl::Platform p : platforms)
            {
               try
               {
                  p.getDevices(CL_DEVICE_TYPE_GPU, &devices);
               }
		catch (cl::Error err) {}
		if (devices.size() > 0)
                {
		    std::cout << "GPU found " << std::endl;
		    std::cout << "number of devices = " << devices.size() << std::endl;
                    device = devices[0];
                    cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)p(), 0};
                    context = cl::Context(device, properties);
                    break;
                }
           }
           if (!context())
           {
            std::cerr << "Error: Cannot secure a GPU device. Trying CPU" << std::endl;;
            
	       for (cl::Platform p : platforms)
		{
		    try
		    {
		        p.getDevices(CL_DEVICE_TYPE_CPU, &devices);
		    } 
		    catch (cl::Error err) {}
		    if (devices.size() > 0)
		    {
			 std::cout << "CPU found" << std::endl;
			 std::cout << "number of devices = " << devices.size() << std::endl;
		         device = devices[0];
		         cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)p(), 0};
		         context = cl::Context(device, properties);
		         break;
		   }
		}

		if( !context() ) 
		{
		    std::cerr << "Error: Cannot secure a GPU or CPU device. 
 
 Exiting!!!" << std::endl;;
		    exit(1);
		}
          }

        
        cl::Program programTrapeziodal(context, util::loadProgram("Trapeziodal_kernel.cl"), true);

        cl::CommandQueue queue(context, device);

        auto trapeziodal = cl::make_kernel<cl::Buffer, cl::Buffer, cl::Buffer, const uint>(programTrapeziodal, "trapeziodal");

        d_x  = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * subIntervals, &h_x[0]);
        d_fx  = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * subIntervals, &h_fx[0]);
	
        d_intervals  = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(double) * subIntervals);
	
        cl::Kernel ko_trapeziodal(programTrapeziodal, "trapeziodal");
        ::size_t local_trapeziodal = ko_trapeziodal.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(device);

// Error occurs here
        trapeziodal(cl::EnqueueArgs(queue, cl::NDRange(subIntervals), cl::NDRange(local_trapeziodal)), d_x, d_fx, d_intervals, subIntervals);
	
        queue.enqueueReadBuffer(d_intervals, CL_TRUE, 0, sizeof(float) * subIntervals, &h_intervals[0]);

    }
    catch(cl::Error err) 
    {
        std::cerr << "The following Exceptions have occurred : " << err.what() << "(" << err.err() << ")" << std::endl;
    }
 

That error typically occurs when the global work size is not evenly divisible by the local work size. Your global work-size looks like it is 20, which is pretty small. But if you are querying CL_KERNEL_WORK_GROU_SIZE for your kernel, I’d bet that doesn’t evenly divide 20.