OpenCL: clEnqueueNDRangeKernel Failed: -54

Hi,

i wrote my first OpenCL Program with VS 2008 CLR Forms.
I always get a: “clEnqueueNDRangeKernel Failed: -54” Error and i dont know why.
Can someone take a look at my code?

Its a Visual Studio 2008 Project:

http://free.doublebackslash.net/Studium … OpenCL.rar

Click first on the button “Devices suchen + Infos lesen” and then on the button “Context + CommandQueues erstellen”

For all others without Visual Studio, here is the Code: (OpenCL.lib is linked!)



#include <utility>
#define __NO_STD_VECTOR
#define __NO_STD_STRING

#include <CL/cl.h>

#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <iterator>
#include <math.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>

	static char vendor_cpu[65536];
	static char vendor_gpu[65536];
	static std::size_t size;
	static cl_uint num_devices_returned;
	static cl_device_id devices[2];
	static cl_ulong platform;

	static cl_ulong frequ, max_par_units, addr_bits;
	static cl_int err = CL_SUCCESS;
	static cl_uint num_platforms;
    static cl_platform_id * platforms;
	static bool cpu = false;
	static bool gpu = false;
	static cl_kernel kernel;
	static cl_context context;
	static size_t local;
	const unsigned int cnBlockSize= 512;
    const unsigned int cnBlocks =3;
    static size_t cnDimension = cnBlocks * cnBlockSize;

    
	const char * kernelcode  = "__kernel void vectorAdd(              
"
							 "__global const float * a,             
"
							 "__global const float * b,             
"
							 "__global   float * c)                 
"
						 	 "{                                     
"
						     "   // Vector element index            
"
							 "   int nIndex = get_global_id(0);     
"
							 "   c[nIndex] = a[nIndex] + b[nIndex]; 
"
							 "}                                     
";

private: System::Void bt_getdeviceinfos_Click(System::Object^  sender, System::EventArgs^  e)
{
	this->rtb_log->AppendText("##Get Platform ID##
");
    err = clGetPlatformIDs(0, NULL, &num_platforms);
	platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
    clGetPlatformIDs(num_platforms, platforms, NULL);        // Get device IDs
    cl_platform_id platform_id = platforms[0];
	platform = (cl_ulong)platform_id;
	this->l_num_platform->Text = "Platform ID: " + platform;


	this->rtb_log->AppendText("##Get CPU Devices##
");
	err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_CPU, 1, &devices[0], &num_devices_returned);
	this->l_num_cpu->Text = "Anzahl der CPU-Devices: " + num_devices_returned.ToString();
	this->rtb_cpu->AppendText(num_devices_returned.ToString() + " CPU Device gefunden
");
	//cout << "Fehlercode: " << err << endl;
	if(err == 0) cpu = true;

	this->rtb_log->AppendText("##Get GPU Devices##
");
	err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 1, &devices[1], &num_devices_returned);
	this->l_num_gpu->Text = "Anzahl der GPU-Devices: " + num_devices_returned.ToString();
	this->rtb_gpu->AppendText(num_devices_returned.ToString() + " GPU Device gefunden
");
	if(err == 0) gpu = true;

	if(cpu == true)
	{
		this->rtb_log->AppendText("##Get CPU Device Infos##
");
		clGetDeviceInfo(devices[0], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(frequ), &frequ, &size);
		this->rtb_cpu->AppendText("Maximum clock frequency of the device in MHz: " + frequ.ToString() + "
");
		clGetDeviceInfo(devices[0], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_par_units), &max_par_units, &size);
		this->rtb_cpu->AppendText("The number of parallel compute cores on the OpenCL device: " + max_par_units.ToString() + "
");
		clGetDeviceInfo(devices[0], CL_DEVICE_NAME, sizeof(vendor_cpu), vendor_cpu, &size);
		this->rtb_cpu->AppendText("Vendor name: " + gcnew String(vendor_cpu) + "
");
		clGetDeviceInfo(devices[0], CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, &size);
		this->rtb_cpu->AppendText("CPU ADDRESS BITS: " + addr_bits.ToString() + "
");
	}

	if(gpu == true)
	{
		this->rtb_gpu->AppendText("##Get GPU Device Infos##
");
		clGetDeviceInfo(devices[1], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(frequ), &frequ, &size);
		this->rtb_gpu->AppendText("Maximum clock frequency of the device in MHz: " + frequ.ToString() + "
");
		clGetDeviceInfo(devices[1], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_par_units), &max_par_units, &size);
		this->rtb_gpu->AppendText("The number of parallel compute cores on the OpenCL device: " + max_par_units.ToString() + "
");
		clGetDeviceInfo(devices[1], CL_DEVICE_NAME, sizeof(vendor_gpu), vendor_gpu, &size);
		this->rtb_gpu->AppendText("Vendor name: " + gcnew String(vendor_gpu) + "
");
		clGetDeviceInfo(devices[1], CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, &size);
		this->rtb_gpu->AppendText("GPU ADDRESS BITS: " + addr_bits.ToString() + "
");
	}
}
private: System::Void bt_contextcq_Click(System::Object^  sender, System::EventArgs^  e)
{
	this->rtb_log->AppendText("##Create Context##
");
	
	if(cpu == true && gpu == true)
		context = clCreateContext(0, 2, devices, NULL, NULL, &err);
	if(cpu == true && gpu == false)
		context = clCreateContext(0, 1, devices, NULL, NULL, &err);
	if(gpu == true && cpu == false)
		context = clCreateContext(0, 1, devices, NULL, NULL, &err);
	
	this->rtb_log->AppendText("##Create CommandQueue's##
");
	cl_command_queue queue_cpu, queue_gpu;
	if(cpu == true)
		queue_cpu = clCreateCommandQueue(context, devices[0], 0, &err);
	if(gpu == true)
		queue_gpu = clCreateCommandQueue(context, devices[1], 0, &err);

	this->rtb_log->AppendText("##Create Program Codes for OpenCL##
");
	cl_program program;
	size_t kernelsize = strlen(kernelcode);
	program = clCreateProgramWithSource(context, 1, (const char**)&kernelcode, &kernelsize, &err);
	err = clBuildProgram(program, num_devices_returned, NULL, NULL, NULL, NULL);
	if(err != 0)
	{
		char log[1024] = "";
		err = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(log), log, NULL);
		MessageBox::Show(gcnew String(log));
	}

	this->rtb_log->AppendText("##Create Kernel Codes for OpenCL##
");
	kernel = clCreateKernel(program, "vectorAdd", &err);
	if(err != 0)
		MessageBox::Show("clCreateKernel: " + err.ToString());
	
   float * pA = new float[cnDimension];
   float * pB = new float[cnDimension];
   float * pC = new float[cnDimension];
   float * pC1 = new float[cnDimension];
   
   memset(pC, 0, cnDimension * sizeof(float));
   memset(pC1, 0, cnDimension * sizeof(float));

   // initialize host memory
   int i;
   for(i=0; i < cnDimension; i++)
   {
      pA[i] = pC[i] = pC1[i] = 0;
      pB[i] = i;
   }

   // allocate device memory
   cl_mem hDeviceMemA, hDeviceMemB, hDeviceMemC;
   hDeviceMemA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);
   hDeviceMemB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pB, 0);
   hDeviceMemC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, cnDimension * sizeof(cl_float), 0, 0);

   // setup parameter values
   err = 0;
   err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&hDeviceMemA);
   err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&hDeviceMemB);
   err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&hDeviceMemC);
   if (err != CL_SUCCESS)
   {
	   MessageBox::Show("Error: Failed to set kernel args: " + err.ToString() + "
");
   }

   // Get the maximum work-group size for executing the kernel on the device
   err = clGetKernelWorkGroupInfo(kernel, devices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
   if (err != CL_SUCCESS)
   {
	   MessageBox::Show("Error: clGetKernelWorkGroupInfo Failed: " + err.ToString() + "
");
   }
   
   // execute kernel
   err = clEnqueueNDRangeKernel(queue_cpu, kernel, 1, NULL, (size_t*)(&cnDimension), &local, 0, NULL, NULL);
   if (err != CL_SUCCESS)
   {
	   MessageBox::Show("Error: clEnqueueNDRangeKernel Failed: " + err.ToString() + "
");
   }

   // copy results from device back to host
   clEnqueueReadBuffer(queue_cpu, hDeviceMemC, CL_TRUE, 0, cnDimension * sizeof(cl_float),
                  pC, 0, NULL, NULL);

   // wait for command queue
   clFinish(queue_cpu);
   for(i=0; i < cnDimension; i++)
   {
	   this->rtb_log->AppendText(pC[i].ToString() + "
");
   }

}
private: System::Void Form1_Load(System::Object^  sender, System::EventArgs^  e)
{

}
};
}



ps: i try to run it on the cpu, ATI stream drivers are installed.

i dont have a opencl ready graphics card, so i have to use the cpu.
is it possible that this problem only accours on the cpu?

i found the error:

err = clEnqueueNDRangeKernel(queue_cpu, kernel, 1, NULL, (size_t*)(&cnDimension), &local, 0, NULL, NULL);

i has to be:

err = clEnqueueNDRangeKernel(queue_cpu, kernel, 1, NULL, (size_t*)(&cnDimension), NULL, 0, NULL, NULL);

dont know why, but i works :smiley: