How to properly create Read and Write Buffers:

i am currently doing a straight implementation of openCL in c++, meaning that I am not using the SDKs. I am experiencing a problem with the writebuffer and readbuffer that I cannot debug, and it has something to do with the amount of data that I am sending to the buffers.

Here is my code:

err = clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, sizeof(data)*dataSize, data, 0, NULL, NULL);
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &dataSize, NULL, 0, NULL, NULL);
err = clEnqueueCopyBuffer(queue, output, input, 0, 0, sizeof(data)*dataSize, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(data)*dataSize, dataOut, 0, NULL, NULL );

As dataSize gets bigger (up into 3000), this code fails and I cannot get into NVCuda.dll to figure out why. Does anyone have any suggestions as to how to fix this code or as to how i can go about doing algorithms on up to a million data sets?

As dataSize gets bigger (up into 3000), this code fails

How does it fail? Do you get an error code? Wrong results? A crash? What does the kernel source code look like?

err = clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, sizeof(data)*dataSize, data, 0, NULL, NULL);

This line looks odd. Are you sure you meant to write “sizeof(data)”? Could you show us the declaration of variables “data” and “dataSize”?

Also, can you show us how you create the buffers “input” and “output”, and any calls to clSetKernelArg()?

The error that I get is Access Violation reading location blahblahblah and when I trace the callstack it sends me to nvcuda.dll.

I am thinking that I have gone wrong somewhere in my buffers, but maybe it is somewhere else.

Here is my code: Full source


#include <fcntl.h>
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <limits.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <CL/opencl.h>


const char *kernel_source =
"__kernel void simple(                                                   
"
"   global read_only int* input,                                         
"
"   global write_only int* output)									     
"
"{                                                                       
"
"   int index = get_global_id(0);                                        
"
"   output[index] = index;						         			     
"
"}                                                                       
";


// Storage for the arrays.
static cl_mem input;
static cl_mem output;
// OpenCL state
static cl_command_queue queue;
static cl_kernel kernel;
static cl_device_id device_ids;
static cl_context context;

static cl_platform_id platform_id;
static const size_t dataSize = 3500;

int main(int argc, char** argv)
{

	int* data = new int[dataSize];
	int* dataOut = new int[dataSize];

	for(int i = dataSize-1; i >= 0; --i)
	{
		data[i] = 0;
		dataOut[i] = 0;
	}

	cl_uint numPlatforms;
	cl_int err = CL_SUCCESS;
	size_t device_list_size;

	//set up the platforms
	err = clGetPlatformIDs(0, NULL, &numPlatforms);
	if(numPlatforms > 0)
	{
		//we have at least one
		cl_platform_id* platforms = new cl_platform_id[numPlatforms];
		err = clGetPlatformIDs(numPlatforms, platforms, NULL);
		platform_id = platforms[0];
		delete[] platforms;
	}

	cl_context_properties cps[3] = 
	{
		CL_CONTEXT_PLATFORM, 
		(cl_context_properties)platform_id, 
		0
    };

	context = clCreateContextFromType(
                  cps,
                  CL_DEVICE_TYPE_ALL,
                  NULL,
                  NULL,
                  &err);

	data[4] = 4;
	dataOut[4] = 5;
	err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_ids, NULL);

	queue = clCreateCommandQueue(context, device_ids, 0, &err);

	//now we set up our buffers
	input = clCreateBuffer(context,  CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,  sizeof(data)*dataSize, &data, &err);
	output = clCreateBuffer(context,  CL_MEM_WRITE_ONLY,  sizeof(data)*dataSize, NULL, &err);

	cl_program program = clCreateProgramWithSource(context, 1, &kernel_source , NULL, &err);
	//OPTIMIZATION OPTIONS FOUND AT http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clBuildProgram.html
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); // build and compile the OpenCL
	
	cl_kernel kernel = clCreateKernel(program, "simple", &err);
	
	clReleaseProgram(program); // no longer needed

	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);

	err = clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, sizeof(data)*dataSize, data, 0, NULL, NULL);

	err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &dataSize, NULL, 0, NULL, NULL);


	err = clEnqueueCopyBuffer(queue, output, input, 0, 0, sizeof(data)*dataSize, 0, NULL, NULL);

	err = clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(data)*dataSize, dataOut, 0, NULL, NULL );

	for(int i = dataSize - 1; i >= 0; --i)
	{
		std::cout << dataOut[i] << std::endl;
	}

	clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    return 0;
}

There are some errors in the code:


const char *kernel_source =
"__kernel void simple(                                                   
"
"   global read_only int* input,                                         
"
"   global write_only int* output)                                
"
"{                                                                       
"
"   int index = get_global_id(0);                                        
"
"   output[index] = index;                                         
"
"}                                                                       
";

read_only and write_only are qualifiers for images, not for pointers. This kernel should not compile successfully.

input = clCreateBuffer(context,  CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,  sizeof(data)*dataSize, &data, &err);

“sizeof(data)” means “sizeof(int*)”. That’s not what you want. What you want is “sizeof(cl_int)” since the type of the argument “input” in kernel “simple()” is “int”. Notice that in the host side you should always use data types with the prefix “cl_”, such as “cl_int” to ensure that they have the same bit representation as the kernel data types.

Also, you are passing “&data” as the host pointer, when you actually want to pass “data”. This is probably the reason your program is crashing.

Finally, notice that since you have create this buffer with the flag CL_MEM_USE_HOST_PTR, the implementation will attempt to reuse the memory allocated in “data” instead of allocating new memory. What that means is that since the kernel will take the buffer “input” as an array of CL int values, the variable “data” should be declared as having type “cl_int*” instead. That ensures that the size of the buffer will match on the host and the device.

output = clCreateBuffer(context,  CL_MEM_WRITE_ONLY,  sizeof(data)*dataSize, NULL, &err);

Same as before: it should be “sizeof(cl_int)”.

err = clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, sizeof(data)*dataSize, data, 0, NULL, NULL);

Please replace “sizeof(data)” with “sizeof(cl_int)”.

More importantly, this code is doing something very strange: it’s copying the contents of “data” into the buffer “input”. Why is this strange? Because “data” and “input” reference exactly the same memory. That’s what the CL_MEM_USE_HOST_PTR flag means.

err = clEnqueueCopyBuffer(queue, output, input, 0, 0, sizeof(data)*dataSize, 0, NULL, NULL);

I don’t understand the purpose of this. The kernel that was just executed wrote some values into “output” and now you are overwriting them with the values that were stored in “input” (i.e. zeroes). Is this what you intended?

Replace sizeof(data) with sizeof(cl_int).

err = clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(data)*dataSize, dataOut, 0, NULL, NULL );

Same drill: replace sizeof.

OK so I fixed the code samples that you gave me. Thank you for your help btw. But there are still problems.
I changed the kernel to use const instead of read_only and nothing for the write_only qualifier.
I changed all of my sizeof(data) to be sizeof(cl_int) and got rid of the *datasize;

I got rid of the copybuffer and the writebuffer lines, and kept the CL_MEM_USE_HOST_PTR. Now I am getting an error with the program? (-11, CL_BUILD_PROGRAM_FAILURE)


#include <fcntl.h>
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <limits.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <CL/opencl.h>


const char *kernel_source =
"__kernel void simple(                                                   
"
"   global int* input,                                                   
"
"   global int* output)													 
"
"{                                                                       
"
"   int index = get_global_id(0);                                        
"
"   output[index] = index;						         			     
"
"}                                                                       
";


// Storage for the arrays.
static cl_mem input;
static cl_mem output;
// OpenCL state
static cl_command_queue queue;
static cl_kernel kernel;
static cl_device_id device_ids;
static cl_context context;

static cl_platform_id platform_id;
static const size_t dataSize = 100;

int main(int argc, char** argv)
{

	cl_int* data = new cl_int[dataSize];
	cl_int* dataOut = new cl_int[dataSize];

	for(int i = dataSize-1; i >= 0; --i)
	{
		data[i] = 0;
		dataOut[i] = 0;
	}

	cl_uint numPlatforms;
	cl_int err = CL_SUCCESS;
	size_t device_list_size;

	//set up the platforms
	err = clGetPlatformIDs(0, NULL, &numPlatforms);
	if(numPlatforms > 0)
	{
		//we have at least one
		cl_platform_id* platforms = new cl_platform_id[numPlatforms];
		err = clGetPlatformIDs(numPlatforms, platforms, NULL);
		platform_id = platforms[0];
		delete[] platforms;
	}

	cl_context_properties cps[3] = 
	{
		CL_CONTEXT_PLATFORM, 
		(cl_context_properties)platform_id, 
		0
    };

	context = clCreateContextFromType(
                  cps,
                  CL_DEVICE_TYPE_ALL,
                  NULL,
                  NULL,
                  &err);

	data[4] = 4;
	dataOut[4] = 5;
	err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_ids, NULL);

	queue = clCreateCommandQueue(context, device_ids, 0, &err);

	//now we set up our buffers
	input = clCreateBuffer(context,  CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,  sizeof(cl_int), data, &err);
	output = clCreateBuffer(context,  CL_MEM_WRITE_ONLY,  sizeof(cl_int), NULL, &err);

	cl_program program = clCreateProgramWithSource(context, 1, &kernel_source , NULL, &err);
	//OPTIMIZATION OPTIONS FOUND AT http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clBuildProgram.html
	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); // build and compile the OpenCL

	cl_kernel kernel = clCreateKernel(program, "simple", &err);
	
	clReleaseProgram(program); // no longer needed

	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);

	err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &dataSize, NULL, 0, NULL,  NULL);
	
	err = clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_int), dataOut, 0, NULL,  NULL );

	for(int i = 0; i < 40; i++)
	{
		std::cout << dataOut[i] << std::endl;
	}

	clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    return 0;
}
 

and got rid of the *datasize

I certainly did not intend to suggest that. It is necessary.

Now I am getting an error with the program? (-11, CL_BUILD_PROGRAM_FAILURE)

You can query the kernel compilation errors with clGetProgramBuildInfo(…, CL_PROGRAM_BUILD_LOG, …).

Thank you for your help. I went back through the code and the changes that you provided, and apparently I changed too much last night. I recommitted and the changes work. Thank you so much for your help.