Hi,
I am trying out OpenCL on my nVidia GTX660. It looks great, but I am having an issue with CL_MEM_USE_HOST_PTR buffers. I use this flag on all my buffers : it works on input buffers but my kernel cannot write the output data in RAM.
I tried to use clEnqueueReadBuffer() after kernel execution and it works so what’s going on ? Does this mean that the buffer is in the GPU RAM ? Why ?
Here are my relevant functions.
void OCLInterface::LoadKernel(string file, size_t size)
{
char* source;
size_t fileSize;
source = LoadSource(file.c_str(), &fileSize);
program = clCreateProgramWithSource(context, 1, (const char **)&source, &fileSize, &err);
free(source);
CHK();
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
CHK();
kernel = clCreateKernel(program, file.c_str(), &err);
CHK();
currentSize = size;
currentArgCount = 0;
}
void OCLInterface::AddParam(float* data, bool bInput)
{
cl_mem buffer = clCreateBuffer(
context,
(bInput ? CL_MEM_READ_ONLY:CL_MEM_WRITE_ONLY) | CL_MEM_USE_HOST_PTR,
VSIZE * sizeof(float),
data,
&err
);
CHK();
err = clSetKernelArg(kernel, currentArgCount, sizeof(cl_mem), (void*)&buffer);
CHK();
buffers.push_back(buffer);
currentArgCount++;
}
void OCLInterface::Exec(int size)
{
size_t localsize = size;
size_t globalsize = ceil(VSIZE / (float)localsize) * localsize;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalsize, &localsize, 0, NULL, NULL);
CHK();
err = clFinish(queue);
CHK();
}
// My code
float* A = (float*)malloc(VSIZE * sizeof(float));
float* B = (float*)malloc(VSIZE * sizeof(float));
float* C = (float*)malloc(VSIZE * sizeof(float));
// snip : array init (a loop)
openCL.LoadKernel("add", VSIZE);
openCL.AddParam(A, true);
openCL.AddParam(B, true);
openCL.AddParam(C, false);
openCL.Exec(128);
At this point C[] is still at its initialization value (0 in my case but depending on the array init code).
And now my kernel, which should do a simple vector addition.
__kernel void add(
__global const float* input1,
__global const float* input2,
__global float* output)
{
unsigned int index = get_global_id(0);
output[index] = input1[index] + input2[index];
}
Thanks !