Expected result?

Hi all, I’m testing the following code on my Nvidia GPU (GTX 750 Ti + latest CUDA 8.0/OpenCL driver/library ) and don’t quite understand why it’s not working as expected.
The same code seems working fine with OpenCL on Mac CPU. Any comments? Thanks in advance.


#include <iostream>
#include <CL/cl.h>
#include <assert.h>
#include <string.h>

cl_platform_id platform_id;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_device_id device_id;
cl_context context;
cl_command_queue command_queue;
cl_mem gpu_cache_ptr;
void *cpu_ptr;
void *gpu_ptr;

int main() {
    cl_int ret;

    // setup
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    assert(ret == CL_SUCCESS);

    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
    assert(ret == CL_SUCCESS);

    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
    assert(ret == CL_SUCCESS);

    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
    assert(ret == CL_SUCCESS);

    // create mapped buffer
    size_t size = 10;
    gpu_cache_ptr = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR, size, NULL, &ret);
    assert(ret == CL_SUCCESS);

    cpu_ptr = clEnqueueMapBuffer(command_queue, gpu_cache_ptr, CL_TRUE, CL_MAP_READ|CL_MAP_WRITE, 0, size, 0, NULL, NULL, &ret);
    assert(ret == CL_SUCCESS);
    memset(cpu_ptr, 0, size);
    
    // fill data
    memset(cpu_ptr, 'h', 1);
    memset(cpu_ptr+1, 'e', 1);
    memset(cpu_ptr+2, 'l', 1);
    memset(cpu_ptr+3, 'l', 1);
    memset(cpu_ptr+4, 'o', 1);
    memset(cpu_ptr+5, '\0', 1);
    std::cout << "cpu_ptr=" << (char*)cpu_ptr << std::endl;
    
    // create another gpu mem
    cl_mem gpu_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &ret);
    assert(ret == CL_SUCCESS);
    gpu_ptr = (void *) gpu_mem;

    // copy data to this gpu mem
    ret = clEnqueueCopyBuffer(command_queue, (cl_mem) gpu_cache_ptr, (cl_mem) gpu_ptr, 0, 0, size, 0, NULL, NULL);
    assert(ret == CL_SUCCESS);

    clFinish(command_queue);

    char* recovered_value = new char[size];
    
    // copy back to cpu buffer
    ret = clEnqueueReadBuffer(command_queue, (cl_mem) gpu_ptr, CL_TRUE, 0, size, recovered_value, 0, NULL, NULL); 
    assert(ret == CL_SUCCESS);

    std::cout << "recovered_value=" << recovered_value << std::endl;

    delete[] recovered_value;
}


Since you’re copying from a buffer to another buffer, you have to “signal” the source buffer that its content has changed. This is done by unmapping the source buffer before copying.

If, as I suspect, you want instead to use pinned memory, you have to use clEnqueueWriteBuffer(command_queue, gpu_mem, cpu_ptr, CL_TRUE, 0, size, 0, NULL, NULL) instead of clEnqueueCopyBuffer. Then there is no need for unmapping the source buffer.