Wrong output values from GPU

Hello all!

I wrote a simple kernel and want to try it out. But, in my opinion, these are considered to be wrong. What did I do wrong?

Kernel code:


const std::string sSrcKernel = "__kernel void kernelMain(__global float * input, __global float * output)
"
"{
"
"    int i = get_global_id(0);
"
"    output[i] = input[i];
"
"}
";

Host code:


#define CHECK_CL_ERROR(funcName, error) \
    std::cout << "Function name: " << funcName << "
" << "error: " << error << std::endl;
void init();
void calc();

cl_platform_id  sPlatformId;
cl_context      sContext;
cl_device_id    sDeviceId;
size_t sMaxWorkGroupSize;

void init()
{
    cl_int error;
    cl_uint numPlatform = 0;
    clGetPlatformIDs(0, nullptr, &numPlatform);
    if(0 < numPlatform)
    {
        std::vector<cl_platform_id> platformsId;
        platformsId.resize(numPlatform);
        clGetPlatformIDs(numPlatform, &platformsId[0], nullptr);
        sPlatformId = platformsId[0];
        std::string buf;
        buf.resize(10240);
        std::string platformName;
        platformName.resize(10240);
        error = clGetPlatformInfo(sPlatformId, CL_PLATFORM_NAME, platformName.size(), &platformName[0], nullptr );
        std::cout << "Platform name: " << platformName << std::endl;
        cl_context_properties properties[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)sPlatformId, 0};
        cl_context context = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, nullptr, nullptr, &error);
        size_t numDevices = 0;
        clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, nullptr, &numDevices);
        size_t amount = numDevices / sizeof(cl_device_id);
        if(0 < amount)
        {
            std::vector<cl_device_id> allDeviceId;
            allDeviceId.resize(amount);
            clGetContextInfo(context, CL_CONTEXT_DEVICES, numDevices, &allDeviceId[0], nullptr);
            sDeviceId = allDeviceId[0];
            sContext = context;

            for(int i=0; i<allDeviceId.size(); i++)
            {
                clGetDeviceInfo(allDeviceId[i], CL_DEVICE_NAME, buf.size(), &buf[0], nullptr );
                std::cout << "Device name: " << buf << std::endl;
                clGetDeviceInfo(allDeviceId[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &sMaxWorkGroupSize, nullptr);
                std::cout << "max work group size: " << sMaxWorkGroupSize << std::endl;
            }
        }
    }
}

void calc()
{
    cl_int error = CL_SUCCESS;

    const char * srcChar = sSrcKernel.c_str();
    size_t srcSize = sSrcKernel.size();
    cl_program program = clCreateProgramWithSource(sContext, 1,
                                        (const char **)&srcChar,
                                        (const size_t *)&srcSize,
                                         &error);
    CHECK_CL_ERROR("clCreateProgramWithSource", error);

    size_t globalWorkSize = sMaxWorkGroupSize;
    //globalWorkSize = 256;

    // create input/output args
    std::vector<float> data(globalWorkSize);

    for(int i=0; i<data.size(); i++)
        data[i] = i+1;

    error = clBuildProgram(program, 1, &sDeviceId, nullptr, nullptr, nullptr);
    CHECK_CL_ERROR("clBuildProgram", error);

    cl_kernel kernel = clCreateKernel(program, "kernelMain", &error);
    CHECK_CL_ERROR("clCreateKernel", error);

    cl_mem input = clCreateBuffer(sContext, CL_MEM_READ_ONLY,  sizeof(float) * globalWorkSize, nullptr, nullptr);
    cl_mem output = clCreateBuffer(sContext, CL_MEM_WRITE_ONLY, sizeof(float) * globalWorkSize, nullptr, nullptr);

    cl_command_queue commandQueue = clCreateCommandQueue(sContext, sDeviceId, 0, &error);
    CHECK_CL_ERROR("clCreateCommandQueue", error);
    error = clEnqueueWriteBuffer(commandQueue, input, CL_TRUE, 0, sizeof(float) * globalWorkSize,(void*)&data[0], 0, nullptr, nullptr);
    CHECK_CL_ERROR("clEnqueueWriteBuffer", error);

    error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    CHECK_CL_ERROR("clSetKernelArg", error);
    error = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    CHECK_CL_ERROR("clSetKernelArg", error);

    size_t group;
    error = clGetKernelWorkGroupInfo(kernel, sDeviceId, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &group, nullptr);
    CHECK_CL_ERROR("clGetKernelWorkGroupInfo", error);
    std::cout << "Group: " << group << std::endl;

    size_t globalWork[1] = { globalWorkSize };
    error = clEnqueueNDRangeKernel(commandQueue, kernel, 1, nullptr, globalWork, nullptr, 0, nullptr, nullptr);
    CHECK_CL_ERROR("clEnqueueNDRangeKernel", error);

    clFinish(commandQueue);

    std::vector<float> result(globalWorkSize);
    clEnqueueReadBuffer(commandQueue, output, CL_TRUE, 0, result.size(), &result[0], 0, nullptr, nullptr);

    clReleaseKernel(kernel);
    clReleaseCommandQueue(commandQueue);
    clReleaseMemObject(input);
    clReleaseMemObject(output);

    clReleaseProgram(program);

    std::cout << "In data: " << std::endl;
    for(auto value : data)
    {
        std::cout << value << " ";
    }
    std::cout << std::endl;

    std::cout << "Out data:" << std::endl;
    for(auto value : result)
    {
        std::cout << value << " ";
    }
    std::cout << std::endl;

}

int main(int argc, char** argv)
{
    init();
    calc();
    return 0;
}

Output:


Platform name: AMD Accelerated Parallel Processing
Device name: Cedar
max work group size: 128
Function name: clCreateProgramWithSource
error: 0
Function name: clBuildProgram
error: 0
Function name: clCreateKernel
error: 0
Function name: clCreateCommandQueue
error: 0
Function name: clEnqueueWriteBuffer
error: 0
Function name: clSetKernelArg
error: 0
Function name: clSetKernelArg
error: 0
Function name: clGetKernelWorkGroupInfo
error: 0
Function name: clEnqueueNDRangeKernel
error: 0
In data: 
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 
Out data:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 


I think the problem is in the selection of the index in the core. But I can not understand why there is so.

Thank you in advance

Correct answer:
Instead of


 std::vector<float> result(globalWorkSize);
    clEnqueueReadBuffer(commandQueue, output, CL_TRUE, 0, [b]result.size()[/b], &result[0], 0, nullptr, nullptr);
[\CODE]

need write

std::vector<float> result(globalWorkSize);
clEnqueueReadBuffer(commandQueue, output, CL_TRUE, 0, sizeof(float)*result.size(), &result[0], 0, nullptr, nullptr);
[\CODE]

That’s my fault ((( Tahnks for all :slight_smile: