Possible Memory leak in nVidia driver

Hi,

I think I found a memory leak in nvidia driver (tested on 310.90 windows X64) with OpenCL 1.1.

The memory leak appears when you call the function clEnqueueReadBuffer with asynchronous parameters.

Here is a usage:


err = clEnqueueReadBuffer(cq, d_c, CL_FALSE, 0, NB_DATA*sizeof(float), c, 0, NULL, &evt);

and here is a code sample to illustrate the problem:


#include "CL/cl.h"

#include <sstream>
#include <iostream>
#include <vector>

#define TEST_SUCCESS(err) do{ if(err != CL_SUCCESS){ std::cout << "Line : " << __LINE__ << " - Error : " << err << std::endl; system("PAUSE"); return -1; } } while(0)

#define DEVICE_INDEX 1
#define NB_DATA 1024

int main(void)
{
  cl_int err;

  // Getting devices
  cl_uint nbPlatforms;

  err = clGetPlatformIDs(0, NULL, &nbPlatforms);
  TEST_SUCCESS(err);

  if (nbPlatforms == 0)
  {
    std::cout << "No platforms" << std::endl;
    return 0;
  }

  cl_platform_id* platformId = new cl_platform_id[nbPlatforms];
  err = clGetPlatformIDs(nbPlatforms, platformId, NULL);
  TEST_SUCCESS(err);

  typedef std::pair<cl_platform_id, cl_device_id> DevicePairType;
  std::vector<DevicePairType> deviceId;
  for (cl_uint i=0; i<nbPlatforms; i++)
  {
    cl_uint nbDevices;
    err = clGetDeviceIDs(platformId[i], CL_DEVICE_TYPE_ALL, 0, NULL, &nbDevices);
    TEST_SUCCESS(err);

    if(nbDevices == 0)
    {
      std::cout << "No Device in  platform : " << i << std::endl;
    }
    else
    {
      cl_device_id* did = new cl_device_id[nbDevices];
      err = clGetDeviceIDs(platformId[i], CL_DEVICE_TYPE_ALL, nbDevices, did, NULL);
      TEST_SUCCESS(err);

      for (cl_uint j = 0; j<nbDevices; j++)
      {
        cl_platform_id pl = platformId[i];
        cl_device_id de = did[j];

        DevicePairType tmp(pl, de);
        deviceId.push_back(tmp);
      }

      delete[] did;
    }
  }

  // Selected device 
  cl_platform_id platform = deviceId[DEVICE_INDEX].first;
  cl_device_id device = deviceId[DEVICE_INDEX].second;

  size_t nameSize;
  err = clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &nameSize);
  TEST_SUCCESS(err);

  char* name = new char[nameSize];
  err = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(char)*nameSize, (void*)name, NULL);
  TEST_SUCCESS(err);

  std::cout << "Device name : " << name << std::endl;

  // Creating context etc...
  cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
  cl_context context = clCreateContext(props, 1, &device, NULL, NULL, &err);
  TEST_SUCCESS(err);

  cl_command_queue cq = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
  TEST_SUCCESS(err);

  // Creating kernel
  std::ostringstream source;
  source << " __kernel void sum( __global const float* a, __global const float* b, __global float* c)" << std::endl;
  source << "{" << std::endl;
  source << "  unsigned int tid = get_global_id(0);" << std::endl;
  source << "  c[tid] = a[tid] + b[tid];" << std::endl;
  source << "}" << std::endl;

  std::string str = source.str();

  const char* src = str.c_str();

  size_t srcSize = str.size();

  std::cout << src << std::endl;
  cl_program program = clCreateProgramWithSource(context, 1, &(src), &srcSize, &err);
  TEST_SUCCESS(err);

  err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
  if( err != CL_SUCCESS)
  {
    std::string blog;

    // Get size of data to get
    size_t logSize;
    err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
    TEST_SUCCESS(err);

    char* lblog = new char[logSize];

    // Get build status
    err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(char)*logSize, (void*)lblog, NULL);
    TEST_SUCCESS(err);

    blog = std::string(lblog);

    std::cout << "Log : " << std::endl << blog << std::endl;

    delete[] lblog;
    
    system("PAUSE");

    return -1;
  }


  cl_kernel zeKernel = clCreateKernel(program, "sum", &err);
  TEST_SUCCESS(err);

  // Memory
  float* a = new float[NB_DATA];
  float* b = new float[NB_DATA];
  float* c = new float[NB_DATA];

  cl_mem d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, NB_DATA*sizeof(float), (void*)a, &err);
  TEST_SUCCESS(err);
  
  cl_mem d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, NB_DATA*sizeof(float), (void*)b, &err);
  TEST_SUCCESS(err);

  cl_mem d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, NB_DATA*sizeof(float), NULL, &err);
  TEST_SUCCESS(err);

  // Running kernel
  size_t lws[1] = {128};
  size_t gws[1] = {NB_DATA};

  cl_event evt;

  for (unsigned int i=0; i<10000; i++)
  {
    err = clSetKernelArg(zeKernel, 0, sizeof(cl_mem), &d_a);
    TEST_SUCCESS(err);
    err = clSetKernelArg(zeKernel, 1, sizeof(cl_mem), &d_b);
    TEST_SUCCESS(err);
    err = clSetKernelArg(zeKernel, 2, sizeof(cl_mem), &d_c);
    TEST_SUCCESS(err);

    err = clEnqueueNDRangeKernel(cq, zeKernel, 1, NULL, gws, lws, 0, NULL, NULL);
    TEST_SUCCESS(err);

    err = clEnqueueReadBuffer(cq, d_c, CL_FALSE, 0, NB_DATA*sizeof(float), c, 0, NULL, &evt);
    TEST_SUCCESS(err);

    clWaitForEvents(1, &evt);
  }

  // Clean
  clReleaseMemObject(d_a);
  clReleaseMemObject(d_b);
  clReleaseMemObject(d_c);

  delete[] a;
  delete[] b;
  delete[] c;

  clReleaseKernel(zeKernel);
  clReleaseProgram(program);
  clFinish(cq);
  clReleaseCommandQueue(cq);
  clReleaseContext(context);

  delete[] platformId;
  delete[] name;
}

When you run this program, the process allocate more and more memory. The problem doesn’t appear with Intel and AMD (GPU) drivers.

Am I right? How can I tell it to NVidia, because their forum doesn’t work!

Thanks.

I forgot to tell that if you replace the line:

err = clEnqueueReadBuffer(cq, d_c, CL_FALSE, 0, NB_DATA*sizeof(float), c, 0, NULL, &evt);

by the line

err = clEnqueueReadBuffer(cq, d_c, CL_TRUE, 0, NB_DATA*sizeof(float), c, 0, NULL, NULL);

The problem no longer appears even on with nvidia driver.

Be careful with the device choice, please select the right “DEVICE_INDEX value”.

Ok, I found a new thing. If I add a clReleaseEvent just after the clWaitForEvents. The memory leak no longer appears.


    err = clEnqueueNDRangeKernel(cq, zeKernel, 1, NULL, gws, lws, 0, NULL, NULL);
    TEST_SUCCESS(err);

    err = clEnqueueReadBuffer(cq, d_c, CL_FALSE, 0, NB_DATA*sizeof(float), c, 0, NULL, &evt);
    TEST_SUCCESS(err);

    err = clWaitForEvents(1, &evt);
    TEST_SUCCESS(err);

    err = clReleaseEvent(evt);
    TEST_SUCCESS(err);

But I don’t understand. Why can’t we use the same cl_event without releasing it?

Ok I think the problem is solved.

Thanks.

Would be great if you can tell us where the problem was.

I think nVidia has choosen to implement each event usage without controlling if the event is allready allocated.

So when you call clEnqueueReadBuffer( […], &evt);

I believe the driver allocates the event systematically even if it has already been allocated.

While Intel and AMD control that the event is not NULL before its allocation and use the same pointer if it already exists.

So maybe it is a problem or maybe it’s a choice. I don’t know if OpenCL specifications constrain this point.

I submited a bug report to nvidia, I’m waiting for the answer.

Coincidentally, I stumbled upon similar “pseudo leak” problems. However, I don’t think you can blame NVIDIA on that in this case.

What you pass on to clEnqueueNDRangeKernel() is not an event object but merely a location. For example

cl_event event = NULL;

clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &size, NULL, 0, NULL, &event);
printf ("event = %p
", event);
clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &size, NULL, 0, NULL, &event);
printf ("event = %p
", event);

will return two distinct pointers. And the standard is also very clear on that: “event Returns an event object that identifies this particular kernel execution instance.”. It would be wrong for clEnqueueNDRangeKernel() to release an existing event that is passed for two reasons: 1) semantically, the function is used to enqueue kernels not release events and 2) it would be disastrous if I released an event a second time that has been released already.

I agree with you, and this is what I was told NVidia:

Our developer identifies the observed behavior is valid as per OpenCL 1.1 spec, revision 44, section 5.9:
“The function cl_int clRetainEvent (cl_event event) increments the event reference count. The OpenCL commands that return an event perform an implicit retain.”
Developers are responsible for releasing event objects returned by asynchronous enqueue calls when the events are no longer needed. So adding a call to clReleaseEvent in the loop is the correct solution.

So does it mean only nvidia correctly implements the standard?

Thanks.

I just did a test on an AMD machine and I get different event pointers each time. However, they also re-use the memory (seeing the same memory location) if I release the first event.