Hi,
I think I found a memory leak in nvidia driver (tested on 310.90 windows X64) with OpenCL 1.1.
The memory leak appears when you call the function clEnqueueReadBuffer with asynchronous parameters.
Here is a usage:
err = clEnqueueReadBuffer(cq, d_c, CL_FALSE, 0, NB_DATA*sizeof(float), c, 0, NULL, &evt);
and here is a code sample to illustrate the problem:
#include "CL/cl.h"
#include <sstream>
#include <iostream>
#include <vector>
#define TEST_SUCCESS(err) do{ if(err != CL_SUCCESS){ std::cout << "Line : " << __LINE__ << " - Error : " << err << std::endl; system("PAUSE"); return -1; } } while(0)
#define DEVICE_INDEX 1
#define NB_DATA 1024
int main(void)
{
cl_int err;
// Getting devices
cl_uint nbPlatforms;
err = clGetPlatformIDs(0, NULL, &nbPlatforms);
TEST_SUCCESS(err);
if (nbPlatforms == 0)
{
std::cout << "No platforms" << std::endl;
return 0;
}
cl_platform_id* platformId = new cl_platform_id[nbPlatforms];
err = clGetPlatformIDs(nbPlatforms, platformId, NULL);
TEST_SUCCESS(err);
typedef std::pair<cl_platform_id, cl_device_id> DevicePairType;
std::vector<DevicePairType> deviceId;
for (cl_uint i=0; i<nbPlatforms; i++)
{
cl_uint nbDevices;
err = clGetDeviceIDs(platformId[i], CL_DEVICE_TYPE_ALL, 0, NULL, &nbDevices);
TEST_SUCCESS(err);
if(nbDevices == 0)
{
std::cout << "No Device in platform : " << i << std::endl;
}
else
{
cl_device_id* did = new cl_device_id[nbDevices];
err = clGetDeviceIDs(platformId[i], CL_DEVICE_TYPE_ALL, nbDevices, did, NULL);
TEST_SUCCESS(err);
for (cl_uint j = 0; j<nbDevices; j++)
{
cl_platform_id pl = platformId[i];
cl_device_id de = did[j];
DevicePairType tmp(pl, de);
deviceId.push_back(tmp);
}
delete[] did;
}
}
// Selected device
cl_platform_id platform = deviceId[DEVICE_INDEX].first;
cl_device_id device = deviceId[DEVICE_INDEX].second;
size_t nameSize;
err = clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &nameSize);
TEST_SUCCESS(err);
char* name = new char[nameSize];
err = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(char)*nameSize, (void*)name, NULL);
TEST_SUCCESS(err);
std::cout << "Device name : " << name << std::endl;
// Creating context etc...
cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
cl_context context = clCreateContext(props, 1, &device, NULL, NULL, &err);
TEST_SUCCESS(err);
cl_command_queue cq = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
TEST_SUCCESS(err);
// Creating kernel
std::ostringstream source;
source << " __kernel void sum( __global const float* a, __global const float* b, __global float* c)" << std::endl;
source << "{" << std::endl;
source << " unsigned int tid = get_global_id(0);" << std::endl;
source << " c[tid] = a[tid] + b[tid];" << std::endl;
source << "}" << std::endl;
std::string str = source.str();
const char* src = str.c_str();
size_t srcSize = str.size();
std::cout << src << std::endl;
cl_program program = clCreateProgramWithSource(context, 1, &(src), &srcSize, &err);
TEST_SUCCESS(err);
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
if( err != CL_SUCCESS)
{
std::string blog;
// Get size of data to get
size_t logSize;
err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
TEST_SUCCESS(err);
char* lblog = new char[logSize];
// Get build status
err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(char)*logSize, (void*)lblog, NULL);
TEST_SUCCESS(err);
blog = std::string(lblog);
std::cout << "Log : " << std::endl << blog << std::endl;
delete[] lblog;
system("PAUSE");
return -1;
}
cl_kernel zeKernel = clCreateKernel(program, "sum", &err);
TEST_SUCCESS(err);
// Memory
float* a = new float[NB_DATA];
float* b = new float[NB_DATA];
float* c = new float[NB_DATA];
cl_mem d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, NB_DATA*sizeof(float), (void*)a, &err);
TEST_SUCCESS(err);
cl_mem d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, NB_DATA*sizeof(float), (void*)b, &err);
TEST_SUCCESS(err);
cl_mem d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, NB_DATA*sizeof(float), NULL, &err);
TEST_SUCCESS(err);
// Running kernel
size_t lws[1] = {128};
size_t gws[1] = {NB_DATA};
cl_event evt;
for (unsigned int i=0; i<10000; i++)
{
err = clSetKernelArg(zeKernel, 0, sizeof(cl_mem), &d_a);
TEST_SUCCESS(err);
err = clSetKernelArg(zeKernel, 1, sizeof(cl_mem), &d_b);
TEST_SUCCESS(err);
err = clSetKernelArg(zeKernel, 2, sizeof(cl_mem), &d_c);
TEST_SUCCESS(err);
err = clEnqueueNDRangeKernel(cq, zeKernel, 1, NULL, gws, lws, 0, NULL, NULL);
TEST_SUCCESS(err);
err = clEnqueueReadBuffer(cq, d_c, CL_FALSE, 0, NB_DATA*sizeof(float), c, 0, NULL, &evt);
TEST_SUCCESS(err);
clWaitForEvents(1, &evt);
}
// Clean
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
delete[] a;
delete[] b;
delete[] c;
clReleaseKernel(zeKernel);
clReleaseProgram(program);
clFinish(cq);
clReleaseCommandQueue(cq);
clReleaseContext(context);
delete[] platformId;
delete[] name;
}
When you run this program, the process allocate more and more memory. The problem doesn’t appear with Intel and AMD (GPU) drivers.
Am I right? How can I tell it to NVidia, because their forum doesn’t work!
Thanks.