OpenCL doesn't work with multiple host threads?

I write an OpenCL program for running on CPUs, so basically the host and the device are essentially the same. When I run my program using 1 CPU thread, everything works fine, but when I use more than 1 thread to call to the device (1 thread creates context, another builds program), it dies at clBuildProgram.

I use AMD SDK v2.5 which has OpenCL 1.1. Khornos cliams that OpenCL 1.1 has Host-thread safety, enabling OpenCL commands to be enqueued from multiple host threads. What can I do to make this work? Please help.

I use CUDA OpenCL on another machine with a GPU device, and it works fine with multiple host threads.

The followings are my codes.

1 thread version (work):


cl_context cxGPUContext;

const char *ProgramSource = 

"// :::: main
"
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable
"
"__kernel void kernel_main( __global float* _region__pb_rv, __global float* _region_x, int h, int w, int dim_d0_begin, int dim_d0_end, int dim_d1_begin, int dim_d1_end, int dim__pb_rv_d0, int dim_x_d0 ) {
"
"unsigned int _r3_x = get_global_id( 0 );
"
"unsigned int _r3_y = get_global_id( 1 );
"
"int i = _r3_x;
"
"int j = _r3_y;
"
"if( _r3_x >= dim_d0_begin && _r3_x < dim_d0_end && _r3_y >= dim_d1_begin && _r3_y < dim_d1_end ) {
"
"unsigned int idx__pb_rv = (_r3_x+(dim__pb_rv_d0*(_r3_y+0)));
"
"unsigned int idx_x = (_r3_x+(dim_x_d0*(_r3_y+0)));
"
"#define RETURN(x) _region__pb_rv[idx__pb_rv] = x; return
"
"RETURN ( _region_x [ idx_x ] );
"
"}
"
"}
"
"";

int main(int argc, const char** argv)
{
    cl_platform_id cpPlatform = NULL;
    cl_uint ciDeviceCount = 0;
    cl_device_id device_id;
    cl_int ciErrNum = CL_SUCCESS;
    cl_uint numPlatforms;

    ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
    if(ciErrNum != CL_SUCCESS)
    {
        printf("Error: Failed to num platforms!
");
        return ciErrNum;
    }

    if (0 < numPlatforms) 
    {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if(ciErrNum != CL_SUCCESS)
        {
          printf("Error: Failed to get platform id!
");
          return ciErrNum;
        }
            for (unsigned i = 0; i < numPlatforms; ++i) 
            {
                char pbuf[100];
                ciErrNum = clGetPlatformInfo(platforms[i],
                                           CL_PLATFORM_VENDOR,
                                           sizeof(pbuf),
                                           pbuf,
                                           NULL);

                if(ciErrNum != CL_SUCCESS)
                {
                  printf("Error: Failed to get platform info!
");
                  return ciErrNum;
                }

                cpPlatform = platforms[i];
                if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) 
                {
                    printf("platform %d
", i);
                    break;
                }
            }
        delete[] platforms;
    }

    if(NULL == cpPlatform)
    {
        printf("NULL platform found so Exiting Application.
");
        return 0;
    }

    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 0, NULL, &ciDeviceCount);    
    if (ciErrNum != CL_SUCCESS)
    {
        printf("Error: Failed to get num devices!
");
        return ciErrNum;
    }

    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        printf("Error: Failed to get device id!
");
        return ciErrNum;
    }

    cxGPUContext = clCreateContext(0, 1, &device_id, NULL, NULL, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        printf("Error: Failed to create OpenCL context!
");
        return ciErrNum;
    }


    cl_program cpProgram = clCreateProgramWithSource(cxGPUContext,1, (const char **) &ProgramSource, NULL, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        printf("Error: Failed to create program
");
    }
    printf("clCreateProgramWithSource_main = %d success!
", cpProgram);

    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        printf("Error: Failed to build program
");
    }
    printf("clBuildProgram_main = %d success!
", cpProgram);

    return 0;
}

2 threads version (fail):


cl_context cxGPUContext;

void mainLoop();

extern "C" void *startGpuManager(void* /*arg*/) {
  //try {
    mainLoop();
  //}catch(petabricks::DynamicScheduler::CleanExitException e){}
  return NULL;
}

const char *ProgramSource = 

"// :::: main
"
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable
"
"__kernel void kernel_main( __global float* _region__pb_rv, __global float* _region_x, int h, int w, int dim_d0_begin, int dim_d0_end, int dim_d1_begin, int dim_d1_end, int dim__pb_rv_d0, int dim_x_d0 ) {
"
"unsigned int _r3_x = get_global_id( 0 );
"
"unsigned int _r3_y = get_global_id( 1 );
"
"int i = _r3_x;
"
"int j = _r3_y;
"
"if( _r3_x >= dim_d0_begin && _r3_x < dim_d0_end && _r3_y >= dim_d1_begin && _r3_y < dim_d1_end ) {
"
"unsigned int idx__pb_rv = (_r3_x+(dim__pb_rv_d0*(_r3_y+0)));
"
"unsigned int idx_x = (_r3_x+(dim_x_d0*(_r3_y+0)));
"
"#define RETURN(x) _region__pb_rv[idx__pb_rv] = x; return
"
"RETURN ( _region_x [ idx_x ] );
"
"}
"
"}
"
"";

int main(int argc, const char** argv)
{
    cl_platform_id cpPlatform = NULL;
    cl_uint ciDeviceCount = 0;
    cl_device_id device_id;
    cl_int ciErrNum = CL_SUCCESS;
    cl_uint numPlatforms;

    ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
    if(ciErrNum != CL_SUCCESS)
    {
        printf("Error: Failed to num platforms!
");
        return ciErrNum;
    }
    printf("num platforms = %d
", numPlatforms);
    if (0 < numPlatforms) 
    {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if(ciErrNum != CL_SUCCESS)
        {
          printf("Error: Failed to get platform id!
");
          return ciErrNum;
        }
            for (unsigned i = 0; i < numPlatforms; ++i) 
            {
                char pbuf[100];
                ciErrNum = clGetPlatformInfo(platforms[i],
                                           CL_PLATFORM_VENDOR,
                                           sizeof(pbuf),
                                           pbuf,
                                           NULL);

                if(ciErrNum != CL_SUCCESS)
                {
                  printf("Error: Failed to get platform info!
");
                  return ciErrNum;
                }

                cpPlatform = platforms[i];
                if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) 
                {
                    printf("platform %d
", i);
                    break;
                }
            }
        delete[] platforms;
    }

    if(NULL == cpPlatform)
    {
        printf("NULL platform found so Exiting Application.
");
        return 0;
    }

    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 0, NULL, &ciDeviceCount);    
    if (ciErrNum != CL_SUCCESS)
    {
        printf("Error: Failed to get num devices!
");
        return ciErrNum;
    }
    printf("num devices = %d
", ciDeviceCount);
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        printf("Error: Failed to get device id!
");
        return ciErrNum;
    }

    cxGPUContext = clCreateContext(0, 1, &device_id, NULL, NULL, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        printf("Error: Failed to create OpenCL context!
");
        return ciErrNum;
    }

    pthread_t _thread;
    pthread_attr_t attr;
    pthread_attr_init(&attr);
    pthread_attr_setdetachstate(&attr, 0);
    pthread_create(&_thread, &attr, startGpuManager, NULL);
    pthread_attr_destroy(&attr);


    return 0;
}

void mainLoop() {
    cl_int ciErrNum = CL_SUCCESS;

    // Program Setup
    cl_program cpProgram = clCreateProgramWithSource(cxGPUContext,1, (const char **) &ProgramSource, NULL, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        printf("Error: Failed to create program
");
    }
    printf("clCreateProgramWithSource = %d success!
", cpProgram);

    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        printf("Error: Failed to build program
");
    }
    printf("clBuildProgram = %d success!
", cpProgram);
}

What steps do you take to ensure that cxGPUContext is initialized by the time mainLoop() starts to run? Is it possible that cxGPUContext is uninitialized when the second thread calls clBuildProgram()?

I figure it out. There is actually a problem with incompatibility between the library that my group implemented and AMD/intel SDK. We suspected that they use new and malloc/ delete/free interchangeably. Thank you for the reply anyway!