[OS X, AMD Radeon] Kernel execution freezes system

Hello,

I’m implementing a fairly simple N^2 potential integration algorithm with OpenCL. I’m running OS X 10.7 on a 2011 iMac with an AMD Radeon HD 6770M.

Launching the GPU threads occasionally freezes the entire OS GUI. When the system is locked up, I can SSH into it from a remote machine, however attempting to “kill -9” the process has no effect and a hard reboot is the only way to restore it. Other times, it does not freeze the entire system, but gets to clEnqueueReadBuffer and hangs. Again, kill has no effect. And sometimes it finishes with no problem.

Running the kernel on the CPU by switching to CL_DEVICE_TYPE_CPU in clGetDeviceIDs works fine. Just wondering if maybe I’m doing something obviously wrong (I know my kernel code is extremely inefficient and I’m calling clFinish way too much on the host). Here’s the host code:


//
//  main.cpp
//

#include <bfstream.h>
#include <cassert>
#include <iostream>
#include <OpenCL/opencl.h>
#include <limits>

cl_context g_context;
cl_kernel g_integrand_gpu_kernel;
cl_command_queue g_queue;

void pfn_notify(const char *errinfo, const void *private_info, size_t cb, void *user_data);
void initialize();
void run();
void destroy();

// ---------------------------------------------------------

void pfn_notify(const char *errinfo, const void *private_info, size_t cb, void *user_data)
{
   fprintf(stderr, "%s
", errinfo);
}

// ---------------------------------------------------------

void initialize()
{
   cl_int error = 0;
   
   // Platform
   cl_platform_id platform;
   error = clGetPlatformIDs(1, &platform, NULL );
   
   if (error != CL_SUCCESS) 
   {
      std::cout << "Error getting platform id: " << error << std::endl;
      exit(error);
   }
   
   // Device
   cl_device_id device;
   error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
   
   if (error != CL_SUCCESS) 
   {
      std::cout << "Error getting device ids: " << error << std::endl;
      exit(error);
   }
   
   // Context
   g_context = clCreateContext(0, 1, &device, pfn_notify, NULL, &error);
   if (error != CL_SUCCESS) 
   {
      std::cout << "Error creating context: " << error << std::endl;
      exit(error);
   }
   
   // Command-queue
   g_queue = clCreateCommandQueue(g_context, device, 0, &error);
   if (error != CL_SUCCESS) 
   {
      std::cout << "Error creating command queue: " << error << std::endl;
      exit(error);
   }
   
   std::cout << "Creating program" << std::endl;

   // Creates the program
#define MAX_SOURCE_SIZE 10000
   
   FILE *fp;
   char *source_str = new char[MAX_SOURCE_SIZE];
   size_t source_size;
   
   fp = fopen("vortexkernel.cl", "r");
   if (!fp) 
   {
      fprintf(stderr, "Failed to load kernel.
");
      exit(1);
   }
   source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
   fclose( fp );
   
   cl_program program = clCreateProgramWithSource(g_context, 1, (const char **)&source_str, (const size_t *)&source_size, &error);
   assert(error == CL_SUCCESS);
   
   // Builds the program
   error = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
   
   std::cout << "finished building" << std::endl;

   delete[] source_str;
   
   if ( error != CL_SUCCESS )
   {
      // Shows the log
      char* build_log;
      size_t log_size;
      // First call to know the proper size
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
      build_log = new char[log_size+1];
      // Second call to get the log
      clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
      build_log[log_size] = '\0';
      std::cout << "build log: 
" << build_log << std::endl;
      delete[] build_log;
   }
 
   // Assert build success
   assert(error == CL_SUCCESS);
   
   // Extracting the kernel
   g_integrand_gpu_kernel = clCreateKernel(program, "compute_integrand_gpu", &error);
   assert(error == CL_SUCCESS);   

   std::cout << "finished initializing" << std::endl;

}

// ---------------------------------------------------------

void run()
{
   
   //
   // Read saved data
   // 
   
   bifstream save_data( "saved.bin" );
   save_data.read_endianity();

   int num_eval_points;
   int num_kernel_centres;

   save_data >> num_eval_points;
   save_data >> num_kernel_centres;   
   
   float* xs = new float[3*num_eval_points];
   float* centres = new float[3*num_kernel_centres];
   float* triangle_vorticities_f = new float[3*num_kernel_centres];
   
   std::cout << "num_eval_points: " << num_eval_points << std::endl;
   std::cout << "num_kernel_centres: " << num_kernel_centres << std::endl;
   
   for ( int i = 0; i < 3*num_eval_points; ++i ) 
   { 
      save_data >> xs[i]; 
      assert( xs[i] == xs[i] );
      assert( xs[i] != std::numeric_limits<float>::infinity() );
   }
   
   for ( int i = 0; i < 3*num_kernel_centres; ++i ) 
   { 
      save_data >> centres[i]; 
      assert( centres[i] == centres[i] );
      assert( centres[i] != std::numeric_limits<float>::infinity() );
   }
   
   for ( int i = 0; i < 3*num_kernel_centres; ++i ) 
   { 
      save_data >> triangle_vorticities_f[i]; 
      assert( triangle_vorticities_f[i] == triangle_vorticities_f[i] );
      assert( triangle_vorticities_f[i] != std::numeric_limits<float>::infinity() );
   }
   
   save_data.close();
   assert(save_data.good());
   
   std::cout << "data loaded" << std::endl;
   
   float *buffer = new float[ 3*num_eval_points ];
   
   size_t buffer_mem_size = num_eval_points * 3 * sizeof(float);
   size_t eval_mem_size = num_eval_points * 3 * sizeof(float);
   size_t kernel_mem_size = num_kernel_centres * 3 * sizeof(float);
   
   // Create buffers on the GPU
   
   cl_int error;
   cl_mem src_xs_d = clCreateBuffer( g_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, eval_mem_size, static_cast<void*>(xs), &error);
   assert(error == CL_SUCCESS);
   cl_mem src_ks_d = clCreateBuffer( g_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, kernel_mem_size, static_cast<void*>(centres), &error);
   assert(error == CL_SUCCESS);
   cl_mem src_vs_d = clCreateBuffer( g_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, kernel_mem_size, static_cast<void*>(triangle_vorticities_f), &error);
   assert(error == CL_SUCCESS);
   cl_mem src_buffer_d = clCreateBuffer( g_context, CL_MEM_READ_WRITE, buffer_mem_size, NULL, &error);
   assert(error == CL_SUCCESS);
   
   // Enqueue parameters
   
   error = clSetKernelArg(g_integrand_gpu_kernel, 0, sizeof(cl_mem), &src_xs_d );
   assert(error == CL_SUCCESS);
   error |= clSetKernelArg(g_integrand_gpu_kernel, 1, sizeof(int), &num_eval_points );
   assert(error == CL_SUCCESS);
   error |= clSetKernelArg(g_integrand_gpu_kernel, 2, sizeof(cl_mem), &src_ks_d );
   assert(error == CL_SUCCESS);
   error |= clSetKernelArg(g_integrand_gpu_kernel, 3, sizeof(int), &num_kernel_centres );
   assert(error == CL_SUCCESS);
   error |= clSetKernelArg(g_integrand_gpu_kernel, 4, sizeof(cl_mem), &src_vs_d );
   assert(error == CL_SUCCESS);
   error |= clSetKernelArg(g_integrand_gpu_kernel, 5, sizeof(cl_mem), &src_buffer_d );
   assert(error == CL_SUCCESS);
   
   std::cout << "paramaters queued" << std::endl;

   std::cout.flush();
   
   // Launching kernel
   
   const size_t global_ws = num_eval_points;
   
   error = clFlush(g_queue);
   assert(error == CL_SUCCESS);
   error = clFinish(g_queue);
   assert(error == CL_SUCCESS);
   
   error = clEnqueueNDRangeKernel(g_queue, g_integrand_gpu_kernel, 1, NULL, &global_ws, NULL, 0, NULL, NULL);
   assert(error == CL_SUCCESS);
   
   std::cout << "kernel enqueued" << std::endl;
   
   error = clFlush(g_queue);
   assert(error == CL_SUCCESS);
   
   error = clFinish(g_queue);
   assert(error == CL_SUCCESS);

   std::cout << "queue finished" << std::endl;
   
   // Wait for threads to finish, then read back buffer
   error = clEnqueueReadBuffer(g_queue, src_buffer_d, CL_TRUE, 0, buffer_mem_size, (void*)buffer, 0, NULL, NULL );
   assert(error == CL_SUCCESS);
   
   std::cout << "read buffer enqueued" << std::endl;
   
   error = clFinish(g_queue);
   assert(error == CL_SUCCESS);
   
   std::cout << "read buffer done" << std::endl;
   
   for ( int i = 0; i < 3*num_eval_points; ++i ) 
   { 
      assert( buffer[i] == buffer[i] );
      assert( buffer[i] != std::numeric_limits<float>::infinity() );
   }

   error = clReleaseMemObject(src_xs_d);
   error |= clReleaseMemObject(src_ks_d);
   error |= clReleaseMemObject(src_vs_d);
   error |= clReleaseMemObject(src_buffer_d);
   assert(error == CL_SUCCESS);
   
   delete[] buffer;
   delete[] triangle_vorticities_f;
   delete[] centres;
   delete[] xs;
   
}

// ---------------------------------------------------------

void destroy()
{
   clReleaseContext( g_context );
   clReleaseKernel( g_integrand_gpu_kernel );
   clReleaseCommandQueue(g_queue);

}

// ---------------------------------------------------------

int main (int argc, const char * argv[])
{
   std::cout << "initializing" << std::endl;
   initialize();
   std::cout << "running" << std::endl;
   run();
   std::cout << "destroying" << std::endl;
   destroy();
   std::cout << "done" << std::endl;
}

and the kernel:


//
//  vortexkernel.cl
//

#define MY_PI  3.14159265358979323846264338327950288f   /* pi */


__kernel void compute_integrand_gpu (__global const float* eval_points, 
                                     int num_eval_points,
                                     __global const float* kernel_centres, 
                                     int num_kernel_centres,
                                     __global const float* triangle_vorticities,
                                     __global float* out_buffer )
{
   
   const int i = get_global_id(0);

   float d[3];
   float grad_kernel_eval[3];   
   float result[3];
   
   if ( i < num_eval_points )
   {
      out_buffer[3*i+0] = 0.0f;
      out_buffer[3*i+1] = 0.0f;
      out_buffer[3*i+2] = 0.0f;
      
      for ( int j = 0; j < num_kernel_centres; ++j )
      {
         d[0] = eval_points[3*i+0] - kernel_centres[3*j+0];
         d[1] = eval_points[3*i+1] - kernel_centres[3*j+1];
         d[2] = eval_points[3*i+2] - kernel_centres[3*j+2];
         
         float rsqr = d[0]*d[0] + d[1]*d[1] + d[2]*d[2];
         float r = sqrt(rsqr);
         float coeff = -1.0/(4*MY_PI*r*r*r);
         
         grad_kernel_eval[0] = -coeff * d[0] / r;
         grad_kernel_eval[1] = -coeff * d[1] / r;
         grad_kernel_eval[2] = -coeff * d[2] / r;

         result[0] = grad_kernel_eval[1]*triangle_vorticities[3*j+2]-grad_kernel_eval[2]*triangle_vorticities[3*j+1];
         result[1] = grad_kernel_eval[2]*triangle_vorticities[3*j+0]-grad_kernel_eval[0]*triangle_vorticities[3*j+2];
         result[2] = grad_kernel_eval[0]*triangle_vorticities[3*j+1]-grad_kernel_eval[1]*triangle_vorticities[3*j+0];

         out_buffer[3*i+0] += result[0];
         out_buffer[3*i+1] += result[1];
         out_buffer[3*i+2] += result[2];
      }      
   }
   
}

I can provide the raw data and additional supporting source code as well if anyone feels like trying to reproduce the problem.

T.

Okay, I noticed that this was failing only when chewing on a fairly large number of inputs (~30k points). So I’m now breaking up the input buffers into chunks and looping over them, launching a smaller number of threads each time. It seems to work if I can keep the time between clEnqueueNDRangeKernel and clEnqueueReadBuffer down below 5 seconds each time.

I wonder if the GUI was timing out while waiting for my threads to finish or something. Or maybe there’s some kind of watchdog process on OS X, similar to the one in Windows, and it’s having trouble killing my long-running tasks? Maybe this is one for the Apple engineers.

T.