OpenCL Callback freezes / hangs (deadlock?, pthread_cond_wait)

Hello,

I tried to paste here all the info but the forum is complaining… so, just small info.

Expected behavior (only occurs in the CPU, not in the IGPU):

  1. The host creates an user event. Then, the host calls a EnqueueKernelNDRange (vector addition) and waits for the user event (WaitForEvents). When the kernel finishes it triggers the callback “callback_kernel”.
  2. This “callback_kernel” calls a EnqueueReadBuffer non-blocking, and when it finishes triggers the callback “callback_read”.
  3. The “callback_read” sets CL_COMPLETE the user event.
  4. The host continues after the WaitForEvents with the content filled (buffer read).

It works in the Intel CPU but not in the Intel Integrated GPU (Graphics).

(gdb shows that is freezed in the pthread_cond_wait of the intel opencl driver).

Can anyone explain really what is the behavior with the callbacks/events and the host thread? (best practices, how to avoid deadlocks)

I need fine grained control and the fastest performance, and it looks like is callbacks, but they have weird behaviors…

The CPU does what I expect. The GPU freezes after finishing the callback_kernel (2nd step). If I kill the process inside gdb I see that it was in pthread_cond_wait from intel opencl driver


#include <CL/cl.h>

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAX_FILE_SIZE 1024000

#include <sys/stat.h>
#include <sys/types.h>

typedef enum ocl_type_e_t {
  OCL_TYPE_NULL = 0,
  OCL_TYPE_CPU = 1,
  OCL_TYPE_GPU = 2,
  OCL_TYPE_IGPU = 3,
  OCL_TYPE_ACC = 4
} ocl_type_e_t;


const char*
cl_device_type_to_str(cl_device_type type)
{
  static char* strings[] = {
    "(invalid)", // invalid
    "CL_DEVICE_TYPE_CPU",
    "CL_DEVICE_TYPE_GPU",
    "CL_DEVICE_TYPE_ACCELERATOR",
    "CL_DEVICE_TYPE_CUSTOM",
    "CL_DEVICE_TYPE_DEFAULT",
    "CL_DEVICE_TYPE_ALL",
  };

  char* ret;

  switch (type) {
    case CL_DEVICE_TYPE_CPU:
      ret = strings[1];
      break;
    case CL_DEVICE_TYPE_GPU:
      ret = strings[2];
      break;
    case CL_DEVICE_TYPE_ACCELERATOR:
      ret = strings[3];
      break;
    case CL_DEVICE_TYPE_CUSTOM:
      ret = strings[4];
      break;
    case CL_DEVICE_TYPE_DEFAULT:
      ret = strings[5];
      break;
    case CL_DEVICE_TYPE_ALL:
      ret = strings[6];
      break;
    default:
      ret = strings[0];
      break;
  }
  return ret;
}

const char*
file_read(char* const path)
{
  struct stat st;
  /* st = (struct stat*)malloc(sizeof(stat)); */
  int error = stat(path, &st);
  if (error != 0) {
    printf("Invalid file %s
", path);
    exit(EXIT_FAILURE);
  }

  int size_file = st.st_size;

  if (size_file > MAX_FILE_SIZE) {
    printf("File %s is bigger than the max allowed size (%d > %d bytes)
",
           path, size_file, MAX_FILE_SIZE);
    exit(EXIT_FAILURE);
  }

  FILE* fp = fopen(path, "r");
  if (fp == NULL) {
    printf("Error opening the file %s
", path);
    exit(EXIT_FAILURE);
  }

  char* const buf = (char* const)malloc(size_file);
  if (buf == NULL) {
    printf("Error allocating %d bytes for the contents of the file %s
",
           size_file, path);
    exit(EXIT_FAILURE);
  }

  int size_read;
  while ((size_read = fread(buf, sizeof(char), size_file, fp)) > 0) {
    ;
  }

  fclose(fp);

  return buf;
}



cl_event clb_events_waiting[100];
int clb_events_waiting_device[100];
int clb_events_init_read[100];
int clb_num_events_waiting = 0;

void
clbWaitEvents(int * c)
{
  if (clb_num_events_waiting > 0){
    printf("About to wait events: %d
", clb_num_events_waiting);
    int i;
    int waiting = 0;
    cl_event ev_waiting[100];
    printf("%d = CL_QUEUED, %d = CL_COMPLETE, %d = CL_SUBMITTED, %d = CL_RUNNING
", CL_QUEUED, CL_COMPLETE, CL_SUBMITTED, CL_RUNNING);
    for (i=0; i<clb_num_events_waiting; i++){
      cl_int ret;
      clGetEventInfo(clb_events_waiting[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL);
      int dev = clb_events_waiting_device[i];
      int init = clb_events_init_read[i] / sizeof(int);
      printf("cl_event %s init %6d  [%d] = status %d (ref %p)
", dev == 0 ? "CPU" : (dev == 1 ? "GPU" : "ACC"), init, i, ret, (void*)clb_events_waiting[i]);

      if (ret != CL_COMPLETE){
        ev_waiting[waiting] = clb_events_waiting[i];
        waiting++;
      }
    }

    for (i=0; i<clb_num_events_waiting; i++){
      int dev = clb_events_waiting_device[i];
      int init = clb_events_init_read[i] / sizeof(int);
      printf("%s [%d] = %d, [%d] = %d, [%d] = %d
", dev == 0 ? "CPU" : (dev == 1 ? "GPU" : "ACC"), init, c[init], init + 1, c[init + 1], init + 2, c[init + 2]);
    }

    if (waiting > 0){
      printf("about to wait %d events
", waiting);
      clWaitForEvents(waiting, ev_waiting);
      printf("wait events finished
");
    }
    /* clWaitForEvents(clb_num_events_waiting, clb_events_waiting); */
  }
}














typedef struct callback_data
{
  cl_command_queue* queue;
  cl_mem* buf_c;
  int* c_v;
  uint size;
  cl_event* end;
  bool nested_callbacks;
  bool blocking;
} callback_data;

void CL_CALLBACK callback_read_fn(cl_event event, cl_int ev_status,
                                  void* user_data);

void CL_CALLBACK callback_kernel_fn(cl_event event, cl_int ev_status,
                                    void* user_data);

int
main(int argc, char* argv[])
{

  bool use_callbacks = true;
  bool use_nested_callbacks = true;
  bool use_blocking = false;

  int numSelPlatform = 0;
  int numSelDevice = 0;
  int doUseCallbacks = 0;
  int doUseNestedCallbacks = 0;
  int doUseBlocking = 0;
  int use_type = 0;
  if (argc != 7) {
    printf("./%s (platform) (device) (type cpu 0|gpu 1|igpu 2|acc 3) (use "
           "callbacks) (use nested callbacks) (use blocking)
",
           argv[0]);
    exit(EXIT_FAILURE);
  } else {
    numSelPlatform = atoi(argv[1]);
    numSelDevice = atoi(argv[2]);
    use_type = atoi(argv[3]);
    doUseCallbacks = atoi(argv[4]);
    doUseNestedCallbacks = atoi(argv[5]);
    doUseBlocking = atoi(argv[6]);
  }

  cl_event end;

  uint size = 1024;
  int* a_v = (int*)malloc(size * sizeof(int));
  int* b_v = (int*)malloc(size * sizeof(int));
  int* c_v = (int*)malloc(size * sizeof(int));
  for (size_t i = 0; i < size; i++) {
    a_v[i] = i;
    b_v[i] = i + 1;
    c_v[i] = 0;
  }

  const char* kernel_str = file_read("src/kernel.cl");

  use_callbacks = doUseCallbacks;
  use_nested_callbacks = doUseNestedCallbacks;
  use_blocking = doUseBlocking ? CL_TRUE : CL_FALSE;

  cl_int st;
  cl_int err;

  int len = 256;
  char buflog[len];

  cl_uint numPlatforms = 0;
  st = clGetPlatformIDs(0, NULL, &numPlatforms);
  cl_platform_id* platforms = NULL;
  platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));

  st = clGetPlatformIDs(numPlatforms, platforms, NULL);
  printf("platforms: %d (%d)
", numPlatforms, st);

  cl_uint selPlatform = numSelPlatform; // 1;

  numPlatforms = 1;
  cl_platform_id platform = platforms[selPlatform];

  clGetPlatformInfo(platform, CL_PLATFORM_NAME, len, &buflog, NULL);
  if (buflog != NULL) {
    printf("platform name: %s
", buflog);
  }

  cl_uint numDevices = 0;
  st = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
  printf("num devices: %d (%d)
", numDevices, st);
  if (st != CL_SUCCESS) {
    /* printf("explain error: %s
", clErrorString(st)); */
    printf("error: %d
", st);
  }
  cl_device_id* devices = NULL;
  devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));

  st = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
  printf("devices: %d (%d)
", numDevices, st);

  // Context
  cl_context context;
  context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &err);
  printf("context (%d)
", err);

  // Select device
  cl_uint selDevice = numSelDevice; // 0;
  numDevices = 1;                   // clBuildProgram
  cl_device_id device = devices[selDevice];

  // Device Info
  clGetDeviceInfo(device, CL_DEVICE_NAME, len, &buflog, NULL);
  if (buflog != NULL) {
    printf("device name: %s
", buflog);
  }

  cl_device_type type;
  clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
  printf("device type: %s
", cl_device_type_to_str(type));

  // events
  cl_event ev_kernel;

  // CommandQueue
  /* cl_command_queue_properties props; */
  cl_command_queue queue;
  queue = clCreateCommandQueue(context, device, 0, &err);
  printf("command queue (%d)
", err);

  // CreateBuffer
  cl_mem buf_a;
  cl_mem buf_b;
  cl_mem buf_c;

  ocl_type_e_t ocl_type;
  if (use_type == 0) {
    ocl_type = OCL_TYPE_CPU;
    printf("mode CPU
");
  } else if (use_type == 1) {
    ocl_type = OCL_TYPE_GPU;
    printf("mode GPU
");
  } else if (use_type == 2) {
    ocl_type = OCL_TYPE_IGPU;
    printf("mode IGPU
");
  } else if (use_type == 3) {
    ocl_type = OCL_TYPE_ACC;
    printf("mode ACC
");
  }

  /* cl_mem buf_x; */
  switch (ocl_type) {
    case OCL_TYPE_IGPU:
      buf_a = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
                             a_v, &err);
      /* buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE |
       * CL_MEM_COPY_HOST_PTR, n * n * sizeof(int), */
      /*                      Acpy, &err); */
      break;
    case OCL_TYPE_GPU:
      buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
                             a_v, &err);
      break;
    case OCL_TYPE_ACC:
      buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                             size * sizeof(int), a_v, &err);
      break;
    case OCL_TYPE_CPU:
      buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                             size * sizeof(int), a_v, &err);
      break;
    default:
      printf("no ocl_type defined
");
      exit(EXIT_FAILURE);
      break;
  }

  printf("create buffer a (%d)
", err);
  if (err != CL_SUCCESS) {
    /* printf("create buffer error: %s
", clErrorString(err)); */
    printf("create buffer error: %d
", err);
  }

  switch (ocl_type) {
    case OCL_TYPE_IGPU:
      buf_b = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
                             b_v, &err);
      break;
    case OCL_TYPE_GPU:
      buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
                             b_v, &err);
      break;
    case OCL_TYPE_ACC:
      buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                             size * sizeof(int), b_v, &err);
      break;
    case OCL_TYPE_CPU:
      buf_b = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                             size * sizeof(int), b_v, &err);
      break;
    default:
      printf("no ocl_type defined
");
      exit(EXIT_FAILURE);
      break;
  }

  printf("create buffer b (%d)
", err);
  if (err != CL_SUCCESS) {
    printf("create buffer error: %d
", err);
    /* printf("create buffer error: %s
", clErrorString(err)); */
  }

  switch (ocl_type) {
    case OCL_TYPE_IGPU:
      buf_c = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, size * sizeof(int),
                             c_v, &err);
      /* buf_c = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, c_rows * c_cols *
       * sizeof(int), */
      /*                        c_v, &err); */
      /* buf_a = clCreateBuffer(context, CL_MEM_READ_WRITE |
       * CL_MEM_COPY_HOST_PTR, n * n * sizeof(int), */
      /*                      Acpy, &err); */
      break;
    case OCL_TYPE_GPU:
      buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE, size * sizeof(int),
                             c_v, &err);
      break;
    case OCL_TYPE_ACC:
      buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                             size * sizeof(int), c_v, &err);
      break;
    case OCL_TYPE_CPU:
      buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE |
                             CL_MEM_USE_HOST_PTR,
      /* buf_c = */
        /* clCreateBuffer(context, CL_MEM_USE_HOST_PTR, */
                       /* buf_c = clCreateBuffer(context, CL_MEM_READ_WRITE, */
                       size * sizeof(int), c_v, &err);
      break;
    default:
      printf("no ocl_type defined
");
      exit(EXIT_FAILURE);
      break;
  }

  printf("create buffer c (%d)
", err);
  if (err != CL_SUCCESS) {
    /* printf("create buffer error: %s
", clErrorString(err)); */
    printf("create buffer error: %d
", err);
  }
  /* b_x = clCreateBuffer(context, CL_MEM_WRITE_ONLY, n * sizeof(float), x,
   * &err); */
  /* printf("create buffer x (%d)
", err); */

  // WriteBuffer
  /* st = clEnqueueWriteBuffer(queue, b_a, CL_FALSE, 0, n * n * sizeof(float),
   */
  /*                           Acpy, 0, NULL, NULL); */
  /* printf("write buffer Acpy - b_a (%d)
", st); */
  /* st = clEnqueueWriteBuffer(queue, b_b, CL_FALSE, 0, n * sizeof(float), bcpy,
   * 0, */
  /*                           NULL, NULL); */
  /* printf("write buffer bcpy - b_b (%d)
", st); */

  // Create Program
  cl_program program;
  program = clCreateProgramWithSource(context, 1, (const char**)&kernel_str,
                                      NULL, &err);
  printf("create program (%d)
", err);

  // Build Program
  /* st = clBuildProgram(program, numDevices, (cl_device_id*)&device, NULL,
   * NULL, */
  /*                     NULL); */
  char* opts = "-Werror";
  st = clBuildProgram(program, numDevices, (cl_device_id*)&device, opts, NULL,
                      NULL);
  printf("build program (%d)
", st);
  if (st != CL_SUCCESS) {
    /* printf("build status: %s
", clErrorString(st)); */
    printf("build status: %d
", st);
    char log[512];
    st = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 512, &log,
                               NULL);
    printf("build info (%d)
", st);
    if (st == CL_SUCCESS) {
      printf("%s
", log);
    }
  }

  // Create Kernel
  cl_kernel kernel1;
  kernel1 = clCreateKernel(program, "kernel1", &st);
  printf("create kernel1 (%d)
", st);
  /* cl_kernel kernel2; */
  /* kernel2 = clCreateKernel(program, "ocl1_2", &st); */
  /* printf("create kernel2 (%d)
", st); */

  // workgroup size
  size_t dims = 1;
  size_t gws[] = { 1, 1, 1 };
  /* size_t gws[dims]; */
  gws[0] = size; // a_rows;
  /* gws[0] = 32; */
  /* size_t* lws = NULL; */
  /* size_t lws[dims]; */
  /* size_t lws[dims]; */
  /* size_t lws[dims] = NULL; */
  /* size_t lws[] = {0, 0, 0}; */
  size_t lws[] = { 128, 1, 1 };
  printf("gws {%lu, %lu, %lu}
", gws[0], gws[1], gws[2]);
  if (lws != NULL) {
    printf("lws {%lu, %lu, %lu}
", lws[0], lws[1], lws[2]);
  } else {
    printf("lws unspecified
");
  }

  // Set Kernel Args
  st = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &buf_a);
  printf("set arg %d (%d)
", 0, st);
  st = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &buf_b);
  printf("set arg %d (%d)
", 1, st);
  /* printf("set kernel1 arg: %d (%d)
", 0, st); */
  st = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &buf_c);
  printf("set arg %d (%d)
", 2, st);
  st = clSetKernelArg(kernel1, 3, sizeof(int), (int*)&size);
  printf("set arg %d (%d)
", 3, st);

  // Execute kernel
  st = clEnqueueNDRangeKernel(queue, kernel1, dims, NULL, (const size_t*)gws,
                              (const size_t*)lws, 0, NULL, &ev_kernel);
  /* (const size_t*)lws, 0, NULL, NULL); */
  /* printf("nd range kernel1 (%d %s)
", st, clErrorString(st)); */
  printf("nd range kernel1 (%d)
", st);

  end = clCreateUserEvent(context, &st);
  printf("create user event (%d)
", st);

  callback_data* user_data = (callback_data*)malloc(sizeof(callback_data));

  printf("c_v %p
", (void*)c_v);

  user_data->queue = &queue;
  user_data->buf_c = &buf_c;
  user_data->c_v = c_v;
  user_data->size = size;
  user_data->end = &end;
  user_data->nested_callbacks = use_nested_callbacks;
  user_data->blocking = use_blocking;

  if (use_callbacks) {
    st =
      clSetEventCallback(ev_kernel, CL_COMPLETE, callback_kernel_fn, user_data);
    printf("set event callback (%d)
", st);
  }
  /* printf("first: %2.5f
", c_v[0]); */
  /* print_matrix_float_s_t("c", c); */
  // ReadBuffer
  /* float* ptr = (float*)clEnqueueMapBuffer(queue, buf_c, CL_TRUE, CL_MAP_READ,
   * 0, c_rows * c_cols * sizeof(float), 0, NULL, NULL, &st); */
  /* printf("read buffer c_v - buf_c (%d)
", st); */
  /* printf("finish queue
"); */
  /* clFinish(queue); */
  /* printf("finished queue
"); */

  if (use_callbacks) {
    /* clWaitForCompletion(context); */

    printf("waiting for events
");
    /* /\* cl_event events[] = {ev_kernel}; *\/ */
    cl_event events[] = { end };
    clWaitForEvents(1, events); // ev_kernel);
    printf("waited for events
");

    clbWaitEvents(c_v);

  } else {
    printf("about to read the c buffer
");
    st = clEnqueueReadBuffer(queue, buf_c, use_blocking, 0, size * sizeof(int),
                             c_v, 0, NULL, NULL);
    printf("read buffer c_v - buf_c (%d)
", st);
  }

  /* print_matrix("c_v", c_v, c_rows, c_cols); */

  /* printf("first: %2.5f
", c_v[0]); */
  /* print_matrix_float_s_t("c", c); */
  free(user_data);

  clReleaseKernel(kernel1);
  /* clReleaseKernel(kernel2); */
  clReleaseProgram(program);
  clReleaseCommandQueue(queue);
  clReleaseMemObject(buf_a);
  clReleaseMemObject(buf_b);
  clReleaseMemObject(buf_c);
  /* clReleaseMemObject(b_x); */
  clReleaseContext(context);
  free(devices);
  free(platforms);

#define THRESHOLD 0
  // check
  printf("about to check (first: %d)
", c_v[0]);
  for (size_t i = 0; i < size; i++) {
    if (abs(c_v[i] - (a_v[i] + b_v[i])) > THRESHOLD) {
      printf("Wrong checking: a_v[%ld] = %d, b_v[%ld] = %d, c_v[%ld] = %d
", i,
             a_v[i], i, b_v[i], i, c_v[i]);
      exit(EXIT_FAILURE);
    }
  }

  return EXIT_SUCCESS;
}

void CL_CALLBACK
callback_read_fn(cl_event event, cl_int ev_status, void* user_data)
{
  printf("-- BEGIN callback read executed (%d)
", ev_status);
  callback_data* cb_data = (callback_data*)user_data;
  /* cl_command_queue queue = *(cb_data->queue); */
  /* cl_mem buf_c = *(cb_data->buf_c); */
  int* c_v = cb_data->c_v;
  cl_event end = *(cb_data->end);
  /* int size = cb_data->size; */

  cl_int st;

  printf("c_v %p
", (void*)c_v);
  printf("c_v[0] = %d
", c_v[0]);

  /* c_v[1] = 1; */

  st = clSetUserEventStatus(end, CL_COMPLETE);
  printf("set user event status (%d)
", st);
  // haz que salga el finish
  printf("-- END
");
}

cl_event ev_read;

void CL_CALLBACK
callback_kernel_fn(cl_event event, cl_int ev_status, void* user_data)
{
  printf("-- BEGIN callback kernel executed (%d)
", ev_status);
  callback_data* cb_data = (callback_data*)user_data;
  cl_command_queue queue = *(cb_data->queue);
  cl_mem buf_c = *(cb_data->buf_c);
  int* c_v = cb_data->c_v;
  int size = cb_data->size;
  bool nested_callbacks = cb_data->nested_callbacks;
  bool blocking = cb_data->blocking;
  cl_event end = *(cb_data->end);


  printf("c_v %p
", (void*)c_v);
  printf("c_v[0] = %d
", c_v[0]);

  cl_int st;

  /* printf("about to flush
"); */
  /* clFlush(queue); */
  /* printf("flushed
"); */

  size_t offset = 0;
  /* size = size + 4; */
  printf("about to read the c buffer
");
  printf("blocking %d
", blocking);

  clb_events_waiting_device[clb_num_events_waiting] = 0;
  clb_events_init_read[clb_num_events_waiting] = 0;


  /* why it does not work? (blocking CL_TRUE) */
  st = clEnqueueReadBuffer(queue, buf_c, blocking, offset, size * sizeof(int),
                           c_v, 0, NULL, &clb_events_waiting[clb_num_events_waiting++]);
  ev_read = clb_events_waiting[clb_num_events_waiting - 1];
  printf("enqueue read buffer (%d)
", st);
  /* size * sizeof(int), c_v, 0, NULL, NULL); */

  if (nested_callbacks) {
    st = clSetEventCallback(ev_read, CL_COMPLETE, callback_read_fn, user_data);
    printf("set event callback (%d)
", st);
    /* st = clSetUserEventStatus(end, CL_COMPLETE); */
    /* printf("set user event status (%d)
", st); */
  }
  /* c_v[1] = 1; */

  /* st = clGetEventInfo(ev_read, CL_EVENT_COMMAND_TYPE, ); */
  /* printf("event info (%d)
", st); */

  /* int len = 512; */
  /* char buflog[len]; */
  /* cl_command_type; */
  /* clGetEventInfo(ev_read, CL_EVENT_COMMAND_TYPE, len, &buflog, NULL); */
  /* if (buflog != NULL) { */
  /*   printf("- event: %s
", buflog); */
  /* } */

  if (!nested_callbacks) {
    st = clSetUserEventStatus(end, CL_COMPLETE);
    printf("set user event status (%d)
", st);

    /* printf("read buffer c_v - buf_c (%d)
", st); */
  }
  printf("-- END
");
}

And now, if I select the Intel CPU as device:


./callback 0 1 0 1 1 0

It works.

Now, if I select the Intel IGPU (Intel Integrated GPU):


./callback 0 0 2 1 1 0

It is freezes / hangs:

As you can see in the first example of execution (CPU) it should appear the two callbacks (two BEGIN/END pairs). In the case of HD Graphics GPU it hangs after the first callback (only one BEGIN/END pair).

Why?

– I cannot paste any type of results from gdb nor bash… in this forum. I don’t understand why. I tried like 10 times (complains about URL but I have no URLs…).

The basic kernel, although it is not relevant:


__kernel void
kernel1(__global int* a, __global int* b, __global int* c, int size)
{

  int idx = get_global_id(0);

  if (idx >= 0 && idx < size){
    c[idx] = a[idx] + b[idx];
  }
}