This is the code:

my_clcb.cpp

Code :
#include "my_clcb.hpp"
 
#define DEBUG 1
 
int g_remaining_size = 0;
mutex g_mutex_remaining_size;
mutex g_kernel;
vector<CBData*> scheduled;
vector<cl::Event*> events; // kernel and read
vector<cl::NDRange*> ranges; // normal and offset
 
 
void CL_CALLBACK cbRead(cl_event /* event */, cl_int /* status */, void* user_data) {
  lock_guard<mutex> guard(g_kernel);
  CBData* data = reinterpret_cast<CBData*>(user_data);
  cl::UserEvent& end = data->end;
  int chunk_size = data->chunk_size;
  int offset_size = data->offset_size;
  int index = data->index;
  if (DEBUG) cout << index << ") cbRead offset: " << offset_size << "\n";
  // cout << "end: " << end() << "\n";
  // lock_guard<mutex> guard(g_mutex_remaining_size);
  g_mutex_remaining_size.lock();
  g_remaining_size -= chunk_size;
  if (g_remaining_size == 0) {
    // cout << "set status.\n";
    end.setStatus(CL_COMPLETE);
  }
  if (DEBUG) cout << "remaining: " << g_remaining_size << "\n";
  g_mutex_remaining_size.unlock();
}
 
void CL_CALLBACK cbKernel(cl_event /* event */, cl_int /* status */, void* user_data) {
  lock_guard<mutex> guard(g_kernel);
  CBData* data = reinterpret_cast<CBData*>(user_data);
 
  cl_int st = CL_SUCCESS;
 
  cl::CommandQueue& queue = data->queue;
  cl::Buffer& c_buffer = data->buffer;
  int chunk_size = data->chunk_size;
  int offset_size = data->offset_size;
  vector<int>& c_array = data->array;
  // cl::UserEvent& end = data->end;
  cl::Event& evread = data->evread;
  int index = data->index;
  // cout << "queue: " << queue() << "\n";
  cout << index << ") c_buffer: " << c_buffer() << "\n";
  // cout << "problem_size: " << problem_size << "\n";
  cout << index << ") chunk_size: " << chunk_size << "\n";
  cout << index << ") offset_size: " << offset_size << "\n";
  cout << index << ") c_array: " << c_array[0] << "\n";
  // cout << "end: " << end() << "\n";
  cout << index << ") evread: " << evread() << "\n";
  if (DEBUG) cout << index << ") cbKernel offset: " << offset_size << "\n";
 
  st = queue.enqueueReadBuffer(c_buffer, CL_FALSE, sizeof(int) * offset_size, sizeof(int) * chunk_size,
                               c_array.data() + (offset_size * sizeof(int)), NULL, &evread);
  if (st != CL_SUCCESS) {
    cout << "error in read\n";
    return;
  }
  queue.flush();
  evread.setCallback(CL_COMPLETE, cbRead, data);
  // cout << "set callback read.\n";
}
 
void schedule(Work& work) {
  cl::CommandQueue& queue = work.queue;
  cl::Kernel& kernel = work.kernel;
  cl::Buffer& c_buffer = work.buffer;
  vector<int>& c_array = work.array;
  int problem_size = work.problem_size;
  int chunk_size = work.chunk_size;
  cl::UserEvent& end = work.end;
  g_remaining_size = problem_size;
 
  int chunks = problem_size / chunk_size;
  for (int i = 0; i < chunks; ++i) {
    cl::Event* evkernel = new cl::Event();
    cl::Event* evread = new cl::Event();
    cl::NDRange* range = new cl::NDRange(chunk_size);
    cl::NDRange* range_offset = new cl::NDRange(chunk_size * i);
    events.push_back(evkernel);
    events.push_back(evread);
    ranges.push_back(range);
    ranges.push_back(range_offset);
    CBData* data = new CBData(queue, kernel, c_buffer, c_array, chunk_size, chunk_size * i, end, *events.at(2*i), *events.at(2*i+1), *ranges.at(2*i), *ranges.at(2*i+1), i);
    scheduled.push_back(data);
    // enqueueKernel(data);
    enqueueKernel(*scheduled.at(i));
    cout << "scheduled: " << i << "\n";
  }
  queue.flush();
  cout << "flushed\n";
}
 
void enqueueKernel(CBData& data) {
  // enqueueKernel();
 
  cl::CommandQueue& queue = data.queue;
  cl::Kernel& kernel = data.kernel;
  // cl::Buffer& c_buffer = data.buffer;
  // int chunk_size = data.chunk_size;
  // int offset_size = data.offset_size;
  // vector<int>& c_array = data.array;
  // cl::UserEvent& end = data.end;
  cl::Event& evkernel = data.evkernel;
  cl::NDRange& range_offset = data.range_offset;
  cl::NDRange& range = data.range;
 
  // cl::Event event;
 
  cl_int st = CL_SUCCESS;
 
  // st = queue.enqueueNDRangeKernel(kernel, cl::NDRange(offset_size, 0, 0), cl::NDRange(chunk_size), cl::NDRange(128),
  //                                 NULL, &event);
  st = queue.enqueueNDRangeKernel(kernel, range_offset, range, cl::NDRange(128),
                                  NULL, &evkernel);
  if (st != CL_SUCCESS) {
    cout << "error in ndrange\n";
    return;
  }
  // CBData data(queue, c_buffer, problem_size, c_array, end, event);
  // data.event = event;
 
  // cout << "queue: " << queue() << "\n";
  // cout << "c_buffer: " << c_buffer() << "\n";
  // cout << "problem_size: " << problem_size << "\n";
  // cout << "chunk_size: " << chunk_size << "\n";
  // cout << "offset_size: " << offset_size << "\n";
  // cout << "c_array: " << c_array[0] << "\n";
  // cout << "end: " << end() << "\n";
  // cout << "event: " << event() << "\n";
 
  evkernel.setCallback(CL_COMPLETE, cbKernel, &data);
  // queue.flush();
}
 
int main(int argc, char* argv[]) {
  uint sel_platform = 0;
  uint sel_device = 1;
  // int problem_size = 10240000;
  int problem_size = 10240;
  // int problem_size = stoi(argv[3]);
  // int problem_size = 2048;
  // int chunk_size = problem_size;
  int chunk_size = 1024;
  // int chunk_size = stoi(argv[4]);
  if (argc == 5) {
    sel_platform = stoi(argv[1]);
    sel_device = stoi(argv[2]);
    problem_size = stoi(argv[3]);
    chunk_size = stoi(argv[4]);
  }
 
  vector<int> a_array(problem_size);
  vector<int> b_array(problem_size);
  vector<int> c_array(problem_size);
 
  for (auto i = 0; i < problem_size; i++) {
    a_array[i] = i;
    b_array[i] = i + 1;
  }
 
  cl_int err = CL_SUCCESS;
  cl_int st = CL_SUCCESS;
  string info_buffer;
  // string info_buffer(" ", 8096);
  info_buffer.reserve(8096);
 
  vector<cl::Platform> platforms;
 
  cl::Platform::get(&platforms);
  for (auto& platform : platforms) {
    st = platform.getInfo(CL_PLATFORM_NAME, &info_buffer);
    if (st != CL_SUCCESS) {
      cout << "error platform.getInfo\n";
      return 1;
    }
    cout << "platform: " << info_buffer << "\n";
 
    vector<cl::Device> devices;
    platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
    for (auto& device : devices) {
      st = device.getInfo(CL_DEVICE_NAME, &info_buffer);
      if (st != CL_SUCCESS) {
        cout << "error device.getInfo\n";
        return 1;
      }
      cout << "  device: " << info_buffer << "\n";
    }
  }
  cout << "num platforms: " << platforms.size() << "\n";
  if (sel_platform >= platforms.size()){
    sel_platform = platforms.size() - 1;
    cout << "sel_platform changed to: " << sel_platform << "(to fit number of platforms)\n";
  }
  cl::Platform platform = platforms.at(sel_platform);
  vector<cl::Device> devices;
  platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
  cout << "num devices in selected platform: " << devices.size() << "\n";
  if (sel_device >= devices.size()){
    sel_device = devices.size() - 1;
    cout << "sel_device changed to: " << sel_device << " (to fit number of devices)\n";
  }
  cl::Device device = devices.at(sel_device);
 
  cl::Context context(device);
 
  cl::Buffer a_buffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, problem_size * sizeof(int), a_array.data(),
                      &err);
  if (err != CL_SUCCESS) {
    cout << "error buff a\n";
    return 1;
  }
  cl::Buffer b_buffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, problem_size * sizeof(int), b_array.data(),
                      &err);
  if (err != CL_SUCCESS) {
    cout << "error buff b\n";
    return 1;
  }
  cl::Buffer c_buffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, problem_size * sizeof(int), c_array.data(),
                      &err);
  if (err != CL_SUCCESS) {
    cout << "error buff c\n";
    return 1;
  }
 
  string kernelstr = R"(
__kernel void kernel1(__global int* a, __global int* b, __global int* c, int size) {
int idx = get_global_id(0);if (idx >= 0 && idx < size){c[idx]=a[idx]+b[idx];}}
)";
 
  cl::Program::Sources sources;
  sources.push_back({kernelstr.c_str(), kernelstr.length()});
 
  cl::Program program(context, sources);
  if (program.build({device}) != CL_SUCCESS) {
    std::cout << " Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device) << "\n";
    return 1;
  }
 
  cl::CommandQueue queue(context, device, 0, &err);
  if (err != CL_SUCCESS) {
    cout << "error queue\n";
    return 1;
  }
 
  st = queue.enqueueWriteBuffer(a_buffer, CL_TRUE, 0, sizeof(int) * problem_size, a_array.data());
  if (st != CL_SUCCESS) {
    cout << "error write buffer\n";
    return 1;
  }
  st = queue.enqueueWriteBuffer(b_buffer, CL_TRUE, 0, sizeof(int) * problem_size, b_array.data());
  if (st != CL_SUCCESS) {
    cout << "error write buffer\n";
    return 1;
  }
 
  cl::Kernel kernel(program, "kernel1", &err);
  if (err != CL_SUCCESS) {
    cout << "error kernel\n";
    return 1;
  }
 
  st = kernel.setArg(0, a_buffer);
  if (st != CL_SUCCESS) {
    cout << "error set arg 0\n";
    return 1;
  }
  st = kernel.setArg(1, b_buffer);
  if (st != CL_SUCCESS) {
    cout << "error set arg 1\n";
    return 1;
  }
  st = kernel.setArg(2, c_buffer);
  if (st != CL_SUCCESS) {
    cout << "error set arg 2\n";
    return 1;
  }
  st = kernel.setArg(3, (int)problem_size);
  if (st != CL_SUCCESS) {
    cout << "error set arg 3\n";
    return 1;
  }
 
  cl::UserEvent end(context, &err);
  if (err != CL_SUCCESS) {
    cout << "error create user event\n";
    return 1;
  }
 
  cl::Event event;
 
  Work work(queue, kernel, c_buffer, c_array, problem_size, chunk_size, end);
 
  schedule(work);
 
  cout << "before wait\n";
  // end.wait();
 
  cl::Event::waitForEvents({end});  // vector<cl::Event>(end));
  cout << "after wait\n";
 
  auto ok = check(a_array, b_array, c_array, problem_size, chunk_size);
 
  if (ok) {
    cout << "Success\n";
  } else {
    cout << "Failure\n";
  }
 
  string info2_buffer;
  info2_buffer.reserve(512);
 
  auto i = 0;
  for (auto& event : events) {
    cl_int evst;
    st = event->getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS, &evst);
    if (st != CL_SUCCESS){
      cout << "error event.getInfo\n";
    }
    cout << "event " << i << ": " << evst << "\n";
    delete event;
    i++;
  }
  for (auto& data : scheduled) {
    delete data;
  }
 
  check(a_array, b_array, c_array, problem_size, chunk_size);
 
  return 0;
}
 
 
int check(vector<int> a_array, vector<int> b_array, vector<int> c_array, int problem_size, int chunk_size){
  auto ok = true;
  auto chunks = problem_size / chunk_size;
  for (auto j = 0; j < chunks; j++) {
    auto i = j * chunk_size;
    if (abs(c_array[i] - (a_array[i] + b_array[i])) > 0.001) {
      cout << "[" << i << "] a: " << a_array[i] << " b: " << b_array[i] << " c: " << c_array[i] << "\n";
      ok = false;
    }
  }
  return ok;
}

my_clcb.hpp

Code :
 
#include <CL/cl.hpp>
#include <iostream>
#include <memory>
#include <mutex>
 
using namespace std;
 
struct Work {
  cl::CommandQueue& queue;
  cl::Kernel& kernel;
  cl::Buffer& buffer;
  vector<int>& array;
  int problem_size;
  int chunk_size;
  cl::UserEvent& end;
 
  Work(cl::CommandQueue& queue_, cl::Kernel& kernel_, cl::Buffer& buffer_, vector<int>& array_, int problem_size_,
       int chunk_size_, cl::UserEvent& end_)
      : queue(queue_),
        kernel(kernel_),
        buffer(buffer_),
        array(array_),
        problem_size(problem_size_),
        chunk_size(chunk_size_),
        end(end_) {}
};
 
struct CBData {
  cl::CommandQueue& queue;
  cl::Kernel& kernel;
  cl::Buffer& buffer;
  vector<int>& array;
  int chunk_size;
  int offset_size;
  cl::UserEvent& end;
  cl::Event& evkernel;
  cl::Event& evread;
  cl::NDRange& range;
  cl::NDRange& range_offset;
  int index;
 
  CBData(cl::CommandQueue& queue_, cl::Kernel& kernel_, cl::Buffer& buffer_, vector<int>& array_, int chunk_size_,
         int offset_size_, cl::UserEvent& end_, cl::Event& evkernel_, cl::Event& evread_, cl::NDRange& range_, cl::NDRange& range_offset_, int index_)
      : queue(queue_),
        kernel(kernel_),
        buffer(buffer_),
        array(array_),
        chunk_size(chunk_size_),
        offset_size(offset_size_),
        end(end_),
        evkernel(evkernel_),
        evread(evread_),
        range(range_),
        range_offset(range_offset_),
        index(index_)
  {}
};
 
int check(vector<int> a_array, vector<int> b_array, vector<int> c_array, int problem_size, int chunk_size);
void enqueueKernel(CBData& data);
void schedule(Work& work);
 
void CL_CALLBACK cbRead(cl_event /* event */, cl_int /* status */, void* user_data);
void CL_CALLBACK cbKernel(cl_event /* event */, cl_int /* status */, void* user_data);
 
int check(vector<int> a_array, vector<int> b_array, vector<int> c_array, int problem_size, int chunk_size);

And I compile it as:

Code :
g++ -std=c++17 -O0 -g -Wall -Wextra -lOpenCL src/my_clcb.cpp -o build/my_clcb


You can execute depending on your platform/device, like this:
Code :
./build/my_clcb 0 1 128 128 # run in platform 0, device 1, problem size of 128 ints, chunk size of 128 ints (so, only 1 operation)
./build/my_clcb 0 1 768 128 # run in platform 0, device 1, problem size of 768 ints, chunk size of 128 ints (so, 6 operations)

Some of the errors:

Data race, Failure with 0 2 768 128:

Code :
....
after wait
[512] a: 512 b: 513 c: 257
Failure
event 0: 0
event 1: 0
event 2: 0
event 3: 0
event 4: 0
event 5: 0
event 6: 0
event 7: 0
event 8: 0
event 9: 0
event 10: 0
event 11: 0
[512] a: 512 b: 513 c: 257
[Thread 0x7fffc58f2700 (LWP 27683) exited]
[Thread 0x7fffe1aed700 (LWP 27682) exited]
[Thread 0x7fffb5fd5700 (LWP 27686) exited]
[Thread 0x7fffb6fea700 (LWP 27685) exited]
[Thread 0x7fffb7fff700 (LWP 27684) exited]
[Thread 0x7fffa7fff700 (LWP 27687) exited]
[Thread 0x7fffa6fea700 (LWP 27688) exited]
[Thread 0x7fffa5fd5700 (LWP 27689) exited]
[Thread 0x7fffe8051700 (LWP 27681) exited]
[Thread 0x7fffe37fe700 (LWP 27678) exited]
[Thread 0x7fffe8be1700 (LWP 27676) exited]
[Thread 0x7fffe3fff700 (LWP 27677) exited]
[Thread 0x7fffe27fc700 (LWP 27680) exited]
[Thread 0x7fffe2ffd700 (LWP 27679) exited]
[Thread 0x7fffe95e7700 (LWP 27675) exited]
[Thread 0x7fffea767700 (LWP 27674) exited]
[Inferior 1 (process 27401) exited normally]

Data race, because now it gives Success and SegFault in info_buffer string deallocate with 0 2 512 128:

Code :
...
remaining: 0
after wait
Success
event 0: 0
event 1: 0
event 2: 0
event 3: 0
event 4: 0
event 5: 0
event 6: 0
event 7: 0
[Thread 0x7fffc58f2700 (LWP 11595) exited]
[Thread 0x7fffe1aed700 (LWP 11594) exited]
[Thread 0x7fffb6fea700 (LWP 11597) exited]
[Thread 0x7fffb7fff700 (LWP 11596) exited]
[Thread 0x7fffa6fea700 (LWP 11600) exited]
 
Thread 1 "my_clcb" received signal SIGSEGV, Segmentation fault.
...
[17] id 11601 name my_clcb from 0x00007ffff5db96db in __lll_lock_wait_private+27
[15] id 11599 name my_clcb from 0x00007ffff5db96db in __lll_lock_wait_private+27
[14] id 11598 name my_clcb from 0x00007ffff5db96db in __lll_lock_wait_private+27
[9] id 11593 name my_clcb from 0x00007ffff5db96db in __lll_lock_wait_private+27
[8] id 11418 name si_shader_low:1 from 0x00007ffff5db638d in pthread_cond_wait@@GLIBC_2.3.2
[7] id 11417 name si_shader_low:0 from 0x00007ffff5db638d in pthread_cond_wait@@GLIBC_2.3.2
[6] id 11416 name si_shader:2 from 0x00007ffff5db638d in pthread_cond_wait@@GLIBC_2.3.2
[5] id 11415 name si_shader:1 from 0x00007ffff5db638d in pthread_cond_wait@@GLIBC_2.3.2
[4] id 11414 name si_shader:0 from 0x00007ffff5db638d in pthread_cond_wait@@GLIBC_2.3.2
[3] id 11404 name disk_cache:0 from 0x00007ffff5db638d in pthread_cond_wait@@GLIBC_2.3.2
[2] id 11374 name amdgpu_cs:0 from 0x00007ffff5db638d in pthread_cond_wait@@GLIBC_2.3.2
[1] id 11168 name my_clcb from 0x00007ffff6f9a003 in free+-15036461
─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
0x00007ffff6f9a003 in free () from /usr/lib/libc.so.6
>>> bt
#0  0x00007ffff6f9a003 in free () from /usr/lib/libc.so.6
#1  0x000055555556164c in __gnu_cxx::new_allocator<char>::deallocate (this=0x7fffffffd460, __p=0x5555557806a0 "\t\001") at /usr/include/c++/7.2.0/ext/new_allocator.h:125
#2  0x000055555555fbc9 in std::allocator_traits<std::allocator<char> >::deallocate (__a=..., __p=0x5555557806a0 "\t\001", __n=8097) at /usr/include/c++/7.2.0/bits/alloc_traits.h:462
#3  0x000055555555dd50 in std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_destroy (this=0x7fffffffd460, __size=8096) at /usr/include/c++/7.2.0/bits/basic_string.h:210
#4  0x000055555555c746 in std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::_M_dispose (this=0x7fffffffd460) at /usr/include/c++/7.2.0/bits/basic_string.h:205
#5  0x000055555555ae58 in std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >::~basic_string (this=0x7fffffffd460, __in_chrg=<optimized out>) at /usr/include/c++/7.2.0/bits/basic_string.h:620
#6  0x0000555555558dc4 in main (argc=5, argv=0x7fffffffd5e8) at src/my_clcb.cpp:168

Please, what am I doing wrong here? How can I run always Success without segfaults?