I’m learning OpenCL 2.0 and stuck at synchronization of child kernels and parent kernels in a simple dynamic parallelism algorithm.
When its just incrementing a single value, it seems to be working but when I add more real world code, child kernels do not finish before the parent kernel finishes.
I tried clFinish() for on-device queue, it returned invalid queue error.
Then tried clenqueuemarkerwithwaitlist(to connect the host queue with device queue so both finish before reading result buffers) which returned same error again.
Then realized documentation says there are for host queues.
How can I wait for an on-device queue?
My intention is to finish all device side queues before buffer read commands start. It doesn’t matter if child kernel starts after parent kernel or at the same time. I just want to make child-kernel data-array adjustments visible on host queue so when I call clFinish on that host queue, memory will be consistent.
A working(or fooling me as if its working) example:
__kernel void testDynamic(__global int *data)
{
int id=get_global_id(0);
atomic_add(&data[1],2);
}
__kernel void test(__global int * data)
{
int id=get_global_id(0);
atomic_add(&data[0],2);
if (id == 0) {
queue_t q = get_default_queue();
ndrange_t ndrange = ndrange_1D(1,1);
void (^my_block_A)(void) = ^{
testDynamic(data);};
enqueue_kernel(q, CLK_ENQUEUE_FLAGS_WAIT_KERNEL,
ndrange,
my_block_A);
}
}
a non-working example:
__kernel void test( __global float *xyz,__global float *xyzn,__global float *xyzo,__global float * arguments)
{
int threadId=get_global_id(0);
float dx=xyz[threadId*3]-arguments[2];float dy=xyz[threadId*3+1]-arguments[3];float t=arguments[1];
float ctr=arguments[0];float wave=0.02f*ctr*sin(40.0f*t+100.0f*sqrt(dx*dx+dy*dy));
xyzo[threadId*3]=xyz[threadId*3]+xyzn[threadId*3]*wave; // wave equation for all surface vertices
xyzo[threadId*3+1]=xyz[threadId*3+1]+xyzn[threadId*3+1]*wave; // wave equation for all surface vertices
xyzo[threadId*3+2]=xyz[threadId*3+2]+xyzn[threadId*3+2]*wave; // wave equation for all surface vertices
}
__kernel void waveEquation( __global float *xyz,__global float *xyzn,__global float *xyzo,__global float * arguments)
{
int threadId=get_global_id(0);
if(threadId<arguments[4])
{
queue_t q = get_default_queue();
ndrange_t ndrange = ndrange_1D(threadId,1,1);
void (^my_block_A)(void) = ^{test(xyz,xyzn,xyzo,arguments);};
enqueue_kernel(q, CLK_ENQUEUE_FLAGS_NO_WAIT,ndrange,my_block_A);
}
}
above code not only malfunctioning but also randomly malfunctioning for the first 1000-2000 workitems (some of them can actually update their data before host synchronizes on data).