Hi everyone!
I’m benchmarking a parallelized algorithm and i wanted to run 1000 iterations of it.
The thing is, my global range for a single iteration is 82369, to make those 1000 iterations i decided to multiply 82369 times 1000, and then compute the offset inside the kernel. The problem is this raizes CL_OUT_OF_RESOURCES at enqueueReadBuffer.
Then i tried to do a FOR loop, inside the kernel 1000 times and keep global range at 82369 but the same CL_OUT_OF_RESOURCES came out also at enqueueReadBuffer.
Then tried to do some variations like 82369*20 and doing A FOR loop 50 times, but CL_OUT_OF_RESOURCES always came out.
I couldn’t find any explanation for this, info explaining the relation that i found between global range and the number of loops inside the kernel.
If someone has an idea why this happens, i would appreciate it (if this is even possible).
This is the kernel code (most of it):
__kernel void square(__constant unsigned long NBTS,
__constant unsigned long GRID_SIZE_X,
__constant unsigned long GRID_SIZE_Y,
__constant unsigned long radix,
__global unsigned long *BTSET,
__global long *fitness,
__global int *covered_points
)
{
long globaID = get_global_id(0);
__local long GRID_SIZE;
if (globaID==0)
GRID_SIZE = GRID_SIZE_X*GRID_SIZE_Y;
barrier(CLK_LOCAL_MEM_FENCE);
int contador;
long x, y;
long x_k, y_k;
long x1, y1,rx,ry;
float cover_rate, fit;
int check=0,j,i,m;
// offset
long idy = globaID / GRID_SIZE;
long idx = globaID - idy * GRID_SIZE;
if (idx==0)
{
*covered_points = 0; //
}
for (j=0; j<1; j++) // <--- HERE´S WHERE I PUT THE 1000 ITERATIONS
{
contador = 0;
for(m=0; m<NBTS; m++)
{
if(BTSET[m]==idx)
{
...
}
}
if (check!=1)
{
for (i=0; i<NBTS; i++)
{
...
}
}
}
}
If you would like some more information, please just ask.
Thanks in advanced …