Hello,
The following algorithm is skipped when L>1 (the number of iteration), except when the INIT part is outside the for loop :
__kernel void test(__global float* f, __global float* u, __global float* v, __local float* vLocal,
__local float* p1Local, __local float* p2Local, int gHeight, int gWidth)
{
// Block index
int bi = get_group_id(1);
int bj = get_group_id(0);
// Thread index
int li = get_local_id(1);
int lj = get_local_id(0);
int lHeight = get_local_size(1)+2;
int lWidth = get_local_size(0)+2;
//global coordinates
int gi = bi * BLOCK_SIZE + li;
int gj = bj * BLOCK_SIZE + lj;
if( (gi-1<0) || (gi+1>gHeight-1) || (gj-1<0) || (gj+1>gWidth-1) )
return;
li+=1;
lj+=1;
barrier(CLK_LOCAL_MEM_FENCE);
float pij1;
float pij2;
#pragma unroll
for(int l=0 ; l<L ; l++)
{
// INIT
// fill local memory
p1Local[li*lWidth+lj] = 0;
p2Local[li*lWidth+lj] = 0;
vLocal[li*lWidth+lj] = v[gi*gWidth+gj];
// local memory edges
if(li-1 == 0)
{
p1Local[(li-1)*lWidth+lj] = p2Local[(li-1)*lWidth+lj] = 0;
vLocal[(li-1)*lWidth+lj] = v[(gi-1)*gWidth+gj];
}
else if(li+1 == lHeight-1)
{
p1Local[(li+1)*lWidth+lj] = p2Local[(li+1)*lWidth+lj] = 0;
vLocal[(li+1)*lWidth+lj] = v[(gi+1)*gWidth+gj];
}
if(lj-1 == 0)
{
p1Local[li*lWidth+lj-1] = p2Local[li*lWidth+lj-1] = 0;
vLocal[li*lWidth+lj-1] = v[gi*gWidth+gj-1];
}
else if(lj+1 == lWidth-1)
{
p1Local[li*lWidth+lj+1] = p2Local[li*lWidth+lj+1] = 0;
vLocal[li*lWidth+lj+1] = v[gi*gWidth+gj+1];
}
if(li+1 == lHeight-1 && lj-1 == 0)
p1Local[(li+1)*lWidth+lj-1] = p2Local[(li+1)*lWidth+lj-1] = 0;
else if(li-1 == 0 && lj+1 == lWidth-1)
p1Local[(li-1)*lWidth+lj+1] = p2Local[(li-1)*lWidth+lj+1] = 0;
// END INIT
// 3.30: Gradient Descent:
#pragma unroll
for(int k=0 ; k<K ; k++)
{
// 3.30
//p[li*gWidth+lj] will be used many times:
pij1 = p1Local[li*lWidth+lj];
pij2 = p2Local[li*lWidth+lj];
//div(p): divP
float dp1i = pij1-p1Local[(li-1)*lWidth+lj];
float dp2j = pij2-p2Local[li*lWidth+lj-1];
float divP = dp1i + dp2j;
//VTD:
float VTD = vLocal[li*lWidth+lj]+THETA*divP;
// dVTDi:
float dp1i_ip1 = p1Local[(li+1)*lWidth+lj] - pij1;
float dp2j_ip1 = p2Local[(li+1)*lWidth+lj] - p2Local[(li+1)*lWidth+lj-1];
float divP_ip1 = dp1i_ip1 + dp2j_ip1;
float VTD_ip1 = vLocal[(li+1)*lWidth+lj] + THETA*divP_ip1;
float dVTDi = VTD_ip1 - VTD;
// dVTDj:
float dp1i_jp1 = p1Local[li*lWidth+lj+1]-p1Local[(li-1)*lWidth+lj+1];
float dp2j_jp1 = p2Local[li*lWidth+lj+1]-pij2;
float divP_jp1 = dp1i_jp1 + dp2j_jp1;
float VTD_jp1 = vLocal[li*lWidth+lj+1] + THETA*divP_jp1;
float dVTDj = VTD_jp1 - VTD;
// p:
pij1 = pij1 + (TAU/THETA)*dVTDi;
pij2 = pij2 + (TAU/THETA)*dVTDj;
// 3.31
float normPij = sqrt(pij1*pij1 + pij2*pij2);
pij1 = pij1 / max((float)1.0, normPij);
pij2 = pij2 / max((float)1.0, normPij);
barrier(CLK_LOCAL_MEM_FENCE);
p1Local[li*lWidth+lj] = pij1;
p2Local[li*lWidth+lj] = pij2;
}
// 3.32:
barrier(CLK_LOCAL_MEM_FENCE);
float dp1i = pij1-p1Local[(li-1)*lWidth+lj];
float dp2j = pij2-p2Local[li*lWidth+lj-1];
float divP = dp1i + dp2j;
u[gi*gWidth+gj] = vLocal[li*lWidth+lj] + THETA*divP;
// 3.33:
float umf = u[gi*gWidth+gj] - f[gi*gWidth+gj];
if(umf > LAMBDA*THETA)
v[gi*gWidth+gj] = u[gi*gWidth+gj] - LAMBDA*THETA;
else if (umf < -LAMBDA*THETA)
v[gi*gWidth+gj] = u[gi*gWidth+gj] + LAMBDA*THETA;
else
v[gi*gWidth+gj] = f[gi*gWidth+gj];
}
}
Do you have any idea why ?
Thanks,
Arthur