PDA

View Full Version : for loop problem

arthur.sw
03-22-2011, 09:17 AM
Hello,

The following algorithm is skipped when L>1 (the number of iteration), except when the INIT part is outside the for loop :

__kernel void test(__global float* f, __global float* u, __global float* v, __local float* vLocal,
__local float* p1Local, __local float* p2Local, int gHeight, int gWidth)
{
// Block index
int bi = get_group_id(1);
int bj = get_group_id(0);

int li = get_local_id(1);
int lj = get_local_id(0);

int lHeight = get_local_size(1)+2;
int lWidth = get_local_size(0)+2;

//global coordinates
int gi = bi * BLOCK_SIZE + li;
int gj = bj * BLOCK_SIZE + lj;

if( (gi-1<0) || (gi+1>gHeight-1) || (gj-1<0) || (gj+1>gWidth-1) )
return;

li+=1;
lj+=1;

barrier(CLK_LOCAL_MEM_FENCE);

float pij1;
float pij2;

#pragma unroll
for(int l=0 ; l<L ; l++)
{
// INIT

// fill local memory
p1Local[li*lWidth+lj] = 0;
p2Local[li*lWidth+lj] = 0;
vLocal[li*lWidth+lj] = v[gi*gWidth+gj];

// local memory edges
if(li-1 == 0)
{
p1Local[(li-1)*lWidth+lj] = p2Local[(li-1)*lWidth+lj] = 0;
vLocal[(li-1)*lWidth+lj] = v[(gi-1)*gWidth+gj];
}
else if(li+1 == lHeight-1)
{
p1Local[(li+1)*lWidth+lj] = p2Local[(li+1)*lWidth+lj] = 0;
vLocal[(li+1)*lWidth+lj] = v[(gi+1)*gWidth+gj];
}
if(lj-1 == 0)
{
p1Local[li*lWidth+lj-1] = p2Local[li*lWidth+lj-1] = 0;
vLocal[li*lWidth+lj-1] = v[gi*gWidth+gj-1];
}
else if(lj+1 == lWidth-1)
{
p1Local[li*lWidth+lj+1] = p2Local[li*lWidth+lj+1] = 0;
vLocal[li*lWidth+lj+1] = v[gi*gWidth+gj+1];
}

if(li+1 == lHeight-1 && lj-1 == 0)
p1Local[(li+1)*lWidth+lj-1] = p2Local[(li+1)*lWidth+lj-1] = 0;
else if(li-1 == 0 && lj+1 == lWidth-1)
p1Local[(li-1)*lWidth+lj+1] = p2Local[(li-1)*lWidth+lj+1] = 0;

// END INIT

// 3.30: Gradient Descent:
#pragma unroll
for(int k=0 ; k<K ; k++)
{
// 3.30

//p[li*gWidth+lj] will be used many times:
pij1 = p1Local[li*lWidth+lj];
pij2 = p2Local[li*lWidth+lj];

//div(p): divP
float dp1i = pij1-p1Local[(li-1)*lWidth+lj];
float dp2j = pij2-p2Local[li*lWidth+lj-1];
float divP = dp1i + dp2j;

//VTD:
float VTD = vLocal[li*lWidth+lj]+THETA*divP;

// dVTDi:
float dp1i_ip1 = p1Local[(li+1)*lWidth+lj] - pij1;
float dp2j_ip1 = p2Local[(li+1)*lWidth+lj] - p2Local[(li+1)*lWidth+lj-1];
float divP_ip1 = dp1i_ip1 + dp2j_ip1;

float VTD_ip1 = vLocal[(li+1)*lWidth+lj] + THETA*divP_ip1;
float dVTDi = VTD_ip1 - VTD;

// dVTDj:
float dp1i_jp1 = p1Local[li*lWidth+lj+1]-p1Local[(li-1)*lWidth+lj+1];
float dp2j_jp1 = p2Local[li*lWidth+lj+1]-pij2;
float divP_jp1 = dp1i_jp1 + dp2j_jp1;

float VTD_jp1 = vLocal[li*lWidth+lj+1] + THETA*divP_jp1;
float dVTDj = VTD_jp1 - VTD;

// p:
pij1 = pij1 + (TAU/THETA)*dVTDi;
pij2 = pij2 + (TAU/THETA)*dVTDj;

// 3.31
float normPij = sqrt(pij1*pij1 + pij2*pij2);

pij1 = pij1 / max((float)1.0, normPij);
pij2 = pij2 / max((float)1.0, normPij);

barrier(CLK_LOCAL_MEM_FENCE);
p1Local[li*lWidth+lj] = pij1;
p2Local[li*lWidth+lj] = pij2;
}

// 3.32:
barrier(CLK_LOCAL_MEM_FENCE);

float dp1i = pij1-p1Local[(li-1)*lWidth+lj];
float dp2j = pij2-p2Local[li*lWidth+lj-1];
float divP = dp1i + dp2j;

u[gi*gWidth+gj] = vLocal[li*lWidth+lj] + THETA*divP;

// 3.33:
float umf = u[gi*gWidth+gj] - f[gi*gWidth+gj];

if(umf > LAMBDA*THETA)
v[gi*gWidth+gj] = u[gi*gWidth+gj] - LAMBDA*THETA;
else if (umf < -LAMBDA*THETA)
v[gi*gWidth+gj] = u[gi*gWidth+gj] + LAMBDA*THETA;
else
v[gi*gWidth+gj] = f[gi*gWidth+gj];

}
}

Do you have any idea why ?

Thanks,
Arthur

chai
03-22-2011, 02:20 PM
if you do

#pragma unroll
for(int l=0 ; l<L ; l++)
{
// INIT

// fill local memory
p1Local[li*lWidth+lj] = 0;
p2Local[li*lWidth+lj] = 0;
vLocal[li*lWidth+lj] = v[gi*gWidth+gj];
...
}

Then the compiler is attempting to do something like:

p1Local[li*lWidth+lj] = 0;
p2Local[li*lWidth+lj] = 0;
vLocal[li*lWidth+lj] = v[gi*gWidth+gj];
// rest of the loop operations for l=0
p1Local[li*lWidth+lj] = 0;
p2Local[li*lWidth+lj] = 0;
vLocal[li*lWidth+lj] = v[gi*gWidth+gj];
// rest of the loop operations for l=1
p1Local[li*lWidth+lj] = 0;
p2Local[li*lWidth+lj] = 0;
vLocal[li*lWidth+lj] = v[gi*gWidth+gj];
// rest of the loop operations for l=2
// etc etc

And the compiler probably realizes that you're assigning a value to the same three spots over and over, with no dependency on the for loop variable "l" so it will skip all iterations. Even if it DIDN'T skip the iterations, the output would be the exact same.

simplified version of the problem:

for(int x=0; x<bigX; x++)
{
y = 2; // a good compiler will remove the loop and leave "y=2;" only
}

arthur.sw
03-23-2011, 03:50 AM

I am aware of this compiler optimization, but in my case the whole loop is skipped, it is not even executed once ! (if L=1 it works, if L>1 it is skipped)

Moreover, the global variable v is modified, and vLocal is each time initialized with v so it should not skip any iteration.

I tried to add/remove #pragma unroll and it does not change anything.

If I remove only the lines

if(li+1 == lHeight-1 && lj-1 == 0)
p1Local[(li+1)*lWidth+lj-1] = p2Local[(li+1)*lWidth+lj-1] = 0;

or

else if(li-1 == 0 && lj+1 == lWidth-1)
p1Local[(li-1)*lWidth+lj+1] = p2Local[(li-1)*lWidth+lj+1] = 0;

or if I remove all the

barrier(CLK_LOCAL_MEM_FENCE);

it works.

The problem might be that the barriers don't work with the if() else() statement (which make some threads differ from the others).

I have the same problem on another very similar algorithm and removing the barriers did NOT solve anything. There are others if() else() statements which may cause the problem but I cannot just remove them !

Help !

arthur.sw
03-23-2011, 04:38 AM
On the other algorithm, my code ends by those lines:

vLocal[li*lWidth+lj] = uLocal[li*lWidth+lj] + LAMBDA*THETA*dI1 * (rou < -LTdI2)
- LAMBDA*THETA*dI1 * (rou > LTdI2)
- (rou / dI1) * (fabs(rou)<LTdI2);

(which is like an if() else() statement)

Now this algorithm is skipped, and it works when I replace those lines by:

vLocal[li*lWidth+lj] = uLocal[li*lWidth+lj] + LAMBDA*THETA*dI1;

Any idea why ???
I removed all the barriers in my code.

Help !!

arthur.sw
03-23-2011, 08:06 AM
Forget the last post, I had an error.