for loop problem

Hello,

The following algorithm is skipped when L>1 (the number of iteration), except when the INIT part is outside the for loop :


__kernel void test(__global float* f, __global float* u, __global float* v, __local float* vLocal,
				   __local float* p1Local, __local float* p2Local, int gHeight, int gWidth)
{
    // Block index
    int bi = get_group_id(1);
    int bj = get_group_id(0);

    // Thread index
    int li = get_local_id(1);
    int lj = get_local_id(0);
	
	int lHeight = get_local_size(1)+2;
	int lWidth  = get_local_size(0)+2;

	//global coordinates
	int gi = bi * BLOCK_SIZE + li;
	int gj = bj * BLOCK_SIZE + lj;

	if( (gi-1<0) || (gi+1>gHeight-1) || (gj-1<0) || (gj+1>gWidth-1) )
		return;
	
	li+=1;
	lj+=1;

	barrier(CLK_LOCAL_MEM_FENCE);
	
	float pij1;
	float pij2;

	#pragma unroll
	for(int l=0 ; l<L ; l++)
	{	
		// INIT

		// fill local memory
		p1Local[li*lWidth+lj] = 0;
		p2Local[li*lWidth+lj] = 0;
		vLocal[li*lWidth+lj] = v[gi*gWidth+gj];

		// local memory edges
		if(li-1 == 0)
		{
			p1Local[(li-1)*lWidth+lj] = p2Local[(li-1)*lWidth+lj] = 0;
			vLocal[(li-1)*lWidth+lj] = v[(gi-1)*gWidth+gj];
		}
		else if(li+1 == lHeight-1)
		{
			p1Local[(li+1)*lWidth+lj] = p2Local[(li+1)*lWidth+lj] = 0;
			vLocal[(li+1)*lWidth+lj] = v[(gi+1)*gWidth+gj];
		}
		if(lj-1 == 0)
		{
			p1Local[li*lWidth+lj-1] = p2Local[li*lWidth+lj-1] = 0;
			vLocal[li*lWidth+lj-1] = v[gi*gWidth+gj-1];
		}
		else if(lj+1 == lWidth-1)
		{
			p1Local[li*lWidth+lj+1] = p2Local[li*lWidth+lj+1] = 0;
			vLocal[li*lWidth+lj+1] = v[gi*gWidth+gj+1];
		}

		if(li+1 == lHeight-1 && lj-1 == 0)
			p1Local[(li+1)*lWidth+lj-1] = p2Local[(li+1)*lWidth+lj-1] = 0;
		else if(li-1 == 0 && lj+1 == lWidth-1)
			p1Local[(li-1)*lWidth+lj+1] = p2Local[(li-1)*lWidth+lj+1] = 0;

		// END INIT

		// 3.30: Gradient Descent: 
		#pragma unroll
		for(int k=0 ; k<K ; k++)
		{
			// 3.30

			//p[li*gWidth+lj] will be used many times:
			pij1 = p1Local[li*lWidth+lj];
			pij2 = p2Local[li*lWidth+lj];

			//div(p): divP
			float dp1i = pij1-p1Local[(li-1)*lWidth+lj];
			float dp2j = pij2-p2Local[li*lWidth+lj-1];
			float divP = dp1i + dp2j;

			//VTD:
			float VTD = vLocal[li*lWidth+lj]+THETA*divP;

			// dVTDi:
			float dp1i_ip1 = p1Local[(li+1)*lWidth+lj] - pij1;
			float dp2j_ip1 = p2Local[(li+1)*lWidth+lj] - p2Local[(li+1)*lWidth+lj-1];
			float divP_ip1 = dp1i_ip1 + dp2j_ip1;

			float VTD_ip1 = vLocal[(li+1)*lWidth+lj] + THETA*divP_ip1;
			float dVTDi = VTD_ip1 - VTD;

			// dVTDj:
			float dp1i_jp1 = p1Local[li*lWidth+lj+1]-p1Local[(li-1)*lWidth+lj+1];
			float dp2j_jp1 = p2Local[li*lWidth+lj+1]-pij2;
			float divP_jp1 = dp1i_jp1 + dp2j_jp1;

			float VTD_jp1 = vLocal[li*lWidth+lj+1] + THETA*divP_jp1;
			float dVTDj = VTD_jp1 - VTD;

			// p:
			pij1 = pij1 + (TAU/THETA)*dVTDi;
			pij2 = pij2 + (TAU/THETA)*dVTDj;

			// 3.31
			float normPij = sqrt(pij1*pij1 + pij2*pij2);

			pij1 = pij1 / max((float)1.0, normPij);
			pij2 = pij2 / max((float)1.0, normPij);

			barrier(CLK_LOCAL_MEM_FENCE);
			p1Local[li*lWidth+lj] = pij1;
			p2Local[li*lWidth+lj] = pij2;
		}

		// 3.32:
		barrier(CLK_LOCAL_MEM_FENCE);
		
		float dp1i = pij1-p1Local[(li-1)*lWidth+lj];
		float dp2j = pij2-p2Local[li*lWidth+lj-1];
		float divP = dp1i + dp2j;
		
		u[gi*gWidth+gj] = vLocal[li*lWidth+lj] + THETA*divP;

		// 3.33:
		float umf = u[gi*gWidth+gj] - f[gi*gWidth+gj];

		if(umf > LAMBDA*THETA)
			v[gi*gWidth+gj] = u[gi*gWidth+gj] - LAMBDA*THETA;
		else if (umf < -LAMBDA*THETA)
			v[gi*gWidth+gj] = u[gi*gWidth+gj] + LAMBDA*THETA;
		else
			v[gi*gWidth+gj] = f[gi*gWidth+gj];
		
	}
}

Do you have any idea why ?

Thanks,
Arthur

if you do


#pragma unroll
   for(int l=0 ; l<L ; l++)
   {   
      // INIT

      // fill local memory
      p1Local[li*lWidth+lj] = 0;
      p2Local[li*lWidth+lj] = 0;
      vLocal[li*lWidth+lj] = v[gi*gWidth+gj];
      ...
   }

Then the compiler is attempting to do something like:


p1Local[li*lWidth+lj] = 0;
p2Local[li*lWidth+lj] = 0;
vLocal[li*lWidth+lj] = v[gi*gWidth+gj];
// rest of the loop operations for l=0
p1Local[li*lWidth+lj] = 0;
p2Local[li*lWidth+lj] = 0;
vLocal[li*lWidth+lj] = v[gi*gWidth+gj];
// rest of the loop operations for l=1
p1Local[li*lWidth+lj] = 0;
p2Local[li*lWidth+lj] = 0;
vLocal[li*lWidth+lj] = v[gi*gWidth+gj];
// rest of the loop operations for l=2
// etc etc 

And the compiler probably realizes that you’re assigning a value to the same three spots over and over, with no dependency on the for loop variable “l” so it will skip all iterations. Even if it DIDN’T skip the iterations, the output would be the exact same.

simplified version of the problem:


for(int x=0; x<bigX; x++)
{
      y = 2; // a good compiler will remove the loop and leave "y=2;" only
}

Thanks for your answer Chai,

I am aware of this compiler optimization, but in my case the whole loop is skipped, it is not even executed once ! (if L=1 it works, if L>1 it is skipped)

Moreover, the global variable v is modified, and vLocal is each time initialized with v so it should not skip any iteration.

I tried to add/remove #pragma unroll and it does not change anything.

If I remove only the lines


		if(li+1 == lHeight-1 && lj-1 == 0)
			p1Local[(li+1)*lWidth+lj-1] = p2Local[(li+1)*lWidth+lj-1] = 0;

or


		else if(li-1 == 0 && lj+1 == lWidth-1)
			p1Local[(li-1)*lWidth+lj+1] = p2Local[(li-1)*lWidth+lj+1] = 0;

or if I remove all the


barrier(CLK_LOCAL_MEM_FENCE);

it works.

The problem might be that the barriers don’t work with the if() else() statement (which make some threads differ from the others).

I have the same problem on another very similar algorithm and removing the barriers did NOT solve anything. There are others if() else() statements which may cause the problem but I cannot just remove them !

Help !

On the other algorithm, my code ends by those lines:


		vLocal[li*lWidth+lj] = uLocal[li*lWidth+lj] + LAMBDA*THETA*dI1 * (rou < -LTdI2)
													- LAMBDA*THETA*dI1 * (rou > LTdI2)
													- (rou / dI1) * (fabs(rou)<LTdI2);

(which is like an if() else() statement)

Now this algorithm is skipped, and it works when I replace those lines by:


vLocal[li*lWidth+lj] = uLocal[li*lWidth+lj] + LAMBDA*THETA*dI1;

Any idea why ???
I removed all the barriers in my code.

Help !!

Forget the last post, I had an error.