Need feedback on OpenCL optimization

Hello Everyone,

I’ve written a OpenCL code for a function for fractal encoding (fixed partitioning). I get same encoding results with C only code and OpenCL based Code (Intel Core i5-3210M 2.5GHz HD Graphics 4000). My OpenCL code executes 5 times faster on a better GPU Nvidia GeForce 660 GTX . I want to get feedback on my code, if there is a possibility of optimizing and get better results on the Intel Integrated GPU. Please help & review the code. The steps of execution are as below:

Host side:

  1. Read 256x256 gray scale 8bit image
  2. Divide image into range(4x4) rMobj and domain blocks(range size*2) dMobj and store into 1D array(pool of ranges and domains).
  3. Average each domain block(4 pixel average of each domain block) and store into 1D array dMobj . The averaged domain block will become 4x4 size.
  4. Apply transformations i.e. rotate and flip the image at different angles (0: dMobj ,90: d90Mobj ,180,270… ,)

Kernel side :
compare (using Mean square error) each range block with each domain block and its transformations and find out the best matching domain block and store it. Here I’ve taken a tolerance parameter. If MSE < tolerance then that domain block will saved and the further domain blocks will not be searched (this saves lot of time). return best matching domain block to host and it will be written in the file.

rCount is numbers of range blocks
dCount is number of domain blocks
rSize is range size (4x4)
Curr Domain is temporary memory block used in for loop.
iType is nothing but number of transformations of domain blocks.

Kernel code *****************************************

__kernel void calculateRms( __global struct rangeBlock* rMobj, __global struct domainBlock* dMobj, __global struct domainBlock* d90Mobj,
__global struct domainBlock* d180Mobj, __global struct domainBlock* d270Mobj, __global struct domainBlock* ddiaMobj,
__global struct domainBlock* ddia90Mobj, __global struct domainBlock* ddia180Mobj, __global struct domainBlock* ddia270Mobj,
__global struct rdmapping* mappingMobj, int rCount , int dCount ,int rSize, __global struct domainBlock* Curr_domain)
{
int l = get_global_id(0);

int  i=0, j=0, iType=0, k=0;
float m_fTolerance = 10.0;
struct rdmapping data;


long sumaa=0, sumbb=0 ,sumab=0 ,suma=0, sumb=0;
long a=0, b=0;

float s=0, o=0, d=0, dd=0;
int iArea=rSize*rSize;
m_fTolerance = 10.0;
dd=9999999999.0;
Curr_domain = dMobj;

for(k =0;k&lt;dCount;k++)
{
	for(iType=0; iType&lt;8; iType++)
	{
		if (iType == 0){ Curr_domain = dMobj;} 
		if (iType == 1){ Curr_domain = d90Mobj;} 
		if (iType == 2){ Curr_domain = d180Mobj;} 
		if (iType == 3){ Curr_domain = d270Mobj;} 
		if (iType == 4){ Curr_domain = ddiaMobj;} 
		if (iType == 5){ Curr_domain = ddia90Mobj;} 
		if (iType == 6){ Curr_domain = ddia180Mobj;} 
		if (iType == 7){ Curr_domain = ddia270Mobj;}
		s = o= d=0;
		sumaa=sumbb=sumab=suma=sumb=0;
		for(i=0;i&lt;rSize;i++)
		{
			for(j=0;j&lt;rSize;j++)
			{
				a=(int)(rMobj[l].intensity[j*rSize+i]);
				b=(int)(Curr_domain[k].intensity[j*rSize+i]);
				sumaa+=a*a;
				sumbb+=b*b;
				sumab+=a*b;
				suma+=(long)a;
				sumb+=(long)b;
			}
		}
		if((iArea*sumbb-sumb*sumb)==0)
			s=0;
		else
			s=((double)(iArea*sumab-suma*sumb))/((double)(iArea*sumbb-sumb*sumb));

		o=((double)(suma-s*sumb))/((double)iArea);
		d=((double)(sumaa+s*(s*sumbb-2*sumab+2*o*sumb)+o*(o*iArea-2*suma)))/((double)iArea);
		
		if(d&lt;dd)
		{
				data.trans = iType;
				data.o=o;
				data.s=s;
				dd=d;
				data.dX=Curr_domain[k].x;
				data.dY=Curr_domain[k].y;
		}
	}
		if(d &lt; m_fTolerance)
		{
				iType=8;
				k= dCount;
		}
}

mappingMobj[l].s = data.s;
mappingMobj[l].o = data.o;
mappingMobj[l].trans = data.trans;
mappingMobj[l].dX = data.dX;
mappingMobj[l].dY =data.dY;

}

Looking at your code, two things jump out right away:

[ul]
[li] sumaa and suma do not need to be in the loop at all, they only need to be calculated once per range block/work item (rCount times total), instead of 8 * dCount * rCount times.
[/li][li] sumbb and sumb only need to be calculated once per domain block (8 * dCount times), where right now you are calculating them 8 * dCount * rCount times.
[/li][/ul]

Whoops, double post but I also just noticed, your inner loop for calculating your sums:

[code=c]
for(i=0;i<rSize;i++)
{
for(j=0;j<rSize;j++)
{
// sums
}
}


should probably have the loop indices switched
[code=c]
for(j=0;j<rSize;j++)
{
	for(i=0;i<rSize;i++)
	{
		// sums
	}
}

since right now you are iterating over columns then rows instead of rows then columns. Depending on the value of rSize it may not make a big difference though.

A few more tips, in addition to cartographer’s good advice:

  • I see you are using long and double datatypes in your arithmetic. These calculations will be much faster if you can get away with doing them in 32-bit datatypes (int, float). It looks that this might be possible for some of them, given that many of the temp variables are floats anyway.
  • It looks like suma and sumaa are recalculated each time through the inner two loops. Perhaps the compile could figure out to pull these out of the loops, but perhaps putting the code in an outer loop would help the compiler figure that out. :slight_smile:
  • It might be better to write the mappingMobj[l] = data to copy the final result to memory. It’s probably not a first-order problem here, but it might help.

Thanks for the reply cartographer and kunze.
I can not write sumaa and sumbb outside the loop.
sumaa is summation of squares of intensity of each element of range block .
sumaa = r1^2 + r2^2+ …+rn^2 where r1,r2,…rn are elements of range block. Same is for domain blocks.

[QUOTE=MenkaMore;30512]Thanks for the reply cartographer and kunze.
I can not write sumaa and sumbb outside the loop.
sumaa is summation of squares of intensity of each element of range block .
sumaa = r1^2 + r2^2+ …+rn^2 where r1,r2,…rn are elements of range block. Same is for domain blocks.[/QUOTE]

Yes, sumbb would require making a separate kernel and storing the results. It may or may not be worth it to do so. On the other hand, sumaa can be moved outside the loop, it does not depend on any other variables. I’ve modified your code a bit but here is my version (note the comments):

[code=c]

// rCount isn’t even used, currDomain can just be declared inside the function,
// and I just put all the domain block types in a pointer array
__kernel void calculateRms( __global struct rangeBlock* rangeBlocks, __global struct domainBlock* domainBlocks[8],
__global struct rdmapping* outputMapping, const int dCount, const int rSize)
{
const float tolerance = 10.0;
const int rangeIdx = get_global_id(0),
iArea = rSize * rSize;

// sumAA and sumA do not depend on the domain blocks at all,
// there's really no reason they can't be out here.
int sumAA = 0,
    sumA = 0;
for (int i = 0; i &lt; rSize; ++i)
{
    for (int j = 0; j &lt; rSize; ++j)
    {
        int a = rangeBlocks[rangeIdx].intensity[i * rSize + j];

        sumAA += a * a;
        sumA += a;
    }
}

float dMin = FLT_MAX;
struct rdmapping bestMatch;

for (int domainIdx = 0; domainIdx &lt; dCount; ++domainIdx)
{
    for (int domainType = 0; domainType &lt; 8; ++domainType)
    {
        // putting the domain block types in an array of pointers is a bit
        // nicer than a bunch of if statements, might be faster too for all I know
        __global struct domainBlock* currDomain = domainBlocks[domainType];

        // I'm not sure whats going on with all the changes between int and float are?
        // regardless, I've stuck to what you were doing
        int sumAB = 0,
            sumBB = 0,
            sumB = 0;
        for (int i = 0; i &lt; rSize; ++i)
        {
            for (int j = 0; j &lt; rSize; ++j)
            {
                int a = rangeBlocks[rangeIdx].intensity[i * rSize + j],
                    b = currDomain[domainIdx].intensity[i * rSize + j];

                // the only sum that really needs to be in here is sumAB
                sumAB += a * b;

                // but to move BB and B out would require making a separate kernel
                // and storing the results somewhere, which might not be worth it
                sumBB += b * b;
                sumB += b;
            }
        }

        float denom = iArea * sumBB - sumB * sumB,
            s = 0.0f;

        if (denom != 0)
            s = (iArea * sumAB - sumA * sumB) / denom;

        float o = (sumA - s * sumB) / iArea,
            d = (sumAA + s * (s * sumBB - 2.0f * sumAB + 2.0f * o * sumB) + o * (o * iArea - 2.0f * sumA)) / iArea;

        if (d &lt; dMin)
        {
            dMin = d;

            bestMatch.trans = domainType;
            bestMatch.o = o;
            bestMatch.s = s;
            bestMatch.dX = currDomain[domainIdx].x;
            bestMatch.dY = currDomain[domainIdx].y;
        }
    }

    // will cause thread divergence, if it matters
    if (dMin &lt; tolerance)
        break;
}

outputMapping[rangeIdx] = bestMatch;

}