Hello,
i have bayer demosaic algorithms for C++ and optimized assembler version. Now i want to develop a openCL implemenation to compare performances. I expect a much higher performance but was really surprised how slow my kernel was. (My first stupid implementation)
I am new to openCL and so i want to ask for hints, how i can improve the performance.
First my system: i5 with 4x3GHz 8 GB RAM, Radeon HD6870, W7 64 bit, with AMD APP SDKv2.
Performance measure over 100 trys, input image size 2336x1752 output should be a RGB Image. I know that RGBA image has a better memory alginment, but i need RGB to.
Values are bayer demosaic operations per second
C++ Single: 26
ASM Single: 120
C++ openMP: 89
ASM OpenMP: 440
openCL: 120
And here my openCL kernel and thank you for your help.
__kernel void convert_3x3_Bayer8_to_RGB8_GR_NoBorder( __global uchar * oDestination, __global uchar * iSource, int iWidth, int iHeight)
{
int x = get_global_id(0);
int y = get_global_id(1);
// 4 pixels per call, 2 pixels of 2 lines
int sourcePixelIndex = (2 * y) * iWidth + 2 * x;
int destWidth = 2 * get_global_size(0);
int destY = 2 * y;
int destX = 2 * x;
uchar4 line_0;
uchar4 line_1;
uchar4 line_2;
uchar4 line_3;
// vector access .x .y .z. w
line_0.x = iSource[sourcePixelIndex];
line_0.y = iSource[sourcePixelIndex+1];
line_0.x = iSource[sourcePixelIndex+2];
line_0.w = iSource[sourcePixelIndex+3];
sourcePixelIndex += iWidth;
line_1.x = iSource[sourcePixelIndex];
line_1.y = iSource[sourcePixelIndex+1];
line_1.z = iSource[sourcePixelIndex+2];
line_1.w = iSource[sourcePixelIndex+3];
sourcePixelIndex += iWidth;
line_2.x = iSource[sourcePixelIndex];
line_2.y = iSource[sourcePixelIndex+1];
line_2.z = iSource[sourcePixelIndex+2];
line_2.w = iSource[sourcePixelIndex+3];
sourcePixelIndex += iWidth;
line_3.x = iSource[sourcePixelIndex];
line_3.y = iSource[sourcePixelIndex+1];
line_3.z = iSource[sourcePixelIndex+2];
line_3.w = iSource[sourcePixelIndex+3];
// first pixel first line
ushort red_00 = hadd(line_0.y, line_2.y);
ushort green_00 = line_1.y;
ushort blue_00 = hadd(line_1.x, line_1.z);
// second pixel first line
ushort red_01 = (line_0.y + line_0.w + line_2.y + line_2.w) / 4;
ushort green_01 = (line_0.z + line_2.z + line_1.y + line_1.w) / 4;
ushort blue_01 = line_1.z;
// first pixel second line
ushort red_10 = line_2.y;
ushort green_10 = (line_1.y + line_3.y + line_2.x + line_2.z) / 4;
ushort blue_10 = (line_1.x + line_1.z + line_3.x + line_3.z) / 4;
// second pixel second line
ushort red_11 = hadd(line_2.y, line_2.w);
ushort green_11 = line_2.z;
ushort blue_11 = hadd(line_1.z, line_3.z);
// first pixel first line
int destPixelIndex = ( destY * destWidth + destX) * 3;
oDestination[destPixelIndex] = red_00;
oDestination[destPixelIndex+1] = green_00;
oDestination[destPixelIndex+2] = blue_00;
// second pixel first line
oDestination[destPixelIndex+3] = red_01;
oDestination[destPixelIndex+4] = green_01;
oDestination[destPixelIndex+5] = blue_01;
// first pixel second line
destPixelIndex += destWidth * 3;
oDestination[destPixelIndex] = red_10;
oDestination[destPixelIndex+1] = green_10;
oDestination[destPixelIndex+2] = blue_10;
// second pixel second line
oDestination[destPixelIndex+3] = red_11;
oDestination[destPixelIndex+4] = green_11;
oDestination[destPixelIndex+5] = blue_11;
}