Hello everyone,
I’m working on a program for reduce the time taken for fractal image compression using OpenCL. With OpenCL I get 3-4x time compared to non OpenCL program. To explain about my program it reads a grayscale 8 bit image and divides it in blocks called range and domain blocks. Range block size of 4x4 and domain block size of 8x8 is used in the program. Range blocks and domain blocks are stored separately in memory. (Right now not worried about the memory usage)
Each domain block compressed to 4x4 block using 4 pixel averaging method. Then on domain blocks 8 transformations (different rotations) are applied and all the transformations are stored in memory. A range block is taken and is compared using Mean Square Error (MSE) against the domain blocks transformations one by one till the most matching transformed block is found and it is stored.
This process is repeated for all the range blocks one by one. I’m trying to reduce the time consumed in MSE calculations by paralleling it using OpenCL, but OpenCL program is taking more time to execute.
Below is the time taken for the program using OpenCL and non OpenCL program for a 256x256 grayscale image.
non OpenCL C code: 1.5 secs.
OpenCL code: 4.9 secs.
When I added timers to check which block consumes more time, I found that the creation of buffers itself takes 3.2 secs. The MSE calculation is taking around 1.8 secs which is more than what I wanted.
What can be done further to improve the time?
My code is as follows :
//*******************data structures used **************
typedef struct rangeBlock
{
unsigned char intensity[16];
}range;
typedef struct domainBlock
{
unsigned char intensity[16];//no
short x;
short y;
}domain;
//*****code for calculating MSE
iArea = 16;
sumaa=sumbb=sumab=suma=sumb=0;
for(i=0;i<rSize;i++)
{
for(j=0;j<rSize;j++)
{
a=(int)(rMobj[l].intensity[jrSize+i]);
b=(int)(dMobj[k].intensity[jrSize+i]);
sumaa+=aa;
sumbb+=bb;
sumab+=ab;
suma+=(long)a;
sumb+=(long)b;
}
}
if((iAreasumbb-sumbsumb)==0)
s=0;
else
s=((double)(iAreasumab-sumasumb))/((double)(iAreasumbb-sumb*sumb));
o=((double)(suma-ssumb))/((double)iArea);
d=((double)(sumaa+s(ssumbb-2sumab+2osumb)+o*(oiArea-2suma)))/((double)iArea);
/***code for creating buffers
/* Create Buffer Object */
rMobj = clCreateBuffer(context, CL_MEM_READ_WRITE, rCount * sizeof(struct rangeBlock), NULL, &ret);
mappingMobj = clCreateBuffer(context, CL_MEM_READ_WRITE, rCount * sizeof(struct rdmapping), NULL, &ret);
dMobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount * sizeof(struct domainBlock), NULL, &ret);
d90Mobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount * sizeof(struct domainBlock), NULL, &ret);
d180Mobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount * sizeof(struct domainBlock), NULL, &ret);
d270Mobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount * sizeof(struct domainBlock), NULL, &ret);
ddiaMobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount * sizeof(struct domainBlock), NULL, &ret);
ddia90Mobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount * sizeof(struct domainBlock), NULL, &ret);
ddia180Mobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount * sizeof(struct domainBlock), NULL, &ret);
ddia270Mobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount *sizeof(struct domainBlock), NULL, &ret);
/* Copy input data to the memory buffer */
ret = clEnqueueWriteBuffer(command_queue, dMobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), dAvgBlk, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, d90Mobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), d90Blk, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, rMobj, CL_TRUE, 0, rCount *sizeof(struct rangeBlock), rangeBlk, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, d270Mobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), d270Blk, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, ddiaMobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), ddiaBlk, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, ddia90Mobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), ddia90Blk, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, d180Mobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), d180Blk, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, ddia180Mobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), ddia180Blk, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, ddia270Mobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), ddia270Blk, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, mappingMobj, CL_TRUE, 0, rCount *sizeof(struct rdmapping), rdmap, 0, NULL, NULL);
/* Set OpenCL kernel arguments */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &rMobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &dMobj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &d90Mobj);
ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d180Mobj);
ret = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &d270Mobj);
ret = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &ddiaMobj);
ret = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &ddia90Mobj);
ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &ddia180Mobj);
ret = clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &ddia270Mobj);
ret = clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &mappingMobj);
ret = clSetKernelArg(kernel, 10, sizeof(cl_int), (void *) &rCount);
ret = clSetKernelArg(kernel, 11, sizeof(cl_int), (void *) &dCount);
ret = clSetKernelArg(kernel, 12, sizeof(cl_int), (void *) &rSize);