How can I reduce time for creation of buffers in OpenCL ?

Hello everyone,

I’m working on a program for reduce the time taken for fractal image compression using OpenCL. With OpenCL I get 3-4x time compared to non OpenCL program. To explain about my program it reads a grayscale 8 bit image and divides it in blocks called range and domain blocks. Range block size of 4x4 and domain block size of 8x8 is used in the program. Range blocks and domain blocks are stored separately in memory. (Right now not worried about the memory usage)

Each domain block compressed to 4x4 block using 4 pixel averaging method. Then on domain blocks 8 transformations (different rotations) are applied and all the transformations are stored in memory. A range block is taken and is compared using Mean Square Error (MSE) against the domain blocks transformations one by one till the most matching transformed block is found and it is stored.
This process is repeated for all the range blocks one by one. I’m trying to reduce the time consumed in MSE calculations by paralleling it using OpenCL, but OpenCL program is taking more time to execute.

Below is the time taken for the program using OpenCL and non OpenCL program for a 256x256 grayscale image.

non OpenCL C code: 1.5 secs.
OpenCL code: 4.9 secs.

When I added timers to check which block consumes more time, I found that the creation of buffers itself takes 3.2 secs. The MSE calculation is taking around 1.8 secs which is more than what I wanted.

What can be done further to improve the time?

My code is as follows :

//*******************data structures used **************
typedef struct rangeBlock
{
unsigned char intensity[16];
}range;

typedef struct domainBlock
{
unsigned char intensity[16];//no
short x;
short y;
}domain;

//*****code for calculating MSE

iArea = 16;
sumaa=sumbb=sumab=suma=sumb=0;
for(i=0;i<rSize;i++)
{
for(j=0;j<rSize;j++)
{
a=(int)(rMobj[l].intensity[jrSize+i]);
b=(int)(dMobj[k].intensity[j
rSize+i]);
sumaa+=aa;
sumbb+=b
b;
sumab+=ab;
suma+=(long)a;
sumb+=(long)b;
}
}
if((iArea
sumbb-sumbsumb)==0)
s=0;
else
s=((double)(iArea
sumab-sumasumb))/((double)(iAreasumbb-sumb*sumb));

o=((double)(suma-ssumb))/((double)iArea);
d=((double)(sumaa+s
(ssumbb-2sumab+2osumb)+o*(oiArea-2suma)))/((double)iArea);

/***code for creating buffers

/* Create Buffer Object */

rMobj = clCreateBuffer(context, CL_MEM_READ_WRITE, rCount * sizeof(struct rangeBlock), NULL, &ret);

mappingMobj = clCreateBuffer(context, CL_MEM_READ_WRITE, rCount * sizeof(struct rdmapping), NULL, &ret);

dMobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount * sizeof(struct domainBlock), NULL, &ret);

d90Mobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount * sizeof(struct domainBlock), NULL, &ret);

d180Mobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount * sizeof(struct domainBlock), NULL, &ret);

d270Mobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount * sizeof(struct domainBlock), NULL, &ret);

ddiaMobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount * sizeof(struct domainBlock), NULL, &ret);

ddia90Mobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount * sizeof(struct domainBlock), NULL, &ret);

ddia180Mobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount * sizeof(struct domainBlock), NULL, &ret);

ddia270Mobj = clCreateBuffer(context, CL_MEM_READ_WRITE, dCount *sizeof(struct domainBlock), NULL, &ret);

/* Copy input data to the memory buffer */
ret = clEnqueueWriteBuffer(command_queue, dMobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), dAvgBlk, 0, NULL, NULL);

ret = clEnqueueWriteBuffer(command_queue, d90Mobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), d90Blk, 0, NULL, NULL);

ret = clEnqueueWriteBuffer(command_queue, rMobj, CL_TRUE, 0, rCount *sizeof(struct rangeBlock), rangeBlk, 0, NULL, NULL);

ret = clEnqueueWriteBuffer(command_queue, d270Mobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), d270Blk, 0, NULL, NULL);

ret = clEnqueueWriteBuffer(command_queue, ddiaMobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), ddiaBlk, 0, NULL, NULL);

ret = clEnqueueWriteBuffer(command_queue, ddia90Mobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), ddia90Blk, 0, NULL, NULL);

ret = clEnqueueWriteBuffer(command_queue, d180Mobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), d180Blk, 0, NULL, NULL);

ret = clEnqueueWriteBuffer(command_queue, ddia180Mobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), ddia180Blk, 0, NULL, NULL);

ret = clEnqueueWriteBuffer(command_queue, ddia270Mobj, CL_TRUE, 0, dCount *sizeof(struct domainBlock), ddia270Blk, 0, NULL, NULL);

ret = clEnqueueWriteBuffer(command_queue, mappingMobj, CL_TRUE, 0, rCount *sizeof(struct rdmapping), rdmap, 0, NULL, NULL);

/* Set OpenCL kernel arguments */

ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &rMobj);

ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &dMobj);

ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &d90Mobj);

ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &d180Mobj);

ret = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *) &d270Mobj);

ret = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *) &ddiaMobj);

ret = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *) &ddia90Mobj);

ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *) &ddia180Mobj);

ret = clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *) &ddia270Mobj);

ret = clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *) &mappingMobj);

ret = clSetKernelArg(kernel, 10, sizeof(cl_int), (void *) &rCount);

ret = clSetKernelArg(kernel, 11, sizeof(cl_int), (void *) &dCount);

ret = clSetKernelArg(kernel, 12, sizeof(cl_int), (void *) &rSize);

I assume the performance figures you gave are not per-run, correct? They must be aggregated for a number of iterations, yes? Assuming that’s true, is it necessary to create new buffers every time you run your kernel? If not, you could cache them and amortize the cost of buffer creation across multiple kernel executions. If caching isn’t an option, you could consider using buffers of the type CL_MEM_USE_HOST_PTR and getting rid of the clEnqueueWriteBuffer calls. On some architectures, and with the right pointer alignment, this can be cheaper than creating other kinds of buffers.

What Kunze said, plus you could combine buffers into one and use pointer arithmetic (inside the kernel).

Thanks Kunze. Yes these performance figures are averaged values. I’ve tried the CL_MEM_USE_HOST_PTR option but it is not giving significant results time required is nearly same. What can be other solution for this? And also the time required to execute kernel is 1.8 sec approximately, which is quiet higher than sequential.

      • Updated - - -

Thanks andrew brownsword. Actually I didn’t get what u said, can you please give any example so that I’ll get idea about it?

Kunze,

The performance figures I provided are for number of iterations. I’ve should have pasted the code in proper flow. The code under creation of buffers is executed once. The code for calculation MSE is in the CL file.

Used CL_MEM_USE_HOST_PTR in my program but don’t know why earlier the CL_MEM_USE_HOST_PTR was taking long time but now it takes me around 5-10ms.