Hi.
i’m developing a sample app that has to do the following:
- render a single frame to an offscreen framebuffer
- analyze pixel by pixel the generated image (in different manners).
i developed a standard openGL app, which draws offscreen, calls the glReadPixels to retrieve rendering result and then do its stuff. this one takes about 0,15 seconds to perform 100 runs on a small rendering (300x300).
then, i developed an opencl app that:
- prepares an opengl context
- prepares an opencl buffer from the framebuffer
- computes the rendered image on the GPU side (there is no explicit data copy between RAM and VRAM)
- retrievs the result of the evaluation from the GPU memory (this is just one float number)
this app takes about 4 seconds to run!!!
a simplified snippet of code:
glutInit(...)
glGenFramebufferEXT(...)
glBindFramebufferEXT(...)
glGenRenderBufferEXT(1, &colorId);
glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, colorId);
glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_RGBA8, 300, 300)
glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_RENDERBUFFER_EXT, colorId);
CGLContextObj kCGLContext = CGLGetCurrentContext();
CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);
cl_context_properties properties[] = {
CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE,
(cl_context_properties)kCGLShareGroup,
0
};
// Create a context from a CGL share group
context = clCreateContext(properties, NULL, NULL, NULL, NULL, &err);
cl_device_id devices[1];
clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, devices, NULL);
program = clCreateProgramWithSource(context, 1, strings, lengths, &err);
queue = clCreateCommandQueue(context, devices[0], NULL, &err);
m_Kernel = clCreateKernel(program, "Evaluate", &err);
res_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float), NULL, &err);
clSetKernelArg(m_Kernel, 1, sizeof(cl_mem), &res_buffer);
m_Shared = clCreateFromGLRenderbuffer(context, CL_MEM_READ_ONLY, colorId, &err);
clSetKernelArg(m_Kernel, 0, sizeof(cl_mem), &m_Shared);
Draw(); // openGL code to draw to the framebuffer
glFinish();
clEnqueueAcquireGLObjects(queue, 1, &m_Shared, NULL, NULL, NULL);
const size_t number = 1;
clEnqueueNDRangeKernel(queue, m_Kernel, 1, NULL, &number, &number, 0, NULL, NULL);
clEnqueueReleaseGLObjects(queue, 1, &m_Shared, 0, 0, 0)
float results[1];
clEnqueueReadBuffer(queue, res_buffer, CL_TRUE, 0, sizeof(float), results, NULL, NULL, NULL);
the kernel:
__kernel void Evaluate(
read_only image2d_t framebuffer,
__global int *results){
int matching = 0;
for(int i=0;i<3000;i++)
{
for(int j=0;j<3000;j++)
{
float4 pixel = read_imagef(framebuffer, CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST, (int2)(i,j));
if(pixel matches)
matching++;
}
results[0] = matching;
}
i can’t understand why performances are so bad.
any help would be really appreciated
if you have any questions, please ask
thanks a lot,
christian