i'm developing a sample app that has to do the following:

1) render a single frame to an offscreen framebuffer
2) analyze pixel by pixel the generated image (in different manners).

i developed a standard openGL app, which draws offscreen, calls the glReadPixels to retrieve rendering result and then do its stuff. this one takes about 0,15 seconds to perform 100 runs on a small rendering (300x300).

then, i developed an opencl app that:
1) prepares an opengl context
2) prepares an opencl buffer from the framebuffer
3) computes the rendered image on the GPU side (there is no explicit data copy between RAM and VRAM)
4) retrievs the result of the evaluation from the GPU memory (this is just one float number)

this app takes about 4 seconds to run!!!!

a simplified snippet of code:
Code :
glGenRenderBufferEXT(1, &colorId);
glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, colorId);
glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_RGBA8, 300, 300)
CGLContextObj kCGLContext = CGLGetCurrentContext();              
CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);
cl_context_properties properties[] = { 
// Create a context from a CGL share group
context = clCreateContext(properties, NULL, NULL, NULL, NULL, &err);
cl_device_id devices[1];
clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, devices, NULL);
program = clCreateProgramWithSource(context, 1, strings, lengths, &err);
queue = clCreateCommandQueue(context, devices[0], NULL, &err);
m_Kernel = clCreateKernel(program, "Evaluate", &err);
res_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float), NULL, &err);
clSetKernelArg(m_Kernel, 1, sizeof(cl_mem), &res_buffer);	
m_Shared = clCreateFromGLRenderbuffer(context, CL_MEM_READ_ONLY, colorId, &err);
clSetKernelArg(m_Kernel, 0, sizeof(cl_mem), &m_Shared);	
Draw(); // openGL code to draw to the framebuffer
clEnqueueAcquireGLObjects(queue, 1, &m_Shared, NULL, NULL, NULL);
const size_t number = 1;
clEnqueueNDRangeKernel(queue, m_Kernel, 1, NULL, &number, &number, 0, NULL, NULL);
clEnqueueReleaseGLObjects(queue, 1, &m_Shared, 0, 0, 0)
float results[1];
clEnqueueReadBuffer(queue, res_buffer, CL_TRUE, 0, sizeof(float), results, NULL, NULL, NULL);

the kernel:
Code :
__kernel void Evaluate(
    read_only image2d_t framebuffer,
    __global int *results){
int matching = 0;
for(int i=0;i<3000;i++)
	for(int j=0;j<3000;j++)
		float4 pixel = read_imagef(framebuffer, CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST, (int2)(i,j));
		if(pixel matches)
results[0] = matching;

i can't understand why performances are so bad.
any help would be really appreciated

if you have any questions, please ask

thanks a lot,