Hello notzed, thanks for the reply, but unfortunely I can’t figure out what you mean in your example, Could you please clarify this example for me ?
I’ve tried somethings on my own too, but the perfomance I got from it was worse than just getting all neighborhood from global memory, here is the code:
int local_x = get_local_id(0);
int local_y = get_local_id(1);
shared_input[pos] = 255.f * read_imagef(input, sampler, (int2) (x, y));
if (local_x == 0)
{
shared_input[pos - 1] = 255.f * read_imagef(input, sampler, (int2) (x - 1, y));
if (local_y == 0)
shared_input[pos - 10 - 1] = 255.f * read_imagef(input, sampler, (int2) (x - 1, y - 1));
}
if (local_y == 0)
{
shared_input[pos - 10] = 255.f * read_imagef(input, sampler, (int2) (x, y));
if (local_x == 7)
shared_input[pos - 10 + 1] = 255.f * read_imagef(input, sampler, (int2) (x + 1, y - 1));
}
if (local_x == 7)
{
shared_input[pos + 1] = 255.f * read_imagef(input, sampler, (int2) (x + 1, y));
if (local_y == 7)
shared_input[pos + 10 + 1] = 255.f * read_imagef(input, sampler, (int2) (x + 1, y + 1));
}
if (local_y == 7)
{
shared_input[pos + 10] = 255.f * read_imagef(input, sampler, (int2) (x, y + 1));
if (local_x == 0)
shared_input[pos + 10 - 1] = 255.f * read_imagef(input, sampler, (int2) (x - 1, y + 1));
}
As you can see, what I did was simply send the thread pixels to the local array and then verify if this thread pixels is in a work-group boundary, if true, it will send the pixels necessary pixels from the other work-groups too, the problem is that this code is very inneficient.
Thanks !