Custom FFT kernel

Hello,
I wrote a small piece of C code to compute FFT of a max 2048 complexe vector size. Stangely FFT stuck at 512 vector size length.
I tryed on many architectures: MacOS, linux. Same result.

Here is the OpenCL kernel code

/* Radix-2 DIT */
__kernel void
dit2(__global float *x, __global float *y, unsigned short int s)
{
unsigned int r = 1 << s;
unsigned int k = r << 1;
unsigned int i, j, l, num;
float Ar, Ai, Br, Bi;
float a, xtw, ytw, a2, xtw2, ytw2;
float Wr[2048], Wi[2048];

i = get_global_id(0);

if ((i % k) &lt; r) {
    // stage 0 l = i+1 i+r
    // stage 1 l = i+2
    // stage 2 l = i+4
    l = i + r;
    a = -2.0f * (float) (i % k) / (float) (k);
    Wr[i] = cospi(a);
    Wi[i] = sinpi(a);

/*
printf("i=%d
W(%d, %d)=%3.3f + %3.3fi
", i, i % k, k, Wr[i],
Wi[i]);

    a2 = -2.0f * M_PI * (float) (l % k) / (float) (k);
    Wr[l] = cos(a2);
    Wi[l] = sin(a2);

    printf("l=%d

W(%d, %d)=%3.3f + %3.3fi
", l, l % k, k, Wr[l],
Wi[l]);
*/

    // W[l] = -W[i] 
    Ar = x[i] + x[l] * Wr[i] - y[l] * Wi[i];
    Ai = y[i] + y[l] * Wr[i] + x[l] * Wi[i];
    Br = x[i] - x[l] * Wr[i] + y[l] * Wi[i];
    Bi = y[i] - y[l] * Wr[i] - x[l] * Wi[i];

    x[i] = Ar;
    y[i] = Ai;
    x[l] = Br;
    y[l] = Bi;
}

}

Here is parts of the host:
//bit reversing
/*

  • 0 000 -> 000 0 1 001 -> 100 4 2 010 -> 010 2 3 011 -> 110 6 4 100 -> 001 1

  • 5 101 -> 101 5 6 110 -> 011 3 7 111 -> 111 7
    */
    unsigned char brev(int stages, unsigned char in)
    {
    unsigned char out = 0;
    int j = 0;

    for (j = 0; j < stages; j++) {
    out += ((in & (1 << j)) >> j) << (stages - 1 - j);
    }
    return out;
    }

     memcpy(x, tmp, size);
     memcpy(y, tmp2, size);
    
     err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_x);
     err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_y);
    
     for (i = 0; i &lt; stage; i++) {
    
     err = clEnqueueWriteBuffer(queue, d_x, CL_FALSE, 0, size, x, 0, NULL, NULL);
     err |= clEnqueueWriteBuffer(queue, d_y, CL_FALSE, 0, size, y, 0, NULL, NULL);
         //Set the arguments to our compute kernel
         err |= clSetKernelArg(kernel, 2, sizeof(short), &i);
    
         // Execute the kernel over the entire range of the data set
         err =
             clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
                                    &global, NULL, 0, NULL, NULL);
    

cl_event readevent;
clEnqueueReadBuffer(queue, d_x, CL_TRUE, 0, size, x, 0, NULL, &readevent);
clEnqueueReadBuffer(queue, d_y, CL_TRUE, 0, size, y, 0, NULL, &readevent);
clWaitForEvents (1, &readevent);
}
memcpy(X, x, size);
memcpy(Y, y, size);

I suppose it will work if your Wr and Wi arrays will be global buffers (1024 floats per thread). But naive textbook algo is a poor choice for GPU implementation.