Hello,
I wrote a small piece of C code to compute FFT of a max 2048 complexe vector size. Stangely FFT stuck at 512 vector size length.
I tryed on many architectures: MacOS, linux. Same result.
Here is the OpenCL kernel code
/* Radix-2 DIT */
__kernel void
dit2(__global float *x, __global float *y, unsigned short int s)
{
unsigned int r = 1 << s;
unsigned int k = r << 1;
unsigned int i, j, l, num;
float Ar, Ai, Br, Bi;
float a, xtw, ytw, a2, xtw2, ytw2;
float Wr[2048], Wi[2048];
i = get_global_id(0);
if ((i % k) < r) {
// stage 0 l = i+1 i+r
// stage 1 l = i+2
// stage 2 l = i+4
l = i + r;
a = -2.0f * (float) (i % k) / (float) (k);
Wr[i] = cospi(a);
Wi[i] = sinpi(a);
/*
printf("i=%d
W(%d, %d)=%3.3f + %3.3fi
", i, i % k, k, Wr[i],
Wi[i]);
a2 = -2.0f * M_PI * (float) (l % k) / (float) (k);
Wr[l] = cos(a2);
Wi[l] = sin(a2);
printf("l=%d
W(%d, %d)=%3.3f + %3.3fi
", l, l % k, k, Wr[l],
Wi[l]);
*/
// W[l] = -W[i]
Ar = x[i] + x[l] * Wr[i] - y[l] * Wi[i];
Ai = y[i] + y[l] * Wr[i] + x[l] * Wi[i];
Br = x[i] - x[l] * Wr[i] + y[l] * Wi[i];
Bi = y[i] - y[l] * Wr[i] - x[l] * Wi[i];
x[i] = Ar;
y[i] = Ai;
x[l] = Br;
y[l] = Bi;
}
}
Here is parts of the host:
//bit reversing
/*
-
0 000 -> 000 0 1 001 -> 100 4 2 010 -> 010 2 3 011 -> 110 6 4 100 -> 001 1
-
5 101 -> 101 5 6 110 -> 011 3 7 111 -> 111 7
*/
unsigned char brev(int stages, unsigned char in)
{
unsigned char out = 0;
int j = 0;for (j = 0; j < stages; j++) {
out += ((in & (1 << j)) >> j) << (stages - 1 - j);
}
return out;
}memcpy(x, tmp, size); memcpy(y, tmp2, size); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_x); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_y); for (i = 0; i < stage; i++) { err = clEnqueueWriteBuffer(queue, d_x, CL_FALSE, 0, size, x, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_y, CL_FALSE, 0, size, y, 0, NULL, NULL); //Set the arguments to our compute kernel err |= clSetKernelArg(kernel, 2, sizeof(short), &i); // Execute the kernel over the entire range of the data set err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
cl_event readevent;
clEnqueueReadBuffer(queue, d_x, CL_TRUE, 0, size, x, 0, NULL, &readevent);
clEnqueueReadBuffer(queue, d_y, CL_TRUE, 0, size, y, 0, NULL, &readevent);
clWaitForEvents (1, &readevent);
}
memcpy(X, x, size);
memcpy(Y, y, size);