Hi,
I am just playing around with Apples OpenCL FFT code and added the following optimization:
Instead of calculating “(dir2.0fM_PI*j/64)” over and over again, I cached the result in a variable and use that subsequently.
But the code runs SLOWER than before now!
What might be the reason?
original code:
__kernel void fft1(__global float2 in, __global float2 out, int dir, int S)
{
…
ang = dir2.0fM_PIj/641;
w = (float2)(native_cos(ang), native_sin(ang));
a[1] = complexMul(a[1], w);
ang = dir2.0fM_PIj/642;
w = (float2)(native_cos(ang), native_sin(ang));
a[2] = complexMul(a[2], w);
ang = dir2.0fM_PIj/643;
w = (float2)(native_cos(ang), native_sin(ang));
a[3] = complexMul(a[3], w);
ang = dir2.0fM_PIj/644;
w = (float2)(native_cos(ang), native_sin(ang));
a[4] = complexMul(a[4], w);
ang = dir2.0fM_PIj/645;
w = (float2)(native_cos(ang), native_sin(ang));
a[5] = complexMul(a[5], w);
ang = dir2.0fM_PIj/646;
w = (float2)(native_cos(ang), native_sin(ang));
a[6] = complexMul(a[6], w);
ang = dir2.0fM_PIj/647;
w = (float2)(native_cos(ang), native_sin(ang));
…
}
my optimization:
__kernel void fft1(__global float2 in, __global float2 out, int dir, int S)
{
…
[b]float cached_multiplicator;
cached_multiplicator = dir2.0fM_PIj/64;[/b]
ang = cached_multiplicator;
w = (float2)(native_cos(ang), native_sin(ang));
a[1] = complexMul(a[1], w);
ang = cached_multiplicator2;
w = (float2)(native_cos(ang), native_sin(ang));
a[2] = complexMul(a[2], w);
ang = cached_multiplicator3;
w = (float2)(native_cos(ang), native_sin(ang));
a[3] = complexMul(a[3], w);
ang = cached_multiplicator4;
w = (float2)(native_cos(ang), native_sin(ang));
a[4] = complexMul(a[4], w);
ang = cached_multiplicator5;
w = (float2)(native_cos(ang), native_sin(ang));
a[5] = complexMul(a[5], w);
ang = cached_multiplicator6;
w = (float2)(native_cos(ang), native_sin(ang));
a[6] = complexMul(a[6], w);
ang = cached_multiplicator*7;
w = (float2)(native_cos(ang), native_sin(ang));
…
}