I decide develop parallel FFT program for GPU using OpenCL. But in result my parallel program slower than program for CPU. Can you help me find out why this happen?
#include <CL\opencl.h>
#include <iostream>
#include <windows.h>
#include <time.h>
//***Serial FFT ****
#define M_PIx2 6.283185307179586476925286766559
struct Complex {
Complex( float r, float i ) : re(r), im(i) {}
Complex(){re = 0.0f; im = 0.0f;}
Complex operator+( Complex &other );
Complex operator-( Complex &other );
Complex operator*( Complex &other );
//void Display( ) { cout << re << ", " << im << endl; }
float getRe() {return re; }
float getIm() {return im; }
private:
float re, im;
};
Complex Complex::operator+( Complex &other ) {
return Complex( re + other.re, im + other.im );
}
Complex Complex::operator-( Complex &other ) {
return Complex( re - other.re, im - other.im );
}
Complex Complex::operator*( Complex &other ) {
return Complex( re*other.re - im*other.im, re*other.im + other.re * im );
}
void FFT(const double *cpfData, Complex *pSpectra, DWORD dwDataSize,
DWORD n, DWORD dwBits)
{
// Form initial array
memset(pSpectra, 0, n * sizeof(Complex));
for(DWORD k = 0; k < dwDataSize; k++)
{
pSpectra[k] = Complex(1.0f, 0.0f); //pSpectra[BitRewers32(k, (BYTE)dwBits)] = cpfData[k];
}
// Now run!
DWORD s, m, j, m2;
Complex W = Complex(0, 0), Wm = Complex(0, 0), T = Complex(0, 0), U = Complex(0, 0); //Complex W = Complex(0, 0 ), Wm, T, U;
for(s = 1; s <= dwBits; s++)
{
m = 1 << s;
m2 = m >> 1;
Wm = Complex((float)cos(M_PIx2 / m), (float)sin(M_PIx2 / m)); //Wm = std::exp(Complex(0.0, M_PIx2 / m));
W = Complex(1.0, 0); //W = 1.0;
for(j = 0; j < m2; j++)
{
for(DWORD k = j; k < n; k += m)
{
T = W * pSpectra[k + m2];
U = pSpectra[k];
pSpectra[k] = U + T;
pSpectra[k + m2] = U - T;
}
W = W * Wm; //W *= Wm;
}
}
}
//*****************
const int N = 16384;
int dwBits = 14;
const int COUNT_REPEAT_MAX = 10;
// Round Up Division function
size_t roundUpSize(int group_size, int global_size)
{
int r = global_size % group_size;
if(r == 0)
{
return global_size;
} else
{
return global_size + group_size - r;
}
}
int main()
{
cl_int error;
cl_float2 hostSpectr[N];
cl_float2 hostResult[N];
int ids[N];
for (int i = 0; i <= N-1; i++)
{
hostSpectr[i].s[0] = 1;
hostSpectr[i].s[1] = 0;
}
//---Platform
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);
//---Device
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
//---Context
cl_context context;
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
//---Source of GPU program
const char *source =
"#define ADD_COMPLEX_RE(CNUM1, CNUM2) ((CNUM1.x) + (CNUM2.x))
"
"#define ADD_COMPLEX_IMG(CNUM1, CNUM2) ((CNUM1.y) + (CNUM2.y))
"
"#define MUL_COMPLEX_RE(CNUM1, CNUM2) ((CNUM1.x)*(CNUM2.x) - (CNUM1.y)*(CNUM2.y))
"
"#define MUL_COMPLEX_IMG(CNUM1, CNUM2) ((CNUM1.x)*(CNUM2.y) + (CNUM1.y)*(CNUM2.x))
"
"__kernel void fft_gpu( __global const float2 *inSpectr,
"
" __global float2 *outResult,
"
" int n,
"
" __global int *ids)
"
"{
"
" int id = get_global_id(0);
"
" int i;
"
" float2 w;
"
" if (id < n)
"
" {
"
" ids[id] = id;
"
" outResult[id].x = 0;
"
" outResult[id].y = 0;
"
" for (i = 0; i<= n-1; i++)
"
" {
"
" w.x = cos(2* M_PI * ((i * id) % n) / n);
"
" w.y = sin(2* M_PI * ((i * id) % n) / n);
"
" float2 mul_result;
"
" mul_result.x = MUL_COMPLEX_RE(w,inSpectr[i]);
"
" mul_result.y = MUL_COMPLEX_IMG(w,inSpectr[i]);
"
" outResult[id].x = ADD_COMPLEX_RE(outResult[id], mul_result);"
" outResult[id].y = ADD_COMPLEX_IMG(outResult[id], mul_result);"
" }
"
" }
"
"}
";
//---Program
cl_program program;
program = clCreateProgramWithSource(context, 1, &source, NULL, &error);
if (error != CL_SUCCESS)
{
std::cout << "Creating program error!
";
}
//---Building program
error = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Building error!
";
}
//---Kernel
cl_kernel kernel;
kernel = clCreateKernel(program, "fft_gpu", NULL);
//---Buffers
cl_mem deviceBufSpectr;
cl_mem deviceBufResult;
cl_mem deviceBufIds;
deviceBufSpectr = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float2) * N, NULL, NULL);
deviceBufResult = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float2) * N, NULL, NULL);
deviceBufIds = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * N, NULL, NULL);
//---Command queue
cl_command_queue cmd_queue;
cmd_queue = clCreateCommandQueue(context, device, NULL, NULL);
//---Data transfer from Host to Device
error = clEnqueueWriteBuffer(cmd_queue, deviceBufSpectr, CL_FALSE, 0, sizeof(cl_float2) * N, hostSpectr, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Data transfer from Host to Device error!
";
}
//---Global and local work size
size_t localWorkSize = 512;
size_t globalWorkSize = roundUpSize(localWorkSize, N);
//---FFT start (GPU)
time_t timeWorkGPU = clock();
//---Setting kernel arguments
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&deviceBufSpectr);
clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&deviceBufResult);
clSetKernelArg(kernel, 2, sizeof(unsigned int), (void *)&N);
clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&deviceBufIds);
for (int countRepeat = 0; countRepeat < COUNT_REPEAT_MAX; countRepeat++)
{
//---Kernel invocation
error = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Kernel invocation error!
";
}
}
//---Data transfer from Device to Host
error = clEnqueueReadBuffer(cmd_queue, deviceBufResult, CL_TRUE, 0, sizeof(cl_float2) * N, hostResult, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Data transfer from Device to Host error!
";
}
error = clEnqueueReadBuffer(cmd_queue, deviceBufIds, CL_TRUE, 0, sizeof(int) * N, ids, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
std::cout << "Data transfer from Device to Host error!
";
}
timeWorkGPU = clock() - timeWorkGPU;
std::cout << "GPU time: " << (double)timeWorkGPU / CLOCKS_PER_SEC<<"
";
/*for (int i = 0; i <= N-1; i++)
{
std::cout << hostResult[i].s[0] << " " << hostResult[i].s[1] << "
";
}*/
//*** DSP.cpp start
double cpfData[2] = {1.0, 1.0};
Complex pSpectra[N];
time_t timeWorkCPU = clock();
for (int countRepeat = 0; countRepeat < COUNT_REPEAT_MAX; countRepeat++)
{
FFT(cpfData, pSpectra, N, N, dwBits); //Serial FFT
}
timeWorkCPU = clock() - timeWorkCPU;
std::cout << "CPU time: " << (double)timeWorkCPU / CLOCKS_PER_SEC<<"
";
/*std::cout<< "--- Result (CPU): ---" << std::endl;
for (int i = 0; i <= N - 1; i++)
{
std::cout << "[" << i <<"]: " << pSpectra[i].getRe() << ", " << pSpectra[i].getIm() << "
";
}*/
//*** DSP.cpp end
system("PAUSE");
return 0;
}