low float precision - only on parts of the output data

hello,

I have a problem with low-precision of calculations - I try to do a one-dimensional FFT transform on matrix(using DFT matrix). Unfortunately, the FFT and IFFT results compared with the input data vary considerably (above 0.1), and the strange is that the first line has the correct results. it does not matter whether the input data(matrix) has a size of 8x8 or more (256x256).
in each case, the results differ significantly, much more than could be due to the float precision.
I tried to build a kernel with option ‘-cl-opt-disable’, unfortunately it does not improve accuracy. any ideas what I am doing wrong?

below the sample input and output data (the first and third row of the output is correct, but differs in the first column) and the host and kernel code.

input:
0.840188 0.394383 0.783099 0.798440
0.911647 0.197551 0.335223 0.768230
0.277775 0.553970 0.477397 0.628871
0.364784 0.513401 0.952230 0.916195

output:
0.840188 0.394383 0.783099 0.798440
0.638216 0.355476 0.643726 0.842212
0.277775 0.553970 0.477397 0.628871
0.638216 0.355476 0.643726 0.842212

fft1d.cl

__kernel void mul(
__global float* InA,
__global float* OutRe,
__global float* OutIm,
int N,

__global float* Wre,
__global float* Wim,
__global float* Mre,
__global float* Mim,

__global float* tempre,
__global float* tempim

)
{

int gid0 = get_global_id(0);
int gid1 = get_global_id(1);
int i,j,k;

int size = N;

float wynikre = 0;
float wynikim = 0;

if(gid0<N && gid1<N) {

// fft1d
for(j=0;j<N;j++) {

  		wynikre += Wre[gid1*N+j]*InA[j*N+gid0];	//RE
  		wynikim += Wim[gid1*N+j]*InA[j*N+gid0];	//IM	
  }
  	tempre[gid1*N+gid0] = wynikre;
  	tempim[gid1*N+gid0] = wynikim;
  	barrier(CLK_GLOBAL_MEM_FENCE);
  wynikre = 0;
  wynikim = 0;

// ifft1d
for(j=0;j<N;j++) {

  		wynikre += Mre[gid1*N+j]*tempre[j*N+gid0]-Mim[gid1*N+j]*tempim[j*N+gid0] ;
  		wynikim += Mre[gid1*N+j]*tempim[j*N+gid0]+Mim[gid1*N+j]*tempre[j*N+gid0] ;
  }
  	OutRe[gid1*N+gid0] = wynikre;	
  	OutIm[gid1*N+gid0] = wynikim;
  	barrier(CLK_GLOBAL_MEM_FENCE);

}

}

fft.cpp

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <unistd.h>
#include <time.h>
#include <cctype>
#include <vector>
#include <math.h>
#include <complex.h>

#include “CL/cl.h”

using namespace std;
const char* cSourceFile = “fft1d.cl”; // dct kernel file

cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQue; // OpenCL command que
cl_device_id* cdDevices; // OpenCL device list

cl_program cpProgram; // OpenCL program
cl_kernel ckKernel; // OpenCL kernel

size_t GlobalWorkSize[2];
size_t LocalWorkSize[2];

size_t ParmDataBytes; // Byte size of context information
cl_int ciErr1, ciErr2; // Error code var
char* cPathAndName = NULL; // Var for paths
char* cSourceCL = NULL; // Buffer to hold source for compilation

char* load_source(const char* filename)
{
FILE* stream = NULL;
size_t source_lenght;

stream = fopen(filename, “rb”);
if(stream == 0)
{
return NULL;
}

fseek(stream, 0, SEEK_END);
source_lenght = ftell(stream);
fseek(stream, 0, SEEK_SET);
size_t result =0;

char* source_string = (char *)malloc(source_lenght + 1);
result = fread(source_string, source_lenght, 1, stream);

fclose(stream);

source_string[source_lenght] = ‘\0’;
return source_string;
}

void showM(float* daneOut, int N) {

int i=0;
for(int i =0;i<N*N;i++) {
if (i%N == 0) { printf("
“); }
printf(”%f “, *(daneOut+i));
}
printf(”
");

}

int main(int argc, char **argv)
{

const char* optio = “-cl-opt-disable”;

int N = 4;
int siz = sizeof(float)NN;

GlobalWorkSize[0] = N;
GlobalWorkSize[1] = N;
LocalWorkSize[0] = 2;
LocalWorkSize[1] = 2;

float PI = 3.14159265;

complex float a = cos(2PI/N)-sin(2PI/N)*I;
complex float b;
int k, w;

////////////////////////////////////////////////////////////////////////

cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf("Error in clCreateContextFromType !!!

");}

ciErr1 = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
cdDevices = (cl_device_id*)malloc(ParmDataBytes);
ciErr1 |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, cdDevices, NULL);
if (ciErr1 != CL_SUCCESS) {printf("Error in clGetContextInfo !!!

");}

cqCommandQue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf("Error in clCreateCommandQueue !!!

");}

////////////////////////////////////////////////////////////////////////

float* daneInA;
daneInA = (float*)malloc(siz);
void * pointer_daneInA= (float )daneInA;
for(int j=0;j<N
N;j++) {
*(daneInA+j)= (float) rand()/RAND_MAX;
}

float* daneOutRe;
daneOutRe = (float*)malloc(siz);
void * pointer_daneOutRe= (float )daneOutRe;
for(int j=0;j<N
N;j++) {
*(daneOutRe+j)= 0;
}

float* daneOutIm;
daneOutIm = (float*)malloc(siz);
void * pointer_daneOutIm= (float )daneOutIm;
for(int j=0;j<N
N;j++) {
*(daneOutIm+j)=0;
}

// Wre
float* data_fft_Wre;
data_fft_Wre = (float*)malloc(siz);
void * pointer_fft_Wre= (float )data_fft_Wre;
for(k=0;k<N;k++) {
for(w=0;w<N;w++) {
b = k
w;
(data_fft_Wre+(k+wN))=crealf( cpow( a,b ) );
}
}

// Wim
float* data_fft_Wim;
data_fft_Wim = (float*)malloc(siz);
void * pointer_fft_Wim= (float )data_fft_Wim;
for(k=0;k<N;k++) {
for(w=0;w<N;w++) {
b = k
w;
(data_fft_Wim+(k+wN))=cimagf( cpow( a,b ) );
}
}

// Mre
float* data_fft_Mre;
data_fft_Mre = (float*)malloc(siz);
void * pointer_fft_Mre= (float )data_fft_Mre;
for(k=0;k<N;k++) {
for(w=0;w<N;w++) {
b = -k
w;
(data_fft_Mre+(k+wN))=crealf( cpow( a,b ) )/N;
}
}

// Mim
float* data_fft_Mim;
data_fft_Mim = (float*)malloc(siz);
void * pointer_fft_Mim= (float )data_fft_Mim;
for(k=0;k<N;k++) {
for(w=0;w<N;w++) {
b = -k
w;
(data_fft_Mim+(k+wN))=cimagf( cpow( a,b ) )/N;
}
}

float* temp1;
temp1 = (float*)malloc(siz);
void * pointer_temp1= (float )temp1;
for(int j=0;j<N
N;j++) {
*(temp1+j)=0;
}

float* temp2;
temp2 = (float*)malloc(siz);
void * pointer_temp2= (float )temp2;
for(int j=0;j<N
N;j++) {
*(temp2+j)=0;
}

cl_mem Buffer_daneInA = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneIn !!!

");}
cl_mem Buffer_daneOutRe = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut!!!

");}
cl_mem Buffer_daneOutIm = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut2!!!

");}

cl_mem Buffer_fft_Wre = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut2!!!

");}
cl_mem Buffer_fft_Wim = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut2!!!

");}
cl_mem Buffer_fft_Mre = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut2!!!

");}
cl_mem Buffer_fft_Mim = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut2!!!

");}

cl_mem Buffer_temp1 = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut2!!!

");}
cl_mem Buffer_temp2 = clCreateBuffer (cxGPUContext, CL_MEM_READ_WRITE, siz, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf("Error in clcreatebuffer Buffer_daneOut2!!!

");}

////////////////////////////////////////////////////////////////////////

cSourceCL = load_source(cSourceFile);

cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, NULL, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf("Error in clCreateProgramWithSource !!!

");}

ciErr1 = clBuildProgram(cpProgram, 0, NULL, optio, NULL, NULL);
if (ciErr1 != CL_SUCCESS) {printf("Error in clBuildProgram !!!

");}

ckKernel = clCreateKernel(cpProgram, “mul”, &ciErr1);
if (ciErr1 != CL_SUCCESS) {printf("Error in clCreateKernel !!!

");}

////////////////////////////////////////////////////////////////////////

ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&Buffer_daneInA);
if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!

");}
ciErr1 = clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&Buffer_daneOutRe);
if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!

");}
ciErr1 = clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&Buffer_daneOutIm);
if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!

");}
ciErr1 = clSetKernelArg(ckKernel, 3, sizeof(int), (void*)&N);
if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!

");}

ciErr1 = clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&Buffer_fft_Wre);
if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!

");}
ciErr1 = clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&Buffer_fft_Wre);
if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!

");}
ciErr1 = clSetKernelArg(ckKernel, 6, sizeof(cl_mem), (void*)&Buffer_fft_Mre);
if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!

");}
ciErr1 = clSetKernelArg(ckKernel, 7, sizeof(cl_mem), (void*)&Buffer_fft_Mim);
if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!

");}

ciErr1 = clSetKernelArg(ckKernel, 8, sizeof(cl_mem), (void*)&Buffer_temp1);
if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!

");}
ciErr1 = clSetKernelArg(ckKernel, 9, sizeof(cl_mem), (void*)&Buffer_temp2);
if (ciErr1 != CL_SUCCESS) {printf("Error in clSetKernelArg !!!

");}
////////////////////////////////////////////////////////////////////////

  ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_daneInA, CL_TRUE, 0, siz, pointer_daneInA, 0, NULL, NULL);
  	if (ciErr1 != CL_SUCCESS) {printf("Error in clEnqueueWriteBuffer !!!

");}

  ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Wre, CL_TRUE, 0, siz, pointer_fft_Wre, 0, NULL, NULL);
  	if (ciErr1 != CL_SUCCESS) {printf("Error in clEnqueueWriteBuffer !!!

");}
ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Wim, CL_TRUE, 0, siz, pointer_fft_Wim, 0, NULL, NULL);
if (ciErr1 != CL_SUCCESS) {printf("Error in clEnqueueWriteBuffer !!!

");}
ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Mre, CL_TRUE, 0, siz, pointer_fft_Mre, 0, NULL, NULL);
if (ciErr1 != CL_SUCCESS) {printf("Error in clEnqueueWriteBuffer !!!

");}
ciErr1 = clEnqueueWriteBuffer(cqCommandQue, Buffer_fft_Mim, CL_TRUE, 0, siz, pointer_fft_Mim, 0, NULL, NULL);
if (ciErr1 != CL_SUCCESS) {printf("Error in clEnqueueWriteBuffer !!!

");}

  ciErr1 = clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);
  clFinish(cqCommandQue);
  	if (ciErr1 != CL_SUCCESS) {printf("Error in clEnqueueNDRangeKernel !!!

");}

  ciErr1 = clEnqueueReadBuffer(cqCommandQue,  Buffer_daneOutRe, CL_TRUE, 0, siz, pointer_daneOutRe, 0, NULL, NULL);
  	if (ciErr1 != CL_SUCCESS) {printf("Error in clEnqueueReadBuffer !!!

");}

showM(daneInA, N);
showM(daneOutRe, N);

}

The first thing that comes to mind are data races. Are you sure that your barrier() in your kernel is not trying to cross work-groups? Your work-group size is 2x2, so that barrier will only synchronize 4 work-items. If that is what you intended then I don’t have any other immediate thoughts.

i think about algoritm again and it seems to me that the synchronization in algorithm is not needed - they are indeed multiple reads of the same buffer, but always for a given pair global_id and local_id (in the sense of two-dimensional), there is only one record to the output buffer. what is more without the use of barriers results are the same ( incorrect as in the beggining).

I tried to divide the two ‘for’ loops from one kernel into two separate ( fft and ifft) lets say ker1 and ker2.
results from ker1 are correct but results from ker2 are not, and i have no clue why.

row1 and row3 from ker2 are incorrect, also row1==row3. only row0 and row3 are correct. this occurs independent to local size - i tested it with local (1,1), (2,2) and (4,4). global size remains the same - (4,4).

any ideas of how to solve it ?