We have tested and verified the following opencl kernel code on a Desktop machine running Windows 10 with Visual Studio 2015…
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
__kernel void loop(__global float* sig_real attribute( (max_constant_size(12012018 ))), __global float* sig_img attribute( (max_constant_size(12012018 ))), __global int* ti0 attribute( (max_constant_size(1681))), __global int* ti1 attribute( (max_constant_size(1681))), __global float* dt attribute( (max_constant_size(1681))), __global float* wtemp attribute( (max_constant_size(1681))), __global float* weight attribute( (max_constant_size(120120))), __global float pr_real attribute( (max_constant_size(1681))), __global float* pr_img attribute( (max_constant_size(1681))), __global float* im_real attribute( (max_constant_size(120120))), __global float im_img attribute( (max_constant_size(120*120))))
{
int x = get_global_id(0);
int y = get_global_id(1);
int ncx = get_global_size(0);
int ncy = get_global_size(1);
int ind = x + (y*ncx) ;
int i,offset;
float ntemp_real[18];
float ntemp_img[18];
float ntemp0_real[1681];
float ntemp0_img[1681];
float ntemp1_real[1681];
float ntemp1_img[1681];
float ndim_real[1681];
float ndim_img[1681];
float ndim_real_temp[1681];
float ndim_img_temp[1681];
for( i=0;i<18;i++){
ntemp_real[i] = sig_real[ind + (i*120*120)];
ntemp_img[i] = sig_img[ind + (i*120*120)];
}
barrier(CLK_LOCAL_MEM_FENCE);
for(i=0;i<1681;i++){
ntemp0_real[i] = ntemp_real[ti0[i]-1];
ntemp0_img[i] = ntemp_img[ti0[i]-1];
ntemp1_real[i] = ntemp_real[ti1[i]-1];
ntemp1_img[i] = ntemp_img[ti1[i]-1];
barrier(CLK_LOCAL_MEM_FENCE);
ndim_real[i] = fma((ntemp1_real[i] - ntemp0_real[i]) ,dt[i],ntemp0_real[i]);
ndim_img[i] = fma((ntemp1_img[i] - ntemp0_img[i]) ,dt[i],ntemp0_img[i]);
barrier(CLK_LOCAL_MEM_FENCE);
ndim_real[i] *= wtemp[i];
ndim_img[i] *= wtemp[i];
barrier(CLK_LOCAL_MEM_FENCE);
// phase rotate
ndim_real_temp[i] = (ndim_real[i] * pr_real[i]) - ( ndim_img[i] * pr_img[i]);
ndim_img_temp[i] = (ndim_real[i] * pr_img[i]) + (ndim_img[i] * pr_real[i]);
}
offset = 0;
int offset_dim = 0;
union{
volatile unsigned int u32;
volatile float f32;
} next, expected, current;
for(i=0;i<41;i++){
for(int j=0;j<41;j++){
offset = ((x+j) + ((y+i) * 160));
offset_dim = (j + i*41) ;
current.f32 = im_real[offset];
do{
expected.f32 = current.f32;
next.f32 = (float)(expected.f32 + ndim_real_temp[offset_dim]);
current.u32 = atomic_cmpxchg( (__global unsigned int *)&im_real[offset],
expected.u32, next.u32);
} while( current.u32 != expected.u32 );
current.f32 = im_img[offset];
do{
expected.f32 = current.f32;
next.f32 = (float)(expected.f32 + ndim_img_temp[offset_dim]);
current.u32 = atomic_cmpxchg( (__global unsigned int *)&im_img[offset],
expected.u32, next.u32);
} while( current.u32 != expected.u32 );
}
}
}
When we integrate the same code in an android NDK application, we get clBuildProgram() failed with error code -11 and error log being set to ‘Pass’. For your reference we are summarising our android NDK build environment here.
Android SDK : 21
Android Device: One Plus One
OpenCL version on mobile device : OpenCL 1.2
options passed to clBuildProgram(): " -cl-finite-math-only -cl-no-signed-zeros -cl-unsafe-math-optimizations "
In the above code, if we comment out the two lines with following code:
next.f32 = expected.f32 + ndim_real_temp[offset_dim];
then the clBuildProgram() succeeds. Please help us in understanding the possible causes of this problem.
In the above statement, two floating point variables are being added.