Issue building program on Android

We have tested and verified the following opencl kernel code on a Desktop machine running Windows 10 with Visual Studio 2015…
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
__kernel void loop(__global float* sig_real attribute( (max_constant_size(12012018 ))), __global float* sig_img attribute( (max_constant_size(12012018 ))), __global int* ti0 attribute( (max_constant_size(1681))), __global int* ti1 attribute( (max_constant_size(1681))), __global float* dt attribute( (max_constant_size(1681))), __global float* wtemp attribute( (max_constant_size(1681))), __global float* weight attribute( (max_constant_size(120120))), __global float pr_real attribute( (max_constant_size(1681))), __global float* pr_img attribute( (max_constant_size(1681))), __global float* im_real attribute( (max_constant_size(120120))), __global float im_img attribute( (max_constant_size(120*120))))
{

int x = get_global_id(0);
int y = get_global_id(1);
int ncx = get_global_size(0);
int ncy = get_global_size(1);
int ind = x + (y*ncx) ;
int i,offset;
float ntemp_real[18];
float ntemp_img[18];
float ntemp0_real[1681];
float ntemp0_img[1681];

float ntemp1_real[1681];
float ntemp1_img[1681];

float ndim_real[1681];
float ndim_img[1681];
float ndim_real_temp[1681];
float ndim_img_temp[1681];




for( i=0;i<18;i++){
ntemp_real[i] = sig_real[ind + (i*120*120)];
ntemp_img[i] = sig_img[ind + (i*120*120)];

}

barrier(CLK_LOCAL_MEM_FENCE);



for(i=0;i<1681;i++){

ntemp0_real[i] = ntemp_real[ti0[i]-1];
ntemp0_img[i] = ntemp_img[ti0[i]-1];
ntemp1_real[i] = ntemp_real[ti1[i]-1];
ntemp1_img[i] = ntemp_img[ti1[i]-1];

barrier(CLK_LOCAL_MEM_FENCE);

ndim_real[i] = fma((ntemp1_real[i] - ntemp0_real[i]) ,dt[i],ntemp0_real[i]);
ndim_img[i] = fma((ntemp1_img[i] - ntemp0_img[i]) ,dt[i],ntemp0_img[i]);

barrier(CLK_LOCAL_MEM_FENCE);

ndim_real[i] *= wtemp[i];
ndim_img[i] *= wtemp[i];

barrier(CLK_LOCAL_MEM_FENCE);

// phase rotate


ndim_real_temp[i] = (ndim_real[i] * pr_real[i]) - ( ndim_img[i] * pr_img[i]);
ndim_img_temp[i] = (ndim_real[i] * pr_img[i]) + (ndim_img[i] * pr_real[i]);

}

offset = 0;
int offset_dim = 0;

union{
      volatile unsigned int u32;
        volatile float        f32;
   } next, expected, current;

for(i=0;i<41;i++){
    for(int j=0;j<41;j++){
    
    offset = ((x+j) + ((y+i) * 160));
    offset_dim = (j + i*41) ;

    
      current.f32    = im_real[offset];
   do{
      expected.f32 = current.f32;
       next.f32     = (float)(expected.f32 + ndim_real_temp[offset_dim]);
       current.u32  = atomic_cmpxchg( (__global unsigned int *)&im_real[offset], 
                           expected.u32, next.u32);
   } while( current.u32 != expected.u32 );

  
      current.f32    = im_img[offset];
   do{
      expected.f32 = current.f32;
       next.f32     = (float)(expected.f32 + ndim_img_temp[offset_dim]);
       current.u32  = atomic_cmpxchg( (__global unsigned int *)&im_img[offset], 
                           expected.u32, next.u32);
   } while( current.u32 != expected.u32 );


    
    }
}



}

When we integrate the same code in an android NDK application, we get clBuildProgram() failed with error code -11 and error log being set to ‘Pass’. For your reference we are summarising our android NDK build environment here.
Android SDK : 21
Android Device: One Plus One
OpenCL version on mobile device : OpenCL 1.2
options passed to clBuildProgram(): " -cl-finite-math-only -cl-no-signed-zeros -cl-unsafe-math-optimizations "

In the above code, if we comment out the two lines with following code:

next.f32 = expected.f32 + ndim_real_temp[offset_dim];
then the clBuildProgram() succeeds. Please help us in understanding the possible causes of this problem.
In the above statement, two floating point variables are being added.