In the kernel function if i am doing min reduction which involves a while loop; i am getting the correct answer for smaller data sets but not getting the correct results for large datasets.

I think i have a synchronization problem between the threads, can anyone help me or tell me where do i need to put the barrier function in the kernel function and the reason for it.

__kernel void add_update_min(__global int *selected, __global float *d, __global float *A, __global float *dcopy, __global int *stVertex,__global int *stVertexcopy,__global int *destVertex, int n)

{

// Getting the global id of the thread

int tid = get_global_id(0);

int p;

p = destVertex[0];

// Updating distances of all the vertices after a new vertex is added

if(selected[tid] == 0)

{

if(d[tid] > A[tid*n + p])

{

d[tid] = A[tid*n + p];

stVertex[tid] = p;

}

}

else

{

d[tid] = 9999;

}

barrier(CLK_GLOBAL_MEM_FENCE);

stVertexcopy[tid] = tid;

dcopy[tid] = d[tid];

barrier(CLK_GLOBAL_MEM_FENCE);

//Calculating the minimum among all the d[i]'s

while(tid < n/2 )

{

if(n%2 != 0) // check if number of elements in consideration are suitable for normal min reduction (check out the else part)

{

if(tid != (n-1)/2) // if thread id is not the same as the middle element : think 9 and 4 [0,1,2,3,4]

{

if(dcopy[tid] > dcopy[tid + (n+1)/2]) // find the smalles

{

dcopy[tid] = dcopy[tid + (n+1)/2];

stVertexcopy[tid] = stVertexcopy[tid + (n+1)/2];

}

}

if(tid == 0) // shall also include the middle element in each iteration of min reduction --- think of a 23 element array [0....22]

{

// every thread is going to execute this

if(dcopy[tid] > dcopy[(n-1)/2])

{

dcopy[tid] = dcopy[(n-1)/2];

stVertexcopy[tid] = stVertexcopy[(n-1)/2];

}

}

n = (n-1)/2;

}

else

{

if(dcopy[tid] > dcopy[tid + n/2])

{

dcopy[tid] = dcopy[tid + n/2];

stVertexcopy[tid] = stVertexcopy[tid + n/2];

}

n = n/2;

}

barrier(CLK_GLOBAL_MEM_FENCE);

}

barrier(CLK_GLOBAL_MEM_FENCE);

if(tid == 0)

{

destVertex[0] = stVertexcopy[0];

p = destVertex[0];

selected[p] = 1;

}

}

Here i have the global item size as n

local item size i have specified it as NULL.