Want to write from global to local and local to global memory

Hi,
I am very new in OpenCL. I am trying to write from global to local and then local to global memory on Intel FPGA. But I am not getting the right answer. I got all output values equal zero. Could anyone please tell me what happened?
Thanks in advance.

Kernel code:
__kernel void g_l_g(__global int* restrict in, __global int* restrict out) {
local int lmem [32];
int gi = get_global_id(0);
int li = get_local_id(0);
int res = in[gi];
#pragma unroll
for ( int i = 0; i<32; i++){
lmem[li] = res;
res >>=1;}
barrier(CLK_GLOBAL_MEM_FENCE);
res = 0;
#pragma unroll
for ( int i = 0; i<32; i++){
res ^= lmem[li];}
out[gi] = res;}

Host:

unsigned N = 64; // problem size
unsigned lmem = 32; // problem size
const unsigned num_block_rows = N / lmem;
for(unsigned i = 0; i < num_devices; ++i) {
queue[i] = clCreateCommandQueue(context, device[i], CL_QUEUE_PROFILING_ENABLE, &status);
checkError(status, “Failed to create command queue”);
const char *kernel_name = “g_l_g”;
kernel[i] = clCreateKernel(program, kernel_name, &status);
checkError(status, “Failed to create kernel”);
n_per_device[i] = num_block_rows / num_devices; // number of elements handled by this device
if(i < (num_block_rows % num_devices)) {
n_per_device[i]++;
}

n_per_device[i] *= lmem;
input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, “Failed to create buffer for input A”);
output_buf[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, “Failed to create buffer for output”);
}
return true;
}

if(num_devices == 0) {
checkError(-1, “No devices”);
}
input_a.reset(num_devices);
output.reset(num_devices);
ref_output.reset(num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
input_a[i].reset(n_per_device[i]);
output[i].reset(n_per_device[i]);
ref_output[i].reset(n_per_device[i]);
for(unsigned j = 0; j < n_per_device[i]; ++j) {
input_a[i][j] = rand_float();
ref_output[i][j] = input_a[i][j];
}

cl_int status;
const double start_time = getCurrentTimestamp();
scoped_array<cl_event> kernel_event(num_devices);
scoped_array<cl_event> finish_event(num_devices);

for(unsigned i = 0; i < num_devices; ++i) {
cl_event write_event[1];
status = clEnqueueWriteBuffer(queue[i], input_a_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), input_a[i], 0, NULL, &write_event[0]);
checkError(status, “Failed to transfer input A”);
unsigned argi = 0;
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_a_buf[i]);
checkError(status, “Failed to set argument %d”, argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &output_buf[i]);
checkError(status, “Failed to set argument %d”, argi - 1);
const size_t global_work_size = n_per_device[i];
const size_t local_work_size[1] = {32};
status = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL,
&global_work_size, local_work_size, 1, write_event, &kernel_event[i]);
checkError(status, “Failed to launch kernel”)
status = clEnqueueReadBuffer(queue[i], output_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), output[i], 1, &kernel_event[i], &finish_event[i]);
checkError(status, “Failed to read output matrix”);
clReleaseEvent(write_event[0]);}

I figured it out. Thanks, everybody.