problem with multithreading spmv (matrix in CSR)

Hi,

I found different sources of kernel code to do the sparse matrix vector multiplication (when the matrix is compressed with the CSR format), but I don’t get the expected result, and I don’t understand why.

Here is my kernel code
__kernel void spmv(__global float *values, __global int col_idx, __global int row_ptr, __global float vec_in, __global float vec_out, const int num_rows, __local float *red_mem)
{
const int items_per_row = 32;
const int idx = get_global_id(0)/items_per_row;
if (idx >= num_rows) return;

      float sum = 0;
      int row = idx;
      int s = row_ptr[row];
      int e = row_ptr[row+1];
      const int local_id = get_local_id(0);
      const int local_row_id = local_id/items_per_row;
      const int local_row_offset = local_id%items_per_row;
      for (int i = s + local_row_offset; i<e; i+=items_per_row)
      {

sum += values[i]*vec_in[col_idx[i]];
}

red_mem[local_id] = sum;
barrier(CLK_LOCAL_MEM_FENCE);

      //reduction step
      if (local_row_offset < 16) red_mem[local_id] += red_mem[local_id + 16];
      if (local_row_offset < 8) red_mem[local_id] += red_mem[local_id + 8];
      if (local_row_offset < 4) red_mem[local_id] += red_mem[local_id +4];
      if (local_row_offset < 2) red_mem[local_id] += red_mem[local_id + 2];
      if (local_row_offset < 1) red_mem[local_id] += red_mem[local_id + 1];

      if(local_row_offset==0)
      {
                vec_out[row] += red_mem[local_id];
      }

}

and here is how I launch the kernel (blockSize is initialized to 32)

cl::Event ndrEvent;
int sRes = wRes *hRes;
if ((sRes%blockSize)!=0)
sRes = sRes +blockSize-(sRes%blockSize);

cl::NDRange globalSize(sRes,1); //sRes is the size of the output vector
cl::NDRange localSize(blockSize,1);

//I set the kernel’s arguments

err = queue.enqueueNDRangeKernel (
kernel,
cl::NullRange,
globalSize,
localSize,
NULL,
&ndrEvent);

I’m kind of desperate, so if someone could help me, it would be great !!

Thanks in advance