Hi,
I found different sources of kernel code to do the sparse matrix vector multiplication (when the matrix is compressed with the CSR format), but I don’t get the expected result, and I don’t understand why.
Here is my kernel code
__kernel void spmv(__global float *values, __global int col_idx, __global int row_ptr, __global float vec_in, __global float vec_out, const int num_rows, __local float *red_mem)
{
const int items_per_row = 32;
const int idx = get_global_id(0)/items_per_row;
if (idx >= num_rows) return;
float sum = 0;
int row = idx;
int s = row_ptr[row];
int e = row_ptr[row+1];
const int local_id = get_local_id(0);
const int local_row_id = local_id/items_per_row;
const int local_row_offset = local_id%items_per_row;
for (int i = s + local_row_offset; i<e; i+=items_per_row)
{
sum += values[i]*vec_in[col_idx[i]];
}
red_mem[local_id] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
//reduction step
if (local_row_offset < 16) red_mem[local_id] += red_mem[local_id + 16];
if (local_row_offset < 8) red_mem[local_id] += red_mem[local_id + 8];
if (local_row_offset < 4) red_mem[local_id] += red_mem[local_id +4];
if (local_row_offset < 2) red_mem[local_id] += red_mem[local_id + 2];
if (local_row_offset < 1) red_mem[local_id] += red_mem[local_id + 1];
if(local_row_offset==0)
{
vec_out[row] += red_mem[local_id];
}
}
and here is how I launch the kernel (blockSize is initialized to 32)
cl::Event ndrEvent;
int sRes = wRes *hRes;
if ((sRes%blockSize)!=0)
sRes = sRes +blockSize-(sRes%blockSize);
cl::NDRange globalSize(sRes,1); //sRes is the size of the output vector
cl::NDRange localSize(blockSize,1);
//I set the kernel’s arguments
err = queue.enqueueNDRangeKernel (
kernel,
cl::NullRange,
globalSize,
localSize,
NULL,
&ndrEvent);
I’m kind of desperate, so if someone could help me, it would be great !!
Thanks in advance