I recently wrote this kernel in open cl,which is working well and returning correct results.
spmv_csr_scalar_kernel(const int num_rows,
const int * ptr,
const int * indices,
const float * data,
const float * x,
float * y )
{
int row = get_global_id(0);
if(row < num_rows)
{
float dot = 0;
int row_start = ptr[row];
int row_end = ptr[row+1];
for (int jj = row_start; jj < row_end; jj++)
{
dot += data[jj] * x[indices[jj]];
}
y[row] += dot;
}
}
It is multiplication of a sparse matrix in compressed row storage with a column vector.It returns correct result.Uses one work item for each for loop (from row_start to row_end).
I want to convert the above code to using two work items per single for loop.But I am getting incorrect answers.here is what I could come write.
__kernel void mykernel(__global int* colvector,
__global int* val,
__global int* result,
__global int* index,
__global int* rowptr,
__global int* sync )
{
__global int vals[8]={0,0,0,0,0,0,0,0};
for(int i=0;i<4;i++)
{
result[i]=0;
}
barrier(CLK_GLOBAL_MEM_FENCE);
int thread_id=get_global_id(0);
int warp_id=thread_id/2;
int lane=(thread_id)&1;
int row=warp_id;
if(row<4)
{
int row_start = rowptr[row];
int row_end = rowptr[row+1];
vals[thread_id]=0;
for (int i = row_start+lane; i<row_end; i+=2)
{
vals[thread_id]+=val[i]*colvector[index[i]];
}
if(lane==0)
{
vals[thread_id]+=vals[thread_id+1];
}
if(lane==0)
{
result[row] += vals[thread_id];
}
}
}
Can anybody help me plzzzzz?My deadline is tomorrow and its returning incorrect results.