mtenay

10-21-2012, 05:56 AM

I recently wrote this kernel in open cl,which is working well and returning correct results.

spmv_csr_scalar_kernel(const int num_rows,

const int * ptr,

const int * indices,

const float * data,

const float * x,

float * y )

{

int row = get_global_id(0);

if(row < num_rows)

{

float dot = 0;

int row_start = ptr[row];

int row_end = ptr[row+1];

for (int jj = row_start; jj < row_end; jj++)

{

dot += data[jj] * x[indices[jj]];

}

y[row] += dot;

}

}

It is multiplication of a sparse matrix in compressed row storage with a column vector.It returns correct result.Uses one work item for each for loop (from row_start to row_end).

I want to convert the above code to using two work items per single for loop.But I am getting incorrect answers.here is what I could come write.

__kernel void mykernel(__global int* colvector,

__global int* val,

__global int* result,

__global int* index,

__global int* rowptr,

__global int* sync )

{

__global int vals[8]={0,0,0,0,0,0,0,0};

for(int i=0;i<4;i++)

{

result[i]=0;

}

barrier(CLK_GLOBAL_MEM_FENCE);

int thread_id=get_global_id(0);

int warp_id=thread_id/2;

int lane=(thread_id)&1;

int row=warp_id;

if(row<4)

{

int row_start = rowptr[row];

int row_end = rowptr[row+1];

vals[thread_id]=0;

for (int i = row_start+lane; i<row_end; i+=2)

{

vals[thread_id]+=val[i]*colvector[index[i]];

}

if(lane==0)

{

vals[thread_id]+=vals[thread_id+1];

}

if(lane==0)

{

result[row] += vals[thread_id];

}

}

}

Can anybody help me plzzzzz?My deadline is tomorrow and its returning incorrect results.

spmv_csr_scalar_kernel(const int num_rows,

const int * ptr,

const int * indices,

const float * data,

const float * x,

float * y )

{

int row = get_global_id(0);

if(row < num_rows)

{

float dot = 0;

int row_start = ptr[row];

int row_end = ptr[row+1];

for (int jj = row_start; jj < row_end; jj++)

{

dot += data[jj] * x[indices[jj]];

}

y[row] += dot;

}

}

It is multiplication of a sparse matrix in compressed row storage with a column vector.It returns correct result.Uses one work item for each for loop (from row_start to row_end).

I want to convert the above code to using two work items per single for loop.But I am getting incorrect answers.here is what I could come write.

__kernel void mykernel(__global int* colvector,

__global int* val,

__global int* result,

__global int* index,

__global int* rowptr,

__global int* sync )

{

__global int vals[8]={0,0,0,0,0,0,0,0};

for(int i=0;i<4;i++)

{

result[i]=0;

}

barrier(CLK_GLOBAL_MEM_FENCE);

int thread_id=get_global_id(0);

int warp_id=thread_id/2;

int lane=(thread_id)&1;

int row=warp_id;

if(row<4)

{

int row_start = rowptr[row];

int row_end = rowptr[row+1];

vals[thread_id]=0;

for (int i = row_start+lane; i<row_end; i+=2)

{

vals[thread_id]+=val[i]*colvector[index[i]];

}

if(lane==0)

{

vals[thread_id]+=vals[thread_id+1];

}

if(lane==0)

{

result[row] += vals[thread_id];

}

}

}

Can anybody help me plzzzzz?My deadline is tomorrow and its returning incorrect results.