Hi to everyone,

I've created a simply opencl program that performs a matrix multiplication, my kernel works with cuda but with opencl works only with datasets of square matrices. Someone could help me to figure out please?

Thanks

IN THE HOST

...

size_t bd[]={16,16};

size_t gd[]={bd[0]*(numCColumns/bd[0]+1), bd[1]*(numCRows/bd[1]+1)};

err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, gd,

bd, 0, NULL, NULL);

...

DEVICE

__kernel void matrixMultiply(__global float * A, __global float * B, __global float * C,

int numARows, int numAColumns, int numBRows, int numBColumns,

int numCRows, int numCColumns) {

int i=get_global_id(0);

int j=get_global_id(1);

int len=numAColumns;

if(i<numCColumns && j<numCRows)

{

float sum=0;\n"

for(int k=0;k<len;k++)\n"

sum+=A[j*numAColumns+k]*B[k*numBColumns+i];

C[j*numCColumns+i]=sum;

}

}

dataset 0

MAT A MAT B

64x64 64x64 OK

dataset 1

128x64 64x128 OK

dataset 2

100x128 128x56 NOT WORKING

dataset 3

128x64 64x128 OK

dataset 4

128x32 32x128 OK

dataset 5

200x100 100x256 NOT WORKING

dataset 6

256x256 256x256 OK

dataset 7

300x256 256x300 OK

dataset 8

128x64 64x128 OK

dataset 9

256x256 256x257 NOT WORKING