hi, i m new in this forum, my name is luis, i m trying to do a kernel to realize a matrix inversion. I have the next code:
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void INVERSA(__global double* C,const int T,__local double* Aux2)
{
int base = get_global_id(0);
int iloc = get_local_id(0);
int nloc = get_local_size(0);
int G = get_global_size(0);
int i,j=0,k,ban=0,ban2=0,Z;
double Aux,X,Y;
for (i=0;i<T;i++){
X=C[i + (i*G)];
for (k=iloc;k<G;k=k+nloc)
Aux2[k]=C[k + (i*G)];
barrier(CLK_LOCAL_MEM_FENCE);
while(X==0 && j<T+1){
if(ban==0){
j=i+1;
Aux=Aux2[base];
ban=1;
}
for (k=iloc;k<G;k=k+nloc)
Aux2[k]=C[k + (j*G)];
barrier(CLK_LOCAL_MEM_FENCE);
C[base + (i*G)]=Aux2[base];
C[base + (j*G)]=Aux;
X=Aux2[i];
if(X==0){
C[base + (i*G)]=Aux;
C[base + (j*G)]=Aux2[base];
}
j++;
}
ban=0;
Aux2[base]=Aux2[base]/X;
Aux=Aux2[base];
C[base + (i*G)]=Aux2[base];
Z=0;
for (j=0;j<T;j++){
for (k=iloc;k<G;k=k+nloc)
Aux2[k]=C[k + (j*G)];
barrier(CLK_LOCAL_MEM_FENCE);
if(Z!=i) {
Y=Aux2[i];
if(Y!=0){
Aux2[base]= Aux2[base]-Y*Aux;
C[base + (j*G)]=Aux2[base];
}
}
Z++;
}
}
}
variable T refers the matrix row or colum size, my global_size is 2 times more that the T size because i agree in the matriz the identy matrix, so if i have a 3 X 3 matrix, really in the code i compute 3 X 3 matrix plus the 3 X 3 identy matrix, the results of the inverse are the side of the identy matrix after the computation
this code is effective with 1000 *1000 matrix size, but with a greatter size i have bad results, my GPU card is NVIDIA Tesla C2075 with specificacion of 1024 max group size, someone have a code that help me with this problem?, or any idea that resolve this problem?, i m new in OpenCL and every review are welcome, thanks a lot.