hello all,
can somebody help me with this code?
import pyopencl as cl
import numpy
a = numpy.random.rand(8, 8).astype(numpy.float32)
b = numpy.random.rand(8, 8).astype(numpy.float32)
print a
print b
block_size = 4
width = 8
tile_width = 2
def HostMultMatrix(A, B):
C = numpy.empty_like(a)
for i in range(width):
for j in range(width):
tmp = 0
for k in range(width):
a_t = A[i][k]
b_t = B[k][j]
tmp += a_t * b_t
C[i][j] = tmp
return C
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)
prg = cl.Program(ctx, """
#define BLOCK_SIZE 4
#define WIDTH 8
__kernel __attribute__ ((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
void DevMultMatrix(__global const float *a,
__global const float *b,
__global float *c)
{
//Threads ID
int gidx = get_global_id(0);
int gidy = get_global_id(1);
//Group ID
int bx = get_group_id(0);
int by = get_group_id(1);
int row = by * BLOCK_SIZE + gidy;
int col = bx * BLOCK_SIZE + gidx;
barrier(CLK_LOCAL_MEM_FENCE);
if((row < WIDTH) && (col < WIDTH)){
float sum = 0;
for(int i = 0; i < WIDTH; i++){
sum += a[row * WIDTH + i] * b[i * WIDTH + col];
barrier(CLK_LOCAL_MEM_FENCE);
}
c[row * WIDTH + col] = sum;
}
}
""").build()
prg.DevMultMatrix(queue, a.shape,(block_size,block_size),a_buf, b_buf, dest_buf)
a_mult_b = numpy.empty_like(a)
cl.enqueue_copy(queue, a_mult_b, dest_buf)
host_c = HostMultMatrix(a, b)
print host_c
print a_mult_b
i cant run 4 workgroups to calculate the final result of the matrix,
i just get the rigth answer if a change de workgroup size to de size of the matrix, but i’d like to run some workgroups im parallel, can somebody help me with this?