PDA

View Full Version : Matrix-Matrix multiply [HELP!]

AllanMSouza
02-06-2013, 05:36 AM
hello all,
can somebody help me with this code?

import pyopencl as cl
import numpy

a = numpy.random.rand(8, 8).astype(numpy.float32)
b = numpy.random.rand(8, 8).astype(numpy.float32)

print a
print b

block_size = 4
width = 8
tile_width = 2

def HostMultMatrix(A, B):
C = numpy.empty_like(a)
for i in range(width):
for j in range(width):
tmp = 0
for k in range(width):
a_t = A[i][k]
b_t = B[k][j]
tmp += a_t * b_t
C[i][j] = tmp

return C

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

prg = cl.Program(ctx, """
#define BLOCK_SIZE 4
#define WIDTH 8

__kernel __attribute__ ((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
void DevMultMatrix(__global const float *a,
__global const float *b,
__global float *c)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);

//Group ID
int bx = get_group_id(0);
int by = get_group_id(1);

int row = by * BLOCK_SIZE + gidy;
int col = bx * BLOCK_SIZE + gidx;

barrier(CLK_LOCAL_MEM_FENCE);
if((row < WIDTH) && (col < WIDTH)){
float sum = 0;
for(int i = 0; i < WIDTH; i++){
sum += a[row * WIDTH + i] * b[i * WIDTH + col];
barrier(CLK_LOCAL_MEM_FENCE);
}
c[row * WIDTH + col] = sum;
}

}
""").build()
prg.DevMultMatrix(queue, a.shape,(block_size,block_size),a_buf, b_buf, dest_buf)

a_mult_b = numpy.empty_like(a)
cl.enqueue_copy(queue, a_mult_b, dest_buf)

host_c = HostMultMatrix(a, b)
print host_c
print a_mult_b

i cant run 4 workgroups to calculate the final result of the matrix,
i just get the rigth answer if a change de workgroup size to de size of the matrix, but i'd like to run some workgroups im parallel, can somebody help me with this?