Matrix-Matrix multiply [HELP!]

hello all,
can somebody help me with this code?


import pyopencl as cl
import numpy

a = numpy.random.rand(8, 8).astype(numpy.float32)
b = numpy.random.rand(8, 8).astype(numpy.float32)

print a
print b

block_size = 4
width = 8
tile_width = 2

def HostMultMatrix(A, B):
  C = numpy.empty_like(a)
  for i in range(width):
    for j in range(width):
      tmp = 0
      for k in range(width):
        a_t = A[i][k]
        b_t = B[k][j]
        tmp += a_t * b_t
      C[i][j] = tmp

  return C 


ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

prg = cl.Program(ctx, """
#define BLOCK_SIZE 4
#define WIDTH 8

  __kernel __attribute__ ((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
    void DevMultMatrix(__global const float *a,
	           __global const float *b, 
             __global float *c)
    {
      //Threads ID
      int gidx = get_global_id(0);
      int gidy = get_global_id(1);
      
      //Group ID      
      int bx = get_group_id(0);
      int by = get_group_id(1);

      int row = by * BLOCK_SIZE + gidy;
      int col = bx * BLOCK_SIZE + gidx;
      
      barrier(CLK_LOCAL_MEM_FENCE);
      if((row < WIDTH) && (col < WIDTH)){
          float sum = 0;
          for(int i = 0; i < WIDTH; i++){
              sum += a[row * WIDTH + i] * b[i * WIDTH + col];
              barrier(CLK_LOCAL_MEM_FENCE);
          }
          c[row * WIDTH + col] = sum;
      }

    }
    """).build()
prg.DevMultMatrix(queue, a.shape,(block_size,block_size),a_buf, b_buf, dest_buf)

a_mult_b = numpy.empty_like(a)
cl.enqueue_copy(queue, a_mult_b, dest_buf)

host_c = HostMultMatrix(a, b)
print host_c 
print a_mult_b


i cant run 4 workgroups to calculate the final result of the matrix,
i just get the rigth answer if a change de workgroup size to de size of the matrix, but i’d like to run some workgroups im parallel, can somebody help me with this?