kernel understanding

Hi dear OpenCL Community,

for my OpenCL understanding i want to port some simple code from Java to OpenCL.
The OpenCL Code works for small sizes but when i increase the vector size then my
Kernel crash.
Here is the Java Part:


FloatBuffer a = FloatBuffer.wrap(new float[]{1, 1, 1, 1});
FloatBuffer b = FloatBuffer.wrap(new float[]{2.2f, 2, 3, 4, 5});

for (int i = 0; i < a.capacity(); i++) {
    float v= 0;
    for (int j = 0; j < b.capacity(); j++) {
        v+= a.get(i) + b.get(j);
    }
    System.out.println(v);
}

And the OpenCL equivalent, i call the kernel with globalsize[a.capacity(), b.capacity()]
and the output has the same size like a:



__kernel void kernel(__global const float* a, __global const float* b, __global float* output, __local float* block)
{
    int gri = get_group_id(0);
    int grj = get_group_id(1);

    int ti = get_local_id(0);
    int tj = get_local_id(1);

    int gi = get_global_size(0);
    int gj = get_global_size(1);

    int ni = get_local_size(0);
    int nj = get_local_size(1);

    int gti = gri * ni + ti;

    int nb = gj / nj;

    float va = a[gti];
    float sum = 0;

    for (int jb = 0; jb < nb; jb++){

        block[tj] = b[jb * nj + tj];
        barrier(CLK_LOCAL_MEM_FENCE);

        for(int k = 0;k < nj; k++){
            sum += va + block[k];
        }

        barrier(CLK_LOCAL_MEM_FENCE);
    }

    output[gti] = sum;
};


Thanks in advance