Hello,
I wrote an algorithm and I want to benefit from the local memory concept, I have 2 array, array a[row][col] and C[col], and I do maths on them, and I put below the needed part to explain how I make the buffer and choose the work-group size, so my question how to use the local memory, I am a bit confuse how to choose the size of the local memory?, and if the size of the local memory is smaller than the size of the global memory so how to manage that?
thank you in advance
Buffer bufferA = Buffer(context,CL_MEM_READ_ONLY ,sizeof(cl_float) * col * row);
Buffer bufferC = Buffer(context,CL_MEM_READ_ONLY ,sizeof(cl_float) * col );
Buffer bufferO = Buffer(context,CL_MEM_WRITE_ONLY ,sizeof(cl_float) * row);
kernel.setArg(0, bufferA);
kernel.setArg(1, bufferC);
kernel.setArg(2, bufferO);
kernel.setArg(3, row);
kernel.setArg(4, col);
NDRange globalNDRange(row); //Total number of work items
NDRange localNDRange(1); //Work items in each work-group
queue.enqueueNDRangeKernel(kernel, NDRange(), globalNDRange, localNDRange, NULL, &event);
__kernel void rmsCalculation(const __global float* a,
const __global float * C,
__global float * O,
const int row,
const int col)
{
const int ar = get_global_id(0);
float R=0;
float I=0;
float c=0;
float sum=0;
for(int j=0;j<col; ++j)
{
c = C[j] * a[ar * col + j]; // C*number of repetition
I=0;
do
{
R = I + c;
I=0;
for(int k=0 ; k<j ; ++k)
{
I = I + C[k] * a[ar * col + k];
}
}while(I+c > R);
sum = sum + R;
}//end for(j=0..
O[ar]=sum;
}