global & local size in 2D problem

i have also another question… :frowning:

I want to make sure if i understand the definition of size_t global_size[] and size_t local_size[] in 2 dimensions, for global and local work space!

I have to read from buffer (from my first kernel) an array of QxN size , and from my second kernel an array of Qxk size! so i suppose that i have to define global_size[] = {Q,N} and local_size[] = {16,16} i think that the local size is similar like the block size in cuda so i choose 16x16 to be more appropriate !
For my second kernel i define global_size[] = {Q,k} and local_size[] = {16,16}.

So, when i run my program i give 3 arguments, size N, size Q and size k, if i try to create an array of
{N=16, Q=16, k=16} or {N=64, Q=64, k=16} i don’t have any problem, but if i try for {N=128, Q=128, k=16} or another combination i have the bug error -54 CL_INVALID_WORK_GROUP_SIZE , so i think that something i didn’t understand so well, i would be grateful if someone help me to manage with that issue! I have read many blocks and sites but i post again to this page because i want an answer to my specific problem!

thank you anyway!

here is the code…

//======================= GPU ==========================//

/* Create device and context */
device = create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);

if(err < 0) {
perror(“Couldn’t create a context”);
exit(1);
}

/* Build program */
program1 = build_program(context, device, PROGRAM_FILE);

/* Create data buffer */
int numQuery = numQueries;
int D=numDim;

double *data = training_set.data;
double *Query = query_set.data;

size_t global_size[] = {numQuery,numObjects}; 
size_t local_size[] = {BLK_SIZE,BLK_SIZE};


Matrix d_N;
d_N.width = D;				//num Dim
d_N.height = numObjects;	//num objects

size_t sizeN = D * numObjects * sizeof(double);


d_N.elements = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeN, data, &err);

Matrix d_Q;
d_Q.width = D;			//num Dim
d_Q.height = numQuery;	//num querries

size_t sizeQ = numQuery * D * sizeof(double);

d_Q.elements = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeQ, Query, &err);

//result
Matrix d_result_Dist;
d_result_Dist.width = numObjects;//num objects
d_result_Dist.height = numQuery ;//num querries

size_t sizeDist = numObjects * numQuery * sizeof(double);

d_result_Dist.elements = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeDist, Dist, &err);//mipws kapws prepei na xwthei to num of groups
   
if(err &lt; 0) {
  perror("Couldn't create a buffer");
  exit(1);   

};

/* Create a command queue */
queue = clCreateCommandQueue(context, device,  CL_QUEUE_PROFILING_ENABLE , &err);
if(err &lt; 0){
	perror("Couldn't create a command queue 1");
	printf("error %d 

", err);
exit(1);
};

/* Create a kernel EDW THA PAREI TON PRWTO KERNEL */
kernel_createDist = clCreateKernel(program1, "kernel_createDist", &err);
if(err &lt; 0) {
	perror("Couldn't create a kernel 1");
	printf("error %d 

", err);
exit(1);
};

/* Create kernel arguments TO LOCAL TO EXEI VALEI STO PARADEIGMA EPEIDI O KERNEL TOU EXEI ORISMA TYPOU _LOCAL */
err = clSetKernelArg(kernel_createDist, 0, sizeof(d_N.elements),(void*) &d_N.elements);
err = clSetKernelArg(kernel_createDist, 1, sizeof(d_Q.elements), (void*)&d_Q.elements);
err = clSetKernelArg(kernel_createDist, 2, sizeof(d_result_Dist.elements), (void*)&d_result_Dist.elements);
err = clSetKernelArg(kernel_createDist, 3, sizeof(d_N.width), (void*)&d_N.width);
err = clSetKernelArg(kernel_createDist, 4, sizeof(d_result_Dist.width), (void*)&d_result_Dist.width);

if(err < 0) {
perror(“Couldn’t create a kernel argument 1”);
printf("error %d
", err);
exit(1);
}
//wait complete queue
clFinish(queue);

/* Enqueue kernel */
err = clEnqueueNDRangeKernel(queue, kernel_createDist, 2, NULL, global_size, local_size, 0, NULL, &event);

if(err < 0) {
perror(“Couldn’t enqueue the kernel 1 %d”);
printf("error %d
", err);
exit(1);
}
// wait for sure to finish the kernel
clWaitForEvents(1 , &event);

// compute of time in GPU
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);

float executionTimeInMilliseconds1 = (end - start) * 1.0e-6f;
printf("[OPENCL] Time elapsed for GPU first kernel: %f s
", executionTimeInMilliseconds1);

/* Read the kernel’s output */
err = clEnqueueReadBuffer(queue, d_result_Dist.elements, CL_TRUE, 0, sizeDist, Dist, 0, NULL, NULL);
if(err < 0) {
perror(“Couldn’t enqueue the kernel 1 %d”);
printf("error %d
", err);
exit(1);
}

clReleaseMemObject(d_N.elements);
clReleaseMemObject(d_Q.elements);
clReleaseMemObject(d_result_Dist.elements);


clReleaseKernel(kernel_createDist);
clReleaseCommandQueue(queue);
clReleaseProgram(program1);
clReleaseContext(context);

//~ printf("----Dist gpu---

");
//~ printMat(Dist,numQuery,numObjects);
//~

//============================= SECOND KERNEL =================================//
device = create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if(err < 0) {
perror(“Couldn’t create a context 2”);
printf("error %d
", err);
exit(1);
}

program2 = build_program(context, device, PROGRAM_FILE);

size_t global_size2[] = {numQuery,k}; 
size_t local_size2[] = {BLK_SIZE,BLK_SIZE};

//input
Matrix d_D;
d_D.width = numObjects;//num objects
d_D.height = numQuery ;//num querries

size_t sizeDist1 = numObjects * numQuery * sizeof(double);

d_D.elements = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeDist1, Dist, &err);



//result data 
Matrix d_NNidx;
d_NNidx.width = k;
d_NNidx.height = numQuery; //num querries

size_t sizeId = k * numQuery * sizeof(double);

d_NNidx.elements = clCreateBuffer(context,  CL_MEM_READ_WRITE  | CL_MEM_COPY_HOST_PTR, sizeId, NNidx, &err);

//result data 
Matrix d_result;
d_result.width = k;
d_result.height = numQuery ;//num querries

size_t sizeRe = k * numQuery * sizeof(double);

d_result.elements = clCreateBuffer(context,  CL_MEM_READ_WRITE  | CL_MEM_COPY_HOST_PTR, sizeRe, NNdist, &err);

/* Create a command queue */
queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
if(err &lt; 0) {
	perror("Couldn't create a command queue 2");
	printf("error %d 

", err);
exit(1);
};

kernel_ParallelSorting = clCreateKernel(program2, "kernel_ParallelSorting", &err);

if(err &lt; 0) {
	perror("Couldn't create a kernel 2");
	printf("error %d 

", err);
exit(1);
};

//~ /* Create kernel arguments  */
err = clSetKernelArg(kernel_ParallelSorting, 0, sizeof(d_D.elements), (void*)&d_D.elements);

err = clSetKernelArg(kernel_ParallelSorting, 1, sizeof(d_result.elements), (void*)&d_result.elements);

err = clSetKernelArg(kernel_ParallelSorting, 2, sizeof(d_NNidx.elements), (void*)&d_NNidx.elements);

err = clSetKernelArg(kernel_ParallelSorting, 3, sizeof(d_D.width), (void*)&d_D.width);

err = clSetKernelArg(kernel_ParallelSorting, 4, sizeof(d_result.width), (void*)&d_result.width);

if(err < 0) {
perror(“Couldn’t create a kernel argument 2”);
printf("error num %d
",err);
exit(1);
}

clFinish(queue);

//~ /* Enqueue kernel */
err = clEnqueueNDRangeKernel(queue, kernel_ParallelSorting, 2, NULL, global_size2, local_size2, 0, NULL, &event);

if(err < 0) {
perror(“Couldn’t enqueue the kernel 2”);
printf("error %d
", err);
exit(1);
}

clWaitForEvents(1 , &event);

clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);

float executionTimeInMilliseconds2 = (end - start) * 1.0e-6f;
printf("[OPENCL] Time elapsed for GPU first kernel: %f s

", executionTimeInMilliseconds2);

//read buffer 1
err = clEnqueueReadBuffer(queue, d_result.elements , CL_TRUE, 0, sizeRe, NNdist, 0, NULL, NULL);

if(err &lt; 0) {
	perror("Couldn't read the buffer 2 for d_kDist");
	printf("error %d 

", err);
exit(1);
}

// read buffer 2
err = clEnqueueReadBuffer(queue, d_NNidx.elements, CL_TRUE, 0, sizeId, NNidx, 0, NULL, NULL);

if(err &lt; 0) {
  perror("Couldn't read the buffer 2 for d_kDist");
  exit(1);

}

printf("+++++++++++++++++++++ knns GPU +++++++++++++++++++++++++

");
printMat(NNdist,numQueries,k);

clReleaseMemObject(d_D.elements);
clReleaseMemObject(d_result.elements);
clReleaseMemObject(d_NNidx.elements);

clReleaseKernel(kernel_ParallelSorting);
clReleaseCommandQueue(queue);
clReleaseProgram(program2);

clReleaseContext(context);

I know you mentioned numbers in your question, but what are all these values actually set to here?

Hint: global_size needs to be an integer multiple of local_size

The error code you’re receiving is suggesting that this is not the case (or that BLK_SIZE is too big for the device).

Total local sizes must be less than CL_DEVICE_MAX_WORK_ITEM_SIZES (per dimension), and the total work group size (multiply the local sizes together) must be less than CL_DEVICE_MAX_WORK_GROUP_SIZE and CL_KERNEL_WORK_GROUP_SIZE to work. Supposedly they will be faster if the work group size is a multiple of CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE but this isn’t always true.

The local size can be left unspecified and the driver will use something legal that works. Be aware that for some global sizes, this could be 1,1 which will be inefficient. But its a good way to get things working before getting fancy.

The global size must be a multiple of the local size. Therefore, you need to round up the global size in some cases. Also, then you need to do a check in your kernels to only process if the global_id values are less than your actual desired size (passed as arguments).

Messy stuff that should have just been handled by the runtime…