i have also another question…
I want to make sure if i understand the definition of size_t global_size[] and size_t local_size[] in 2 dimensions, for global and local work space!
I have to read from buffer (from my first kernel) an array of QxN size , and from my second kernel an array of Qxk size! so i suppose that i have to define global_size[] = {Q,N} and local_size[] = {16,16} i think that the local size is similar like the block size in cuda so i choose 16x16 to be more appropriate !
For my second kernel i define global_size[] = {Q,k} and local_size[] = {16,16}.
So, when i run my program i give 3 arguments, size N, size Q and size k, if i try to create an array of
{N=16, Q=16, k=16} or {N=64, Q=64, k=16} i don’t have any problem, but if i try for {N=128, Q=128, k=16} or another combination i have the bug error -54 CL_INVALID_WORK_GROUP_SIZE , so i think that something i didn’t understand so well, i would be grateful if someone help me to manage with that issue! I have read many blocks and sites but i post again to this page because i want an answer to my specific problem!
thank you anyway!
here is the code…
//======================= GPU ==========================//
/* Create device and context */
device = create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if(err < 0) {
perror(“Couldn’t create a context”);
exit(1);
}
/* Build program */
program1 = build_program(context, device, PROGRAM_FILE);
/* Create data buffer */
int numQuery = numQueries;
int D=numDim;
double *data = training_set.data;
double *Query = query_set.data;
size_t global_size[] = {numQuery,numObjects};
size_t local_size[] = {BLK_SIZE,BLK_SIZE};
Matrix d_N;
d_N.width = D; //num Dim
d_N.height = numObjects; //num objects
size_t sizeN = D * numObjects * sizeof(double);
d_N.elements = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeN, data, &err);
Matrix d_Q;
d_Q.width = D; //num Dim
d_Q.height = numQuery; //num querries
size_t sizeQ = numQuery * D * sizeof(double);
d_Q.elements = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeQ, Query, &err);
//result
Matrix d_result_Dist;
d_result_Dist.width = numObjects;//num objects
d_result_Dist.height = numQuery ;//num querries
size_t sizeDist = numObjects * numQuery * sizeof(double);
d_result_Dist.elements = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeDist, Dist, &err);//mipws kapws prepei na xwthei to num of groups
if(err < 0) {
perror("Couldn't create a buffer");
exit(1);
};
/* Create a command queue */
queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE , &err);
if(err < 0){
perror("Couldn't create a command queue 1");
printf("error %d
", err);
exit(1);
};
/* Create a kernel EDW THA PAREI TON PRWTO KERNEL */
kernel_createDist = clCreateKernel(program1, "kernel_createDist", &err);
if(err < 0) {
perror("Couldn't create a kernel 1");
printf("error %d
", err);
exit(1);
};
/* Create kernel arguments TO LOCAL TO EXEI VALEI STO PARADEIGMA EPEIDI O KERNEL TOU EXEI ORISMA TYPOU _LOCAL */
err = clSetKernelArg(kernel_createDist, 0, sizeof(d_N.elements),(void*) &d_N.elements);
err = clSetKernelArg(kernel_createDist, 1, sizeof(d_Q.elements), (void*)&d_Q.elements);
err = clSetKernelArg(kernel_createDist, 2, sizeof(d_result_Dist.elements), (void*)&d_result_Dist.elements);
err = clSetKernelArg(kernel_createDist, 3, sizeof(d_N.width), (void*)&d_N.width);
err = clSetKernelArg(kernel_createDist, 4, sizeof(d_result_Dist.width), (void*)&d_result_Dist.width);
if(err < 0) {
perror(“Couldn’t create a kernel argument 1”);
printf("error %d
", err);
exit(1);
}
//wait complete queue
clFinish(queue);
/* Enqueue kernel */
err = clEnqueueNDRangeKernel(queue, kernel_createDist, 2, NULL, global_size, local_size, 0, NULL, &event);
if(err < 0) {
perror(“Couldn’t enqueue the kernel 1 %d”);
printf("error %d
", err);
exit(1);
}
// wait for sure to finish the kernel
clWaitForEvents(1 , &event);
// compute of time in GPU
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);
float executionTimeInMilliseconds1 = (end - start) * 1.0e-6f;
printf("[OPENCL] Time elapsed for GPU first kernel: %f s
", executionTimeInMilliseconds1);
/* Read the kernel’s output */
err = clEnqueueReadBuffer(queue, d_result_Dist.elements, CL_TRUE, 0, sizeDist, Dist, 0, NULL, NULL);
if(err < 0) {
perror(“Couldn’t enqueue the kernel 1 %d”);
printf("error %d
", err);
exit(1);
}
clReleaseMemObject(d_N.elements);
clReleaseMemObject(d_Q.elements);
clReleaseMemObject(d_result_Dist.elements);
clReleaseKernel(kernel_createDist);
clReleaseCommandQueue(queue);
clReleaseProgram(program1);
clReleaseContext(context);
//~ printf("----Dist gpu---
");
//~ printMat(Dist,numQuery,numObjects);
//~
//============================= SECOND KERNEL =================================//
device = create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if(err < 0) {
perror(“Couldn’t create a context 2”);
printf("error %d
", err);
exit(1);
}
program2 = build_program(context, device, PROGRAM_FILE);
size_t global_size2[] = {numQuery,k};
size_t local_size2[] = {BLK_SIZE,BLK_SIZE};
//input
Matrix d_D;
d_D.width = numObjects;//num objects
d_D.height = numQuery ;//num querries
size_t sizeDist1 = numObjects * numQuery * sizeof(double);
d_D.elements = clCreateBuffer(context,CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeDist1, Dist, &err);
//result data
Matrix d_NNidx;
d_NNidx.width = k;
d_NNidx.height = numQuery; //num querries
size_t sizeId = k * numQuery * sizeof(double);
d_NNidx.elements = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeId, NNidx, &err);
//result data
Matrix d_result;
d_result.width = k;
d_result.height = numQuery ;//num querries
size_t sizeRe = k * numQuery * sizeof(double);
d_result.elements = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeRe, NNdist, &err);
/* Create a command queue */
queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);
if(err < 0) {
perror("Couldn't create a command queue 2");
printf("error %d
", err);
exit(1);
};
kernel_ParallelSorting = clCreateKernel(program2, "kernel_ParallelSorting", &err);
if(err < 0) {
perror("Couldn't create a kernel 2");
printf("error %d
", err);
exit(1);
};
//~ /* Create kernel arguments */
err = clSetKernelArg(kernel_ParallelSorting, 0, sizeof(d_D.elements), (void*)&d_D.elements);
err = clSetKernelArg(kernel_ParallelSorting, 1, sizeof(d_result.elements), (void*)&d_result.elements);
err = clSetKernelArg(kernel_ParallelSorting, 2, sizeof(d_NNidx.elements), (void*)&d_NNidx.elements);
err = clSetKernelArg(kernel_ParallelSorting, 3, sizeof(d_D.width), (void*)&d_D.width);
err = clSetKernelArg(kernel_ParallelSorting, 4, sizeof(d_result.width), (void*)&d_result.width);
if(err < 0) {
perror(“Couldn’t create a kernel argument 2”);
printf("error num %d
",err);
exit(1);
}
clFinish(queue);
//~ /* Enqueue kernel */
err = clEnqueueNDRangeKernel(queue, kernel_ParallelSorting, 2, NULL, global_size2, local_size2, 0, NULL, &event);
if(err < 0) {
perror(“Couldn’t enqueue the kernel 2”);
printf("error %d
", err);
exit(1);
}
clWaitForEvents(1 , &event);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL);
float executionTimeInMilliseconds2 = (end - start) * 1.0e-6f;
printf("[OPENCL] Time elapsed for GPU first kernel: %f s
", executionTimeInMilliseconds2);
//read buffer 1
err = clEnqueueReadBuffer(queue, d_result.elements , CL_TRUE, 0, sizeRe, NNdist, 0, NULL, NULL);
if(err < 0) {
perror("Couldn't read the buffer 2 for d_kDist");
printf("error %d
", err);
exit(1);
}
// read buffer 2
err = clEnqueueReadBuffer(queue, d_NNidx.elements, CL_TRUE, 0, sizeId, NNidx, 0, NULL, NULL);
if(err < 0) {
perror("Couldn't read the buffer 2 for d_kDist");
exit(1);
}
printf("+++++++++++++++++++++ knns GPU +++++++++++++++++++++++++
");
printMat(NNdist,numQueries,k);
clReleaseMemObject(d_D.elements);
clReleaseMemObject(d_result.elements);
clReleaseMemObject(d_NNidx.elements);
clReleaseKernel(kernel_ParallelSorting);
clReleaseCommandQueue(queue);
clReleaseProgram(program2);
clReleaseContext(context);