How to know maximum allowed vector length in an OpenCL program?

Yogita · October 10, 2018, 3:04am

Hi,

I am new to OpenCL. I am trying to run a simple OpenCL program for Vector Addition on NVIDIA Tesla M2050.
I have mentioned global_work_size as 1024 and local_work_size as 16. The program works fine if I put total number of vector elements as 994 at runtime but it gets hang for elements more than 994. I want to run this program with large dataset so that I can compare its performance with the one running on CPU.
Is this because of the mentioned values of global_work_size and local_work_size ?
I have read articles related to global_work_size but I couldn’t get clarity.
How can I know the maximum value that I can provide at runtime for total number of elements in vector ?

Any help in this regard would be really appreciated.

Thanks !

Yogita · October 12, 2018, 12:26am

Yes you are right, context is being created without using any platform(putting platform as null). Is this not correct ?
And the code is running.
Please specify which errors you are talking about?
The device selected using clGetDeviceIds and device in context are same.

Please have a look at the code :

OpenCL file ::

#define MAX_SOURCE_SIZE (0x10000)
#define MEM_SIZE (1024)
#include<stdio.h>
#include<stdlib.h> 
#include "CL/cl.h" 

int main()
{
  cl_uint ret_num_platforms;
  cl_uint ret_num_devices;
  cl_platform_id platform_id = NULL;
  cl_kernel kernel2 = NULL;
  cl_program program2 = NULL;
  cl_command_queue command_queue = NULL;
  cl_context context = NULL;
  cl_device_id device_id = NULL;
  cl_int ret;
  FILE * fp2;
  char fileName2[]="./kernel.cl";
  int for_var=0;
  char * source_str2;
  size_t source_size2;
  size_t globalWorkSize[2];
  size_t localWorkSize[2];
  cl_mem cl_buffer3;
  cl_mem cl_buffer2;
  cl_mem cl_buffer1;
  cl_mem cl_buffer0;
  int *A;
  int *B;
  int *C;
  int *n;
  int i;
  n = ((int *)(malloc((sizeof(int )))));
  printf("Enter the number of elements of vector : 
");
  scanf("%d",n);
  A = ((int *)(malloc((( *n) * sizeof(int )))));
  B = ((int *)(malloc((( *n) * sizeof(int )))));
  C = ((int *)(malloc((( *n) * sizeof(int )))));
  printf("
Input Vector1 :
");
  for (i = 0; i <=  *n - 1; i += 1) {
    A[i] = (2 * i);
    printf("%d ",A[i]);
  }
  printf("

Input Vector2 :
");
  for (i = 0; i <=  *n - 1; i += 1) {
    B[i] = (3 * i);
    printf("%d ",B[i]);
  }
  ret = clGetPlatformIDs(1,&platform_id,&ret_num_platforms);
  if (ret != CL_SUCCESS) {
    printf("Platform error");
  }
  ret = clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_DEFAULT,1,&device_id,&ret_num_devices);
  if (ret != CL_SUCCESS)
  printf("device err");
  context=clCreateContext(NULL,1,&device_id,NULL,NULL,&ret);
  if (!context)
  printf("context err");
  command_queue = clCreateCommandQueue(context,device_id,0,&ret);
  if (!command_queue)
  printf("command queue error");
  localWorkSize[0] = 16;
  localWorkSize[1] = 16;
  globalWorkSize[0] =1024;
  globalWorkSize[1] =1024;
  cl_buffer0=clCreateBuffer(context, CL_MEM_WRITE_ONLY, MEM_SIZE * sizeof(int), NULL, &ret);
  cl_buffer1=clCreateBuffer(context, CL_MEM_WRITE_ONLY, MEM_SIZE * sizeof(int), NULL, &ret);
  cl_buffer3=clCreateBuffer(context, CL_MEM_WRITE_ONLY, MEM_SIZE * sizeof(int), NULL, &ret);
 cl_buffer2=clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(int), NULL, &ret);
  ret = clEnqueueWriteBuffer(command_queue, cl_buffer0 , CL_TRUE, 0,MEM_SIZE * sizeof(int), A, 0, NULL, NULL);
  ret = clEnqueueWriteBuffer(command_queue, cl_buffer1 , CL_TRUE, 0,MEM_SIZE * sizeof(int), B, 0, NULL, NULL);
  ret = clEnqueueWriteBuffer(command_queue, cl_buffer3 , CL_TRUE, 0,MEM_SIZE * sizeof(int), n, 0, NULL, NULL);
  ret = clEnqueueWriteBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,MEM_SIZE * sizeof(int), C, 0, NULL, NULL);
  fp2 = fopen(fileName2,"r");
  if (!fp2) {
    fprintf(stderr,"Failed");
    exit(1);
  }
  source_str2 = (char*)malloc(MAX_SOURCE_SIZE);
  source_size2 = fread(source_str2,1,MAX_SOURCE_SIZE,fp2);
  fclose(fp2);
  program2 = clCreateProgramWithSource(context, 1, (const char **)&source_str2,(const size_t *)&source_size2, &ret);
  if(!program2)
  printf("error creating program2");
  ret = clBuildProgram(program2, 1, &device_id, NULL, NULL, NULL);
  if (ret)
  printf("error building program2");
  kernel2 = clCreateKernel(program2, "ADD" , &ret);
  ret = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &cl_buffer0);
  ret = clSetKernelArg(kernel2, 1, sizeof(cl_mem), &cl_buffer1);
  ret = clSetKernelArg(kernel2, 2, sizeof(cl_mem), &cl_buffer2);
  ret = clSetKernelArg(kernel2, 3, sizeof(cl_mem), &cl_buffer3);
  ret = clEnqueueNDRangeKernel(command_queue, kernel2, 1, NULL, globalWorkSize, localWorkSize, 0 , NULL , NULL);
  ret = clEnqueueReadBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,MEM_SIZE * sizeof(int), C, 0, NULL, NULL);
  printf("

Addition of vectors :
");
  for (i = 0; i <=  *n - 1; i += 1) {
    printf("%d ",C[i]);
  }
  clReleaseMemObject(cl_buffer0);
  clReleaseMemObject(cl_buffer1);
  clReleaseMemObject(cl_buffer2);
  clReleaseMemObject(cl_buffer3);
clReleaseCommandQueue(command_queue);
  clReleaseContext(context);
  return 0; 
}

Kernel file (kernel.cl) ::

__kernel void ADD(__constant int *A,__constant int *B,__global int *C,__constant int *n)
{
  int i = get_global_id(0);
  if (i <=  *n - 1) {
    C[i] = (A[i] + B[i]);
  }
}

bashbaug · October 15, 2018, 2:42pm

This code is allocating some number of integers based on user input, but then unconditionally writing MEM_SIZE integers to the allocated OpenCL buffers, so it’s reading beyond the malloc’d memory. Depending what happens to be beyond the malloc’d memory this could either execute fine… or crash unpredictably.

I’d be curious to know if OCLGrind is able to identify these errors:

You may also find the Intercept Layer for OpenCL Applications to be helpful:

I was looking at a call log from the Intercept Layer for OpenCL Applications when I observed the mismatch in the allocated buffer size and the size passed to clEnqueueWriteBuffer.

Yogita · October 17, 2018, 5:57am

Thanks @bashbaug

Here is the updated code -

#define MAX_SOURCE_SIZE (0x10000)
#include<stdio.h>
#include<stdlib.h>
#include "CL/cl.h" 

int main()
{
  cl_uint ret_num_platforms;
  cl_uint ret_num_devices;
  cl_platform_id platform_id = NULL;

  cl_kernel kernel2 = NULL;

  cl_program program2 = NULL;

  cl_command_queue command_queue = NULL;
  cl_context context = NULL;
  cl_device_id device_id = NULL;
  cl_int ret;

  FILE * fp2;
  char fileName2[]="./vec_add.cl";

  int for_var=0;

  char * source_str2;
  size_t source_size2;

  size_t globalWorkSize[1];
  size_t localWorkSize[1];
  cl_mem cl_buffer3;
  cl_mem cl_buffer2;
  cl_mem cl_buffer1;
  cl_mem cl_buffer0;
  int *A;
  int *B;
  int *C;
  int *n;
  int i;
  n = ((int*)(malloc((sizeof(int)))));
  printf("Enter the number of elements of vector : 
");
  scanf("%d",n);
  A = ((int *)(malloc((( *n) * sizeof(int )))));
  B = ((int *)(malloc((( *n) * sizeof(int )))));
  C = ((int *)(malloc((( *n) * sizeof(int )))));
  printf("
Input Vector1 :
");
  for (i = 0; i <=  *n - 1; i += 1) {
    A[i] = (2 * i);
    printf("%d ",A[i]);
  }
  printf("

Input Vector2 :
");
  for (i = 0; i <=  *n - 1; i += 1) {
    B[i] = (3 * i);
    printf("%d ",B[i]);
  }
  ret = clGetPlatformIDs(1,&platform_id,&ret_num_platforms);
  if (ret != CL_SUCCESS) {
    printf("Platform error");
  }
  ret = clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_GPU,1,&device_id,&ret_num_devices);

  if (ret != CL_SUCCESS)
  printf("device err");

  context=clCreateContext(NULL,1,&device_id,NULL,NULL,&ret);
  if (!context)
  printf("context err");

  command_queue = clCreateCommandQueue(context,device_id,0,&ret);

  if (!command_queue)
  printf("command queue error");

  cl_buffer0=clCreateBuffer(context, CL_MEM_WRITE_ONLY, (*n) * sizeof(int), NULL, &ret);
 
cl_buffer1=clCreateBuffer(context, CL_MEM_WRITE_ONLY, (*n) * sizeof(int), NULL, &ret);

  cl_buffer3=clCreateBuffer(context, CL_MEM_WRITE_ONLY,  sizeof(int), NULL, &ret);
 
cl_buffer2=clCreateBuffer(context, CL_MEM_READ_WRITE, (*n) * sizeof(int), NULL, &ret);

ret = clEnqueueWriteBuffer(command_queue, cl_buffer0 , CL_TRUE, 0,(*n) * sizeof(int), A, 0, NULL, NULL);
 
 ret = clEnqueueWriteBuffer(command_queue, cl_buffer1 , CL_TRUE, 0,(*n) * sizeof(int), B, 0, NULL, NULL);
 
 ret = clEnqueueWriteBuffer(command_queue, cl_buffer3 , CL_TRUE, 0, sizeof(int), n, 0, NULL, NULL);
 
 ret = clEnqueueWriteBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,(*n) * sizeof(int), C, 0, NULL, NULL);

  fp2 = fopen(fileName2,"r");
  if (!fp2) {
    fprintf(stderr,"Failed");
    exit(1);
  }
  source_str2 = (char*)malloc(MAX_SOURCE_SIZE);
  source_size2 = fread(source_str2,1,MAX_SOURCE_SIZE,fp2);
  fclose(fp2);

  program2 = clCreateProgramWithSource(context, 1, (const char **)&source_str2,(const size_t *)&source_size2, &ret);

      if(!program2)
  printf("error creating program2");

  ret = clBuildProgram(program2, 1, &device_id, NULL, NULL, NULL);

  if (ret)
  printf("error building program2");

  kernel2 = clCreateKernel(program2, "VEC_ADD" , &ret);

  ret = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &cl_buffer0);

  ret = clSetKernelArg(kernel2, 1, sizeof(cl_mem), &cl_buffer1);
 
 ret = clSetKernelArg(kernel2, 2, sizeof(cl_mem), &cl_buffer2);

  ret = clSetKernelArg(kernel2, 3, sizeof(cl_mem), &cl_buffer3);

 localWorkSize[0] = 16;
  
  globalWorkSize[0] = 16400;
  
  ret = clEnqueueNDRangeKernel(command_queue, kernel2, 1, NULL, globalWorkSize, localWorkSize, 0 , NULL , NULL);

  ret = clEnqueueReadBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,(*n) * sizeof(int), C, 0, NULL, NULL);
  
printf("

Addition of vectors :
");
  for (i = 0; i <=  *n - 1; i += 1) {
    printf("%d ",C[i]);
  }

    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel2);
    ret = clReleaseProgram(program2);
    ret = clReleaseMemObject(cl_buffer0);
    ret = clReleaseMemObject(cl_buffer1);
    ret = clReleaseMemObject(cl_buffer2);
    ret = clReleaseMemObject(cl_buffer3);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    free(A);
    free(B);
    free(C);
    free(n);

  return 0;

}

Now this is running till n = 16343. But I want to check the performance for much larger data set.
How can I achieve so ?

Yogita · October 21, 2018, 10:08pm

Thanks @bashbaug

Here is the updated code -

#define MAX_SOURCE_SIZE (0x10000)
#include<stdio.h>
#include<stdlib.h>
#include "CL/cl.h" 

int main()
{
  cl_uint ret_num_platforms;
  cl_uint ret_num_devices;
  cl_platform_id platform_id = NULL;

  cl_kernel kernel2 = NULL;

  cl_program program2 = NULL;

  cl_command_queue command_queue = NULL;
  cl_context context = NULL;
  cl_device_id device_id = NULL;
  cl_int ret;

  FILE * fp2;
  char fileName2[]="./vec_add.cl";

  int for_var=0;

  char * source_str2;
  size_t source_size2;

  size_t globalWorkSize[1];
  size_t localWorkSize[1];
  cl_mem cl_buffer3;
  cl_mem cl_buffer2;
  cl_mem cl_buffer1;
  cl_mem cl_buffer0;
  int *A;
  int *B;
  int *C;
  int *n;
  int i;
  n = ((int*)(malloc((sizeof(int)))));
  printf("Enter the number of elements of vector : 
");
  scanf("%d",n);
  A = ((int *)(malloc((( *n) * sizeof(int )))));
  B = ((int *)(malloc((( *n) * sizeof(int )))));
  C = ((int *)(malloc((( *n) * sizeof(int )))));
  printf("
Input Vector1 :
");
  for (i = 0; i <=  *n - 1; i += 1) {
    A[i] = (2 * i);
    printf("%d ",A[i]);
  }
  printf("

Input Vector2 :
");
  for (i = 0; i <=  *n - 1; i += 1) {
    B[i] = (3 * i);
    printf("%d ",B[i]);
  }
  ret = clGetPlatformIDs(1,&platform_id,&ret_num_platforms);
  if (ret != CL_SUCCESS) {
    printf("Platform error");
  }
  ret = clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_GPU,1,&device_id,&ret_num_devices);

  if (ret != CL_SUCCESS)
  printf("device err");

  context=clCreateContext(NULL,1,&device_id,NULL,NULL,&ret);
  if (!context)
  printf("context err");

  command_queue = clCreateCommandQueue(context,device_id,0,&ret);

  if (!command_queue)
  printf("command queue error");

  cl_buffer0=clCreateBuffer(context, CL_MEM_WRITE_ONLY, (*n) * sizeof(int), NULL, &ret);
 
cl_buffer1=clCreateBuffer(context, CL_MEM_WRITE_ONLY, (*n) * sizeof(int), NULL, &ret);

  cl_buffer3=clCreateBuffer(context, CL_MEM_WRITE_ONLY,  sizeof(int), NULL, &ret);
 
cl_buffer2=clCreateBuffer(context, CL_MEM_READ_WRITE, (*n) * sizeof(int), NULL, &ret);

ret = clEnqueueWriteBuffer(command_queue, cl_buffer0 , CL_TRUE, 0,(*n) * sizeof(int), A, 0, NULL, NULL);
 
 ret = clEnqueueWriteBuffer(command_queue, cl_buffer1 , CL_TRUE, 0,(*n) * sizeof(int), B, 0, NULL, NULL);
 
 ret = clEnqueueWriteBuffer(command_queue, cl_buffer3 , CL_TRUE, 0, sizeof(int), n, 0, NULL, NULL);
 
 ret = clEnqueueWriteBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,(*n) * sizeof(int), C, 0, NULL, NULL);

  fp2 = fopen(fileName2,"r");
  if (!fp2) {
    fprintf(stderr,"Failed");
    exit(1);
  }
  source_str2 = (char*)malloc(MAX_SOURCE_SIZE);
  source_size2 = fread(source_str2,1,MAX_SOURCE_SIZE,fp2);
  fclose(fp2);

  program2 = clCreateProgramWithSource(context, 1, (const char **)&source_str2,(const size_t *)&source_size2, &ret);

      if(!program2)
  printf("error creating program2");

  ret = clBuildProgram(program2, 1, &device_id, NULL, NULL, NULL);

  if (ret)
  printf("error building program2");

  kernel2 = clCreateKernel(program2, "VEC_ADD" , &ret);

  ret = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &cl_buffer0);

  ret = clSetKernelArg(kernel2, 1, sizeof(cl_mem), &cl_buffer1);
 
 ret = clSetKernelArg(kernel2, 2, sizeof(cl_mem), &cl_buffer2);

  ret = clSetKernelArg(kernel2, 3, sizeof(cl_mem), &cl_buffer3);

 localWorkSize[0] = 16;
  
  globalWorkSize[0] = 16400;
  
  ret = clEnqueueNDRangeKernel(command_queue, kernel2, 1, NULL, globalWorkSize, localWorkSize, 0 , NULL , NULL);

  ret = clEnqueueReadBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,(*n) * sizeof(int), C, 0, NULL, NULL);
  
printf("

Addition of vectors :
");
  for (i = 0; i <=  *n - 1; i += 1) {
    printf("%d ",C[i]);
  }

    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseKernel(kernel2);
    ret = clReleaseProgram(program2);
    ret = clReleaseMemObject(cl_buffer0);
    ret = clReleaseMemObject(cl_buffer1);
    ret = clReleaseMemObject(cl_buffer2);
    ret = clReleaseMemObject(cl_buffer3);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    free(A);
    free(B);
    free(C);
    free(n);

  return 0;

}

Now this is running till n = 16384. But I want to check the performance for much larger data set.
How can I make this program run for much larger data set?