Results 1 to 5 of 5

Thread: How to know maximum allowed vector length in an OpenCL program?

  1. #1

    How to know maximum allowed vector length in an OpenCL program?

    Hi,

    I am new to OpenCL. I am trying to run a simple OpenCL program for Vector Addition on NVIDIA Tesla M2050.
    I have mentioned global_work_size as 1024 and local_work_size as 16. The program works fine if I put total number of vector elements as 994 at runtime but it gets hang for elements more than 994. I want to run this program with large dataset so that I can compare its performance with the one running on CPU.
    Is this because of the mentioned values of global_work_size and local_work_size ?
    I have read articles related to global_work_size but I couldn't get clarity.
    How can I know the maximum value that I can provide at runtime for total number of elements in vector ?

    Any help in this regard would be really appreciated.

    Thanks !

  2. #2
    Junior Member
    Join Date
    Oct 2018
    Posts
    3
    The context is being created without using any platform?? How is this code running in any case? You need to check error codes from OpenCL APIs, and importantly, do a clCreateContextFromType. it seems device selected using clGetDeviceIds and device in context are different. EDIT: Looks like context can be created from the device only, and platform can be null in that case. But the testcase is always crashing on my machine as of now. Let me revisit the code.

  3. #3
    Yes you are right, context is being created without using any platform(putting platform as null). Is this not correct ?
    And the code is running.
    Please specify which errors you are talking about?
    The device selected using clGetDeviceIds and device in context are same.

    Please have a look at the code :

    OpenCL file ::

    Code :
    #define MAX_SOURCE_SIZE (0x10000)
    #define MEM_SIZE (1024)
    #include<stdio.h>
    #include<stdlib.h> 
    #include "CL/cl.h" 
     
    int main()
    {
      cl_uint ret_num_platforms;
      cl_uint ret_num_devices;
      cl_platform_id platform_id = NULL;
      cl_kernel kernel2 = NULL;
      cl_program program2 = NULL;
      cl_command_queue command_queue = NULL;
      cl_context context = NULL;
      cl_device_id device_id = NULL;
      cl_int ret;
      FILE * fp2;
      char fileName2[]="./kernel.cl";
      int for_var=0;
      char * source_str2;
      size_t source_size2;
      size_t globalWorkSize[2];
      size_t localWorkSize[2];
      cl_mem cl_buffer3;
      cl_mem cl_buffer2;
      cl_mem cl_buffer1;
      cl_mem cl_buffer0;
      int *A;
      int *B;
      int *C;
      int *n;
      int i;
      n = ((int *)(malloc((sizeof(int )))));
      printf("Enter the number of elements of vector : \n");
      scanf("%d",n);
      A = ((int *)(malloc((( *n) * sizeof(int )))));
      B = ((int *)(malloc((( *n) * sizeof(int )))));
      C = ((int *)(malloc((( *n) * sizeof(int )))));
      printf("\nInput Vector1 :\n");
      for (i = 0; i <=  *n - 1; i += 1) {
        A[i] = (2 * i);
        printf("%d ",A[i]);
      }
      printf("\n\nInput Vector2 :\n");
      for (i = 0; i <=  *n - 1; i += 1) {
        B[i] = (3 * i);
        printf("%d ",B[i]);
      }
      ret = clGetPlatformIDs(1,&platform_id,&ret_num_platforms);
      if (ret != CL_SUCCESS) {
        printf("Platform error");
      }
      ret = clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_DEFAULT,1,&device_id,&ret_num_devices);
      if (ret != CL_SUCCESS)
      printf("device err");
      context=clCreateContext(NULL,1,&device_id,NULL,NULL,&ret);
      if (!context)
      printf("context err");
      command_queue = clCreateCommandQueue(context,device_id,0,&ret);
      if (!command_queue)
      printf("command queue error");
      localWorkSize[0] = 16;
      localWorkSize[1] = 16;
      globalWorkSize[0] =1024;
      globalWorkSize[1] =1024;
      cl_buffer0=clCreateBuffer(context, CL_MEM_WRITE_ONLY, MEM_SIZE * sizeof(int), NULL, &ret);
      cl_buffer1=clCreateBuffer(context, CL_MEM_WRITE_ONLY, MEM_SIZE * sizeof(int), NULL, &ret);
      cl_buffer3=clCreateBuffer(context, CL_MEM_WRITE_ONLY, MEM_SIZE * sizeof(int), NULL, &ret);
     cl_buffer2=clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(int), NULL, &ret);
      ret = clEnqueueWriteBuffer(command_queue, cl_buffer0 , CL_TRUE, 0,MEM_SIZE * sizeof(int), A, 0, NULL, NULL);
      ret = clEnqueueWriteBuffer(command_queue, cl_buffer1 , CL_TRUE, 0,MEM_SIZE * sizeof(int), B, 0, NULL, NULL);
      ret = clEnqueueWriteBuffer(command_queue, cl_buffer3 , CL_TRUE, 0,MEM_SIZE * sizeof(int), n, 0, NULL, NULL);
      ret = clEnqueueWriteBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,MEM_SIZE * sizeof(int), C, 0, NULL, NULL);
      fp2 = fopen(fileName2,"r");
      if (!fp2) {
        fprintf(stderr,"Failed");
        exit(1);
      }
      source_str2 = (char*)malloc(MAX_SOURCE_SIZE);
      source_size2 = fread(source_str2,1,MAX_SOURCE_SIZE,fp2);
      fclose(fp2);
      program2 = clCreateProgramWithSource(context, 1, (const char **)&source_str2,(const size_t *)&source_size2, &ret);
      if(!program2)
      printf("error creating program2");
      ret = clBuildProgram(program2, 1, &device_id, NULL, NULL, NULL);
      if (ret)
      printf("error building program2");
      kernel2 = clCreateKernel(program2, "ADD" , &ret);
      ret = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &cl_buffer0);
      ret = clSetKernelArg(kernel2, 1, sizeof(cl_mem), &cl_buffer1);
      ret = clSetKernelArg(kernel2, 2, sizeof(cl_mem), &cl_buffer2);
      ret = clSetKernelArg(kernel2, 3, sizeof(cl_mem), &cl_buffer3);
      ret = clEnqueueNDRangeKernel(command_queue, kernel2, 1, NULL, globalWorkSize, localWorkSize, 0 , NULL , NULL);
      ret = clEnqueueReadBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,MEM_SIZE * sizeof(int), C, 0, NULL, NULL);
      printf("\n\nAddition of vectors :\n");
      for (i = 0; i <=  *n - 1; i += 1) {
        printf("%d ",C[i]);
      }
      clReleaseMemObject(cl_buffer0);
      clReleaseMemObject(cl_buffer1);
      clReleaseMemObject(cl_buffer2);
      clReleaseMemObject(cl_buffer3);
    clReleaseCommandQueue(command_queue);
      clReleaseContext(context);
      return 0; 
    }
     
    Kernel file (kernel.cl) ::
     
    __kernel void ADD(__constant int *A,__constant int *B,__global int *C,__constant int *n)
    {
      int i = get_global_id(0);
      if (i <=  *n - 1) {
        C[i] = (A[i] + B[i]);
      }
    }
    Last edited by khronos; Yesterday at 05:52 AM. Reason: Wrap code in [ CODE ] tags

  4. #4
    Junior Member
    Join Date
    May 2017
    Posts
    3
    Quote Originally Posted by Yogita View Post
    ...
    n = ((int *)(malloc((sizeof(int )))));
    printf("Enter the number of elements of vector : \n");
    scanf("%d",n);
    A = ((int *)(malloc((( *n) * sizeof(int )))));
    B = ((int *)(malloc((( *n) * sizeof(int )))));
    C = ((int *)(malloc((( *n) * sizeof(int )))));
    ...
    ret = clEnqueueWriteBuffer(command_queue, cl_buffer0 , CL_TRUE, 0,MEM_SIZE * sizeof(int), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, cl_buffer1 , CL_TRUE, 0,MEM_SIZE * sizeof(int), B, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, cl_buffer3 , CL_TRUE, 0,MEM_SIZE * sizeof(int), n, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,MEM_SIZE * sizeof(int), C, 0, NULL, NULL);
    ...
    This code is allocating some number of integers based on user input, but then unconditionally writing MEM_SIZE integers to the allocated OpenCL buffers, so it's reading beyond the malloc'd memory. Depending what happens to be beyond the malloc'd memory this could either execute fine.... or crash unpredictably.

    I'd be curious to know if OCLGrind is able to identify these errors:
    https://github.com/jrprice/Oclgrind

    You may also find the Intercept Layer for OpenCL Applications to be helpful:
    https://github.com/intel/opencl-intercept-layer

    I was looking at a call log from the Intercept Layer for OpenCL Applications when I observed the mismatch in the allocated buffer size and the size passed to clEnqueueWriteBuffer.

  5. #5
    Thanks @bashbaug

    Here is the updated code -

    Code :
    #define MAX_SOURCE_SIZE (0x10000)
    #include<stdio.h>
    #include<stdlib.h>
    #include "CL/cl.h" 
     
    int main()
    {
      cl_uint ret_num_platforms;
      cl_uint ret_num_devices;
      cl_platform_id platform_id = NULL;
     
      cl_kernel kernel2 = NULL;
     
      cl_program program2 = NULL;
     
      cl_command_queue command_queue = NULL;
      cl_context context = NULL;
      cl_device_id device_id = NULL;
      cl_int ret;
     
      FILE * fp2;
      char fileName2[]="./vec_add.cl";
     
      int for_var=0;
     
      char * source_str2;
      size_t source_size2;
     
      size_t globalWorkSize[1];
      size_t localWorkSize[1];
      cl_mem cl_buffer3;
      cl_mem cl_buffer2;
      cl_mem cl_buffer1;
      cl_mem cl_buffer0;
      int *A;
      int *B;
      int *C;
      int *n;
      int i;
      n = ((int*)(malloc((sizeof(int)))));
      printf("Enter the number of elements of vector : \n");
      scanf("%d",n);
      A = ((int *)(malloc((( *n) * sizeof(int )))));
      B = ((int *)(malloc((( *n) * sizeof(int )))));
      C = ((int *)(malloc((( *n) * sizeof(int )))));
      printf("\nInput Vector1 :\n");
      for (i = 0; i <=  *n - 1; i += 1) {
        A[i] = (2 * i);
        printf("%d ",A[i]);
      }
      printf("\n\nInput Vector2 :\n");
      for (i = 0; i <=  *n - 1; i += 1) {
        B[i] = (3 * i);
        printf("%d ",B[i]);
      }
      ret = clGetPlatformIDs(1,&platform_id,&ret_num_platforms);
      if (ret != CL_SUCCESS) {
        printf("Platform error");
      }
      ret = clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_GPU,1,&device_id,&ret_num_devices);
     
      if (ret != CL_SUCCESS)
      printf("device err");
     
      context=clCreateContext(NULL,1,&device_id,NULL,NULL,&ret);
      if (!context)
      printf("context err");
     
      command_queue = clCreateCommandQueue(context,device_id,0,&ret);
     
      if (!command_queue)
      printf("command queue error");
     
      cl_buffer0=clCreateBuffer(context, CL_MEM_WRITE_ONLY, (*n) * sizeof(int), NULL, &ret);
     
    cl_buffer1=clCreateBuffer(context, CL_MEM_WRITE_ONLY, (*n) * sizeof(int), NULL, &ret);
     
      cl_buffer3=clCreateBuffer(context, CL_MEM_WRITE_ONLY,  sizeof(int), NULL, &ret);
     
    cl_buffer2=clCreateBuffer(context, CL_MEM_READ_WRITE, (*n) * sizeof(int), NULL, &ret);
     
    ret = clEnqueueWriteBuffer(command_queue, cl_buffer0 , CL_TRUE, 0,(*n) * sizeof(int), A, 0, NULL, NULL);
     
     ret = clEnqueueWriteBuffer(command_queue, cl_buffer1 , CL_TRUE, 0,(*n) * sizeof(int), B, 0, NULL, NULL);
     
     ret = clEnqueueWriteBuffer(command_queue, cl_buffer3 , CL_TRUE, 0, sizeof(int), n, 0, NULL, NULL);
     
     ret = clEnqueueWriteBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,(*n) * sizeof(int), C, 0, NULL, NULL);
     
      fp2 = fopen(fileName2,"r");
      if (!fp2) {
        fprintf(stderr,"Failed");
        exit(1);
      }
      source_str2 = (char*)malloc(MAX_SOURCE_SIZE);
      source_size2 = fread(source_str2,1,MAX_SOURCE_SIZE,fp2);
      fclose(fp2);
     
      program2 = clCreateProgramWithSource(context, 1, (const char **)&source_str2,(const size_t *)&source_size2, &ret);
     
          if(!program2)
      printf("error creating program2");
     
      ret = clBuildProgram(program2, 1, &device_id, NULL, NULL, NULL);
     
      if (ret)
      printf("error building program2");
     
      kernel2 = clCreateKernel(program2, "VEC_ADD" , &ret);
     
      ret = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &cl_buffer0);
     
      ret = clSetKernelArg(kernel2, 1, sizeof(cl_mem), &cl_buffer1);
     
     ret = clSetKernelArg(kernel2, 2, sizeof(cl_mem), &cl_buffer2);
     
      ret = clSetKernelArg(kernel2, 3, sizeof(cl_mem), &cl_buffer3);
     
     localWorkSize[0] = 16;
     
      globalWorkSize[0] = 16400;
     
      ret = clEnqueueNDRangeKernel(command_queue, kernel2, 1, NULL, globalWorkSize, localWorkSize, 0 , NULL , NULL);
     
      ret = clEnqueueReadBuffer(command_queue, cl_buffer2 , CL_TRUE, 0,(*n) * sizeof(int), C, 0, NULL, NULL);
     
    printf("\n\nAddition of vectors :\n");
      for (i = 0; i <=  *n - 1; i += 1) {
        printf("%d ",C[i]);
      }
     
        ret = clFlush(command_queue);
        ret = clFinish(command_queue);
        ret = clReleaseKernel(kernel2);
        ret = clReleaseProgram(program2);
        ret = clReleaseMemObject(cl_buffer0);
        ret = clReleaseMemObject(cl_buffer1);
        ret = clReleaseMemObject(cl_buffer2);
        ret = clReleaseMemObject(cl_buffer3);
        ret = clReleaseCommandQueue(command_queue);
        ret = clReleaseContext(context);
        free(A);
        free(B);
        free(C);
        free(n);
     
      return 0;
     
    }

    Now this is running till n = 16343. But I want to check the performance for much larger data set.
    How can I achieve so ?
    Last edited by khronos; Yesterday at 05:51 AM. Reason: Wrap code in [ CODE ] tags

Tags for this Thread

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •  
Proudly hosted by Digital Ocean