offline kernel compilation SPIR linux

Hi,

I would like to compile kernel offline using SPIR, and then load the binary.
Several entry in this forum touch the subject ( https://www.khronos.org/message_boards/showthread.php/9482-Offline-Binary-Kernel-Generation , https://www.khronos.org/message_boards/showthread.php/8763-Offline-Kernel-Compiling-in-Ubuntu , https://www.khronos.org/message_boards/showthread.php/5796-offline-compilation ), but none of them answers my questions.

I followed the instructions given in GitHub - KhronosGroup/SPIR to build the SPIR generator using Clang. I the following snippets of code, clang refers to this version build from KhronosGroup github.

I tried on a very simple kernel, but failed to use it offline.

Kernel Code

I have a very simple kernel, in file bidon.cl :


__kernel void vecAdd(__global float* a)
{
    int gid = get_global_id(0);
    a[gid] += a[gid];
}

Online Compilation

I can call it online, using the following file offline.cpp :


#include <stdio.h>
#include <stdlib.h>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

#define MEM_SIZE (32)
#define MAX_SOURCE_SIZE (0x100000)

int main()
{
	cl_platform_id platform_id = NULL;
	cl_device_id device_id = NULL;
	cl_context context = NULL;
	cl_command_queue command_queue = NULL;
	cl_mem memobj = NULL;
	cl_program program = NULL;
	cl_kernel kernel = NULL;
	cl_uint ret_num_devices;
	cl_uint ret_num_platforms;
	cl_int ret;

	float mem[MEM_SIZE];

	FILE *fp;
	const char fileName[] = "./bidon.cl";
	size_t source_size;
	char *source_str;
	cl_int i;

	/* Load kernel source code */
	fp = fopen(fileName, "r");
	if (!fp) {


	}
	source_str = (char *)malloc(MAX_SOURCE_SIZE);
	source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose(fp);

	/*Initialize Data */
	for (i = 0; i < MEM_SIZE; i++) {
          mem[i] = i;
	}

	/* Get platform/device information */
	ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
	ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

	/* Create OpenCL Context */
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

	/* Create Command Queue */
	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

	/* Create memory buffer*/
	memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(float), NULL, &ret);

	/* Transfer data to memory buffer */
	ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL);

	/* Create Kernel program from the read in source */
	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);

	/* Build Kernel Program */
	ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

	/* Create OpenCL Kernel */
	kernel = clCreateKernel(program, "vecAdd", &ret);
        fprintf(stdout, "err: %d
", ret);

	/* Set OpenCL kernel argument */
	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);

	size_t global_work_size[3] = {MEM_SIZE, 0, 0};
	size_t local_work_size[3] = {MEM_SIZE, 0, 0};

	/* Execute OpenCL kernel */
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);

	/* Transfer result from the memory buffer */
	ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL);

	/* Display result */
	for (i=0; i < MEM_SIZE; i++) {
          fprintf(stdout, "mem[%d]: %f
", i, mem[i]);
	}

	/* Finalization */
	ret = clFlush(command_queue);
	ret = clFinish(command_queue);
	ret = clReleaseKernel(kernel);
	ret = clReleaseProgram(program);
	ret = clReleaseMemObject(memobj);
	ret = clReleaseCommandQueue(command_queue);
	ret = clReleaseContext(context);

	free(source_str);

	return 0;
}

I compile this by running :

clang -I/usr/local/cuda-7.0/include online.cpp -lOpenCL

I run it and it works properly.

Offline Compilation

I compile the kernel with the follwing command :

clang -cc1 -emit-llvm-bc -triple spir64-unknown-unknown -include opencl_spir.h -o bidon.bc bidon.cl

This creates the file bidon.bc

I have a file offline.cpp (which is almost exactly the same as online.cpp, the only major difference is that clCreateProgramWithSource() is replaced with clCreateProgramWithBinary() :


#include <stdio.h>
#include <stdlib.h>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

#define MEM_SIZE (32)
#define MAX_BINARY_SIZE (0x100000)

int main()
{
	cl_platform_id platform_id = NULL;
	cl_device_id device_id = NULL;
	cl_context context = NULL;
	cl_command_queue command_queue = NULL;
	cl_mem memobj = NULL;
	cl_program program = NULL;
	cl_kernel kernel = NULL;
	cl_uint ret_num_devices;
	cl_uint ret_num_platforms;
	cl_int ret;

	float mem[MEM_SIZE];

	FILE *fp;
	char fileName[] = "./bidon.bc";
	size_t binary_size;
	char *binary_buf;
	cl_int binary_status;
	cl_int i;

	/* Load kernel binary */
	fp = fopen(fileName, "r");
	if (!fp) {
          fprintf(stderr, "Could not read the kernel file: %s
", fileName);
          exit(1);
	}
	binary_buf = (char *)malloc(MAX_BINARY_SIZE);
	binary_size = fread(binary_buf, 1, MAX_BINARY_SIZE, fp);
	fclose(fp);

	/* Initialize input data */
	for (i = 0; i < MEM_SIZE; i++) {
          mem[i] = i;
	}

	/* Get platform/device information */
	ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
	ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);

	/* Create OpenCL context*/
	context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

	/* Create command queue */
	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

        /* Create memory buffer */
	memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(float), NULL, &ret);

	/* Transfer data over to the memory buffer */
	ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL);

	/* Create kernel program from the kernel binary */
	program = clCreateProgramWithBinary(context, 1, &device_id, (const size_t *)&binary_size,
	(const unsigned char **)&binary_buf, &binary_status, &ret);

        fprintf(stdout, "clCreateProgramWithBinary err: %d
", ret);
        fprintf(stdout, "binary_status: %d
", binary_status);

        ret = clBuildProgram( program, 1, &device_id, NULL, NULL, NULL );
        fprintf(stdout, "clBuildProgram: %d
", ret);


	/* Create OpenCL kernel */
	kernel = clCreateKernel(program, "vecAdd", &ret);
	printf("clCreateKernel err:%d
", ret);

	/* Set OpenCL kernel arguments */
	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);

	size_t global_work_size[3] = {MEM_SIZE, 0, 0};
	size_t local_work_size[3] = {MEM_SIZE, 0, 0};

	/* Execute OpenCL kernel */
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL);

	/* Copy result from the memory buffer */
	ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL);

	/* Display results */
	for (i=0; i < MEM_SIZE; i++) {
	  fprintf(stdout, "mem[%d]: %f
", i, mem[i]);
	}

	/* Finalization */
	ret = clFlush(command_queue);
	ret = clFinish(command_queue);
	ret = clReleaseKernel(kernel);
	ret = clReleaseProgram(program);
	ret = clReleaseMemObject(memobj);
	ret = clReleaseCommandQueue(command_queue);
	ret = clReleaseContext(context);

	free(binary_buf);

	return 0;
}

I build this with the following command :

clang -I/usr/local/cuda-7.0/include offline.cpp -lOpenCL

But when I run it I get error codes.

clCreateProgramWithBinary() return code is (0) : OK
clBuildProgram() returns an error code (-42) : CL_INVALID_BINARY (I tried both with and without the call to clBuildProgram() : https://www.khronos.org/message_boards/showthread.php/9482-Offline-Binary-Kernel-Generation )
clCreateKernel() returns an error code (-45) : CL_INVALID_PROGRAM_EXECUTABLE

Did I do something wrong ? Any suggestion about what should I try to make it work ?

What is the platform/device that you’re targeting? Not all support SPIR consumption at present, so you’d need to check the platform/device extensions list for cl_khr_spir.

I work on ubuntu 15.04

I assumed that nvidia would support spir, which is wrong.

When I probe for CL_DEVICE_EXTENSIONS, I get the following result :


=== 1 OpenCL platform(s) found: ===
  -- 0 --
  PROFILE = FULL_PROFILE
  VERSION = OpenCL 1.2 CUDA 7.5.15
  NAME = NVIDIA CUDA
  VENDOR = NVIDIA Corporation
  EXTENSIONS = cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_copy_opts 
=== 3 OpenCL device(s) found on platform:
  -- 0 --
  DEVICE_NAME = GeForce GTX 960
  DEVICE_VENDOR = NVIDIA Corporation
  DEVICE_VERSION = OpenCL 1.2 CUDA
  DRIVER_VERSION = 352.30
  DEVICE_MAX_COMPUTE_UNITS = 8
  DEVICE_MAX_CLOCK_FREQUENCY = 1228
  DEVICE_GLOBAL_MEM_SIZE = 2147287040
  DEVICE_EXTENSIONS = cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_copy_opts  cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 
  -- 1 --
  DEVICE_NAME = GeForce GTX 690
  DEVICE_VENDOR = NVIDIA Corporation
  DEVICE_VERSION = OpenCL 1.2 CUDA
  DRIVER_VERSION = 352.30
  DEVICE_MAX_COMPUTE_UNITS = 8
  DEVICE_MAX_CLOCK_FREQUENCY = 1019
  DEVICE_GLOBAL_MEM_SIZE = 2146762752
  DEVICE_EXTENSIONS = cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_copy_opts  cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 
  -- 2 --
  DEVICE_NAME = GeForce GTX 690
  DEVICE_VENDOR = NVIDIA Corporation
  DEVICE_VERSION = OpenCL 1.2 CUDA
  DRIVER_VERSION = 352.30
  DEVICE_MAX_COMPUTE_UNITS = 8
  DEVICE_MAX_CLOCK_FREQUENCY = 1019
  DEVICE_GLOBAL_MEM_SIZE = 2147287040
  DEVICE_EXTENSIONS = cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_copy_opts  cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 

Looks like spir is not supported on GeForce GTX 690 nor GeForce GTX 960.

Yes, unfortunately NVIDIA currently do not support SPIR consumption. This may change in the future as the new SPIR-V specification is now a core feature in OpenCL 2.1.

I believe both Intel and AMD do support SPIR.

[QUOTE=jprice;39444]Yes, unfortunately NVIDIA currently do not support SPIR consumption. This may change in the future as the new SPIR-V specification is now a core feature in OpenCL 2.1.

I believe both Intel and AMD do support SPIR.[/QUOTE]

Hi jprice. You are indeed perfectly right. I have tried on an AMD gpu. Its DEVICE_EXTENSIONS contain cl_khr_spir, and the code I described in my first message does work as expected (with and without offline compilation).
Thank you.