Hello World not working in Linux

After writing a large program that works in Windows and not Linux, I decided to debug with a Hello World program. I realie there are better ways to write this program, I am just trying to debug my hardware / install.

This runs correctly in Windows using the CUDA 5.5 and the Intel OpenCL libs. I have a Centos 6.4 server I would like to run on as well. I installed CUDA 5.5 and the Intel XE SDK. Neither will run Hello World. If I choose the NVIDIA GPU I get a -45 (CL_INVALID_PROGRAM_EXECUTABLE) error on clCreateKernel. If I select to run on my Intel Xeon Phi I actually get the message “1 warning generated.” however I cannot figure out how to view the warning!!

Can anyone provide insight or what to debug from here?


#include <utility>
#include <CL/cl.h> 
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>
#include <iterator>
#include <cstring>


const char* kernel_text = 
	"#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
" \
	"__constant char hw[] = \"Hello World\
\";
"
	"__kernel void hello(__global char * out){ 
"
	"size_t tid = get_global_id(0);	
"
	"out = hw[tid];}
";

int	main(void)
{
	cl_int err;
	cl_uint platformCount, devCount;
	cl_int status;
	char nameBuf[1000];

	// Get list of OpenCL compatible platforms
	err = clGetPlatformIDs(0, NULL, &platformCount);  

	// Allocate memory, get list of platform handles
	cl_platform_id *platforms =
		(cl_platform_id *) malloc(platformCount*sizeof(cl_platform_id));

	err = clGetPlatformIDs(platformCount, platforms, NULL);

	// List platform(s) and vendor(s)
	for(unsigned int i = 0; i < platformCount; i++)
	{    
		err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR,sizeof(nameBuf), nameBuf, NULL);

		std::cerr << "Platform " << i << " is by: " << nameBuf << std::endl;
	}

	std::string selection = "";
	std::cout << "Please select a platform (0 to " << platformCount-1 << ")" << std::endl;
	getline(std::cin, selection);

	// Let user make selection
	cl_platform_id selectedPlatform = platforms[stoi(selection)];

	// allocate memory, get list of device handles in platform
	err = clGetDeviceIDs(platforms[stoi(selection)], CL_DEVICE_TYPE_ALL, 0, NULL, &devCount);
	cl_device_id *devices =
		(cl_device_id *) malloc(devCount*sizeof(cl_device_id));
	err = clGetDeviceIDs(platforms[stoi(selection)], CL_DEVICE_TYPE_ALL, devCount, devices, NULL);
	cl_device_id device = devices[0];

	// Create platform context and don't ask why the platform ID 
	// and properties are the same value with different typecasts...
	cl_context_properties cprops[3] = 
	{CL_CONTEXT_PLATFORM, (cl_context_properties)selectedPlatform, 0};
	cl_context context = clCreateContext(cprops, 1, &device, NULL, NULL, &status);

	// create a command queue
	cl_command_queue_properties queueProps = 0;
	cl_command_queue queue = clCreateCommandQueue(context, device, queueProps, &status);

	// Setup result buffer for OpenCL
	char * outH = new char[hw.length()+1];
	cl_mem outCL = clCreateBuffer(context,
		CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
		hw.length()+1,
		outH,
		&err);
	clFinish(queue);


	//const char *srcStr = kernel_text;
	const char **str = &kernel_text;//&srcStr;
	// create an OpenCL program (may have multiple kernels)
	size_t kernelSize[] = {strlen(kernel_text)};
	cl_program program = clCreateProgramWithSource(context, 1, str, kernelSize, &status);
	std::cout << "sts " << status << std::endl;

	// build it
	status = clBuildProgram(program, devCount, &devices[0], NULL, NULL, NULL);
	if (status != 0) {
		// Determine the size of the log
		size_t log_size;
		clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

		// Allocate memory for the log
		char *log = (char *) malloc(log_size);

		// Get the log
		clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

		// Print the log
		printf("%s
", log);
		free(log);
	}

	// fish the kernel out of the program
	const char* kernelName = "hello";
	cl_kernel kernel = clCreateKernel(program, kernelName, &status);
	std::cout << "sts " << status << std::endl;
	clFinish(queue);

	// Device computations use a command queue. This is a 1-to-1 mapping
	// with the device, associated with a context.
	size_t wgSize[]= {1};
	size_t block_size[] = {1};
	clSetKernelArg(kernel, 0, hw.length(), outCL);
	clEnqueueNDRangeKernel(queue,kernel,1,NULL,block_size,wgSize,0,NULL,NULL);
	clFinish(queue);

	clEnqueueReadBuffer(queue,outCL,CL_TRUE,0,hw.length()+1,outH,0,NULL,NULL);
	std::cout << outH;

	std::cout << "Press any key to exit..." << std::endl;
	getline(std::cin, selection);
	//free(kernel_text);

	return 0;
}


There are several problems:

  • The last line of your kernel should be: out[tid]=hw[tid]
  • arg_size in clSetKernelArg is the size of the type of the memory object: clSetKernelArg(kernel, 0, sizeof(cl_mem), outCL)
  • The global work-size block_size should contain hw.length()+1, not 1