Failed to build CL program on Radeon 4670

Hello,

I am using iMac 2009 with OSX 10.6.4 and video card Radeon 4670. When trying to build a kernel (see full program below) I ger error CL_BUILD_PROGRAM_FAILURE. The same kernel builds successfully on MacBook with OSX 10.6.4 and GF9600. If you remove any string from the kernel (even the blank function declaration) it will start to build without any problems. Does anyone know what can be causing it?

The program in question (yes, it was initially fft calculation, I just threw out all the parts that did not affect the error):


#include <string.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <OpenCL/opencl.h>


int main (int argc, char * const argv[]) {

	cl_ulong gMemSize;
	int numIter = 1;
	int batchSize = 1;
	cl_device_id device_ids[16];

	FILE *paramFile;

	cl_int err;
	unsigned int num_devices;

	err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, sizeof(device_ids), device_ids, &num_devices);
	if(err)
	{
		printf("clGetComputeDevice failed
");
		return -1;
	}

	cl_device_id device_id = NULL;
	cl_context context;

	unsigned int i;
	for(i = 0; i < num_devices; i++)
	{
	    cl_bool available;
	    err = clGetDeviceInfo(device_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL);
	    if(err)
	    {
	         printf("Cannot check device availability of device # %d
", i);
	    }

	    if(available)
	    {
	        device_id = device_ids[i];
	        break;
	    }
	    else
	    {
	        char name[200];
	        err = clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(name), name, NULL);
	        if(err == CL_SUCCESS)
	        {
	             printf("Device %s not available for compute
", name);
	        }
	        else
	        {
	             printf("Device # %d not available for compute
", i);
	        }
	    }
	}

	if(!device_id)
	{
	    printf("None of the devices available for compute ... aborting test
");
	    return -1;
	}

	context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	if(!context || err)
	{
		printf("clCreateContext failed
");
		return -1;
	}

	const char *src = "\
\
__kernel void fftInv(__global float *in)\
{\
	__local float smem[768];\
	size_t smem_store_index, smem_load_index;\
	float2 a[8] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};\
	int thread_id = get_local_id(0);\
	int blocks_num = get_num_groups(0);\
	int jj = thread_id >> 4;\
	smem_store_index = jj;\
\
	a[0] = in[0];\
	a[1] = in[64];\
	a[2] = in[128];\
	a[3] = in[192];\
	a[4] = in[256];\
	a[5] = in[320];\
\
	smem_load_index = thread_id;\
\
	a[0].x = smem[smem_load_index + 0];\
	a[1].x = smem[smem_load_index + 1];\
	a[2].x = smem[smem_load_index + 6];\
	a[3].x = smem[smem_load_index + 7];\
	a[4].x = smem[smem_load_index + 12];\
	a[5].x = smem[smem_load_index + 13];\
	a[6].x = smem[smem_load_index + 18];\
	a[7].x = smem[smem_load_index + 19];\
\
	barrier(CLK_LOCAL_MEM_FENCE);\
\
	smem[smem_store_index + 0] = a[0].y;\
	smem[smem_store_index + 3] = a[1].y;\
	smem[smem_store_index + 6] = a[2].y;\
	smem[smem_store_index + 9] = a[3].y;\
	smem[smem_store_index + 12] = a[4].y;\
	smem[smem_store_index + 15] = a[5].y;\
	smem[smem_store_index + 18] = a[6].y;\
	smem[smem_store_index + 21] = a[7].y;\
\
	barrier(CLK_LOCAL_MEM_FENCE);\
\
	smem[smem_load_index + 0] = a[0].x;\
	smem[smem_load_index + 2] = a[2].x;\
	smem[smem_load_index + 4] = a[4].x;\
	smem[smem_load_index + 6] = a[6].x;\
	smem[smem_load_index + 8] = a[1].x;\
	smem[smem_load_index + 10] = a[3].x;\
	smem[smem_load_index + 12] = a[5].x;\
	smem[smem_load_index + 14] = a[7].x;\
}\
\
__kernel void fftFwd(__global float2 *in, __global float2 *out, int S)\
{\
\
}\
	";

	cl_program program;
	program = clCreateProgramWithSource(context, 1, (const char**)&src, NULL, &err);
	if(!program || err)
	{
		printf("clCreateProgramWithSource failed
");
		return -1;
	}

	err = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
	if (err != CL_SUCCESS)
	{
		char *build_log;
		char devicename[200];
		size_t log_size;

		printf("Failed to build program; error %d
", err);

		err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
		if(err != CL_SUCCESS)
		{
			printf("Failed to get build log size
");
			return -1;
		}

		build_log = (char *) malloc(log_size + 1);
		err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
		if(err != CL_SUCCESS)
		{
			printf("Failed to get build log
");
			return -1;
		}

		err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(devicename), devicename, NULL);
		if(err != CL_SUCCESS)
		{
			printf("Failed to get device info
");
			return -1;
		}

		printf("FFT program build log on device %s:
", devicename);
		printf("%s
", build_log);
		free(build_log);
	}

	clReleaseContext(context);

	return 0;
}

Output that I get:


$ ./test
Failed to build program; error -11
FFT program build log on device Radeon HD 4670: