OpenCL compile error when using constant memory

So in the following kernel, if I change the global declaration of ndevice to constant, the opencl program/kernel will not compile.


__kernel void kernel_fdk(
	__global float *dev_vol,
	__read_only image2d_t dev_img,
	__constant float *dev_matrix,
	__constant float4 *nrm,
	__constant float4 *vol_offset,
	__constant float4 *vol_pix_spacing,
	__constant int4 *vol_dim,
	__constant float2 *ic,
	__constant int2 *img_dim,
	__constant float *sad,
	__constant float *scale,
	__constant int4 *offset,
	__global int4 *ndevice
){
	uint i = get_global_id(0);
	uint j = get_global_id(1);
	uint k = get_global_id(2);

	if (i >= (*ndevice).x || j >= (*ndevice).y || k >= (*ndevice).z)
		return;

	// Index row major into the volume
	long vol_idx = i + (j * (*vol_dim).x) + (k * (*vol_dim).x * (*vol_dim).y);
	vol_idx -= (*offset).w;

	i += (*offset).x;
	j += (*offset).y;
	k += (*offset).z;

	// Get volume value from global memory
	float dev_vol_value = dev_vol[vol_idx];

	// offset volume coords
	float4 vp;
	vp.x = (*vol_offset).x + (i * (*vol_pix_spacing).x);	// Compiler should combine into 1 FMAD.
	vp.y = (*vol_offset).y + (j * (*vol_pix_spacing).y);	// Compiler should combine into 1 FMAD.
	vp.z = (*vol_offset).z + (k * (*vol_pix_spacing).z);	// Compiler should combine into 1 FMAD.

	// matrix multiply
	float4 ip;
	ip.x = (dev_matrix[0] * vp.x) + (dev_matrix[1] * vp.y) + (dev_matrix[2] * vp.z) + dev_matrix[3];
	ip.y = (dev_matrix[4] * vp.x) + (dev_matrix[5] * vp.y) + (dev_matrix[6] * vp.z) + dev_matrix[7];
	ip.z = (dev_matrix[8] * vp.x) + (dev_matrix[9] * vp.y) + (dev_matrix[10] * vp.z) + dev_matrix[11];

	// Change coordinate systems
	ip.x = (*ic).x + ip.x / ip.z;
	ip.y = (*ic).y + ip.y / ip.z;

	// Get pixel location from 2D image
	int2 pos;
	pos.y = convert_int_rtn(ip.x);
	pos.x = convert_int_rtn(ip.y);

	// Clip against image dimensions
	if (pos.x < 0 || pos.x >= (*img_dim).x || pos.y < 0 || pos.y >= (*img_dim).y)
		return;

	// Get pixel from texture memory
	float4 voxel_data = read_imagef(dev_img, dev_img_sampler, pos);

	// Dot product
	float s = ((*nrm).x * vp.x) + ((*nrm).y * vp.y) + ((*nrm).z * vp.z);

	// Conebeam weighting factor
	s = (*sad) - s;
	s = ((*sad) * (*sad)) / (s * s);

	// Place it into the volume
	dev_vol[vol_idx] = dev_vol_value + ((*scale) * s * voxel_data.x);
}

The errors are as follows:


Build Log:
ptxas application ptx input, line 77; error   : Illegal bank number: 11
ptxas application ptx input, line 90; error   : Illegal bank number: 11
ptxas fatal   : Ptx assembly aborted due to errors
error   : Ptx compilation failed: gpu='sm_11', device code='anonymous_jit_identity'
: Retrieving binary for 'anonymous_jit_identity', for gpu='sm_11', usage mode=''
: Considering profile 'compute_10' for gpu='sm_11' in 'anonymous_jit_identity'
: Control flags for 'anonymous_jit_identity' disable search path
: Ptx binary found for 'anonymous_jit_identity', architecture='compute_10'
: Ptx compilation for 'anonymous_jit_identity', for gpu='sm_11', ocg options=''

Clearly this is a memory limit issue, but how can it be solved? Using less constant memory?

This looks like a bug in your kernel. You are using 10 kernel arguments declared with the __constant qualifier. The CL spec states that upto 8 constant arguments can be specified in a kernel. Please query the value of CL_DEVICE_MAX_CONSTANT_ARGS on the implementation you are running on and then make sure that the maximum number of arguments you define in the kernel is <= CL_DEVICE_MAX_CONSTANT_ARGS.