Kernel always returns the same array of single value

Hello, I’m having trouble reading the values back after my workers are done. I don’t get any errors but the entire array is always filled with a single value. My kernel takes a char* of random bytes as input (works as expected) and an int* for output.

__kernel void skein(__global char* input,
	__global int* scores)            
{                                        
    int i = get_global_id(0); 

	u08b_t hashval[128] = {0};
	Skein1024_Ctxt_t ctx;
	Skein1024_Init(&ctx, 1024);
	Skein1024_Update(&ctx, (const u08b_t*)input[i*128], 128);
	Skein1024_Final(&ctx, hashval);

	int cc = countbits(&hashval,&goal);
	//printf("%d, %d
",i,cc);
	scores[i] = cc;
}

When printing from inside the kernel I get the expected output. However back in my host after reading back a buffer from the scores array, all the values in the buffer are the same.

Here is all the code I use for setting up, filling and reading back the data (with error checking removed for shortness):


#define DATA_SIZE (4096)

char* data = new char[DATA_SIZE * 128];
int* score = new int[DATA_SIZE];
cl_mem input;                     
cl_mem outputscores;

unsigned int count = DATA_SIZE;
memset(score,0,DATA_SIZE * sizeof(int));

input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  
			 sizeof(char) * 128 * count, NULL, NULL);
outputscores = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
			sizeof(int) * count, NULL, NULL);

for(int i=0;i<DATA_SIZE * 128;i++){
	  data[i] = (unsigned char)(rand() % RAND_MAX);
}

err = clEnqueueWriteBuffer(commands, input, 
			     CL_TRUE, 0, sizeof(char) * 128 * count, 
			     data, 0, NULL, NULL);

err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputscores);

err = clGetKernelWorkGroupInfo(kernel, device_id, 
			CL_KERNEL_WORK_GROUP_SIZE, 
			sizeof(local), &local, NULL);

global = count;
err = clEnqueueNDRangeKernel(commands, kernel, 
			       1, NULL, &global, &local, 
			       0, NULL, NULL);

clFinish(commands);
  

err = clEnqueueReadBuffer( commands, outputscores,
		CL_TRUE, 0, sizeof(int) * count,
		score, 0 ,NULL, NULL);

I would expect at the end “score” to be filled with the same data I see printed from the kernel but I guess I’m missing something or made some stupid mistake. Any help would be awesome, thanks.

I haven’t figured out what was wrong, but the problem only appears when it is being run in debug mode. Running in release everything works correctly.