Problem with bitwise or

Hello again,
I have the following code:

#include <oclUtils.h>
#include <stdio.h>
#include "Timer.h"


int main()
{
   cl_platform_id platform;
   

   size_t szGlobalWorkSize;
   size_t szLocalWorkSize=64;
   szGlobalWorkSize = 64;
   printf("Global Work Size = %d 
",(int)szGlobalWorkSize);
   
   cl_long message = 0x0123456789abcdef;
   cl_long key = 0x133457799BBCDFF1;
   cl_long encrypted_message=0x0;
   Timer t1;
   cl_int err = oclGetPlatformID(&platform);
   if(err != CL_SUCCESS)
       printf("O eroare la citirea platformei= %s
",oclErrorString(err));
    
   cl_device_id gpuDevice;
   err = clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU,1,&gpuDevice,NULL);
   if(err != CL_SUCCESS)
        printf("O eroare la conectarea la dispozitivul de calcul= %s
",oclErrorString(err));
   char cBuffer[1024];
   err = clGetDeviceInfo(gpuDevice,CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(cBuffer),&cBuffer,NULL);
   if(err != CL_SUCCESS)
        printf("O eroare la interogarea dispozitivului de calcul= %s
",oclErrorString(err));
   printf("Max items: %s
",cBuffer);

   cl_context gpuContext;
   gpuContext = clCreateContext(0,1,&gpuDevice,NULL,NULL,&err);
   if(err != CL_SUCCESS)
        printf("O eroare la crearea contextului= %s
",oclErrorString(err));
   
   cl_command_queue gpuCommandQueue;
   gpuCommandQueue = clCreateCommandQueue(gpuContext,gpuDevice,0,&err);
   if(err != CL_SUCCESS)
        printf("O eroare la crearea cozii de comenzi= %s
",oclErrorString(err));

   cl_program program;
   size_t kernelLength ;
   const char *kernelStr =oclLoadProgSource("kernel.cl","",&kernelLength);
   
   program = clCreateProgramWithSource(gpuContext,1,&kernelStr,&kernelLength,&err);
   err = clBuildProgram(program,0,NULL,NULL,NULL,NULL);
   if(err != CL_SUCCESS)
        printf("O eroare la compilarea kernelului= %s
",oclErrorString(err));
	

   size_t retval;
   clGetProgramBuildInfo(program,gpuDevice,CL_PROGRAM_BUILD_LOG,0,NULL,&retval);
   char * buid_log = new char[retval+1];
   clGetProgramBuildInfo(program,gpuDevice,CL_PROGRAM_BUILD_LOG,retval,buid_log,NULL);
   buid_log[retval]='\0';
   printf("%s
",buid_log);

   cl_kernel kernel = clCreateKernel(program,"encrypt",&err);
 
   cl_mem buf_a = clCreateBuffer(gpuContext,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(cl_long), &message,&err);
   cl_mem buf_b = clCreateBuffer(gpuContext,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(cl_long), &key,&err);
   cl_mem buf_c = clCreateBuffer(gpuContext,CL_MEM_WRITE_ONLY,sizeof(cl_long),&encrypted_message,&err);

   err = clSetKernelArg(kernel,0,sizeof(cl_mem),(void *)&buf_a);
   err = clSetKernelArg(kernel,1,sizeof(cl_mem),(void *)&buf_b);
   err = clSetKernelArg(kernel,2,sizeof(cl_mem),(void *)&buf_c);
   clFlush(gpuCommandQueue);
   t1.start();
   
   err = clEnqueueNDRangeKernel(gpuCommandQueue,kernel,1,NULL,&szGlobalWorkSize,&szLocalWorkSize,0,0,0);
   clEnqueueReadBuffer(gpuCommandQueue,buf_c,CL_TRUE,0, sizeof(cl_long),&encrypted_message,0,0,0);
   
   printf("Time: %f
",t1.getElapsedTimeInMilliSec());
   printf("%ll 
",encrypted_message);
   clReleaseMemObject(buf_a);
   clReleaseMemObject(buf_b);
 

}

and the kernel code is



unsigned long permute_pc1(unsigned long src,int tid){
	unsigned long dst = 0;
	int srcPos = 0;
	unsigned char permutation[] = {
														57, 49, 41, 33, 25, 17, 9,
													    1,  58, 50, 42, 34, 26, 18,
													    10, 2,  59, 51, 43, 35, 27,
													    19, 11, 3,  60, 52, 44, 36,
													    63, 55, 47, 39, 31, 23, 15,
														7,  62, 54, 46, 38, 30, 22,
													    14, 6,  61, 53, 45, 37, 29,
													    21, 13, 5,  28, 20, 12, 4
													 };
	if(tid<56)
		{
			srcPos = 64 - permutation[tid];
			dst = (src>>srcPos & 0x01)<<(55-tid);
		}
	return dst;
}

__kernel void encrypt(__global unsigned long *message, __global unsigned long *key, __global unsigned long *encrypted)
                     {
                         unsigned int tid = get_global_id(0);
						 __local unsigned long cheie;
						 cheie = key[0];
						 unsigned long bit = permute_pc1(cheie,tid);
						 __local unsigned long result[64];
						 result[tid] = 0;
						 result[tid] = bit;
						 encrypted[0] = encrypted[0]|result[tid];
						
}

I want to do a simple parallel permutation according to the permutation matrix. All things go accordingly to purpose but when I do

encrypted[0] = encrypted[0]|result[tid];

encrypted is 0;
but if I do


if(tid==0)//or another value
encrypted[0] = encrypted[0]|result[tid];

encrypted has the designated bit set or unset according to the test values

Any hint on how to resolve this problem would be greatly appreciated!

Not sure exactly what you’re expecting here: that statement encrypted[0] = … will be executed by 64 threads simultaneously. As such, you can’t expect to get any useful value out of it. You will at best get the result of one valid calculation from some seemingly-random thread.

This is why checking tid makes it work.

You need to serialise the or somehow. You probably want something like:


barrier(CLK_LOCAL_MEM_FENCE);

if (tid == 0) {
   for (int i=1;i<64;i++) {
     bit |= result[i];
   }
  encrypted[0] = bit;
}

Although there are other ways to do it which might be more efficient, such as using atomic_or locally, or using a ‘parallel prefix sum’ type thing.

Thank you for the reply. The sugested solution works and I understand why, now. But I tried to improve things with atomic_or for 64 bits long operands. I added the pragma specifying the extension to be used and I replaced your solution with something like this:


 atomic_or(encrypted,bit);

but I get this error :

Error: Cannot yet select: 0x4f9bbc0: i64,ch = AtomicLoadOr 0x4f9c3b8, 0x4f9c550, 0x4c03898<Volatile LDST8[%def_encrypted]> [ORD=27] [ID=14]

I have enumerated my device extension and int64_base_atomics aren’t among them:

CL_DEVICE_EXTENSIONS: cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_d3d9_sharing cl_nv_d3d10_sharing cl_khr_d3d10_sharing cl_nv_d3d11_sharing cl_nv_compiler_options cl_nv_device_att
ribute_query cl_nv_pragma_unroll  cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics