Atomic compare and swap

Hi everyone,

I’m trying to use the atom_cmpxchg (OpenCL version is 1.0, GPU is NVIDIA 9600M GT) function, but I cant manage to get the expected result: the swap dont happen.

I tried to find a example (my code is rather long, so I looked for something simpler), but I can’t make it work either (this code can be found on the internet, but I just dont remember the address right now):

The output result:

	Old A = 500
	New A 500

I may not have understood what this function is supposed to do…
Given the prototype of the function atom_cmpxchg (__global int *p, int cmp, int val): I want to swap the value at *p by val if and only if *p == cmp (store the old value of *p if *p!=cmp). Is that right? Or I am missing something?

Thanks for your help!

The kernel:

__kernel void atomiccmpxchg(__global int *old, __global int *new)
{
	__local int v,v1;
        v = 500;
	v1=10;
	*old = atom_cmpxchg(new,v,v1);
}  

The host code:

#include <iostream>
#include <cstdlib>
#include <fstream>
#include <string>
#if defined __APPLE__ || defined (MACOSX)
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif


using namespace std;

void err_check( int err, string err_code ) {
	if ( err != CL_SUCCESS ) {
		cout << "Error: " << err_code << "(" << err << ")" << endl;
		exit(-1);
	}
}

int main()
{
	cl_platform_id platform_id = NULL;
	cl_device_id device_id = NULL;
	cl_context context = NULL;
	cl_command_queue command_queue = NULL;
	cl_mem mobj_a = NULL;
	cl_mem mobj_b = NULL;
	cl_program program = NULL;
	cl_kernel kernel = NULL;
	cl_uint ret_num_devices;
	cl_uint ret_num_platforms;
	cl_int err;
  
	int a, b;
	a = 500;
	b = 500;
  
  
	// Get platform/device information 
	err = clGetPlatformIDs( 1, &platform_id, &ret_num_platforms );
	err_check( err, "clGetPlatformIDs" );
  
	// Get information about the device
	err = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices );
	err_check( err, "clGetDeviceIDs" );
  
	// Create OpenCL Context
	context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &err );
	err_check( err, "clCreateContext" );
  
	// Create Command Queue
	command_queue = clCreateCommandQueue( context, device_id, CL_QUEUE_PROFILING_ENABLE, &err );
	err_check( err, "clCreateCommandQueue" );
  
	// Create memory objects and tranfer the data to memory buffer
	mobj_a = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(int), NULL, &err );
	err = clEnqueueWriteBuffer( command_queue, mobj_a, CL_TRUE, 0, sizeof(int), &a, 0, NULL, NULL );
	err_check( err, "clEnqueueWriteBuffer" );
  
	mobj_b = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(int), NULL, &err );
	err = clEnqueueWriteBuffer( command_queue, mobj_b, CL_TRUE, 0, sizeof(int), &b, 0, NULL, NULL );
	err_check( err, "clEnqueueWriteBuffer" );	
  
	// Read kernel file
	ifstream file("atomic_cmpxchg.cl");
	string prog( istreambuf_iterator<char>( file ), ( istreambuf_iterator<char>() ) );
	const char *source_str = prog.c_str();
  
	// Create Kernel program from the read in source
	program = clCreateProgramWithSource( context, 1, (const char **) &source_str, 0, &err );
	err_check( err, "clCreateProgramWithSource" );
  
	// Build Kernel Program
	err = clBuildProgram( program, 1, &device_id, NULL, NULL, NULL );
  size_t len;
  char buffer[2048];
  clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
                        sizeof(buffer), buffer, &len);
  printf("--- Build log ---
%s
", buffer);
	err_check( err, "clBuildProgram" );
  
  
	// Create OpenCL Kernel
	kernel = clCreateKernel( program, "atomiccmpxchg", &err );
	err_check( err, "clCreateKernel" );
  
	//  Set OpenCL kernel argument
	err = clSetKernelArg( kernel, 0, sizeof( cl_mem ), (void *) &mobj_a );
	err_check( err, "clSetKernelArg" );
	err = clSetKernelArg( kernel, 1, sizeof( cl_mem ), (void *) &mobj_b );
	err_check( err, "clSetKernelArg" );
  
	//  Execute OpenCL kernel in task parallel
	clEnqueueTask( command_queue, kernel, 0, NULL, NULL );
	err_check( err, "clEnqueueTask" );	
  
	//  Read (Transfer result) from the memory buffer
	err = clEnqueueReadBuffer( command_queue, mobj_a, CL_TRUE, 0, sizeof(int), &a, 0, NULL, NULL );
	err = clEnqueueReadBuffer( command_queue, mobj_b, CL_TRUE, 0, sizeof(int), &b, 0, NULL, NULL );
  
	// Free objects
	err = clFlush( command_queue );
	err = clFinish( command_queue );
	err = clReleaseKernel( kernel );
	err = clReleaseProgram( program );
	err = clReleaseMemObject( mobj_a );
	err = clReleaseMemObject( mobj_b );
	err = clReleaseCommandQueue( command_queue );
	err = clReleaseContext( context );
  
	// Display result
	cout << "	Old A = " << a << endl;
	cout << "	New A " << b << endl;
  
  
	return 0;
}

You aren’t checking the return code from clEnqueueTask. Your code is

clEnqueueTask( command_queue, kernel, 0, NULL, NULL );

But it should be

err = clEnqueueTask( command_queue, kernel, 0, NULL, NULL );

Er… well yes, indeed it was missing. It does not help though (I’ve added the return code check, just in case…).
As I was saying, it is not my code, I was just looking for a minimal example to try this function. The swap wont happen, either in this example or in my own program.

Have you enabled atomic operations? Might be disabled?

NOTE: The atomic built-in functions that use the atom_ prefix and are described by the
following extensions
cl_khr_global_int32_base_atomics
cl_khr_global_int32_extended_atomics
cl_khr_local_int32_base_atomics
cl_khr_local_int32_extended_atomics
in sections 9.5 and 9.6 of the OpenCL 1.0 specification are also supported.

Since you first write to local memory when settings v and v1, then read this local memory in the atom_cmpxchg() function, you should insert a local memory fence before calling atom_cmpxchg.

Yep, atomics operations are enabled (I tripled checked)

I’ve added the barrier, no change though, still cant manage to swap the values.
I tried with privates variables instead, so that no barrier is needed, still nothing.

I can’t see any error in your program. In fact, it works fine on a 9600M GS (driver 306.23)

Have you tried with a simple assignment:

*old = *new;
if (*new == v) *new = v1;

Yes it could be done that way, but I need to prevent any workitem to interfer in an other workitem’s instructions (would cause data inconsistency).

That’s really weird…