Code hangs when executed second time

I am naive OpenCL user and I am trying to run an OpenCL code on my AMD device. The program executes successfully when run for first time but fails when executed there after(system hangs) after successful termination of first execution. The program is:

Main.cpp

#include <cstdio>
#include <cstring>
#include <time.h>
#include <inttypes.h>
#include <iostream>
#include <cstdlib>
#include <fstream>
#include <string>

#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif

#define MAX_SOURCE_SIZE (0x100000)

using namespace std;

const int LIST_SIZE = 1;													
char *source_str;
size_t source_size;

cl_platform_id platform_id = NULL;							
cl_device_id device_id = NULL;   
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_event events[2];

cl_context context;
cl_command_queue command_queue;
cl_mem st_mem_obj;							
cl_mem mv_mem_obj;							
cl_program program;
cl_kernel kernel;

cl_int ret;
cl_uint work_dim = 1;

struct ch {
	cl_ulong W;
	cl_ulong B;
};

struct M {
	int a;
	int b;
};

struct MB {
	int size;
	struct M mo[256];
};

int initialize_cl_device() 
{	
	ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);

	if( ret != CL_SUCCESS ) {
		printf( "Error : Unable to get Platform ID
" ); 
		fflush(stdout);
		return 1;
	}

	ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);

	if( ret != CL_SUCCESS ) {
		printf( "Error : Unable to get Device ID
" );
		fflush(stdout);
		return 1;
	}
    	
	context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);					

	if( ret != CL_SUCCESS ) {
		printf( "Error  : Unable to create context
" );
		fflush(stdout);
		return 1;
	}

	command_queue = clCreateCommandQueue(context, device_id, 0, &ret);					
	if( ret != CL_SUCCESS ) {
		printf( "Error  : Unable to create command queue
" );
		fflush(stdout);
		return 1;
	}

	return 0;
}

int initialize_cl(struct ch c, struct M m)
{
	st_mem_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(struct ch), NULL, &ret);		

	ret = clEnqueueWriteBuffer(command_queue, st_mem_obj, CL_TRUE, 0, sizeof(struct ch), &c, 0, NULL, NULL);		
	
	if( ret != CL_SUCCESS ) {
			printf( "Error : copy struct ch to memory buffer 
" );
			fflush(stdout);
			return 1;
	}

	mv_mem_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(struct M), NULL, &ret);		

	ret = clEnqueueWriteBuffer(command_queue, mv_mem_obj, CL_TRUE, 0, sizeof(struct M), &m, 0, NULL, NULL);		

	if( ret != CL_SUCCESS ) {
			printf( "Error : copy struct M to memory buffer 
" );
			fflush(stdout);
			return 1;
	}

        ifstream file("kernel.cl");
        string prog( istreambuf_iterator<char>( file ), ( istreambuf_iterator<char>() ) );
        source_str = (char*)prog.c_str();



	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);	   

	
	ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);				

	if( ret != CL_SUCCESS ) {
		printf( "Error : Building program
" );	
		fflush(stdout);

		size_t size;
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,0, NULL, &size);	
							  
		char * log = (char*)malloc(size);
		clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,size, log, NULL);	
							  
		printf("%s
", log);
		fflush(stdout);		
	}
	return 0;
}

int run_cl_kernel(struct ch *c, struct M* m)
{
	kernel = clCreateKernel(program, "board", &ret);		
	    	
	if( ret != CL_SUCCESS ) {
		printf( "Error : Unable to create kernel
" );
		fflush(stdout);
		return 1;
	}

   	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&st_mem_obj);

	if(ret != CL_SUCCESS)  { 
		printf("Error: Setting kernel argument. (struct ch)
");
		fflush(stdout);	
		return 1;
	}

	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&mv_mem_obj);

	if(ret != CL_SUCCESS)  { 
		printf("Error: Setting kernel argument. (struct M)
");
		fflush(stdout);	
		return 1;
	}

	size_t global_item_size = 1;
	size_t local_item_size = 1; 					

	fflush(stdout);
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, &events[0]);
	fflush(stdout);

	if( ret != CL_SUCCESS ) {
		printf( "Error : execute opencl kernel" );fflush(stdout);
		return 1;
	}
   
	fflush(stdout);
	ret = clWaitForEvents(1, &events[0]);
	fflush(stdout);

	 if(ret != CL_SUCCESS)  { 
		printf("Error: Waiting for kernel run to finish. (clWaitForEvents)
");
		fflush(stdout);
		return 1;
	}

	ret = clReleaseEvent(events[0]);

	if(ret != CL_SUCCESS) { 
			printf("Error: Release event object. (clReleaseEvent)
");
			fflush(stdout);
			return 1;
	}

    	ret = clEnqueueReadBuffer(command_queue, st_mem_obj, CL_TRUE, 0, sizeof(struct ch), c, 0, NULL, &events[1]);

	ret = clEnqueueReadBuffer(command_queue, mv_mem_obj, CL_TRUE, 0, sizeof(struct M), m, 0, NULL, &events[1]);	
	

	ret = clWaitForEvents(1, &events[1]);

	if(ret != CL_SUCCESS) { 
			printf("Error: Waiting for read buffer call to finish.(st_mem_obj )
");
			fflush(stdout);
			return 1;
	}
    	
	ret = clReleaseEvent(events[1]);

	if(ret != CL_SUCCESS) { 
			printf("Error: Release event object.(st_mem_obj)
");
			fflush(stdout);
			return 1;
	}

	return 0;
		
}

int clean_res() {

	ret = clReleaseKernel(kernel);
    	if(ret != CL_SUCCESS) {
		printf("Error: In clReleaseKernel 
");
		fflush(stdout);
		return 1; 
	}

    	ret = clReleaseCommandQueue(command_queue);
	if(ret != CL_SUCCESS) {
		printf("Error: In clReleaseCommandQueue
");
		fflush(stdout);
		return 1;
	}

    	ret = clReleaseProgram(program);
    	if(ret != CL_SUCCESS) {
		printf("Error: In clReleaseProgram
");
		fflush(stdout);
		return 1; 
	}

	ret = clReleaseMemObject(st_mem_obj);
	if(ret != CL_SUCCESS) {
		printf("Error: In clReleaseMemObject (st_mem_obj)
");
		fflush(stdout);
		return 1; 
	}

    	ret = clReleaseMemObject(mv_mem_obj);
    	if(ret != CL_SUCCESS) {
		printf("Error: In clReleaseMemObject (mv_mem_obj)
");
		fflush(stdout);
		return 1; 
	}

	ret = clReleaseContext(context);
    	if(ret != CL_SUCCESS) {
		printf("Error: In clReleaseContext
");
		fflush(stdout);
		return 1;
	}

	return 0;
}

int main()
{
	struct ch c;
	FILE* filep;
	time_t t1, t2;
	double t;
	struct M m;
	int status;
	int i;

	filep = fopen( "kernel.cl", "r" );
	if (!filep) {
		exit( 1 );
	}
	
	source_str = (char*) malloc( MAX_SOURCE_SIZE );				
	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, filep );

	fclose( filep );

	c.W = 0x0EABC017ABBC8008UL;
	c.B = 0x0000C01700BC0880UL;
	i = 0;

	while( i < 1) {
	
		i++;
		printf("From main: %ld.
", c.W);

		time( &t1 );
	
		status = initialize_cl_device();
        	if (status) {
                	printf( "Error : intitialize_cl_device
" );
	        }

        	status = initialize_cl(c, m);

	        if( status ) {
        	        printf( "Error in kernel initialisation
" );
	        }

        	status = run_cl_kernel(&c, &m );
	        if( status ) {
        	        printf( "Error in kernel running
" );
	        }

	        status = clean_res();
	        if (status) {
	                printf( "Error : Releasing resources
" );
	        }

		time( &t2 );
	
		t = difftime( t2, t1 ); 
		printf( "Time taken by GPU is: %0.3f ", t );
		printf ("Returned: %d %d
", m.a,  m.b);
	}

	return 0;
}

Kernel.cl :

#pragma OPENCL EXTENSION cl_amd_printf : enable

struct __attribute__ ((aligned (128))) ch {
	unsigned long W;			
	unsigned long B;			
};

struct __attribute__ ((aligned (128))) M {
	int a;			
	int b;			
};

struct __attribute__ ((aligned (128))) MB {			
	int size;			
	struct M mo[256] __attribute__ ((packed));	
};

struct MB gen( unsigned long A )
{	
	int i,j;
	unsigned long t;
	struct MB l;
	struct M mt;

	l.size = -1;

	for( i = 0; i < 64; i++ ) {
		if( 1UL << i & A ) {
			t = 0xFFFFFFFFFFFFFFFFUL;
			for( j = 0; j < 64; j++ ) {
				if( 1UL << j & t ) {
					mt.a = i;
					mt.b = j;
					l.size++;
					l.mo[l.size] = mt;
				}
			}
		}
	}
	return l;
}

__kernel void board( __global struct ch* cc, __global struct M* mt)
{
	int sid = get_global_id(0);
	struct ch c = cc[sid];
	struct M m;
	struct MB mb;	

	mb = gen(c.W);

}

I have to reboot my system every second time I run this code. Also, code hangs in first run when I add one more call to “gen” function in kernel. Why is this happening :?: Please help (I am using 64 bit CentOS with AMD-APP-2.6 and ATI Catalyst 12.2 drivers).