strange behavior - clEnqueueWriteBuffer/ReadBuffer

when i execute the following kernel without barriers everything works fine.
if i use barriers, the program hangs at clEnqueueWriteBuffer after kernel execution and then after 30 seconds the display driver restarts.

is it possible that the workers wait for each other while accessing the output_mat vector and its a memory conflict or something when i read/write a vector on host from/to the device? i have no more ideas where to problem could be!

please help! thanks!

__kernel void
            Wave2DEuler(   __global       float * v1,
                           __global       float * v2,
                           __global       float * v3,                    
                           __global float * Floats,
                           __global int * Ints,
                           __global int * Act
                             ) 
		{
            int global_id_x = get_global_id(0);
            int global_id_y = get_global_id(1);

            __global float * input_matTm0;                
            __global float * input_matTm1;
            __global float * output_mat;

            float tau = Floats[0];
            float c = Floats[1];
            float h = Floats[2];
            float damp = Floats[3];
            int width = Ints[0];
            int height = Ints[1];
            int border = Ints[2];
            int iterations = Ints[3];
            int log_type = Ints[4];
			int first_input = 1;
            int left_pos;
            int right_pos;
            int cur_pos;
            int top_pos;
            int bot_pos;
            int padding = 1;
            int i = 0;
            long field_index = 0;
            int actIndex = 0;
			int iteration_step = 5;
			float val = 0;
			int pos = 0;
			
			while(i < iterations) {
					if (first_input == 1)
					{
						input_matTm0 = v3;
						input_matTm1 = v2;
						output_mat = v1;
					}
					else if (first_input == 2)
					{
					   input_matTm0 = v1;
					   input_matTm1 = v3;
					   output_mat = v2; 
					}
					else if (first_input == 3)
					{
						input_matTm0 = v2;
						input_matTm1 = v1;
						output_mat = v3; 
					}
					first_input = first_input + 1;
					if(first_input == 4) {
					   first_input = 1;
					}
                
                cur_pos = global_id_x + 0 + (global_id_y + 0) * width;
                left_pos = cur_pos - 1;
                right_pos = cur_pos + 1;
                top_pos = global_id_x + 0 + (global_id_y + 0 - 1) * width;
                bot_pos = global_id_x + 0 + (global_id_y + 0 + 1) * width;

                if(global_id_x > padding - 1 && global_id_x < width - padding && global_id_y > padding - 1 && global_id_y < height - padding) {
					output_mat[cur_pos] = input_matTm0[right_pos] - 4 * input_matTm0[cur_pos] + input_matTm0[left_pos] + input_matTm0[top_pos] + input_matTm0[bot_pos] + 2 * input_matTm0[cur_pos] - input_matTm1[cur_pos];
					
					if(border == 1) {
						//barrier(CLK_GLOBAL_MEM_FENCE);
						if(global_id_x == 1) {
							output_mat[left_pos] = output_mat[cur_pos];
						}
						if(global_id_x == width - 2) {
							output_mat[right_pos] = output_mat[cur_pos];
						}
						if(global_id_y == 1) {
							output_mat[top_pos] = output_mat[cur_pos];
						}
						if(global_id_y == height - 2) {
							output_mat[bot_pos] = output_mat[cur_pos];
						}
					}
					
					if(damp != 0) {
						//barrier(CLK_GLOBAL_MEM_FENCE);
						output_mat[cur_pos] = output_mat[cur_pos] * damp;
					}	
                } 

				while(Act[actIndex] != -1) {
					if(i > Act[actIndex]) {
						output_mat[width * Act[actIndex + 1] + Act[actIndex + 2]] = tau * tau;
					}
					actIndex = actIndex + 3;
				}
				
				//barrier(CLK_GLOBAL_MEM_FENCE);
                
                i++;
            }
		}

While I don’t quite understand what the source code is doing, I notice that some of the calls to the function “barrier()” occur inside control flow. Is it possible that some of the work-items execute the barrier and some of them do not? Barriers must be executed either by all work-items or by zero work-items.

Looking a bit more in the source, it seems to be doing the right thing :-/

i have set the barrier outside the “if” so that all work items pass it. same problem!

i changed the hardware from ati radeon 5450 to ati 5700 series (same driver, same ati stream sdk) and the problem doesn’t occur any more!

that solved the problem but wha’ts the cause that it doesn’t work with ati radeon 5450?