kernel freezes video output

Hi,

I wrote a tile based deferred shading kernel, it compiles, but it makes the video output freeze after a few seconds of running with ~70FPS. Only the graphics card is hanging, the system is still responsive through ssh. I’m using Linux Mint 12.1 64 bit with Catalyst 12.1 64 bit, and APP SDK 2.6

here’s the kernel:


    __constant float far = -10000.0f; //far plane distance  
    __constant float near = -1.0f; //near plane distance  
    __constant sampler_t the_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;  
    __constant float cutoff = 0.25f; //0.005f  
    __constant int attenuation_type = 0; //linear or full attenuation?  
      
    float my_abs( float var ) //these floating point operations aren't supported in opencl 1.1  
    {  
      if ( var < 0 )  
      {  
        return -var;  
      }  
      else  
      {  
        return var;  
      }  
    }  
      
    float my_mix( float x, float y, float weigth )  
    {  
      return x * ( 1.0f - weigth ) + y * weigth;  
    }  
      
    float3 my_reflect( float3 incident, float3 normal )  
    {  
      return incident - 2.0f * dot( normal, incident ) * normal;  
    }  
      
    float3 decode_normals_spheremap( float4 n ) //decode normals from spheremap encoding  
    {  
      float4 nn = n * ( float4 )( 2.0f, 2.0f, 0.0f, 0.0f ) + ( float4 )( -1.0f, -1.0f, 1.0f, -1.0f );  
      float l = dot( nn.xyz, -nn.xyw );  
      nn.z = l;  
      nn.xy *= sqrt( l );  
      return nn.xyz * 2.0f + ( float3 )( 0.0f, 0.0f, -1.0f );  
    }  
      
    float3 decode_linear_depth( float4 linear_depth, float4 position ) //decode linear depth into view space position  
    {  
      return ( float3 )( position.xy * ( far / position.z ), far ) * linear_depth.x;  
    }  
      
    __kernel void main( __read_only image2d_t albedo, //diffuse surface color from the g-buffer  
                        __read_only image2d_t normals, //normals encoded using spheremap encoding  
                        __read_only image2d_t depth, //linear depth  
                        __write_only image2d_t result, //the output buffer that stores lighting data  
                        __global const float* far_plane, //the lower left and upper right corners of the far plane  
                        __global const float* in_view_pos, //view space camera position  
                        __global const float* in_lights, //1024 lights {light pos[3], diffuse_color[3], radius[1], specular intensity[1] }  
                        __global const float* in_num_of_lights, //number of incoming lights (1024)  
                        __global const float* in_projection_matrix ) //the projection matrix is used for frustum culling  
    {  
      /* 
       * Per pixel calculations (global) 
       */  
      
      int2 coords = ( int2 )( get_global_id( 0 ), get_global_id( 1 ) );  
      float4 raw_albedo = read_imagef( albedo, the_sampler, coords );  
      float4 raw_normal = read_imagef( normals, the_sampler, coords ); //this will store the decoded normals  
      
      float4 out_color = ( float4 )( 0.0f ); //this will store the resulting color  
      out_color.w = 1.0f;  
      
      float4 raw_depth; //this will store the decoded view space position  
      int2 global_size = ( int2 )( get_global_size( 0 ), get_global_size( 1 ) );  
      
      /* 
       * Per tile data 
       */  
      
      float3 view_pos; //this will store the view space position (uniform among the workgroups, but stored as local for speedup)  
      int num_of_lights; //num of lights (same here)  
      
      int2 local_coords;  
      int2 local_size;  
      int workgroup_index;  
      
      __local int tile_lights[1024]; //index of the lights visible per tile  
      __local int num_of_tile_lights; //number of lights per tile  
      
      float2 tile_scale; //used for calculating frustum culling, taken from Intel's sample  
      float2 tile_bias;  
      
      float4 column_1;  
      float4 column_2;  
      float4 column_4;  
      
      float4 frustum_planes[6];  
      
      /* 
       * Check for skybox 
       */  
      
      bool early_rejection = ( length( raw_normal.xy ) == 0.0f );  
      
      if ( early_rejection )  
      {  
        out_color = raw_albedo;  
      }  
      else  
      {  
        local_coords = ( int2 )( get_local_id( 0 ), get_local_id( 1 ) );  
        local_size = ( int2 )( get_local_size( 0 ), get_local_size( 1 ) );  
      
        workgroup_index = local_coords.y * local_size.x + local_coords.x;  
      
        float4 ll, ur;  
      
        raw_depth = read_imagef( depth, the_sampler, coords );  
      
        ll = ( float4 )( far_plane[0], far_plane[1], far_plane[2], 1.0f );  
        ur = ( float4 )( far_plane[3], far_plane[4], far_plane[5], 1.0f );  
      
        //texture coordinate [0...1] for input processing  
        float2 texel = ( float2 )(( float )( coords.x ) / ( float )( global_size.x ), ( float )( coords.y ) / ( float )( global_size.y ) );  
      
        raw_depth.xyz = decode_linear_depth( raw_depth, ( float4 )( my_mix( ll.x, ur.x, texel.x ), my_mix( ll.y, ur.y, texel.y ), ll.z, 1.0f ) );  
        raw_normal.xyz = decode_normals_spheremap( raw_normal );  
      
        view_pos = vload3( 0, in_view_pos );  
        num_of_lights = ( int )in_num_of_lights[0];  
      
        //I'm not sure if the maths here is correct due to OpenGL and DirectX using different matrices, but this shouldnt matter  
        tile_scale = ( float2 )( global_size.x, global_size.y ) * ( 1.0f / ( float )( 2.0f * local_size.x ) );  
        tile_bias = tile_scale - ( float2 )( local_coords.x, local_coords.y );  
      
        column_1 = ( float4 )( in_projection_matrix[5] * tile_scale.x, 0.0f, tile_bias.x, 0.0f );  
        column_2 = ( float4 )( 0.0f, -in_projection_matrix[10] * tile_scale.y, tile_bias.y, 0.0f );  
        column_4 = ( float4 )( 0.0f, 0.0f, 1.0f, 0.0f );  
      
        frustum_planes[0] = column_4 - column_1;  
        frustum_planes[1] = column_4 + column_1;  
        frustum_planes[2] = column_4 - column_2;  
        frustum_planes[3] = column_4 + column_2;  
        frustum_planes[4] = ( float4 )( 0.0f, 0.0f, -1.0f, near );  
        frustum_planes[5] = ( float4 )( 0.0f, 0.0f, 1.0f, far );  
      
        for ( int c = 0; c < 4; c++ ) //normalize frustum plane normals  
        {  
          frustum_planes[c].xyz *= 1.0f / length( frustum_planes[c].xyz );  
        }  
      
        /* 
        * Per workgroup (tile) calculations (local) 
        */  
      
        if ( workgroup_index == 0 )  
        {  
          num_of_tile_lights = 0;  
        }  
      }  
      
      barrier( CLK_LOCAL_MEM_FENCE );  
      
      if ( !early_rejection )  
      {  
        for ( int c = workgroup_index; c < num_of_lights; c += local_size.x * local_size.y ) //cull each light per tile, each thread in a tile processes one light  
        {  
          if ( c < num_of_lights )  
          {  
            bool in_frustum = true;  
            float attenuation_end = 0.0f;  
      
            if ( attenuation_type == 0 )  
            {  
              attenuation_end = ( float )( in_lights[c * 8 + 6] ) / ( float )( cutoff ); //radius / cutoff  
            }  
            else  
            {  
              attenuation_end = ( float )( in_lights[c * 8 + 6] ); //radius  
            }  
      
            for ( int d = 0; d < 6; d++ ) //cull each light based on the distance where it will shine and the frustum defined by the tile  
            {  
              float e = dot( frustum_planes[d], ( float4 )( in_lights[c * 8 + 0], in_lights[c * 8 + 1], in_lights[c * 8 + 2], 1.0f ) );  
              in_frustum = in_frustum && ( e >= -attenuation_end );  
            }  
      
            if ( in_frustum ) //if the light is in the frustum, then store its index (if I comment this out, the kernel runs, but doesn't cull lights)  
            {  
              int index = atomic_inc( &num_of_tile_lights );  
              tile_lights[index] = c;  
              if ( num_of_tile_lights != 0 )  
              {  
                out_color = ( float4 )( 0.0f, 1.0f, 0.0f, 1.0f );  
              }  
            }  
          }  
        }  
      }  
      
      barrier( CLK_LOCAL_MEM_FENCE );  
      
      /* 
       * Per light calculations 
       */  
      
      if ( !early_rejection )  
      {  
        for ( int c = 0; c < num_of_tile_lights; c++ ) //draw each light per tile  
        {  
          int index = tile_lights[c]; //get back the light index  
          float3 light_pos = ( float3 )( in_lights[index * 8 + 0], in_lights[index * 8 + 1], in_lights[index * 8 + 2] ); //gather light data using the index  
          float light_radius = in_lights[index * 8 + 6];  
      
          //calculate blinn-phong lighting with custom attenuation  
          float3 light_dir = light_pos - raw_depth.xyz;  
          float distance = length( light_dir );  
          light_dir /= distance;  
      
          float coeff, attenuation;  
      
          if ( attenuation_type == 0 )  
          {  
            coeff = max( distance - light_radius, 0.0f ) / light_radius + 1.0f;  
            attenuation = max(( 1.0f / ( coeff * coeff ) - cutoff ) / ( 1.0f - cutoff ), 0.0f );  
          }  
          else  
          {  
            attenuation = ( light_radius - distance ) / ( light_radius * 0.01f ) * 0.01f;  
          }  
      
          if ( attenuation > 0.0f )  
          {  
            float3 light_diffuse_color = ( float3 )( in_lights[index * 8 + 3], in_lights[index * 8 + 4], in_lights[index * 8 + 5] );  
            float light_specular_power = ( float )in_lights[index * 8 + 3];  
            float3 view_dir = normalize( view_pos - raw_depth.xyz );  
      
            float3 half_vector = ( light_dir + view_dir ) * 0.5f;  
            float n_dot_l = max( dot( raw_normal.xyz, light_dir ), 0.0f );  
      
            out_color.xyz += raw_albedo.xyz * light_diffuse_color * n_dot_l * attenuation;  
            float n_dot_h = pow( max( dot( raw_normal.xyz, half_vector ), 0.0f ), light_specular_power );  
            out_color.xyz += light_diffuse_color * n_dot_h * attenuation;  
          }   
        }  
      }  
      
      if ( coords.x < global_size.x && coords.y < global_size.y )  
      {  
        write_imagef( result, coords, out_color ); //write the calculated light data to the result buffer (texture)  
      }  
    }   

best regards,
Yours3!f