__global__ void kernel_convoGene8x8pL3( unsigned char  *output, int j_dim )
{
  int ic, jc ;
  const int k=3 ;
  unsigned char pix ;
  float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0 ;
  float outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ;
  
  // coordonnees absolues du point de base en haut a gauche
  int j = ( __umul24( blockIdx.x, blockDim.x) + threadIdx.x)<< 3 ; 
  int i = ( __umul24( blockIdx.y, blockDim.y) + threadIdx.y) ; 

  // center pixels
  for (ic=0 ; ic<k ; ic++)
	{
	  pix = tex2D(tex_img_inc, j+1, i-1+ic) ;
	  outval0 += masque[ __umul24(ic,k) +2 ]*pix ;
	  outval1 += masque[ __umul24(ic,k) +1 ]*pix ;
	  outval2 += masque[ __umul24(ic,k)    ]*pix ;
	  pix = tex2D(tex_img_inc, j+2, i-1+ic) ;
	  outval1 += masque[ __umul24(ic,k) +2 ]*pix ;
	  outval2 += masque[ __umul24(ic,k) +1 ]*pix ;
	  outval3 += masque[ __umul24(ic,k)    ]*pix ;
	  pix = tex2D(tex_img_inc, j+3, i-1+ic) ;
	  outval2 += masque[ __umul24(ic,k) +2 ]*pix ;
	  outval3 += masque[ __umul24(ic,k) +1 ]*pix ;
	  outval4 += masque[ __umul24(ic,k)    ]*pix ;
	  pix = tex2D(tex_img_inc, j+4, i-1+ic) ;
	  outval3 += masque[ __umul24(ic,k) +2 ]*pix ;
	  outval4 += masque[ __umul24(ic,k) +1 ]*pix ;
	  outval5 += masque[ __umul24(ic,k)    ]*pix ;
	  pix = tex2D(tex_img_inc, j+5, i-1+ic) ;
	  outval4 += masque[ __umul24(ic,k) +2 ]*pix ;
	  outval5 += masque[ __umul24(ic,k) +1 ]*pix ;
	  outval6 += masque[ __umul24(ic,k)    ]*pix ;
	  pix = tex2D(tex_img_inc, j+6, i-1+ic) ;
	  outval5 += masque[ __umul24(ic,k) +2 ]*pix ;
	  outval6 += masque[ __umul24(ic,k) +1 ]*pix ;
	  outval7 += masque[ __umul24(ic,k)    ]*pix ;
  	  // end zones
	  pix = tex2D(tex_img_inc, j, i-1+ic) ;
	  outval0 += masque[ __umul24(ic,k) +1 ]*pix ;
	  outval1 += masque[ __umul24(ic,k)    ]*pix ;
	  pix = tex2D(tex_img_inc, j-1, i-1+ic) ;
	  outval0 += masque[ __umul24(ic,k)  ]*pix ;

	  pix = tex2D(tex_img_inc, j+7, i-1+ic) ;
	  outval6 += masque[ __umul24(ic,k) +2 ]*pix ;
	  outval7 += masque[ __umul24(ic,k) +1 ]*pix ;
	  pix = tex2D(tex_img_inc, j+8, i-1+ic) ;
	  outval7 += masque[ __umul24(ic,k) +2 ]*pix ;
	}
  // multiple output 
  output[ __umul24(i, j_dim) + j   ] = outval0 ;
  output[ __umul24(i, j_dim) + j+1 ] = outval1 ;
  output[ __umul24(i, j_dim) + j+2 ] = outval2 ;
  output[ __umul24(i, j_dim) + j+3 ] = outval3;
  output[ __umul24(i, j_dim) + j+4 ] = outval4;
  output[ __umul24(i, j_dim) + j+5 ] = outval5;
  output[ __umul24(i, j_dim) + j+6 ] = outval6 ;
  output[ __umul24(i, j_dim) + j+7 ] = outval7 ;   
}