__global__ void kernel_medianV_sh( short *output, int i_dim, int j_dim, int r)
{
  
  int idc, val, min, max, inf, egal, sup, mxinf, minsup, estim ;

  //coordinates in the block
  int ib = threadIdx.y ;
  int jb = threadIdx.x ;
  int idx_h = __mul24(ib+r,blockDim.x) +jb; // base pixel index
  int offset = __mul24(blockDim.x,r) ;
  
  int j = __mul24(blockIdx.x,blockDim.x) + jb ; 
  int i = __mul24(blockIdx.y,blockDim.y) + ib ;
  
  //      DATA PREFETCHING INTO SHARED MEM
  extern __shared__ int buff[] ;	       
  buff[ idx_h ] = tex2D(tex_img_ins, j, i) ;
				  
  if (ib < r)
	{
	  buff[ idx_h - offset ]    = tex2D(tex_img_ins, j, i-r) ;
	} else
	if (ib >= (blockDim.y-r))
	  {
		buff[ idx_h + offset  ]    = tex2D(tex_img_ins, j, i+r) ;
	  }
  
  __syncthreads() ;

  //      TORBEN MOGENSEN SORTING
  min = max = buff[ ib*blockDim.x +jb] ;
  
  for (idc= 0 ; idc< 2*r+1 ; idc++ )
	{
	  val = buff[ __mul24(ib+idc, blockDim.x) +jb ] ;
	  if ( val  < min ) min = val ;
	  if ( val  > max ) max = val ;
	}
  
  while (1)
	{  
	  estim = (min+max)/2 ;
	  inf = sup = egal = 0  ;
	  mxinf = min ;
	  minsup= max ;
	  for (idc =0; idc< 2*r+1 ; idc++)
		{
		  val = buff[ __mul24(ib+idc, blockDim.x) +jb ] ;
		  if( val < estim )
			{
			  inf++;
			  if( val > mxinf) mxinf = val ;
			} else if (val > estim)
			{
			  sup++;
			  if( val < minsup) minsup = val ;
			} else egal++ ;
		}
	  if ( (inf <= (r+1))&&(sup <=(r+1)) ) break ;
	  else if (inf>sup) max = mxinf ;
	  else min = minsup ;
	  }
  
  if ( inf >= r+1 ) val = mxinf ;
  else if (inf+egal >= r+1) val = estim ;
  else val = minsup ;
  
  output[ __mul24(j, i_dim)  +i  ] =  val ; 
}