__global__ void kernel_medianV_sh( short *output, int i_dim, int j_dim, int r) { int idc, val, min, max, inf, egal, sup, mxinf, minsup, estim ; //coordinates in the block int ib = threadIdx.y ; int jb = threadIdx.x ; int idx_h = __mul24(ib+r,blockDim.x) +jb; // base pixel index int offset = __mul24(blockDim.x,r) ; int j = __mul24(blockIdx.x,blockDim.x) + jb ; int i = __mul24(blockIdx.y,blockDim.y) + ib ; // DATA PREFETCHING INTO SHARED MEM extern __shared__ int buff[] ; buff[ idx_h ] = tex2D(tex_img_ins, j, i) ; if (ib < r) { buff[ idx_h - offset ] = tex2D(tex_img_ins, j, i-r) ; } else if (ib >= (blockDim.y-r)) { buff[ idx_h + offset ] = tex2D(tex_img_ins, j, i+r) ; } __syncthreads() ; // TORBEN MOGENSEN SORTING min = max = buff[ ib*blockDim.x +jb] ; for (idc= 0 ; idc< 2*r+1 ; idc++ ) { val = buff[ __mul24(ib+idc, blockDim.x) +jb ] ; if ( val < min ) min = val ; if ( val > max ) max = val ; } while (1) { estim = (min+max)/2 ; inf = sup = egal = 0 ; mxinf = min ; minsup= max ; for (idc =0; idc< 2*r+1 ; idc++) { val = buff[ __mul24(ib+idc, blockDim.x) +jb ] ; if( val < estim ) { inf++; if( val > mxinf) mxinf = val ; } else if (val > estim) { sup++; if( val < minsup) minsup = val ; } else egal++ ; } if ( (inf <= (r+1))&&(sup <=(r+1)) ) break ; else if (inf>sup) max = mxinf ; else min = minsup ; } if ( inf >= r+1 ) val = mxinf ; else if (inf+egal >= r+1) val = estim ; else val = minsup ; output[ __mul24(j, i_dim) +i ] = val ; }