int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ;
int j0= __umul24(blockIdx.x,blockDim.x)<<3 ; // block's base point
int idx = __umul24(i,j_dim) + j ; // absolute index
int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ;
int j0= __umul24(blockIdx.x,blockDim.x)<<3 ; // block's base point
int idx = __umul24(i,j_dim) + j ; // absolute index
outval0 += valMask*roi8p[ baseRoi ] ;
outval1 += valMask*roi8p[ baseRoi +1 ] ;
outval2 += valMask*roi8p[ baseRoi +2 ] ;
outval0 += valMask*roi8p[ baseRoi ] ;
outval1 += valMask*roi8p[ baseRoi +1 ] ;
outval2 += valMask*roi8p[ baseRoi +2 ] ;
output[ idx++ ] = outval4 ;
output[ idx++ ] = outval5 ;
output[ idx++ ] = outval6 ;
output[ idx++ ] = outval4 ;
output[ idx++ ] = outval5 ;
output[ idx++ ] = outval6 ;