for( jc=0 ; jc<k ; jc++)
{
int baseRoi = __umul24(ic+threadIdx.y,(bdimX+k-1)) + jc+tidX ;
- float valMask = masque[ __umul24(ic,k)+jc ] ;
+ float valMask = mask[ __umul24(ic,k)+jc ] ;
outval0 += valMask*roi8p[ baseRoi ] ;
outval1 += valMask*roi8p[ baseRoi +1 ] ;
outval2 += valMask*roi8p[ baseRoi +2 ] ;
output[ idx++ ] = outval4 ;
output[ idx++ ] = outval5 ;
output[ idx++ ] = outval6 ;
- output[ idx++ ] = outval7 ;
+ output[ idx ] = outval7 ;
}