}
}
__syncthreads();
-
+
// computations
for (ic=0 ; ic<k ; ic++)
for( jc=0 ; jc<k ; jc++)
{
int baseRoi = __umul24(ic+threadIdx.y,(bdimX+k-1)) + jc+tidX ;
- float valMask = masque[ __umul24(ic,k)+jc ] ;
+ float valMask = mask[ __umul24(ic,k)+jc ] ;
outval0 += valMask*roi8p[ baseRoi ] ;
outval1 += valMask*roi8p[ baseRoi +1 ] ;
outval2 += valMask*roi8p[ baseRoi +2 ] ;
}
// multiple output --> global mem
- output[ idx ] = outval0 ;
- output[ idx+1 ] = outval1 ;
- output[ idx+2 ] = outval2 ;
- output[ idx+3 ] = outval3 ;
- output[ idx+4 ] = outval4 ;
- output[ idx+5 ] = outval5 ;
- output[ idx+6 ] = outval6 ;
- output[ idx+7 ] = outval7 ;
+ output[ idx++ ] = outval0 ;
+ output[ idx++ ] = outval1 ;
+ output[ idx++ ] = outval2 ;
+ output[ idx++ ] = outval3 ;
+ output[ idx++ ] = outval4 ;
+ output[ idx++ ] = outval5 ;
+ output[ idx++ ] = outval6 ;
+ output[ idx ] = outval7 ;
}