- // 1 pixel par thread --> global mem
- output[ idx ] = outval0 ;
- output[ idx+1 ] = outval1 ;
- output[ idx+2 ] = outval2 ;
- output[ idx+3 ] = outval3 ;
- output[ idx+4 ] = outval4 ;
- output[ idx+5 ] = outval5 ;
- output[ idx+6 ] = outval6 ;
- output[ idx+7 ] = outval7 ;
+ // 8 pixels per thread --> global mem
+ output[ idx++ ] = outval0 ;
+ output[ idx++ ] = outval1 ;
+ output[ idx++ ] = outval2 ;
+ output[ idx++ ] = outval3 ;
+ output[ idx++ ] = outval4 ;
+ output[ idx++ ] = outval5 ;
+ output[ idx++ ] = outval6 ;
+ output[ idx++ ] = outval7 ;