1 __global__ void kernel_convoNonSepSh_8p(unsigned char *output, int j_dim, int r)
5 float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ;
6 int bdimX = blockDim.x<<3 ;
7 int tidX = threadIdx.x<<3 ;
9 // absolute coordinates of packet's base point
10 int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x) << 3 ;
11 int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ;
12 int j0= __umul24(blockIdx.x,blockDim.x)<<3 ; // block's base point
13 int idx = __umul24(i,j_dim) + j ; // absolute index
14 int idrow = threadIdx.y*(bdimX+k-1) ; // line's offset in sh mem
16 // shared memory declaration
17 extern __shared__ unsigned char roi8p[];
20 for (int p=0; p<8; p++)
21 roi8p[ idrow + tidX +p ] = tex2D(tex_img_inc, j-r+p , i-r) ;
24 if ( threadIdx.x < r )
26 roi8p[ idrow + bdimX + threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX+threadIdx.x , i-r ) ;
27 roi8p[ idrow + bdimX + threadIdx.x +r ] = tex2D( tex_img_inc, j0 +bdimX+threadIdx.x , i-r ) ;
30 if ( threadIdx.y < k-1 )
32 idrow = (threadIdx.y+blockDim.y)*(bdimX+k-1) ;
33 for (int p=0; p<8; p++)
34 roi8p[ idrow + tidX +p ] = tex2D( tex_img_inc, j-r+p , i+blockDim.y-r ) ;
36 if ( threadIdx.x < r )
38 roi8p[ idrow + bdimX +threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX +threadIdx.x, i+blockDim.y-r ) ;
39 roi8p[ idrow + bdimX +threadIdx.x +r] = tex2D( tex_img_inc, j0 +bdimX +threadIdx.x, i+blockDim.y-r ) ;
45 for (ic=0 ; ic<k ; ic++)
46 for( jc=0 ; jc<k ; jc++)
48 int baseRoi = __umul24(ic+threadIdx.y,(bdimX+k-1)) + jc+tidX ;
49 float valMask = masque[ __umul24(ic,k)+jc ] ;
50 outval0 += valMask*roi8p[ baseRoi ] ;
51 outval1 += valMask*roi8p[ baseRoi +1 ] ;
52 outval2 += valMask*roi8p[ baseRoi +2 ] ;
53 outval3 += valMask*roi8p[ baseRoi +3 ] ;
54 outval4 += valMask*roi8p[ baseRoi +4 ] ;
55 outval5 += valMask*roi8p[ baseRoi +5 ] ;
56 outval6 += valMask*roi8p[ baseRoi +6 ] ;
57 outval7 += valMask*roi8p[ baseRoi +7 ] ;
60 // multiple output --> global mem
61 output[ idx ] = outval0 ;
62 output[ idx+1 ] = outval1 ;
63 output[ idx+2 ] = outval2 ;
64 output[ idx+3 ] = outval3 ;
65 output[ idx+4 ] = outval4 ;
66 output[ idx+5 ] = outval5 ;
67 output[ idx+6 ] = outval6 ;
68 output[ idx+7 ] = outval7 ;