1 __global__ void kernel_convoSepShx8pV(unsigned char *output, int j_dim, int r)
5 float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ;
6 int bdimX = blockDim.x<<3 ;
7 int tidX = threadIdx.x<<3 ;
9 // absolute coordinates of the base point
10 int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x)<<3 ;
11 int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ;
12 // absolute index in the image
13 int idx = __umul24(i,j_dim) + j ;
14 // offset of one ROI row in shared memory
15 int idrow = threadIdx.y*bdimX ;
17 extern __shared__ unsigned char roi8p[];
21 roi8p[ idrow + tidX +p ] = tex2D(tex_img_inc, j+p , i-r) ;
24 if ( threadIdx.y < k-1 )
26 idrow = (threadIdx.y+blockDim.y)*bdimX ;
27 for (int p=0; p<8; p++)
28 roi8p[ idrow + tidX +p ] = tex2D( tex_img_inc, j+p , i+blockDim.y-r ) ;
32 // vertical convolution
33 for (ic=0 ; ic<k ; ic++)
35 int baseRoi = __umul24(ic+threadIdx.y,bdimX) + tidX ;
36 float valMask = mask[ ic ] ;
37 outval0 += valMask*roi8p[ baseRoi ] ;
38 outval1 += valMask*roi8p[ baseRoi +1 ] ;
39 outval2 += valMask*roi8p[ baseRoi +2 ] ;
40 outval3 += valMask*roi8p[ baseRoi +3 ] ;
41 outval4 += valMask*roi8p[ baseRoi +4 ] ;
42 outval5 += valMask*roi8p[ baseRoi +5 ] ;
43 outval6 += valMask*roi8p[ baseRoi +6 ] ;
44 outval7 += valMask*roi8p[ baseRoi +7 ] ;
47 // 8 pixel par thread --> global mem
48 output[ idx++ ] = outval0 ;
49 output[ idx++ ] = outval1 ;
50 output[ idx++ ] = outval2 ;
51 output[ idx++ ] = outval3 ;
52 output[ idx++ ] = outval4 ;
53 output[ idx++ ] = outval5 ;
54 output[ idx++ ] = outval6 ;
55 output[ idx++ ] = outval7 ;