__global__ void kernel_convoSepShx8pV(unsigned char *output, int j_dim, int r) { int ic, jc, p; int k = 2*r+1 ; float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ; int bdimX = blockDim.x<<3 ; int tidX = threadIdx.x<<3 ; // absolute coordinates of the base point int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x)<<3 ; int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ; // absolute index in the image int idx = __umul24(i,j_dim) + j ; // offset of one ROI row in shared memory int idrow = threadIdx.y*bdimX ; extern __shared__ unsigned char roi8p[]; // top block for (p=0; p<8; p++) roi8p[ idrow + tidX +p ] = tex2D(tex_img_inc, j+p , i-r) ; // bottom block if ( threadIdx.y < k-1 ) { idrow = (threadIdx.y+blockDim.y)*bdimX ; for (int p=0; p<8; p++) roi8p[ idrow + tidX +p ] = tex2D( tex_img_inc, j+p , i+blockDim.y-r ) ; } __syncthreads(); // vertical convolution for (ic=0 ; ic global mem output[ idx++ ] = outval0 ; output[ idx++ ] = outval1 ; output[ idx++ ] = outval2 ; output[ idx++ ] = outval3 ; output[ idx++ ] = outval4 ; output[ idx++ ] = outval5 ; output[ idx++ ] = outval6 ; output[ idx++ ] = outval7 ; }