X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/blobdiff_plain/1b4c95e988aa2d0c5d7f6cf87750ea4214dc8171..HEAD:/BookGPU/Chapters/chapter4/code/convoSepShH.cu~ diff --git a/BookGPU/Chapters/chapter4/code/convoSepShH.cu~ b/BookGPU/Chapters/chapter4/code/convoSepShH.cu~ index 8f3e73d..19f533e 100644 --- a/BookGPU/Chapters/chapter4/code/convoSepShH.cu~ +++ b/BookGPU/Chapters/chapter4/code/convoSepShH.cu~ @@ -2,28 +2,28 @@ __global__ void kernel_convoSepShx8pH(unsigned char *output, int j_dim, int r) { int ic, jc, p; int k = 2*r+1 ; - float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ; - int bdimX = blockDim.x<<3 ; - int tidX = threadIdx.x<<3 ; + float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0 ; + float outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ; + int bdimX = blockDim.x<<3 ; // all packets width + int tidX = threadIdx.x<<3 ; // one packet offset - // coordonnees absolues du point de base + // absolute coordinates of one packet base point int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x)<<3 ; int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ; - int j0= __umul24(blockIdx.x,blockDim.x)<<3 ; - int idx = __umul24(i,j_dim) + j ; // indice dans l'image + int j0= __umul24(blockIdx.x,blockDim.x)<<3 ; + // absolute index in the image + int idx = __umul24(i,j_dim) + j ; - - // chargement en smem + // offset of one ROI row in shared memory int idrow = threadIdx.y*(bdimX+k-1) ; extern __shared__ unsigned char roi8p[]; - // bloc 0 (a gauche) + // top left block for (p=0; p<8; p++) roi8p[ idrow + tidX +p ] = tex2D(tex_img_inc, j-r+p , i) ; - - // a droite - if ( threadIdx.x < r ) //...ou plutot ce qu'il en manque + // top right block + if ( threadIdx.x < r ) { roi8p[ idrow + bdimX + threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX+threadIdx.x , i ) ; roi8p[ idrow + bdimX + threadIdx.x +r ] = tex2D( tex_img_inc, j0 +bdimX+threadIdx.x , i ) ; @@ -31,8 +31,7 @@ __global__ void kernel_convoSepShx8pH(unsigned char *output, int j_dim, int r) __syncthreads(); - // calculs de convolution - // passe horizontale + // horizontal convolution for (jc=0 ; jc global mem - output[ idx ] = outval0 ; - output[ idx+1 ] = outval1 ; - output[ idx+2 ] = outval2 ; - output[ idx+3 ] = outval3 ; - output[ idx+4 ] = outval4 ; - output[ idx+5 ] = outval5 ; - output[ idx+6 ] = outval6 ; - output[ idx+7 ] = outval7 ; + // 8 pixels per thread --> global mem + output[ idx++ ] = outval0 ; + output[ idx++ ] = outval1 ; + output[ idx++ ] = outval2 ; + output[ idx++ ] = outval3 ; + output[ idx++ ] = outval4 ; + output[ idx++ ] = outval5 ; + output[ idx++ ] = outval6 ; + output[ idx++ ] = outval7 ; }