#define PI 3.1415926 __constant__ float masque[256] ; /* noyau gaussien 3x3 *//* __constant__ float noyau[] = {1.0/16,2.0/16,1.0/16, 2.0/16,4.0/16,2.0/16, 1.0/16,2.0/16,1.0/16} ; __constant__ int rnoyau = 1 ;*/ /* noyau gaussien 5x5 *//* __constant__ int noyau[] = {1,4,6,4,1, 4,16,24,16,4, 6,24,36,24,6, 4,16,24,16,4, 1,4,6,4,1} ; __constant__ int sumNoyau = 256 ; __constant__ int rnoyau = 2 ; __constant__ int NNoyau = 25 ; */ /* noyau 5x5*/ __constant__ float noyau[] = {1.0/25,1.0/25,1.0/25,1.0/25,1.0/25, 1.0/25,1.0/25,1.0/25,1.0/25,1.0/25, 1.0/25,1.0/25,1.0/25,1.0/25,1.0/25, 1.0/25,1.0/25,1.0/25,1.0/25,1.0/25, 1.0/25,1.0/25,1.0/25,1.0/25,1.0/25} ; __constant__ int rnoyau = 2 ; /* noyau 7x7 *//* __device__ __constant__ float noyau[] = {1.0/49,1.0/49,1.0/49,1.0/49,1.0/49,1.0/49,1.0/49, 1.0/49,1.0/49,1.0/49,1.0/49,1.0/49,1.0/49,1.0/49, 1.0/49,1.0/49,1.0/49,1.0/49,1.0/49,1.0/49,1.0/49, 1.0/49,1.0/49,1.0/49,1.0/49,1.0/49,1.0/49,1.0/49, 1.0/49,1.0/49,1.0/49,1.0/49,1.0/49,1.0/49,1.0/49, 1.0/49,1.0/49,1.0/49,1.0/49,1.0/49,1.0/49,1.0/49, 1.0/49,1.0/49,1.0/49,1.0/49,1.0/49,1.0/49,1.0/49} ; __device__ __constant__ int rnoyau = 3 ; */ //__device__ __constant__ int NNoyau = 49 ; __device__ __constant__ int noyauH[] = {1,1,1,1,1,1,1} ; __device__ __constant__ int sumNoyauH = 7 ; __device__ __constant__ int rnoyauH = 3 ; // average 7x7 /* __device__ __constant__ int noyauV[] = {1,1,1,1,1,1,1} ; __device__ __constant__ int sumNoyauV = 7 ; __device__ __constant__ int rnoyauV = 3 ; */ // gaussian 5x5 __device__ __constant__ int noyauV[] = {1,4,6,4,1} ; __device__ __constant__ int rnoyauV = 2 ; __constant__ int sumKV = 16 ; //__device__ __constant__ int noyauV[] = {1,1,1} ; //__constant__ int sumNoyauV = 16 ; // declarations des textures texture tex_img_in ; texture tex_img_ins ; texture tex_img_inc ; texture tex_img_estim ; texture tex_noyau ; texture tex_noyauConvof ; texture tex_noyauConvos ; texture tex_kern ; /*************************************************************** * kernel identite pour mesures de debit max effectif possible ***************************************************************/ __global__ void kernel_ident( unsigned short*output, int i_dim, int j_dim) { // coordonnees absolues du point de base int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; int i = __mul24( blockIdx.y, blockDim.y) + threadIdx.y ; int idx = __mul24(i, j_dim) + j ; // indice dans l'image output[ idx ] = tex2D(tex_img_ins, j, i) ; } __global__ void kernel_ident2p( unsigned short *output, int i_dim, int j_dim) { // coordonnees absolues du point de base int j = __umul24(__umul24(blockIdx.x,blockDim.x) + threadIdx.x,2) ; int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ; output[ __umul24(i, j_dim) + j ] = tex2D(tex_img_inc, j, i) ; output[ __umul24(i, j_dim) + j +1 ] = tex2D(tex_img_inc, j+1, i) ; } __global__ void kernel_ident( unsigned char *output, int i_dim, int j_dim) { // coordonnees absolues du point de base int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; int i = __mul24( blockIdx.y, blockDim.y) + threadIdx.y ; output[ __umul24(i, j_dim) + j ] = tex2D(tex_img_inc, j, i) ; } __global__ void kernel_ident2p( unsigned char *output, int i_dim, int j_dim) { // coordonnees absolues du point de base int j = __umul24(__umul24(blockIdx.x,blockDim.x) + threadIdx.x,2) ; int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ; output[ __umul24(i, j_dim) + j ] = tex2D(tex_img_inc, j, i) ; output[ __umul24(i, j_dim) + j +1 ] = tex2D(tex_img_inc, j+1, i) ; } /** * * \brief calcule l'estimation initiale * \author zulu - AND * * \param[in] L Largeur de l'image * \param[in] i_dim Hauteur de l'image * \param[in] j_dim coté de la fenêtre de moyenneage * * \param[out] output Image estimee initiale * * Version texture : l'img originale est supposée en texture. * L'estimation réalisée correspond a un moyenneur de 'rayon' r * Execution sur des blocs de threads 2D et une grille 2D * selon les dimensions de l'image. * */ __global__ void kernel_moyenne_shtex( unsigned int *output, unsigned int i_dim, unsigned int j_dim, unsigned int r) { int idl, idc ; //coordonnées ds le bloc int ib = threadIdx.y ; int jb = threadIdx.x ; int idb_row = __mul24(ib,blockDim.x) + jb ; // indice dans le bloc // coordonnees absolues du point int j = __mul24(blockIdx.x,blockDim.x) + jb ; int i = __mul24(blockIdx.y,blockDim.y) + ib ; int idx = __mul24(i,j_dim) + j ; // indice dans l'image int off = __mul24(r,blockDim.x) ; // offset du bloc dans le buffer en sh mem extern __shared__ unsigned int shmem[] ; volatile unsigned int * buffer = shmem ; volatile unsigned int * bufferV = &buffer[ blockDim.x*blockDim.y + 2*off ] ; /*********************************************************************************** * PASSE HORIZONTALE ***********************************************************************************/ // charge les pixels de la bande au dessus du bloc if(ib=(blockDim.y-r)) { buffer[ idb_row + 2*off ] = 0 ; for (idc=j-r ; idc<=j+r ; idc++ ) buffer[ idb_row + 2*off ] += tex2D(tex_img_in, idc, i+r) ; } // charge les pixels du bloc buffer[ idb_row + off ] = 0 ; for (idc=j-r ; idc<=j+r ; idc++ ) buffer[ idb_row + off ] += tex2D(tex_img_in, idc, i) ; __syncthreads() ; /***************************************************************************** * PASSE VERTICALE *****************************************************************************/ bufferV[ idb_row ] = 0 ; for (idl=0 ; idl<=2*r ; idl++ ) bufferV[ idb_row ] += buffer[ (ib+idl)*blockDim.x + jb ] ; __syncthreads() ; output[idx] = bufferV[ idb_row ]/((2*r+1)*(2*r+1)) ; //pas juste sur les bords, mais c'est pô grave ! } /** * * \brief calcule la convolution avec un noyau carre * \author zulu - AND * * \param[in] L Largeur de l'image * \param[in] i_dim Hauteur de l'image * \param[in] j_dim coté de la fenêtre de moyenneage * * \param[out] output Image estimee initiale * * Version texture : l'img originale est supposée en texture. * Execution sur des blocs de threads 2D et une grille 2D * correspondant aux dimensions de l'image. * */ // convolution non separable // image en texture et noyau en mem constante __global__ void kernel_convoGene8( unsigned char *output, int i_dim, int j_dim) { int ic, jc ; int r = 2 ; int L=5 ; float outval0=0.0 ; // coordonnees absolues du point de base int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; int i = __mul24( blockIdx.y, blockDim.y) + threadIdx.y ; for (ic=0 ; ic masque[ __mul24(ic,L)+jc ]*tex2D(tex_img_inc, j-r+jc, i-r+ic) for (ic=1 ; ic masque[ __mul24(ic,L)+jc ]*tex2D(tex_img_inc, j-r+jc, i-r+ic) for (ic=1 ; ic masque[ __mul24(ic,L)+jc ]*tex2D(tex_img_inc, j-r+jc, i-r+ic) for (ic=0 ; ic masque[ __mul24(ic,L)+jc ]*tex2D(tex_img_inc, j-r+jc, i-r+ic) for (ic=0 ; ic masque[ __mul24(ic,L)+jc ]*tex2D(tex_img_inc, j-r+jc, i-r+ic) for (ic=0 ; ic masque[ __mul24(ic,L)+jc ]*tex2D(tex_img_inc, j-r+jc, i-r+ic) for (ic=1 ; ic global mem output[ idx ] = outval0 ; } __global__ void kernel_convoNonSepSh_2p(unsigned short *output, int i_dim, int j_dim) { int idb, ic, jc ; int L = 2*rnoyau+1 ; float outval0=0.0, outval1=0.0 ; int bdimX = blockDim.x<<1 ; int tidX = threadIdx.x<<1 ; // coordonnees absolues du point de base int j = __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2) ; int i = __mul24( blockIdx.y, blockDim.y) + threadIdx.y ; int idx = __mul24(i,j_dim) + j ; // indice dans l'image // chargement en smem int idrow = threadIdx.y*(bdimX+L-1) ; extern __shared__ int roi[]; // bloc 0 (en haut \E0 gauche) roi[ idrow + tidX ] = tex2D(tex_img_ins, j-rnoyau, i-rnoyau) ; roi[ idrow + tidX +1] = tex2D(tex_img_ins, j-rnoyau+1, i-rnoyau) ; // bloc 1 (en haut \E0 droite)... if ( threadIdx.x < rnoyau ) //...ou plutot ce qu'il en manque { roi[ idrow + tidX+bdimX ] = tex2D( tex_img_ins, j+bdimX-rnoyau, i-rnoyau ) ; roi[ idrow + tidX+bdimX +1] = tex2D( tex_img_ins, j+bdimX-rnoyau+1, i-rnoyau ) ; } // bloc 2 ( en bas \E0 gauche) if ( threadIdx.y < L-1 ) { idrow = (threadIdx.y+blockDim.y)*(bdimX+L-1) ; roi[ idrow + tidX ] = tex2D( tex_img_ins, j-rnoyau, i+blockDim.y-rnoyau ) ; roi[ idrow + tidX +1] = tex2D( tex_img_ins, j-rnoyau+1, i+blockDim.y-rnoyau ) ; //bloc 4 ( en bas \E0 doite )... if ( 2*threadIdx.x < L-1 ) //...ou ce qu'il en manque { roi[ idrow + tidX+bdimX ] = tex2D( tex_img_ins, j+bdimX-rnoyau, i+blockDim.y-rnoyau ) ; roi[ idrow + tidX+bdimX +1] = tex2D( tex_img_ins, j+bdimX-rnoyau+1, i+blockDim.y-rnoyau ) ; } } __syncthreads(); // calculs de convolution for (ic=0 ; ic global mem output[ idx ] = outval0 ; output[ idx+1 ] = outval1 ; } __global__ void kernel_convoNonSepSh_2p(unsigned char *output, int i_dim, int j_dim, int r) { int idb, ic, jc ; int L = 2*r+1 ; float outval0=0.0, outval1=0.0 ; int bdimX = blockDim.x<<1 ; int tidX = threadIdx.x<<1 ; // coordonnees absolues du point de base int j = __umul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2) ; int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ; int idx = __umul24(i,j_dim) + j ; // indice dans l'image // chargement en smem int idrow = threadIdx.y*(bdimX+L-1) ; extern __shared__ int roi[]; // bloc 0 (en haut \E0 gauche) roi[ idrow + tidX ] = tex2D( tex_img_inc, j-r , i-r) ; roi[ idrow + tidX +1 ] = tex2D( tex_img_inc, j-r+1, i-r) ; // bloc 1 (en haut \E0 droite)... if ( threadIdx.x < r ) //...ou plutot ce qu'il en manque { roi[ idrow + bdimX + tidX ] = tex2D( tex_img_inc, j+bdimX-r , i-r ) ; roi[ idrow + bdimX + tidX +1 ] = tex2D( tex_img_inc, j+bdimX-r+1, i-r ) ; } // bloc 2 ( en bas \E0 gauche) if ( threadIdx.y < L-1 ) { idrow = (threadIdx.y+blockDim.y)*(bdimX+L-1) ; roi[ idrow + tidX ] = tex2D( tex_img_inc, j-r, i+blockDim.y-r ) ; roi[ idrow + tidX +1] = tex2D( tex_img_inc, j-r+1, i+blockDim.y-r ) ; //bloc 4 ( en bas \E0 doite )... if ( threadIdx.x < r ) //...ou ce qu'il en manque { roi[ idrow + tidX+bdimX ] = tex2D( tex_img_inc, j+bdimX-r , i+blockDim.y-r ) ; roi[ idrow + tidX+bdimX +1] = tex2D( tex_img_inc, j+bdimX-r+1, i+blockDim.y-r ) ; } } __syncthreads(); // calculs de convolution for (ic=0 ; ic global mem output[ idx ] = outval0 ; output[ idx+1 ] = outval1 ; } //4 pixels en ligne par thread __global__ void kernel_convoNonSepSh_4p(unsigned char *output, int i_dim, int j_dim, int r) { int idb, ic, jc ; int L = 2*r+1 ; float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0 ; int bdimX = blockDim.x<<2 ; int tidX = threadIdx.x<<2 ; // coordonnees absolues du point de base int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x)<<2 ; int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ; int j0= __umul24(blockIdx.x,blockDim.x)<<2 ; int idx = __umul24(i,j_dim) + j ; // indice dans l'image // chargement en smem int idrow = threadIdx.y*(bdimX+L-1) ; extern __shared__ unsigned char roi4p[]; // bloc 0 (en haut \E0 gauche) roi4p[ idrow + tidX ] = tex2D(tex_img_inc, j-r , i-r) ; roi4p[ idrow + tidX +1] = tex2D(tex_img_inc, j-r+1, i-r) ; roi4p[ idrow + tidX +2] = tex2D(tex_img_inc, j-r+2, i-r) ; roi4p[ idrow + tidX +3] = tex2D(tex_img_inc, j-r+3, i-r) ; // bloc 1 (en haut \E0 droite)... if ( threadIdx.x < r ) //...ou plutot ce qu'il en manque { roi4p[ idrow + bdimX + threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX+threadIdx.x , i-r ) ; roi4p[ idrow + bdimX + threadIdx.x +r ] = tex2D( tex_img_inc, j0 +bdimX+threadIdx.x , i-r ) ; } // bloc 2 ( en bas \E0 gauche) if ( threadIdx.y < L-1 ) { idrow = (threadIdx.y+blockDim.y)*(bdimX+L-1) ; roi4p[ idrow + tidX ] = tex2D( tex_img_inc, j-r , i+blockDim.y-r ) ; roi4p[ idrow + tidX +1] = tex2D( tex_img_inc, j-r+1, i+blockDim.y-r ) ; roi4p[ idrow + tidX +2] = tex2D( tex_img_inc, j-r+2, i+blockDim.y-r ) ; roi4p[ idrow + tidX +3] = tex2D( tex_img_inc, j-r+3, i+blockDim.y-r ) ; //bloc 4 ( en bas \E0 doite )... if ( threadIdx.x < r ) //...ou ce qu'il en manque { roi4p[ idrow + bdimX +threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX +threadIdx.x, i+blockDim.y-r ) ; roi4p[ idrow + bdimX +threadIdx.x +r] = tex2D( tex_img_inc, j0 +bdimX +threadIdx.x, i+blockDim.y-r ) ; } } __syncthreads(); // calculs de convolution for (ic=0 ; ic global mem output[ idx ] = outval0 ; output[ idx+1 ] = outval1 ; output[ idx+2 ] = outval2 ; output[ idx+3 ] = outval3 ; } //4 pixels en carre par thread __global__ void kernel_convoNonSepSh_4pcarre(unsigned char *output, int i_dim, int j_dim, int r) { int idb, ic, jc ; int L = 2*r+1 ; float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0 ; int bdimX = blockDim.x<<1 ; int tidX = threadIdx.x<<1 ; int bdimY = blockDim.y<<1 ; int tidY = threadIdx.y<<1 ; // coordonnees absolues du point de base int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x)<<1 ; int i = (__umul24( blockIdx.y, blockDim.y) + threadIdx.y)<<1 ; int j0= __umul24(blockIdx.x,blockDim.x)<<1 ; int i0= __umul24(blockIdx.y,blockDim.y)<<1 ; int idx = __umul24(i,j_dim) + j ; // indice dans l'image // chargement en smem int idrowBase = tidY*(bdimX+L-1) ; int idrowSecond = (tidY+1)*(bdimX+L-1) ; extern __shared__ unsigned char roi4p2[]; // bloc 0 (en haut \E0 gauche) roi4p2[ idrowBase + tidX ] = tex2D(tex_img_inc, j-r , i-r ) ; roi4p2[ idrowBase + tidX +1] = tex2D(tex_img_inc, j-r+1, i-r ) ; roi4p2[ idrowSecond+ tidX ] = tex2D(tex_img_inc, j-r , i-r+1) ; roi4p2[ idrowSecond+ tidX +1] = tex2D(tex_img_inc, j-r+1, i-r+1) ; // bloc 1 (en haut \E0 droite)... if ( threadIdx.x < r ) //...ou plutot ce qu'il en manque { roi4p2[ idrowBase + bdimX + threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX+threadIdx.x , i-r ) ; roi4p2[ idrowBase + bdimX + threadIdx.x +r ] = tex2D( tex_img_inc, j0 +bdimX+threadIdx.x , i-r ) ; roi4p2[ idrowSecond + bdimX + threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX+threadIdx.x , i-r+1 ) ; roi4p2[ idrowSecond + bdimX + threadIdx.x +r ] = tex2D( tex_img_inc, j0 +bdimX+threadIdx.x , i-r+1 ) ; } // bloc 2 ( en bas \E0 gauche) if ( threadIdx.y < L-1 ) { idrowBase = (bdimY + threadIdx.y)*(bdimX+L-1) ; roi4p2[ idrowBase + tidX ] = tex2D( tex_img_inc, j-r , i0-r +bdimY +threadIdx.y ) ; roi4p2[ idrowBase + tidX +1] = tex2D( tex_img_inc, j-r+1, i0-r +bdimY +threadIdx.y ) ; //bloc 4 ( en bas \E0 doite )... if ( threadIdx.x < r ) //...ou ce qu'il en manque { roi4p2[ idrowBase + bdimX + threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX +threadIdx.x , i0-r +bdimY +threadIdx.y ) ; roi4p2[ idrowBase + bdimX + threadIdx.x +r ] = tex2D( tex_img_inc, j0 +bdimX +threadIdx.x , i0-r +bdimY +threadIdx.y ) ; } } __syncthreads(); // calculs de convolution for (ic=0 ; ic global mem output[ idx ] = outval0 ; output[ idx+1 ] = outval1 ; output[ idx+j_dim ] = outval2 ; output[ idx+j_dim+1 ] = outval3 ; } //8 pixels en ligne par thread __global__ void kernel_convoNonSepSh_8p(unsigned char *output, int i_dim, int j_dim, int r) { int idb, ic, jc; int L = 2*r+1 ; float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ; int bdimX = blockDim.x<<3 ; int tidX = threadIdx.x<<3 ; // coordonnees absolues du point de base int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x)<<3 ; int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ; int j0= __umul24(blockIdx.x,blockDim.x)<<3 ; int idx = __umul24(i,j_dim) + j ; // indice dans l'image // chargement en smem int idrow = threadIdx.y*(bdimX+L-1) ; extern __shared__ unsigned char roi8p[]; // bloc 0 (en haut \E0 gauche) for (int p=0; p<8; p++) roi8p[ idrow + tidX +p ] = tex2D(tex_img_inc, j-r+p , i-r) ; // bloc 1 (en haut \E0 droite)... if ( threadIdx.x < r ) //...ou plutot ce qu'il en manque { roi8p[ idrow + bdimX + threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX+threadIdx.x , i-r ) ; roi8p[ idrow + bdimX + threadIdx.x +r ] = tex2D( tex_img_inc, j0 +bdimX+threadIdx.x , i-r ) ; } // bloc 2 ( en bas \E0 gauche) if ( threadIdx.y < L-1 ) { idrow = (threadIdx.y+blockDim.y)*(bdimX+L-1) ; for (int p=0; p<8; p++) roi8p[ idrow + tidX +p ] = tex2D( tex_img_inc, j-r+p , i+blockDim.y-r ) ; //bloc 4 ( en bas \E0 doite )... if ( threadIdx.x < r ) //...ou ce qu'il en manque { roi8p[ idrow + bdimX +threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX +threadIdx.x, i+blockDim.y-r ) ; roi8p[ idrow + bdimX +threadIdx.x +r] = tex2D( tex_img_inc, j0 +bdimX +threadIdx.x, i+blockDim.y-r ) ; } } __syncthreads(); // calculs de convolution for (ic=0 ; ic global mem output[ idx ] = outval0 ; output[ idx+1 ] = outval1 ; output[ idx+2 ] = outval2 ; output[ idx+3 ] = outval3 ; output[ idx+4 ] = outval4 ; output[ idx+5 ] = outval5 ; output[ idx+6 ] = outval6 ; output[ idx+7 ] = outval7 ; } //16 pixels en ligne par thread __global__ void kernel_convoNonSepSh_16p(unsigned char *output, int i_dim, int j_dim, int r) { int idb, ic, jc; int L = 2*r+1 ; float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ; float outval8=0.0, outval9=0.0, outval10=0.0, outval11=0.0, outval12=0.0, outval13=0.0, outval14=0.0, outval15=0.0 ; int bdimX = blockDim.x<<4 ; int tidX = threadIdx.x<<4 ; // coordonnees absolues du point de base int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x)<<4 ; int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ; int j0= __umul24(blockIdx.x,blockDim.x)<<4 ; int idx = __umul24(i,j_dim) + j ; // indice dans l'image // chargement en smem int idrow = threadIdx.y*(bdimX+L-1) ; extern __shared__ unsigned char roi16p[]; // bloc 0 (en haut \E0 gauche) for (int p=0; p<16; p++) roi16p[ idrow + tidX +p ] = tex2D(tex_img_inc, j-r+p , i-r) ; // bloc 1 (en haut \E0 droite)... if ( threadIdx.x < r ) //...ou plutot ce qu'il en manque { roi16p[ idrow + bdimX + threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX+threadIdx.x , i-r ) ; roi16p[ idrow + bdimX + threadIdx.x +r ] = tex2D( tex_img_inc, j0 +bdimX+threadIdx.x , i-r ) ; } // bloc 2 ( en bas \E0 gauche) if ( threadIdx.y < L-1 ) { idrow = (threadIdx.y+blockDim.y)*(bdimX+L-1) ; for (int p=0; p<16; p++) roi16p[ idrow + tidX +p ] = tex2D( tex_img_inc, j-r+p , i+blockDim.y-r ) ; //bloc 4 ( en bas \E0 doite )... if ( threadIdx.x < r ) //...ou ce qu'il en manque { roi16p[ idrow + bdimX +threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX +threadIdx.x, i+blockDim.y-r ) ; roi16p[ idrow + bdimX +threadIdx.x +r] = tex2D( tex_img_inc, j0 +bdimX +threadIdx.x, i+blockDim.y-r ) ; } } __syncthreads(); // calculs de convolution for (ic=0 ; ic global mem output[ idx ] = outval0 ; output[ idx+1 ] = outval1 ; output[ idx+2 ] = outval2 ; output[ idx+3 ] = outval3 ; output[ idx+4 ] = outval4 ; output[ idx+5 ] = outval5 ; output[ idx+6 ] = outval6 ; output[ idx+7 ] = outval7 ; output[ idx+8 ] = outval8 ; output[ idx+9 ] = outval9 ; output[ idx+10 ] = outval10 ; output[ idx+11 ] = outval11 ; output[ idx+12 ] = outval12 ; output[ idx+13 ] = outval13 ; output[ idx+14 ] = outval14 ; output[ idx+15 ] = outval15 ; } // convolution non separable // image en texture et noyau en mem constante // fetch direct des pixels // 2 pixels traités par thread => meilleur débit __global__ void kernel_convoNonSep_2p( int *output, int i_dim, int j_dim) { int idb, ic, jc ; int r = rnoyau ; int L = 2*r+1 ; int N = L*L ; float outval0=0, outval1=0 ; // coordonnees absolues du point de base int j = __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x, 2) ; int i = __mul24( blockIdx.y, blockDim.y) + threadIdx.y ; int idx = __mul24(i, j_dim) + j ; // indice dans l'image #pragma unroll for (idb=0 ; idb< N ; idb++) { ic = i-r + idb/L ; jc = j-r + idb - (idb/L)*L ; outval0 += ( noyau[ idb ]*tex2D(tex_img_in, jc, ic)) ; outval1 += ( noyau[ idb ]*tex2D(tex_img_in, jc+1, ic)) ; } output[ idx ] = outval0 ; output[ idx+1 ] = outval1 ; } // convolution non separable // image en texture et noyau en mem constante // fetch direct des pixels // 2 pixels traités par thread => meilleur débit __global__ void kernel_convoNonSep_2p_s( unsigned short*output, int i_dim, int j_dim) { int idb, ic, jc ; int r = rnoyau ; int L = 2*r+1 ; int N = L*L ; float outval0=0, outval1=0 ; // coordonnees absolues du point de base int j = __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x, 2) ; int i = __mul24( blockIdx.y, blockDim.y) + threadIdx.y ; int idx = __mul24(i, j_dim) + j ; // indice dans l'image #pragma unroll for (idb=0 ; idb< N ; idb++) { ic = i-r + idb/L ; jc = j-r + idb - (idb/L)*L ; outval0 += ( noyau[ idb ]*tex2D(tex_img_ins, jc, ic)) ; outval1 += ( noyau[ idb ]*tex2D(tex_img_ins, jc+1, ic)) ; } output[ idx ] = outval0 ; output[ idx+1 ] = outval1 ; } // convolution separable // image en texture et noyau en mem constante __global__ void kernel_convoSep8V( unsigned char *output, int i_dim, int j_dim, int r) { int ic ; int L=2*r+1 ; float outval0=0.0 ; // coordonnees absolues du point de base int j = __mul24( blockIdx.x, blockDim.x ) + threadIdx.x ; int i = __mul24( blockIdx.y, blockDim.y ) + threadIdx.y ; //vertical for (ic=0 ; ic0 ) id = baseIdx-1 ; else id = baseIdx ; val = input[id] ; //p1 outval0 += masque[0]*val ; //p2 val = input[baseIdx++] ; outval0 += masque[1]*val ; outval1 += masque[0]*val ; //p3 val = input[baseIdx++] ; outval0 += masque[2]*val ; outval1 += masque[1]*val ; outval2 += masque[0]*val ; //p4 val = input[baseIdx++] ; outval1 += masque[2]*val ; outval2 += masque[1]*val ; outval3 += masque[0]*val ; //p5 val = input[baseIdx++] ; outval2 += masque[2]*val ; outval3 += masque[1]*val ; outval4 += masque[0]*val ; //p6 val = input[baseIdx++] ; outval3 += masque[2]*val ; outval4 += masque[1]*val ; outval5 += masque[0]*val ; //p7 val = input[baseIdx++] ; outval4 += masque[2]*val ; outval5 += masque[1]*val ; outval6 += masque[0]*val ; //p8 val = input[baseIdx++] ; outval5 += masque[2]*val ; outval6 += masque[1]*val ; outval7 += masque[0]*val ; //p9 val = input[baseIdx++] ; outval6 += masque[2]*val ; outval7 += masque[1]*val ; //p10 val = input[baseIdx++] ; outval7 += masque[2]*val ; baseIdx = __mul24(i , j_dim) + j ; output[ baseIdx++ ] = outval0 ; output[ baseIdx++ ] = outval1 ; output[ baseIdx++ ] = outval2 ; output[ baseIdx++ ] = outval3 ; output[ baseIdx++ ] = outval4 ; output[ baseIdx++ ] = outval5 ; output[ baseIdx++ ] = outval6 ; output[ baseIdx++ ] = outval7 ; } // convolution separable // image en texture et noyau en mem constante __global__ void kernel_convoSep8HL5x8pG( unsigned char *input, unsigned char *output, int i_dim, int j_dim) { int jc, baseIdx, id ; float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ; float val ; // coordonnees absolues du point de base int j = (__mul24( blockIdx.x, blockDim.x ) + threadIdx.x)<<3 ; int i = __mul24( blockIdx.y, blockDim.y ) + threadIdx.y ; //horizontal baseIdx = __mul24(i , j_dim) + j ; if (baseIdx >1 ) id = baseIdx-2 ; else id = baseIdx ; val = input[id] ; //p1 outval0 += masque[0]*val ; //p2 if (baseIdx >0 ) id = baseIdx-1 ; else id = baseIdx ; val = input[id] ; outval0 += masque[1]*val ; outval1 += masque[0]*val ; //p3 val = input[baseIdx++] ; outval0 += masque[2]*val ; outval1 += masque[1]*val ; outval2 += masque[0]*val ; //p4 val = input[baseIdx++] ; outval0 += masque[3]*val ; outval1 += masque[2]*val ; outval2 += masque[1]*val ; outval3 += masque[0]*val ; //p5 val = input[baseIdx++] ; outval0 += masque[4]*val ; outval1 += masque[3]*val ; outval2 += masque[2]*val ; outval3 += masque[1]*val ; outval4 += masque[0]*val ; //p6 val = input[baseIdx++] ; outval1 += masque[4]*val ; outval2 += masque[3]*val ; outval3 += masque[2]*val ; outval4 += masque[1]*val ; outval5 += masque[0]*val ; //p7 val = input[baseIdx++] ; outval2 += masque[4]*val ; outval3 += masque[3]*val ; outval4 += masque[2]*val ; outval5 += masque[1]*val ; outval6 += masque[0]*val ; //p8 val = input[baseIdx++] ; outval3 += masque[4]*val ; outval4 += masque[3]*val ; outval5 += masque[2]*val ; outval6 += masque[1]*val ; outval7 += masque[0]*val ; //p9 val = input[baseIdx++] ; outval4 += masque[4]*val ; outval5 += masque[3]*val ; outval6 += masque[2]*val ; outval7 += masque[1]*val ; //p10 val = input[baseIdx++] ; outval5 += masque[4]*val ; outval6 += masque[3]*val ; outval7 += masque[2]*val ; //p11 val = input[baseIdx++] ; outval6 += masque[4]*val ; outval7 += masque[3]*val ; //p11 val = input[baseIdx++] ; outval7 += masque[4]*val ; baseIdx = __mul24(i , j_dim) + j ; output[ baseIdx++ ] = outval0 ; output[ baseIdx++ ] = outval1 ; output[ baseIdx++ ] = outval2 ; output[ baseIdx++ ] = outval3 ; output[ baseIdx++ ] = outval4 ; output[ baseIdx++ ] = outval5 ; output[ baseIdx++ ] = outval6 ; output[ baseIdx++ ] = outval7 ; } // convolution separable // image en texture et noyau en mem constante __global__ void kernel_convoSep8HL7x8pG( unsigned char *input, unsigned char *output, int i_dim, int j_dim) { int jc, baseIdx, id ; float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ; float val ; // coordonnees absolues du point de base int j = (__mul24( blockIdx.x, blockDim.x ) + threadIdx.x)<<3 ; int i = __mul24( blockIdx.y, blockDim.y ) + threadIdx.y ; //horizontal baseIdx = __mul24(i , j_dim) + j ; if (baseIdx >2 ) id = baseIdx-3 ; else id = baseIdx ; val = input[id] ; //p1 outval0 += masque[0]*val ; //p2 if (baseIdx >1 ) id = baseIdx-2 ; else id = baseIdx ; val = input[id] ; outval0 += masque[1]*val ; outval1 += masque[0]*val ; //p3 if (baseIdx >0 ) id = baseIdx-1 ; else id = baseIdx ; val = input[id] ; outval0 += masque[2]*val ; outval1 += masque[1]*val ; outval2 += masque[0]*val ; //p4 val = input[baseIdx++] ; outval0 += masque[3]*val ; outval1 += masque[2]*val ; outval2 += masque[1]*val ; outval3 += masque[0]*val ; //p5 val = input[baseIdx++] ; outval0 += masque[4]*val ; outval1 += masque[3]*val ; outval2 += masque[2]*val ; outval3 += masque[1]*val ; outval4 += masque[0]*val ; //p6 val = input[baseIdx++] ; outval0 += masque[5]*val ; outval1 += masque[4]*val ; outval2 += masque[3]*val ; outval3 += masque[2]*val ; outval4 += masque[1]*val ; outval5 += masque[0]*val ; //p7 val = input[baseIdx++] ; outval0 += masque[6]*val ; outval1 += masque[5]*val ; outval2 += masque[4]*val ; outval3 += masque[3]*val ; outval4 += masque[2]*val ; outval5 += masque[1]*val ; outval6 += masque[0]*val ; //p8 val = input[baseIdx++] ; outval1 += masque[6]*val ; outval2 += masque[5]*val ; outval3 += masque[4]*val ; outval4 += masque[3]*val ; outval5 += masque[2]*val ; outval6 += masque[1]*val ; outval7 += masque[0]*val ; //p9 val = input[baseIdx++] ; outval2 += masque[6]*val ; outval3 += masque[5]*val ; outval4 += masque[4]*val ; outval5 += masque[3]*val ; outval6 += masque[2]*val ; outval7 += masque[1]*val ; //p10 val = input[baseIdx++] ; outval3 += masque[6]*val ; outval4 += masque[5]*val ; outval5 += masque[4]*val ; outval6 += masque[3]*val ; outval7 += masque[2]*val ; //p11 val = input[baseIdx++] ; outval4 += masque[6]*val ; outval5 += masque[5]*val ; outval6 += masque[4]*val ; outval7 += masque[3]*val ; //p12 val = input[baseIdx++] ; outval5 += masque[6]*val ; outval6 += masque[5]*val ; outval7 += masque[4]*val ; //p13 val = input[baseIdx++] ; outval6 += masque[6]*val ; outval7 += masque[5]*val ; //p14 val = input[baseIdx++] ; outval7 += masque[6]*val ; baseIdx = __mul24(i , j_dim) + j ; output[ baseIdx++ ] = outval0 ; output[ baseIdx++ ] = outval1 ; output[ baseIdx++ ] = outval2 ; output[ baseIdx++ ] = outval3 ; output[ baseIdx++ ] = outval4 ; output[ baseIdx++ ] = outval5 ; output[ baseIdx++ ] = outval6 ; output[ baseIdx++ ] = outval7 ; } //8 pixels en ligne par thread __global__ void kernel_convoSepShx8pV(unsigned char *output, int i_dim, int j_dim, int r) { int idb, ic, jc, p; int L = 2*r+1 ; float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ; int bdimX = blockDim.x<<3 ; int tidX = threadIdx.x<<3 ; // coordonnees absolues du point de base int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x)<<3 ; int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ; int idx = __umul24(i,j_dim) + j ; // indice dans l'image // chargement en smem int idrow = threadIdx.y*bdimX ; extern __shared__ unsigned char roi8p[]; // bloc 0 (en haut) for (p=0; p<8; p++) roi8p[ idrow + tidX +p ] = tex2D(tex_img_inc, j+p , i-r) ; // bloc 2 ( en bas) if ( threadIdx.y < L-1 ) { idrow = (threadIdx.y+blockDim.y)*bdimX ; for (int p=0; p<8; p++) roi8p[ idrow + tidX +p ] = tex2D( tex_img_inc, j+p , i+blockDim.y-r ) ; } __syncthreads(); // calculs de convolution // passe verticale for (ic=0 ; ic global mem output[ idx ] = outval0 ; output[ idx+1 ] = outval1 ; output[ idx+2 ] = outval2 ; output[ idx+3 ] = outval3 ; output[ idx+4 ] = outval4 ; output[ idx+5 ] = outval5 ; output[ idx+6 ] = outval6 ; output[ idx+7 ] = outval7 ; } //8 pixels en ligne par thread __global__ void kernel_convoSepShx8pVG(unsigned char *input, unsigned char *output, int i_dim, int j_dim, int r) { int idb, ic, jc, p; int L = 2*r+1 ; float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ; int bdimX = blockDim.x<<3 ; int tidX = threadIdx.x<<3 ; // coordonnees absolues du point de base int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x)<<3 ; int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ; int idx = __umul24(i,j_dim) + j ; // indice dans l'image // chargement en smem int idrow = threadIdx.y*bdimX ; extern __shared__ unsigned char roi8p[]; // bloc 0 (en haut) for (p=0; p<8; p++) roi8p[ idrow + tidX +p ] = input[ (i-r)*j_dim + j+p ] ; // bloc 2 ( en bas) if ( threadIdx.y < L-1 ) { idrow = (threadIdx.y+blockDim.y)*bdimX ; for (int p=0; p<8; p++) roi8p[ idrow + tidX +p ] = input[ (i+blockDim.y-r)*j_dim + j+p ] ; } __syncthreads(); // calculs de convolution // passe verticale for (ic=0 ; ic global mem output[ idx ] = outval0 ; output[ idx+1 ] = outval1 ; output[ idx+2 ] = outval2 ; output[ idx+3 ] = outval3 ; output[ idx+4 ] = outval4 ; output[ idx+5 ] = outval5 ; output[ idx+6 ] = outval6 ; output[ idx+7 ] = outval7 ; } __global__ void kernel_convoSepShx8pH(unsigned char *output, int i_dim, int j_dim, int r) { int idb, ic, jc, p; int L = 2*r+1 ; float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ; int bdimX = blockDim.x<<3 ; int tidX = threadIdx.x<<3 ; // coordonnees absolues du point de base int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x)<<3 ; int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ; int j0= __umul24(blockIdx.x,blockDim.x)<<3 ; int idx = __umul24(i,j_dim) + j ; // indice dans l'image // chargement en smem int idrow = threadIdx.y*(bdimX+L-1) ; extern __shared__ unsigned char roi8p[]; // bloc 0 (a gauche) for (p=0; p<8; p++) roi8p[ idrow + tidX +p ] = tex2D(tex_img_inc, j-r+p , i) ; // a droite if ( threadIdx.x < r ) //...ou plutot ce qu'il en manque { roi8p[ idrow + bdimX + threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX+threadIdx.x , i ) ; roi8p[ idrow + bdimX + threadIdx.x +r ] = tex2D( tex_img_inc, j0 +bdimX+threadIdx.x , i ) ; } __syncthreads(); // calculs de convolution // passe horizontale for (jc=0 ; jc global mem output[ idx ] = outval0 ; output[ idx+1 ] = outval1 ; output[ idx+2 ] = outval2 ; output[ idx+3 ] = outval3 ; output[ idx+4 ] = outval4 ; output[ idx+5 ] = outval5 ; output[ idx+6 ] = outval6 ; output[ idx+7 ] = outval7 ; } /************************************************************************************************* *********************************************************************************************** FIN DES kERNELS de CONVOLUTION *********************************************************************************************** *************************************************************************************************/ // kernel de la libjacket // Exchange trick: Morgan McGuire, ShaderX 2008 #define s2(a,b) { float tmp = a; a = min(a,b); b = max(tmp,b); } #define mn3(a,b,c) s2(a,b); s2(a,c); #define mx3(a,b,c) s2(b,c); s2(a,c); #define mnmx3(a,b,c) mx3(a,b,c); s2(a,b); // 3 exchanges #define mnmx4(a,b,c,d) s2(a,b); s2(c,d); s2(a,c); s2(b,d); // 4 exchanges #define mnmx5(a,b,c,d,e) s2(a,b); s2(c,d); mn3(a,c,e); mx3(b,d,e); // 6 exchanges #define mnmx6(a,b,c,d,e,f) s2(a,d); s2(b,e); s2(c,f); mn3(a,b,c); mx3(d,e,f); // 7 exchanges #define IN(X,Y) ((0 <= (X) && (X) < nx && 0 <= (Y) && (Y) < ny) ? d_in[(Y)*nx+(X)] : 0) __global__ static void kernel_medjacket(int nx, int ny, unsigned short*d_out, unsigned short*d_in) { int x = __mul24(blockIdx.x , blockDim.x) + threadIdx.x; int y = __mul24(blockIdx.y , blockDim.y) + threadIdx.y; // pull top six from image float v[6] = { IN(x-1, y-1), IN(x , y-1), IN(x+1, y-1), IN(x-1, y ), IN(x , y ), IN(x+1, y ) }; // with each pass, remove min and max values and add new value mnmx6(v[0], v[1], v[2], v[3], v[4], v[5]); v[5] = IN(x-1, y+1); // add new contestant mnmx5(v[1], v[2], v[3], v[4], v[5]); v[5] = IN(x , y+1); // add new contestant mnmx4(v[2], v[3], v[4], v[5]); v[5] = IN(x+1, y+1); // add last contenstant mnmx3(v[3], v[4], v[5]); // pick the middle one d_out[y*nx + x] = v[4]; } /*************************************************************** * fonction de tri de 2 valeurs entieres (min en premier) ***************************************************************/ __device__ inline void s(int* a, int* b) { int tmp ; if (*a > *b) { tmp = *b ; *b = *a ; *a = tmp ; } } __device__ inline void s(unsigned short * a, unsigned short* b) { unsigned short tmp ; if (*a > *b) { tmp = *b ; *b = *a ; *a = tmp ; } } __device__ inline void s(unsigned char * a, unsigned char * b) { unsigned short tmp ; if (*a > *b) { tmp = *b ; *b = *a ; *a = tmp ; } } /*************************************************************** * fonction de min-max d'un tableau de n valeurs entieres ***************************************************************/ __device__ void minmaxN(unsigned short* v, int n) { for (int i=1; i< n; i++) s( v, v+i) ; for (int i=n-2; i>0; i--) s( v+i, v+n-1) ; } /*************************************************************** * fonction de tri naif d'un tableau de n valeurs entieres ***************************************************************/ __device__ void bubTriN(int * v, int n) { for (int i=0; i< n-1; i++) for (int j=i+1; j ((2*r+1)*(2*r+1))>>1 ) break ; } output[ __mul24(i, j_dim) +j ] = ic ; } __global__ void kernel_medianRSH( unsigned short * output, int i_dim, int j_dim, int r) { // coordonnees absolues du point int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; unsigned short cpt ; unsigned short ic, jc ; // chargement en smem int idrow = threadIdx.y*(blockDim.x+2*r) ; extern __shared__ int medRoi[]; // bloc 0 (en haut \E0 gauche) medRoi[ idrow + threadIdx.x ] = tex2D(tex_img_ins, j-r, i-r) ; // bloc 1 (en haut \E0 droite)... if ( threadIdx.x < 2*r ) //...ou plutot ce qu'il en manque medRoi[ idrow + threadIdx.x + blockDim.x ] = tex2D( tex_img_ins, j+blockDim.x-r, i-r ) ; // bloc 2 ( en bas \E0 gauche) if ( threadIdx.y < 2*r ) { idrow = (threadIdx.y+blockDim.y)*(blockDim.x+2*r) ; medRoi[ idrow + threadIdx.x ] = tex2D( tex_img_ins, j-r, i+blockDim.y-r ) ; //bloc 4 ( en bas \E0 doite )... if ( threadIdx.x < 2*r ) //...ou ce qu'il en manque medRoi[ idrow + threadIdx.x + blockDim.x ] = tex2D( tex_img_ins, j+blockDim.x-r, i+blockDim.y-r ) ; } __syncthreads(); int hist[256] ; for (ic =0; ic<256; ic++) hist[ic]=0; // init histogramme //generation histogramme for(ic=0; ic<=2*r; ic++ ) for(jc=0; jc<=2*r; jc++) hist[medRoi[(threadIdx.y+ic)*(blockDim.x+2*r)+ (threadIdx.x+jc)]]++ ; //parcours histogramme cpt = 0 ; for(ic=0; ic<256; ic++) { cpt += hist[ic] ; //selection de la valeur du percentile (ici 50%=SUM/2) if ( cpt > (((2*r+1)*(2*r+1))>>1) ) break ; } output[ __mul24(i, j_dim) +j ] = ic ; } __global__ void kernel_median5_2pix( unsigned short*output, int i_dim, int j_dim) { // coordonnees absolues du point int j = __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2) ; int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; /************************************************************************** * tri(s) **************************************************************************/ int a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13 ; int b7, b8, b9, b10, b11, b12, b13 ; /******************************************************************************** * les 14 premieres valeurs (suffisant pour median 5x5 par forgetfull selection) ********************************************************************************/ //premiere ligne a0 = tex2D(tex_img_ins, j-1, i-2) ; a1 = tex2D(tex_img_ins, j , i-2) ; a2 = tex2D(tex_img_ins, j+1, i-2) ; a3 = tex2D(tex_img_ins, j+2, i-2) ; //deuxieme ligne a4 = tex2D(tex_img_ins, j-1, i-1) ; a5 = tex2D(tex_img_ins, j , i-1) ; a6 = tex2D(tex_img_ins, j+1, i-1) ; a7 = tex2D(tex_img_ins, j+2, i-1) ; //troisieme ligne a8 = tex2D(tex_img_ins, j-1, i) ; a9 = tex2D(tex_img_ins, j , i) ; a10 = tex2D(tex_img_ins, j+1, i) ; a11 = tex2D(tex_img_ins, j+2, i) ; //les 2 premiers de la 4eme ligne a12 = tex2D(tex_img_ins, j-1, i+1) ; a13 = tex2D(tex_img_ins, j , i+1) ; //min max aux extremites minmax14(&a0,&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); //chargement valeurs suivante (15) a13 = tex2D(tex_img_ins, j+1, i+1); //minmax aux extremites minmax13(&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); //chargement valeur suivante (16) a13 = tex2D(tex_img_ins, j+2, i+1); //minmax aux extremites minmax12(&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); //chargement valeur suivante (17) a13 = tex2D(tex_img_ins, j-1, i+2); //minmax aux extremites minmax11(&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); //chargement valeur suivante (18) a13 = tex2D(tex_img_ins, j , i+2); //minmax aux extremites minmax10(&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); //chargement valeur suivante (19) a13 = tex2D(tex_img_ins, j+1, i+2); //minmax aux extremites minmax9(&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); //chargement valeur suivante (20) a13 = tex2D(tex_img_ins, j+2, i+2); //minmax aux extmites minmax8(&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); // fin des pixels voisins communs deux pixels centraux b7=a7; b8=a8; b9=a9; b10=a10; b11=a11; b12=a12; b13=a13; //chargement valeur suivante (21) a13 = tex2D(tex_img_ins, j-2, i-2); b13 = tex2D(tex_img_ins, j+3, i-2); //minmax aux extremites minmax7(&a7,&a8,&a9,&a10,&a11,&a12,&a13); minmax7(&b7,&b8,&b9,&b10,&b11,&b12,&b13); //chargement valeur suivante (22) a13 = tex2D(tex_img_ins, j-2, i-1); b13 = tex2D(tex_img_ins, j+3, i-1); //minmax aux extremites minmax6(&a8,&a9,&a10,&a11,&a12,&a13); minmax6(&b8,&b9,&b10,&b11,&b12,&b13); //chargement valeur suivante (23) a13 = tex2D(tex_img_ins, j-2, i ); b13 = tex2D(tex_img_ins, j+3, i ); //minmax aux extremites minmax5(&a9,&a10,&a11,&a12,&a13); minmax5(&b9,&b10,&b11,&b12,&b13); //chargement valeur suivante (24) a13 = tex2D(tex_img_ins, j-2, i+1); b13 = tex2D(tex_img_ins, j+3, i+1); //minmax aux extremites minmax4(&a10,&a11,&a12,&a13); minmax4(&b10,&b11,&b12,&b13); //chargement valeur suivante (25) a13 = tex2D(tex_img_ins, j-2, i+2); b13 = tex2D(tex_img_ins, j+3, i+2); //minmax aux extremites minmax3(&a11,&a12,&a13); minmax3(&b11,&b12,&b13); //median au milieu ! output[ __mul24(i, j_dim) +j ] = a12 ; output[ __mul24(i, j_dim) +j+1 ] = b12 ; } __global__ void kernel_median5_2pix( unsigned char *output, int i_dim, int j_dim) { // coordonnees absolues du point int j = __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2) ; int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; /************************************************************************** * tri(s) **************************************************************************/ int a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13 ; int b7, b8, b9, b10, b11, b12, b13 ; /******************************************************************************** * les 14 premieres valeurs (suffisant pour median 5x5 par forgetfull selection) ********************************************************************************/ //premiere ligne a0 = tex2D(tex_img_inc, j-1, i-2) ; a1 = tex2D(tex_img_inc, j , i-2) ; a2 = tex2D(tex_img_inc, j+1, i-2) ; a3 = tex2D(tex_img_inc, j+2, i-2) ; //deuxieme ligne a4 = tex2D(tex_img_inc, j-1, i-1) ; a5 = tex2D(tex_img_inc, j , i-1) ; a6 = tex2D(tex_img_inc, j+1, i-1) ; a7 = tex2D(tex_img_inc, j+2, i-1) ; //troisieme ligne a8 = tex2D(tex_img_inc, j-1, i) ; a9 = tex2D(tex_img_inc, j , i) ; a10 = tex2D(tex_img_inc, j+1, i) ; a11 = tex2D(tex_img_inc, j+2, i) ; //les 2 premiers de la 4eme ligne a12 = tex2D(tex_img_inc, j-1, i+1) ; a13 = tex2D(tex_img_inc, j , i+1) ; //min max aux extremites minmax14(&a0,&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); //chargement valeurs suivante (15) a13 = tex2D(tex_img_inc, j+1, i+1); //minmax aux extremites minmax13(&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); //chargement valeur suivante (16) a13 = tex2D(tex_img_inc, j+2, i+1); //minmax aux extremites minmax12(&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); //chargement valeur suivante (17) a13 = tex2D(tex_img_inc, j-1, i+2); //minmax aux extremites minmax11(&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); //chargement valeur suivante (18) a13 = tex2D(tex_img_inc, j , i+2); //minmax aux extremites minmax10(&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); //chargement valeur suivante (19) a13 = tex2D(tex_img_inc, j+1, i+2); //minmax aux extremites minmax9(&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); //chargement valeur suivante (20) a13 = tex2D(tex_img_inc, j+2, i+2); //minmax aux extmites minmax8(&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); // fin des pixels voisins communs deux pixels centraux b7=a7; b8=a8; b9=a9; b10=a10; b11=a11; b12=a12; b13=a13; //chargement valeur suivante (21) a13 = tex2D(tex_img_inc, j-2, i-2); b13 = tex2D(tex_img_inc, j+3, i-2); //minmax aux extremites minmax7(&a7,&a8,&a9,&a10,&a11,&a12,&a13); minmax7(&b7,&b8,&b9,&b10,&b11,&b12,&b13); //chargement valeur suivante (22) a13 = tex2D(tex_img_inc, j-2, i-1); b13 = tex2D(tex_img_inc, j+3, i-1); //minmax aux extremites minmax6(&a8,&a9,&a10,&a11,&a12,&a13); minmax6(&b8,&b9,&b10,&b11,&b12,&b13); //chargement valeur suivante (23) a13 = tex2D(tex_img_inc, j-2, i ); b13 = tex2D(tex_img_inc, j+3, i ); //minmax aux extremites minmax5(&a9,&a10,&a11,&a12,&a13); minmax5(&b9,&b10,&b11,&b12,&b13); //chargement valeur suivante (24) a13 = tex2D(tex_img_inc, j-2, i+1); b13 = tex2D(tex_img_inc, j+3, i+1); //minmax aux extremites minmax4(&a10,&a11,&a12,&a13); minmax4(&b10,&b11,&b12,&b13); //chargement valeur suivante (25) a13 = tex2D(tex_img_inc, j-2, i+2); b13 = tex2D(tex_img_inc, j+3, i+2); //minmax aux extremites minmax3(&a11,&a12,&a13); minmax3(&b11,&b12,&b13); //median au milieu ! output[ __mul24(i, j_dim) +j ] = a12 ; output[ __mul24(i, j_dim) +j+1 ] = b12 ; } __global__ void kernel_median7( unsigned short*output, int i_dim, int j_dim) { // coordonnees absolues du point int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; /************************************************************************** * tri(s) **************************************************************************/ int a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16,a17,a18,a19,a20,a21,a22,a23,a24,a25 ; /**************************************** * les 26 premieres valeurs ****************************************/ //premiere ligne a0 = tex2D(tex_img_ins, j-2, i-3) ; a1 = tex2D(tex_img_ins, j-1, i-3) ; a2 = tex2D(tex_img_ins, j , i-3) ; a3 = tex2D(tex_img_ins, j+1, i-3) ; a4 = tex2D(tex_img_ins, j+2, i-3) ; a5 = tex2D(tex_img_ins, j+3, i-3) ; //deuxieme ligne a6 = tex2D(tex_img_ins, j-2, i-2) ; a7 = tex2D(tex_img_ins, j-1, i-2) ; a8 = tex2D(tex_img_ins, j , i-2) ; a9 = tex2D(tex_img_ins, j+1, i-2) ; a10 = tex2D(tex_img_ins, j+2, i-2) ; a11 = tex2D(tex_img_ins, j+3, i-2) ; //troisieme ligne a12 = tex2D(tex_img_ins, j-2, i-1) ; a13 = tex2D(tex_img_ins, j-1, i-1) ; a14 = tex2D(tex_img_ins, j , i-1) ; a15 = tex2D(tex_img_ins, j+1, i-1) ; a16 = tex2D(tex_img_ins, j+2, i-1) ; a17 = tex2D(tex_img_ins, j+3, i-1) ; //quatrieme ligne a18 = tex2D(tex_img_ins, j-2, i ) ; a19 = tex2D(tex_img_ins, j-1, i ) ; a20 = tex2D(tex_img_ins, j , i ) ; a21 = tex2D(tex_img_ins, j+1, i ) ; a22 = tex2D(tex_img_ins, j+2, i ) ; a23 = tex2D(tex_img_ins, j+3, i ) ; //cinquieme ligne a24 = tex2D(tex_img_ins, j-2, i+1) ; a25 = tex2D(tex_img_ins, j-1, i+1) ; //min max aux extremites minmax26(&a0,&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeurs suivante (26) a25 = tex2D(tex_img_ins, j , i+1); //minmax aux extremites minmax25(&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (27) a25 = tex2D(tex_img_ins, j+1, i+1); //minmax aux extremites minmax24(&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (28) a25 = tex2D(tex_img_ins, j+2, i+1); //minmax aux extremites minmax23(&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (29) a25 = tex2D(tex_img_ins, j+3, i+1); //minmax aux extremites minmax22(&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (30) a25 = tex2D(tex_img_ins, j-2, i+2); //minmax aux extremites minmax21(&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (31) a25 = tex2D(tex_img_ins, j-1, i+2); //minmax aux extmites minmax20(&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (32) a25 = tex2D(tex_img_ins, j , i+2); //minmax aux extmites minmax19(&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (33) a25 = tex2D(tex_img_ins, j+1, i+2); //minmax aux extmites minmax18(&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (34) a25 = tex2D(tex_img_ins, j+2, i+2); //minmax aux extmites minmax17(&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (35) a25 = tex2D(tex_img_ins, j+3, i+2); //minmax aux extmites minmax16(&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (36) a25 = tex2D(tex_img_ins, j-2, i+3); //minmax aux extmites minmax15(&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (37) a25 = tex2D(tex_img_ins, j-1, i+3); //minmax aux extmites minmax14(&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (38) a25 = tex2D(tex_img_ins, j , i+3); //minmax aux extmites minmax13(&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (39) a25 = tex2D(tex_img_ins, j+1, i+3); //minmax aux extmites minmax12(&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (40) a25 = tex2D(tex_img_ins, j+2, i+3); //minmax aux extmites minmax11(&a15, &a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (41) a25 = tex2D(tex_img_ins, j+3, i+3); //minmax aux extmites minmax10(&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeurs suivantes a25 = tex2D(tex_img_ins, j-3, i+3); //minmax aux extremites minmax9(&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeurs suivantes a25 = tex2D(tex_img_ins, j-3, i+2); //minmax aux extremites minmax8(&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeurs suivantes a25 = tex2D(tex_img_ins, j-3, i+1); //minmax aux extremites minmax7(&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeurs suivantes a25 = tex2D(tex_img_ins, j-3, i); //minmax aux extremites minmax6(&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeurs suivantes a25 = tex2D(tex_img_ins, j-3, i-1); //minmax aux extremites minmax5(&a21,&a22,&a23,&a24,&a25); //chargement valeurs suivantes a25 = tex2D(tex_img_ins, j-3, i-2); //minmax aux extremites minmax4(&a22,&a23,&a24,&a25); //chargement valeurs suivantes a25 = tex2D(tex_img_ins, j-3, i-3); //minmax aux extremites minmax3(&a23,&a24,&a25); //medians au milieu ! output[ __mul24(i, j_dim) +j ] = a24 ; } __global__ void kernel_median7_2pix( unsigned short*output, int i_dim, int j_dim) { // coordonnees absolues du point int j = __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2) ; int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; /************************************************************************** * tri(s) **************************************************************************/ int a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16,a17,a18,a19,a20,a21,a22,a23,a24,a25 ; int b17,b18,b19,b20,b21,b22,b23,b24,b25; /**************************************** * les 26 premieres valeurs ****************************************/ //premiere ligne a0 = tex2D(tex_img_ins, j-2, i-3) ; a1 = tex2D(tex_img_ins, j-1, i-3) ; a2 = tex2D(tex_img_ins, j , i-3) ; a3 = tex2D(tex_img_ins, j+1, i-3) ; a4 = tex2D(tex_img_ins, j+2, i-3) ; a5 = tex2D(tex_img_ins, j+3, i-3) ; //deuxieme ligne a6 = tex2D(tex_img_ins, j-2, i-2) ; a7 = tex2D(tex_img_ins, j-1, i-2) ; a8 = tex2D(tex_img_ins, j , i-2) ; a9 = tex2D(tex_img_ins, j+1, i-2) ; a10 = tex2D(tex_img_ins, j+2, i-2) ; a11 = tex2D(tex_img_ins, j+3, i-2) ; //troisieme ligne a12 = tex2D(tex_img_ins, j-2, i-1) ; a13 = tex2D(tex_img_ins, j-1, i-1) ; a14 = tex2D(tex_img_ins, j , i-1) ; a15 = tex2D(tex_img_ins, j+1, i-1) ; a16 = tex2D(tex_img_ins, j+2, i-1) ; a17 = tex2D(tex_img_ins, j+3, i-1) ; //quatrieme ligne a18 = tex2D(tex_img_ins, j-2, i ) ; a19 = tex2D(tex_img_ins, j-1, i ) ; a20 = tex2D(tex_img_ins, j , i ) ; a21 = tex2D(tex_img_ins, j+1, i ) ; a22 = tex2D(tex_img_ins, j+2, i ) ; a23 = tex2D(tex_img_ins, j+3, i ) ; //cinquieme ligne a24 = tex2D(tex_img_ins, j-2, i+1) ; a25 = tex2D(tex_img_ins, j-1, i+1) ; //min max aux extremites minmax26(&a0,&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeurs suivante (26) a25 = tex2D(tex_img_ins, j , i+1); //minmax aux extremites minmax25(&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (27) a25 = tex2D(tex_img_ins, j+1, i+1); //minmax aux extremites minmax24(&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (28) a25 = tex2D(tex_img_ins, j+2, i+1); //minmax aux extremites minmax23(&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (29) a25 = tex2D(tex_img_ins, j+3, i+1); //minmax aux extremites minmax22(&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (30) a25 = tex2D(tex_img_ins, j-2, i+2); //minmax aux extremites minmax21(&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (31) a25 = tex2D(tex_img_ins, j-1, i+2); //minmax aux extmites minmax20(&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (32) a25 = tex2D(tex_img_ins, j , i+2); //minmax aux extmites minmax19(&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (33) a25 = tex2D(tex_img_ins, j+1, i+2); //minmax aux extmites minmax18(&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (34) a25 = tex2D(tex_img_ins, j+2, i+2); //minmax aux extmites minmax17(&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (35) a25 = tex2D(tex_img_ins, j+3, i+2); //minmax aux extmites minmax16(&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (36) a25 = tex2D(tex_img_ins, j-2, i+3); //minmax aux extmites minmax15(&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (37) a25 = tex2D(tex_img_ins, j-1, i+3); //minmax aux extmites minmax14(&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (38) a25 = tex2D(tex_img_ins, j , i+3); //minmax aux extmites minmax13(&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (39) a25 = tex2D(tex_img_ins, j+1, i+3); //minmax aux extmites minmax12(&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (40) a25 = tex2D(tex_img_ins, j+2, i+3); //minmax aux extmites minmax11(&a15, &a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (41) a25 = tex2D(tex_img_ins, j+3, i+3); //minmax aux extmites minmax10(&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); // fin des pixels voisins communs deux pixels centraux b17=a17; b18=a18; b19=a19; b20=a20; b21=a21; b22=a22; b23=a23; b24=a24; b25=a25; //chargement valeurs suivantes a25 = tex2D(tex_img_ins, j-3, i+3); b25 = tex2D(tex_img_ins, j+4, i+3); //minmax aux extremites minmax9(&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); minmax9(&b17,&b18,&b19,&b20,&b21,&b22,&b23,&b24,&b25); //chargement valeurs suivantes a25 = tex2D(tex_img_ins, j-3, i+2); b25 = tex2D(tex_img_ins, j+4, i+2); //minmax aux extremites minmax8(&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); minmax8(&b18,&b19,&b20,&b21,&b22,&b23,&b24,&b25); //chargement valeurs suivantes a25 = tex2D(tex_img_ins, j-3, i+1); b25 = tex2D(tex_img_ins, j+4, i+1); //minmax aux extremites minmax7(&a19,&a20,&a21,&a22,&a23,&a24,&a25); minmax7(&b19,&b20,&b21,&b22,&b23,&b24,&b25); //chargement valeurs suivantes a25 = tex2D(tex_img_ins, j-3, i); b25 = tex2D(tex_img_ins, j+4, i); //minmax aux extremites minmax6(&a20,&a21,&a22,&a23,&a24,&a25); minmax6(&b20,&b21,&b22,&b23,&b24,&b25); //chargement valeurs suivantes a25 = tex2D(tex_img_ins, j-3, i-1); b25 = tex2D(tex_img_ins, j+4, i-1); //minmax aux extremites minmax5(&a21,&a22,&a23,&a24,&a25); minmax5(&b21,&b22,&b23,&b24,&b25); //chargement valeurs suivantes a25 = tex2D(tex_img_ins, j-3, i-2); b25 = tex2D(tex_img_ins, j+4, i-2); //minmax aux extremites minmax4(&a22,&a23,&a24,&a25); minmax4(&b22,&b23,&b24,&b25); //chargement valeurs suivantes a25 = tex2D(tex_img_ins, j-3, i-3); b25 = tex2D(tex_img_ins, j+4, i-3); //minmax aux extremites minmax3(&a23,&a24,&a25); minmax3(&b23,&b24,&b25); //medians au milieu ! output[ __mul24(i, j_dim) +j ] = a24 ; output[ __mul24(i, j_dim) +j+1 ] = b24 ; } __global__ void kernel_median7_2pix( unsigned char *output, int i_dim, int j_dim) { // coordonnees absolues du point int j = __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2) ; int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; /************************************************************************** * tri(s) **************************************************************************/ int a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16,a17,a18,a19,a20,a21,a22,a23,a24,a25 ; int b17,b18,b19,b20,b21,b22,b23,b24,b25; /**************************************** * les 26 premieres valeurs ****************************************/ //premiere ligne a0 = tex2D(tex_img_inc, j-2, i-3) ; a1 = tex2D(tex_img_inc, j-1, i-3) ; a2 = tex2D(tex_img_inc, j , i-3) ; a3 = tex2D(tex_img_inc, j+1, i-3) ; a4 = tex2D(tex_img_inc, j+2, i-3) ; a5 = tex2D(tex_img_inc, j+3, i-3) ; //deuxieme ligne a6 = tex2D(tex_img_inc, j-2, i-2) ; a7 = tex2D(tex_img_inc, j-1, i-2) ; a8 = tex2D(tex_img_inc, j , i-2) ; a9 = tex2D(tex_img_inc, j+1, i-2) ; a10 = tex2D(tex_img_inc, j+2, i-2) ; a11 = tex2D(tex_img_inc, j+3, i-2) ; //troisieme ligne a12 = tex2D(tex_img_inc, j-2, i-1) ; a13 = tex2D(tex_img_inc, j-1, i-1) ; a14 = tex2D(tex_img_inc, j , i-1) ; a15 = tex2D(tex_img_inc, j+1, i-1) ; a16 = tex2D(tex_img_inc, j+2, i-1) ; a17 = tex2D(tex_img_inc, j+3, i-1) ; //quatrieme ligne a18 = tex2D(tex_img_inc, j-2, i ) ; a19 = tex2D(tex_img_inc, j-1, i ) ; a20 = tex2D(tex_img_inc, j , i ) ; a21 = tex2D(tex_img_inc, j+1, i ) ; a22 = tex2D(tex_img_inc, j+2, i ) ; a23 = tex2D(tex_img_inc, j+3, i ) ; //cinquieme ligne a24 = tex2D(tex_img_inc, j-2, i+1) ; a25 = tex2D(tex_img_inc, j-1, i+1) ; //min max aux extremites minmax26(&a0,&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeurs suivante (26) a25 = tex2D(tex_img_inc, j , i+1); //minmax aux extremites minmax25(&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (27) a25 = tex2D(tex_img_inc, j+1, i+1); //minmax aux extremites minmax24(&a2,&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (28) a25 = tex2D(tex_img_inc, j+2, i+1); //minmax aux extremites minmax23(&a3,&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (29) a25 = tex2D(tex_img_inc, j+3, i+1); //minmax aux extremites minmax22(&a4,&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (30) a25 = tex2D(tex_img_inc, j-2, i+2); //minmax aux extremites minmax21(&a5,&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (31) a25 = tex2D(tex_img_inc, j-1, i+2); //minmax aux extmites minmax20(&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (32) a25 = tex2D(tex_img_inc, j , i+2); //minmax aux extmites minmax19(&a7,&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (33) a25 = tex2D(tex_img_inc, j+1, i+2); //minmax aux extmites minmax18(&a8,&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (34) a25 = tex2D(tex_img_inc, j+2, i+2); //minmax aux extmites minmax17(&a9,&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (35) a25 = tex2D(tex_img_inc, j+3, i+2); //minmax aux extmites minmax16(&a10,&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (36) a25 = tex2D(tex_img_inc, j-2, i+3); //minmax aux extmites minmax15(&a11,&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (37) a25 = tex2D(tex_img_inc, j-1, i+3); //minmax aux extmites minmax14(&a12,&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (38) a25 = tex2D(tex_img_inc, j , i+3); //minmax aux extmites minmax13(&a13,&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (39) a25 = tex2D(tex_img_inc, j+1, i+3); //minmax aux extmites minmax12(&a14,&a15,&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (40) a25 = tex2D(tex_img_inc, j+2, i+3); //minmax aux extmites minmax11(&a15, &a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); //chargement valeur suivante (41) a25 = tex2D(tex_img_inc, j+3, i+3); //minmax aux extmites minmax10(&a16,&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); // fin des pixels voisins communs deux pixels centraux b17=a17; b18=a18; b19=a19; b20=a20; b21=a21; b22=a22; b23=a23; b24=a24; b25=a25; //chargement valeurs suivantes a25 = tex2D(tex_img_inc, j-3, i+3); b25 = tex2D(tex_img_inc, j+4, i+3); //minmax aux extremites minmax9(&a17,&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); minmax9(&b17,&b18,&b19,&b20,&b21,&b22,&b23,&b24,&b25); //chargement valeurs suivantes a25 = tex2D(tex_img_inc, j-3, i+2); b25 = tex2D(tex_img_inc, j+4, i+2); //minmax aux extremites minmax8(&a18,&a19,&a20,&a21,&a22,&a23,&a24,&a25); minmax8(&b18,&b19,&b20,&b21,&b22,&b23,&b24,&b25); //chargement valeurs suivantes a25 = tex2D(tex_img_inc, j-3, i+1); b25 = tex2D(tex_img_inc, j+4, i+1); //minmax aux extremites minmax7(&a19,&a20,&a21,&a22,&a23,&a24,&a25); minmax7(&b19,&b20,&b21,&b22,&b23,&b24,&b25); //chargement valeurs suivantes a25 = tex2D(tex_img_inc, j-3, i); b25 = tex2D(tex_img_inc, j+4, i); //minmax aux extremites minmax6(&a20,&a21,&a22,&a23,&a24,&a25); minmax6(&b20,&b21,&b22,&b23,&b24,&b25); //chargement valeurs suivantes a25 = tex2D(tex_img_inc, j-3, i-1); b25 = tex2D(tex_img_inc, j+4, i-1); //minmax aux extremites minmax5(&a21,&a22,&a23,&a24,&a25); minmax5(&b21,&b22,&b23,&b24,&b25); //chargement valeurs suivantes a25 = tex2D(tex_img_inc, j-3, i-2); b25 = tex2D(tex_img_inc, j+4, i-2); //minmax aux extremites minmax4(&a22,&a23,&a24,&a25); minmax4(&b22,&b23,&b24,&b25); //chargement valeurs suivantes a25 = tex2D(tex_img_inc, j-3, i-3); b25 = tex2D(tex_img_inc, j+4, i-3); //minmax aux extremites minmax3(&a23,&a24,&a25); minmax3(&b23,&b24,&b25); //medians au milieu ! output[ __mul24(i, j_dim) +j ] = a24 ; output[ __mul24(i, j_dim) +j+1 ] = b24 ; } /***************************************************************************** * median generic shared mem + forgetfull *****************************************************************************/ __global__ void kernel_medianForgetRSH( unsigned short * output, int i_dim, int j_dim, int r) { // coordonnees absolues du point int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; unsigned short ic, jc ; unsigned short Nreg = ((2*r+1)*(2*r+1))/2 + 2 ; int bdimX = blockDim.x; //<<1 ; // pour le cas \E0 2 pixels par thread int tidX = threadIdx.x; //<<1 ; // chargement en smem int idrow = threadIdx.y*(blockDim.x+2*r) ; extern __shared__ int medRoi[]; // bloc 0 (en haut \E0 gauche) medRoi[ idrow + threadIdx.x ] = tex2D(tex_img_ins, j-r, i-r) ; // bloc 1 (en haut \E0 droite)... if ( threadIdx.x < 2*r ) //...ou plutot ce qu'il en manque medRoi[ idrow + threadIdx.x + blockDim.x ] = tex2D( tex_img_ins, j+blockDim.x-r, i-r ) ; // bloc 2 ( en bas \E0 gauche) if ( threadIdx.y < 2*r ) { idrow = (threadIdx.y+blockDim.y)*(blockDim.x+2*r) ; medRoi[ idrow + threadIdx.x ] = tex2D( tex_img_ins, j-r, i+blockDim.y-r ) ; //bloc 4 ( en bas \E0 doite )... if ( threadIdx.x < 2*r ) //...ou ce qu'il en manque medRoi[ idrow + threadIdx.x + blockDim.x ] = tex2D( tex_img_ins, j+blockDim.x-r, i+blockDim.y-r ) ; } __syncthreads() ; // remplissage du vecteur de tri minmax unsigned short vect[8066] ; int Freg=Nreg ; for (ic=0; ic<2*r+1; ic++) { for (jc=0; jc<2*r+1; jc++) { if ( ic*(2*r+1)+jc < Nreg ) { vect[ ic*(2*r+1)+jc ] = medRoi[ (threadIdx.y+ic)*(bdimX+2*r)+ (tidX+jc) ] ; } else { minmaxN(vect, Freg--) ; vect[ Nreg-1 ] = medRoi[ (threadIdx.y+ic)*(bdimX+2*r)+ (tidX+jc) ] ; } } } minmax3(&vect[Nreg-3], &vect[Nreg-2], &vect[Nreg-1]) //medRoi[ (threadIdx.y+ic)*(bdimX+L-1)+ (tidX+jc) ] output[ __mul24(i, j_dim) +j ] = vect[ Nreg-2 ]; } /***************************************************************************** * median generic shared mem + forgetfull *****************************************************************************/ __global__ void kernel_medianForgetR( unsigned short * output, int i_dim, int j_dim, int r) { // coordonnees absolues du point int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; unsigned short ic, jc ; unsigned short Nreg = ((2*r+1)*(2*r+1))/2 + 2 ; // remplissage du vecteur de tri minmax unsigned short vect[8066] ; int Freg=Nreg ; for (ic=0; ic<2*r+1; ic++) { for (jc=0; jc<2*r+1; jc++) { if ( ic*(2*r+1)+jc < Nreg ) { vect[ ic*(2*r+1)+jc ] = tex2D(tex_img_ins, j-r+jc, i-r+ic) ; } else { minmaxN(vect, Freg--) ; vect[ Nreg-1 ] = tex2D(tex_img_ins, j-r+jc, i-r+ic) ; } } } minmax3(&vect[Nreg-3], &vect[Nreg-2], &vect[Nreg-1]) //medRoi[ (threadIdx.y+ic)*(bdimX+L-1)+ (tidX+jc) ] output[ __mul24(i, j_dim) +j ] = vect[ Nreg-2 ]; } /********************************************************************************** * MEDIAN PSEUDO-SEPARABLE POUR GRANDS NOYAUX **********************************************************************************/ __global__ void kernel_medianSepR( unsigned short *output, int i_dim, int j_dim, int r) { int idc, val, min, max, inf, egal, sup, mxinf, minsup, estim ; //coordonnées ds le bloc int ib = threadIdx.y ; int jb = threadIdx.x ; //int idx_h = __mul24(ib+r,blockDim.x) + jb ; // index pixel deans shmem (bloc+halo) //int offset = __mul24(blockDim.x,r) ; // coordonnees absolues du point int j = __mul24(blockIdx.x,blockDim.x) + jb ; int i = __mul24(blockIdx.y,blockDim.y) + ib ; extern __shared__ int buff[] ; /*********************************************************************************** * CHARGEMENT DATA EN SHARED MEM ***********************************************************************************/ for (idc = 0 ; idc < (2*(blockDim.y+r)-1)/blockDim.y; idc++) { if (idc*blockDim.y +ib < i_dim) buff[ (idc*blockDim.y +ib)*blockDim.x + jb ] = tex2D(tex_img_ins, j, i-r+ idc*blockDim.y) ; } __syncthreads() ; /********************************************************************************************** * TRI VERTICAL par algo TORBEN MOGENSEN * (a little bit slow but saves memory => faster !) **********************************************************************************************/ min = max = buff[ ib*blockDim.x +jb] ; for (idc= 0 ; idc< 2*r+1 ; idc++ ) { val = buff[ __mul24(ib+idc, blockDim.x) +jb ] ; if ( val < min ) min = val ; if ( val > max ) max = val ; } while (1) { estim = (min+max)/2 ; inf = sup = egal = 0 ; mxinf = min ; minsup= max ; for (idc =0; idc< 2*r+1 ; idc++) { val = buff[ __mul24(ib+idc, blockDim.x) +jb ] ; if( val < estim ) { inf++; if( val > mxinf) mxinf = val ; } else if (val > estim) { sup++; if( val < minsup) minsup = val ; } else egal++ ; } if ( (inf <= (r+1))&&(sup <=(r+1)) ) break ; else if (inf>sup) max = mxinf ; else min = minsup ; } if ( inf >= r+1 ) val = mxinf ; else if (inf+egal >= r+1) val = estim ; else val = minsup ; output[ __mul24(j, i_dim) +i ] = val ; } /** * * correction de staircase */ __global__ void kernel_staircase_reduc3(unsigned int * img_out, unsigned int L, unsigned int H) { // coordonnees du point dans le bloc //unsigned int iib = threadIdx.x ; //unsigned int jib = threadIdx.y ; // coordonnees du point dans l'image unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; int a, b, c, d, e, f, g, h, i ; // gl des voisins float wa, wb, wc, wd, we, wf, wg, wh, wi ; // poids float S1, S2, S11, S22, S12, S0, Sx, Sx1, Sx2 ; float c1,c2 ; // chargement des valeurs GL des pixels du voisinage a = tex2D(tex_img_estim, x-1, y-1) ; b = tex2D(tex_img_estim, x , y-1) ; c = tex2D(tex_img_estim, x+1, y-1) ; d = tex2D(tex_img_estim, x-1, y ) ; e = tex2D(tex_img_estim, x , y ) ; f = tex2D(tex_img_estim, x+1, y ) ; g = tex2D(tex_img_estim, x-1, y+1) ; h = tex2D(tex_img_estim, x , y+1) ; i = tex2D(tex_img_estim, x+1, y+1) ; wa = tex1D(tex_noyau, abs(a-e)) ; wb = tex1D(tex_noyau, abs(b-e)) ; wc = tex1D(tex_noyau, abs(c-e)) ; wd = tex1D(tex_noyau, abs(d-e)) ; we = 1 ; wf = tex1D(tex_noyau, abs(f-e)) ; wg = tex1D(tex_noyau, abs(g-e)) ; wh = tex1D(tex_noyau, abs(h-e)) ; wi = tex1D(tex_noyau, abs(i-e)) ; //construction des elements du systeme lineaire S0 = wa+wb+wc+wd+we+wf+wg+wh+wi ; S1 = wc+wf+wi-wa-wd-wg ; S2 = wg+wh+wi-wa-wb-wc ; Sx = wa*a + wb*b + wc*c + wd*d + we*e + wf*f + wg*g + wh*h + wi*i ; Sx1 = wc*c + wf*f + wi*i - wa*a - wd*d - wg*g ; Sx2 = wg*g + wh*h + wi*i - wa*a - wb*b - wc*c ; S11 = wc+wf+wi+wa+wd+wg ; S22 = wg+wh+wi+wa+wb+wc ; S12 = wa + wi -wc -wg ; c1 = S22*(S11*Sx-S1*Sx1) + S12*(S1*Sx2-S12*Sx) + S2*(S12*Sx1-S11*Sx2) ; c2 = S22*(S11*S0-S1*S1) + S12*(S2*S1-S12*S0) + S2*(S1*S12-S2*S11) ; img_out[y*L + x] = c1/c2 ; }