src/lib_kernels_cumuls.cu~

   1
   2
   3
   4 __global__ void calcul_cumuls_gpu(unsigned short * img, t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2, uint32 h, uint32 l, uint64 * gsomblocs, unsigned int N_BLOCS_LIGNE, unsigned int baseline, unsigned int nb_lines )
   5 {
   6   int bs = blockDim.x ;
   7   int offsetSom = N_BLOCS_LIGNE * nb_lines;         //indice du premier element de cumul_x2 dans le tableau
   8
   9   Int tib = threadIdx.x ;                           //indice du thread dans le bloc
  10   int idx = blockIdx.x*bs + tib ;                   //indice global du thread dans la portion d'image
  11   int num_ligne = blockIdx.x / N_BLOCS_LIGNE ;      //indice de la ligne du thread dans la portion d'image
  12   int til = idx - num_ligne*bs*N_BLOCS_LIGNE ;      //indice du thread ds la ligne
  13   int pos = num_ligne*l + til ;                     //indice du pixel dans la 'tranche' courante
  14   int pos_img = pos + baseline*l ;                  //indice correspondant dans l'image originale
  15   int id, stride=1;
  16
  17   extern __shared__ tcumuls sdata[];
  18   //chargement en smem avec complement de 0
  19   id = CFI(tib);
  20   if (til < l){
  21             sdata[id].x = img[ pos_img ] ;
  22             sdata[id].x2 = img[ pos_img ]*img[ pos_img ] ;
  23          } else {
  24            sdata[id].x = 0 ;
  25            sdata[id].x2 = 0 ;
  26         }
  27         /*
  28          *prefix sum en smem
  29          */
  30         //passe 1 : construction des sommes
  31
  32         for (int d = bs/2; d > 0; d >>= 1)
  33           {
  34                 __syncthreads();
  35
  36                 if (tib < d)
  37                   {
  38                         int i  = __mul24(__mul24(2, stride), tib);
  39                         int ai = i + stride - 1;
  40                         int bi = CFI(ai + stride);
  41
  42                         ai = CFI(ai);
  43                         //bi = CFI(bi);
  44
  45                         sdata[bi].x += sdata[ai].x;
  46                         sdata[bi].x2 += sdata[ai].x2;
  47
  48                   }
  49                 stride *= 2;
  50           }
  51
  52         //stockage somme du bloc en gmem
  53         // et
  54         //mise a zero dernier element du bloc
  55
  56         __syncthreads();
  57
  58         if (tib == bs-1) {
  59           t_cumul_x som_x = sdata[CFI(tib)].x;
  60           t_cumul_x2 som_x2 = sdata[CFI(tib)].x2;
  61           if (til < l){                                 //il ne faut pas ecrire de somme dans le dernier bloc, elle serait hors image
  62              cumul_x[ pos_img ] = som_x;
  63              cumul_x2[pos_img ] = som_x2;
  64              }
  65           gsomblocs[blockIdx.x] = som_x;
  66           gsomblocs[blockIdx.x + offsetSom] = som_x2;
  67           sdata[CFI(tib)].x = 0;
  68           sdata[CFI(tib)].x2 = 0;
  69         }
  70
  71         //Passe 2 : construction des scans
  72         for (int d = 1; d <= (bs/2); d *= 2)
  73           {
  74                 stride >>= 1;
  75                 __syncthreads();
  76
  77                 if (tib < d)
  78                   {
  79                         int i  = __mul24(__mul24(2, stride), tib);
  80                         int ai = i + stride - 1;
  81                         int bi = CFI(ai + stride);
  82
  83                         ai = CFI(ai);
  84                         //bi = CFI(bi);
  85
  86                         t_cumul_x tx = sdata[ai].x;
  87                         t_cumul_x2 tx2 = sdata[ai].x2;
  88
  89                         sdata[ai].x = sdata[bi].x;
  90                         sdata[bi].x += tx;
  91
  92                         sdata[ai].x2 = sdata[bi].x2;
  93                         sdata[bi].x2 += tx2;
  94                   }
  95           }
  96
  97         //transfert en gmem des scans
  98         //il faut décaler à gauche de 1 element
  99         __syncthreads();
 100         if ( (til < l) && ( tib < (bs-1) ) ){
 101                 cumul_x[ pos_img ] = sdata[CFI(tib+1)].x;
 102                 cumul_x2[pos_img ] = sdata[CFI(tib+1)].x2;
 103          }
 104
 105 }
 106
 107
 108
 109 /*
 110   kernel pour faire le prefixsum des sommes de blocs des lignes de l'image
 111   les params d'executions kivonbien sont :
 112   - threads : la moitié de la nextpow2 du nb de blocs par ligne
 113   - grid : 2 x le nombre de blocs de sommes dans l'image , soit le nb_blocs_par_ligne x nb_lignes x 2
 114   - shared mem : 2 x nextPow2(nb_blocs par ligne)
 115 */
 116
 117
 118
 119 __global__ void scan_somblocs(uint64 * g_idata, int n_bl_l){
 120
 121   extern __shared__  uint64 temp[];
 122
 123   int tib = threadIdx.x;
 124   int toff = blockIdx.x*n_bl_l;         // offset des positions des sommes partielles dans le tableau g_idata
 125   int nmax = blockDim.x*2;              //nb max de sommes partielles par ligne
 126
 127
 128   int aai = tib;
 129   int bbi = tib + blockDim.x;
 130
 131
 132   // chargement data en smem
 133   temp[ CFI(aai) ] = (aai < n_bl_l)? g_idata[ toff + aai ] : 0;
 134   temp[ CFI(bbi) ] = (bbi < n_bl_l)? g_idata[ toff + bbi ] : 0;
 135
 136   int offset = 1;
 137
 138   // passe 1 de bas en haut
 139   for (int d = nmax/2; d > 0; d >>= 1)
 140     {
 141           __syncthreads();
 142
 143           if (tib < d)
 144         {
 145                   int ai = CFI(offset*(2*tib+1)-1);
 146                   int bi = CFI(offset*(2*tib+2)-1);
 147
 148                   temp[bi] += temp[ai];
 149         }
 150
 151           offset *= 2;
 152     }
 153
 154   // zero le dernier element
 155   __syncthreads();
 156   if (tib == 0)
 157     {
 158           int index = CFI(nmax - 1);
 159           g_idata[ toff + n_bl_l - 1 ] =  temp[ index ]; //memorisation de la somme du bloc a sa place en gmem
 160           temp[index] = 0;
 161     }
 162
 163   // passe 2 de haut en bas
 164   for (int d = 1; d < nmax; d *= 2)
 165     {
 166           offset /= 2;
 167
 168           __syncthreads();
 169
 170           if (tib < d)
 171         {
 172                   int ai = CFI(offset*(2*tib+1)-1);
 173                   int bi = CFI(offset*(2*tib+2)-1);
 174
 175                   uint64 t  = temp[ai];
 176                   temp[ai] = temp[bi];
 177                   temp[bi] += t; //tester atomicadd 64 bits
 178                   }
 179
 180     }
 181
 182   __syncthreads();
 183
 184   // transfert resultats en gmem decales d'un element vers la gauche
 185   g_idata[ toff + aai ] = temp[ CFI(aai + 1) ]; //pour le demi tableau smem inferieur
 186   if ( bbi < n_bl_l-1 ) g_idata[ toff + bbi ] = temp[CFI( bbi + 1) ]; //demi tableau smem sup. sauf  dernier=somme.
 187
 188 }
 189
 190
 191 __global__ void add_soms_to_cumuls(t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2, uint32 h, uint32 l, uint64 * gsomblocs,
 192                                                                    unsigned int  N_BLOCS_LIGNE, unsigned int baseline, unsigned int nb_lines){
 193
 194   int bs = blockDim.x ;
 195   int offsetSom = N_BLOCS_LIGNE * nb_lines;
 196   int tib = threadIdx.x ;                           //indice du thread dans le bloc
 197   int idx = blockIdx.x*blockDim.x + tib ;           //indice global du thread
 198   int num_ligne = blockIdx.x / N_BLOCS_LIGNE ;      //indice de la ligne du thread dans l'image partielle
 199   int til = idx - num_ligne*bs*N_BLOCS_LIGNE ;      //indice du thread ds la ligne
 200   int pos = num_ligne*l + til;
 201   int pos_img = pos + baseline*l ;
 202
 203
 204   //chargement des valeurs a ajouter en smem 0<-->x et 1<-->x2
 205   uint64 __shared__ ajout[2];
 206
 207   if ( til >= bs ){
 208           ajout[0] = gsomblocs[blockIdx.x -1 ];
 209           ajout[1] = gsomblocs[blockIdx.x -1 + offsetSom];
 210         __syncthreads(); //tester les nouveaux __sync__
 211
 212         //addition par bloc
 213         if (til < l) {
 214           cumul_x[pos_img] += ajout[0];
 215           cumul_x2[pos_img]+= ajout[1];
 216         }
 217   }
 218 }
 219
 220 //calcul des SUM_1, SUM_X et SUM_X2 de l'image
 221
 222 __global__ void calcul_stats_image( t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2, uint32 h, uint32 l, uint64 * d_stats_snake){
 223   uint64 sigX = 0, sigX2 = 0 ;
 224   for (int i=l-1; i<h*l; i+=l)
 225         {
 226           sigX += cumul_x[i] ;
 227           sigX2+= cumul_x2[i];
 228         }
 229   d_stats_snake[3] = h*l ;
 230   d_stats_snake[4] = sigX ;
 231   d_stats_snake[5] = sigX2;
 232
 233 }