src/lib_kernel_snake_2_gpu.cu

   1
   2
   3 __global__ void genere_snake_rectangle_4nodes_gpu(snake_node_gpu * d_snake, int dist_bords, int i_dim, int j_dim){
   4   if (threadIdx.x == 0){
   5         int n = 0;
   6         /* n0 */
   7         d_snake[n].posi = dist_bords ;
   8         d_snake[n].posj = dist_bords ;
   9         n++ ;
  10         /* n1 */
  11         d_snake[n].posi = i_dim - dist_bords ;
  12         d_snake[n].posj = dist_bords ;
  13         n++ ;
  14         /* n2 */
  15         d_snake[n].posi = i_dim - dist_bords ;
  16         d_snake[n].posj = j_dim - dist_bords ;
  17         n++ ;
  18         /* n3 */
  19         d_snake[n].posi = dist_bords ;
  20         d_snake[n].posj = j_dim - dist_bords ;
  21
  22         for (int i=0; i<4; i++)
  23           {
  24                 d_snake[i].freeman_in = 0;
  25                 d_snake[i].freeman_out = 0;
  26                 d_snake[i].centre_i = 0;
  27                 d_snake[i].centre_j = 0;
  28                 d_snake[i].last_move = 0;
  29                 d_snake[i].nb_pixels = 123;
  30                 d_snake[i].code_segment = 0;
  31
  32           }
  33   }
  34 }
  35
  36 __global__ void genere_diagos_rectangle(uint4 * d_diagos, int h, int l, int q, int * n_diagos){
  37   int inci = h/q;
  38   int incj = l/q;
  39   int iM,jM, iN, jN ;
  40   int idxDiago = 0 ;
  41         // boucle double pour les positions du point NO de la diagonale
  42         for ( iM = 0; iM < q-1; iM++){
  43           for (jM = 0 ; jM < q-1 ; jM++){
  44                 //boucle double pour les positions du point SE de la diagonale
  45                 for (iN = iM+1; iN < q; iN++){
  46                   for (jN = jM+1; jN < q; jN++){
  47                         d_diagos[idxDiago].x = iM*inci;
  48                         d_diagos[idxDiago].y = jM*incj;
  49                         d_diagos[idxDiago].z = iN*inci;
  50                         d_diagos[idxDiago].w = jN*incj;
  51                         idxDiago++;
  52                   }
  53                 }
  54           }
  55         }
  56         *n_diagos = --idxDiago ;
  57 }
  58
  59 __global__ void genere_snake_rectangle_Nnodes_gpu(snake_node_gpu * d_snake, int dist_bords, int i_dim, int j_dim){
  60   int nb_node_seg = 9 ;
  61   int limite = 64 ;
  62
  63   int i , h= i_dim-2*dist_bords, l= j_dim-2*dist_bords ;
  64   int inch = h/(nb_node_seg+1), incl= l/(nb_node_seg+1) ;
  65   if (threadIdx.x == 0){
  66         int n = 0;
  67         /* n0 */
  68         d_snake[n].posi = dist_bords ;
  69         d_snake[n].posj = dist_bords ;
  70         n++ ;
  71         /*entre sommet 0 et 1*/
  72         i = 0 ;
  73         while (i < nb_node_seg)
  74           {
  75                 if ( (d_snake[n-1].posi + inch)-(i_dim - dist_bords) > limite )
  76                   d_snake[n].posi = d_snake[n-1].posi + inch ;
  77                 else
  78                   d_snake[n].posi = d_snake[n-1].posi + inch/2 ;
  79                 d_snake[n].posj = dist_bords ;
  80                 d_snake[n-1].nb_pixels =  d_snake[n].posi -  d_snake[n-1].posi ;
  81                 n++ ; i++ ;
  82           }
  83         /* n1 */
  84         d_snake[n].posi = i_dim - dist_bords ;
  85         d_snake[n].posj = dist_bords ;
  86         d_snake[n-1].nb_pixels =  d_snake[n].posi -  d_snake[n-1].posi ;
  87         n++ ;
  88         /*entre S1 et S2*/
  89         i = 0 ;
  90         while (i< nb_node_seg)
  91           {
  92                 if ( (j_dim - dist_bords) - (d_snake[n-1].posj + incl) > limite )
  93                   d_snake[n].posj = d_snake[n-1].posj + incl ;
  94                 else
  95                   d_snake[n].posj = d_snake[n-1].posj + incl/2 ;
  96                 d_snake[n].posi = i_dim - dist_bords ;
  97                 d_snake[n-1].nb_pixels =  d_snake[n].posj -  d_snake[n-1].posj ;
  98                 n++ ; i++ ;
  99           }
 100         /* n2 */
 101         d_snake[n].posi = i_dim - dist_bords ;
 102         d_snake[n].posj = j_dim - dist_bords ;
 103         d_snake[n-1].nb_pixels =  d_snake[n].posj -  d_snake[n-1].posj ;
 104         n++ ;
 105         /*entre S2 et S3*/
 106         i = 0 ;
 107         while (i< nb_node_seg)
 108           {
 109                 if ( (d_snake[n-1].posi - inch) - dist_bords > limite )
 110                   d_snake[n].posi = d_snake[n-1].posi - inch ;
 111                 else
 112                   d_snake[n].posi = d_snake[n-1].posi - inch/2 ;
 113                 d_snake[n].posj = j_dim - dist_bords ;
 114                 d_snake[n-1].nb_pixels =  d_snake[n-1].posi -  d_snake[n].posi ;
 115                 n++ ; i++ ;
 116           }
 117         /* n3 */
 118         d_snake[n].posi = dist_bords ;
 119         d_snake[n].posj = j_dim - dist_bords ;
 120         d_snake[n-1].nb_pixels =  d_snake[n-1].posi -  d_snake[n].posi ;
 121         n++ ;
 122         /*entre S3 et S0*/
 123         i = 0 ;
 124         while (i< nb_node_seg)
 125           {
 126                 if ( (d_snake[n-1].posj - incl) - dist_bords > limite)
 127                   d_snake[n].posj = d_snake[n-1].posj - incl ;
 128                 else
 129                   d_snake[n].posj = d_snake[n-1].posj - incl/2 ;
 130                 d_snake[n].posi = dist_bords ;
 131                 d_snake[n-1].nb_pixels =  d_snake[n-1].posj -  d_snake[n].posj ;
 132                 n++ ; i++ ;
 133           }
 134         d_snake[n-1].nb_pixels =  d_snake[n-1].posj -  d_snake[0].posj ;
 135         for (i=0; i<n; i++)
 136           {
 137                 d_snake[i].freeman_in = 0;
 138                 d_snake[i].freeman_out = 0;
 139                 d_snake[i].centre_i = 0;
 140                 d_snake[i].centre_j = 0;
 141                 d_snake[i].last_move = 1;
 142                 d_snake[i].code_segment = 0;
 143
 144           }
 145   }
 146 }
 147
 148 __global__ void calcul_contribs_segments_snake(snake_node_gpu * d_snake, int nb_nodes,
 149                                                                                            t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2,
 150                                                                                            int l, uint2 * liste_pix, t_sum_x2 * gsombloc, int * d_table_freeman)
 151 {
 152   // indices des elements
 153   int blockSize = blockDim.x ;
 154   int tib = threadIdx.x ;
 155   int nblocs_seg =  gridDim.x / nb_nodes ;
 156   int idx = blockDim.x*blockIdx.x + threadIdx.x ;
 157   int segment = blockIdx.x / nblocs_seg ;
 158   int tis = idx - segment*nblocs_seg*blockDim.x ;
 159
 160   //tab pour coordonnées pixels & contribs pixels de taille = (blockDim.x+offset(dec,dec2) )*(sizeof(t_sum_1+t_sum_x+t_sum_x2))
 161   extern __shared__ t_sum_1 scumuls_1[] ; // blockDim varie selon la longueur des segments => taille smem dynamique
 162   t_sum_x* scumuls_x = (t_sum_x*) &scumuls_1[CFI(blockDim.x)] ;
 163   t_sum_x2* scumuls_x2 = (t_sum_x2*) &scumuls_x[CFI(blockDim.x)] ;
 164
 165   //indices des noeuds
 166   uint x1, y1, x2, y2 ;
 167   int n1, n2 ;
 168
 169   n1 = segment ;
 170   n2 = segment +1 ;
 171   //gestion du bouclage du snake
 172   if (n2 >= nb_nodes) n2 = 0 ;
 173
 174   //affectation des differentes positions aux différents segments 'blocs de threads'
 175         x1 = d_snake[n1].posj ;
 176         y1 = d_snake[n1].posi ;
 177         x2 = d_snake[n2].posj ;
 178         y2 = d_snake[n2].posi ;
 179
 180   //params des deplacements
 181   int dx=x2-x1;
 182   int dy=y2-y1;
 183   uint abs_dx = ABS(dx);
 184   uint abs_dy = ABS(dy);
 185   uint nb_pix = abs_dy>abs_dx?(abs_dy+1):(abs_dx+1); // alternative -> lecture ds liste_points[]
 186   int incx=0, incy=0;
 187   uint2 p ;
 188   int xprec, xsuiv ;
 189
 190   //calcul liste des pixels du segment (x1,y1)-(x2,y2)
 191   if (dy > 0) incy=1; else incy=-1 ;
 192   if (dx > 0) incx=1; else incx=-1 ;
 193
 194   if (tis < nb_pix){
 195         if (abs_dy > abs_dx){
 196           //1 thread par ligne
 197           double k = (double)dx/dy ;
 198           p.x = y1 + incy*tis ;
 199           p.y =  x1 + floor((double)incy*k*tis+0.5) ;
 200           //enreg. coords. pixels en global mem pour freemans
 201
 202           if ((tis < 2)||(tis > nb_pix - 3)||(tis == nb_pix/2))
 203                 {
 204                   liste_pix[idx].x = p.x ;
 205                   liste_pix[idx].y = p.y ;
 206                   }
 207
 208         } else {
 209           //1 thread par colonne
 210           double k=(double)dy/dx ;
 211           p.x = y1 + floor((double)(incx*k*tis)+0.5) ;
 212           p.y = x1 + incx*tis ;
 213           if ( tis > 0 ){
 214                 xsuiv = y1 + floor((double)(incx*k*(tis+1))+0.5) ;
 215                 xprec = y1 + floor((double)(incx*k*(tis-1))+0.5) ;
 216           }
 217           //enreg. coords. pixels en global mem pour freeman
 218           //TODO
 219           //on peut calculer les freemans des segments
 220           //sans stocker l'ensemble des valeurs des pixels
 221           //juste avec les derivees aux extremites a calculer ici
 222
 223           if ((tis < 2)||(tis > nb_pix - 3)||(tis == nb_pix/2))
 224                 {
 225                   liste_pix[idx].x = p.x ;
 226                   liste_pix[idx].y = p.y ;
 227                   }
 228
 229         }
 230   }
 231   __syncthreads();
 232
 233   //calcul contribs individuelles des pixels
 234
 235   if ( (tis >0) && (tis < nb_pix-1)
 236            && ( (abs_dy <= abs_dx)
 237                         && ( (xprec > p.x) || (xsuiv > p.x))
 238                         || (abs_dy > abs_dx) ) )
 239         {
 240         int pos = p.x * l + p.y ;
 241         scumuls_1[ CFI(tib)] = 1+p.y;
 242         scumuls_x[ CFI(tib)] = cumul_x[ pos ] ;
 243         scumuls_x2[CFI(tib)] = cumul_x2[ pos ];
 244   } else {
 245         scumuls_1[ CFI(tib)] = 0;
 246         scumuls_x[ CFI(tib)] = 0;
 247         scumuls_x2[CFI(tib)] = 0;
 248   }
 249
 250    __syncthreads();
 251   //somme des contribs individuelles
 252   // unroll des  sommes partielles en smem
 253
 254   if (blockSize >= 512) {
 255         if (tib < 256) {
 256           scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib + 256) ];
 257           scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib + 256) ];
 258           scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib + 256) ];
 259         }
 260         __syncthreads();
 261   }
 262
 263   if (blockSize >= 256) {
 264         if (tib < 128) {
 265           scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib + 128) ];
 266           scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib + 128) ];
 267           scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib + 128) ];
 268         }
 269         __syncthreads();
 270   }
 271   if (blockSize >= 128) {
 272         if (tib <  64) {
 273           scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib +  64) ];
 274           scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib +  64) ];
 275           scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib +  64) ];
 276         }
 277         __syncthreads();
 278   }
 279
 280   //32 threads <==> 1 warp
 281   if (tib < 32)
 282         {
 283           {
 284                 scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib + 32) ];
 285                 scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib + 32) ];
 286                 scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib + 32) ];
 287           }
 288           {
 289                 scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib + 16) ];
 290                 scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib + 16) ];
 291                 scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib + 16) ];
 292           }
 293           {
 294                 scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib +  8) ];
 295                 scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib +  8) ];
 296                 scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib +  8) ];
 297           }
 298           scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib +  4) ];
 299           scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib +  4) ];
 300           scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib +  4) ];
 301
 302           scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib +  2) ];
 303           scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib +  2) ];
 304           scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib +  2) ];
 305
 306           scumuls_1[ CFI(tib)] += scumuls_1[ CFI(tib +  1) ];
 307           scumuls_x[ CFI(tib)] += scumuls_x[ CFI(tib +  1) ];
 308           scumuls_x2[CFI(tib)] += scumuls_x2[CFI(tib +  1) ];
 309         }
 310
 311   // resultat sommes partielles en gmem
 312   if (tib == 0) {
 313         gsombloc[ blockIdx.x ] = (t_sum_x2) scumuls_1[0];
 314         gsombloc[ blockIdx.x + gridDim.x ] = (t_sum_x2) scumuls_x[0];
 315         gsombloc[ blockIdx.x + 2*gridDim.x ] = (t_sum_x2) scumuls_x2[0];
 316   }
 317
 318   //calculs freemans, centre et code segment
 319   //1 uint4 par segment
 320
 321   int Di, Dj;
 322   if (tis == 0){
 323         Di = 1 + liste_pix[idx+1].x - liste_pix[idx].x ;
 324         Dj = 1 + liste_pix[idx+1].y - liste_pix[idx].y ;
 325         d_snake[segment].freeman_out = d_table_freeman[3*Di + Dj] ;
 326         //code seg
 327         if (dy > 0 ) d_snake[segment].code_segment = -1 ;
 328         if (dy < 0 ) d_snake[segment].code_segment = 1 ;
 329         if (dy == 0) d_snake[segment].code_segment = 0 ;
 330   }
 331
 332   if (tis == nb_pix-1){
 333         Di = 1 + liste_pix[idx].x - liste_pix[idx-1].x  ;
 334         Dj = 1 + liste_pix[idx].y - liste_pix[idx-1].y;
 335         d_snake[segment].freeman_in = d_table_freeman[3*Di + Dj] ;
 336   }
 337
 338   if (tis == (nb_pix/2)){
 339         d_snake[segment].centre_i = liste_pix[idx].x ;
 340         d_snake[segment].centre_j = liste_pix[idx].y ;
 341         }
 342 }
 343
 344 /*
 345   sommme des contribs par bloc -> contribs segment, pour le snake
 346
 347   execution sur : 1bloc / 1 thread par segment
 348  */
 349
 350 __global__ void somsom_snake(t_sum_x2 * somblocs, int nb_nodes, unsigned int nb_bl_seg, snake_node_gpu * d_snake){
 351
 352   t_sum_x2 sdata[3];
 353   unsigned int seg = blockIdx.x ;
 354
 355   //un thread par segment
 356   {
 357         sdata[0] = 0;
 358         sdata[1] = 0;
 359         sdata[2] = 0;
 360   }
 361
 362   for (int b=0; b < nb_bl_seg ; b++){
 363         sdata[0] += somblocs[seg*nb_bl_seg + b];
 364         sdata[1] += somblocs[(seg + nb_nodes)*nb_bl_seg + b];
 365         sdata[2] += somblocs[(seg + 2*nb_nodes)*nb_bl_seg + b];
 366   }
 367
 368   //totaux en gmem
 369   {
 370         d_snake[seg].sum_1 = sdata[0];
 371         d_snake[seg].sum_x = sdata[1];
 372         d_snake[seg].sum_x2 = sdata[2];
 373   }
 374 }
 375
 376 __device__ double codage_gl_gauss(uint64 stat_sum_1, uint64 stat_sum_x, uint64 stat_sum_x2,
 377                                                                                    uint64 n_dim, uint64 SUM_X, uint64 SUM_X2){
 378   uint64 stat_sum_xe ;  /* somme des xn region exterieure */
 379   uint32 ne ;             /* nombre de pixel region exterieure */
 380   double sigi2, sige2; /* variance region interieure et exterieure */
 381
 382   /* variance des valeurs des niveaux de gris a l'interieur du snake */
 383   sigi2 =
 384     ((double)stat_sum_x2/(double)stat_sum_1) -
 385     ((double)stat_sum_x/(uint64)stat_sum_1)*((double)stat_sum_x/(uint64)stat_sum_1) ;
 386
 387   /* variance des valeurs des niveaux de gris a l'exterieur du snake */
 388   ne = n_dim-stat_sum_1 ;
 389   stat_sum_xe = SUM_X - stat_sum_x ;
 390   sige2 =
 391     ((double)SUM_X2-stat_sum_x2)/(double)ne -
 392     ((double)stat_sum_xe/(uint64)ne)*((double)stat_sum_xe/(uint64)ne) ;
 393
 394   if ((sigi2 > 0)|(sige2 > 0))
 395   return  0.5*((double)stat_sum_1*log(sigi2) + (double)ne*log(sige2)) ;
 396   return -1 ;
 397 }
 398
 399
 400 __global__ void calcul_stats_snake(snake_node_gpu * d_snake, int  nnodes, int64 * d_stats_snake, double * vrais_min,
 401                                                                    t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2, int * TABLE_CODAGE, uint32 l
 402                                                                    )
 403 {
 404
 405   int id_nx, id_nprec, id_nprecprec ;
 406   int code_noeud, code_segment, pos ;
 407   __shared__ int64 s_stats_snake[3] ;
 408
 409   //init stats en shared mem
 410   s_stats_snake[0] = 0 ;
 411   s_stats_snake[1] = 0 ;
 412   s_stats_snake[2] = 0 ;
 413
 414
 415   for (id_nx = 0; id_nx < nnodes; id_nx++)
 416     {
 417           if (id_nx == 0) id_nprec = nnodes - 1;
 418           else id_nprec = id_nx - 1;
 419           if (id_nprec == 0) id_nprecprec = nnodes -1 ;
 420           else id_nprecprec = id_nprec - 1 ;
 421       /* gestion des segments partant du noeud */
 422       /* vers le noeud suivant dans l'ordre trigo */
 423       code_segment = d_snake[id_nprec].code_segment ;
 424       if (code_segment > 0)
 425                 {
 426                   /* on somme les contributions */
 427                   s_stats_snake[0] += d_snake[id_nprec].sum_1 ;
 428                   s_stats_snake[1] += d_snake[id_nprec].sum_x ;
 429                   s_stats_snake[2] += d_snake[id_nprec].sum_x2 ;
 430                 }
 431       else if (code_segment < 0)
 432                 {
 433                   /* on soustrait les contributions */
 434                   s_stats_snake[0] -= d_snake[id_nprec].sum_1 ;
 435                   s_stats_snake[1] -= d_snake[id_nprec].sum_x ;
 436                   s_stats_snake[2] -= d_snake[id_nprec].sum_x2 ;
 437                 }
 438       // else (code_segment == 0), on ne fait rien
 439       /* gestion des pixels connectant les segments */
 440       /* pixel de depart du segment actuel np --> np->noeud_suiv */
 441       /* freeman_out = np->freeman_out ; */
 442       /* freeman_in = np->noeud_prec->freeman_in ; */
 443           pos = d_snake[id_nprecprec].freeman_in*8 + d_snake[id_nprec].freeman_out ;
 444       code_noeud = TABLE_CODAGE[pos] ;
 445           pos = d_snake[id_nprec].posi*l + d_snake[id_nprec].posj ;
 446
 447       if (code_noeud > 0)
 448                 {
 449                   /* on somme les contributions */
 450                   s_stats_snake[0] += 1 + d_snake[id_nprec].posj ;
 451                   s_stats_snake[1] += cumul_x[pos] ;
 452                   s_stats_snake[2] += cumul_x2[pos] ;
 453                 }
 454       else if (code_noeud < 0)
 455                 {
 456                   /* on soustrait les contributions */
 457                   s_stats_snake[0] -= 1 + d_snake[id_nprec].posj ;
 458                   s_stats_snake[1] -= cumul_x[pos] ;
 459                   s_stats_snake[2] -= cumul_x2[pos] ;
 460                 }
 461       // else (code_pixel == 0), on ne fait rien
 462     }
 463   d_stats_snake[0] = s_stats_snake[0] ;
 464   d_stats_snake[1] = s_stats_snake[1] ;
 465   d_stats_snake[2] = s_stats_snake[2] ;
 466
 467   *vrais_min = codage_gl_gauss(s_stats_snake[0], s_stats_snake[1], s_stats_snake[2],
 468                                                            d_stats_snake[3], d_stats_snake[4], d_stats_snake[5]);
 469 }
 470
 471 // kernel d'init rectangulaire au niveau snake
 472 // un block par point de base et un point opposé par thread du bloc
 473
 474 __global__ void calcul_contribs_snake4(snake_node_gpu * snake, t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2, int h, int l,
 475                                                                            int64 * sums, t_rectangle_snake * critere)
 476 {
 477   // nb de diagonales testees par bloc (ie. par point de base NO)
 478   int blockSize = blockDim.x ;
 479   // indice du second point de chaque diagonale (=Opposite Point, = point SE)
 480   int OPib = threadIdx.x ;
 481   // coordonnees de chaque point de base (NO)
 482   int BPi = blockIdx.x ;
 483   int BPj = blockIdx.y ;
 484   //coordonnees de chaque Opposite Point (SE)
 485   double incThread = ((h-BPi)*(l-BPj) + blockSize -1)/(double)blockSize ;
 486   int OPi = (double)(OPib*incThread)/(l - BPj) ;
 487   int OPj = OPib*incThread - OPi*(l-BPj) ;
 488   OPi += BPi ;
 489   OPj += BPj ;
 490   if (OPi >= h) OPi = h-1 ;
 491   if (OPj >= l) OPj = l-1 ;
 492   //indices des pixels dans les images cumulees
 493   int posG, posD;
 494   //contrib 1 du snake
 495   int C1 = (OPi - BPi)*(OPj - BPj) ;
 496
 497   //pour stocker contribs de chaque snake d'un block
 498   //TODO on peut utiliser une structure restreinte (sans le c1)
 499   extern __shared__ tcontribs scumuls[];
 500
 501   //calcul contribs du snake
 502   scumuls[CFI(OPib)].cx = 0;
 503   scumuls[CFI(OPib)].cx2 = 0;
 504   for (int k=BPi ; k < OPi ; k++)
 505         {
 506           posG = (BPi+k)*l + BPj ;
 507           posD = posG - BPj + OPj ;
 508           scumuls[CFI(OPib)].cx  += (cumul_x[ posD ] - cumul_x[ posG ]) ;
 509           scumuls[CFI(OPib)].cx2 += (cumul_x2[ posD ] - cumul_x2[ posG ]);
 510   }
 511
 512   //calcul de critère pour chaque snake
 513   uint64 stat_sum_xe ;  /* somme des xn region exterieure */
 514   uint32 ne ;           /* nombre de pixel region exterieure */
 515   double sigi2, sige2;  /* variance region interieure et exterieure */
 516   int index_crit;
 517
 518   /* variance des valeurs des niveaux de gris a l'interieur du snake */
 519   sigi2 =
 520     ((double)scumuls[CFI(OPib)].cx2/(double)C1) -
 521     ((double)scumuls[CFI(OPib)].cx/(uint64)C1)*((double)scumuls[CFI(OPib)].cx/(uint64)C1) ;
 522
 523   /* variance des valeurs des niveaux de gris a l'exterieur du snake */
 524
 525   ne = sums[3] - C1 ;
 526   stat_sum_xe = sums[4] - scumuls[CFI(OPib)].cx ;
 527   sige2 =
 528     ((double)sums[5]-scumuls[CFI(OPib)].cx2)/(double)ne -
 529     ((double)stat_sum_xe/(uint64)ne)*((double)stat_sum_xe/(uint64)ne) ;
 530
 531   index_crit = blockSize*(BPi*gridDim.y + BPj) + OPib ;
 532
 533
 534   if ((sigi2 > 0)|(sige2 > 0))
 535         {
 536           critere[ index_crit ].bpi = BPi;
 537           critere[ index_crit ].bpj = BPj;
 538           critere[ index_crit ].opi = OPi;
 539           critere[ index_crit ].opj = OPj;
 540           critere[ index_crit ].crit = 0.5*((double)C1*log(sigi2) + (double)ne*log(sige2)) ;
 541         }
 542   else
 543         {
 544           critere[ index_crit ].bpi = BPi;
 545           critere[ index_crit ].bpj = BPj;
 546           critere[ index_crit ].opi = OPi;
 547           critere[ index_crit ].opj = OPj;
 548           critere[ index_crit ].crit = -1 ;
 549         }
 550
 551   // identification meilleur snake du bloc
 552   // laissé au CPU pour test mais le principe de ce kernel n'est pas efficace.
 553
 554 }