src/lib_gpu.cu

   1
   2 #include <stdio.h>
   3 #include <cfloat>
   4
   5 extern "C"{
   6 #include "structures.h"
   7 #include "lib_math.h"
   8 #include "defines.h"
   9 #include "lib_gpu.h"
  10 #include "lib_snake_2_gpu.h"
  11 }
  12 #include "lib_test_gpu.h"
  13 #include "lib_kernels_cumuls.cu"
  14 #include "lib_kernel_snake_2_gpu.cu"
  15
  16 #define DEBUG_IMG_CUMUL 1
  17 bool DISPLAY_ERR_IMG_CUMUL = 1;
  18 //#define DEBUG_POSITIONS
  19 //#define DEBUG_MOVE
  20 //#define DEBUG_CRST
  21 //#define DEBUG_MV
  22 //#define DEBUG_SOMSOM
  23 //#define DEBUG_SOMBLOCS
  24 //#define DEBUG_LISTES
  25 //#define DEBUG_STATS_REF
  26
  27 double critere_gauss_cpu( int n, int ni, uint64 sxi, uint64 sxi2, int64 *stats_img){
  28   int ne = n - ni ;
  29   uint64 sxe = stats_img[4] - sxi ;
  30   uint64 sxe2= stats_img[5] - sxi2;
  31   double sigi2, sige2, critere ;
  32
  33   /* variance des valeurs des niveaux de gris a l'interieur */
  34   sigi2 =
  35         ((double)sxi2/ni) -
  36         ((double)sxi/ni)*((double)sxi/ni) ;
  37
  38   /* variance des valeurs des niveaux de gris a l'exterieur */
  39   sige2 =
  40         ((double)sxe2)/ne -
  41         ((double)sxe/ne)*((double)sxe/ne) ;
  42
  43   if ( (sigi2 > 0) && (sige2 > 0) )
  44         critere =  0.5*((double)ni*log(sigi2) + (double)ne*log(sige2)) ;
  45   else critere = DBL_MAX ;
  46   return critere;
  47 }
  48
  49 void cuda_init_img_cumul(unsigned short ** img_in, int H, int L, int nb_nodes,
  50                                                  unsigned short ** d_img, t_cumul_x ** d_img_x, t_cumul_x2 ** d_img_x2,
  51                                                  int ** d_freemanDiDj, int ** d_codeNoeud,
  52                                                  snake_node_gpu ** d_snake, uint32 ** d_nb_pix_max,
  53                                                  uint4 ** d_positions, uint64 ** d_contribs_segments, uint4 ** d_freemans_centres,
  54                                                  int ** d_codes_segments, int64 ** d_stats_snake,
  55                                                  int64 ** d_stats, int64 ** d_stats_ref, double ** d_vrais, double ** d_vrais_snake,
  56                                                  uint2 ** d_liste_pixels, uint64 ** d_contribs_segments_blocs,
  57                                                  bool ** d_move
  58                                                  )
  59 {
  60   unsigned int taille = H*L;
  61   timeval chrono;
  62
  63
  64   //allocation cumuls en memoire GPU
  65   tic(&chrono, NULL);
  66   /*
  67         MAX_PIX 20000
  68         MAX_NODES 10000
  69         MAX_LISTE_PIX 10000000
  70    */
  71   cudaMalloc( (void**) d_snake, MAX_NODES*sizeof(snake_node_gpu) );
  72
  73   cudaMalloc( (void**) d_img, taille*sizeof(unsigned short) );
  74   cudaMalloc( (void**) d_img_x, taille*sizeof(t_cumul_x) );
  75   cudaMalloc( (void**) d_img_x2, taille*sizeof(t_cumul_x2) );
  76
  77   cudaMalloc( (void**) d_freemanDiDj, 9*sizeof(int) );
  78   cudaMalloc( (void**) d_codeNoeud, 64*sizeof(int) );
  79
  80   cudaMalloc( (void**) d_stats_snake, 6*sizeof(int64)) ;
  81   cudaMalloc( (void**) d_positions, 8*MAX_NODES*sizeof(uint4)) ;
  82   cudaMalloc( (void**) d_contribs_segments, 3*16*MAX_NODES*sizeof(uint64)) ;
  83   cudaMalloc( (void**) d_contribs_segments_blocs, (3*MAX_LISTE_PIX/32)*sizeof(uint64)) ;
  84   cudaMalloc( (void**) d_freemans_centres, 16*MAX_NODES*sizeof(uint4)) ;
  85   cudaMalloc( (void**) d_codes_segments, 16*MAX_NODES*sizeof(int)) ;
  86   cudaMalloc( (void**) d_stats, 3*8*MAX_NODES*sizeof(int64)) ;
  87   cudaMalloc( (void**) d_stats_ref, 3*MAX_NODES*sizeof(int64)) ;
  88   cudaMalloc( (void**) d_vrais, 8*MAX_NODES*sizeof(double)) ;
  89   cudaMalloc( (void**) d_move, MAX_NODES*sizeof(bool)) ;
  90   cudaMalloc( (void**) d_nb_pix_max, sizeof(uint32)) ;
  91   cudaMalloc( (void**) d_vrais_snake, sizeof(double)) ;
  92
  93   cudaMalloc( (void**) d_liste_pixels, 16*5*(MAX_NODES)*sizeof(uint2) );
  94
  95   printf("TOTAL MEM = %ld octets\n",
  96                  (2*MAX_NODES*(sizeof(snake_node_gpu)+(8+16)*sizeof(uint4)+3*16*8+16*4+24*8+3*8+8*sizeof(double)+sizeof(bool))
  97                  +(MAX_LISTE_PIX)*(sizeof(uint2)+1)
  98                  +taille*(8+sizeof(t_cumul_x)+sizeof(t_cumul_x2))
  99                   +9*4+64*4+6*8+4+sizeof(double)) );
 100
 101   int64 * h_stats_snake = new int64[6];
 102
 103   toc(chrono, "temps alloc mem GPU");
 104
 105   /*detection-choix-initialisation de la carte GPU*/
 106   tic(&chrono, NULL) ;
 107   cudaDeviceProp deviceProp;
 108   deviceProp.major = 2;
 109   deviceProp.minor = 0;
 110   int dev;
 111   cudaChooseDevice(&dev, &deviceProp);
 112   cudaGetDeviceProperties(&deviceProp, dev);
 113   if(deviceProp.major >= 2 )
 114         {
 115           printf("Using Device %d: \"%s\"\n", dev, deviceProp.name);
 116           cudaSetDevice(dev);
 117         }
 118   toc(chrono, "temps acces GPU") ;
 119
 120   //copie tables correspondances freeman en mem GPU
 121   tic(&chrono, NULL) ;
 122   cudaMemcpy( *d_freemanDiDj, CORRESPONDANCE_Di_Dj_FREEMAN , 9*sizeof(int), cudaMemcpyHostToDevice);
 123   cudaMemcpy( *d_codeNoeud, TABLE_CODAGE , 64*sizeof(unsigned int), cudaMemcpyHostToDevice);
 124   toc(chrono, "temps transfert tables de codage") ;
 125
 126   /*transfert image en global mem GPU*/
 127   tic(&chrono, NULL);
 128   cudaMemcpy( *d_img, img_in[0], taille*sizeof(unsigned short), cudaMemcpyHostToDevice);
 129   toc(chrono, "transfert image vers GPU");
 130
 131   //calculs images cumulees sur GPU
 132   int blocs_max = 65536 ;
 133   int bs = 256 ; //arbitraire, d'apres les observations c'est souvent l'optimu
 134   unsigned int base = 0 ;
 135   unsigned int bl_l  = (L+bs-1)/bs ;
 136   unsigned int nb_lines =  blocs_max / bl_l ;
 137   unsigned int lines ;
 138   unsigned int tranches = ( 1 + H / nb_lines ) ;
 139   nb_lines = (H +tranches -1)/ tranches ; // equilibre la taille des tranches
 140
 141   dim3 threads(bs,1,1);
 142   int smem = nextPow2(bl_l)*2; //smem pour le prefixscan des sommes de blocs (etape 2)
 143   smem += smem >> DEC;
 144   smem += smem >> DEC;
 145   int smem_size = smem*sizeof(uint64);
 146   uint64 * d_somblocs ; // sommes des cumuls par bloc de calcul
 147
 148
 149   if(DEBUG_IMG_CUMUL)
 150         {
 151           printf("--- CALCULS IMAGES CUMULEES+STATS GPU  ----\n");
 152           printf("\t%d threads par bloc  -- %u blocs par ligne -- %u tranches -- %u lignes par tranche \n",bs, bl_l, tranches,nb_lines);
 153           printf(" Smem totale pour cumuls : %d\n", CFI(bs)*(sizeof(t_cumul_x)+sizeof(t_cumul_x2)) );
 154           tic(&chrono, NULL);
 155         }
 156   //calculs cumuls generiques : necessitent 3 etapes / 3 kernels
 157   cudaMalloc( (void**) &d_somblocs, 2*bl_l*nb_lines*sizeof(uint64) );
 158   cudaFuncSetCacheConfig(calcul_cumuls_gpu, cudaFuncCachePreferShared);
 159   do
 160         {
 161           if  ( H-base < nb_lines ) lines = H - base ; else lines = nb_lines ;
 162           printf("base = ligne %d -- traitement de %d lignes \n", base, lines) ;
 163           dim3 grid(bl_l*lines,1,1) ;
 164           calcul_cumuls_gpu<<<grid, threads, CFI(bs)*sizeof(tcumuls)>>>(*d_img, *d_img_x, *d_img_x2, H, L, d_somblocs, bl_l, base, lines) ;
 165           scan_somblocs<<<2*lines, nextPow2(bl_l)/2, smem_size>>>(d_somblocs, bl_l) ;
 166           add_soms_to_cumuls<<<grid, threads>>>(*d_img_x, *d_img_x2, H, L, d_somblocs, bl_l, base, lines) ;
 167           base += lines ;
 168         }
 169   while (base < H) ;
 170   cudaFree(d_somblocs) ;
 171
 172   //calcul des sommes totales N, sigX et sigX2 sur l'image
 173   calcul_stats_image<<<1, 1>>>( *d_img_x, *d_img_x2, H, L, (uint64*)*d_stats_snake);
 174
 175   cudaThreadSynchronize()   ;
 176   toc(chrono, "\tTemps GPU");
 177   //allocation memoire CPU
 178           t_cumul_x  * img_x = new t_cumul_x [H*L];
 179           t_cumul_x2 *  img_x2 = new t_cumul_x2 [H*L];
 180           uint64 sigX = 0, sigX2 = 0 ;
 181   if(DEBUG_IMG_CUMUL)
 182         {
 183
 184           /*pour test comparaison*/
 185           t_cumul_x * img_xb = new t_cumul_x [H*L];
 186           t_cumul_x2 * img_x2b = new t_cumul_x2 [H*L];
 187
 188           cudaMemcpy( img_xb, *d_img_x, taille*sizeof(t_cumul_x), cudaMemcpyDeviceToHost);
 189           cudaMemcpy( img_x2b, *d_img_x2, taille*sizeof(t_cumul_x2), cudaMemcpyDeviceToHost);
 190
 191           //cumuls : etape 1 CPU
 192           /*
 193                 for (int i=0; i<H; i++)
 194                 {
 195                         for (int b=0; b<bl_l; b++)
 196                         {
 197                                 int offset = b*bs ;
 198                                 img_x[i*L+offset] = img_in[i][offset] ;
 199                                 img_x2[i*L+offset]= img_in[i][offset]*img_in[i][offset] ;
 200                                 for (int p=1; p<bs; p++)
 201                                 {
 202                                         int j = p+offset ;
 203                                         if (j<L)
 204                                         {
 205                                                 img_x[i*L+j] = img_x[i*L+j-1] + img_in[i][j];
 206                                                 img_x2[i*L+j] = img_x2[i*L+j-1] + img_in[i][j]*img_in[i][j] ;
 207                                         }
 208                                 }
 209                         }
 210                 }
 211           */
 212           //cumuls complets CPU
 213
 214           for (int i=0; i<H; i++)
 215                 {
 216                   img_x[i*L+0] = img_in[i][0] ;
 217                   img_x2[i*L+0]= img_in[i][0]*img_in[i][0] ;
 218                   for (int j=1; j<L; j++)
 219                         {
 220                           img_x[i*L+j]  = img_x[i*L+j-1]  + img_in[i][j] ;
 221                           img_x2[i*L+j] = img_x2[i*L+j-1] + img_in[i][j]*img_in[i][j] ;
 222                         }
 223                 }
 224
 225           int cpt = 0;
 226           int cpt_errx=0, cpt_errx2 = 0;
 227           for (int i=0; i< H; i++){
 228                 for (int j=0; j< L; j++){
 229                   if ( (img_x[i*L+j] !=  img_xb[i*L+j]) ) cpt_errx++ ;
 230                   if ( (img_x2[i*L+j] !=  img_x2b[i*L+j]) ) cpt_errx2++ ;
 231                   if ( (img_x[i*L+j] !=  img_xb[i*L+j]) || (img_x2[i*L+j] !=  img_x2b[i*L+j]))
 232                   {
 233                         //printf("(%d,%d)sxCPU=%lu  sxGPU=%lu -- sx2CPU=%lu  sx2GPU=%lu\n",i,j,img_x[i*L+j], img_xb[i*L+j], img_x2[i*L+j], img_x2b[i*L+j]);
 234                   }
 235                   cpt++;
 236                 }
 237           }
 238           printf("%d erreurs sur CX / %d points\n", cpt_errx, cpt );
 239           printf("%d erreurs sur CX2 / %d points\n", cpt_errx2, cpt );
 240
 241           for (int i=0; i<H; i++)
 242                 {
 243                   sigX += img_x[i*L+L-1] ;
 244                   sigX2+= img_x2[i*L+L-1];
 245                 }
 246           printf("STATS IMAGE  N = %d - sigX = %lu - sigX2 = %lu\n",  H*L, sigX, sigX2 );
 247         }
 248
 249   /*
 250    * generation snake en mem GPU
 251    */
 252   int dist = 140 ;
 253
 254   /* Test de determination de la fenetre verticale  optimale*/
 255   int div = 256;//nb de divisions de l'image : cela définit le pas. La valeur max découle du nb max de threads possible ds une grille
 256   tcontribs *d_contribs_part ; // pour les contribs partielles par bloc
 257   tcontribs *d_contribs_cols, *h_contribs_cols, *h_contribs_cols_cpu ; // pour les contrib des colonnes
 258   double2 *d_miniblocs, *h_miniblocs ;       // pour les minima de chaque colonne haute
 259
 260   //parametres execution
 261   bs = div ;
 262   int bpc= (H + bs -1)/bs ;
 263   dim3 grid = dim3(div, bpc, 1);
 264   smem = CFI(bs)*sizeof(tcontribs) ;
 265
 266   //allocations
 267   cudaMalloc((void**) &d_contribs_part, div*bpc*sizeof(tcontribs));
 268   cudaMalloc((void**) &d_contribs_cols, div*sizeof(tcontribs)) ;
 269   cudaMalloc((void**) &d_miniblocs, div*sizeof(double2)) ;
 270   h_contribs_cols = new tcontribs[div] ;
 271   h_contribs_cols_cpu = new tcontribs[div] ;
 272   h_miniblocs = new double2[div] ;
 273
 274   int pas = L / div ;
 275
 276   tic(&chrono, NULL);
 277
 278   //execution kernels
 279   calcul_contribs_cols<<<grid, bs, smem>>>( *d_img_x, *d_img_x2, H, L, d_contribs_part );
 280   somsom_contribs<<<div,1>>>( d_contribs_part, bpc, d_contribs_cols ) ;
 281   calcul_contribs_permutations<<<div,div, CFI(div)*sizeof(double2)>>>(d_contribs_cols, bpc, H, L, (uint64*) *d_stats_snake, d_miniblocs) ;
 282   cudaMemcpy( h_miniblocs, d_miniblocs, div*sizeof(double2), cudaMemcpyDeviceToHost ) ;
 283
 284   //verif minimum des blocs
 285   double crit_mini = h_miniblocs[0].x;
 286   int j1=0, id_mini=0;
 287   for (j1=1 ; j1 < div ; j1++){
 288         if (h_miniblocs[j1].x < crit_mini) {
 289           crit_mini = h_miniblocs[j1].x ;
 290           id_mini = j1 ;
 291         }
 292   }
 293   toc(chrono, "\nCALCUL RECTANGLE");
 294   j1 = pas * id_mini ;
 295   int j2 = (int)(pas*h_miniblocs[ id_mini ].y) ;
 296   printf("pas = %d cols -- critere mini =%f , positions j1=%d j2=%d\n", pas, h_miniblocs[ id_mini ].x, j1, j2);
 297
 298
 299   // transfert datas GPU -> CPU
 300   cudaMemcpy( h_contribs_cols, d_contribs_cols, div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ;
 301
 302   //verif contribs colonnes
 303   for (int c=0 ; c < div ; c++){
 304         // calcul valeurs de ref en CPU
 305         h_contribs_cols_cpu[c].cx = 0 ;
 306         h_contribs_cols_cpu[c].cx2= 0 ;
 307         for (int ip=0; ip < H; ip++){
 308           h_contribs_cols_cpu[c].cx  += img_x[ ip*L + c*pas] ;
 309           h_contribs_cols_cpu[c].cx2 += img_x2[ ip*L + c*pas] ;
 310         }
 311         //comparaison avec valeurs GPU
 312         if ( (h_contribs_cols_cpu[c].cx != h_contribs_cols[c].cx) || (h_contribs_cols_cpu[c].cx2 != h_contribs_cols[c].cx2) )
 313           printf("ERR colonne %d -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n",
 314                          c*pas, h_contribs_cols_cpu[c].cx, h_contribs_cols_cpu[c].cx2, h_contribs_cols[c].cx, h_contribs_cols[c].cx2 );
 315   }
 316   cudaFree(d_contribs_part) ;
 317   cudaFree(d_contribs_cols) ;
 318   cudaFree(d_miniblocs) ;
 319   free(h_contribs_cols);
 320   free(h_contribs_cols_cpu);
 321   free(h_miniblocs) ;
 322
 323   //realloc pour lignes horizontales
 324   bs = 128 ;
 325
 326   div = (H+bs-1)/bs ;
 327   printf("DIV = %d\n", div ) ;
 328
 329   int divpow2 = nextPow2(div) ;
 330   printf("DIVPOW2 = %d\n", divpow2) ;
 331
 332   grid = dim3(div, 1, 1) ;
 333   smem = CFI(bs)*sizeof(tcontribs) ;
 334   cudaMalloc((void**) &d_contribs_part, div*sizeof(tcontribs)) ;
 335   cudaMalloc((void**) &d_contribs_cols, div*div*sizeof(tcontribs)) ;
 336   cudaMalloc((void**) &d_miniblocs, div*sizeof(double2)) ;
 337   h_contribs_cols = new tcontribs[div*div] ;
 338   tcontribs * h_contribs_part = new tcontribs[div] ;
 339   h_miniblocs = new double2[div] ;
 340
 341   tic(&chrono, NULL);
 342   // Appels kernels optim lignes horizontales
 343   calcul_contrib_conjuguee_colonnes<<<grid, bs, CFI(bs)*sizeof(tcontribs) >>>( *d_img_x, *d_img_x2, H, L, j1, j2, d_contribs_part) ;
 344
 345   /*verif CPU
 346   int cpt = 0 ;
 347   int cpterr = 0 ;
 348   tcontribs * h_contribs_part_cpu = new tcontribs[div] ;
 349   cudaMemcpy( h_contribs_part, d_contribs_part, div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ;
 350   for (int bloc=0; bloc < div; bloc++){
 351         h_contribs_part_cpu[ bloc ].cx = 0 ;
 352         h_contribs_part_cpu[ bloc ].cx2 = 0 ;
 353           for (int line=0; ((line < bs)&&(bloc*bs+line < H)); line++){
 354                 h_contribs_part_cpu[bloc].cx += img_x[ (bloc*bs+line)*L + j2] - img_x[ (bloc*bs+line)*L + j1 ];
 355                 h_contribs_part_cpu[bloc].cx2 += img_x2[ (bloc*bs+line)*L + j2] - img_x2[ (bloc*bs+line)*L + j1 ];
 356           }
 357           if ( ( h_contribs_part_cpu[bloc].cx != h_contribs_part[bloc].cx ) || ( h_contribs_part_cpu[bloc].cx2 != h_contribs_part[bloc].cx2 ) )
 358                 {
 359                   printf("ERREUR bloc %d -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n", bloc,
 360                                  h_contribs_part_cpu[bloc].cx, h_contribs_part_cpu[bloc].cx2, h_contribs_part[bloc].cx, h_contribs_part[bloc].cx2 ) ;
 361                   cpterr++;
 362                 }
 363           cpt++ ;
 364   }
 365   printf("VERIF CONTRIB CONJUGUEES BLOCS -->  %d ERREURS / %d BLOCS\n", cpterr, cpt) ;
 366   fin verif*/
 367
 368   grid = dim3(div, div, 1) ;
 369   calcul_contribs_snake_rectangle<<<grid,divpow2, CFI(divpow2)*sizeof(tcontribs) >>>(d_contribs_part, d_contribs_cols) ;
 370
 371   /* verif CPU
 372   h_contribs_cols_cpu = new tcontribs[div*div] ;
 373   cudaMemcpy( h_contribs_cols, d_contribs_cols, div*div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ;
 374   cpt = 0 ;
 375   cpterr = 0 ;
 376   for (int i1=0; i1 < div ; i1++){
 377         for (int i2=0 ; i2 < div ; i2++){
 378           if (i2 >= i1){
 379                 h_contribs_cols_cpu[ i1*div +i2 ].cx = 0 ;
 380                 h_contribs_cols_cpu[ i1*div +i2 ].cx2= 0 ;
 381                 for (int d=i1 ; d <= i2 ; d++){
 382                   h_contribs_cols_cpu[ i1*div +i2 ].cx += h_contribs_part_cpu[ d ].cx ;
 383                   h_contribs_cols_cpu[ i1*div +i2 ].cx2+= h_contribs_part_cpu[ d ].cx2 ;
 384                 }
 385           } else {
 386                 h_contribs_cols_cpu[ i1*div +i2 ].cx = 0 ;
 387                 h_contribs_cols_cpu[ i1*div +i2 ].cx2= 0 ;
 388           }
 389
 390           if (( ( h_contribs_cols_cpu[ i1*div +i2 ].cx != h_contribs_cols[ i1*div +i2 ].cx ) || ( h_contribs_cols_cpu[ i1*div +i2].cx2 != h_contribs_cols[ i1*div +i2].cx2 ) )
 391                 && (i2 >= i1))
 392                 {
 393                   printf("ERREUR combinaison (%d, %d) -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n", i1, i2,
 394                                  h_contribs_cols_cpu[ i1*div +i2].cx, h_contribs_cols_cpu[ i1*div +i2 ].cx2, h_contribs_cols[ i1*div +i2 ].cx, h_contribs_cols[ i1*div +i2 ].cx2 ) ;
 395                   cpterr++;
 396                 }
 397           cpt++ ;
 398         }
 399   }
 400   printf("VERIF COMBINAISONS LIGNES -->  %d ERREURS / %d COMBINAISONS\n", cpterr, cpt) ;
 401   fin verif */
 402
 403
 404   grid = dim3(div, 1, 1) ;
 405   calcul_critere_permutations_verticales<<< grid, divpow2, CFI(divpow2)*sizeof(double2) >>>(d_contribs_cols, bs, j1, j2, H, L, sigX, sigX2, d_miniblocs) ;
 406
 407   /* verif CPU
 408   cpt = 0 ;
 409   cpterr = 0 ;
 410   double2 * h_miniblocs_cpu = new double2[ div ] ;
 411   cudaMemcpy( h_miniblocs, d_miniblocs, div*sizeof(double2), cudaMemcpyDeviceToHost) ;
 412   cudaMemcpy( h_stats_snake, *d_stats_snake, 6*sizeof(int64), cudaMemcpyDeviceToHost) ;
 413   for (int lb=0 ; lb < div ; lb++){
 414         h_miniblocs_cpu[lb].x = DBL_MAX ;
 415         for (int lh=lb ; lh < div ; lh++){
 416           if ( critere_gauss_cpu(H*L, (lh-lb+1)*bs*(j2 - j1), h_contribs_cols_cpu[ lb*div +lh ].cx, h_contribs_cols_cpu[ lb*div + lh ].cx2, h_stats_snake ) < h_miniblocs_cpu[lb].x )
 417                 {
 418                   h_miniblocs_cpu[lb].x = critere_gauss_cpu(H*L, (lh-lb+1)*bs*(j2 - j1), h_contribs_cols_cpu[ lb*div +lh ].cx, h_contribs_cols_cpu[ lb*div + lh ].cx2, h_stats_snake) ;
 419                   h_miniblocs_cpu[lb].y = (double)lh ;
 420                 }
 421         }
 422         if ( ( h_miniblocs_cpu[lb].x > 1.000001*h_miniblocs[lb].x ) || ( h_miniblocs_cpu[lb].x < 0.999999*h_miniblocs[lb].x ) || ( h_miniblocs_cpu[lb].y != h_miniblocs[lb].y ) )
 423           {
 424                  printf("ERREUR MINIMUM BLOC LIGNE %d -> CPU=%lf en i2=%d | GPU=%lf en i2=%d\n", lb,
 425                                 h_miniblocs_cpu[ lb ].x, (int)h_miniblocs_cpu[ lb ].y, h_miniblocs[ lb ].x, (int)h_miniblocs[ lb ].y ) ;
 426                   cpterr++;
 427           }
 428         cpt++ ;
 429   }
 430   printf("VERIF MINIMA PAR BLOC -->  %d ERREURS / %d BLOCS\n", cpterr, cpt) ;
 431   */
 432   /* fin verif */
 433
 434   /*
 435    * determination  minimum absolu
 436    * a conserver sur CPU
 437    */
 438   cudaMemcpy( h_miniblocs, d_miniblocs, div*sizeof(double2), cudaMemcpyDeviceToHost) ;
 439   crit_mini = h_miniblocs[0].x ;
 440   int i1=0  ;
 441   id_mini=0 ;
 442   for (i1=1 ; i1 < div ; i1++){
 443         if (h_miniblocs[i1].x < crit_mini) {
 444           crit_mini = h_miniblocs[i1].x ;
 445           id_mini = i1 ;
 446         }
 447   }
 448
 449   i1 = bs * id_mini ;
 450   int i2 = (int)(bs*h_miniblocs[ id_mini ].y) ;
 451
 452   toc(chrono, "CALCUL RECTANGLE");
 453
 454   printf("pas = %d lignes -- critere mini =%f , positions i1=%d i2=%d\n", bs, h_miniblocs[ id_mini ].x, i1, i2);
 455   /*fin test snake rectangle initial optimal*/
 456   cudaFree(d_contribs_part) ;
 457   cudaFree(d_contribs_cols) ;
 458   cudaFree(d_miniblocs) ;
 459
 460   tic(&chrono, NULL);
 461   //genere_snake_rectangle_4nodes_gpu<<< 1, 1>>>(*d_snake, 140, H, L) ;
 462   //genere_snake_bande_gpu<<<1,1>>>(*d_snake, pas*id_mini, (int)(pas*h_miniblocs[ id_mini ].y), H);
 463   genere_snake_rectangle<<<1,1>>>(*d_snake, i1, i2, j1, j2);
 464
 465
 466   int nnodes = nb_nodes ;
 467   snake_node_gpu * h_snake = new snake_node_gpu[nnodes];
 468   snake_node * h_snake_ll = new snake_node[nnodes] ;
 469   uint4 * h_liste_positions = new uint4[nnodes*8];
 470   double * h_vrais_snake = new double ;
 471   //init les stats du snake
 472   uint2 * d_liste_temp  ;
 473   t_sum_x2 * d_sompart  ;
 474   int tpb, bps, npixmax ;
 475
 476   //calcul nb threads par bloc
 477   npixmax = 2*(H+L-4*dist)/(nnodes-1) ;
 478   tpb = nextPow2(npixmax) ;
 479   if (tpb >= 256) tpb = 256 ;//  /!\ le kernel <<< calcul_contrib...>>> ne supporte pas un bs>256 a cause de la shared-mem nécessaire
 480   if (tpb < 32 ) tpb = 32 ;
 481   tpb=128 ;
 482   bps = (npixmax+tpb-1)/tpb ;
 483   printf("PARAMS EXEC INIT : %d pix max, %d threads/bloc, %d blocs/seg, %d blocs/grille\n", npixmax, tpb, bps, nnodes*bps);
 484
 485   //alloc
 486   cudaMalloc((void**) &d_liste_temp, nnodes*bps*tpb*sizeof(uint2));
 487   cudaMalloc((void**) &d_sompart, 3*nnodes*bps*sizeof(t_sum_x2));
 488   cudaMalloc((void**) &d_stats_ref, 3*nnodes*sizeof(int64));
 489
 490   //DEBUG : pour forcer la mise à zero du tableau intermediaire d_stats_ref
 491   /*
 492   int64 h_stats_ref[3*nnodes] ;
 493   for (int a=0; a<3*nnodes ; a++) h_stats_ref[a] = 0 ;
 494   cudaMemcpy( h_stats_ref, d_stats_ref, sizeof(int64), cudaMemcpyHostToDevice) ;
 495   */
 496   //fin forçage a 0
 497
 498   //DEBUG : pour forcer la mise à zero du tableau intermediaire d_sompart
 499   /*
 500      t_sum_x2 h_sompart[ 3*nnodes*bps ] ;
 501      for (int a=0; a<3*nnodes*bps ; a++) h_sompart[a] = 0 ;
 502      cudaMemcpy( h_sompart, d_sompart, sizeof(t_sum_x2), cudaMemcpyHostToDevice) ;
 503   */
 504   //fin forçage a 0
 505
 506   calcul_contribs_segments_snake<<< nnodes*bps, tpb, (CFI(tpb))*(3*sizeof(t_sum_x2))>>>
 507         (*d_snake, nnodes,
 508          *d_img_x, *d_img_x2,
 509          L, d_liste_temp, d_sompart, *d_freemanDiDj );
 510
 511   //TODO
 512   //parametrer pour ne pas appeler qd tpb=1
 513   //oblige a modifier le kernel <<< calcul_contrib...>>> pour ecrire directement ds d_snake
 514   // au lieu de d_sompart
 515   somsom_snake<<< nnodes , 1 >>>(d_sompart, nnodes, bps, *d_snake);
 516
 517
 518   calcul_stats_snake<<< 1 , 1 >>>(*d_snake, nnodes, *d_stats_snake, *d_vrais_snake,
 519                                                                   *d_img_x, *d_img_x2,
 520                                                                   *d_codeNoeud, L
 521                                                                   );
 522   cudaThreadSynchronize() ;
 523   toc(chrono, "\tTemps") ;
 524
 525   /*
 526         verif stats initiales du snake
 527   */
 528   cudaMemcpy( h_vrais_snake, *d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost) ;
 529   cudaMemcpy( h_stats_snake, *d_stats_snake, 6*sizeof(int64), cudaMemcpyDeviceToHost) ;
 530
 531   printf("STATS SNAKE log vrais=%lf : c1=%lu - cx=%lu - cx2=%lu - N=%lu - SUMX=%lu - SUMX2=%lu\n",
 532                  *h_vrais_snake,
 533                  h_stats_snake[0],  h_stats_snake[1],  h_stats_snake[2],
 534                  h_stats_snake[3],  h_stats_snake[4],  h_stats_snake[5] );
 535
 536   /*
 537         verif stats diminuees des contribs des 2 segments associes a chaque noeud
 538   */
 539 #ifdef DEBUG_STATS_REF
 540   cudaMemcpy( h_stats_ref, d_stats_ref, 3*nnodes*sizeof(int64), cudaMemcpyDeviceToHost) ;
 541   cudaMemcpy( h_snake, *d_snake, nnodes*sizeof(snake_node_gpu), cudaMemcpyDeviceToHost) ;
 542
 543
 544   printf("******* STATS DIMINUEES\n");
 545   for(int n=0; n<nnodes;n++)
 546         {
 547           int i = h_snake[n].posi, j = h_snake[n].posj ;
 548           printf("node %d (%d,%d) : %ld - %ld - %ld - img1= %lu - imgx= %lu - imgx2= %lu \n", n, i, j,
 549                          h_stats_ref[3*n], h_stats_ref[3*n +1], h_stats_ref[3*n +2],
 550                          img_1[i][j], img_x[i][j], img_x2[i][j]);
 551         }
 552 #endif //DEBUG_STATS_REF
 553
 554   //snake2gpu(d_snake, snake, nb_nodes);
 555   //gpu2snake(*d_snake, &h_snake_ll, nnodes);
 556
 557
 558 #ifdef DEBUG_POSITIONS
 559   for (int n=0; n<nnodes; n++)
 560         {
 561           printf("Node %d :\n", n);
 562           for (int pos=0; pos<8; pos++)
 563                 {
 564                   printf("(%d , %d):%d:%d | ", h_liste_positions[8*n + pos].x, h_liste_positions[8*n + pos].y,
 565                                  h_liste_positions[8*n + pos].z, h_liste_positions[8*n + pos].w);
 566                 }
 567           printf("\n");
 568         }
 569 #endif //DEBUG_POSITIONS
 570
 571 //verif liste pixels noeuds pairs/impairs selon
 572
 573 #ifdef DEBUG_LISTES
 574   printf("NOMBRE PIXELS pour LISTE = %d\n", *h_nb_pix_max) ;
 575   printf("bs = %d - grid = %d - nblocs_seg = %d - npix_max = %d - taille mem = %d\n",
 576                  bs, grid.x, nblocs_seg, *h_nb_pix_max, taille_mem);
 577
 578   cudaMemcpy( h_liste_pix, d_liste_pix, taille_mem*sizeof(uint2), cudaMemcpyDeviceToHost ) ;
 579   cudaMemcpy( h_snake, *d_snake, nnodes*sizeof(snake_node_gpu), cudaMemcpyDeviceToHost );
 580   uint32 * h_liste_pixels_segment = new uint32[2*(*h_nb_pix_max)] ;
 581   int idx_n, idx_nprec, idx_nsuiv ;
 582
 583   printf("********* LISTE PIX  ***********\n");
 584   printf("bs = %d - grid = %d - nblocs_seg = %d - npix_max = %d - taille mem = %d\n",
 585                  bs, grid.x, nblocs_seg, *h_nb_pix_max, taille_mem);
 586
 587   for (int n=0; n<(nnodes/2 + (nnodes%2)*pairs); n++)
 588         {
 589           idx_n = 2*n + !pairs ;
 590           if (idx_n == 0) idx_nprec = nnodes - 1;
 591           else idx_nprec = idx_n - 1;
 592           if (idx_n == nnodes-1) idx_nsuiv = 0;
 593           else idx_nsuiv = idx_n + 1 ;
 594
 595           for (int pos=0; pos < 8 ; pos++) //test des segments avant le noeud
 596                 {
 597
 598                   int nb_pix = calcul_liste_pixel_segment(h_snake[idx_nprec].posi,h_snake[idx_nprec].posj,
 599                                                                                                   h_liste_positions[8*idx_n+pos].x, h_liste_positions[8*idx_n+pos].y,
 600                                                                                                   h_liste_pixels_segment, 0);
 601
 602                   for (int pix=0; pix < nb_pix; pix++)
 603                         {
 604
 605                           if ( (h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].x != h_liste_pixels_segment[2*pix] )
 606                                    || ( h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].y != h_liste_pixels_segment[2*pix+1] ) )
 607                                 printf("erreur avant n=%d pix %d/%d segment %d noeuds[ %d-%d-%d ] , CPU (%d,%d) - GPU (%d, %d)\n", n, pix, nb_pix, pos,
 608                                            idx_nprec, idx_n, idx_nsuiv,
 609                                            h_liste_pixels_segment[2*pix], h_liste_pixels_segment[2*pix+1],
 610                                            h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].x,  h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].y);
 611
 612                         }
 613
 614                 }
 615           for (int pos=0; pos < 8 ; pos++) //test des segments apres le noeud
 616                 {
 617
 618                   int nb_pix = calcul_liste_pixel_segment(h_liste_positions[8*idx_n+pos].x, h_liste_positions[8*idx_n+pos].y,
 619                                                                                                   h_snake[idx_nsuiv].posi,h_snake[idx_nsuiv].posj,
 620                                                                                                   h_liste_pixels_segment, 0);
 621
 622                   for (int pix=0; pix < nb_pix; pix++)
 623                         {
 624
 625                           if ( (h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].x != h_liste_pixels_segment[2*pix] )
 626                              || ( h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].y != h_liste_pixels_segment[2*pix+1] ) )
 627                                 printf("erreur apres n=%d pix %d/%d segment %d noeuds[ %d-%d-%d ] , CPU (%d,%d) - GPU (%d, %d)\n", n, pix, nb_pix, pos+8,
 628                                            idx_nprec, idx_n, idx_nsuiv,
 629                                            h_liste_pixels_segment[2*pix], h_liste_pixels_segment[2*pix+1],
 630                                            h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].x,  h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].y);
 631
 632                         }
 633
 634                 }
 635
 636                 }
 637
 638 #endif //DEBUG_LISTES
 639
 640   /*
 641
 642         Test du calcul des sommes partielles 'somblocs' faites par le kernel 'calcul_contribs_segments_blocs_full'
 643
 644    */
 645
 646 #ifdef DEBUG_SOMBLOCS
 647   printf("********* SOMMES PARTIELLES  ***********\n");
 648   printf("bs = %d - grid = %d -  intervalles = %d - nblocs_seg = %d - pairs = %d \n", bs, grid.x, n_interval, nblocs_seg, pairs);
 649   for (int n=0; n< n_interval; n++)
 650         {
 651           idx_n = 2*n + !pairs ;
 652           if (idx_n == 0) idx_nprec = nnodes - 1 ;
 653           else idx_nprec = idx_n - 1 ;
 654           if (idx_n == nnodes-1) idx_nsuiv = 0 ;
 655           else idx_nsuiv = idx_n + 1 ;
 656           printf("******** node %d\n", idx_n) ;
 657           for(int s=0; s<8; s++)
 658                 {
 659                   int nb_pix = calcul_liste_pixel_segment(h_snake[idx_nprec].posi,h_snake[idx_nprec].posj,
 660                                                                                                   h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
 661                                                                                                   h_liste_pixels_segment, 0);
 662                   for (int b=0; b<nblocs_seg; b++)
 663                         {
 664                           uint64 c1=0, cx=0, cx2=0 ;
 665                           int i,j;
 666                           for (int p=0; p<bs; p++)
 667                                 {
 668                                   if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
 669                                         {
 670                                           //  /!\ penser a oter le test de prise en
 671                                           // compte pour les pix sur la même ligne dans
 672                                           // le kernel, sinon les comparaisons des
 673                                           // sommes par colonne seront fausses
 674                                           i = h_liste_pixels_segment[2*(b*bs + p)] ;
 675                                           j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
 676                                           c1 += img_1[i][j] ;
 677                                           cx += img_x[i][j] ;
 678                                           cx2+= img_x2[i][j];
 679                                         }
 680                                 }
 681                           if ( ( c1 != h_sombloc[(16*n + s)*nblocs_seg + b ] ) || ( cx != h_sombloc[(16*n + s)*nblocs_seg + b + grid.x] )
 682                                    ||  ( cx2 != h_sombloc[ (16*n + s)*nblocs_seg + b + 2*grid.x] ) )
 683                                 printf("seg %d - %d pix : bloc %d -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix, b,
 684                                            c1, cx, cx2, h_sombloc[(16*n+s)*nblocs_seg + b], h_sombloc[(16*n+s)*nblocs_seg + b + grid.x],
 685                                            h_sombloc[(16*n+s)*nblocs_seg + b + 2*grid.x]) ;
 686                         }
 687
 688                 }
 689            for(int s=0; s<8; s++)
 690                 {
 691                   int nb_pix = calcul_liste_pixel_segment( h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
 692                                                                                                   h_snake[idx_nsuiv].posi,h_snake[idx_nsuiv].posj,
 693                                                                                                   h_liste_pixels_segment, 0);
 694                   for (int b=0; b<nblocs_seg; b++)
 695                         {
 696                           uint64 c1=0, cx=0, cx2=0 ;
 697                           int i,j;
 698                           for (int p=0; p<bs; p++)
 699                                 {
 700                                   if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
 701                                         {
 702                                           //  /!\ penser a oter le test de prise en
 703                                           // compte pour les pix sur la même ligne dans
 704                                           // le kernel, sinon les comparaisons des
 705                                           // sommes par colonne seront fausses
 706                                           i = h_liste_pixels_segment[2*(b*bs + p)] ;
 707                                           j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
 708                                           c1 += img_1[i][j] ;
 709                                           cx += img_x[i][j] ;
 710                                           cx2+= img_x2[i][j];
 711                                         }
 712                                 }
 713                           if ( ( c1 != h_sombloc[(16*n + s + 8)*nblocs_seg + b ] ) || ( cx != h_sombloc[(16*n + s + 8)*nblocs_seg + b + grid.x] )
 714                                    ||  ( cx2 != h_sombloc[ (16*n + s + 8)*nblocs_seg + b + 2*grid.x] ) )
 715                                 printf("seg %d - %d pix : bloc %d -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix, b,
 716                                            c1, cx, cx2, h_sombloc[(16*n+s+8)*nblocs_seg + b], h_sombloc[(16*n+s+8)*nblocs_seg + b + grid.x],
 717                                            h_sombloc[(16*n+s+8)*nblocs_seg + b + 2*grid.x]) ;
 718                         }
 719
 720                 }
 721
 722         }
 723 #endif //DEBUG_SOMBLOCS
 724
 725
 726  /*
 727
 728         Test du calcul des sommes totales 'somsom' faites par le kernel 'somsom_full'
 729
 730    */
 731
 732 #ifdef DEBUG_SOMSOM
 733  printf("********* SOMMES TOTALES  ***********\n");
 734   printf("bs = %d - grid = %d -  intervalles = %d - nblocs_seg = %d - pairs = %d \n", bs, grid.x, n_interval, nblocs_seg, pairs);
 735   for (int n=0; n< n_interval; n++)
 736         {
 737           idx_n = 2*n + !pairs ;
 738           if (idx_n == 0) idx_nprec = nnodes - 1 ;
 739           else idx_nprec = idx_n - 1 ;
 740           if (idx_n == nnodes-1) idx_nsuiv = 0 ;
 741           else idx_nsuiv = idx_n + 1 ;
 742           printf("******** node %d\n", idx_n) ;
 743           for(int s=0; s<8; s++)
 744                 {
 745                   int nb_pix = calcul_liste_pixel_segment(h_snake[idx_nprec].posi,h_snake[idx_nprec].posj,
 746                                                                                                   h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
 747                                                                                                   h_liste_pixels_segment, 0);
 748                   uint64 c1=0, cx=0, cx2=0 ;
 749                   for (int b=0; b<nblocs_seg; b++)
 750                         {
 751                           int i,j;
 752                           for (int p=0; p<bs; p++)
 753                                 {
 754                                   if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
 755                                         {
 756                                           //  /!\ penser a oter le test de prise en
 757                                           // compte pour les pix sur la même ligne dans
 758                                           // le kernel, sinon les comparaisons des
 759                                           // sommes par colonne seront fausses
 760                                           i = h_liste_pixels_segment[2*(b*bs + p)] ;
 761                                           j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
 762                                           c1 += img_1[i][j] ;
 763                                           cx += img_x[i][j] ;
 764                                           cx2+= img_x2[i][j];
 765                                         }
 766                                 }
 767                         }
 768                   if ( ( c1 != h_somsom[3*(16*n + s)] ) || ( cx != h_somsom[3*(16*n + s) + 1] )
 769                            ||  ( cx2 != h_somsom[3*(16*n + s) + 2] ) )
 770                                 printf("seg %d - %d pix -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix,
 771                                            c1, cx, cx2, h_somsom[3*(16*n + s)], h_somsom[3*(16*n + s) + 1],
 772                                            h_somsom[3*(16*n + s) + 2]) ;
 773
 774                 }
 775
 776            for(int s=0; s<8; s++)
 777                 {
 778                   int nb_pix = calcul_liste_pixel_segment( h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
 779                                                                                                   h_snake[idx_nsuiv].posi,h_snake[idx_nsuiv].posj,
 780                                                                                                    h_liste_pixels_segment, 0);
 781                   uint64 c1=0, cx=0, cx2=0 ;
 782                   for (int b=0; b<nblocs_seg; b++)
 783                         {
 784
 785                           int i,j;
 786                           for (int p=0; p<bs; p++)
 787                                 {
 788                                   if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
 789                                         {
 790                                           //  /!\ penser a oter le test de prise en
 791                                           // compte pour les pix sur la même ligne dans
 792                                           // le kernel, sinon les comparaisons des
 793                                           // sommes par colonne seront fausses
 794                                           i = h_liste_pixels_segment[2*(b*bs + p)] ;
 795                                           j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
 796                                           c1 += img_1[i][j] ;
 797                                           cx += img_x[i][j] ;
 798                                           cx2+= img_x2[i][j];
 799                                         }
 800                                 }
 801                         }
 802                   if ( ( c1 != h_somsom[3*(16*n + s + 8)]  ) || ( cx != h_somsom[3*(16*n + s + 8) + 1] )
 803                            ||  ( cx2 != h_somsom[3*(16*n + s + 8) + 2] ) )
 804                         printf("seg %d - %d pix -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix,
 805                                    c1, cx, cx2, h_somsom[3*(16*n + s + 8)], h_somsom[3*(16*n + s + 8) + 1],
 806                                    h_somsom[3*(16*n + s + 8)  + 2]) ;
 807
 808                 }
 809
 810         }
 811
 812 #endif
 813
 814
 815 #ifdef DEBUG_MV
 816   printf("**** STATS - REF : %lf \n", *h_vrais_snake);
 817   for(int n=0; n<n_interval; n++)
 818         {
 819           for(int p=0; p<8; p++)
 820                 {
 821                   printf("test %d du node %d : %lu - %lu - %lu - - log_vrais = %lf\n", p, (2*n + !pairs),
 822                                  h_stats[3*(8*n+p)], h_stats[3*(8*n+p)+1], h_stats[3*(8*n+p)+2], h_vrais[8*n+p]);
 823                 }
 824         }
 825 #endif //DEBUG_MV
 826
 827
 828 #ifdef DEBUG_CRST
 829   printf("**** CROISEMENTS \n");
 830   for(int n=0; n<nnodes; n++)
 831         {
 832           printf("test du seg %d : ",  n);
 833           if ( h_croist[n] ) printf("CROISEMENT\n"); else printf("\n");
 834         }
 835 #endif //DEBUG_CRST
 836
 837
 838 #ifdef DEBUG_MOVE
 839   printf("**** MOUVEMENTS \n");
 840   for(int n=0; n<nnodes; n++)
 841         {
 842           printf("Node %d : (%s) ",n, (h_move[n])? "yes":"no");
 843         }
 844 #endif //DEBUG_MOVE
 845
 846   delete h_liste_positions ;
 847   delete h_snake;
 848
 849   /*
 850    * fin generation snake GPU
 851    */
 852 }
 853
 854