src/lib_gpu.cu

   1
   2 #include <stdio.h>
   3
   4
   5 extern "C"{
   6 #include "structures.h"
   7 #include "lib_math.h"
   8 #include "defines.h"
   9 #include "lib_gpu.h"
  10 #include "lib_snake_2_gpu.h"
  11 }
  12 #include "lib_test_gpu.h"
  13 #include "lib_kernels_cumuls.cu"
  14 #include "lib_kernel_snake_2_gpu.cu"
  15
  16 #define DEBUG_IMG_CUMUL 1
  17 bool DISPLAY_ERR_IMG_CUMUL = 1;
  18 //#define DEBUG_POSITIONS
  19 //#define DEBUG_MOVE
  20 //#define DEBUG_CRST
  21 //#define DEBUG_MV
  22 //#define DEBUG_SOMSOM
  23 //#define DEBUG_SOMBLOCS
  24 //#define DEBUG_LISTES
  25 //#define DEBUG_STATS_REF
  26
  27
  28
  29 void cuda_init_img_cumul(unsigned short ** img_in, int H, int L, int nb_nodes,
  30                                                  unsigned short ** d_img, t_cumul_x ** d_img_x, t_cumul_x2 ** d_img_x2,
  31                                                  int ** d_freemanDiDj, int ** d_codeNoeud,
  32                                                  snake_node_gpu ** d_snake, uint32 ** d_nb_pix_max,
  33                                                  uint4 ** d_positions, uint64 ** d_contribs_segments, uint4 ** d_freemans_centres,
  34                                                  int ** d_codes_segments, int64 ** d_stats_snake,
  35                                                  int64 ** d_stats, int64 ** d_stats_ref, double ** d_vrais, double ** d_vrais_snake,
  36                                                  uint2 ** d_liste_pixels, uint64 ** d_contribs_segments_blocs,
  37                                                  bool ** d_move
  38                                                  )
  39 {
  40   unsigned int taille = H*L;
  41   timeval chrono;
  42
  43
  44   //allocation cumuls en memoire GPU
  45   tic(&chrono, NULL);
  46   /*
  47         MAX_PIX 20000
  48         MAX_NODES 10000
  49         MAX_LISTE_PIX 10000000
  50    */
  51   cudaMalloc( (void**) d_snake, MAX_NODES*sizeof(snake_node_gpu) );
  52
  53   cudaMalloc( (void**) d_img, taille*sizeof(unsigned short) );
  54   cudaMalloc( (void**) d_img_x, taille*sizeof(t_cumul_x) );
  55   cudaMalloc( (void**) d_img_x2, taille*sizeof(t_cumul_x2) );
  56
  57   cudaMalloc( (void**) d_freemanDiDj, 9*sizeof(int) );
  58   cudaMalloc( (void**) d_codeNoeud, 64*sizeof(int) );
  59
  60   cudaMalloc( (void**) d_stats_snake, 6*sizeof(int64)) ;
  61   cudaMalloc( (void**) d_positions, 8*MAX_NODES*sizeof(uint4)) ;
  62   cudaMalloc( (void**) d_contribs_segments, 3*16*MAX_NODES*sizeof(uint64)) ;
  63   cudaMalloc( (void**) d_contribs_segments_blocs, (3*MAX_LISTE_PIX/32)*sizeof(uint64)) ;
  64   cudaMalloc( (void**) d_freemans_centres, 16*MAX_NODES*sizeof(uint4)) ;
  65   cudaMalloc( (void**) d_codes_segments, 16*MAX_NODES*sizeof(int)) ;
  66   cudaMalloc( (void**) d_stats, 3*8*MAX_NODES*sizeof(int64)) ;
  67   cudaMalloc( (void**) d_stats_ref, 3*MAX_NODES*sizeof(int64)) ;
  68   cudaMalloc( (void**) d_vrais, 8*MAX_NODES*sizeof(double)) ;
  69   cudaMalloc( (void**) d_move, MAX_NODES*sizeof(bool)) ;
  70   cudaMalloc( (void**) d_nb_pix_max, sizeof(uint32)) ;
  71   cudaMalloc( (void**) d_vrais_snake, sizeof(double)) ;
  72
  73   cudaMalloc( (void**) d_liste_pixels, 16*5*(MAX_NODES)*sizeof(uint2) );
  74
  75   printf("TOTAL MEM = %ld octets\n",
  76                  (2*MAX_NODES*(sizeof(snake_node_gpu)+(8+16)*sizeof(uint4)+3*16*8+16*4+24*8+3*8+8*sizeof(double)+sizeof(bool))
  77                  +(MAX_LISTE_PIX)*(sizeof(uint2)+1)
  78                  +taille*(8+sizeof(t_cumul_x)+sizeof(t_cumul_x2))
  79                   +9*4+64*4+6*8+4+sizeof(double)) );
  80
  81   int64 * h_stats_snake = new int64[6];
  82
  83   toc(chrono, "temps alloc mem GPU");
  84
  85   /*detection-choix-initialisation de la carte GPU*/
  86   tic(&chrono, NULL) ;
  87   cudaDeviceProp deviceProp;
  88   deviceProp.major = 2;
  89   deviceProp.minor = 0;
  90   int dev;
  91   cudaChooseDevice(&dev, &deviceProp);
  92   cudaGetDeviceProperties(&deviceProp, dev);
  93   if(deviceProp.major >= 2 )
  94         {
  95           printf("Using Device %d: \"%s\"\n", dev, deviceProp.name);
  96           cudaSetDevice(dev);
  97         }
  98   toc(chrono, "temps acces GPU") ;
  99
 100   //copie tables correspondances freeman en mem GPU
 101   tic(&chrono, NULL) ;
 102   cudaMemcpy( *d_freemanDiDj, CORRESPONDANCE_Di_Dj_FREEMAN , 9*sizeof(int), cudaMemcpyHostToDevice);
 103   cudaMemcpy( *d_codeNoeud, TABLE_CODAGE , 64*sizeof(unsigned int), cudaMemcpyHostToDevice);
 104   toc(chrono, "temps transfert tables de codage") ;
 105
 106   /*transfert image en global mem GPU*/
 107   tic(&chrono, NULL);
 108   cudaMemcpy( *d_img, img_in[0], taille*sizeof(unsigned short), cudaMemcpyHostToDevice);
 109   toc(chrono, "transfert image vers GPU");
 110
 111   //calculs images cumulees sur GPU
 112   int blocs_max = 65536 ;
 113   int bs = 256 ; //arbitraire, d'apres les observations c'est souvent l'optimu
 114   unsigned int base = 0 ;
 115   unsigned int bl_l  = (L+bs-1)/bs ;
 116   unsigned int nb_lines =  blocs_max / bl_l ;
 117   unsigned int lines ;
 118   unsigned int tranches = ( 1 + H / nb_lines ) ;
 119   nb_lines = (H +tranches -1)/ tranches ; // equilibre la taille des tranches
 120
 121   dim3 threads(bs,1,1);
 122   int smem = nextPow2(bl_l)*2; //smem pour le prefixscan des sommes de blocs (etape 2)
 123   smem += smem >> DEC;
 124   smem += smem >> DEC;
 125   int smem_size = smem*sizeof(uint64);
 126   uint64 * d_somblocs ; // sommes des cumuls par bloc de calcul
 127
 128
 129   if(DEBUG_IMG_CUMUL)
 130         {
 131           printf("--- CALCULS IMAGES CUMULEES+STATS GPU  ----\n");
 132           printf("\t%d threads par bloc  -- %u blocs par ligne -- %u tranches -- %u lignes par tranche \n",bs, bl_l, tranches,nb_lines);
 133           printf(" Smem totale pour cumuls : %d\n", CFI(bs)*(sizeof(t_cumul_x)+sizeof(t_cumul_x2)) );
 134           tic(&chrono, NULL);
 135         }
 136   //calculs cumuls generiques : necessitent 3 etapes / 3 kernels
 137   cudaMalloc( (void**) &d_somblocs, 2*bl_l*nb_lines*sizeof(uint64) );
 138   cudaFuncSetCacheConfig(calcul_cumuls_gpu, cudaFuncCachePreferShared);
 139   do
 140         {
 141           if  ( H-base < nb_lines ) lines = H - base ; else lines = nb_lines ;
 142           printf("base = ligne %d -- traitement de %d lignes \n", base, lines) ;
 143           dim3 grid(bl_l*lines,1,1) ;
 144           calcul_cumuls_gpu<<<grid, threads, CFI(bs)*sizeof(tcumuls)>>>(*d_img, *d_img_x, *d_img_x2, H, L, d_somblocs, bl_l, base, lines) ;
 145           scan_somblocs<<<2*lines, nextPow2(bl_l)/2, smem_size>>>(d_somblocs, bl_l) ;
 146           add_soms_to_cumuls<<<grid, threads>>>(*d_img_x, *d_img_x2, H, L, d_somblocs, bl_l, base, lines) ;
 147           base += lines ;
 148         }
 149   while (base < H) ;
 150   cudaFree(d_somblocs) ;
 151
 152   //calcul des sommes totales N, sigX et sigX2 sur l'image
 153   calcul_stats_image<<<1, 1>>>( *d_img_x, *d_img_x2, H, L, (uint64*)*d_stats_snake);
 154
 155
 156           cudaThreadSynchronize()   ;
 157           toc(chrono, "\tTemps GPU");
 158          if(DEBUG_IMG_CUMUL)
 159         {
 160
 161           //allocation memoire CPU
 162           t_cumul_x  * img_x = new t_cumul_x [H*L];
 163           t_cumul_x2 *  img_x2 = new t_cumul_x2 [H*L];
 164
 165           /*pour test comparaison*/
 166           t_cumul_x * img_xb = new t_cumul_x [H*L];
 167           t_cumul_x2 * img_x2b = new t_cumul_x2 [H*L];
 168
 169           cudaMemcpy( img_xb, *d_img_x, taille*sizeof(t_cumul_x), cudaMemcpyDeviceToHost);
 170           cudaMemcpy( img_x2b, *d_img_x2, taille*sizeof(t_cumul_x2), cudaMemcpyDeviceToHost);
 171
 172           //cumuls : etape 1 CPU
 173           /*
 174                 for (int i=0; i<H; i++)
 175                 {
 176                         for (int b=0; b<bl_l; b++)
 177                         {
 178                                 int offset = b*bs ;
 179                                 img_x[i*L+offset] = img_in[i][offset] ;
 180                                 img_x2[i*L+offset]= img_in[i][offset]*img_in[i][offset] ;
 181                                 for (int p=1; p<bs; p++)
 182                                 {
 183                                         int j = p+offset ;
 184                                         if (j<L)
 185                                         {
 186                                                 img_x[i*L+j] = img_x[i*L+j-1] + img_in[i][j];
 187                                                 img_x2[i*L+j] = img_x2[i*L+j-1] + img_in[i][j]*img_in[i][j] ;
 188                                         }
 189                                 }
 190                         }
 191                 }
 192           */
 193           //cumuls complets CPU
 194
 195           for (int i=0; i<H; i++)
 196                 {
 197                   img_x[i*L+0] = img_in[i][0] ;
 198                   img_x2[i*L+0]= img_in[i][0]*img_in[i][0] ;
 199                   for (int j=1; j<L; j++)
 200                         {
 201                           img_x[i*L+j]  = img_x[i*L+j-1]  + img_in[i][j] ;
 202                           img_x2[i*L+j] = img_x2[i*L+j-1] + img_in[i][j]*img_in[i][j] ;
 203                         }
 204                 }
 205
 206           int cpt = 0;
 207           int cpt_errx=0, cpt_errx2 = 0;
 208           for (int i=0; i< H; i++){
 209                 for (int j=0; j< L; j++){
 210                   if ( (img_x[i*L+j] !=  img_xb[i*L+j]) ) cpt_errx++ ;
 211                   if ( (img_x2[i*L+j] !=  img_x2b[i*L+j]) ) cpt_errx2++ ;
 212                   if ( (img_x[i*L+j] !=  img_xb[i*L+j]) || (img_x2[i*L+j] !=  img_x2b[i*L+j]))
 213                   {
 214                         //printf("(%d,%d)sxCPU=%lu  sxGPU=%lu -- sx2CPU=%lu  sx2GPU=%lu\n",i,j,img_x[i*L+j], img_xb[i*L+j], img_x2[i*L+j], img_x2b[i*L+j]);
 215                   }
 216                   cpt++;
 217                 }
 218           }
 219           printf("%d erreurs sur CX / %d points\n", cpt_errx, cpt );
 220           printf("%d erreurs sur CX2 / %d points\n", cpt_errx2, cpt );
 221           uint64 sigX = 0, sigX2 = 0 ;
 222           for (int i=0; i<H; i++)
 223                 {
 224                   sigX += img_x[i*L+L-1] ;
 225                   sigX2+= img_x2[i*L+L-1];
 226                 }
 227           printf("STATS IMAGE  N = %d - sigX = %lu - sigX2 = %lu\n",  H*L, sigX, sigX2 );
 228         }
 229
 230   /*
 231    * generation snake en mem GPU
 232    */
 233   int dist = 140 ;
 234
 235   tic(&chrono, NULL);
 236
 237   int MAX_DIAGOS = 1024*65536 ;
 238   int ret, Q = 120 ;
 239   int * d_n_diagos, h_n_diagos;
 240   uint4 * d_diagos_snake ;
 241   uint4 * h_diagos_snake = new uint4[MAX_DIAGOS];
 242
 243
 244   cudaMalloc( (void**) &d_n_diagos,  sizeof(int)) ;
 245   cudaMalloc( (void**) &d_diagos_snake, MAX_DIAGOS*sizeof(uint4)) ;
 246
 247   genere_diagos_rectangle<<<1,1>>>(d_diagos_snake, H,L,Q, d_n_diagos);
 248
 249   cudaMemcpy( &h_n_diagos, d_n_diagos, sizeof(int), cudaMemcpyDeviceToHost) ;
 250   ret = cudaMemcpy( h_diagos_snake, d_diagos_snake, MAX_DIAGOS*sizeof(uint4), cudaMemcpyDeviceToHost) ;
 251
 252   toc(chrono, "\tCalcul diagos");
 253
 254   printf("COPY : %s, DIAGOS = %d / %d\n", (ret==0)?"OK":"ERREUR", h_n_diagos,MAX_DIAGOS);
 255   for (int d=0; d<200;d++){
 256         printf("| (%d,%d)-(%d,%d) ", h_diagos_snake[d].x, h_diagos_snake[d].y, h_diagos_snake[d].z, h_diagos_snake[d].w );
 257   }
 258
 259   exit(0);
 260   genere_snake_rectangle_4nodes_gpu<<< 1, 1>>>(*d_snake, 140, H, L) ;
 261
 262
 263   int nnodes = nb_nodes ;
 264   snake_node_gpu * h_snake = new snake_node_gpu[nnodes];
 265   snake_node * h_snake_ll = new snake_node[nnodes] ;
 266   uint4 * h_liste_positions = new uint4[nnodes*8];
 267   double * h_vrais_snake = new double ;
 268   //init les stats du snake
 269   uint2 * d_liste_temp  ;
 270   t_sum_x2 * d_sompart  ;
 271   int tpb, bps, npixmax ;
 272
 273   //calcul nb threads par bloc
 274   npixmax = 2*(H+L-4*dist)/(nnodes-1) ;
 275   tpb = nextPow2(npixmax) ;
 276   if (tpb >= 256) tpb = 256 ;//  /!\ le kernel <<< calcul_contrib...>>> ne supporte pas un bs>256 a cause de la shared-mem nécessaire
 277   if (tpb < 32 ) tpb = 32 ;
 278   tpb=128 ;
 279   bps = (npixmax+tpb-1)/tpb ;
 280   printf("PARAMS EXEC INIT : %d pix max, %d threads/bloc, %d blocs/seg, %d blocs/grille\n", npixmax, tpb, bps, nnodes*bps);
 281
 282   //alloc
 283   cudaMalloc((void**) &d_liste_temp, nnodes*bps*tpb*sizeof(uint2));
 284   cudaMalloc((void**) &d_sompart, 3*nnodes*bps*sizeof(t_sum_x2));
 285   cudaMalloc((void**) &d_stats_ref, 3*nnodes*sizeof(int64));
 286
 287   //DEBUG : pour forcer la mise à zero du tableau intermediaire d_stats_ref
 288   /*
 289   int64 h_stats_ref[3*nnodes] ;
 290   for (int a=0; a<3*nnodes ; a++) h_stats_ref[a] = 0 ;
 291   cudaMemcpy( h_stats_ref, d_stats_ref, sizeof(int64), cudaMemcpyHostToDevice) ;
 292   */
 293   //fin forçage a 0
 294
 295   //DEBUG : pour forcer la mise à zero du tableau intermediaire d_sompart
 296   /*
 297      t_sum_x2 h_sompart[ 3*nnodes*bps ] ;
 298      for (int a=0; a<3*nnodes*bps ; a++) h_sompart[a] = 0 ;
 299      cudaMemcpy( h_sompart, d_sompart, sizeof(t_sum_x2), cudaMemcpyHostToDevice) ;
 300   */
 301   //fin forçage a 0
 302
 303   calcul_contribs_segments_snake<<< nnodes*bps, tpb, (CFI(tpb))*(3*sizeof(t_sum_x2))>>>
 304         (*d_snake, nnodes,
 305          *d_img_x, *d_img_x2,
 306          L, d_liste_temp, d_sompart, *d_freemanDiDj );
 307
 308   //TODO
 309   //parametrer pour ne pas appeler qd tpb=1
 310   //oblige a modifier le kernel <<< calcul_contrib...>>> pour ecrire directement ds d_snake
 311   // au lieu de d_sompart
 312   somsom_snake<<< nnodes , 1 >>>(d_sompart, nnodes, bps, *d_snake);
 313
 314
 315   calcul_stats_snake<<< 1 , 1 >>>(*d_snake, nnodes, *d_stats_snake, *d_vrais_snake,
 316                                                                   *d_img_x, *d_img_x2,
 317                                                                   *d_codeNoeud, L
 318                                                                   );
 319   cudaThreadSynchronize() ;
 320   toc(chrono, "\tTemps") ;
 321
 322   /*
 323         verif stats initiales du snake
 324   */
 325   cudaMemcpy( h_vrais_snake, *d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost) ;
 326   cudaMemcpy( h_stats_snake, *d_stats_snake, 6*sizeof(int64), cudaMemcpyDeviceToHost) ;
 327
 328   printf("STATS SNAKE log vrais=%lf : c1=%lu - cx=%lu - cx2=%lu - N=%lu - SUMX=%lu - SUMX2=%lu\n",
 329                  *h_vrais_snake,
 330                  h_stats_snake[0],  h_stats_snake[1],  h_stats_snake[2],
 331                  h_stats_snake[3],  h_stats_snake[4],  h_stats_snake[5] );
 332
 333   /*
 334         verif stats diminuees des contribs des 2 segments associes a chaque noeud
 335   */
 336 #ifdef DEBUG_STATS_REF
 337   cudaMemcpy( h_stats_ref, d_stats_ref, 3*nnodes*sizeof(int64), cudaMemcpyDeviceToHost) ;
 338   cudaMemcpy( h_snake, *d_snake, nnodes*sizeof(snake_node_gpu), cudaMemcpyDeviceToHost) ;
 339
 340
 341   printf("******* STATS DIMINUEES\n");
 342   for(int n=0; n<nnodes;n++)
 343         {
 344           int i = h_snake[n].posi, j = h_snake[n].posj ;
 345           printf("node %d (%d,%d) : %ld - %ld - %ld - img1= %lu - imgx= %lu - imgx2= %lu \n", n, i, j,
 346                          h_stats_ref[3*n], h_stats_ref[3*n +1], h_stats_ref[3*n +2],
 347                          img_1[i][j], img_x[i][j], img_x2[i][j]);
 348         }
 349 #endif //DEBUG_STATS_REF
 350
 351   //snake2gpu(d_snake, snake, nb_nodes);
 352   //gpu2snake(*d_snake, &h_snake_ll, nnodes);
 353
 354
 355 #ifdef DEBUG_POSITIONS
 356   for (int n=0; n<nnodes; n++)
 357         {
 358           printf("Node %d :\n", n);
 359           for (int pos=0; pos<8; pos++)
 360                 {
 361                   printf("(%d , %d):%d:%d | ", h_liste_positions[8*n + pos].x, h_liste_positions[8*n + pos].y,
 362                                  h_liste_positions[8*n + pos].z, h_liste_positions[8*n + pos].w);
 363                 }
 364           printf("\n");
 365         }
 366 #endif //DEBUG_POSITIONS
 367
 368 //verif liste pixels noeuds pairs/impairs selon
 369
 370 #ifdef DEBUG_LISTES
 371   printf("NOMBRE PIXELS pour LISTE = %d\n", *h_nb_pix_max) ;
 372   printf("bs = %d - grid = %d - nblocs_seg = %d - npix_max = %d - taille mem = %d\n",
 373                  bs, grid.x, nblocs_seg, *h_nb_pix_max, taille_mem);
 374
 375   cudaMemcpy( h_liste_pix, d_liste_pix, taille_mem*sizeof(uint2), cudaMemcpyDeviceToHost ) ;
 376   cudaMemcpy( h_snake, *d_snake, nnodes*sizeof(snake_node_gpu), cudaMemcpyDeviceToHost );
 377   uint32 * h_liste_pixels_segment = new uint32[2*(*h_nb_pix_max)] ;
 378   int idx_n, idx_nprec, idx_nsuiv ;
 379
 380   printf("********* LISTE PIX  ***********\n");
 381   printf("bs = %d - grid = %d - nblocs_seg = %d - npix_max = %d - taille mem = %d\n",
 382                  bs, grid.x, nblocs_seg, *h_nb_pix_max, taille_mem);
 383
 384   for (int n=0; n<(nnodes/2 + (nnodes%2)*pairs); n++)
 385         {
 386           idx_n = 2*n + !pairs ;
 387           if (idx_n == 0) idx_nprec = nnodes - 1;
 388           else idx_nprec = idx_n - 1;
 389           if (idx_n == nnodes-1) idx_nsuiv = 0;
 390           else idx_nsuiv = idx_n + 1 ;
 391
 392           for (int pos=0; pos < 8 ; pos++) //test des segments avant le noeud
 393                 {
 394
 395                   int nb_pix = calcul_liste_pixel_segment(h_snake[idx_nprec].posi,h_snake[idx_nprec].posj,
 396                                                                                                   h_liste_positions[8*idx_n+pos].x, h_liste_positions[8*idx_n+pos].y,
 397                                                                                                   h_liste_pixels_segment, 0);
 398
 399                   for (int pix=0; pix < nb_pix; pix++)
 400                         {
 401
 402                           if ( (h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].x != h_liste_pixels_segment[2*pix] )
 403                                    || ( h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].y != h_liste_pixels_segment[2*pix+1] ) )
 404                                 printf("erreur avant n=%d pix %d/%d segment %d noeuds[ %d-%d-%d ] , CPU (%d,%d) - GPU (%d, %d)\n", n, pix, nb_pix, pos,
 405                                            idx_nprec, idx_n, idx_nsuiv,
 406                                            h_liste_pixels_segment[2*pix], h_liste_pixels_segment[2*pix+1],
 407                                            h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].x,  h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].y);
 408
 409                         }
 410
 411                 }
 412           for (int pos=0; pos < 8 ; pos++) //test des segments apres le noeud
 413                 {
 414
 415                   int nb_pix = calcul_liste_pixel_segment(h_liste_positions[8*idx_n+pos].x, h_liste_positions[8*idx_n+pos].y,
 416                                                                                                   h_snake[idx_nsuiv].posi,h_snake[idx_nsuiv].posj,
 417                                                                                                   h_liste_pixels_segment, 0);
 418
 419                   for (int pix=0; pix < nb_pix; pix++)
 420                         {
 421
 422                           if ( (h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].x != h_liste_pixels_segment[2*pix] )
 423                              || ( h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].y != h_liste_pixels_segment[2*pix+1] ) )
 424                                 printf("erreur apres n=%d pix %d/%d segment %d noeuds[ %d-%d-%d ] , CPU (%d,%d) - GPU (%d, %d)\n", n, pix, nb_pix, pos+8,
 425                                            idx_nprec, idx_n, idx_nsuiv,
 426                                            h_liste_pixels_segment[2*pix], h_liste_pixels_segment[2*pix+1],
 427                                            h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].x,  h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].y);
 428
 429                         }
 430
 431                 }
 432
 433                 }
 434
 435 #endif //DEBUG_LISTES
 436
 437   /*
 438
 439         Test du calcul des sommes partielles 'somblocs' faites par le kernel 'calcul_contribs_segments_blocs_full'
 440
 441    */
 442
 443 #ifdef DEBUG_SOMBLOCS
 444   printf("********* SOMMES PARTIELLES  ***********\n");
 445   printf("bs = %d - grid = %d -  intervalles = %d - nblocs_seg = %d - pairs = %d \n", bs, grid.x, n_interval, nblocs_seg, pairs);
 446   for (int n=0; n< n_interval; n++)
 447         {
 448           idx_n = 2*n + !pairs ;
 449           if (idx_n == 0) idx_nprec = nnodes - 1 ;
 450           else idx_nprec = idx_n - 1 ;
 451           if (idx_n == nnodes-1) idx_nsuiv = 0 ;
 452           else idx_nsuiv = idx_n + 1 ;
 453           printf("******** node %d\n", idx_n) ;
 454           for(int s=0; s<8; s++)
 455                 {
 456                   int nb_pix = calcul_liste_pixel_segment(h_snake[idx_nprec].posi,h_snake[idx_nprec].posj,
 457                                                                                                   h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
 458                                                                                                   h_liste_pixels_segment, 0);
 459                   for (int b=0; b<nblocs_seg; b++)
 460                         {
 461                           uint64 c1=0, cx=0, cx2=0 ;
 462                           int i,j;
 463                           for (int p=0; p<bs; p++)
 464                                 {
 465                                   if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
 466                                         {
 467                                           //  /!\ penser a oter le test de prise en
 468                                           // compte pour les pix sur la même ligne dans
 469                                           // le kernel, sinon les comparaisons des
 470                                           // sommes par colonne seront fausses
 471                                           i = h_liste_pixels_segment[2*(b*bs + p)] ;
 472                                           j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
 473                                           c1 += img_1[i][j] ;
 474                                           cx += img_x[i][j] ;
 475                                           cx2+= img_x2[i][j];
 476                                         }
 477                                 }
 478                           if ( ( c1 != h_sombloc[(16*n + s)*nblocs_seg + b ] ) || ( cx != h_sombloc[(16*n + s)*nblocs_seg + b + grid.x] )
 479                                    ||  ( cx2 != h_sombloc[ (16*n + s)*nblocs_seg + b + 2*grid.x] ) )
 480                                 printf("seg %d - %d pix : bloc %d -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix, b,
 481                                            c1, cx, cx2, h_sombloc[(16*n+s)*nblocs_seg + b], h_sombloc[(16*n+s)*nblocs_seg + b + grid.x],
 482                                            h_sombloc[(16*n+s)*nblocs_seg + b + 2*grid.x]) ;
 483                         }
 484
 485                 }
 486            for(int s=0; s<8; s++)
 487                 {
 488                   int nb_pix = calcul_liste_pixel_segment( h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
 489                                                                                                   h_snake[idx_nsuiv].posi,h_snake[idx_nsuiv].posj,
 490                                                                                                   h_liste_pixels_segment, 0);
 491                   for (int b=0; b<nblocs_seg; b++)
 492                         {
 493                           uint64 c1=0, cx=0, cx2=0 ;
 494                           int i,j;
 495                           for (int p=0; p<bs; p++)
 496                                 {
 497                                   if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
 498                                         {
 499                                           //  /!\ penser a oter le test de prise en
 500                                           // compte pour les pix sur la même ligne dans
 501                                           // le kernel, sinon les comparaisons des
 502                                           // sommes par colonne seront fausses
 503                                           i = h_liste_pixels_segment[2*(b*bs + p)] ;
 504                                           j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
 505                                           c1 += img_1[i][j] ;
 506                                           cx += img_x[i][j] ;
 507                                           cx2+= img_x2[i][j];
 508                                         }
 509                                 }
 510                           if ( ( c1 != h_sombloc[(16*n + s + 8)*nblocs_seg + b ] ) || ( cx != h_sombloc[(16*n + s + 8)*nblocs_seg + b + grid.x] )
 511                                    ||  ( cx2 != h_sombloc[ (16*n + s + 8)*nblocs_seg + b + 2*grid.x] ) )
 512                                 printf("seg %d - %d pix : bloc %d -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix, b,
 513                                            c1, cx, cx2, h_sombloc[(16*n+s+8)*nblocs_seg + b], h_sombloc[(16*n+s+8)*nblocs_seg + b + grid.x],
 514                                            h_sombloc[(16*n+s+8)*nblocs_seg + b + 2*grid.x]) ;
 515                         }
 516
 517                 }
 518
 519         }
 520 #endif //DEBUG_SOMBLOCS
 521
 522
 523  /*
 524
 525         Test du calcul des sommes totales 'somsom' faites par le kernel 'somsom_full'
 526
 527    */
 528
 529 #ifdef DEBUG_SOMSOM
 530  printf("********* SOMMES TOTALES  ***********\n");
 531   printf("bs = %d - grid = %d -  intervalles = %d - nblocs_seg = %d - pairs = %d \n", bs, grid.x, n_interval, nblocs_seg, pairs);
 532   for (int n=0; n< n_interval; n++)
 533         {
 534           idx_n = 2*n + !pairs ;
 535           if (idx_n == 0) idx_nprec = nnodes - 1 ;
 536           else idx_nprec = idx_n - 1 ;
 537           if (idx_n == nnodes-1) idx_nsuiv = 0 ;
 538           else idx_nsuiv = idx_n + 1 ;
 539           printf("******** node %d\n", idx_n) ;
 540           for(int s=0; s<8; s++)
 541                 {
 542                   int nb_pix = calcul_liste_pixel_segment(h_snake[idx_nprec].posi,h_snake[idx_nprec].posj,
 543                                                                                                   h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
 544                                                                                                   h_liste_pixels_segment, 0);
 545                   uint64 c1=0, cx=0, cx2=0 ;
 546                   for (int b=0; b<nblocs_seg; b++)
 547                         {
 548                           int i,j;
 549                           for (int p=0; p<bs; p++)
 550                                 {
 551                                   if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
 552                                         {
 553                                           //  /!\ penser a oter le test de prise en
 554                                           // compte pour les pix sur la même ligne dans
 555                                           // le kernel, sinon les comparaisons des
 556                                           // sommes par colonne seront fausses
 557                                           i = h_liste_pixels_segment[2*(b*bs + p)] ;
 558                                           j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
 559                                           c1 += img_1[i][j] ;
 560                                           cx += img_x[i][j] ;
 561                                           cx2+= img_x2[i][j];
 562                                         }
 563                                 }
 564                         }
 565                   if ( ( c1 != h_somsom[3*(16*n + s)] ) || ( cx != h_somsom[3*(16*n + s) + 1] )
 566                            ||  ( cx2 != h_somsom[3*(16*n + s) + 2] ) )
 567                                 printf("seg %d - %d pix -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix,
 568                                            c1, cx, cx2, h_somsom[3*(16*n + s)], h_somsom[3*(16*n + s) + 1],
 569                                            h_somsom[3*(16*n + s) + 2]) ;
 570
 571                 }
 572
 573            for(int s=0; s<8; s++)
 574                 {
 575                   int nb_pix = calcul_liste_pixel_segment( h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
 576                                                                                                   h_snake[idx_nsuiv].posi,h_snake[idx_nsuiv].posj,
 577                                                                                                    h_liste_pixels_segment, 0);
 578                   uint64 c1=0, cx=0, cx2=0 ;
 579                   for (int b=0; b<nblocs_seg; b++)
 580                         {
 581
 582                           int i,j;
 583                           for (int p=0; p<bs; p++)
 584                                 {
 585                                   if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
 586                                         {
 587                                           //  /!\ penser a oter le test de prise en
 588                                           // compte pour les pix sur la même ligne dans
 589                                           // le kernel, sinon les comparaisons des
 590                                           // sommes par colonne seront fausses
 591                                           i = h_liste_pixels_segment[2*(b*bs + p)] ;
 592                                           j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
 593                                           c1 += img_1[i][j] ;
 594                                           cx += img_x[i][j] ;
 595                                           cx2+= img_x2[i][j];
 596                                         }
 597                                 }
 598                         }
 599                   if ( ( c1 != h_somsom[3*(16*n + s + 8)]  ) || ( cx != h_somsom[3*(16*n + s + 8) + 1] )
 600                            ||  ( cx2 != h_somsom[3*(16*n + s + 8) + 2] ) )
 601                         printf("seg %d - %d pix -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix,
 602                                    c1, cx, cx2, h_somsom[3*(16*n + s + 8)], h_somsom[3*(16*n + s + 8) + 1],
 603                                    h_somsom[3*(16*n + s + 8)  + 2]) ;
 604
 605                 }
 606
 607         }
 608
 609 #endif
 610
 611
 612 #ifdef DEBUG_MV
 613   printf("**** STATS - REF : %lf \n", *h_vrais_snake);
 614   for(int n=0; n<n_interval; n++)
 615         {
 616           for(int p=0; p<8; p++)
 617                 {
 618                   printf("test %d du node %d : %lu - %lu - %lu - - log_vrais = %lf\n", p, (2*n + !pairs),
 619                                  h_stats[3*(8*n+p)], h_stats[3*(8*n+p)+1], h_stats[3*(8*n+p)+2], h_vrais[8*n+p]);
 620                 }
 621         }
 622 #endif //DEBUG_MV
 623
 624
 625 #ifdef DEBUG_CRST
 626   printf("**** CROISEMENTS \n");
 627   for(int n=0; n<nnodes; n++)
 628         {
 629           printf("test du seg %d : ",  n);
 630           if ( h_croist[n] ) printf("CROISEMENT\n"); else printf("\n");
 631         }
 632 #endif //DEBUG_CRST
 633
 634
 635 #ifdef DEBUG_MOVE
 636   printf("**** MOUVEMENTS \n");
 637   for(int n=0; n<nnodes; n++)
 638         {
 639           printf("Node %d : (%s) ",n, (h_move[n])? "yes":"no");
 640         }
 641 #endif //DEBUG_MOVE
 642
 643   delete h_liste_positions ;
 644   delete h_snake;
 645
 646   /*
 647    * fin generation snake GPU
 648    */
 649 }
 650
 651