6 #include "structures.h"
10 #include "lib_snake_2_gpu.h"
12 #include "lib_test_gpu.h"
13 #include "lib_kernels_cumuls.cu"
14 #include "lib_kernel_snake_2_gpu.cu"
16 #define DEBUG_IMG_CUMUL 1
17 bool DISPLAY_ERR_IMG_CUMUL = 1;
18 //#define DEBUG_POSITIONS
22 //#define DEBUG_SOMSOM
23 //#define DEBUG_SOMBLOCS
24 //#define DEBUG_LISTES
25 //#define DEBUG_STATS_REF
27 double critere_gauss_cpu( int n, int ni, uint64 sxi, uint64 sxi2, int64 *stats_img){
29 uint64 sxe = stats_img[4] - sxi ;
30 uint64 sxe2= stats_img[5] - sxi2;
31 double sigi2, sige2, critere ;
33 /* variance des valeurs des niveaux de gris a l'interieur */
36 ((double)sxi/ni)*((double)sxi/ni) ;
38 /* variance des valeurs des niveaux de gris a l'exterieur */
41 ((double)sxe/ne)*((double)sxe/ne) ;
43 if ( (sigi2 > 0) && (sige2 > 0) )
44 critere = 0.5*((double)ni*log(sigi2) + (double)ne*log(sige2)) ;
45 else critere = DBL_MAX ;
49 void cuda_init_img_cumul(unsigned short ** img_in, int H, int L, int nb_nodes,
50 unsigned short ** d_img, t_cumul_x ** d_img_x, t_cumul_x2 ** d_img_x2,
51 int ** d_freemanDiDj, int ** d_codeNoeud,
52 snake_node_gpu ** d_snake, uint32 ** d_nb_pix_max,
53 uint4 ** d_positions, uint64 ** d_contribs_segments, uint4 ** d_freemans_centres,
54 int ** d_codes_segments, int64 ** d_stats_snake,
55 int64 ** d_stats, int64 ** d_stats_ref, double ** d_vrais, double ** d_vrais_snake,
56 uint2 ** d_liste_pixels, uint64 ** d_contribs_segments_blocs,
60 unsigned int taille = H*L;
64 //allocation cumuls en memoire GPU
69 MAX_LISTE_PIX 10000000
71 cudaMalloc( (void**) d_snake, MAX_NODES*sizeof(snake_node_gpu) );
73 cudaMalloc( (void**) d_img, taille*sizeof(unsigned short) );
74 cudaMalloc( (void**) d_img_x, taille*sizeof(t_cumul_x) );
75 cudaMalloc( (void**) d_img_x2, taille*sizeof(t_cumul_x2) );
77 cudaMalloc( (void**) d_freemanDiDj, 9*sizeof(int) );
78 cudaMalloc( (void**) d_codeNoeud, 64*sizeof(int) );
80 cudaMalloc( (void**) d_stats_snake, 6*sizeof(int64)) ;
81 cudaMalloc( (void**) d_positions, 8*MAX_NODES*sizeof(uint4)) ;
82 cudaMalloc( (void**) d_contribs_segments, 3*16*MAX_NODES*sizeof(uint64)) ;
83 cudaMalloc( (void**) d_contribs_segments_blocs, (3*MAX_LISTE_PIX/32)*sizeof(uint64)) ;
84 cudaMalloc( (void**) d_freemans_centres, 16*MAX_NODES*sizeof(uint4)) ;
85 cudaMalloc( (void**) d_codes_segments, 16*MAX_NODES*sizeof(int)) ;
86 cudaMalloc( (void**) d_stats, 3*8*MAX_NODES*sizeof(int64)) ;
87 cudaMalloc( (void**) d_stats_ref, 3*MAX_NODES*sizeof(int64)) ;
88 cudaMalloc( (void**) d_vrais, 8*MAX_NODES*sizeof(double)) ;
89 cudaMalloc( (void**) d_move, MAX_NODES*sizeof(bool)) ;
90 cudaMalloc( (void**) d_nb_pix_max, sizeof(uint32)) ;
91 cudaMalloc( (void**) d_vrais_snake, sizeof(double)) ;
93 cudaMalloc( (void**) d_liste_pixels, 16*5*(MAX_NODES)*sizeof(uint2) );
95 printf("TOTAL MEM = %ld octets\n",
96 (2*MAX_NODES*(sizeof(snake_node_gpu)+(8+16)*sizeof(uint4)+3*16*8+16*4+24*8+3*8+8*sizeof(double)+sizeof(bool))
97 +(MAX_LISTE_PIX)*(sizeof(uint2)+1)
98 +taille*(8+sizeof(t_cumul_x)+sizeof(t_cumul_x2))
99 +9*4+64*4+6*8+4+sizeof(double)) );
101 int64 * h_stats_snake = new int64[6];
103 toc(chrono, "temps alloc mem GPU");
105 /*detection-choix-initialisation de la carte GPU*/
107 cudaDeviceProp deviceProp;
108 deviceProp.major = 2;
109 deviceProp.minor = 0;
111 cudaChooseDevice(&dev, &deviceProp);
112 cudaGetDeviceProperties(&deviceProp, dev);
113 if(deviceProp.major >= 2 )
115 printf("Using Device %d: \"%s\"\n", dev, deviceProp.name);
118 toc(chrono, "temps acces GPU") ;
120 //copie tables correspondances freeman en mem GPU
122 cudaMemcpy( *d_freemanDiDj, CORRESPONDANCE_Di_Dj_FREEMAN , 9*sizeof(int), cudaMemcpyHostToDevice);
123 cudaMemcpy( *d_codeNoeud, TABLE_CODAGE , 64*sizeof(unsigned int), cudaMemcpyHostToDevice);
124 toc(chrono, "temps transfert tables de codage") ;
126 /*transfert image en global mem GPU*/
128 cudaMemcpy( *d_img, img_in[0], taille*sizeof(unsigned short), cudaMemcpyHostToDevice);
129 toc(chrono, "transfert image vers GPU");
131 //calculs images cumulees sur GPU
132 int blocs_max = 65536 ;
133 int bs = 256 ; //arbitraire, d'apres les observations c'est souvent l'optimu
134 unsigned int base = 0 ;
135 unsigned int bl_l = (L+bs-1)/bs ;
136 unsigned int nb_lines = blocs_max / bl_l ;
138 unsigned int tranches = ( 1 + H / nb_lines ) ;
139 nb_lines = (H +tranches -1)/ tranches ; // equilibre la taille des tranches
141 dim3 threads(bs,1,1);
142 int smem = nextPow2(bl_l)*2; //smem pour le prefixscan des sommes de blocs (etape 2)
145 int smem_size = smem*sizeof(uint64);
146 uint64 * d_somblocs ; // sommes des cumuls par bloc de calcul
151 printf("--- CALCULS IMAGES CUMULEES+STATS GPU ----\n");
152 printf("\t%d threads par bloc -- %u blocs par ligne -- %u tranches -- %u lignes par tranche \n",bs, bl_l, tranches,nb_lines);
153 printf(" Smem totale pour cumuls : %d\n", CFI(bs)*(sizeof(t_cumul_x)+sizeof(t_cumul_x2)) );
156 //calculs cumuls generiques : necessitent 3 etapes / 3 kernels
157 cudaMalloc( (void**) &d_somblocs, 2*bl_l*nb_lines*sizeof(uint64) );
158 cudaFuncSetCacheConfig(calcul_cumuls_gpu, cudaFuncCachePreferShared);
161 if ( H-base < nb_lines ) lines = H - base ; else lines = nb_lines ;
162 printf("base = ligne %d -- traitement de %d lignes \n", base, lines) ;
163 dim3 grid(bl_l*lines,1,1) ;
164 calcul_cumuls_gpu<<<grid, threads, CFI(bs)*sizeof(tcumuls)>>>(*d_img, *d_img_x, *d_img_x2, H, L, d_somblocs, bl_l, base, lines) ;
165 scan_somblocs<<<2*lines, nextPow2(bl_l)/2, smem_size>>>(d_somblocs, bl_l) ;
166 add_soms_to_cumuls<<<grid, threads>>>(*d_img_x, *d_img_x2, H, L, d_somblocs, bl_l, base, lines) ;
170 cudaFree(d_somblocs) ;
172 //calcul des sommes totales N, sigX et sigX2 sur l'image
173 calcul_stats_image<<<1, 1>>>( *d_img_x, *d_img_x2, H, L, (uint64*)*d_stats_snake);
175 cudaThreadSynchronize() ;
176 toc(chrono, "\tTemps GPU");
177 //allocation memoire CPU
178 t_cumul_x * img_x = new t_cumul_x [H*L];
179 t_cumul_x2 * img_x2 = new t_cumul_x2 [H*L];
180 uint64 sigX = 0, sigX2 = 0 ;
184 /*pour test comparaison*/
185 t_cumul_x * img_xb = new t_cumul_x [H*L];
186 t_cumul_x2 * img_x2b = new t_cumul_x2 [H*L];
188 cudaMemcpy( img_xb, *d_img_x, taille*sizeof(t_cumul_x), cudaMemcpyDeviceToHost);
189 cudaMemcpy( img_x2b, *d_img_x2, taille*sizeof(t_cumul_x2), cudaMemcpyDeviceToHost);
191 //cumuls : etape 1 CPU
193 for (int i=0; i<H; i++)
195 for (int b=0; b<bl_l; b++)
198 img_x[i*L+offset] = img_in[i][offset] ;
199 img_x2[i*L+offset]= img_in[i][offset]*img_in[i][offset] ;
200 for (int p=1; p<bs; p++)
205 img_x[i*L+j] = img_x[i*L+j-1] + img_in[i][j];
206 img_x2[i*L+j] = img_x2[i*L+j-1] + img_in[i][j]*img_in[i][j] ;
212 //cumuls complets CPU
214 for (int i=0; i<H; i++)
216 img_x[i*L+0] = img_in[i][0] ;
217 img_x2[i*L+0]= img_in[i][0]*img_in[i][0] ;
218 for (int j=1; j<L; j++)
220 img_x[i*L+j] = img_x[i*L+j-1] + img_in[i][j] ;
221 img_x2[i*L+j] = img_x2[i*L+j-1] + img_in[i][j]*img_in[i][j] ;
226 int cpt_errx=0, cpt_errx2 = 0;
227 for (int i=0; i< H; i++){
228 for (int j=0; j< L; j++){
229 if ( (img_x[i*L+j] != img_xb[i*L+j]) ) cpt_errx++ ;
230 if ( (img_x2[i*L+j] != img_x2b[i*L+j]) ) cpt_errx2++ ;
231 if ( (img_x[i*L+j] != img_xb[i*L+j]) || (img_x2[i*L+j] != img_x2b[i*L+j]))
233 //printf("(%d,%d)sxCPU=%lu sxGPU=%lu -- sx2CPU=%lu sx2GPU=%lu\n",i,j,img_x[i*L+j], img_xb[i*L+j], img_x2[i*L+j], img_x2b[i*L+j]);
238 printf("%d erreurs sur CX / %d points\n", cpt_errx, cpt );
239 printf("%d erreurs sur CX2 / %d points\n", cpt_errx2, cpt );
241 for (int i=0; i<H; i++)
243 sigX += img_x[i*L+L-1] ;
244 sigX2+= img_x2[i*L+L-1];
246 printf("STATS IMAGE N = %d - sigX = %lu - sigX2 = %lu\n", H*L, sigX, sigX2 );
250 * generation snake en mem GPU
254 /* Test de determination de la fenetre verticale optimale*/
255 int div = 256;//nb de divisions de l'image : cela définit le pas. La valeur max découle du nb max de threads possible ds une grille
256 tcontribs *d_contribs_part ; // pour les contribs partielles par bloc
257 tcontribs *d_contribs_cols, *h_contribs_cols, *h_contribs_cols_cpu ; // pour les contrib des colonnes
258 double2 *d_miniblocs, *h_miniblocs ; // pour les minima de chaque colonne haute
260 //parametres execution
262 int bpc= (H + bs -1)/bs ;
263 dim3 grid = dim3(div, bpc, 1);
264 smem = CFI(bs)*sizeof(tcontribs) ;
267 cudaMalloc((void**) &d_contribs_part, div*bpc*sizeof(tcontribs));
268 cudaMalloc((void**) &d_contribs_cols, div*sizeof(tcontribs)) ;
269 cudaMalloc((void**) &d_miniblocs, div*sizeof(double2)) ;
270 h_contribs_cols = new tcontribs[div] ;
271 h_contribs_cols_cpu = new tcontribs[div] ;
272 h_miniblocs = new double2[div] ;
279 calcul_contribs_cols<<<grid, bs, smem>>>( *d_img_x, *d_img_x2, H, L, d_contribs_part );
280 somsom_contribs<<<div,1>>>( d_contribs_part, bpc, d_contribs_cols ) ;
281 calcul_contribs_permutations<<<div,div, CFI(div)*sizeof(double2)>>>(d_contribs_cols, bpc, H, L, (uint64*) *d_stats_snake, d_miniblocs) ;
282 cudaMemcpy( h_miniblocs, d_miniblocs, div*sizeof(double2), cudaMemcpyDeviceToHost ) ;
284 //verif minimum des blocs
285 double crit_mini = h_miniblocs[0].x;
287 for (j1=1 ; j1 < div ; j1++){
288 if (h_miniblocs[j1].x < crit_mini) {
289 crit_mini = h_miniblocs[j1].x ;
293 toc(chrono, "\nCALCUL RECTANGLE");
295 int j2 = (int)(pas*h_miniblocs[ id_mini ].y) ;
296 printf("pas = %d cols -- critere mini =%f , positions j1=%d j2=%d\n", pas, h_miniblocs[ id_mini ].x, j1, j2);
299 // transfert datas GPU -> CPU
300 cudaMemcpy( h_contribs_cols, d_contribs_cols, div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ;
302 //verif contribs colonnes
303 for (int c=0 ; c < div ; c++){
304 // calcul valeurs de ref en CPU
305 h_contribs_cols_cpu[c].cx = 0 ;
306 h_contribs_cols_cpu[c].cx2= 0 ;
307 for (int ip=0; ip < H; ip++){
308 h_contribs_cols_cpu[c].cx += img_x[ ip*L + c*pas] ;
309 h_contribs_cols_cpu[c].cx2 += img_x2[ ip*L + c*pas] ;
311 //comparaison avec valeurs GPU
312 if ( (h_contribs_cols_cpu[c].cx != h_contribs_cols[c].cx) || (h_contribs_cols_cpu[c].cx2 != h_contribs_cols[c].cx2) )
313 printf("ERR colonne %d -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n",
314 c*pas, h_contribs_cols_cpu[c].cx, h_contribs_cols_cpu[c].cx2, h_contribs_cols[c].cx, h_contribs_cols[c].cx2 );
316 cudaFree(d_contribs_part) ;
317 cudaFree(d_contribs_cols) ;
318 cudaFree(d_miniblocs) ;
319 free(h_contribs_cols);
320 free(h_contribs_cols_cpu);
323 //realloc pour lignes horizontales
327 printf("DIV = %d\n", div ) ;
329 int divpow2 = nextPow2(div) ;
330 printf("DIVPOW2 = %d\n", divpow2) ;
332 grid = dim3(div, 1, 1) ;
333 smem = CFI(bs)*sizeof(tcontribs) ;
334 cudaMalloc((void**) &d_contribs_part, div*sizeof(tcontribs)) ;
335 cudaMalloc((void**) &d_contribs_cols, div*div*sizeof(tcontribs)) ;
336 cudaMalloc((void**) &d_miniblocs, div*sizeof(double2)) ;
337 h_contribs_cols = new tcontribs[div*div] ;
338 tcontribs * h_contribs_part = new tcontribs[div] ;
339 h_miniblocs = new double2[div] ;
342 // Appels kernels optim lignes horizontales
343 calcul_contrib_conjuguee_colonnes<<<grid, bs, CFI(bs)*sizeof(tcontribs) >>>( *d_img_x, *d_img_x2, H, L, j1, j2, d_contribs_part) ;
348 tcontribs * h_contribs_part_cpu = new tcontribs[div] ;
349 cudaMemcpy( h_contribs_part, d_contribs_part, div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ;
350 for (int bloc=0; bloc < div; bloc++){
351 h_contribs_part_cpu[ bloc ].cx = 0 ;
352 h_contribs_part_cpu[ bloc ].cx2 = 0 ;
353 for (int line=0; ((line < bs)&&(bloc*bs+line < H)); line++){
354 h_contribs_part_cpu[bloc].cx += img_x[ (bloc*bs+line)*L + j2] - img_x[ (bloc*bs+line)*L + j1 ];
355 h_contribs_part_cpu[bloc].cx2 += img_x2[ (bloc*bs+line)*L + j2] - img_x2[ (bloc*bs+line)*L + j1 ];
357 if ( ( h_contribs_part_cpu[bloc].cx != h_contribs_part[bloc].cx ) || ( h_contribs_part_cpu[bloc].cx2 != h_contribs_part[bloc].cx2 ) )
359 printf("ERREUR bloc %d -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n", bloc,
360 h_contribs_part_cpu[bloc].cx, h_contribs_part_cpu[bloc].cx2, h_contribs_part[bloc].cx, h_contribs_part[bloc].cx2 ) ;
365 printf("VERIF CONTRIB CONJUGUEES BLOCS --> %d ERREURS / %d BLOCS\n", cpterr, cpt) ;
368 grid = dim3(div, div, 1) ;
369 calcul_contribs_snake_rectangle<<<grid,divpow2, CFI(divpow2)*sizeof(tcontribs) >>>(d_contribs_part, d_contribs_cols) ;
372 h_contribs_cols_cpu = new tcontribs[div*div] ;
373 cudaMemcpy( h_contribs_cols, d_contribs_cols, div*div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ;
376 for (int i1=0; i1 < div ; i1++){
377 for (int i2=0 ; i2 < div ; i2++){
379 h_contribs_cols_cpu[ i1*div +i2 ].cx = 0 ;
380 h_contribs_cols_cpu[ i1*div +i2 ].cx2= 0 ;
381 for (int d=i1 ; d <= i2 ; d++){
382 h_contribs_cols_cpu[ i1*div +i2 ].cx += h_contribs_part_cpu[ d ].cx ;
383 h_contribs_cols_cpu[ i1*div +i2 ].cx2+= h_contribs_part_cpu[ d ].cx2 ;
386 h_contribs_cols_cpu[ i1*div +i2 ].cx = 0 ;
387 h_contribs_cols_cpu[ i1*div +i2 ].cx2= 0 ;
390 if (( ( h_contribs_cols_cpu[ i1*div +i2 ].cx != h_contribs_cols[ i1*div +i2 ].cx ) || ( h_contribs_cols_cpu[ i1*div +i2].cx2 != h_contribs_cols[ i1*div +i2].cx2 ) )
393 printf("ERREUR combinaison (%d, %d) -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n", i1, i2,
394 h_contribs_cols_cpu[ i1*div +i2].cx, h_contribs_cols_cpu[ i1*div +i2 ].cx2, h_contribs_cols[ i1*div +i2 ].cx, h_contribs_cols[ i1*div +i2 ].cx2 ) ;
400 printf("VERIF COMBINAISONS LIGNES --> %d ERREURS / %d COMBINAISONS\n", cpterr, cpt) ;
404 grid = dim3(div, 1, 1) ;
405 calcul_critere_permutations_verticales<<< grid, divpow2, CFI(divpow2)*sizeof(double2) >>>(d_contribs_cols, bs, j1, j2, H, L, sigX, sigX2, d_miniblocs) ;
410 double2 * h_miniblocs_cpu = new double2[ div ] ;
411 cudaMemcpy( h_miniblocs, d_miniblocs, div*sizeof(double2), cudaMemcpyDeviceToHost) ;
412 cudaMemcpy( h_stats_snake, *d_stats_snake, 6*sizeof(int64), cudaMemcpyDeviceToHost) ;
413 for (int lb=0 ; lb < div ; lb++){
414 h_miniblocs_cpu[lb].x = DBL_MAX ;
415 for (int lh=lb ; lh < div ; lh++){
416 if ( critere_gauss_cpu(H*L, (lh-lb+1)*bs*(j2 - j1), h_contribs_cols_cpu[ lb*div +lh ].cx, h_contribs_cols_cpu[ lb*div + lh ].cx2, h_stats_snake ) < h_miniblocs_cpu[lb].x )
418 h_miniblocs_cpu[lb].x = critere_gauss_cpu(H*L, (lh-lb+1)*bs*(j2 - j1), h_contribs_cols_cpu[ lb*div +lh ].cx, h_contribs_cols_cpu[ lb*div + lh ].cx2, h_stats_snake) ;
419 h_miniblocs_cpu[lb].y = (double)lh ;
422 if ( ( h_miniblocs_cpu[lb].x > 1.000001*h_miniblocs[lb].x ) || ( h_miniblocs_cpu[lb].x < 0.999999*h_miniblocs[lb].x ) || ( h_miniblocs_cpu[lb].y != h_miniblocs[lb].y ) )
424 printf("ERREUR MINIMUM BLOC LIGNE %d -> CPU=%lf en i2=%d | GPU=%lf en i2=%d\n", lb,
425 h_miniblocs_cpu[ lb ].x, (int)h_miniblocs_cpu[ lb ].y, h_miniblocs[ lb ].x, (int)h_miniblocs[ lb ].y ) ;
430 printf("VERIF MINIMA PAR BLOC --> %d ERREURS / %d BLOCS\n", cpterr, cpt) ;
435 * determination minimum absolu
436 * a conserver sur CPU
438 cudaMemcpy( h_miniblocs, d_miniblocs, div*sizeof(double2), cudaMemcpyDeviceToHost) ;
439 crit_mini = h_miniblocs[0].x ;
442 for (i1=1 ; i1 < div ; i1++){
443 if (h_miniblocs[i1].x < crit_mini) {
444 crit_mini = h_miniblocs[i1].x ;
450 int i2 = (int)(bs*h_miniblocs[ id_mini ].y) ;
452 toc(chrono, "CALCUL RECTANGLE");
454 printf("pas = %d lignes -- critere mini =%f , positions i1=%d i2=%d\n", bs, h_miniblocs[ id_mini ].x, i1, i2);
455 /*fin test snake rectangle initial optimal*/
456 cudaFree(d_contribs_part) ;
457 cudaFree(d_contribs_cols) ;
458 cudaFree(d_miniblocs) ;
461 //genere_snake_rectangle_4nodes_gpu<<< 1, 1>>>(*d_snake, 140, H, L) ;
462 //genere_snake_bande_gpu<<<1,1>>>(*d_snake, pas*id_mini, (int)(pas*h_miniblocs[ id_mini ].y), H);
463 genere_snake_rectangle<<<1,1>>>(*d_snake, i1, i2, j1, j2);
466 int nnodes = nb_nodes ;
467 snake_node_gpu * h_snake = new snake_node_gpu[nnodes];
468 snake_node * h_snake_ll = new snake_node[nnodes] ;
469 uint4 * h_liste_positions = new uint4[nnodes*8];
470 double * h_vrais_snake = new double ;
471 //init les stats du snake
472 uint2 * d_liste_temp ;
473 t_sum_x2 * d_sompart ;
474 int tpb, bps, npixmax ;
476 //calcul nb threads par bloc
477 npixmax = 2*(H+L-4*dist)/(nnodes-1) ;
478 tpb = nextPow2(npixmax) ;
479 if (tpb >= 256) tpb = 256 ;// /!\ le kernel <<< calcul_contrib...>>> ne supporte pas un bs>256 a cause de la shared-mem nécessaire
480 if (tpb < 32 ) tpb = 32 ;
482 bps = (npixmax+tpb-1)/tpb ;
483 printf("PARAMS EXEC INIT : %d pix max, %d threads/bloc, %d blocs/seg, %d blocs/grille\n", npixmax, tpb, bps, nnodes*bps);
486 cudaMalloc((void**) &d_liste_temp, nnodes*bps*tpb*sizeof(uint2));
487 cudaMalloc((void**) &d_sompart, 3*nnodes*bps*sizeof(t_sum_x2));
488 cudaMalloc((void**) &d_stats_ref, 3*nnodes*sizeof(int64));
490 //DEBUG : pour forcer la mise à zero du tableau intermediaire d_stats_ref
492 int64 h_stats_ref[3*nnodes] ;
493 for (int a=0; a<3*nnodes ; a++) h_stats_ref[a] = 0 ;
494 cudaMemcpy( h_stats_ref, d_stats_ref, sizeof(int64), cudaMemcpyHostToDevice) ;
498 //DEBUG : pour forcer la mise à zero du tableau intermediaire d_sompart
500 t_sum_x2 h_sompart[ 3*nnodes*bps ] ;
501 for (int a=0; a<3*nnodes*bps ; a++) h_sompart[a] = 0 ;
502 cudaMemcpy( h_sompart, d_sompart, sizeof(t_sum_x2), cudaMemcpyHostToDevice) ;
506 calcul_contribs_segments_snake<<< nnodes*bps, tpb, (CFI(tpb))*(3*sizeof(t_sum_x2))>>>
509 L, d_liste_temp, d_sompart, *d_freemanDiDj );
512 //parametrer pour ne pas appeler qd tpb=1
513 //oblige a modifier le kernel <<< calcul_contrib...>>> pour ecrire directement ds d_snake
514 // au lieu de d_sompart
515 somsom_snake<<< nnodes , 1 >>>(d_sompart, nnodes, bps, *d_snake);
518 calcul_stats_snake<<< 1 , 1 >>>(*d_snake, nnodes, *d_stats_snake, *d_vrais_snake,
522 cudaThreadSynchronize() ;
523 toc(chrono, "\tTemps") ;
526 verif stats initiales du snake
528 cudaMemcpy( h_vrais_snake, *d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost) ;
529 cudaMemcpy( h_stats_snake, *d_stats_snake, 6*sizeof(int64), cudaMemcpyDeviceToHost) ;
531 printf("STATS SNAKE log vrais=%lf : c1=%lu - cx=%lu - cx2=%lu - N=%lu - SUMX=%lu - SUMX2=%lu\n",
533 h_stats_snake[0], h_stats_snake[1], h_stats_snake[2],
534 h_stats_snake[3], h_stats_snake[4], h_stats_snake[5] );
537 verif stats diminuees des contribs des 2 segments associes a chaque noeud
539 #ifdef DEBUG_STATS_REF
540 cudaMemcpy( h_stats_ref, d_stats_ref, 3*nnodes*sizeof(int64), cudaMemcpyDeviceToHost) ;
541 cudaMemcpy( h_snake, *d_snake, nnodes*sizeof(snake_node_gpu), cudaMemcpyDeviceToHost) ;
544 printf("******* STATS DIMINUEES\n");
545 for(int n=0; n<nnodes;n++)
547 int i = h_snake[n].posi, j = h_snake[n].posj ;
548 printf("node %d (%d,%d) : %ld - %ld - %ld - img1= %lu - imgx= %lu - imgx2= %lu \n", n, i, j,
549 h_stats_ref[3*n], h_stats_ref[3*n +1], h_stats_ref[3*n +2],
550 img_1[i][j], img_x[i][j], img_x2[i][j]);
552 #endif //DEBUG_STATS_REF
554 //snake2gpu(d_snake, snake, nb_nodes);
555 //gpu2snake(*d_snake, &h_snake_ll, nnodes);
558 #ifdef DEBUG_POSITIONS
559 for (int n=0; n<nnodes; n++)
561 printf("Node %d :\n", n);
562 for (int pos=0; pos<8; pos++)
564 printf("(%d , %d):%d:%d | ", h_liste_positions[8*n + pos].x, h_liste_positions[8*n + pos].y,
565 h_liste_positions[8*n + pos].z, h_liste_positions[8*n + pos].w);
569 #endif //DEBUG_POSITIONS
571 //verif liste pixels noeuds pairs/impairs selon
574 printf("NOMBRE PIXELS pour LISTE = %d\n", *h_nb_pix_max) ;
575 printf("bs = %d - grid = %d - nblocs_seg = %d - npix_max = %d - taille mem = %d\n",
576 bs, grid.x, nblocs_seg, *h_nb_pix_max, taille_mem);
578 cudaMemcpy( h_liste_pix, d_liste_pix, taille_mem*sizeof(uint2), cudaMemcpyDeviceToHost ) ;
579 cudaMemcpy( h_snake, *d_snake, nnodes*sizeof(snake_node_gpu), cudaMemcpyDeviceToHost );
580 uint32 * h_liste_pixels_segment = new uint32[2*(*h_nb_pix_max)] ;
581 int idx_n, idx_nprec, idx_nsuiv ;
583 printf("********* LISTE PIX ***********\n");
584 printf("bs = %d - grid = %d - nblocs_seg = %d - npix_max = %d - taille mem = %d\n",
585 bs, grid.x, nblocs_seg, *h_nb_pix_max, taille_mem);
587 for (int n=0; n<(nnodes/2 + (nnodes%2)*pairs); n++)
589 idx_n = 2*n + !pairs ;
590 if (idx_n == 0) idx_nprec = nnodes - 1;
591 else idx_nprec = idx_n - 1;
592 if (idx_n == nnodes-1) idx_nsuiv = 0;
593 else idx_nsuiv = idx_n + 1 ;
595 for (int pos=0; pos < 8 ; pos++) //test des segments avant le noeud
598 int nb_pix = calcul_liste_pixel_segment(h_snake[idx_nprec].posi,h_snake[idx_nprec].posj,
599 h_liste_positions[8*idx_n+pos].x, h_liste_positions[8*idx_n+pos].y,
600 h_liste_pixels_segment, 0);
602 for (int pix=0; pix < nb_pix; pix++)
605 if ( (h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].x != h_liste_pixels_segment[2*pix] )
606 || ( h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].y != h_liste_pixels_segment[2*pix+1] ) )
607 printf("erreur avant n=%d pix %d/%d segment %d noeuds[ %d-%d-%d ] , CPU (%d,%d) - GPU (%d, %d)\n", n, pix, nb_pix, pos,
608 idx_nprec, idx_n, idx_nsuiv,
609 h_liste_pixels_segment[2*pix], h_liste_pixels_segment[2*pix+1],
610 h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].x, h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].y);
615 for (int pos=0; pos < 8 ; pos++) //test des segments apres le noeud
618 int nb_pix = calcul_liste_pixel_segment(h_liste_positions[8*idx_n+pos].x, h_liste_positions[8*idx_n+pos].y,
619 h_snake[idx_nsuiv].posi,h_snake[idx_nsuiv].posj,
620 h_liste_pixels_segment, 0);
622 for (int pix=0; pix < nb_pix; pix++)
625 if ( (h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].x != h_liste_pixels_segment[2*pix] )
626 || ( h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].y != h_liste_pixels_segment[2*pix+1] ) )
627 printf("erreur apres n=%d pix %d/%d segment %d noeuds[ %d-%d-%d ] , CPU (%d,%d) - GPU (%d, %d)\n", n, pix, nb_pix, pos+8,
628 idx_nprec, idx_n, idx_nsuiv,
629 h_liste_pixels_segment[2*pix], h_liste_pixels_segment[2*pix+1],
630 h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].x, h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].y);
638 #endif //DEBUG_LISTES
642 Test du calcul des sommes partielles 'somblocs' faites par le kernel 'calcul_contribs_segments_blocs_full'
646 #ifdef DEBUG_SOMBLOCS
647 printf("********* SOMMES PARTIELLES ***********\n");
648 printf("bs = %d - grid = %d - intervalles = %d - nblocs_seg = %d - pairs = %d \n", bs, grid.x, n_interval, nblocs_seg, pairs);
649 for (int n=0; n< n_interval; n++)
651 idx_n = 2*n + !pairs ;
652 if (idx_n == 0) idx_nprec = nnodes - 1 ;
653 else idx_nprec = idx_n - 1 ;
654 if (idx_n == nnodes-1) idx_nsuiv = 0 ;
655 else idx_nsuiv = idx_n + 1 ;
656 printf("******** node %d\n", idx_n) ;
657 for(int s=0; s<8; s++)
659 int nb_pix = calcul_liste_pixel_segment(h_snake[idx_nprec].posi,h_snake[idx_nprec].posj,
660 h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
661 h_liste_pixels_segment, 0);
662 for (int b=0; b<nblocs_seg; b++)
664 uint64 c1=0, cx=0, cx2=0 ;
666 for (int p=0; p<bs; p++)
668 if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
670 // /!\ penser a oter le test de prise en
671 // compte pour les pix sur la même ligne dans
672 // le kernel, sinon les comparaisons des
673 // sommes par colonne seront fausses
674 i = h_liste_pixels_segment[2*(b*bs + p)] ;
675 j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
681 if ( ( c1 != h_sombloc[(16*n + s)*nblocs_seg + b ] ) || ( cx != h_sombloc[(16*n + s)*nblocs_seg + b + grid.x] )
682 || ( cx2 != h_sombloc[ (16*n + s)*nblocs_seg + b + 2*grid.x] ) )
683 printf("seg %d - %d pix : bloc %d -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix, b,
684 c1, cx, cx2, h_sombloc[(16*n+s)*nblocs_seg + b], h_sombloc[(16*n+s)*nblocs_seg + b + grid.x],
685 h_sombloc[(16*n+s)*nblocs_seg + b + 2*grid.x]) ;
689 for(int s=0; s<8; s++)
691 int nb_pix = calcul_liste_pixel_segment( h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
692 h_snake[idx_nsuiv].posi,h_snake[idx_nsuiv].posj,
693 h_liste_pixels_segment, 0);
694 for (int b=0; b<nblocs_seg; b++)
696 uint64 c1=0, cx=0, cx2=0 ;
698 for (int p=0; p<bs; p++)
700 if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
702 // /!\ penser a oter le test de prise en
703 // compte pour les pix sur la même ligne dans
704 // le kernel, sinon les comparaisons des
705 // sommes par colonne seront fausses
706 i = h_liste_pixels_segment[2*(b*bs + p)] ;
707 j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
713 if ( ( c1 != h_sombloc[(16*n + s + 8)*nblocs_seg + b ] ) || ( cx != h_sombloc[(16*n + s + 8)*nblocs_seg + b + grid.x] )
714 || ( cx2 != h_sombloc[ (16*n + s + 8)*nblocs_seg + b + 2*grid.x] ) )
715 printf("seg %d - %d pix : bloc %d -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix, b,
716 c1, cx, cx2, h_sombloc[(16*n+s+8)*nblocs_seg + b], h_sombloc[(16*n+s+8)*nblocs_seg + b + grid.x],
717 h_sombloc[(16*n+s+8)*nblocs_seg + b + 2*grid.x]) ;
723 #endif //DEBUG_SOMBLOCS
728 Test du calcul des sommes totales 'somsom' faites par le kernel 'somsom_full'
733 printf("********* SOMMES TOTALES ***********\n");
734 printf("bs = %d - grid = %d - intervalles = %d - nblocs_seg = %d - pairs = %d \n", bs, grid.x, n_interval, nblocs_seg, pairs);
735 for (int n=0; n< n_interval; n++)
737 idx_n = 2*n + !pairs ;
738 if (idx_n == 0) idx_nprec = nnodes - 1 ;
739 else idx_nprec = idx_n - 1 ;
740 if (idx_n == nnodes-1) idx_nsuiv = 0 ;
741 else idx_nsuiv = idx_n + 1 ;
742 printf("******** node %d\n", idx_n) ;
743 for(int s=0; s<8; s++)
745 int nb_pix = calcul_liste_pixel_segment(h_snake[idx_nprec].posi,h_snake[idx_nprec].posj,
746 h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
747 h_liste_pixels_segment, 0);
748 uint64 c1=0, cx=0, cx2=0 ;
749 for (int b=0; b<nblocs_seg; b++)
752 for (int p=0; p<bs; p++)
754 if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
756 // /!\ penser a oter le test de prise en
757 // compte pour les pix sur la même ligne dans
758 // le kernel, sinon les comparaisons des
759 // sommes par colonne seront fausses
760 i = h_liste_pixels_segment[2*(b*bs + p)] ;
761 j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
768 if ( ( c1 != h_somsom[3*(16*n + s)] ) || ( cx != h_somsom[3*(16*n + s) + 1] )
769 || ( cx2 != h_somsom[3*(16*n + s) + 2] ) )
770 printf("seg %d - %d pix -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix,
771 c1, cx, cx2, h_somsom[3*(16*n + s)], h_somsom[3*(16*n + s) + 1],
772 h_somsom[3*(16*n + s) + 2]) ;
776 for(int s=0; s<8; s++)
778 int nb_pix = calcul_liste_pixel_segment( h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
779 h_snake[idx_nsuiv].posi,h_snake[idx_nsuiv].posj,
780 h_liste_pixels_segment, 0);
781 uint64 c1=0, cx=0, cx2=0 ;
782 for (int b=0; b<nblocs_seg; b++)
786 for (int p=0; p<bs; p++)
788 if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
790 // /!\ penser a oter le test de prise en
791 // compte pour les pix sur la même ligne dans
792 // le kernel, sinon les comparaisons des
793 // sommes par colonne seront fausses
794 i = h_liste_pixels_segment[2*(b*bs + p)] ;
795 j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
802 if ( ( c1 != h_somsom[3*(16*n + s + 8)] ) || ( cx != h_somsom[3*(16*n + s + 8) + 1] )
803 || ( cx2 != h_somsom[3*(16*n + s + 8) + 2] ) )
804 printf("seg %d - %d pix -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix,
805 c1, cx, cx2, h_somsom[3*(16*n + s + 8)], h_somsom[3*(16*n + s + 8) + 1],
806 h_somsom[3*(16*n + s + 8) + 2]) ;
816 printf("**** STATS - REF : %lf \n", *h_vrais_snake);
817 for(int n=0; n<n_interval; n++)
819 for(int p=0; p<8; p++)
821 printf("test %d du node %d : %lu - %lu - %lu - - log_vrais = %lf\n", p, (2*n + !pairs),
822 h_stats[3*(8*n+p)], h_stats[3*(8*n+p)+1], h_stats[3*(8*n+p)+2], h_vrais[8*n+p]);
829 printf("**** CROISEMENTS \n");
830 for(int n=0; n<nnodes; n++)
832 printf("test du seg %d : ", n);
833 if ( h_croist[n] ) printf("CROISEMENT\n"); else printf("\n");
839 printf("**** MOUVEMENTS \n");
840 for(int n=0; n<nnodes; n++)
842 printf("Node %d : (%s) ",n, (h_move[n])? "yes":"no");
846 delete h_liste_positions ;
850 * fin generation snake GPU