+ tic(&chrono, NULL);
+ // Appels kernels optim lignes horizontales
+ calcul_contrib_conjuguee_colonnes<<<grid, bs, CFI(bs)*sizeof(tcontribs) >>>( *d_img_x, *d_img_x2, H, L, j1, j2, d_contribs_part) ;
+
+ /*verif CPU
+ int cpt = 0 ;
+ int cpterr = 0 ;
+ tcontribs * h_contribs_part_cpu = new tcontribs[div] ;
+ cudaMemcpy( h_contribs_part, d_contribs_part, div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ;
+ for (int bloc=0; bloc < div; bloc++){
+ h_contribs_part_cpu[ bloc ].cx = 0 ;
+ h_contribs_part_cpu[ bloc ].cx2 = 0 ;
+ for (int line=0; ((line < bs)&&(bloc*bs+line < H)); line++){
+ h_contribs_part_cpu[bloc].cx += img_x[ (bloc*bs+line)*L + j2] - img_x[ (bloc*bs+line)*L + j1 ];
+ h_contribs_part_cpu[bloc].cx2 += img_x2[ (bloc*bs+line)*L + j2] - img_x2[ (bloc*bs+line)*L + j1 ];
+ }
+ if ( ( h_contribs_part_cpu[bloc].cx != h_contribs_part[bloc].cx ) || ( h_contribs_part_cpu[bloc].cx2 != h_contribs_part[bloc].cx2 ) )
+ {
+ printf("ERREUR bloc %d -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n", bloc,
+ h_contribs_part_cpu[bloc].cx, h_contribs_part_cpu[bloc].cx2, h_contribs_part[bloc].cx, h_contribs_part[bloc].cx2 ) ;
+ cpterr++;
+ }
+ cpt++ ;
+ }
+ printf("VERIF CONTRIB CONJUGUEES BLOCS --> %d ERREURS / %d BLOCS\n", cpterr, cpt) ;
+ fin verif*/
+
+ grid = dim3(div, div, 1) ;
+ calcul_contribs_snake_rectangle<<<grid,divpow2, CFI(divpow2)*sizeof(tcontribs) >>>(d_contribs_part, d_contribs_cols) ;
+
+ /* verif CPU
+ h_contribs_cols_cpu = new tcontribs[div*div] ;
+ cudaMemcpy( h_contribs_cols, d_contribs_cols, div*div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ;
+ cpt = 0 ;
+ cpterr = 0 ;
+ for (int i1=0; i1 < div ; i1++){
+ for (int i2=0 ; i2 < div ; i2++){
+ if (i2 >= i1){
+ h_contribs_cols_cpu[ i1*div +i2 ].cx = 0 ;
+ h_contribs_cols_cpu[ i1*div +i2 ].cx2= 0 ;
+ for (int d=i1 ; d <= i2 ; d++){
+ h_contribs_cols_cpu[ i1*div +i2 ].cx += h_contribs_part_cpu[ d ].cx ;
+ h_contribs_cols_cpu[ i1*div +i2 ].cx2+= h_contribs_part_cpu[ d ].cx2 ;
+ }
+ } else {
+ h_contribs_cols_cpu[ i1*div +i2 ].cx = 0 ;
+ h_contribs_cols_cpu[ i1*div +i2 ].cx2= 0 ;
+ }
+
+ if (( ( h_contribs_cols_cpu[ i1*div +i2 ].cx != h_contribs_cols[ i1*div +i2 ].cx ) || ( h_contribs_cols_cpu[ i1*div +i2].cx2 != h_contribs_cols[ i1*div +i2].cx2 ) )
+ && (i2 >= i1))
+ {
+ printf("ERREUR combinaison (%d, %d) -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n", i1, i2,
+ h_contribs_cols_cpu[ i1*div +i2].cx, h_contribs_cols_cpu[ i1*div +i2 ].cx2, h_contribs_cols[ i1*div +i2 ].cx, h_contribs_cols[ i1*div +i2 ].cx2 ) ;
+ cpterr++;
+ }
+ cpt++ ;
+ }
+ }
+ printf("VERIF COMBINAISONS LIGNES --> %d ERREURS / %d COMBINAISONS\n", cpterr, cpt) ;
+ fin verif */
+
+
+ grid = dim3(div, 1, 1) ;
+ calcul_critere_permutations_verticales<<< grid, divpow2, CFI(divpow2)*sizeof(double2) >>>(d_contribs_cols, bs, j1, j2, H, L, sigX, sigX2, d_miniblocs) ;
+
+ /* verif CPU
+ cpt = 0 ;
+ cpterr = 0 ;
+ double2 * h_miniblocs_cpu = new double2[ div ] ;
+ cudaMemcpy( h_miniblocs, d_miniblocs, div*sizeof(double2), cudaMemcpyDeviceToHost) ;
+ cudaMemcpy( h_stats_snake, *d_stats_snake, 6*sizeof(int64), cudaMemcpyDeviceToHost) ;
+ for (int lb=0 ; lb < div ; lb++){
+ h_miniblocs_cpu[lb].x = DBL_MAX ;
+ for (int lh=lb ; lh < div ; lh++){
+ if ( critere_gauss_cpu(H*L, (lh-lb+1)*bs*(j2 - j1), h_contribs_cols_cpu[ lb*div +lh ].cx, h_contribs_cols_cpu[ lb*div + lh ].cx2, h_stats_snake ) < h_miniblocs_cpu[lb].x )
+ {
+ h_miniblocs_cpu[lb].x = critere_gauss_cpu(H*L, (lh-lb+1)*bs*(j2 - j1), h_contribs_cols_cpu[ lb*div +lh ].cx, h_contribs_cols_cpu[ lb*div + lh ].cx2, h_stats_snake) ;
+ h_miniblocs_cpu[lb].y = (double)lh ;
+ }
+ }
+ if ( ( h_miniblocs_cpu[lb].x > 1.000001*h_miniblocs[lb].x ) || ( h_miniblocs_cpu[lb].x < 0.999999*h_miniblocs[lb].x ) || ( h_miniblocs_cpu[lb].y != h_miniblocs[lb].y ) )
+ {
+ printf("ERREUR MINIMUM BLOC LIGNE %d -> CPU=%lf en i2=%d | GPU=%lf en i2=%d\n", lb,
+ h_miniblocs_cpu[ lb ].x, (int)h_miniblocs_cpu[ lb ].y, h_miniblocs[ lb ].x, (int)h_miniblocs[ lb ].y ) ;
+ cpterr++;
+ }
+ cpt++ ;