+ div = (H+bs-1)/bs ;
+ printf("DIV = %d\n", div ) ;
+
+ int divpow2 = nextPow2(div) ;
+ printf("DIVPOW2 = %d\n", divpow2) ;
+
+ grid = dim3(div, 1, 1) ;
+ smem = CFI(bs)*sizeof(tcontribs) ;
+ cudaMalloc((void**) &d_contribs_part, div*sizeof(tcontribs)) ;
+ cudaMalloc((void**) &d_contribs_cols, div*div*sizeof(tcontribs)) ;
+ cudaMalloc((void**) &d_miniblocs, div*sizeof(double2)) ;
+ h_contribs_cols = new tcontribs[div*div] ;
+ tcontribs * h_contribs_part = new tcontribs[div] ;
+ h_miniblocs = new double2[div] ;
+
+ tic(&chrono, NULL);
+ // Appels kernels optim lignes horizontales
+ calcul_contrib_conjuguee_colonnes<<<grid, bs, CFI(bs)*sizeof(tcontribs) >>>( *d_img_x, *d_img_x2, H, L, j1, j2, d_contribs_part) ;
+
+ /*verif CPU
+ int cpt = 0 ;
+ int cpterr = 0 ;
+ tcontribs * h_contribs_part_cpu = new tcontribs[div] ;
+ cudaMemcpy( h_contribs_part, d_contribs_part, div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ;
+ for (int bloc=0; bloc < div; bloc++){
+ h_contribs_part_cpu[ bloc ].cx = 0 ;
+ h_contribs_part_cpu[ bloc ].cx2 = 0 ;
+ for (int line=0; ((line < bs)&&(bloc*bs+line < H)); line++){
+ h_contribs_part_cpu[bloc].cx += img_x[ (bloc*bs+line)*L + j2] - img_x[ (bloc*bs+line)*L + j1 ];
+ h_contribs_part_cpu[bloc].cx2 += img_x2[ (bloc*bs+line)*L + j2] - img_x2[ (bloc*bs+line)*L + j1 ];
+ }
+ if ( ( h_contribs_part_cpu[bloc].cx != h_contribs_part[bloc].cx ) || ( h_contribs_part_cpu[bloc].cx2 != h_contribs_part[bloc].cx2 ) )
+ {
+ printf("ERREUR bloc %d -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n", bloc,
+ h_contribs_part_cpu[bloc].cx, h_contribs_part_cpu[bloc].cx2, h_contribs_part[bloc].cx, h_contribs_part[bloc].cx2 ) ;
+ cpterr++;
+ }
+ cpt++ ;
+ }
+ printf("VERIF CONTRIB CONJUGUEES BLOCS --> %d ERREURS / %d BLOCS\n", cpterr, cpt) ;
+ fin verif*/