Fin de test multisnake sur iter 1

[snake_gpu.git] / src / lib_gpu.cu
diff --git a/src/lib_gpu.cu b/src/lib_gpu.cu

index d6df5ec11acac0a9142c5057996e1bf170bf4b27..5a90c1ccf8169373919d07eeeedc449488f8dcdd 100644 (file)
--- a/src/lib_gpu.cu
+++ b/src/lib_gpu.cu
@@ -153,9 +153,9 @@ void cuda_init_img_cumul(unsigned short ** img_in, int H, int L, int nb_nodes,
    calcul_stats_image<<<1, 1>>>( *d_img_x, *d_img_x2, H, L, (uint64*)*d_stats_snake);
    
    
    calcul_stats_image<<<1, 1>>>( *d_img_x, *d_img_x2, H, L, (uint64*)*d_stats_snake);
    
    
-         cudaThreadSynchronize()   ;
-         toc(chrono, "\tTemps GPU");
-        if(DEBUG_IMG_CUMUL)
+  cudaThreadSynchronize()   ;
+  toc(chrono, "\tTemps GPU");
+  if(DEBUG_IMG_CUMUL)
         { 
           
           //allocation memoire CPU
         { 
           
           //allocation memoire CPU
@@ -231,10 +231,53 @@ void cuda_init_img_cumul(unsigned short ** img_in, int H, int L, int nb_nodes,
     * generation snake en mem GPU
     */
    int dist = 140 ;
     * generation snake en mem GPU
     */
    int dist = 140 ;
+
+  /* Test de determination du snake rectangle initial optimal*/
+  int div = 100;//nb de divisions de l'image : cela définit le pas. La valeur max découle du nb max de threads possible ds une grille
+  int Nperm = div*div*bs;//nb total de rectangles a tester. La distribution est ainsi irrégulière, mais plus simple.
+  double best_crit ;
+  int ind_best_crit ;
+  
+  t_rectangle_snake * d_all_crit, d_best_crit;//tableaux pour les résultats des différents rectangles / le meilleur
+  t_rectangle_snake * h_all_crit = new t_rectangle_snake[Nperm];//correspondant CPU
+
+  //allocations
+  cudaMalloc((void**) &d_all_crit, Nperm*sizeof(t_rectangle_snake));
+  cudaMalloc((void**) &d_best_crit, sizeof(t_rectangle_snake));
    
    tic(&chrono, NULL);
    
    tic(&chrono, NULL);
-  if (nb_nodes == 4)  genere_snake_rectangle_4nodes_gpu<<< 1, 1>>>(*d_snake, 140, H, L) ;
-  else if (nb_nodes == 40) genere_snake_rectangle_Nnodes_gpu<<< 1, 1>>>(*d_snake, (H+L)/20, H, L) ;
+
+  //execution kernel
+  dim3 grid = dim3(H/div, L/div, 1); 
+  calcul_contribs_snake4<<<grid, bs, CFI(bs)*sizeof(tcontribs) >>>(*d_snake, *d_img_x, *d_img_x2, H, L, *d_stats_snake, d_all_crit) ;
+  cudaThreadSynchronize();
+  toc(chrono, "\nCALCULS RECTANGLES");
+
+  //recup data rectangles
+  int ret;
+  ret = cudaMemcpy( h_all_crit, d_all_crit, Nperm*sizeof(t_rectangle_snake), cudaMemcpyDeviceToHost) ;
+  printf("COPIE DATA = %s\n",(ret==0)?"OK":"ERR");
+  
+  //optimum sur CPU
+  best_crit = h_all_crit[0].crit ;
+  ind_best_crit = 0 ;
+  for (int k=1; k<100; k++){
+       if ((h_all_crit[k].crit > 0) && (h_all_crit[k].crit < best_crit)) {
+         best_crit = h_all_crit[k].crit ;
+         ind_best_crit = k ;
+       }
+       printf("%d -> ( %d, %d )--( %d, %d)  CRITERE = %f\n", k, h_all_crit[k].bpi, h_all_crit[k].bpj,
+                h_all_crit[k].opi, h_all_crit[k].opj,  h_all_crit[k].crit );
+  }
+
+  printf("BEST RECTANGLE/%d tests : %d -> ( %d, %d )--( %d, %d)  CRITERE = %f\n", Nperm, ind_best_crit, h_all_crit[ind_best_crit].bpi, h_all_crit[ind_best_crit].bpj,
+                h_all_crit[ind_best_crit].opi, h_all_crit[ind_best_crit].opj, best_crit );
+  
+  exit(0);
+  /*fin test snake rectangle initial optimal*/
+ 
+  //genere_snake_rectangle_4nodes_gpu<<< 1, 1>>>(*d_snake, 140, H, L) ;
+ 
  
    int nnodes = nb_nodes ;
    snake_node_gpu * h_snake = new snake_node_gpu[nnodes];
  
    int nnodes = nb_nodes ;
    snake_node_gpu * h_snake = new snake_node_gpu[nnodes];
@@ -261,15 +304,19 @@ void cuda_init_img_cumul(unsigned short ** img_in, int H, int L, int nb_nodes,
    cudaMalloc((void**) &d_stats_ref, 3*nnodes*sizeof(int64));
  
    //DEBUG : pour forcer la mise à zero du tableau intermediaire d_stats_ref
    cudaMalloc((void**) &d_stats_ref, 3*nnodes*sizeof(int64));
  
    //DEBUG : pour forcer la mise à zero du tableau intermediaire d_stats_ref
+  /*
    int64 h_stats_ref[3*nnodes] ;
    for (int a=0; a<3*nnodes ; a++) h_stats_ref[a] = 0 ;
    cudaMemcpy( h_stats_ref, d_stats_ref, sizeof(int64), cudaMemcpyHostToDevice) ;
    int64 h_stats_ref[3*nnodes] ;
    for (int a=0; a<3*nnodes ; a++) h_stats_ref[a] = 0 ;
    cudaMemcpy( h_stats_ref, d_stats_ref, sizeof(int64), cudaMemcpyHostToDevice) ;
+  */
    //fin forçage a 0
    
    //DEBUG : pour forcer la mise à zero du tableau intermediaire d_sompart
    //fin forçage a 0
    
    //DEBUG : pour forcer la mise à zero du tableau intermediaire d_sompart
+  /*
       t_sum_x2 h_sompart[ 3*nnodes*bps ] ;
       for (int a=0; a<3*nnodes*bps ; a++) h_sompart[a] = 0 ;
       cudaMemcpy( h_sompart, d_sompart, sizeof(t_sum_x2), cudaMemcpyHostToDevice) ;
       t_sum_x2 h_sompart[ 3*nnodes*bps ] ;
       for (int a=0; a<3*nnodes*bps ; a++) h_sompart[a] = 0 ;
       cudaMemcpy( h_sompart, d_sompart, sizeof(t_sum_x2), cudaMemcpyHostToDevice) ;
+  */
    //fin forçage a 0
    
    calcul_contribs_segments_snake<<< nnodes*bps, tpb, (CFI(tpb))*(3*sizeof(t_sum_x2))>>>
    //fin forçage a 0
    
    calcul_contribs_segments_snake<<< nnodes*bps, tpb, (CFI(tpb))*(3*sizeof(t_sum_x2))>>>