]> AND Private Git Repository - snake_gpu.git/blobdiff - src/snake2D_gpu.cu
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
test tex
[snake_gpu.git] / src / snake2D_gpu.cu
index d5a42bb88c6a2a7a473938652d2fdc423767f574..1ba2edaa300fb15c778945a9e54143eaa7e220aa 100644 (file)
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <malloc.h>
 #include "structures.h"
+#include "cutil_inline.h"
 extern "C"{
 #include "lib_alloc.h"
 #include "lib_images.h"
@@ -158,7 +159,7 @@ int main(int argc, char **argv)
         
       if (VERBOSE)
                {
-                 cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost);
+                 cutilSafeCall( cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost) );
                  printf("\n#%d : pas %d pixels, LV = %lf \n", iter, Pas, h_vrais_snake) ;
                  tic(&chrono, NULL) ;
                }
@@ -177,7 +178,7 @@ int main(int argc, char **argv)
                liste_positions_a_tester<<<nnodes, 8>>>(d_snake, d_positions, d_nb_pix_max, Pas, nnodes, I_dim, J_dim) ;
                
                // recupere la taille maxi des segments
-               cudaMemcpy( &h_nb_pix_max, d_nb_pix_max, sizeof(uint32), cudaMemcpyDeviceToHost) ;
+               cutilSafeCallNoSync( cudaMemcpy( &h_nb_pix_max, d_nb_pix_max, sizeof(uint32), cudaMemcpyDeviceToHost) ) ;
                
                // determination des parametres des kernels
                bs = nextPow2(h_nb_pix_max) ;
@@ -186,16 +187,16 @@ int main(int argc, char **argv)
                nblocs_seg = (h_nb_pix_max+bs-1)/bs ;
 
                pairs = false ;
-               n_interval = nnodes/2 + pairs*(nnodes%2) ;
+               n_interval = nnodes ;
                taille_smem =  CFI(bs)*sizeof(tcontribs) ;
                threads = dim3(bs,1,1) ;
                grid = dim3( n_interval*16*nblocs_seg ,1,1) ; 
                
                  //calcul listes pix + contrib partielles + freemans + centres  
                  calcul_contribs_segments_blocs_full<<< grid , threads, taille_smem >>>( d_snake, nnodes, d_positions, h_nb_pix_max,
-                                                                                         d_img_x, d_img_x2, d_codes_segments,
-                                                                                         J_dim, d_listes_pixels, d_contribs_segments_blocs,
-                                                                                                                                                                 pairs);
+                                                                                                                                                                         d_img_x, d_img_x2, d_codes_segments,
+                                                                                                                                                                         J_dim, d_listes_pixels, d_contribs_segments_blocs,
+                                                                                                                                                                         pairs);
                  
           calcul_freemans_centre<<<n_interval, 16>>>( d_listes_pixels,  d_freemanDiDj, d_freemans_centres);
                  //printf("EXEC impairs : %d max pix - %d intervalles => %d blocs de %d threads - %d octets de smem\n", h_nb_pix_max, n_interval, grid.x, threads.x, taille_smem);
@@ -203,42 +204,38 @@ int main(int argc, char **argv)
                  somsom_full<<< 16*n_interval , 1>>>(d_contribs_segments_blocs, nnodes, nblocs_seg, d_contribs_segments) ;
                  
                  //calcul des stats associees a chaque position de test
-                 calcul_stats_full<<< n_interval, 8 >>>(d_snake, nnodes, pairs, d_stats_snake, d_stats_ref, d_stats, d_contribs_segments,
+                 calcul_stats_full<<< n_interval, 8 >>>(d_snake, d_snake_tmp, nnodes, pairs, d_stats_snake, d_stats_ref, d_stats, d_contribs_segments,
                                                                                                 d_positions, d_codes_segments,  d_freemans_centres, d_codeNoeud,
                                                                                                 d_img_x, d_img_x2, I_dim, J_dim, d_vrais, d_vrais_snake, d_move);
                
-                 pairs = true ;
-                 n_interval = nnodes/2 + pairs*(nnodes%2) ;
-                 grid = dim3( n_interval*16*nblocs_seg ,1,1) ; 
 
-                 //calcul listes pix + contrib partielles + freemans + centres
-                 calcul_contribs_segments_blocs_full<<< grid , threads, taille_smem >>>( d_snake, nnodes, d_positions, h_nb_pix_max,
-                                                                                         d_img_x, d_img_x2, d_codes_segments,
-                                                                                         J_dim, d_listes_pixels, d_contribs_segments_blocs,
-                                                                                         pairs);
-                 calcul_freemans_centre<<<n_interval, 16>>>( d_listes_pixels, d_freemanDiDj, d_freemans_centres);                                                                                      
-                 //printf("EXEC pairs : %d max pix - %d intervalles => %d blocs de %d threads - %d octets de smem\n", h_nb_pix_max, n_interval, grid.x, threads.x, taille_smem);
-               //sommes des contribs partielles -> contribs segments
-                 somsom_full<<< 16*n_interval , 1>>>(d_contribs_segments_blocs, nnodes, nblocs_seg, d_contribs_segments) ;
+                 //parametres d'execution des kernels pour le recalcul des contribs et stats du snake
+                 npixmax = h_nb_pix_max ;
+                 tpb = nextPow2(npixmax) ;
+                 if (tpb >= BSMAX) tpb = BSMAX ;//  /!\ le kernel <<< calcul_contrib...>>> ne supporte pas un bs>BSMAX a cause de la shared-mem nécessaire
+                 if (tpb < 32 ) tpb = 32 ;
+                 bps = (npixmax+tpb-1)/tpb ;
+                 //calcul sommes partielles des contribs + codes segments
+                 recalcul_contribs_segments_snake<<< nnodes*bps, tpb, CFI(tpb)*sizeof(tcontribs)>>>(d_snake_tmp, nnodes, 
+                                                                                                                                                                                        d_img_x, d_img_x2, 
+                                                                                                                                                                                        J_dim, d_liste_temp, d_sompart );
+                 //calcul des freemans et des centres a partir des 5 points stockes par segment dans 'd_liste_temp'
+                 recalcul_freemans_centre<<<nnodes, 1>>>(d_snake_tmp, d_liste_temp, d_freemanDiDj);
+                 //somme des sommes partielles
+                 resomsom_snake<<< nnodes , 1 >>>(d_sompart, nnodes, bps, d_snake_tmp);
+                 //calcul des stats 
+                 recalcul_stats_snake<<< 1 , 1 >>>(d_snake_tmp, nnodes, d_stats_snake, d_vrais_snake,
+                                                                                       d_img_x, d_img_x2,
+                                                                                       d_codeNoeud, J_dim
+                                                                                       );
+                 copie_snake<<< nnodes, 1 >>>(d_snake_tmp, d_snake) ;
                  
-                 //calcul des stats associees a chaque position de test
-                 calcul_stats_full<<< n_interval, 8 >>>(d_snake, nnodes, pairs, d_stats_snake, d_stats_ref, d_stats, d_contribs_segments,
-                                                                                                d_positions, d_codes_segments,  d_freemans_centres, d_codeNoeud,
-                                                                                                d_img_x, d_img_x2, I_dim, J_dim, d_vrais, d_vrais_snake, d_move);
-               
-                 
-               //il faut recalculer les stats du snake apres modif
-               recalcul_stats_snake<<< 1 , 1  >>>(d_snake, nnodes, d_stats_snake, d_vrais_snake,
-                                                                                  d_img_x, d_img_x2,
-                                                                                  d_codeNoeud, J_dim
-                                                                                  );
-               
-               cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost);
-               //printf("iter %d apres recalcul du move LV = %lf - ",  iter, h_vrais_snake) ;
+                 cutilSafeCallNoSync( cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost) );
+               printf("iter %d apres recalcul du move LV = %lf - \n",  iter, h_vrais_snake) ;
                
                nb_move = 0;
                //recup move
-               cudaMemcpy( h_move, d_move, nnodes*sizeof(bool), cudaMemcpyDeviceToHost);
+               cutilSafeCallNoSync( cudaMemcpy( h_move, d_move, nnodes*sizeof(bool), cudaMemcpyDeviceToHost) );
                i = 0;
                while (i<nnodes)
                  {
@@ -314,7 +311,7 @@ int main(int argc, char **argv)
        delete Liste_pixel_segment ; 
        delete h_snake_ll;
        */
-       cudaMemcpy( h_snake, d_snake, nnodes*sizeof(snake_node_gpu), cudaMemcpyDeviceToHost);
+       cudaMemcpy( h_snake, d_snake_tmp, nnodes*sizeof(snake_node_gpu), cudaMemcpyDeviceToHost);
        //affiche coordonnees
        for (int node=0; node<nnodes; node++){
            printf("NODE %d  %d  %d \n", node, h_snake[node].posi, h_snake[node].posj);
@@ -329,6 +326,7 @@ int main(int argc, char **argv)
   cudaFree(d_freemanDiDj);
   cudaFree(d_codeNoeud);
   cudaFree(d_snake);
+  cudaFree(d_snake_tmp);
   cudaFree(d_nb_pix_max); 
   cudaFree(d_positions);
   cudaFree(d_contribs_segments);