/** * \file snake2D.c * \brief snake polygonale approche region sous hypothese Gaussienne ou Gamma. * \author NB - PhyTI * \version x.x * \date 20 decembre 2009 * * traitement d'images en entiers 16 bits non signe : ppm * USAGE : SNAKE2D image.pgm 0 (ou 1) */ #include #include #include "structures.h" extern "C"{ #include "lib_alloc.h" #include "lib_images.h" #include "lib_snake_common.h" #include "lib_math.h" #include "lib_gpu.h" #include "defines.h" #include "lib_snake_2_gpu.h" } #include "lib_kernels_maths.cu" #include "lib_kernels_contribs.cu" int main(int argc, char **argv) { /* declaration des variables */ int ret ; int Prof ; /* profondeur en octets */ uint32 I_dim ; /* hauteur de l'image */ uint32 J_dim ; /* largeur de l'image */ int Nb_level ; /* dynamique de l'image */ char *File_name ; /* images */ unsigned short **Image_in; struct timeval chrono, chrono2, chrono_all ; /* lecture argument entree (basique!) */ File_name = argv[3] ; /* verif type image (pgm 8/16) */ ret = type_image_ppm(&Prof, &I_dim, &J_dim, &Nb_level, File_name) ; if ((ret == 0) | (Prof == 3)) { printf("format non pris en charge ... exit\n") ; return(0) ; } /* infos */ printf("Image : %s\n", File_name) ; printf("lecture OK : %d\n", ret) ; printf("Image (%d x %d) pixels\n", I_dim, J_dim) ; printf("Dynamique : %d\n", Nb_level) ; /* Allocation */ Image_in = new_matrix_ushort(I_dim, J_dim) ; /* chargement image d'entree */ load_pgm2ushort(Image_in, I_dim, J_dim, Nb_level, File_name) ; //POINTEURS VARIABLES MEMOIRE GLOBALE GPU unsigned short * d_img ; // image t_cumul_x * d_img_x ; // images cumulees t_cumul_x2 * d_img_x2; // snake_node_gpu * d_snake ; //image du snake CPU dans un tableau en gmem GPUe int * d_freemanDiDj ; // table de correspondance [Di][Dj]->Freemans int * d_codeNoeud ; // table de correspondance [F_in][F_out]->codeNoeud uint4 * d_positions ; // positions de test autour des noeuds uint2 * d_listes_pixels ; // coordonnees des pixels des segments correspondants aux 8 posiionstest uint2 * d_liste_temp ; uint32 * d_nb_pix_max ; // taille max des segments a tester uint64 * d_contribs_segments_blocs ;// sommes des contribs pixels par blocs de calcul uint64 * d_contribs_segments ; // contribs segments 1, x et x2 uint64 * d_sompart ; // vecteur de resultats intermediaires (sommes partielles = sommes par blocs) int64 * d_stats, * d_stats_ref ; // stats des positions de test, du snake sans les segments en test int64 * d_stats_snake; // stats du snake + stats de l'image complete double * d_vrais, * d_vrais_snake ; // valeurs de la log-vraisemblance des positions de test, du snake uint4 * d_freemans_centres ; // valeurs des F_in, F_out et coord. // centres des 16 segments associes aux 8 positions de test int * d_codes_segments ; // valeurs de codes des 16 segments bool * d_move ; // nb de deplacement effectues lors d'une iteration int * d_nb_nodes ; // nb de noeuds du snake snake_node_gpu * d_snake_tmp ; // snake tampon pour l'etape d'ajout de noeuds /* pointeurs sur mem CPU */ int *h_nb_nodes = new int; // image CPU du nb de noeud du snake snake_node_gpu h_snake[MAX_NODES] ; uint2 h_listes_pixels[MAX_NODES*16*5] ; double h_vrais_snake, h_vrais_mem, h_vrais[8*MAX_NODES] ; // image CPU de la log-vraisemblance bool * h_move = new bool[MAX_NODES];// image CPU du vecteur identifiant les noeuds qui ont bouge uint32 h_nb_pix_max, npixmax ; // taille max des segments a tester : utile pour determiner les params d'execution int nnodes = 4 ; // 4 ou 40 pour l'instant /*allocation memoire GPU */ int retour_cmd = 0; cudaMalloc((void**) &d_nb_nodes, sizeof(int)); cudaMalloc((void**) &d_sompart, MAX_NODES*256*16*sizeof(uint64)); cudaMalloc((void**) &d_liste_temp, MAX_NODES*5*16*sizeof(uint2)); retour_cmd = cudaMalloc((void**) &d_snake_tmp, MAX_NODES*sizeof(snake_node_gpu) ); printf("RESULTAT ALLOC snake_tmp = %d\n", retour_cmd); /*init snake (positions/contribs/stats/freemans/centres/codes)*/ cuda_init_img_cumul(Image_in, I_dim, J_dim, nnodes, &d_img, &d_img_x, &d_img_x2, &d_freemanDiDj, &d_codeNoeud, &d_snake, &d_nb_pix_max, &d_positions, &d_contribs_segments, &d_freemans_centres, &d_codes_segments, &d_stats_snake, &d_stats, &d_stats_ref, &d_vrais, &d_vrais_snake, &d_listes_pixels, &d_contribs_segments_blocs, &d_move ); /* debug : affichage snake */ int Verbose = 1 ; int VERBOSE = 1 ; int Display = 1 ; /* debug : tests fonctions*/ int H= I_dim, L= J_dim ; //snake_node * h_snake_ll; uint64 h_stats_snake[6]; //gpu2snake(d_snake, &h_snake_ll, nnodes); // variables de debug int nb_move, nb_newnode, iter, i ; int nb_move_total=0, nb_test_total=0 ; int NB_iter_max = atoi(argv[1]); int dist = (I_dim+J_dim)/5; int Pas = atoi(argv[2]) ; // dist entre la position actuelle et les positions de test int Dist_min_entre_noeud = 4*Pas ; int bs, nblocs_seg, tpb, bps ; // nb de threads par blocs pour l'execution des kernels, nb de blocs de threads par segment a tester dim3 threads, grid ; // params d'execution des kernels int n_interval ; // nombre d'intervalles Na--Nx--Nb concernes int taille_smem ; // quantite de shared memory allouee pour le calcul des contribs des segments de test bool pairs = true ; // mouvement des noeuds pairs/impairs if (Verbose) { printf("nb noeuds : %d\n", nnodes) ; tic(&chrono_all, NULL) ; } for (iter=1; (iter<=NB_iter_max)&&(Pas>0); iter++, Pas>>=1) { if (VERBOSE) { cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost); printf("\n#%d : pas %d pixels, LV = %lf \n", iter, Pas, h_vrais_snake) ; tic(&chrono, NULL) ; } // DEBUT MOVE SNAKE do { //memorisation precedente LV h_vrais_mem = h_vrais_snake ; // calcul stats sans les paires de segments a bouger soustrait_aux_stats_2N_segments_noeud<<< nnodes , 1 >>>(d_snake, d_stats_snake, d_stats_ref, d_img_x, d_img_x2, d_codeNoeud, J_dim ); // calcul des coordonnées de toutes les positions possibles des noeud a l'etape N+1 liste_positions_a_tester<<>>(d_snake, d_positions, d_nb_pix_max, Pas, nnodes, I_dim, J_dim) ; // recupere la taille maxi des segments cudaMemcpy( &h_nb_pix_max, d_nb_pix_max, sizeof(uint32), cudaMemcpyDeviceToHost) ; // determination des parametres des kernels bs = nextPow2(h_nb_pix_max) ; if (bs>=BSMAX) bs = BSMAX ; // /!\ le kernel <<< calcul_contrib...>>> ne supporte pas un bs>256 a cause de la shared-mem nécessaire if (bs<32) bs = 32 ; nblocs_seg = (h_nb_pix_max+bs-1)/bs ; pairs = false ; n_interval = nnodes/2 + pairs*(nnodes%2) ; taille_smem = CFI(bs)*sizeof(tcontribs) ; threads = dim3(bs,1,1) ; grid = dim3( n_interval*16*nblocs_seg ,1,1) ; //calcul listes pix + contrib partielles + freemans + centres calcul_contribs_segments_blocs_full<<< grid , threads, taille_smem >>>( d_snake, nnodes, d_positions, h_nb_pix_max, d_img_x, d_img_x2, d_codes_segments, J_dim, d_listes_pixels, d_contribs_segments_blocs, pairs); /*debug*/ cudaMemcpy( h_listes_pixels, d_listes_pixels, 16*5*n_interval*sizeof(uint2), cudaMemcpyDeviceToHost); for(int inter=0; inter < 3; inter++){ for(int seg=0; seg<16; seg++){ printf(" intervalle %d segment %d : (%d,%d)-(%d,%d)-(%d,%d)-(%d,%d)-(%d,%d)\n", inter, seg, h_listes_pixels[5*(16*inter+seg)].x,h_listes_pixels[5*(16*inter+seg)].y,h_listes_pixels[5*(16*inter+seg)+1].x, h_listes_pixels[5*(16*inter+seg)+1].y,h_listes_pixels[5*(16*inter+seg)+2].x,h_listes_pixels[5*(16*inter+seg)+2].y, h_listes_pixels[5*(16*inter+seg)+3].x,h_listes_pixels[5*(16*inter+seg)+3].y,h_listes_pixels[5*(16*inter+seg)+4].x, h_listes_pixels[5*(16*inter+seg)+4].y); } } /*fin*/ calcul_freemans_centre<<>>( d_listes_pixels, d_freemanDiDj, d_freemans_centres); //printf("EXEC impairs : %d max pix - %d intervalles => %d blocs de %d threads - %d octets de smem\n", h_nb_pix_max, n_interval, grid.x, threads.x, taille_smem); //sommes des contribs partielles -> contribs segments somsom_full<<< 16*n_interval , 1>>>(d_contribs_segments_blocs, nnodes, nblocs_seg, d_contribs_segments) ; //calcul des stats associees a chaque position de test calcul_stats_full<<< n_interval, 8 >>>(d_snake, nnodes, pairs, d_stats_snake, d_stats_ref, d_stats, d_contribs_segments, d_positions, d_codes_segments, d_freemans_centres, d_codeNoeud, d_img_x, d_img_x2, I_dim, J_dim, d_vrais, d_vrais_snake, d_move); pairs = true ; n_interval = nnodes/2 + pairs*(nnodes%2) ; grid = dim3( n_interval*16*nblocs_seg ,1,1) ; //calcul listes pix + contrib partielles + freemans + centres calcul_contribs_segments_blocs_full<<< grid , threads, taille_smem >>>( d_snake, nnodes, d_positions, h_nb_pix_max, d_img_x, d_img_x2, d_codes_segments, J_dim, d_listes_pixels, d_contribs_segments_blocs, pairs); calcul_freemans_centre<<>>( d_listes_pixels, d_freemanDiDj, d_freemans_centres); //printf("EXEC pairs : %d max pix - %d intervalles => %d blocs de %d threads - %d octets de smem\n", h_nb_pix_max, n_interval, grid.x, threads.x, taille_smem); //sommes des contribs partielles -> contribs segments somsom_full<<< 16*n_interval , 1>>>(d_contribs_segments_blocs, nnodes, nblocs_seg, d_contribs_segments) ; //calcul des stats associees a chaque position de test calcul_stats_full<<< n_interval, 8 >>>(d_snake, nnodes, pairs, d_stats_snake, d_stats_ref, d_stats, d_contribs_segments, d_positions, d_codes_segments, d_freemans_centres, d_codeNoeud, d_img_x, d_img_x2, I_dim, J_dim, d_vrais, d_vrais_snake, d_move); //il faut recalculer les stats du snake apres modif recalcul_stats_snake<<< 1 , 1 >>>(d_snake, nnodes, d_stats_snake, d_vrais_snake, d_img_x, d_img_x2, d_codeNoeud, J_dim ); cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost); //printf("iter %d apres recalcul du move LV = %lf - ", iter, h_vrais_snake) ; nb_move = 0; //recup move cudaMemcpy( h_move, d_move, nnodes*sizeof(bool), cudaMemcpyDeviceToHost); i = 0; while (i>>(d_snake, d_snake_tmp, nnodes, Dist_min_entre_noeud, d_nb_nodes ); cudaMemcpy( h_nb_nodes, d_nb_nodes, sizeof(int), cudaMemcpyDeviceToHost); nnodes += (*h_nb_nodes) ; cudaMemcpy( d_snake, d_snake_tmp, nnodes*sizeof(snake_node_gpu), cudaMemcpyDeviceToDevice); npixmax = h_nb_pix_max ; tpb = nextPow2(npixmax) ; if (tpb >= BSMAX) tpb = BSMAX ;// /!\ le kernel <<< calcul_contrib...>>> ne supporte pas un bs>256 a cause de la shared-mem nécessaire if (tpb < 32 ) tpb = 32 ; bps = (npixmax+tpb-1)/tpb ; recalcul_contribs_segments_snake<<< nnodes*bps, tpb, CFI(tpb)*sizeof(tcontribs)>>>(d_snake, nnodes, d_img_x, d_img_x2, J_dim, d_liste_temp, d_sompart ); recalcul_freemans_centre<<>>(d_snake, d_liste_temp, d_freemanDiDj); resomsom_snake<<< nnodes , 1 >>>(d_sompart, nnodes, bps, d_snake); recalcul_stats_snake<<< 1 , 1 >>>(d_snake, nnodes, d_stats_snake, d_vrais_snake, d_img_x, d_img_x2, d_codeNoeud, J_dim ); if (*h_nb_nodes == 0) break ; cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost); printf("\niter %d LV apres ajout noeuds = %lf \n ", iter, h_vrais_snake) ; } if (VERBOSE) { toc(chrono, "temps sequence move"); printf("nb deplacements : %d\n", nb_move) ; printf("nb deplacements total/test : %d/%d\n", nb_move_total, nb_test_total) ; printf("nb nouveaux noeuds : %d (total: %d)\n", *h_nb_nodes, nnodes) ; printf("\nlongueur de codage de gl : %lf \n", h_vrais_snake) ; } } if (Verbose) { toc(chrono_all, "temps move mv") ; cudaMemcpy( h_stats_snake, d_stats_snake, 6*sizeof(uint64), cudaMemcpyDeviceToHost); cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost); printf("\nFIN : longueur de codage de gl : %lf (%d)\n", h_vrais_snake, h_stats_snake[0]) ; printf("nb noeuds : %d, nb_iter : %d\n", nnodes, iter-1) ; printf("nb deplacements total/test : %d/%d\n", nb_move_total, nb_test_total) ; } if (Display) { /* old fashion way to draw the snake gpu2snake(d_snake, &h_snake_ll, nnodes); uint32 * Liste_pixel_segment = new uint32[I_dim+J_dim]; affiche_snake_ushort(Image_in, h_snake_ll, 255, 0, Liste_pixel_segment) ; delete Liste_pixel_segment ; delete h_snake_ll; */ cudaMemcpy( h_snake, d_snake, nnodes*sizeof(snake_node_gpu), cudaMemcpyDeviceToHost); //affiche coordonnees for (int node=0; node