X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/snake_gpu.git/blobdiff_plain/086196b2c97166f2420134552efb0a20195fece3..refs/heads/master:/src/snake2D_gpu.cu diff --git a/src/snake2D_gpu.cu b/src/snake2D_gpu.cu index d5a42bb..1ba2eda 100644 --- a/src/snake2D_gpu.cu +++ b/src/snake2D_gpu.cu @@ -12,6 +12,7 @@ #include #include #include "structures.h" +#include "cutil_inline.h" extern "C"{ #include "lib_alloc.h" #include "lib_images.h" @@ -158,7 +159,7 @@ int main(int argc, char **argv) if (VERBOSE) { - cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost); + cutilSafeCall( cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost) ); printf("\n#%d : pas %d pixels, LV = %lf \n", iter, Pas, h_vrais_snake) ; tic(&chrono, NULL) ; } @@ -177,7 +178,7 @@ int main(int argc, char **argv) liste_positions_a_tester<<>>(d_snake, d_positions, d_nb_pix_max, Pas, nnodes, I_dim, J_dim) ; // recupere la taille maxi des segments - cudaMemcpy( &h_nb_pix_max, d_nb_pix_max, sizeof(uint32), cudaMemcpyDeviceToHost) ; + cutilSafeCallNoSync( cudaMemcpy( &h_nb_pix_max, d_nb_pix_max, sizeof(uint32), cudaMemcpyDeviceToHost) ) ; // determination des parametres des kernels bs = nextPow2(h_nb_pix_max) ; @@ -186,16 +187,16 @@ int main(int argc, char **argv) nblocs_seg = (h_nb_pix_max+bs-1)/bs ; pairs = false ; - n_interval = nnodes/2 + pairs*(nnodes%2) ; + n_interval = nnodes ; taille_smem = CFI(bs)*sizeof(tcontribs) ; threads = dim3(bs,1,1) ; grid = dim3( n_interval*16*nblocs_seg ,1,1) ; //calcul listes pix + contrib partielles + freemans + centres calcul_contribs_segments_blocs_full<<< grid , threads, taille_smem >>>( d_snake, nnodes, d_positions, h_nb_pix_max, - d_img_x, d_img_x2, d_codes_segments, - J_dim, d_listes_pixels, d_contribs_segments_blocs, - pairs); + d_img_x, d_img_x2, d_codes_segments, + J_dim, d_listes_pixels, d_contribs_segments_blocs, + pairs); calcul_freemans_centre<<>>( d_listes_pixels, d_freemanDiDj, d_freemans_centres); //printf("EXEC impairs : %d max pix - %d intervalles => %d blocs de %d threads - %d octets de smem\n", h_nb_pix_max, n_interval, grid.x, threads.x, taille_smem); @@ -203,42 +204,38 @@ int main(int argc, char **argv) somsom_full<<< 16*n_interval , 1>>>(d_contribs_segments_blocs, nnodes, nblocs_seg, d_contribs_segments) ; //calcul des stats associees a chaque position de test - calcul_stats_full<<< n_interval, 8 >>>(d_snake, nnodes, pairs, d_stats_snake, d_stats_ref, d_stats, d_contribs_segments, + calcul_stats_full<<< n_interval, 8 >>>(d_snake, d_snake_tmp, nnodes, pairs, d_stats_snake, d_stats_ref, d_stats, d_contribs_segments, d_positions, d_codes_segments, d_freemans_centres, d_codeNoeud, d_img_x, d_img_x2, I_dim, J_dim, d_vrais, d_vrais_snake, d_move); - pairs = true ; - n_interval = nnodes/2 + pairs*(nnodes%2) ; - grid = dim3( n_interval*16*nblocs_seg ,1,1) ; - //calcul listes pix + contrib partielles + freemans + centres - calcul_contribs_segments_blocs_full<<< grid , threads, taille_smem >>>( d_snake, nnodes, d_positions, h_nb_pix_max, - d_img_x, d_img_x2, d_codes_segments, - J_dim, d_listes_pixels, d_contribs_segments_blocs, - pairs); - calcul_freemans_centre<<>>( d_listes_pixels, d_freemanDiDj, d_freemans_centres); - //printf("EXEC pairs : %d max pix - %d intervalles => %d blocs de %d threads - %d octets de smem\n", h_nb_pix_max, n_interval, grid.x, threads.x, taille_smem); - //sommes des contribs partielles -> contribs segments - somsom_full<<< 16*n_interval , 1>>>(d_contribs_segments_blocs, nnodes, nblocs_seg, d_contribs_segments) ; + //parametres d'execution des kernels pour le recalcul des contribs et stats du snake + npixmax = h_nb_pix_max ; + tpb = nextPow2(npixmax) ; + if (tpb >= BSMAX) tpb = BSMAX ;// /!\ le kernel <<< calcul_contrib...>>> ne supporte pas un bs>BSMAX a cause de la shared-mem nécessaire + if (tpb < 32 ) tpb = 32 ; + bps = (npixmax+tpb-1)/tpb ; + //calcul sommes partielles des contribs + codes segments + recalcul_contribs_segments_snake<<< nnodes*bps, tpb, CFI(tpb)*sizeof(tcontribs)>>>(d_snake_tmp, nnodes, + d_img_x, d_img_x2, + J_dim, d_liste_temp, d_sompart ); + //calcul des freemans et des centres a partir des 5 points stockes par segment dans 'd_liste_temp' + recalcul_freemans_centre<<>>(d_snake_tmp, d_liste_temp, d_freemanDiDj); + //somme des sommes partielles + resomsom_snake<<< nnodes , 1 >>>(d_sompart, nnodes, bps, d_snake_tmp); + //calcul des stats + recalcul_stats_snake<<< 1 , 1 >>>(d_snake_tmp, nnodes, d_stats_snake, d_vrais_snake, + d_img_x, d_img_x2, + d_codeNoeud, J_dim + ); + copie_snake<<< nnodes, 1 >>>(d_snake_tmp, d_snake) ; - //calcul des stats associees a chaque position de test - calcul_stats_full<<< n_interval, 8 >>>(d_snake, nnodes, pairs, d_stats_snake, d_stats_ref, d_stats, d_contribs_segments, - d_positions, d_codes_segments, d_freemans_centres, d_codeNoeud, - d_img_x, d_img_x2, I_dim, J_dim, d_vrais, d_vrais_snake, d_move); - - - //il faut recalculer les stats du snake apres modif - recalcul_stats_snake<<< 1 , 1 >>>(d_snake, nnodes, d_stats_snake, d_vrais_snake, - d_img_x, d_img_x2, - d_codeNoeud, J_dim - ); - - cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost); - //printf("iter %d apres recalcul du move LV = %lf - ", iter, h_vrais_snake) ; + cutilSafeCallNoSync( cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost) ); + printf("iter %d apres recalcul du move LV = %lf - \n", iter, h_vrais_snake) ; nb_move = 0; //recup move - cudaMemcpy( h_move, d_move, nnodes*sizeof(bool), cudaMemcpyDeviceToHost); + cutilSafeCallNoSync( cudaMemcpy( h_move, d_move, nnodes*sizeof(bool), cudaMemcpyDeviceToHost) ); i = 0; while (i