#include <stdio.h>
#include <malloc.h>
#include "structures.h"
+#include "cutil_inline.h"
extern "C"{
#include "lib_alloc.h"
#include "lib_images.h"
if (VERBOSE)
{
- cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost);
+ cutilSafeCall( cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost) );
printf("\n#%d : pas %d pixels, LV = %lf \n", iter, Pas, h_vrais_snake) ;
tic(&chrono, NULL) ;
}
liste_positions_a_tester<<<nnodes, 8>>>(d_snake, d_positions, d_nb_pix_max, Pas, nnodes, I_dim, J_dim) ;
// recupere la taille maxi des segments
- cudaMemcpy( &h_nb_pix_max, d_nb_pix_max, sizeof(uint32), cudaMemcpyDeviceToHost) ;
+ cutilSafeCallNoSync( cudaMemcpy( &h_nb_pix_max, d_nb_pix_max, sizeof(uint32), cudaMemcpyDeviceToHost) ) ;
// determination des parametres des kernels
bs = nextPow2(h_nb_pix_max) ;
nblocs_seg = (h_nb_pix_max+bs-1)/bs ;
pairs = false ;
- n_interval = nnodes/2 + pairs*(nnodes%2) ;
+ n_interval = nnodes ;
taille_smem = CFI(bs)*sizeof(tcontribs) ;
threads = dim3(bs,1,1) ;
grid = dim3( n_interval*16*nblocs_seg ,1,1) ;
//calcul listes pix + contrib partielles + freemans + centres
calcul_contribs_segments_blocs_full<<< grid , threads, taille_smem >>>( d_snake, nnodes, d_positions, h_nb_pix_max,
- d_img_x, d_img_x2, d_codes_segments,
- J_dim, d_listes_pixels, d_contribs_segments_blocs,
- pairs);
+ d_img_x, d_img_x2, d_codes_segments,
+ J_dim, d_listes_pixels, d_contribs_segments_blocs,
+ pairs);
calcul_freemans_centre<<<n_interval, 16>>>( d_listes_pixels, d_freemanDiDj, d_freemans_centres);
//printf("EXEC impairs : %d max pix - %d intervalles => %d blocs de %d threads - %d octets de smem\n", h_nb_pix_max, n_interval, grid.x, threads.x, taille_smem);
somsom_full<<< 16*n_interval , 1>>>(d_contribs_segments_blocs, nnodes, nblocs_seg, d_contribs_segments) ;
//calcul des stats associees a chaque position de test
- calcul_stats_full<<< n_interval, 8 >>>(d_snake, nnodes, pairs, d_stats_snake, d_stats_ref, d_stats, d_contribs_segments,
+ calcul_stats_full<<< n_interval, 8 >>>(d_snake, d_snake_tmp, nnodes, pairs, d_stats_snake, d_stats_ref, d_stats, d_contribs_segments,
d_positions, d_codes_segments, d_freemans_centres, d_codeNoeud,
d_img_x, d_img_x2, I_dim, J_dim, d_vrais, d_vrais_snake, d_move);
- pairs = true ;
- n_interval = nnodes/2 + pairs*(nnodes%2) ;
- grid = dim3( n_interval*16*nblocs_seg ,1,1) ;
- //calcul listes pix + contrib partielles + freemans + centres
- calcul_contribs_segments_blocs_full<<< grid , threads, taille_smem >>>( d_snake, nnodes, d_positions, h_nb_pix_max,
- d_img_x, d_img_x2, d_codes_segments,
- J_dim, d_listes_pixels, d_contribs_segments_blocs,
- pairs);
- calcul_freemans_centre<<<n_interval, 16>>>( d_listes_pixels, d_freemanDiDj, d_freemans_centres);
- //printf("EXEC pairs : %d max pix - %d intervalles => %d blocs de %d threads - %d octets de smem\n", h_nb_pix_max, n_interval, grid.x, threads.x, taille_smem);
- //sommes des contribs partielles -> contribs segments
- somsom_full<<< 16*n_interval , 1>>>(d_contribs_segments_blocs, nnodes, nblocs_seg, d_contribs_segments) ;
+ //parametres d'execution des kernels pour le recalcul des contribs et stats du snake
+ npixmax = h_nb_pix_max ;
+ tpb = nextPow2(npixmax) ;
+ if (tpb >= BSMAX) tpb = BSMAX ;// /!\ le kernel <<< calcul_contrib...>>> ne supporte pas un bs>BSMAX a cause de la shared-mem nécessaire
+ if (tpb < 32 ) tpb = 32 ;
+ bps = (npixmax+tpb-1)/tpb ;
+ //calcul sommes partielles des contribs + codes segments
+ recalcul_contribs_segments_snake<<< nnodes*bps, tpb, CFI(tpb)*sizeof(tcontribs)>>>(d_snake_tmp, nnodes,
+ d_img_x, d_img_x2,
+ J_dim, d_liste_temp, d_sompart );
+ //calcul des freemans et des centres a partir des 5 points stockes par segment dans 'd_liste_temp'
+ recalcul_freemans_centre<<<nnodes, 1>>>(d_snake_tmp, d_liste_temp, d_freemanDiDj);
+ //somme des sommes partielles
+ resomsom_snake<<< nnodes , 1 >>>(d_sompart, nnodes, bps, d_snake_tmp);
+ //calcul des stats
+ recalcul_stats_snake<<< 1 , 1 >>>(d_snake_tmp, nnodes, d_stats_snake, d_vrais_snake,
+ d_img_x, d_img_x2,
+ d_codeNoeud, J_dim
+ );
+ copie_snake<<< nnodes, 1 >>>(d_snake_tmp, d_snake) ;
- //calcul des stats associees a chaque position de test
- calcul_stats_full<<< n_interval, 8 >>>(d_snake, nnodes, pairs, d_stats_snake, d_stats_ref, d_stats, d_contribs_segments,
- d_positions, d_codes_segments, d_freemans_centres, d_codeNoeud,
- d_img_x, d_img_x2, I_dim, J_dim, d_vrais, d_vrais_snake, d_move);
-
-
- //il faut recalculer les stats du snake apres modif
- recalcul_stats_snake<<< 1 , 1 >>>(d_snake, nnodes, d_stats_snake, d_vrais_snake,
- d_img_x, d_img_x2,
- d_codeNoeud, J_dim
- );
-
- cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost);
- //printf("iter %d apres recalcul du move LV = %lf - ", iter, h_vrais_snake) ;
+ cutilSafeCallNoSync( cudaMemcpy( &h_vrais_snake, d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost) );
+ printf("iter %d apres recalcul du move LV = %lf - \n", iter, h_vrais_snake) ;
nb_move = 0;
//recup move
- cudaMemcpy( h_move, d_move, nnodes*sizeof(bool), cudaMemcpyDeviceToHost);
+ cutilSafeCallNoSync( cudaMemcpy( h_move, d_move, nnodes*sizeof(bool), cudaMemcpyDeviceToHost) );
i = 0;
while (i<nnodes)
{
delete Liste_pixel_segment ;
delete h_snake_ll;
*/
- cudaMemcpy( h_snake, d_snake, nnodes*sizeof(snake_node_gpu), cudaMemcpyDeviceToHost);
+ cudaMemcpy( h_snake, d_snake_tmp, nnodes*sizeof(snake_node_gpu), cudaMemcpyDeviceToHost);
//affiche coordonnees
for (int node=0; node<nnodes; node++){
printf("NODE %d %d %d \n", node, h_snake[node].posi, h_snake[node].posj);
cudaFree(d_freemanDiDj);
cudaFree(d_codeNoeud);
cudaFree(d_snake);
+ cudaFree(d_snake_tmp);
cudaFree(d_nb_pix_max);
cudaFree(d_positions);
cudaFree(d_contribs_segments);