X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/snake_gpu.git/blobdiff_plain/48b1bac747f398161b53a67ad80c4596f531c88a..5f3b433716d7b93cda65ae4ce49969105fdafdbd:/src/lib_gpu.cu?ds=inline diff --git a/src/lib_gpu.cu b/src/lib_gpu.cu index 5a90c1c..5406160 100644 --- a/src/lib_gpu.cu +++ b/src/lib_gpu.cu @@ -1,6 +1,6 @@ #include - +#include extern "C"{ #include "structures.h" @@ -24,7 +24,27 @@ bool DISPLAY_ERR_IMG_CUMUL = 1; //#define DEBUG_LISTES //#define DEBUG_STATS_REF - +double critere_gauss_cpu( int n, int ni, uint64 sxi, uint64 sxi2, int64 *stats_img){ + int ne = n - ni ; + uint64 sxe = stats_img[4] - sxi ; + uint64 sxe2= stats_img[5] - sxi2; + double sigi2, sige2, critere ; + + /* variance des valeurs des niveaux de gris a l'interieur */ + sigi2 = + ((double)sxi2/ni) - + ((double)sxi/ni)*((double)sxi/ni) ; + + /* variance des valeurs des niveaux de gris a l'exterieur */ + sige2 = + ((double)sxe2)/ne - + ((double)sxe/ne)*((double)sxe/ne) ; + + if ( (sigi2 > 0) && (sige2 > 0) ) + critere = 0.5*((double)ni*log(sigi2) + (double)ne*log(sige2)) ; + else critere = DBL_MAX ; + return critere; +} void cuda_init_img_cumul(unsigned short ** img_in, int H, int L, int nb_nodes, unsigned short ** d_img, t_cumul_x ** d_img_x, t_cumul_x2 ** d_img_x2, @@ -151,16 +171,15 @@ void cuda_init_img_cumul(unsigned short ** img_in, int H, int L, int nb_nodes, //calcul des sommes totales N, sigX et sigX2 sur l'image calcul_stats_image<<<1, 1>>>( *d_img_x, *d_img_x2, H, L, (uint64*)*d_stats_snake); - - + cudaThreadSynchronize() ; toc(chrono, "\tTemps GPU"); - if(DEBUG_IMG_CUMUL) - { - - //allocation memoire CPU + //allocation memoire CPU t_cumul_x * img_x = new t_cumul_x [H*L]; t_cumul_x2 * img_x2 = new t_cumul_x2 [H*L]; + uint64 sigX = 0, sigX2 = 0 ; + if(DEBUG_IMG_CUMUL) + { /*pour test comparaison*/ t_cumul_x * img_xb = new t_cumul_x [H*L]; @@ -218,7 +237,7 @@ void cuda_init_img_cumul(unsigned short ** img_in, int H, int L, int nb_nodes, } printf("%d erreurs sur CX / %d points\n", cpt_errx, cpt ); printf("%d erreurs sur CX2 / %d points\n", cpt_errx2, cpt ); - uint64 sigX = 0, sigX2 = 0 ; + for (int i=0; i>>(*d_snake, *d_img_x, *d_img_x2, H, L, *d_stats_snake, d_all_crit) ; - cudaThreadSynchronize(); - toc(chrono, "\nCALCULS RECTANGLES"); - - //recup data rectangles - int ret; - ret = cudaMemcpy( h_all_crit, d_all_crit, Nperm*sizeof(t_rectangle_snake), cudaMemcpyDeviceToHost) ; - printf("COPIE DATA = %s\n",(ret==0)?"OK":"ERR"); - - //optimum sur CPU - best_crit = h_all_crit[0].crit ; - ind_best_crit = 0 ; - for (int k=1; k<100; k++){ - if ((h_all_crit[k].crit > 0) && (h_all_crit[k].crit < best_crit)) { - best_crit = h_all_crit[k].crit ; - ind_best_crit = k ; + //execution kernels + calcul_contribs_cols<<>>( *d_img_x, *d_img_x2, H, L, d_contribs_part ); + somsom_contribs<<>>( d_contribs_part, bpc, d_contribs_cols ) ; + calcul_contribs_permutations<<>>(d_contribs_cols, bpc, H, L, (uint64*) *d_stats_snake, d_miniblocs) ; + cudaMemcpy( h_miniblocs, d_miniblocs, div*sizeof(double2), cudaMemcpyDeviceToHost ) ; + + //verif minimum des blocs + double crit_mini = h_miniblocs[0].x; + int j1=0, id_mini=0; + for (j1=1 ; j1 < div ; j1++){ + if (h_miniblocs[j1].x < crit_mini) { + crit_mini = h_miniblocs[j1].x ; + id_mini = j1 ; + } + } + toc(chrono, "\nCALCUL RECTANGLE"); + j1 = pas * id_mini ; + int j2 = (int)(pas*h_miniblocs[ id_mini ].y) ; + printf("pas = %d cols -- critere mini =%f , positions j1=%d j2=%d\n", pas, h_miniblocs[ id_mini ].x, j1, j2); + + + // transfert datas GPU -> CPU + cudaMemcpy( h_contribs_cols, d_contribs_cols, div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ; + + //verif contribs colonnes + for (int c=0 ; c < div ; c++){ + // calcul valeurs de ref en CPU + h_contribs_cols_cpu[c].cx = 0 ; + h_contribs_cols_cpu[c].cx2= 0 ; + for (int ip=0; ip < H; ip++){ + h_contribs_cols_cpu[c].cx += img_x[ ip*L + c*pas] ; + h_contribs_cols_cpu[c].cx2 += img_x2[ ip*L + c*pas] ; + } + //comparaison avec valeurs GPU + if ( (h_contribs_cols_cpu[c].cx != h_contribs_cols[c].cx) || (h_contribs_cols_cpu[c].cx2 != h_contribs_cols[c].cx2) ) + printf("ERR colonne %d -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n", + c*pas, h_contribs_cols_cpu[c].cx, h_contribs_cols_cpu[c].cx2, h_contribs_cols[c].cx, h_contribs_cols[c].cx2 ); + } + cudaFree(d_contribs_part) ; + cudaFree(d_contribs_cols) ; + cudaFree(d_miniblocs) ; + free(h_contribs_cols); + free(h_contribs_cols_cpu); + free(h_miniblocs) ; + + //realloc pour lignes horizontales + bs = 128 ; + + div = (H+bs-1)/bs ; + printf("DIV = %d\n", div ) ; + + int divpow2 = nextPow2(div) ; + printf("DIVPOW2 = %d\n", divpow2) ; + + grid = dim3(div, 1, 1) ; + smem = CFI(bs)*sizeof(tcontribs) ; + cudaMalloc((void**) &d_contribs_part, div*sizeof(tcontribs)) ; + cudaMalloc((void**) &d_contribs_cols, div*div*sizeof(tcontribs)) ; + cudaMalloc((void**) &d_miniblocs, div*sizeof(double2)) ; + h_contribs_cols = new tcontribs[div*div] ; + tcontribs * h_contribs_part = new tcontribs[div] ; + h_miniblocs = new double2[div] ; + + tic(&chrono, NULL); + // Appels kernels optim lignes horizontales + calcul_contrib_conjuguee_colonnes<<>>( *d_img_x, *d_img_x2, H, L, j1, j2, d_contribs_part) ; + + /*verif CPU + int cpt = 0 ; + int cpterr = 0 ; + tcontribs * h_contribs_part_cpu = new tcontribs[div] ; + cudaMemcpy( h_contribs_part, d_contribs_part, div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ; + for (int bloc=0; bloc < div; bloc++){ + h_contribs_part_cpu[ bloc ].cx = 0 ; + h_contribs_part_cpu[ bloc ].cx2 = 0 ; + for (int line=0; ((line < bs)&&(bloc*bs+line < H)); line++){ + h_contribs_part_cpu[bloc].cx += img_x[ (bloc*bs+line)*L + j2] - img_x[ (bloc*bs+line)*L + j1 ]; + h_contribs_part_cpu[bloc].cx2 += img_x2[ (bloc*bs+line)*L + j2] - img_x2[ (bloc*bs+line)*L + j1 ]; + } + if ( ( h_contribs_part_cpu[bloc].cx != h_contribs_part[bloc].cx ) || ( h_contribs_part_cpu[bloc].cx2 != h_contribs_part[bloc].cx2 ) ) + { + printf("ERREUR bloc %d -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n", bloc, + h_contribs_part_cpu[bloc].cx, h_contribs_part_cpu[bloc].cx2, h_contribs_part[bloc].cx, h_contribs_part[bloc].cx2 ) ; + cpterr++; + } + cpt++ ; + } + printf("VERIF CONTRIB CONJUGUEES BLOCS --> %d ERREURS / %d BLOCS\n", cpterr, cpt) ; + fin verif*/ + + grid = dim3(div, div, 1) ; + calcul_contribs_snake_rectangle<<>>(d_contribs_part, d_contribs_cols) ; + + /* verif CPU + h_contribs_cols_cpu = new tcontribs[div*div] ; + cudaMemcpy( h_contribs_cols, d_contribs_cols, div*div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ; + cpt = 0 ; + cpterr = 0 ; + for (int i1=0; i1 < div ; i1++){ + for (int i2=0 ; i2 < div ; i2++){ + if (i2 >= i1){ + h_contribs_cols_cpu[ i1*div +i2 ].cx = 0 ; + h_contribs_cols_cpu[ i1*div +i2 ].cx2= 0 ; + for (int d=i1 ; d <= i2 ; d++){ + h_contribs_cols_cpu[ i1*div +i2 ].cx += h_contribs_part_cpu[ d ].cx ; + h_contribs_cols_cpu[ i1*div +i2 ].cx2+= h_contribs_part_cpu[ d ].cx2 ; + } + } else { + h_contribs_cols_cpu[ i1*div +i2 ].cx = 0 ; + h_contribs_cols_cpu[ i1*div +i2 ].cx2= 0 ; + } + + if (( ( h_contribs_cols_cpu[ i1*div +i2 ].cx != h_contribs_cols[ i1*div +i2 ].cx ) || ( h_contribs_cols_cpu[ i1*div +i2].cx2 != h_contribs_cols[ i1*div +i2].cx2 ) ) + && (i2 >= i1)) + { + printf("ERREUR combinaison (%d, %d) -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n", i1, i2, + h_contribs_cols_cpu[ i1*div +i2].cx, h_contribs_cols_cpu[ i1*div +i2 ].cx2, h_contribs_cols[ i1*div +i2 ].cx, h_contribs_cols[ i1*div +i2 ].cx2 ) ; + cpterr++; + } + cpt++ ; } - printf("%d -> ( %d, %d )--( %d, %d) CRITERE = %f\n", k, h_all_crit[k].bpi, h_all_crit[k].bpj, - h_all_crit[k].opi, h_all_crit[k].opj, h_all_crit[k].crit ); } + printf("VERIF COMBINAISONS LIGNES --> %d ERREURS / %d COMBINAISONS\n", cpterr, cpt) ; + fin verif */ - printf("BEST RECTANGLE/%d tests : %d -> ( %d, %d )--( %d, %d) CRITERE = %f\n", Nperm, ind_best_crit, h_all_crit[ind_best_crit].bpi, h_all_crit[ind_best_crit].bpj, - h_all_crit[ind_best_crit].opi, h_all_crit[ind_best_crit].opj, best_crit ); - exit(0); - /*fin test snake rectangle initial optimal*/ - - //genere_snake_rectangle_4nodes_gpu<<< 1, 1>>>(*d_snake, 140, H, L) ; + grid = dim3(div, 1, 1) ; + calcul_critere_permutations_verticales<<< grid, divpow2, CFI(divpow2)*sizeof(double2) >>>(d_contribs_cols, bs, j1, j2, H, L, sigX, sigX2, d_miniblocs) ; + + /* verif CPU + cpt = 0 ; + cpterr = 0 ; + double2 * h_miniblocs_cpu = new double2[ div ] ; + cudaMemcpy( h_miniblocs, d_miniblocs, div*sizeof(double2), cudaMemcpyDeviceToHost) ; + cudaMemcpy( h_stats_snake, *d_stats_snake, 6*sizeof(int64), cudaMemcpyDeviceToHost) ; + for (int lb=0 ; lb < div ; lb++){ + h_miniblocs_cpu[lb].x = DBL_MAX ; + for (int lh=lb ; lh < div ; lh++){ + if ( critere_gauss_cpu(H*L, (lh-lb+1)*bs*(j2 - j1), h_contribs_cols_cpu[ lb*div +lh ].cx, h_contribs_cols_cpu[ lb*div + lh ].cx2, h_stats_snake ) < h_miniblocs_cpu[lb].x ) + { + h_miniblocs_cpu[lb].x = critere_gauss_cpu(H*L, (lh-lb+1)*bs*(j2 - j1), h_contribs_cols_cpu[ lb*div +lh ].cx, h_contribs_cols_cpu[ lb*div + lh ].cx2, h_stats_snake) ; + h_miniblocs_cpu[lb].y = (double)lh ; + } + } + if ( ( h_miniblocs_cpu[lb].x > 1.000001*h_miniblocs[lb].x ) || ( h_miniblocs_cpu[lb].x < 0.999999*h_miniblocs[lb].x ) || ( h_miniblocs_cpu[lb].y != h_miniblocs[lb].y ) ) + { + printf("ERREUR MINIMUM BLOC LIGNE %d -> CPU=%lf en i2=%d | GPU=%lf en i2=%d\n", lb, + h_miniblocs_cpu[ lb ].x, (int)h_miniblocs_cpu[ lb ].y, h_miniblocs[ lb ].x, (int)h_miniblocs[ lb ].y ) ; + cpterr++; + } + cpt++ ; + } + printf("VERIF MINIMA PAR BLOC --> %d ERREURS / %d BLOCS\n", cpterr, cpt) ; + */ + /* fin verif */ + + /* + * determination minimum absolu + * a conserver sur CPU + */ + cudaMemcpy( h_miniblocs, d_miniblocs, div*sizeof(double2), cudaMemcpyDeviceToHost) ; + crit_mini = h_miniblocs[0].x ; + int i1=0 ; + id_mini=0 ; + for (i1=1 ; i1 < div ; i1++){ + if (h_miniblocs[i1].x < crit_mini) { + crit_mini = h_miniblocs[i1].x ; + id_mini = i1 ; + } + } + i1 = bs * id_mini ; + int i2 = (int)(bs*h_miniblocs[ id_mini ].y) ; + toc(chrono, "CALCUL RECTANGLE"); + + printf("pas = %d lignes -- critere mini =%f , positions i1=%d i2=%d\n", bs, h_miniblocs[ id_mini ].x, i1, i2); + /*fin test snake rectangle initial optimal*/ + cudaFree(d_contribs_part) ; + cudaFree(d_contribs_cols) ; + cudaFree(d_miniblocs) ; + + tic(&chrono, NULL); + //genere_snake_rectangle_4nodes_gpu<<< 1, 1>>>(*d_snake, 140, H, L) ; + //genere_snake_bande_gpu<<<1,1>>>(*d_snake, pas*id_mini, (int)(pas*h_miniblocs[ id_mini ].y), H); + genere_snake_rectangle<<<1,1>>>(*d_snake, i1, i2, j1, j2); + + int nnodes = nb_nodes ; snake_node_gpu * h_snake = new snake_node_gpu[nnodes]; snake_node * h_snake_ll = new snake_node[nnodes] ;