#include <stdio.h>
-
+#include <cfloat>
extern "C"{
#include "structures.h"
//#define DEBUG_LISTES
//#define DEBUG_STATS_REF
-
+double critere_gauss_cpu( int n, int ni, uint64 sxi, uint64 sxi2, int64 *stats_img){
+ int ne = n - ni ;
+ uint64 sxe = stats_img[4] - sxi ;
+ uint64 sxe2= stats_img[5] - sxi2;
+ double sigi2, sige2, critere ;
+
+ /* variance des valeurs des niveaux de gris a l'interieur */
+ sigi2 =
+ ((double)sxi2/ni) -
+ ((double)sxi/ni)*((double)sxi/ni) ;
+
+ /* variance des valeurs des niveaux de gris a l'exterieur */
+ sige2 =
+ ((double)sxe2)/ne -
+ ((double)sxe/ne)*((double)sxe/ne) ;
+
+ if ( (sigi2 > 0) && (sige2 > 0) )
+ critere = 0.5*((double)ni*log(sigi2) + (double)ne*log(sige2)) ;
+ else critere = DBL_MAX ;
+ return critere;
+}
void cuda_init_img_cumul(unsigned short ** img_in, int H, int L, int nb_nodes,
unsigned short ** d_img, t_cumul_x ** d_img_x, t_cumul_x2 ** d_img_x2,
//calcul des sommes totales N, sigX et sigX2 sur l'image
calcul_stats_image<<<1, 1>>>( *d_img_x, *d_img_x2, H, L, (uint64*)*d_stats_snake);
-
-
+
cudaThreadSynchronize() ;
toc(chrono, "\tTemps GPU");
- if(DEBUG_IMG_CUMUL)
- {
-
- //allocation memoire CPU
+ //allocation memoire CPU
t_cumul_x * img_x = new t_cumul_x [H*L];
t_cumul_x2 * img_x2 = new t_cumul_x2 [H*L];
+ uint64 sigX = 0, sigX2 = 0 ;
+ if(DEBUG_IMG_CUMUL)
+ {
/*pour test comparaison*/
t_cumul_x * img_xb = new t_cumul_x [H*L];
}
printf("%d erreurs sur CX / %d points\n", cpt_errx, cpt );
printf("%d erreurs sur CX2 / %d points\n", cpt_errx2, cpt );
- uint64 sigX = 0, sigX2 = 0 ;
+
for (int i=0; i<H; i++)
{
sigX += img_x[i*L+L-1] ;
*/
int dist = 140 ;
- /* Test de determination du snake rectangle initial optimal*/
- int div = 100;//nb de divisions de l'image : cela définit le pas. La valeur max découle du nb max de threads possible ds une grille
- int Nperm = div*div*bs;//nb total de rectangles a tester. La distribution est ainsi irrégulière, mais plus simple.
- double best_crit ;
- int ind_best_crit ;
+ /* Test de determination de la fenetre verticale optimale*/
+ int div = 256;//nb de divisions de l'image : cela définit le pas. La valeur max découle du nb max de threads possible ds une grille
+ tcontribs *d_contribs_part ; // pour les contribs partielles par bloc
+ tcontribs *d_contribs_cols, *h_contribs_cols, *h_contribs_cols_cpu ; // pour les contrib des colonnes
+ double2 *d_miniblocs, *h_miniblocs ; // pour les minima de chaque colonne haute
+
+ //parametres execution
+ bs = div ;
+ int bpc= (H + bs -1)/bs ;
+ dim3 grid = dim3(div, bpc, 1);
+ smem = CFI(bs)*sizeof(tcontribs) ;
- t_rectangle_snake * d_all_crit, d_best_crit;//tableaux pour les résultats des différents rectangles / le meilleur
- t_rectangle_snake * h_all_crit = new t_rectangle_snake[Nperm];//correspondant CPU
-
//allocations
- cudaMalloc((void**) &d_all_crit, Nperm*sizeof(t_rectangle_snake));
- cudaMalloc((void**) &d_best_crit, sizeof(t_rectangle_snake));
+ cudaMalloc((void**) &d_contribs_part, div*bpc*sizeof(tcontribs));
+ cudaMalloc((void**) &d_contribs_cols, div*sizeof(tcontribs)) ;
+ cudaMalloc((void**) &d_miniblocs, div*sizeof(double2)) ;
+ h_contribs_cols = new tcontribs[div] ;
+ h_contribs_cols_cpu = new tcontribs[div] ;
+ h_miniblocs = new double2[div] ;
+
+ int pas = L / div ;
tic(&chrono, NULL);
- //execution kernel
- dim3 grid = dim3(H/div, L/div, 1);
- calcul_contribs_snake4<<<grid, bs, CFI(bs)*sizeof(tcontribs) >>>(*d_snake, *d_img_x, *d_img_x2, H, L, *d_stats_snake, d_all_crit) ;
- cudaThreadSynchronize();
- toc(chrono, "\nCALCULS RECTANGLES");
-
- //recup data rectangles
- int ret;
- ret = cudaMemcpy( h_all_crit, d_all_crit, Nperm*sizeof(t_rectangle_snake), cudaMemcpyDeviceToHost) ;
- printf("COPIE DATA = %s\n",(ret==0)?"OK":"ERR");
-
- //optimum sur CPU
- best_crit = h_all_crit[0].crit ;
- ind_best_crit = 0 ;
- for (int k=1; k<100; k++){
- if ((h_all_crit[k].crit > 0) && (h_all_crit[k].crit < best_crit)) {
- best_crit = h_all_crit[k].crit ;
- ind_best_crit = k ;
+ //execution kernels
+ calcul_contribs_cols<<<grid, bs, smem>>>( *d_img_x, *d_img_x2, H, L, d_contribs_part );
+ somsom_contribs<<<div,1>>>( d_contribs_part, bpc, d_contribs_cols ) ;
+ calcul_contribs_permutations<<<div,div, CFI(div)*sizeof(double2)>>>(d_contribs_cols, bpc, H, L, (uint64*) *d_stats_snake, d_miniblocs) ;
+ cudaMemcpy( h_miniblocs, d_miniblocs, div*sizeof(double2), cudaMemcpyDeviceToHost ) ;
+
+ //verif minimum des blocs
+ double crit_mini = h_miniblocs[0].x;
+ int j1=0, id_mini=0;
+ for (j1=1 ; j1 < div ; j1++){
+ if (h_miniblocs[j1].x < crit_mini) {
+ crit_mini = h_miniblocs[j1].x ;
+ id_mini = j1 ;
+ }
+ }
+ toc(chrono, "\nCALCUL RECTANGLE");
+ j1 = pas * id_mini ;
+ int j2 = (int)(pas*h_miniblocs[ id_mini ].y) ;
+ printf("pas = %d cols -- critere mini =%f , positions j1=%d j2=%d\n", pas, h_miniblocs[ id_mini ].x, j1, j2);
+
+
+ // transfert datas GPU -> CPU
+ cudaMemcpy( h_contribs_cols, d_contribs_cols, div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ;
+
+ //verif contribs colonnes
+ for (int c=0 ; c < div ; c++){
+ // calcul valeurs de ref en CPU
+ h_contribs_cols_cpu[c].cx = 0 ;
+ h_contribs_cols_cpu[c].cx2= 0 ;
+ for (int ip=0; ip < H; ip++){
+ h_contribs_cols_cpu[c].cx += img_x[ ip*L + c*pas] ;
+ h_contribs_cols_cpu[c].cx2 += img_x2[ ip*L + c*pas] ;
+ }
+ //comparaison avec valeurs GPU
+ if ( (h_contribs_cols_cpu[c].cx != h_contribs_cols[c].cx) || (h_contribs_cols_cpu[c].cx2 != h_contribs_cols[c].cx2) )
+ printf("ERR colonne %d -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n",
+ c*pas, h_contribs_cols_cpu[c].cx, h_contribs_cols_cpu[c].cx2, h_contribs_cols[c].cx, h_contribs_cols[c].cx2 );
+ }
+ cudaFree(d_contribs_part) ;
+ cudaFree(d_contribs_cols) ;
+ cudaFree(d_miniblocs) ;
+ free(h_contribs_cols);
+ free(h_contribs_cols_cpu);
+ free(h_miniblocs) ;
+
+ //realloc pour lignes horizontales
+ bs = 128 ;
+
+ div = (H+bs-1)/bs ;
+ printf("DIV = %d\n", div ) ;
+
+ int divpow2 = nextPow2(div) ;
+ printf("DIVPOW2 = %d\n", divpow2) ;
+
+ grid = dim3(div, 1, 1) ;
+ smem = CFI(bs)*sizeof(tcontribs) ;
+ cudaMalloc((void**) &d_contribs_part, div*sizeof(tcontribs)) ;
+ cudaMalloc((void**) &d_contribs_cols, div*div*sizeof(tcontribs)) ;
+ cudaMalloc((void**) &d_miniblocs, div*sizeof(double2)) ;
+ h_contribs_cols = new tcontribs[div*div] ;
+ tcontribs * h_contribs_part = new tcontribs[div] ;
+ h_miniblocs = new double2[div] ;
+
+ tic(&chrono, NULL);
+ // Appels kernels optim lignes horizontales
+ calcul_contrib_conjuguee_colonnes<<<grid, bs, CFI(bs)*sizeof(tcontribs) >>>( *d_img_x, *d_img_x2, H, L, j1, j2, d_contribs_part) ;
+
+ /*verif CPU
+ int cpt = 0 ;
+ int cpterr = 0 ;
+ tcontribs * h_contribs_part_cpu = new tcontribs[div] ;
+ cudaMemcpy( h_contribs_part, d_contribs_part, div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ;
+ for (int bloc=0; bloc < div; bloc++){
+ h_contribs_part_cpu[ bloc ].cx = 0 ;
+ h_contribs_part_cpu[ bloc ].cx2 = 0 ;
+ for (int line=0; ((line < bs)&&(bloc*bs+line < H)); line++){
+ h_contribs_part_cpu[bloc].cx += img_x[ (bloc*bs+line)*L + j2] - img_x[ (bloc*bs+line)*L + j1 ];
+ h_contribs_part_cpu[bloc].cx2 += img_x2[ (bloc*bs+line)*L + j2] - img_x2[ (bloc*bs+line)*L + j1 ];
+ }
+ if ( ( h_contribs_part_cpu[bloc].cx != h_contribs_part[bloc].cx ) || ( h_contribs_part_cpu[bloc].cx2 != h_contribs_part[bloc].cx2 ) )
+ {
+ printf("ERREUR bloc %d -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n", bloc,
+ h_contribs_part_cpu[bloc].cx, h_contribs_part_cpu[bloc].cx2, h_contribs_part[bloc].cx, h_contribs_part[bloc].cx2 ) ;
+ cpterr++;
+ }
+ cpt++ ;
+ }
+ printf("VERIF CONTRIB CONJUGUEES BLOCS --> %d ERREURS / %d BLOCS\n", cpterr, cpt) ;
+ fin verif*/
+
+ grid = dim3(div, div, 1) ;
+ calcul_contribs_snake_rectangle<<<grid,divpow2, CFI(divpow2)*sizeof(tcontribs) >>>(d_contribs_part, d_contribs_cols) ;
+
+ /* verif CPU
+ h_contribs_cols_cpu = new tcontribs[div*div] ;
+ cudaMemcpy( h_contribs_cols, d_contribs_cols, div*div*sizeof(tcontribs), cudaMemcpyDeviceToHost ) ;
+ cpt = 0 ;
+ cpterr = 0 ;
+ for (int i1=0; i1 < div ; i1++){
+ for (int i2=0 ; i2 < div ; i2++){
+ if (i2 >= i1){
+ h_contribs_cols_cpu[ i1*div +i2 ].cx = 0 ;
+ h_contribs_cols_cpu[ i1*div +i2 ].cx2= 0 ;
+ for (int d=i1 ; d <= i2 ; d++){
+ h_contribs_cols_cpu[ i1*div +i2 ].cx += h_contribs_part_cpu[ d ].cx ;
+ h_contribs_cols_cpu[ i1*div +i2 ].cx2+= h_contribs_part_cpu[ d ].cx2 ;
+ }
+ } else {
+ h_contribs_cols_cpu[ i1*div +i2 ].cx = 0 ;
+ h_contribs_cols_cpu[ i1*div +i2 ].cx2= 0 ;
+ }
+
+ if (( ( h_contribs_cols_cpu[ i1*div +i2 ].cx != h_contribs_cols[ i1*div +i2 ].cx ) || ( h_contribs_cols_cpu[ i1*div +i2].cx2 != h_contribs_cols[ i1*div +i2].cx2 ) )
+ && (i2 >= i1))
+ {
+ printf("ERREUR combinaison (%d, %d) -> CPUx=%lu CPUx2=%lu | GPUx=%lu GPUx2=%lu\n", i1, i2,
+ h_contribs_cols_cpu[ i1*div +i2].cx, h_contribs_cols_cpu[ i1*div +i2 ].cx2, h_contribs_cols[ i1*div +i2 ].cx, h_contribs_cols[ i1*div +i2 ].cx2 ) ;
+ cpterr++;
+ }
+ cpt++ ;
}
- printf("%d -> ( %d, %d )--( %d, %d) CRITERE = %f\n", k, h_all_crit[k].bpi, h_all_crit[k].bpj,
- h_all_crit[k].opi, h_all_crit[k].opj, h_all_crit[k].crit );
}
+ printf("VERIF COMBINAISONS LIGNES --> %d ERREURS / %d COMBINAISONS\n", cpterr, cpt) ;
+ fin verif */
- printf("BEST RECTANGLE/%d tests : %d -> ( %d, %d )--( %d, %d) CRITERE = %f\n", Nperm, ind_best_crit, h_all_crit[ind_best_crit].bpi, h_all_crit[ind_best_crit].bpj,
- h_all_crit[ind_best_crit].opi, h_all_crit[ind_best_crit].opj, best_crit );
- exit(0);
- /*fin test snake rectangle initial optimal*/
-
- //genere_snake_rectangle_4nodes_gpu<<< 1, 1>>>(*d_snake, 140, H, L) ;
+ grid = dim3(div, 1, 1) ;
+ calcul_critere_permutations_verticales<<< grid, divpow2, CFI(divpow2)*sizeof(double2) >>>(d_contribs_cols, bs, j1, j2, H, L, sigX, sigX2, d_miniblocs) ;
+
+ /* verif CPU
+ cpt = 0 ;
+ cpterr = 0 ;
+ double2 * h_miniblocs_cpu = new double2[ div ] ;
+ cudaMemcpy( h_miniblocs, d_miniblocs, div*sizeof(double2), cudaMemcpyDeviceToHost) ;
+ cudaMemcpy( h_stats_snake, *d_stats_snake, 6*sizeof(int64), cudaMemcpyDeviceToHost) ;
+ for (int lb=0 ; lb < div ; lb++){
+ h_miniblocs_cpu[lb].x = DBL_MAX ;
+ for (int lh=lb ; lh < div ; lh++){
+ if ( critere_gauss_cpu(H*L, (lh-lb+1)*bs*(j2 - j1), h_contribs_cols_cpu[ lb*div +lh ].cx, h_contribs_cols_cpu[ lb*div + lh ].cx2, h_stats_snake ) < h_miniblocs_cpu[lb].x )
+ {
+ h_miniblocs_cpu[lb].x = critere_gauss_cpu(H*L, (lh-lb+1)*bs*(j2 - j1), h_contribs_cols_cpu[ lb*div +lh ].cx, h_contribs_cols_cpu[ lb*div + lh ].cx2, h_stats_snake) ;
+ h_miniblocs_cpu[lb].y = (double)lh ;
+ }
+ }
+ if ( ( h_miniblocs_cpu[lb].x > 1.000001*h_miniblocs[lb].x ) || ( h_miniblocs_cpu[lb].x < 0.999999*h_miniblocs[lb].x ) || ( h_miniblocs_cpu[lb].y != h_miniblocs[lb].y ) )
+ {
+ printf("ERREUR MINIMUM BLOC LIGNE %d -> CPU=%lf en i2=%d | GPU=%lf en i2=%d\n", lb,
+ h_miniblocs_cpu[ lb ].x, (int)h_miniblocs_cpu[ lb ].y, h_miniblocs[ lb ].x, (int)h_miniblocs[ lb ].y ) ;
+ cpterr++;
+ }
+ cpt++ ;
+ }
+ printf("VERIF MINIMA PAR BLOC --> %d ERREURS / %d BLOCS\n", cpterr, cpt) ;
+ */
+ /* fin verif */
+
+ /*
+ * determination minimum absolu
+ * a conserver sur CPU
+ */
+ cudaMemcpy( h_miniblocs, d_miniblocs, div*sizeof(double2), cudaMemcpyDeviceToHost) ;
+ crit_mini = h_miniblocs[0].x ;
+ int i1=0 ;
+ id_mini=0 ;
+ for (i1=1 ; i1 < div ; i1++){
+ if (h_miniblocs[i1].x < crit_mini) {
+ crit_mini = h_miniblocs[i1].x ;
+ id_mini = i1 ;
+ }
+ }
+ i1 = bs * id_mini ;
+ int i2 = (int)(bs*h_miniblocs[ id_mini ].y) ;
+ toc(chrono, "CALCUL RECTANGLE");
+
+ printf("pas = %d lignes -- critere mini =%f , positions i1=%d i2=%d\n", bs, h_miniblocs[ id_mini ].x, i1, i2);
+ /*fin test snake rectangle initial optimal*/
+
+ tic(&chrono, NULL);
+ //genere_snake_rectangle_4nodes_gpu<<< 1, 1>>>(*d_snake, 140, H, L) ;
+ //genere_snake_bande_gpu<<<1,1>>>(*d_snake, pas*id_mini, (int)(pas*h_miniblocs[ id_mini ].y), H);
+ genere_snake_rectangle<<<1,1>>>(*d_snake, i1, i2, j1, j2);
+
+
int nnodes = nb_nodes ;
snake_node_gpu * h_snake = new snake_node_gpu[nnodes];
snake_node * h_snake_ll = new snake_node[nnodes] ;