X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/snake_gpu.git/blobdiff_plain/af4b787ce73a80f23e9e2b1ef9ac52660e8ab754..fd872eca52592617f52015c1d4d086d0e24b12e1:/src/lib_kernel_snake_2_gpu.cu

diff --git a/src/lib_kernel_snake_2_gpu.cu b/src/lib_kernel_snake_2_gpu.cu
index 5293308..2002a7e 100644
--- a/src/lib_kernel_snake_2_gpu.cu
+++ b/src/lib_kernel_snake_2_gpu.cu
@@ -26,125 +26,107 @@ __global__ void genere_snake_rectangle_4nodes_gpu(snake_node_gpu * d_snake, int
 		d_snake[i].centre_i = 0;
 		d_snake[i].centre_j = 0;
 		d_snake[i].last_move = 0;
-		d_snake[i].nb_pixels = 123;
+		d_snake[i].nb_pixels = 0;
 		d_snake[i].code_segment = 0;
 		
 	  }
   }
 }
 
-__global__ void genere_diagos_rectangle(uint4 * d_diagos, int h, int l, int q, int * n_diagos){
-  int inci = h/q;
-  int incj = l/q;
-  int iM,jM, iN, jN ;
-  int idxDiago = 0 ;
-  	// boucle double pour les positions du point NO de la diagonale
-	for ( iM = 0; iM < q-1; iM++){
-	  for (jM = 0 ; jM < q-1 ; jM++){
-		//boucle double pour les positions du point SE de la diagonale
-		for (iN = iM+1; iN < q; iN++){
-		  for (jN = jM+1; jN < q; jN++){
-			d_diagos[idxDiago].x = iM*inci;
-			d_diagos[idxDiago].y = jM*incj;
-			d_diagos[idxDiago].z = iN*inci;
-			d_diagos[idxDiago].w = jN*incj;
-			idxDiago++;
-		  }
-		}
-	  }
-	}
-	*n_diagos = --idxDiago ;
-}
-
-__global__ void genere_snake_rectangle_Nnodes_gpu(snake_node_gpu * d_snake, int dist_bords, int i_dim, int j_dim){
-  int nb_node_seg = 9 ;
-  int limite = 64 ;
-  
-  int i , h= i_dim-2*dist_bords, l= j_dim-2*dist_bords ;
-  int inch = h/(nb_node_seg+1), incl= l/(nb_node_seg+1) ;
+__global__ void genere_snake_bande_gpu(snake_node_gpu * d_snake, int j1, int j2, int h){
   if (threadIdx.x == 0){
-	int n = 0;
+	int n = 0 ;
 	/* n0 */
-	d_snake[n].posi = dist_bords ;
-	d_snake[n].posj = dist_bords ;
+	d_snake[n].posi = 0 ;
+	d_snake[n].posj = j1 ;
 	n++ ;
-	/*entre sommet 0 et 1*/
-	i = 0 ;
-	while (i < nb_node_seg)
-	  {
-		if ( (d_snake[n-1].posi + inch)-(i_dim - dist_bords) > limite )
-		  d_snake[n].posi = d_snake[n-1].posi + inch ;
-		else
-		  d_snake[n].posi = d_snake[n-1].posi + inch/2 ;
-		d_snake[n].posj = dist_bords ;
-		d_snake[n-1].nb_pixels =  d_snake[n].posi -  d_snake[n-1].posi ;
-		n++ ; i++ ;
-	  }
 	/* n1 */
-	d_snake[n].posi = i_dim - dist_bords ;
-	d_snake[n].posj = dist_bords ;
-	d_snake[n-1].nb_pixels =  d_snake[n].posi -  d_snake[n-1].posi ;
+	d_snake[n].posi = h-1 ;
+	d_snake[n].posj = j1 ; 
 	n++ ;
-	/*entre S1 et S2*/
-	i = 0 ; 
-	while (i< nb_node_seg)
-	  {
-		if ( (j_dim - dist_bords) - (d_snake[n-1].posj + incl) > limite )
-		  d_snake[n].posj = d_snake[n-1].posj + incl ;
-		else
-		  d_snake[n].posj = d_snake[n-1].posj + incl/2 ;
-		d_snake[n].posi = i_dim - dist_bords ;
-		d_snake[n-1].nb_pixels =  d_snake[n].posj -  d_snake[n-1].posj ;
-		n++ ; i++ ;
-	  }
 	/* n2 */
-	d_snake[n].posi = i_dim - dist_bords ;
-	d_snake[n].posj = j_dim - dist_bords ;
-	d_snake[n-1].nb_pixels =  d_snake[n].posj -  d_snake[n-1].posj ;
+	d_snake[n].posi = h-1 ;
+	d_snake[n].posj = j2 ; 
 	n++ ;
-	/*entre S2 et S3*/
-	i = 0 ;
-	while (i< nb_node_seg)
+	/* n3 */
+	d_snake[n].posi = 0 ;
+	d_snake[n].posj = j2 ;
+
+	for (int i=0; i<4; i++)
 	  {
-		if ( (d_snake[n-1].posi - inch) - dist_bords > limite )
-		  d_snake[n].posi = d_snake[n-1].posi - inch ;
-		else
-		  d_snake[n].posi = d_snake[n-1].posi - inch/2 ;
-		d_snake[n].posj = j_dim - dist_bords ;
-		d_snake[n-1].nb_pixels =  d_snake[n-1].posi -  d_snake[n].posi ;
-		n++ ; i++ ;
+		d_snake[i].freeman_in = 0;
+		d_snake[i].freeman_out = 0;
+		d_snake[i].centre_i = 0;
+		d_snake[i].centre_j = 0;
+		d_snake[i].last_move = 0;
+		d_snake[i].nb_pixels = 0;
+		d_snake[i].code_segment = 0;
+		
 	  }
-	/* n3 */ 
-	d_snake[n].posi = dist_bords ;
-	d_snake[n].posj = j_dim - dist_bords ;
-	d_snake[n-1].nb_pixels =  d_snake[n-1].posi -  d_snake[n].posi ;
+  }
+}
+
+
+__global__ void genere_snake_rectangle(snake_node_gpu * d_snake, int i1, int i2, int j1, int j2){
+  if (threadIdx.x == 0){
+	int n = 0 ;
+	/* n0 */
+	d_snake[n].posi = i1 ;
+	d_snake[n].posj = j1 ;
 	n++ ;
-	/*entre S3 et S0*/
-	i = 0 ;
-	while (i< nb_node_seg)
-	  {
-		if ( (d_snake[n-1].posj - incl) - dist_bords > limite)
-		  d_snake[n].posj = d_snake[n-1].posj - incl ;
-		else
-		  d_snake[n].posj = d_snake[n-1].posj - incl/2 ;
-		d_snake[n].posi = dist_bords ;
-		d_snake[n-1].nb_pixels =  d_snake[n-1].posj -  d_snake[n].posj ;
-		n++ ; i++ ;
-	  }
-	d_snake[n-1].nb_pixels =  d_snake[n-1].posj -  d_snake[0].posj ;
-	for (i=0; i<n; i++)
+	/* n1 */
+	d_snake[n].posi = i2 ;
+	d_snake[n].posj = j1 ; 
+	n++ ;
+	/* n2 */
+	d_snake[n].posi = i2 ;
+	d_snake[n].posj = j2 ; 
+	n++ ;
+	/* n3 */
+	d_snake[n].posi = i1 ;
+	d_snake[n].posj = j2 ;
+
+	for (int i=0; i<4; i++)
 	  {
 		d_snake[i].freeman_in = 0;
 		d_snake[i].freeman_out = 0;
 		d_snake[i].centre_i = 0;
 		d_snake[i].centre_j = 0;
-		d_snake[i].last_move = 1;
+		d_snake[i].last_move = 0;
+		d_snake[i].nb_pixels = 0;
 		d_snake[i].code_segment = 0;
 		
 	  }
   }
 }
 
+
+/*
+  Evalue le critere dans l'hypothese gaussienne
+ */
+__device__ double critere_gauss( int n, int ni, uint64 sxi, uint64 sxi2, uint64 sigX, uint64 sigX2){
+  int ne = n - ni ;
+  uint64 sxe = sigX - sxi ;
+  uint64 sxe2= sigX2 - sxi2;
+  double sigi2, sige2, critere ;
+  
+  /* variance des valeurs des niveaux de gris a l'interieur */
+  sigi2 =
+	((double)sxi2/ni) - 
+	((double)sxi/ni)*((double)sxi/ni) ;
+  
+  /* variance des valeurs des niveaux de gris a l'exterieur */
+  sige2 =
+	((double)sxe2)/ne - 
+	((double)sxe/ne)*((double)sxe/ne) ;
+  
+  if ( (sigi2 > 0) && (sige2 > 0) )
+	critere =  0.5*((double)ni*log(sigi2) + (double)ne*log(sige2)) ;
+  else critere = DBL_MAX ;
+  return critere;
+}
+
+
 __global__ void calcul_contribs_segments_snake(snake_node_gpu * d_snake, int nb_nodes,
 											   t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2, 
 											   int l, uint2 * liste_pix, t_sum_x2 * gsombloc, int * d_table_freeman)
@@ -391,7 +373,7 @@ __device__ double codage_gl_gauss(uint64 stat_sum_1, uint64 stat_sum_x, uint64 s
     ((double)SUM_X2-stat_sum_x2)/(double)ne - 
     ((double)stat_sum_xe/(uint64)ne)*((double)stat_sum_xe/(uint64)ne) ;
   
-  if ((sigi2 > 0)|(sige2 > 0))
+  if ((sigi2 > 0) && (sige2 > 0))
   return  0.5*((double)stat_sum_1*log(sigi2) + (double)ne*log(sige2)) ;
   return -1 ;
 }
@@ -468,46 +450,52 @@ __global__ void calcul_stats_snake(snake_node_gpu * d_snake, int  nnodes, int64
 							   d_stats_snake[3], d_stats_snake[4], d_stats_snake[5]);
 }
 
+// kernel d'init rectangulaire au niveau snake
+// un block par point de base et un point opposÃ© par thread du bloc
 
-__global__ void calcul_contribs_snake4(t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2, int h, int l, tcontribs * gcontribs,
-									   uint64 SUM_1, uint64 SUM_X, uint64 SUM_X2)
+__global__ void calcul_contribs_snake4(snake_node_gpu * snake, t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2, int h, int l,
+									   int64 * sums, t_rectangle_snake * critere)
 {
   // nb de diagonales testees par bloc (ie. par point de base NO)
-  int blockSize = blockDim.x ; 
+  int blockSize = blockDim.x ;
   // indice du second point de chaque diagonale (=Opposite Point, = point SE)
   int OPib = threadIdx.x ;              
   // coordonnees de chaque point de base (NO)
   int BPi = blockIdx.x ;
   int BPj = blockIdx.y ;
   //coordonnees de chaque Opposite Point (SE)
-  int OPi = OPib / (l - BPj) ;
-  int OPj = OPib - (l - BPj)*OPi ;
+  double incThread = ((h-BPi)*(l-BPj) + blockSize -1)/(double)blockSize ;
+  int OPi = (double)(OPib*incThread)/(l - BPj) ;
+  int OPj = OPib*incThread - OPi*(l-BPj) ;
   OPi += BPi ;
   OPj += BPj ;
+  if (OPi >= h) OPi = h-1 ;
+  if (OPj >= l) OPj = l-1 ;
   //indices des pixels dans les images cumulees
   int posG, posD;
   //contrib 1 du snake
   int C1 = (OPi - BPi)*(OPj - BPj) ; 
-
-  
+ 
   //pour stocker contribs de chaque snake d'un block
-  //TODO on peut utiliser une structure restreinte (sans le c1) = gain d'espace
+  //TODO on peut utiliser une structure restreinte (sans le c1)
   extern __shared__ tcontribs scumuls[]; 
-   
+  
   //calcul contribs du snake
+  scumuls[CFI(OPib)].cx = 0;
+  scumuls[CFI(OPib)].cx2 = 0;
   for (int k=BPi ; k < OPi ; k++)
 	{
 	  posG = (BPi+k)*l + BPj ;
 	  posD = posG - BPj + OPj ;
-	  scumuls[CFI(OPib)].cx  += cumul_x[ posD ] - cumul_x[ posG ] ;
-	  scumuls[CFI(OPib)].cx2 += cumul_x2[ posD ] - cumul_x2[ posG ];
+	  scumuls[CFI(OPib)].cx  += (cumul_x[ posD ] - cumul_x[ posG ]) ;
+	  scumuls[CFI(OPib)].cx2 += (cumul_x2[ posD ] - cumul_x2[ posG ]);
   } 
   
   //calcul de critÃ¨re pour chaque snake
   uint64 stat_sum_xe ;  /* somme des xn region exterieure */
   uint32 ne ;           /* nombre de pixel region exterieure */
   double sigi2, sige2;  /* variance region interieure et exterieure */ 
-  double criterion;
+  int index_crit;
   
   /* variance des valeurs des niveaux de gris a l'interieur du snake */
   sigi2 = 
@@ -515,15 +503,530 @@ __global__ void calcul_contribs_snake4(t_cumul_x * cumul_x, t_cumul_x2 * cumul_x
     ((double)scumuls[CFI(OPib)].cx/(uint64)C1)*((double)scumuls[CFI(OPib)].cx/(uint64)C1) ;
 
   /* variance des valeurs des niveaux de gris a l'exterieur du snake */
-  ne = SUM_1 - C1 ;
-  stat_sum_xe = SUM_X - scumuls[CFI(OPib)].cx ;
+  
+  ne = sums[3] - C1 ;
+  stat_sum_xe = sums[4] - scumuls[CFI(OPib)].cx ;
   sige2 =
-    ((double)SUM_X2-scumuls[CFI(OPib)].cx2)/(double)ne - 
-    ((double)stat_sum_xe/(uint64)ne)*((double)stat_sum_xe/(uint64)ne) ;
+    ((double)sums[5]-scumuls[CFI(OPib)].cx2)/(double)ne - 
+    ((double)stat_sum_xe/(uint64)ne)*((double)stat_sum_xe/(uint64)ne) ;																		
+  index_crit = blockSize*(BPi*gridDim.y + BPj) + OPib ;
+
+   critere[ index_crit ].bpi = BPi;
+   critere[ index_crit ].bpj = BPj;
+   critere[ index_crit ].opi = OPi;
+   critere[ index_crit ].opj = OPj;
   
   if ((sigi2 > 0)|(sige2 > 0))
-  criterion = 0.5*((double)C1*log(sigi2) + (double)ne*log(sige2)) ;
+	critere[ index_crit ].crit = 0.5*((double)C1*log(sigi2) + (double)ne*log(sige2)) ;
+  else
+	critere[ index_crit ].crit = -1 ;
+	
+  // identification meilleur snake du bloc
+  // laissÃ© au CPU pour test mais le principe de ce kernel n'est pas efficace.
+    
+}
+__global__ void calcul_contribs_cols(t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2, int h, int l, tcontribs * sums )
+{
+  
+  int bs  = blockDim.x ;       // nb pixels par bloc
+  int nbC = gridDim.x ;        // nb de colonnes traitees
+  int bpc = gridDim.y ;        // nb de blocs par colonne
+  int idC = blockIdx.x ;       // indice de la colonne ds la grille
+  int idB = blockIdx.y ;       // indice du bloc de pixels ds la colonne
+  int tib = threadIdx.x ;      // indice du thread ds le bloc courant        
+
+  int i = idB*bs + tib ;       // ligne du pixel
+  int j = idC * (l/nbC) ;      // colonne du pixel
+  int pos =  i*l + j ;         // index absolu du pixel ds l'image
+
+  extern __shared__ tcontribs scontrib[]; // pour stocker les contribs partielles
+
+  if (i < h)
+	{
+	  scontrib[ CFI(tib) ].cx = cumul_x[ pos ] ;
+	  scontrib[ CFI(tib) ].cx2= cumul_x2[pos ] ;
+	}
+  else
+	{
+	  scontrib[ CFI(tib) ].cx = 0 ;
+	  scontrib[ CFI(tib) ].cx2= 0 ;
+	}
+  __syncthreads();
+
+  // somme des contribs individuelles par bloc
+  // unroll des  sommes partielles en smem
+  if (bs >= 1024) {
+	if (tib < 512) {
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib + 512) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib + 512) ].cx2;
+	}
+	__syncthreads();
+  }
+  
+  if (bs >= 512) {
+	if (tib < 256) {
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib + 256) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib + 256) ].cx2;
+	}
+	__syncthreads();
+  }
+ 
+  if (bs >= 256) {
+	if (tib < 128) {
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib + 128) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib + 128) ].cx2; 
+	}
+	__syncthreads();
+  }
+  if (bs >= 128) {
+	if (tib <  64) {
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib +  64) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib +  64) ].cx2;	  
+	}
+	__syncthreads();
+  }
+  
+  //32 threads <==> 1 warp
+  if (tib < 32)
+	{
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib + 32) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib + 32) ].cx2;
+	}
+  if (tib < 16)	  
+	{
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib + 16) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib + 16) ].cx2;
+	}
+  if (tib < 8)
+	{
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib +  8) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib +  8) ].cx2;
+	}
+  if (tib < 4)
+	{
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib +  4) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib +  4) ].cx2;
+	}
+  if (tib < 2)
+	{
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib +  2) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib +  2) ].cx2;
+	}
+  if (tib < 1)
+	{
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib +  1) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib +  1) ].cx2;
+	}
+  
+  __syncthreads();
+  
+  // resultat contribs partielles ( par bloc) en gmem
+  if (tib == 0) {
+	// c1 s'obtient directement car les sommes sont Ã  j constant ( j=idC )
+	sums[ idC*bpc +idB ].cx = scontrib[0].cx ;            
+	sums[ idC*bpc +idB ].cx2  = scontrib[0].cx2 ;
+  }
+}
+
+/*
+  sommme des contribs par bloc -> contribs colonnes
+  execution sur : (1 bloc de 1 thread) par colonne
+ */
+
+__global__ void somsom_contribs(tcontribs * somblocs, int bpc, tcontribs * somcols){
+
+  tcontribs scontrib ;
+  int col = blockIdx.x ;
+  int base = col*bpc ;
+  
+  //un thread par colonne
+  {
+	scontrib.cx = 0;
+	scontrib.cx2 = 0;
+  }
+
+  for (int b=0; b < bpc ; b++){
+	scontrib.cx += somblocs[ base +b ].cx ;
+	scontrib.cx2 += somblocs[ base +b ].cx2 ;
+  }
+  
+  //totaux en gmem
+  {
+	somcols[ col ].cx  = scontrib.cx ;
+	somcols[ col ].cx2 = scontrib.cx2;
+  }
+  
+}
+
+/*
+  recherche des criteres minis dans les blocs
+ */
+__device__ void mini_critere_bloc(double2 *critere, int bs){
+
+  int tib = threadIdx.x ; 
+
+  critere[ CFI(tib) ].y = tib ; //initialisation des indices pour les minima
+  
+  if ( (bs >= 1024) && (tib < 512) ) {
+	if ( critere[ CFI(tib + 512) ].x < critere[ CFI(tib) ].x ){
+	  critere[ CFI(tib) ] = critere[ CFI(tib + 512) ] ;
+	  __syncthreads() ;
+	} 
+  }
+  
+  if ( (bs >= 512) && (tib < 256) ) {
+	if ( critere[ CFI(tib + 256) ].x < critere[ CFI(tib) ].x ){
+	  critere[ CFI(tib) ] = critere[ CFI(tib + 256) ] ;
+	  __syncthreads() ;
+	}
+  }
   
-  //tri meilleur snake du bloc ( necessite de passer SUM_1, SUM_X et SUM_X2 )
+  if ( (bs >= 256) && (tib < 128) ) {
+	if ( critere[ CFI(tib + 128) ].x < critere[ CFI(tib) ].x ){
+	  critere[ CFI(tib) ] = critere[ CFI(tib + 128) ] ;
+	  __syncthreads() ;
+	}
+  }
+  
+  if ( (bs >= 128) && (tib < 64) ) {
+	if ( critere[ CFI(tib + 64) ].x < critere[ CFI(tib) ].x ){
+	  critere[ CFI(tib) ] = critere[ CFI(tib + 64) ] ;
+	  __syncthreads() ;
+	}
+  }
+  
+  //32 threads <==> 1 warp
+  if (tib < 32){
+	if ( critere[ CFI(tib + 32) ].x < critere[ CFI(tib) ].x ){
+	  critere[ CFI(tib) ] = critere[ CFI(tib + 32) ] ;
+	}
+  }
+  
+  if (tib < 16){
+	if ( critere[ CFI(tib + 16) ].x < critere[ CFI(tib) ].x ){
+	  critere[ CFI(tib) ] = critere[ CFI(tib + 16) ] ;
+	}
+  }
+  
+  if (tib < 8){
+	if ( critere[ CFI(tib + 8) ].x < critere[ CFI(tib) ].x ){
+	  critere[ CFI(tib) ] = critere[ CFI(tib + 8) ] ;
+	}
+  }
+  
+  if (tib < 4){
+	if ( critere[ CFI(tib + 4) ].x < critere[ CFI(tib) ].x ){
+	  critere[ CFI(tib) ] = critere[ CFI(tib + 4) ] ;
+	}
+  }
+  
+  if (tib < 2){
+	if ( critere[ CFI(tib + 2) ].x < critere[ CFI(tib) ].x ){
+	  critere[ CFI(tib) ] = critere[ CFI(tib + 2) ] ;
+	}
+  }
+  
+  if (tib < 1){
+	if ( critere[ CFI(tib + 1) ].x < critere[ CFI(tib) ].x ){
+	  critere[ CFI(tib) ] = critere[ CFI(tib + 1) ] ;
+	}
+  }
+  __syncthreads() ;
+}
+
+
+/*
+  calcule, pour l'ensemble des permutations possibles
+  des colonnes j1 et j2 (j1<J2) :
+  - le critÃ¨re correspondant
+ */
+__global__ void calcul_contribs_permutations(tcontribs * somcols, int bpc, int h, int l, uint64 * stats_img, double2 * miniblocs){
+
+  int bs = blockDim.x ;
+  int j1 = blockIdx.x ;
+  int j2 = threadIdx.x ;
+
+  //shared mem dynamique 'critere' et 'indice' 
+  extern __shared__ double2 critere[] ;
+
+  critere[ CFI(j2) ].y = j2 ; //initialisation des indices pour les minima
+
+  __syncthreads() ;
+  
+  if (j1 < j2){
+	int colj1 = j1*bpc ;
+	int colj2 = j2*bpc ;
+	int ni = h*(colj2 - colj1) ;
+	int ne = h*l - ni ;                                  // h*l ou stats_img[3]
+	uint64 stat_sum_xi = somcols[j2].cx - somcols[j1].cx ;
+	uint64 stat_sum_xi2 = somcols[j2].cx2 - somcols[j1].cx2 ;
+	uint64 stat_sum_xe = stats_img[4] - stat_sum_xi ;    /* somme des xn region exterieure */
+	uint64 stat_sum_xe2 = stats_img[5] - stat_sum_xi2 ;
+    double sigi2, sige2;                                 /* variances regions interieure et exterieure */ 
+
+	/* variance des valeurs des niveaux de gris a l'interieur */
+	sigi2 = 
+	  ((double)stat_sum_xi2/ni) - 
+	  ((double)stat_sum_xi/ni)*((double)stat_sum_xi/ni) ;
+	
+	/* variance des valeurs des niveaux de gris a l'exterieur */
+	sige2 =
+	  ((double)stat_sum_xe2)/ne - 
+	  ((double)stat_sum_xe/ne)*((double)stat_sum_xe/ne) ;
+	
+	if ((sigi2 > 0)|(sige2 > 0))
+	  critere[CFI(j2)].x =  0.5*((double)ni*log(sigi2) + (double)ne*log(sige2)) ;
+	else critere[CFI(j2)].x = DBL_MAX;
+  }
+  else critere[CFI(j2)].x = DBL_MAX ;
+
+  __syncthreads(); // on s'assure que toutes les valeurs des criteres sont calculees avant de chercher le mini
+
+  /*
+  if ( (bs >= 1024) && (j2 < 512) ) {
+	if ( critere[ CFI(j2 + 512) ].x < critere[ CFI(j2) ].x ){
+	  critere[ CFI(j2) ] = critere[ CFI(j2 + 512) ] ;
+	  __syncthreads() ;
+	} 
+  }
+  
+  if ( (bs >= 512) && (j2 < 256) ) {
+	if ( critere[ CFI(j2 + 256) ].x < critere[ CFI(j2) ].x ){
+	  critere[ CFI(j2) ] = critere[ CFI(j2 + 256) ] ;
+	  __syncthreads() ;
+	}
+  }
+  
+  if ( (bs >= 256) && (j2 < 128) ) {
+	if ( critere[ CFI(j2 + 128) ].x < critere[ CFI(j2) ].x ){
+	  critere[ CFI(j2) ] = critere[ CFI(j2 + 128) ] ;
+	  __syncthreads() ;
+	}
+  }
+  
+  if ( (bs >= 128) && (j2 < 64) ) {
+	if ( critere[ CFI(j2 + 64) ].x < critere[ CFI(j2) ].x ){
+	  critere[ CFI(j2) ] = critere[ CFI(j2 + 64) ] ;
+	  __syncthreads() ;
+	}
+  }
+  
+  //32 threads <==> 1 warp
+  if (j2 < 32){
+	if ( critere[ CFI(j2 + 32) ].x < critere[ CFI(j2) ].x ){
+	  critere[ CFI(j2) ] = critere[ CFI(j2 + 32) ] ;
+	}
+  }
+  
+  if (j2 < 16){
+	if ( critere[ CFI(j2 + 16) ].x < critere[ CFI(j2) ].x ){
+	  critere[ CFI(j2) ] = critere[ CFI(j2 + 16) ] ;
+	}
+  }
+  
+  if (j2 < 8){
+	if ( critere[ CFI(j2 + 8) ].x < critere[ CFI(j2) ].x ){
+	  critere[ CFI(j2) ] = critere[ CFI(j2 + 8) ] ;
+	}
+  }
+  
+  if (j2 < 4){
+	if ( critere[ CFI(j2 + 4) ].x < critere[ CFI(j2) ].x ){
+	  critere[ CFI(j2) ] = critere[ CFI(j2 + 4) ] ;
+	}
+  }
+  
+  if (j2 < 2){
+	if ( critere[ CFI(j2 + 2) ].x < critere[ CFI(j2) ].x ){
+	  critere[ CFI(j2) ] = critere[ CFI(j2 + 2) ] ;
+	}
+  }
+  
+  if (j2 < 1){
+	if ( critere[ CFI(j2 + 1) ].x < critere[ CFI(j2) ].x ){
+	  critere[ CFI(j2) ] = critere[ CFI(j2 + 1) ] ;
+	}
+	//critere[0] contient maintenant les params du mini du bloc
+	miniblocs[ j1 ] = critere[0] ;
+	}
+  */
+  mini_critere_bloc(critere, bs) ;
+  if (j2 == 0) miniblocs[ j1 ] = critere[0] ;
+}
+
+
+__device__ void somme_contribs_blocs(tcontribs * scontrib, int bs){
+  int tib = threadIdx.x ;
+  // somme des contribs individuelles par bloc
+  // unroll des  sommes partielles en smem
+  if (bs >= 1024) {
+	if (tib < 512) {
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib + 512) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib + 512) ].cx2;
+	}
+	__syncthreads();
+  }
+  
+  if (bs >= 512) {
+	if (tib < 256) {
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib + 256) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib + 256) ].cx2;
+	}
+	__syncthreads();
+  }
+ 
+  if (bs >= 256) {
+	if (tib < 128) {
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib + 128) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib + 128) ].cx2; 
+	}
+	__syncthreads();
+  }
+  if (bs >= 128) {
+	if (tib <  64) {
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib +  64) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib +  64) ].cx2;	  
+	}
+	__syncthreads();
+  }
+  
+  //32 threads <==> 1 warp
+  if (tib < 32)
+	{
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib + 32) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib + 32) ].cx2;
+	}
+  if (tib < 16)	  
+	{
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib + 16) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib + 16) ].cx2;
+	}
+  if (tib < 8)
+	{
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib +  8) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib +  8) ].cx2;
+	}
+  if (tib < 4)
+	{
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib +  4) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib +  4) ].cx2;
+	}
+  if (tib < 2)
+	{
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib +  2) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib +  2) ].cx2;
+	}
+  if (tib < 1)
+	{
+	  scontrib[ CFI(tib) ].cx += scontrib[ CFI(tib +  1) ].cx;
+	  scontrib[ CFI(tib) ].cx2 += scontrib[CFI(tib +  1) ].cx2;
+	}
+  
+  __syncthreads();
+}
+
+/*
+  effectue la somme (pix par pix) ds un bloc des contribs sur les colonnes j1 et j2 (j1 < j2)
+  Grille de calcul GPU :
+  -> threads = bs selon capacitÃ©s/perfs
+  -> grid = div = (h+bs-1)/bs (h = nb lignes)
+ */
+__global__ void calcul_contrib_conjuguee_colonnes(t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2, int h, int l, int j1, int j2, tcontribs * vect_contrib)
+{
+  int bs = blockDim.x ;
+  int id_bloc = blockIdx.x ;
+  int tib = threadIdx.x ;
+  int idx = id_bloc * bs + tib ;
+
+  extern __shared__ tcontribs sh_contrib[] ;
+  
+  if (idx < h){
+	sh_contrib[CFI(tib)].cx = cumul_x[ idx * l + j2 ] - cumul_x[ idx * l + j1 ] ;
+	sh_contrib[CFI(tib)].cx2= cumul_x2[ idx * l + j2 ] - cumul_x2[ idx * l + j1 ] ;
+  } else {
+	sh_contrib[CFI(tib)].cx =  0 ;
+	sh_contrib[CFI(tib)].cx2=  0 ;
+  }
+  __syncthreads(); // synchro avant de sommer
+
+  /*sommes par bloc*/
+  somme_contribs_blocs(sh_contrib, bs);
+
+  /*enregistrement somme des blocs*/
+  if (tib == 0)	vect_contrib[ id_bloc ] = sh_contrib[0] ;
+  
+}
+
+/*
+  evalue les hauteurs les plus probables des horizontales
+  du rectangle 'englobant' (qui ne l'est pas nÃ©cessairement en fait)
+  Grille de calcul GPU :
+  -> threads = div = nb de blocs de lignes (Cf. calcul_contrib_conjuguee_colonnes() )
+  -> grid = div x div x 1
+*/
+__global__ void calcul_contribs_snake_rectangle(tcontribs * vect_contribs, tcontribs * soms)
+{
+  int bs = blockDim.x ;
+  int div = gridDim.x ;
+  int tib = threadIdx.x ;
+  int i1 = blockIdx.x ;
+  int i2 = blockIdx.y ;
+
+  extern __shared__ tcontribs scontribs[] ;
+  
+  if (i2 >= i1){
+	if ( (tib >= i1) && (tib <= i2) ) {
+	  scontribs[CFI(tib)].cx = vect_contribs[tib].cx ;
+	  scontribs[CFI(tib)].cx2= vect_contribs[tib].cx2;
+	} else {
+	  scontribs[CFI(tib)].cx  = 0;
+	  scontribs[CFI(tib)].cx2 = 0;
+	}
+	__syncthreads() ;
+	/*sommes par bloc*/
+	somme_contribs_blocs(scontribs, bs);
+
+	/*ranger les sommes ds un vecteur en gmem*/
+	if (tib == 0) soms[ i1*div + i2 ] = scontribs[0] ;
+  } else {
+	if (tib == 0) {
+	  soms[ i1*div + i2 ].cx = 0 ;
+	  soms[ i1*div + i2 ].cx = 0 ;
+	}
+  } 
+}
+
+/*
+  Evalue la valeur du critere pour chaque combinaison possible
+  des blocs de lignes dÃ©finis par  calcul_contrib_conjuguee_colonnes()
+  grille de calcul GPU :
+  threads -> nextPow2 (div)
+  grid -> div x 1 x1
+
+  TODO voir a inverser les indices i1/i2
+ */
+
+__global__ void calcul_critere_permutations_verticales(tcontribs *soms, int lpb, int j1, int j2, int h, int l, uint64 sigX, uint64 sigX2, double2 *miniblocs){
+
+  int n = h*l ;
+  int bs = blockDim.x ;
+  int div = gridDim.x ;
+  int i2 = threadIdx.x ;
+  int i1 = blockIdx.x ;
+  
+  extern __shared__ double2 sh_mini[];
+
+  if ( (i2 < div) && (i2 >= i1) )  
+	sh_mini[ CFI(i2) ].x = critere_gauss(n, (i2-i1+1)*lpb*(j2 - j1), soms[ i1*div +i2 ].cx, soms[ i1*div + i2 ].cx2, sigX, sigX2);
+  else sh_mini[ CFI(i2) ].x = DBL_MAX ;
+  __syncthreads() ;
+
+  mini_critere_bloc(sh_mini, bs) ;
+  
+  if (i2==0) {
+	//miniblocs[i1] = sh_mini[0] ;
+	miniblocs[i1].x = sh_mini[0].x ;
+	miniblocs[i1].y = sh_mini[0].y ;
+	}
   
 }