#include <stdio.h>


extern "C"{
#include "structures.h"
#include "lib_math.h"
#include "defines.h"
#include "lib_gpu.h"
#include "lib_snake_2_gpu.h"
}
#include "lib_test_gpu.h"
#include "lib_kernels_cumuls.cu"
#include "lib_kernel_snake_2_gpu.cu"

#define DEBUG_IMG_CUMUL 1
bool DISPLAY_ERR_IMG_CUMUL = 1;
//#define DEBUG_POSITIONS
//#define DEBUG_MOVE
//#define DEBUG_CRST
//#define DEBUG_MV
//#define DEBUG_SOMSOM
//#define DEBUG_SOMBLOCS
//#define DEBUG_LISTES
//#define DEBUG_STATS_REF


inline unsigned int nextPow2( unsigned int x ) {
  --x;
  x |= x >> 1;
  x |= x >> 2;
  x |= x >> 4;
  x |= x >> 8;
  x |= x >> 16;
  return ++x;
}


void cuda_init_img_cumul(unsigned short ** img_in, int H, int L, int nb_nodes,
						 unsigned short ** d_img, t_cumul_x ** d_img_x, t_cumul_x2 ** d_img_x2,
						 int ** d_freemanDiDj, int ** d_codeNoeud,
						 snake_node_gpu ** d_snake, uint32 ** d_nb_pix_max,
						 uint4 ** d_positions, uint64 ** d_contribs_segments, uint4 ** d_freemans_centres,
						 int ** d_codes_segments, int64 ** d_stats_snake,
						 int64 ** d_stats, int64 ** d_stats_ref, double ** d_vrais, double ** d_vrais_snake,
						 uint2 ** d_liste_pixels, uint64 ** d_contribs_segments_blocs,
						 bool ** d_move
						 )
{
  unsigned int taille = H*L;
  timeval chrono;

  
  //allocation cumuls en memoire GPU
  tic(&chrono, NULL);
  /*
	MAX_PIX 20000
	MAX_NODES 10000
	MAX_LISTE_PIX 10000000
   */
  cudaMalloc( (void**) d_snake, MAX_NODES*sizeof(snake_node_gpu) );
  
  cudaMalloc( (void**) d_img, taille*sizeof(unsigned short) );
  cudaMalloc( (void**) d_img_x, taille*sizeof(t_cumul_x) );
  cudaMalloc( (void**) d_img_x2, taille*sizeof(t_cumul_x2) );
 
  cudaMalloc( (void**) d_freemanDiDj, 9*sizeof(int) );
  cudaMalloc( (void**) d_codeNoeud, 64*sizeof(int) );
  
  cudaMalloc( (void**) d_stats_snake, 6*sizeof(int64)) ;
  cudaMalloc( (void**) d_positions, 8*MAX_NODES*sizeof(uint4)) ;
  cudaMalloc( (void**) d_contribs_segments, 3*16*MAX_NODES*sizeof(uint64)) ;
  cudaMalloc( (void**) d_contribs_segments_blocs, (3*MAX_LISTE_PIX/32)*sizeof(uint64)) ;
  cudaMalloc( (void**) d_freemans_centres, 16*MAX_NODES*sizeof(uint4)) ;
  cudaMalloc( (void**) d_codes_segments, 16*MAX_NODES*sizeof(int)) ;
  cudaMalloc( (void**) d_stats, 3*8*MAX_NODES*sizeof(int64)) ;
  cudaMalloc( (void**) d_stats_ref, 3*MAX_NODES*sizeof(int64)) ;
  cudaMalloc( (void**) d_vrais, 8*MAX_NODES*sizeof(double)) ;
  cudaMalloc( (void**) d_move, MAX_NODES*sizeof(bool)) ;
  cudaMalloc( (void**) d_nb_pix_max, sizeof(uint32)) ;
  cudaMalloc( (void**) d_vrais_snake, sizeof(double)) ;
  
  cudaMalloc( (void**) d_liste_pixels, 16*5*(MAX_NODES)*sizeof(uint2) );
  
  printf("TOTAL MEM = %ld octets\n",
		 (2*MAX_NODES*(sizeof(snake_node_gpu)+(8+16)*sizeof(uint4)+3*16*8+16*4+24*8+3*8+8*sizeof(double)+sizeof(bool))
		 +(MAX_LISTE_PIX)*(sizeof(uint2)+1)
		 +taille*(8+sizeof(t_cumul_x)+sizeof(t_cumul_x2))
		  +9*4+64*4+6*8+4+sizeof(double)) );
  	    
  int64 * h_stats_snake = new int64[6];
  
  toc(chrono, "temps alloc mem GPU");

  /*detection-choix-initialisation de la carte GPU*/
  tic(&chrono, NULL) ;
  cudaDeviceProp deviceProp;
  deviceProp.major = 1;
  deviceProp.minor = 0;
  int desiredMinorRevision = 3;
  int dev;
  cudaChooseDevice(&dev, &deviceProp);
  cudaGetDeviceProperties(&deviceProp, dev);
  if(deviceProp.major > 1 || deviceProp.minor >= desiredMinorRevision)
	{
	  printf("Using Device %d: \"%s\"\n", dev, deviceProp.name);
	  cudaSetDevice(dev);
	}
  toc(chrono, "temps acces GPU") ;
  
  //copie tables correspondances freeman en mem GPU
  tic(&chrono, NULL) ;
  cudaMemcpy( *d_freemanDiDj, CORRESPONDANCE_Di_Dj_FREEMAN , 9*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy( *d_codeNoeud, TABLE_CODAGE , 64*sizeof(unsigned int), cudaMemcpyHostToDevice);
  toc(chrono, "temps transfert tables de codage") ;
  
  /*transfert image en global mem GPU*/
  tic(&chrono, NULL);
  cudaMemcpy( *d_img, img_in[0], taille*sizeof(unsigned short), cudaMemcpyHostToDevice);
  toc(chrono, "transfert image vers GPU");

  //calculs images cumulees sur GPU
  int blocs_max = 65536 ;
  int bs = 256 ; //arbitraire, d'apres les observations c'est souvent l'optimu
  unsigned int base = 0 ;
  unsigned int bl_l  = (L+bs-1)/bs ;
  unsigned int nb_lines =  blocs_max / bl_l ;
  unsigned int lines ;
  unsigned int tranches = ( 1 + H / nb_lines ) ;
  nb_lines = (H +tranches -1)/ tranches ; // equilibre la taille des tranches
  unsigned blocs = bl_l*nb_lines ;
  dim3 threads(bs,1,1);
  int smem = nextPow2(bl_l)*2; //smem pour le prefixscan des sommes de blocs (etape 2)
  smem += smem >> DEC;
  smem += smem >> DEC;
  int smem_size = smem*sizeof(uint64);
  uint64 * d_somblocs ; // sommes des cumuls par bloc de calcul
  

  if(DEBUG_IMG_CUMUL)
	{
	  printf("--- CALCULS IMAGES CUMULEES+STATS GPU  ----\n");
	  printf("\t%d threads par bloc  -- %u blocs par ligne -- %u tranches -- %u lignes par tranche \n",bs, bl_l, tranches,nb_lines);
	  printf(" Smem totale pour cumuls : %d\n", CFI(bs)*(sizeof(t_cumul_x)+sizeof(t_cumul_x2)) );
	  tic(&chrono, NULL);
	}
  //calculs cumuls generiques : necessitent 3 etapes / 3 kernels  
  cudaMalloc( (void**) &d_somblocs, 2*bl_l*nb_lines*sizeof(uint64) );
  cudaFuncSetCacheConfig(calcul_cumuls_gpu, cudaFuncCachePreferShared);
  do
	{
	  if  ( H-base < nb_lines ) lines = H - base ; else lines = nb_lines ;
	  printf("base = ligne %d -- traitement de %d lignes \n", base, lines) ;
	  dim3 grid(bl_l*lines,1,1) ;
	  calcul_cumuls_gpu<<<grid, threads, CFI(bs)*sizeof(tcumuls)>>>(*d_img, *d_img_x, *d_img_x2, H, L, d_somblocs, bl_l, base, lines) ;
	  scan_somblocs<<<2*lines, nextPow2(bl_l)/2, smem_size>>>(d_somblocs, bl_l) ;
	  add_soms_to_cumuls<<<grid, threads>>>(*d_img_x, *d_img_x2, H, L, d_somblocs, bl_l, base, lines) ;
	  base += lines ;
	}
  while (base < H) ;
  cudaFree(d_somblocs) ;
  
  //calcul des sommes totales N, sigX et sigX2 sur l'image
  calcul_stats_image<<<1, 1>>>( *d_img_x, *d_img_x2, H, L, (uint64*)*d_stats_snake);
  
  
	  cudaThreadSynchronize()   ;
	  toc(chrono, "\tTemps GPU");
	 if(DEBUG_IMG_CUMUL)
	{ 
	  
	  //allocation memoire CPU
	  t_cumul_x  * img_x = new t_cumul_x [H*L];
	  t_cumul_x2 *  img_x2 = new t_cumul_x2 [H*L];
	  
	  /*pour test comparaison*/
	  t_cumul_x * img_xb = new t_cumul_x [H*L];
	  t_cumul_x2 * img_x2b = new t_cumul_x2 [H*L];
	  
	  cudaMemcpy( img_xb, *d_img_x, taille*sizeof(t_cumul_x), cudaMemcpyDeviceToHost);
	  cudaMemcpy( img_x2b, *d_img_x2, taille*sizeof(t_cumul_x2), cudaMemcpyDeviceToHost);
	  
	  //cumuls : etape 1 CPU
	  /*      
		for (int i=0; i<H; i++)
		{
			for (int b=0; b<bl_l; b++)
			{
				int offset = b*bs ;
				img_x[i*L+offset] = img_in[i][offset] ;
				img_x2[i*L+offset]= img_in[i][offset]*img_in[i][offset] ;
				for (int p=1; p<bs; p++)
				{
					int j = p+offset ;
					if (j<L)
					{
						img_x[i*L+j] = img_x[i*L+j-1] + img_in[i][j];
						img_x2[i*L+j] = img_x2[i*L+j-1] + img_in[i][j]*img_in[i][j] ;
					}
				}
			}
		}
	  */
	  //cumuls complets CPU
          
	  for (int i=0; i<H; i++)
		{
		  img_x[i*L+0] = img_in[i][0] ;
		  img_x2[i*L+0]= img_in[i][0]*img_in[i][0] ;
		  for (int j=1; j<L; j++)
			{
			  img_x[i*L+j]  = img_x[i*L+j-1]  + img_in[i][j] ;
			  img_x2[i*L+j] = img_x2[i*L+j-1] + img_in[i][j]*img_in[i][j] ;
			}
		}
	  
	  int cpt = 0;
	  int cpt_err1=0, cpt_errx=0, cpt_errx2 = 0;
	  for (int i=0; i< H; i++){
		for (int j=0; j< L; j++){
		  if ( (img_x[i*L+j] !=  img_xb[i*L+j]) ) cpt_errx++ ;
		  if ( (img_x2[i*L+j] !=  img_x2b[i*L+j]) ) cpt_errx2++ ;
		  if ( (img_x[i*L+j] !=  img_xb[i*L+j]) || (img_x2[i*L+j] !=  img_x2b[i*L+j]))
		  {
			//printf("(%d,%d)sxCPU=%lu  sxGPU=%lu -- sx2CPU=%lu  sx2GPU=%lu\n",i,j,img_x[i*L+j], img_xb[i*L+j], img_x2[i*L+j], img_x2b[i*L+j]);
		  }
		  cpt++;
		}
	  }
	  printf("%d erreurs sur CX / %d points\n", cpt_errx, cpt );
	  printf("%d erreurs sur CX2 / %d points\n", cpt_errx2, cpt );
	  uint64 sigX = 0, sigX2 = 0 ;
	  for (int i=0; i<H; i++)
		{
		  sigX += img_x[i*L+L-1] ;
		  sigX2+= img_x2[i*L+L-1];
		}
	  printf("STATS IMAGE  N = %d - sigX = %lu - sigX2 = %lu\n",  H*L, sigX, sigX2 );
	}
  
  /*
   * generation snake en mem GPU
   */
  int dist = 140 ;
  
  tic(&chrono, NULL);
  if (nb_nodes == 4)  genere_snake_rectangle_4nodes_gpu<<< 1, 1>>>(*d_snake, 140, H, L) ;
  else if (nb_nodes == 40) genere_snake_rectangle_Nnodes_gpu<<< 1, 1>>>(*d_snake, (H+L)/20, H, L) ;

  int nnodes = nb_nodes ;
  snake_node_gpu * h_snake = new snake_node_gpu[nnodes];
  snake_node * h_snake_ll = new snake_node[nnodes] ;
  uint4 * h_liste_positions = new uint4[nnodes*8]; 
  double * h_vrais_snake = new double ;
  //init les stats du snake
  uint2 * d_liste_temp  ;
  t_sum_x2 * d_sompart  ;
  int tpb, bps, npixmax ;
 
  //calcul nb threads par bloc
  npixmax = 2*(H+L-4*dist)/(nnodes-1) ;
  tpb = nextPow2(npixmax) ;
  if (tpb >= 256) tpb = 256 ;//  /!\ le kernel <<< calcul_contrib...>>> ne supporte pas un bs>256 a cause de la shared-mem nécessaire
  if (tpb < 32 ) tpb = 32 ;
  tpb=128 ; 
  bps = (npixmax+tpb-1)/tpb ;
  printf("PARAMS EXEC INIT : %d pix max, %d threads/bloc, %d blocs/seg, %d blocs/grille\n", npixmax, tpb, bps, nnodes*bps);
 
  //alloc
  cudaMalloc((void**) &d_liste_temp, nnodes*bps*tpb*sizeof(uint2));
  cudaMalloc((void**) &d_sompart, 3*nnodes*bps*sizeof(t_sum_x2));
  cudaMalloc((void**) &d_stats_ref, 3*nnodes*sizeof(int64));

  //DEBUG : pour forcer la mise à zero du tableau intermediaire d_stats_ref
  int64 h_stats_ref[3*nnodes] ;
  for (int a=0; a<3*nnodes ; a++) h_stats_ref[a] = 0 ;
  cudaMemcpy( h_stats_ref, d_stats_ref, sizeof(int64), cudaMemcpyHostToDevice) ;
  //fin forçage a 0
  
  //DEBUG : pour forcer la mise à zero du tableau intermediaire d_sompart
     t_sum_x2 h_sompart[ 3*nnodes*bps ] ;
     for (int a=0; a<3*nnodes*bps ; a++) h_sompart[a] = 0 ;
     cudaMemcpy( h_sompart, d_sompart, sizeof(t_sum_x2), cudaMemcpyHostToDevice) ;
  //fin forçage a 0
  
  calcul_contribs_segments_snake<<< nnodes*bps, tpb, (CFI(tpb))*(3*sizeof(t_sum_x2))>>>
	(*d_snake, nnodes, 
	 *d_img_x, *d_img_x2, 
	 L, d_liste_temp, d_sompart, *d_freemanDiDj );

  //TODO
  //parametrer pour ne pas appeler qd tpb=1
  //oblige a modifier le kernel <<< calcul_contrib...>>> pour ecrire directement ds d_snake
  // au lieu de d_sompart
  somsom_snake<<< nnodes , 1 >>>(d_sompart, nnodes, bps, *d_snake);
  
  
  calcul_stats_snake<<< 1 , 1 >>>(*d_snake, nnodes, *d_stats_snake, *d_vrais_snake,
								  *d_img_x, *d_img_x2,
								  *d_codeNoeud, L
								  );
  cudaThreadSynchronize() ;
  toc(chrono, "\tTemps") ;
  
  /*
	verif stats initiales du snake
  */
  cudaMemcpy( h_vrais_snake, *d_vrais_snake, sizeof(double), cudaMemcpyDeviceToHost) ;	
  cudaMemcpy( h_stats_snake, *d_stats_snake, 6*sizeof(int64), cudaMemcpyDeviceToHost) ;
  
  printf("STATS SNAKE log vrais=%lf : c1=%lu - cx=%lu - cx2=%lu - N=%lu - SUMX=%lu - SUMX2=%lu\n",
		 *h_vrais_snake,
		 h_stats_snake[0],  h_stats_snake[1],  h_stats_snake[2],
		 h_stats_snake[3],  h_stats_snake[4],  h_stats_snake[5] );
  
  /*
	verif stats diminuees des contribs des 2 segments associes a chaque noeud
  */  
#ifdef DEBUG_STATS_REF
  cudaMemcpy( h_stats_ref, d_stats_ref, 3*nnodes*sizeof(int64), cudaMemcpyDeviceToHost) ;
  cudaMemcpy( h_snake, *d_snake, nnodes*sizeof(snake_node_gpu), cudaMemcpyDeviceToHost) ;
  
	
  printf("******* STATS DIMINUEES\n");
  for(int n=0; n<nnodes;n++)
	{
	  int i = h_snake[n].posi, j = h_snake[n].posj ;
	  printf("node %d (%d,%d) : %ld - %ld - %ld - img1= %lu - imgx= %lu - imgx2= %lu \n", n, i, j,
			 h_stats_ref[3*n], h_stats_ref[3*n +1], h_stats_ref[3*n +2],
			 img_1[i][j], img_x[i][j], img_x2[i][j]);
	}
#endif //DEBUG_STATS_REF
  
  //snake2gpu(d_snake, snake, nb_nodes);
  //gpu2snake(*d_snake, &h_snake_ll, nnodes);

 
#ifdef DEBUG_POSITIONS
  for (int n=0; n<nnodes; n++)
	{
	  printf("Node %d :\n", n);
	  for (int pos=0; pos<8; pos++)
		{
		  printf("(%d , %d):%d:%d | ", h_liste_positions[8*n + pos].x, h_liste_positions[8*n + pos].y,
				 h_liste_positions[8*n + pos].z, h_liste_positions[8*n + pos].w);
		}
	  printf("\n");
	}
#endif //DEBUG_POSITIONS

//verif liste pixels noeuds pairs/impairs selon

#ifdef DEBUG_LISTES
  printf("NOMBRE PIXELS pour LISTE = %d\n", *h_nb_pix_max) ;
  printf("bs = %d - grid = %d - nblocs_seg = %d - npix_max = %d - taille mem = %d\n",
		 bs, grid.x, nblocs_seg, *h_nb_pix_max, taille_mem);

  cudaMemcpy( h_liste_pix, d_liste_pix, taille_mem*sizeof(uint2), cudaMemcpyDeviceToHost ) ;
  cudaMemcpy( h_snake, *d_snake, nnodes*sizeof(snake_node_gpu), cudaMemcpyDeviceToHost );
  uint32 * h_liste_pixels_segment = new uint32[2*(*h_nb_pix_max)] ;
  int idx_n, idx_nprec, idx_nsuiv ;

  printf("********* LISTE PIX  ***********\n");
  printf("bs = %d - grid = %d - nblocs_seg = %d - npix_max = %d - taille mem = %d\n",
		 bs, grid.x, nblocs_seg, *h_nb_pix_max, taille_mem);
  
  for (int n=0; n<(nnodes/2 + (nnodes%2)*pairs); n++)
	{
	  idx_n = 2*n + !pairs ;
	  if (idx_n == 0) idx_nprec = nnodes - 1;
	  else idx_nprec = idx_n - 1;
	  if (idx_n == nnodes-1) idx_nsuiv = 0;
	  else idx_nsuiv = idx_n + 1 ;
		
	  for (int pos=0; pos < 8 ; pos++) //test des segments avant le noeud
		{
		  
		  int nb_pix = calcul_liste_pixel_segment(h_snake[idx_nprec].posi,h_snake[idx_nprec].posj,
												  h_liste_positions[8*idx_n+pos].x, h_liste_positions[8*idx_n+pos].y,
												  h_liste_pixels_segment, 0);
		  
		  for (int pix=0; pix < nb_pix; pix++)
			{
			  
			  if ( (h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].x != h_liste_pixels_segment[2*pix] )
				   || ( h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].y != h_liste_pixels_segment[2*pix+1] ) )
				printf("erreur avant n=%d pix %d/%d segment %d noeuds[ %d-%d-%d ] , CPU (%d,%d) - GPU (%d, %d)\n", n, pix, nb_pix, pos,
					   idx_nprec, idx_n, idx_nsuiv,
					   h_liste_pixels_segment[2*pix], h_liste_pixels_segment[2*pix+1],
					   h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].x,  h_liste_pix[(16*n + pos)*nblocs_seg*bs + pix].y);
			  
			}
		  
		}
	  for (int pos=0; pos < 8 ; pos++) //test des segments apres le noeud
		{
		  
		  int nb_pix = calcul_liste_pixel_segment(h_liste_positions[8*idx_n+pos].x, h_liste_positions[8*idx_n+pos].y,
												  h_snake[idx_nsuiv].posi,h_snake[idx_nsuiv].posj,
												  h_liste_pixels_segment, 0);
		  
		  for (int pix=0; pix < nb_pix; pix++)
			{
			  
			  if ( (h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].x != h_liste_pixels_segment[2*pix] )
			     || ( h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].y != h_liste_pixels_segment[2*pix+1] ) )
				printf("erreur apres n=%d pix %d/%d segment %d noeuds[ %d-%d-%d ] , CPU (%d,%d) - GPU (%d, %d)\n", n, pix, nb_pix, pos+8,
					   idx_nprec, idx_n, idx_nsuiv,
					   h_liste_pixels_segment[2*pix], h_liste_pixels_segment[2*pix+1],
					   h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].x,  h_liste_pix[(16*n + pos + 8)*nblocs_seg*bs + pix].y);
			  
			}
		  
		}
 
		}
  
#endif //DEBUG_LISTES
  
  /*
	
	Test du calcul des sommes partielles 'somblocs' faites par le kernel 'calcul_contribs_segments_blocs_full'

   */
 
#ifdef DEBUG_SOMBLOCS
  printf("********* SOMMES PARTIELLES  ***********\n");
  printf("bs = %d - grid = %d -  intervalles = %d - nblocs_seg = %d - pairs = %d \n", bs, grid.x, n_interval, nblocs_seg, pairs);
  for (int n=0; n< n_interval; n++)
	{
	  idx_n = 2*n + !pairs ;
	  if (idx_n == 0) idx_nprec = nnodes - 1 ;
	  else idx_nprec = idx_n - 1 ;
	  if (idx_n == nnodes-1) idx_nsuiv = 0 ;
	  else idx_nsuiv = idx_n + 1 ;
	  printf("******** node %d\n", idx_n) ;
	  for(int s=0; s<8; s++)
		{
		  int nb_pix = calcul_liste_pixel_segment(h_snake[idx_nprec].posi,h_snake[idx_nprec].posj,
												  h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
												  h_liste_pixels_segment, 0);
		  for (int b=0; b<nblocs_seg; b++)
			{
			  uint64 c1=0, cx=0, cx2=0 ;
			  int i,j;
			  for (int p=0; p<bs; p++)
				{
				  if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
					{
					  //  /!\ penser a oter le test de prise en
					  // compte pour les pix sur la même ligne dans
					  // le kernel, sinon les comparaisons des
					  // sommes par colonne seront fausses
					  i = h_liste_pixels_segment[2*(b*bs + p)] ;
					  j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
					  c1 += img_1[i][j] ;
					  cx += img_x[i][j] ;
					  cx2+= img_x2[i][j];
					}
				}
			  if ( ( c1 != h_sombloc[(16*n + s)*nblocs_seg + b ] ) || ( cx != h_sombloc[(16*n + s)*nblocs_seg + b + grid.x] )
				   ||  ( cx2 != h_sombloc[ (16*n + s)*nblocs_seg + b + 2*grid.x] ) )
				printf("seg %d - %d pix : bloc %d -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix, b,
					   c1, cx, cx2, h_sombloc[(16*n+s)*nblocs_seg + b], h_sombloc[(16*n+s)*nblocs_seg + b + grid.x],
					   h_sombloc[(16*n+s)*nblocs_seg + b + 2*grid.x]) ;    
			}
		 
		}
	   for(int s=0; s<8; s++)
		{
		  int nb_pix = calcul_liste_pixel_segment( h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
												  h_snake[idx_nsuiv].posi,h_snake[idx_nsuiv].posj,
												  h_liste_pixels_segment, 0);
		  for (int b=0; b<nblocs_seg; b++)
			{
			  uint64 c1=0, cx=0, cx2=0 ;
			  int i,j;
			  for (int p=0; p<bs; p++)
				{
				  if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
					{
					  //  /!\ penser a oter le test de prise en
					  // compte pour les pix sur la même ligne dans
					  // le kernel, sinon les comparaisons des
					  // sommes par colonne seront fausses
					  i = h_liste_pixels_segment[2*(b*bs + p)] ;
					  j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
					  c1 += img_1[i][j] ;
					  cx += img_x[i][j] ;
					  cx2+= img_x2[i][j];
					}
				}
			  if ( ( c1 != h_sombloc[(16*n + s + 8)*nblocs_seg + b ] ) || ( cx != h_sombloc[(16*n + s + 8)*nblocs_seg + b + grid.x] )
				   ||  ( cx2 != h_sombloc[ (16*n + s + 8)*nblocs_seg + b + 2*grid.x] ) )
				printf("seg %d - %d pix : bloc %d -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix, b,
					   c1, cx, cx2, h_sombloc[(16*n+s+8)*nblocs_seg + b], h_sombloc[(16*n+s+8)*nblocs_seg + b + grid.x],
					   h_sombloc[(16*n+s+8)*nblocs_seg + b + 2*grid.x]) ;    
			}
		 
		}
	  
	}
#endif //DEBUG_SOMBLOCS

 
 /*
	
	Test du calcul des sommes totales 'somsom' faites par le kernel 'somsom_full'

   */

#ifdef DEBUG_SOMSOM
 printf("********* SOMMES TOTALES  ***********\n");
  printf("bs = %d - grid = %d -  intervalles = %d - nblocs_seg = %d - pairs = %d \n", bs, grid.x, n_interval, nblocs_seg, pairs);
  for (int n=0; n< n_interval; n++)
	{
	  idx_n = 2*n + !pairs ;
	  if (idx_n == 0) idx_nprec = nnodes - 1 ;
	  else idx_nprec = idx_n - 1 ;
	  if (idx_n == nnodes-1) idx_nsuiv = 0 ;
	  else idx_nsuiv = idx_n + 1 ;
	  printf("******** node %d\n", idx_n) ;
	  for(int s=0; s<8; s++)
		{
		  int nb_pix = calcul_liste_pixel_segment(h_snake[idx_nprec].posi,h_snake[idx_nprec].posj,
												  h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
												  h_liste_pixels_segment, 0);
		  uint64 c1=0, cx=0, cx2=0 ;
		  for (int b=0; b<nblocs_seg; b++)
			{
			  int i,j;
			  for (int p=0; p<bs; p++)
				{
				  if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
					{
					  //  /!\ penser a oter le test de prise en
					  // compte pour les pix sur la même ligne dans
					  // le kernel, sinon les comparaisons des
					  // sommes par colonne seront fausses
					  i = h_liste_pixels_segment[2*(b*bs + p)] ;
					  j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
					  c1 += img_1[i][j] ;
					  cx += img_x[i][j] ;
					  cx2+= img_x2[i][j];
					}
				}    
			}
		  if ( ( c1 != h_somsom[3*(16*n + s)] ) || ( cx != h_somsom[3*(16*n + s) + 1] )
			   ||  ( cx2 != h_somsom[3*(16*n + s) + 2] ) )
				printf("seg %d - %d pix -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix, 
					   c1, cx, cx2, h_somsom[3*(16*n + s)], h_somsom[3*(16*n + s) + 1],
					   h_somsom[3*(16*n + s) + 2]) ;
		 
		}
	  
	   for(int s=0; s<8; s++)
		{
		  int nb_pix = calcul_liste_pixel_segment( h_liste_positions[8*idx_n+s].x, h_liste_positions[8*idx_n+s].y,
												  h_snake[idx_nsuiv].posi,h_snake[idx_nsuiv].posj,
												   h_liste_pixels_segment, 0);
		  uint64 c1=0, cx=0, cx2=0 ;
		  for (int b=0; b<nblocs_seg; b++)
			{
			  
			  int i,j;
			  for (int p=0; p<bs; p++)
				{
				  if ( ((b*bs+p) < (nb_pix-1)) && ((b*bs+p)>0) )
					{
					  //  /!\ penser a oter le test de prise en
					  // compte pour les pix sur la même ligne dans
					  // le kernel, sinon les comparaisons des
					  // sommes par colonne seront fausses
					  i = h_liste_pixels_segment[2*(b*bs + p)] ;
					  j = h_liste_pixels_segment[2*(b*bs + p) + 1] ;
					  c1 += img_1[i][j] ;
					  cx += img_x[i][j] ;
					  cx2+= img_x2[i][j];
					}
				}
			}
		  if ( ( c1 != h_somsom[3*(16*n + s + 8)]  ) || ( cx != h_somsom[3*(16*n + s + 8) + 1] )
			   ||  ( cx2 != h_somsom[3*(16*n + s + 8) + 2] ) )
			printf("seg %d - %d pix -> CPU : %lu - %lu - %lu \t|| GPU : %lu - %lu - %lu \n", s, nb_pix,
				   c1, cx, cx2, h_somsom[3*(16*n + s + 8)], h_somsom[3*(16*n + s + 8) + 1],
				   h_somsom[3*(16*n + s + 8)  + 2]) ;      
		  
		}
	  
	}
  
#endif
  
 
#ifdef DEBUG_MV
  printf("**** STATS - REF : %lf \n", *h_vrais_snake);
  for(int n=0; n<n_interval; n++)
	{
	  for(int p=0; p<8; p++)
		{
		  printf("test %d du node %d : %lu - %lu - %lu - - log_vrais = %lf\n", p, (2*n + !pairs),
				 h_stats[3*(8*n+p)], h_stats[3*(8*n+p)+1], h_stats[3*(8*n+p)+2], h_vrais[8*n+p]);
		}
	}
#endif //DEBUG_MV

 
#ifdef DEBUG_CRST
  printf("**** CROISEMENTS \n");
  for(int n=0; n<nnodes; n++)
	{
	  printf("test du seg %d : ",  n);
	  if ( h_croist[n] ) printf("CROISEMENT\n"); else printf("\n");
	}
#endif //DEBUG_CRST

 
#ifdef DEBUG_MOVE
  printf("**** MOUVEMENTS \n");
  for(int n=0; n<nnodes; n++)
	{
	  printf("Node %d : (%s) ",n, (h_move[n])? "yes":"no");
	}
#endif //DEBUG_MOVE
  
  delete h_liste_positions ;
  delete h_snake;
									 
  /*
   * fin generation snake GPU
   */ 
}