__global__ void calcul_cumuls_gpu(unsigned short * img, t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2, uint32 h, uint32 l, uint64 * gsomblocs, unsigned int N_BLOCS_LIGNE, unsigned int baseline, unsigned int nb_lines )
{
  int bs = blockDim.x ;
  int offsetSom = N_BLOCS_LIGNE * nb_lines;         //indice du premier element de cumul_x2 dans le tableau
    
  Int tib = threadIdx.x ;                           //indice du thread dans le bloc
  int idx = blockIdx.x*bs + tib ;                   //indice global du thread dans la portion d'image
  int num_ligne = blockIdx.x / N_BLOCS_LIGNE ;      //indice de la ligne du thread dans la portion d'image
  int til = idx - num_ligne*bs*N_BLOCS_LIGNE ;      //indice du thread ds la ligne
  int pos = num_ligne*l + til ;                     //indice du pixel dans la 'tranche' courante
  int pos_img = pos + baseline*l ;                  //indice correspondant dans l'image originale
  int id, stride=1;
  
  extern __shared__ tcumuls sdata[];
  //chargement en smem avec complement de 0  
  id = CFI(tib);
  if (til < l){
     	    sdata[id].x = img[ pos_img ] ;
  	    sdata[id].x2 = img[ pos_img ]*img[ pos_img ] ;
	 } else {
	   sdata[id].x = 0 ;
	   sdata[id].x2 = 0 ;
	}
	/*
	 *prefix sum en smem
	 */
	//passe 1 : construction des sommes
  
	for (int d = bs/2; d > 0; d >>= 1)
	  {
		__syncthreads();
	
		if (tib < d)
		  {
			int i  = __mul24(__mul24(2, stride), tib);
			int ai = i + stride - 1;
			int bi = CFI(ai + stride);

			ai = CFI(ai);
			//bi = CFI(bi);
			
			sdata[bi].x += sdata[ai].x;
			sdata[bi].x2 += sdata[ai].x2;
			
		  }
		stride *= 2;
	  }
  
	//stockage somme du bloc en gmem
	// et 
	//mise a zero dernier element du bloc
	
	__syncthreads();
	
	if (tib == bs-1) {
	  t_cumul_x som_x = sdata[CFI(tib)].x;
	  t_cumul_x2 som_x2 = sdata[CFI(tib)].x2;
	  if (til < l){					//il ne faut pas ecrire de somme dans le dernier bloc, elle serait hors image
	     cumul_x[ pos_img ] = som_x;  
	     cumul_x2[pos_img ] = som_x2;
	     }	     
	  gsomblocs[blockIdx.x] = som_x;
	  gsomblocs[blockIdx.x + offsetSom] = som_x2;  
	  sdata[CFI(tib)].x = 0;
	  sdata[CFI(tib)].x2 = 0;
	}
	
	//Passe 2 : construction des scans
	for (int d = 1; d <= (bs/2); d *= 2)
	  {
		stride >>= 1;
		__syncthreads();

		if (tib < d)
		  {
			int i  = __mul24(__mul24(2, stride), tib);
			int ai = i + stride - 1;
			int bi = CFI(ai + stride);

			ai = CFI(ai);
			//bi = CFI(bi);
			
			t_cumul_x tx = sdata[ai].x;
			t_cumul_x2 tx2 = sdata[ai].x2;
		
			sdata[ai].x = sdata[bi].x;
			sdata[bi].x += tx;

			sdata[ai].x2 = sdata[bi].x2;
			sdata[bi].x2 += tx2;
		  }
	  }
	
	//transfert en gmem des scans
	//il faut décaler à gauche de 1 element
	__syncthreads();
	if ( (til < l) && ( tib < (bs-1) ) ){
		cumul_x[ pos_img ] = sdata[CFI(tib+1)].x;  
		cumul_x2[pos_img ] = sdata[CFI(tib+1)].x2;
	 }
	
}


/*
  kernel pour faire le prefixsum des sommes de blocs des lignes de l'image
  les params d'executions kivonbien sont :
  - threads : la moitié de la nextpow2 du nb de blocs par ligne
  - grid : 2 x le nombre de blocs de sommes dans l'image , soit le nb_blocs_par_ligne x nb_lignes x 2 
  - shared mem : 2 x nextPow2(nb_blocs par ligne) 
*/


__global__ void scan_somblocs(uint64 * g_idata, int n_bl_l){

  extern __shared__  uint64 temp[];
    
  int tib = threadIdx.x;
  int toff = blockIdx.x*n_bl_l;         // offset des positions des sommes partielles dans le tableau g_idata
  int nmax = blockDim.x*2;              //nb max de sommes partielles par ligne

  
  int aai = tib;
  int bbi = tib + blockDim.x;
  
   
  // chargement data en smem
  temp[ CFI(aai) ] = (aai < n_bl_l)? g_idata[ toff + aai ] : 0; 
  temp[ CFI(bbi) ] = (bbi < n_bl_l)? g_idata[ toff + bbi ] : 0; 
    
  int offset = 1;

  // passe 1 de bas en haut
  for (int d = nmax/2; d > 0; d >>= 1)
    {
	  __syncthreads();

	  if (tib < d)      
        {
		  int ai = CFI(offset*(2*tib+1)-1);
		  int bi = CFI(offset*(2*tib+2)-1);

		  temp[bi] += temp[ai];
        }

	  offset *= 2;
    }
  
  // zero le dernier element
  __syncthreads();
  if (tib == 0)
    {
	  int index = CFI(nmax - 1);
	  g_idata[ toff + n_bl_l - 1 ] =  temp[ index ]; //memorisation de la somme du bloc a sa place en gmem
	  temp[index] = 0;
    }   
  
  // passe 2 de haut en bas
  for (int d = 1; d < nmax; d *= 2)
    {
	  offset /= 2;
	  
	  __syncthreads();

	  if (tib < d)
        {
		  int ai = CFI(offset*(2*tib+1)-1);
		  int bi = CFI(offset*(2*tib+2)-1);

		  uint64 t  = temp[ai];
		  temp[ai] = temp[bi];
		  temp[bi] += t; //tester atomicadd 64 bits
		  }
	  
    }
  
  __syncthreads();
  
  // transfert resultats en gmem decales d'un element vers la gauche
  g_idata[ toff + aai ] = temp[ CFI(aai + 1) ]; //pour le demi tableau smem inferieur
  if ( bbi < n_bl_l-1 ) g_idata[ toff + bbi ] = temp[CFI( bbi + 1) ]; //demi tableau smem sup. sauf  dernier=somme.
   
}


__global__ void add_soms_to_cumuls(t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2, uint32 h, uint32 l, uint64 * gsomblocs,
								   unsigned int  N_BLOCS_LIGNE, unsigned int baseline, unsigned int nb_lines){

  int bs = blockDim.x ;
  int offsetSom = N_BLOCS_LIGNE * nb_lines;
  int tib = threadIdx.x ;                           //indice du thread dans le bloc
  int idx = blockIdx.x*blockDim.x + tib ;           //indice global du thread
  int num_ligne = blockIdx.x / N_BLOCS_LIGNE ;      //indice de la ligne du thread dans l'image partielle
  int til = idx - num_ligne*bs*N_BLOCS_LIGNE ;      //indice du thread ds la ligne
  int pos = num_ligne*l + til;
  int pos_img = pos + baseline*l ;

  
  //chargement des valeurs a ajouter en smem 0<-->x et 1<-->x2
  uint64 __shared__ ajout[2];
  
  if ( til >= bs ){
	  ajout[0] = gsomblocs[blockIdx.x -1 ];
	  ajout[1] = gsomblocs[blockIdx.x -1 + offsetSom];
	__syncthreads(); //tester les nouveaux __sync__
	
	//addition par bloc
	if (til < l) {
	  cumul_x[pos_img] += ajout[0];
	  cumul_x2[pos_img]+= ajout[1];
	}
  } 
}

//calcul des SUM_1, SUM_X et SUM_X2 de l'image

__global__ void calcul_stats_image( t_cumul_x * cumul_x, t_cumul_x2 * cumul_x2, uint32 h, uint32 l, uint64 * d_stats_snake){
  uint64 sigX = 0, sigX2 = 0 ;
  for (int i=l-1; i<h*l; i+=l)
	{
	  sigX += cumul_x[i] ;
	  sigX2+= cumul_x2[i];
	}
  d_stats_snake[3] = h*l ;
  d_stats_snake[4] = sigX ;
  d_stats_snake[5] = sigX2;
  
}