X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/lniv_gpu.git/blobdiff_plain/1a59c41c272e22861e759e660a64578b47147d85..HEAD:/main.cu?ds=sidebyside

diff --git a/main.cu b/main.cu
index d9db4f8..63bf259 100644
--- a/main.cu
+++ b/main.cu
@@ -13,127 +13,77 @@
 // lib spec
 #include "defines.h"
 #include "levelines_common.h"
-
 #include "levelines_kernels.cu"
 
 
-__global__ void kernel_debil(unsigned int * ptr1, unsigned int * ptr2, unsigned int L, int val){
-  
-  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
-  unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
-  unsigned int pos = i*L +j ;
+int main(int argc, char **argv){
 
-  ptr2[pos] = val - ptr1[pos] ;
+  // use device with highest Gflops/s
+  cudaSetDevice( 0 );
   
-}
-
-int main(int argc, char **argv){
+  unsigned int timer ;
+  cutilCheckError( cutCreateTimer(&timer) );
+  cutilCheckError( cutResetTimer(timer) );
+  cutilCheckError( cutStartTimer(timer) );
 
+ 
+  //alloc bidon pour anticiper l'initialisation du GPU
+  short * d_bidon ;
+  cutilSafeCall( cudaMalloc( (void**) &d_bidon, sizeof(short))) ;  
 
   //float coef_regul = atof( argv[1] ) ;
 
-  unsigned int timer ;
-  cutilCheckError( cutCreateTimer(&timer) );
-  cutilCheckError( cutResetTimer(timer) );
   /*****************************
    *    CHARGEMENT IMAGE
    *****************************/
   char* image_path = argv[argc-1];
-  char* image_out = "./image_out.pgm" ;
+  unsigned int r = atoi(argv[1]) ;
+  bool seq_out = atoi(argv[2]) ;
+  unsigned int iter , nb_iter = atoi(argv[3]) ;
+  unsigned int  poids = 15 ;
+  char * image_out_base = "./image_out" ;
+  char * pgm_ext = ".pgm" ;
+  char image_out[80] ;
   unsigned int * h_data = NULL ;
   unsigned int * h_data_out = NULL ;
-  unsigned int H, L, size;
-
-  cutilCheckError( cutStartTimer(timer) );
-  cutilCheckError( cutLoadPGMi(image_path, &h_data, &L, &H));
-  cutilCheckError( cutStopTimer(timer) );
+  unsigned int H, L, size, psize ;
   
-  size = H * L * sizeof( unsigned int ); 
-  printf("Loaded  %d x %d = %d pixels from '%s' en %f ms,\n", L, H, size, image_path,  cutGetTimerValue(timer));
-
-
-  //essai alloc mapped
-  /*
-  cutilCheckError( cutResetTimer(timer) );
-  cutilCheckError( cutStartTimer(timer) );
-  unsigned int * h_ptr1, * d_ptr1 ;
-  unsigned int * h_ptr2, * d_ptr2 ;
-  int  h =  ;
-  int l = h ;
-  int mem = h*l*sizeof(unsigned int) ;
-  cutilSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost));
-  cutilCheckError( cutStopTimer(timer) );
-  printf("Temps set flag Mapped : %f ms\n", cutGetTimerValue(timer)) ;
-
-  cutilCheckError( cutStartTimer(timer) );
-  cutilSafeCall(cudaHostAlloc((void **)&h_ptr1, mem, cudaHostAllocMapped));
-  cutilSafeCall(cudaHostAlloc((void **)&h_ptr2, mem, cudaHostAllocMapped));
+  // chargt image
+  cutilCheckError( cutLoadPGMi(image_path, &h_data, &L, &H));
   cutilCheckError( cutStopTimer(timer) );
-  printf("Temps cumul alloc Mapped : %f ms\n", cutGetTimerValue(timer)) ;
   
-  for (int i = 0; i<h*l ; i++) h_ptr1[i] = 200 ;
+  size = H * L * sizeof( unsigned int );
+  psize = (r-1)*PSIZE_I*sizeof(ushort) ;
 
-  cutilCheckError( cutStartTimer(timer) );
-  cutilSafeCall(cudaHostGetDevicePointer((void **)&d_ptr1, (void *)h_ptr1, 0));
-  cutilSafeCall(cudaHostGetDevicePointer((void **)&d_ptr2, (void *)h_ptr2, 0));
-  cutilCheckError( cutStopTimer(timer) );
-  printf("Temps cumul get pointer  Mapped : %f ms\n", cutGetTimerValue(timer)) ;
-  
-  cutilCheckError( cutStartTimer(timer) );
-  dim3 blocks(16,16,1) ;
-  dim3 grid( h / blocks.x, l / blocks.y, 1 ) ;
-  
-  kernel_debil<<< grid, blocks >>>(d_ptr1, d_ptr2, l, 255) ;
+  printf("Longueur des chemins = %d pixels\n", r);
+  printf("Init GPU + Image %s  %d x %d = %d pixels en %f ms,\n", image_path, L, H, size, cutGetTimerValue(timer));
 
-  cutilCheckError( cutStopTimer(timer) );
-  printf("Temps total Mapped : %f ms\n", cutGetTimerValue(timer)) ;
-  
-  char * image_1 = "./image_1.pgm" ;
-  char * image_2 = "./image_2.pgm" ;
-  
-  cutilCheckError( cutSavePGMi(image_1, h_ptr1, l, h) ) ;
-  cutilCheckError( cutSavePGMi(image_2, h_ptr2, l, h) ) ;
-  */
   /*****************************
    *     FIN CHARGEMENT IMAGE
    *****************************/
-  
-  
-    
-  // use device with highest Gflops/s
-  cudaSetDevice( cutGetMaxGflopsDeviceId() );
-  
-  
-  /*
-	cutilSafeCall( cudaMallocArray(&a_Src, &floatTex, imageW, imageH) );
-	cutilSafeCall( cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float)) );
-	cutilSafeCall( cudaThreadSynchronize() );
-	cutilCheckError( cutResetTimer(hTimer) );
-	cutilCheckError( cutStartTimer(hTimer) );
-    
-	cutilSafeCall( cudaThreadSynchronize() );
-	cutilCheckError( cutStopTimer(hTimer) );
-	gpuTime = cutGetTimerValue(hTimer) / (float)iterations;
-  */
 
   cutilCheckError( cutResetTimer(timer) );
   cutilCheckError( cutStartTimer(timer) );
   // allocation mem GPU
   unsigned int * d_directions =NULL ;
   unsigned int * d_lniv, * d_estim = NULL ;
+  ushort * d_paths ;
 
   cutilSafeCall( cudaMalloc( (void**) &d_directions, size)) ;
   cutilSafeCall( cudaMalloc( (void**) &d_lniv, size ) );
   cutilSafeCall( cudaMalloc( (void**) &d_estim, size ) );
+  cutilSafeCall( cudaMalloc( (void**) &d_paths, psize ) );
 
   
   // allocate array and copy image data
   cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned);
-  cudaArray * array_img_in, *array_img_estim, *array_img_lniv;
+  //cudaChannelFormatDesc channelDescP = cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSigned);
+  cudaChannelFormatDesc channelDescP = cudaCreateChannelDesc<ushort>();
+  
+  cudaArray * array_img_in, *array_img_estim, *array_img_lniv, *array_paths;
   cutilSafeCall( cudaMallocArray( &array_img_in, &channelDesc, L, H )); 
   cutilSafeCall( cudaMemcpyToArray( array_img_in, 0, 0, h_data, size, cudaMemcpyHostToDevice)) ;
   cutilSafeCall( cudaBindTextureToArray( tex_img_in, array_img_in, channelDesc));
-  cutilCheckError( cutStopTimer(timer) );
   
   cutilSafeCall( cudaMallocArray( &array_img_estim, &channelDesc, L, H )); 
   cutilSafeCall( cudaBindTextureToArray( tex_img_estim, array_img_estim, channelDesc));
@@ -141,28 +91,76 @@ int main(int argc, char **argv){
   cutilSafeCall( cudaMallocArray( &array_img_lniv, &channelDesc, L, H )); 
   cutilSafeCall( cudaBindTextureToArray( tex_img_lniv, array_img_lniv, channelDesc));
 
+  cutilSafeCall( cudaMallocArray( &array_paths, &channelDescP, (r-1), PSIZE_I )); 
+  cutilSafeCall( cudaBindTextureToArray( tex_paths, array_paths, channelDescP));
+
+  cutilCheckError( cutStopTimer(timer) );
   printf("Temps alloc + transferts en Textures : %f ms\n", cutGetTimerValue(timer)) ;
+
   /*****************************
-   * APPELS KERNELS et chronos
+   * GENERATION des CHEMINS
    *****************************/
   cutilCheckError( cutResetTimer(timer) );
   cutilCheckError( cutStartTimer(timer) );
   
-	unsigned int iter , nb_iter = 15 ;
-	unsigned int  poids = 15 ;
-	dim3 dimBlock(8,8,1) ;
-	dim3 dimGrid( H / dimBlock.x, L / dimBlock.y, 1 ) ;
-	unsigned int smem_size = dimBlock.x * dimBlock.y * sizeof(unsigned int) ;
-	// init image estimee avec image_in
+  dim3 dimBlock(1,1,1) ;
+  dim3 dimGrid(1,1,1) ;
+  // calcul des chemins
+  kernel_calcul_paths<<< dimGrid, dimBlock, 0 >>>(d_paths, r);
+  
+  // copie du tableau en texture
+  cutilSafeCall( cudaMemcpyToArray( array_paths, 0, 0, d_paths, psize, cudaMemcpyDeviceToDevice)) ;
+  
+  cutilCheckError( cutStopTimer(timer) );
+  printf("Temps generation chemin + transfert en texture : %f ms\n", cutGetTimerValue(timer)) ;
+  
+  /*****************************
+   * APPELS KERNELS et chronos
+   *****************************/
+	dimBlock = dim3(8,8,1) ;
+	dimGrid = dim3( H / dimBlock.x,L / dimBlock.y, 1 ) ;
+
+	// pour enregistrement image lniv GPU
+	free(h_data_out) ;
+	h_data_out = new unsigned int[H*L] ;
+	
+	//init image estimee avec image_in
+	cutilCheckError( cutResetTimer(timer) );
+	cutilCheckError( cutStartTimer(timer) );
 	kernel_init_estim_from_img_in<<< dimGrid, dimBlock, 0 >>>(d_estim, L, H, 7);
 	
-	printf("Grille : %d x %d de Blocs : %d x %d - Shared mem : %d octets\n", dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y, smem_size) ;
+	// kernel de copie
+	/*
+	kernel_neutre_img2estim<<< dimGrid, dimBlock, 0>>>(d_estim, L, H);
+	*/
+	cudaThreadSynchronize() ;
+	cutilCheckError( cutStopTimer(timer) );
+	printf("Temps kernel init : %f ms\n", cutGetTimerValue(timer)) ;
+	// a remplacer par
+	/*
+	cutilCheckError( cutResetTimer(timer) );
+	cutilCheckError( cutStartTimer(timer) );
+	cutilSafeCall( cudaMemcpyFromArray( d_estim, array_img_in, 0, 0, size, cudaMemcpyDeviceToDevice)) ;
+	cutilCheckError( cutStopTimer(timer) );
+	printf("Temps memcpyFromArray : %f ms\n", cutGetTimerValue(timer)) ;
+	*/
+	printf("Grille : %d x %d de Blocs : %d x %d \n", dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y) ;
+
+	cutilCheckError( cutResetTimer(timer) );
+	cutilCheckError( cutStartTimer(timer) );
 	
 	for ( iter =0 ; iter < nb_iter ; iter++ )
 	  {
 		cutilSafeCall( cudaMemcpyToArray( array_img_estim, 0, 0, d_estim, size, cudaMemcpyDeviceToDevice)) ;
-		kernel_levelines_texture<<< dimGrid, dimBlock, 0 >>>( d_lniv, L, H );
+		//version avec/sans tentative d'utilisation de smem ( pas probante )
+		kernel_levelines_texture<<< dimGrid, dimBlock, 24*(r-1)*sizeof(short) >>>( d_lniv, L, H, r );
 		cutilSafeCall( cudaMemcpyToArray( array_img_lniv, 0, 0, d_lniv, size, cudaMemcpyDeviceToDevice)) ;
+		if (seq_out){
+		  sprintf(image_out, "%s%d%s", image_out_base, iter+1, pgm_ext) ;
+		  printf("chaine : %s\n", image_out);
+		  cutilSafeCall( cudaMemcpy(h_data_out , d_estim, size, cudaMemcpyDeviceToHost) );
+		  cutilCheckError( cutSavePGMi(image_out, h_data_out, L, H) ) ;
+		  }
 		kernel_estim_next_step_texture<<< dimGrid, dimBlock, 0 >>>(d_estim, L, H, poids) ;
 	}
 	
@@ -171,26 +169,95 @@ int main(int argc, char **argv){
 	cutilCheckError( cutStopTimer(timer) );
 	printf("Execution moy par kernel : %f ms\n", cutGetTimerValue(timer)/(float)nb_iter) ;
 	printf("Total pour %d kernels : %f ms\n", nb_iter, cutGetTimerValue(timer)) ;
-
+  
 	/**************************
 	 * VERIFS 
 	 **************************/
-	//trace des lniv sur grille de 'pas x pas'
-	//kernel_trace_levelines<<< dimGrid, dimBlock, 0 >>>(d_data, d_directions, d_data2, L, H, 16, 255) ;
-	//cudaThreadSynchronize();
-	
-	// enregistrement image lniv GPU
-	h_data_out = new unsigned int[H*L] ;
-	if ( h_data_out != NULL)
-	  cutilSafeCall( cudaMemcpy(h_data_out , d_estim, size, cudaMemcpyDeviceToHost) );
-        else
-	  printf("Echec allocation mem CPU\n"); 		
 
-	cutilCheckError( cutSavePGMi(image_out, h_data_out, L, H) ) ;
+	/**************************************************
+	 * recuperation matrice des chemins pour affichage
+	 **************************************************/
+	/*
+	int2 * h_paths = new int2[(r-1)*PSIZE_I] ;
+	cutilSafeCall( cudaMemcpyFromArray(h_paths , array_paths, 0, 0, psize, cudaMemcpyDeviceToHost) );
+	
+	//verif Di
+	printf("matrice  Di");
+	for(int idpath=0; idpath< PSIZE_I; idpath++){b
+	
+	  printf("\n");
+	  for(int idpix=0; idpix< r-1; idpix++){
+		printf(" % d ", h_paths[idpath*(r-1) + idpix].x );
+	  }
+	  printf("\t// %dÂ°", idpath*15) ;
+	}
+	//verif Dj
+	printf("\nmatrice  Dj");
+	for(int idpath=0; idpath< PSIZE_I; idpath++){
+	  printf("\n");
+	  for(int idpix=0; idpix< r-1; idpix++){
+		printf(" % d ", h_paths[idpath*(r-1) + idpix].y);
+	  }
+	  printf("\t// %dÂ°", idpath*15) ;
+	}
+	printf("\n");
+	*/
+	/***************************************************
+	 *   fin verif visuelle matrices des chemins
+	 ***************************************************/
 
-	// calcul lniv CPU
+	/***************************************************
+	 *	 execution sequentielle pour comparaison
+	 *  la comparaison n'est pertinente que
+	 * si d_lniv contient les lniv de l'image se dÃ©part
+	 **************************************************/
+	/*
+	// calcul sequentiel
+	cutilCheckError( cutResetTimer(timer) );
+	cutilCheckError( cutStartTimer(timer) );
 	
+	h_data_out = new unsigned int[H*L] ;
+	int * dout  = new int[H*L] ;
+	for ( iter =0 ; iter < nb_iter ; iter++ ){
+	  for (int i=r; i<= H-r; i++){
+		for (int j=r; j<= L-r; j++){
+		  h_data_out[i*L+j] = lniv4_value(h_data, h_paths, i, j, H, L, &dout[i*L+j], r) ;
+		}
+	  }
+	}
+	cutilCheckError( cutStopTimer(timer) );
+	printf("Execution sequentielle CPU : %f ms\n", cutGetTimerValue(timer)) ;
 	
+	// comparaison
+	unsigned int * h_lniv = new unsigned int[H*L] ;
+	int pos, cpt_err=0, cpt_pix=0 ;
+	cutilSafeCall( cudaMemcpy(h_lniv , d_lniv, size, cudaMemcpyDeviceToHost) );
+	for ( iter =0 ; iter < nb_iter ; iter++ ){
+	  for (int i=r; i<= H-r; i++){
+		for (int j=r; j<= L-r; j++){
+		  pos = i*L + j ;
+		  if ( h_data_out[ pos ] != h_lniv[ pos ] ) {
+			cpt_err++ ;
+			printf(" pixel ( %d , %d ) -> GPU= %d  CPU= %d \n", i, j, h_lniv[pos], h_data_out[pos]);
+		  }
+		  cpt_pix++ ;
+		}
+	  }
+	}
+	printf("TAUX ERREUR GPU/CPU : %d / %d \n", cpt_err, cpt_pix );
+	*/
+	//trace des lniv sur grille de 'pas x pas'
+	//kernel_trace_levelines<<< dimGrid, dimBlock, 0 >>>(d_data, d_directions, d_data2, L, H, 16, 255, r) ;
+	
+	if (!seq_out){
+	  if ( h_data_out != NULL)
+		cutilSafeCall( cudaMemcpy(h_data_out , d_estim, size, cudaMemcpyDeviceToHost) );
+	  else
+		printf("Echec allocation mem CPU\n"); 		
+	  sprintf(image_out, "%s%d%s", image_out_base, iter+1, pgm_ext) ;
+	  cutilCheckError( cutSavePGMi(image_out, h_data_out, L, H) ) ;
+	}
+		
 	// TODO verifier pourquoi les deux lignes suivantes produisent une erreur
 	//cutilExit(argc, argv);
 	//cudaThreadExit();