main.cu

   1 // libs C
   2 #include <stdlib.h>
   3 #include <stdio.h>
   4 #include <string.h>
   5 #include <math.h>
   6
   7 #include "lib_lniv.h"
   8
   9 // libs NV
  10 #include <cuda_runtime.h>
  11 #include <cutil_inline.h>
  12
  13 // lib spec
  14 #include "defines.h"
  15 #include "levelines_common.h"
  16 #include "levelines_kernels.cu"
  17
  18
  19 int main(int argc, char **argv){
  20
  21   // use device with highest Gflops/s
  22   cudaSetDevice( 0 );
  23
  24   unsigned int timer ;
  25   cutilCheckError( cutCreateTimer(&timer) );
  26   cutilCheckError( cutResetTimer(timer) );
  27   cutilCheckError( cutStartTimer(timer) );
  28
  29
  30   //alloc bidon pour anticiper l'initialisation du GPU
  31   short * d_bidon ;
  32   cutilSafeCall( cudaMalloc( (void**) &d_bidon, sizeof(short))) ;
  33
  34   //float coef_regul = atof( argv[1] ) ;
  35
  36   /*****************************
  37    *    CHARGEMENT IMAGE
  38    *****************************/
  39   char* image_path = argv[argc-1];
  40   unsigned int r = atoi(argv[1]) ;
  41   bool seq_out = atoi(argv[2]) ;
  42   unsigned int iter , nb_iter = atoi(argv[3]) ;
  43   unsigned int  poids = 15 ;
  44   char * image_out_base = "./image_out" ;
  45   char * pgm_ext = ".pgm" ;
  46   char image_out[80] ;
  47   unsigned int * h_data = NULL ;
  48   unsigned int * h_data_out = NULL ;
  49   unsigned int H, L, size, psize ;
  50
  51   // chargt image
  52   cutilCheckError( cutLoadPGMi(image_path, &h_data, &L, &H));
  53   cutilCheckError( cutStopTimer(timer) );
  54
  55   size = H * L * sizeof( unsigned int );
  56   psize = (r-1)*PSIZE_I*sizeof(ushort) ;
  57
  58   printf("Longueur des chemins = %d pixels\n", r);
  59   printf("Init GPU + Image %s  %d x %d = %d pixels en %f ms,\n", image_path, L, H, size, cutGetTimerValue(timer));
  60
  61   /*****************************
  62    *     FIN CHARGEMENT IMAGE
  63    *****************************/
  64
  65   cutilCheckError( cutResetTimer(timer) );
  66   cutilCheckError( cutStartTimer(timer) );
  67   // allocation mem GPU
  68   unsigned int * d_directions =NULL ;
  69   unsigned int * d_lniv, * d_estim = NULL ;
  70   ushort * d_paths ;
  71
  72   cutilSafeCall( cudaMalloc( (void**) &d_directions, size)) ;
  73   cutilSafeCall( cudaMalloc( (void**) &d_lniv, size ) );
  74   cutilSafeCall( cudaMalloc( (void**) &d_estim, size ) );
  75   cutilSafeCall( cudaMalloc( (void**) &d_paths, psize ) );
  76
  77
  78   // allocate array and copy image data
  79   cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned);
  80   //cudaChannelFormatDesc channelDescP = cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSigned);
  81   cudaChannelFormatDesc channelDescP = cudaCreateChannelDesc<ushort>();
  82
  83   cudaArray * array_img_in, *array_img_estim, *array_img_lniv, *array_paths;
  84   cutilSafeCall( cudaMallocArray( &array_img_in, &channelDesc, L, H ));
  85   cutilSafeCall( cudaMemcpyToArray( array_img_in, 0, 0, h_data, size, cudaMemcpyHostToDevice)) ;
  86   cutilSafeCall( cudaBindTextureToArray( tex_img_in, array_img_in, channelDesc));
  87
  88   cutilSafeCall( cudaMallocArray( &array_img_estim, &channelDesc, L, H ));
  89   cutilSafeCall( cudaBindTextureToArray( tex_img_estim, array_img_estim, channelDesc));
  90
  91   cutilSafeCall( cudaMallocArray( &array_img_lniv, &channelDesc, L, H ));
  92   cutilSafeCall( cudaBindTextureToArray( tex_img_lniv, array_img_lniv, channelDesc));
  93
  94   cutilSafeCall( cudaMallocArray( &array_paths, &channelDescP, (r-1), PSIZE_I ));
  95   cutilSafeCall( cudaBindTextureToArray( tex_paths, array_paths, channelDescP));
  96
  97   cutilCheckError( cutStopTimer(timer) );
  98   printf("Temps alloc + transferts en Textures : %f ms\n", cutGetTimerValue(timer)) ;
  99
 100   /*****************************
 101    * GENERATION des CHEMINS
 102    *****************************/
 103   cutilCheckError( cutResetTimer(timer) );
 104   cutilCheckError( cutStartTimer(timer) );
 105
 106   dim3 dimBlock(1,1,1) ;
 107   dim3 dimGrid(1,1,1) ;
 108   // calcul des chemins
 109   kernel_calcul_paths<<< dimGrid, dimBlock, 0 >>>(d_paths, r);
 110
 111   // copie du tableau en texture
 112   cutilSafeCall( cudaMemcpyToArray( array_paths, 0, 0, d_paths, psize, cudaMemcpyDeviceToDevice)) ;
 113
 114   cutilCheckError( cutStopTimer(timer) );
 115   printf("Temps generation chemin + transfert en texture : %f ms\n", cutGetTimerValue(timer)) ;
 116
 117   /*****************************
 118    * APPELS KERNELS et chronos
 119    *****************************/
 120         dimBlock = dim3(8,8,1) ;
 121         dimGrid = dim3( H / dimBlock.x,L / dimBlock.y, 1 ) ;
 122
 123         // pour enregistrement image lniv GPU
 124         free(h_data_out) ;
 125         h_data_out = new unsigned int[H*L] ;
 126
 127         //init image estimee avec image_in
 128         cutilCheckError( cutResetTimer(timer) );
 129         cutilCheckError( cutStartTimer(timer) );
 130         kernel_init_estim_from_img_in<<< dimGrid, dimBlock, 0 >>>(d_estim, L, H, 7);
 131
 132         // kernel de copie
 133         /*
 134         kernel_neutre_img2estim<<< dimGrid, dimBlock, 0>>>(d_estim, L, H);
 135         */
 136         cudaThreadSynchronize() ;
 137         cutilCheckError( cutStopTimer(timer) );
 138         printf("Temps kernel init : %f ms\n", cutGetTimerValue(timer)) ;
 139         // a remplacer par
 140         /*
 141         cutilCheckError( cutResetTimer(timer) );
 142         cutilCheckError( cutStartTimer(timer) );
 143         cutilSafeCall( cudaMemcpyFromArray( d_estim, array_img_in, 0, 0, size, cudaMemcpyDeviceToDevice)) ;
 144         cutilCheckError( cutStopTimer(timer) );
 145         printf("Temps memcpyFromArray : %f ms\n", cutGetTimerValue(timer)) ;
 146         */
 147         printf("Grille : %d x %d de Blocs : %d x %d \n", dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y) ;
 148
 149         cutilCheckError( cutResetTimer(timer) );
 150         cutilCheckError( cutStartTimer(timer) );
 151
 152         for ( iter =0 ; iter < nb_iter ; iter++ )
 153           {
 154                 cutilSafeCall( cudaMemcpyToArray( array_img_estim, 0, 0, d_estim, size, cudaMemcpyDeviceToDevice)) ;
 155                 //version avec/sans tentative d'utilisation de smem ( pas probante )
 156                 kernel_levelines_texture<<< dimGrid, dimBlock, 24*(r-1)*sizeof(short) >>>( d_lniv, L, H, r );
 157                 cutilSafeCall( cudaMemcpyToArray( array_img_lniv, 0, 0, d_lniv, size, cudaMemcpyDeviceToDevice)) ;
 158                 if (seq_out){
 159                   sprintf(image_out, "%s%d%s", image_out_base, iter+1, pgm_ext) ;
 160                   printf("chaine : %s\n", image_out);
 161                   cutilSafeCall( cudaMemcpy(h_data_out , d_estim, size, cudaMemcpyDeviceToHost) );
 162                   cutilCheckError( cutSavePGMi(image_out, h_data_out, L, H) ) ;
 163                   }
 164                 kernel_estim_next_step_texture<<< dimGrid, dimBlock, 0 >>>(d_estim, L, H, poids) ;
 165         }
 166
 167         cudaThreadSynchronize();
 168
 169         cutilCheckError( cutStopTimer(timer) );
 170         printf("Execution moy par kernel : %f ms\n", cutGetTimerValue(timer)/(float)nb_iter) ;
 171         printf("Total pour %d kernels : %f ms\n", nb_iter, cutGetTimerValue(timer)) ;
 172
 173         /**************************
 174          * VERIFS
 175          **************************/
 176
 177         /**************************************************
 178          * recuperation matrice des chemins pour affichage
 179          **************************************************/
 180         /*
 181         int2 * h_paths = new int2[(r-1)*PSIZE_I] ;
 182         cutilSafeCall( cudaMemcpyFromArray(h_paths , array_paths, 0, 0, psize, cudaMemcpyDeviceToHost) );
 183
 184         //verif Di
 185         printf("matrice  Di");
 186         for(int idpath=0; idpath< PSIZE_I; idpath++){b
 187
 188           printf("\n");
 189           for(int idpix=0; idpix< r-1; idpix++){
 190                 printf(" % d ", h_paths[idpath*(r-1) + idpix].x );
 191           }
 192           printf("\t// %d°", idpath*15) ;
 193         }
 194         //verif Dj
 195         printf("\nmatrice  Dj");
 196         for(int idpath=0; idpath< PSIZE_I; idpath++){
 197           printf("\n");
 198           for(int idpix=0; idpix< r-1; idpix++){
 199                 printf(" % d ", h_paths[idpath*(r-1) + idpix].y);
 200           }
 201           printf("\t// %d°", idpath*15) ;
 202         }
 203         printf("\n");
 204         */
 205         /***************************************************
 206          *   fin verif visuelle matrices des chemins
 207          ***************************************************/
 208
 209         /***************************************************
 210          *       execution sequentielle pour comparaison
 211          *  la comparaison n'est pertinente que
 212          * si d_lniv contient les lniv de l'image se départ
 213          **************************************************/
 214         /*
 215         // calcul sequentiel
 216         cutilCheckError( cutResetTimer(timer) );
 217         cutilCheckError( cutStartTimer(timer) );
 218
 219         h_data_out = new unsigned int[H*L] ;
 220         int * dout  = new int[H*L] ;
 221         for ( iter =0 ; iter < nb_iter ; iter++ ){
 222           for (int i=r; i<= H-r; i++){
 223                 for (int j=r; j<= L-r; j++){
 224                   h_data_out[i*L+j] = lniv4_value(h_data, h_paths, i, j, H, L, &dout[i*L+j], r) ;
 225                 }
 226           }
 227         }
 228         cutilCheckError( cutStopTimer(timer) );
 229         printf("Execution sequentielle CPU : %f ms\n", cutGetTimerValue(timer)) ;
 230
 231         // comparaison
 232         unsigned int * h_lniv = new unsigned int[H*L] ;
 233         int pos, cpt_err=0, cpt_pix=0 ;
 234         cutilSafeCall( cudaMemcpy(h_lniv , d_lniv, size, cudaMemcpyDeviceToHost) );
 235         for ( iter =0 ; iter < nb_iter ; iter++ ){
 236           for (int i=r; i<= H-r; i++){
 237                 for (int j=r; j<= L-r; j++){
 238                   pos = i*L + j ;
 239                   if ( h_data_out[ pos ] != h_lniv[ pos ] ) {
 240                         cpt_err++ ;
 241                         printf(" pixel ( %d , %d ) -> GPU= %d  CPU= %d \n", i, j, h_lniv[pos], h_data_out[pos]);
 242                   }
 243                   cpt_pix++ ;
 244                 }
 245           }
 246         }
 247         printf("TAUX ERREUR GPU/CPU : %d / %d \n", cpt_err, cpt_pix );
 248         */
 249         //trace des lniv sur grille de 'pas x pas'
 250         //kernel_trace_levelines<<< dimGrid, dimBlock, 0 >>>(d_data, d_directions, d_data2, L, H, 16, 255, r) ;
 251
 252         if (!seq_out){
 253           if ( h_data_out != NULL)
 254                 cutilSafeCall( cudaMemcpy(h_data_out , d_estim, size, cudaMemcpyDeviceToHost) );
 255           else
 256                 printf("Echec allocation mem CPU\n");
 257           sprintf(image_out, "%s%d%s", image_out_base, iter+1, pgm_ext) ;
 258           cutilCheckError( cutSavePGMi(image_out, h_data_out, L, H) ) ;
 259         }
 260
 261         // TODO verifier pourquoi les deux lignes suivantes produisent une erreur
 262         //cutilExit(argc, argv);
 263         //cudaThreadExit();
 264         return EXIT_SUCCESS ;
 265 }