main_gmem.cu

   1 // libs C
   2 #include <stdlib.h>
   3 #include <stdio.h>
   4 #include <string.h>
   5 #include <math.h>
   6
   7 #include "lib_lniv.h"
   8
   9 // libs NV
  10 #include <cuda_runtime.h>
  11 #include <cutil_inline.h>
  12
  13 // lib spec
  14 #include "defines.h"
  15 #include "levelines_common.h"
  16
  17 #include "levelines_kernels.cu"
  18
  19
  20 int main(int argc, char **argv){
  21
  22
  23   //float coef_regul = atof( argv[1] ) ;
  24
  25   unsigned int timer ;
  26   float time_cumul = 0.0 ;
  27   cutilCheckError( cutCreateTimer(&timer) );
  28   cutilCheckError( cutResetTimer(timer) );
  29
  30   /*****************************
  31    *    CHARGEMENT IMAGE
  32    *****************************/
  33   char* image_path = argv[argc-1];
  34   char* image_out = "./image_out.pgm" ;
  35   unsigned int * h_data = NULL ;
  36   unsigned int * h_data_out = NULL ;
  37   unsigned int H, L, size;
  38
  39   cutilCheckError( cutStartTimer(timer) );
  40   cutilCheckError( cutLoadPGMi(image_path, &h_data, &L, &H));
  41   cutilCheckError( cutStopTimer(timer) );
  42
  43   size = H * L * sizeof( unsigned int );
  44   printf("Loaded  %d x %d = %d pixels from '%s' en %f ms,\n", L, H, size, image_path,  cutGetTimerValue(timer));
  45   time_cumul += cutGetTimerValue(timer) ;
  46   /*****************************
  47    *     FIN CHARGEMENT IMAGE
  48    *****************************/
  49
  50
  51
  52   // use device with highest Gflops/s
  53   cudaSetDevice( cutGetMaxGflopsDeviceId() );
  54
  55   /*
  56         cutilSafeCall( cudaMallocArray(&a_Src, &floatTex, imageW, imageH) );
  57         cutilSafeCall( cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float)) );
  58         cutilSafeCall( cudaThreadSynchronize() );
  59         cutilCheckError( cutResetTimer(hTimer) );
  60         cutilCheckError( cutStartTimer(hTimer) );
  61
  62         cutilSafeCall( cudaThreadSynchronize() );
  63         cutilCheckError( cutStopTimer(hTimer) );
  64         gpuTime = cutGetTimerValue(hTimer) / (float)iterations;
  65   */
  66
  67   cutilCheckError( cutResetTimer(timer) );
  68   cutilCheckError( cutStartTimer(timer) );
  69   // allocation mem GPU
  70   unsigned int * d_directions =NULL ;
  71   unsigned int * d_lniv, * d_estim, * d_data ;
  72   cutilSafeCall( cudaMalloc( (void**) &d_directions, size)) ;
  73   cutilSafeCall( cudaMalloc( (void**) &d_lniv, size ) );
  74   cutilSafeCall( cudaMalloc( (void**) &d_estim, size ) );
  75   cutilSafeCall( cudaMalloc( (void**) &d_data, size ) );
  76   cutilCheckError( cutStopTimer(timer) );
  77   printf("Temps alloc global mem : %f ms\n", cutGetTimerValue(timer)) ;
  78   time_cumul += cutGetTimerValue(timer) ;
  79
  80   // transfert data -> GPU global mem
  81   cutilCheckError( cutStartTimer(timer) );
  82   cutilSafeCall( cudaMemcpy( d_data , h_data, size, cudaMemcpyHostToDevice) );
  83   cutilCheckError( cutStopTimer(timer) );
  84   printf("Temps transferts en global mem : %f ms\n", cutGetTimerValue(timer)) ;
  85   time_cumul += cutGetTimerValue(timer) ;
  86   /*****************************
  87    * APPELS KERNELS et chronos
  88    *****************************/
  89   cutilCheckError( cutResetTimer(timer) );
  90   cutilCheckError( cutStartTimer(timer) );
  91
  92         unsigned int iter , nb_iter = 15 ;
  93         unsigned int  poids = 15 ;
  94         dim3 dimBlock(8,8,1) ;
  95         dim3 dimGrid( H / dimBlock.x, L / dimBlock.y, 1 ) ;
  96         unsigned int smem_size = dimBlock.x * dimBlock.y * sizeof(unsigned int) ;
  97         // init image estimee avec image_in
  98         kernel_init_estim_from_img_in_global_mem<<< dimGrid, dimBlock, 0 >>>(d_data, d_estim, L, H, 7);
  99         cutilCheckError( cutStopTimer(timer) );
 100         printf("Execution moy par kernel : %f ms\n", cutGetTimerValue(timer)) ;
 101         time_cumul += cutGetTimerValue(timer) ;
 102
 103
 104         // iterations
 105         cutilCheckError( cutStartTimer(timer) );
 106         printf("Grille : %d x %d de Blocs : %d x %d - Shared mem : %d octets\n", dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y, smem_size) ;
 107         for ( iter =0 ; iter < nb_iter ; iter++ )
 108           {
 109                 kernel_levelines_global_mem<<< dimGrid, dimBlock, 0 >>>( d_estim, d_lniv, L, H );
 110                 kernel_estim_next_step_global_mem<<< dimGrid, dimBlock, 0 >>>(d_estim, d_lniv, d_data, L, H, poids) ;
 111         }
 112         cutilCheckError( cutStopTimer(timer) );
 113         printf("Execution moy par kernel : %f ms\n", cutGetTimerValue(timer)) ;
 114         time_cumul += cutGetTimerValue(timer) ;
 115
 116         printf("Execution moy par kernel : %f ms\n", cutGetTimerValue(timer)/(float)nb_iter) ;
 117         printf("Total pour %d kernels : %f ms\n", nb_iter, cutGetTimerValue(timer)) ;
 118         printf("Total execution : %f ms\n", time_cumul) ;
 119
 120         /**************************
 121          * VERIFS
 122          **************************/
 123         //trace des lniv sur grille de 'pas x pas'
 124         //kernel_trace_levelines<<< dimGrid, dimBlock, 0 >>>(d_data, d_directions, d_data2, L, H, 16, 255) ;
 125         //cudaThreadSynchronize();
 126
 127         // enregistrement image lniv GPU
 128         h_data_out = new unsigned int[H*L] ;
 129         if ( h_data_out != NULL)
 130           cutilSafeCall( cudaMemcpy(h_data_out , d_estim, size, cudaMemcpyDeviceToHost) );
 131         else
 132           printf("Echec allocation mem CPU\n");
 133
 134         cutilCheckError( cutSavePGMi(image_out, h_data_out, L, H) ) ;
 135
 136         // calcul lniv CPU
 137
 138
 139         // TODO verifier pourquoi les deux lignes suivantes produisent une erreur
 140         //cutilExit(argc, argv);
 141         //cudaThreadExit();
 142         return EXIT_SUCCESS ;
 143 }