]> AND Private Git Repository - lniv_gpu.git/commitdiff
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
version opérationnelle master
authorGilles Perrot <gilles.perrot@univ-fcomte.fr>
Tue, 6 Sep 2011 11:49:31 +0000 (13:49 +0200)
committerGilles Perrot <gilles.perrot@univ-fcomte.fr>
Tue, 6 Sep 2011 11:49:31 +0000 (13:49 +0200)
chemins de longueur variable mais rectilignes tous les 15 degres

Makefile
defines.h
image_out16.pgm
image_out6.pgm [new file with mode: 0644]
levelines_kernels.cu
lniv.cvp
lniv_Session17_Context_0.csv [new file with mode: 0644]
lniv_smemPaths_Context_0.csv [new file with mode: 0644]
main.cu
obj/release/main.cu.o

index 20361caf68a025b383e906c6aaa496ef1311d5f4..7c8c11f2730c8b1ddffb037a8d4ca2fd683fa2ec 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -47,3 +47,6 @@ CCFILES               :=
 # Rules and targets
 
 include ../../common/common.mk
+
+zul:
+       echo $(NVCCFLAGS)
\ No newline at end of file
index cddf7c62e8e2aff2765545854990a815c4e191dd..70fcaa8c14adecf2d436236bbd2aba74214919d0 100644 (file)
--- a/defines.h
+++ b/defines.h
@@ -16,8 +16,8 @@
 #define BSMAX 512
 #define MAX(x,y) ( ( (x)>=(y) )?(x):(y) )
 #define ABS(x) ( ((x)>0)?(x):-(x))
-#define DEC 4
-#define DEC2 8 
+#define DEC 3
+#define DEC2 6 
 #define CONFLICT_FREE_OFFSET(index)  ( ((index) >>(DEC)) + ((index) >>(DEC2) ) )
 #define CFO(index)  ( ( (index) >>(DEC) ) + ( (index) >>(DEC2) ) ) 
 #define CFI(index)  ( (index) + (CFO(index)) )
index 2cbfd03ebf66f20d36432e03834e159c1c7a19f1..ebca7c03587f452770d4b020e3e2293b6e0d0826 100644 (file)
Binary files a/image_out16.pgm and b/image_out16.pgm differ
diff --git a/image_out6.pgm b/image_out6.pgm
new file mode 100644 (file)
index 0000000..44ea1aa
Binary files /dev/null and b/image_out6.pgm differ
index 6838e8322a7e2e786910c26cbaa0a5132f5ccd09..6d616c66c3c1f11e7810da00329c8d9c2095e5e4 100644 (file)
@@ -78,7 +78,7 @@ __constant__ float tangente[] = {0.000, 0.268, 0.577, 1.000} ;
 texture<int, 2, cudaReadModeElementType> tex_img_in ;
 texture<int, 2, cudaReadModeElementType> tex_img_estim ;
 texture<int, 2, cudaReadModeElementType> tex_img_lniv ;
-texture<int2, 2, cudaReadModeElementType> tex_paths ;
+texture<ushort, 2, cudaReadModeElementType> tex_paths ;
 
 
 
@@ -95,20 +95,24 @@ texture<int2, 2, cudaReadModeElementType> tex_paths ;
  * considérés pour le calcul de chemins (float tangente[]).
  * 
  */
-__global__ void kernel_calcul_paths( int2 * d_paths, unsigned int r){
+__global__ void kernel_calcul_paths( ushort * d_paths, unsigned int r){
 
   unsigned int idpath = 0 ;
   int ic, jc, iprec, jprec ;
   float offset = 0.5 ;
   unsigned int basepath = 0 ;
+  char MSQ, LSQ ;
 
   // Q1 inf
   for (int a=0 ; a< 4 ; a++){        // les 4 angles 0,15,30 et 45
        for (int p=0 ; p< r ; p++){      // les r points
          ic = r-1 - floor(tangente[a]*p + offset) ;
          if ( p > 0 ){
-               d_paths[idpath*(r-1)+p-1].x = ic - iprec ;
-               d_paths[idpath*(r-1)+p-1].y = 1 ;
+               MSQ = ic - iprec ;
+               LSQ = 1 ;
+               //d_paths[idpath*(r-1)+p-1].x = ic - iprec ;
+               //d_paths[idpath*(r-1)+p-1].y = 1 ;
+               d_paths[idpath*(r-1)+p-1] = ((short)MSQ << 8) | LSQ ;
          }
          iprec = ic ;
        }
@@ -119,8 +123,11 @@ __global__ void kernel_calcul_paths( int2 * d_paths, unsigned int r){
        for (int p=0 ; p< r ; p++){      // les r points
          jc = floor(tangente[a]*p + offset) ; 
          if ( p > 0 ){
-               d_paths[idpath*(r-1)+p-1].x = -1 ;
-               d_paths[idpath*(r-1)+p-1].y = jc - jprec ;
+               MSQ = -1 ;
+               LSQ = jc - jprec ;
+               d_paths[idpath*(r-1)+p-1] = ((short)MSQ << 8) | LSQ ;
+               //d_paths[idpath*(r-1)+p-1].x = -1 ;
+               //d_paths[idpath*(r-1)+p-1].y = jc - jprec ;
          }
          jprec = jc ;
        }
@@ -131,8 +138,11 @@ __global__ void kernel_calcul_paths( int2 * d_paths, unsigned int r){
   basepath += 6 ;
   for (int a=0 ; a< 6 ; a++){         // les 6 angles 90,105,120,135,150,165
        for (int p=0 ; p<r-1 ; p++){      // les r points
-         d_paths[idpath*(r-1)+p].x = -d_paths[(idpath - basepath)*(r-1)+p].y ;
-         d_paths[idpath*(r-1)+p].y =  d_paths[(idpath - basepath)*(r-1)+p].x ;
+         MSQ = - ( d_paths[(idpath - basepath)*(r-1)+p] & 0x00FF ) ;
+         LSQ = ( d_paths[(idpath - basepath)*(r-1)+p] >> 8 ) ;
+         d_paths[idpath*(r-1)+p-1] = ((short)MSQ << 8) | LSQ ;
+         //d_paths[idpath*(r-1)+p].x = -d_paths[(idpath - basepath)*(r-1)+p].y ;
+         //d_paths[idpath*(r-1)+p].y =  d_paths[(idpath - basepath)*(r-1)+p].x ;
          }
        idpath++ ;
   }
@@ -141,8 +151,11 @@ __global__ void kernel_calcul_paths( int2 * d_paths, unsigned int r){
   basepath += 6 ;
   for (int a=0 ; a< 6 ; a++){         // les 6 angles 180,195,210,225,240,255
        for (int p=0 ; p<r-1 ; p++){      // les r points
-         d_paths[idpath*(r-1)+p].x = -d_paths[(idpath - basepath)*(r-1)+p].x ;
-         d_paths[idpath*(r-1)+p].y = -d_paths[(idpath - basepath)*(r-1)+p].y ;
+         MSQ = - ( d_paths[(idpath - basepath)*(r-1)+p] >> 8 ) ;
+         LSQ = - ( d_paths[(idpath - basepath)*(r-1)+p] & 0x00FF ) ;
+         d_paths[idpath*(r-1)+p-1] = ((short)MSQ << 8) | LSQ ;
+         //d_paths[idpath*(r-1)+p].x = -d_paths[(idpath - basepath)*(r-1)+p].x ;
+         //d_paths[idpath*(r-1)+p].y = -d_paths[(idpath - basepath)*(r-1)+p].y ;
          }
        idpath++ ;
   }
@@ -151,11 +164,15 @@ __global__ void kernel_calcul_paths( int2 * d_paths, unsigned int r){
   basepath += 6 ;
   for (int a=0 ; a< 6 ; a++){         // les 6 angles 270,285,300,315,330,345
        for (int p=0 ; p<r-1 ; p++){      // les r points
-         d_paths[idpath*(r-1)+p].x =  d_paths[(idpath - basepath)*(r-1)+p].y ;
-         d_paths[idpath*(r-1)+p].y = -d_paths[(idpath - basepath)*(r-1)+p].x ;
+         MSQ = d_paths[(idpath - basepath)*(r-1)+p] & 0x00FF ;
+         LSQ = - ( d_paths[(idpath - basepath)*(r-1)+p] >> 8 ) ;
+         d_paths[idpath*(r-1)+p-1] = ((short)MSQ << 8) | LSQ ;
+         //d_paths[idpath*(r-1)+p].x =  d_paths[(idpath - basepath)*(r-1)+p].y ;
+         //d_paths[idpath*(r-1)+p].y = -d_paths[(idpath - basepath)*(r-1)+p].x ;
          }
        idpath++ ;
   }
+
 }
 
 /**
@@ -368,12 +385,15 @@ __global__ void kernel_estim_next_step_global_mem(unsigned int * d_estim, unsign
  * Execution sur des blocs de threads 2D et une grille 2D
  * selon les dimensions de l'image.
  * L'image d'entrée doit être au préalable en mémoire texture pointée par "tex_img_estim".
- * Les matrices des chemins sont, elles, pointées par "tex_paths"
+ * Les matrices des chemins sont, elles, préalablement chargées en SHMEM depuis la texture"
  * Cette version ne fournit pas les indices des chemins pour les tracé éventuel des lniv.
  */
 __global__ void kernel_levelines_texture(unsigned int * img_out, unsigned int L, unsigned int H, unsigned int r)
 {
-  // coordonnes du point dans l'image
+  // coordonnees du point dans le bloc
+  unsigned int iib = threadIdx.x ;
+  unsigned int jib = threadIdx.y ;
+  // coordonnees du point dans l'image
   unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
   unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
     
@@ -383,7 +403,18 @@ __global__ void kernel_levelines_texture(unsigned int * img_out, unsigned int L,
   int idpath, idpix ;
   unsigned int mse_min, mse_cur, val ;
   uint2 mse ;
+  short texVal ;
+
   
+  extern __shared__ short shPath[] ;
+
+  unsigned int absPos = jib*8 + iib ;
+  if ( absPos < PSIZE_I ){
+       for ( idpix = 0; idpix < lpath-1; idpix++){
+         shPath[ idpix*24 + absPos ] = tex2D(tex_paths, idpix, absPos) ; 
+       }
+       syncthreads() ;
+  }
   
   if((i>=lpath)&&(i<=H-lpath)&&(j>=lpath)&&(j<=L-lpath)){
        z = tex2D(tex_img_estim, j, i) ;
@@ -393,8 +424,9 @@ __global__ void kernel_levelines_texture(unsigned int * img_out, unsigned int L,
          mse.x = z ;
          mse.y = z*z ;
          for( idpix=0; idpix < lpath-1 ; idpix++ ) {
-               ic += tex2D(tex_paths, idpix, idpath).x ;
-               jc += tex2D(tex_paths, idpix, idpath).y ;
+               texVal = shPath[ idpix*24 + idpath ] ;
+               ic += (char)(texVal>>8) ;
+               jc += (char)(texVal) ;
                zc = tex2D(tex_img_estim, jc, ic) ;
                mse.x += zc ;
                mse.y += zc*zc ; 
@@ -412,73 +444,6 @@ __global__ void kernel_levelines_texture(unsigned int * img_out, unsigned int L,
 }
 
 
-/**
- *
- * \brief determine les lniv en chaque point de l'image
- * \author zulu - AND
- *
- * \param[in] L         Largeur de l'image
- * \param[in] H         Hauteur de l'image
- * \param[in] r         longueur des segments
- *
- * \param[out] img_out  image des lniv 
- *
- * Execution sur des blocs de threads 2D et une grille 2D
- * selon les dimensions de l'image.
- * L'image d'entrée doit être au préalable en mémoire texture pointée par "tex_img_estim".
- * Les matrices des chemins sont, elles, pointées par "tex_paths"
- * Cette version ne fournit pas les indices des chemins pour les tracé éventuel des lniv.
- * Cette version tente d'utiliser la shared memory pour compenser la baisse de perf due aux chemins
- * paramétrables non constants.
- */
-
-__global__ void kernel_levelines_texture_smem(unsigned int * img_out, unsigned int L, unsigned int H, unsigned int r)
-{
-  // coordonnées du point dans le bloc
-  unsigned int iib = threadIdx.x ;
-  unsigned int jib = threadIdx.y ;
-  // coordonnes du point dans l'image
-  unsigned int i = blockIdx.x*blockDim.x + iib ;
-  unsigned int j = blockIdx.y*blockDim.y + jib ;
-    
-  // nb de points par chemin
-  int lpath =  r ;
-  int ic, jc ;
-  int idpath, idpix ;
-  unsigned int val, mse_cur, mse_min, z, zc ;
-  uint2 mse_data ;
-
-  //__shared__ unsigned int val_img[16*16] ;
-
-  //val_img[jib*16+iib] = tex2D(tex_img_estim, j, i) ;
-  
-  if((i>=lpath)&&(i<=H-lpath)&&(j>=lpath)&&(j<=L-lpath)){
-       z = tex2D(tex_img_estim, j, i) ;
-       for( idpath=0; idpath < PSIZE_I ; idpath++) {
-         ic = i ;
-         jc = j ;
-         mse_data.x = z ;
-         mse_data.y = z*z ;
-         mse_min = mse_data.y - mse_data.x/lpath*mse_data.y ;
-         for( idpix=0; idpix < lpath-1 ; idpix++ ) {
-               ic += tex2D(tex_paths, idpix, idpath).x ;
-               jc += tex2D(tex_paths, idpix, idpath).y ;
-               zc = tex2D(tex_img_estim, jc, ic) ;
-               mse_data.x += zc ;
-               mse_data.y += zc*zc ; 
-         }
-         // critere de selection du chemin ( SUM_(X2) - SUM_(X)2 / lpath )
-         // TODO cherchera  ameliorer pour vitesse
-         mse_cur = ( mse_data.y - ( mse_data.x / lpath ) * mse_data.x ) ;
-         if ( mse_cur < mse_min ){
-               mse_min = mse_cur ;
-               val = mse_data.x ;
-         } 
-       }
-       img_out[ i*L + j ] = val / lpath ; 
-  }
-}
-
 /**
  *
  * \brief trace les segments sur un maillage carré
@@ -498,6 +463,7 @@ __global__ void kernel_levelines_texture_smem(unsigned int * img_out, unsigned i
  * execution sur des blocs de threads 2D et une grille 2D
  * selon les dimensions de l'image
  */
+/*
 __global__ void kernel_trace_levelines(unsigned int * img_in, unsigned int * dir, unsigned int * img_out,
                                                                           unsigned int L, unsigned int H, unsigned int pas, unsigned int ng,
                                                                           unsigned int r ){
@@ -525,3 +491,4 @@ __global__ void kernel_trace_levelines(unsigned int * img_in, unsigned int * dir
   }
   
 }
+*/
index 741b8ce832c95867911fabe8858da7eda5fe9893..1802ee0b53a4320ec684ebdf396a3ca322ca85b0 100644 (file)
--- a/lniv.cvp
+++ b/lniv.cvp
@@ -7,7 +7,7 @@
   <workdirpath>/home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu</workdirpath>
   <datetime>21 Jun 2011 10:47:10</datetime>
   <normalizedcounter>false</normalizedcounter>
-  <device multProcessorCount="30" warpSize="32" textureAlignment="256" maxRegPerBlock="16384" computeCapability="1.3" maxThreadPerBlock="512" totalConstMem="65536" maxGridDim="65535 65535" maxShareMemPerBlock="16384" id="0" chip="160" maxBlockDim="512 512 64" GpuOverlap="1" name="Tesla C1060" label="Device_0" clockFreq="1296000" maxPitchMem="2147483647" >
+  <device multProcessorCount="30" warpSize="32" textureAlignment="256" maxRegPerBlock="16384" computeCapability="1.3" maxThreadPerBlock="512" maxGridDim="65535 65535" maxShareMemPerBlock="16384" id="0" chip="160" maxBlockDim="512 512 64" GpuOverlap="1" name="Tesla C1060" label="Device_0" clockFreq="1296000" maxPitchMem="2147483647" >
    <context totalcountercount="0" invalidcountercount="0" id="0" label="Context_0" truncatedcountercount="0" trace="1" >
     <run number="2" >branch,divergent_branch,instructions,warp_serialize</run>
     <run number="3" >cta_launched,local_load,local_store,gld_32b</run>
@@ -25,7 +25,7 @@
   <workdirpath>/home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu</workdirpath>
   <datetime>21 Jun 2011 10:50:54</datetime>
   <normalizedcounter>false</normalizedcounter>
-  <device multProcessorCount="30" warpSize="32" textureAlignment="256" maxRegPerBlock="16384" computeCapability="1.3" maxThreadPerBlock="512" totalConstMem="65536" maxGridDim="65535 65535" maxShareMemPerBlock="16384" id="0" chip="160" maxBlockDim="512 512 64" GpuOverlap="1" name="Tesla C1060" label="Device_0" clockFreq="1296000" maxPitchMem="2147483647" >
+  <device multProcessorCount="30" warpSize="32" textureAlignment="256" maxRegPerBlock="16384" computeCapability="1.3" maxThreadPerBlock="512" maxGridDim="65535 65535" maxShareMemPerBlock="16384" id="0" chip="160" maxBlockDim="512 512 64" GpuOverlap="1" name="Tesla C1060" label="Device_0" clockFreq="1296000" maxPitchMem="2147483647" >
    <context totalcountercount="0" invalidcountercount="0" id="0" label="Context_0" truncatedcountercount="0" trace="0" >
     <run number="2" >branch,divergent_branch,instructions,warp_serialize</run>
     <run number="3" >cta_launched,local_load,local_store,gld_32b</run>
@@ -43,7 +43,7 @@
   <workdirpath>/home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu</workdirpath>
   <datetime>21 Jun 2011 11:10:33</datetime>
   <normalizedcounter>false</normalizedcounter>
-  <device multProcessorCount="30" warpSize="32" textureAlignment="256" maxRegPerBlock="16384" computeCapability="1.3" maxThreadPerBlock="512" totalConstMem="65536" maxGridDim="65535 65535" maxShareMemPerBlock="16384" id="0" chip="160" maxBlockDim="512 512 64" GpuOverlap="1" name="Tesla C1060" label="Device_0" clockFreq="1296000" maxPitchMem="2147483647" >
+  <device multProcessorCount="30" warpSize="32" textureAlignment="256" maxRegPerBlock="16384" computeCapability="1.3" maxThreadPerBlock="512" maxGridDim="65535 65535" maxShareMemPerBlock="16384" id="0" chip="160" maxBlockDim="512 512 64" GpuOverlap="1" name="Tesla C1060" label="Device_0" clockFreq="1296000" maxPitchMem="2147483647" >
    <context totalcountercount="0" invalidcountercount="0" id="0" label="Context_0" truncatedcountercount="0" trace="0" >
     <run number="2" >branch,divergent_branch,instructions,warp_serialize</run>
     <run number="3" >cta_launched,local_load,local_store,gld_32b</run>
@@ -61,7 +61,7 @@
   <workdirpath>/home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu</workdirpath>
   <datetime>21 Jun 2011 11:16:24</datetime>
   <normalizedcounter>false</normalizedcounter>
-  <device multProcessorCount="30" warpSize="32" textureAlignment="256" maxRegPerBlock="16384" computeCapability="1.3" maxThreadPerBlock="512" totalConstMem="65536" maxGridDim="65535 65535" maxShareMemPerBlock="16384" id="0" chip="160" maxBlockDim="512 512 64" GpuOverlap="1" name="Tesla C1060" label="Device_0" clockFreq="1296000" maxPitchMem="2147483647" >
+  <device multProcessorCount="30" warpSize="32" textureAlignment="256" maxRegPerBlock="16384" computeCapability="1.3" maxThreadPerBlock="512" maxGridDim="65535 65535" maxShareMemPerBlock="16384" id="0" chip="160" maxBlockDim="512 512 64" GpuOverlap="1" name="Tesla C1060" label="Device_0" clockFreq="1296000" maxPitchMem="2147483647" >
    <context totalcountercount="0" invalidcountercount="0" id="0" label="Context_0" truncatedcountercount="0" trace="0" >
     <run number="2" >branch,divergent_branch,instructions,warp_serialize</run>
     <run number="3" >cta_launched,local_load,local_store,gld_32b</run>
   <workdirpath>/home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu</workdirpath>
   <datetime>21 Jun 2011 11:30:11</datetime>
   <normalizedcounter>false</normalizedcounter>
-  <device multProcessorCount="30" warpSize="32" textureAlignment="256" maxRegPerBlock="16384" computeCapability="1.3" maxThreadPerBlock="512" totalConstMem="65536" maxGridDim="65535 65535" maxShareMemPerBlock="16384" id="0" chip="160" maxBlockDim="512 512 64" GpuOverlap="1" name="Tesla C1060" label="Device_0" clockFreq="1296000" maxPitchMem="2147483647" >
-   <context totalcountercount="0" invalidcountercount="0" id="0" label="Context_0" truncatedcountercount="0" trace="0" >
-    <run number="2" >branch,divergent_branch,instructions,warp_serialize</run>
-    <run number="3" >cta_launched,local_load,local_store,gld_32b</run>
-    <run number="4" >gld_64b,gld_128b,gst_32b,gst_64b</run>
-    <run number="5" >gst_128b</run>
-    <run number="6" >gld_request,gst_request</run>
-    <run number="7" >tex_cache_hit,tex_cache_miss</run>
-   </context>
-  </device>
- </session>
- <session>
-  <name>Session19</name>
-  <progpath>"/home/perrot/NVIDIA_GPU_Computing_SDK/C/bin/linux/release/lniv"</progpath>
-  <cmdlineargs>4 0 15 /home/perrot/Images/cochons/cochon_512b.pgm</cmdlineargs>
-  <workdirpath>/home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu</workdirpath>
-  <datetime>21 Jun 2011 14:10:47</datetime>
-  <normalizedcounter>false</normalizedcounter>
-  <device multProcessorCount="30" warpSize="32" textureAlignment="256" maxRegPerBlock="16384" computeCapability="1.3" maxThreadPerBlock="512" totalConstMem="65536" maxGridDim="65535 65535" maxShareMemPerBlock="16384" id="0" chip="160" maxBlockDim="512 512 64" GpuOverlap="1" name="Tesla C1060" label="Device_0" clockFreq="1296000" maxPitchMem="2147483647" >
+  <device multProcessorCount="30" warpSize="32" textureAlignment="256" maxRegPerBlock="16384" computeCapability="1.3" maxThreadPerBlock="512" maxGridDim="65535 65535" maxShareMemPerBlock="16384" id="0" chip="160" maxBlockDim="512 512 64" GpuOverlap="1" name="Tesla C1060" label="Device_0" clockFreq="1296000" maxPitchMem="2147483647" >
    <context totalcountercount="0" invalidcountercount="0" id="0" label="Context_0" truncatedcountercount="0" trace="0" >
     <run number="2" >branch,divergent_branch,instructions,warp_serialize</run>
     <run number="3" >cta_launched,local_load,local_store,gld_32b</run>
   </device>
  </session>
  <session>
-  <name>Session21</name>
+  <name>smemPaths</name>
   <progpath>"/home/perrot/NVIDIA_GPU_Computing_SDK/C/bin/linux/release/lniv"</progpath>
   <cmdlineargs>4 0 15 /home/perrot/Images/cochons/cochon_512b.pgm</cmdlineargs>
   <workdirpath>/home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu</workdirpath>
-  <datetime>21 Jun 2011 14:27:50</datetime>
+  <datetime>22 Jun 2011 11:18:26</datetime>
   <normalizedcounter>false</normalizedcounter>
   <device multProcessorCount="30" warpSize="32" textureAlignment="256" maxRegPerBlock="16384" computeCapability="1.3" maxThreadPerBlock="512" totalConstMem="65536" maxGridDim="65535 65535" maxShareMemPerBlock="16384" id="0" chip="160" maxBlockDim="512 512 64" GpuOverlap="1" name="Tesla C1060" label="Device_0" clockFreq="1296000" maxPitchMem="2147483647" >
    <context totalcountercount="0" invalidcountercount="0" id="0" label="Context_0" truncatedcountercount="0" trace="0" >
   </device>
  </session>
  <session>
-  <name>Session22</name>
+  <name>Session17</name>
   <progpath>"/home/perrot/NVIDIA_GPU_Computing_SDK/C/bin/linux/release/lniv"</progpath>
   <cmdlineargs>4 0 15 /home/perrot/Images/cochons/cochon_512b.pgm</cmdlineargs>
   <workdirpath>/home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu</workdirpath>
-  <datetime>21 Jun 2011 14:34:00</datetime>
+  <datetime>22 Jun 2011 11:27:01</datetime>
   <normalizedcounter>false</normalizedcounter>
   <device multProcessorCount="30" warpSize="32" textureAlignment="256" maxRegPerBlock="16384" computeCapability="1.3" maxThreadPerBlock="512" totalConstMem="65536" maxGridDim="65535 65535" maxShareMemPerBlock="16384" id="0" chip="160" maxBlockDim="512 512 64" GpuOverlap="1" name="Tesla C1060" label="Device_0" clockFreq="1296000" maxPitchMem="2147483647" >
    <context totalcountercount="0" invalidcountercount="0" id="0" label="Context_0" truncatedcountercount="0" trace="0" >
diff --git a/lniv_Session17_Context_0.csv b/lniv_Session17_Context_0.csv
new file mode 100644 (file)
index 0000000..1f63b5c
--- /dev/null
@@ -0,0 +1,70 @@
+# CUDA_PROFILE_LOG_VERSION 2.0
+# CUDA_PROFILE_CSV 1
+# TIMESTAMPFACTOR fffff6ef6d4ddfa0
+# CUDA_DEVICE 0 Tesla C1060
+gpustarttimestamp,method,gputime,cputime,occupancy,gridSizeX,gridSizeY,gridSizeZ,blockSizeX,blockSizeY,blockSizeZ,dynSmemPerBlock,staSmemPerBlock,registerPerThread,streamID,localblocksize,memTransferSize,memtransferhostmemtype,branch,divergent_branch,instructions,warp_serialize,cta_launched,local_load,local_store,gld_32b,gld_64b,gld_128b,gst_32b,gst_64b,gst_128b,gld_request,gst_request,tex_cache_hit,tex_cache_miss
+1229865247212140,memcpyHtoA,177.856,336,,,,,,,,,,,0,0,1048576,0
+1229865247279f80,_Z19kernel_calcul_pathsPtj,47.808,13,0.031,1,1,13486368,1,1,1,0,32,10,0,-1,,0,168,0,1834,0,1,0,0,108,0,0,72,0,0,108,72,0,0
+1229865247286460,memcpyDtoA,4.352,6,,,,,,,,,,,0,0,144,0
+122986524728b1e0,_Z29kernel_init_estim_from_img_inPjjjj,1600.64,8,0.5,64,64,13,8,8,1,0,48,12,0,-1,,0,78756,10,478914,0,409,0,0,0,0,0,12416,0,0,0,266,625958,89992
+122986524741da80,memcpyDtoA,47.872,6,,,,,,,,,,,0,0,1048576,0
+122986524742eea0,_Z24kernel_levelines_texturePjjjj,1925.92,9,0.5,64,64,12923796,8,8,1,144,48,16,0,-1,,0,49113,148,716401,1106,410,0,0,0,0,0,12802,0,0,0,272,184897,54656
+1229865247605c40,memcpyDtoA,47.68,4,,,,,,,,,,,0,0,1048576,0
+1229865247615220,_Z30kernel_estim_next_step_texturePjjjj,479.392,6,0.5,64,64,1,8,8,1,0,48,9,0,-1,,0,816,0,15510,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636
+122986524768ad20,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0
+1229865247698320,_Z24kernel_levelines_texturePjjjj,1940.35,4,0.5,64,64,14781808,8,8,1,144,48,16,0,-1,,0,49113,149,716405,1062,410,0,0,0,0,0,12804,0,0,0,272,185864,53908
+12298652478728e0,memcpyDtoA,47.616,4,,,,,,,,,,,0,0,1048576,0
+122986524787ff60,_Z30kernel_estim_next_step_texturePjjjj,472.448,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15511,0,410,0,0,0,0,0,13120,0,0,0,272,4920,1640
+12298652478f3f40,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0
+1229865247901580,_Z24kernel_levelines_texturePjjjj,1938.46,4,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,49113,147,716400,1080,409,0,0,0,0,0,12800,0,0,0,272,184958,54513
+1229865247adb480,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0
+1229865247ae8ae0,_Z30kernel_estim_next_step_texturePjjjj,472.8,3,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15511,0,410,0,0,0,0,0,13120,0,0,0,272,4920,1640
+1229865247b5cc20,memcpyDtoA,47.616,4,,,,,,,,,,,0,0,1048576,0
+1229865247b6a240,_Z24kernel_levelines_texturePjjjj,1946.3,3,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,49113,149,716406,1089,409,0,0,0,0,0,12788,0,0,0,272,184803,54668
+1229865247d45f40,memcpyDtoA,47.552,4,,,,,,,,,,,0,0,1048576,0
+1229865247d53520,_Z30kernel_estim_next_step_texturePjjjj,468.192,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15512,0,410,0,0,0,0,0,13120,0,0,0,272,4920,1640
+1229865247dc6460,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0
+1229865247dd3aa0,_Z24kernel_levelines_texturePjjjj,1931.3,4,0.5,64,64,14780256,8,8,1,144,48,16,0,-1,,0,49113,147,716403,1082,410,0,0,0,0,0,12815,0,0,0,272,185389,54456
+1229865247fabd20,memcpyDtoA,47.552,3,,,,,,,,,,,0,0,1048576,0
+1229865247fb9300,_Z30kernel_estim_next_step_texturePjjjj,468.128,3,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15511,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636
+122986524802c220,memcpyDtoA,47.552,4,,,,,,,,,,,0,0,1048576,0
+12298652480397e0,_Z24kernel_levelines_texturePjjjj,1923.81,3,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,48941,149,713807,1061,410,0,0,0,0,0,12802,0,0,0,271,184460,55093
+122986524820fd20,memcpyDtoA,47.584,4,,,,,,,,,,,0,0,1048576,0
+122986524821d340,_Z30kernel_estim_next_step_texturePjjjj,482.88,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15510,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636
+1229865248293bc0,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0
+12298652482a11a0,_Z24kernel_levelines_texturePjjjj,1931.9,3,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,48941,147,713807,1039,410,0,0,0,0,0,12804,0,0,0,271,186162,53610
+12298652484796a0,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0
+1229865248486c80,_Z30kernel_estim_next_step_texturePjjjj,478.944,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15625,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640
+12298652484fc5a0,memcpyDtoA,47.68,4,,,,,,,,,,,0,0,1048576,0
+1229865248509c20,_Z24kernel_levelines_texturePjjjj,1942.02,4,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,48752,146,711135,1071,409,0,0,0,0,0,12800,0,0,0,270,184874,54597
+12298652486e48a0,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0
+12298652486f1ee0,_Z30kernel_estim_next_step_texturePjjjj,479.84,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15624,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640
+1229865248767b80,memcpyDtoA,47.552,3,,,,,,,,,,,0,0,1048576,0
+1229865248775160,_Z24kernel_levelines_texturePjjjj,1925.86,4,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,48752,146,711137,1052,409,0,0,0,0,0,12788,0,0,0,270,184892,54579
+122986524894be80,memcpyDtoA,47.584,4,,,,,,,,,,,0,0,1048576,0
+12298652489594c0,_Z30kernel_estim_next_step_texturePjjjj,473.632,3,0.5,64,64,32767,8,8,1,0,48,9,0,-1,,0,822,0,15624,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640
+12298652489cd920,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0
+12298652489dafc0,_Z24kernel_levelines_texturePjjjj,1946.14,4,0.5,64,64,144,8,8,1,144,48,16,0,-1,,0,48752,146,711139,1029,410,0,0,0,0,0,12815,0,0,0,270,185136,54709
+1229865248bb6c40,memcpyDtoA,47.712,4,,,,,,,,,,,0,0,1048576,0
+1229865248bc4320,_Z30kernel_estim_next_step_texturePjjjj,470.08,4,0.5,64,64,48,8,8,1,0,48,9,0,-1,,0,822,0,15624,0,409,0,0,0,0,0,13088,0,0,0,274,4908,1636
+1229865248c379c0,memcpyDtoA,47.552,4,,,,,,,,,,,0,0,1048576,0
+1229865248c44fe0,_Z24kernel_levelines_texturePjjjj,1932.83,4,0.5,64,64,13964298,8,8,1,144,48,16,0,-1,,0,48752,146,711138,1048,410,0,0,0,0,0,12802,0,0,0,270,184798,54755
+1229865248e1d840,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0
+1229865248e2aea0,_Z30kernel_estim_next_step_texturePjjjj,467.36,4,0.5,64,64,7602289,8,8,1,0,48,9,0,-1,,0,822,0,15626,0,409,0,0,0,0,0,13088,0,0,0,274,4908,1636
+1229865248e9daa0,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0
+1229865248eab120,_Z24kernel_levelines_texturePjjjj,1921.12,4,0.5,64,64,6226030,8,8,1,144,48,16,0,-1,,0,48752,146,711133,1089,410,0,0,0,0,0,12804,0,0,0,270,185686,54086
+1229865249080bc0,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0
+122986524908e240,_Z30kernel_estim_next_step_texturePjjjj,467.456,4,0.5,64,64,3145780,8,8,1,0,48,9,0,-1,,0,822,0,15626,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640
+1229865249100e80,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0
+122986524910e480,_Z24kernel_levelines_texturePjjjj,1931.62,3,0.5,64,64,3670071,8,8,1,144,48,16,0,-1,,0,48752,146,711135,1087,409,0,0,0,0,0,12800,0,0,0,270,185448,54023
+12298652492e6840,memcpyDtoA,47.616,4,,,,,,,,,,,0,0,1048576,0
+12298652492f3ea0,_Z30kernel_estim_next_step_texturePjjjj,485.568,4,0.5,64,64,3145776,8,8,1,0,48,9,0,-1,,0,822,0,15626,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640
+122986524936b1c0,memcpyDtoA,47.552,4,,,,,,,,,,,0,0,1048576,0
+12298652493787c0,_Z24kernel_levelines_texturePjjjj,1927.55,4,0.5,64,64,6225971,8,8,1,144,48,16,0,-1,,0,48752,146,711136,1043,409,0,0,0,0,0,12788,0,0,0,270,185851,53620
+122986524954fb80,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0
+122986524955d1e0,_Z30kernel_estim_next_step_texturePjjjj,475.648,3,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15626,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640
+12298652495d1e40,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0
+12298652495df480,_Z24kernel_levelines_texturePjjjj,1971.1,4,0.5,64,64,64,8,8,1,144,48,16,0,-1,,0,49113,150,716402,1074,410,0,0,0,0,0,12815,0,0,0,272,185344,54501
+12298652497c1280,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0
+12298652497ce8a0,_Z30kernel_estim_next_step_texturePjjjj,476.032,3,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15510,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636
+122986524988d200,memcpyDtoH,191.296,740,,,,,,,,,,,0,0,1048576,0
diff --git a/lniv_smemPaths_Context_0.csv b/lniv_smemPaths_Context_0.csv
new file mode 100644 (file)
index 0000000..359a966
--- /dev/null
@@ -0,0 +1,70 @@
+# CUDA_PROFILE_LOG_VERSION 2.0
+# CUDA_PROFILE_CSV 1
+# TIMESTAMPFACTOR fffff6ef6cfa6038
+# CUDA_DEVICE 0 Tesla C1060
+gpustarttimestamp,method,gputime,cputime,occupancy,gridSizeX,gridSizeY,gridSizeZ,blockSizeX,blockSizeY,blockSizeZ,dynSmemPerBlock,staSmemPerBlock,registerPerThread,streamID,localblocksize,memTransferSize,memtransferhostmemtype,branch,divergent_branch,instructions,warp_serialize,cta_launched,local_load,local_store,gld_32b,gld_64b,gld_128b,gst_32b,gst_64b,gst_128b,gld_request,gst_request,tex_cache_hit,tex_cache_miss
+122985da5f305f60,memcpyHtoA,177.92,331,,,,,,,,,,,0,0,1048576,0
+122985da5f36ee60,_Z19kernel_calcul_pathsPtj,47.808,14,0.031,1,1,13732448,1,1,1,0,32,10,0,-1,,0,168,0,1834,0,1,0,0,108,0,0,72,0,0,108,72,0,0
+122985da5f37b380,memcpyDtoA,4.32,5,,,,,,,,,,,0,0,144,0
+122985da5f3800e0,_Z29kernel_init_estim_from_img_inPjjjj,1586.94,8,0.5,64,64,8,8,8,1,0,48,12,0,-1,,0,78756,10,478915,0,409,0,0,0,0,0,12416,0,0,0,266,620103,95847
+122985da5f50d800,memcpyDtoA,48,6,,,,,,,,,,,0,0,1048576,0
+122985da5f51ebc0,_Z24kernel_levelines_texturePjjjj,1930.4,9,0.5,64,64,12923760,8,8,1,144,48,16,0,-1,,0,49113,148,690702,1881,410,0,0,0,0,0,12802,0,0,0,272,236380,3173
+122985da5f6f6ae0,memcpyDtoA,47.616,3,,,,,,,,,,,0,0,1048576,0
+122985da5f706100,_Z30kernel_estim_next_step_texturePjjjj,466.528,6,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15511,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636
+122985da5f7789e0,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0
+122985da5f786060,_Z24kernel_levelines_texturePjjjj,1930.21,4,0.5,64,64,-64,8,8,1,144,48,16,0,-1,,0,49113,149,690700,1877,410,0,0,0,0,0,12804,0,0,0,272,236557,3215
+122985da5f95dec0,memcpyDtoA,47.584,4,,,,,,,,,,,0,0,1048576,0
+122985da5f96b520,_Z30kernel_estim_next_step_texturePjjjj,472.32,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15511,0,410,0,0,0,0,0,13120,0,0,0,272,4920,1640
+122985da5f9df480,memcpyDtoA,47.68,4,,,,,,,,,,,0,0,1048576,0
+122985da5f9ecac0,_Z24kernel_levelines_texturePjjjj,1921.98,4,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,49113,147,690706,1882,409,0,0,0,0,0,12800,0,0,0,272,236235,3236
+122985da5fbc28c0,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0
+122985da5fbcfea0,_Z30kernel_estim_next_step_texturePjjjj,465.536,4,0.5,64,64,-64,8,8,1,0,48,9,0,-1,,0,816,0,15513,0,410,0,0,0,0,0,13120,0,0,0,272,4920,1640
+122985da5fc42380,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0
+122985da5fc4f980,_Z24kernel_levelines_texturePjjjj,1916.54,4,0.5,64,64,8057472,8,8,1,144,48,16,0,-1,,0,49113,149,690708,1895,409,0,0,0,0,0,12788,0,0,0,272,236073,3398
+122985da5fe24300,memcpyDtoA,47.552,3,,,,,,,,,,,0,0,1048576,0
+122985da5fe318e0,_Z30kernel_estim_next_step_texturePjjjj,470.784,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15510,0,410,0,0,0,0,0,13120,0,0,0,272,4920,1640
+122985da5fea5280,memcpyDtoA,47.712,3,,,,,,,,,,,0,0,1048576,0
+122985da5feb2900,_Z24kernel_levelines_texturePjjjj,1916.61,4,0.5,64,64,144,8,8,1,144,48,16,0,-1,,0,49113,147,690703,1870,410,0,0,0,0,0,12815,0,0,0,272,236510,3335
+122985da60087220,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0
+122985da60094840,_Z30kernel_estim_next_step_texturePjjjj,465.344,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15510,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636
+122985da60106c60,memcpyDtoA,47.776,3,,,,,,,,,,,0,0,1048576,0
+122985da60114320,_Z24kernel_levelines_texturePjjjj,1914.43,4,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,48941,149,688205,1889,410,0,0,0,0,0,12802,0,0,0,271,236103,3450
+122985da602e83c0,memcpyDtoA,47.616,3,,,,,,,,,,,0,0,1048576,0
+122985da602f5a00,_Z30kernel_estim_next_step_texturePjjjj,477.952,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15511,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636
+122985da6036af60,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0
+122985da603785a0,_Z24kernel_levelines_texturePjjjj,1909.12,3,0.5,64,64,6881375,8,8,1,144,48,16,0,-1,,0,48941,147,688211,1849,410,0,0,0,0,0,12804,0,0,0,271,236529,3243
+122985da6054b180,memcpyDtoA,47.616,7,,,,,,,,,,,0,0,1048576,0
+122985da605587e0,_Z30kernel_estim_next_step_texturePjjjj,474.72,4,0.5,64,64,3276855,8,8,1,0,48,9,0,-1,,0,822,0,15624,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640
+122985da605cd080,memcpyDtoA,47.552,3,,,,,,,,,,,0,0,1048576,0
+122985da605da680,_Z24kernel_levelines_texturePjjjj,1920.1,4,0.5,64,64,3145783,8,8,1,144,48,16,0,-1,,0,48752,146,685626,1884,409,0,0,0,0,0,12800,0,0,0,270,235824,3647
+122985da607afd40,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0
+122985da607bd3a0,_Z30kernel_estim_next_step_texturePjjjj,490.048,4,0.5,64,64,3604536,8,8,1,0,48,9,0,-1,,0,822,0,15624,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640
+122985da60835860,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0
+122985da60842ea0,_Z24kernel_levelines_texturePjjjj,1915.78,4,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,48752,146,685627,1911,409,0,0,0,0,0,12788,0,0,0,270,236322,3149
+122985da60a174c0,memcpyDtoA,47.584,4,,,,,,,,,,,0,0,1048576,0
+122985da60a24aa0,_Z30kernel_estim_next_step_texturePjjjj,480.128,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15624,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640
+122985da60a9a8a0,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0
+122985da60aa7ec0,_Z24kernel_levelines_texturePjjjj,1911.39,3,0.5,64,64,48,8,8,1,144,48,16,0,-1,,0,48752,146,685632,1882,410,0,0,0,0,0,12815,0,0,0,270,236367,3478
+122985da60c7b3a0,memcpyDtoA,47.616,4,,,,,,,,,,,0,0,1048576,0
+122985da60c889e0,_Z30kernel_estim_next_step_texturePjjjj,476.992,3,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15624,0,409,0,0,0,0,0,13088,0,0,0,274,4908,1636
+122985da60cfdb80,memcpyDtoA,47.552,4,,,,,,,,,,,0,0,1048576,0
+122985da60d0b160,_Z24kernel_levelines_texturePjjjj,1912.67,4,0.5,64,64,9532554,8,8,1,144,48,16,0,-1,,0,48752,146,685631,1906,410,0,0,0,0,0,12802,0,0,0,270,236347,3206
+122985da60edeb00,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0
+122985da60eec160,_Z30kernel_estim_next_step_texturePjjjj,486.112,3,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15627,0,409,0,0,0,0,0,13088,0,0,0,274,4908,1636
+122985da60f63660,memcpyDtoA,47.584,4,,,,,,,,,,,0,0,1048576,0
+122985da60f70c40,_Z24kernel_levelines_texturePjjjj,1913.22,3,0.5,64,64,97,8,8,1,144,48,16,0,-1,,0,48752,146,685626,1867,410,0,0,0,0,0,12804,0,0,0,270,236389,3383
+122985da61144800,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0
+122985da61151e40,_Z30kernel_estim_next_step_texturePjjjj,487.296,3,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15626,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640
+122985da611c9820,memcpyDtoA,47.584,4,,,,,,,,,,,0,0,1048576,0
+122985da611d6e40,_Z24kernel_levelines_texturePjjjj,1912.86,4,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,48752,146,685627,1873,409,0,0,0,0,0,12800,0,0,0,270,236113,3358
+122985da613aa8e0,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0
+122985da613b7f40,_Z30kernel_estim_next_step_texturePjjjj,480.544,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15626,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640
+122985da6142dea0,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0
+122985da6143b500,_Z24kernel_levelines_texturePjjjj,1914.14,4,0.5,64,64,12032,8,8,1,144,48,16,0,-1,,0,48752,146,685630,1884,409,0,0,0,0,0,12788,0,0,0,270,235919,3552
+122985da6160f480,memcpyDtoA,47.744,3,,,,,,,,,,,0,0,1048576,0
+122985da6161cb60,_Z30kernel_estim_next_step_texturePjjjj,488.672,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15627,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640
+122985da61694a80,memcpyDtoA,47.616,4,,,,,,,,,,,0,0,1048576,0
+122985da616a20a0,_Z24kernel_levelines_texturePjjjj,1930.24,3,0.5,64,64,12032,8,8,1,144,48,16,0,-1,,0,49113,150,690707,1868,410,0,0,0,0,0,12815,0,0,0,272,236723,3122
+122985da61879f20,memcpyDtoA,47.584,4,,,,,,,,,,,0,0,1048576,0
+122985da61887520,_Z30kernel_estim_next_step_texturePjjjj,476,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15512,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636
+122985da619422a0,memcpyDtoH,191.328,755,,,,,,,,,,,0,0,1048576,0
diff --git a/main.cu b/main.cu
index 0bb7c3d43d83f8b5c55cb37551253ef3e859037e..63bf259d42b679536d30a44871be75f65c70b6ef 100644 (file)
--- a/main.cu
+++ b/main.cu
@@ -53,7 +53,7 @@ int main(int argc, char **argv){
   cutilCheckError( cutStopTimer(timer) );
   
   size = H * L * sizeof( unsigned int );
-  psize = (r-1)*PSIZE_I*sizeof(int2) ;
+  psize = (r-1)*PSIZE_I*sizeof(ushort) ;
 
   printf("Longueur des chemins = %d pixels\n", r);
   printf("Init GPU + Image %s  %d x %d = %d pixels en %f ms,\n", image_path, L, H, size, cutGetTimerValue(timer));
@@ -67,7 +67,7 @@ int main(int argc, char **argv){
   // allocation mem GPU
   unsigned int * d_directions =NULL ;
   unsigned int * d_lniv, * d_estim = NULL ;
-  int2 * d_paths ;
+  ushort * d_paths ;
 
   cutilSafeCall( cudaMalloc( (void**) &d_directions, size)) ;
   cutilSafeCall( cudaMalloc( (void**) &d_lniv, size ) );
@@ -77,7 +77,8 @@ int main(int argc, char **argv){
   
   // allocate array and copy image data
   cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned);
-  cudaChannelFormatDesc channelDescP = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned);
+  //cudaChannelFormatDesc channelDescP = cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSigned);
+  cudaChannelFormatDesc channelDescP = cudaCreateChannelDesc<ushort>();
   
   cudaArray * array_img_in, *array_img_estim, *array_img_lniv, *array_paths;
   cutilSafeCall( cudaMallocArray( &array_img_in, &channelDesc, L, H )); 
@@ -116,7 +117,7 @@ int main(int argc, char **argv){
   /*****************************
    * APPELS KERNELS et chronos
    *****************************/
-       dimBlock = dim3(16,16,1) ;
+       dimBlock = dim3(8,8,1) ;
        dimGrid = dim3( H / dimBlock.x,L / dimBlock.y, 1 ) ;
 
        // pour enregistrement image lniv GPU
@@ -151,7 +152,8 @@ int main(int argc, char **argv){
        for ( iter =0 ; iter < nb_iter ; iter++ )
          {
                cutilSafeCall( cudaMemcpyToArray( array_img_estim, 0, 0, d_estim, size, cudaMemcpyDeviceToDevice)) ;
-               kernel_levelines_texture_smem<<< dimGrid, dimBlock, 0 >>>( d_lniv, L, H, r );
+               //version avec/sans tentative d'utilisation de smem ( pas probante )
+               kernel_levelines_texture<<< dimGrid, dimBlock, 24*(r-1)*sizeof(short) >>>( d_lniv, L, H, r );
                cutilSafeCall( cudaMemcpyToArray( array_img_lniv, 0, 0, d_lniv, size, cudaMemcpyDeviceToDevice)) ;
                if (seq_out){
                  sprintf(image_out, "%s%d%s", image_out_base, iter+1, pgm_ext) ;
@@ -175,10 +177,10 @@ int main(int argc, char **argv){
        /**************************************************
         * recuperation matrice des chemins pour affichage
         **************************************************/
-       
+       /*
        int2 * h_paths = new int2[(r-1)*PSIZE_I] ;
        cutilSafeCall( cudaMemcpyFromArray(h_paths , array_paths, 0, 0, psize, cudaMemcpyDeviceToHost) );
-       /*
+       
        //verif Di
        printf("matrice  Di");
        for(int idpath=0; idpath< PSIZE_I; idpath++){b
index 20ef00afbeab84646c758e4387eca156cdbc11f7..9c75383f9f181daf09590cf3acd55398f9ca56e9 100644 (file)
Binary files a/obj/release/main.cu.o and b/obj/release/main.cu.o differ