From: Gilles Perrot Date: Tue, 6 Sep 2011 11:49:31 +0000 (+0200) Subject: version opérationnelle X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/lniv_gpu.git/commitdiff_plain/refs/heads/master?hp=7cc36a5689750ee7d77b959362edc65948a79427 version opérationnelle chemins de longueur variable mais rectilignes tous les 15 degres --- diff --git a/Makefile b/Makefile index 20361ca..7c8c11f 100644 --- a/Makefile +++ b/Makefile @@ -47,3 +47,6 @@ CCFILES := # Rules and targets include ../../common/common.mk + +zul: + echo $(NVCCFLAGS) \ No newline at end of file diff --git a/defines.h b/defines.h index cddf7c6..70fcaa8 100644 --- a/defines.h +++ b/defines.h @@ -16,8 +16,8 @@ #define BSMAX 512 #define MAX(x,y) ( ( (x)>=(y) )?(x):(y) ) #define ABS(x) ( ((x)>0)?(x):-(x)) -#define DEC 4 -#define DEC2 8 +#define DEC 3 +#define DEC2 6 #define CONFLICT_FREE_OFFSET(index) ( ((index) >>(DEC)) + ((index) >>(DEC2) ) ) #define CFO(index) ( ( (index) >>(DEC) ) + ( (index) >>(DEC2) ) ) #define CFI(index) ( (index) + (CFO(index)) ) diff --git a/image_out16.pgm b/image_out16.pgm index 2cbfd03..ebca7c0 100644 Binary files a/image_out16.pgm and b/image_out16.pgm differ diff --git a/image_out6.pgm b/image_out6.pgm new file mode 100644 index 0000000..44ea1aa Binary files /dev/null and b/image_out6.pgm differ diff --git a/levelines_kernels.cu b/levelines_kernels.cu index 6838e83..6d616c6 100644 --- a/levelines_kernels.cu +++ b/levelines_kernels.cu @@ -78,7 +78,7 @@ __constant__ float tangente[] = {0.000, 0.268, 0.577, 1.000} ; texture tex_img_in ; texture tex_img_estim ; texture tex_img_lniv ; -texture tex_paths ; +texture tex_paths ; @@ -95,20 +95,24 @@ texture tex_paths ; * considérés pour le calcul de chemins (float tangente[]). * */ -__global__ void kernel_calcul_paths( int2 * d_paths, unsigned int r){ +__global__ void kernel_calcul_paths( ushort * d_paths, unsigned int r){ unsigned int idpath = 0 ; int ic, jc, iprec, jprec ; float offset = 0.5 ; unsigned int basepath = 0 ; + char MSQ, LSQ ; // Q1 inf for (int a=0 ; a< 4 ; a++){ // les 4 angles 0,15,30 et 45 for (int p=0 ; p< r ; p++){ // les r points ic = r-1 - floor(tangente[a]*p + offset) ; if ( p > 0 ){ - d_paths[idpath*(r-1)+p-1].x = ic - iprec ; - d_paths[idpath*(r-1)+p-1].y = 1 ; + MSQ = ic - iprec ; + LSQ = 1 ; + //d_paths[idpath*(r-1)+p-1].x = ic - iprec ; + //d_paths[idpath*(r-1)+p-1].y = 1 ; + d_paths[idpath*(r-1)+p-1] = ((short)MSQ << 8) | LSQ ; } iprec = ic ; } @@ -119,8 +123,11 @@ __global__ void kernel_calcul_paths( int2 * d_paths, unsigned int r){ for (int p=0 ; p< r ; p++){ // les r points jc = floor(tangente[a]*p + offset) ; if ( p > 0 ){ - d_paths[idpath*(r-1)+p-1].x = -1 ; - d_paths[idpath*(r-1)+p-1].y = jc - jprec ; + MSQ = -1 ; + LSQ = jc - jprec ; + d_paths[idpath*(r-1)+p-1] = ((short)MSQ << 8) | LSQ ; + //d_paths[idpath*(r-1)+p-1].x = -1 ; + //d_paths[idpath*(r-1)+p-1].y = jc - jprec ; } jprec = jc ; } @@ -131,8 +138,11 @@ __global__ void kernel_calcul_paths( int2 * d_paths, unsigned int r){ basepath += 6 ; for (int a=0 ; a< 6 ; a++){ // les 6 angles 90,105,120,135,150,165 for (int p=0 ; p> 8 ) ; + d_paths[idpath*(r-1)+p-1] = ((short)MSQ << 8) | LSQ ; + //d_paths[idpath*(r-1)+p].x = -d_paths[(idpath - basepath)*(r-1)+p].y ; + //d_paths[idpath*(r-1)+p].y = d_paths[(idpath - basepath)*(r-1)+p].x ; } idpath++ ; } @@ -141,8 +151,11 @@ __global__ void kernel_calcul_paths( int2 * d_paths, unsigned int r){ basepath += 6 ; for (int a=0 ; a< 6 ; a++){ // les 6 angles 180,195,210,225,240,255 for (int p=0 ; p> 8 ) ; + LSQ = - ( d_paths[(idpath - basepath)*(r-1)+p] & 0x00FF ) ; + d_paths[idpath*(r-1)+p-1] = ((short)MSQ << 8) | LSQ ; + //d_paths[idpath*(r-1)+p].x = -d_paths[(idpath - basepath)*(r-1)+p].x ; + //d_paths[idpath*(r-1)+p].y = -d_paths[(idpath - basepath)*(r-1)+p].y ; } idpath++ ; } @@ -151,11 +164,15 @@ __global__ void kernel_calcul_paths( int2 * d_paths, unsigned int r){ basepath += 6 ; for (int a=0 ; a< 6 ; a++){ // les 6 angles 270,285,300,315,330,345 for (int p=0 ; p> 8 ) ; + d_paths[idpath*(r-1)+p-1] = ((short)MSQ << 8) | LSQ ; + //d_paths[idpath*(r-1)+p].x = d_paths[(idpath - basepath)*(r-1)+p].y ; + //d_paths[idpath*(r-1)+p].y = -d_paths[(idpath - basepath)*(r-1)+p].x ; } idpath++ ; } + } /** @@ -368,12 +385,15 @@ __global__ void kernel_estim_next_step_global_mem(unsigned int * d_estim, unsign * Execution sur des blocs de threads 2D et une grille 2D * selon les dimensions de l'image. * L'image d'entrée doit être au préalable en mémoire texture pointée par "tex_img_estim". - * Les matrices des chemins sont, elles, pointées par "tex_paths" + * Les matrices des chemins sont, elles, préalablement chargées en SHMEM depuis la texture" * Cette version ne fournit pas les indices des chemins pour les tracé éventuel des lniv. */ __global__ void kernel_levelines_texture(unsigned int * img_out, unsigned int L, unsigned int H, unsigned int r) { - // coordonnes du point dans l'image + // coordonnees du point dans le bloc + unsigned int iib = threadIdx.x ; + unsigned int jib = threadIdx.y ; + // coordonnees du point dans l'image unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; unsigned int j = blockIdx.y*blockDim.y + threadIdx.y; @@ -383,7 +403,18 @@ __global__ void kernel_levelines_texture(unsigned int * img_out, unsigned int L, int idpath, idpix ; unsigned int mse_min, mse_cur, val ; uint2 mse ; + short texVal ; + + extern __shared__ short shPath[] ; + + unsigned int absPos = jib*8 + iib ; + if ( absPos < PSIZE_I ){ + for ( idpix = 0; idpix < lpath-1; idpix++){ + shPath[ idpix*24 + absPos ] = tex2D(tex_paths, idpix, absPos) ; + } + syncthreads() ; + } if((i>=lpath)&&(i<=H-lpath)&&(j>=lpath)&&(j<=L-lpath)){ z = tex2D(tex_img_estim, j, i) ; @@ -393,8 +424,9 @@ __global__ void kernel_levelines_texture(unsigned int * img_out, unsigned int L, mse.x = z ; mse.y = z*z ; for( idpix=0; idpix < lpath-1 ; idpix++ ) { - ic += tex2D(tex_paths, idpix, idpath).x ; - jc += tex2D(tex_paths, idpix, idpath).y ; + texVal = shPath[ idpix*24 + idpath ] ; + ic += (char)(texVal>>8) ; + jc += (char)(texVal) ; zc = tex2D(tex_img_estim, jc, ic) ; mse.x += zc ; mse.y += zc*zc ; @@ -412,73 +444,6 @@ __global__ void kernel_levelines_texture(unsigned int * img_out, unsigned int L, } -/** - * - * \brief determine les lniv en chaque point de l'image - * \author zulu - AND - * - * \param[in] L Largeur de l'image - * \param[in] H Hauteur de l'image - * \param[in] r longueur des segments - * - * \param[out] img_out image des lniv - * - * Execution sur des blocs de threads 2D et une grille 2D - * selon les dimensions de l'image. - * L'image d'entrée doit être au préalable en mémoire texture pointée par "tex_img_estim". - * Les matrices des chemins sont, elles, pointées par "tex_paths" - * Cette version ne fournit pas les indices des chemins pour les tracé éventuel des lniv. - * Cette version tente d'utiliser la shared memory pour compenser la baisse de perf due aux chemins - * paramétrables non constants. - */ - -__global__ void kernel_levelines_texture_smem(unsigned int * img_out, unsigned int L, unsigned int H, unsigned int r) -{ - // coordonnées du point dans le bloc - unsigned int iib = threadIdx.x ; - unsigned int jib = threadIdx.y ; - // coordonnes du point dans l'image - unsigned int i = blockIdx.x*blockDim.x + iib ; - unsigned int j = blockIdx.y*blockDim.y + jib ; - - // nb de points par chemin - int lpath = r ; - int ic, jc ; - int idpath, idpix ; - unsigned int val, mse_cur, mse_min, z, zc ; - uint2 mse_data ; - - //__shared__ unsigned int val_img[16*16] ; - - //val_img[jib*16+iib] = tex2D(tex_img_estim, j, i) ; - - if((i>=lpath)&&(i<=H-lpath)&&(j>=lpath)&&(j<=L-lpath)){ - z = tex2D(tex_img_estim, j, i) ; - for( idpath=0; idpath < PSIZE_I ; idpath++) { - ic = i ; - jc = j ; - mse_data.x = z ; - mse_data.y = z*z ; - mse_min = mse_data.y - mse_data.x/lpath*mse_data.y ; - for( idpix=0; idpix < lpath-1 ; idpix++ ) { - ic += tex2D(tex_paths, idpix, idpath).x ; - jc += tex2D(tex_paths, idpix, idpath).y ; - zc = tex2D(tex_img_estim, jc, ic) ; - mse_data.x += zc ; - mse_data.y += zc*zc ; - } - // critere de selection du chemin ( SUM_(X2) - SUM_(X)2 / lpath ) - // TODO cherchera ameliorer pour vitesse - mse_cur = ( mse_data.y - ( mse_data.x / lpath ) * mse_data.x ) ; - if ( mse_cur < mse_min ){ - mse_min = mse_cur ; - val = mse_data.x ; - } - } - img_out[ i*L + j ] = val / lpath ; - } -} - /** * * \brief trace les segments sur un maillage carré @@ -498,6 +463,7 @@ __global__ void kernel_levelines_texture_smem(unsigned int * img_out, unsigned i * execution sur des blocs de threads 2D et une grille 2D * selon les dimensions de l'image */ +/* __global__ void kernel_trace_levelines(unsigned int * img_in, unsigned int * dir, unsigned int * img_out, unsigned int L, unsigned int H, unsigned int pas, unsigned int ng, unsigned int r ){ @@ -525,3 +491,4 @@ __global__ void kernel_trace_levelines(unsigned int * img_in, unsigned int * dir } } +*/ diff --git a/lniv.cvp b/lniv.cvp index 741b8ce..1802ee0 100644 --- a/lniv.cvp +++ b/lniv.cvp @@ -7,7 +7,7 @@ /home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu 21 Jun 2011 10:47:10 false - + branch,divergent_branch,instructions,warp_serialize cta_launched,local_load,local_store,gld_32b @@ -25,7 +25,7 @@ /home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu 21 Jun 2011 10:50:54 false - + branch,divergent_branch,instructions,warp_serialize cta_launched,local_load,local_store,gld_32b @@ -43,7 +43,7 @@ /home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu 21 Jun 2011 11:10:33 false - + branch,divergent_branch,instructions,warp_serialize cta_launched,local_load,local_store,gld_32b @@ -61,7 +61,7 @@ /home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu 21 Jun 2011 11:16:24 false - + branch,divergent_branch,instructions,warp_serialize cta_launched,local_load,local_store,gld_32b @@ -79,25 +79,7 @@ /home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu 21 Jun 2011 11:30:11 false - - - branch,divergent_branch,instructions,warp_serialize - cta_launched,local_load,local_store,gld_32b - gld_64b,gld_128b,gst_32b,gst_64b - gst_128b - gld_request,gst_request - tex_cache_hit,tex_cache_miss - - - - - Session19 - "/home/perrot/NVIDIA_GPU_Computing_SDK/C/bin/linux/release/lniv" - 4 0 15 /home/perrot/Images/cochons/cochon_512b.pgm - /home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu - 21 Jun 2011 14:10:47 - false - + branch,divergent_branch,instructions,warp_serialize cta_launched,local_load,local_store,gld_32b @@ -109,11 +91,11 @@ - Session21 + smemPaths "/home/perrot/NVIDIA_GPU_Computing_SDK/C/bin/linux/release/lniv" 4 0 15 /home/perrot/Images/cochons/cochon_512b.pgm /home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu - 21 Jun 2011 14:27:50 + 22 Jun 2011 11:18:26 false @@ -127,11 +109,11 @@ - Session22 + Session17 "/home/perrot/NVIDIA_GPU_Computing_SDK/C/bin/linux/release/lniv" 4 0 15 /home/perrot/Images/cochons/cochon_512b.pgm /home/perrot/NVIDIA_GPU_Computing_SDK/C/src/lniv_gpu - 21 Jun 2011 14:34:00 + 22 Jun 2011 11:27:01 false diff --git a/lniv_Session17_Context_0.csv b/lniv_Session17_Context_0.csv new file mode 100644 index 0000000..1f63b5c --- /dev/null +++ b/lniv_Session17_Context_0.csv @@ -0,0 +1,70 @@ +# CUDA_PROFILE_LOG_VERSION 2.0 +# CUDA_PROFILE_CSV 1 +# TIMESTAMPFACTOR fffff6ef6d4ddfa0 +# CUDA_DEVICE 0 Tesla C1060 +gpustarttimestamp,method,gputime,cputime,occupancy,gridSizeX,gridSizeY,gridSizeZ,blockSizeX,blockSizeY,blockSizeZ,dynSmemPerBlock,staSmemPerBlock,registerPerThread,streamID,localblocksize,memTransferSize,memtransferhostmemtype,branch,divergent_branch,instructions,warp_serialize,cta_launched,local_load,local_store,gld_32b,gld_64b,gld_128b,gst_32b,gst_64b,gst_128b,gld_request,gst_request,tex_cache_hit,tex_cache_miss +1229865247212140,memcpyHtoA,177.856,336,,,,,,,,,,,0,0,1048576,0 +1229865247279f80,_Z19kernel_calcul_pathsPtj,47.808,13,0.031,1,1,13486368,1,1,1,0,32,10,0,-1,,0,168,0,1834,0,1,0,0,108,0,0,72,0,0,108,72,0,0 +1229865247286460,memcpyDtoA,4.352,6,,,,,,,,,,,0,0,144,0 +122986524728b1e0,_Z29kernel_init_estim_from_img_inPjjjj,1600.64,8,0.5,64,64,13,8,8,1,0,48,12,0,-1,,0,78756,10,478914,0,409,0,0,0,0,0,12416,0,0,0,266,625958,89992 +122986524741da80,memcpyDtoA,47.872,6,,,,,,,,,,,0,0,1048576,0 +122986524742eea0,_Z24kernel_levelines_texturePjjjj,1925.92,9,0.5,64,64,12923796,8,8,1,144,48,16,0,-1,,0,49113,148,716401,1106,410,0,0,0,0,0,12802,0,0,0,272,184897,54656 +1229865247605c40,memcpyDtoA,47.68,4,,,,,,,,,,,0,0,1048576,0 +1229865247615220,_Z30kernel_estim_next_step_texturePjjjj,479.392,6,0.5,64,64,1,8,8,1,0,48,9,0,-1,,0,816,0,15510,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636 +122986524768ad20,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0 +1229865247698320,_Z24kernel_levelines_texturePjjjj,1940.35,4,0.5,64,64,14781808,8,8,1,144,48,16,0,-1,,0,49113,149,716405,1062,410,0,0,0,0,0,12804,0,0,0,272,185864,53908 +12298652478728e0,memcpyDtoA,47.616,4,,,,,,,,,,,0,0,1048576,0 +122986524787ff60,_Z30kernel_estim_next_step_texturePjjjj,472.448,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15511,0,410,0,0,0,0,0,13120,0,0,0,272,4920,1640 +12298652478f3f40,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0 +1229865247901580,_Z24kernel_levelines_texturePjjjj,1938.46,4,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,49113,147,716400,1080,409,0,0,0,0,0,12800,0,0,0,272,184958,54513 +1229865247adb480,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0 +1229865247ae8ae0,_Z30kernel_estim_next_step_texturePjjjj,472.8,3,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15511,0,410,0,0,0,0,0,13120,0,0,0,272,4920,1640 +1229865247b5cc20,memcpyDtoA,47.616,4,,,,,,,,,,,0,0,1048576,0 +1229865247b6a240,_Z24kernel_levelines_texturePjjjj,1946.3,3,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,49113,149,716406,1089,409,0,0,0,0,0,12788,0,0,0,272,184803,54668 +1229865247d45f40,memcpyDtoA,47.552,4,,,,,,,,,,,0,0,1048576,0 +1229865247d53520,_Z30kernel_estim_next_step_texturePjjjj,468.192,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15512,0,410,0,0,0,0,0,13120,0,0,0,272,4920,1640 +1229865247dc6460,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0 +1229865247dd3aa0,_Z24kernel_levelines_texturePjjjj,1931.3,4,0.5,64,64,14780256,8,8,1,144,48,16,0,-1,,0,49113,147,716403,1082,410,0,0,0,0,0,12815,0,0,0,272,185389,54456 +1229865247fabd20,memcpyDtoA,47.552,3,,,,,,,,,,,0,0,1048576,0 +1229865247fb9300,_Z30kernel_estim_next_step_texturePjjjj,468.128,3,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15511,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636 +122986524802c220,memcpyDtoA,47.552,4,,,,,,,,,,,0,0,1048576,0 +12298652480397e0,_Z24kernel_levelines_texturePjjjj,1923.81,3,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,48941,149,713807,1061,410,0,0,0,0,0,12802,0,0,0,271,184460,55093 +122986524820fd20,memcpyDtoA,47.584,4,,,,,,,,,,,0,0,1048576,0 +122986524821d340,_Z30kernel_estim_next_step_texturePjjjj,482.88,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15510,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636 +1229865248293bc0,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0 +12298652482a11a0,_Z24kernel_levelines_texturePjjjj,1931.9,3,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,48941,147,713807,1039,410,0,0,0,0,0,12804,0,0,0,271,186162,53610 +12298652484796a0,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0 +1229865248486c80,_Z30kernel_estim_next_step_texturePjjjj,478.944,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15625,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640 +12298652484fc5a0,memcpyDtoA,47.68,4,,,,,,,,,,,0,0,1048576,0 +1229865248509c20,_Z24kernel_levelines_texturePjjjj,1942.02,4,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,48752,146,711135,1071,409,0,0,0,0,0,12800,0,0,0,270,184874,54597 +12298652486e48a0,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0 +12298652486f1ee0,_Z30kernel_estim_next_step_texturePjjjj,479.84,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15624,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640 +1229865248767b80,memcpyDtoA,47.552,3,,,,,,,,,,,0,0,1048576,0 +1229865248775160,_Z24kernel_levelines_texturePjjjj,1925.86,4,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,48752,146,711137,1052,409,0,0,0,0,0,12788,0,0,0,270,184892,54579 +122986524894be80,memcpyDtoA,47.584,4,,,,,,,,,,,0,0,1048576,0 +12298652489594c0,_Z30kernel_estim_next_step_texturePjjjj,473.632,3,0.5,64,64,32767,8,8,1,0,48,9,0,-1,,0,822,0,15624,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640 +12298652489cd920,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0 +12298652489dafc0,_Z24kernel_levelines_texturePjjjj,1946.14,4,0.5,64,64,144,8,8,1,144,48,16,0,-1,,0,48752,146,711139,1029,410,0,0,0,0,0,12815,0,0,0,270,185136,54709 +1229865248bb6c40,memcpyDtoA,47.712,4,,,,,,,,,,,0,0,1048576,0 +1229865248bc4320,_Z30kernel_estim_next_step_texturePjjjj,470.08,4,0.5,64,64,48,8,8,1,0,48,9,0,-1,,0,822,0,15624,0,409,0,0,0,0,0,13088,0,0,0,274,4908,1636 +1229865248c379c0,memcpyDtoA,47.552,4,,,,,,,,,,,0,0,1048576,0 +1229865248c44fe0,_Z24kernel_levelines_texturePjjjj,1932.83,4,0.5,64,64,13964298,8,8,1,144,48,16,0,-1,,0,48752,146,711138,1048,410,0,0,0,0,0,12802,0,0,0,270,184798,54755 +1229865248e1d840,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0 +1229865248e2aea0,_Z30kernel_estim_next_step_texturePjjjj,467.36,4,0.5,64,64,7602289,8,8,1,0,48,9,0,-1,,0,822,0,15626,0,409,0,0,0,0,0,13088,0,0,0,274,4908,1636 +1229865248e9daa0,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0 +1229865248eab120,_Z24kernel_levelines_texturePjjjj,1921.12,4,0.5,64,64,6226030,8,8,1,144,48,16,0,-1,,0,48752,146,711133,1089,410,0,0,0,0,0,12804,0,0,0,270,185686,54086 +1229865249080bc0,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0 +122986524908e240,_Z30kernel_estim_next_step_texturePjjjj,467.456,4,0.5,64,64,3145780,8,8,1,0,48,9,0,-1,,0,822,0,15626,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640 +1229865249100e80,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0 +122986524910e480,_Z24kernel_levelines_texturePjjjj,1931.62,3,0.5,64,64,3670071,8,8,1,144,48,16,0,-1,,0,48752,146,711135,1087,409,0,0,0,0,0,12800,0,0,0,270,185448,54023 +12298652492e6840,memcpyDtoA,47.616,4,,,,,,,,,,,0,0,1048576,0 +12298652492f3ea0,_Z30kernel_estim_next_step_texturePjjjj,485.568,4,0.5,64,64,3145776,8,8,1,0,48,9,0,-1,,0,822,0,15626,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640 +122986524936b1c0,memcpyDtoA,47.552,4,,,,,,,,,,,0,0,1048576,0 +12298652493787c0,_Z24kernel_levelines_texturePjjjj,1927.55,4,0.5,64,64,6225971,8,8,1,144,48,16,0,-1,,0,48752,146,711136,1043,409,0,0,0,0,0,12788,0,0,0,270,185851,53620 +122986524954fb80,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0 +122986524955d1e0,_Z30kernel_estim_next_step_texturePjjjj,475.648,3,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15626,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640 +12298652495d1e40,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0 +12298652495df480,_Z24kernel_levelines_texturePjjjj,1971.1,4,0.5,64,64,64,8,8,1,144,48,16,0,-1,,0,49113,150,716402,1074,410,0,0,0,0,0,12815,0,0,0,272,185344,54501 +12298652497c1280,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0 +12298652497ce8a0,_Z30kernel_estim_next_step_texturePjjjj,476.032,3,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15510,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636 +122986524988d200,memcpyDtoH,191.296,740,,,,,,,,,,,0,0,1048576,0 diff --git a/lniv_smemPaths_Context_0.csv b/lniv_smemPaths_Context_0.csv new file mode 100644 index 0000000..359a966 --- /dev/null +++ b/lniv_smemPaths_Context_0.csv @@ -0,0 +1,70 @@ +# CUDA_PROFILE_LOG_VERSION 2.0 +# CUDA_PROFILE_CSV 1 +# TIMESTAMPFACTOR fffff6ef6cfa6038 +# CUDA_DEVICE 0 Tesla C1060 +gpustarttimestamp,method,gputime,cputime,occupancy,gridSizeX,gridSizeY,gridSizeZ,blockSizeX,blockSizeY,blockSizeZ,dynSmemPerBlock,staSmemPerBlock,registerPerThread,streamID,localblocksize,memTransferSize,memtransferhostmemtype,branch,divergent_branch,instructions,warp_serialize,cta_launched,local_load,local_store,gld_32b,gld_64b,gld_128b,gst_32b,gst_64b,gst_128b,gld_request,gst_request,tex_cache_hit,tex_cache_miss +122985da5f305f60,memcpyHtoA,177.92,331,,,,,,,,,,,0,0,1048576,0 +122985da5f36ee60,_Z19kernel_calcul_pathsPtj,47.808,14,0.031,1,1,13732448,1,1,1,0,32,10,0,-1,,0,168,0,1834,0,1,0,0,108,0,0,72,0,0,108,72,0,0 +122985da5f37b380,memcpyDtoA,4.32,5,,,,,,,,,,,0,0,144,0 +122985da5f3800e0,_Z29kernel_init_estim_from_img_inPjjjj,1586.94,8,0.5,64,64,8,8,8,1,0,48,12,0,-1,,0,78756,10,478915,0,409,0,0,0,0,0,12416,0,0,0,266,620103,95847 +122985da5f50d800,memcpyDtoA,48,6,,,,,,,,,,,0,0,1048576,0 +122985da5f51ebc0,_Z24kernel_levelines_texturePjjjj,1930.4,9,0.5,64,64,12923760,8,8,1,144,48,16,0,-1,,0,49113,148,690702,1881,410,0,0,0,0,0,12802,0,0,0,272,236380,3173 +122985da5f6f6ae0,memcpyDtoA,47.616,3,,,,,,,,,,,0,0,1048576,0 +122985da5f706100,_Z30kernel_estim_next_step_texturePjjjj,466.528,6,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15511,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636 +122985da5f7789e0,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0 +122985da5f786060,_Z24kernel_levelines_texturePjjjj,1930.21,4,0.5,64,64,-64,8,8,1,144,48,16,0,-1,,0,49113,149,690700,1877,410,0,0,0,0,0,12804,0,0,0,272,236557,3215 +122985da5f95dec0,memcpyDtoA,47.584,4,,,,,,,,,,,0,0,1048576,0 +122985da5f96b520,_Z30kernel_estim_next_step_texturePjjjj,472.32,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15511,0,410,0,0,0,0,0,13120,0,0,0,272,4920,1640 +122985da5f9df480,memcpyDtoA,47.68,4,,,,,,,,,,,0,0,1048576,0 +122985da5f9ecac0,_Z24kernel_levelines_texturePjjjj,1921.98,4,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,49113,147,690706,1882,409,0,0,0,0,0,12800,0,0,0,272,236235,3236 +122985da5fbc28c0,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0 +122985da5fbcfea0,_Z30kernel_estim_next_step_texturePjjjj,465.536,4,0.5,64,64,-64,8,8,1,0,48,9,0,-1,,0,816,0,15513,0,410,0,0,0,0,0,13120,0,0,0,272,4920,1640 +122985da5fc42380,memcpyDtoA,47.584,3,,,,,,,,,,,0,0,1048576,0 +122985da5fc4f980,_Z24kernel_levelines_texturePjjjj,1916.54,4,0.5,64,64,8057472,8,8,1,144,48,16,0,-1,,0,49113,149,690708,1895,409,0,0,0,0,0,12788,0,0,0,272,236073,3398 +122985da5fe24300,memcpyDtoA,47.552,3,,,,,,,,,,,0,0,1048576,0 +122985da5fe318e0,_Z30kernel_estim_next_step_texturePjjjj,470.784,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15510,0,410,0,0,0,0,0,13120,0,0,0,272,4920,1640 +122985da5fea5280,memcpyDtoA,47.712,3,,,,,,,,,,,0,0,1048576,0 +122985da5feb2900,_Z24kernel_levelines_texturePjjjj,1916.61,4,0.5,64,64,144,8,8,1,144,48,16,0,-1,,0,49113,147,690703,1870,410,0,0,0,0,0,12815,0,0,0,272,236510,3335 +122985da60087220,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0 +122985da60094840,_Z30kernel_estim_next_step_texturePjjjj,465.344,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15510,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636 +122985da60106c60,memcpyDtoA,47.776,3,,,,,,,,,,,0,0,1048576,0 +122985da60114320,_Z24kernel_levelines_texturePjjjj,1914.43,4,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,48941,149,688205,1889,410,0,0,0,0,0,12802,0,0,0,271,236103,3450 +122985da602e83c0,memcpyDtoA,47.616,3,,,,,,,,,,,0,0,1048576,0 +122985da602f5a00,_Z30kernel_estim_next_step_texturePjjjj,477.952,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15511,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636 +122985da6036af60,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0 +122985da603785a0,_Z24kernel_levelines_texturePjjjj,1909.12,3,0.5,64,64,6881375,8,8,1,144,48,16,0,-1,,0,48941,147,688211,1849,410,0,0,0,0,0,12804,0,0,0,271,236529,3243 +122985da6054b180,memcpyDtoA,47.616,7,,,,,,,,,,,0,0,1048576,0 +122985da605587e0,_Z30kernel_estim_next_step_texturePjjjj,474.72,4,0.5,64,64,3276855,8,8,1,0,48,9,0,-1,,0,822,0,15624,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640 +122985da605cd080,memcpyDtoA,47.552,3,,,,,,,,,,,0,0,1048576,0 +122985da605da680,_Z24kernel_levelines_texturePjjjj,1920.1,4,0.5,64,64,3145783,8,8,1,144,48,16,0,-1,,0,48752,146,685626,1884,409,0,0,0,0,0,12800,0,0,0,270,235824,3647 +122985da607afd40,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0 +122985da607bd3a0,_Z30kernel_estim_next_step_texturePjjjj,490.048,4,0.5,64,64,3604536,8,8,1,0,48,9,0,-1,,0,822,0,15624,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640 +122985da60835860,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0 +122985da60842ea0,_Z24kernel_levelines_texturePjjjj,1915.78,4,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,48752,146,685627,1911,409,0,0,0,0,0,12788,0,0,0,270,236322,3149 +122985da60a174c0,memcpyDtoA,47.584,4,,,,,,,,,,,0,0,1048576,0 +122985da60a24aa0,_Z30kernel_estim_next_step_texturePjjjj,480.128,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15624,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640 +122985da60a9a8a0,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0 +122985da60aa7ec0,_Z24kernel_levelines_texturePjjjj,1911.39,3,0.5,64,64,48,8,8,1,144,48,16,0,-1,,0,48752,146,685632,1882,410,0,0,0,0,0,12815,0,0,0,270,236367,3478 +122985da60c7b3a0,memcpyDtoA,47.616,4,,,,,,,,,,,0,0,1048576,0 +122985da60c889e0,_Z30kernel_estim_next_step_texturePjjjj,476.992,3,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15624,0,409,0,0,0,0,0,13088,0,0,0,274,4908,1636 +122985da60cfdb80,memcpyDtoA,47.552,4,,,,,,,,,,,0,0,1048576,0 +122985da60d0b160,_Z24kernel_levelines_texturePjjjj,1912.67,4,0.5,64,64,9532554,8,8,1,144,48,16,0,-1,,0,48752,146,685631,1906,410,0,0,0,0,0,12802,0,0,0,270,236347,3206 +122985da60edeb00,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0 +122985da60eec160,_Z30kernel_estim_next_step_texturePjjjj,486.112,3,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15627,0,409,0,0,0,0,0,13088,0,0,0,274,4908,1636 +122985da60f63660,memcpyDtoA,47.584,4,,,,,,,,,,,0,0,1048576,0 +122985da60f70c40,_Z24kernel_levelines_texturePjjjj,1913.22,3,0.5,64,64,97,8,8,1,144,48,16,0,-1,,0,48752,146,685626,1867,410,0,0,0,0,0,12804,0,0,0,270,236389,3383 +122985da61144800,memcpyDtoA,47.648,3,,,,,,,,,,,0,0,1048576,0 +122985da61151e40,_Z30kernel_estim_next_step_texturePjjjj,487.296,3,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15626,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640 +122985da611c9820,memcpyDtoA,47.584,4,,,,,,,,,,,0,0,1048576,0 +122985da611d6e40,_Z24kernel_levelines_texturePjjjj,1912.86,4,0.5,64,64,0,8,8,1,144,48,16,0,-1,,0,48752,146,685627,1873,409,0,0,0,0,0,12800,0,0,0,270,236113,3358 +122985da613aa8e0,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0 +122985da613b7f40,_Z30kernel_estim_next_step_texturePjjjj,480.544,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15626,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640 +122985da6142dea0,memcpyDtoA,47.68,3,,,,,,,,,,,0,0,1048576,0 +122985da6143b500,_Z24kernel_levelines_texturePjjjj,1914.14,4,0.5,64,64,12032,8,8,1,144,48,16,0,-1,,0,48752,146,685630,1884,409,0,0,0,0,0,12788,0,0,0,270,235919,3552 +122985da6160f480,memcpyDtoA,47.744,3,,,,,,,,,,,0,0,1048576,0 +122985da6161cb60,_Z30kernel_estim_next_step_texturePjjjj,488.672,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,822,0,15627,0,410,0,0,0,0,0,13120,0,0,0,274,4920,1640 +122985da61694a80,memcpyDtoA,47.616,4,,,,,,,,,,,0,0,1048576,0 +122985da616a20a0,_Z24kernel_levelines_texturePjjjj,1930.24,3,0.5,64,64,12032,8,8,1,144,48,16,0,-1,,0,49113,150,690707,1868,410,0,0,0,0,0,12815,0,0,0,272,236723,3122 +122985da61879f20,memcpyDtoA,47.584,4,,,,,,,,,,,0,0,1048576,0 +122985da61887520,_Z30kernel_estim_next_step_texturePjjjj,476,4,0.5,64,64,0,8,8,1,0,48,9,0,-1,,0,816,0,15512,0,409,0,0,0,0,0,13088,0,0,0,272,4908,1636 +122985da619422a0,memcpyDtoH,191.328,755,,,,,,,,,,,0,0,1048576,0 diff --git a/main.cu b/main.cu index 0bb7c3d..63bf259 100644 --- a/main.cu +++ b/main.cu @@ -53,7 +53,7 @@ int main(int argc, char **argv){ cutilCheckError( cutStopTimer(timer) ); size = H * L * sizeof( unsigned int ); - psize = (r-1)*PSIZE_I*sizeof(int2) ; + psize = (r-1)*PSIZE_I*sizeof(ushort) ; printf("Longueur des chemins = %d pixels\n", r); printf("Init GPU + Image %s %d x %d = %d pixels en %f ms,\n", image_path, L, H, size, cutGetTimerValue(timer)); @@ -67,7 +67,7 @@ int main(int argc, char **argv){ // allocation mem GPU unsigned int * d_directions =NULL ; unsigned int * d_lniv, * d_estim = NULL ; - int2 * d_paths ; + ushort * d_paths ; cutilSafeCall( cudaMalloc( (void**) &d_directions, size)) ; cutilSafeCall( cudaMalloc( (void**) &d_lniv, size ) ); @@ -77,7 +77,8 @@ int main(int argc, char **argv){ // allocate array and copy image data cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned); - cudaChannelFormatDesc channelDescP = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned); + //cudaChannelFormatDesc channelDescP = cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSigned); + cudaChannelFormatDesc channelDescP = cudaCreateChannelDesc(); cudaArray * array_img_in, *array_img_estim, *array_img_lniv, *array_paths; cutilSafeCall( cudaMallocArray( &array_img_in, &channelDesc, L, H )); @@ -116,7 +117,7 @@ int main(int argc, char **argv){ /***************************** * APPELS KERNELS et chronos *****************************/ - dimBlock = dim3(16,16,1) ; + dimBlock = dim3(8,8,1) ; dimGrid = dim3( H / dimBlock.x,L / dimBlock.y, 1 ) ; // pour enregistrement image lniv GPU @@ -151,7 +152,8 @@ int main(int argc, char **argv){ for ( iter =0 ; iter < nb_iter ; iter++ ) { cutilSafeCall( cudaMemcpyToArray( array_img_estim, 0, 0, d_estim, size, cudaMemcpyDeviceToDevice)) ; - kernel_levelines_texture_smem<<< dimGrid, dimBlock, 0 >>>( d_lniv, L, H, r ); + //version avec/sans tentative d'utilisation de smem ( pas probante ) + kernel_levelines_texture<<< dimGrid, dimBlock, 24*(r-1)*sizeof(short) >>>( d_lniv, L, H, r ); cutilSafeCall( cudaMemcpyToArray( array_img_lniv, 0, 0, d_lniv, size, cudaMemcpyDeviceToDevice)) ; if (seq_out){ sprintf(image_out, "%s%d%s", image_out_base, iter+1, pgm_ext) ; @@ -175,10 +177,10 @@ int main(int argc, char **argv){ /************************************************** * recuperation matrice des chemins pour affichage **************************************************/ - + /* int2 * h_paths = new int2[(r-1)*PSIZE_I] ; cutilSafeCall( cudaMemcpyFromArray(h_paths , array_paths, 0, 0, psize, cudaMemcpyDeviceToHost) ); - /* + //verif Di printf("matrice Di"); for(int idpath=0; idpath< PSIZE_I; idpath++){b diff --git a/obj/release/main.cu.o b/obj/release/main.cu.o index 20ef00a..9c75383 100644 Binary files a/obj/release/main.cu.o and b/obj/release/main.cu.o differ