From: Gilles Perrot Date: Fri, 10 Jun 2011 11:58:57 +0000 (+0200) Subject: generation de chemins de taille parametrable X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/lniv_gpu.git/commitdiff_plain/refs/heads/genPaths?ds=inline generation de chemins de taille parametrable --- diff --git a/.gitignore b/.gitignore index e69de29..94c2ef1 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1 @@ +obj/*.o \ No newline at end of file diff --git a/Makefile b/Makefile index 2801dda..20361ca 100644 --- a/Makefile +++ b/Makefile @@ -34,9 +34,9 @@ ################################################################################ # Add source files here -EXECUTABLE := levelines +EXECUTABLE := lniv # CUDA source files (compiled with cudacc) -CUFILES := main_gmem.cu +CUFILES := main.cu # CUDA dependency files CU_DEPS := levelines_common.h # C/C++ source files (compiled with gcc / c++) diff --git a/image_cpu.pgm b/image_cpu.pgm new file mode 100644 index 0000000..309cc8a Binary files /dev/null and b/image_cpu.pgm differ diff --git a/image_out.pgm b/image_out.pgm new file mode 100644 index 0000000..690310e Binary files /dev/null and b/image_out.pgm differ diff --git a/levelines_common.c b/levelines_common.c new file mode 100644 index 0000000..bb77107 --- /dev/null +++ b/levelines_common.c @@ -0,0 +1,131 @@ +#include "levelines_common.h" + +unsigned int lniv4_value(unsigned int **image, int i, int j, int idim, int jdim, int *dout) +{ + unsigned int value_c ; + unsigned int value2_c ; + unsigned int d, v, p, d_min, eq_min, eq, sum, sum2 ; + unsigned int sum_eq_min ; + unsigned int it, jt ; + + + /* mem */ + value_c = image[ i*l + j ] ; + value2_c = value_c*value_c ; + + // direction d=0 + sum = value_c ; + sum2 = value2_c ; + it = i ; + jt = j ; + for (p=0; p<3; p++) + { + it += Di_Q1[0][p] ; + jt += Dj_Q1[0][p] ; + v = image[ it*l + jt ] ; + sum += v ; + sum2 += v*v ; + } + eq_min = sum2 - sum*sum/4 ; /* *4 */ + sum_eq_min = sum ; + d_min = 0 ; + + /* direction 1 a 5 */ + for (d=1; d<6; d++) + { + sum = value_c ; + sum2 = value2_c ; + it = i ; + jt = j ; + for (p=0; p<3; p++) + { + it += Di_Q1[d][p] ; + jt += Dj_Q1[d][p] ; + v = image[it*l + jt] ; + sum += v ; + sum2 += v*v ; + } + eq = sum2 - sum*sum/4 ; /* *4 */ + if (eq < eq_min) + { + eq_min = eq ; + sum_eq_min = sum ; + d_min = d ; /* pour info */ + } + } + + /* direction 6 a 11 */ + for (d=0; d<6; d++) + { + sum = value_c ; + sum2 = value2_c ; + it = i ; + jt = j ; + for (p=0; p<3; p++) + { + it += Dj_Q1[d][p] ; + jt -= Di_Q1[d][p] ; + v = image[it*l + jt] ; + sum += v ; + sum2 += v*v ; + } + eq = sum2 - sum*sum/4 ; /* *4 */ + if (eq < eq_min) + { + eq_min = eq ; + sum_eq_min = sum ; + d_min = d+6 ; /* pour info */ + } + } + + /* direction 12 a 17 */ + for (d=0; d<6; d++) + { + sum = value_c ; + sum2 = value2_c ; + it = i ; + jt = j ; + for (p=0; p<3; p++) + { + it -= Di_Q1[d][p] ; + jt -= Dj_Q1[d][p] ; + v = image[it*l + jt] ; + sum += v ; + sum2 += v*v ; + } + eq = sum2 - sum*sum/4 ; /* *4 */ + if (eq < eq_min) + { + eq_min = eq ; + sum_eq_min = sum ; + d_min = d+12 ; /* pour info */ + } + } + + /* direction 18 a 23 */ + for (d=0; d<6; d++) + { + sum = value_c ; + sum2 = value2_c ; + it = i ; + jt = j ; + for (p=0; p<3; p++) + { + it -= Dj_Q1[d][p] ; + jt += Di_Q1[d][p] ; + v = image[it*l + jt] ; + sum += v ; + sum2 += v*v ; + } + eq = sum2 - sum*sum/4 ; /* *4 */ + if (eq < eq_min) + { + eq_min = eq ; + sum_eq_min = sum ; + d_min = d+18 ; /* pour info */ + } + } + + *dout = d_min ; + return sum_eq_min/4 ; +} diff --git a/levelines_common.h b/levelines_common.h index ddb0ea2..97d9bd7 100644 --- a/levelines_common.h +++ b/levelines_common.h @@ -3,17 +3,171 @@ #ifndef LEVELINES_COMMON_H #define LEVELINES_COMMON_H +#include -#include +/** + * \var Di_Q1 + * \brief table de coordonnees des segments pour le quadrant 1 + * + */ +const int Di_Q1[6][3] = + {{ -1, -1, -1}, /* 0 */ + { -1, -1, -1}, /* 1 */ + { -1, -1, -1}, /* 2 */ + { -1, -1, -1}, /* 3 */ + { -1, 0, -1}, /* 4 */ + { 0, -1, 0}}; /* 5 */ + +/** + * \var Dj_Q1 + * \brief table de coordonnees des segments pour le quadrant 1 + * + */ +const int Dj_Q1[6][3] = + {{ 0, 0, 0}, /* 0 */ + { 0, 1, 0}, /* 1 */ + { 1, 0, 1}, /* 2 */ + { 1, 1, 1}, /* 3 */ + { 1, 1, 1}, /* 4 */ + { 1, 1, 1}}; /* 5 */ //////////////////////////////////////////////////////////////////////////////// // Reference CPU functions //////////////////////////////////////////////////////////////////////////////// -//extern "C" void fonc(...); +extern "C" unsigned int lniv_cpu(unsigned int *image, int i, int j, int idim, int l, int *dout){ + + unsigned int value_c ; + unsigned int value2_c ; + unsigned int d, v, p, d_min, eq_min, eq, sum, sum2 ; + unsigned int sum_eq_min ; + unsigned int it, jt ; + + + /* mem */ + value_c = image[ i*l + j ] ; + value2_c = value_c*value_c ; + + // direction d=0 + sum = value_c ; + sum2 = value2_c ; + it = i ; + jt = j ; + for (p=0; p<3; p++) + { + it += Di_Q1[0][p] ; + jt += Dj_Q1[0][p] ; + v = image[ it*l + jt ] ; + sum += v ; + sum2 += v*v ; + } + eq_min = sum2 - sum*sum/4 ; /* *4 */ + sum_eq_min = sum ; + d_min = 0 ; + + /* direction 1 a 5 */ + for (d=1; d<6; d++) + { + sum = value_c ; + sum2 = value2_c ; + it = i ; + jt = j ; + for (p=0; p<3; p++) + { + it += Di_Q1[d][p] ; + jt += Dj_Q1[d][p] ; + v = image[it*l + jt] ; + sum += v ; + sum2 += v*v ; + } + eq = sum2 - sum*sum/4 ; /* *4 */ + if (eq < eq_min) + { + eq_min = eq ; + sum_eq_min = sum ; + d_min = d ; /* pour info */ + } + } + + /* direction 6 a 11 */ + for (d=0; d<6; d++) + { + sum = value_c ; + sum2 = value2_c ; + it = i ; + jt = j ; + for (p=0; p<3; p++) + { + it += Dj_Q1[d][p] ; + jt -= Di_Q1[d][p] ; + v = image[it*l + jt] ; + sum += v ; + sum2 += v*v ; + } + eq = sum2 - sum*sum/4 ; /* *4 */ + if (eq < eq_min) + { + eq_min = eq ; + sum_eq_min = sum ; + d_min = d+6 ; /* pour info */ + } + } + + /* direction 12 a 17 */ + for (d=0; d<6; d++) + { + sum = value_c ; + sum2 = value2_c ; + it = i ; + jt = j ; + for (p=0; p<3; p++) + { + it -= Di_Q1[d][p] ; + jt -= Dj_Q1[d][p] ; + v = image[it*l + jt] ; + sum += v ; + sum2 += v*v ; + } + eq = sum2 - sum*sum/4 ; /* *4 */ + if (eq < eq_min) + { + eq_min = eq ; + sum_eq_min = sum ; + d_min = d+12 ; /* pour info */ + } + } + + /* direction 18 a 23 */ + for (d=0; d<6; d++) + { + sum = value_c ; + sum2 = value2_c ; + it = i ; + jt = j ; + for (p=0; p<3; p++) + { + it -= Dj_Q1[d][p] ; + jt += Di_Q1[d][p] ; + v = image[it*l + jt] ; + sum += v ; + sum2 += v*v ; + } + eq = sum2 - sum*sum/4 ; /* *4 */ + if (eq < eq_min) + { + eq_min = eq ; + sum_eq_min = sum ; + d_min = d+18 ; /* pour info */ + } + } + + *dout = d_min ; + return sum_eq_min/4 ; + }; + //////////////////////////////////////////////////////////////////////////////// // GPU functions (in file.cu) diff --git a/levelines_kernels.cu b/levelines_kernels.cu index 491c600..754f3bf 100644 --- a/levelines_kernels.cu +++ b/levelines_kernels.cu @@ -6,65 +6,65 @@ __constant__ int pathDi[PSIZE_I][PSIZE_J-1] = { // Q1 - { -1, -1, -1}, // - { -1, -1, -1}, // - { -1, -1, -1}, // - { -1, -1, -1}, // - { -1, 0, 1}, // - { 0, -1, 0}, + { -1, -1, -1}, // 90 + { -1, -1, -1}, // 75 + { -1, -1, -1}, // 60 + { -1, -1, -1}, // 45 + { -1, 0, -1}, // 30 + { 0, -1, 0}, // 15 // Q4 - { 0, 0, 0}, // - { 0, 1, 1}, // - { 1, 0, 1}, // - { 1, 1, 1}, // - { 1, 1, 1}, // - { 1, 1, 1}, + { 0, 0, 0}, // 0 + { 0, 1, 0}, // 345 + { 1, 0, 1}, // 330 + { 1, 1, 1}, // 315 + { 1, 1, 1}, // 300 + { 1, 1, 1}, // 285 // Q3 - { 1, 1, 1}, // - { 1, 1, 1}, // - { 1, 1, 1}, // - { 1, 1, 1}, // - { 1, 0, -1}, // - { 0, 1, 0}, + { 1, 1, 1}, // 270 + { 1, 1, 1}, // 255 + { 1, 1, 1}, // 240 + { 1, 1, 1}, // 225 + { 1, 0, 1}, // 210 + { 0, 1, 0}, // 195 // Q2 - { 0, 0, 0}, // - { 0, -1, 0}, // - { -1, 0, -1}, // - { -1, -1, -1}, // - { -1, -1, -1}, // - { -1, -1, -1} + { 0, 0, 0}, // 180 + { 0, -1, 0}, // 165 + { -1, 0, -1}, // 150 + { -1, -1, -1}, // 135 + { -1, -1, -1}, // 120 + { -1, -1, -1} // 105 } ; // __constant__ int pathDj[PSIZE_I][PSIZE_J-1] = { // Q1 - { 0, 0, 0}, // - { 0, 1, 0}, - { 1, 0, 1}, - { 1, 1, 1}, - { 1, 1, 1}, - { 1, 1, 1}, + { 0, 0, 0}, // 90 + { 0, 1, 0}, // 75 + { 1, 0, 1}, // 60 + { 1, 1, 1}, // 45 + { 1, 1, 1}, // 30 + { 1, 1, 1}, // 15 // Q4 - { 1, 1, 1}, // - { 1, 1, 1}, - { 1, 1, 1}, - { 1, 1, 1}, - { 1, 0, -1}, - { 0, 1, 0}, - // Q3 - { 0, 0, 0}, // - { 0, -1, 0}, - { -1, 0, -1}, - { -1, -1, -1}, - { -1, -1, -1}, - { -1, -1, -1}, + { 1, 1, 1}, // 0 + { 1, 1, 1}, // 345 + { 1, 1, 1}, // 330 + { 1, 1, 1}, // 315 + { 1, 0, 1}, // 300 + { 0, 1, 0}, // 285 + // Q3 + { 0, 0, 0}, // 270 + { 0, -1, 0}, // 255 + { -1, 0, -1}, // 240 + { -1, -1, -1}, // 225 + { -1, -1, -1}, // 210 + { -1, -1, -1}, // 195 // Q2 - { -1, -1, -1}, // - { -1, -1, -1}, - { -1, -1, -1}, - { -1, -1, -1}, - { -1, 0, 1}, - { 0, -1, 0} + { -1, -1, -1}, // 180 + { -1, -1, -1}, // 165 + { -1, -1, -1}, // 150 + { -1, -1, -1}, // 135 + { -1, 0, -1}, // 120 + { 0, -1, 0} // 105 } ; @@ -74,6 +74,72 @@ texture tex_img_estim ; texture tex_img_lniv ; +__constant__ float tangente[] = {0.000, 0.268, 0.577, 1.000} ; + +__global__ void kernel_calcul_paths( int2 * d_paths, unsigned int r){ + + unsigned int idpath = 0 ; + int ic, jc, iprec, jprec ; + float offset = 0.5 ; + unsigned int basepath = 0 ; + + // Q1 inf + for (int a=0 ; a< 4 ; a++){ // les 4 angles 0,15,30 et 45 + for (int p=0 ; p< r ; p++){ // les r points + ic = r-1 - floor(tangente[a]*p + offset) ; + if ( p > 0 ){ + d_paths[idpath*(r-1)+p-1].x = ic - iprec ; + d_paths[idpath*(r-1)+p-1].y = 1 ; + } + iprec = ic ; + } + idpath++ ; + } + // Q1 sup + for (int a=2 ; a>0 ; a--){ // les 2 angles 60 et 75 + for (int p=0 ; p< r ; p++){ // les r points + jc = floor(tangente[a]*p + offset) ; + if ( p > 0 ){ + d_paths[idpath*(r-1)+p-1].x = -1 ; + d_paths[idpath*(r-1)+p-1].y = jc - jprec ; + } + jprec = jc ; + } + idpath++ ; + } + + // Q2 + basepath += 6 ; + for (int a=0 ; a< 6 ; a++){ // les 6 angles 90,105,120,135,150,165 + for (int p=0 ; plpath)&&(ilpath)&&(j=lpath )&&( i<= (H-lpath) )&&( j>=lpath )&&( j<=(L-lpath) ) ){ for( idpath=0; idpath < PSIZE_I ; idpath++) { ic = i ; jc = j ; @@ -225,7 +302,7 @@ __global__ void kernel_levelines_texture(unsigned int * img_out, unsigned int L, } // critere de selection du chemin ( SUM_(X2) - SUM_(X)2 / lpath ) // a ameliorer pour vitesse - mse_cur = ( mse.y - ( mse.x / lpath ) * mse.x ) ; + mse_cur = ( mse.y - ( mse.x*mse.x / lpath ) ) ; if (idpath == 0) { mse_min = mse_cur ; val = mse.x ; @@ -238,56 +315,61 @@ __global__ void kernel_levelines_texture(unsigned int * img_out, unsigned int L, } img_out[ i*L + j ] = val / lpath ; } + else img_out[ i*L + j ] = 0 ; } - - -__global__ void kernel_levelines_only_texture(unsigned int * img_out, unsigned int L, unsigned int H) +__global__ void kernel_dir_levelines_texture(unsigned int * dir, unsigned int L, unsigned int H) { // coordonnes du point dans l'image unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; - unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;; + unsigned int j = blockIdx.y*blockDim.y + threadIdx.y; //unsigned int spos = threadIdx.x * blockDim.y + threadIdx.y ; // nb de points par chemin int lpath = PSIZE_J ; - unsigned int ic, jc ; + unsigned int ic, jc, zc ; int idpath, idpix ; - unsigned int mse_min, mse_cur ; - - //extern __shared__ uint2 mse[] ; + unsigned int mse_min, mse_cur, val ; uint2 mse ; - if((i>lpath)&&(ilpath)&&(j=lpath )&&( i<= (H-lpath) )&&( j>=lpath )&&( j<=(L-lpath) ) ){ for( idpath=0; idpath < PSIZE_I ; idpath++) { ic = i ; jc = j ; - mse.x = tex2D(tex_img_in, i, j) ; - mse.y = tex2D(tex_img_in, i, j)*tex2D(tex_img_in, i, j) ; + zc = tex2D(tex_img_estim, j, i) ; + mse.x = zc ; + mse.y = zc*zc ; for( idpix=0; idpix < lpath-1 ; idpix++ ) { ic += pathDi[idpath][idpix] ; jc += pathDj[idpath][idpix] ; - mse.x += tex2D( tex_img_in, ic, jc ) ; - mse.y += tex2D( tex_img_in, ic, jc ) * tex2D( tex_img_in, ic, jc ) ; + zc = tex2D(tex_img_estim, jc, ic) ; + mse.x += zc ; + mse.y += zc*zc ; } // critere de selection du chemin ( SUM_(X2) - SUM_(X)2 / lpath ) // a ameliorer pour vitesse - mse_cur = ( mse.y - ( mse.x / lpath ) * mse.x ) ; - if (idpath > 0) { + mse_cur = ( mse.y - ( mse.x*mse.x / lpath ) ) ; + if (idpath == 0) { + mse_min = mse_cur ; + val = idpath ; + } else { if ( mse_cur < mse_min ) { mse_min = mse_cur ; + val = idpath ; } - } else { - mse_min = mse_cur ; - } + } } - img_out[ i*L + j ] = mse_min / lpath ; + dir[ i*L + j ] = val ; } + } -__global__ void kernel_trace_levelines(unsigned int * img_in, unsigned int * dir, unsigned int * img_out, + + +__global__ void kernel_trace_levelines_texture(unsigned int * dir, unsigned int * img_out, unsigned int L, unsigned int H, unsigned int pas, unsigned int ng){ // coordonnes du point dans l'image unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; @@ -298,9 +380,9 @@ __global__ void kernel_trace_levelines(unsigned int * img_in, unsigned int * dir unsigned int ic, jc, idpix ; unsigned int idpath ; - img_out[ i*L + j ] = img_in[ i*L + j ] ; + img_out[ i*L + j ] = tex2D(tex_img_estim, j, i ) ; - if ( !(i%pas+j%pas)&&(i>lpath)&&(ilpath)&&(j=lpath)&&(i<=H-lpath)&&(j>=lpath)&&(j<=L-lpath) ){ ic = i ; jc = j ; idpath = dir[ic*L+jc] ; diff --git a/main.cu b/main.cu index d9db4f8..b2c1681 100644 --- a/main.cu +++ b/main.cu @@ -16,181 +16,232 @@ #include "levelines_kernels.cu" +const float tang[] = {0, 0.268, 0.577, 1} ; -__global__ void kernel_debil(unsigned int * ptr1, unsigned int * ptr2, unsigned int L, int val){ - - unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; - unsigned int j = blockIdx.y*blockDim.y + threadIdx.y; - unsigned int pos = i*L +j ; - ptr2[pos] = val - ptr1[pos] ; +void genPaths(unsigned int *h_paths, int *p_i, int *p_j, unsigned int r ){ + unsigned int idpath = 0 ; + int ic, jc ; + float offset = 0.5 ; + + // Q1 inf + for (int a=0 ; a< 4 ; a++){ // les 4 angles 0,15,30 et 45 + for (int p=0 ; p< r ; p++){ // les r points + jc = p ; + ic = r-1 - floor(tang[a]*p + offset) ; + h_paths[ idpath*r*r + ic*r + jc ] = 255 ; + if ( p > 0 ){ + p_i[idpath*(r-1)+p-1] = ic ; + p_j[idpath*(r-1)+p-1] = jc ; + } + } + idpath++ ; + } + // Q1 sup + for (int a=2 ; a>0 ; a--){ // les 2 angles 60 et 75 + for (int p=0 ; p< r ; p++){ // les r points + ic = r-1 - p ; + jc = floor(tang[a]*p + offset) ; + h_paths[ idpath*r*r + ic*r + jc ] = 255 ; + if ( p > 0 ){ + p_i[idpath*(r-1)+p-1] = ic ; + p_j[idpath*(r-1)+p-1] = jc ; + } + } + idpath++ ; + } + + // Q2 inf + for (int a=0 ; a< 4 ; a++){ // les 4 angles 90,105,130 et 145 + for (int p=0 ; p< r ; p++){ // les r points + ic = r-1 - p ; + jc = r-1 - floor(tang[a]*p + offset) ; + h_paths[ idpath*r*r + ic*r + jc ] = 255 ; + if ( p > 0 ){ + p_i[idpath*(r-1)+p-1] = ic ; + p_j[idpath*(r-1)+p-1] = jc ; + } + } + idpath++ ; + } + // Q2 sup + for (int a=2 ; a>0 ; a--){ // les 2 angles 60 et 75 + for (int p=0 ; p< r ; p++){ // les r points + jc = r-1 - p ; + ic = r-1 - floor(tang[a]*p + offset) ; + h_paths[ idpath*r*r + ic*r + jc ] = 255 ; + if ( p > 0 ){ + p_i[idpath*(r-1)+p-1] = ic ; + p_j[idpath*(r-1)+p-1] = jc ; + } + } + idpath++ ; + } + + + // Q3 inf + for (int a=0 ; a< 4 ; a++){ // les 4 angles 90,105,130 et 145 + for (int p=0 ; p< r ; p++){ // les r points + jc = r-1 - p ; + ic = floor(tang[a]*p + offset) ; + h_paths[ idpath*r*r + ic*r + jc ] = 255 ; + if ( p > 0 ){ + p_i[idpath*(r-1)+p-1] = ic ; + p_j[idpath*(r-1)+p-1] = jc ; + } + } + idpath++ ; + } + // Q3 sup + for (int a=2 ; a>0 ; a--){ // les 2 angles 60 et 75 + for (int p=0 ; p< r ; p++){ // les r points + ic = p ; + jc = r-1 - floor(tang[a]*p + offset) ; + h_paths[ idpath*r*r + ic*r + jc ] = 255 ; + if ( p > 0 ){ + p_i[idpath*(r-1)+p-1] = ic ; + p_j[idpath*(r-1)+p-1] = jc ; + } + } + idpath++ ; + } + + + // Q4 inf + for (int a=0 ; a< 4 ; a++){ // les 4 angles 90,105,130 et 145 + for (int p=0 ; p< r ; p++){ // les r points + ic = p ; + jc = floor(tang[a]*p + offset) ; + h_paths[ idpath*r*r + ic*r + jc ] = 255 ; + if ( p > 0 ){ + p_i[idpath*(r-1)+p-1] = ic ; + p_j[idpath*(r-1)+p-1] = jc ; + } + } + idpath++ ; + } + // Q4 sup + for (int a=2 ; a>0 ; a--){ // les 2 angles 60 et 75 + for (int p=0 ; p< r ; p++){ // les r points + jc = p ; + ic = floor(tang[a]*p + offset) ; + h_paths[ idpath*r*r + ic*r + jc ] = 255 ; + if ( p > 0 ){ + p_i[idpath*(r-1)+p-1] = ic ; + p_j[idpath*(r-1)+p-1] = jc ; + } + } + idpath++ ; + } } -int main(int argc, char **argv){ +int main(int argc, char **argv){ - //float coef_regul = atof( argv[1] ) ; - + // use device with highest Gflops/s + cudaSetDevice( cutGetMaxGflopsDeviceId() ); unsigned int timer ; cutilCheckError( cutCreateTimer(&timer) ); cutilCheckError( cutResetTimer(timer) ); - /***************************** - * CHARGEMENT IMAGE - *****************************/ - char* image_path = argv[argc-1]; - char* image_out = "./image_out.pgm" ; - unsigned int * h_data = NULL ; - unsigned int * h_data_out = NULL ; - unsigned int H, L, size; - cutilCheckError( cutStartTimer(timer) ); - cutilCheckError( cutLoadPGMi(image_path, &h_data, &L, &H)); - cutilCheckError( cutStopTimer(timer) ); + // une alloc debile pour initialiser la carte GPU + int * d_void ; + cutilSafeCall( cudaMalloc( (void**) &d_void, 4)) ; - size = H * L * sizeof( unsigned int ); - printf("Loaded %d x %d = %d pixels from '%s' en %f ms,\n", L, H, size, image_path, cutGetTimerValue(timer)); - + + + /********************************* + * DEFINITION PARAMS CHEMINS + *********************************/ + char* image_out = "./image_out.pgm" ; + int *p_i, *p_j ; + int2 * d_paths ; + unsigned int * h_paths ; + unsigned int R = atoi(argv[1]); + + + //unsigned int size = R * R * sizeof( unsigned int ); - //essai alloc mapped - /* - cutilCheckError( cutResetTimer(timer) ); - cutilCheckError( cutStartTimer(timer) ); - unsigned int * h_ptr1, * d_ptr1 ; - unsigned int * h_ptr2, * d_ptr2 ; - int h = ; - int l = h ; - int mem = h*l*sizeof(unsigned int) ; - cutilSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost)); - cutilCheckError( cutStopTimer(timer) ); - printf("Temps set flag Mapped : %f ms\n", cutGetTimerValue(timer)) ; - cutilCheckError( cutStartTimer(timer) ); - cutilSafeCall(cudaHostAlloc((void **)&h_ptr1, mem, cudaHostAllocMapped)); - cutilSafeCall(cudaHostAlloc((void **)&h_ptr2, mem, cudaHostAllocMapped)); - cutilCheckError( cutStopTimer(timer) ); - printf("Temps cumul alloc Mapped : %f ms\n", cutGetTimerValue(timer)) ; + // allocation mem + int memsize = 24*(R-1)*sizeof(int2) ; + cutilSafeCall( cudaMalloc( (void**) &d_paths, memsize ) ); - for (int i = 0; i>>(d_ptr1, d_ptr2, l, 255) ; + for (int j=0; j<24*R*R ; j++) h_paths[j] = 0 ; - cutilCheckError( cutStopTimer(timer) ); - printf("Temps total Mapped : %f ms\n", cutGetTimerValue(timer)) ; + genPaths(h_paths, p_i, p_j, R) ; + + printf("Rayon : %d pixels \n", R) ; + + //matrice p_i + printf("P_I\n"); + for (int idpath=0; idpath < 24; idpath++){ + printf("\n"); + for (int idpix=0; idpix < R-1; idpix++){ + printf(" %d ", p_i[idpath*(R-1)+idpix]); + } + } + //matrice p_j + printf("\nP_J\n"); + for (int idpath=0; idpath < 24; idpath++){ + printf("\n"); + for (int idpix=0; idpix < R-1; idpix++){ + printf(" %d ", p_j[idpath*(R-1)+idpix]); + } + } - char * image_1 = "./image_1.pgm" ; - char * image_2 = "./image_2.pgm" ; - cutilCheckError( cutSavePGMi(image_1, h_ptr1, l, h) ) ; - cutilCheckError( cutSavePGMi(image_2, h_ptr2, l, h) ) ; - */ /***************************** - * FIN CHARGEMENT IMAGE + * APPELS KERNELS et chronos *****************************/ + dim3 dimBlock( 1, 1, 1 ) ; + dim3 dimGrid( 1, 1, 1 ) ; + + kernel_calcul_paths<<< dimGrid, dimBlock, 0 >>>(d_paths, R) ; - - // use device with highest Gflops/s - cudaSetDevice( cutGetMaxGflopsDeviceId() ); - + printf("\nGrille : %d x %d de Blocs : %d x %d \n", dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y) ; - /* - cutilSafeCall( cudaMallocArray(&a_Src, &floatTex, imageW, imageH) ); - cutilSafeCall( cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float)) ); - cutilSafeCall( cudaThreadSynchronize() ); - cutilCheckError( cutResetTimer(hTimer) ); - cutilCheckError( cutStartTimer(hTimer) ); - - cutilSafeCall( cudaThreadSynchronize() ); - cutilCheckError( cutStopTimer(hTimer) ); - gpuTime = cutGetTimerValue(hTimer) / (float)iterations; - */ - - cutilCheckError( cutResetTimer(timer) ); - cutilCheckError( cutStartTimer(timer) ); - // allocation mem GPU - unsigned int * d_directions =NULL ; - unsigned int * d_lniv, * d_estim = NULL ; - - cutilSafeCall( cudaMalloc( (void**) &d_directions, size)) ; - cutilSafeCall( cudaMalloc( (void**) &d_lniv, size ) ); - cutilSafeCall( cudaMalloc( (void**) &d_estim, size ) ); + + /************************** + * VERIFS + **************************/ + int2 * paths = new int2[24*(R-1)] ; + cutilSafeCall( cudaMemcpy(paths , d_paths, memsize, cudaMemcpyDeviceToHost) ); - // allocate array and copy image data - cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned); - cudaArray * array_img_in, *array_img_estim, *array_img_lniv; - cutilSafeCall( cudaMallocArray( &array_img_in, &channelDesc, L, H )); - cutilSafeCall( cudaMemcpyToArray( array_img_in, 0, 0, h_data, size, cudaMemcpyHostToDevice)) ; - cutilSafeCall( cudaBindTextureToArray( tex_img_in, array_img_in, channelDesc)); - cutilCheckError( cutStopTimer(timer) ); - - cutilSafeCall( cudaMallocArray( &array_img_estim, &channelDesc, L, H )); - cutilSafeCall( cudaBindTextureToArray( tex_img_estim, array_img_estim, channelDesc)); - - cutilSafeCall( cudaMallocArray( &array_img_lniv, &channelDesc, L, H )); - cutilSafeCall( cudaBindTextureToArray( tex_img_lniv, array_img_lniv, channelDesc)); + //matrice p_i + printf("P_I\n"); + for (int idpath=0; idpath < 24; idpath++){ + printf("\n"); + for (int idpix=0; idpix < R-1; idpix++){ + printf(" %d ", paths[idpath*(R-1)+idpix].x); + } + printf("\t // %d", 15*idpath); + } + //matrice p_j + printf("\nP_J\n"); + for (int idpath=0; idpath < 24; idpath++){ + printf("\n"); + for (int idpix=0; idpix < R-1; idpix++){ + printf(" %d ", paths[idpath*(R-1)+idpix].y); + } + printf("\t // %d", 15*idpath); + } - printf("Temps alloc + transferts en Textures : %f ms\n", cutGetTimerValue(timer)) ; - /***************************** - * APPELS KERNELS et chronos - *****************************/ - cutilCheckError( cutResetTimer(timer) ); - cutilCheckError( cutStartTimer(timer) ); + printf("\n"); - unsigned int iter , nb_iter = 15 ; - unsigned int poids = 15 ; - dim3 dimBlock(8,8,1) ; - dim3 dimGrid( H / dimBlock.x, L / dimBlock.y, 1 ) ; - unsigned int smem_size = dimBlock.x * dimBlock.y * sizeof(unsigned int) ; - // init image estimee avec image_in - kernel_init_estim_from_img_in<<< dimGrid, dimBlock, 0 >>>(d_estim, L, H, 7); - - printf("Grille : %d x %d de Blocs : %d x %d - Shared mem : %d octets\n", dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y, smem_size) ; - - for ( iter =0 ; iter < nb_iter ; iter++ ) - { - cutilSafeCall( cudaMemcpyToArray( array_img_estim, 0, 0, d_estim, size, cudaMemcpyDeviceToDevice)) ; - kernel_levelines_texture<<< dimGrid, dimBlock, 0 >>>( d_lniv, L, H ); - cutilSafeCall( cudaMemcpyToArray( array_img_lniv, 0, 0, d_lniv, size, cudaMemcpyDeviceToDevice)) ; - kernel_estim_next_step_texture<<< dimGrid, dimBlock, 0 >>>(d_estim, L, H, poids) ; - } - - cudaThreadSynchronize(); - - cutilCheckError( cutStopTimer(timer) ); - printf("Execution moy par kernel : %f ms\n", cutGetTimerValue(timer)/(float)nb_iter) ; - printf("Total pour %d kernels : %f ms\n", nb_iter, cutGetTimerValue(timer)) ; - - /************************** - * VERIFS - **************************/ - //trace des lniv sur grille de 'pas x pas' - //kernel_trace_levelines<<< dimGrid, dimBlock, 0 >>>(d_data, d_directions, d_data2, L, H, 16, 255) ; - //cudaThreadSynchronize(); - - // enregistrement image lniv GPU - h_data_out = new unsigned int[H*L] ; - if ( h_data_out != NULL) - cutilSafeCall( cudaMemcpy(h_data_out , d_estim, size, cudaMemcpyDeviceToHost) ); - else - printf("Echec allocation mem CPU\n"); - - cutilCheckError( cutSavePGMi(image_out, h_data_out, L, H) ) ; - - // calcul lniv CPU - - + // enregistrement image des PATHS + //cutilSafeCall( cudaMemcpy(h_paths , d_paths, size, cudaMemcpyDeviceToHost) ); + + cutilCheckError( cutSavePGMi(image_out, h_paths, R, 24*R) ) ; + // TODO verifier pourquoi les deux lignes suivantes produisent une erreur //cutilExit(argc, argv); //cudaThreadExit(); diff --git a/obj/release/main_gmem.cu.o b/obj/release/main_gmem.cu.o new file mode 100644 index 0000000..264c1c6 Binary files /dev/null and b/obj/release/main_gmem.cu.o differ