X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/lniv_gpu.git/blobdiff_plain/1a59c41c272e22861e759e660a64578b47147d85..refs/heads/pathVar:/main.cu diff --git a/main.cu b/main.cu index d9db4f8..0bb7c3d 100644 --- a/main.cu +++ b/main.cu @@ -13,127 +13,76 @@ // lib spec #include "defines.h" #include "levelines_common.h" - #include "levelines_kernels.cu" -__global__ void kernel_debil(unsigned int * ptr1, unsigned int * ptr2, unsigned int L, int val){ - - unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; - unsigned int j = blockIdx.y*blockDim.y + threadIdx.y; - unsigned int pos = i*L +j ; +int main(int argc, char **argv){ - ptr2[pos] = val - ptr1[pos] ; + // use device with highest Gflops/s + cudaSetDevice( 0 ); -} - -int main(int argc, char **argv){ + unsigned int timer ; + cutilCheckError( cutCreateTimer(&timer) ); + cutilCheckError( cutResetTimer(timer) ); + cutilCheckError( cutStartTimer(timer) ); + + //alloc bidon pour anticiper l'initialisation du GPU + short * d_bidon ; + cutilSafeCall( cudaMalloc( (void**) &d_bidon, sizeof(short))) ; //float coef_regul = atof( argv[1] ) ; - unsigned int timer ; - cutilCheckError( cutCreateTimer(&timer) ); - cutilCheckError( cutResetTimer(timer) ); /***************************** * CHARGEMENT IMAGE *****************************/ char* image_path = argv[argc-1]; - char* image_out = "./image_out.pgm" ; + unsigned int r = atoi(argv[1]) ; + bool seq_out = atoi(argv[2]) ; + unsigned int iter , nb_iter = atoi(argv[3]) ; + unsigned int poids = 15 ; + char * image_out_base = "./image_out" ; + char * pgm_ext = ".pgm" ; + char image_out[80] ; unsigned int * h_data = NULL ; unsigned int * h_data_out = NULL ; - unsigned int H, L, size; - - cutilCheckError( cutStartTimer(timer) ); - cutilCheckError( cutLoadPGMi(image_path, &h_data, &L, &H)); - cutilCheckError( cutStopTimer(timer) ); + unsigned int H, L, size, psize ; - size = H * L * sizeof( unsigned int ); - printf("Loaded %d x %d = %d pixels from '%s' en %f ms,\n", L, H, size, image_path, cutGetTimerValue(timer)); - - - //essai alloc mapped - /* - cutilCheckError( cutResetTimer(timer) ); - cutilCheckError( cutStartTimer(timer) ); - unsigned int * h_ptr1, * d_ptr1 ; - unsigned int * h_ptr2, * d_ptr2 ; - int h = ; - int l = h ; - int mem = h*l*sizeof(unsigned int) ; - cutilSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost)); - cutilCheckError( cutStopTimer(timer) ); - printf("Temps set flag Mapped : %f ms\n", cutGetTimerValue(timer)) ; - - cutilCheckError( cutStartTimer(timer) ); - cutilSafeCall(cudaHostAlloc((void **)&h_ptr1, mem, cudaHostAllocMapped)); - cutilSafeCall(cudaHostAlloc((void **)&h_ptr2, mem, cudaHostAllocMapped)); + // chargt image + cutilCheckError( cutLoadPGMi(image_path, &h_data, &L, &H)); cutilCheckError( cutStopTimer(timer) ); - printf("Temps cumul alloc Mapped : %f ms\n", cutGetTimerValue(timer)) ; - for (int i = 0; i>>(d_ptr1, d_ptr2, l, 255) ; + printf("Longueur des chemins = %d pixels\n", r); + printf("Init GPU + Image %s %d x %d = %d pixels en %f ms,\n", image_path, L, H, size, cutGetTimerValue(timer)); - cutilCheckError( cutStopTimer(timer) ); - printf("Temps total Mapped : %f ms\n", cutGetTimerValue(timer)) ; - - char * image_1 = "./image_1.pgm" ; - char * image_2 = "./image_2.pgm" ; - - cutilCheckError( cutSavePGMi(image_1, h_ptr1, l, h) ) ; - cutilCheckError( cutSavePGMi(image_2, h_ptr2, l, h) ) ; - */ /***************************** * FIN CHARGEMENT IMAGE *****************************/ - - - - // use device with highest Gflops/s - cudaSetDevice( cutGetMaxGflopsDeviceId() ); - - - /* - cutilSafeCall( cudaMallocArray(&a_Src, &floatTex, imageW, imageH) ); - cutilSafeCall( cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float)) ); - cutilSafeCall( cudaThreadSynchronize() ); - cutilCheckError( cutResetTimer(hTimer) ); - cutilCheckError( cutStartTimer(hTimer) ); - - cutilSafeCall( cudaThreadSynchronize() ); - cutilCheckError( cutStopTimer(hTimer) ); - gpuTime = cutGetTimerValue(hTimer) / (float)iterations; - */ cutilCheckError( cutResetTimer(timer) ); cutilCheckError( cutStartTimer(timer) ); // allocation mem GPU unsigned int * d_directions =NULL ; unsigned int * d_lniv, * d_estim = NULL ; + int2 * d_paths ; cutilSafeCall( cudaMalloc( (void**) &d_directions, size)) ; cutilSafeCall( cudaMalloc( (void**) &d_lniv, size ) ); cutilSafeCall( cudaMalloc( (void**) &d_estim, size ) ); + cutilSafeCall( cudaMalloc( (void**) &d_paths, psize ) ); // allocate array and copy image data cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned); - cudaArray * array_img_in, *array_img_estim, *array_img_lniv; + cudaChannelFormatDesc channelDescP = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindSigned); + + cudaArray * array_img_in, *array_img_estim, *array_img_lniv, *array_paths; cutilSafeCall( cudaMallocArray( &array_img_in, &channelDesc, L, H )); cutilSafeCall( cudaMemcpyToArray( array_img_in, 0, 0, h_data, size, cudaMemcpyHostToDevice)) ; cutilSafeCall( cudaBindTextureToArray( tex_img_in, array_img_in, channelDesc)); - cutilCheckError( cutStopTimer(timer) ); cutilSafeCall( cudaMallocArray( &array_img_estim, &channelDesc, L, H )); cutilSafeCall( cudaBindTextureToArray( tex_img_estim, array_img_estim, channelDesc)); @@ -141,28 +90,75 @@ int main(int argc, char **argv){ cutilSafeCall( cudaMallocArray( &array_img_lniv, &channelDesc, L, H )); cutilSafeCall( cudaBindTextureToArray( tex_img_lniv, array_img_lniv, channelDesc)); + cutilSafeCall( cudaMallocArray( &array_paths, &channelDescP, (r-1), PSIZE_I )); + cutilSafeCall( cudaBindTextureToArray( tex_paths, array_paths, channelDescP)); + + cutilCheckError( cutStopTimer(timer) ); printf("Temps alloc + transferts en Textures : %f ms\n", cutGetTimerValue(timer)) ; + /***************************** - * APPELS KERNELS et chronos + * GENERATION des CHEMINS *****************************/ cutilCheckError( cutResetTimer(timer) ); cutilCheckError( cutStartTimer(timer) ); - unsigned int iter , nb_iter = 15 ; - unsigned int poids = 15 ; - dim3 dimBlock(8,8,1) ; - dim3 dimGrid( H / dimBlock.x, L / dimBlock.y, 1 ) ; - unsigned int smem_size = dimBlock.x * dimBlock.y * sizeof(unsigned int) ; - // init image estimee avec image_in + dim3 dimBlock(1,1,1) ; + dim3 dimGrid(1,1,1) ; + // calcul des chemins + kernel_calcul_paths<<< dimGrid, dimBlock, 0 >>>(d_paths, r); + + // copie du tableau en texture + cutilSafeCall( cudaMemcpyToArray( array_paths, 0, 0, d_paths, psize, cudaMemcpyDeviceToDevice)) ; + + cutilCheckError( cutStopTimer(timer) ); + printf("Temps generation chemin + transfert en texture : %f ms\n", cutGetTimerValue(timer)) ; + + /***************************** + * APPELS KERNELS et chronos + *****************************/ + dimBlock = dim3(16,16,1) ; + dimGrid = dim3( H / dimBlock.x,L / dimBlock.y, 1 ) ; + + // pour enregistrement image lniv GPU + free(h_data_out) ; + h_data_out = new unsigned int[H*L] ; + + //init image estimee avec image_in + cutilCheckError( cutResetTimer(timer) ); + cutilCheckError( cutStartTimer(timer) ); kernel_init_estim_from_img_in<<< dimGrid, dimBlock, 0 >>>(d_estim, L, H, 7); - printf("Grille : %d x %d de Blocs : %d x %d - Shared mem : %d octets\n", dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y, smem_size) ; + // kernel de copie + /* + kernel_neutre_img2estim<<< dimGrid, dimBlock, 0>>>(d_estim, L, H); + */ + cudaThreadSynchronize() ; + cutilCheckError( cutStopTimer(timer) ); + printf("Temps kernel init : %f ms\n", cutGetTimerValue(timer)) ; + // a remplacer par + /* + cutilCheckError( cutResetTimer(timer) ); + cutilCheckError( cutStartTimer(timer) ); + cutilSafeCall( cudaMemcpyFromArray( d_estim, array_img_in, 0, 0, size, cudaMemcpyDeviceToDevice)) ; + cutilCheckError( cutStopTimer(timer) ); + printf("Temps memcpyFromArray : %f ms\n", cutGetTimerValue(timer)) ; + */ + printf("Grille : %d x %d de Blocs : %d x %d \n", dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y) ; + + cutilCheckError( cutResetTimer(timer) ); + cutilCheckError( cutStartTimer(timer) ); for ( iter =0 ; iter < nb_iter ; iter++ ) { cutilSafeCall( cudaMemcpyToArray( array_img_estim, 0, 0, d_estim, size, cudaMemcpyDeviceToDevice)) ; - kernel_levelines_texture<<< dimGrid, dimBlock, 0 >>>( d_lniv, L, H ); + kernel_levelines_texture_smem<<< dimGrid, dimBlock, 0 >>>( d_lniv, L, H, r ); cutilSafeCall( cudaMemcpyToArray( array_img_lniv, 0, 0, d_lniv, size, cudaMemcpyDeviceToDevice)) ; + if (seq_out){ + sprintf(image_out, "%s%d%s", image_out_base, iter+1, pgm_ext) ; + printf("chaine : %s\n", image_out); + cutilSafeCall( cudaMemcpy(h_data_out , d_estim, size, cudaMemcpyDeviceToHost) ); + cutilCheckError( cutSavePGMi(image_out, h_data_out, L, H) ) ; + } kernel_estim_next_step_texture<<< dimGrid, dimBlock, 0 >>>(d_estim, L, H, poids) ; } @@ -171,26 +167,95 @@ int main(int argc, char **argv){ cutilCheckError( cutStopTimer(timer) ); printf("Execution moy par kernel : %f ms\n", cutGetTimerValue(timer)/(float)nb_iter) ; printf("Total pour %d kernels : %f ms\n", nb_iter, cutGetTimerValue(timer)) ; - + /************************** * VERIFS **************************/ - //trace des lniv sur grille de 'pas x pas' - //kernel_trace_levelines<<< dimGrid, dimBlock, 0 >>>(d_data, d_directions, d_data2, L, H, 16, 255) ; - //cudaThreadSynchronize(); - - // enregistrement image lniv GPU - h_data_out = new unsigned int[H*L] ; - if ( h_data_out != NULL) - cutilSafeCall( cudaMemcpy(h_data_out , d_estim, size, cudaMemcpyDeviceToHost) ); - else - printf("Echec allocation mem CPU\n"); - cutilCheckError( cutSavePGMi(image_out, h_data_out, L, H) ) ; + /************************************************** + * recuperation matrice des chemins pour affichage + **************************************************/ + + int2 * h_paths = new int2[(r-1)*PSIZE_I] ; + cutilSafeCall( cudaMemcpyFromArray(h_paths , array_paths, 0, 0, psize, cudaMemcpyDeviceToHost) ); + /* + //verif Di + printf("matrice Di"); + for(int idpath=0; idpath< PSIZE_I; idpath++){b + + printf("\n"); + for(int idpix=0; idpix< r-1; idpix++){ + printf(" % d ", h_paths[idpath*(r-1) + idpix].x ); + } + printf("\t// %d°", idpath*15) ; + } + //verif Dj + printf("\nmatrice Dj"); + for(int idpath=0; idpath< PSIZE_I; idpath++){ + printf("\n"); + for(int idpix=0; idpix< r-1; idpix++){ + printf(" % d ", h_paths[idpath*(r-1) + idpix].y); + } + printf("\t// %d°", idpath*15) ; + } + printf("\n"); + */ + /*************************************************** + * fin verif visuelle matrices des chemins + ***************************************************/ - // calcul lniv CPU + /*************************************************** + * execution sequentielle pour comparaison + * la comparaison n'est pertinente que + * si d_lniv contient les lniv de l'image se départ + **************************************************/ + /* + // calcul sequentiel + cutilCheckError( cutResetTimer(timer) ); + cutilCheckError( cutStartTimer(timer) ); + h_data_out = new unsigned int[H*L] ; + int * dout = new int[H*L] ; + for ( iter =0 ; iter < nb_iter ; iter++ ){ + for (int i=r; i<= H-r; i++){ + for (int j=r; j<= L-r; j++){ + h_data_out[i*L+j] = lniv4_value(h_data, h_paths, i, j, H, L, &dout[i*L+j], r) ; + } + } + } + cutilCheckError( cutStopTimer(timer) ); + printf("Execution sequentielle CPU : %f ms\n", cutGetTimerValue(timer)) ; + // comparaison + unsigned int * h_lniv = new unsigned int[H*L] ; + int pos, cpt_err=0, cpt_pix=0 ; + cutilSafeCall( cudaMemcpy(h_lniv , d_lniv, size, cudaMemcpyDeviceToHost) ); + for ( iter =0 ; iter < nb_iter ; iter++ ){ + for (int i=r; i<= H-r; i++){ + for (int j=r; j<= L-r; j++){ + pos = i*L + j ; + if ( h_data_out[ pos ] != h_lniv[ pos ] ) { + cpt_err++ ; + printf(" pixel ( %d , %d ) -> GPU= %d CPU= %d \n", i, j, h_lniv[pos], h_data_out[pos]); + } + cpt_pix++ ; + } + } + } + printf("TAUX ERREUR GPU/CPU : %d / %d \n", cpt_err, cpt_pix ); + */ + //trace des lniv sur grille de 'pas x pas' + //kernel_trace_levelines<<< dimGrid, dimBlock, 0 >>>(d_data, d_directions, d_data2, L, H, 16, 255, r) ; + + if (!seq_out){ + if ( h_data_out != NULL) + cutilSafeCall( cudaMemcpy(h_data_out , d_estim, size, cudaMemcpyDeviceToHost) ); + else + printf("Echec allocation mem CPU\n"); + sprintf(image_out, "%s%d%s", image_out_base, iter+1, pgm_ext) ; + cutilCheckError( cutSavePGMi(image_out, h_data_out, L, H) ) ; + } + // TODO verifier pourquoi les deux lignes suivantes produisent une erreur //cutilExit(argc, argv); //cudaThreadExit();