// lib spec
#include "defines.h"
#include "levelines_common.h"
-
#include "levelines_kernels.cu"
-__global__ void kernel_debil(unsigned int * ptr1, unsigned int * ptr2, unsigned int L, int val){
-
- unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
- unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
- unsigned int pos = i*L +j ;
+int main(int argc, char **argv){
- ptr2[pos] = val - ptr1[pos] ;
+ // use device with highest Gflops/s
+ cudaSetDevice( 0 );
-}
-
-int main(int argc, char **argv){
+ unsigned int timer ;
+ cutilCheckError( cutCreateTimer(&timer) );
+ cutilCheckError( cutResetTimer(timer) );
+ cutilCheckError( cutStartTimer(timer) );
+
+ //alloc bidon pour anticiper l'initialisation du GPU
+ short * d_bidon ;
+ cutilSafeCall( cudaMalloc( (void**) &d_bidon, sizeof(short))) ;
//float coef_regul = atof( argv[1] ) ;
- unsigned int timer ;
- cutilCheckError( cutCreateTimer(&timer) );
- cutilCheckError( cutResetTimer(timer) );
/*****************************
* CHARGEMENT IMAGE
*****************************/
char* image_path = argv[argc-1];
- char* image_out = "./image_out.pgm" ;
+ unsigned int r = atoi(argv[1]) ;
+ bool seq_out = atoi(argv[2]) ;
+ unsigned int iter , nb_iter = atoi(argv[3]) ;
+ unsigned int poids = 15 ;
+ char * image_out_base = "./image_out" ;
+ char * pgm_ext = ".pgm" ;
+ char image_out[80] ;
unsigned int * h_data = NULL ;
unsigned int * h_data_out = NULL ;
- unsigned int H, L, size;
-
- cutilCheckError( cutStartTimer(timer) );
- cutilCheckError( cutLoadPGMi(image_path, &h_data, &L, &H));
- cutilCheckError( cutStopTimer(timer) );
+ unsigned int H, L, size, psize ;
- size = H * L * sizeof( unsigned int );
- printf("Loaded %d x %d = %d pixels from '%s' en %f ms,\n", L, H, size, image_path, cutGetTimerValue(timer));
-
-
- //essai alloc mapped
- /*
- cutilCheckError( cutResetTimer(timer) );
- cutilCheckError( cutStartTimer(timer) );
- unsigned int * h_ptr1, * d_ptr1 ;
- unsigned int * h_ptr2, * d_ptr2 ;
- int h = ;
- int l = h ;
- int mem = h*l*sizeof(unsigned int) ;
- cutilSafeCall(cudaSetDeviceFlags(cudaDeviceMapHost));
- cutilCheckError( cutStopTimer(timer) );
- printf("Temps set flag Mapped : %f ms\n", cutGetTimerValue(timer)) ;
-
- cutilCheckError( cutStartTimer(timer) );
- cutilSafeCall(cudaHostAlloc((void **)&h_ptr1, mem, cudaHostAllocMapped));
- cutilSafeCall(cudaHostAlloc((void **)&h_ptr2, mem, cudaHostAllocMapped));
+ // chargt image
+ cutilCheckError( cutLoadPGMi(image_path, &h_data, &L, &H));
cutilCheckError( cutStopTimer(timer) );
- printf("Temps cumul alloc Mapped : %f ms\n", cutGetTimerValue(timer)) ;
- for (int i = 0; i<h*l ; i++) h_ptr1[i] = 200 ;
+ size = H * L * sizeof( unsigned int );
+ psize = (r-1)*PSIZE_I*sizeof(ushort) ;
- cutilCheckError( cutStartTimer(timer) );
- cutilSafeCall(cudaHostGetDevicePointer((void **)&d_ptr1, (void *)h_ptr1, 0));
- cutilSafeCall(cudaHostGetDevicePointer((void **)&d_ptr2, (void *)h_ptr2, 0));
- cutilCheckError( cutStopTimer(timer) );
- printf("Temps cumul get pointer Mapped : %f ms\n", cutGetTimerValue(timer)) ;
-
- cutilCheckError( cutStartTimer(timer) );
- dim3 blocks(16,16,1) ;
- dim3 grid( h / blocks.x, l / blocks.y, 1 ) ;
-
- kernel_debil<<< grid, blocks >>>(d_ptr1, d_ptr2, l, 255) ;
+ printf("Longueur des chemins = %d pixels\n", r);
+ printf("Init GPU + Image %s %d x %d = %d pixels en %f ms,\n", image_path, L, H, size, cutGetTimerValue(timer));
- cutilCheckError( cutStopTimer(timer) );
- printf("Temps total Mapped : %f ms\n", cutGetTimerValue(timer)) ;
-
- char * image_1 = "./image_1.pgm" ;
- char * image_2 = "./image_2.pgm" ;
-
- cutilCheckError( cutSavePGMi(image_1, h_ptr1, l, h) ) ;
- cutilCheckError( cutSavePGMi(image_2, h_ptr2, l, h) ) ;
- */
/*****************************
* FIN CHARGEMENT IMAGE
*****************************/
-
-
-
- // use device with highest Gflops/s
- cudaSetDevice( cutGetMaxGflopsDeviceId() );
-
-
- /*
- cutilSafeCall( cudaMallocArray(&a_Src, &floatTex, imageW, imageH) );
- cutilSafeCall( cudaMalloc((void **)&d_Output, imageW * imageH * sizeof(float)) );
- cutilSafeCall( cudaThreadSynchronize() );
- cutilCheckError( cutResetTimer(hTimer) );
- cutilCheckError( cutStartTimer(hTimer) );
-
- cutilSafeCall( cudaThreadSynchronize() );
- cutilCheckError( cutStopTimer(hTimer) );
- gpuTime = cutGetTimerValue(hTimer) / (float)iterations;
- */
cutilCheckError( cutResetTimer(timer) );
cutilCheckError( cutStartTimer(timer) );
// allocation mem GPU
unsigned int * d_directions =NULL ;
unsigned int * d_lniv, * d_estim = NULL ;
+ ushort * d_paths ;
cutilSafeCall( cudaMalloc( (void**) &d_directions, size)) ;
cutilSafeCall( cudaMalloc( (void**) &d_lniv, size ) );
cutilSafeCall( cudaMalloc( (void**) &d_estim, size ) );
+ cutilSafeCall( cudaMalloc( (void**) &d_paths, psize ) );
// allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindUnsigned);
- cudaArray * array_img_in, *array_img_estim, *array_img_lniv;
+ //cudaChannelFormatDesc channelDescP = cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSigned);
+ cudaChannelFormatDesc channelDescP = cudaCreateChannelDesc<ushort>();
+
+ cudaArray * array_img_in, *array_img_estim, *array_img_lniv, *array_paths;
cutilSafeCall( cudaMallocArray( &array_img_in, &channelDesc, L, H ));
cutilSafeCall( cudaMemcpyToArray( array_img_in, 0, 0, h_data, size, cudaMemcpyHostToDevice)) ;
cutilSafeCall( cudaBindTextureToArray( tex_img_in, array_img_in, channelDesc));
- cutilCheckError( cutStopTimer(timer) );
cutilSafeCall( cudaMallocArray( &array_img_estim, &channelDesc, L, H ));
cutilSafeCall( cudaBindTextureToArray( tex_img_estim, array_img_estim, channelDesc));
cutilSafeCall( cudaMallocArray( &array_img_lniv, &channelDesc, L, H ));
cutilSafeCall( cudaBindTextureToArray( tex_img_lniv, array_img_lniv, channelDesc));
+ cutilSafeCall( cudaMallocArray( &array_paths, &channelDescP, (r-1), PSIZE_I ));
+ cutilSafeCall( cudaBindTextureToArray( tex_paths, array_paths, channelDescP));
+
+ cutilCheckError( cutStopTimer(timer) );
printf("Temps alloc + transferts en Textures : %f ms\n", cutGetTimerValue(timer)) ;
+
/*****************************
- * APPELS KERNELS et chronos
+ * GENERATION des CHEMINS
*****************************/
cutilCheckError( cutResetTimer(timer) );
cutilCheckError( cutStartTimer(timer) );
- unsigned int iter , nb_iter = 15 ;
- unsigned int poids = 15 ;
- dim3 dimBlock(8,8,1) ;
- dim3 dimGrid( H / dimBlock.x, L / dimBlock.y, 1 ) ;
- unsigned int smem_size = dimBlock.x * dimBlock.y * sizeof(unsigned int) ;
- // init image estimee avec image_in
+ dim3 dimBlock(1,1,1) ;
+ dim3 dimGrid(1,1,1) ;
+ // calcul des chemins
+ kernel_calcul_paths<<< dimGrid, dimBlock, 0 >>>(d_paths, r);
+
+ // copie du tableau en texture
+ cutilSafeCall( cudaMemcpyToArray( array_paths, 0, 0, d_paths, psize, cudaMemcpyDeviceToDevice)) ;
+
+ cutilCheckError( cutStopTimer(timer) );
+ printf("Temps generation chemin + transfert en texture : %f ms\n", cutGetTimerValue(timer)) ;
+
+ /*****************************
+ * APPELS KERNELS et chronos
+ *****************************/
+ dimBlock = dim3(8,8,1) ;
+ dimGrid = dim3( H / dimBlock.x,L / dimBlock.y, 1 ) ;
+
+ // pour enregistrement image lniv GPU
+ free(h_data_out) ;
+ h_data_out = new unsigned int[H*L] ;
+
+ //init image estimee avec image_in
+ cutilCheckError( cutResetTimer(timer) );
+ cutilCheckError( cutStartTimer(timer) );
kernel_init_estim_from_img_in<<< dimGrid, dimBlock, 0 >>>(d_estim, L, H, 7);
- printf("Grille : %d x %d de Blocs : %d x %d - Shared mem : %d octets\n", dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y, smem_size) ;
+ // kernel de copie
+ /*
+ kernel_neutre_img2estim<<< dimGrid, dimBlock, 0>>>(d_estim, L, H);
+ */
+ cudaThreadSynchronize() ;
+ cutilCheckError( cutStopTimer(timer) );
+ printf("Temps kernel init : %f ms\n", cutGetTimerValue(timer)) ;
+ // a remplacer par
+ /*
+ cutilCheckError( cutResetTimer(timer) );
+ cutilCheckError( cutStartTimer(timer) );
+ cutilSafeCall( cudaMemcpyFromArray( d_estim, array_img_in, 0, 0, size, cudaMemcpyDeviceToDevice)) ;
+ cutilCheckError( cutStopTimer(timer) );
+ printf("Temps memcpyFromArray : %f ms\n", cutGetTimerValue(timer)) ;
+ */
+ printf("Grille : %d x %d de Blocs : %d x %d \n", dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y) ;
+
+ cutilCheckError( cutResetTimer(timer) );
+ cutilCheckError( cutStartTimer(timer) );
for ( iter =0 ; iter < nb_iter ; iter++ )
{
cutilSafeCall( cudaMemcpyToArray( array_img_estim, 0, 0, d_estim, size, cudaMemcpyDeviceToDevice)) ;
- kernel_levelines_texture<<< dimGrid, dimBlock, 0 >>>( d_lniv, L, H );
+ //version avec/sans tentative d'utilisation de smem ( pas probante )
+ kernel_levelines_texture<<< dimGrid, dimBlock, 24*(r-1)*sizeof(short) >>>( d_lniv, L, H, r );
cutilSafeCall( cudaMemcpyToArray( array_img_lniv, 0, 0, d_lniv, size, cudaMemcpyDeviceToDevice)) ;
+ if (seq_out){
+ sprintf(image_out, "%s%d%s", image_out_base, iter+1, pgm_ext) ;
+ printf("chaine : %s\n", image_out);
+ cutilSafeCall( cudaMemcpy(h_data_out , d_estim, size, cudaMemcpyDeviceToHost) );
+ cutilCheckError( cutSavePGMi(image_out, h_data_out, L, H) ) ;
+ }
kernel_estim_next_step_texture<<< dimGrid, dimBlock, 0 >>>(d_estim, L, H, poids) ;
}
cutilCheckError( cutStopTimer(timer) );
printf("Execution moy par kernel : %f ms\n", cutGetTimerValue(timer)/(float)nb_iter) ;
printf("Total pour %d kernels : %f ms\n", nb_iter, cutGetTimerValue(timer)) ;
-
+
/**************************
* VERIFS
**************************/
- //trace des lniv sur grille de 'pas x pas'
- //kernel_trace_levelines<<< dimGrid, dimBlock, 0 >>>(d_data, d_directions, d_data2, L, H, 16, 255) ;
- //cudaThreadSynchronize();
-
- // enregistrement image lniv GPU
- h_data_out = new unsigned int[H*L] ;
- if ( h_data_out != NULL)
- cutilSafeCall( cudaMemcpy(h_data_out , d_estim, size, cudaMemcpyDeviceToHost) );
- else
- printf("Echec allocation mem CPU\n");
- cutilCheckError( cutSavePGMi(image_out, h_data_out, L, H) ) ;
+ /**************************************************
+ * recuperation matrice des chemins pour affichage
+ **************************************************/
+ /*
+ int2 * h_paths = new int2[(r-1)*PSIZE_I] ;
+ cutilSafeCall( cudaMemcpyFromArray(h_paths , array_paths, 0, 0, psize, cudaMemcpyDeviceToHost) );
+
+ //verif Di
+ printf("matrice Di");
+ for(int idpath=0; idpath< PSIZE_I; idpath++){b
+
+ printf("\n");
+ for(int idpix=0; idpix< r-1; idpix++){
+ printf(" % d ", h_paths[idpath*(r-1) + idpix].x );
+ }
+ printf("\t// %d°", idpath*15) ;
+ }
+ //verif Dj
+ printf("\nmatrice Dj");
+ for(int idpath=0; idpath< PSIZE_I; idpath++){
+ printf("\n");
+ for(int idpix=0; idpix< r-1; idpix++){
+ printf(" % d ", h_paths[idpath*(r-1) + idpix].y);
+ }
+ printf("\t// %d°", idpath*15) ;
+ }
+ printf("\n");
+ */
+ /***************************************************
+ * fin verif visuelle matrices des chemins
+ ***************************************************/
- // calcul lniv CPU
+ /***************************************************
+ * execution sequentielle pour comparaison
+ * la comparaison n'est pertinente que
+ * si d_lniv contient les lniv de l'image se départ
+ **************************************************/
+ /*
+ // calcul sequentiel
+ cutilCheckError( cutResetTimer(timer) );
+ cutilCheckError( cutStartTimer(timer) );
+ h_data_out = new unsigned int[H*L] ;
+ int * dout = new int[H*L] ;
+ for ( iter =0 ; iter < nb_iter ; iter++ ){
+ for (int i=r; i<= H-r; i++){
+ for (int j=r; j<= L-r; j++){
+ h_data_out[i*L+j] = lniv4_value(h_data, h_paths, i, j, H, L, &dout[i*L+j], r) ;
+ }
+ }
+ }
+ cutilCheckError( cutStopTimer(timer) );
+ printf("Execution sequentielle CPU : %f ms\n", cutGetTimerValue(timer)) ;
+ // comparaison
+ unsigned int * h_lniv = new unsigned int[H*L] ;
+ int pos, cpt_err=0, cpt_pix=0 ;
+ cutilSafeCall( cudaMemcpy(h_lniv , d_lniv, size, cudaMemcpyDeviceToHost) );
+ for ( iter =0 ; iter < nb_iter ; iter++ ){
+ for (int i=r; i<= H-r; i++){
+ for (int j=r; j<= L-r; j++){
+ pos = i*L + j ;
+ if ( h_data_out[ pos ] != h_lniv[ pos ] ) {
+ cpt_err++ ;
+ printf(" pixel ( %d , %d ) -> GPU= %d CPU= %d \n", i, j, h_lniv[pos], h_data_out[pos]);
+ }
+ cpt_pix++ ;
+ }
+ }
+ }
+ printf("TAUX ERREUR GPU/CPU : %d / %d \n", cpt_err, cpt_pix );
+ */
+ //trace des lniv sur grille de 'pas x pas'
+ //kernel_trace_levelines<<< dimGrid, dimBlock, 0 >>>(d_data, d_directions, d_data2, L, H, 16, 255, r) ;
+
+ if (!seq_out){
+ if ( h_data_out != NULL)
+ cutilSafeCall( cudaMemcpy(h_data_out , d_estim, size, cudaMemcpyDeviceToHost) );
+ else
+ printf("Echec allocation mem CPU\n");
+ sprintf(image_out, "%s%d%s", image_out_base, iter+1, pgm_ext) ;
+ cutilCheckError( cutSavePGMi(image_out, h_data_out, L, H) ) ;
+ }
+
// TODO verifier pourquoi les deux lignes suivantes produisent une erreur
//cutilExit(argc, argv);
//cudaThreadExit();