X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/blobdiff_plain/8ad1643f80bdf5681bdb9cca04ff30378cb44cb8..1ac5b5a535d9154c4f080e94f2f9a49ab6e299b7:/BookGPU/Chapters/chapter3/ch3.aux?ds=sidebyside diff --git a/BookGPU/Chapters/chapter3/ch3.aux b/BookGPU/Chapters/chapter3/ch3.aux index 0d1505e..8740459 100644 --- a/BookGPU/Chapters/chapter3/ch3.aux +++ b/BookGPU/Chapters/chapter3/ch3.aux @@ -1,126 +1,114 @@ \relax \@writefile{toc}{\author{Gilles Perrot}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {3}Setting up the environnement.}{23}} +\@writefile{toc}{\contentsline {chapter}{\numberline {3}Setting up the environnement.}{25}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\newlabel{algo:memcopy:H2D}{{7}{23}} -\newlabel{algo:memcopy:kernel}{{8}{23}} -\newlabel{algo:memcopy:D2H}{{9}{23}} -\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{23}} -\newlabel{algo:memcopy}{{1}{23}} -\@writefile{toc}{\contentsline {section}{\numberline {3.1}Data transfers, memory management.}{24}} -\newlabel{lst:main1}{{3.1}{25}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}Generic main.cu file used to launch CUDA kernels}{25}} -\newlabel{lst:fkern1}{{3.2}{25}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}fast\_kernels.cu file featuring one kernel skeleton}{25}} -\newlabel{lst:mkfile}{{3.3}{26}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}Generic Makefile based on those provided by NV SDK}{26}} -\@writefile{toc}{\contentsline {section}{\numberline {3.2}Performance measurements}{26}} -\newlabel{lst:chronos}{{3.4}{26}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Time measurement technique using cutil functions}{26}} +\@writefile{toc}{\contentsline {section}{\numberline {3.1}Data transfers, memory management.}{25}} +\newlabel{algo:memcopy:H2D}{{7}{26}} +\newlabel{algo:memcopy:kernel}{{8}{26}} +\newlabel{algo:memcopy:D2H}{{9}{26}} +\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces global memory management on CPU and GPU sides\relax }}{26}} +\newlabel{algo:memcopy}{{1}{26}} +\newlabel{lst:main1}{{3.1}{27}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}generic main.cu file used to launch CUDA kernels}{27}} +\newlabel{lst:fkern1}{{3.2}{27}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}fast\_kernels.cu file featuring one kernel skeleton}{27}} +\newlabel{lst:mkfile}{{3.3}{28}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}generic makefile based on those provided by NVIDIA SDK}{28}} +\@writefile{toc}{\contentsline {section}{\numberline {3.2}Performance measurements}{28}} +\newlabel{lst:chronos}{{3.4}{28}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Time measurement technique using cutil functions}{28}} +\@writefile{toc}{\author{Gilles Perrot}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {4}Implementing a fast median filter}{29}} +\@writefile{toc}{\contentsline {chapter}{\numberline {4}Implementing a fast median filter}{31}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {section}{\numberline {4.1}Introduction}{29}} -\@writefile{toc}{\contentsline {section}{\numberline {4.2}Median filtering}{30}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.1}Basic principles}{30}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.2}A naive implementation}{30}} -\newlabel{img:sap_example_ref}{{4.1(a)}{31}} -\newlabel{sub@img:sap_example_ref}{{(a)}{31}} -\newlabel{img:sap_example_med3}{{4.1(b)}{31}} -\newlabel{sub@img:sap_example_med3}{{(b)}{31}} -\newlabel{img:sap_example_med5}{{4.1(c)}{31}} -\newlabel{sub@img:sap_example_med5}{{(c)}{31}} -\newlabel{img:sap_example_med3_it2}{{4.1(d)}{31}} -\newlabel{sub@img:sap_example_med3_it2}{{(d)}{31}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Exemple of median filtering, applied to salt \& pepper noise reduction.\relax }}{31}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted by salt and pepper noise of density 0.25}}}{31}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ median filter}}}{31}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ median filter}}}{31}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image denoised by 2 iterations of a $3\times 3$ median filter}}}{31}} -\newlabel{fig:sap_examples}{{4.1}{31}} -\newlabel{lst:medianGeneric}{{4.1}{32}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.1}Generic CUDA kernel achieving median filtering}{32}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Exemple of 5x5 median filtering\relax }}{33}} -\newlabel{fig:median_1}{{4.2}{33}} +\@writefile{toc}{\contentsline {section}{\numberline {4.1}Introduction}{31}} +\@writefile{toc}{\contentsline {section}{\numberline {4.2}Median filtering}{32}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.1}Basic principles}{32}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Example of 5x5 median filtering\relax }}{32}} +\newlabel{fig:median_1}{{4.1}{32}} \newlabel{algoMedianGeneric}{{2}{33}} \newlabel{algoMedianGeneric:memcpyH2D}{{1}{33}} \newlabel{algoMedianGeneric:cptstart}{{3}{33}} \newlabel{algoMedianGeneric:cptend}{{5}{33}} \newlabel{algoMedianGeneric:memcpyD2H}{{7}{33}} \@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{33}} -\@writefile{toc}{\contentsline {section}{\numberline {4.3}NVidia GPU tuning recipes}{33}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{34}} -\newlabel{fig:median_overlap}{{4.3}{34}} -\@writefile{lot}{\contentsline {table}{\numberline {4.1}{\ignorespaces Performance results of \texttt {kernel medianR}. \relax }}{34}} -\newlabel{tab:medianHisto1}{{4.1}{34}} -\@writefile{toc}{\contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers }{35}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.1}The simplest way}{35}} -\newlabel{lst:kernelMedian3RegTri9}{{4.2}{36}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}3$\times $3 median filter kernel using one register per neighborhood pixel and bubble sort}{36}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Further optimization}{36}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs on GPU C2070 and CPU for generic median, in 3$\times $3 median register-only and \textit {libJacket}.\relax }}{37}} -\newlabel{fig:compMedians1}{{4.4}{37}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{37}} -\newlabel{fig:forgetful_selection}{{4.5}{37}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count}{37}} -\newlabel{lst:medianForget1pix3}{{4.3}{38}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method}{38}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{39}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{40}} -\newlabel{fig:median3_overlap}{{4.6}{40}} -\newlabel{lst:medianForget2pix3}{{4.4}{40}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{40}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{41}} -\newlabel{fig:compMedians2}{{4.7}{41}} -\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{41}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.\relax }}{42}} -\newlabel{fig:median5overlap}{{4.8}{42}} -\newlabel{sec:median5}{{4.5.1}{42}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{42}} -\newlabel{lst:medianForget2pix5}{{4.5}{42}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{42}} -\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{44}} -\newlabel{tab:median5comp}{{4.2}{44}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{44}} -\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{45}} -\newlabel{tab:medianSeparable}{{4.3}{45}} -\newlabel{lst:medianSeparable}{{4.6}{45}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{45}} -\newlabel{img:sap_example_ref}{{4.9(a)}{46}} -\newlabel{sub@img:sap_example_ref}{{(a)}{46}} -\newlabel{img:sap_example_sep_med3}{{4.9(b)}{46}} -\newlabel{sub@img:sap_example_sep_med3}{{(b)}{46}} -\newlabel{img:sap_example_sep_med5}{{4.9(c)}{46}} -\newlabel{sub@img:sap_example_sep_med5}{{(c)}{46}} -\newlabel{img:sap_example_sep_med3_it2}{{4.9(d)}{46}} -\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{46}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{46}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{46}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{46}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{46}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{46}} -\newlabel{fig:sap_examples2}{{4.9}{46}} -\@writefile{toc}{\contentsline {section}{Bibliography}{47}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.2}A naive implementation}{33}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{34}} +\newlabel{fig:median_overlap}{{4.2}{34}} +\newlabel{lst:medianGeneric}{{4.1}{34}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.1}generic CUDA kernel achieving median filtering}{34}} +\@writefile{lot}{\contentsline {table}{\numberline {4.1}{\ignorespaces Performance results of \texttt {kernel medianR}. \relax }}{35}} +\newlabel{tab:medianHisto1}{{4.1}{35}} +\@writefile{toc}{\contentsline {section}{\numberline {4.3}NVIDIA GPU tuning recipes}{35}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Example of median filtering, applied to salt and pepper noise reduction.\relax }}{36}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted by salt and pepper noise of density 0.25}}}{36}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ median filter}}}{36}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ median filter}}}{36}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image denoised by 2 iterations of a $3\times 3$ median filter}}}{36}} +\newlabel{fig:sap_examples}{{4.3}{36}} +\@writefile{toc}{\contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers}{37}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.1}The simplest way}{37}} +\newlabel{lst:kernelMedian3RegTri9}{{4.2}{38}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}$3\times 3$ median filter kernel using one register per neighborhood pixel and bubble sort}{38}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Further optimization}{38}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs for CPU generic median, CPU 3$\times $3 median register-only with bubble sort, GPU generic median, GPU 3$\times $3 median register-only with bubble sort, and GPU libJacket.}}{39}} +\newlabel{fig:compMedians1}{{4.4}{39}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{39}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for $3\times 3$ pixel window represented in a row and supposed sorted.\relax }}{40}} +\newlabel{fig:forgetful_selection}{{4.5}{40}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the median value by the \textit {forgetful selection} process, applied to a $3\times 3$ neighborhood window.\relax }}{41}} +\newlabel{fig:forgetful3}{{4.6}{41}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network.}}{41}} +\newlabel{fig:bitonic}{{4.7}{41}} +\newlabel{lst:medianForget1pix3}{{4.3}{42}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070}{42}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{42}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a $3\times 3$ median kernel.\relax }}{43}} +\newlabel{fig:median3_overlap}{{4.8}{43}} +\newlabel{lst:medianForget2pix3}{{4.4}{43}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}$3\times 3$ median filter kernel processing 2 output pixel values per thread using combined forgetful selection}{43}} +\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{44}} +\newlabel{sec:median5}{{4.5.1}{44}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{44}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{45}} +\newlabel{fig:compMedians2}{{4.9}{45}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously.}}{45}} +\newlabel{fig:median5overlap}{{4.10}{45}} +\newlabel{lst:medianForget2pix5}{{4.5}{46}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection}{46}} +\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{47}} +\newlabel{tab:median5comp}{{4.2}{47}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated $n\times n$ median filter }{47}} +\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{48}} +\newlabel{tab:medianSeparable}{{4.3}{48}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Example of separable median filtering (smoother), applied to salt and pepper noise reduction.\relax }}{49}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{49}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{49}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{49}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{49}} +\newlabel{fig:sap_examples2}{{4.11}{49}} +\newlabel{lst:medianSeparable}{{4.6}{50}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{50}} \@setckpt{Chapters/chapter3/ch3}{ -\setcounter{page}{49} +\setcounter{page}{53} \setcounter{equation}{0} \setcounter{enumi}{3} \setcounter{enumii}{0} \setcounter{enumiii}{0} -\setcounter{enumiv}{9} +\setcounter{enumiv}{12} \setcounter{footnote}{0} \setcounter{mpfootnote}{0} -\setcounter{part}{1} +\setcounter{part}{2} \setcounter{chapter}{4} \setcounter{section}{5} \setcounter{subsection}{2} \setcounter{subsubsection}{0} \setcounter{paragraph}{0} \setcounter{subparagraph}{0} -\setcounter{figure}{9} +\setcounter{figure}{11} \setcounter{table}{3} \setcounter{numauthors}{0} \setcounter{parentequation}{0} @@ -134,6 +122,13 @@ \setcounter{algocfline}{2} \setcounter{algocfproc}{2} \setcounter{algocf}{2} +\setcounter{nprt@mantissa@digitsbefore}{0} +\setcounter{nprt@mantissa@digitsafter}{0} +\setcounter{nprt@exponent@digitsbefore}{0} +\setcounter{nprt@exponent@digitsafter}{0} +\setcounter{nprt@digitsfirstblock}{0} +\setcounter{nprt@blockcnt}{0} +\setcounter{nprt@cntprint}{0} \setcounter{proposition}{0} \setcounter{theorem}{0} \setcounter{exercise}{0}