X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/blobdiff_plain/620e57fe130fbf0a4aa2cba23938159c7ab14719..078df8cd2b7dba79784d8e374afb7c5ee3dc4d58:/BookGPU/Chapters/chapter3/ch3.aux?ds=sidebyside diff --git a/BookGPU/Chapters/chapter3/ch3.aux b/BookGPU/Chapters/chapter3/ch3.aux index 298a450..b6d8bfd 100644 --- a/BookGPU/Chapters/chapter3/ch3.aux +++ b/BookGPU/Chapters/chapter3/ch3.aux @@ -1,121 +1,120 @@ \relax \@writefile{toc}{\author{Gilles Perrot}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {3}Setting up the environnement.}{25}} +\@writefile{toc}{\contentsline {chapter}{\numberline {3}Setting up the environnement.}{23}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\newlabel{algo:memcopy:H2D}{{7}{25}} -\newlabel{algo:memcopy:kernel}{{8}{25}} -\newlabel{algo:memcopy:D2H}{{9}{25}} -\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{25}} -\newlabel{algo:memcopy}{{1}{25}} -\@writefile{toc}{\contentsline {section}{\numberline {3.1}Data transfers, memory management.}{26}} -\newlabel{lst:main1}{{3.1}{27}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}Generic main.cu file used to launch CUDA kernels}{27}} -\newlabel{lst:fkern1}{{3.2}{27}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}fast\_kernels.cu file featuring one kernel skeleton}{27}} -\newlabel{lst:mkfile}{{3.3}{28}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}Generic Makefile based on those provided by NV SDK}{28}} -\@writefile{toc}{\contentsline {section}{\numberline {3.2}Performance measurements}{28}} -\newlabel{lst:chronos}{{3.4}{28}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Time measurement technique using cutil functions}{28}} +\newlabel{algo:memcopy:H2D}{{7}{23}} +\newlabel{algo:memcopy:kernel}{{8}{23}} +\newlabel{algo:memcopy:D2H}{{9}{23}} +\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{23}} +\newlabel{algo:memcopy}{{1}{23}} +\@writefile{toc}{\contentsline {section}{\numberline {3.1}Data transfers, memory management.}{24}} +\newlabel{lst:main1}{{3.1}{25}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}Generic main.cu file used to launch CUDA kernels}{25}} +\newlabel{lst:fkern1}{{3.2}{25}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}fast\_kernels.cu file featuring one kernel skeleton}{25}} +\newlabel{lst:mkfile}{{3.3}{26}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}Generic Makefile based on those provided by NV SDK}{26}} +\@writefile{toc}{\contentsline {section}{\numberline {3.2}Performance measurements}{26}} +\newlabel{lst:chronos}{{3.4}{26}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Time measurement technique using cutil functions}{26}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {4}Implementing a fast median filter}{31}} +\@writefile{toc}{\contentsline {chapter}{\numberline {4}Implementing a fast median filter}{29}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} \@writefile{toc}{\author{Gilles Perrot}{}} -\@writefile{toc}{\contentsline {section}{\numberline {4.1}Introduction}{31}} -\@writefile{toc}{\contentsline {section}{\numberline {4.2}Median filtering}{32}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.1}Basic principles}{32}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.2}A naive implementation}{32}} -\newlabel{img:sap_example_ref}{{4.1(a)}{33}} -\newlabel{sub@img:sap_example_ref}{{(a)}{33}} -\newlabel{img:sap_example_med3}{{4.1(b)}{33}} -\newlabel{sub@img:sap_example_med3}{{(b)}{33}} -\newlabel{img:sap_example_med5}{{4.1(c)}{33}} -\newlabel{sub@img:sap_example_med5}{{(c)}{33}} -\newlabel{img:sap_example_med3_it2}{{4.1(d)}{33}} -\newlabel{sub@img:sap_example_med3_it2}{{(d)}{33}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Exemple of median filtering, applied to salt \& pepper noise reduction.\relax }}{33}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted by salt and pepper noise of density 0.25}}}{33}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ median filter}}}{33}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ median filter}}}{33}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image denoised by 2 iterations of a $3\times 3$ median filter}}}{33}} -\newlabel{fig:sap_examples}{{4.1}{33}} -\newlabel{lst:medianGeneric}{{4.1}{34}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.1}Generic CUDA kernel achieving median filtering}{34}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Exemple of 5x5 median filtering\relax }}{35}} -\newlabel{fig:median_1}{{4.2}{35}} -\newlabel{algoMedianGeneric}{{2}{35}} -\newlabel{algoMedianGeneric:memcpyH2D}{{1}{35}} -\newlabel{algoMedianGeneric:cptstart}{{3}{35}} -\newlabel{algoMedianGeneric:cptend}{{5}{35}} -\newlabel{algoMedianGeneric:memcpyD2H}{{7}{35}} -\@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{35}} -\@writefile{toc}{\contentsline {section}{\numberline {4.3}NVidia GPU tuning recipes}{35}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{36}} -\newlabel{fig:median_overlap}{{4.3}{36}} -\@writefile{lot}{\contentsline {table}{\numberline {4.1}{\ignorespaces Performance results of \texttt {kernel medianR}. \relax }}{36}} -\newlabel{tab:medianHisto1}{{4.1}{36}} -\@writefile{toc}{\contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers }{37}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.1}The simplest way}{37}} -\newlabel{lst:kernelMedian3RegTri9}{{4.2}{38}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}3$\times $3 median filter kernel using one register per neighborhood pixel and bubble sort}{38}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Further optimization}{38}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs on GPU C2070 and CPU for generic median, 3$\times $3 median register-only and \textit {libJacket}.\relax }}{39}} -\newlabel{fig:compMedians1}{{4.4}{39}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{39}} -\newlabel{fig:forgetful_selection}{{4.5}{39}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{39}} -\newlabel{lst:medianForget1pix3}{{4.3}{40}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{40}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{41}} -\newlabel{fig:forgetful3}{{4.6}{41}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{42}} -\newlabel{fig:median3_overlap}{{4.7}{42}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{42}} -\newlabel{lst:medianForget2pix3}{{4.4}{42}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{42}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{43}} -\newlabel{fig:compMedians2}{{4.8}{43}} -\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{44}} -\newlabel{sec:median5}{{4.5.1}{44}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{44}} -\newlabel{lst:medianForget2pix5}{{4.5}{44}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{44}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.\relax }}{45}} -\newlabel{fig:median5overlap}{{4.9}{45}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.\relax }}{45}} -\newlabel{fig:median5overlap}{{4.10}{45}} -\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{46}} -\newlabel{tab:median5comp}{{4.2}{46}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{47}} -\newlabel{lst:medianSeparable}{{4.6}{47}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{47}} -\newlabel{img:sap_example_ref}{{4.11(a)}{48}} -\newlabel{sub@img:sap_example_ref}{{(a)}{48}} -\newlabel{img:sap_example_sep_med3}{{4.11(b)}{48}} -\newlabel{sub@img:sap_example_sep_med3}{{(b)}{48}} -\newlabel{img:sap_example_sep_med5}{{4.11(c)}{48}} -\newlabel{sub@img:sap_example_sep_med5}{{(c)}{48}} -\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{48}} -\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{48}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{48}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{48}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{48}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{48}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{48}} -\newlabel{fig:sap_examples2}{{4.11}{48}} -\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{49}} -\newlabel{tab:medianSeparable}{{4.3}{49}} -\@writefile{toc}{\contentsline {section}{Bibliography}{50}} +\@writefile{toc}{\contentsline {section}{\numberline {4.1}Introduction}{29}} +\@writefile{toc}{\contentsline {section}{\numberline {4.2}Median filtering}{30}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.1}Basic principles}{30}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.2}A naive implementation}{30}} +\newlabel{img:sap_example_ref}{{4.1(a)}{31}} +\newlabel{sub@img:sap_example_ref}{{(a)}{31}} +\newlabel{img:sap_example_med3}{{4.1(b)}{31}} +\newlabel{sub@img:sap_example_med3}{{(b)}{31}} +\newlabel{img:sap_example_med5}{{4.1(c)}{31}} +\newlabel{sub@img:sap_example_med5}{{(c)}{31}} +\newlabel{img:sap_example_med3_it2}{{4.1(d)}{31}} +\newlabel{sub@img:sap_example_med3_it2}{{(d)}{31}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Exemple of median filtering, applied to salt \& pepper noise reduction.\relax }}{31}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted by salt and pepper noise of density 0.25}}}{31}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ median filter}}}{31}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ median filter}}}{31}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image denoised by 2 iterations of a $3\times 3$ median filter}}}{31}} +\newlabel{fig:sap_examples}{{4.1}{31}} +\newlabel{lst:medianGeneric}{{4.1}{32}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.1}Generic CUDA kernel achieving median filtering}{32}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Exemple of 5x5 median filtering\relax }}{33}} +\newlabel{fig:median_1}{{4.2}{33}} +\newlabel{algoMedianGeneric}{{2}{33}} +\newlabel{algoMedianGeneric:memcpyH2D}{{1}{33}} +\newlabel{algoMedianGeneric:cptstart}{{3}{33}} +\newlabel{algoMedianGeneric:cptend}{{5}{33}} +\newlabel{algoMedianGeneric:memcpyD2H}{{7}{33}} +\@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{33}} +\@writefile{toc}{\contentsline {section}{\numberline {4.3}NVidia GPU tuning recipes}{33}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{34}} +\newlabel{fig:median_overlap}{{4.3}{34}} +\@writefile{lot}{\contentsline {table}{\numberline {4.1}{\ignorespaces Performance results of \texttt {kernel medianR}. \relax }}{34}} +\newlabel{tab:medianHisto1}{{4.1}{34}} +\@writefile{toc}{\contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers }{35}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.1}The simplest way}{35}} +\newlabel{lst:kernelMedian3RegTri9}{{4.2}{36}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}3$\times $3 median filter kernel using one register per neighborhood pixel and bubble sort}{36}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Further optimization}{36}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{36}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs on GPU C2070 and CPU for generic median, 3$\times $3 median register-only and \textit {libJacket}.\relax }}{37}} +\newlabel{fig:compMedians1}{{4.4}{37}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{37}} +\newlabel{fig:forgetful_selection}{{4.5}{37}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{38}} +\newlabel{fig:forgetful3}{{4.6}{38}} +\newlabel{lst:medianForget1pix3}{{4.3}{39}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{39}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{40}} +\newlabel{fig:median3_overlap}{{4.7}{40}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{40}} +\newlabel{lst:medianForget2pix3}{{4.4}{40}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{40}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{41}} +\newlabel{fig:compMedians2}{{4.8}{41}} +\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{42}} +\newlabel{sec:median5}{{4.5.1}{42}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{42}} +\newlabel{lst:medianForget2pix5}{{4.5}{42}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{42}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.\relax }}{43}} +\newlabel{fig:median5overlap}{{4.9}{43}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.\relax }}{43}} +\newlabel{fig:median5overlap}{{4.10}{43}} +\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{44}} +\newlabel{tab:median5comp}{{4.2}{44}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{45}} +\newlabel{lst:medianSeparable}{{4.6}{45}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{45}} +\newlabel{img:sap_example_ref}{{4.11(a)}{46}} +\newlabel{sub@img:sap_example_ref}{{(a)}{46}} +\newlabel{img:sap_example_sep_med3}{{4.11(b)}{46}} +\newlabel{sub@img:sap_example_sep_med3}{{(b)}{46}} +\newlabel{img:sap_example_sep_med5}{{4.11(c)}{46}} +\newlabel{sub@img:sap_example_sep_med5}{{(c)}{46}} +\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{46}} +\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{46}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{46}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{46}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{46}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{46}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{46}} +\newlabel{fig:sap_examples2}{{4.11}{46}} +\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{47}} +\newlabel{tab:medianSeparable}{{4.3}{47}} \@setckpt{Chapters/chapter3/ch3}{ -\setcounter{page}{52} +\setcounter{page}{49} \setcounter{equation}{0} \setcounter{enumi}{3} \setcounter{enumii}{0} \setcounter{enumiii}{0} -\setcounter{enumiv}{10} +\setcounter{enumiv}{0} \setcounter{footnote}{0} \setcounter{mpfootnote}{0} \setcounter{part}{2}