From: couturie Date: Wed, 24 Jul 2013 20:12:52 +0000 (+0200) Subject: ch zulu X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/17d1891ee5feec4f52ef9c51bfa60b78f0bd14c2?ds=inline ch zulu --- diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index 3202b3b..90baace 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -37,6 +37,7 @@ \usepackage{commath} \usepackage{numprint} \usepackage{placeins} +%\usepackage{float} %\usepackage{lmodern} %% \usepackage{listings} %% \usepackage{subfigure} diff --git a/BookGPU/Chapters/chapter3/biblio3.bib b/BookGPU/Chapters/chapter3/biblio3.bib index 219118b..7cf9431 100755 --- a/BookGPU/Chapters/chapter3/biblio3.bib +++ b/BookGPU/Chapters/chapter3/biblio3.bib @@ -332,7 +332,9 @@ doi = {10.1109/NSSMIC.2009.5402323}, issn = {1095-7863}, keywords = {CUDA-based BVM filter; NVIDIA compute unified device architecture; O(M In M) computational complexity; O(M2) computational complexity; branchless vectorized median filter; computerised tomography; data-level parallelism; fast accessing scheme; high performance median filtering; memory layout; modern commodity graphics processing units; pivot median filter; vectorized median computation; biology computing; computerised tomography; medical image processing}, - month = {24 2009-nov. 1}, + year = {2009}, + month = {November}, + day = {24}, pages = {4142--4147}, title = {High performance median filtering using commodity graphics hardware}, year = {2009} @@ -351,12 +353,13 @@ @INPROCEEDINGS{6036776, author={Perrot, G. and Domas, S. and Couturier, R. and Bertaux, N.}, -booktitle={Computer and Information Technology (CIT), 2011 IEEE 11th International Conference on}, title={GPU Implementation of a Region Based Algorithm for Large Images Segmentation}, +booktitle={2011 IEEE 11th International Conference on Computer and Information Technology (CIT)}, +title={GPU Implementation of a Region Based Algorithm for Large Images Segmentation}, year={2011}, -month={31 2011-sept. 2}, +month={Sept.}, volume={}, number={}, -pages={291 -298}, +pages={291-298}, keywords={GPU implementation;Nvidia GPU architecture;algorithmic optimization;graphical processing units;image computing;image segmentation;image size;multicore CPU;multithreaded execution capability;region based algorithm;region-based active contour technique;snake algorithm;computer graphic equipment;coprocessors;image enhancement;image segmentation;multi-threading;multiprocessing systems;optimisation;}, doi={10.1109/CIT.2011.60}, ISSN={},} @@ -372,8 +375,9 @@ year = 1977 @INPROCEEDINGS{5402362, author={Kachelriess, M.}, booktitle={Nuclear Science Symposium Conference Record (NSS/MIC), 2009 IEEE}, title={Branchless vectorized median filtering}, -year={2009}, -month={24 2009-nov. 1}, +year = {2009}, +month = {November}, +day = {24}, volume={}, number={}, pages={4099 -4105}, @@ -383,7 +387,7 @@ ISSN={1095-7863},} @article{Weiss:2006:FMB:1141911.1141918, - author = {Weiss, B}, + author = {Weiss, B.}, title = {Fast median and bilateral filtering}, journal = {ACM Trans. Graph.}, issue_date = {July 2006}, @@ -451,12 +455,13 @@ ISSN={1520-6149},} @ARTICLE{4287006, author={Perreault, S. and Hebert, P.}, -journal={Image Processing, IEEE Transactions on}, title={Median Filtering in Constant Time}, +journal={IEEE Transactions on Image Processing} , +title={Median Filtering in Constant Time}, year={2007}, -month={sept. }, +month={Sept.}, volume={16}, number={9}, -pages={2389 -2394}, +pages={2389-2394}, keywords={algorithmic runtime complexity;filter kernel radius;image processing;median filtering algorithm;computational complexity;filtering theory;image processing;median filters;Algorithms;Computer Graphics;Image Enhancement;Image Interpretation, Computer-Assisted;Numerical Analysis, Computer-Assisted;Reproducibility of Results;Sensitivity and Specificity;Time Factors;User-Computer Interface;}, doi={10.1109/TIP.2007.902329}, ISSN={1057-7149},} @@ -466,7 +471,7 @@ author={Y. Wu and M. Eghbali and J. Ou and R. Lu and L. Toro and E. Stefani}, journal={Biophysical Journal}, title={Quantitative determination of spatial protein-protein correlations in fluorescence confocal microscopy.}, year={2010}, -month={feb. }, +month={Feb. }, volume={98}, number={3}, pages={493-504}, @@ -476,20 +481,22 @@ doi={10.1016/j.bpj.2009.10.037}, year={2012}, issn={1939-8018}, journal={Journal of Signal Processing Systems}, +volume={71}, +number={3}, doi={10.1007/s11265-012-0715-1}, title={Highly Parallelable Bidimensional Median Filter for Modern Parallel Programming Models}, url={http://dx.doi.org/10.1007/s11265-012-0715-1}, publisher={Springer US}, keywords={Nonlinear filters; Parallel algorithms; Image processing}, author={Sánchez, R. M. and Rodríguez, P. A.}, -pages={1-15}, +pages={221-235}, language={English} } @inproceedings{Batcher:1968:SNA:1468075.1468121, author = {Batcher, K. E.}, title = {Sorting networks and their applications}, - booktitle = {Proceedings of the April 30--May 2, 1968, spring joint computer conference}, + booktitle = {Proceedings of the April 30--May 2, 1968, Spring Joint Computer Conference}, series = {AFIPS '68 (Spring)}, year = {1968}, location = {Atlantic City, New Jersey}, @@ -506,4 +513,17 @@ language={English} author={Cormen, Thomas H and Leiserson, Charles E and Rivest, Ronald L and Stein, Clifford}, year={2001}, publisher={MIT press} +} +@article{median_zul, +year={2013}, +issn={1939-8018}, +journal={Journal of Signal Processing Systems}, +doi={10.1007/s11265-013-0799-2}, +title={Fine-tuned High-speed Implementation of a GPU-based Median Filter}, +url={http://dx.doi.org/10.1007/s11265-013-0799-2}, +publisher={Springer US}, +keywords={Median; Filter; GPU}, +author={Perrot, G. and Domas, S. and Couturier, R.}, +pages={1-6}, +language={English} } \ No newline at end of file diff --git a/BookGPU/Chapters/chapter3/ch3.aux b/BookGPU/Chapters/chapter3/ch3.aux index 69d76c6..0fb9a0d 100644 --- a/BookGPU/Chapters/chapter3/ch3.aux +++ b/BookGPU/Chapters/chapter3/ch3.aux @@ -4,18 +4,18 @@ \@writefile{toc}{\contentsline {chapter}{\numberline {3}Setting up the environnement.}{25}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\newlabel{algo:memcopy:H2D}{{7}{25}} -\newlabel{algo:memcopy:kernel}{{8}{25}} -\newlabel{algo:memcopy:D2H}{{9}{25}} -\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{25}} -\newlabel{algo:memcopy}{{1}{25}} -\@writefile{toc}{\contentsline {section}{\numberline {3.1}Data transfers, memory management.}{26}} +\@writefile{toc}{\contentsline {section}{\numberline {3.1}Data transfers, memory management.}{25}} +\newlabel{algo:memcopy:H2D}{{7}{26}} +\newlabel{algo:memcopy:kernel}{{8}{26}} +\newlabel{algo:memcopy:D2H}{{9}{26}} +\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces global memory management on CPU and GPU sides\relax }}{26}} +\newlabel{algo:memcopy}{{1}{26}} \newlabel{lst:main1}{{3.1}{27}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}Generic main.cu file used to launch CUDA kernels}{27}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}generic main.cu file used to launch CUDA kernels}{27}} \newlabel{lst:fkern1}{{3.2}{27}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}fast\_kernels.cu file featuring one kernel skeleton}{27}} \newlabel{lst:mkfile}{{3.3}{28}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}Generic Makefile based on those provided by NV SDK}{28}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}generic makefile based on those provided by NVIDIA SDK}{28}} \@writefile{toc}{\contentsline {section}{\numberline {3.2}Performance measurements}{28}} \newlabel{lst:chronos}{{3.4}{28}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Time measurement technique using cutil functions}{28}} @@ -27,69 +27,69 @@ \@writefile{toc}{\contentsline {section}{\numberline {4.1}Introduction}{31}} \@writefile{toc}{\contentsline {section}{\numberline {4.2}Median filtering}{32}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.2.1}Basic principles}{32}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.2}A naive implementation}{32}} -\newlabel{img:sap_example_ref}{{4.1(a)}{33}} -\newlabel{sub@img:sap_example_ref}{{(a)}{33}} -\newlabel{img:sap_example_med3}{{4.1(b)}{33}} -\newlabel{sub@img:sap_example_med3}{{(b)}{33}} -\newlabel{img:sap_example_med5}{{4.1(c)}{33}} -\newlabel{sub@img:sap_example_med5}{{(c)}{33}} -\newlabel{img:sap_example_med3_it2}{{4.1(d)}{33}} -\newlabel{sub@img:sap_example_med3_it2}{{(d)}{33}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Example of median filtering, applied to salt \& pepper noise reduction.\relax }}{33}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted by salt and pepper noise of density 0.25}}}{33}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ median filter}}}{33}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ median filter}}}{33}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image denoised by 2 iterations of a $3\times 3$ median filter}}}{33}} -\newlabel{fig:sap_examples}{{4.1}{33}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Example of 5x5 median filtering\relax }}{32}} +\newlabel{fig:median_1}{{4.1}{32}} +\newlabel{algoMedianGeneric}{{2}{33}} +\newlabel{algoMedianGeneric:memcpyH2D}{{1}{33}} +\newlabel{algoMedianGeneric:cptstart}{{3}{33}} +\newlabel{algoMedianGeneric:cptend}{{5}{33}} +\newlabel{algoMedianGeneric:memcpyD2H}{{7}{33}} +\@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{33}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.2}A naive implementation}{33}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{34}} +\newlabel{fig:median_overlap}{{4.2}{34}} \newlabel{lst:medianGeneric}{{4.1}{34}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.1}Generic CUDA kernel achieving median filtering}{34}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Example of 5x5 median filtering\relax }}{35}} -\newlabel{fig:median_1}{{4.2}{35}} -\newlabel{algoMedianGeneric}{{2}{35}} -\newlabel{algoMedianGeneric:memcpyH2D}{{1}{35}} -\newlabel{algoMedianGeneric:cptstart}{{3}{35}} -\newlabel{algoMedianGeneric:cptend}{{5}{35}} -\newlabel{algoMedianGeneric:memcpyD2H}{{7}{35}} -\@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{35}} -\@writefile{toc}{\contentsline {section}{\numberline {4.3}NVidia GPU tuning recipes}{35}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{36}} -\newlabel{fig:median_overlap}{{4.3}{36}} -\@writefile{lot}{\contentsline {table}{\numberline {4.1}{\ignorespaces Performance results of \texttt {kernel medianR}. \relax }}{36}} -\newlabel{tab:medianHisto1}{{4.1}{36}} -\@writefile{toc}{\contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers }{37}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.1}generic CUDA kernel achieving median filtering}{34}} +\@writefile{lot}{\contentsline {table}{\numberline {4.1}{\ignorespaces Performance results of \texttt {kernel medianR}. \relax }}{35}} +\newlabel{tab:medianHisto1}{{4.1}{35}} +\@writefile{toc}{\contentsline {section}{\numberline {4.3}NVIDIA GPU tuning recipes}{35}} +\newlabel{img:sap_example_ref}{{4.3(a)}{36}} +\newlabel{sub@img:sap_example_ref}{{(a)}{36}} +\newlabel{img:sap_example_med3}{{4.3(b)}{36}} +\newlabel{sub@img:sap_example_med3}{{(b)}{36}} +\newlabel{img:sap_example_med5}{{4.3(c)}{36}} +\newlabel{sub@img:sap_example_med5}{{(c)}{36}} +\newlabel{img:sap_example_med3_it2}{{4.3(d)}{36}} +\newlabel{sub@img:sap_example_med3_it2}{{(d)}{36}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Example of median filtering, applied to salt and pepper noise reduction.\relax }}{36}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted by salt and pepper noise of density 0.25}}}{36}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ median filter}}}{36}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ median filter}}}{36}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image denoised by 2 iterations of a $3\times 3$ median filter}}}{36}} +\newlabel{fig:sap_examples}{{4.3}{36}} +\@writefile{toc}{\contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers}{37}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.4.1}The simplest way}{37}} \newlabel{lst:kernelMedian3RegTri9}{{4.2}{38}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}3$\times $3 median filter kernel using one register per neighborhood pixel and bubble sort}{38}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}$3\times 3$ median filter kernel using one register per neighborhood pixel and bubble sort}{38}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Further optimization}{38}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs on GPU C2070 and CPU for generic median, 3$\times $3 median register-only and \textit {libJacket}.\relax }}{39}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs for CPU generic median, CPU 3$\times $3 median register-only with bubble sort, GPU generic median, GPU 3$\times $3 median register-only with bubble sort, and GPU libJacket.}}{39}} \newlabel{fig:compMedians1}{{4.4}{39}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{39}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{40}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for $3\times 3$ pixel window represented in a row and supposed sorted.\relax }}{40}} \newlabel{fig:forgetful_selection}{{4.5}{40}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{41}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the median value by the \textit {forgetful selection} process, applied to a $3\times 3$ neighborhood window.\relax }}{41}} \newlabel{fig:forgetful3}{{4.6}{41}} -\newlabel{lst:medianForget1pix3}{{4.3}{41}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{41}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network.}}{41}} +\newlabel{fig:bitonic}{{4.7}{41}} +\newlabel{lst:medianForget1pix3}{{4.3}{42}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070}{42}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{42}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{42}} -\newlabel{fig:median3_overlap}{{4.7}{42}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a $3\times 3$ median kernel.\relax }}{43}} +\newlabel{fig:median3_overlap}{{4.8}{43}} \newlabel{lst:medianForget2pix3}{{4.4}{43}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{43}} -\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{43}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{44}} -\newlabel{fig:compMedians2}{{4.8}{44}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}$3\times 3$ median filter kernel processing 2 output pixel values per thread using combined forgetful selection}{43}} +\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{44}} \newlabel{sec:median5}{{4.5.1}{44}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{44}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously.}}{45}} -\newlabel{fig:median5overlap}{{4.9}{45}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network.}}{45}} -\newlabel{fig:bitonic}{{4.10}{45}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{45}} +\newlabel{fig:compMedians2}{{4.9}{45}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously.}}{45}} +\newlabel{fig:median5overlap}{{4.10}{45}} \newlabel{lst:medianForget2pix5}{{4.5}{46}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{46}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection}{46}} \@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{47}} \newlabel{tab:median5comp}{{4.2}{47}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{47}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated $n\times n$ median filter }{47}} \@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{48}} \newlabel{tab:medianSeparable}{{4.3}{48}} \newlabel{img:sap_example_ref}{{4.11(a)}{49}} @@ -100,7 +100,7 @@ \newlabel{sub@img:sap_example_sep_med5}{{(c)}{49}} \newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{49}} \newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{49}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Example of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{49}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Example of separable median filtering (smoother), applied to salt and pepper noise reduction.\relax }}{49}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{49}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{49}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{49}} @@ -114,7 +114,7 @@ \setcounter{enumi}{3} \setcounter{enumii}{0} \setcounter{enumiii}{0} -\setcounter{enumiv}{11} +\setcounter{enumiv}{12} \setcounter{footnote}{0} \setcounter{mpfootnote}{0} \setcounter{part}{2} diff --git a/BookGPU/Chapters/chapter3/ch3.tex b/BookGPU/Chapters/chapter3/ch3.tex index 6a5181b..64709ea 100755 --- a/BookGPU/Chapters/chapter3/ch3.tex +++ b/BookGPU/Chapters/chapter3/ch3.tex @@ -15,8 +15,18 @@ Obviously, our code originally accepts various image dimensions and can process However, so as to propose concise and more readable code, we will assume the following limitations: 16~bit-coded gray-level input images whose dimensions $H\times W$ are multiples of 512 pixels. +\section{Data transfers, memory management.} +This section deals with the following issues: +\begin{enumerate} +\item Data transfer from CPU memory to GPU global memory: several GPU memory areas are available as destination memory but the 2D caching mechanism of texture memory, \index{memory~hierarchy!texture~memory} specifically designed for fetching neighboring pixels, is currently the fastest way to fetch gray-level pixel values inside a kernel computation. This has led us to choose \textbf{texture memory} as primary GPU memory area for input images. +\item Data fetching from GPU global memory to kernel local memory: as said above, we use texture memory. \index{memory~hierarchy!texture~memory} Depending on which process is run, texture data is used either by direct fetching in kernel local memory or through a prefetching \index{prefetching} in shared memory. \index{memory~hierarchy!shared~memory} +\item Data outputting from kernels to GPU memory: there is actually no alternative to global memory, as kernels cannot directly write into texture memory and as copying from texture to CPU memory would not be faster than from simple global memory. +\item Data transfer from GPU global memory to CPU memory: it can be drastically accelerated by use of \textbf{pinned memory}, \index{memory~hierarchy!pinned~memory} keeping in mind it has to be used sparingly. +\end{enumerate} +Algorithm \ref{algo:memcopy} summarizes all the above considerations and describes how data are handled in our examples. For more information on how to handle the different types of GPU memory, we suggest referring to the CUDA programmer's guide. + \begin{algorithm} -\SetNlSty{}{}{:} +%\SetNlSty{}{}{:} allocate and populate CPU memory \textbf{h\_in}\; allocate CPU pinned-memory \textbf{h\_out}\; allocate GPU global memory \textbf{d\_out}\; @@ -26,119 +36,81 @@ However, so as to propose concise and more readable code, we will assume the fol copy data from \textbf{h\_in} to \textbf{array\_img\_in}\label{algo:memcopy:H2D}\; kernel\kl gridDim,blockDim\kr()\tcc*[f]{outputs to d\_out}\label{algo:memcopy:kernel}\; copy data from \textbf{d\_out} to \textbf{h\_out} \label{algo:memcopy:D2H}\; -\caption{Global memory management on CPU and GPU sides.} +\caption{global memory management on CPU and GPU sides} \label{algo:memcopy} \end{algorithm} -\section{Data transfers, memory management.} -This section deals with the following issues: -\begin{enumerate} -\item Data transfer from CPU memory to GPU global memory: several GPU memory areas are available as destination memory but the 2-D caching mechanism of texture memory \index{memory~hierarchy!texture~memory}, specifically designed for fetching neighboring pixels, is currently the fastest way to fetch gray-level pixel values inside a kernel computation. This has led us to choose \textbf{texture memory} as primary GPU memory area for input images. -\item Data fetching from GPU global memory to kernel local memory: as said above, we use texture memory. \index{memory~hierarchy!texture~memory} Depending on which process is run, texture data is used either by direct fetching in kernel local memory or through a prefetching \index{prefetching} in shared memory \index{memory~hierarchy!shared~memory}. -\item Data outputting from kernels to GPU memory: there is actually no alternative to global memory, as kernels can not directly write into texture memory and as copying from texture to CPU memory would not be faster than from simple global memory. -\item Data transfer from GPU global memory to CPU memory: it can be drastically accelerated by use of \textbf{pinned memory}, \index{memory~hierarchy!pinned~memory} keeping in mind it has to be used sparingly. -\end{enumerate} -Algorithm \ref{algo:memcopy} summarizes all the above considerations and describes how data are handled in our examples. For more information on how to handle the different types of GPU memory, we suggest to refer to the CUDA programmer's guide. -At debug stage, for simplicity's sake, we use the \textbf{cutil} \index{Cutil library} library supplied by the NVidia development kit (SDK). Thus, in order to easily implement our examples, we suggest readers to download and to install the latest NVidia-SDK (ours is SDK4.0), create a new directory \textit{SDK-root-dir/C/src/fast\_kernels} and adapt the generic \textit{Makefile} that can be found in each sub-directory of \textit{SDK-root-dir/C/src/}. Then, only two more files will be enough to have a fully operational environnement: \textit{main.cu} and \textit{fast\_kernels.cu}. +At debug stage, for simplicity's sake, we use the \textbf{cutil} \index{Cutil library} library supplied by the NVIDIA software development kit (SDK). Thus, in order to easily implement our examples, we suggest readers download and install the latest NVIDIA-SDK (ours is SDK4.0), create a new directory \textit{SDK-root-dir/C/src/fast\_kernels} and adapt the generic \textit{Makefile} that can be found in each subdirectory of \textit{SDK-root-dir/C/src/}. Then, only two more files will be needed to have a fully operational environnement: \textit{main.cu} and \textit{fast\_kernels.cu}. Listings \ref{lst:main1}, \ref{lst:fkern1} and \ref{lst:mkfile} implement all the above considerations minimally, while remaining functional. The main file of Listing \ref{lst:main1} is a simplified version of our actual main file. -It has to be noticed that cutil functions \texttt{cutLoadPGMi} \index{Cutil library!cutLoadPGMi} and \texttt{cutSavePGMi} \index{Cutil library!cutSavePGMi} only operate on unsigned integer data. As data is coded in short integer format for performance reasons, the use of these functions involves casting data after loading and before saving. This may be overcome by use of a different library. Actually, our choice was to modify the above mentioned cutil functions. +It has to be noted that functions \texttt{cutLoadPGMi} \index{Cutil library!cutLoadPGMi} and \texttt{cutSavePGMi} \index{Cutil library!cutSavePGMi} of the \textbf{cutil} library operate only on unsigned integer data. As data is coded in short integer format for performance reasons, the use of these functions involves one data cast after loading and before saving. This may be overcome by use of a different library. Actually, our choice was to modify the above mentioned cutil functions. Listing \ref{lst:fkern1} gives a minimal kernel skeleton that will serve as the basis for all other kernels. Lines 5 and 6 determine the coordinates $(i, j)$ of the pixel to be processed, each pixel being associated to one thread. -The instruction in line 8 combines writing the output gray-level value into global memory and fetching the input gray-level value from 2-D texture memory. +The instruction in line 8 combines writing the output gray-level value into global memory and fetching the input gray-level value from 2D texture memory. The Makefile given in Listing \ref{lst:mkfile} shows how to adapt examples given in SDK. -\lstinputlisting[label={lst:main1},caption=Generic main.cu file used to launch CUDA kernels]{Chapters/chapter3/code/mainSkel.cu} +\lstinputlisting[label={lst:main1},caption=generic main.cu file used to launch CUDA kernels]{Chapters/chapter3/code/mainSkel.cu} \lstinputlisting[label={lst:fkern1},caption=fast\_kernels.cu file featuring one kernel skeleton]{Chapters/chapter3/code/kernSkel.cu} -\lstinputlisting[label={lst:mkfile},caption=Generic Makefile based on those provided by NV SDK]{Chapters/chapter3/code/Makefile} +\lstinputlisting[label={lst:mkfile},caption=generic makefile based on those provided by NVIDIA SDK]{Chapters/chapter3/code/Makefile} \section{Performance measurements} -As our goal is to design very fast implementations of basic image processing algorithms, we need to make quite accurate time-measurements, within the order of magnitude of $0.01~ms$. Again, the easiest way of doing so is to use the helper functions of the cutil library. As usual, as the durations we are measuring are short and possibly subject to non neglectable variations, a good practice is to measure multiple executions and issue the mean runtime. All time results given in this chapter have been obtained through 1000 calls to each kernel. +As our goal is to design very fast implementations of basic image processing algorithms, we need to make quite accurate time-measurements, within the order of magnitude of $0.01$~ms. Again, the easiest way of doing so is to use the helper functions of the \textbf{cutil} library. As usual, because the durations we are measuring are short and possibly subject to non negligible variations, a good practice is to measure multiple executions and report the mean runtime. All time results given in this chapter have been obtained through 1000 calls to each kernel. -Listing \ref{lst:chronos} shows how to use the dedicated cutil functions \index{Cutil library!Timer usage}. Timer declaration and creation only need to be performed once while reset, start and stop can be used as often as necessary. Synchronization is mandatory before stopping the timer (Line 7), to avoid runtime measurement being biased. +Listing \ref{lst:chronos} shows how to use the dedicated \textbf{cutil} functions \index{Cutil library!Timer usage}. Timer declaration and creation need to be performed only once while reset, start and stop functions can be used as often as necessary. Synchronization is mandatory before stopping the timer (Line 7), to avoid runtime measurement being biased. \lstinputlisting[label={lst:chronos},caption=Time measurement technique using cutil functions]{Chapters/chapter3/code/exChronos.cu} -In an attempt to provide relevant speedup values, we either implemented CPU versions of the algorithms studied, or used the values found in existing literature. Still, the large number and diversity of hardware platforms and GPU cards makes it impossible to benchmark every possible combination and significant differences may occur between the speedups we announce and those obtained with different devices. As a reference, our developing platform details as follows: +In an attempt to provide relevant speedup values, we either implemented CPU versions of the algorithms studied or used the values found in existing literature. Still, the large number and diversity of hardware platforms and GPU cards makes it impossible to benchmark every possible combination and significant differences may occur between the speedups we report and those obtained with different devices. As a reference, our developing platform details as follows: \begin{itemize} -\item CPU codes run on: +\item CPU codes run on \begin{itemize} \item \textbf{Xeon}: a recent and very efficient Quad Core Xeon E31245 at 3.3GHz-8GByte RAM running Linux kernel 3.2. \end{itemize} -\item GPU codes run on: +\item GPU codes run on \begin{itemize} - \item \textbf{C2070}: Nvidia Tesla C2070 hosted by a PC QuadCore Xeon E5620 at 2.4GHz-12GByte RAM, running Linux kernel 2.6.18 - \item \textbf{GTX280}: NVidia GeForce GTX 280 hosted by a PC QuadCore Xeon X5482 at 3.20GHz-4GByte RAM, running Linux kernel 2.6.32 + \item \textbf{C2070}: NVIDIA Tesla C2070 hosted by a PC QuadCore Xeon E5620 at 2.4GHz-12GByte RAM, running Linux kernel 2.6.18 + \item \textbf{GTX280}: NVIDIA GeForce GTX 280 hosted by a PC QuadCore Xeon X5482 at 3.20GHz-4GByte RAM, running Linux kernel 2.6.32 \end{itemize} \end{itemize} -All kernels have also been tested with various image sizes from 512$\times$512 to 4096$\times$4096 pixels. This allows to guess runtime dependancy over image size. +All kernels have also been tested with various image sizes from 512$\times$512 to 4096$\times$4096 pixels. This allows estimating runtime dependancy over image size. Last, like many authors, we chose to use the pixel throughput value of each process in Mega Pixels per second (MP/s) as a performance indicator, including data transfers and kernel runtimes. -In order to estimate the potential for improvement of each kernel, a reference throughput measurement, involving identity kernel of Listing \ref{lst:fkern1}, was performed. As this kernel only fetches input values from texture memory and outputs them to global memory without doing any computation, it represents the smallest, thus fastest, possible process and is taken as the reference throughput value (100\%). The same measurement was performed on CPU, with a maximum effective pixel throughput of 130~Mpixel per second. On GPU, depending on grid parameters it amounts to 800~MPixels/s on GTX280 and 1300~Mpixels/s on C2070. +In order to estimate the potential for improvement of each kernel, a reference throughput measurement, involving the identity kernel of Listing \ref{lst:fkern1}, was performed. As this kernel only fetches input values from texture memory and outputs them to global memory without doing any computation, it represents the smallest, thus fastest, possible process and is taken as the reference throughput value (100\%). The same measurement was performed on CPU, with a maximum effective pixel throughput of $130~MP/s$. On GPU, depending on grid parameters this measurement was $800~MP/s$ on GTX280 and $1300~MP/s$ on C2070. \chapterauthor{Gilles Perrot}{Femto-ST Institute, University of Franche-Comte, France} \chapter{Implementing a fast median filter} \section{Introduction} -Median filtering is a well-known method used in a wide range of application frameworks as well as a standalone filter especially for \textit{salt and pepper} denoising. It is able to highly reduce power of noise without blurring edges too much. That is actually why we originally focused on this filtering technique as a preprocessing stage when we were in the process of designing a GPU implementation of one region-based image segmentation algorithm \cite{6036776}. +Median filtering is a well-known method used in a wide range of application frameworks as well as a standalone filter especially for \textit{salt and pepper} denoising. It is able to greatly reduce the power of noise without blurring edges too much. That is actually why we originally focused on this filtering technique as a preprocessing stage when we were in the process of designing a GPU implementation of one region-based image segmentation algorithm \cite{6036776}. -First introduced by Tukey in \cite{tukey77}, it has been widely studied since then, and many researchers have proposed efficient implementations of it, adapted to various hypothesis, architectures and processors. -Originally, its main drawbacks were its compute complexity, its non linearity and its data-dependent runtime. Several researchers have addressed these issues and designed, for example, efficient histogram-based median filter with predictible runtimes \cite{Huang:1981:TDS:539567, Weiss:2006:FMB:1179352.1141918}. +First introduced by Tukey in \cite{tukey77}, it has been widely studied since then, and many researchers have proposed efficient implementations of it, adapted to various hypotheses, architectures and processors. +Originally, its main drawbacks were its compute complexity, its nonlinearity and its data-dependent runtime. Several researchers have addressed these issues and designed, for example, efficient histogram-based median filters with predictible runtimes \cite{Huang:1981:TDS:539567, Weiss:2006:FMB:1179352.1141918}. -More recently, the advent of GPUs opened new perspectives in terms of image processing performance, and some researchers managed to take advantage of the new graphic capabilities: in that respect, we can cite the Branchless Vectorized Median filter (BVM) \cite{5402362, chen09} which allows very interesting runtimes on CUDA-enabled devices but, as far as we know, the fastest implementation to date is the histogram-based PCMF median filter \cite{Sanchez-2-2012}. +More recently, the advent of GPUs opened new perspectives in terms of image processing performance, and some researchers managed to take advantage of the new graphics capabilities: in that respect, we can cite the Branchless Vectorized Median (BVM) filter \cite{5402362, chen09} which allows very interesting runtimes on CUDA-enabled devices but, as far as we know, the fastest implementation to date is the histogram-based PCMF median filter \cite{Sanchez-2-2012}. -Some of the following implementations feature very fast runtimes; They are targeted on Nvidia Tesla GPU (Fermi architecture, compute capability 2.x) but may easily be adapted to other models e.g. those of compute capability 1.3. +Some of the following implementations feature very fast runtimes. They are targeted on NVIDIA Tesla GPU (Fermi architecture, compute capability 2.x) but may easily be adapted to other models, e.g., those of compute capability 1.3. -The fastest ones are based on one efficient parallel implementation of the BVM algorithm described in \cite{mcguire2008median}, improving its performance through fine tuning of its implementation. +The fastest ones are based on one efficient parallel implementation of the BVM algorithm described in \cite{mcguire2008median}, improving its performance through fine tuning of its implementation as presented in \cite{median_zul} and detailed in the following sections. \section{Median filtering} \subsection{Basic principles} -Designing a 2-D median filter basically consists in defining a square window $H(i,j)$ for each pixel $I(i,j)$ of the input image, containing $n\times n$ pixels and centered on $I(i,j)$. The output value $I'(i,j)$ is the median value of the gray level values of the $n\times n$ pixels of $H(i,j)$. Figure \ref{fig:median_1} illustrates this principle with an example of a 5x5 median filter applied on pixel $I(5,6)$. The output value is the median value of the 25 values of the dark gray window centered on pixel $I(5,6)$. -Figure \ref{fig:sap_examples} shows an example of a $512\times 512$ pixel image, corrupted by a \textit{salt and pepper} noise and the denoised versions, output respectively by a $3\times 3$, a $5\times 5$ and a 2 iterations $3\times 3 $ median filter. -\begin{figure} -\centering - \subfigure[Airplane image, corrupted by salt and pepper noise of density 0.25]{\label{img:sap_example_ref} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25.png}}\qquad - \subfigure[Image denoised by a $3\times 3$ median filter]{\label{img:sap_example_med3} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_med3.png}}\\ - \subfigure[Image denoised by a $5\times 5$ median filter]{\label{img:sap_example_med5} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_med5.png}}\qquad - \subfigure[Image denoised by 2 iterations of a $3\times 3$ median filter]{\label{img:sap_example_med3_it2} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_med3_it2.png}}\\ - \caption{Example of median filtering, applied to salt \& pepper noise reduction.} - \label{fig:sap_examples} -\end{figure} - - The generic filtering method is given by Algorithm \ref{algoMedianGeneric}. After the data transfer stage of the first line, which copies data from CPU memory to GPU texture memory, the actual median computing occurs, before the final transfer which copies data back to CPU memory at the last line. Obviously, one key issue is the selection method that identifies the median value. But, as shown in figure \ref{fig:median_overlap}, since two neighboring pixels share part of the values to be sorted, a second key issue is how to rule redundancy between consecutive positions of the running window $H(i,j)$. -As mentioned earlier, the selection of the median value can be performed by more than one technique, using either histogram-based or sorting methods, each of them having its own benefits and drawbacks as will be discussed further down. - -\subsection{A naive implementation} -As a reference, Listing \ref{lst:medianGeneric} gives a simple, not to say simplistic implementation of a CUDA kernel (\texttt{kernel\_medianR}) achieving generic $n\times n$ histogram-based median filtering. Its runtime has a very low data dependency, but this implementation does not suit very well GPU architecture. Each pixel loads the whole of its $n\times n$ neighborhood meaning that one pixel is loaded multiple times inside one single thread block, and above all, the use of a local vector (histogram[]) considerably downgrades performance, as the compiler automatically stores such vectors in local memory (slow) \index{memory~hierarchy!local~memory}. - -Table \ref{tab:medianHisto1} displays measured runtimes of \texttt{kernel\_medianR} and pixel throughputs for each GPU version and for both CPU and GPU implementations. Usual window sizes of $3\times 3$, $5\times 5$ and $7\times 7$ are shown. Though some specific applications require larger window sizes and dedicated algorithms, such small square window sizes are most widely used in general purpose image processing. GPU runtimes have been obtained with a grid of 64-thread blocks. This block size, is a good compromise in this case. - -The first observation to make when analysing results of Table \ref{tab:medianHisto1} is that, on CPU, window size has almost no influence on the effective pixel throughput. -Since inner loops that fill the histogram vector contain very few fetching instructions (from 9 to 49, depending on the window size), it is not surprising to note their neglectable impact compared to outer loops that fetch image pixels (from 256k to 16M instructions). -One could be tempted to claim that CPU has no chance to win, which is not so obvious as it highly depends on what kind of algorithm is run and above all, how it is implemented. To illustrate this, we can notice that, despite a maximum effective throughput potential that is almost five times higher, measured GTX280 throughput values sometimes prove slower than CPU values, as shown in Table \ref{tab:medianHisto1}. - - -\lstinputlisting[label={lst:medianGeneric},caption=Generic CUDA kernel achieving median filtering]{Chapters/chapter3/code/medianGeneric.cu} - - -On the GPU's side, we note high dependence on window size due to the redundancy induced by the multiple fetches of each pixel inside each block, becoming higher with the window size as illustrated by Figure \ref{fig:median_overlap}. On C2070 card, thanks to a more efficient caching mechanism, this effect is lesser. On GPUs, dependency over image size is low, and due to slightly more efficient data transfers when copying larger data amounts, pixel throughputs increases with image size. As an example, transferring a 4096$\times$4096 pixel image (32~MBytes) is a bit faster than transferring 64 times a 512$\times$512 pixel image (0.5~MBytes). - -%% mettre l'eau à la bouche - -\begin{figure} +Designing a 2D median filter basically consists of defining a square window $H(i,j)$ for each pixel $I(i,j)$ of the input image, containing $n\times n$ pixels and centered on $I(i,j)$. The output value $I'(i,j)$ is the median value of the gray-level values of the $n\times n$ pixels of $H(i,j)$. Figure \ref{fig:median_1} illustrates this principle with an example of a 5x5 median filter applied on pixel $I(5,6)$. The output value is the median value of the 25 values of the dark gray window centered on pixel $I(5,6)$. +\begin{figure}[b] \centering \includegraphics[width=8cm]{Chapters/chapter3/img/median_1.png} \caption{Example of 5x5 median filtering} \label{fig:median_1} \end{figure} +Figure \ref{fig:sap_examples} shows an example of a $512\times 512$ pixel image, corrupted by a \textit{salt and pepper} noise and the denoised versions, output respectively by a $3\times 3$, a $5\times 5$, and 2 iterations of a $3\times 3$ median filter. + The generic filtering method is given by Algorithm \ref{algoMedianGeneric}. After the data transfer stage of the first line, which copies data from CPU memory to GPU texture memory, the actual median computing occurs, before the final transfer which copies data back to CPU memory at the last line. Obviously, one key issue is the selection method that identifies the median value. But, as shown in Figure \ref{fig:median_overlap}, since two neighboring pixels share part of the values to be sorted, a second key issue is how to rule redundancy between consecutive positions of the running window $H(i,j)$. \begin{algorithm} %\SetNlSty{}{}{:} % \SetLine @@ -152,8 +124,19 @@ On the GPU's side, we note high dependence on window size due to the redundancy copy data from GPU global memory to CPU memory\label{algoMedianGeneric:memcpyD2H}\; \caption{\label{algoMedianGeneric}generic n$\times$n median filter} \end{algorithm} +As mentioned earlier, the selection of the median value can be performed by more than one technique, using either histogram-based or sorting methods, each having its own benefits and drawbacks as will be discussed further down. -\begin{figure} +\subsection{A naive implementation} +As a reference, Listing \ref{lst:medianGeneric} gives a simple, not to say simplistic, implementation of a CUDA kernel (\texttt{kernel\_medianR}) achieving generic $n\times n$ histogram-based median filtering. Its runtime has a very low data dependency, but this implementation does not suit GPU architecture very well. Each pixel loads the whole of its $n\times n$ neighborhood, meaning that one pixel is loaded multiple times inside one single thread block, and even more time-consuming, the use of a local vector (histogram[]) considerably downgrades performance, as the compiler automatically stores such vectors in local memory (slow) \index{memory~hierarchy!local~memory}. + +Table \ref{tab:medianHisto1} displays measured runtimes of \texttt{kernel\_medianR} and pixel throughputs for each GPU version (C2070 and GTX480 targets) and for both CPU and GPU implementations. Usual window sizes of $3\times 3$, $5\times 5$, and $7\times 7$ are shown. Though some specific applications require larger window sizes and dedicated algorithms, such small square window sizes are most widely used in general purpose image processing. GPU runtimes have been obtained with a grid of 64-thread blocks. + +The first observation to make when analysing results of Table \ref{tab:medianHisto1} is that, on CPU, window size has almost no influence on the effective pixel throughput. +Since inner loops that fill the histogram vector contain very few fetching instructions (from 9 to 49, depending on the window size), it is not surprising to note their negligible impact compared to outer loops that fetch image pixels (from 256k to 16M instructions). +One could be tempted to claim that CPU has no chance to win, which is not so obvious as it highly depends on what kind of algorithm is run and, above all, how it is implemented. To illustrate this, we can observe that, despite a maximum effective throughput potential that is almost five times higher, measured GTX280 throughput values sometimes prove slower than CPU values, as shown in Table \ref{tab:medianHisto1}. + +On the GPU's side, we note high dependence on window size due to the redundancy induced by the multiple fetches of each pixel inside each block, becoming higher with the window size. Figure \ref{fig:median_overlap} shows for example that two $5\times 5$ windows, centered on two neighbor pixels share at least 16 pixels. On C2070 card, thanks to a more efficient caching mechanism, this effect is less. On GPUs, dependency on image size is low, and due to slightly more efficient data transfers when copying larger data amounts, pixel throughputs increases with image size. As an example, transferring a 4096$\times$4096 pixel image (32~MBytes) is a bit faster than transferring a 512$\times$512 pixel image (0.5~MBytes) 64 times. +\begin{figure}[h] \centering \includegraphics[width=5cm]{Chapters/chapter3/img/median_overlap.png} \caption{Illustration of window overlapping in 5x5 median filtering} @@ -161,6 +144,8 @@ copy data from GPU global memory to CPU memory\label{algoMedianGeneric:memcpyD2H \end{figure} +\lstinputlisting[label={lst:medianGeneric},caption=generic CUDA kernel achieving median filtering]{Chapters/chapter3/code/medianGeneric.cu} + \begin{table}[h] %\newcolumntype{I}{!{\vrule width 1.5pt}} \newlength\savedwidth @@ -174,7 +159,7 @@ copy data from GPU global memory to CPU memory\label{algoMedianGeneric:memcpyD2H {\tiny \begin{tabular}{|c|l||c|c|c|c|c|c|c|c|c|} \hline -\multicolumn{2}{|l||}{Processor} & \multicolumn{3}{c|}{\textbf{GTX280}} & \multicolumn{3}{c|}{\textbf{C2070}} & \multicolumn{3}{c|}{\textbf{Xeon}} \\ \hline +\multicolumn{2}{|l||}{Processor} & \multicolumn{3}{c|}{\textbf{GTX280}} & \multicolumn{3}{c|}{\textbf{C2070}} & \multicolumn{3}{c|}{\textbf{CPU (Xeon)}} \\ \hline \multicolumn{2}{|l||}{\shortstack{Performances$\rightarrow$\\sizes (pixels)$\downarrow$}} & \shortstack{t\\(ms)}& \shortstack{output\\(MP/s)}& \shortstack{rate\\\% }&\shortstack{t\\(ms)}& \shortstack{output\\(MP/s)}& \shortstack{rate\\\% }&\shortstack{t\\(ms)}& \shortstack{output\\(MP/s)}& \shortstack{rate\\\% } \\ \whline \multirow{3}{*}{\rotatebox{90}{512$^2$}} &3$\times$3&11.50 &22 &2.2 &7.58 &33 &3.4 & 19.25& 14&11\\ &5$\times$5&19.10 &14 &1.3 &8.60 &30 &3.0 &18.49 &14 &11\\ @@ -194,43 +179,53 @@ copy data from GPU global memory to CPU memory\label{algoMedianGeneric:memcpyD2H \label{tab:medianHisto1} \end{table} -\section{NVidia GPU tuning recipes} -When designing GPU code, besides thinking of the actual data computing process, one must choose the memory type into which to store temporary data. Three types of GPU memory are available: +\begin{figure}[t] +\centering + \subfigure[Airplane image, corrupted by salt and pepper noise of density 0.25]{\label{img:sap_example_ref} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25.png}}\qquad + \subfigure[Image denoised by a $3\times 3$ median filter]{\label{img:sap_example_med3} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_med3.png}}\\ + \subfigure[Image denoised by a $5\times 5$ median filter]{\label{img:sap_example_med5} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_med5.png}}\qquad + \subfigure[Image denoised by 2 iterations of a $3\times 3$ median filter]{\label{img:sap_example_med3_it2} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_med3_it2.png}}\\ + \caption{Example of median filtering, applied to salt and pepper noise reduction.} + \label{fig:sap_examples} +\end{figure} + +\section{NVIDIA GPU tuning recipes} +When designing GPU code, besides thinking of the actual data computing process, one must choose the memory type in which to store temporary data. Three types of GPU memory are available: \begin{enumerate} -\item \textbf{Global memory, the most versatile:} \index{memory~hierarchy!global~memory}\\Offers the largest storing space and global scope but is slowest (400-800 clock cycles latency). \textbf{Texture memory} is physically included into it, but allows access through an efficient 2-D caching mechanism. -\item \textbf{Registers, the fastest:} \index{memory~hierarchy!registers}\\Allow access wihtout latency, but only 63 registers are available per thread (thread scope), with a maximum of 32K per Symetric Multiprocessor (SM) \index{register count}. -\item \textbf{Shared memory, a complex compromise:} \index{memory~hierarchy!shared~memory}\\All threads in one block can access 48~KBytes of shared memory, which is faster than global memory (\~20 cycles latency) but slower than registers. -However, bank conflicts can occur if two threads of a warp try to access data stored in one single memory bank. In such cases, the parallel process is re-serialized which may cause significant performance decrease. One easy way to avoid it is to ensure that two consecutive threads in one block always access 32-bit data at two consecutive adresses. +\item \textbf{Global memory, the most versatile:} \index{memory~hierarchy!global~memory}\\Offers the largest storing space and global scope but is the slowest (400 to 800 clock cycles latency). \textbf{Texture memory} is physically included into it, but allows access through an efficient 2D caching mechanism. +\item \textbf{Registers, the fastest:} \index{memory~hierarchy!registers}\\Allow access without latency, but only 63 registers are available per thread (thread scope), with a maximum of 32K per Streaming Multiprocessor (SM). \index{register count} +\item \textbf{Shared memory, a complex compromise:} \index{memory~hierarchy!shared~memory}\\All threads in one block can access $48~KBytes$ of shared memory, which is faster than global memory (20 clock cycles latency) but slower than registers. +However, bank conflicts can occur if two threads of a warp try to access data stored in one single memory bank. In such cases, the parallel process is serialized which may cause significant performance decrease. One easy way to avoid this is to ensure that two consecutive threads in one block always access 32-bit data at two consecutive addresses. \end{enumerate} -\noindent As observed earlier, designing a median filter GPU implementation using only global memory is fairly straightforward, but its performance remains quite low even if it is faster than CPU. -To overcome this, the most frequent choice made in efficient implementations found in literature is to use shared memory. Such option implies prefetching \index{prefetching}data prior to doing the actual computations, a relevant choice, as each pixel of an image belongs to n$\times$n different neighborhoods. Thus, it can be expected that fetching each gray-level value from global memory only once should be more efficient than doing it each time it is required. One of the most efficient implementations using shared memory is presented in \cite{5402362}. In the case of the generic kernel of Listing \ref{lst:medianGeneric}, using shared memory without further optimization would not bring valuable speedup because that would just move redundancy from texture to shared memory fetching and would generate bank conflicts. For information, we wrote such a version of the generic median kernel and our measurements showed a speedup of around 3\% (as an example: 32ms for 5$\times$5 median on a 1024$^2$ pixel image, i.e. 33~Mpixel/s ). +As observed earlier, designing a median filter GPU implementation using only global memory is fairly straightforward, but its performance remains quite low even if it is faster than CPU. +To overcome this, the most frequent choice made in efficient implementations found in literature is to use shared memory. Such option implies prefetching \index{prefetching}data prior to doing the actual computations, a relevant choice, as each pixel of an image belongs to $n^2$ different neighborhoods. Thus, it can be expected that fetching each gray-level value from global memory only once should be more efficient than doing it each time it is required. One of the most efficient implementations using shared memory is presented in \cite{5402362}. In the case of the generic kernel of Listing \ref{lst:medianGeneric}, using shared memory without further optimization would not bring valuable speedup because that would just move redundancy from texture to shared memory fetching and would generate bank conflicts. For information, we wrote such a version of the generic median kernel and our measurements showed a speedup of around 3\% (as an example, $32~ms$ for $5\times 5$ median on a 1024$^2$ pixel image, i.e., $33~MP/s$ ). -As for registers, designing a generic median filter that would only use that type of memory seems difficult, due to the above mentioned 63 register-per-thread limitation \index{register count}. +As for registers, designing a generic median filter that would use only that type of memory seems difficult, due to the above mentioned 63 register-per-thread limitation. \index{register count} Yet, nothing forbids us to design fixed-size filters, each of them specific to one of the most popular window sizes. It might be worth the effort as dramatic increase in performance could be expected. -Another track to follow in order to improve performance of GPU implementations consists in hiding latencies generated by arithmetic instruction calls and memory accesses. Both can be partially hidden by introducing Instruction-Level Parallelism \index{Instruction-Level Parallelism}(ILP) and by increasing the data count output by each thread. Though such techniques may seem to break the NVidia occupancy paradigm, they can lead to dramatically higher data throughput values. +Another track to follow in order to improve performance of GPU implementations consists of hiding latencies generated by arithmetic instruction calls and memory accesses. Both can be partially hidden by introducing Instruction-Level Parallelism \index{Instruction-Level Parallelism}(ILP) and by increasing the data count outputted by each thread. Though such techniques may seem to break the NVIDIA occupancy paradigm, they can lead to dramatically higher data throughput values. The following sections illustrate these ideas and detail the design of the fastest CUDA median filter known to date. -\section{A 3$\times$3 median filter: using registers } +\section{A 3$\times$3 median filter: using registers} Designing a median filter dedicated to the smallest possible square window size is a good challenge to start using registers. -One first issue is that the exclusive use of registers forbids us to implement a naive histogram-based method. In a \textit{8-bit gray level pixel per thread} rule, each histogram requires one 256-element vector to store its values, i.e. four times the maximum register count allowed per thread (63)\index{register count}. Considering that a 3$\times$3 median filter involves only 9 pixel values per thread, it seem obvious they can be sorted within the 63-register limit. +One first issue is that the exclusive use of registers forbids us to implement a naive histogram-based method. In a \textit{8-bit gray-level pixel per thread} rule, each histogram requires one 256-element vector to store its values, i.e., more than four times the maximum register count allowed per thread (63).\index{register count} Considering that a $3\times 3$ median filter involves only 9 pixel values per thread, it seem obvious they can be sorted within the 63-register limit. \subsection{The simplest way} -In the case of a 3$\times$3 median filter, the simplest solution consists in associating one register to each gray-level value, then sorting those 9 values and selecting the fifth one, i.e. the median value. For such a small amount of data to sort, a simple selection method is well indicated. As shown in Listing \ref{lst:kernelMedian3RegTri9} (\texttt{kernelMedian3RegSort9()}), the constraint of only using registers leads to adopt an unusual manner of coding. However, results are persuasive: runtimes are divided by around 120 on GTX280 and 80 on C2070, while only reduced by a 3.5 factor on CPU. -The diagram of Figure \ref{fig:compMedians1} summarizes these first results, obtained with a block size of 128 threads on GTX280 and 256 on C2070. Only C2070 throughputs are shown and compared to CPU results. We included the maximum effective pixel throughput in order to see the improvement potential of the different implementations. We also introduced throughputd achieved by \textit{libJacket}, a commercial implementation, as it was the fastest known implementation of a 3$\times$3 median filter to date, as illustrated in \cite{chen09}. One of the authors of libJacket kindly posted the CUDA code of its 3$\times$3 median filter, that we inserted into our own coding structure. The algorithm itself is quite similar to ours, but running it in our own environement produced higher throughput values than those published in \cite{chen09}, not due to different hardware capabilities between our GTX280 and the GTX260 used in the paper, but to the way we perform memory transfers and to our register-only method of storing temporary data. +In the case of a 3$\times$3 median filter, the simplest solution consists of associating one register to each gray-level value, then sorting those 9 values and selecting the fifth one, i.e., the median value. For such a small amount of data to sort, a simple selection method is well indicated. As shown in Listing \ref{lst:kernelMedian3RegTri9} (\texttt{kernel\_Median3RegSort9()}), the constraint of using only registers forces the adoption of an unusual manner of coding. However, results are persuasive: runtimes are divided by around 120 on GTX280 and 80 on C2070, while only reduced by a 3.5 factor on CPU (CPU median3 bubble sort). +The diagram of Figure \ref{fig:compMedians1} summarizes these first results for C2070, obtained with a block size of 256 threads, and Xeon CPU. We included the maximum effective pixel throughput in order to see the improvement potential of the different implementations. We also introduced throughput achieved by libJacket, a commercial implementation, as it was the fastest known implementation of a $3\times 3$ median filter to date, as illustrated in \cite{chen09}. One of the authors of libJacket kindly posted the CUDA code of its $3\times 3$ median filter, which we inserted into our own coding structure. The algorithm itself is quite similar to ours, but running it in our own environement produced higher throughput values than those published in \cite{chen09}, not due to different hardware capabilities between our GTX280 and the GTX260 those authors used, but due to the way we perform memory transfers and our register-only method of storing temporary data. -\lstinputlisting[label={lst:kernelMedian3RegTri9},caption= 3$\times$3 median filter kernel using one register per neighborhood pixel and bubble sort]{Chapters/chapter3/code/kernMedianRegTri9.cu} +\lstinputlisting[label={lst:kernelMedian3RegTri9},caption= $3\times 3$ median filter kernel using one register per neighborhood pixel and bubble sort]{Chapters/chapter3/code/kernMedianRegTri9.cu} \begin{figure} \centering \includegraphics[width=15cm]{Chapters/chapter3/img/debitPlot1.pdf} - \caption{Comparison of pixel throughputs on GPU C2070 and CPU for generic median, 3$\times$3 median register-only and \textit{libJacket}.} + \caption[Comparison of pixel throughputs for CPU generic median, CPU 3$\times$3 median register-only with bubble sort, GPU generic median, GPU 3$\times$3 median register-only with bubble sort, and GPU libJacket.]{Comparison of pixel throughputs for CPU generic median, CPU 3$\times$3 median register-only with bubble sort, GPU generic median, GPU 3$\times$3 median register-only with bubble sort, and GPU libJacket. The GPU is the C2070 card and the CPU is the Xeon processor. The maximum effective C2070 throughput is also shown.} \label{fig:compMedians1} \end{figure} \subsection{Further optimization} -Running the above register-only 3$\times$3 median filter through the NVidia CUDA profiler teaches us that the memory throughput achieved by the kernel remains quite low. To improve this, two methods can be used: +Running the above register-only 3$\times$3 median filter through the NVIDIA CUDA profiler teaches us that the memory throughput achieved by the kernel remains quite low. To improve this, two methods can be used: \begin{itemize} \item increasing the number of concurrent threads, which can be achieved by reducing the number of registers used by each thread. \item having each thread process more data which can be achieved at thread level by processing and outputting the gray-level value of two pixels or more. @@ -238,57 +233,68 @@ Running the above register-only 3$\times$3 median filter through the NVidia CUDA \subsubsection{Reducing register count \index{register count}} -Our current kernel (\texttt{kernelMedian3RegSort9}) uses one register per gray-level value, which amounts to 9 registers for the entire 3$\times$3 window. -This count can be reduced by use of an iterative sorting process called \textit{forgetful selection}, where both \textit{extrema} are eliminated at each sorting stage, until only 3 elements remain. The question is to find out the minimal register count $k_{n^2}$ that allows the selection of the median amoung $n^2$ values. The answer can be evaluated considering that, when eliminating the maximum and the minimum values, one has to make sure not to eliminate the global median value. Such a situation is illustrated in Figure \ref{fig:forgetful_selection} for a 3$\times$3 median filter. For better comprehension, the 9 elements of the 3$\times$3 pixel window have been represented in a row. -\begin{figure} - \centering - \includegraphics[width=6cm]{Chapters/chapter3/img/forgetful_selection.png} - \caption{Forgetful selection with the minimal element register count. Illustration for 3$\times$3 pixel window represented in a row and supposed sorted.} - \label{fig:forgetful_selection} -\end{figure} -We must remember that, in the fully sorted vector, the median value will have the middle index \textit{i.e.} $\lfloor n^2/2\rfloor$. -Moreover, assuming that both \textit{extrema} are eliminated from the first $k$ elements and that the global median is one of them would mean that: +Our current kernel (\texttt{kernel\_Median3RegSort9}) uses one register per gray-level value, which amounts to 9 registers for the entire 3$\times$3 window. +This count can be reduced by use of an iterative sorting process called \textit{forgetful selection}, where both \textit{extrema} are eliminated at each sorting stage, until only 3 elements remain. The question is to learn the minimal register count $k_{n^2}$ that allows the selection of the median amoung $n^2$ values. The answer can be evaluated considering that, when eliminating the maximum and the minimum values, one has to make sure not to eliminate the global median value. Such a situation is illustrated in Figure \ref{fig:forgetful_selection} for a $3\times 3$ median filter. For better comprehension, the 9 elements of the $3\times 3$ pixel window have been represented in a row. + +We must remember that by definition, in the fully sorted vector, the median value will have the middle index, i.e., $\lfloor n^2/2\rfloor$. +Moreover, assuming that both \textit{extrema} are eliminated from the first $k$ elements and that the global median is one of them would mean that \begin{itemize} \item if the global median was the minimum among the $k$ elements, then at least $k-1$ elements would have a higher index. Considering the above median definition, at least $k-1$ elements should also have a lower index in the entire vector. \item if the global median was the maximum among the $k$ elements, then at least $k-1$ elements would have a lower index. Considering the above median definition, at least $k-1$ elements should also have a higher index in the entire vector. -\end{itemize} + Therefore, the number $k$ of elements that are part of the first selection stage can be defined by the condition $$n^2-k \leq \lfloor \frac{n^2}{2} \rfloor -1$$ -which leads to: +which leads to $$k_{n^2}=\lceil \frac{n^2}{2}\rceil+1 $$ + This rule can be applied to the first eliminating stage and remains true with the next ones as each stage suppresses exactly two values, one above and one below the median value. -In our 3$\times$3 pixel window example, the minimum register count becomes $k_9=\lceil 9/2\rceil+1 = 6$. +In our $3\times 3$ pixel window example, the minimum register count becomes $k_9=\lceil 9/2\rceil+1 = 6$. This iterative process is illustrated in Figure \ref{fig:forgetful3}, where it achieves one entire $3\times 3$ median selection, beginning with $k_9=6$ elements. + +The \textit{forgetful selection} method, used in \cite{mcguire2008median}, does not imply full sorting of values, but only selecting minimum and maximum values, which, at the price of a few iteration steps ($n^2-k$), reduces arithmetic complexity. +Listing \ref{lst:medianForget1pix3} details this process where forgetful selection is achieved by use of simple 2-value swapping function ($s()$, lines 1 to 5) that swaps input values if necessary, so as to achieve the first steps of an incomplete sorting network \cite{Batcher:1968:SNA:1468075.1468121}. Moreover, whenever possible, in order to increase the ILP, \index{Instruction-Level Parallelism} successive calls to $s()$ are done with independant elements as arguments. This is illustrated by the macro definitions of lines 7 to 12 and by Figure \ref{fig:bitonic} which details the first iteration of the $5\times 5$ selection, starting with $k_{25}=14$ elements. +\begin{figure}[b] + \centering + \includegraphics[width=6cm]{Chapters/chapter3/img/forgetful_selection.png} + \caption{Forgetful selection with the minimal element register count. Illustration for $3\times 3$ pixel window represented in a row and supposed sorted.} + \label{fig:forgetful_selection} +\end{figure} \begin{figure} \centering \includegraphics[width=5cm]{Chapters/chapter3/img/forgetful_selectionb.png} - \caption{Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.} + \caption{Determination of the median value by the \textit{forgetful selection} process, applied to a $3\times 3$ neighborhood window.} \label{fig:forgetful3} \end{figure} - +\end{itemize} -The \textit{forgetful selection} method, used in \cite{mcguire2008median}, does not imply full sorting of values, but only selecting minimum and maximum values, which, at the price of a few iteration steps ($n^2-k$), reduces arithmetic complexity. -Listing \ref{lst:medianForget1pix3} details this process where forgetful selection is achieved by use of simple 2-value swapping function ($s()$, lines 1 to 5) that swaps input values if necessary, so as to achieve the first steps of an incomplete sorting network \cite{Batcher:1968:SNA:1468075.1468121}. Moreover, whenever possible, in order to increase the Instruction-Level Parallelism \index{Instruction-Level Parallelism}, successive calls to $s()$ are done with independant elements as arguments. This is illustrated by the macro definitions of lines 7 to 14 and by Figure \ref{fig:bitonic} which details the first iteration of the $5\times 5$ selection, starting with $k_{25}=14$ elements. +\begin{figure} + \centering + \includegraphics[width=6cm]{Chapters/chapter3/img/fig3.jpg} + \caption[First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network.]{First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lower value at the starting point and the higher value at the end point.} + \label{fig:bitonic} +\end{figure} + +\lstinputlisting[label={lst:medianForget1pix3},caption= 3$\times$3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070]{Chapters/chapter3/code/kernMedianForget1pix3.cu} -\lstinputlisting[label={lst:medianForget1pix3},caption= 3$\times$3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070. ]{Chapters/chapter3/code/kernMedianForget1pix3.cu} +Our such modified kernel provides significantly improved runtimes: an average speedup of 16\% is obtained, and pixel throughput reaches around $1000~MP/s$ on C2070. -Our such modified kernel provides significantly improved runtimes: a speedup of around 16\% is obtained, and pixel throughput reaches around 1000~MPixel/s on C2070. \subsubsection{More data output per thread} -In the case of a kernel achieving an effective memory throughput value far from the GPU peak value, and if enough threads are run, another technique may help hiding memory latency and thus leverage performance: make sure that each thread generates multiple pixel outputs.\\ -Attentive readers could remark that it would increase the register count per thread, which can be compensated by dividing thread block size accordingly, thus allowing to keep the same register count per block. -Moreover, it is now possible to take advantage of window overlapping, first illustrated In Figure \ref{fig:median_overlap}, and further detailed in Figure \ref{fig:median3_overlap}. As the selection is first processed on the first 6 gray-level values, i.e. exactly the number of pixels that overlap between the neighborhoods of two adjacent center pixels, 6 texture fetches and one \texttt{minmax6} selection per thread can be saved. There again, some speedup can be expected through our modified kernel source code presented in Listing \ref{lst:medianForget2pix3}. One important difference with previous versions lies in the way pixel coordinates are computed from thread indexes. As each thread has to process two pixels, the number of threads in each block is divided by 2, while the grid size remains unchanged. Consequently, in our kernel code, each thread whose block-related coordinates are $(tx, ty)$ will be in charge of processing pixels of block-related coordinates $(2tx, ty)$ and $(2tx+1, ty)$; lines 5 and 6 implement this. +In the case of a kernel achieving an effective memory throughput value far from the GPU peak value, and if enough threads are run, another technique may help with hiding memory latency and thus leverage performance: making sure that each thread generates multiple pixel outputs. + +Attentive readers could remark that it would increase the register count per thread, which can be compensated by dividing thread block size accordingly, thus keeping the same register count per block. +Moreover, it is now possible to take advantage of window overlapping, first illustrated in Figure \ref{fig:median_overlap}, and further detailed in Figure \ref{fig:median3_overlap}. As the selection is first processed on the first 6 gray-level values, i.e., exactly the number of pixels that overlap between the neighborhoods of two adjacent center pixels, 6 texture fetches, and one \texttt{minmax6} selection per thread can be saved. There again, some speedup can be expected through our modified kernel source code presented in Listing \ref{lst:medianForget2pix3}. One important difference from previous versions lies in the way pixel coordinates are computed from thread indexes. As each thread has to process two pixels, the number of threads in each block is divided by 2, while the grid size remains unchanged. Consequently, in our kernel code, each thread whose block-related coordinates are $(tx, ty)$ will be in charge of processing pixels of block-related coordinates $(2tx, ty)$ and $(2tx+1, ty)$; lines 5 and 6 implement this. \begin{figure} \centering \includegraphics[width=4cm]{Chapters/chapter3/img/median3_overlap.png} - \caption{Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times$3 median kernel.} + \caption{Illustration of how window overlapping is used to combine 2 pixel selections in a $3\times 3$ median kernel.} \label{fig:median3_overlap} \end{figure} -\lstinputlisting[label={lst:medianForget2pix3},caption=3$\times$3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.]{Chapters/chapter3/code/kernMedian2pix3.cu} +\lstinputlisting[label={lst:medianForget2pix3},caption=$3\times 3$ median filter kernel processing 2 output pixel values per thread using combined forgetful selection]{Chapters/chapter3/code/kernMedian2pix3.cu} -Running this $3\times 3$ kernel saves another 10\% runtime, as shown in Figure \ref{fig:compMedians2} and provides the best peak pixel throughput value known so far on C2070: 1155~Mpixel/s which is 86\% the maximum effective throughput. +Running this $3\times 3$ kernel saves another 10\% runtime, as shown in Figure \ref{fig:compMedians2} and provides the best peak pixel throughput value known so far on the C2070: $1155~MP/s$ which is 86\% of the maximum effective throughput. \begin{figure} \centering @@ -298,11 +304,11 @@ Running this $3\times 3$ kernel saves another 10\% runtime, as shown in Figure \ \end{figure} \section{A 5$\times$5 and more median filter } -Considering the maximum register count allowed per thread (63) and trying to push this technique to its limit potentially allows designing up to 9$\times$9 median filters. Such maximum would actually use $k_{81}=\lceil 81/2\rceil+1 = 42$ registers per thread plus 9, used by the compiler to complete arithmetic operations and 9 more when outputting 2 pixels per thread. This leads to a total register count of 60, which would limit the number of concurrent threads per block. Our measurements show that this technique is still worth using for the 7$\times$7 median. As for larger window sizes, one option could be using shared memory. +Considering the maximum register count allowed per thread (63) and trying to push this technique to its limit potentially allows designing up to 9$\times$9 median filters. Such maximum would actually use $k_{81}=\lceil 81/2\rceil+1 = 42$ registers per thread plus 9, used by the compiler to complete arithmetic operations, and 9 more when outputting 2 pixels per thread. This leads to a total register count of 60, which would limit the number of concurrent threads per block. As for larger window sizes, one option could be using shared memory. The next two sections will first detail the particular case of the 5$\times$5 median through register-only method and eventually a generic kernel for larger window sizes. \subsection{A register-only 5$\times$5 median filter \label{sec:median5}} -The minimum register count required to apply the forgetful selection method to a 5$\times$5 median filter is $k_{25}=\lceil 25/2\rceil+1 = 14$. Moreover, two adjacent overlapping windows share 20 pixels ($n^2-one\_column$) so that, when processing 2 pixels simultaneously, a count of 7 common selection stages can be carried out from the first selection stage with 14 common values to the processing of the last common value. That allows to limit register count to 22 per thread. Figure \ref{fig:median5overlap} describes the distribution of overlapping pixels, implemented in Listing \ref{lst:medianForget2pix5}: common selection stages take place from line 25 to line 37, while the remaining separate selection stages occur between lines 45 and 62 after the separation of line 40. +The minimum register count required to apply the forgetful selection method to a 5$\times$5 median filter is $k_{25}=\lceil 25/2\rceil+1 = 14$. Moreover, two adjacent overlapping windows share 20 pixels ($n^2-one\_column$) so that, when processing 2 pixels simultaneously, a count of 7 common selection stages can be carried out from the first selection stage with 14 common values to the processing of the last common value. This allows limiting register count to 22 per thread. Figure \ref{fig:median5overlap} describes the distribution of overlapping pixels, implemented in Listing \ref{lst:medianForget2pix5}: common selection stages take place from line 25 to line 37, while the remaining separate selection stages occur between lines 45 and 62 after the separation of line 40. \begin{figure} \centering \includegraphics[width=6cm]{Chapters/chapter3/img/median5_overlap4.png} @@ -310,14 +316,7 @@ The minimum register count required to apply the forgetful selection method to a \label{fig:median5overlap} \end{figure} -\begin{figure} - \centering - \includegraphics[width=6cm]{Chapters/chapter3/img/fig3.jpg} - \caption[First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network.]{First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.} - \label{fig:bitonic} -\end{figure} - -\lstinputlisting[label={lst:medianForget2pix5},caption=kernel 5$\times$5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.]{Chapters/chapter3/code/kernMedian2pix5.cu} +\lstinputlisting[label={lst:medianForget2pix5},caption=kernel 5$\times$5 median filter processing 2 output pixel values per thread by a combined forgetfull selection]{Chapters/chapter3/code/kernMedian2pix5.cu} Timing results follow the same variations with image size as in previously presented kernels. That is why Table \ref{tab:median5comp} shows only throughput values obtained for C2070 card and 4096$\times$4096 pixel image. @@ -340,23 +339,23 @@ Timing results follow the same variations with image size as in previously prese \label{tab:median5comp} \end{table} -\subsection{Fast approximated n$\times$n median filter } -Large window median filters are less widespread and used in more specific fields, such as digital microscopy where, for example, background estimation of images is achieved through $64\times 64$ or $128\times 128$ median filters \cite{Wu2010}. In such cases, a possible technique is to split median selection into two separate 1-D stages: one in the vertical direction and the other in the horizontal direction. Image processing specialists may object that this method does not select the actual median value. This is true but, in the case of large window sizes and \textit{real-life} images, the so selected value is statistically near the actual median value and often represents an acceptable approximation. Such a filter is sometimes called \textit{smoother}. +\subsection{Fast approximated $n\times n$ median filter } +Large window median filters are less widespread but are used in more specific fields, such as digital microscopy where, for example, background estimation of images is achieved through $64\times 64$ or $128\times 128$ median filters \cite{Wu2010}. In such cases, a possible technique is to split median selection into two separate 1D stages: one in the vertical direction and the other in the horizontal direction. Image processing specialists may object that this method does not select the actual median value. This is true but, in the case of large window sizes and \textit{real-life} images, the value selected in this manner is statistically near the actual median value and often represents an acceptable approximation. Such a filter is sometimes called a \textit{smoother}. As explained earlier in this section, the use of large window median filters rules out register-only implementation, -which suggests to privilege the use of shared memory. The 1-D operation almost completely avoids bank conflicts in shared memory accesses. -Furthermore, the above-described forgetful selection method cannot be used anymore, as too many registers would be required.\\Instead, the Torben Morgensen sorting algorithm is used, as its required register count is both low and constant, and avoids the use of a local vector, unlike histogram-based methods. +which favors the use of shared memory. The 1D operation almost completely avoids bank conflicts in shared memory accesses. +Furthermore, the above-described forgetful selection method cannot be used anymore, as too many registers would be required. Instead, the Torben Morgensen sorting algorithm is used, as its required register count is both low and constant, and avoids the use of a local vector, unlike histogram-based methods. -Listing \ref{lst:medianSeparable} presents a kernel code that implements the above considerations and achieves a 1-D vertical $n \times 1$ median filter. The shared memory vector is declared as \texttt{extern} (Line 16) as its size is determined at runtime and passed to the kernel call as an argument. Lines 20 to 29 perform data prefetching, including the $2n$-row halo ($n$ at the bottom and $n$ at the top of each block). Then one synchronization barrier is mandatory (line 31) to ensure that all needed data is ready prior to its use by the different threads. -Torben Morgensen sorting takes place between lines 37 and 71 and eventually, the transposed output value is stored in global memory at line 73. Outputting the transposed image in global memory saves time and allows to re-use the same kernel to achieve the second step, e.g 1-D horizontal $n \times 1$ median filtering. The final transpose is done at transfer time, when copying data from GPU to CPU memory, which once more saves time while actually generates the expected image. -It has to be noticed that this smoother, unlike the technique we proposed for fixed-size median filters, can not be considered as a state-of-the-art technique, as for example the one presented in \cite{4287006}. However, it may be considered as a good, easy to use and efficient alternative as confirmed by the results presented in Table \ref{tab:medianSeparable}. Pixel throughput values achieved by our kernel, though not constant with window size, remain very competitive if window size is kept under $120\times 120$ pixels, especially when outputting 2 pixels per thread (in \cite{4287006}, pixel throughput is around 7MP/s). -Figure \ref{fig:sap_examples2} shows an example of a $512\times 512$ pixel image, corrupted by a \textit{salt and pepper} noise and the denoised versions, output respectively by a $3\times 3$, a $5\times 5$ and a $55\times 55 $ separable smoother. +Listing \ref{lst:medianSeparable} presents a kernel code that implements the above considerations and achieves a 1D vertical $n \times 1$ median filter. The shared memory vector is declared as \texttt{extern} (Line 16) as its size is determined at runtime and passed to the kernel call as an argument. Lines 20 to 29 perform data prefetching, including the $2n$-row halo ($n$ at the bottom and $n$ at the top of each block). Then one synchronization barrier is mandatory (line 31) to ensure that all needed data is ready prior to its use by the different threads. +Torben Morgensen sorting takes place between lines 37 and 66 and eventually, the transposed output value is stored in global memory at line 69. Outputting the transposed image in global memory saves time and allows to reuse the same kernel to achieve the second step, e.g 1D horizontal $n \times 1$ median filtering. +It has to be noticed that this smoother, unlike the technique we proposed for fixed-size median filters, cannot be considered as a state-of-the-art technique as, for example, the one presented in \cite{4287006}. However, it may be considered as a good, easy to use and efficient alternative as confirmed by the results presented in Table \ref{tab:medianSeparable}. Pixel throughput values achieved by our kernel, though not constant with window size, remain very competitive if window size is kept under $120\times 120$ pixels, especially when outputting 2 pixels per thread (in \cite{4287006}, pixel throughput is around 7MP/s). +Figure \ref{fig:sap_examples2} shows an example of a $512\times 512$ pixel image, corrupted by a \textit{salt and pepper} noise, and the denoised versions, outputted respectively by a $3\times 3$, a $5\times 5$, and a $55\times 55 $ separable smoother. \begin{figure} \subfigure[Airplane image, corrupted with by salt and pepper noise of density 0.25]{\label{img:sap_example_ref} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25.png}}\qquad \subfigure[Image denoised by a $3\times 3$ separable smoother]{\label{img:sap_example_sep_med3} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_sep_med3.png}}\\ \subfigure[Image denoised by a $5\times 5$ separable smoother]{\label{img:sap_example_sep_med5} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_sep_med5.png}}\qquad \subfigure[Image background estimation by a $55\times 55$ separable smoother]{\label{img:sap_example_sep_med3_it2} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_sep_med111.png}}\\ - \caption{Example of separable median filtering (smoother), applied to salt \& pepper noise reduction.} + \caption{Example of separable median filtering (smoother), applied to salt and pepper noise reduction.} \label{fig:sap_examples2} \end{figure} diff --git a/BookGPU/Chapters/chapter3/code/kernMedianForget1pix3.cu b/BookGPU/Chapters/chapter3/code/kernMedianForget1pix3.cu index a34d784..5a04bd2 100755 --- a/BookGPU/Chapters/chapter3/code/kernMedianForget1pix3.cu +++ b/BookGPU/Chapters/chapter3/code/kernMedianForget1pix3.cu @@ -11,8 +11,7 @@ __device__ inline void s(int* a, int* b) #define minmax5(a, b, c, d, e) s(a, b); s(c, d); min3(a, c, e); max3(b, d, e); #define minmax6(a, b, c, d, e, f) s(a,d); s(b, e); s(c, f); min3(a, b, c); max3(d, e, f); -__global__ void kernel_medianForget1pix3( short *output, - int i_dim, int j_dim) +__global__ void kernel_medianForget1pix3( short *output, int i_dim, int j_dim) { int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; diff --git a/BookGPU/Chapters/chapter3/code/kernMedianForget1pix3.cu~ b/BookGPU/Chapters/chapter3/code/kernMedianForget1pix3.cu~ index c842715..a34d784 100755 --- a/BookGPU/Chapters/chapter3/code/kernMedianForget1pix3.cu~ +++ b/BookGPU/Chapters/chapter3/code/kernMedianForget1pix3.cu~ @@ -1,13 +1,7 @@ __device__ inline void s(int* a, int* b) -{ - +{ int tmp ; - if (*a > *b) - { - tmp = *b ; - *b = *a ; - *a = tmp ; - } + if (*a > *b) { tmp = *b; *b = *a; *a = tmp;} } #define min3(a, b, c) s(a, b); s(a, c); @@ -17,52 +11,28 @@ __device__ inline void s(int* a, int* b) #define minmax5(a, b, c, d, e) s(a, b); s(c, d); min3(a, c, e); max3(b, d, e); #define minmax6(a, b, c, d, e, f) s(a,d); s(b, e); s(c, f); min3(a, b, c); max3(d, e, f); -__global__ void kernel_median3( short *output, int i_dim, int j_dim) +__global__ void kernel_medianForget1pix3( short *output, + int i_dim, int j_dim) { - - // coordonnees absolues du point int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; - - /************************************************************************** - * tri(s) - **************************************************************************/ int a0, a1, a2, a3, a4, a5 ; - /******************************************************************************** - * les six premieres valeurs (suffisant pour median 3x3 par forgetfull selection) - ********************************************************************************/ - a0 = tex2D(tex_img_ins, j-1, i-1) ; + a0 = tex2D(tex_img_ins, j-1, i-1) ; // first 6 values a1 = tex2D(tex_img_ins, j, i-1) ; a2 = tex2D(tex_img_ins, j+1, i-1) ; a3 = tex2D(tex_img_ins, j-1, i) ; a4 = tex2D(tex_img_ins, j, i) ; a5 = tex2D(tex_img_ins, j+1, i) ; + minmax6(&a0, &a1, &a2, &a3, &a4, &a5);//min->a0 max->a5 + a5 = tex2D(tex_img_in, j-1, i+1) ; //next value in a5 + minmax5(&a1, &a2, &a3, &a4, &a5) ; //min->a1 max->a5 + a5 = tex2D(tex_img_ins, j, i+1) ; //next value in a5 + minmax4(&a2, &a3, &a4, &a5) ; //min->a1 max->a5 + a5 = tex2D(tex_img_ins, j+1, i+1) ; //next value in a5 + minmax3(&a3, &a4, &a5) ; //min->a1 max->a5 - //min max aux extremites - minmax6(&a0, &a1, &a2, &a3, &a4, &a5) ; - - /******************************************** - * les deux valeurs suivantes aux extremites - ********************************************/ - a5 = tex2D(tex_img_in, j-1, i+1) ; - - minmax5(&a1, &a2, &a3, &a4, &a5) ; - - /******************************************** - * la derniere valeur a la fin - ********************************************/ - - a5 = tex2D(tex_img_ins, j, i+1) ; - - minmax4(&a2, &a3, &a4, &a5) ; - - a5 = tex2D(tex_img_ins, j+1, i+1) ; - minmax3(&a3, &a4, &a5) ; - - - //median au milieu ! - output[ __mul24(i, j_dim) +j ] = a4 ; + output[ __mul24(i, j_dim) +j ] = a4 ; //middle value } diff --git a/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu b/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu index 29922a3..c613706 100755 --- a/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu +++ b/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu @@ -8,9 +8,9 @@ __global__ void kernel_Median3RegSort9( short *output, a0 = tex2D(tex_img_ins, j-1, i-1) ; // fetching values a1 = tex2D(tex_img_ins, j , i-1) ; a2 = tex2D(tex_img_ins, j+1, i-1) ; - a3 = tex2D(tex_img_ins, j-1, i) ; - a4 = tex2D(tex_img_ins, j , i) ; - a5 = tex2D(tex_img_ins, j+1, i) ; + a3 = tex2D(tex_img_ins, j-1, i ) ; + a4 = tex2D(tex_img_ins, j , i ) ; + a5 = tex2D(tex_img_ins, j+1, i ) ; a6 = tex2D(tex_img_ins, j-1, i+1) ; a7 = tex2D(tex_img_ins, j , i+1) ; a8 = tex2D(tex_img_ins, j+1, i+1) ; diff --git a/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu~ b/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu~ index 363b181..29922a3 100755 --- a/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu~ +++ b/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu~ @@ -1,4 +1,4 @@ -__global__ void kernel_Median3RegTri9( short *output, +__global__ void kernel_Median3RegSort9( short *output, int i_dim, int j_dim) { int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; diff --git a/BookGPU/Chapters/chapter3/code/medianGeneric.cu b/BookGPU/Chapters/chapter3/code/medianGeneric.cu index bf41842..dff2a92 100755 --- a/BookGPU/Chapters/chapter3/code/medianGeneric.cu +++ b/BookGPU/Chapters/chapter3/code/medianGeneric.cu @@ -18,7 +18,7 @@ __global__ void kernel_medianR( short *output, for(ic=0; ic<256; ic++) { cpt += histogram[ ic ] ; - // selection of 50% percentile + // selection of the median value if ( cpt > ((2*r+1)*(2*r+1))>>1 ) break ; } output[ __mul24(i, j_dim) +j ] = ic ; diff --git a/BookGPU/Chapters/chapter3/code/medianGeneric.cu~ b/BookGPU/Chapters/chapter3/code/medianGeneric.cu~ index 7280904..bf41842 100755 --- a/BookGPU/Chapters/chapter3/code/medianGeneric.cu~ +++ b/BookGPU/Chapters/chapter3/code/medianGeneric.cu~ @@ -1,11 +1,25 @@ -__device__ inline void s(int* a, int* b) -{ +__global__ void kernel_medianR( short *output, + int i_dim, int j_dim, int r) +{ + // absolute coordinates of the center pixel + int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; + int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; - int tmp ; - if (*a > *b) + short cpt, ic, jc ; + short histogram[256] ; // 8 bit image + // zeroing histogram data + for (ic =0; ic<256; ic++) histogram[ ic ]=0 ; + // histogram filling + for(ic=i-r; ic<=i+r; ic++ ) + for(jc=j-r; jc<=j+r; jc++) + histogram[ tex2D(tex_img_ins, jc, ic) ]++ ; + // histogram parsing + cpt = 0 ; + for(ic=0; ic<256; ic++) { - tmp = *b ; - *b = *a ; - *a = tmp ; - } + cpt += histogram[ ic ] ; + // selection of 50% percentile + if ( cpt > ((2*r+1)*(2*r+1))>>1 ) break ; + } + output[ __mul24(i, j_dim) +j ] = ic ; } diff --git a/BookGPU/Chapters/chapter3/img/debitPlot1.pdf b/BookGPU/Chapters/chapter3/img/debitPlot1.pdf index 9cd73d0..a4794f4 100644 Binary files a/BookGPU/Chapters/chapter3/img/debitPlot1.pdf and b/BookGPU/Chapters/chapter3/img/debitPlot1.pdf differ diff --git a/BookGPU/Chapters/chapter3/img/debitPlot2.pdf b/BookGPU/Chapters/chapter3/img/debitPlot2.pdf index e0b3d0d..cd1fc58 100644 Binary files a/BookGPU/Chapters/chapter3/img/debitPlot2.pdf and b/BookGPU/Chapters/chapter3/img/debitPlot2.pdf differ diff --git a/BookGPU/Chapters/chapter4/biblio4.bib b/BookGPU/Chapters/chapter4/biblio4.bib index 463e4e4..49063f2 100644 --- a/BookGPU/Chapters/chapter4/biblio4.bib +++ b/BookGPU/Chapters/chapter4/biblio4.bib @@ -1,8 +1,10 @@ -@unpublished{convolutionsoup, +@inproceedings{convolutionsoup, title = {Convolution Soup}, + booktitle = {GPU Technology Conference}, author = {Stam, J.}, abstract = {Graphics processors can be easily programmed to provide significant acceleration in many common parallel tasks. However, with additional architecture knowledge and understanding of optimization strategies, a savvy programmer can unleash the full potential of the GPU's massive memory bandwidth and ensure the processing resources are utilized to their fullest extent. In this talk, we'll explore several different approaches to a very simple but ubiquitous image processing algorithm, the convolution. A naive approach shows the detrimental impact of poorly written code, a simple approach achieves decent results with little effort or code complexity, and a few highly optimized techniques realize the GPUs full power for the most demanding tasks. The techniques explored in this simple but illustrative example will serve as a base for understanding the optimization strategies to apply towards more complex algorithms.}, year = {2010}, - month ={8}, + month ={Aug.}, pdf = {http://fr.slideshare.net/NVIDIA/1412-gtc09}, + url = {http://fr.slideshare.net/NVIDIA/1412-gtc09}, } \ No newline at end of file diff --git a/BookGPU/Chapters/chapter4/ch4.tex b/BookGPU/Chapters/chapter4/ch4.tex index 8788ab4..0a0d6cb 100644 --- a/BookGPU/Chapters/chapter4/ch4.tex +++ b/BookGPU/Chapters/chapter4/ch4.tex @@ -10,13 +10,13 @@ In this chapter, after dealing with GPU median filter implementations, we propose to explore how convolutions\index{Convolution} can be implemented on modern GPUs. Widely used in digital image processing filters, the \emph{convolution -operation} basically consists in taking the sum of products of elements -from two 2-D functions, letting one of the two functions move over +operation} basically consists of taking the sum of products of elements +from two 2D functions, letting one of the two functions move over every element of the other, producing a third function that is typically viewed as a modified version of one of the original functions. To -begin with, we shall examine non-separable or generic convolutions, -before adressing the matter of separable convolutions. We shall refer -to $I$ as an H x L pixel gray-level image, and to $I(x,y)$ as the gray-level +begin with, we shall examine non separable or generic convolutions, +before addressing the matter of separable convolutions. We shall refer +to $I$ as an $H\times L$ pixel gray-level image and to $I(x,y)$ as the gray-level value of each pixel of coordinates $(x,y)$. @@ -25,15 +25,15 @@ value of each pixel of coordinates $(x,y)$. Within a digital image $I$, the convolution operation is performed between image $I$ and convolution mask \emph{h} (To avoid confusion with other GPU functions referred to as kernels, we shall use\emph{ convolution -mask} instead of \emph{convolution kernel}) is defined by: +mask} instead of \emph{convolution kernel}) is defined by \begin{equation} -I'(x, y) = \left(I * h\right) = \sum_{(i < H)} \sum_{(j < L)}I(x-j, y-j).h(j,i) +I'(x, y) = \left(I * h\right) = \sum_{(i < H)} \sum_{(j < L)}I(x-j, y-j)h(j,i) \label{convoDef} \end{equation} While processing an image, function \emph{h} is often bounded by a square -window of size \emph{k = 2r + 1}, \textit{i.e} an uneven number, to ensure +window of size \emph{k = 2r + 1}, i.e., an uneven number, to ensure there is a center. We shall also point out that, as stated earlier, -the square shape is no limiting factor to the process, as any shape +the square shape is not a limiting factor to the process, as any shape can be inscribed into a square. In the case of a more complex shape, the remaining space is filled by null values (padding). @@ -41,7 +41,7 @@ the remaining space is filled by null values (padding). \section{Implementation} The basic principle of computing a convolution between one $I$ picture and one \emph{h} convolution mask defined on domain $\Omega$ is given -by algorithm \ref{algo_genconv} and illustrated by Figure \ref{fig:convoPrinciple}, which mainly shows how gray-level values of the center pixel's neighborhood are combined with the convolution mask values to compute the output value. +by Algorithm \ref{algo_genconv} and illustrated by Figure \ref{fig:convoPrinciple}, which mainly shows how gray-level values of the center pixel's neighborhood are combined with the convolution mask values to compute the output value. For more readability, only part of the connecting lines are shown. \begin{figure} \centering @@ -54,9 +54,9 @@ For more readability, only part of the connecting lines are shown. \label{algo_genconv} \ForEach{pixel at position $(x, y)$}{ Read all gray-level values $I(x, y)$ in the neighborhood\; - Compute the weighted sum \( I_\Omega = \sum_{(j,i) \in \Omega}I(x-j, y-j).h(j,i) \)\; + Compute the weighted sum \( I_\Omega = \sum_{(j,i) \in \Omega}I(x-j, y-j)h(j,i) \)\; Normalize $I'(x, y)$ value\; - Outputs the new gray-level value + Output the new gray-level value } \end{algorithm} @@ -68,7 +68,7 @@ brightness of the image will be altered and a normalization stage has to take place, as, for example, in the case of an 8-bit coded image: \begin{enumerate} -\item if $S \ge 0$ then $I' = I_\Omega / S$ +\item if $S > 0$ then $I' = I_\Omega / S$ \item if $S = 0$ then $I' = I_\Omega + 128$ \item if $S < 0$ then $I' = I_\Omega + 255$ \end{enumerate} @@ -78,69 +78,84 @@ each pixel, which will be quite time-costly when performed on a GPU. A simple wo \subsection{First test implementation} This first implementation consists of a rather naive application to -convolutions of the tuning recipes applied to median filters in the -previous chapter, as a reminder : texture memory used with incoming +convolutions of the techniques applied to median filters in the +previous chapter, as a reminder: texture memory used with incoming data, pinned memory with output data, optimized use of registers while processing data and multiple output per thread\index{Multiple output per thread}. One significant difference lies in the fact that the median filter uses only one parameter, the size of the window mask, -which can be hard-coded, while a convolution mask requires referring to several; hard-coding -its elements would lead to severe lack of flexibility (one function +which can be hard-coded, while a convolution mask requires referring to several parameters; hard-coding +the elements of the mask would lead to severe lack of flexibility (one function per filter, no external settings) so we will just use it as a starting point in our approach. Let us assume that we are planning to implement the convolution defined by the following $3\times 3$ mask (low-pass filter or averaging filter): $$h=\frac{1}{9}\begin{bmatrix}1&1&1\\1&1&1\\1&1&1\end{bmatrix}$$ -The kernel code presented in Listing \ref{lst:convoGene3Reg8} implements the convolution operation and applies all above optimizations except, for clarity reasons, multiple output per thread. -In the particular case of a generic convolution, it is important to note how mask coefficients are applied to image pixels in order to fit the definition of equation \ref{convoDef}: if the coordinates of the center pixel had been set to (0,0), then the pixel of coordinates $(i,j)$ would have been multiplied by the element $(-i,-j)$ of the mask, which, transposed in our kernel code, leads to multiply the $p^{th}$ pixel of the window by the $(n-p)^{th}$ element of the convolution mask. +The kernel code presented in Listing \ref{lst:convoGene3Reg8} implements the convolution operation and applies all above optimizations except, for clarity reasons, multiple outputs per thread. +In the particular case of a generic convolution, it is important to note how mask coefficients are applied to image pixels in order to fit the definition of equation \ref{convoDef}: if the coordinates of the center pixel had been set to (0,0), then the gray-level value of pixel of coordinates $(i,j)$ would have been multiplied by the element $(-i,-j)$ of the mask, which, transposed in our kernel code, leads to multiplying the $p^{th}$ pixel of the window by the $(n-p)^{th}$ element of the convolution mask. -\lstinputlisting[label={lst:convoGene3Reg8},caption=Generic CUDA kernel achieving a convolution operation with hard-coded mask values]{Chapters/chapter4/code/convoGene3Reg8.cu} +\lstinputlisting[label={lst:convoGene3Reg8},caption=generic CUDA kernel achieving a convolution operation with hard-coded mask values]{Chapters/chapter4/code/convoGene3Reg8.cu} Table \ref{tab:convoNonSepReg1} shows kernel timings and throughput values for such a low-pass filter extended to $5\times 5$ and $7\times 7$ masks applied on 8-bit coded gray-level -images of sizes $512\times 512$, $1024\times 1024$, $2048\times 2048$, $4096\times 4096$ and run on a C2070 card with $32\times 8$ thread blocks. -As a reminder, Table \ref{tab:memcpy1} details the data transfer costs that helped computing throughput values. +images of sizes $512\times 512$, $1024\times 1024$, $2048\times 2048$, and $4096\times 4096$ run on a C2070 card with $32\times 8$ thread blocks. -\begin{table}[h] +\begin{table}[htbp] \centering {\normalsize -\begin{tabular}{|c||r|r|r|r|r|r|} +\begin{tabular}{|c||r|r||r|r||r|r|} \hline -\textbf{Mask size$\rightarrow$}&\multicolumn{2}{|c|}{\textbf{3x3}}&\multicolumn{2}{|c|}{\textbf{5x5}}&\multicolumn{2}{|c|}{\textbf{7x7}}\\ -\textbf{Image size$\downarrow$}&time (ms)&TP&time (ms)&TP&time (ms)&TP\\\hline\hline +\textbf{Mask size}$\rightarrow$&\multicolumn{2}{c||}{$\mathbf{3\times 3}$}&\multicolumn{2}{c||}{$\mathbf{5\times 5}$}&\multicolumn{2}{c|}{$\mathbf{7\times 7}$}\\ +\textbf{Image size}$\downarrow$&time (ms)&TP&time (ms)&TP&time (ms)&TP\\\hline\hline $\mathbf{512\times 512}$ &0.077&1165 &0.209&559 &0.407 &472 \\\hline $\mathbf{1024\times 1024}$&0.297&1432 &0.820&836 &1.603 &515 \\\hline $\mathbf{2048\times 2048}$&1.178&1549 &\bf 3.265&\bf 875 &6.398&529 \\\hline $\mathbf{4096\times 4096}$&4.700&1585 &13.05&533 &25.56&533 \\\hline \end{tabular} } -\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.} +\caption[Timings (time) and throughput values (TP in MP/s) of one register-only non-separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a C2070 card.]{Timings (time) and throughput values (TP in MPx/s) of one register-only non-separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}. The bold value points out the result obtained in the reference situation.} \label{tab:convoNonSepReg1} \end{table} -\begin{table}[h] + + + + + + +Table \ref{tab:convoNonSepReg3} shows timings and global throughput values achieved by those convolution masks on an NVIDIA GT200 Tesla architecture (GTX280 card) with $16\times 8$ thread blocks. This measurement has been done in order to make a relevant comparison with a reference given by NVIDIA in \cite{convolutionsoup} in which they state that their fastest kernel achieves a $5\times 5$ convolution of an 8-bit $2048\times 2048$ pixel image in $1.4~ms$, leading to a throughput value of 945~MP/s. In all the result tables, the values associated to this reference will be presented in boldface. +Our current value of 802~MP/s, though not unsatisfactory, remains lower to the one reached by the manufacturer's own coding. +Tested in the same conditions, the newer Fermi architecture of +NVIDIA's GPUs proved slower (3.3 ms, see Table \ref{tab:convoNonSepReg1}) due to the lower maximum +register count allowed (63 as opposed to 128 for Tesla GT200). + +\begin{table}[htbp] \centering {\normalsize -\begin{tabular}{|c||r|r|r|r|r|r|} +\begin{tabular}{|c||r|r||r|r||r|r|} \hline -\textbf{Mask size$\rightarrow$}&\multicolumn{2}{|c|}{\textbf{3x3}}&\multicolumn{2}{|c|}{\textbf{5x5}}&\multicolumn{2}{|c|}{\textbf{7x7}}\\ -\textbf{Image size$\downarrow$}&time (ms)&TP&time (ms)&TP&time(ms)&TP\\\hline\hline +\textbf{Mask size}$\rightarrow$&\multicolumn{2}{c||}{$\mathbf{3\times 3}$}&\multicolumn{2}{c||}{$\mathbf{5\times 5}$}&\multicolumn{2}{c|}{$\mathbf{7\times 7}$}\\ +\textbf{Image size}$\downarrow$&time (ms)&TP&time (ms)&TP&time(ms)&TP\\\hline\hline $\mathbf{512\times 512}$ &0.060&1186 &0.148&848 &0.280&594 \\\hline $\mathbf{1024\times 1024}$&0.209&1407 &0.556&960 &1.080&649 \\\hline $\mathbf{2048\times 2048}$&0.801&1092 &\bf 2.189&\bf 802 &4.278&573 \\\hline $\mathbf{4096\times 4096}$&3.171&1075 &8.720&793 &17.076&569 \\\hline \end{tabular} } -\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.} +\caption[Timings (time) and throughput values (TP in MP/s) of one register-only non-separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a GTX280.]{Timings (time) and throughput values (TP in MP/s) of one register-only non-separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}. The bold value points out the result obtained in the reference situation.} \label{tab:convoNonSepReg3} \end{table} +It is interesting to note that, as long as each thread processes one single pixel, kernel execution time is ruled in proportion +with the number of pixels in the image multiplied by that of the mask. +The proportionality factor, that we call \textit{slope}, is $3.14.10^{-8}$~ms/pix on C2070 in this first implementation. +As a reminder, Table \ref{tab:memcpy1} details the data transfer costs that helped in computing throughput values. \begin{table}[h] \centering {\normalsize \begin{tabular}{|c||r|r|} \hline -\shortstack{\textbf{GPU card$\rightarrow$}\\\textbf{Image size$\downarrow$}}&\textbf{C2070}&\textbf{GTX280}\\\hline\hline +\shortstack{\textbf{GPU card}$\rightarrow$\\\textbf{Image size$\downarrow$}}&\textbf{C2070}&\textbf{GTX280}\\\hline\hline $\mathbf{512\times 512}$ &0.148 &0.161 \\\hline $\mathbf{1024\times 1024}$&0.435 &0.536 \\\hline $\mathbf{2048\times 2048}$&1.530 &3.039 \\\hline @@ -151,25 +166,14 @@ $\mathbf{4096\times 4096}$&5.882 &12.431 \\\hline \label{tab:memcpy1} \end{table} -Table \ref{tab:convoNonSepReg3} shows timings and global throughput values achieved by those convolution masks on an Nvidia GT200 Tesla architecture (GTX480 card) with $16x8$ thread blocks. This measurement has been done in order to make a relevant comparison with a reference given by Nvidia in \cite{convolutionsoup} where they state that their fastest kernel achieves a $5\times5$ convolution of an 8-bit $2048\times 2048$ pixelimage in $1.4~ms$, which lead to a throughput value of 945~Mpix/s. -Our current value of 802~Mpix/s, though not unsatisfactory, remains lower to the one reached by the manufacturer's own coding. -Tested in the same conditions, the newer Fermi architecture of -Nvidia's GPUs proved slower (3.3 ms, see Table \ref{tab:convoNonSepReg1}) due to the lower maximum -register count allowed (63, against 128 for Tesla GT200). - -It is interesting to note that, as long as each thread processes one single pixel, kernel execution time is ruled in proportion -with the number of pixels in the image multiplied by that of the mask. -The slope in this first implementaion is $3.14.10^{-8}~ms/pix$ on C2070. - \subsection{Using parameterizable masks} - To further improve the above implementation, it becomes necessary to free ourselves from the hard-coding constraint. To achieve this, as was the case with input image storing, several memory options are available, but, since the amount of data involved in processing a mask is quite small and constant, we considered it relevant to copy data -into \emph{symbol memory}. Listing \ref{lst:symbolmem} details the process, involving -the Cuda function \emph{CudaMemCopyToSymbol()}. +into \emph{symbol memory}. Listing \ref{lst:symbolmem} details this process, involving +the CUDA function \emph{cudaMemcpyToSymbol()}. \lstinputlisting[label={lst:symbolmem},caption=code snippet showing how to setup a mask in GPU symbol memory]{Chapters/chapter4/code/maskInSymbol.cu} @@ -179,10 +183,10 @@ a generic convolution kernel, whose code immediately appears both simple and concise. Its global time performance, however, is comparatively lower than the register-only process, due to the use of constant memory and of the \emph{r} parameter -(radius of the mask). The average slope amounts to $3.81~ms/pix$ on C2070, -which means a time-cost increase of around $20~\%$. +(radius of the mask). The average slope amounts to $3.81.10^{-8}$~ms/pix on C2070, +which means a time-cost increase of around 20~\%. -\lstinputlisting[label={lst:convoGene8r},caption=Generic CUDA kernel achieving a convolution operation with the mask in symbol memory and its radius passed as a parameter]{Chapters/chapter4/code/convoGene8r.cu} +\lstinputlisting[label={lst:convoGene8r},caption=generic CUDA kernel achieving a convolution operation with the mask in symbol memory and its radius passed as a parameter]{Chapters/chapter4/code/convoGene8r.cu} \subsection{Increasing the number of pixels processed by each thread} Much in the same way as we did with the Median Filter, we shall now @@ -193,87 +197,88 @@ of the size of the convolution mask, one can envisage processing 2 or more pixels per thread while keeping safely within the 63-per-thread rule. -However, when doing so, \textit{e.g} processing what we shall call a \textit{packet} of pixels, window mask overlapping has to be taken into account -to avoid multiple texture fetches of each pixel's gray-level value, while benefiting from the 2-D cache. +However, when doing so, e.g., processing what we shall call a \textit{packet} of pixels, window mask overlapping has to be taken into account +to avoid multiple texture fetches of each pixel's gray-level value, while benefiting from the 2D cache. In that case, both mask size and pixel packet shape determine the number of texture fetches to be performed for each pixel value. -Figure \ref{fig:convoOverlap1} illustrates two different situations: on top, a mask of radius 1 ($3\times 3$) applied to a packet of 8 pixels in row; at bottom, a mask of radius 2 ($5\times 5$). +Figure \ref{fig:convoOverlap1} illustrates two different situations: (a) a mask of radius 1 ($3\times 3$) applied to a packet of 8 pixels in a row; (b) a mask of radius 2 ($5\times 5$). The dark gray pixels are the center pixels (pixels of the packet), while light gray pixels belong to the halo around the packet. The number in each pixel box corresponds to the convolution count in which it is involved. -There would be little interest in using different \textit{packet} shapes, as the final global memory writes would not be coalescent; generating multiple latencies. - \begin{figure} +There would be little interest in using different \textit{packet} shapes, as the final global memory writes would not be coalescent, generating multiple latencies. + \begin{figure}[htbp] \centering - \subfigure[$3\times 3$ mask: there are 18 center pixels (out of 30) involved in 3 computations.]{ \includegraphics[width=5.8cm]{Chapters/chapter4/img/convoOverlap1.png}}\\ - \subfigure[$5\times 5$ mask: only 20 center pixels (out of 60), involved in 5 computations.]{ \includegraphics[width=7cm]{Chapters/chapter4/img/convoOverlap2.png}} - \caption{Mask window overlapping when processing 8 pixels per thread. Top: $3\times 3$ mask. Bottom: $5\times 5$ mask.} + \subfigure[$3\times 3$ mask: there are 18 pixels (out of 30) involved in 3 computations.]{ \includegraphics[width=5.8cm]{Chapters/chapter4/img/convoOverlap1.png}}\\ + \subfigure[$5\times 5$ mask: only 20 pixels (out of 60) are involved in 5 computations.]{ \includegraphics[width=7cm]{Chapters/chapter4/img/convoOverlap2.png}} + \caption[Mask window overlapping when processing a packet of 8 pixels per thread.]{Mask window overlapping when processing a packet of 8 pixels per thread. The dark gray pixels are the center pixels, while light gray pixels belong to the halo. The number in each pixel box is the convolution count in which it is involved. (a) $3\times 3$ mask; (b) $5\times 5$ mask.} \label{fig:convoOverlap1} \end{figure} -Altough we actually wrote GPU kernels able to process 2, 4, 8 and 16 pixels per thread, only the one that processes 8 pixels per thread is presented below, as it proved to be the fastest one. Listing \ref{lst:convoGene8x8pL3} reproduce the source code of the kernel for $3\times 3$ masks. -The bottom line is that each thread is associated with one base pixel of coordinates $(x,y)$ which is the first of the packet to be processed, the last one being $(x+7,y)$. -\lstinputlisting[label={lst:convoGene8x8pL3},caption=CUDA kernel achieving a $3\times 3$ convolution operation with the mask in symbol memory and direct data fetches in texture memory]{Chapters/chapter4/code/convoGene8x8pL3.cu} +Although we actually have written GPU kernels able to process 2, 4, 8, and 16 pixels per thread, only the one that processes 8 pixels per thread is presented below, as it proved to be the fastest one. Listing \ref{lst:convoGene8x8pL3} reproduces the source code of the kernel for $3\times 3$ masks. +The bottom line is that each thread is associated with one base pixel of coordinates $(x,y)$ which is the first, in the packet, to be processed, the last one being $(x+7,y)$. -In this particular case of a $3\times 3$ mask, each pixel value is used in 3 different convolution sums, except pixels located near both ends of the packet, whose values are used in fewer sums. -The general rule, when performing a $n\times n$ convolution (radius $k$) by 8-pixel packets is that each of the $(8-2k).(2k+1)$ \textit{center} pixels of the halo is used in $k$ sums, while the $4k.(2k+1)$ remaining pixels, located around the ends of the packet are used in fewer sums, from $k-1$ to $1$ ($2.(2k+1)$ pixels each). -\begin{table}[h] +In this particular case of a $3\times 3$ mask, each pixel value is used in 3 different convolution sums, except for pixels located near both ends of the packet, whose values are used in fewer sums. +The general rule, when performing an $n\times n$ convolution (radius $k$) by 8-pixel packets is that each of the $(8-2k).(2k+1)$ \textit{center} pixels of the halo is used in $k$ sums, while the $4k.(2k+1)$ remaining pixels, located around the ends of the packet, are used in fewer sums, from $k-1$ to $1$ ($2(2k+1)$ pixels each). +\begin{table}[htbp] \centering {\normalsize -\begin{tabular}{|c||r|r|r|r|r|r|} +\begin{tabular}{|c||r|r||r|r||r|r|} \hline -\textbf{Mask size$\rightarrow$}&\multicolumn{2}{|c|}{\textbf{3x3}}&\multicolumn{2}{|c|}{\textbf{5x5}}&\multicolumn{2}{|c|}{\textbf{7x7}}\\ -\textbf{Image size$\downarrow$}&time (ms)&TP&time (ms)&TP&time (ms)&TP\\\hline\hline +\textbf{Mask size}$\rightarrow$&\multicolumn{2}{c||}{$\mathbf{3\times 3}$}&\multicolumn{2}{c||}{$\mathbf{5\times 5}$}&\multicolumn{2}{c|}{$\mathbf{7\times 7}$}\\ +\textbf{Image size}$\downarrow$&time (ms)&TP&time (ms)&TP&time (ms)&TP\\\hline\hline $\mathbf{512\times 512}$ &0.036&1425 &0.069&1208 &0.110&1016 \\\hline $\mathbf{1024\times 1024}$&0.128&1862 &0.253&1524 &0.413&1237 \\\hline $\mathbf{2048\times 2048}$&0.495&2071 &\bf 0.987&1666 &1.615&1334 \\\hline $\mathbf{4096\times 4096}$&1.964&2138 &3.926&1711 &6.416&1364 \\\hline \end{tabular} } -\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.} +\caption[Timings (time) and throughput values (TP in MP/s) of our generic fixed mask size convolution kernel run on a C2070 card.]{Timings (time) and throughput values (TP in MP/s) of our generic fixed mask size convolution kernel run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}. The bold value points out the result obtained in the reference situation.} \label{tab:convoGene8x8p} \end{table} -Timing results and throughput values are shown in Table \ref{tab:convoGene8x8p}, and show that this solution now outperforms Nvidia references. +Timing results and throughput values are shown in Table \ref{tab:convoGene8x8p}, and show that this solution now outperforms NVIDIA references. It is important to remember that the above kernels have been optimized for the Fermi architecture, unlike those mentioned earlier, which were more efficient on the GT200 architecture. -However, our technique requires to write one kernel per mask size, which can be seen as a major constraint. To make it easier to use this method, we shall propose a kernel code generator that will be available in the near future. +However, our technique requires writing one kernel per mask size, which can be seen as a major constraint. To make it easier to use this method, we are working on a kernel code generator that is currently under development and will be made available in the near future. + +\lstinputlisting[label={lst:convoGene8x8pL3},caption=CUDA kernel achieving a $3\times 3$ convolution operation with the mask in symbol memory and direct data fetches in texture memory]{Chapters/chapter4/code/convoGene8x8pL3.cu} \subsection{Using shared memory to store prefetched data\index{Prefetching}.} \index{memory~hierarchy!shared~memory} A more convenient way of coding a convolution kernel is to use shared memory to perform a prefetching stage of the whole halo before computing the convolution sums. -This proves to be quite efficient and more versatile, but it obviously generates some overhead as: +This proves to be quite efficient and more versatile, but it obviously generates some overhead because \begin{itemize} \item Each pixel value has to be read at least twice, first from texture memory into shared memory and then one or several more times from shared memory to be used in convolution computations. \item Reducing the number of times a single pixel value is read from shared memory is bound to generate bank conflicts, hence once again performance loss. \end{itemize} - \begin{figure} + \begin{figure}[htbp] \centering \includegraphics[width=12cm]{Chapters/chapter4/img/convoShMem.png} - \caption[Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$.]{Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$. Threads in both top corners of the top figure are identified either by a circle or by a star symbol. The image tile, loaded into shared memory includes the pixels to be updated by the threads of the block, as well as its 2-pixel wide halo. Here, circle and star symbols in the image tile show which pixels are actually loaded into one shared memory vector by its corresponding thread. } + \caption[Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$.]{Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$. Threads in both top corners of the top figure are identified either by a circle or by a star symbol. The image tile, loaded into shared memory, includes the pixels to be updated by the threads of the block, as well as its 2-pixel wide halo. Here, circle and star symbols in the image tile show which pixels are actually loaded into one shared memory vector by its corresponding thread. } \label{fig:ShMem1} \end{figure} -Still, we also implemented this method, in a similar manner as Nvidia did in its SDK sample code. +Still, we also implemented this method, in a similar manner as NVIDIA did in its SDK sample code. Some improvement has been obtained by increasing the number of pixels processed by each thread, to an optimum 8 pixels per thread. The principle is to prefetch all pixel values involved in the computations performed by all threads of a block, including 8 pixels per thread plus the halo of radius $r$ (the radius of the convolution mask). As this obviously represents more values than the thread count in one block, some threads have to load more than one value. The general organization is reproduced in Figure \ref{fig:ShMem1} for $5\times 5$ mask and a $8\times 4$ thread block, while Listing \ref{lst:convoGeneSh1} gives the details of the implementation with its two distinct code blocks: preload in shared memory (Lines 20 to 42) and convolution computations (Lines 45 to 57). -Table \ref{tab:convoGeneSh1} details timing results of this implementation ($16\times 8$ threads/block), up to $13\times 13$ masks, that will serve as a reference in the next section, devoted to separable convolution. -\begin{table}[h] +Tables \ref{tab:convoGeneSh1} and \ref{tab:convoGeneSh2} detail timing results and throughput values of this implementation ($16\times 8$ threads/block), up to $13\times 13$ masks, that will serve as a reference in the next section, devoted to separable convolution. +\begin{table}[htbp] \centering {\normalsize \begin{tabular}{|c||r|r|r|r|r|r|} \hline -\shortstack{\textbf{Mask size$\rightarrow$}\\\textbf{Image size$\downarrow$}}&\textbf{3x3}&\textbf{5x5}&\textbf{7x7}&\textbf{9x9}&\textbf{11x11}&\textbf{13x13}\\\hline\hline +\shortstack{\textbf{Mask size}$\rightarrow$\\\textbf{Image size$\downarrow$}}&$\mathbf{3\times 3}$&$\mathbf{5\times 5}$&$\mathbf{7\times 7}$&$\mathbf{9\times 9}$&$\mathbf{11\times 11}$&$\mathbf{13\times 13}$\\\hline\hline $\mathbf{512\times 512}$ &0.040 &0.075 &0.141 &0.243&0.314&0.402\\\hline $\mathbf{1024\times 1024}$&0.141 &0.307 &0.524 &0.917&1.192&1.535\\\hline $\mathbf{2048\times 2048}$&0.543 &\bf 1.115&2.048 &3.598&4.678&6.037\\\hline $\mathbf{4096\times 4096}$&2.146 &4.364 &8.156 &14.341&18.652&24.020\\\hline \end{tabular} } -\caption{Performances, in milliseconds, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card.} +\caption{Performances, in milliseconds, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card. Data transfers duration are not included.} \label{tab:convoGeneSh1} \end{table} -\begin{table}[h] +\begin{table}[htbp] \centering {\normalsize \begin{tabular}{|c||r|r|r|r|r|r|} \hline -\shortstack{\textbf{Mask size$\rightarrow$}\\\textbf{Image size$\downarrow$}}&\textbf{3x3}&\textbf{5x5}&\textbf{7x7}&\textbf{9x9}&\textbf{11x11}&\textbf{13x13}\\\hline\hline +\shortstack{\textbf{Mask size}$\rightarrow$\\\textbf{Image size$\downarrow$}}&$\mathbf{3\times 3}$&$\mathbf{5\times 5}$&$\mathbf{7\times 7}$&$\mathbf{9\times 9}$&$\mathbf{11\times 11}$&$\mathbf{13\times 13}$\\\hline\hline $\mathbf{512\times 512}$ &1394 &1176 &907 &670&567&477\\\hline $\mathbf{1024\times 1024}$&1820 &1413 &1093 &776&644&532\\\hline $\mathbf{2048\times 2048}$&2023 &\bf 1586 &1172 &818&676&554\\\hline @@ -283,7 +288,7 @@ $\mathbf{4096\times 4096}$&2090 &1637 &1195 &830&684&561\\\hline \caption[Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.} \label{tab:convoGeneSh2} \end{table} -\lstinputlisting[label={lst:convoGeneSh1},caption=CUDA kernel achieving a generic convolution operation after a preloading of data in shared memory.]{Chapters/chapter4/code/convoGeneSh1.cu} +\lstinputlisting[label={lst:convoGeneSh1},caption=CUDA kernel achieving a generic convolution operation after a preloading of data in shared memory]{Chapters/chapter4/code/convoGeneSh1.cu} \section{Separable convolution} A convolution operation is said separable when its masks $h$ is the product of 2 vectors $h_v$ and $h_h$, as is the case in the following example: @@ -292,45 +297,32 @@ $$h = h_v \times h_h = \begin{bmatrix}1\\2\\1\end{bmatrix} \times \begin{bmatrix -2&4&-2\\ -1&2&-1 \end{bmatrix}$$ -Such a mask allows to replace a generic 2-D convolution operation by two consecutive stages of a 1-D convolution operation: a vertical of mask $h_v$ and a horizontal of mask $h_h$. -This saves a lot of arithmetic operations, as a generic $n\times n$ convolution applied on a $H\times L$ image basically represents $H.L.n^2$ multiplications and as many additions, while two consecutive $n\times 1$ convolutions only represents $2.H.L.n$ of each, \textit{e.g} 60\% operations are saved per pixel of the image for a $5\times 5$ mask.\\ -However, beside reducing the operation count, performing a separable convolution also means writing an intermediate image into global memory. -CPU implementations of separable convolutions often use a single function to perform both 1-D convolution stages. To do so, this function reads the input image and actually ouputs the transposed filtered image. -Applying that principle to GPUs is not efficient, as outputting the transposed image means non-coalescent writes into global memory, generating severe performance loss. Hence the idea of developing two different kernels, one for each of both vertical and horizontal convolutions. - -Here, the use of Shared memory is the best choice, as there is no overlapping between neighbor windows and thus no possible optimization. -Moreover, to ensure efficiency, it is important to read the input image from texture memory, which implies an internal GPU data copy between both 1-D convolution stages. -Which, even if it is faster than CPU/GPU data transfer, makes separable convolutions slower than generic convolutions for small mask sizes. On C2070, the lower limit is $7\times 7$ pixels ($9\times 9$ for $512\times 512$ images). - -Both vertical and horizontal kernels feature similar runtimes: Table \ref{tab:convoSepSh1} only contains their average execution time, including the internal data copy stage, while Table \ref{tab:convoSepSh2} shows the achieved global throughput values. Timings of the data copy stage are given in Table \ref{tab:cpyToArray}. -Listings \ref{lst:convoSepShV} and \ref{lst:convoSepShH} detail the implementation of both 1-D kernels, while Listing \ref{lst:convoSepSh} shows how to use them in addition with the data copy function in order to achieve a whole separable convolution. The shared memory size is dynamically passed as a parameter at kernel call time. Its expression is given in the comment line before its declaration. -\begin{table}[h] -\centering -{\normalsize -\begin{tabular}{|c||r|} -\hline -\textbf{Image size}&\textbf{C2070}\\\hline\hline -$\mathbf{512\times 512}$ &0.029 \\\hline -$\mathbf{1024\times 1024}$&0.101 \\\hline -$\mathbf{2048\times 2048}$&0.387 \\\hline -$\mathbf{4096\times 4096}$&1.533 \\\hline -\end{tabular} -} -\caption{Time cost of data copy between the vertical and the horizontal 1-D convolution stages, on a C2070 cards (in milliseconds).} -\label{tab:cpyToArray} -\end{table} +Such a mask allows us to replace a generic 2D convolution operation by two consecutive stages of a 1D convolution operation: a vertical of mask $h_v$ and a horizontal of mask $h_h$. +This saves a lot of arithmetic operations, as a generic $n\times n$ convolution applied on an $H\times L$ image basically represents $HLn^2$ multiplications and as many additions, while two consecutive $n\times 1$ convolutions represents only $2HLn$ of each, e.g., 60\% operations are saved per pixel of the image for a $5\times 5$ mask. + +However, besides reducing the operation count, performing a separable convolution also means writing an intermediate image into global memory. +CPU implementations of separable convolutions often use a single function to perform both 1D convolution stages. To do so, this function reads the input image and actually ouputs the transposed filtered image. +Applying this principle to GPUs is not efficient, as outputting the transposed image means non coalescent writes into global memory, generating severe performance loss. Hence the idea of developing two different kernels, one for each of the vertical and horizontal convolutions. + +Here, the use of shared memory is the best choice, as there is no overlapping between neighbor windows and thus no possible optimization. +Moreover, to ensure efficiency, it is important to read the input image from texture memory, which implies an internal GPU data copy between both 1D convolution stages. +This, even if it is faster than CPU/GPU data transfer, makes separable convolutions slower than generic convolutions for small mask sizes. On C2070, the lower limit is $7\times 7$ pixels ($9\times 9$ for $512\times 512$ images). + +Both vertical and horizontal kernels feature similar runtimes: Table \ref{tab:convoSepSh1} contains only their average execution time, including the internal data copy stage, while Table \ref{tab:convoSepSh2} shows the achieved global throughput values. Timings of the data copy stage are given in Table \ref{tab:cpyToArray}. +Listings \ref{lst:convoSepShV} and \ref{lst:convoSepShH} detail the implementation of both 1D kernels, while Listing \ref{lst:convoSepSh} shows how to use them in addition with the data copy function in order to achieve a whole separable convolution. The shared memory size is dynamically passed as a parameter at kernel call time. Its expression is given in both Listings (\ref{lst:convoSepShV} and \ref{lst:convoSepShH}), in the comment lines before its declaration. + \begin{table}[h] \centering {\normalsize \begin{tabular}{|c||r|r|r|r|r|r|} \hline -\shortstack{\textbf{Mask size$\rightarrow$}\\\textbf{Image size$\downarrow$}}&\textbf{3x3}&\textbf{5x5}&\textbf{7x7}&\textbf{9x9}&\textbf{11x11}&\textbf{13x13}\\\hline\hline +\shortstack{\textbf{Mask size}$\rightarrow$\\\textbf{Image size$\downarrow$}}&$\mathbf{3\times 3}$&$\mathbf{5\times 5}$&$\mathbf{7\times 7}$&$\mathbf{9\times 9}$&$\mathbf{11\times 11}$&$\mathbf{13\times 13}$\\\hline\hline $\mathbf{512\times 512}$ &0.080 &0.087 &0.095 &\bf 0.108&\bf 0.115&\bf 0.126\\\hline $\mathbf{1024\times 1024}$&0.306 &0.333 &\bf 0.333 &\bf 0.378&\bf 0.404&\bf 0.468\\\hline $\mathbf{2048\times 2048}$&1.094 &1.191 &\bf 1.260 &\bf 1.444&\bf 1.545&\bf 1.722\\\hline $\mathbf{4096\times 4096}$&4.262 &4.631 &\bf 5.000 &\bf 5.676&\bf 6.105&\bf 6.736\\\hline \end{tabular}} -\caption[Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run on a C2070 card.]{Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run on a C2070 card. Timings include data copy. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones.} +\caption[Performances, in milliseconds, of our generic 8 pixels per thread 1D convolution kernels using shared memory, run on a C2070 card.]{Performances, in milliseconds, of our generic 8 pixels per thread 1D convolution kernels using shared memory, run on a C2070 card. Timings include data copy. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones.} \label{tab:convoSepSh1} \end{table} \begin{table}[h] @@ -338,28 +330,42 @@ $\mathbf{4096\times 4096}$&4.262 &4.631 &\bf 5.000 &\bf 5.676&\bf 6.105&\bf 6.73 {\normalsize \begin{tabular}{|c||r|r|r|r|r|r|} \hline -\shortstack{\textbf{Mask size$\rightarrow$}\\\textbf{Image size$\downarrow$}}&\textbf{3x3}&\textbf{5x5}&\textbf{7x7}&\textbf{9x9}&\textbf{11x11}&\textbf{13x13}\\\hline\hline +\shortstack{\textbf{Mask size}$\rightarrow$\\\textbf{Image size$\downarrow$}}&$\mathbf{3\times 3}$&$\mathbf{5\times 5}$&$\mathbf{7\times 7}$&$\mathbf{9\times 9}$&$\mathbf{11\times 11}$&$\mathbf{13\times 13}$\\\hline\hline $\mathbf{512\times 512}$ &1150 &1116 &1079 &\bf 1024&\bf 997 &\bf 957\\\hline $\mathbf{1024\times 1024}$&1415 &1365 &\bf 1365 &\bf 1290&\bf 1250&\bf 1169\\\hline $\mathbf{2048\times 2048}$&1598 &1541 &\bf 1503 &\bf 1410&\bf 1364&\bf 1290\\\hline $\mathbf{4096\times 4096}$&1654 &1596 &\bf 1542 &\bf 1452&\bf 1400&\bf 1330\\\hline \end{tabular} } -\caption[Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.} +\caption[Throughput values, in megapixel per second, of our generic 8 pixels per thread 1D convolution kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1D convolution kernel using shared memory, run on a C2070 card. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones (data transfer durations are those of Table \ref{tab:memcpy1}).} \label{tab:convoSepSh2} \end{table} - -\lstinputlisting[label={lst:convoSepSh},caption=data copy between the calls to 1-D convolution kernels achieving a 2-D separable convolution operation.]{Chapters/chapter4/code/convoSepSh.cu} -\lstinputlisting[label={lst:convoSepShV},caption=CUDA kernel achieving a horizontal 1-D convolution operation after a preloading \index{Prefetching} of data in shared memory.]{Chapters/chapter4/code/convoSepShV.cu} -\lstinputlisting[label={lst:convoSepShH},caption=CUDA kernel achieving a vertical 1-D convolution operation after a preloading of data in shared memory.]{Chapters/chapter4/code/convoSepShH.cu} +\begin{table}[h] +\centering +{\normalsize +\begin{tabular}{|c||r|} +\hline +\textbf{Image size}&\textbf{C2070}\\\hline\hline +$\mathbf{512\times 512}$ &0.029 \\\hline +$\mathbf{1024\times 1024}$&0.101 \\\hline +$\mathbf{2048\times 2048}$&0.387 \\\hline +$\mathbf{4096\times 4096}$&1.533 \\\hline +\end{tabular} +} +\caption{Time cost of data copy between the vertical and the horizontal 1D convolution stages, on a C2070 cards (in milliseconds).} +\label{tab:cpyToArray} +\end{table} +\lstinputlisting[label={lst:convoSepSh},caption=data copy between the calls to 1D convolution kernels achieving a 2D separable convolution operation]{Chapters/chapter4/code/convoSepSh.cu} +\lstinputlisting[label={lst:convoSepShV},caption=CUDA kernel achieving a horizontal 1D convolution operation after a preloading \index{Prefetching} of data into shared memory]{Chapters/chapter4/code/convoSepShV.cu} +\lstinputlisting[label={lst:convoSepShH},caption=CUDA kernel achieving a vertical 1D convolution operation after a preloading of data into shared memory]{Chapters/chapter4/code/convoSepShH.cu} \section{Conclusion} -Extensively detailing the various techniques that may be applied when designing a median or a convolution operation on GPU has enabled us determine that: +Extensively detailing the various techniques that may be applied when designing a median or a convolution operation on GPU has enabled us determine that \begin{itemize} -\item the use of registers with direct data fetching from texture often allows kernels to run faster than those which use the more conventionnal way of prefetching data from texture memory and storing them into shared memory. -\item increasing the pixel count processed by each thread brings important speedups. In this case, if neighboring windows overlap, optimized direct data fetching from texture will likely outperform the shared memory prefetching technique. That is the case for generic convolution kernels. -\item coding such optimized data fetching is not straightforward. Consequently, we are planning to provide a kernel code generator that will make our kernels more accessible by GPU users. +\item the use of registers with direct data fetching from texture often allows kernels to run faster than those which use the more conventionnal way of prefetching data from texture memory and storing them in shared memory. +\item increasing the pixel count processed by each thread brings important speedups. In this case, if neighboring windows overlap, optimized direct data fetching from texture will likely outperform the shared memory prefetching technique. This is the case for generic convolution kernels. +\item coding such optimized data fetching is not straightforward. Consequently, we are currently developing a kernel code generator that will make our kernels more accessible by GPU users. \end{itemize} -The presented kernels, optimized for a C2070 card, achieve up to 2138~Mpix/s including data transfers, which comes close to the absolute maximum throughput value allowed by the Fermi architecture. The next GPU generation (Kepler) may allow us not only to benefit from new dynamic parallelism capability to increase kernel paralelism level, but also to take advantage of an increase of the register count allowed per thread block which would allow us, for example, to extend our register-only median filter technique to larger mask sizes. +The presented kernels, optimized for a C2070 card, achieve up to 2138~MP/s including data transfers, which comes close to the absolute maximum throughput value allowed by the Fermi architecture. The next GPU generation (called Kepler) may allow us not only to benefit from new dynamic parallelism capability to increase kernel paralelism level, but also to take advantage of an increase in the register count allowed per thread block which would allow us, for example, to extend our register-only median filter technique to larger mask sizes. \putbib[Chapters/chapter4/biblio4]