paper_fast_median/paper_fast_median.tex~

   1 \documentclass[a4paper, 11pt, twocolumn]{article}
   2
   3 %\usepackage[T1]{fontenc}
   4 %\usepackage[latin1]{inputenc}
   5 %\usepackage[cyr]{aeguill}
   6 %\usepackage[frenchb]{babel}
   7
   8 \usepackage{amssymb}
   9 \usepackage{amsmath}
  10 \usepackage{graphicx}
  11 \usepackage{color, xcolor}
  12 \graphicspath{{img/}}
  13 \usepackage{subfigure}
  14 \usepackage[caption=false,font=footnotesize]{subfig}
  15 %\usepackage{epsfig}
  16 \usepackage{makeidx}
  17 %\usepackage[sectionbib]{bibunits}
  18 \usepackage{multicol}
  19 \usepackage{multirow}
  20 \usepackage{tabularx}
  21 \usepackage[ruled,lined,linesnumbered]{algorithm2e}
  22 \usepackage{array}
  23
  24
  25 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  26 %      Listings
  27 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  28 \usepackage{listings}
  29 \usepackage{courier}
  30 \lstset{
  31   language=C,
  32   columns=fixed,
  33   basicstyle=\footnotesize\ttfamily,
  34   numbers=left,
  35   firstnumber=1,
  36   numberstyle=\tiny,
  37   stepnumber=5,
  38   numbersep=5pt,
  39   tabsize=3,
  40   extendedchars=true,
  41   breaklines=true,
  42   keywordstyle=\textbf,
  43   frame=single,
  44   % keywordstyle=[1]\textbf,
  45   %identifierstyle=\textbf,
  46   commentstyle=\color{white}\textbf,
  47   stringstyle=\color{white}\ttfamily,
  48   % xleftmargin=17pt,
  49   % framexleftmargin=17pt,
  50   % framexrightmargin=5pt,
  51   % framexbottommargin=4pt,
  52   backgroundcolor=\color{lightgray},
  53   }
  54
  55 \usepackage{caption}
  56 %\DeclareCaptionFont{blue}{\color{blue}}
  57 %\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}}
  58
  59 %\DeclareCaptionFont{white}{\color{white}}
  60 %\DeclareCaptionFormat{listing}{\colorbox{gray}{\parbox{\textwidth}{\hspace{15pt}#1#2#3}}}
  61 %\captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white, singleline}
  62 %%%%%%%%%%%%%%%%%%%%%%%% Fin Listings %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  63
  64 \newcommand{\kl}{\includegraphics[scale=0.4]{kernLeft.png}~}
  65 \newcommand{\kr}{\includegraphics[scale=0.4]{kernRight.png}}
  66
  67
  68 % \begin{VF}
  69 % ``A ''
  70
  71 % \VA{Thomas Davenport}{Senior Adjutant to the Junior Marketing VP}
  72 % \end{VF}
  73
  74
  75
  76 % \begin{shadebox}
  77 % A component part for an electronic item is
  78 % manufactured at one of three different factories, and then delivered to
  79 % the main assembly line.Of the total number supplied, factory A supplies
  80 % 50\%, factory B 30\%, and factory C 20\%. Of the components
  81 % manufactured at factory A, 1\% are faulty and the corresponding
  82 % proportions for factories B and C are 4\% and 2\% respectively. A
  83 % component is picked at random from the assembly line. What is the
  84 % probability that it is faulty?
  85 % \end{shadebox}
  86
  87
  88 % \begin{equation}
  89 % \mbox{var}\widehat{\Delta} = \sum_{j = 1}^t \sum_{k = j+1}^t
  90 % \mbox{var}\,(\hat{\alpha}_j - \hat{\alpha}_k)  = \sum_{j = 1}^t
  91 % \sum_{k = j+1}^t \sigma^2(1/n_j + 1/n_k). \label{2delvart2}
  92 % \end{equation}
  93
  94
  95 % \begin{shortbox}
  96 % \Boxhead{Box Title Here}
  97 % \end{shortbox}
  98
  99 % \begin{theorem}\label{1th:Z_m}
 100 % Let $m$ be a prime number. With the addition and multiplication as
 101 % defined above, $Z_m$ is a field.
 102 % \end{theorem}
 103
 104 % \begin{proof}
 105 % \end{proof}
 106
 107 % \begin{notelist}{000000}
 108 %  \notes{Note:}{The process of integrating reengineering is best accomplished with an engineer, a dog, and a cat.}
 109 % \end{notelist}
 110
 111
 112 % \begin{VT1}
 113 % \VH{Think About It...}
 114 % Com
 115 % \VT
 116 % \VTA{The Information Revolution}{Business Week}
 117 % \end{VT1}
 118
 119
 120 %\begin{definition}\label{1def:linearcomb}{}\end{definition}
 121
 122
 123
 124 % \begin{extract}
 125 % text
 126 % \end{extract}
 127
 128 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 129 %      Listings
 130 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 131 \lstset{
 132   language=C,
 133   columns=fixed,
 134   basicstyle=\footnotesize\ttfamily,
 135   numbers=left,
 136   firstnumber=1,
 137   numberstyle=\tiny,
 138   stepnumber=5,
 139   numbersep=5pt,
 140   tabsize=3,
 141   extendedchars=true,
 142   breaklines=true,
 143   keywordstyle=\textbf,
 144   frame=single,
 145   % keywordstyle=[1]\textbf,
 146   %identifierstyle=\textbf,
 147   commentstyle=\color{white}\textbf,
 148   stringstyle=\color{white}\ttfamily,
 149   % xleftmargin=17pt,
 150   % framexleftmargin=17pt,
 151   % framexrightmargin=5pt,
 152   % framexbottommargin=4pt,
 153   backgroundcolor=\color{lightgray},
 154   }
 155
 156 %\DeclareCaptionFont{blue}{\color{blue}}
 157 %\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}}
 158
 159 %\DeclareCaptionFont{white}{\color{white}}
 160 %\DeclareCaptionFormat{listing}{\colorbox{gray}{\parbox{\textwidth}{\hspace{15pt}#1#2#3}}}
 161 %\captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white, singleline}
 162 %%%%%%%%%%%%%%%%%%%%%%%% Fin Listings %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 163
 164
 165 \begin{document}
 166
 167 \title{Fine-tuned high-speed implementation \\of a GPU-based median filter.}
 168 \twocolumn[
 169   \begin{@twocolumnfalse}
 170     \maketitle
 171 \begin{abstract}
 172   Median filtering is a well-known method used in a wide range of application frameworks as well as a standalone filter, especially for \textit{salt-and-pepper} denoising. It is able to highly reduce power of noise while minimizing edge blurring.
 173 Currently, its existing algorithms and implementations have proved quite efficient but leave room for improvement as far as processing speed is concerned, which has lead us to further investigate the specificities of modern GPUs.
 174 In this paper, we propose the GPU implementation of fixed-size kernel median filters, able to output up to 1.8 billion pixels per second.
 175 Based on a Branchless Vectorized Median class algorithm and implemented through memory fine tuning and the use of GPU registers, our median drastically outperforms existing implementations, resulting, as far as we know, in the fastest median filter to date.
 176 \end{abstract}
 177   \end{@twocolumnfalse}
 178 ]
 179
 180 \section{Introduction}
 181 First introduced by Tukey in \cite{tukey77}, median filtering has been widely studied since then, and many researchers have proposed efficient implementations of it, adapted to various hypothesis, architectures and processors.
 182 Originally, its main drawbacks were its compute complexity, its non linearity and its data-dependent runtime. Several researchers have adressed these issues and designed, for example, efficient histogram-based median filters featuring predictable runtimes \cite{Huang:1981:TDS:539567, Weiss:2006:FMB:1179352.1141918}.
 183 More recently, authors managed to take advantage of the newly opened perspectives offered by modern GPUs, to develop such CUDA-based filters such as the Branchless Vectorized Median filter (BVM) \cite{5402362, chen09} which allows very interesting runtimes and  the histogram-based, PCMF median filter \cite{6288187} which was the fastest median filter implementation known to us.
 184
 185 The use of a GPU as a general-purpose computing processor raises the issue of data transfers, especially when kernel runtime is fast and/or when large data sets are processed. In certain cases, data transfers between GPU and CPU are slower than the actual computation on GPU, even though global GPU processes can prove faster than similar ones run on CPU.
 186 In the following section, we propose the overall code structure to be used with our median kernels.
 187 For more concision and readability, our coding will be restricted to 8 or 16 bit gray-level input images whose height ($H$) and width ($W$) are both multiples of 512 pixels.
 188 Let us also point out that the following implementation, targeted on Nvidia Tesla GPU (Fermi architecture, compute capability 2.x), may easily be adapted to other models e.g. those of compute capability 1.3.
 189
 190 \section{General structure}
 191 Algorithm \ref{algo:memcopy} describes how data is handled in our code.
 192 Input image data is stored in the GPU's texture memory so as to benefit from the 2-D caching mechanism. After kernel execution, copying output image back to CPU memory is done by use of pinned memory, which drastically accelerates data transfer.
 193
 194 \begin{algorithm}
 195  \SetNlSty{textbf}{}{:}
 196 \footnotesize
 197  allocate and populate CPU memory \textbf{h\_in}\;
 198  allocate CPU pinned-memory \textbf{h\_out}\;
 199  allocate GPU global memory \textbf{d\_out}\;
 200  declare GPU texture reference \textbf{tex\_img\_in}\;
 201  allocate GPU array \textbf{array\_img\_in}\;
 202  bind \textbf{array\_img\_in} to texture \textbf{tex\_img\_in}\;
 203  copy data from \textbf{h\_in} to \textbf{array\_img\_in}\label{algo:memcopy:H2D}\;
 204  kernel\kl gridDim,blockDim\kr\tcc*[f]{to d\_out}\label{algo:memcopy:kernel}\;
 205  copy data from \textbf{d\_out} to \textbf{h\_out} \label{algo:memcopy:D2H}\;
 206 \caption{Global memory management on CPU and GPU sides.}
 207 \label{algo:memcopy}
 208 \end{algorithm}
 209
 210
 211 \section{Implementing a fast median filter}
 212 \subsection{Basic principles}
 213 Designing a 2-D median filter basically consists in defining a square window $H(i,j)$ for each pixel $I(i,j)$ of the input image, containing $n=k\times k$ pixels and centered on $I(i,j)$. The output value $I'(i,j)$ is the median value of the gray level values of the $k\times k$ pixels of $H(i,j)$. Figure \ref{median1} illustrates this principle with an example of a 5x5 median filter applied on pixel $I(5,6)$.
 214
 215 Obviously, one key issue is the selection method that identifies the median value, which can be done using either  histogram-based or sorting methods.
 216 But, as shown in figure \ref{median1}, since two neighboring pixels share part of the values to be sorted, a second key issue is how to rule redundancy between consecutive positions of the running window $H(i,j)$.
 217
 218 \begin{figure}
 219    \centering
 220    \subfigure[$5\times 5$ median filtering applied on pixel of coordinates (5,6)]{ \includegraphics[width=5cm]{median_1.png}}\qquad
 221    \subfigure[window overlapping in $5\times 5$ median filtering]{\includegraphics[width=3cm]{median_overlap.png}}\\
 222 \caption{Illustration of $5\times 5$ median filtering}
 223 \label{median1}
 224 \end{figure}
 225
 226 \subsection{Using registers}
 227 As register access is at least 20 times faster than all the other memory types available on the GPU, it is natural to turn to the use of registers as a means to store temporary data, keeping in mind that on the \textit{fermi} architecture, each individual thread can use a maximum of 63 registers within the limit of 32K per thread block.
 228 Consequently, it is important to use registers sparingly in order to preserve high pixel throughput values: to do so, we use the iterative \textit{forgetfull selection} algorithm.
 229 Its principle is, at each iteration, to identify then to eliminate (forget) both elements showing the maximum and the minimum values among the current list, then to add the next candidate element left in the original list, till none is left; the last value remaining actually is the global median value.
 230 Figure \ref{forget} illustrates the \textit{forgetfull selection} process applied to a $3\times3$ pixel median filter. For clarity reasons, the nine values have been represented in a row.
 231 \begin{figure}
 232    \centering
 233    \includegraphics[width=4cm]{forgetful_selectionb.png}
 234    \caption{Determination of the Median value by the Forgetfull selection process, applied to a $3\times 3$ neighborhood window.}
 235    \label{forget}
 236 \end{figure}
 237 The minimum register count $R_{n}$ needed to start this iterative selection process among $k \times k$  values is given by:
 238 $$R_{n}=\lceil \frac{n}{2}\rceil+1$$
 239 The selection of both \textit{extrema} is implemented through an basic 2-element swapping function.
 240 This ensures that the GPU kernel code is free of divergent branches liable to severely impact performance.
 241
 242 \subsection{Hiding Latencies}
 243 Optimizing a GPU kernel also means hiding latencies. The offered massive thread parallelism helps in doing so transparently but, considering the actual computation performed by each thread, optimization may be taken a few steps further:\\
 244 First, we maximized the Instruction Level Parallelism inside the \textit{forgetfull selection} method by re-arranging the instruction sequence of an incomplete \textit{bitonic sort} \cite{Batcher:1968:SNA:1468075.1468121, cormen2001introduction} so as to reduce the data dependency of consecutive instructions.\\
 245 second, we divided thread block size by 2, having each thread perform the computation of two neighbor input pixels instead of just one, thus keeping grid size unchanged while reducing the effect of global memory access latency.
 246 Additionally, window overlapping also had to be managed, as illustrated by Figure \ref{median1}, in order to minimize the increase of register count per thread brought by the new computing organization.
 247 Two $k\times k$ consecutive neighboring windows share $S_{n}=n-\sqrt{n}$ pixels, which is more than the number of registers needed to perform the median selection, i.e. $R_{n}$ (or equal for $3\times 3$ median filter). The $(S_{n}-R_{n}+1)$ first selection stages can then be considered common to both windows, leaving only the $k$ non-shared pixels of each window to be processed separately.
 248 The above technique saves $k+1$ registers for each pair of input pixels, which means that each thread block now uses fewer registers while processing the same pixel count, thus allowing a higher level of parallelism.
 249 Figure \ref{median5overlap} illustrates this by representing the different classes of pixels in the $5\times5$ median example.
 250
 251 \subsection{Compute complexity}
 252  Arithmetic instructions and texture fetches are easy to count but the number of element swaps needed to select the median value is data dependent and only its maximum can be evaluated. The incomplete sorting (\textit{forgetful selection}) and the redundancy management performed when outputing two pixels per thread lead to the instruction count below:
 253 \begin{itemize}
 254 \item 5 integer multiplications and 5 integer additions to compute thread indexes and ouput pixel coordinates.
 255 \item $2n-1$ additions to compute neighbor pixel coordinates.
 256 \item $n-\sqrt{n}$ texture fetches to load gray-level values.
 257 \item within a $m$-element vector and according to our selection method, the maximum number of element swaps needed to move the minimum value at first position and the maximum at last is given by: $$sc(m)=-2+\lceil\frac{3.m}{2}\rceil$$
 258 Consequently, the number of element swaps needed by the entire selection of two median values performed by one thread is inferior or equal to:
 259 $$\left(\sum_{i=\lceil\frac{n}{2}\rceil+1}^{n-\sqrt{n}}{sc(i)}\right) + 2\left(\sum_{i=n-\sqrt{n}+1}^{n}{sc(i)}\right)$$
 260 The above sum equals 42,156 and 474 for typical window sizes of $3\times 3$, $5\times 5$ and $7\times 7$. The total instruction count is thus kept near a  $O(nlog(n))$ rule.
 261 \end{itemize}
 262
 263 \begin{figure}
 264    \centering
 265    \includegraphics[width=4cm]{median5_overlap.png}
 266    \caption{Reducing register count in a 5$\times$5 register-only median kernel processing 2 input pixels. The first 7 forgetful selection stages are common to both processed center pixels.}
 267    \label{median5overlap}
 268 \end{figure}
 269
 270 \section{Experiments}
 271 Runtimes have been obtained by averaging 1000 executions  on a C2070 GPU card hosted by a system with one Xeon E5620\@2.40GHz processor running a linux kernel 2.6.18x86\_64 and CUDA v4.0. Each kernel has been run on images of sizes 512$\times$512, 1024$\times$1024, 2048$\times$2048 and 4096$\times$4096. Like many authors, we computed the pixel throughput value as our main performance indicator. It includes kernel runtime as well as transfer times to and from the GPU. We also measured the maximum effective pixel throughput that our GPU/host couple is able to achieve, by running an identity kernel that fetches the gray-level of each pixel in texture memory and outputs it into global memory exclusive of  any other instruction. Knowing this peak value allows us to evaluate the potential for improvement of each kernel and helps in deciding on further investigation. Those peak values are detailed in Table \ref{tabmaxtp}, which shows that the larger the image, the higher the expected throughput.
 272
 273 \begin{figure}
 274    \centering
 275    \subfigure[512$\times$512 pixel input image.]{\includegraphics[width=7cm]{comp512.png}}
 276    \subfigure[4096$\times$4096 pixel input image]{\includegraphics[width=7cm]{comp4k.png}}
 277    \caption{Pixel throughput value comparison of several implementation against our PRMF.}
 278    \label{figcomp}
 279 \end{figure}
 280
 281 \begin{table*}[ht]
 282 %\newcolumntype{I}{!{\vrule width 1.5pt}}
 283 \newlength\savedwidth
 284 \newcommand\whline{\noalign{\global\savedwidth
 285   \arrayrulewidth\global\arrayrulewidth 1.5pt}
 286   \hline \noalign{\global\arrayrulewidth
 287   \savedwidth}
 288 }
 289 \renewcommand{\arraystretch}{1.5}
 290 \centering
 291 {\footnotesize
 292 \begin{tabular}{|c||c|c|c|c|c|c|}
 293 \hline\hline
 294 Gray level format & \multicolumn{3}{c|}{\textbf{8 bits}} & \multicolumn{3}{c|}{\textbf{16 bits}} \\ \hline
 295 \shortstack{time costs$\rightarrow$\\image size (pixels)$\downarrow$} & \shortstack{to GPU\\(ms)}& \shortstack{from GPU\\(ms)}& \shortstack{Total\\(ms) }& \shortstack{$\rightarrow$GPU\\(ms)}& \shortstack{$\rightarrow$GPU\\(ms)}& \shortstack{Total\\(ms) } \\ \hline\hline
 296 512$\times$512   &0.08 &0.06 &0.14 &0.14 &0.10 &0.24 \\\hline
 297 1024$\times$1024 &0.24 &0.19 &0.43 &0.45 &0.35 &0.80 \\\hline
 298 2048$\times$2048 &0.85 &0.68 &1.53 &1.59 &1.32 &2.91 \\\hline
 299 4096$\times$4096 &3.27 &2.61 &5.88 &6.21 &5.21 &11.42 \\\hline
 300 \end{tabular}}
 301 \caption{Time cost of data transfer for each image size and gray-level format on C2070 GPU.}
 302 \label{tabmemcpy}
 303 \end{table*}
 304
 305 \section{Results}
 306 The main contribution of this work is to show how to tune a CUDA GPU implementation in order to achieve highest pixel throughput values. Runtimes, as well as pixel throughput values are gathered in Table \ref{tabresults}. Figure \ref{figcomp} compares the throughput values of several implementations against ours, labeled PRMF for Parallel Register-only Median Filter. Due to the lack of available source code, comparison is based on the most recent results published in \cite{Sanchez-2-2012}, obtained with the same GPU as ours and with 8 bit-coded gray-level images.
 307 While the algorithm implemented here is similar to the one in \textit{ArrayFire}, what makes the difference is our fine tuning of the implementation that leads to the  fastest GPU median filter known to date with 1854~MPix/s.
 308 Let us also note that such considerable throughput values come very close to the peak effective pixel throughput value of 2444~Mpix/s allowed by our developpement platform. Consequently further investigation would likely bring little performance improvement.
 309
 310
 311 \begin{table}[h]
 312 %\newcolumntype{I}{!{\vrule width 1.5pt}}
 313 %\newlength\savedwidth
 314 \newcommand\whline{\noalign{\global\savedwidth
 315   \arrayrulewidth\global\arrayrulewidth 1.5pt}
 316   \hline \noalign{\global\arrayrulewidth
 317   \savedwidth}
 318 }
 319 \renewcommand{\arraystretch}{1.5}
 320 \centering
 321 {\small
 322 \begin{tabular}{|c||c|c|}\hline\hline
 323 \shortstack{Gray-level format$\rightarrow$\\image size$\downarrow$} & {$\mathbf{ T_8}$} & {$\mathbf{T_{16}}$} \\ \hline\hline
 324 512$\times$512   &1598 &975 \\\hline
 325 1024$\times$1024 &2101 &1200 \\\hline
 326 2048$\times$2048 &2359 &1308 \\\hline
 327 4096$\times$4096 &2444 &1335 \\\hline
 328 \end{tabular} }
 329 \caption{Maximum effective pixel throughput values for $T_8$ and $T_{16}$ (in MPixel per second) on C2070, achieved when processing 8 and 16 bit-coded gray-level images.}
 330 \label{tabmaxtp}
 331 \end{table}
 332
 333 \begin{table}[h]
 334 %\newcolumntype{I}{!{\vrule width 1.5pt}}
 335 %\newlength\savedwidth
 336 \newcommand\whline{\noalign{\global\savedwidth
 337   \arrayrulewidth\global\arrayrulewidth 1.5pt}
 338   \hline \noalign{\global\arrayrulewidth
 339   \savedwidth}
 340 }
 341 \renewcommand{\arraystretch}{1.5}
 342 \centering
 343 {\footnotesize
 344 \begin{tabular}{|c|l||c|c|c|c|c|c|c|c|c|}
 345 \hline\hline
 346 \multicolumn{2}{|l||}{\shortstack{Window size$\rightarrow$\\Image size - perf.$\downarrow$}} & \textbf{3$\times$3} & \textbf{5$\times$5} & \textbf{7$\times$7} \\ \whline
 347 \multirow{3}{*}{\rotatebox{90}{512$^2$}} &t (ms)    &0.05 &0.19 &0.60 \\
 348                                          &$T_{8}$ (Mpix/s)&1291 &773  &348 \\
 349                                          &$T_{16}$ (Mpix/s)&865  &607   &307 \\\whline
 350 \multirow{3}{*}{\rotatebox{90}{1024$^2$}}&t (ms)    &0.20 &0.74 &2.39 \\
 351                                          &$T_{8}$ (Mpix/s)&1644 &889  &371 \\
 352                                          &$T_{16}$ (Mpix/s)&1045   &692   &329 \\\whline
 353 \multirow{3}{*}{\rotatebox{90}{2048$^2$}}&t (ms)    &0.79 &2.95 &9.53 \\
 354                                          &$T_{8}$ (Mpix/s)&1805 &936  &379 \\
 355                                          &$T_{16}$ (Mpix/s)&1130   &729   &338 \\\whline
 356 \multirow{3}{*}{\rotatebox{90}{4096$^2$}}&t (ms)    &3.17 &11.77 &38.06 \\
 357                                          &$T_{8}$ (Mpix/s)&1854 &951   &382 \\
 358                                          &$T_{16}$ (Mpix/s)&1151   &738    &340  \\\whline
 359
 360 \end{tabular}}
 361 \caption{Runtime and pixel throughput of fast median kernels processing 8 and 16 bit-coded gray-level images and run by C2070 GPU.}
 362 \label{tabresults}
 363 \end{table}
 364
 365 \bibliographystyle{plain}
 366 \bibliography{biblio3}
 367 \end{document}
 368