\documentclass[a4paper, 11pt, twocolumn]{article}

%\usepackage[T1]{fontenc}
%\usepackage[latin1]{inputenc}
%\usepackage[cyr]{aeguill}
%\usepackage[frenchb]{babel}

\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{graphicx}
\usepackage{color, xcolor}
\graphicspath{{img/}}
\usepackage{subfigure}
\usepackage[caption=false,font=footnotesize]{subfig}
%\usepackage{epsfig}
\usepackage{makeidx}
%\usepackage[sectionbib]{bibunits}
\usepackage{multicol}
\usepackage{multirow}
\usepackage{tabularx}
\usepackage[ruled,lined,linesnumbered]{algorithm2e}
\usepackage{array}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%      Listings
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{listings}
\usepackage{courier}
\lstset{
  language=C,
  columns=fixed,
  basicstyle=\footnotesize\ttfamily,
  numbers=left,
  firstnumber=1,
  numberstyle=\tiny,
  stepnumber=5,             
  numbersep=5pt,              
  tabsize=3,                  
  extendedchars=true,         
  breaklines=true,       
  keywordstyle=\textbf,
  frame=single,         
  % keywordstyle=[1]\textbf,   
  %identifierstyle=\textbf,
  commentstyle=\color{white}\textbf,
  stringstyle=\color{white}\ttfamily,
  % xleftmargin=17pt,
  % framexleftmargin=17pt,
  % framexrightmargin=5pt,
  % framexbottommargin=4pt,
  backgroundcolor=\color{lightgray},
  }

\usepackage{caption}
%\DeclareCaptionFont{blue}{\color{blue}} 
%\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}}

%\DeclareCaptionFont{white}{\color{white}}
%\DeclareCaptionFormat{listing}{\colorbox{gray}{\parbox{\textwidth}{\hspace{15pt}#1#2#3}}}
%\captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white, singleline}
%%%%%%%%%%%%%%%%%%%%%%%% Fin Listings %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\newcommand{\kl}{\includegraphics[scale=0.4]{kernLeft.png}~}
\newcommand{\kr}{\includegraphics[scale=0.4]{kernRight.png}}


% \begin{VF}
% ``A ''

% \VA{Thomas Davenport}{Senior Adjutant to the Junior Marketing VP}
% \end{VF}


% \begin{shadebox}
% A component part for an electronic item is
% manufactured at one of three different factories, and then delivered to
% the main assembly line.Of the total number supplied, factory A supplies
% 50\%, factory B 30\%, and factory C 20\%. Of the components
% manufactured at factory A, 1\% are faulty and the corresponding
% proportions for factories B and C are 4\% and 2\% respectively. A
% component is picked at random from the assembly line. What is the
% probability that it is faulty? 
% \end{shadebox}


% \begin{equation}
% \mbox{var}\widehat{\Delta} = \sum_{j = 1}^t \sum_{k = j+1}^t
% \mbox{var}\,(\hat{\alpha}_j - \hat{\alpha}_k)  = \sum_{j = 1}^t
% \sum_{k = j+1}^t \sigma^2(1/n_j + 1/n_k). \label{2delvart2}
% \end{equation}


% \begin{shortbox}
% \Boxhead{Box Title Here}
% \end{shortbox}

% \begin{theorem}\label{1th:Z_m}
% Let $m$ be a prime number. With the addition and multiplication as 
% defined above, $Z_m$ is a field.
% \end{theorem}

% \begin{proof}
% \end{proof}

% \begin{notelist}{000000}
%  \notes{Note:}{The process of integrating reengineering is best accomplished with an engineer, a dog, and a cat.}
% \end{notelist}


% \begin{VT1}
% \VH{Think About It...}
% Com
% \VT
% \VTA{The Information Revolution}{Business Week}
% \end{VT1}


%\begin{definition}\label{1def:linearcomb}{}\end{definition}


% \begin{extract}
% text 
% \end{extract}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%      Listings
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\lstset{
  language=C,
  columns=fixed,
  basicstyle=\footnotesize\ttfamily,
  numbers=left,
  firstnumber=1,
  numberstyle=\tiny,
  stepnumber=5,             
  numbersep=5pt,              
  tabsize=3,                  
  extendedchars=true,         
  breaklines=true,       
  keywordstyle=\textbf,
  frame=single,         
  % keywordstyle=[1]\textbf,   
  %identifierstyle=\textbf,
  commentstyle=\color{white}\textbf,
  stringstyle=\color{white}\ttfamily,
  % xleftmargin=17pt,
  % framexleftmargin=17pt,
  % framexrightmargin=5pt,
  % framexbottommargin=4pt,
  backgroundcolor=\color{lightgray},
  }

%\DeclareCaptionFont{blue}{\color{blue}} 
%\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}}

%\DeclareCaptionFont{white}{\color{white}}
%\DeclareCaptionFormat{listing}{\colorbox{gray}{\parbox{\textwidth}{\hspace{15pt}#1#2#3}}}
%\captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white, singleline}
%%%%%%%%%%%%%%%%%%%%%%%% Fin Listings %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\begin{document}

\title{Fine-tuned high-speed implementation \\of a GPU-based median filter.}
\twocolumn[
  \begin{@twocolumnfalse}
    \maketitle
\begin{abstract}
  Median filtering is a well-known method used in a wide range of application frameworks as well as a standalone filter, especially for \textit{salt-and-pepper} denoising. It is able to highly reduce power of noise while minimizing edge blurring.
Currently, its existing algorithms and implementations have proved quite efficient but leave room for improvement as far as processing speed is concerned, which has lead us to further investigate the specificities of modern GPUs.
In this paper, we propose the GPU implementation of fixed-size kernel median filters, able to output up to 1.8 billion pixels per second. 
Based on a Branchless Vectorized Median class algorithm and implemented through memory fine tuning and the use of GPU registers, our median drastically outperforms existing implementations, resulting, as far as we know, in the fastest median filter to date.
\end{abstract}
  \end{@twocolumnfalse}
]

\section{Introduction}
First introduced by Tukey in \cite{tukey77}, median filtering has been widely studied since then, and many researchers have proposed efficient implementations of it, adapted to various hypothesis, architectures and processors. 
Originally, its main drawbacks were its compute complexity, its non linearity and its data-dependent runtime. Several researchers have adressed these issues and designed, for example, efficient histogram-based median filters featuring predictable runtimes \cite{Huang:1981:TDS:539567, Weiss:2006:FMB:1179352.1141918}.  
More recently, authors managed to take advantage of the newly opened perspectives offered by modern GPUs, to develop such CUDA-based filters such as the Branchless Vectorized Median filter (BVM) \cite{5402362, chen09} which allows very interesting runtimes and  the histogram-based, PCMF median filter \cite{6288187} which was the fastest median filter implementation known to us.

The use of a GPU as a general-purpose computing processor raises the issue of data transfers, especially when kernel runtime is fast and/or when large data sets are processed. In certain cases, data transfers between GPU and CPU are slower than the actual computation on GPU, even though global GPU processes can prove faster than similar ones run on CPU. 
In the following section, we propose the overall code structure to be used with our median kernels. 
For more concision and readability, our coding will be restricted to 8 or 16 bit gray-level input images whose height ($H$) and width ($W$) are both multiples of 512 pixels. 
Let us also point out that the following implementation, targeted on Nvidia Tesla GPU (Fermi architecture, compute capability 2.x), may easily be adapted to other models e.g. those of compute capability 1.3.

\section{General structure}
Algorithm \ref{algo:memcopy} describes how data is handled in our code. 
Input image data is stored in the GPU's texture memory so as to benefit from the 2-D caching mechanism. After kernel execution, copying output image back to CPU memory is done by use of pinned memory, which drastically accelerates data transfer.
 
\begin{algorithm}
 \SetNlSty{textbf}{}{:}
\footnotesize
 allocate and populate CPU memory \textbf{h\_in}\;
 allocate CPU pinned-memory \textbf{h\_out}\;
 allocate GPU global memory \textbf{d\_out}\;
 declare GPU texture reference \textbf{tex\_img\_in}\;
 allocate GPU array \textbf{array\_img\_in}\;
 bind \textbf{array\_img\_in} to texture \textbf{tex\_img\_in}\;
 copy data from \textbf{h\_in} to \textbf{array\_img\_in}\label{algo:memcopy:H2D}\; 
 kernel\kl gridDim,blockDim\kr\tcc*[f]{to d\_out}\label{algo:memcopy:kernel}\;
 copy data from \textbf{d\_out} to \textbf{h\_out} \label{algo:memcopy:D2H}\;
\caption{Global memory management on CPU and GPU sides.}
\label{algo:memcopy}
\end{algorithm}


\section{Implementing a fast median filter}
\subsection{Basic principles}
Designing a 2-D median filter basically consists in defining a square window $H(i,j)$ for each pixel $I(i,j)$ of the input image, containing $n=k\times k$ pixels and centered on $I(i,j)$. The output value $I'(i,j)$ is the median value of the gray level values of the $k\times k$ pixels of $H(i,j)$. Figure \ref{median1} illustrates this principle with an example of a 5x5 median filter applied on pixel $I(5,6)$.

Obviously, one key issue is the selection method that identifies the median value, which can be done using either  histogram-based or sorting methods.
But, as shown in figure \ref{median1}, since two neighboring pixels share part of the values to be sorted, a second key issue is how to rule redundancy between consecutive positions of the running window $H(i,j)$.   

\begin{figure}
   \centering
   \subfigure[$5\times 5$ median filtering applied on pixel of coordinates (5,6)]{ \includegraphics[width=5cm]{median_1.png}}\qquad
   \subfigure[window overlapping in $5\times 5$ median filtering]{\includegraphics[width=3cm]{median_overlap.png}}\\
\caption{Illustration of $5\times 5$ median filtering}
\label{median1}
\end{figure}

\subsection{Using registers}
As register access is at least 20 times faster than all the other memory types available on the GPU, it is natural to turn to the use of registers as a means to store temporary data, keeping in mind that on the \textit{fermi} architecture, each individual thread can use a maximum of 63 registers within the limit of 32K per thread block.
Consequently, it is important to use registers sparingly in order to preserve high pixel throughput values: to do so, we use the iterative \textit{forgetfull selection} algorithm. 
Its principle is, at each iteration, to identify then to eliminate (forget) both elements showing the maximum and the minimum values among the current list, then to add the next candidate element left in the original list, till none is left; the last value remaining actually is the global median value. 
Figure \ref{forget} illustrates the \textit{forgetfull selection} process applied to a $3\times3$ pixel median filter. For clarity reasons, the nine values have been represented in a row.
\begin{figure}
   \centering
   \includegraphics[width=4cm]{forgetful_selectionb.png}
   \caption{Determination of the Median value by the Forgetfull selection process, applied to a $3\times 3$ neighborhood window.}
   \label{forget}
\end{figure}
The minimum register count $R_{n}$ needed to start this iterative selection process among $k \times k$  values is given by:
$$R_{n}=\lceil \frac{n}{2}\rceil+1$$
The selection of both \textit{extrema} is implemented through an basic 2-element swapping function.
This ensures that the GPU kernel code is free of divergent branches liable to severely impact performance.

\subsection{Hiding Latencies}
Optimizing a GPU kernel also means hiding latencies. The offered massive thread parallelism helps in doing so transparently but, considering the actual computation performed by each thread, optimization may be taken a few steps further:\\ 
First, we maximized the Instruction Level Parallelism inside the \textit{forgetfull selection} method by re-arranging the instruction sequence of an incomplete \textit{bitonic sort} \cite{Batcher:1968:SNA:1468075.1468121, cormen2001introduction} so as to reduce the data dependency of consecutive instructions.\\
second, we divided thread block size by 2, having each thread perform the computation of two neighbor input pixels instead of just one, thus keeping grid size unchanged while reducing the effect of global memory access latency. 
Additionally, window overlapping also had to be managed, as illustrated by Figure \ref{median1}, in order to minimize the increase of register count per thread brought by the new computing organization.
Two $k\times k$ consecutive neighboring windows share $S_{n}=n-\sqrt{n}$ pixels, which is more than the number of registers needed to perform the median selection, i.e. $R_{n}$ (or equal for $3\times 3$ median filter). The $(S_{n}-R_{n}+1)$ first selection stages can then be considered common to both windows, leaving only the $k$ non-shared pixels of each window to be processed separately.
The above technique saves $k+1$ registers for each pair of input pixels, which means that each thread block now uses fewer registers while processing the same pixel count, thus allowing a higher level of parallelism.
Figure \ref{median5overlap} illustrates this by representing the different classes of pixels in the $5\times5$ median example. 

\subsection{Compute complexity}
 Arithmetic instructions and texture fetches are easy to count but the number of element swaps needed to select the median value is data dependent and only its maximum can be evaluated. The incomplete sorting (\textit{forgetful selection}) and the redundancy management performed when outputing two pixels per thread lead to the instruction count below:
\begin{itemize}
\item 5 integer multiplications and 5 integer additions to compute thread indexes and ouput pixel coordinates.
\item $2n-1$ additions to compute neighbor pixel coordinates.
\item $n-\sqrt{n}$ texture fetches to load gray-level values.
\item within a $m$-element vector and according to our selection method, the maximum number of element swaps needed to move the minimum value at first position and the maximum at last is given by: $$sc(m)=-2+\lceil\frac{3.m}{2}\rceil$$
Consequently, the number of element swaps needed by the entire selection of two median values performed by one thread is inferior or equal to:
$$\left(\sum_{i=\lceil\frac{n}{2}\rceil+1}^{n-\sqrt{n}}{sc(i)}\right) + 2\left(\sum_{i=n-\sqrt{n}+1}^{n}{sc(i)}\right)$$
The above sum equals 42,156 and 474 for typical window sizes of $3\times 3$, $5\times 5$ and $7\times 7$. The total instruction count is thus kept near a  $O(nlog(n))$ rule.
\end{itemize}

\begin{figure}
   \centering
   \includegraphics[width=4cm]{median5_overlap.png}
   \caption{Reducing register count in a 5$\times$5 register-only median kernel processing 2 input pixels. The first 7 forgetful selection stages are common to both processed center pixels.}
   \label{median5overlap}
\end{figure}

\section{Experiments}
Runtimes have been obtained by averaging 1000 executions  on a C2070 GPU card hosted by a system with one Xeon E5620\@2.40GHz processor running a linux kernel 2.6.18x86\_64 and CUDA v4.0. Each kernel has been run on images of sizes 512$\times$512, 1024$\times$1024, 2048$\times$2048 and 4096$\times$4096. Like many authors, we computed the pixel throughput value as our main performance indicator. It includes kernel runtime as well as transfer times to and from the GPU. We also measured the maximum effective pixel throughput that our GPU/host couple is able to achieve, by running an identity kernel that fetches the gray-level of each pixel in texture memory and outputs it into global memory exclusive of  any other instruction. Knowing this peak value allows us to evaluate the potential for improvement of each kernel and helps in deciding on further investigation. Those peak values are detailed in Table \ref{tabmaxtp}, which shows that the larger the image, the higher the expected throughput.

\begin{figure}
   \centering
   \subfigure[512$\times$512 pixel input image.]{\includegraphics[width=7cm]{comp512.png}}
   \subfigure[4096$\times$4096 pixel input image]{\includegraphics[width=7cm]{comp4k.png}}
   \caption{Pixel throughput value comparison of several implementation against our PRMF.}
   \label{figcomp}
\end{figure}

\begin{table*}[ht]
%\newcolumntype{I}{!{\vrule width 1.5pt}}
\newlength\savedwidth
\newcommand\whline{\noalign{\global\savedwidth
  \arrayrulewidth\global\arrayrulewidth 1.5pt}
  \hline \noalign{\global\arrayrulewidth
  \savedwidth}
}
\renewcommand{\arraystretch}{1.5}
\centering
{\footnotesize
\begin{tabular}{|c||c|c|c|c|c|c|}
\hline\hline
Gray level format & \multicolumn{3}{c|}{\textbf{8 bits}} & \multicolumn{3}{c|}{\textbf{16 bits}} \\ \hline
\shortstack{time costs$\rightarrow$\\image size (pixels)$\downarrow$} & \shortstack{to GPU\\(ms)}& \shortstack{from GPU\\(ms)}& \shortstack{Total\\(ms) }& \shortstack{$\rightarrow$GPU\\(ms)}& \shortstack{$\rightarrow$GPU\\(ms)}& \shortstack{Total\\(ms) } \\ \hline\hline
512$\times$512   &0.08 &0.06 &0.14 &0.14 &0.10 &0.24 \\\hline
1024$\times$1024 &0.24 &0.19 &0.43 &0.45 &0.35 &0.80 \\\hline
2048$\times$2048 &0.85 &0.68 &1.53 &1.59 &1.32 &2.91 \\\hline
4096$\times$4096 &3.27 &2.61 &5.88 &6.21 &5.21 &11.42 \\\hline
\end{tabular}}  
\caption{Time cost of data transfer for each image size and gray-level format on C2070 GPU.}
\label{tabmemcpy}
\end{table*}

\section{Results}
The main contribution of this work is to show how to tune a CUDA GPU implementation in order to achieve highest pixel throughput values. Runtimes, as well as pixel throughput values are gathered in Table \ref{tabresults}. Figure \ref{figcomp} compares the throughput values of several implementations against ours, labeled PRMF for Parallel Register-only Median Filter. Due to the lack of available source code, comparison is based on the most recent results published in \cite{Sanchez-2-2012}, obtained with the same GPU as ours and with 8 bit-coded gray-level images. 
While the algorithm implemented here is similar to the one in \textit{ArrayFire}, what makes the difference is our fine tuning of the implementation that leads to the  fastest GPU median filter known to date with 1854~MPix/s.
Let us also note that such considerable throughput values come very close to the peak effective pixel throughput value of 2444~Mpix/s allowed by our developpement platform. Consequently further investigation would likely bring little performance improvement.


\begin{table}[h]
%\newcolumntype{I}{!{\vrule width 1.5pt}}
%\newlength\savedwidth
\newcommand\whline{\noalign{\global\savedwidth
  \arrayrulewidth\global\arrayrulewidth 1.5pt}
  \hline \noalign{\global\arrayrulewidth
  \savedwidth}
}
\renewcommand{\arraystretch}{1.5}
\centering
{\small
\begin{tabular}{|c||c|c|}\hline\hline
\shortstack{Gray-level format$\rightarrow$\\image size$\downarrow$} & {$\mathbf{ T_8}$} & {$\mathbf{T_{16}}$} \\ \hline\hline
512$\times$512   &1598 &975 \\\hline
1024$\times$1024 &2101 &1200 \\\hline
2048$\times$2048 &2359 &1308 \\\hline
4096$\times$4096 &2444 &1335 \\\hline
\end{tabular} }
\caption{Maximum effective pixel throughput values for $T_8$ and $T_{16}$ (in MPixel per second) on C2070, achieved when processing 8 and 16 bit-coded gray-level images.}
\label{tabmaxtp}
\end{table}

\begin{table}[h]
%\newcolumntype{I}{!{\vrule width 1.5pt}}
%\newlength\savedwidth
\newcommand\whline{\noalign{\global\savedwidth
  \arrayrulewidth\global\arrayrulewidth 1.5pt}
  \hline \noalign{\global\arrayrulewidth
  \savedwidth}
}
\renewcommand{\arraystretch}{1.5}
\centering
{\footnotesize
\begin{tabular}{|c|l||c|c|c|c|c|c|c|c|c|}
\hline\hline
\multicolumn{2}{|l||}{\shortstack{Window size$\rightarrow$\\Image size - perf.$\downarrow$}} & \textbf{3$\times$3} & \textbf{5$\times$5} & \textbf{7$\times$7} \\ \whline
\multirow{3}{*}{\rotatebox{90}{512$^2$}} &t (ms)    &0.05 &0.19 &0.60 \\
                                         &$T_{8}$ (Mpix/s)&1291 &773  &348 \\
                                         &$T_{16}$ (Mpix/s)&865  &607   &307 \\\whline
\multirow{3}{*}{\rotatebox{90}{1024$^2$}}&t (ms)    &0.20 &0.74 &2.39 \\
                                         &$T_{8}$ (Mpix/s)&1644 &889  &371 \\
                                         &$T_{16}$ (Mpix/s)&1045   &692   &329 \\\whline
\multirow{3}{*}{\rotatebox{90}{2048$^2$}}&t (ms)    &0.79 &2.95 &9.53 \\
                                         &$T_{8}$ (Mpix/s)&1805 &936  &379 \\
                                         &$T_{16}$ (Mpix/s)&1130   &729   &338 \\\whline
\multirow{3}{*}{\rotatebox{90}{4096$^2$}}&t (ms)    &3.17 &11.77 &38.06 \\
                                         &$T_{8}$ (Mpix/s)&1854 &951   &382 \\
                                         &$T_{16}$ (Mpix/s)&1151   &738    &340  \\\whline

\end{tabular}}  
\caption{Runtime and pixel throughput of fast median kernels processing 8 and 16 bit-coded gray-level images and run by C2070 GPU.}
\label{tabresults}
\end{table}

\bibliographystyle{plain}
\bibliography{biblio3}
\end{document}