modif gillou

[book_gpu.git] / BookGPU / Chapters / chapter3 / ch3.tex
diff --git a/BookGPU/Chapters/chapter3/ch3.tex b/BookGPU/Chapters/chapter3/ch3.tex

index 1a991e2623d26b7bc5ae70688fd204183814e75b..6a5181b2bae55aa036692849c96e9926692a93c2 100755 (executable)
--- a/BookGPU/Chapters/chapter3/ch3.tex
+++ b/BookGPU/Chapters/chapter3/ch3.tex
@@ -1,102 +1,4 @@
-\chapterauthor{Gilles Perrot}{FEMTO-ST Institute}
-%\graphicspath{{img/}}
-
-
-% \begin{VF}
-% ``A ''
-
-% \VA{Thomas Davenport}{Senior Adjutant to the Junior Marketing VP}
-% \end{VF}
-
-
-
-% \begin{shadebox}
-% A component part for an electronic item is
-% manufactured at one of three different factories, and then delivered to
-% the main assembly line.Of the total number supplied, factory A supplies
-% 50\%, factory B 30\%, and factory C 20\%. Of the components
-% manufactured at factory A, 1\% are faulty and the corresponding
-% proportions for factories B and C are 4\% and 2\% respectively. A
-% component is picked at random from the assembly line. What is the
-% probability that it is faulty? 
-% \end{shadebox}
-
-
-% \begin{equation}
-% \mbox{var}\widehat{\Delta} = \sum_{j = 1}^t \sum_{k = j+1}^t
-% \mbox{var}\,(\hat{\alpha}_j - \hat{\alpha}_k)  = \sum_{j = 1}^t
-% \sum_{k = j+1}^t \sigma^2(1/n_j + 1/n_k). \label{2delvart2}
-% \end{equation}
-
-
-% \begin{shortbox}
-% \Boxhead{Box Title Here}
-% \end{shortbox}
-
-% \begin{theorem}\label{1th:Z_m}
-% Let $m$ be a prime number. With the addition and multiplication as 
-% defined above, $Z_m$ is a field.
-% \end{theorem}
-
-% \begin{proof}
-% \end{proof}
-
-% \begin{notelist}{000000}
-%  \notes{Note:}{The process of integrating reengineering is best accomplished with an engineer, a dog, and a cat.}
-% \end{notelist}
-
-
-% \begin{VT1}
-% \VH{Think About It...}
-% Com
-% \VT
-% \VTA{The Information Revolution}{Business Week}
-% \end{VT1}
-
-
-%\begin{definition}\label{1def:linearcomb}{}\end{definition}
-
-
-
-% \begin{extract}
-% text 
-% \end{extract}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%      Listings
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%% \lstset{
-%%   language=C,
-%%   columns=fixed,
-%%   basicstyle=\footnotesize\ttfamily,
-%%   numbers=left,
-%%   firstnumber=1,
-%%   numberstyle=\tiny,
-%%   stepnumber=5,             
-%%   numbersep=5pt,              
-%%   tabsize=3,                  
-%%   extendedchars=true,         
-%%   breaklines=true,       
-%%   keywordstyle=\textbf,
-%%   frame=single,         
-%%   % keywordstyle=[1]\textbf,   
-%%   %identifierstyle=\textbf,
-%%   commentstyle=\color{white}\textbf,
-%%   stringstyle=\color{white}\ttfamily,
-%%   % xleftmargin=17pt,
-%%   % framexleftmargin=17pt,
-%%   % framexrightmargin=5pt,
-%%   % framexbottommargin=4pt,
-%%   backgroundcolor=\color{lightgray},
-%%   }
-
-%\DeclareCaptionFont{blue}{\color{blue}} 
-%\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}}
-
-%\DeclareCaptionFont{white}{\color{white}}
-%\DeclareCaptionFormat{listing}{\colorbox{gray}{\parbox{\textwidth}{\hspace{15pt}#1#2#3}}}
-%\captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white, singleline}
-%%%%%%%%%%%%%%%%%%%%%%%% Fin Listings %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\chapterauthor{Gilles Perrot}{Femto-ST Institute, University of Franche-Comte, France}
  
  \newcommand{\kl}{\includegraphics[scale=0.6]{Chapters/chapter3/img/kernLeft.png}~}
  \newcommand{\kr}{\includegraphics[scale=0.6]{Chapters/chapter3/img/kernRight.png}}
  
  \newcommand{\kl}{\includegraphics[scale=0.6]{Chapters/chapter3/img/kernLeft.png}~}
  \newcommand{\kr}{\includegraphics[scale=0.6]{Chapters/chapter3/img/kernRight.png}}
@@ -131,10 +33,10 @@ However, so as to propose concise and more readable code, we will assume the fol
  \section{Data transfers, memory management.}
  This section deals with the following issues: 
  \begin{enumerate}
  \section{Data transfers, memory management.}
  This section deals with the following issues: 
  \begin{enumerate}
-\item Data transfer from CPU memory to GPU global memory: several GPU memory areas are available as destination memory but the 2-D caching mechanism of texture memory \index{memory~hierarchy!texture~memory}, specifically designed for fetching neighboring pixels, is currently the fastest way to fetch gray-level pixel values inside a kernel computation. This has led us to choose \textbf{texture memory} as primary GPU memory area for images.
-\item Data fetching from GPU global memory to kernel local memory: as said above, we use texture memory \index{memory~hierarchy!texture~memory}. Depending on which process is run, texture data is used either by direct fetching in kernel local memory or through a prefetching \index{prefetching} in shared memory \index{memory~hierarchy!shared~memory}.
+\item Data transfer from CPU memory to GPU global memory: several GPU memory areas are available as destination memory but the 2-D caching mechanism of texture memory \index{memory~hierarchy!texture~memory}, specifically designed for fetching neighboring pixels, is currently the fastest way to fetch gray-level pixel values inside a kernel computation. This has led us to choose \textbf{texture memory} as primary GPU memory area for input images.
+\item Data fetching from GPU global memory to kernel local memory: as said above, we use texture memory. \index{memory~hierarchy!texture~memory} Depending on which process is run, texture data is used either by direct fetching in kernel local memory or through a prefetching \index{prefetching} in shared memory \index{memory~hierarchy!shared~memory}.
  \item Data outputting from kernels to GPU memory: there is actually no alternative to global memory, as kernels can not directly write into texture memory and as copying from texture to CPU memory would not be faster than from simple global memory.
  \item Data outputting from kernels to GPU memory: there is actually no alternative to global memory, as kernels can not directly write into texture memory and as copying from texture to CPU memory would not be faster than from simple global memory.
-\item Data transfer from GPU global memory to CPU memory: it can be drastically accelerated by use of \textbf{pinned memory} \index{memory~hierarchy!pinned~memory}, keeping in mind it has to be used sparingly.
+\item Data transfer from GPU global memory to CPU memory: it can be drastically accelerated by use of \textbf{pinned memory}, \index{memory~hierarchy!pinned~memory} keeping in mind it has to be used sparingly.
  \end{enumerate}
  Algorithm \ref{algo:memcopy} summarizes all the above considerations and describes how data are handled in our examples. For more information on how to handle the different types of GPU memory, we suggest to refer to the CUDA programmer's guide. 
  
  \end{enumerate}
  Algorithm \ref{algo:memcopy} summarizes all the above considerations and describes how data are handled in our examples. For more information on how to handle the different types of GPU memory, we suggest to refer to the CUDA programmer's guide. 
  
@@ -181,10 +83,11 @@ Last, like many authors, we chose to use the pixel throughput value of each proc
  In order to estimate the potential for improvement of each kernel, a reference throughput measurement, involving identity kernel of Listing \ref{lst:fkern1}, was performed. As this kernel only fetches input values from texture memory and outputs them to global memory without doing any computation, it represents the smallest, thus fastest, possible process and is taken as the reference throughput value (100\%). The same measurement was performed on CPU, with a maximum effective pixel throughput of 130~Mpixel per second. On GPU, depending on grid parameters it amounts to 800~MPixels/s on GTX280 and 1300~Mpixels/s on C2070.
  
  
  In order to estimate the potential for improvement of each kernel, a reference throughput measurement, involving identity kernel of Listing \ref{lst:fkern1}, was performed. As this kernel only fetches input values from texture memory and outputs them to global memory without doing any computation, it represents the smallest, thus fastest, possible process and is taken as the reference throughput value (100\%). The same measurement was performed on CPU, with a maximum effective pixel throughput of 130~Mpixel per second. On GPU, depending on grid parameters it amounts to 800~MPixels/s on GTX280 and 1300~Mpixels/s on C2070.
  
  
+\chapterauthor{Gilles Perrot}{Femto-ST Institute, University of Franche-Comte, France}
+
  \chapter{Implementing a fast median filter}
  \chapter{Implementing a fast median filter}
-\chapterauthor{Gilles Perrot}{FEMTO-ST Institute}
  \section{Introduction}
  \section{Introduction}
-Median filtering is a well-known method used in a wide range of application frameworks as well as a standalone filter especially for \textit{salt and pepper} denoising. It is able to highly reduce power of noise without blurring edges too much.
+Median filtering is a well-known method used in a wide range of application frameworks as well as a standalone filter especially for \textit{salt and pepper} denoising. It is able to highly reduce power of noise without blurring edges too much. That is actually why we originally focused on this filtering technique as a preprocessing stage when we were in the process of designing a GPU implementation of one region-based image segmentation algorithm \cite{6036776}.
  
  First introduced by Tukey in \cite{tukey77}, it has been widely studied since then, and many researchers have proposed efficient implementations of it, adapted to various hypothesis, architectures and processors. 
  Originally, its main drawbacks were its compute complexity, its non linearity and its data-dependent runtime. Several researchers have addressed these issues and designed, for example, efficient histogram-based median filter with predictible runtimes \cite{Huang:1981:TDS:539567, Weiss:2006:FMB:1179352.1141918}.  
  
  First introduced by Tukey in \cite{tukey77}, it has been widely studied since then, and many researchers have proposed efficient implementations of it, adapted to various hypothesis, architectures and processors. 
  Originally, its main drawbacks were its compute complexity, its non linearity and its data-dependent runtime. Several researchers have addressed these issues and designed, for example, efficient histogram-based median filter with predictible runtimes \cite{Huang:1981:TDS:539567, Weiss:2006:FMB:1179352.1141918}.  
@@ -294,9 +197,9 @@ copy data from GPU global memory to CPU memory\label{algoMedianGeneric:memcpyD2H
  \section{NVidia GPU tuning recipes}
  When designing GPU code, besides thinking of the actual data computing process, one must choose the memory type into which to store temporary data. Three types of GPU memory are available:
  \begin{enumerate}
  \section{NVidia GPU tuning recipes}
  When designing GPU code, besides thinking of the actual data computing process, one must choose the memory type into which to store temporary data. Three types of GPU memory are available:
  \begin{enumerate}
-\item \textbf{Global memory, the most versatile:} \index{memory~hierarchy!global~memory}\\Offers the largest storing space and global scope but is slowest (400 cycles latency). \textbf{Texture memory} is physically included into it, but allows access through an efficient 2-D caching mechanism.
-\item \textbf{Registers, the fastest:} \index{memory~hierarchy!registers}\\Allow access wihtout latency, but only 63 registers are available per thread (thread scope), with a maximum of 32K per Symetric Multiprocessor (SM) \index{register count}w.
-\item \textbf{Shared memory, a complex compromise:} \index{memory~hierarchy!shared~memory}\\All threads in one block can access 48~KBytes of shared memory, which is faster than global memory (20 cycles latency) but slower than registers. 
+\item \textbf{Global memory, the most versatile:} \index{memory~hierarchy!global~memory}\\Offers the largest storing space and global scope but is slowest (400-800 clock cycles latency). \textbf{Texture memory} is physically included into it, but allows access through an efficient 2-D caching mechanism.
+\item \textbf{Registers, the fastest:} \index{memory~hierarchy!registers}\\Allow access wihtout latency, but only 63 registers are available per thread (thread scope), with a maximum of 32K per Symetric Multiprocessor (SM) \index{register count}.
+\item \textbf{Shared memory, a complex compromise:} \index{memory~hierarchy!shared~memory}\\All threads in one block can access 48~KBytes of shared memory, which is faster than global memory (\~20 cycles latency) but slower than registers. 
  However, bank conflicts can occur if two threads of a warp try to access data stored in one single memory bank. In such cases, the parallel process is re-serialized which may cause significant performance decrease. One easy way to avoid it is to ensure that two consecutive threads in one block always access 32-bit data at two consecutive adresses.  
  \end{enumerate}
  
  However, bank conflicts can occur if two threads of a warp try to access data stored in one single memory bank. In such cases, the parallel process is re-serialized which may cause significant performance decrease. One easy way to avoid it is to ensure that two consecutive threads in one block always access 32-bit data at two consecutive adresses.  
  \end{enumerate}
  
@@ -343,7 +246,7 @@ This count can be reduced by use of an iterative sorting process called \textit{
     \caption{Forgetful selection with the minimal element register count. Illustration for 3$\times$3 pixel window represented in a row and supposed sorted.}
     \label{fig:forgetful_selection}
  \end{figure}
     \caption{Forgetful selection with the minimal element register count. Illustration for 3$\times$3 pixel window represented in a row and supposed sorted.}
     \label{fig:forgetful_selection}
  \end{figure}
-We must remember that, in the fully sorted vector, the median value will have the middle index i.e. $\lfloor n^2/2\rfloor$. 
+We must remember that, in the fully sorted vector, the median value will have the middle index \textit{i.e.} $\lfloor n^2/2\rfloor$. 
  Moreover, assuming that both \textit{extrema} are eliminated from the first $k$ elements and that the global median is one of them would mean that:
  \begin{itemize}
  \item if the global median was the minimum among the $k$ elements, then at least $k-1$ elements would have a higher index. Considering the above median definition, at least $k-1$ elements should also have a lower index in the entire vector.
  Moreover, assuming that both \textit{extrema} are eliminated from the first $k$ elements and that the global median is one of them would mean that:
  \begin{itemize}
  \item if the global median was the minimum among the $k$ elements, then at least $k-1$ elements would have a higher index. Considering the above median definition, at least $k-1$ elements should also have a lower index in the entire vector.
@@ -364,8 +267,8 @@ This iterative process is illustrated in Figure \ref{fig:forgetful3}, where it a
  \end{figure}
   
  
  \end{figure}
   
  
-The \textit{forgetful selection} method, used in \cite{mcguire2008median} does not imply full sorting of values, but only selecting minimum and maximum values, which, at the price of a few iteration steps ($n^2-k$), reduces arithmetic complexity.
-Listing \ref{lst:medianForget1pix3} details this process where forgetful selection is achieved by use of simple 2-value swapping function ($s()$, lines 1 to 5) that swaps input values if necessary, so as to achieve the first steps of a \textit{bitonic}-like sorting network (\cite{Batcher:1968:SNA:1468075.1468121}). Moreover, whenever possible, in order to increase the Instruction-Level Parallelism \index{Instruction-Level Parallelism}, successive calls to $s()$ are done with independant elements as arguments. This is illustrated by the macro definitions of lines 7 to 14 and by Figure \ref{fig:bitonic} which details the first iteration of the $5\times 5$ selection, starting with $k_{25}=14$ elements. 
+The \textit{forgetful selection} method, used in \cite{mcguire2008median}, does not imply full sorting of values, but only selecting minimum and maximum values, which, at the price of a few iteration steps ($n^2-k$), reduces arithmetic complexity.
+Listing \ref{lst:medianForget1pix3} details this process where forgetful selection is achieved by use of simple 2-value swapping function ($s()$, lines 1 to 5) that swaps input values if necessary, so as to achieve the first steps of an incomplete sorting network \cite{Batcher:1968:SNA:1468075.1468121}. Moreover, whenever possible, in order to increase the Instruction-Level Parallelism \index{Instruction-Level Parallelism}, successive calls to $s()$ are done with independant elements as arguments. This is illustrated by the macro definitions of lines 7 to 14 and by Figure \ref{fig:bitonic} which details the first iteration of the $5\times 5$ selection, starting with $k_{25}=14$ elements. 
  
  \lstinputlisting[label={lst:medianForget1pix3},caption= 3$\times$3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070. ]{Chapters/chapter3/code/kernMedianForget1pix3.cu}
  
  
  \lstinputlisting[label={lst:medianForget1pix3},caption= 3$\times$3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070. ]{Chapters/chapter3/code/kernMedianForget1pix3.cu}
  
@@ -395,7 +298,7 @@ Running this $3\times 3$ kernel saves another 10\% runtime, as shown in Figure \
  \end{figure}
  
  \section{A 5$\times$5 and more median filter }
  \end{figure}
  
  \section{A 5$\times$5 and more median filter }
-Considering the maximum register count allowed per thread (63) and trying to push this technique to its limit potentially allows designing up to 9$\times$9 median filters. Such maximum would actually use  $k_{81}=\lceil 81/2\rceil+1 = 42$ registers per thread plus 9, used by the compiler to complete arithmetic operations. This leads to a total register count of 51, which would forbid to compute more than one pixel per thread, but also would limit the number of concurrent threads per block. Our measurements show that this technique is still worth using for the 5$\times$5 median. As for larger window sizes, one option could be using shared memory.
+Considering the maximum register count allowed per thread (63) and trying to push this technique to its limit potentially allows designing up to 9$\times$9 median filters. Such maximum would actually use  $k_{81}=\lceil 81/2\rceil+1 = 42$ registers per thread plus 9, used by the compiler to complete arithmetic operations and 9 more when outputting 2 pixels per thread. This leads to a total register count of 60, which would limit the number of concurrent threads per block. Our measurements show that this technique is still worth using for the 7$\times$7 median. As for larger window sizes, one option could be using shared memory.
  The next two sections will first detail the particular case of the 5$\times$5 median through register-only method and eventually a generic kernel for larger window sizes.
  
  \subsection{A register-only 5$\times$5 median filter \label{sec:median5}}
  The next two sections will first detail the particular case of the 5$\times$5 median through register-only method and eventually a generic kernel for larger window sizes.
  
  \subsection{A register-only 5$\times$5 median filter \label{sec:median5}}
@@ -409,9 +312,9 @@ The minimum register count required to apply the forgetful selection method to a
  
  \begin{figure}
     \centering
  
  \begin{figure}
     \centering
-   \includegraphics[width=6cm]{Chapters/chapter3/img/forgetful_selection4.png}
+   \includegraphics[width=6cm]{Chapters/chapter3/img/fig3.jpg}
     \caption[First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network.]{First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.}
     \caption[First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network.]{First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.}
-   \label{fig:median5overlap}
+   \label{fig:bitonic}
  \end{figure}
  
  \lstinputlisting[label={lst:medianForget2pix5},caption=kernel 5$\times$5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.]{Chapters/chapter3/code/kernMedian2pix5.cu}
  \end{figure}
  
  \lstinputlisting[label={lst:medianForget2pix5},caption=kernel 5$\times$5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.]{Chapters/chapter3/code/kernMedian2pix5.cu}