new

[book_gpu.git] / BookGPU / Chapters / chapter4 / ch4.tex
diff --git a/BookGPU/Chapters/chapter4/ch4.tex b/BookGPU/Chapters/chapter4/ch4.tex

index ea473a15bec738d54b08222571fd45558eab4530..8788ab424eaa95a8f38a05104566a6c18dfbbbe9 100644 (file)
--- a/BookGPU/Chapters/chapter4/ch4.tex
+++ b/BookGPU/Chapters/chapter4/ch4.tex
@@ -1,38 +1,14 @@
-\chapterauthor{Gilles Perrot}{FEMTO-ST Institute}
-
-%\newcommand{\kl}{\includegraphics[scale=0.6]{Chapters/chapter4/img/kernLeft.png}~}
-%\newcommand{\kr}{\includegraphics[scale=0.6]{Chapters/chapter4/img/kernRight.png}}
-
-%% \lstset{
-%%   language=C,
-%%   columns=fixed,
-%%   basicstyle=\footnotesize\ttfamily,
-%%   numbers=left,
-%%   firstnumber=1,
-%%   numberstyle=\tiny,
-%%   stepnumber=5,             
-%%   numbersep=5pt,              
-%%   tabsize=3,                  
-%%   extendedchars=true,         
-%%   breaklines=true,       
-%%   keywordstyle=\textbf,
-%%   frame=single,         
-%%   % keywordstyle=[1]\textbf,   
-%%   %identifierstyle=\textbf,
-%%   commentstyle=\color{white}\textbf,
-%%   stringstyle=\color{white}\ttfamily,
-%%   % xleftmargin=17pt,
-%%   % framexleftmargin=17pt,
-%%   % framexrightmargin=5pt,
-%%   % framexbottommargin=4pt,
-%%   backgroundcolor=\color{lightgray},
-%%   }
-
-
-\chapter{Implementing an efficient convolution \index{Convolution} operation on GPU}
+\chapterauthor{Gilles Perrot}{Femto-ST Institute, University of Franche-Comte, France}
+
+\chapter{Implementing an efficient convolution operation on GPU}
+
+
+
+
+
  \section{Overview}
  In this chapter, after dealing with GPU median filter implementations,
  \section{Overview}
  In this chapter, after dealing with GPU median filter implementations,
-we propose to explore how convolutions can be implemented on modern
+we propose to explore how convolutions\index{Convolution}  can be implemented on modern
  GPUs. Widely used in digital image processing filters, the \emph{convolution
  operation} basically consists in taking the sum of products of elements
  from two 2-D functions, letting one of the two functions move over
  GPUs. Widely used in digital image processing filters, the \emph{convolution
  operation} basically consists in taking the sum of products of elements
  from two 2-D functions, letting one of the two functions move over
@@ -42,7 +18,7 @@ begin with, we shall examine non-separable or generic convolutions,
  before adressing the matter of separable convolutions. We shall refer
  to $I$ as an H x L pixel gray-level image, and to $I(x,y)$ as the gray-level
  value of each pixel of coordinates $(x,y)$.
  before adressing the matter of separable convolutions. We shall refer
  to $I$ as an H x L pixel gray-level image, and to $I(x,y)$ as the gray-level
  value of each pixel of coordinates $(x,y)$.
-%dire qqs mots sur le filtrage IIR/FIR ?
+
  
  
  \section{Definition}
  
  
  \section{Definition}
@@ -51,11 +27,11 @@ image $I$ and convolution mask \emph{h} (To avoid confusion with other
  GPU functions referred to as kernels, we shall use\emph{ convolution
  mask} instead of \emph{convolution kernel}) is defined by:
  \begin{equation}
  GPU functions referred to as kernels, we shall use\emph{ convolution
  mask} instead of \emph{convolution kernel}) is defined by:
  \begin{equation}
-I'(x, y) = \left(I * h\right) = \sum_{i < H} \sum_{j < L}I(x-j, y-j).h(j,i)
+I'(x, y) = \left(I * h\right) = \sum_{(i < H)} \sum_{(j < L)}I(x-j, y-j).h(j,i)
  \label{convoDef}
  \end{equation}
  \label{convoDef}
  \end{equation}
-While processing an image, function \emph{h} is bounded by a square
-window of size \emph{k = 2r + 1}, i.e an uneven number, to ensure
+While processing an image, function \emph{h} is often bounded by a square
+window of size \emph{k = 2r + 1}, \textit{i.e} an uneven number, to ensure
  there is a center. We shall also point out that, as stated earlier,
  the square shape is no limiting factor to the process, as any shape
  can be inscribed into a square. In the case of a more complex shape,
  there is a center. We shall also point out that, as stated earlier,
  the square shape is no limiting factor to the process, as any shape
  can be inscribed into a square. In the case of a more complex shape,
@@ -70,16 +46,16 @@ For more readability, only part of the connecting lines are shown.
   \begin{figure}
  \centering
     \includegraphics[width=11cm]{Chapters/chapter4/img/convo1.png}
   \begin{figure}
  \centering
     \includegraphics[width=11cm]{Chapters/chapter4/img/convo1.png}
-   \caption{Principle of a generic convolution implementation. The center pixel is represented with a black background and the pixels of its neighborhood are denoted $I_{p,q}$ where $(p,q)$ is the relative position of the neighbor pixel. Elements $h_{t,u}$ are the values of the convolution mask.}
+   \caption[Principle of a generic convolution implementation.]{Principle of a generic convolution implementation. The center pixel is represented with a black background and the pixels of its neighborhood are denoted $I_{p,q}$ where $(p,q)$ is the relative position of the neighbor pixel. Elements $h_{t,u}$ are the values of the convolution mask.}
     \label{fig:convoPrinciple}
  \end{figure}
  \begin{algorithm}
  \caption{generic convolution}   
  \label{algo_genconv}
    \ForEach{pixel at position $(x, y)$}{
     \label{fig:convoPrinciple}
  \end{figure}
  \begin{algorithm}
  \caption{generic convolution}   
  \label{algo_genconv}
    \ForEach{pixel at position $(x, y)$}{
-    Read all gray-level values $I(x, y)$ in the neighborhood\\\;
-    Compute the weighted sum \( I_\Omega = \sum_{(j,i) \in \Omega}I(x-j, y-j).h(j,i) \)\\\;
-    Normalize $I'(x, y)$ value\\\;
+    Read all gray-level values $I(x, y)$ in the neighborhood\;
+    Compute the weighted sum \( I_\Omega = \sum_{(j,i) \in \Omega}I(x-j, y-j).h(j,i) \)\;
+    Normalize $I'(x, y)$ value\;
      Outputs the new gray-level value 
    }
  \end{algorithm}
      Outputs the new gray-level value 
    }
  \end{algorithm}
@@ -106,7 +82,7 @@ convolutions of the tuning recipes applied to median filters in the
  previous chapter, as a reminder : texture memory used with incoming
  data, pinned memory with output data, optimized use of registers
  while processing data and multiple output per thread\index{Multiple output per thread}. 
  previous chapter, as a reminder : texture memory used with incoming
  data, pinned memory with output data, optimized use of registers
  while processing data and multiple output per thread\index{Multiple output per thread}. 
-One signifcant difference lies in the fact
+One significant difference lies in the fact
  that the median filter uses only one parameter, the size of the window mask,
  which can be hard-coded, while a convolution mask requires referring to several; hard-coding
  its elements would lead to severe lack of flexibility (one function
  that the median filter uses only one parameter, the size of the window mask,
  which can be hard-coded, while a convolution mask requires referring to several; hard-coding
  its elements would lead to severe lack of flexibility (one function
@@ -114,7 +90,7 @@ per filter, no external settings) so we will just use it as a starting
  point in our approach. 
  
  Let us assume that we are planning to implement the convolution defined by the following $3\times 3$ mask (low-pass filter or averaging filter):
  point in our approach. 
  
  Let us assume that we are planning to implement the convolution defined by the following $3\times 3$ mask (low-pass filter or averaging filter):
-$$h=\begin{bmatrix}1&1&1\\1&1&1\\1&1&1\end{bmatrix}$$ 
+$$h=\frac{1}{9}\begin{bmatrix}1&1&1\\1&1&1\\1&1&1\end{bmatrix}$$ 
  The kernel code presented in Listing \ref{lst:convoGene3Reg8} implements the convolution operation and applies all above optimizations except, for clarity reasons, multiple output per thread.
  In the particular case of a generic convolution, it is important to note how mask coefficients are applied to image pixels in order to fit the definition of equation \ref{convoDef}: if the coordinates of the center pixel had been set to (0,0), then the pixel of coordinates $(i,j)$ would have been multiplied by the element $(-i,-j)$ of the mask, which, transposed in our kernel code, leads to multiply the $p^{th}$ pixel of the window by the $(n-p)^{th}$ element of the convolution mask.
  
  The kernel code presented in Listing \ref{lst:convoGene3Reg8} implements the convolution operation and applies all above optimizations except, for clarity reasons, multiple output per thread.
  In the particular case of a generic convolution, it is important to note how mask coefficients are applied to image pixels in order to fit the definition of equation \ref{convoDef}: if the coordinates of the center pixel had been set to (0,0), then the pixel of coordinates $(i,j)$ would have been multiplied by the element $(-i,-j)$ of the mask, which, transposed in our kernel code, leads to multiply the $p^{th}$ pixel of the window by the $(n-p)^{th}$ element of the convolution mask.
  
@@ -138,7 +114,7 @@ $\mathbf{2048\times 2048}$&1.178&1549 &\bf 3.265&\bf 875 &6.398&529 \\\hline
  $\mathbf{4096\times 4096}$&4.700&1585 &13.05&533     &25.56&533 \\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&4.700&1585 &13.05&533     &25.56&533 \\\hline
  \end{tabular}
  }  
-\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
+\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoNonSepReg1}
  \end{table} 
  
  \label{tab:convoNonSepReg1}
  \end{table} 
  
@@ -155,7 +131,7 @@ $\mathbf{2048\times 2048}$&0.801&1092 &\bf 2.189&\bf 802 &4.278&573 \\\hline
  $\mathbf{4096\times 4096}$&3.171&1075 &8.720&793 &17.076&569 \\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&3.171&1075 &8.720&793 &17.076&569 \\\hline
  \end{tabular}
  }  
-\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
+\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoNonSepReg3}
  \end{table}
  
  \label{tab:convoNonSepReg3}
  \end{table}
  
@@ -176,7 +152,7 @@ $\mathbf{4096\times 4096}$&5.882 &12.431 \\\hline
  \end{table}
  
  Table \ref{tab:convoNonSepReg3} shows timings and global throughput values achieved by those convolution masks on an Nvidia GT200 Tesla architecture (GTX480 card) with $16x8$ thread blocks. This measurement has been done in order to make a relevant comparison with a reference given by Nvidia in \cite{convolutionsoup} where they state that their fastest kernel achieves a $5\times5$ convolution of an 8-bit  $2048\times 2048$ pixelimage in $1.4~ms$, which lead to a throughput value of 945~Mpix/s.
  \end{table}
  
  Table \ref{tab:convoNonSepReg3} shows timings and global throughput values achieved by those convolution masks on an Nvidia GT200 Tesla architecture (GTX480 card) with $16x8$ thread blocks. This measurement has been done in order to make a relevant comparison with a reference given by Nvidia in \cite{convolutionsoup} where they state that their fastest kernel achieves a $5\times5$ convolution of an 8-bit  $2048\times 2048$ pixelimage in $1.4~ms$, which lead to a throughput value of 945~Mpix/s.
-Our current value of 802~Mpix/s, though not unsatisfactory, remains lower to the one reached by the manufacturer'own coding. 
+Our current value of 802~Mpix/s, though not unsatisfactory, remains lower to the one reached by the manufacturer's own coding. 
  Tested in the same conditions, the newer Fermi architecture of
  Nvidia's GPUs proved slower (3.3 ms, see Table \ref{tab:convoNonSepReg1}) due to the lower maximum
  register count allowed (63, against 128 for Tesla GT200).
  Tested in the same conditions, the newer Fermi architecture of
  Nvidia's GPUs proved slower (3.3 ms, see Table \ref{tab:convoNonSepReg1}) due to the lower maximum
  register count allowed (63, against 128 for Tesla GT200).
@@ -217,7 +193,7 @@ of the size of the convolution mask, one can envisage processing 2
  or more pixels per thread while keeping safely within the 63-per-thread
  rule.
  
  or more pixels per thread while keeping safely within the 63-per-thread
  rule.
  
-However, when doing so, \textit{eg} processing what we shall call a \textit{packet} of pixels, window mask overlapping has to be taken into account
+However, when doing so, \textit{e.g} processing what we shall call a \textit{packet} of pixels, window mask overlapping has to be taken into account
  to avoid multiple texture fetches of each pixel's gray-level value, while benefiting from the 2-D cache.
  In that case, both mask size and pixel packet shape determine the number of texture fetches to be performed for each pixel value.
  Figure \ref{fig:convoOverlap1} illustrates two different situations: on top, a mask of radius 1 ($3\times 3$) applied to a packet of 8 pixels in row; at bottom, a mask of radius 2 ($5\times 5$).
  to avoid multiple texture fetches of each pixel's gray-level value, while benefiting from the 2-D cache.
  In that case, both mask size and pixel packet shape determine the number of texture fetches to be performed for each pixel value.
  Figure \ref{fig:convoOverlap1} illustrates two different situations: on top, a mask of radius 1 ($3\times 3$) applied to a packet of 8 pixels in row; at bottom, a mask of radius 2 ($5\times 5$).
@@ -250,7 +226,7 @@ $\mathbf{2048\times 2048}$&0.495&2071 &\bf 0.987&1666 &1.615&1334 \\\hline
  $\mathbf{4096\times 4096}$&1.964&2138 &3.926&1711 &6.416&1364 \\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&1.964&2138 &3.926&1711 &6.416&1364 \\\hline
  \end{tabular}
  }  
-\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
+\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoGene8x8p}
  \end{table}
   
  \label{tab:convoGene8x8p}
  \end{table}
   
@@ -269,7 +245,7 @@ This proves to be quite efficient and more versatile, but it obviously generates
   \begin{figure}
  \centering
     \includegraphics[width=12cm]{Chapters/chapter4/img/convoShMem.png}
   \begin{figure}
  \centering
     \includegraphics[width=12cm]{Chapters/chapter4/img/convoShMem.png}
-   \caption{Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$. Threads in both top corners of the top figure are identified either by a circle or by a star symbol. The image tile, loaded into shared memory includes the pixels to be updated by the threads of the block, as well as its 2-pixel wide halo. Here, circle and star symbols in the image tile show which pixels are actually loaded into one shared memory vector by its corresponding thread. }
+   \caption[Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$.]{Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$. Threads in both top corners of the top figure are identified either by a circle or by a star symbol. The image tile, loaded into shared memory includes the pixels to be updated by the threads of the block, as well as its 2-pixel wide halo. Here, circle and star symbols in the image tile show which pixels are actually loaded into one shared memory vector by its corresponding thread. }
     \label{fig:ShMem1}
  \end{figure}
  Still, we also implemented this method, in a similar manner as Nvidia did in its SDK sample code.
     \label{fig:ShMem1}
  \end{figure}
  Still, we also implemented this method, in a similar manner as Nvidia did in its SDK sample code.
@@ -304,7 +280,7 @@ $\mathbf{2048\times 2048}$&2023 &\bf 1586 &1172 &818&676&554\\\hline
  $\mathbf{4096\times 4096}$&2090 &1637 &1195     &830&684&561\\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&2090 &1637 &1195     &830&684&561\\\hline
  \end{tabular}
  }  
-\caption{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
+\caption[Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoGeneSh2}
  \end{table} 
  \lstinputlisting[label={lst:convoGeneSh1},caption=CUDA kernel achieving a generic convolution operation after a preloading of data in shared memory.]{Chapters/chapter4/code/convoGeneSh1.cu}
  \label{tab:convoGeneSh2}
  \end{table} 
  \lstinputlisting[label={lst:convoGeneSh1},caption=CUDA kernel achieving a generic convolution operation after a preloading of data in shared memory.]{Chapters/chapter4/code/convoGeneSh1.cu}
@@ -317,7 +293,7 @@ $$h = h_v \times h_h = \begin{bmatrix}1\\2\\1\end{bmatrix} \times \begin{bmatrix
  -1&2&-1
  \end{bmatrix}$$
  Such a mask allows to replace a generic 2-D convolution operation by two consecutive stages of a 1-D convolution operation: a vertical of mask $h_v$ and a horizontal of mask $h_h$.
  -1&2&-1
  \end{bmatrix}$$
  Such a mask allows to replace a generic 2-D convolution operation by two consecutive stages of a 1-D convolution operation: a vertical of mask $h_v$ and a horizontal of mask $h_h$.
-This saves a lot of arithmetic operations, as a generic $n\times n$ convolution applied on a $H\times L$ image basically represents $H.L.n^2$ multiplications and as many additions, while two consecutive $n\times 1$ convolutions only represents $2.H.L.n$ of each, \textit{eg} 60\% operations are saved per pixel of the image for a $5\times 5$ mask.\\
+This saves a lot of arithmetic operations, as a generic $n\times n$ convolution applied on a $H\times L$ image basically represents $H.L.n^2$ multiplications and as many additions, while two consecutive $n\times 1$ convolutions only represents $2.H.L.n$ of each, \textit{e.g} 60\% operations are saved per pixel of the image for a $5\times 5$ mask.\\
  However, beside reducing the operation count, performing a separable convolution also means writing an intermediate image into global memory.
  CPU implementations of separable convolutions often use a single function to perform both 1-D convolution stages. To do so, this function reads the input image and actually ouputs the transposed filtered image. 
  Applying that principle to GPUs is not efficient, as outputting the transposed image means non-coalescent writes into global memory, generating severe performance loss. Hence the idea of developing two different kernels, one for each of both vertical and horizontal convolutions.
  However, beside reducing the operation count, performing a separable convolution also means writing an intermediate image into global memory.
  CPU implementations of separable convolutions often use a single function to perform both 1-D convolution stages. To do so, this function reads the input image and actually ouputs the transposed filtered image. 
  Applying that principle to GPUs is not efficient, as outputting the transposed image means non-coalescent writes into global memory, generating severe performance loss. Hence the idea of developing two different kernels, one for each of both vertical and horizontal convolutions.
@@ -354,7 +330,7 @@ $\mathbf{1024\times 1024}$&0.306 &0.333 &\bf 0.333 &\bf 0.378&\bf 0.404&\bf 0.46
  $\mathbf{2048\times 2048}$&1.094 &1.191 &\bf 1.260 &\bf 1.444&\bf 1.545&\bf 1.722\\\hline
  $\mathbf{4096\times 4096}$&4.262 &4.631 &\bf 5.000 &\bf 5.676&\bf 6.105&\bf 6.736\\\hline
  \end{tabular}}  
  $\mathbf{2048\times 2048}$&1.094 &1.191 &\bf 1.260 &\bf 1.444&\bf 1.545&\bf 1.722\\\hline
  $\mathbf{4096\times 4096}$&4.262 &4.631 &\bf 5.000 &\bf 5.676&\bf 6.105&\bf 6.736\\\hline
  \end{tabular}}  
-\caption{Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run  on a C2070 card. Timings include data copy. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones.}
+\caption[Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run  on a C2070 card.]{Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run  on a C2070 card. Timings include data copy. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones.}
  \label{tab:convoSepSh1}
  \end{table}
  \begin{table}[h]
  \label{tab:convoSepSh1}
  \end{table}
  \begin{table}[h]
@@ -369,7 +345,7 @@ $\mathbf{2048\times 2048}$&1598 &1541 &\bf 1503 &\bf 1410&\bf 1364&\bf 1290\\\hl
  $\mathbf{4096\times 4096}$&1654 &1596 &\bf 1542 &\bf 1452&\bf 1400&\bf 1330\\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&1654 &1596 &\bf 1542 &\bf 1452&\bf 1400&\bf 1330\\\hline
  \end{tabular}
  }  
-\caption{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
+\caption[Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoSepSh2}
  \end{table} 
  
  \label{tab:convoSepSh2}
  \end{table}