new

[book_gpu.git] / BookGPU / Chapters / chapter4 / ch4.tex
diff --git a/BookGPU/Chapters/chapter4/ch4.tex b/BookGPU/Chapters/chapter4/ch4.tex

index ea473a15bec738d54b08222571fd45558eab4530..e3dbfd5c1fd87fef6ae39c74c9f8bf86b8964b10 100644 (file)
--- a/BookGPU/Chapters/chapter4/ch4.tex
+++ b/BookGPU/Chapters/chapter4/ch4.tex
@@ -1,4 +1,7 @@
-\chapterauthor{Gilles Perrot}{FEMTO-ST Institute}
+\chapterauthor{Gilles Perrot}{Femto-ST Institute, University of Franche-Comte, France}
+
+\chapter{Implementing an efficient convolution operation on GPU}
+
  
  %\newcommand{\kl}{\includegraphics[scale=0.6]{Chapters/chapter4/img/kernLeft.png}~}
  %\newcommand{\kr}{\includegraphics[scale=0.6]{Chapters/chapter4/img/kernRight.png}}
  
  %\newcommand{\kl}{\includegraphics[scale=0.6]{Chapters/chapter4/img/kernLeft.png}~}
  %\newcommand{\kr}{\includegraphics[scale=0.6]{Chapters/chapter4/img/kernRight.png}}
@@ -29,10 +32,10 @@
  %%   }
  
  
  %%   }
  
  
-\chapter{Implementing an efficient convolution \index{Convolution} operation on GPU}
+
  \section{Overview}
  In this chapter, after dealing with GPU median filter implementations,
  \section{Overview}
  In this chapter, after dealing with GPU median filter implementations,
-we propose to explore how convolutions can be implemented on modern
+we propose to explore how convolutions\index{Convolution}  can be implemented on modern
  GPUs. Widely used in digital image processing filters, the \emph{convolution
  operation} basically consists in taking the sum of products of elements
  from two 2-D functions, letting one of the two functions move over
  GPUs. Widely used in digital image processing filters, the \emph{convolution
  operation} basically consists in taking the sum of products of elements
  from two 2-D functions, letting one of the two functions move over
@@ -70,7 +73,7 @@ For more readability, only part of the connecting lines are shown.
   \begin{figure}
  \centering
     \includegraphics[width=11cm]{Chapters/chapter4/img/convo1.png}
   \begin{figure}
  \centering
     \includegraphics[width=11cm]{Chapters/chapter4/img/convo1.png}
-   \caption{Principle of a generic convolution implementation. The center pixel is represented with a black background and the pixels of its neighborhood are denoted $I_{p,q}$ where $(p,q)$ is the relative position of the neighbor pixel. Elements $h_{t,u}$ are the values of the convolution mask.}
+   \caption[Principle of a generic convolution implementation.]{Principle of a generic convolution implementation. The center pixel is represented with a black background and the pixels of its neighborhood are denoted $I_{p,q}$ where $(p,q)$ is the relative position of the neighbor pixel. Elements $h_{t,u}$ are the values of the convolution mask.}
     \label{fig:convoPrinciple}
  \end{figure}
  \begin{algorithm}
     \label{fig:convoPrinciple}
  \end{figure}
  \begin{algorithm}
@@ -138,7 +141,7 @@ $\mathbf{2048\times 2048}$&1.178&1549 &\bf 3.265&\bf 875 &6.398&529 \\\hline
  $\mathbf{4096\times 4096}$&4.700&1585 &13.05&533     &25.56&533 \\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&4.700&1585 &13.05&533     &25.56&533 \\\hline
  \end{tabular}
  }  
-\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
+\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoNonSepReg1}
  \end{table} 
  
  \label{tab:convoNonSepReg1}
  \end{table} 
  
@@ -155,7 +158,7 @@ $\mathbf{2048\times 2048}$&0.801&1092 &\bf 2.189&\bf 802 &4.278&573 \\\hline
  $\mathbf{4096\times 4096}$&3.171&1075 &8.720&793 &17.076&569 \\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&3.171&1075 &8.720&793 &17.076&569 \\\hline
  \end{tabular}
  }  
-\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
+\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoNonSepReg3}
  \end{table}
  
  \label{tab:convoNonSepReg3}
  \end{table}
  
@@ -250,7 +253,7 @@ $\mathbf{2048\times 2048}$&0.495&2071 &\bf 0.987&1666 &1.615&1334 \\\hline
  $\mathbf{4096\times 4096}$&1.964&2138 &3.926&1711 &6.416&1364 \\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&1.964&2138 &3.926&1711 &6.416&1364 \\\hline
  \end{tabular}
  }  
-\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
+\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoGene8x8p}
  \end{table}
   
  \label{tab:convoGene8x8p}
  \end{table}
   
@@ -269,7 +272,7 @@ This proves to be quite efficient and more versatile, but it obviously generates
   \begin{figure}
  \centering
     \includegraphics[width=12cm]{Chapters/chapter4/img/convoShMem.png}
   \begin{figure}
  \centering
     \includegraphics[width=12cm]{Chapters/chapter4/img/convoShMem.png}
-   \caption{Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$. Threads in both top corners of the top figure are identified either by a circle or by a star symbol. The image tile, loaded into shared memory includes the pixels to be updated by the threads of the block, as well as its 2-pixel wide halo. Here, circle and star symbols in the image tile show which pixels are actually loaded into one shared memory vector by its corresponding thread. }
+   \caption[Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$.]{Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$. Threads in both top corners of the top figure are identified either by a circle or by a star symbol. The image tile, loaded into shared memory includes the pixels to be updated by the threads of the block, as well as its 2-pixel wide halo. Here, circle and star symbols in the image tile show which pixels are actually loaded into one shared memory vector by its corresponding thread. }
     \label{fig:ShMem1}
  \end{figure}
  Still, we also implemented this method, in a similar manner as Nvidia did in its SDK sample code.
     \label{fig:ShMem1}
  \end{figure}
  Still, we also implemented this method, in a similar manner as Nvidia did in its SDK sample code.
@@ -304,7 +307,7 @@ $\mathbf{2048\times 2048}$&2023 &\bf 1586 &1172 &818&676&554\\\hline
  $\mathbf{4096\times 4096}$&2090 &1637 &1195     &830&684&561\\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&2090 &1637 &1195     &830&684&561\\\hline
  \end{tabular}
  }  
-\caption{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
+\caption[Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoGeneSh2}
  \end{table} 
  \lstinputlisting[label={lst:convoGeneSh1},caption=CUDA kernel achieving a generic convolution operation after a preloading of data in shared memory.]{Chapters/chapter4/code/convoGeneSh1.cu}
  \label{tab:convoGeneSh2}
  \end{table} 
  \lstinputlisting[label={lst:convoGeneSh1},caption=CUDA kernel achieving a generic convolution operation after a preloading of data in shared memory.]{Chapters/chapter4/code/convoGeneSh1.cu}
@@ -354,7 +357,7 @@ $\mathbf{1024\times 1024}$&0.306 &0.333 &\bf 0.333 &\bf 0.378&\bf 0.404&\bf 0.46
  $\mathbf{2048\times 2048}$&1.094 &1.191 &\bf 1.260 &\bf 1.444&\bf 1.545&\bf 1.722\\\hline
  $\mathbf{4096\times 4096}$&4.262 &4.631 &\bf 5.000 &\bf 5.676&\bf 6.105&\bf 6.736\\\hline
  \end{tabular}}  
  $\mathbf{2048\times 2048}$&1.094 &1.191 &\bf 1.260 &\bf 1.444&\bf 1.545&\bf 1.722\\\hline
  $\mathbf{4096\times 4096}$&4.262 &4.631 &\bf 5.000 &\bf 5.676&\bf 6.105&\bf 6.736\\\hline
  \end{tabular}}  
-\caption{Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run  on a C2070 card. Timings include data copy. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones.}
+\caption[Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run  on a C2070 card.]{Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run  on a C2070 card. Timings include data copy. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones.}
  \label{tab:convoSepSh1}
  \end{table}
  \begin{table}[h]
  \label{tab:convoSepSh1}
  \end{table}
  \begin{table}[h]
@@ -369,7 +372,7 @@ $\mathbf{2048\times 2048}$&1598 &1541 &\bf 1503 &\bf 1410&\bf 1364&\bf 1290\\\hl
  $\mathbf{4096\times 4096}$&1654 &1596 &\bf 1542 &\bf 1452&\bf 1400&\bf 1330\\\hline
  \end{tabular}
  }  
  $\mathbf{4096\times 4096}$&1654 &1596 &\bf 1542 &\bf 1452&\bf 1400&\bf 1330\\\hline
  \end{tabular}
  }  
-\caption{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
+\caption[Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
  \label{tab:convoSepSh2}
  \end{table} 
  
  \label{tab:convoSepSh2}
  \end{table}