-\chapterauthor{Gilles Perrot}{FEMTO-ST Institute}
+\chapterauthor{Gilles Perrot}{Femto-ST Institute, University of Franche-Comte, France}
+
+\chapter{Implementing an efficient convolution operation on GPU}
+
%\newcommand{\kl}{\includegraphics[scale=0.6]{Chapters/chapter4/img/kernLeft.png}~}
%\newcommand{\kr}{\includegraphics[scale=0.6]{Chapters/chapter4/img/kernRight.png}}
%% }
-\chapter{Implementing an efficient convolution \index{Convolution} operation on GPU}
+
\section{Overview}
In this chapter, after dealing with GPU median filter implementations,
-we propose to explore how convolutions can be implemented on modern
+we propose to explore how convolutions\index{Convolution} can be implemented on modern
GPUs. Widely used in digital image processing filters, the \emph{convolution
operation} basically consists in taking the sum of products of elements
from two 2-D functions, letting one of the two functions move over
\begin{figure}
\centering
\includegraphics[width=11cm]{Chapters/chapter4/img/convo1.png}
- \caption{Principle of a generic convolution implementation. The center pixel is represented with a black background and the pixels of its neighborhood are denoted $I_{p,q}$ where $(p,q)$ is the relative position of the neighbor pixel. Elements $h_{t,u}$ are the values of the convolution mask.}
+ \caption[Principle of a generic convolution implementation.]{Principle of a generic convolution implementation. The center pixel is represented with a black background and the pixels of its neighborhood are denoted $I_{p,q}$ where $(p,q)$ is the relative position of the neighbor pixel. Elements $h_{t,u}$ are the values of the convolution mask.}
\label{fig:convoPrinciple}
\end{figure}
\begin{algorithm}
$\mathbf{4096\times 4096}$&4.700&1585 &13.05&533 &25.56&533 \\\hline
\end{tabular}
}
-\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
+\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
\label{tab:convoNonSepReg1}
\end{table}
$\mathbf{4096\times 4096}$&3.171&1075 &8.720&793 &17.076&569 \\\hline
\end{tabular}
}
-\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
+\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.}
\label{tab:convoNonSepReg3}
\end{table}
$\mathbf{4096\times 4096}$&1.964&2138 &3.926&1711 &6.416&1364 \\\hline
\end{tabular}
}
-\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
+\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
\label{tab:convoGene8x8p}
\end{table}
\begin{figure}
\centering
\includegraphics[width=12cm]{Chapters/chapter4/img/convoShMem.png}
- \caption{Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$. Threads in both top corners of the top figure are identified either by a circle or by a star symbol. The image tile, loaded into shared memory includes the pixels to be updated by the threads of the block, as well as its 2-pixel wide halo. Here, circle and star symbols in the image tile show which pixels are actually loaded into one shared memory vector by its corresponding thread. }
+ \caption[Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$.]{Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$. Threads in both top corners of the top figure are identified either by a circle or by a star symbol. The image tile, loaded into shared memory includes the pixels to be updated by the threads of the block, as well as its 2-pixel wide halo. Here, circle and star symbols in the image tile show which pixels are actually loaded into one shared memory vector by its corresponding thread. }
\label{fig:ShMem1}
\end{figure}
Still, we also implemented this method, in a similar manner as Nvidia did in its SDK sample code.
$\mathbf{4096\times 4096}$&2090 &1637 &1195 &830&684&561\\\hline
\end{tabular}
}
-\caption{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
+\caption[Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
\label{tab:convoGeneSh2}
\end{table}
\lstinputlisting[label={lst:convoGeneSh1},caption=CUDA kernel achieving a generic convolution operation after a preloading of data in shared memory.]{Chapters/chapter4/code/convoGeneSh1.cu}
$\mathbf{2048\times 2048}$&1.094 &1.191 &\bf 1.260 &\bf 1.444&\bf 1.545&\bf 1.722\\\hline
$\mathbf{4096\times 4096}$&4.262 &4.631 &\bf 5.000 &\bf 5.676&\bf 6.105&\bf 6.736\\\hline
\end{tabular}}
-\caption{Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run on a C2070 card. Timings include data copy. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones.}
+\caption[Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run on a C2070 card.]{Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run on a C2070 card. Timings include data copy. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones.}
\label{tab:convoSepSh1}
\end{table}
\begin{table}[h]
$\mathbf{4096\times 4096}$&1654 &1596 &\bf 1542 &\bf 1452&\bf 1400&\bf 1330\\\hline
\end{tabular}
}
-\caption{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
+\caption[Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.}
\label{tab:convoSepSh2}
\end{table}