\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously.}}{45}}
\newlabel{fig:median5overlap}{{4.9}{45}}
\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network.}}{45}}
-\newlabel{fig:median5overlap}{{4.10}{45}}
+\newlabel{fig:bitonic}{{4.10}{45}}
\newlabel{lst:medianForget2pix5}{{4.5}{46}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{46}}
\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{47}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{50}}
\@writefile{toc}{\contentsline {section}{Bibliography}{51}}
\@setckpt{Chapters/chapter3/ch3}{
-\setcounter{page}{52}
+\setcounter{page}{53}
\setcounter{equation}{0}
\setcounter{enumi}{3}
\setcounter{enumii}{0}
\setcounter{enumiii}{0}
-\setcounter{enumiv}{10}
+\setcounter{enumiv}{11}
\setcounter{footnote}{0}
\setcounter{mpfootnote}{0}
\setcounter{part}{2}
\chapterauthor{Gilles Perrot}{Femto-ST Institute, University of Franche-Comte, France}
-%\graphicspath{{img/}}
-
-
-% \begin{VF}
-% ``A ''
-
-% \VA{Thomas Davenport}{Senior Adjutant to the Junior Marketing VP}
-% \end{VF}
-
-
-
-% \begin{shadebox}
-% A component part for an electronic item is
-% manufactured at one of three different factories, and then delivered to
-% the main assembly line.Of the total number supplied, factory A supplies
-% 50\%, factory B 30\%, and factory C 20\%. Of the components
-% manufactured at factory A, 1\% are faulty and the corresponding
-% proportions for factories B and C are 4\% and 2\% respectively. A
-% component is picked at random from the assembly line. What is the
-% probability that it is faulty?
-% \end{shadebox}
-
-
-% \begin{equation}
-% \mbox{var}\widehat{\Delta} = \sum_{j = 1}^t \sum_{k = j+1}^t
-% \mbox{var}\,(\hat{\alpha}_j - \hat{\alpha}_k) = \sum_{j = 1}^t
-% \sum_{k = j+1}^t \sigma^2(1/n_j + 1/n_k). \label{2delvart2}
-% \end{equation}
-
-
-% \begin{shortbox}
-% \Boxhead{Box Title Here}
-% \end{shortbox}
-
-% \begin{theorem}\label{1th:Z_m}
-% Let $m$ be a prime number. With the addition and multiplication as
-% defined above, $Z_m$ is a field.
-% \end{theorem}
-
-% \begin{proof}
-% \end{proof}
-
-% \begin{notelist}{000000}
-% \notes{Note:}{The process of integrating reengineering is best accomplished with an engineer, a dog, and a cat.}
-% \end{notelist}
-
-
-% \begin{VT1}
-% \VH{Think About It...}
-% Com
-% \VT
-% \VTA{The Information Revolution}{Business Week}
-% \end{VT1}
-
-
-%\begin{definition}\label{1def:linearcomb}{}\end{definition}
-
-
-
-% \begin{extract}
-% text
-% \end{extract}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-% Listings
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%% \lstset{
-%% language=C,
-%% columns=fixed,
-%% basicstyle=\footnotesize\ttfamily,
-%% numbers=left,
-%% firstnumber=1,
-%% numberstyle=\tiny,
-%% stepnumber=5,
-%% numbersep=5pt,
-%% tabsize=3,
-%% extendedchars=true,
-%% breaklines=true,
-%% keywordstyle=\textbf,
-%% frame=single,
-%% % keywordstyle=[1]\textbf,
-%% %identifierstyle=\textbf,
-%% commentstyle=\color{white}\textbf,
-%% stringstyle=\color{white}\ttfamily,
-%% % xleftmargin=17pt,
-%% % framexleftmargin=17pt,
-%% % framexrightmargin=5pt,
-%% % framexbottommargin=4pt,
-%% backgroundcolor=\color{lightgray},
-%% }
-
-%\DeclareCaptionFont{blue}{\color{blue}}
-%\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}}
-
-%\DeclareCaptionFont{white}{\color{white}}
-%\DeclareCaptionFormat{listing}{\colorbox{gray}{\parbox{\textwidth}{\hspace{15pt}#1#2#3}}}
-%\captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white, singleline}
-%%%%%%%%%%%%%%%%%%%%%%%% Fin Listings %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newcommand{\kl}{\includegraphics[scale=0.6]{Chapters/chapter3/img/kernLeft.png}~}
\newcommand{\kr}{\includegraphics[scale=0.6]{Chapters/chapter3/img/kernRight.png}}
\section{Data transfers, memory management.}
This section deals with the following issues:
\begin{enumerate}
-\item Data transfer from CPU memory to GPU global memory: several GPU memory areas are available as destination memory but the 2-D caching mechanism of texture memory \index{memory~hierarchy!texture~memory}, specifically designed for fetching neighboring pixels, is currently the fastest way to fetch gray-level pixel values inside a kernel computation. This has led us to choose \textbf{texture memory} as primary GPU memory area for images.
-\item Data fetching from GPU global memory to kernel local memory: as said above, we use texture memory \index{memory~hierarchy!texture~memory}. Depending on which process is run, texture data is used either by direct fetching in kernel local memory or through a prefetching \index{prefetching} in shared memory \index{memory~hierarchy!shared~memory}.
+\item Data transfer from CPU memory to GPU global memory: several GPU memory areas are available as destination memory but the 2-D caching mechanism of texture memory \index{memory~hierarchy!texture~memory}, specifically designed for fetching neighboring pixels, is currently the fastest way to fetch gray-level pixel values inside a kernel computation. This has led us to choose \textbf{texture memory} as primary GPU memory area for input images.
+\item Data fetching from GPU global memory to kernel local memory: as said above, we use texture memory. \index{memory~hierarchy!texture~memory} Depending on which process is run, texture data is used either by direct fetching in kernel local memory or through a prefetching \index{prefetching} in shared memory \index{memory~hierarchy!shared~memory}.
\item Data outputting from kernels to GPU memory: there is actually no alternative to global memory, as kernels can not directly write into texture memory and as copying from texture to CPU memory would not be faster than from simple global memory.
-\item Data transfer from GPU global memory to CPU memory: it can be drastically accelerated by use of \textbf{pinned memory} \index{memory~hierarchy!pinned~memory}, keeping in mind it has to be used sparingly.
+\item Data transfer from GPU global memory to CPU memory: it can be drastically accelerated by use of \textbf{pinned memory}, \index{memory~hierarchy!pinned~memory} keeping in mind it has to be used sparingly.
\end{enumerate}
Algorithm \ref{algo:memcopy} summarizes all the above considerations and describes how data are handled in our examples. For more information on how to handle the different types of GPU memory, we suggest to refer to the CUDA programmer's guide.
\chapter{Implementing a fast median filter}
\section{Introduction}
-Median filtering is a well-known method used in a wide range of application frameworks as well as a standalone filter especially for \textit{salt and pepper} denoising. It is able to highly reduce power of noise without blurring edges too much.
+Median filtering is a well-known method used in a wide range of application frameworks as well as a standalone filter especially for \textit{salt and pepper} denoising. It is able to highly reduce power of noise without blurring edges too much. That is actually why we originally focused on this filtering technique as a preprocessing stage when we were in the process of designing a GPU implementation of one region-based image segmentation algorithm \cite{6036776}.
First introduced by Tukey in \cite{tukey77}, it has been widely studied since then, and many researchers have proposed efficient implementations of it, adapted to various hypothesis, architectures and processors.
Originally, its main drawbacks were its compute complexity, its non linearity and its data-dependent runtime. Several researchers have addressed these issues and designed, for example, efficient histogram-based median filter with predictible runtimes \cite{Huang:1981:TDS:539567, Weiss:2006:FMB:1179352.1141918}.
\section{NVidia GPU tuning recipes}
When designing GPU code, besides thinking of the actual data computing process, one must choose the memory type into which to store temporary data. Three types of GPU memory are available:
\begin{enumerate}
-\item \textbf{Global memory, the most versatile:} \index{memory~hierarchy!global~memory}\\Offers the largest storing space and global scope but is slowest (400 cycles latency). \textbf{Texture memory} is physically included into it, but allows access through an efficient 2-D caching mechanism.
-\item \textbf{Registers, the fastest:} \index{memory~hierarchy!registers}\\Allow access wihtout latency, but only 63 registers are available per thread (thread scope), with a maximum of 32K per Symetric Multiprocessor (SM) \index{register count}w.
-\item \textbf{Shared memory, a complex compromise:} \index{memory~hierarchy!shared~memory}\\All threads in one block can access 48~KBytes of shared memory, which is faster than global memory (20 cycles latency) but slower than registers.
+\item \textbf{Global memory, the most versatile:} \index{memory~hierarchy!global~memory}\\Offers the largest storing space and global scope but is slowest (400-800 clock cycles latency). \textbf{Texture memory} is physically included into it, but allows access through an efficient 2-D caching mechanism.
+\item \textbf{Registers, the fastest:} \index{memory~hierarchy!registers}\\Allow access wihtout latency, but only 63 registers are available per thread (thread scope), with a maximum of 32K per Symetric Multiprocessor (SM) \index{register count}.
+\item \textbf{Shared memory, a complex compromise:} \index{memory~hierarchy!shared~memory}\\All threads in one block can access 48~KBytes of shared memory, which is faster than global memory (\~20 cycles latency) but slower than registers.
However, bank conflicts can occur if two threads of a warp try to access data stored in one single memory bank. In such cases, the parallel process is re-serialized which may cause significant performance decrease. One easy way to avoid it is to ensure that two consecutive threads in one block always access 32-bit data at two consecutive adresses.
\end{enumerate}
\caption{Forgetful selection with the minimal element register count. Illustration for 3$\times$3 pixel window represented in a row and supposed sorted.}
\label{fig:forgetful_selection}
\end{figure}
-We must remember that, in the fully sorted vector, the median value will have the middle index i.e. $\lfloor n^2/2\rfloor$.
+We must remember that, in the fully sorted vector, the median value will have the middle index \textit{i.e.} $\lfloor n^2/2\rfloor$.
Moreover, assuming that both \textit{extrema} are eliminated from the first $k$ elements and that the global median is one of them would mean that:
\begin{itemize}
\item if the global median was the minimum among the $k$ elements, then at least $k-1$ elements would have a higher index. Considering the above median definition, at least $k-1$ elements should also have a lower index in the entire vector.
\end{figure}
-The \textit{forgetful selection} method, used in \cite{mcguire2008median} does not imply full sorting of values, but only selecting minimum and maximum values, which, at the price of a few iteration steps ($n^2-k$), reduces arithmetic complexity.
-Listing \ref{lst:medianForget1pix3} details this process where forgetful selection is achieved by use of simple 2-value swapping function ($s()$, lines 1 to 5) that swaps input values if necessary, so as to achieve the first steps of a \textit{bitonic}-like sorting network (\cite{Batcher:1968:SNA:1468075.1468121}). Moreover, whenever possible, in order to increase the Instruction-Level Parallelism \index{Instruction-Level Parallelism}, successive calls to $s()$ are done with independant elements as arguments. This is illustrated by the macro definitions of lines 7 to 14 and by Figure \ref{fig:bitonic} which details the first iteration of the $5\times 5$ selection, starting with $k_{25}=14$ elements.
+The \textit{forgetful selection} method, used in \cite{mcguire2008median}, does not imply full sorting of values, but only selecting minimum and maximum values, which, at the price of a few iteration steps ($n^2-k$), reduces arithmetic complexity.
+Listing \ref{lst:medianForget1pix3} details this process where forgetful selection is achieved by use of simple 2-value swapping function ($s()$, lines 1 to 5) that swaps input values if necessary, so as to achieve the first steps of an incomplete sorting network \cite{Batcher:1968:SNA:1468075.1468121}. Moreover, whenever possible, in order to increase the Instruction-Level Parallelism \index{Instruction-Level Parallelism}, successive calls to $s()$ are done with independant elements as arguments. This is illustrated by the macro definitions of lines 7 to 14 and by Figure \ref{fig:bitonic} which details the first iteration of the $5\times 5$ selection, starting with $k_{25}=14$ elements.
\lstinputlisting[label={lst:medianForget1pix3},caption= 3$\times$3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070. ]{Chapters/chapter3/code/kernMedianForget1pix3.cu}
\end{figure}
\section{A 5$\times$5 and more median filter }
-Considering the maximum register count allowed per thread (63) and trying to push this technique to its limit potentially allows designing up to 9$\times$9 median filters. Such maximum would actually use $k_{81}=\lceil 81/2\rceil+1 = 42$ registers per thread plus 9, used by the compiler to complete arithmetic operations. This leads to a total register count of 51, which would forbid to compute more than one pixel per thread, but also would limit the number of concurrent threads per block. Our measurements show that this technique is still worth using for the 5$\times$5 median. As for larger window sizes, one option could be using shared memory.
+Considering the maximum register count allowed per thread (63) and trying to push this technique to its limit potentially allows designing up to 9$\times$9 median filters. Such maximum would actually use $k_{81}=\lceil 81/2\rceil+1 = 42$ registers per thread plus 9, used by the compiler to complete arithmetic operations and 9 more when outputting 2 pixels per thread. This leads to a total register count of 60, which would limit the number of concurrent threads per block. Our measurements show that this technique is still worth using for the 7$\times$7 median. As for larger window sizes, one option could be using shared memory.
The next two sections will first detail the particular case of the 5$\times$5 median through register-only method and eventually a generic kernel for larger window sizes.
\subsection{A register-only 5$\times$5 median filter \label{sec:median5}}
\begin{figure}
\centering
- \includegraphics[width=6cm]{Chapters/chapter3/img/forgetful_selection4.png}
+ \includegraphics[width=6cm]{Chapters/chapter3/img/fig3.jpg}
\caption[First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network.]{First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.}
- \label{fig:median5overlap}
+ \label{fig:bitonic}
\end{figure}
\lstinputlisting[label={lst:medianForget2pix5},caption=kernel 5$\times$5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.]{Chapters/chapter3/code/kernMedian2pix5.cu}
int j= __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2);
int i= __mul24(blockIdx.y,blockDim.y) + threadIdx.y;
int a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13;//left window
- int b6,b7,b8,b9,b10,b11,b12,b13 ; //right window
+ int b7,b8,b9,b10,b11,b12,b13 ; //right window
//first 14 common pixels
a0 = tex2D(tex_img_ins, j-1, i-2) ; // first line
a1 = tex2D(tex_img_ins, j , i-2) ;
minmax8(&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13);
// separation
- b6=a6; b7=a7; b8=a8; b9=a9; b10=a10; b11=a11; b12=a12; b13=a13;
+ b7=a7; b8=a8; b9=a9; b10=a10; b11=a11; b12=a12; b13=a13;
// separate selections: 5 remaining pixels in both windows
a13 = tex2D(tex_img_ins, j-2, i-2);
-__global__ void kernel_median5_2pix( short *output, int i_dim, int j_dim)
+__global__ void kernel_median5_2pix( short *output,
+ int i_dim, int j_dim)
{
- int j = __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2) ;
- int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ;
- int a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13 ;//left window
- int b6, b7, b8, b9, b10, b11, b12, b13 ; //right window
-
+ int j= __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2);
+ int i= __mul24(blockIdx.y,blockDim.y) + threadIdx.y;
+ int a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13;//left window
+ int b6,b7,b8,b9,b10,b11,b12,b13 ; //right window
+ //first 14 common pixels
a0 = tex2D(tex_img_ins, j-1, i-2) ; // first line
a1 = tex2D(tex_img_ins, j , i-2) ;
a2 = tex2D(tex_img_ins, j+1, i-2) ;
// separation
b6=a6; b7=a7; b8=a8; b9=a9; b10=a10; b11=a11; b12=a12; b13=a13;
- // separate selections for the 5 remaining pixels in both windows
+ // separate selections: 5 remaining pixels in both windows
a13 = tex2D(tex_img_ins, j-2, i-2);
b13 = tex2D(tex_img_ins, j+3, i-2);
minmax7(&a7,&a8,&a9,&a10,&a11,&a12,&a13);
output[ __mul24(i, j_dim) +j ] = a12 ; //middle values
output[ __mul24(i, j_dim) +j+1 ] = b12 ;
-
}
\chapter{Implementing an efficient convolution operation on GPU}
-%\newcommand{\kl}{\includegraphics[scale=0.6]{Chapters/chapter4/img/kernLeft.png}~}
-%\newcommand{\kr}{\includegraphics[scale=0.6]{Chapters/chapter4/img/kernRight.png}}
-
-%% \lstset{
-%% language=C,
-%% columns=fixed,
-%% basicstyle=\footnotesize\ttfamily,
-%% numbers=left,
-%% firstnumber=1,
-%% numberstyle=\tiny,
-%% stepnumber=5,
-%% numbersep=5pt,
-%% tabsize=3,
-%% extendedchars=true,
-%% breaklines=true,
-%% keywordstyle=\textbf,
-%% frame=single,
-%% % keywordstyle=[1]\textbf,
-%% %identifierstyle=\textbf,
-%% commentstyle=\color{white}\textbf,
-%% stringstyle=\color{white}\ttfamily,
-%% % xleftmargin=17pt,
-%% % framexleftmargin=17pt,
-%% % framexrightmargin=5pt,
-%% % framexbottommargin=4pt,
-%% backgroundcolor=\color{lightgray},
-%% }
before adressing the matter of separable convolutions. We shall refer
to $I$ as an H x L pixel gray-level image, and to $I(x,y)$ as the gray-level
value of each pixel of coordinates $(x,y)$.
-%dire qqs mots sur le filtrage IIR/FIR ?
+
\section{Definition}
GPU functions referred to as kernels, we shall use\emph{ convolution
mask} instead of \emph{convolution kernel}) is defined by:
\begin{equation}
-I'(x, y) = \left(I * h\right) = \sum_{i < H} \sum_{j < L}I(x-j, y-j).h(j,i)
+I'(x, y) = \left(I * h\right) = \sum_{(i < H)} \sum_{(j < L)}I(x-j, y-j).h(j,i)
\label{convoDef}
\end{equation}
-While processing an image, function \emph{h} is bounded by a square
-window of size \emph{k = 2r + 1}, i.e an uneven number, to ensure
+While processing an image, function \emph{h} is often bounded by a square
+window of size \emph{k = 2r + 1}, \textit{i.e} an uneven number, to ensure
there is a center. We shall also point out that, as stated earlier,
the square shape is no limiting factor to the process, as any shape
can be inscribed into a square. In the case of a more complex shape,
\caption{generic convolution}
\label{algo_genconv}
\ForEach{pixel at position $(x, y)$}{
- Read all gray-level values $I(x, y)$ in the neighborhood\\\;
- Compute the weighted sum \( I_\Omega = \sum_{(j,i) \in \Omega}I(x-j, y-j).h(j,i) \)\\\;
- Normalize $I'(x, y)$ value\\\;
+ Read all gray-level values $I(x, y)$ in the neighborhood\;
+ Compute the weighted sum \( I_\Omega = \sum_{(j,i) \in \Omega}I(x-j, y-j).h(j,i) \)\;
+ Normalize $I'(x, y)$ value\;
Outputs the new gray-level value
}
\end{algorithm}
previous chapter, as a reminder : texture memory used with incoming
data, pinned memory with output data, optimized use of registers
while processing data and multiple output per thread\index{Multiple output per thread}.
-One signifcant difference lies in the fact
+One significant difference lies in the fact
that the median filter uses only one parameter, the size of the window mask,
which can be hard-coded, while a convolution mask requires referring to several; hard-coding
its elements would lead to severe lack of flexibility (one function
point in our approach.
Let us assume that we are planning to implement the convolution defined by the following $3\times 3$ mask (low-pass filter or averaging filter):
-$$h=\begin{bmatrix}1&1&1\\1&1&1\\1&1&1\end{bmatrix}$$
+$$h=\frac{1}{9}\begin{bmatrix}1&1&1\\1&1&1\\1&1&1\end{bmatrix}$$
The kernel code presented in Listing \ref{lst:convoGene3Reg8} implements the convolution operation and applies all above optimizations except, for clarity reasons, multiple output per thread.
In the particular case of a generic convolution, it is important to note how mask coefficients are applied to image pixels in order to fit the definition of equation \ref{convoDef}: if the coordinates of the center pixel had been set to (0,0), then the pixel of coordinates $(i,j)$ would have been multiplied by the element $(-i,-j)$ of the mask, which, transposed in our kernel code, leads to multiply the $p^{th}$ pixel of the window by the $(n-p)^{th}$ element of the convolution mask.
\end{table}
Table \ref{tab:convoNonSepReg3} shows timings and global throughput values achieved by those convolution masks on an Nvidia GT200 Tesla architecture (GTX480 card) with $16x8$ thread blocks. This measurement has been done in order to make a relevant comparison with a reference given by Nvidia in \cite{convolutionsoup} where they state that their fastest kernel achieves a $5\times5$ convolution of an 8-bit $2048\times 2048$ pixelimage in $1.4~ms$, which lead to a throughput value of 945~Mpix/s.
-Our current value of 802~Mpix/s, though not unsatisfactory, remains lower to the one reached by the manufacturer'own coding.
+Our current value of 802~Mpix/s, though not unsatisfactory, remains lower to the one reached by the manufacturer's own coding.
Tested in the same conditions, the newer Fermi architecture of
Nvidia's GPUs proved slower (3.3 ms, see Table \ref{tab:convoNonSepReg1}) due to the lower maximum
register count allowed (63, against 128 for Tesla GT200).
or more pixels per thread while keeping safely within the 63-per-thread
rule.
-However, when doing so, \textit{eg} processing what we shall call a \textit{packet} of pixels, window mask overlapping has to be taken into account
+However, when doing so, \textit{e.g} processing what we shall call a \textit{packet} of pixels, window mask overlapping has to be taken into account
to avoid multiple texture fetches of each pixel's gray-level value, while benefiting from the 2-D cache.
In that case, both mask size and pixel packet shape determine the number of texture fetches to be performed for each pixel value.
Figure \ref{fig:convoOverlap1} illustrates two different situations: on top, a mask of radius 1 ($3\times 3$) applied to a packet of 8 pixels in row; at bottom, a mask of radius 2 ($5\times 5$).
-1&2&-1
\end{bmatrix}$$
Such a mask allows to replace a generic 2-D convolution operation by two consecutive stages of a 1-D convolution operation: a vertical of mask $h_v$ and a horizontal of mask $h_h$.
-This saves a lot of arithmetic operations, as a generic $n\times n$ convolution applied on a $H\times L$ image basically represents $H.L.n^2$ multiplications and as many additions, while two consecutive $n\times 1$ convolutions only represents $2.H.L.n$ of each, \textit{eg} 60\% operations are saved per pixel of the image for a $5\times 5$ mask.\\
+This saves a lot of arithmetic operations, as a generic $n\times n$ convolution applied on a $H\times L$ image basically represents $H.L.n^2$ multiplications and as many additions, while two consecutive $n\times 1$ convolutions only represents $2.H.L.n$ of each, \textit{e.g} 60\% operations are saved per pixel of the image for a $5\times 5$ mask.\\
However, beside reducing the operation count, performing a separable convolution also means writing an intermediate image into global memory.
CPU implementations of separable convolutions often use a single function to perform both 1-D convolution stages. To do so, this function reads the input image and actually ouputs the transposed filtered image.
Applying that principle to GPUs is not efficient, as outputting the transposed image means non-coalescent writes into global memory, generating severe performance loss. Hence the idea of developing two different kernels, one for each of both vertical and horizontal convolutions.
for (ic=0 ; ic<k ; ic++)
{
pix = tex2D(tex_img_inc, j+1, i-1+ic) ;
- outval0 += masque[ __umul24(ic,k) +2 ]*pix ;
- outval1 += masque[ __umul24(ic,k) +1 ]*pix ;
- outval2 += masque[ __umul24(ic,k) ]*pix ;
+ outval0 += mask[ __umul24(ic,k) +2 ]*pix ;
+ outval1 += mask[ __umul24(ic,k) +1 ]*pix ;
+ outval2 += mask[ __umul24(ic,k) ]*pix ;
pix = tex2D(tex_img_inc, j+2, i-1+ic) ;
- outval1 += masque[ __umul24(ic,k) +2 ]*pix ;
- outval2 += masque[ __umul24(ic,k) +1 ]*pix ;
- outval3 += masque[ __umul24(ic,k) ]*pix ;
+ outval1 += mask[ __umul24(ic,k) +2 ]*pix ;
+ outval2 += mask[ __umul24(ic,k) +1 ]*pix ;
+ outval3 += mask[ __umul24(ic,k) ]*pix ;
pix = tex2D(tex_img_inc, j+3, i-1+ic) ;
- outval2 += masque[ __umul24(ic,k) +2 ]*pix ;
- outval3 += masque[ __umul24(ic,k) +1 ]*pix ;
- outval4 += masque[ __umul24(ic,k) ]*pix ;
+ outval2 += mask[ __umul24(ic,k) +2 ]*pix ;
+ outval3 += mask[ __umul24(ic,k) +1 ]*pix ;
+ outval4 += mask[ __umul24(ic,k) ]*pix ;
pix = tex2D(tex_img_inc, j+4, i-1+ic) ;
- outval3 += masque[ __umul24(ic,k) +2 ]*pix ;
- outval4 += masque[ __umul24(ic,k) +1 ]*pix ;
- outval5 += masque[ __umul24(ic,k) ]*pix ;
+ outval3 += mask[ __umul24(ic,k) +2 ]*pix ;
+ outval4 += mask[ __umul24(ic,k) +1 ]*pix ;
+ outval5 += mask[ __umul24(ic,k) ]*pix ;
pix = tex2D(tex_img_inc, j+5, i-1+ic) ;
- outval4 += masque[ __umul24(ic,k) +2 ]*pix ;
- outval5 += masque[ __umul24(ic,k) +1 ]*pix ;
- outval6 += masque[ __umul24(ic,k) ]*pix ;
+ outval4 += mask[ __umul24(ic,k) +2 ]*pix ;
+ outval5 += mask[ __umul24(ic,k) +1 ]*pix ;
+ outval6 += mask[ __umul24(ic,k) ]*pix ;
pix = tex2D(tex_img_inc, j+6, i-1+ic) ;
- outval5 += masque[ __umul24(ic,k) +2 ]*pix ;
- outval6 += masque[ __umul24(ic,k) +1 ]*pix ;
- outval7 += masque[ __umul24(ic,k) ]*pix ;
+ outval5 += mask[ __umul24(ic,k) +2 ]*pix ;
+ outval6 += mask[ __umul24(ic,k) +1 ]*pix ;
+ outval7 += mask[ __umul24(ic,k) ]*pix ;
// end zones
pix = tex2D(tex_img_inc, j, i-1+ic) ;
- outval0 += masque[ __umul24(ic,k) +1 ]*pix ;
- outval1 += masque[ __umul24(ic,k) ]*pix ;
+ outval0 += mask[ __umul24(ic,k) +1 ]*pix ;
+ outval1 += mask[ __umul24(ic,k) ]*pix ;
pix = tex2D(tex_img_inc, j-1, i-1+ic) ;
- outval0 += masque[ __umul24(ic,k) ]*pix ;
+ outval0 += mask[ __umul24(ic,k) ]*pix ;
pix = tex2D(tex_img_inc, j+7, i-1+ic) ;
- outval6 += masque[ __umul24(ic,k) +2 ]*pix ;
- outval7 += masque[ __umul24(ic,k) +1 ]*pix ;
+ outval6 += mask[ __umul24(ic,k) +2 ]*pix ;
+ outval7 += mask[ __umul24(ic,k) +1 ]*pix ;
pix = tex2D(tex_img_inc, j+8, i-1+ic) ;
- outval7 += masque[ __umul24(ic,k) +2 ]*pix ;
+ outval7 += mask[ __umul24(ic,k) +2 ]*pix ;
}
// multiple output
output[ __umul24(i, j_dim) + j ] = outval0 ;
output[ __umul24(i, j_dim) + j ] = outval0 ;
output[ __umul24(i, j_dim) + j+1 ] = outval1 ;
output[ __umul24(i, j_dim) + j+2 ] = outval2 ;
- output[ __umul24(i, j_dim) + j+3 ] = outval3;
- output[ __umul24(i, j_dim) + j+4 ] = outval4;
- output[ __umul24(i, j_dim) + j+5 ] = outval5;
+ output[ __umul24(i, j_dim) + j+3 ] = outval3 ;
+ output[ __umul24(i, j_dim) + j+4 ] = outval4 ;
+ output[ __umul24(i, j_dim) + j+5 ] = outval5 ;
output[ __umul24(i, j_dim) + j+6 ] = outval6 ;
output[ __umul24(i, j_dim) + j+7 ] = outval7 ;
}
for( jc=0 ; jc<k ; jc++)
{
int baseRoi = __umul24(ic+threadIdx.y,(bdimX+k-1)) + jc+tidX ;
- float valMask = masque[ __umul24(ic,k)+jc ] ;
+ float valMask = mask[ __umul24(ic,k)+jc ] ;
outval0 += valMask*roi8p[ baseRoi ] ;
outval1 += valMask*roi8p[ baseRoi +1 ] ;
outval2 += valMask*roi8p[ baseRoi +2 ] ;
output[ idx++ ] = outval4 ;
output[ idx++ ] = outval5 ;
output[ idx++ ] = outval6 ;
- output[ idx++ ] = outval7 ;
+ output[ idx ] = outval7 ;
}
}
}
__syncthreads();
-
+
// computations
for (ic=0 ; ic<k ; ic++)
for( jc=0 ; jc<k ; jc++)
{
int baseRoi = __umul24(ic+threadIdx.y,(bdimX+k-1)) + jc+tidX ;
- float valMask = masque[ __umul24(ic,k)+jc ] ;
+ float valMask = mask[ __umul24(ic,k)+jc ] ;
outval0 += valMask*roi8p[ baseRoi ] ;
outval1 += valMask*roi8p[ baseRoi +1 ] ;
outval2 += valMask*roi8p[ baseRoi +2 ] ;
}
// multiple output --> global mem
- output[ idx ] = outval0 ;
- output[ idx+1 ] = outval1 ;
- output[ idx+2 ] = outval2 ;
- output[ idx+3 ] = outval3 ;
- output[ idx+4 ] = outval4 ;
- output[ idx+5 ] = outval5 ;
- output[ idx+6 ] = outval6 ;
- output[ idx+7 ] = outval7 ;
+ output[ idx++ ] = outval0 ;
+ output[ idx++ ] = outval1 ;
+ output[ idx++ ] = outval2 ;
+ output[ idx++ ] = outval3 ;
+ output[ idx++ ] = outval4 ;
+ output[ idx++ ] = outval5 ;
+ output[ idx++ ] = outval6 ;
+ output[ idx ] = outval7 ;
}
output[ idx++ ] = outval4 ;
output[ idx++ ] = outval5 ;
output[ idx++ ] = outval6 ;
- output[ idx++ ] = outval7 ;
+ output[ idx ] = outval7 ;
}
{
int ic, jc, p;
int k = 2*r+1 ;
- float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ;
- int bdimX = blockDim.x<<3 ;
- int tidX = threadIdx.x<<3 ;
+ float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0 ;
+ float outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ;
+ int bdimX = blockDim.x<<3 ; // all packets width
+ int tidX = threadIdx.x<<3 ; // one packet offset
- // coordonnees absolues du point de base
+ // absolute coordinates of one packet base point
int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x)<<3 ;
int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ;
- int j0= __umul24(blockIdx.x,blockDim.x)<<3 ;
- int idx = __umul24(i,j_dim) + j ; // indice dans l'image
+ int j0= __umul24(blockIdx.x,blockDim.x)<<3 ;
+ // absolute index in the image
+ int idx = __umul24(i,j_dim) + j ;
-
- // chargement en smem
+ // offset of one ROI row in shared memory
int idrow = threadIdx.y*(bdimX+k-1) ;
extern __shared__ unsigned char roi8p[];
- // bloc 0 (a gauche)
+ // top left block
for (p=0; p<8; p++)
roi8p[ idrow + tidX +p ] = tex2D(tex_img_inc, j-r+p , i) ;
-
- // a droite
- if ( threadIdx.x < r ) //...ou plutot ce qu'il en manque
+ // top right block
+ if ( threadIdx.x < r )
{
roi8p[ idrow + bdimX + threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX+threadIdx.x , i ) ;
roi8p[ idrow + bdimX + threadIdx.x +r ] = tex2D( tex_img_inc, j0 +bdimX+threadIdx.x , i ) ;
__syncthreads();
- // calculs de convolution
- // passe horizontale
+ // horizontal convolution
for (jc=0 ; jc<k ; jc++)
{
int baseRoi = idrow + tidX +jc ;
outval7 += valMask*roi8p[ baseRoi +7 ] ;
}
- // 1 pixel par thread --> global mem
- output[ idx ] = outval0 ;
- output[ idx+1 ] = outval1 ;
- output[ idx+2 ] = outval2 ;
- output[ idx+3 ] = outval3 ;
- output[ idx+4 ] = outval4 ;
- output[ idx+5 ] = outval5 ;
- output[ idx+6 ] = outval6 ;
- output[ idx+7 ] = outval7 ;
+ // 8 pixels per thread --> global mem
+ output[ idx++ ] = outval0 ;
+ output[ idx++ ] = outval1 ;
+ output[ idx++ ] = outval2 ;
+ output[ idx++ ] = outval3 ;
+ output[ idx++ ] = outval4 ;
+ output[ idx++ ] = outval5 ;
+ output[ idx++ ] = outval6 ;
+ output[ idx++ ] = outval7 ;
}
output[ idx++ ] = outval4 ;
output[ idx++ ] = outval5 ;
output[ idx++ ] = outval6 ;
- output[ idx++ ] = outval7 ;
+ output[ idx ] = outval7 ;
}
{
int ic, jc, p;
int k = 2*r+1 ;
- float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ;
- int bdimX = blockDim.x<<3 ;
- int tidX = threadIdx.x<<3 ;
+ float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0 ;
+ float outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ;
+ int bdimX = blockDim.x<<3 ; // all packets width
+ int tidX = threadIdx.x<<3 ; // one packet offset
// absolute coordinates of the base point
int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x)<<3 ;
outval7 += valMask*roi8p[ baseRoi +7 ] ;
}
- // 8 pixel par thread --> global mem
+ // 8 pixels per thread --> global mem
output[ idx++ ] = outval0 ;
output[ idx++ ] = outval1 ;
output[ idx++ ] = outval2 ;