From: Raphael Couturier Date: Tue, 21 May 2013 15:01:44 +0000 (+0200) Subject: modif gillou X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/f045ded06189b82188fcba9dd6ca383823e34aaa?ds=inline modif gillou --- diff --git a/BookGPU/Chapters/chapter3/ch3.aux b/BookGPU/Chapters/chapter3/ch3.aux index 0f932fd..8a1e22e 100644 --- a/BookGPU/Chapters/chapter3/ch3.aux +++ b/BookGPU/Chapters/chapter3/ch3.aux @@ -84,7 +84,7 @@ \@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously.}}{45}} \newlabel{fig:median5overlap}{{4.9}{45}} \@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network.}}{45}} -\newlabel{fig:median5overlap}{{4.10}{45}} +\newlabel{fig:bitonic}{{4.10}{45}} \newlabel{lst:medianForget2pix5}{{4.5}{46}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{46}} \@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{47}} @@ -110,12 +110,12 @@ \@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{50}} \@writefile{toc}{\contentsline {section}{Bibliography}{51}} \@setckpt{Chapters/chapter3/ch3}{ -\setcounter{page}{52} +\setcounter{page}{53} \setcounter{equation}{0} \setcounter{enumi}{3} \setcounter{enumii}{0} \setcounter{enumiii}{0} -\setcounter{enumiv}{10} +\setcounter{enumiv}{11} \setcounter{footnote}{0} \setcounter{mpfootnote}{0} \setcounter{part}{2} diff --git a/BookGPU/Chapters/chapter3/ch3.tex b/BookGPU/Chapters/chapter3/ch3.tex index 8cd1767..6a5181b 100755 --- a/BookGPU/Chapters/chapter3/ch3.tex +++ b/BookGPU/Chapters/chapter3/ch3.tex @@ -1,102 +1,4 @@ \chapterauthor{Gilles Perrot}{Femto-ST Institute, University of Franche-Comte, France} -%\graphicspath{{img/}} - - -% \begin{VF} -% ``A '' - -% \VA{Thomas Davenport}{Senior Adjutant to the Junior Marketing VP} -% \end{VF} - - - -% \begin{shadebox} -% A component part for an electronic item is -% manufactured at one of three different factories, and then delivered to -% the main assembly line.Of the total number supplied, factory A supplies -% 50\%, factory B 30\%, and factory C 20\%. Of the components -% manufactured at factory A, 1\% are faulty and the corresponding -% proportions for factories B and C are 4\% and 2\% respectively. A -% component is picked at random from the assembly line. What is the -% probability that it is faulty? -% \end{shadebox} - - -% \begin{equation} -% \mbox{var}\widehat{\Delta} = \sum_{j = 1}^t \sum_{k = j+1}^t -% \mbox{var}\,(\hat{\alpha}_j - \hat{\alpha}_k) = \sum_{j = 1}^t -% \sum_{k = j+1}^t \sigma^2(1/n_j + 1/n_k). \label{2delvart2} -% \end{equation} - - -% \begin{shortbox} -% \Boxhead{Box Title Here} -% \end{shortbox} - -% \begin{theorem}\label{1th:Z_m} -% Let $m$ be a prime number. With the addition and multiplication as -% defined above, $Z_m$ is a field. -% \end{theorem} - -% \begin{proof} -% \end{proof} - -% \begin{notelist}{000000} -% \notes{Note:}{The process of integrating reengineering is best accomplished with an engineer, a dog, and a cat.} -% \end{notelist} - - -% \begin{VT1} -% \VH{Think About It...} -% Com -% \VT -% \VTA{The Information Revolution}{Business Week} -% \end{VT1} - - -%\begin{definition}\label{1def:linearcomb}{}\end{definition} - - - -% \begin{extract} -% text -% \end{extract} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Listings -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% \lstset{ -%% language=C, -%% columns=fixed, -%% basicstyle=\footnotesize\ttfamily, -%% numbers=left, -%% firstnumber=1, -%% numberstyle=\tiny, -%% stepnumber=5, -%% numbersep=5pt, -%% tabsize=3, -%% extendedchars=true, -%% breaklines=true, -%% keywordstyle=\textbf, -%% frame=single, -%% % keywordstyle=[1]\textbf, -%% %identifierstyle=\textbf, -%% commentstyle=\color{white}\textbf, -%% stringstyle=\color{white}\ttfamily, -%% % xleftmargin=17pt, -%% % framexleftmargin=17pt, -%% % framexrightmargin=5pt, -%% % framexbottommargin=4pt, -%% backgroundcolor=\color{lightgray}, -%% } - -%\DeclareCaptionFont{blue}{\color{blue}} -%\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}} - -%\DeclareCaptionFont{white}{\color{white}} -%\DeclareCaptionFormat{listing}{\colorbox{gray}{\parbox{\textwidth}{\hspace{15pt}#1#2#3}}} -%\captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white, singleline} -%%%%%%%%%%%%%%%%%%%%%%%% Fin Listings %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \newcommand{\kl}{\includegraphics[scale=0.6]{Chapters/chapter3/img/kernLeft.png}~} \newcommand{\kr}{\includegraphics[scale=0.6]{Chapters/chapter3/img/kernRight.png}} @@ -131,10 +33,10 @@ However, so as to propose concise and more readable code, we will assume the fol \section{Data transfers, memory management.} This section deals with the following issues: \begin{enumerate} -\item Data transfer from CPU memory to GPU global memory: several GPU memory areas are available as destination memory but the 2-D caching mechanism of texture memory \index{memory~hierarchy!texture~memory}, specifically designed for fetching neighboring pixels, is currently the fastest way to fetch gray-level pixel values inside a kernel computation. This has led us to choose \textbf{texture memory} as primary GPU memory area for images. -\item Data fetching from GPU global memory to kernel local memory: as said above, we use texture memory \index{memory~hierarchy!texture~memory}. Depending on which process is run, texture data is used either by direct fetching in kernel local memory or through a prefetching \index{prefetching} in shared memory \index{memory~hierarchy!shared~memory}. +\item Data transfer from CPU memory to GPU global memory: several GPU memory areas are available as destination memory but the 2-D caching mechanism of texture memory \index{memory~hierarchy!texture~memory}, specifically designed for fetching neighboring pixels, is currently the fastest way to fetch gray-level pixel values inside a kernel computation. This has led us to choose \textbf{texture memory} as primary GPU memory area for input images. +\item Data fetching from GPU global memory to kernel local memory: as said above, we use texture memory. \index{memory~hierarchy!texture~memory} Depending on which process is run, texture data is used either by direct fetching in kernel local memory or through a prefetching \index{prefetching} in shared memory \index{memory~hierarchy!shared~memory}. \item Data outputting from kernels to GPU memory: there is actually no alternative to global memory, as kernels can not directly write into texture memory and as copying from texture to CPU memory would not be faster than from simple global memory. -\item Data transfer from GPU global memory to CPU memory: it can be drastically accelerated by use of \textbf{pinned memory} \index{memory~hierarchy!pinned~memory}, keeping in mind it has to be used sparingly. +\item Data transfer from GPU global memory to CPU memory: it can be drastically accelerated by use of \textbf{pinned memory}, \index{memory~hierarchy!pinned~memory} keeping in mind it has to be used sparingly. \end{enumerate} Algorithm \ref{algo:memcopy} summarizes all the above considerations and describes how data are handled in our examples. For more information on how to handle the different types of GPU memory, we suggest to refer to the CUDA programmer's guide. @@ -185,7 +87,7 @@ In order to estimate the potential for improvement of each kernel, a reference t \chapter{Implementing a fast median filter} \section{Introduction} -Median filtering is a well-known method used in a wide range of application frameworks as well as a standalone filter especially for \textit{salt and pepper} denoising. It is able to highly reduce power of noise without blurring edges too much. +Median filtering is a well-known method used in a wide range of application frameworks as well as a standalone filter especially for \textit{salt and pepper} denoising. It is able to highly reduce power of noise without blurring edges too much. That is actually why we originally focused on this filtering technique as a preprocessing stage when we were in the process of designing a GPU implementation of one region-based image segmentation algorithm \cite{6036776}. First introduced by Tukey in \cite{tukey77}, it has been widely studied since then, and many researchers have proposed efficient implementations of it, adapted to various hypothesis, architectures and processors. Originally, its main drawbacks were its compute complexity, its non linearity and its data-dependent runtime. Several researchers have addressed these issues and designed, for example, efficient histogram-based median filter with predictible runtimes \cite{Huang:1981:TDS:539567, Weiss:2006:FMB:1179352.1141918}. @@ -295,9 +197,9 @@ copy data from GPU global memory to CPU memory\label{algoMedianGeneric:memcpyD2H \section{NVidia GPU tuning recipes} When designing GPU code, besides thinking of the actual data computing process, one must choose the memory type into which to store temporary data. Three types of GPU memory are available: \begin{enumerate} -\item \textbf{Global memory, the most versatile:} \index{memory~hierarchy!global~memory}\\Offers the largest storing space and global scope but is slowest (400 cycles latency). \textbf{Texture memory} is physically included into it, but allows access through an efficient 2-D caching mechanism. -\item \textbf{Registers, the fastest:} \index{memory~hierarchy!registers}\\Allow access wihtout latency, but only 63 registers are available per thread (thread scope), with a maximum of 32K per Symetric Multiprocessor (SM) \index{register count}w. -\item \textbf{Shared memory, a complex compromise:} \index{memory~hierarchy!shared~memory}\\All threads in one block can access 48~KBytes of shared memory, which is faster than global memory (20 cycles latency) but slower than registers. +\item \textbf{Global memory, the most versatile:} \index{memory~hierarchy!global~memory}\\Offers the largest storing space and global scope but is slowest (400-800 clock cycles latency). \textbf{Texture memory} is physically included into it, but allows access through an efficient 2-D caching mechanism. +\item \textbf{Registers, the fastest:} \index{memory~hierarchy!registers}\\Allow access wihtout latency, but only 63 registers are available per thread (thread scope), with a maximum of 32K per Symetric Multiprocessor (SM) \index{register count}. +\item \textbf{Shared memory, a complex compromise:} \index{memory~hierarchy!shared~memory}\\All threads in one block can access 48~KBytes of shared memory, which is faster than global memory (\~20 cycles latency) but slower than registers. However, bank conflicts can occur if two threads of a warp try to access data stored in one single memory bank. In such cases, the parallel process is re-serialized which may cause significant performance decrease. One easy way to avoid it is to ensure that two consecutive threads in one block always access 32-bit data at two consecutive adresses. \end{enumerate} @@ -344,7 +246,7 @@ This count can be reduced by use of an iterative sorting process called \textit{ \caption{Forgetful selection with the minimal element register count. Illustration for 3$\times$3 pixel window represented in a row and supposed sorted.} \label{fig:forgetful_selection} \end{figure} -We must remember that, in the fully sorted vector, the median value will have the middle index i.e. $\lfloor n^2/2\rfloor$. +We must remember that, in the fully sorted vector, the median value will have the middle index \textit{i.e.} $\lfloor n^2/2\rfloor$. Moreover, assuming that both \textit{extrema} are eliminated from the first $k$ elements and that the global median is one of them would mean that: \begin{itemize} \item if the global median was the minimum among the $k$ elements, then at least $k-1$ elements would have a higher index. Considering the above median definition, at least $k-1$ elements should also have a lower index in the entire vector. @@ -365,8 +267,8 @@ This iterative process is illustrated in Figure \ref{fig:forgetful3}, where it a \end{figure} -The \textit{forgetful selection} method, used in \cite{mcguire2008median} does not imply full sorting of values, but only selecting minimum and maximum values, which, at the price of a few iteration steps ($n^2-k$), reduces arithmetic complexity. -Listing \ref{lst:medianForget1pix3} details this process where forgetful selection is achieved by use of simple 2-value swapping function ($s()$, lines 1 to 5) that swaps input values if necessary, so as to achieve the first steps of a \textit{bitonic}-like sorting network (\cite{Batcher:1968:SNA:1468075.1468121}). Moreover, whenever possible, in order to increase the Instruction-Level Parallelism \index{Instruction-Level Parallelism}, successive calls to $s()$ are done with independant elements as arguments. This is illustrated by the macro definitions of lines 7 to 14 and by Figure \ref{fig:bitonic} which details the first iteration of the $5\times 5$ selection, starting with $k_{25}=14$ elements. +The \textit{forgetful selection} method, used in \cite{mcguire2008median}, does not imply full sorting of values, but only selecting minimum and maximum values, which, at the price of a few iteration steps ($n^2-k$), reduces arithmetic complexity. +Listing \ref{lst:medianForget1pix3} details this process where forgetful selection is achieved by use of simple 2-value swapping function ($s()$, lines 1 to 5) that swaps input values if necessary, so as to achieve the first steps of an incomplete sorting network \cite{Batcher:1968:SNA:1468075.1468121}. Moreover, whenever possible, in order to increase the Instruction-Level Parallelism \index{Instruction-Level Parallelism}, successive calls to $s()$ are done with independant elements as arguments. This is illustrated by the macro definitions of lines 7 to 14 and by Figure \ref{fig:bitonic} which details the first iteration of the $5\times 5$ selection, starting with $k_{25}=14$ elements. \lstinputlisting[label={lst:medianForget1pix3},caption= 3$\times$3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070. ]{Chapters/chapter3/code/kernMedianForget1pix3.cu} @@ -396,7 +298,7 @@ Running this $3\times 3$ kernel saves another 10\% runtime, as shown in Figure \ \end{figure} \section{A 5$\times$5 and more median filter } -Considering the maximum register count allowed per thread (63) and trying to push this technique to its limit potentially allows designing up to 9$\times$9 median filters. Such maximum would actually use $k_{81}=\lceil 81/2\rceil+1 = 42$ registers per thread plus 9, used by the compiler to complete arithmetic operations. This leads to a total register count of 51, which would forbid to compute more than one pixel per thread, but also would limit the number of concurrent threads per block. Our measurements show that this technique is still worth using for the 5$\times$5 median. As for larger window sizes, one option could be using shared memory. +Considering the maximum register count allowed per thread (63) and trying to push this technique to its limit potentially allows designing up to 9$\times$9 median filters. Such maximum would actually use $k_{81}=\lceil 81/2\rceil+1 = 42$ registers per thread plus 9, used by the compiler to complete arithmetic operations and 9 more when outputting 2 pixels per thread. This leads to a total register count of 60, which would limit the number of concurrent threads per block. Our measurements show that this technique is still worth using for the 7$\times$7 median. As for larger window sizes, one option could be using shared memory. The next two sections will first detail the particular case of the 5$\times$5 median through register-only method and eventually a generic kernel for larger window sizes. \subsection{A register-only 5$\times$5 median filter \label{sec:median5}} @@ -410,9 +312,9 @@ The minimum register count required to apply the forgetful selection method to a \begin{figure} \centering - \includegraphics[width=6cm]{Chapters/chapter3/img/forgetful_selection4.png} + \includegraphics[width=6cm]{Chapters/chapter3/img/fig3.jpg} \caption[First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network.]{First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.} - \label{fig:median5overlap} + \label{fig:bitonic} \end{figure} \lstinputlisting[label={lst:medianForget2pix5},caption=kernel 5$\times$5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.]{Chapters/chapter3/code/kernMedian2pix5.cu} diff --git a/BookGPU/Chapters/chapter3/code/kernMedian2pix5.cu b/BookGPU/Chapters/chapter3/code/kernMedian2pix5.cu index b01619e..50511ce 100755 --- a/BookGPU/Chapters/chapter3/code/kernMedian2pix5.cu +++ b/BookGPU/Chapters/chapter3/code/kernMedian2pix5.cu @@ -4,7 +4,7 @@ __global__ void kernel_median5_2pix( short *output, int j= __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2); int i= __mul24(blockIdx.y,blockDim.y) + threadIdx.y; int a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13;//left window - int b6,b7,b8,b9,b10,b11,b12,b13 ; //right window + int b7,b8,b9,b10,b11,b12,b13 ; //right window //first 14 common pixels a0 = tex2D(tex_img_ins, j-1, i-2) ; // first line a1 = tex2D(tex_img_ins, j , i-2) ; @@ -37,7 +37,7 @@ __global__ void kernel_median5_2pix( short *output, minmax8(&a6,&a7,&a8,&a9,&a10,&a11,&a12,&a13); // separation - b6=a6; b7=a7; b8=a8; b9=a9; b10=a10; b11=a11; b12=a12; b13=a13; + b7=a7; b8=a8; b9=a9; b10=a10; b11=a11; b12=a12; b13=a13; // separate selections: 5 remaining pixels in both windows a13 = tex2D(tex_img_ins, j-2, i-2); diff --git a/BookGPU/Chapters/chapter3/code/kernMedian2pix5.cu~ b/BookGPU/Chapters/chapter3/code/kernMedian2pix5.cu~ index bb8d5ee..b01619e 100755 --- a/BookGPU/Chapters/chapter3/code/kernMedian2pix5.cu~ +++ b/BookGPU/Chapters/chapter3/code/kernMedian2pix5.cu~ @@ -1,10 +1,11 @@ -__global__ void kernel_median5_2pix( short *output, int i_dim, int j_dim) +__global__ void kernel_median5_2pix( short *output, + int i_dim, int j_dim) { - int j = __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2) ; - int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; - int a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13 ;//left window - int b6, b7, b8, b9, b10, b11, b12, b13 ; //right window - + int j= __mul24(__mul24(blockIdx.x,blockDim.x) + threadIdx.x,2); + int i= __mul24(blockIdx.y,blockDim.y) + threadIdx.y; + int a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13;//left window + int b6,b7,b8,b9,b10,b11,b12,b13 ; //right window + //first 14 common pixels a0 = tex2D(tex_img_ins, j-1, i-2) ; // first line a1 = tex2D(tex_img_ins, j , i-2) ; a2 = tex2D(tex_img_ins, j+1, i-2) ; @@ -38,7 +39,7 @@ __global__ void kernel_median5_2pix( short *output, int i_dim, int j_dim) // separation b6=a6; b7=a7; b8=a8; b9=a9; b10=a10; b11=a11; b12=a12; b13=a13; - // separate selections for the 5 remaining pixels in both windows + // separate selections: 5 remaining pixels in both windows a13 = tex2D(tex_img_ins, j-2, i-2); b13 = tex2D(tex_img_ins, j+3, i-2); minmax7(&a7,&a8,&a9,&a10,&a11,&a12,&a13); @@ -62,5 +63,4 @@ __global__ void kernel_median5_2pix( short *output, int i_dim, int j_dim) output[ __mul24(i, j_dim) +j ] = a12 ; //middle values output[ __mul24(i, j_dim) +j+1 ] = b12 ; - } diff --git a/BookGPU/Chapters/chapter4/ch4.tex b/BookGPU/Chapters/chapter4/ch4.tex index e3dbfd5..8788ab4 100644 --- a/BookGPU/Chapters/chapter4/ch4.tex +++ b/BookGPU/Chapters/chapter4/ch4.tex @@ -3,33 +3,6 @@ \chapter{Implementing an efficient convolution operation on GPU} -%\newcommand{\kl}{\includegraphics[scale=0.6]{Chapters/chapter4/img/kernLeft.png}~} -%\newcommand{\kr}{\includegraphics[scale=0.6]{Chapters/chapter4/img/kernRight.png}} - -%% \lstset{ -%% language=C, -%% columns=fixed, -%% basicstyle=\footnotesize\ttfamily, -%% numbers=left, -%% firstnumber=1, -%% numberstyle=\tiny, -%% stepnumber=5, -%% numbersep=5pt, -%% tabsize=3, -%% extendedchars=true, -%% breaklines=true, -%% keywordstyle=\textbf, -%% frame=single, -%% % keywordstyle=[1]\textbf, -%% %identifierstyle=\textbf, -%% commentstyle=\color{white}\textbf, -%% stringstyle=\color{white}\ttfamily, -%% % xleftmargin=17pt, -%% % framexleftmargin=17pt, -%% % framexrightmargin=5pt, -%% % framexbottommargin=4pt, -%% backgroundcolor=\color{lightgray}, -%% } @@ -45,7 +18,7 @@ begin with, we shall examine non-separable or generic convolutions, before adressing the matter of separable convolutions. We shall refer to $I$ as an H x L pixel gray-level image, and to $I(x,y)$ as the gray-level value of each pixel of coordinates $(x,y)$. -%dire qqs mots sur le filtrage IIR/FIR ? + \section{Definition} @@ -54,11 +27,11 @@ image $I$ and convolution mask \emph{h} (To avoid confusion with other GPU functions referred to as kernels, we shall use\emph{ convolution mask} instead of \emph{convolution kernel}) is defined by: \begin{equation} -I'(x, y) = \left(I * h\right) = \sum_{i < H} \sum_{j < L}I(x-j, y-j).h(j,i) +I'(x, y) = \left(I * h\right) = \sum_{(i < H)} \sum_{(j < L)}I(x-j, y-j).h(j,i) \label{convoDef} \end{equation} -While processing an image, function \emph{h} is bounded by a square -window of size \emph{k = 2r + 1}, i.e an uneven number, to ensure +While processing an image, function \emph{h} is often bounded by a square +window of size \emph{k = 2r + 1}, \textit{i.e} an uneven number, to ensure there is a center. We shall also point out that, as stated earlier, the square shape is no limiting factor to the process, as any shape can be inscribed into a square. In the case of a more complex shape, @@ -80,9 +53,9 @@ For more readability, only part of the connecting lines are shown. \caption{generic convolution} \label{algo_genconv} \ForEach{pixel at position $(x, y)$}{ - Read all gray-level values $I(x, y)$ in the neighborhood\\\; - Compute the weighted sum \( I_\Omega = \sum_{(j,i) \in \Omega}I(x-j, y-j).h(j,i) \)\\\; - Normalize $I'(x, y)$ value\\\; + Read all gray-level values $I(x, y)$ in the neighborhood\; + Compute the weighted sum \( I_\Omega = \sum_{(j,i) \in \Omega}I(x-j, y-j).h(j,i) \)\; + Normalize $I'(x, y)$ value\; Outputs the new gray-level value } \end{algorithm} @@ -109,7 +82,7 @@ convolutions of the tuning recipes applied to median filters in the previous chapter, as a reminder : texture memory used with incoming data, pinned memory with output data, optimized use of registers while processing data and multiple output per thread\index{Multiple output per thread}. -One signifcant difference lies in the fact +One significant difference lies in the fact that the median filter uses only one parameter, the size of the window mask, which can be hard-coded, while a convolution mask requires referring to several; hard-coding its elements would lead to severe lack of flexibility (one function @@ -117,7 +90,7 @@ per filter, no external settings) so we will just use it as a starting point in our approach. Let us assume that we are planning to implement the convolution defined by the following $3\times 3$ mask (low-pass filter or averaging filter): -$$h=\begin{bmatrix}1&1&1\\1&1&1\\1&1&1\end{bmatrix}$$ +$$h=\frac{1}{9}\begin{bmatrix}1&1&1\\1&1&1\\1&1&1\end{bmatrix}$$ The kernel code presented in Listing \ref{lst:convoGene3Reg8} implements the convolution operation and applies all above optimizations except, for clarity reasons, multiple output per thread. In the particular case of a generic convolution, it is important to note how mask coefficients are applied to image pixels in order to fit the definition of equation \ref{convoDef}: if the coordinates of the center pixel had been set to (0,0), then the pixel of coordinates $(i,j)$ would have been multiplied by the element $(-i,-j)$ of the mask, which, transposed in our kernel code, leads to multiply the $p^{th}$ pixel of the window by the $(n-p)^{th}$ element of the convolution mask. @@ -179,7 +152,7 @@ $\mathbf{4096\times 4096}$&5.882 &12.431 \\\hline \end{table} Table \ref{tab:convoNonSepReg3} shows timings and global throughput values achieved by those convolution masks on an Nvidia GT200 Tesla architecture (GTX480 card) with $16x8$ thread blocks. This measurement has been done in order to make a relevant comparison with a reference given by Nvidia in \cite{convolutionsoup} where they state that their fastest kernel achieves a $5\times5$ convolution of an 8-bit $2048\times 2048$ pixelimage in $1.4~ms$, which lead to a throughput value of 945~Mpix/s. -Our current value of 802~Mpix/s, though not unsatisfactory, remains lower to the one reached by the manufacturer'own coding. +Our current value of 802~Mpix/s, though not unsatisfactory, remains lower to the one reached by the manufacturer's own coding. Tested in the same conditions, the newer Fermi architecture of Nvidia's GPUs proved slower (3.3 ms, see Table \ref{tab:convoNonSepReg1}) due to the lower maximum register count allowed (63, against 128 for Tesla GT200). @@ -220,7 +193,7 @@ of the size of the convolution mask, one can envisage processing 2 or more pixels per thread while keeping safely within the 63-per-thread rule. -However, when doing so, \textit{eg} processing what we shall call a \textit{packet} of pixels, window mask overlapping has to be taken into account +However, when doing so, \textit{e.g} processing what we shall call a \textit{packet} of pixels, window mask overlapping has to be taken into account to avoid multiple texture fetches of each pixel's gray-level value, while benefiting from the 2-D cache. In that case, both mask size and pixel packet shape determine the number of texture fetches to be performed for each pixel value. Figure \ref{fig:convoOverlap1} illustrates two different situations: on top, a mask of radius 1 ($3\times 3$) applied to a packet of 8 pixels in row; at bottom, a mask of radius 2 ($5\times 5$). @@ -320,7 +293,7 @@ $$h = h_v \times h_h = \begin{bmatrix}1\\2\\1\end{bmatrix} \times \begin{bmatrix -1&2&-1 \end{bmatrix}$$ Such a mask allows to replace a generic 2-D convolution operation by two consecutive stages of a 1-D convolution operation: a vertical of mask $h_v$ and a horizontal of mask $h_h$. -This saves a lot of arithmetic operations, as a generic $n\times n$ convolution applied on a $H\times L$ image basically represents $H.L.n^2$ multiplications and as many additions, while two consecutive $n\times 1$ convolutions only represents $2.H.L.n$ of each, \textit{eg} 60\% operations are saved per pixel of the image for a $5\times 5$ mask.\\ +This saves a lot of arithmetic operations, as a generic $n\times n$ convolution applied on a $H\times L$ image basically represents $H.L.n^2$ multiplications and as many additions, while two consecutive $n\times 1$ convolutions only represents $2.H.L.n$ of each, \textit{e.g} 60\% operations are saved per pixel of the image for a $5\times 5$ mask.\\ However, beside reducing the operation count, performing a separable convolution also means writing an intermediate image into global memory. CPU implementations of separable convolutions often use a single function to perform both 1-D convolution stages. To do so, this function reads the input image and actually ouputs the transposed filtered image. Applying that principle to GPUs is not efficient, as outputting the transposed image means non-coalescent writes into global memory, generating severe performance loss. Hence the idea of developing two different kernels, one for each of both vertical and horizontal convolutions. diff --git a/BookGPU/Chapters/chapter4/code/convoGene8x8pL3.cu b/BookGPU/Chapters/chapter4/code/convoGene8x8pL3.cu index ce619d7..523dde5 100644 --- a/BookGPU/Chapters/chapter4/code/convoGene8x8pL3.cu +++ b/BookGPU/Chapters/chapter4/code/convoGene8x8pL3.cu @@ -14,41 +14,41 @@ __global__ void kernel_convoGene8x8pL3( unsigned char *output, int j_dim ) for (ic=0 ; ic global mem - output[ idx ] = outval0 ; - output[ idx+1 ] = outval1 ; - output[ idx+2 ] = outval2 ; - output[ idx+3 ] = outval3 ; - output[ idx+4 ] = outval4 ; - output[ idx+5 ] = outval5 ; - output[ idx+6 ] = outval6 ; - output[ idx+7 ] = outval7 ; + output[ idx++ ] = outval0 ; + output[ idx++ ] = outval1 ; + output[ idx++ ] = outval2 ; + output[ idx++ ] = outval3 ; + output[ idx++ ] = outval4 ; + output[ idx++ ] = outval5 ; + output[ idx++ ] = outval6 ; + output[ idx ] = outval7 ; } diff --git a/BookGPU/Chapters/chapter4/code/convoSepShH.cu b/BookGPU/Chapters/chapter4/code/convoSepShH.cu index 19f533e..d85b340 100644 --- a/BookGPU/Chapters/chapter4/code/convoSepShH.cu +++ b/BookGPU/Chapters/chapter4/code/convoSepShH.cu @@ -54,5 +54,5 @@ __global__ void kernel_convoSepShx8pH(unsigned char *output, int j_dim, int r) output[ idx++ ] = outval4 ; output[ idx++ ] = outval5 ; output[ idx++ ] = outval6 ; - output[ idx++ ] = outval7 ; + output[ idx ] = outval7 ; } diff --git a/BookGPU/Chapters/chapter4/code/convoSepShH.cu~ b/BookGPU/Chapters/chapter4/code/convoSepShH.cu~ index 8f3e73d..19f533e 100644 --- a/BookGPU/Chapters/chapter4/code/convoSepShH.cu~ +++ b/BookGPU/Chapters/chapter4/code/convoSepShH.cu~ @@ -2,28 +2,28 @@ __global__ void kernel_convoSepShx8pH(unsigned char *output, int j_dim, int r) { int ic, jc, p; int k = 2*r+1 ; - float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ; - int bdimX = blockDim.x<<3 ; - int tidX = threadIdx.x<<3 ; + float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0 ; + float outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ; + int bdimX = blockDim.x<<3 ; // all packets width + int tidX = threadIdx.x<<3 ; // one packet offset - // coordonnees absolues du point de base + // absolute coordinates of one packet base point int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x)<<3 ; int i = __umul24( blockIdx.y, blockDim.y) + threadIdx.y ; - int j0= __umul24(blockIdx.x,blockDim.x)<<3 ; - int idx = __umul24(i,j_dim) + j ; // indice dans l'image + int j0= __umul24(blockIdx.x,blockDim.x)<<3 ; + // absolute index in the image + int idx = __umul24(i,j_dim) + j ; - - // chargement en smem + // offset of one ROI row in shared memory int idrow = threadIdx.y*(bdimX+k-1) ; extern __shared__ unsigned char roi8p[]; - // bloc 0 (a gauche) + // top left block for (p=0; p<8; p++) roi8p[ idrow + tidX +p ] = tex2D(tex_img_inc, j-r+p , i) ; - - // a droite - if ( threadIdx.x < r ) //...ou plutot ce qu'il en manque + // top right block + if ( threadIdx.x < r ) { roi8p[ idrow + bdimX + threadIdx.x ] = tex2D( tex_img_inc, j0-r +bdimX+threadIdx.x , i ) ; roi8p[ idrow + bdimX + threadIdx.x +r ] = tex2D( tex_img_inc, j0 +bdimX+threadIdx.x , i ) ; @@ -31,8 +31,7 @@ __global__ void kernel_convoSepShx8pH(unsigned char *output, int j_dim, int r) __syncthreads(); - // calculs de convolution - // passe horizontale + // horizontal convolution for (jc=0 ; jc global mem - output[ idx ] = outval0 ; - output[ idx+1 ] = outval1 ; - output[ idx+2 ] = outval2 ; - output[ idx+3 ] = outval3 ; - output[ idx+4 ] = outval4 ; - output[ idx+5 ] = outval5 ; - output[ idx+6 ] = outval6 ; - output[ idx+7 ] = outval7 ; + // 8 pixels per thread --> global mem + output[ idx++ ] = outval0 ; + output[ idx++ ] = outval1 ; + output[ idx++ ] = outval2 ; + output[ idx++ ] = outval3 ; + output[ idx++ ] = outval4 ; + output[ idx++ ] = outval5 ; + output[ idx++ ] = outval6 ; + output[ idx++ ] = outval7 ; } diff --git a/BookGPU/Chapters/chapter4/code/convoSepShV.cu b/BookGPU/Chapters/chapter4/code/convoSepShV.cu index a8bf71e..3dc2f1b 100644 --- a/BookGPU/Chapters/chapter4/code/convoSepShV.cu +++ b/BookGPU/Chapters/chapter4/code/convoSepShV.cu @@ -53,5 +53,5 @@ __global__ void kernel_convoSepShx8pV(unsigned char *output, int j_dim, int r) output[ idx++ ] = outval4 ; output[ idx++ ] = outval5 ; output[ idx++ ] = outval6 ; - output[ idx++ ] = outval7 ; + output[ idx ] = outval7 ; } diff --git a/BookGPU/Chapters/chapter4/code/convoSepShV.cu~ b/BookGPU/Chapters/chapter4/code/convoSepShV.cu~ index 4437a56..a8bf71e 100644 --- a/BookGPU/Chapters/chapter4/code/convoSepShV.cu~ +++ b/BookGPU/Chapters/chapter4/code/convoSepShV.cu~ @@ -2,9 +2,10 @@ __global__ void kernel_convoSepShx8pV(unsigned char *output, int j_dim, int r) { int ic, jc, p; int k = 2*r+1 ; - float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0, outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ; - int bdimX = blockDim.x<<3 ; - int tidX = threadIdx.x<<3 ; + float outval0=0.0, outval1=0.0, outval2=0.0, outval3=0.0 ; + float outval4=0.0, outval5=0.0, outval6=0.0, outval7=0.0 ; + int bdimX = blockDim.x<<3 ; // all packets width + int tidX = threadIdx.x<<3 ; // one packet offset // absolute coordinates of the base point int j = (__umul24(blockIdx.x,blockDim.x) + threadIdx.x)<<3 ; @@ -44,7 +45,7 @@ __global__ void kernel_convoSepShx8pV(unsigned char *output, int j_dim, int r) outval7 += valMask*roi8p[ baseRoi +7 ] ; } - // 8 pixel par thread --> global mem + // 8 pixels per thread --> global mem output[ idx++ ] = outval0 ; output[ idx++ ] = outval1 ; output[ idx++ ] = outval2 ;