X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/blobdiff_plain/b0cfcc742771497c83313352b59170ead2f99f40..32cc2188d02725dd461b5ab139938bb89af67469:/BookGPU/Chapters/chapter3/ch3.tex

diff --git a/BookGPU/Chapters/chapter3/ch3.tex b/BookGPU/Chapters/chapter3/ch3.tex
index 2afb337..8b7b55e 100755
--- a/BookGPU/Chapters/chapter3/ch3.tex
+++ b/BookGPU/Chapters/chapter3/ch3.tex
@@ -1,4 +1,4 @@
-\chapterauthor{Zulu pero}{Zulumachine Institute}
+\chapterauthor{Gilles Perrot}{FEMTO-ST Institute}
 %\graphicspath{{img/}}
 
 
@@ -112,6 +112,21 @@ Obviously, our code originally accepts various image dimensions and can process
 However, so as to propose concise and more readable code, we will assume the following limitations:
 8 or 16~bit-coded gray-level input images whose dimensions $H\times W$ are multiples of 512 pixels. 
 
+\begin{algorithm}
+ \SetNlSty{textbf}{}{:}
+ allocate and populate CPU memory \textbf{h\_in}\;\\
+ allocate CPU pinned-memory \textbf{h\_out}\;\\
+ allocate GPU global memory \textbf{d\_out}\;\\
+ declare GPU texture reference \textbf{tex\_img\_in}\;\\
+ allocate GPU array in global memory \textbf{array\_img\_in}\;\\
+ bind GPU array \textbf{array\_img\_in} to texture \textbf{tex\_img\_in}\;\\
+ copy data from \textbf{h\_in} to \textbf{array\_img\_in}\label{algo:memcopy:H2D}\; \\
+ kernel\kl gridDim,blockDim\kr()\tcc*[f]{outputs to d\_out}\label{algo:memcopy:kernel}\;\\
+ copy data from \textbf{d\_out} to \textbf{h\_out} \label{algo:memcopy:D2H}\;\\
+\caption{Global memory management on CPU and GPU sides.}
+\label{algo:memcopy}
+ \end{algorithm}
+
 \section{Data transfers, memory management.}
 This section deals with the following issues: 
 \begin{enumerate}
@@ -132,21 +147,6 @@ Listing \ref{lst:fkern1} gives a minimal kernel skeleton that will serve as the
 The instruction in line 8 combines writing the output gray-level value into global memory and fetching the input gray-level value from 2-D texture memory.
 The Makefile given in Listing \ref{lst:mkfile} shows how to adapt examples given in SDK.
 
-\begin{algorithm}
- \SetNlSty{textbf}{}{:}
- allocate and populate CPU memory \textbf{h\_in}\;
- allocate CPU pinned-memory \textbf{h\_out}\;
- allocate GPU global memory \textbf{d\_out}\;
- declare GPU texture reference \textbf{tex\_img\_in}\;
- allocate GPU array in global memory \textbf{array\_img\_in}\;
- bind GPU array \textbf{array\_img\_in} to texture \textbf{tex\_img\_in}\;
- copy data from \textbf{h\_in} to \textbf{array\_img\_in}\label{algo:memcopy:H2D}\; 
- kernel\kl gridDim,blockDim\kr()\tcc*[f]{outputs to d\_out}\label{algo:memcopy:kernel}\;
- copy data from \textbf{d\_out} to \textbf{h\_out} \label{algo:memcopy:D2H}\;
-\caption{Global memory management on CPU and GPU sides.}
-\label{algo:memcopy}
-\end{algorithm}
-
 \lstinputlisting[label={lst:main1},caption=Generic main.cu file used to launch CUDA kernels]{Chapters/chapter3/code/mainSkel.cu}
 
 \lstinputlisting[label={lst:fkern1},caption=fast\_kernels.cu file featuring one kernel skeleton]{Chapters/chapter3/code/kernSkel.cu}
@@ -313,36 +313,57 @@ The diagram of Figure \ref{fig:compMedians1} summarizes these first results. Onl
 \end{figure}
 
 \subsection{Further optimization}
-Running the above register-only 3$\times$3 median filter through the NVidia CUDA profiler teaches us that the memory throughput achieved by the kernel remains quite low. To improve this, two methods can be used: one is to increase the number of concurrent threads by reducing the number of registers used, the other to have each thread process more data which can be achieved by outputting the gray-level value of two pixels or more.
+Running the above register-only 3$\times$3 median filter through the NVidia CUDA profiler teaches us that the memory throughput achieved by the kernel remains quite low. To improve this, two methods can be used: 
+\begin{itemize}
+\item increasing the number of concurrent threads, which can be achieved by reducing the number of registers used by each thread.
+\item having each thread process more data which can be achieved at thread level by processing and outputting the gray-level value of two pixels or more.
+\end{itemize}
+
+ 
 \subsubsection{Reducing register count}
 Our current kernel (\texttt{kernelMedian3RegTri9}) uses one register per gray-level value, which amounts to 9 registers for the entire 3$\times$3 window. 
-This count can be reduced by use of an iterative sorting process called \textit{forgetful selection}, where both \textit{extrema} are eliminated at each sorting stage, until only 3 elements remain. The question is to find out the minimal register count $k_n$ that allows the selection of the median amoung $n^2$ values. The answer can be evaluated  considering that, when eliminating the maximum and the minimum values, one has to make sure not to eliminate the global median value, e.g. $k_n=\lceil n^2/2\rceil+1$.
-%To ensure this, the number of values that are not part of the process must remain lower than the number of values that would have had an index higher (or lower) than the middle one in the fully sorted $n^2$ value vector. 
+This count can be reduced by use of an iterative sorting process called \textit{forgetful selection}, where both \textit{extrema} are eliminated at each sorting stage, until only 3 elements remain. The question is to find out the minimal register count $k_n$ that allows the selection of the median amoung $n^2$ values. The answer can be evaluated  considering that, when eliminating the maximum and the minimum values, one has to make sure not to eliminate the global median value. Such a situation is illustrated in Figure \ref{fig:forgetful_selection} for a  3$\times$3 median filter. For better comprehension, the 9 elements of the  3$\times$3 pixel window have been represented in a row.
+\begin{figure}
+   \centering
+   \includegraphics[width=6cm]{Chapters/chapter3/img/forgetful_selection.png}
+   \caption{Forgetful selection with the minimal element register count. Illustration for 3$\times$3 pixel window represented in a row and supposed sorted.}
+   \label{fig:forgetful_selection}
+\end{figure}
+We must remember that, in the fully sorted vector, the median value will have the middle index e.g. $\lfloor n^2/2\rfloor$. 
+Moreover, assuming that both \textit{extrema} are eliminated from the first $k$ elements and that the global median is one of them would mean that:
+\begin{itemize}
+\item if the global median was the minimum among the $k$ elements, then at least $k-1$ elements would have a higher index. Considering the above median definition, at least $k-1$ elements should also have a lower index in the entire vector.
+\item if the global median was the maximum among the $k$ elements, then at least $k-1$ elements would have a lower index. Considering the above median definition, at least $k-1$ elements should also have a higher index in the entire vector.
+\end{itemize}
+Therefore, the number $k$ of elements that are part of the first selection stage can be defined by the condition 
+$$n^2-k \leq \lfloor \frac{n^2}{2} \rfloor -1$$
+which leads to:
+$$k_n=\lceil \frac{n^2}{2}\rceil+1 $$
 This rule can be applied to the first eliminating stage and remains true with the next ones as each stage suppresses exactly two values.
 In our 3$\times$3 pixel window example, the minimum register count becomes $k_9=\lceil 9/2\rceil+1 = 6$.
 
 The \textit{forgetful selection} method, used in \cite{mcguire2008median} does not imply full sorting of values, but only selecting minimum and maximum values, which, at the price of a few iteration steps ($n^2-k$), reduces arithmetic complexity.
 Listing \ref{lst:medianForget1pix3} details this process where forgetful selection is achieved by use of simple 2-value sorting function ($s()$, lines 1 to 5) that swaps input values if necessary. Moreover, whenever possible, in order to increase the Instruction-Level Parallelism, successive calls to $s()$ are done with independant elements as arguments. This is illustrated by the macro definitions of lines 7 to 14.
 
-\lstinputlisting[label={lst:medianForget1pix3},caption= 3$\times$3 median filter kernel using the minimum register count of 6 and finding the median value by forgetful selection method]{Chapters/chapter3/code/kernMedianForget1pix3.cu}
+\lstinputlisting[label={lst:medianForget1pix3},caption= 3$\times$3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method]{Chapters/chapter3/code/kernMedianForget1pix3.cu}
 
 Our such modified kernel provides significantly improved runtimes: a speedup of around 16\% is obtained, and pixel throughput reaches around 1000~MPixel/s on C2070.
 
 \subsubsection{More data output per thread}
-In the case of a kernel achieving an effective memory throughput much lower than the peak value, and if enough threads are run, another technique may help hiding memory latency and thus leverage performance: one thread produces multiple pixel outputs.
-Attentive readers should notice that it would increase the register count per thread. That's true, but dividing thread block size by the same quantity allow, at least, to keep the same register count per block, which is the parallelism limiting factor.
-Moreover, it is now possible to take advantage of the window overlapping, first illustrated In Figure \ref{fig:median_overlap}, and more detailed in Figure \ref{fig:median3_overlap}. As the selection is first processed on the first 6 gray-level values and as it is exactly the number of pixels that overlap between two neighbor window of adjacent pixels, it allows to save 6 texture fetches and one \texttt{minmax6} selection per thread. Again, speedup is expected through the modified kernel source code and the associated grid dimensions presented in Listing \ref{lst:medianForget2pix3}. Important differences to be noticed are pixel coordinates computation given thread index. As each thread has to process two pixels, the number of threads in each block is divided by 2, while the grid size remains the same. Consequently, in kernel code, each thread of block coordinates $(tx, ty)$ will be in charge of processing pixels of block coordinates $(2tx, ty)$ and $(2tx+1, ty)$; lines 5 and 6 implement this.
+In the case of a kernel achieving an effective memory throughput value far from the GPU peak value, and if enough threads are run, another technique may help hiding memory latency and thus leverage performance: make sure that each thread generates multiple pixel outputs.\\
+Attentive readers could remark that it would increase the register count per thread, which can be compensated by dividing thread block size accordingly, thus allowing to keep the same register count per block.
+Moreover, it is now possible to take advantage of window overlapping, first illustrated In Figure \ref{fig:median_overlap}, and further detailed in Figure \ref{fig:median3_overlap}. As the selection is first processed on the first 6 gray-level values, e.g. exactly the number of pixels that overlap between the neighborhoods of two adjacent center pixels, 6 texture fetches and one \texttt{minmax6} selection per thread can be saved. There again, some speedup can be  expected through our modified kernel source code presented in Listing \ref{lst:medianForget2pix3}. One important difference lies in the way pixel coordinates are computed from thread indexes. As each thread has to process two pixels, the number of threads in each block is divided by 2, while the grid size remains unchanged. Consequently, in our kernel code, each thread whose block-related coordinates are $(tx, ty)$ will be in charge of processing pixels of block-related coordinates $(2tx, ty)$ and $(2tx+1, ty)$; lines 5 and 6 implement this.
 
 \begin{figure}
    \centering
    \includegraphics[width=4cm]{Chapters/chapter3/img/median3_overlap.png}
-   \caption{Illustration of how window overlapping is used to combine 2 pixel selections in 3$\times$3 median kernel.}
+   \caption{Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times$3 median kernel.}
    \label{fig:median3_overlap}
 \end{figure}
 
-\lstinputlisting[label={lst:medianForget2pix3},caption=kernel 3$\times$3 median filter processing 2 output pixel values per thread by a combined forgetfull selection.]{Chapters/chapter3/code/kernMedian2pix3.cu}
+\lstinputlisting[label={lst:medianForget2pix3},caption=3$\times$3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.]{Chapters/chapter3/code/kernMedian2pix3.cu}
 
-Running this ultimate kernel saves another 10\% of runtime, as shown in Figure \ref{fig:compMedians2} and provides the best peak pixel throughput known so far on C2070 of 1155~Mpixel/s which is 86\% of the maximum effective throughput.
+Running this $3\times 3$ kernel saves another 10\% runtime, as shown in Figure \ref{fig:compMedians2} and provides the best peak pixel throughput value known so far on C2070: 1155~Mpixel/s which is 86\% the maximum effective throughput.
 
 \begin{figure}
    \centering
@@ -351,24 +372,22 @@ Running this ultimate kernel saves another 10\% of runtime, as shown in Figure \
    \label{fig:compMedians2}
 \end{figure}
 
-\section{Median filter 5$\times$5 and more}
-Considering the maximum register count allowed dper thread (63) and trying to push this technique to its limit would let us design median filters up to 9$\times$9 pixel window. This maximum would actually use  $k_{81}=\lceil 81/2\rceil+1 = 42$ registers per thread plus a few ones used by the compiler to complete arithmetic operations (9) leading to a total register count of 51. 
-This would oviously forbids us to compute more than one pixel per thread, but also would limit the number of concurrent threads per block. Our measurements show that this technique is still worth using for the 5$\times$5 median but that larger window sizes could take advantage of using shared memory.
-The next two sections will first detail the particular case of the 5$\times$5 median through register-only method and then a generic kernel for larger window sizes.
+\section{A 5$\times$5 and more median filter }
+Considering the maximum register count allowed per thread (63) and trying to push this technique to its limit potentially allows designing up to 9$\times$9 median filters. Such maximum would actually use  $k_{81}=\lceil 81/2\rceil+1 = 42$ registers per thread plus 9, used by the compiler to complete arithmetic operations. This leads to a total register count of 51, which would forbid to compute more than one pixel per thread, but also would limit the number of concurrent threads per block. Our measurements show that this technique is still worth using for the 5$\times$5 median. As for larger window sizes, one option could be using shared memory.
+The next two sections will first detail the particular case of the 5$\times$5 median through register-only method and eventually a generic kernel for larger window sizes.
 
-\subsection{Median filter 5$\times$5: register only }
-The minimum register count allowing to apply the forgetfull selection method to 5$\times$5 median filter is $k_{25}=\lceil 25/2\rceil+1 = 14$. Moreover, two adjacent overlapping windows share 20 pixels ($n^2-one\_column$) so that, when processing 2 pixels at once, from the first selection stage with 14 common values to the passing of the last common value, a count of 6 common selection stages can be carried out. That allows to limit the register count to 14+8=22 per thread. Figure \ref{fig:median5overlap}
+\subsection{A register-only 5$\times$5 median filter \label{sec:median5}}
+The minimum register count required to apply the forgetful selection method to a 5$\times$5 median filter is $k_{25}=\lceil 25/2\rceil+1 = 14$. Moreover, two adjacent overlapping windows share 20 pixels ($n^2-one\_column$) so that, when processing 2 pixels simultaneously, a count of 6 common selection stages can be carried out from the first selection stage with 14 common values to the processing of the last common value. That allows to limit register count to 14+8=22 per thread. Figure \ref{fig:median5overlap} describes the distribution of overlapping pixels, implemented in Listing \ref{lst:medianForget2pix5}: common selection stages take place from line 25 to line 37, while the remaining separate selection stages occur between lines 45 and 62 after the separation of line 40.
 \begin{figure}
    \centering
-   \includegraphics[width=8cm]{Chapters/chapter3/img/median5_overlap.png}
-   \caption{Reduction of register count in 5$\times$5 register only median kernel, outputting 2 pixel at once. The first 6 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.}
+   \includegraphics[width=6cm]{Chapters/chapter3/img/median5_overlap.png}
+   \caption{Reducing register count in a 5$\times$5 register-only median kernel outputting 2 pixels simultaneously. The first 6 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.}
    \label{fig:median5overlap}
 \end{figure}
-Listing \ref{lst:medianForget2pix5} reproduces the kernel \texttt{kernel\_medianForget2pix5} code where the common selection stages take place from line XX to line YY. The remaining separate selection stages occur between lines XX and YY after the separation of line GG.
 
 \lstinputlisting[label={lst:medianForget2pix5},caption=kernel 5$\times$5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.]{Chapters/chapter3/code/kernMedian2pix5.cu}
 
-Timing results follow the same variations with image size than previous ones. That is why  Table \ref{tab:median5comp} shows only throughput values obtained for C2070 card and 4096$\times$4096 pixel image.
+Timing results follow the same variations with image size as in previously presented kernels. That is why  Table \ref{tab:median5comp} shows only throughput values obtained for C2070 card and 4096$\times$4096 pixel image.
 
 \begin{table}[h]
 %\newlength\savedwidth
@@ -385,44 +404,20 @@ Timing results follow the same variations with image size than previous ones. Th
  \shortstack{\textbf{Throughput}\\\textbf{(MP/s)}}&551&738&152&540\\\hline
 \end{tabular}
 }  
-\caption{Performance of various 5$\times$5 median kernel implementations, applied on 4096$\times$4096 pixel image with C2070 GPU card..}
+\caption{Performance of various 5$\times$5 median kernel implementations, applied on 4096$\times$4096 pixel image with C2070 GPU card.}
 \label{tab:median5comp}
 \end{table}  
 
-\subsection{True median filter n$\times$n}
-Shared memory can represent an efficient way to reduce global or texture loads, but it is also a limiting factor for performance.
-On Fermi GPUs (as C2070), a maximum of 48~kB of per block shared memory is avalaible. With 16-bit coded gray levels, that allows to store up to 24576 values, which can be organised as a square of 156$\times$156 pixels maximum.
-A point is that it is not efficient to use the shared memory at its maximum, as it would reduce the number of blocks beeing run in parallel on each SM.
-Another point is that it is not  possible to avoid bank conflicts when designing a generic median kernel.
-Thus, the most efficient way to code a generic, large window, median filter, is to do without shared memory but use texture direct fetching.
-Listing \ref{lst:medianForgetGeneric} reproduce such a code, where the most interesting part is between lines XX and YY, where the forgetfull selection has been generalized to an arbitrary window size.
-Performance results summarized in table \ref{tab:medianForgetGeneric} demonstrate that such a method is far from beeing as efficient as small fixed-size implementations. 
-
-\begin{table}[h]
-%\newlength\savedwidth
-\newcommand\whline{\noalign{\global\savedwidth
-  \arrayrulewidth\global\arrayrulewidth 1.5pt}
-  \hline \noalign{\global\arrayrulewidth
-  \savedwidth}
-}
-\centering
-{\scriptsize
-\begin{tabular}{|l||c|c|c|c|}
-\hline
-\shortstack{\textbf{Window size}\\(in pixels)}&\textbf{121}&\textbf{169}&\textbf{225}&\textbf{441}\\\whline
- \shortstack{\textbf{Throughput}\\\textbf{(MP/s)}}& & & & \\\hline
-\end{tabular}
-}  
-\caption{Performance of generic median kernel applied to various window sizes on 4096$\times$4096 pixel image.}
-\label{tab:medianForgetGeneric}
-\end{table} 
-
-\lstinputlisting[label={lst:medianForgetGeneric},caption= generic median kernel by forgetfull selection.]{Chapters/chapter3/code/kernMedianForgetGeneric.cu}
+\subsection{Fast approximated n$\times$n median filter }
+Large window median filters are less widespread and used in more specific fields, such as digital microscopy: for example in \cite{paper_bio_background}, an estimation of the background gray-level intensity is achieved through a $111\times 111$ median filtering (processed in around 2s for a 4MPixel image). In such cases, a possible technique is to split median selection into two separate 1-D stages: one in the vertical direction and the other in the horizontal direction. Image processing specialists may object that this method does not select the actual median value. This is true but, in the case of large window sizes and \textit{real-life} images, the so selected value is statistically near the actual median value and often represents an acceptable approximation. Such a filter is sometimes called \textit{smoother}.
 
-\subsection{Fast approximated median filter n$\times$n}
-If faster process is required, a possible technique is to split median selection in two separate 1-D stages: one in the vertical direction and the other in the horizontal direction. Image processing specialists would say that this method does not selects the actual median value. They would be right, but for large window sizes and \textit{real-life} images, the so selected value is statically near the true median value and often represents an acceptable approximation.
-In this particular case, we use a Torben Morgensen sorting algorithm, as it only needs a few and fixed register count. 
+As explained earlier in this section, the use of large window median filters rules out register-only implementation,
+which suggests to privilege the use of shared memory. The 1-D operation almost completely avoids bank conflicts in shared memory accesses. 
+Furthermore, the above-described forgetful selection method cannot be used anymore, as too many registers would be required.\\Instead, the Torben Morgensen sorting algorithm is used, as its required register count is both low and constant, and avoids the use of a local vector, unlike histogram-based methods.
 
+Listing \ref{lst:medianSeparable} presents a kernel code that implements the above considerations and achieves a 1-D vertical $n \times 1$ median filter. The shared memory vector is declared as \texttt{extern} (Line 16) as its size is determined at runtime and passed to the kernel call as an argument. Lines 20 to 29 perform data prefetching, including the $2n$-row halo ($n$ at the bottom and $n$ at the top of each block). Then one synchronization barrier is mandatory (line 31) to ensure that all needed data is ready prior to its use by the different threads.
+Torben Morgensen sorting takes place between lines 37 and 71 and eventually, the transposed output value is stored in global memory at line 73. Outputting the transposed image in global memory saves time and allows to re-use the same kernel to achieve the second step, e.g 1-D horizontal $n \times 1$ median filtering. The final transpose is done at transfer time, when copying data from GPU to CPU memory, which once more saves time while actually generating the expected image. 
+It has to be noticed that this smoother, unlike the technique we proposed for fixed-size median filters, can not be considered as a state-of-the-art technique, as for example the one presented in \cite{4287006}. However, it may be considered as a good, easy to use and efficient alternative as confirmed by the results presented in Table \ref{tab:medianSeparable}. Pixel throughput values achieved by our kernel, though not constant with window size, remain very competitive if window size is kept under $120\times 120$ pixels (in \cite{4287006}, pixel throughput is around 7MP/s).  
 \begin{table}[h]
 %\newlength\savedwidth
 \newcommand\whline{\noalign{\global\savedwidth
@@ -434,17 +429,16 @@ In this particular case, we use a Torben Morgensen sorting algorithm, as it only
 {\scriptsize
 \begin{tabular}{|l||c|c|c|c|}
 \hline
-\shortstack{\textbf{Window size}\\(in pixels)}&\textbf{121}&\textbf{169}&\textbf{225}&\textbf{441}\\\whline
- \shortstack{\textbf{Throughput}\\\textbf{(MP/s)}}& & & & \\\hline
+\shortstack{\textbf{Window edge size}\\(in pixels)}&\textbf{41}&\textbf{81}&\textbf{111}&\textbf{121}\\\whline
+ \shortstack{\textbf{Throughput}\\\textbf{(MP/s)}}&54 &27 & 20& 18\\\hline
 \end{tabular}
 }  
-\caption{Performance of generic pseudo separable median kernel applied to various window sizes on 4096$\times$4096 pixel image.}
+\caption{Measured performance of one generic pseudo-separable median kernel applied to 4096$\times$4096 pixel image with various window sizes.}
 \label{tab:medianSeparable}
 \end{table} 
 
 \lstinputlisting[label={lst:medianSeparable},caption= generic pseudo median kernel.]{Chapters/chapter3/code/kernMedianSeparable.cu}
 
-
 \section{Glossary}
 \begin{Glossary}
 \item[CUDA] Compute Unified Device Architecture.