\frenchspacing
\tolerance=5000
-\include{Chapters/chapter1/preamble}
+%\include{Chapters/chapter1/preamble}
\include{Chapters/chapter5/preamble}
\newcommand{\scalprod}[2]%
\newcommand{\mymat}[1]{\mathcal{#1}}
\newcommand{\myvec}[1]{\mathbf{#1}}
-
- \lstset{
- basicstyle=\footnotesize\ttfamily, % Standardschrift
- %numbers=left, % Ort der Zeilennummern
- numberstyle=\tiny, % Stil der Zeilennummern
- %stepnumber=2, % Abstand zwischen den Zeilennummern
- numbersep=5pt, % Abstand der Nummern zum Text
- tabsize=2, % Groesse von Tabs
- extendedchars=true, %
- breaklines=true, % Zeilen werden Umgebrochen
- keywordstyle=\color{red},
- frame=b,
- % keywordstyle=[1]\textbf, % Stil der Keywords
- % keywordstyle=[2]\textbf, %
- % keywordstyle=[3]\textbf, %
- % keywordstyle=[4]\textbf, \sqrt{\sqrt{}} %
- stringstyle=\color{white}\ttfamily, % Farbe der String
- showspaces=false, % Leerzeichen anzeigen ?
- showtabs=false, % Tabs anzeigen ?
- xleftmargin=17pt,
- framexleftmargin=17pt,
- framexrightmargin=5pt,
- framexbottommargin=4pt,
- %backgroundcolor=\color{lightgray},
- showstringspaces=false % Leerzeichen in Strings anzeigen ?
- }
- \lstloadlanguages{% Check Dokumentation for further languages ...
- %[Visual]Basic
- %Pascal
- C
- %C++
- %XML
- %HTML
- %Java
- }
- %\DeclareCaptionFont{blue}{\color{blue}}
-
- %\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}}
+\lstset{morekeywords={HALF4,HALF3,float2,float3,float4,half,half2,half3,half4,tex2D,dim3,endif,threadIdx,blockIdx,blockDim,gridDim,Dim3,__host__,__global__,__shared__,float}}
+\lstset{
+ language=C,
+ columns=fixed,
+ basicstyle=\footnotesize\ttfamily,
+ numbers=left,
+ firstnumber=1,
+ numberstyle=\tiny,
+ stepnumber=5,
+ numbersep=5pt,
+ tabsize=3,
+ extendedchars=true,
+ breaklines=true,
+ keywordstyle=\textbf,
+ frame=single,
+ % keywordstyle=[1]\textbf,
+ %identifierstyle=\textbf,
+ commentstyle=\color{white}\textbf,
+ stringstyle=\color{white}\ttfamily,
+ % xleftmargin=17pt,
+ % framexleftmargin=17pt,
+ % framexrightmargin=5pt,
+ % framexbottommargin=4pt,
+ backgroundcolor=\color{lightgray},
+ basicstyle=\scriptsize
+ }
+
+ %% \lstset{
+%% basicstyle=\footnotesize\ttfamily, % Standardschrift
+%% %numbers=left, % Ort der Zeilennummern
+%% numberstyle=\tiny, % Stil der Zeilennummern
+%% %stepnumber=2, % Abstand zwischen den Zeilennummern
+%% numbersep=5pt, % Abstand der Nummern zum Text
+%% tabsize=2, % Groesse von Tabs
+%% extendedchars=true, %
+%% breaklines=true, % Zeilen werden Umgebrochen
+%% keywordstyle=\color{red},
+%% frame=b,
+%% % keywordstyle=[1]\textbf, % Stil der Keywords
+%% % keywordstyle=[2]\textbf, %
+%% % keywordstyle=[3]\textbf, %
+%% % keywordstyle=[4]\textbf, \sqrt{\sqrt{}} %
+%% stringstyle=\color{white}\ttfamily, % Farbe der String
+%% showspaces=false, % Leerzeichen anzeigen ?
+%% showtabs=false, % Tabs anzeigen ?
+%% xleftmargin=17pt,
+%% framexleftmargin=17pt,
+%% framexrightmargin=5pt,
+%% framexbottommargin=4pt,
+%% %backgroundcolor=\color{lightgray},
+%% showstringspaces=false % Leerzeichen in Strings anzeigen ?
+%% }
+%% \lstloadlanguages{% Check Dokumentation for further languages ...
+%% %[Visual]Basic
+%% %Pascal
+%% C
+%% %C++
+%% %XML
+%% %HTML
+%% %Java
+%% }
+%% %\DeclareCaptionFont{blue}{\color{blue}}
+
+%% %\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}}
\DeclareCaptionFont{white}{\color{white}}
\DeclareCaptionFormat{listing}{\colorbox[cmyk]{0.43, 0.35, 0.35,0.01}{\parbox{\textwidth}{\hspace{15pt}#1#2#3}}}
\captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white, singlelinecheck=false, margin=0pt, font={bf,footnotesize}}
\newlabel{ch13:fig:02}{{12.2}{288}}
\newlabel{ch13:list:01}{{12.1}{288}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.1}Skeleton codes of a GPU kernel and a CPU function}{288}}
+\newlabel{ch13:eq:17}{{12.18}{289}}
\@writefile{lof}{\contentsline {figure}{\numberline {12.3}{\ignorespaces Matrix constant coefficients in a three-dimensional domain.\relax }}{290}}
\newlabel{ch13:fig:03}{{12.3}{290}}
-\newlabel{ch13:eq:17}{{12.18}{290}}
\newlabel{ch13:list:02}{{12.2}{290}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.2}GPU kernels of the projected Richardson method}{290}}
-\@writefile{lof}{\contentsline {figure}{\numberline {12.4}{\ignorespaces Computation of a vector element with the projected Richardson method.\relax }}{292}}
-\newlabel{ch13:fig:04}{{12.4}{292}}
+\@writefile{lof}{\contentsline {figure}{\numberline {12.4}{\ignorespaces Computation of a vector element with the projected Richardson method.\relax }}{291}}
+\newlabel{ch13:fig:04}{{12.4}{291}}
\newlabel{ch13:list:03}{{12.3}{292}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.3}Memory access to the cache texture memory}{292}}
\@writefile{toc}{\contentsline {section}{\numberline {12.5}Experimental tests on a GPU cluster}{293}}
\newlabel{ch13:sec:05}{{12.5}{293}}
-\@writefile{lof}{\contentsline {figure}{\numberline {12.5}{\ignorespaces GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.\relax }}{295}}
-\newlabel{ch13:fig:05}{{12.5}{295}}
+\@writefile{lof}{\contentsline {figure}{\numberline {12.5}{\ignorespaces GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.\relax }}{294}}
+\newlabel{ch13:fig:05}{{12.5}{294}}
\@writefile{lot}{\contentsline {table}{\numberline {12.1}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 24 CPU cores.\relax }}{295}}
\newlabel{ch13:tab:01}{{12.1}{295}}
-\@writefile{lot}{\contentsline {table}{\numberline {12.2}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.\relax }}{296}}
-\newlabel{ch13:tab:02}{{12.2}{296}}
+\@writefile{lot}{\contentsline {table}{\numberline {12.2}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.\relax }}{295}}
+\newlabel{ch13:tab:02}{{12.2}{295}}
\@writefile{toc}{\contentsline {section}{\numberline {12.6}Red-Black ordering technique}{296}}
\newlabel{ch13:sec:06}{{12.6}{296}}
+\newlabel{ch13:fig:06.01}{{12.6(a)}{297}}
+\newlabel{sub@ch13:fig:06.01}{{(a)}{297}}
+\newlabel{ch13:fig:06.02}{{12.6(b)}{297}}
+\newlabel{sub@ch13:fig:06.02}{{(b)}{297}}
+\@writefile{lof}{\contentsline {figure}{\numberline {12.6}{\ignorespaces Red-Black ordering for computing the iterate vector elements in a three-dimensional space.\relax }}{297}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Red-Black ordering on x, y and z axises}}}{297}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Red-Black ordering on y axis}}}{297}}
\newlabel{ch13:list:04}{{12.4}{297}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.4}GPU kernels of the projected Richardson method using the red-black technique}{297}}
-\newlabel{ch13:fig:06.01}{{12.6(a)}{298}}
-\newlabel{sub@ch13:fig:06.01}{{(a)}{298}}
-\newlabel{ch13:fig:06.02}{{12.6(b)}{298}}
-\newlabel{sub@ch13:fig:06.02}{{(b)}{298}}
-\@writefile{lof}{\contentsline {figure}{\numberline {12.6}{\ignorespaces Red-Black ordering for computing the iterate vector elements in a three-dimensional space.\relax }}{298}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Red-Black ordering on x, y and z axises}}}{298}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Red-Black ordering on y axis}}}{298}}
\@writefile{lot}{\contentsline {table}{\numberline {12.3}{\ignorespaces Execution times in seconds of the parallel projected Richardson method using read-black ordering technique implemented on a cluster of 12 GPUs.\relax }}{299}}
\newlabel{ch13:tab:03}{{12.3}{299}}
\@writefile{lof}{\contentsline {figure}{\numberline {12.7}{\ignorespaces Weak scaling of both synchronous and asynchronous algorithms of the projected Richardson method using red-black ordering technique.\relax }}{300}}
\newlabel{lst:kernelMedian3RegTri9}{{4.2}{36}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}3$\times $3 median filter kernel using one register per neighborhood pixel and bubble sort}{36}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Further optimization}{36}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{36}}
\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs on GPU C2070 and CPU for generic median, 3$\times $3 median register-only and \textit {libJacket}.\relax }}{37}}
\newlabel{fig:compMedians1}{{4.4}{37}}
-\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{37}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{38}}
-\newlabel{fig:forgetful_selection}{{4.5}{38}}
-\newlabel{lst:medianForget1pix3}{{4.3}{38}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{38}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{39}}
-\newlabel{fig:forgetful3}{{4.6}{39}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{37}}
+\newlabel{fig:forgetful_selection}{{4.5}{37}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{38}}
+\newlabel{fig:forgetful3}{{4.6}{38}}
+\newlabel{lst:medianForget1pix3}{{4.3}{39}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{39}}
\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{40}}
\newlabel{fig:median3_overlap}{{4.7}{40}}
\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{40}}
-\newlabel{lst:medianForget2pix3}{{4.4}{41}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{41}}
-\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{41}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{42}}
-\newlabel{fig:compMedians2}{{4.8}{42}}
+\newlabel{lst:medianForget2pix3}{{4.4}{40}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{40}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{41}}
+\newlabel{fig:compMedians2}{{4.8}{41}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{42}}
\newlabel{sec:median5}{{4.5.1}{42}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{42}}
\newlabel{lst:medianForget2pix5}{{4.5}{42}}
\newlabel{fig:median5overlap}{{4.9}{43}}
\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.\relax }}{43}}
\newlabel{fig:median5overlap}{{4.10}{43}}
-\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{45}}
-\newlabel{tab:median5comp}{{4.2}{45}}
+\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{44}}
+\newlabel{tab:median5comp}{{4.2}{44}}
\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{45}}
-\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{46}}
-\newlabel{tab:medianSeparable}{{4.3}{46}}
-\newlabel{lst:medianSeparable}{{4.6}{46}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{46}}
-\newlabel{img:sap_example_ref}{{4.11(a)}{47}}
-\newlabel{sub@img:sap_example_ref}{{(a)}{47}}
-\newlabel{img:sap_example_sep_med3}{{4.11(b)}{47}}
-\newlabel{sub@img:sap_example_sep_med3}{{(b)}{47}}
-\newlabel{img:sap_example_sep_med5}{{4.11(c)}{47}}
-\newlabel{sub@img:sap_example_sep_med5}{{(c)}{47}}
-\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{47}}
-\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{47}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{47}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{47}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{47}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{47}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{47}}
-\newlabel{fig:sap_examples2}{{4.11}{47}}
-\@writefile{toc}{\contentsline {section}{Bibliography}{49}}
+\newlabel{lst:medianSeparable}{{4.6}{45}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{45}}
+\newlabel{img:sap_example_ref}{{4.11(a)}{46}}
+\newlabel{sub@img:sap_example_ref}{{(a)}{46}}
+\newlabel{img:sap_example_sep_med3}{{4.11(b)}{46}}
+\newlabel{sub@img:sap_example_sep_med3}{{(b)}{46}}
+\newlabel{img:sap_example_sep_med5}{{4.11(c)}{46}}
+\newlabel{sub@img:sap_example_sep_med5}{{(c)}{46}}
+\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{46}}
+\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{46}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{46}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{46}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{46}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{46}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{46}}
+\newlabel{fig:sap_examples2}{{4.11}{46}}
+\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{47}}
+\newlabel{tab:medianSeparable}{{4.3}{47}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{48}}
\@setckpt{Chapters/chapter3/ch3}{
\setcounter{page}{50}
\setcounter{equation}{0}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Listings
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\lstset{
- language=C,
- columns=fixed,
- basicstyle=\footnotesize\ttfamily,
- numbers=left,
- firstnumber=1,
- numberstyle=\tiny,
- stepnumber=5,
- numbersep=5pt,
- tabsize=3,
- extendedchars=true,
- breaklines=true,
- keywordstyle=\textbf,
- frame=single,
- % keywordstyle=[1]\textbf,
- %identifierstyle=\textbf,
- commentstyle=\color{white}\textbf,
- stringstyle=\color{white}\ttfamily,
- % xleftmargin=17pt,
- % framexleftmargin=17pt,
- % framexrightmargin=5pt,
- % framexbottommargin=4pt,
- backgroundcolor=\color{lightgray},
- }
+%% \lstset{
+%% language=C,
+%% columns=fixed,
+%% basicstyle=\footnotesize\ttfamily,
+%% numbers=left,
+%% firstnumber=1,
+%% numberstyle=\tiny,
+%% stepnumber=5,
+%% numbersep=5pt,
+%% tabsize=3,
+%% extendedchars=true,
+%% breaklines=true,
+%% keywordstyle=\textbf,
+%% frame=single,
+%% % keywordstyle=[1]\textbf,
+%% %identifierstyle=\textbf,
+%% commentstyle=\color{white}\textbf,
+%% stringstyle=\color{white}\ttfamily,
+%% % xleftmargin=17pt,
+%% % framexleftmargin=17pt,
+%% % framexrightmargin=5pt,
+%% % framexbottommargin=4pt,
+%% backgroundcolor=\color{lightgray},
+%% }
%\DeclareCaptionFont{blue}{\color{blue}}
%\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}}
% lstlisting settings
%\lstset{language=c++, tabsize=4, extendedchars=false, numbers=left,stepnumber=1, breaklines=true, basicstyle=\scriptsize\ttfamily,commentstyle=\color{gray}\rm,frame=single,showstringspaces=false,morekeywords={HALF4, HALF3, float2, float3, float4, half, half2, half3, half4, tex2D,dim3, endif,threadIdx, blockIdx, blockDim, gridDim, Dim3, __host__, __global__,__shared__,float},backgroundcolor=\color{lightgray},keywordstyle=\bfseries\ttfamily,numberstyle=\tiny}
-\lstset{morekeywords={HALF4,HALF3,float2,float3,float4,half,half2,half3,half4,tex2D,dim3,endif,threadIdx,blockIdx,blockDim,gridDim,Dim3,__host__,__global__,__shared__,float}}
+%\lstset{morekeywords={HALF4,HALF3,float2,float3,float4,half,half2,half3,half4,tex2D,dim3,endif,threadIdx,blockIdx,blockDim,gridDim,Dim3,__host__,__global__,__shared__,float}}
% This is for tikz/pgf plots
iterative process and by $x^t=(x_1^t,...,x_n^t)$ the values of the $n$ elements of
the system at iteration $t$, we pass from a synchronous iterative scheme of the
form:
+%% \begin{algorithm}[H]
+%% \caption{Synchronous iterative scheme}\label{algo:ch6p2sync}
+%% \begin{Algo}
+%% $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
+%% \textbf{for} $t=0,1,...$\\
+%% \>\textbf{for} $i=1,...,n$\\
+%% \>\>$x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\\
+%% \>\textbf{endfor}\\
+%% \textbf{endfor}
+%% \end{Algo}
+%% \end{algorithm}
\begin{algorithm}[H]
\caption{Synchronous iterative scheme}\label{algo:ch6p2sync}
- \begin{Algo}
- $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
- \textbf{for} $t=0,1,...$\\
- \>\textbf{for} $i=1,...,n$\\
- \>\>$x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\\
- \>\textbf{endfor}\\
- \textbf{endfor}
- \end{Algo}
+ $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\;
+ \For{ $t=0,1,...$} {
+ \For{ $i=1,...,n$}{
+ $x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\;
+ }
+ }
\end{algorithm}
+
+
\noindent
-to an asynchronous iterative scheme of the form:
+to an asynchronous iterative scheme of the form:\\
+%% \begin{algorithm}[H]
+%% \caption{Asynchronous iterative scheme}\label{algo:ch6p2async}
+%% \begin{Algo}
+%% $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
+%% \textbf{for} $t=0,1,...$\\
+%% \>\textbf{for} $i=1,...,n$\\
+%% \>\>$x_{i}^{t+1}=\left\{
+%% \begin{array}[h]{ll}
+%% x_i^t & \text{if } i \text{ is \emph{not} updated at iteration } i\\
+%% f_i(x_1^{s_1^i(t)},...,x_n^{s_n^i(t)}) & \text{if } i \text{ is updated at iteration } i
+%% \end{array}
+%% \right.$\\
+%% \>\textbf{endfor}\\
+%% \textbf{endfor}
+%% \end{Algo}
+%% \end{algorithm}
\begin{algorithm}[H]
\caption{Asynchronous iterative scheme}\label{algo:ch6p2async}
- \begin{Algo}
- $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
- \textbf{for} $t=0,1,...$\\
- \>\textbf{for} $i=1,...,n$\\
- \>\>$x_{i}^{t+1}=\left\{
+ $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\;
+ \For {$t=0,1,...$} {
+ \For{ $i=1,...,n$} {
+ $x_{i}^{t+1}=\left\{
\begin{array}[h]{ll}
x_i^t & \text{if } i \text{ is \emph{not} updated at iteration } i\\
f_i(x_1^{s_1^i(t)},...,x_n^{s_n^i(t)}) & \text{if } i \text{ is updated at iteration } i
\end{array}
- \right.$\\
- \>\textbf{endfor}\\
- \textbf{endfor}
- \end{Algo}
+ \right.$
+ }
+ }
\end{algorithm}
where $s_j^i(t)$ is the iteration number of the production of the value $x_j$ of
element $j$ that is used on element $i$ at iteration $t$ (see for example~\cite{BT89,
\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{88}}
\newlabel{algo:ch6p1overlapseqsequence}{{6.2}{89}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{89}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{90}}
-\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{90}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{91}}
+\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{91}}
\newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{91}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{91}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{93}}
-\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{93}}
\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{93}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{94}}
+\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{94}}
\newlabel{algo:ch6p1overlapinterleaved}{{6.4}{94}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{94}}
\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{96}}
\newlabel{ch6:part2}{{6.3}{98}}
\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{98}}
\newlabel{algo:ch6p2sync}{{3}{98}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{98}}
-\newlabel{algo:ch6p2async}{{4}{98}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{99}}
-\newlabel{ch6:p2BasicAsync}{{6.3.1}{99}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{99}}
+\newlabel{algo:ch6p2async}{{4}{99}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{100}}
+\newlabel{ch6:p2BasicAsync}{{6.3.1}{100}}
\newlabel{algo:ch6p2BasicAsync}{{6.5}{100}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{100}}
\newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{101}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{101}}
-\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{102}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{102}}
+\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{103}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{103}}
\newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{103}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{103}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{104}}
-\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{104}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{105}}
+\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{105}}
\newlabel{algo:ch6p2Sync}{{6.9}{105}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{105}}
\newlabel{algo:ch6p2SyncComp}{{6.10}{106}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{106}}
-\newlabel{algo:ch6p2SyncReceptions}{{6.11}{107}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{107}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{108}}
-\newlabel{ch6:p2GPUAsync}{{6.3.3}{108}}
-\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{109}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{109}}
+\newlabel{algo:ch6p2SyncReceptions}{{6.11}{108}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{108}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{109}}
+\newlabel{ch6:p2GPUAsync}{{6.3.3}{109}}
+\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{110}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{110}}
\newlabel{algo:ch6p2syncGPU}{{6.13}{111}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{111}}
-\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{113}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{113}}
-\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{114}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{114}}
-\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{115}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{115}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{116}}
-\newlabel{sec:ch6p2expes}{{6.3.4}{116}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{117}}
-\newlabel{fig:ch6p2syncasync}{{6.6}{117}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{118}}
-\newlabel{fig:ch6p2aux}{{6.7}{118}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{119}}
-\newlabel{sec:ch6p3unify}{{6.4}{119}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{119}}
-\newlabel{sec:ch6p3resources}{{6.4.1}{119}}
-\newlabel{algo:ch6p3ORWLresources}{{6.17}{120}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{120}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{120}}
-\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{120}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{121}}
-\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{121}}
-\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{121}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{121}}
-\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{122}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{122}}
-\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{122}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{122}}
-\newlabel{algo:ch6p3ORWLtrans}{{6.21}{122}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{122}}
-\newlabel{algo:ch6p3ORWLdecl}{{6.22}{123}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{123}}
+\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{114}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{114}}
+\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{115}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{115}}
+\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{116}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{116}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{117}}
+\newlabel{sec:ch6p2expes}{{6.3.4}{117}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{118}}
+\newlabel{fig:ch6p2syncasync}{{6.6}{118}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{119}}
+\newlabel{fig:ch6p2aux}{{6.7}{119}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{120}}
+\newlabel{sec:ch6p3unify}{{6.4}{120}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{120}}
+\newlabel{sec:ch6p3resources}{{6.4.1}{120}}
+\newlabel{algo:ch6p3ORWLresources}{{6.17}{121}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{121}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{121}}
+\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{121}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{122}}
+\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{122}}
+\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{122}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{122}}
+\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{123}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{123}}
+\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{123}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{123}}
+\newlabel{algo:ch6p3ORWLtrans}{{6.21}{123}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{123}}
+\newlabel{algo:ch6p3ORWLdecl}{{6.22}{124}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{124}}
\newlabel{algo:ch6p3ORWLinit}{{6.23}{124}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{124}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{124}}
-\newlabel{sec:ch6p3tasks}{{6.4.4}{124}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{125}}
-\newlabel{ch6:conclu}{{6.5}{125}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{125}}
-\@writefile{toc}{\contentsline {section}{Bibliography}{126}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{125}}
+\newlabel{sec:ch6p3tasks}{{6.4.4}{125}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{126}}
+\newlabel{ch6:conclu}{{6.5}{126}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{126}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{127}}
\@setckpt{Chapters/chapter6/ch6}{
-\setcounter{page}{128}
+\setcounter{page}{129}
\setcounter{equation}{0}
\setcounter{enumi}{4}
\setcounter{enumii}{0}
\setcounter{lotdepth}{1}
\setcounter{lstnumber}{17}
\setcounter{ContinuedFloat}{0}
-\setcounter{AlgoLine}{0}
+\setcounter{AlgoLine}{6}
\setcounter{algocfline}{4}
\setcounter{algocfproc}{4}
\setcounter{algocf}{4}
\def\Reals{\mathbb{R}}
\definecolor{shadecolor}{rgb}{0.95,0.95,0.95}
-\newenvironment{Algo}{\vspace{-1em}\begin{center}\begin{minipage}[h]{0.95\columnwidth}\begin{shaded}\begin{tabbing}%
- \hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\= \kill} %
- { \end{tabbing}\vspace{-1em}\end{shaded}\end{minipage}\end{center}\vspace{-1em}}
+%\newenvironment{Algo}{\vspace{-1em}\begin{center}\begin{minipage}[h]{0.95\columnwidth}\begin{shaded}\begin{tabbing}%
+% \hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\= \kill} %
+% { \end{tabbing}\vspace{-1em}\end{shaded}\end{minipage}\end{center}\vspace{-1em}}
-\lstnewenvironment{Listing}[2]{\lstset{basicstyle=\scriptsize\ttfamily,%
- breaklines=true, breakatwhitespace=true, language=C, keywordstyle=\color{black},%
- prebreak = \raisebox{0ex}[0ex][0ex]{\ensuremath{\hookleftarrow}},%
- commentstyle=\textit, numbersep=1em, numberstyle=\tiny, numbers=left,%
- numberblanklines=false, mathescape, escapechar=@, label=#1, caption={#2}}}{}
+\lstnewenvironment{Listing}[2]{\lstset{
+%% basicstyle=\scriptsize\ttfamily,%
+%% breaklines=true, breakatwhitespace=true, language=C, keywordstyle=\color{black},%
+%% prebreak = \raisebox{0ex}[0ex][0ex]{\ensuremath{\hookleftarrow}},%
+%% commentstyle=\textit, numbersep=1em, numberstyle=\tiny, numbers=left,%
+%% numberblanklines=false, mathescape, escapechar=@,
+ label=#1, caption={#2}}
+}{}
\def\N{$\mathbb N$ }
\def\R{$\mathbb R$ }
For the unified potential flow model the user will need to provide implementations of the following components; the right hand side operator for the semi-discrete free surface variables \eqref{ch7:FSorigin}, the matrix-vector operator for the discretized $\sigma$-transformed Laplace equation \eqref{ch7:TransformedLaplace}, a smoother for the multigrid relaxation step, and the potential flow solver itself, that reads initial data and advance the solution in time. In order to make the library as generic as possible, all components are template-based, which makes it possible to assemble the PDE solver by combining type definitions in the preamble of the application. An excerpt of the potential flow assembling is given in listing \ref{ch7:lst:solversetup}.
-\lstset{label=ch7:lst:solversetup,caption={Generic assembling of the potential flow solver for fully nonlinear free surface water waves.},basicstyle=\scriptsize}
+\lstset{label=ch7:lst:solversetup,caption={Generic assembling of the potential flow solver for fully nonlinear free surface water waves.}
+%,basicstyle=\scriptsize
+}
\begin{lstlisting}
// Basics
typedef double value_type;
Hereafter, the potential flow solver is aware of all component types that should be used to solve the entire PDE system, and it will be easy for developers to exchange parts at later times. The \texttt{laplace\_sigma\_stencil\_3d} class implements both the matrix-vector and right hand side operator. The flexible-order finite difference kernel for the matrix-free matrix-vector product for the two-dimensional Laplace problem is presented in listing \ref{ch7:lst:fd2d}. Library macros and reusable kernel routines are used throughout the implementations to enhance developer productivity and hide hardware specific details. This kernel can be used both for matrix-vector products for the original system and for the preconditioning.
-\lstset{label=ch7:lst:fd2d,caption={CUDA kernel implementation for the two dimensional finite difference approximation to the transformed Laplace equation.},basicstyle=\scriptsize\ttfamily}
+\lstset{label=ch7:lst:fd2d,caption={CUDA kernel implementation for the two dimensional finite difference approximation to the transformed Laplace equation.}
+%,basicstyle=\scriptsize\ttfamily
+}
\begin{lstlisting}
template <typename value_type, typename size_type>
__global__ void laplace_sigma_transformed(
In a similar template-based approach, the kernel for the right hand side operator of the two dimensional problem is implemented and listed in Listing \ref{ch7:lst:rhs2d}. The kernel computes the right hand side updates for both surface variables, $\eta$ and $\tilde{\phi}$, and applies an embedded penalty forcing \eqref{ch7:eq:penalty}, for all nodes within generation or absorption zones. The penalty forcing functions are computed based on linear or non-linear wave theory in a separate device function.
-\lstset{label=ch7:lst:rhs2d,caption={CUDA kernel implementation for the 2D right hand side.},basicstyle=\scriptsize\ttfamily}
+\lstset{label=ch7:lst:rhs2d,caption={CUDA kernel implementation for the 2D right hand side.}%,basicstyle=\scriptsize\ttfamily
+}
\begin{lstlisting}
template <typename value_type, typename size_type>
__global__ void rhs(value_type const* p , value_type const* p_surf