]> AND Private Git Repository - book_gpu.git/commitdiff
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
pas mal de petites modifs
authorcouturie <couturie@carcariass.(none)>
Thu, 14 Mar 2013 20:40:41 +0000 (21:40 +0100)
committercouturie <couturie@carcariass.(none)>
Thu, 14 Mar 2013 20:40:41 +0000 (21:40 +0100)
BookGPU/BookGPU.tex
BookGPU/Chapters/chapter13/ch13.aux
BookGPU/Chapters/chapter3/ch3.aux
BookGPU/Chapters/chapter3/ch3.tex
BookGPU/Chapters/chapter5/preamble.tex
BookGPU/Chapters/chapter6/PartieAsync.tex
BookGPU/Chapters/chapter6/ch6.aux
BookGPU/Chapters/chapter6/ch6.tex
BookGPU/Chapters/chapter7/ch7.tex

index b58372df8f4291be13b0a62008fe0881f22c8ef6..46546ef7d70eee7d14d170ca160e6fdfbde89d23 100755 (executable)
@@ -42,7 +42,7 @@
 \frenchspacing
 \tolerance=5000
 
-\include{Chapters/chapter1/preamble}
+%\include{Chapters/chapter1/preamble}
 \include{Chapters/chapter5/preamble}
 
 \newcommand{\scalprod}[2]%
 \newcommand{\mymat}[1]{\mathcal{#1}}
 \newcommand{\myvec}[1]{\mathbf{#1}} 
 
-
- \lstset{
-         basicstyle=\footnotesize\ttfamily, % Standardschrift
-         %numbers=left,               % Ort der Zeilennummern
-         numberstyle=\tiny,          % Stil der Zeilennummern
-         %stepnumber=2,               % Abstand zwischen den Zeilennummern
-         numbersep=5pt,              % Abstand der Nummern zum Text
-         tabsize=2,                  % Groesse von Tabs
-         extendedchars=true,         %
-         breaklines=true,            % Zeilen werden Umgebrochen
-         keywordstyle=\color{red},
-               frame=b,         
- %        keywordstyle=[1]\textbf,    % Stil der Keywords
- %        keywordstyle=[2]\textbf,    %
- %        keywordstyle=[3]\textbf,    %
- %        keywordstyle=[4]\textbf,   \sqrt{\sqrt{}} %
-         stringstyle=\color{white}\ttfamily, % Farbe der String
-         showspaces=false,           % Leerzeichen anzeigen ?
-         showtabs=false,             % Tabs anzeigen ?
-         xleftmargin=17pt,
-         framexleftmargin=17pt,
-         framexrightmargin=5pt,
-         framexbottommargin=4pt,
-         %backgroundcolor=\color{lightgray},
-         showstringspaces=false      % Leerzeichen in Strings anzeigen ?        
- }
- \lstloadlanguages{% Check Dokumentation for further languages ...
-         %[Visual]Basic
-         %Pascal
-         C
-         %C++
-         %XML
-         %HTML
-         %Java
- }
-  %\DeclareCaptionFont{blue}{\color{blue}} 
-
-  %\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}}
+\lstset{morekeywords={HALF4,HALF3,float2,float3,float4,half,half2,half3,half4,tex2D,dim3,endif,threadIdx,blockIdx,blockDim,gridDim,Dim3,__host__,__global__,__shared__,float}}
+\lstset{
+  language=C,
+  columns=fixed,
+  basicstyle=\footnotesize\ttfamily,
+  numbers=left,
+  firstnumber=1,
+  numberstyle=\tiny,
+  stepnumber=5,             
+  numbersep=5pt,              
+  tabsize=3,                  
+  extendedchars=true,         
+  breaklines=true,       
+  keywordstyle=\textbf,
+  frame=single,         
+  % keywordstyle=[1]\textbf,   
+  %identifierstyle=\textbf,
+  commentstyle=\color{white}\textbf,
+  stringstyle=\color{white}\ttfamily,
+  % xleftmargin=17pt,
+  % framexleftmargin=17pt,
+  % framexrightmargin=5pt,
+  % framexbottommargin=4pt,
+  backgroundcolor=\color{lightgray},
+  basicstyle=\scriptsize
+  }
+
+ %% \lstset{
+%%          basicstyle=\footnotesize\ttfamily, % Standardschrift
+%%          %numbers=left,               % Ort der Zeilennummern
+%%          numberstyle=\tiny,          % Stil der Zeilennummern
+%%          %stepnumber=2,               % Abstand zwischen den Zeilennummern
+%%          numbersep=5pt,              % Abstand der Nummern zum Text
+%%          tabsize=2,                  % Groesse von Tabs
+%%          extendedchars=true,         %
+%%          breaklines=true,            % Zeilen werden Umgebrochen
+%%          keywordstyle=\color{red},
+%%                     frame=b,         
+%%  %        keywordstyle=[1]\textbf,    % Stil der Keywords
+%%  %        keywordstyle=[2]\textbf,    %
+%%  %        keywordstyle=[3]\textbf,    %
+%%  %        keywordstyle=[4]\textbf,   \sqrt{\sqrt{}} %
+%%          stringstyle=\color{white}\ttfamily, % Farbe der String
+%%          showspaces=false,           % Leerzeichen anzeigen ?
+%%          showtabs=false,             % Tabs anzeigen ?
+%%          xleftmargin=17pt,
+%%          framexleftmargin=17pt,
+%%          framexrightmargin=5pt,
+%%          framexbottommargin=4pt,
+%%          %backgroundcolor=\color{lightgray},
+%%          showstringspaces=false      % Leerzeichen in Strings anzeigen ?        
+%%  }
+%%  \lstloadlanguages{% Check Dokumentation for further languages ...
+%%          %[Visual]Basic
+%%          %Pascal
+%%          C
+%%          %C++
+%%          %XML
+%%          %HTML
+%%          %Java
+%%  }
+%%   %\DeclareCaptionFont{blue}{\color{blue}} 
+
+%%   %\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}}
 \DeclareCaptionFont{white}{\color{white}}
 \DeclareCaptionFormat{listing}{\colorbox[cmyk]{0.43, 0.35, 0.35,0.01}{\parbox{\textwidth}{\hspace{15pt}#1#2#3}}}
 \captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white, singlelinecheck=false, margin=0pt, font={bf,footnotesize}}
index bb6c472d158d70af7843f8a7e7c7638b35868893..d3b6d7dc3306078dc5bf2088f826c87ff2ea75c4 100644 (file)
 \newlabel{ch13:fig:02}{{12.2}{288}}
 \newlabel{ch13:list:01}{{12.1}{288}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {12.1}Skeleton codes of a GPU kernel and a CPU function}{288}}
+\newlabel{ch13:eq:17}{{12.18}{289}}
 \@writefile{lof}{\contentsline {figure}{\numberline {12.3}{\ignorespaces Matrix constant coefficients in a three-dimensional domain.\relax }}{290}}
 \newlabel{ch13:fig:03}{{12.3}{290}}
-\newlabel{ch13:eq:17}{{12.18}{290}}
 \newlabel{ch13:list:02}{{12.2}{290}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {12.2}GPU kernels of the projected Richardson method}{290}}
-\@writefile{lof}{\contentsline {figure}{\numberline {12.4}{\ignorespaces Computation of a vector element with the projected Richardson method.\relax }}{292}}
-\newlabel{ch13:fig:04}{{12.4}{292}}
+\@writefile{lof}{\contentsline {figure}{\numberline {12.4}{\ignorespaces Computation of a vector element with the projected Richardson method.\relax }}{291}}
+\newlabel{ch13:fig:04}{{12.4}{291}}
 \newlabel{ch13:list:03}{{12.3}{292}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {12.3}Memory access to the cache texture memory}{292}}
 \@writefile{toc}{\contentsline {section}{\numberline {12.5}Experimental tests on a GPU cluster}{293}}
 \newlabel{ch13:sec:05}{{12.5}{293}}
-\@writefile{lof}{\contentsline {figure}{\numberline {12.5}{\ignorespaces GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.\relax }}{295}}
-\newlabel{ch13:fig:05}{{12.5}{295}}
+\@writefile{lof}{\contentsline {figure}{\numberline {12.5}{\ignorespaces GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.\relax }}{294}}
+\newlabel{ch13:fig:05}{{12.5}{294}}
 \@writefile{lot}{\contentsline {table}{\numberline {12.1}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 24 CPU cores.\relax }}{295}}
 \newlabel{ch13:tab:01}{{12.1}{295}}
-\@writefile{lot}{\contentsline {table}{\numberline {12.2}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.\relax }}{296}}
-\newlabel{ch13:tab:02}{{12.2}{296}}
+\@writefile{lot}{\contentsline {table}{\numberline {12.2}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.\relax }}{295}}
+\newlabel{ch13:tab:02}{{12.2}{295}}
 \@writefile{toc}{\contentsline {section}{\numberline {12.6}Red-Black ordering technique}{296}}
 \newlabel{ch13:sec:06}{{12.6}{296}}
+\newlabel{ch13:fig:06.01}{{12.6(a)}{297}}
+\newlabel{sub@ch13:fig:06.01}{{(a)}{297}}
+\newlabel{ch13:fig:06.02}{{12.6(b)}{297}}
+\newlabel{sub@ch13:fig:06.02}{{(b)}{297}}
+\@writefile{lof}{\contentsline {figure}{\numberline {12.6}{\ignorespaces Red-Black ordering for computing the iterate vector elements in a three-dimensional space.\relax }}{297}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Red-Black ordering on x, y and z axises}}}{297}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Red-Black ordering on y axis}}}{297}}
 \newlabel{ch13:list:04}{{12.4}{297}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {12.4}GPU kernels of the projected Richardson method using the red-black technique}{297}}
-\newlabel{ch13:fig:06.01}{{12.6(a)}{298}}
-\newlabel{sub@ch13:fig:06.01}{{(a)}{298}}
-\newlabel{ch13:fig:06.02}{{12.6(b)}{298}}
-\newlabel{sub@ch13:fig:06.02}{{(b)}{298}}
-\@writefile{lof}{\contentsline {figure}{\numberline {12.6}{\ignorespaces Red-Black ordering for computing the iterate vector elements in a three-dimensional space.\relax }}{298}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Red-Black ordering on x, y and z axises}}}{298}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Red-Black ordering on y axis}}}{298}}
 \@writefile{lot}{\contentsline {table}{\numberline {12.3}{\ignorespaces Execution times in seconds of the parallel projected Richardson method using read-black ordering technique implemented on a cluster of 12 GPUs.\relax }}{299}}
 \newlabel{ch13:tab:03}{{12.3}{299}}
 \@writefile{lof}{\contentsline {figure}{\numberline {12.7}{\ignorespaces Weak scaling of both synchronous and asynchronous algorithms of the projected Richardson method using red-black ordering technique.\relax }}{300}}
index cb0cf96245fd0647c2108a24ca16c321a47fcc9f..91483edb7edc673fa600b4e6a310c9a4febf6a1c 100644 (file)
 \newlabel{lst:kernelMedian3RegTri9}{{4.2}{36}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}3$\times $3 median filter kernel using one register per neighborhood pixel and bubble sort}{36}}
 \@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Further optimization}{36}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{36}}
 \@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs on GPU C2070 and CPU for generic median, 3$\times $3 median register-only and \textit  {libJacket}.\relax }}{37}}
 \newlabel{fig:compMedians1}{{4.4}{37}}
-\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{37}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{38}}
-\newlabel{fig:forgetful_selection}{{4.5}{38}}
-\newlabel{lst:medianForget1pix3}{{4.3}{38}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{38}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{39}}
-\newlabel{fig:forgetful3}{{4.6}{39}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{37}}
+\newlabel{fig:forgetful_selection}{{4.5}{37}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{38}}
+\newlabel{fig:forgetful3}{{4.6}{38}}
+\newlabel{lst:medianForget1pix3}{{4.3}{39}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{39}}
 \@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{40}}
 \newlabel{fig:median3_overlap}{{4.7}{40}}
 \@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{40}}
-\newlabel{lst:medianForget2pix3}{{4.4}{41}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{41}}
-\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{41}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{42}}
-\newlabel{fig:compMedians2}{{4.8}{42}}
+\newlabel{lst:medianForget2pix3}{{4.4}{40}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{40}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{41}}
+\newlabel{fig:compMedians2}{{4.8}{41}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{42}}
 \newlabel{sec:median5}{{4.5.1}{42}}
 \@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{42}}
 \newlabel{lst:medianForget2pix5}{{4.5}{42}}
 \newlabel{fig:median5overlap}{{4.9}{43}}
 \@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.\relax }}{43}}
 \newlabel{fig:median5overlap}{{4.10}{43}}
-\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{45}}
-\newlabel{tab:median5comp}{{4.2}{45}}
+\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{44}}
+\newlabel{tab:median5comp}{{4.2}{44}}
 \@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{45}}
-\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{46}}
-\newlabel{tab:medianSeparable}{{4.3}{46}}
-\newlabel{lst:medianSeparable}{{4.6}{46}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{46}}
-\newlabel{img:sap_example_ref}{{4.11(a)}{47}}
-\newlabel{sub@img:sap_example_ref}{{(a)}{47}}
-\newlabel{img:sap_example_sep_med3}{{4.11(b)}{47}}
-\newlabel{sub@img:sap_example_sep_med3}{{(b)}{47}}
-\newlabel{img:sap_example_sep_med5}{{4.11(c)}{47}}
-\newlabel{sub@img:sap_example_sep_med5}{{(c)}{47}}
-\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{47}}
-\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{47}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{47}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{47}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{47}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{47}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{47}}
-\newlabel{fig:sap_examples2}{{4.11}{47}}
-\@writefile{toc}{\contentsline {section}{Bibliography}{49}}
+\newlabel{lst:medianSeparable}{{4.6}{45}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{45}}
+\newlabel{img:sap_example_ref}{{4.11(a)}{46}}
+\newlabel{sub@img:sap_example_ref}{{(a)}{46}}
+\newlabel{img:sap_example_sep_med3}{{4.11(b)}{46}}
+\newlabel{sub@img:sap_example_sep_med3}{{(b)}{46}}
+\newlabel{img:sap_example_sep_med5}{{4.11(c)}{46}}
+\newlabel{sub@img:sap_example_sep_med5}{{(c)}{46}}
+\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{46}}
+\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{46}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{46}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{46}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{46}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{46}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{46}}
+\newlabel{fig:sap_examples2}{{4.11}{46}}
+\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{47}}
+\newlabel{tab:medianSeparable}{{4.3}{47}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{48}}
 \@setckpt{Chapters/chapter3/ch3}{
 \setcounter{page}{50}
 \setcounter{equation}{0}
index 94c3e97c9895bdcec13d0650e7b5bdf17950d774..78396a9d77639a501ea825164021995d4109a222 100755 (executable)
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %      Listings
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\lstset{
-  language=C,
-  columns=fixed,
-  basicstyle=\footnotesize\ttfamily,
-  numbers=left,
-  firstnumber=1,
-  numberstyle=\tiny,
-  stepnumber=5,             
-  numbersep=5pt,              
-  tabsize=3,                  
-  extendedchars=true,         
-  breaklines=true,       
-  keywordstyle=\textbf,
-  frame=single,         
-  % keywordstyle=[1]\textbf,   
-  %identifierstyle=\textbf,
-  commentstyle=\color{white}\textbf,
-  stringstyle=\color{white}\ttfamily,
-  % xleftmargin=17pt,
-  % framexleftmargin=17pt,
-  % framexrightmargin=5pt,
-  % framexbottommargin=4pt,
-  backgroundcolor=\color{lightgray},
-  }
+%% \lstset{
+%%   language=C,
+%%   columns=fixed,
+%%   basicstyle=\footnotesize\ttfamily,
+%%   numbers=left,
+%%   firstnumber=1,
+%%   numberstyle=\tiny,
+%%   stepnumber=5,             
+%%   numbersep=5pt,              
+%%   tabsize=3,                  
+%%   extendedchars=true,         
+%%   breaklines=true,       
+%%   keywordstyle=\textbf,
+%%   frame=single,         
+%%   % keywordstyle=[1]\textbf,   
+%%   %identifierstyle=\textbf,
+%%   commentstyle=\color{white}\textbf,
+%%   stringstyle=\color{white}\ttfamily,
+%%   % xleftmargin=17pt,
+%%   % framexleftmargin=17pt,
+%%   % framexrightmargin=5pt,
+%%   % framexbottommargin=4pt,
+%%   backgroundcolor=\color{lightgray},
+%%   }
 
 %\DeclareCaptionFont{blue}{\color{blue}} 
 %\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}}
index f4f093e0f48713b23346dbf46265f7394b64c325..f3bd355efb5f2c5ead190af68d53fa5f2ad30d56 100644 (file)
@@ -12,7 +12,7 @@
 % lstlisting settings
 %\lstset{language=c++, tabsize=4, extendedchars=false, numbers=left,stepnumber=1, breaklines=true, basicstyle=\scriptsize\ttfamily,commentstyle=\color{gray}\rm,frame=single,showstringspaces=false,morekeywords={HALF4, HALF3, float2, float3, float4, half, half2, half3, half4, tex2D,dim3, endif,threadIdx, blockIdx, blockDim, gridDim, Dim3, __host__, __global__,__shared__,float},backgroundcolor=\color{lightgray},keywordstyle=\bfseries\ttfamily,numberstyle=\tiny}
 
-\lstset{morekeywords={HALF4,HALF3,float2,float3,float4,half,half2,half3,half4,tex2D,dim3,endif,threadIdx,blockIdx,blockDim,gridDim,Dim3,__host__,__global__,__shared__,float}}
+%\lstset{morekeywords={HALF4,HALF3,float2,float3,float4,half,half2,half3,half4,tex2D,dim3,endif,threadIdx,blockIdx,blockDim,gridDim,Dim3,__host__,__global__,__shared__,float}}
 
 
 % This is for tikz/pgf plots
index 44eadd5609ee4032d59adf459dfb4654dcc60360..4a2b6d2aee7e1bae827e28527f39cd5e039b42d7 100644 (file)
@@ -15,34 +15,59 @@ Formally, if we denote by $f=(f_1,...,f_n)$ the function representing the
 iterative process and by $x^t=(x_1^t,...,x_n^t)$ the values of the $n$ elements of
 the system at iteration $t$, we pass from a synchronous iterative scheme of the
 form:
+%% \begin{algorithm}[H]
+%%   \caption{Synchronous iterative scheme}\label{algo:ch6p2sync}
+%%   \begin{Algo}
+%%     $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
+%%     \textbf{for} $t=0,1,...$\\
+%%     \>\textbf{for} $i=1,...,n$\\
+%%     \>\>$x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\\
+%%     \>\textbf{endfor}\\
+%%     \textbf{endfor}
+%%   \end{Algo}
+%% \end{algorithm}
 \begin{algorithm}[H]
   \caption{Synchronous iterative scheme}\label{algo:ch6p2sync}
-  \begin{Algo}
-    $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
-    \textbf{for} $t=0,1,...$\\
-    \>\textbf{for} $i=1,...,n$\\
-    \>\>$x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\\
-    \>\textbf{endfor}\\
-    \textbf{endfor}
-  \end{Algo}
+  $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\;
+  \For{ $t=0,1,...$} {
+    \For{ $i=1,...,n$}{
+      $x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\;
+    }
+  }
 \end{algorithm}
+
+
 \noindent
-to an asynchronous iterative scheme of the form:
+to an asynchronous iterative scheme of the form:\\
+%% \begin{algorithm}[H]
+%%   \caption{Asynchronous iterative scheme}\label{algo:ch6p2async}
+%%   \begin{Algo}
+%%     $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
+%%     \textbf{for} $t=0,1,...$\\
+%%     \>\textbf{for} $i=1,...,n$\\
+%%     \>\>$x_{i}^{t+1}=\left\{
+%%       \begin{array}[h]{ll}
+%%         x_i^t & \text{if } i \text{ is \emph{not} updated at iteration } i\\
+%%         f_i(x_1^{s_1^i(t)},...,x_n^{s_n^i(t)}) & \text{if } i \text{ is updated at iteration } i
+%%       \end{array}
+%%     \right.$\\
+%%     \>\textbf{endfor}\\
+%%     \textbf{endfor}
+%%   \end{Algo}
+%% \end{algorithm}
 \begin{algorithm}[H]
   \caption{Asynchronous iterative scheme}\label{algo:ch6p2async}
-  \begin{Algo}
-    $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
-    \textbf{for} $t=0,1,...$\\
-    \>\textbf{for} $i=1,...,n$\\
-    \>\>$x_{i}^{t+1}=\left\{
+    $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\;
+    \For {$t=0,1,...$} {
+      \For{ $i=1,...,n$} {
+        $x_{i}^{t+1}=\left\{
       \begin{array}[h]{ll}
         x_i^t & \text{if } i \text{ is \emph{not} updated at iteration } i\\
         f_i(x_1^{s_1^i(t)},...,x_n^{s_n^i(t)}) & \text{if } i \text{ is updated at iteration } i
       \end{array}
-    \right.$\\
-    \>\textbf{endfor}\\
-    \textbf{endfor}
-  \end{Algo}
+    \right.$
+      }
+    }
 \end{algorithm}
 where $s_j^i(t)$ is the iteration number of the production of the value $x_j$ of
 element $j$  that is  used on element  $i$ at iteration  $t$ (see for example~\cite{BT89,
index 2be0894822ca03e6e2ca469306377311e09a80ed..af524e9c27187bce5c11e149f365d2f1b8519497 100644 (file)
 \@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{88}}
 \newlabel{algo:ch6p1overlapseqsequence}{{6.2}{89}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{89}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{90}}
-\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{90}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{91}}
+\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{91}}
 \newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{91}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{91}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{93}}
-\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{93}}
 \@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{93}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{94}}
+\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{94}}
 \newlabel{algo:ch6p1overlapinterleaved}{{6.4}{94}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{94}}
 \@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{96}}
 \newlabel{ch6:part2}{{6.3}{98}}
 \@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{98}}
 \newlabel{algo:ch6p2sync}{{3}{98}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{98}}
-\newlabel{algo:ch6p2async}{{4}{98}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{99}}
-\newlabel{ch6:p2BasicAsync}{{6.3.1}{99}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{99}}
+\newlabel{algo:ch6p2async}{{4}{99}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{100}}
+\newlabel{ch6:p2BasicAsync}{{6.3.1}{100}}
 \newlabel{algo:ch6p2BasicAsync}{{6.5}{100}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{100}}
 \newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{101}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{101}}
-\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{102}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{102}}
+\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{103}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{103}}
 \newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{103}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{103}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{104}}
-\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{104}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{105}}
+\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{105}}
 \newlabel{algo:ch6p2Sync}{{6.9}{105}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{105}}
 \newlabel{algo:ch6p2SyncComp}{{6.10}{106}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{106}}
-\newlabel{algo:ch6p2SyncReceptions}{{6.11}{107}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{107}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{108}}
-\newlabel{ch6:p2GPUAsync}{{6.3.3}{108}}
-\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{109}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{109}}
+\newlabel{algo:ch6p2SyncReceptions}{{6.11}{108}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{108}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{109}}
+\newlabel{ch6:p2GPUAsync}{{6.3.3}{109}}
+\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{110}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{110}}
 \newlabel{algo:ch6p2syncGPU}{{6.13}{111}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{111}}
-\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{113}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{113}}
-\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{114}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{114}}
-\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{115}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{115}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{116}}
-\newlabel{sec:ch6p2expes}{{6.3.4}{116}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{117}}
-\newlabel{fig:ch6p2syncasync}{{6.6}{117}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{118}}
-\newlabel{fig:ch6p2aux}{{6.7}{118}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{119}}
-\newlabel{sec:ch6p3unify}{{6.4}{119}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{119}}
-\newlabel{sec:ch6p3resources}{{6.4.1}{119}}
-\newlabel{algo:ch6p3ORWLresources}{{6.17}{120}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{120}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{120}}
-\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{120}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{121}}
-\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{121}}
-\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{121}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{121}}
-\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{122}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{122}}
-\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{122}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{122}}
-\newlabel{algo:ch6p3ORWLtrans}{{6.21}{122}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{122}}
-\newlabel{algo:ch6p3ORWLdecl}{{6.22}{123}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{123}}
+\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{114}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{114}}
+\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{115}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{115}}
+\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{116}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{116}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{117}}
+\newlabel{sec:ch6p2expes}{{6.3.4}{117}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{118}}
+\newlabel{fig:ch6p2syncasync}{{6.6}{118}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{119}}
+\newlabel{fig:ch6p2aux}{{6.7}{119}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{120}}
+\newlabel{sec:ch6p3unify}{{6.4}{120}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{120}}
+\newlabel{sec:ch6p3resources}{{6.4.1}{120}}
+\newlabel{algo:ch6p3ORWLresources}{{6.17}{121}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{121}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{121}}
+\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{121}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{122}}
+\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{122}}
+\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{122}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{122}}
+\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{123}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{123}}
+\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{123}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{123}}
+\newlabel{algo:ch6p3ORWLtrans}{{6.21}{123}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{123}}
+\newlabel{algo:ch6p3ORWLdecl}{{6.22}{124}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{124}}
 \newlabel{algo:ch6p3ORWLinit}{{6.23}{124}}
 \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{124}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{124}}
-\newlabel{sec:ch6p3tasks}{{6.4.4}{124}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{125}}
-\newlabel{ch6:conclu}{{6.5}{125}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{125}}
-\@writefile{toc}{\contentsline {section}{Bibliography}{126}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{125}}
+\newlabel{sec:ch6p3tasks}{{6.4.4}{125}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{126}}
+\newlabel{ch6:conclu}{{6.5}{126}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{126}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{127}}
 \@setckpt{Chapters/chapter6/ch6}{
-\setcounter{page}{128}
+\setcounter{page}{129}
 \setcounter{equation}{0}
 \setcounter{enumi}{4}
 \setcounter{enumii}{0}
 \setcounter{lotdepth}{1}
 \setcounter{lstnumber}{17}
 \setcounter{ContinuedFloat}{0}
-\setcounter{AlgoLine}{0}
+\setcounter{AlgoLine}{6}
 \setcounter{algocfline}{4}
 \setcounter{algocfproc}{4}
 \setcounter{algocf}{4}
index 79cd4290d33b8b157fd6199310bff4d64431c772..bc5ea3f8deb7abd4d0b862e7389867d1790b3e97 100755 (executable)
 \def\Reals{\mathbb{R}}
 
 \definecolor{shadecolor}{rgb}{0.95,0.95,0.95}
-\newenvironment{Algo}{\vspace{-1em}\begin{center}\begin{minipage}[h]{0.95\columnwidth}\begin{shaded}\begin{tabbing}%
-  \hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\= \kill} %
- { \end{tabbing}\vspace{-1em}\end{shaded}\end{minipage}\end{center}\vspace{-1em}}
+%\newenvironment{Algo}{\vspace{-1em}\begin{center}\begin{minipage}[h]{0.95\columnwidth}\begin{shaded}\begin{tabbing}%
+%  \hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\= \kill} %
+% { \end{tabbing}\vspace{-1em}\end{shaded}\end{minipage}\end{center}\vspace{-1em}}
 
-\lstnewenvironment{Listing}[2]{\lstset{basicstyle=\scriptsize\ttfamily,%
-    breaklines=true, breakatwhitespace=true, language=C, keywordstyle=\color{black},%
-    prebreak = \raisebox{0ex}[0ex][0ex]{\ensuremath{\hookleftarrow}},%
-    commentstyle=\textit, numbersep=1em, numberstyle=\tiny, numbers=left,%
-    numberblanklines=false, mathescape, escapechar=@, label=#1, caption={#2}}}{}
+\lstnewenvironment{Listing}[2]{\lstset{
+%% basicstyle=\scriptsize\ttfamily,%
+%%      breaklines=true, breakatwhitespace=true, language=C, keywordstyle=\color{black},%
+%%      prebreak = \raisebox{0ex}[0ex][0ex]{\ensuremath{\hookleftarrow}},%
+%%      commentstyle=\textit, numbersep=1em, numberstyle=\tiny, numbers=left,%
+%%      numberblanklines=false, mathescape, escapechar=@,
+ label=#1, caption={#2}}
+}{}
 
 \def\N{$\mathbb N$ }
 \def\R{$\mathbb R$ }
index 585515861281423bec92a004543c496983cbe3a9..401f132aea15bd79801faeeb035002c23a068cf1 100644 (file)
@@ -468,7 +468,9 @@ The CUDA-based numerical wave model has been developed based on all the numerica
 
 For the unified potential flow model the user will need to provide implementations of the following components; the right hand side operator for the semi-discrete free surface variables \eqref{ch7:FSorigin}, the matrix-vector operator for the discretized $\sigma$-transformed Laplace equation \eqref{ch7:TransformedLaplace}, a smoother for the multigrid relaxation step, and the potential flow solver itself, that reads initial data and advance the solution in time. In order to make the library as generic as possible, all components are template-based, which makes it possible to assemble the PDE solver by combining type definitions in the preamble of the application. An excerpt of the potential flow assembling is given in listing \ref{ch7:lst:solversetup}.
 
-\lstset{label=ch7:lst:solversetup,caption={Generic assembling of the potential flow solver for fully nonlinear free surface water waves.},basicstyle=\scriptsize}
+\lstset{label=ch7:lst:solversetup,caption={Generic assembling of the potential flow solver for fully nonlinear free surface water waves.}
+%,basicstyle=\scriptsize
+}
 \begin{lstlisting}
 // Basics
 typedef double value_type;
@@ -501,7 +503,9 @@ typedef free_surface::potential_flow_solver_3d<potential_flow_types> potential_f
 
 Hereafter, the potential flow solver is aware of all component types that should be used to solve the entire PDE system, and it will be easy for developers to exchange parts at later times. The \texttt{laplace\_sigma\_stencil\_3d} class implements both the matrix-vector and right hand side operator. The flexible-order finite difference kernel for the matrix-free matrix-vector product for the two-dimensional Laplace problem is presented in listing \ref{ch7:lst:fd2d}. Library macros and reusable kernel routines are used throughout the implementations to enhance developer productivity and hide hardware specific details. This kernel can be used both for matrix-vector products for the original system and for the preconditioning.
 
-\lstset{label=ch7:lst:fd2d,caption={CUDA kernel implementation for the two dimensional finite difference approximation to the transformed Laplace equation.},basicstyle=\scriptsize\ttfamily}
+\lstset{label=ch7:lst:fd2d,caption={CUDA kernel implementation for the two dimensional finite difference approximation to the transformed Laplace equation.}
+%,basicstyle=\scriptsize\ttfamily
+}
 \begin{lstlisting}
 template <typename value_type, typename size_type>
 __global__ void laplace_sigma_transformed(
@@ -589,7 +593,8 @@ __global__ void laplace_sigma_transformed(
 
 In a similar template-based approach, the kernel for the right hand side operator of the two dimensional problem is implemented and listed in Listing \ref{ch7:lst:rhs2d}. The kernel computes the right hand side updates for both surface variables, $\eta$ and $\tilde{\phi}$, and applies an embedded penalty forcing \eqref{ch7:eq:penalty}, for all nodes within generation or absorption zones. The penalty forcing functions are computed based on linear or non-linear wave theory in a separate device function.
 
-\lstset{label=ch7:lst:rhs2d,caption={CUDA kernel implementation for the 2D right hand side.},basicstyle=\scriptsize\ttfamily}
+\lstset{label=ch7:lst:rhs2d,caption={CUDA kernel implementation for the 2D right hand side.}%,basicstyle=\scriptsize\ttfamily
+}
 \begin{lstlisting}
 template <typename value_type, typename size_type>
 __global__ void rhs(value_type const* p    , value_type const* p_surf