update

[book_gpu.git] / BookGPU / Chapters / chapter7 / ch7.tex
diff --git a/BookGPU/Chapters/chapter7/ch7.tex b/BookGPU/Chapters/chapter7/ch7.tex

index 585515861281423bec92a004543c496983cbe3a9..6ca20807315fb588e52dffa71e8f67fe0120f003 100644 (file)
--- a/BookGPU/Chapters/chapter7/ch7.tex
+++ b/BookGPU/Chapters/chapter7/ch7.tex
@@ -9,7 +9,7 @@
  
  \begin{figure}[!htb]
  \centering
  
  \begin{figure}[!htb]
  \centering
-\includegraphics[width=0.95\textwidth]{Chapters/chapter7/figures/figSeries60CB06Type7StedaySnapshot.eps}
+\includegraphics[width=0.95\textwidth]{Chapters/chapter7/figures/figSeries60CB06Type7StedaySnapshot-eps-converted-to.pdf}
  %\caption{Snapshot of steady state wave field generated by a Series 60 ship hull.}
  \end{figure}
  
  %\caption{Snapshot of steady state wave field generated by a Series 60 ship hull.}
  \end{figure}
  
@@ -313,21 +313,21 @@ Similar results were reported for the first time in the context of high-order Bo
  \centering
  \subfigure[Grid scaling, $x=(1-a)\xi^3+a\xi$.]{
  % MainLaplace2D_ex03.m
  \centering
  \subfigure[Grid scaling, $x=(1-a)\xi^3+a\xi$.]{
  % MainLaplace2D_ex03.m
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/scalingNx25.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/scalingNx25-eps-converted-to.pdf}
  }
  \subfigure[High-order spatial discretisation and stable explicit time-stepping with large time steps for a nonlinear standing wave. Scaling based on $a=0$. ]{
  % MainLaplace2D_ex03.m
  }
  \subfigure[High-order spatial discretisation and stable explicit time-stepping with large time steps for a nonlinear standing wave. Scaling based on $a=0$. ]{
  % MainLaplace2D_ex03.m
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/standingwaveglozman.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/standingwaveglozman-eps-converted-to.pdf}
  }
  \subfigure[Uniform grid ($a=1$).]{
  % MainLaplace2D_ex035_nonlinearLaplace.m
  }
  \subfigure[Uniform grid ($a=1$).]{
  % MainLaplace2D_ex035_nonlinearLaplace.m
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/SFwaves_snapshots_uniform.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/SFwaves_snapshots_uniform-eps-converted-to.pdf}
  }
  \subfigure[Clustered grid ($a=0.05$).]{
  % MainLaplace2D_ex035_nonlinearLaplace.m
  }
  \subfigure[Clustered grid ($a=0.05$).]{
  % MainLaplace2D_ex035_nonlinearLaplace.m
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/SFwaves_snapshots_clustered.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/SFwaves_snapshots_clustered-eps-converted-to.pdf}
  }
  }
-\caption{Numerical experiments to assess stability properties of numerical wave model. In three cases, computed snapshots are taken of the wave elevation over one wave period of time. In a) the grid distribution of nodes in a one-parameter mapping for the grid is illustrated. Results from changes in wave elevation are illustrated for b) a mildly nonlinear standing wave on a highly clustered grid, c) regular stream function wave of medium steepness in shallow water $(kh,H/L)=(0.5,0.0292)$ on a uniform grid ($N_x=80$) and d) for a nonuniform grid with a minimal grid spacing 20 times smaller(!). In every case the step size remains fixed at $\Delta t = T/160$ s corresponding to a Courant number $C_r=c\tfrac{\Delta t}{\Delta x}=0.5$ for the uniform grid. A 6'$th$ order scheme and explicit EKR4 time-stepping is used in each test case.}
+\caption[Numerical experiments to assess stability properties of numerical wave model.]{Numerical experiments to assess stability properties of numerical wave model. In three cases, computed snapshots are taken of the wave elevation over one wave period of time. In a) the grid distribution of nodes in a one-parameter mapping for the grid is illustrated. Results from changes in wave elevation are illustrated for b) a mildly nonlinear standing wave on a highly clustered grid, c) regular stream function wave of medium steepness in shallow water $(kh,H/L)=(0.5,0.0292)$ on a uniform grid ($N_x=80$) and d) for a nonuniform grid with a minimal grid spacing 20 times smaller(!). In every case the step size remains fixed at $\Delta t = T/160$ s corresponding to a Courant number $C_r=c\tfrac{\Delta t}{\Delta x}=0.5$ for the uniform grid. A 6'$th$ order scheme and explicit EKR4 time-stepping is used in each test case.}
  \label{ch7:numexp}
  \end{figure}
  %\newpage
  \label{ch7:numexp}
  \end{figure}
  %\newpage
@@ -376,15 +376,15 @@ The profiles can be reversed by a change of coordinate, i.e. $\Gamma(1-x)$, and
  \centering
  \subfigure[Wave generation, reflection and absorption of small-amplitude waves.]{
  % Script : MainLaplace2D_ex03penalityLINEAR_REFLECTEDWAVES.m
  \centering
  \subfigure[Wave generation, reflection and absorption of small-amplitude waves.]{
  % Script : MainLaplace2D_ex03penalityLINEAR_REFLECTEDWAVES.m
-\includegraphics[width=0.98\textwidth]{Chapters/chapter7/figures/standingwavespenalty.eps}
+\includegraphics[width=0.98\textwidth]{Chapters/chapter7/figures/standingwavespenalty-eps-converted-to.pdf}
  % Nx = 480, 6th order, vertical clustering, Nz=6;
  }
  \subfigure[Wave generation and absorption of steep finite-amplitude waves.]{
  % Script : MainLaplace2D_ex03penalityNONLINEAR_GENERATEWAVES.m
  % Nx = 480, 6th order, vertical clustering, Nz=6;
  }
  \subfigure[Wave generation and absorption of steep finite-amplitude waves.]{
  % Script : MainLaplace2D_ex03penalityNONLINEAR_GENERATEWAVES.m
-\includegraphics[width=0.98\textwidth]{Chapters/chapter7/figures/nonlinearwavespenalty.eps}
+\includegraphics[width=0.98\textwidth]{Chapters/chapter7/figures/nonlinearwavespenalty-eps-converted-to.pdf}
  % Nx = 540, 6th order, vertical clustering, Nz=6;
  }
  % Nx = 540, 6th order, vertical clustering, Nz=6;
  }
-\caption{Snapshots at intervals $T/8$ over one wave period in time of computed a) small-amplitude $(kh,kH)=(0.63,0.005)$ and b) finite-amplitude $(kh,kH)=(1,0.41)$ stream function waves elevations having reached a steady state after transient startup. Combined wave generation and absorption zones in the western relaxation zone of both a) and b). In b) an absorption zone is positioned next to the eastern boundary and causes minor visible reflections. }
+\caption[Snapshots at intervals $T/8$ over one wave period in time.]{Snapshots at intervals $T/8$ over one wave period in time of computed a) small-amplitude $(kh,kH)=(0.63,0.005)$ and b) finite-amplitude $(kh,kH)=(1,0.41)$ stream function waves elevations having reached a steady state after transient startup. Combined wave generation and absorption zones in the western relaxation zone of both a) and b). In b) an absorption zone is positioned next to the eastern boundary and causes minor visible reflections. }
  \label{ch7:figstandwave}
  \end{figure}
  
  \label{ch7:figstandwave}
  \end{figure}
  
@@ -424,9 +424,6 @@ Numerical modelling of large ocean areas to account for nonlinear wave-wave inte
  
  The ratio between necessary data transfers and computational work for the proposed numerical model for free surface water waves is high enough to expect reasonable latency hiding. The data domain decomposition method consists of a logically structured division of the computational domain into multiple subdomains. Each of these subdomains are connected via fictitious ghost layers at the artificial boundaries of width corresponding to the half-width of the finite difference stencils employed. This results in a favourable volume-to-boundary ratio as the problem size increases, diminishing communication overhead for message passing. Information between subdomains are exchanged through ghost layers at every step of the iterative PDC method, in connection with the matrix-vector evaluation for the $\sigma$-transformed Laplace problem, and before relaxation steps in the multigrid method. A single global synchronization point occur at most once each iteration, if convergence is monitored, where a global reduction step (inner product) between all processor nodes takes place. The main advantage of this decomposition strategy is, that the decomposition into multiple subdomains is straightforward. However, it comes with the cost of extra data transfers to update the set of fictitious ghost layers.
  
  
  The ratio between necessary data transfers and computational work for the proposed numerical model for free surface water waves is high enough to expect reasonable latency hiding. The data domain decomposition method consists of a logically structured division of the computational domain into multiple subdomains. Each of these subdomains are connected via fictitious ghost layers at the artificial boundaries of width corresponding to the half-width of the finite difference stencils employed. This results in a favourable volume-to-boundary ratio as the problem size increases, diminishing communication overhead for message passing. Information between subdomains are exchanged through ghost layers at every step of the iterative PDC method, in connection with the matrix-vector evaluation for the $\sigma$-transformed Laplace problem, and before relaxation steps in the multigrid method. A single global synchronization point occur at most once each iteration, if convergence is monitored, where a global reduction step (inner product) between all processor nodes takes place. The main advantage of this decomposition strategy is, that the decomposition into multiple subdomains is straightforward. However, it comes with the cost of extra data transfers to update the set of fictitious ghost layers.
  
-The parallel domain decomposition solver has been validated against the sequential solvers with respect to algorithmic efficiency to establish that the code produce correct results. An analysis of the numerical efficiency have also been carried out on different GPU systems to identify comparative behaviors as both the problems sizes and number of compute nodes vary. For example, performance scalings on Test environment 1 and Test environment 2 are presented in figure \ref{ch7:fig:multigpuperformance}. The figure confirms that there is only a limited benefit from using multiple GPUs for small problem sizes, since the computational intensity is simply too low to efficiently hide the latency of message passing. A substantial speedup is achieved compared to the single GPU version, while being able to solve even larger systems.
-With the linear scaling of memory requirements and improved computational speed, the methodology based on multiple GPUs makes it possible to simulate water waves in very large numerical wave tanks with improved performance.
-
  \begin{figure}[!htb]
      \setlength\figureheight{0.30\textwidth}
      \setlength\figurewidth{0.33\textwidth}
  \begin{figure}[!htb]
      \setlength\figureheight{0.30\textwidth}
      \setlength\figurewidth{0.33\textwidth}
@@ -438,9 +435,14 @@ With the linear scaling of memory requirements and improved computational speed,
      {\scriptsize\input{Chapters/chapter7/figures/TeslaK20SpeedupGPUvsCPU3D.tikz}}
      }
      \end{center}
      {\scriptsize\input{Chapters/chapter7/figures/TeslaK20SpeedupGPUvsCPU3D.tikz}}
      }
      \end{center}
-    \caption{Performance timings per PDC iteration as a function of increasing problem size $N$, for single, mixed and double precision arithmetics. Three dimensional nonlinear waves, using $6^{th}$ order finite difference approximations, preconditioned with one multigrid V-cycle and one pre- and post- Red-black Gauss-Seidel smoothing. Speedup compared to fastest known serial implementation. Using Test environment 3. CPU timings represent starting point for our investigations and has been obtained using Fortran 90 code and is based on a single-core run on a Intel Core i7, 2.80GHz processor.}\label{ch7:fig:perftimings}
+    \caption[Performance timings per PDC iteration as a function of increasing problem size $N$, for single, mixed and double precision arithmetics.]{Performance timings per PDC iteration as a function of increasing problem size $N$, for single, mixed and double precision arithmetics. Three dimensional nonlinear waves, using $6^{th}$ order finite difference approximations, preconditioned with one multigrid V-cycle and one pre- and post- Red-black Gauss-Seidel smoothing. Speedup compared to fastest known serial implementation. Using Test environment 3. CPU timings represent starting point for our investigations and has been obtained using Fortran 90 code and is based on a single-core run on a Intel Core i7, 2.80GHz processor.}\label{ch7:fig:perftimings}
  \end{figure}
  
  \end{figure}
  
+The parallel domain decomposition solver has been validated against the sequential solvers with respect to algorithmic efficiency to establish that the code produce correct results. An analysis of the numerical efficiency have also been carried out on different GPU systems to identify comparative behaviors as both the problems sizes and number of compute nodes vary. For example, performance scalings on Test environment 1 and Test environment 2 are presented in figure \ref{ch7:fig:multigpuperformance}. The figure confirms that there is only a limited benefit from using multiple GPUs for small problem sizes, since the computational intensity is simply too low to efficiently hide the latency of message passing. A substantial speedup is achieved compared to the single GPU version, while being able to solve even larger systems.
+With the linear scaling of memory requirements and improved computational speed, the methodology based on multiple GPUs makes it possible to simulate water waves in very large numerical wave tanks with improved performance.
+
+
+
  
  
  \begin{figure}[!htb]
  
  
  \begin{figure}[!htb]
@@ -454,7 +456,7 @@ With the linear scaling of memory requirements and improved computational speed,
      {\scriptsize\input{Chapters/chapter7/figures/TeslaM2050MultiGPUScaling3D.tikz}}
      }
      \end{center}
      {\scriptsize\input{Chapters/chapter7/figures/TeslaM2050MultiGPUScaling3D.tikz}}
      }
      \end{center}
-    \caption{Domain decomposition performance on multi-GPU systems. Performance timings per PDC iteration as a function of increasing problem sizes using single precision. Same setup as in figure \ref{ch7:fig:perftimings}.}
+    \caption[Domain decomposition performance on multi-GPU systems.]{Domain decomposition performance on multi-GPU systems. Performance timings per PDC iteration as a function of increasing problem sizes using single precision. Same setup as in figure \ref{ch7:fig:perftimings}.}
      \label{ch7:fig:multigpuperformance}
  \end{figure}
  
      \label{ch7:fig:multigpuperformance}
  \end{figure}
  
@@ -468,7 +470,9 @@ The CUDA-based numerical wave model has been developed based on all the numerica
  
  For the unified potential flow model the user will need to provide implementations of the following components; the right hand side operator for the semi-discrete free surface variables \eqref{ch7:FSorigin}, the matrix-vector operator for the discretized $\sigma$-transformed Laplace equation \eqref{ch7:TransformedLaplace}, a smoother for the multigrid relaxation step, and the potential flow solver itself, that reads initial data and advance the solution in time. In order to make the library as generic as possible, all components are template-based, which makes it possible to assemble the PDE solver by combining type definitions in the preamble of the application. An excerpt of the potential flow assembling is given in listing \ref{ch7:lst:solversetup}.
  
  
  For the unified potential flow model the user will need to provide implementations of the following components; the right hand side operator for the semi-discrete free surface variables \eqref{ch7:FSorigin}, the matrix-vector operator for the discretized $\sigma$-transformed Laplace equation \eqref{ch7:TransformedLaplace}, a smoother for the multigrid relaxation step, and the potential flow solver itself, that reads initial data and advance the solution in time. In order to make the library as generic as possible, all components are template-based, which makes it possible to assemble the PDE solver by combining type definitions in the preamble of the application. An excerpt of the potential flow assembling is given in listing \ref{ch7:lst:solversetup}.
  
-\lstset{label=ch7:lst:solversetup,caption={Generic assembling of the potential flow solver for fully nonlinear free surface water waves.},basicstyle=\scriptsize}
+\lstset{label=ch7:lst:solversetup,caption={Generic assembling of the potential flow solver for fully nonlinear free surface water waves.}
+%,basicstyle=\scriptsize
+}
  \begin{lstlisting}
  // Basics
  typedef double value_type;
  \begin{lstlisting}
  // Basics
  typedef double value_type;
@@ -501,7 +505,9 @@ typedef free_surface::potential_flow_solver_3d<potential_flow_types> potential_f
  
  Hereafter, the potential flow solver is aware of all component types that should be used to solve the entire PDE system, and it will be easy for developers to exchange parts at later times. The \texttt{laplace\_sigma\_stencil\_3d} class implements both the matrix-vector and right hand side operator. The flexible-order finite difference kernel for the matrix-free matrix-vector product for the two-dimensional Laplace problem is presented in listing \ref{ch7:lst:fd2d}. Library macros and reusable kernel routines are used throughout the implementations to enhance developer productivity and hide hardware specific details. This kernel can be used both for matrix-vector products for the original system and for the preconditioning.
  
  
  Hereafter, the potential flow solver is aware of all component types that should be used to solve the entire PDE system, and it will be easy for developers to exchange parts at later times. The \texttt{laplace\_sigma\_stencil\_3d} class implements both the matrix-vector and right hand side operator. The flexible-order finite difference kernel for the matrix-free matrix-vector product for the two-dimensional Laplace problem is presented in listing \ref{ch7:lst:fd2d}. Library macros and reusable kernel routines are used throughout the implementations to enhance developer productivity and hide hardware specific details. This kernel can be used both for matrix-vector products for the original system and for the preconditioning.
  
-\lstset{label=ch7:lst:fd2d,caption={CUDA kernel implementation for the two dimensional finite difference approximation to the transformed Laplace equation.},basicstyle=\scriptsize\ttfamily}
+\lstset{label=ch7:lst:fd2d,caption={CUDA kernel implementation for the two dimensional finite difference approximation to the transformed Laplace equation.}
+%,basicstyle=\scriptsize\ttfamily
+}
  \begin{lstlisting}
  template <typename value_type, typename size_type>
  __global__ void laplace_sigma_transformed(
  \begin{lstlisting}
  template <typename value_type, typename size_type>
  __global__ void laplace_sigma_transformed(
@@ -531,7 +537,8 @@ __global__ void laplace_sigma_transformed(
         {                       
                 size_type offset_i = i < alpha ? 2*alpha-i : i >= Ns-alpha ? Ns-1-i : alpha;
                 size_type row_i    = offset_i*rank;
         {                       
                 size_type offset_i = i < alpha ? 2*alpha-i : i >= Ns-alpha ? Ns-1-i : alpha;
                 size_type row_i    = offset_i*rank;
-               size_type offset_j = alpha;  // Always centered stencils in x-dir
+    // Always centered stencils in x-dir
+               size_type offset_j = alpha;  
                 size_type row_j    = alpha*rank;
                         
                 value_type dhdx    = hx[j];
                 size_type row_j    = alpha*rank;
                         
                 value_type dhdx    = hx[j];
@@ -589,7 +596,8 @@ __global__ void laplace_sigma_transformed(
  
  In a similar template-based approach, the kernel for the right hand side operator of the two dimensional problem is implemented and listed in Listing \ref{ch7:lst:rhs2d}. The kernel computes the right hand side updates for both surface variables, $\eta$ and $\tilde{\phi}$, and applies an embedded penalty forcing \eqref{ch7:eq:penalty}, for all nodes within generation or absorption zones. The penalty forcing functions are computed based on linear or non-linear wave theory in a separate device function.
  
  
  In a similar template-based approach, the kernel for the right hand side operator of the two dimensional problem is implemented and listed in Listing \ref{ch7:lst:rhs2d}. The kernel computes the right hand side updates for both surface variables, $\eta$ and $\tilde{\phi}$, and applies an embedded penalty forcing \eqref{ch7:eq:penalty}, for all nodes within generation or absorption zones. The penalty forcing functions are computed based on linear or non-linear wave theory in a separate device function.
  
-\lstset{label=ch7:lst:rhs2d,caption={CUDA kernel implementation for the 2D right hand side.},basicstyle=\scriptsize\ttfamily}
+\lstset{label=ch7:lst:rhs2d,caption={CUDA kernel implementation for the 2D right hand side.}%,basicstyle=\scriptsize\ttfamily
+}
  \begin{lstlisting}
  template <typename value_type, typename size_type>
  __global__ void rhs(value_type const* p    , value_type const* p_surf
  \begin{lstlisting}
  template <typename value_type, typename size_type>
  __global__ void rhs(value_type const* p    , value_type const* p_surf
@@ -676,10 +684,11 @@ where $m$ is one of the scalar functions $\phi,u,w$ describing kinematics, $c$ i
  \begin{figure}[!htb]
  \begin{center}
  \subfigure[Uniform vertical grid.]{
  \begin{figure}[!htb]
  \begin{center}
  \subfigure[Uniform vertical grid.]{
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/lineardispersion_Nx30-HL90-p6-vergrid0_Linear.eps}
+%\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/lineardispersion_Nx30-HL90-p6-vergrid0_Linear.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/lineardispersion_Nx30-HL90-p6-vergrid0_Linear-eps-converted-to.pdf}
  }
  \subfigure[Cosine-clustered vertical grid.]{
  }
  \subfigure[Cosine-clustered vertical grid.]{
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/lineardispersion_Nx30-HL90-p6_Linear.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/lineardispersion_Nx30-HL90-p6_Linear-eps-converted-to.pdf}
  }
  \end{center}
  \caption{The accuracy in phase celerity $c$ determined by \eqref{ch7:errdisp} for small-amplitude (linear) wave.
  }
  \end{center}
  \caption{The accuracy in phase celerity $c$ determined by \eqref{ch7:errdisp} for small-amplitude (linear) wave.
@@ -691,19 +700,19 @@ $N_z\in[6,12]$. Sixth order scheme.}
  \begin{figure}[!htb]
  \begin{center}
  \subfigure[Linear]{
  \begin{figure}[!htb]
  \begin{center}
  \subfigure[Linear]{
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/kinematicsPHI_Nx30-HL90-p6_Linear.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/kinematicsPHI_Nx30-HL90-p6_Linear-eps-converted-to.pdf}
  }
  \subfigure[Linear]{
  }
  \subfigure[Linear]{
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/kinematicsW_Nx30-HL90-p6_Linear.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/kinematicsW_Nx30-HL90-p6_Linear-eps-converted-to.pdf}
  }
  \subfigure[Nonlinear]{
  }
  \subfigure[Nonlinear]{
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/kinematicsPHI_Nx30-HL90-p6_Nonlinear.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/kinematicsPHI_Nx30-HL90-p6_Nonlinear-eps-converted-to.pdf}
  }
  \subfigure[Nonlinear]{
  }
  \subfigure[Nonlinear]{
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/kinematicsW_Nx30-HL90-p6_Nonlinear.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/kinematicsW_Nx30-HL90-p6_Nonlinear-eps-converted-to.pdf}
  }
  \end{center}
  }
  \end{center}
-\caption{Assessment of kinematic error is presented in terms of the depth-averaged error determined by \eqref{ch7:errkin} for a) scalar velocity potential and b) vertical velocity for a small-amplitude (linear) wave, and c) scalar velocity potential and d) vertical velocity for a finite-amplitude (nonlinear) wave with wave height $H/L=90\%(H/L)_\textrm{max}$.
+\caption[Assessment of kinematic error is presented in terms of the depth-averaged error.]{Assessment of kinematic error is presented in terms of the depth-averaged error determined by \eqref{ch7:errkin} for a) scalar velocity potential and b) vertical velocity for a small-amplitude (linear) wave, and c) scalar velocity potential and d) vertical velocity for a finite-amplitude (nonlinear) wave with wave height $H/L=90\%(H/L)_\textrm{max}$.
  $N_z\in[6,12]$. Sixth order scheme. Clustered vertical grid. }
  \label{ch7:figlinear2}
  \end{figure}
  $N_z\in[6,12]$. Sixth order scheme. Clustered vertical grid. }
  \label{ch7:figlinear2}
  \end{figure}
@@ -725,13 +734,13 @@ Previously reported performance results for the wave model can be taken a step f
  \begin{center}
  % MainLaplace2D_ex025nonlinearLaplaceSINGLE.m
  \subfigure[Single precision.]{
  \begin{center}
  % MainLaplace2D_ex025nonlinearLaplaceSINGLE.m
  \subfigure[Single precision.]{
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/PrecisionSINGLE.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/PrecisionSINGLE-eps-converted-to.pdf}
  }
  \subfigure[Double precision.]{
  }
  \subfigure[Double precision.]{
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/PrecisionDOUBLE.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/PrecisionDOUBLE-eps-converted-to.pdf}
  }
  \end{center}
  }
  \end{center}
-\caption{Comparison between convergence histories for single and double precision computations using a PDC method for the solution of the transformed Laplace problem. Very steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0903)$. Discretizaiton based on $(N_x,N_z)=(15,9)$ with 6'$th$ order stencils.}
+\caption[Comparison between convergence histories for single and double precision computations using a PDC method for the solution of the transformed Laplace problem.]{Comparison between convergence histories for single and double precision computations using a PDC method for the solution of the transformed Laplace problem. Very steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0903)$. Discretizaiton based on $(N_x,N_z)=(15,9)$ with 6'$th$ order stencils.}
  \label{ch7:convhist}
  \end{figure}
  
  \label{ch7:convhist}
  \end{figure}
  
@@ -753,19 +762,19 @@ Results from numerical experiments are presented in figure \ref{ch7:filtering} a
  \begin{center}
  % DriverWavemodelDecomposition.m
  \subfigure[Direct solve without filter.]{
  \begin{center}
  % DriverWavemodelDecomposition.m
  \subfigure[Direct solve without filter.]{
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/ComparisonLUNoFiltering.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/ComparisonLUNoFiltering-eps-converted-to.pdf}
  }
  \subfigure[Direct solve with filter.]{
  }
  \subfigure[Direct solve with filter.]{
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/ComparisonLUWithFiltering.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/ComparisonLUWithFiltering-eps-converted-to.pdf}
  } \\
  \subfigure[Iterative PDC solve without filter.]{
  } \\
  \subfigure[Iterative PDC solve without filter.]{
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/ComparisonDCNoFiltering.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/ComparisonDCNoFiltering-eps-converted-to.pdf}
  }
  \subfigure[Iterative PDC solve with filter.]{
  }
  \subfigure[Iterative PDC solve with filter.]{
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/ComparisonDCWithFiltering.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/ComparisonDCWithFiltering-eps-converted-to.pdf}
  }
  \end{center}
  }
  \end{center}
-\caption{Comparison between accuracy as a function of time for double precision calculations vs. single precision with and without filtering. The double precision result are unfiltered in each comparison and shows to be less sensitive to roundoff-errors. Medium steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0502)$. Discretization is based on $(N_x,N_z)=(30,6)$, A courant number of $C_r=0.5$ and 6'$th$ order stencils.}
+\caption[Comparison between accuracy as a function of time for double precision calculations vs. single precision with and without filtering.]{Comparison between accuracy as a function of time for double precision calculations vs. single precision with and without filtering. The double precision result are unfiltered in each comparison and shows to be less sensitive to roundoff-errors. Medium steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0502)$. Discretization is based on $(N_x,N_z)=(30,6)$, A courant number of $C_r=0.5$ and 6'$th$ order stencils.}
  \label{ch7:filtering}
  \end{figure}
  
  \label{ch7:filtering}
  \end{figure}
  
@@ -791,7 +800,7 @@ A harmonic analysis of the wave spectrum at the shoal center line is computed an
      {\scriptsize\input{Chapters/chapter7/figures/WhalinWaveHarmonics_T3_single.tikz}}
      }
  %    \end{center}
      {\scriptsize\input{Chapters/chapter7/figures/WhalinWaveHarmonics_T3_single.tikz}}
      }
  %    \end{center}
-    \caption{Harmonic analysis for the experiment of Whalin for $T=1,2,3\,s$ respectively. Measured experimental and computed results (single precision) are in good agreement. Test environment 1.}\label{ch7:whalinresults}
+    \caption[Harmonic analysis for the experiment of Whalin for $T=1,2,3\,s$ respectively.]{Harmonic analysis for the experiment of Whalin for $T=1,2,3\,s$ respectively. Measured experimental and computed results (single precision) are in good agreement. Test environment 1.}\label{ch7:whalinresults}
  \end{figure}
  
  \subsection{Acceleration via parallelism in time using 'Parareal'}\label{ch7:parareal}\index{parareal}
  \end{figure}
  
  \subsection{Acceleration via parallelism in time using 'Parareal'}\label{ch7:parareal}\index{parareal}
@@ -813,13 +822,15 @@ Ideally, the ratio $\mathcal{C}_\mathcal{G}/\mathcal{C}_\mathcal{F}$ is small an
      \setlength\figureheight{0.35\textwidth}
      \setlength\figurewidth{0.37\textwidth}
      \subfigure[Performance scaling]{
      \setlength\figureheight{0.35\textwidth}
      \setlength\figurewidth{0.37\textwidth}
      \subfigure[Performance scaling]{
-        {\small\input{Chapters/chapter7/figures/PararealScaletestGTX590.tikz}}
+%        {\small\input{Chapters/chapter7/figures/PararealScaletestGTX590.tikz}}
+      \includegraphics[width=0.5\textwidth]{Chapters/chapter7/figures/PararealScaletestGTX590_conv.pdf}
      }
      \subfigure[Speedup]{
      }
      \subfigure[Speedup]{
-        {\small\input{Chapters/chapter7/figures/PararealSpeedupGTX590.tikz}}
+       % {\small\input{Chapters/chapter7/figures/PararealSpeedupGTX590.tikz}}
+ \includegraphics[width=0.5\textwidth]{Chapters/chapter7/figures/PararealSpeedupGTX590_conv.pdf}
      }
      \end{center}
      }
      \end{center}
-    \caption{(a) Parareal absolute timings for an increasingly number of water waves traveling one wave length, each wave resolution is ($33\times 9$). (b) Parareal speedup for two to sixteen compute nodes compared to the purely sequential single GPU solver. Notice how insensitive the parareal scheme is to the size of the problem solved. Test environment 2.}\label{ch7:fig:DDPA_SPEEDUP}
+    \caption[Parareal absolute timings and parareal speedup.]{(a) Parareal absolute timings for an increasingly number of water waves traveling one wave length, each wave resolution is ($33\times 9$). (b) Parareal speedup for two to sixteen compute nodes compared to the purely sequential single GPU solver. Notice how insensitive the parareal scheme is to the size of the problem solved. Test environment 2.}\label{ch7:fig:DDPA_SPEEDUP}
  \end{figure}
  %
  
  \end{figure}
  %
  
@@ -838,7 +849,7 @@ Performance results for the Whalin test case have also been reported in figure \
      {\small\input{Chapters/chapter7/figures/WhalinPararealEfficiency.tikz}}
      }
  %    \end{center}
      {\small\input{Chapters/chapter7/figures/WhalinPararealEfficiency.tikz}}
      }
  %    \end{center}
-    \caption{Parallel time integration using the parareal method. $R$ is the ratio between the complexity of the fine and coarse propagators. Test environment 2.}\label{ch7:fig:whalinparareal}
+    \caption[Parallel time integration using the parareal method.]{Parallel time integration using the parareal method. $R$ is the ratio between the complexity of the fine and coarse propagators. Test environment 2.}\label{ch7:fig:whalinparareal}
  \end{figure}
  
  % Comparison with DD
  \end{figure}
  
  % Comparison with DD
@@ -889,10 +900,10 @@ The modified numerical model can still be based on flexible-order finite differe
  \begin{figure}[!htb]
  \begin{center}
  \subfigure[Hydrodynamic force calculations.]{
  \begin{figure}[!htb]
  \begin{center}
  \subfigure[Hydrodynamic force calculations.]{
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/figSeries60CB06Type7Resistance.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/figSeries60CB06Type7Resistance-eps-converted-to.pdf}
  }
  \subfigure[Kelvin pattern.]{
  }
  \subfigure[Kelvin pattern.]{
-\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/figSeries60CB06Type7kelvin.eps}
+\includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/figSeries60CB06Type7kelvin-eps-converted-to.pdf}
  }
  \end{center}
  \caption{Computed results. Comparison with experiments for hydrodynamics force calculations confirming engineering accuracy for low Froudes numbers.}
  }
  \end{center}
  \caption{Computed results. Comparison with experiments for hydrodynamics force calculations confirming engineering accuracy for low Froudes numbers.}