preface

[book_gpu.git] / BookGPU / Chapters / chapter7 / ch7.tex
diff --git a/BookGPU/Chapters/chapter7/ch7.tex b/BookGPU/Chapters/chapter7/ch7.tex

index c18d417bf1e84650ab1605d1a22e9bb926009c8b..1e0dd3d7495b1eeffaa76ce728832455e79db10f 100644 (file)
--- a/BookGPU/Chapters/chapter7/ch7.tex
+++ b/BookGPU/Chapters/chapter7/ch7.tex
@@ -1,8 +1,8 @@
  
-\chapterauthor{Allan P. Engsig-Karup}{Technical University of Denmark}
-\chapterauthor{Stefan L. Glimberg}{Technical University of Denmark}
-\chapterauthor{Allan S. Nielsen}{Technical University of Denmark}
-\chapterauthor{Ole Lindberg}{Technical University of Denmark}
+\chapterauthor{Allan P. Engsig-Karup, Stefan L. Glimberg, Allan S. Nielsen and Ole Lindberg}{Technical University of Denmark}
+%\chapterauthor{Stefan L. Glimberg}{Technical University of Denmark}
+%\chapterauthor{Allan S. Nielsen}{Technical University of Denmark}
+%\chapterauthor{Ole Lindberg}{Technical University of Denmark}
  
  \chapter{Fast hydrodynamics on heterogenous many-core hardware}
  \label{ch7}
@@ -327,7 +327,7 @@ Similar results were reported for the first time in the context of high-order Bo
  % MainLaplace2D_ex035_nonlinearLaplace.m
  \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/SFwaves_snapshots_clustered-eps-converted-to.pdf}
  }
-\caption{Numerical experiments to assess stability properties of numerical wave model. In three cases, computed snapshots are taken of the wave elevation over one wave period of time. In a) the grid distribution of nodes in a one-parameter mapping for the grid is illustrated. Results from changes in wave elevation are illustrated for b) a mildly nonlinear standing wave on a highly clustered grid, c) regular stream function wave of medium steepness in shallow water $(kh,H/L)=(0.5,0.0292)$ on a uniform grid ($N_x=80$) and d) for a nonuniform grid with a minimal grid spacing 20 times smaller(!). In every case the step size remains fixed at $\Delta t = T/160$ s corresponding to a Courant number $C_r=c\tfrac{\Delta t}{\Delta x}=0.5$ for the uniform grid. A 6'$th$ order scheme and explicit EKR4 time-stepping is used in each test case.}
+\caption[Numerical experiments to assess stability properties of numerical wave model.]{Numerical experiments to assess stability properties of numerical wave model. In three cases, computed snapshots are taken of the wave elevation over one wave period of time. In a) the grid distribution of nodes in a one-parameter mapping for the grid is illustrated. Results from changes in wave elevation are illustrated for b) a mildly nonlinear standing wave on a highly clustered grid, c) regular stream function wave of medium steepness in shallow water $(kh,H/L)=(0.5,0.0292)$ on a uniform grid ($N_x=80$) and d) for a nonuniform grid with a minimal grid spacing 20 times smaller(!). In every case the step size remains fixed at $\Delta t = T/160$ s corresponding to a Courant number $C_r=c\tfrac{\Delta t}{\Delta x}=0.5$ for the uniform grid. A 6'$th$ order scheme and explicit EKR4 time-stepping is used in each test case.}
  \label{ch7:numexp}
  \end{figure}
  %\newpage
@@ -384,7 +384,7 @@ The profiles can be reversed by a change of coordinate, i.e. $\Gamma(1-x)$, and
  \includegraphics[width=0.98\textwidth]{Chapters/chapter7/figures/nonlinearwavespenalty-eps-converted-to.pdf}
  % Nx = 540, 6th order, vertical clustering, Nz=6;
  }
-\caption{Snapshots at intervals $T/8$ over one wave period in time of computed a) small-amplitude $(kh,kH)=(0.63,0.005)$ and b) finite-amplitude $(kh,kH)=(1,0.41)$ stream function waves elevations having reached a steady state after transient startup. Combined wave generation and absorption zones in the western relaxation zone of both a) and b). In b) an absorption zone is positioned next to the eastern boundary and causes minor visible reflections. }
+\caption[Snapshots at intervals $T/8$ over one wave period in time.]{Snapshots at intervals $T/8$ over one wave period in time of computed a) small-amplitude $(kh,kH)=(0.63,0.005)$ and b) finite-amplitude $(kh,kH)=(1,0.41)$ stream function waves elevations having reached a steady state after transient startup. Combined wave generation and absorption zones in the western relaxation zone of both a) and b). In b) an absorption zone is positioned next to the eastern boundary and causes minor visible reflections. }
  \label{ch7:figstandwave}
  \end{figure}
  
@@ -424,9 +424,6 @@ Numerical modelling of large ocean areas to account for nonlinear wave-wave inte
  
  The ratio between necessary data transfers and computational work for the proposed numerical model for free surface water waves is high enough to expect reasonable latency hiding. The data domain decomposition method consists of a logically structured division of the computational domain into multiple subdomains. Each of these subdomains are connected via fictitious ghost layers at the artificial boundaries of width corresponding to the half-width of the finite difference stencils employed. This results in a favourable volume-to-boundary ratio as the problem size increases, diminishing communication overhead for message passing. Information between subdomains are exchanged through ghost layers at every step of the iterative PDC method, in connection with the matrix-vector evaluation for the $\sigma$-transformed Laplace problem, and before relaxation steps in the multigrid method. A single global synchronization point occur at most once each iteration, if convergence is monitored, where a global reduction step (inner product) between all processor nodes takes place. The main advantage of this decomposition strategy is, that the decomposition into multiple subdomains is straightforward. However, it comes with the cost of extra data transfers to update the set of fictitious ghost layers.
  
-The parallel domain decomposition solver has been validated against the sequential solvers with respect to algorithmic efficiency to establish that the code produce correct results. An analysis of the numerical efficiency have also been carried out on different GPU systems to identify comparative behaviors as both the problems sizes and number of compute nodes vary. For example, performance scalings on Test environment 1 and Test environment 2 are presented in figure \ref{ch7:fig:multigpuperformance}. The figure confirms that there is only a limited benefit from using multiple GPUs for small problem sizes, since the computational intensity is simply too low to efficiently hide the latency of message passing. A substantial speedup is achieved compared to the single GPU version, while being able to solve even larger systems.
-With the linear scaling of memory requirements and improved computational speed, the methodology based on multiple GPUs makes it possible to simulate water waves in very large numerical wave tanks with improved performance.
-
  \begin{figure}[!htb]
      \setlength\figureheight{0.30\textwidth}
      \setlength\figurewidth{0.33\textwidth}
@@ -438,9 +435,14 @@ With the linear scaling of memory requirements and improved computational speed,
      {\scriptsize\input{Chapters/chapter7/figures/TeslaK20SpeedupGPUvsCPU3D.tikz}}
      }
      \end{center}
-    \caption{Performance timings per PDC iteration as a function of increasing problem size $N$, for single, mixed and double precision arithmetics. Three dimensional nonlinear waves, using $6^{th}$ order finite difference approximations, preconditioned with one multigrid V-cycle and one pre- and post- Red-black Gauss-Seidel smoothing. Speedup compared to fastest known serial implementation. Using Test environment 3. CPU timings represent starting point for our investigations and has been obtained using Fortran 90 code and is based on a single-core run on a Intel Core i7, 2.80GHz processor.}\label{ch7:fig:perftimings}
+    \caption[Performance timings per PDC iteration as a function of increasing problem size $N$, for single, mixed and double precision arithmetics.]{Performance timings per PDC iteration as a function of increasing problem size $N$, for single, mixed and double precision arithmetics. Three dimensional nonlinear waves, using $6^{th}$ order finite difference approximations, preconditioned with one multigrid V-cycle and one pre- and post- Red-black Gauss-Seidel smoothing. Speedup compared to fastest known serial implementation. Using Test environment 3. CPU timings represent starting point for our investigations and has been obtained using Fortran 90 code and is based on a single-core run on a Intel Core i7, 2.80GHz processor.}\label{ch7:fig:perftimings}
  \end{figure}
  
+The parallel domain decomposition solver has been validated against the sequential solvers with respect to algorithmic efficiency to establish that the code produce correct results. An analysis of the numerical efficiency have also been carried out on different GPU systems to identify comparative behaviors as both the problems sizes and number of compute nodes vary. For example, performance scalings on Test environment 1 and Test environment 2 are presented in figure \ref{ch7:fig:multigpuperformance}. The figure confirms that there is only a limited benefit from using multiple GPUs for small problem sizes, since the computational intensity is simply too low to efficiently hide the latency of message passing. A substantial speedup is achieved compared to the single GPU version, while being able to solve even larger systems.
+With the linear scaling of memory requirements and improved computational speed, the methodology based on multiple GPUs makes it possible to simulate water waves in very large numerical wave tanks with improved performance.
+
+
+
  
  
  \begin{figure}[!htb]
@@ -454,7 +456,7 @@ With the linear scaling of memory requirements and improved computational speed,
      {\scriptsize\input{Chapters/chapter7/figures/TeslaM2050MultiGPUScaling3D.tikz}}
      }
      \end{center}
-    \caption{Domain decomposition performance on multi-GPU systems. Performance timings per PDC iteration as a function of increasing problem sizes using single precision. Same setup as in figure \ref{ch7:fig:perftimings}.}
+    \caption[Domain decomposition performance on multi-GPU systems.]{Domain decomposition performance on multi-GPU systems. Performance timings per PDC iteration as a function of increasing problem sizes using single precision. Same setup as in figure \ref{ch7:fig:perftimings}.}
      \label{ch7:fig:multigpuperformance}
  \end{figure}
  
@@ -535,7 +537,8 @@ __global__ void laplace_sigma_transformed(
         {                       
                 size_type offset_i = i < alpha ? 2*alpha-i : i >= Ns-alpha ? Ns-1-i : alpha;
                 size_type row_i    = offset_i*rank;
-               size_type offset_j = alpha;  // Always centered stencils in x-dir
+    // Always centered stencils in x-dir
+               size_type offset_j = alpha;  
                 size_type row_j    = alpha*rank;
                         
                 value_type dhdx    = hx[j];
@@ -709,7 +712,7 @@ $N_z\in[6,12]$. Sixth order scheme.}
  \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/kinematicsW_Nx30-HL90-p6_Nonlinear-eps-converted-to.pdf}
  }
  \end{center}
-\caption{Assessment of kinematic error is presented in terms of the depth-averaged error determined by \eqref{ch7:errkin} for a) scalar velocity potential and b) vertical velocity for a small-amplitude (linear) wave, and c) scalar velocity potential and d) vertical velocity for a finite-amplitude (nonlinear) wave with wave height $H/L=90\%(H/L)_\textrm{max}$.
+\caption[Assessment of kinematic error is presented in terms of the depth-averaged error.]{Assessment of kinematic error is presented in terms of the depth-averaged error determined by \eqref{ch7:errkin} for a) scalar velocity potential and b) vertical velocity for a small-amplitude (linear) wave, and c) scalar velocity potential and d) vertical velocity for a finite-amplitude (nonlinear) wave with wave height $H/L=90\%(H/L)_\textrm{max}$.
  $N_z\in[6,12]$. Sixth order scheme. Clustered vertical grid. }
  \label{ch7:figlinear2}
  \end{figure}
@@ -737,7 +740,7 @@ Previously reported performance results for the wave model can be taken a step f
  \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/PrecisionDOUBLE-eps-converted-to.pdf}
  }
  \end{center}
-\caption{Comparison between convergence histories for single and double precision computations using a PDC method for the solution of the transformed Laplace problem. Very steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0903)$. Discretizaiton based on $(N_x,N_z)=(15,9)$ with 6'$th$ order stencils.}
+\caption[Comparison between convergence histories for single and double precision computations using a PDC method for the solution of the transformed Laplace problem.]{Comparison between convergence histories for single and double precision computations using a PDC method for the solution of the transformed Laplace problem. Very steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0903)$. Discretizaiton based on $(N_x,N_z)=(15,9)$ with 6'$th$ order stencils.}
  \label{ch7:convhist}
  \end{figure}
  
@@ -771,7 +774,7 @@ Results from numerical experiments are presented in figure \ref{ch7:filtering} a
  \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/ComparisonDCWithFiltering-eps-converted-to.pdf}
  }
  \end{center}
-\caption{Comparison between accuracy as a function of time for double precision calculations vs. single precision with and without filtering. The double precision result are unfiltered in each comparison and shows to be less sensitive to roundoff-errors. Medium steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0502)$. Discretization is based on $(N_x,N_z)=(30,6)$, A courant number of $C_r=0.5$ and 6'$th$ order stencils.}
+\caption[Comparison between accuracy as a function of time for double precision calculations vs. single precision with and without filtering.]{Comparison between accuracy as a function of time for double precision calculations vs. single precision with and without filtering. The double precision result are unfiltered in each comparison and shows to be less sensitive to roundoff-errors. Medium steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0502)$. Discretization is based on $(N_x,N_z)=(30,6)$, A courant number of $C_r=0.5$ and 6'$th$ order stencils.}
  \label{ch7:filtering}
  \end{figure}
  
@@ -797,7 +800,7 @@ A harmonic analysis of the wave spectrum at the shoal center line is computed an
      {\scriptsize\input{Chapters/chapter7/figures/WhalinWaveHarmonics_T3_single.tikz}}
      }
  %    \end{center}
-    \caption{Harmonic analysis for the experiment of Whalin for $T=1,2,3\,s$ respectively. Measured experimental and computed results (single precision) are in good agreement. Test environment 1.}\label{ch7:whalinresults}
+    \caption[Harmonic analysis for the experiment of Whalin for $T=1,2,3\,s$ respectively.]{Harmonic analysis for the experiment of Whalin for $T=1,2,3\,s$ respectively. Measured experimental and computed results (single precision) are in good agreement. Test environment 1.}\label{ch7:whalinresults}
  \end{figure}
  
  \subsection{Acceleration via parallelism in time using 'Parareal'}\label{ch7:parareal}\index{parareal}
@@ -817,17 +820,17 @@ Ideally, the ratio $\mathcal{C}_\mathcal{G}/\mathcal{C}_\mathcal{F}$ is small an
  \begin{figure}[!htb]
      \begin{center}
      \setlength\figureheight{0.35\textwidth}
-    \setlength\figurewidth{0.37\textwidth}
+    \setlength\figurewidth{0.35\textwidth}
      \subfigure[Performance scaling]{
  %        {\small\input{Chapters/chapter7/figures/PararealScaletestGTX590.tikz}}
-      \includegraphics[width=0.5\textwidth]{Chapters/chapter7/figures/PararealScaletestGTX590_conv.pdf}
+      \includegraphics[width=0.47\textwidth]{Chapters/chapter7/figures/PararealScaletestGTX590_conv.pdf}
      }
      \subfigure[Speedup]{
         % {\small\input{Chapters/chapter7/figures/PararealSpeedupGTX590.tikz}}
- \includegraphics[width=0.5\textwidth]{Chapters/chapter7/figures/PararealSpeedupGTX590_conv.pdf}
+ \includegraphics[width=0.47\textwidth]{Chapters/chapter7/figures/PararealSpeedupGTX590_conv.pdf}
      }
      \end{center}
-    \caption{(a) Parareal absolute timings for an increasingly number of water waves traveling one wave length, each wave resolution is ($33\times 9$). (b) Parareal speedup for two to sixteen compute nodes compared to the purely sequential single GPU solver. Notice how insensitive the parareal scheme is to the size of the problem solved. Test environment 2.}\label{ch7:fig:DDPA_SPEEDUP}
+    \caption[Parareal absolute timings and parareal speedup.]{(a) Parareal absolute timings for an increasingly number of water waves traveling one wave length, each wave resolution is ($33\times 9$). (b) Parareal speedup for two to sixteen compute nodes compared to the purely sequential single GPU solver. Notice how insensitive the parareal scheme is to the size of the problem solved. Test environment 2.}\label{ch7:fig:DDPA_SPEEDUP}
  \end{figure}
  %
  
@@ -846,7 +849,7 @@ Performance results for the Whalin test case have also been reported in figure \
      {\small\input{Chapters/chapter7/figures/WhalinPararealEfficiency.tikz}}
      }
  %    \end{center}
-    \caption{Parallel time integration using the parareal method. $R$ is the ratio between the complexity of the fine and coarse propagators. Test environment 2.}\label{ch7:fig:whalinparareal}
+    \caption[Parallel time integration using the parareal method.]{Parallel time integration using the parareal method. $R$ is the ratio between the complexity of the fine and coarse propagators. Test environment 2.}\label{ch7:fig:whalinparareal}
  \end{figure}
  
  % Comparison with DD