ch17

[book_gpu.git] / BookGPU / Chapters / chapter7 / ch7.tex
diff --git a/BookGPU/Chapters/chapter7/ch7.tex b/BookGPU/Chapters/chapter7/ch7.tex

index f9f7fd43dce5afb53b36a92a3571adc7456caa33..e084f01a0e514e08a28a7a54521401e30064c70f 100644 (file)
--- a/BookGPU/Chapters/chapter7/ch7.tex
+++ b/BookGPU/Chapters/chapter7/ch7.tex
@@ -37,7 +37,7 @@ A key problem is that improvements in performance require porting legacy codes\f
  %However, increasing amounts of applications are utilizing accelerators to parts of their code to gain speedups albeit with less dramatic improvements of performance as one can potentially find by adapting most, if not all of the application to modern hardware.
  
  In this work, we explore some of these trends by developing, by bottom-up-design, a water-wave model which can be utilized in maritime engineering and with the intended use on affordable office desktops as well as on more expensive modern compute clusters for engineering analysis purposes.  
  %However, increasing amounts of applications are utilizing accelerators to parts of their code to gain speedups albeit with less dramatic improvements of performance as one can potentially find by adapting most, if not all of the application to modern hardware.
  
  In this work, we explore some of these trends by developing, by bottom-up-design, a water-wave model which can be utilized in maritime engineering and with the intended use on affordable office desktops as well as on more expensive modern compute clusters for engineering analysis purposes.  
-
+\clearpage
  \section{On modeling paradigms for highly nonlinear and dispersive water waves}
  \label{ch7:sec:modernwavemodellingparadigms}
  
  \section{On modeling paradigms for highly nonlinear and dispersive water waves}
  \label{ch7:sec:modernwavemodellingparadigms}
  
@@ -200,7 +200,7 @@ From a numerical point of view, an efficient and scalable discretization strateg
  
  We present scalability and performance tests based on the same two test environments outlined in Chapter \ref{ch5}, Section \ref{ch5:sec:testenvironments}, plus a fourth test environment based on the most recent hardware generation:
  \begin{description}
  
  We present scalability and performance tests based on the same two test environments outlined in Chapter \ref{ch5}, Section \ref{ch5:sec:testenvironments}, plus a fourth test environment based on the most recent hardware generation:
  \begin{description}
-\item[Test environment 4.] Desktop with dual-socket Sandy Bridge Intel Xeon E5-2670 (2.60 GHz) processors, 64GB RAM, 2x Nvidia Tesla K20 GPUs.
+\item[Test environment 4.] Desktop with dual-socket Sandy Bridge Intel Xeon E5-2670 (2.60 GHz) processors, 64GB RAM, 2x NVIDIA Tesla K20 GPUs.
  \end{description}
  Performance results can be used to predict actual runtimes as described in \cite{ch7:EngsigKarupEtAl2011}, e.g., for estimation of whether a real-time constraint for a given problem size can be met.
  
  \end{description}
  Performance results can be used to predict actual runtimes as described in \cite{ch7:EngsigKarupEtAl2011}, e.g., for estimation of whether a real-time constraint for a given problem size can be met.
  
@@ -359,7 +359,7 @@ Subtracting $g^n$ in \eqref{ch7:eq:discreteupdate} and dividing by a pseudo time
  \frac{g^{*,n+1}-g^n}{\tau} =\frac{(1-\Gamma)}{\tau} (g_e^n-g^n).
  \end{align}
  %
  \frac{g^{*,n+1}-g^n}{\tau} =\frac{(1-\Gamma)}{\tau} (g_e^n-g^n).
  \end{align}
  %
-The first term is similar to a first-order accurate Forward Euler\index{forward Euler} approximation of a rate of change term. This motivates an {\em embedded penalty forcing technique} based on adding a correction term of the form
+The first term is similar to a first-order accurate Forward Euler\index{Euler!forward Euler} approximation of a rate of change term. This motivates an {\em embedded penalty forcing technique} based on adding a correction term of the form
  %
  \begin{align}\label{ch7:eq:penalty}
  \partial_t g = \mathcal{N}(g) + \frac{1-\Gamma(x)}{\tau} (g_e(t,x)-g(t,x)), \quad {\bf x}\in\Omega_\Gamma,
  %
  \begin{align}\label{ch7:eq:penalty}
  \partial_t g = \mathcal{N}(g) + \frac{1-\Gamma(x)}{\tau} (g_e(t,x)-g(t,x)), \quad {\bf x}\in\Omega_\Gamma,
@@ -761,7 +761,7 @@ Elaborating on this example, we examine the propagation of a regular stream func
  \label{ch7:filter}
  \mathcal{F}u(x_i) = \sum_{n=-\alpha}^{\alpha} c_n u(x_{i+n}),
  \end{align}
  \label{ch7:filter}
  \mathcal{F}u(x_i) = \sum_{n=-\alpha}^{\alpha} c_n u(x_{i+n}),
  \end{align}
-where $c_n\in\mathbb{R}$ are the stencil coefficients and $\alpha\in\mathbb{Z}_+$ is the stencil half-width. An active filter can for example be based on employing a Savitzky-Golay smoothening filter \cite{ch7:PT90}, e.g., the mild 7-point SG(6,10) filter, and applying it after every 10th time step to each of the collocation nodes defining the free surface variables. The same procedure can be used for stabilization of nonlinear simulations to remove high-frequency "saw-tooth" instabilities as shown in \cite{ch7:EBL08}. This filtering technique can also remove high-frequency noise resulting from round-off errors in computations that would otherwise potentially pollute the computational results and in the worst case leave them useless. The effect of this type of filtering on the numerical efficiency of the model is insignificant.
+where $c_n\in\mathbb{R}$ are the stencil coefficients and $\alpha\in\mathbb{Z}_+$ is the stencil half-width. An active filter can for example be based on employing a Savitzky-Golay smoothening filter \cite{ch7:PT90}, e.g., the mild 7-point SG(6,10) filter, and applying it after every 10th time step to each of the collocation nodes defining the free surface variables. The same procedure can be used for stabilization of nonlinear simulations to remove high-frequency ``saw-tooth'' instabilities as shown in \cite{ch7:EBL08}. This filtering technique can also remove high-frequency noise resulting from round-off errors in computations that would otherwise potentially pollute the computational results and in the worst case leave them useless. The effect of this type of filtering on the numerical efficiency of the model is insignificant.
  
  Results from numerical experiments are presented in Figure \ref{ch7:filtering}, and most of the errors can be attributed to phase errors resulting from difference in exact versus numerical phase speed. In numerical experiments, we find that while results computed in double-precision are not significantly affected by accumulation of round-off errors, the single-precision results are. In Figures \ref{ch7:filtering} (a) and (b), a direct solver based on sparse Gaussian elimination within MATLAB\footnote{\url{http://www.mathworks.com}.} is used to solve the linear system at every stage and a comparison is made between single- and unfiltered double-precision calculations. It is shown in Figure \ref{ch7:filtering} a) that without a filter, the single-precision calculations result in ``blow-up'' after which the solver fails just before 50 wave periods of calculation time. However, in Figure \ref{ch7:filtering} (b) it is demonstrated that invoking a smoothening filter, cf. \eqref{ch7:filter}, stabilizes the accumulation of round-off errors and the calculations continue and achieve reduced accuracy compared to the computed double-precision results. Thus, it is confirmed that such a filter can be used to control and suppress high-frequency oscillations that results from accumulation of round-off errors. In contrast, replacing the direct solver with an iterative PDC method using the GPU-accelerated wave model appears to be much more attractive upon inspection of Figures \ref{ch7:filtering} (c) and (d). The single-precision results are found to be stable with and {\em without} the filter-based strategy for this problem. The calculations show that single-precision math leads to slightly faster error accumulation for this choice of resolution, however, with only small differences in error level during long time integration. This highlights that fault-tolerance of the iterative PDC method contributes to securing robustness of the calculations.
  
  
  Results from numerical experiments are presented in Figure \ref{ch7:filtering}, and most of the errors can be attributed to phase errors resulting from difference in exact versus numerical phase speed. In numerical experiments, we find that while results computed in double-precision are not significantly affected by accumulation of round-off errors, the single-precision results are. In Figures \ref{ch7:filtering} (a) and (b), a direct solver based on sparse Gaussian elimination within MATLAB\footnote{\url{http://www.mathworks.com}.} is used to solve the linear system at every stage and a comparison is made between single- and unfiltered double-precision calculations. It is shown in Figure \ref{ch7:filtering} a) that without a filter, the single-precision calculations result in ``blow-up'' after which the solver fails just before 50 wave periods of calculation time. However, in Figure \ref{ch7:filtering} (b) it is demonstrated that invoking a smoothening filter, cf. \eqref{ch7:filter}, stabilizes the accumulation of round-off errors and the calculations continue and achieve reduced accuracy compared to the computed double-precision results. Thus, it is confirmed that such a filter can be used to control and suppress high-frequency oscillations that results from accumulation of round-off errors. In contrast, replacing the direct solver with an iterative PDC method using the GPU-accelerated wave model appears to be much more attractive upon inspection of Figures \ref{ch7:filtering} (c) and (d). The single-precision results are found to be stable with and {\em without} the filter-based strategy for this problem. The calculations show that single-precision math leads to slightly faster error accumulation for this choice of resolution, however, with only small differences in error level during long time integration. This highlights that fault-tolerance of the iterative PDC method contributes to securing robustness of the calculations.
  
@@ -789,7 +789,8 @@ Last, we demonstrate using a classical benchmark for propagation of nonlinear wa
  %
  
  A harmonic analysis of the wave spectrum at the shoal center line is computed and plotted in Figure \ref{ch7:whalinresults} for comparison with the analogous results obtained from the experiments data. The three harmonic amplitudes are computed via a Fast Fourier Transform (FFT) method using the last three wave periods up to $t=50\,$s. There is a satisfactory agreement between the computed and experimental results and no noticeable loss in accuracy resulting from the use of single-precision math.
  %
  
  A harmonic analysis of the wave spectrum at the shoal center line is computed and plotted in Figure \ref{ch7:whalinresults} for comparison with the analogous results obtained from the experiments data. The three harmonic amplitudes are computed via a Fast Fourier Transform (FFT) method using the last three wave periods up to $t=50\,$s. There is a satisfactory agreement between the computed and experimental results and no noticeable loss in accuracy resulting from the use of single-precision math.
-%
+
+\pagebreak
  \begin{figure}[!htb]
      \setlength\figureheight{0.3\textwidth}
      \setlength\figurewidth{0.32\textwidth}
  \begin{figure}[!htb]
      \setlength\figureheight{0.3\textwidth}
      \setlength\figurewidth{0.32\textwidth}
@@ -846,8 +847,8 @@ We have performed a scalability study for parareal using 2D nonlinear stream fun
  Performance results for the Whalin test case are also shown in Figure \ref{ch7:fig:whalinparareal}. There is a natural limitation to how much we can increase $R$ (the ratio between the complexity of the fine and coarse propagators), because of stability issues with the coarse propagator. In this test case we simulate from $t=[0,1]$s, using up to $32$ GPUs. For low $R$ and only two GPUs, there is no speedup gain, but for the configuration with eight or more GPUs and $R\geq6$, we are able to get more than $2$ times speedup. Though these hyperbolic systems are not optimal for performance tuning using the parareal method, results still confirm that reasonable speedups are in fact possible on heterogenous systems.
  
  \begin{figure}[!htb]
  Performance results for the Whalin test case are also shown in Figure \ref{ch7:fig:whalinparareal}. There is a natural limitation to how much we can increase $R$ (the ratio between the complexity of the fine and coarse propagators), because of stability issues with the coarse propagator. In this test case we simulate from $t=[0,1]$s, using up to $32$ GPUs. For low $R$ and only two GPUs, there is no speedup gain, but for the configuration with eight or more GPUs and $R\geq6$, we are able to get more than $2$ times speedup. Though these hyperbolic systems are not optimal for performance tuning using the parareal method, results still confirm that reasonable speedups are in fact possible on heterogenous systems.
  
  \begin{figure}[!htb]
-    \setlength\figureheight{0.3\textwidth}
-    \setlength\figurewidth{0.32\textwidth}
+    \setlength\figureheight{0.29\textwidth}
+    \setlength\figurewidth{0.29\textwidth}
  %    \begin{center}
      \subfigure[Speedup]{
      {\small\input{Chapters/chapter7/figures/WhalinPararealSpeedup.tikz}}
  %    \begin{center}
      \subfigure[Speedup]{
      {\small\input{Chapters/chapter7/figures/WhalinPararealSpeedup.tikz}}
@@ -930,5 +931,5 @@ We anticipate that a tool based on the proposed parallel solution strategies wil
  \section{Acknowledgments}
  
  This work was supported by grant no. 09-070032 from the Danish Research Council for Technology and Production Sciences. A special thank goes to Professor Jan S. Hesthaven for supporting parts of this work. Scalability and performance tests was done in the GPUlab at DTU Informatics, Technical University of Denmark and using the GPU-cluster at Center for Computing and Visualization, Brown University, USA. NVIDIA Corporation is acknowledged for generous hardware donations to facilities of the GPUlab.
  \section{Acknowledgments}
  
  This work was supported by grant no. 09-070032 from the Danish Research Council for Technology and Production Sciences. A special thank goes to Professor Jan S. Hesthaven for supporting parts of this work. Scalability and performance tests was done in the GPUlab at DTU Informatics, Technical University of Denmark and using the GPU-cluster at Center for Computing and Visualization, Brown University, USA. NVIDIA Corporation is acknowledged for generous hardware donations to facilities of the GPUlab.
-
+\clearpage
  \putbib[Chapters/chapter7/biblio7]
  \putbib[Chapters/chapter7/biblio7]