From: couturie <couturie@extinction>
Date: Sat, 3 Aug 2013 14:19:37 +0000 (+0200)
Subject: new ch5 reread
X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/fd66513f28a83144dbb902570a63fc94787d95d2

new ch5 reread
---

diff --git a/BookGPU/Chapters/chapter5/biblio5.bib b/BookGPU/Chapters/chapter5/biblio5.bib
index 04267a1..f0d230b 100644
--- a/BookGPU/Chapters/chapter5/biblio5.bib
+++ b/BookGPU/Chapters/chapter5/biblio5.bib
@@ -3,7 +3,7 @@
 
 @INCOLLECTION{ch5:Bajrovic2011,
   author = {Bajrovic, E. and Traff, J.},
-  title = {Using MPI Derived Datatypes in Numerical Libraries},
+  title = {Using {MPI} Derived Datatypes in Numerical Libraries},
   booktitle = {Recent Advances in the Message Passing Interface},
   publisher = {Springer Berlin / Heidelberg},
   year = {2011},
@@ -24,13 +24,11 @@
 
 @ARTICLE{ch5:Bell2011,
   author = {N. Bell and J. Hoberock},
-  title = {Thrust: A Productivity-Oriented Library for CUDA},
-  journal = {in GPU Computing Gems, Jade Edition, Edited by Wen-mei W. Hwu},
+  title = {Thrust: A Productivity-Oriented Library for {CUDA}},
+  journal = {In GPU Computing Gems, {\rm Jade Edition, Edited by Wen-mei W. Hwu. Elsevier Science}},
   year = {2011},
   volume = {2},
   pages = {359-371},
-  owner = {slgl},
-  timestamp = {2012.09.12}
 }
 
 @ARTICLE{ch5:Engsig-Karup2011,
@@ -45,7 +43,7 @@
 }
 
 @BOOK{ch5:Gamma1995,
-  title = {Design Patterns - Elements of Reusable Object-Oriented Software},
+  title = {Design Patterns--Elements of Reusable Object-Oriented Software},
   publisher = {Addison-Wesley Professional Computing Series},
   year = {1995},
   author = {E. Gamma and R. Helm and R. Johnson and J. Vlissides},
@@ -76,10 +74,10 @@
 
 @INCOLLECTION{ch5:Hoefler2011,
   author = {Hoefler, T. and Snir, M.},
-  title = {Writing Parallel Libraries with MPI - Common Practice, Issues, and
+  title = {Writing Parallel Libraries with {MPI}--{C}ommon Practice, Issues, and
 	Extensions},
   booktitle = {Recent Advances in the Message Passing Interface},
-  publisher = {Springer Berlin / Heidelberg},
+  publisher = {Springer, Berlin/Heidelberg},
   year = {2011},
   editor = {Cotronis, Y. and Danalis, A. and Nikolopoulos, D.
 	and Dongarra, J.},
@@ -96,9 +94,9 @@
 }
 
 @BOOK{ch5:LeVeque2007,
-  title = {Finite difference methods for ordinary and partial differential equations
-	- steady-state and time-dependent problems},
+  title = {Finite Difference Methods for Ordinary and Partial Differential Equations--Steady-state and Time-dependent Problems},
   publisher = {SIAM},
+  address = {Philadelphia, PA},
   year = {2007},
   author = {R. J. LeVeque},
   pages = {I-XV, 1-341},
@@ -109,13 +107,12 @@
 
 @TECHREPORT{ch5:Skjellum1994,
   author = {A. Skjellum and N. E. Doss and P. V. Bangaloret},
-  title = {Writing Libraries in MPI},
+  title = {Writing Libraries in {MPI}},
   institution = {Department of Computer Science and NSF Engineering Research Center
 	for Computational Fiels Simulation. Mississippi State University},
   year = {1994},
   file = {Skjellum1994.pdf:Skjellum1994.pdf:PDF},
-  owner = {slgl},
-  timestamp = {2012.09.10}
+  type = {Technical {R}eport},
 }
 
 @BOOK{ch5:Smith1996,
@@ -135,7 +132,6 @@
   year = {2002},
   author = {D. Vandevoorde and N. M. Josuttis},
   pages = {552},
-  edition = {1st},
   month = {November},
   owner = {slgl},
   timestamp = {2011.07.15},
@@ -177,28 +173,12 @@
 }
 
 @techreport{ch5:Asanovic:EECS-2006-183,
-  Author = {Asanovic, K. and Bodik, R. and Catanzaro, B. C. and Gebis, Joseph, J. and Husbands, P. and Keutzer, K. and Patterson, D. A. and Plishker, W. L. and Shalf, J. and Williams, S. W. and Yelick, K. A.},
-  Title = {The Landscape of Parallel Computing Research: A View from Berkeley},
+  Author = {Asanovic, K. and Bodik, R. and Catanzaro, B. C. and Gebis, J. J. and Husbands, P. and Keutzer, K. and Patterson, D. A. and Plishker, W. L. and Shalf, J. and Williams, S. W. and Yelick, K. A.},
+  Title = {The Landscape of Parallel Computing Research: A View from {B}erkeley},
   Institution = {EECS Department, University of California, Berkeley},
   Year = {2006},
   Month = {Dec},
   Number = {UCB/EECS-2006-183},
-  Abstract = {The recent switch to parallel microprocessors is a milestone in the history of computing. Industry has laid out a roadmap for multicore designs that preserves the programming paradigm of the past via binary compatibility and cache coherence. Conventional wisdom is now to double the number of cores on a chip with each silicon generation.
-A multidisciplinary group of Berkeley researchers met nearly two years to discuss this change. Our view is that this evolutionary approach to parallel hardware and software may work from 2 or 8 processor systems, but is likely to face diminishing returns as 16 and 32 processor systems are realized, just as returns fell with greater instruction-level parallelism.
-We believe that much can be learned by examining the success of parallelism at the extremes of the computing spectrum, namely embedded computing and high performance computing. This led us to frame the parallel landscape with seven questions, and to recommend the following:
-<ul>
-<li>The overarching goal should be to make it easy to write programs that execute efficiently on highly parallel computing systems
-<li>The target should be 1000s of cores per chip, as these chips are built from processing elements that are the most efficient in MIPS (Million Instructions per Second) per watt, MIPS per area of silicon, and MIPS per development dollar.
-<li>Instead of traditional benchmarks, use 13 "Dwarfs" to design and evaluate parallel programming models and architectures. (A dwarf is an algorithmic method that captures a pattern of computation and communication.)
-<li>"Autotuners" should play a larger role than conventional compilers in translating parallel programs.
-<li>To maximize programmer productivity, future programming models must be more human-centric than the conventional focus on hardware or applications.
-<li>To be successful, programming models should be independent of the number of processors.
-<li>To maximize application efficiency, programming models should support a wide range of data types and successful models of parallelism: task-level parallelism, word-level parallelism, and bit-level parallelism.
-<li>Architects should not include features that significantly affect performance or energy if programmers cannot accurately measure their impact via performance counters and energy counters.
-<li>Traditional operating systems will be deconstructed and operating system functionality will be orchestrated using libraries and virtual machines.
-<li>To explore the design space rapidly, use system emulators based on Field Programmable Gate Arrays (FPGAs) that are highly scalable and low cost.
-</ul>
-Since real world applications are naturally parallel and hardware is naturally parallel, what we need is a programming model, system software, and a supporting architecture that are naturally parallel. Researchers have the rare opportunity to re-invent these cornerstones of computing, provided they simplify the efficient programming of highly parallel systems.}
 }
 
 @article{ch5:mooreslaw1965,
@@ -216,20 +196,20 @@ Since real world applications are naturally parallel and hardware is naturally p
   author = "A. Kloeckner, T. Warburton and J. S. Hesthaven",
   institution = "Scientific Computing Group, Brown University",
   number = "2011-13",
-  address = "Providence, RI, USA",
+  address = "Providence, RI",
   year = "2011",
   month = jun,
 }
 
 @book{ch5:Ferziger1996,
-  title={Computational methods for fluid dynamics},
+  title={Computational Methods for Fluid Dynamics},
   author={Ferziger, J.H. and Peri{\'c}, M.},
   isbn={9783540594345},
   lccn={98231766},
-  series={Numerical methods: Research and development},
+  series={Numerical Methods: Research and Development},
   url={http://books.google.dk/books?id=SJkeAQAAIAAJ},
   year={1996},
-  publisher={Springer-Verlag}
+  publisher={Springer-Verlag, Berlin Heidelberg New York}
 }
 
 @book{ch5:chorin1993,
@@ -240,9 +220,11 @@ Since real world applications are naturally parallel and hardware is naturally p
   series={Texts in Applied Mathematics},
   url={http://books.google.dk/books?id=0Iglq1WA5PQC},
   year={1993},
-  publisher={Springer}
+  publisher={Springer, New York, NY},
+  edition={3rd},  
 }
 
+
 @book{ch5:Saad2003,
  author = {Saad, Y.},
  title = {Iterative Methods for Sparse Linear Systems},
@@ -250,7 +232,7 @@ Since real world applications are naturally parallel and hardware is naturally p
  isbn = {0898715342},
  edition = {2nd},
  publisher = {Society for Industrial and Applied Mathematics},
- address = {Philadelphia, PA, USA},
+ address = {Philadelphia, PA},
 }
 
 @book{ch5:Kelley1995,
@@ -261,13 +243,13 @@ Since real world applications are naturally parallel and hardware is naturally p
   series={Frontiers in Applied Mathematics Series},
   url={http://books.google.dk/books?id=3J4XEAooQOoC},
   year={1995},
-  publisher={Society for Industrial and Applied Mathematics (SIAM, 3600 Market Street, Floor 6, Philadelphia, PA 19104)}
+  publisher={Society for Industrial and Applied Mathematics, Philadelphia, PA,}
 }
 
 @techreport{ch5:YMTR08,
 	author = {Y. Maday},
 	title = {The parareal in time algorithm},
-	institution = {Universite Pierr\'{e} et Marie Curie},
+	institution = {Universite Pierr\'{e} et Marie Curie, Paris},
 	year = {2008},
 	type = {Technical Report},
 	number = {R08030}
@@ -276,7 +258,7 @@ Since real world applications are naturally parallel and hardware is naturally p
 @article{ch5:MS07,
 	author = {M. Gander and S. Vandewalle},
 	title = {Analysis of the parareal time-parallel time-integration method},
-	journal = {SIAM Journal of scientific computing},
+	journal = {SIAM Journal of Scientific Computing},
 	year = {2007},
 	volume = {29},
 	number = {2},
@@ -286,25 +268,31 @@ Since real world applications are naturally parallel and hardware is naturally p
 @article{ch5:LMT01,
 	author = {J.-L. Lions and Y. Maday and G. Turinici},
 	title = {R\'{e}solution d'EDP par un sch\'{e}ma en temps parar\'{e}el},
-	journal = {C.R. Acad Sci. Paris S\'{e}r. I math},
+	journal = {C.R. Acad. Sci. Paris S\'{e}r. I Math},
 	year = {2001},
 	volume = {332},
 	pages = {661-668}
 }
 
 @article{ch5:LSY02,
-	author = {L. Baffico and S. Bernard and Y. Maday and G. Turinici and G. Z\'{e}rah},
-	title = {Parallel in time molecular dynamics simulations},
-	journal = {Physical Review E.},
-	year = {2002},
-	volume = {66},
-	number = {057701}
+  title = {Parallel-in-time molecular-dynamics simulations},
+  author = {Baffico, L. and Bernard, S. and Maday, Y. and Turinici, G. and Z\'erah, G.},
+  journal = {Phys. Rev. E},
+  volume = {66},
+  issue = {5},
+  pages = {057701},
+  numpages = {4},
+  year = {2002},
+  month = {Nov},
+  doi = {10.1103/PhysRevE.66.057701},
+  url = {http://link.aps.org/doi/10.1103/PhysRevE.66.057701},
+  publisher = {American Physical Society}
 }
 
 @mastersthesis{ch5:ASNP12,
 	author = {A. S. Nielsen},
 	title = {Feasibility study of the Parareal algorithm},
-	school = {Technical University of Denmark, Department of Informatics and Mathematical Modeling},
+	school = {Technical University of Denmark, Department of Informatics and Mathematical Modeling, Lyngby},
 	year = {2012},
 	type = {Master Thesis}
 }
@@ -320,19 +308,21 @@ Since real world applications are naturally parallel and hardware is naturally p
 
 @BOOK{ch5:Barrett1994,
   AUTHOR = {R. Barrett and M. Berry and T. F. Chan and J. Demmel and J. Donato and J. Dongarra and V. Eijkhout and R. Pozo and C. Romine and H. Van der Vorst },
-  TITLE = {Templates for the Solution of Linear Systems: Building Blocks for Iterative Methods, 2nd Edition},
+  TITLE = {Templates for the Solution of Linear Systems: Building Blocks for Iterative Methods},
+  edition = {2nd},  
   PUBLISHER = {SIAM},
   YEAR = {1994},
   ADDRESS = {Philadelphia, PA}
 }
 
 @TECHREPORT{ch5:ScientificGrandChallenges2010,
-  author = {D. L. Brown and P. Messina et. al},
+  author = {D. L. Brown, P. Messina et. al},
   title = {Scientific Grand Challenges, Crosscutting technologies for computing at the exascale},
   institution = {U.S. Department of Energy},
   year = {2010},
   month = {February},
   address = {Washington, D.C.},
+  type = {Technical {R}eport},  
 }
 
 @article{ch5:Keyes2011,
@@ -366,7 +356,7 @@ masid = {49649121}
 
 @misc{ch5:cudaguide,
    author = {{NVIDIA Corporation}},
-   title = {CUDA C Programming Guide},
+   title = {{CUDA C} {P}rogramming {G}uide},
    publisher = {NVIDIA Corporation},
    year = {2012},
    url = {http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html}
@@ -374,7 +364,7 @@ masid = {49649121}
 
 @misc{ch5:cudapractice,
    author = {{NVIDIA Corporation}},
-   title = {CUDA C Best Practices Guide},
+   title = {\rm {CUDA C} {B}est {P}ractices {G}uide},
    publisher = {NVIDIA Corporation},
    year = {2012},
    url = {http://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html}
@@ -390,7 +380,7 @@ masid = {49649121}
  location = {Portland, Oregon},
  doi = {http://doi.acm.org/10.1145/1654059.1654078},
  publisher = {ACM},
- address = {New York, NY, USA},
+ address = {New York, NY},
  }
 
 @book{ch5:Kirk2010,
@@ -398,9 +388,8 @@ masid = {49649121}
  title = {Programming Massively Parallel Processors: A Hands-on Approach},
  year = {2010},
  isbn = {0123814723, 9780123814722},
- edition = {1st},
  publisher = {Morgan Kaufmann Publishers Inc.},
- address = {San Francisco, CA, USA},
+ address = {San Francisco, CA},
 }
 
 @book{ch5:Trottenberg2001,
@@ -410,5 +399,6 @@ masid = {49649121}
   lccn={00103940},
   url={http://books.google.dk/books?id=9ysyNPZoR24C},
   year={2001},
-  publisher={Academic Press}
+  publisher={Elsevier Academic Press},
+  address={London}
 }
diff --git a/BookGPU/Chapters/chapter5/ch5.tex b/BookGPU/Chapters/chapter5/ch5.tex
index 8ee7775..f7eabb4 100644
--- a/BookGPU/Chapters/chapter5/ch5.tex
+++ b/BookGPU/Chapters/chapter5/ch5.tex
@@ -1,12 +1,9 @@
-\chapterauthor{Stefan L. Glimberg, Allan P. Engsig-Karup, Allan S. Nielsen and Bernd Dammann}{Technical University of Denmark}
+\chapterauthor{Stefan L. Glimberg, Allan P. Engsig-Karup, Allan S. Nielsen, and Bernd Dammann}{Technical University of Denmark}
 %\chapterauthor{Allan P. Engsig-Karup}{Technical University of Denmark}
 %\chapterauthor{Allan S. Nielsen}{Technical University of Denmark}
 %\chapterauthor{Bernd Dammann}{Technical University of Denmark}
 
-
-
-\chapter{Development of software components for heterogeneous many-core architectures}\label{ch5}
-
+\chapter[Software components for heterogeneous manycore architectures]{Development of software components for heterogeneous manycore architectures}\label{ch5}
 
 %Subjects:
 %\begin{itemize}
@@ -15,7 +12,6 @@
 %\item efficient and scalable iterative methods for solution of high-order numerical methods and strategies for efficient implementations on desktop architectures.
 %\end{itemize}
 
-
 \section{Software development for heterogeneous architectures}
 %Our library facilitates massively parallelization through GPU computing and contains components for various iterative strategies such as DC, CG, CGNR, BiCGSTAB and GMRES(m) for solution of large linear systems along with support for preconditioning strategies. The goal is to create a reusable library and framework which provide general components with performance similar to that of a dedicated solver. Preliminary results show that performance overhead can be kept minimal in this new software framework [8].
 
@@ -29,25 +25,25 @@
 
 % there is a price/penalty to pay when using libraries, i.e. one doesn't necessarily get the best performance, because the architectural details are not visible to the programmer (keyword: Visibility  [Berkeley dwarf paper]).  However, if the library permits to write own add-ons/kernels (flexibility), this is no longer an issue.
 
-Massively parallel processors, such as graphical processing units (GPUs), have in recent years proven to be effective for a vast amount of scientific applications. Today, most desktop computers are equipped with one or more powerful GPUs, offering heterogeneous high-performance computing to a broad range of scientific researchers and software developers. Though GPUs are now programmable and can be highly effective compute units, they still pose challenges for software developers to fully utilize their efficiency. Sequential legacy codes are not always easily parallelized, and the time spent on conversion might not pay off in the end. This is particular true for heterogenous computers, where the architectural differences between the main- and co-processor can be so significant, that they call for completely different optimization strategies. The cache hierarchy management of CPUs and GPUs being an evident example hereof. In the past, industrial companies were able to boost application performance solely by upgrading their hardware systems, with an overt balance between investment and performance speedup. Today, the picture is different, not only do they have to invest in new hardware, but also account for the adaption and training of their software developers. What traditionally used to be a hardware problem, addressed by the chip manufacturers, has now become a software problem for application developers.
+Massively parallel processors, such as graphical processing units (GPUs), have in recent years proven to be effective for a vast amount of scientific applications. Today, most desktop computers are equipped with one or more powerful GPUs, offering heterogeneous high-performance computing to a broad range of scientific researchers and software developers. Though GPUs are now programmable and can be highly effective computing units, they still pose challenges for software developers to fully utilize their efficiency. Sequential legacy codes are not always easily parallelized, and the time spent on conversion might not pay off in the end. This is particular true for heterogenous computers, where the architectural differences between the main and coprocessor can be so significant that they require completely different optimization strategies. The cache hierarchy management of CPUs and GPUs are an evident example hereof. In the past, industrial companies were able to boost application performance solely by upgrading their hardware systems, with an overt balance between investment and performance speedup. Today, the picture is different; not only do they have to invest in new hardware, but they also must account for the adaption and training of their software developers. What traditionally used to be a hardware problem, addressed by the chip manufacturers, has now become a software problem for application developers.
 
-Software libraries \index{software library} \index{library|see{software library}} can be a tremendous help for developers as they make it easier to implement an application, without having to know about the complexity of the underlying computer hardware, known as \emph{opacity}~\cite{ch5:Asanovic:EECS-2006-183}. The ultimate goal for a successful library is to simplify the process of writing new software and thus to increase developer productivity. Since programmable heterogeneous CPU/GPU systems are a rather new phenomenon, there is yet a limited number of established software libraries that take full advantage of such heterogeneous high performance systems, and there are no de-facto design standards for such systems either. Some existing libraries for conventional homogeneous systems have already added support for offloading computational intense operations onto co-processing GPUs. However, this approach comes with the cost of frequent memory transfers across the low bandwidth PCIe bus.
+Software libraries\index{software library}\index{library|see{software library}} can be a tremendous help for developers as they make it easier to implement an application, without requiring special knowledge of the underlying computer architecture and hardware. A library may be referred to as \emph{opaque} when it automatically utilize the available resources, without requiring specific details from the developer\cite{ch5:Asanovic:EECS-2006-183}. The ultimate goal for a successful library is to simplify the process of writing new software and thus to increase developer productivity. Since programmable heterogeneous CPU/GPU systems are a rather new phenomenon, there is yet a limited number of established software libraries that take full advantage of such heterogeneous high performance systems, and there are no de facto design standards for such systems either. Some existing libraries for conventional homogeneous systems have already added support for offloading computationally intense operations onto coprocessing GPUs. However, this approach comes at the cost of frequent memory transfers across the low bandwidth PCIe bus.
 
-In this chapter, we focus on the use of a software library to help application developers achieve their goals without spending immense time on optimization details, while still offering close-to-optimal performance. A good library provides performance portable implementations with intuitive interfaces, that hide the complexity of underlaying hardware optimizations. Unfortunately, this opaqueness sometimes comes with a price, as one does not necessarily get the best performance when the architectural details are not \emph{visible} to the programmer~\cite{ch5:Asanovic:EECS-2006-183}. If, however, the library is flexible enough and permits developers to supply their own low-level implementations as well, this does not need to be an issue. These are some of the considerations library developers should make, and what we will try to address in this chapter.
+In this chapter, we focus on the use of a software library to help application developers achieve their goals without spending an immense amount of time on optimization details, while still offering close-to-optimal performance. A good library provides performance-portable implementations with intuitive interfaces, that hide the complexity of underlaying hardware optimizations. Unfortunately, opaqueness sometimes comes at a price, as one does not necessarily get the best performance when the architectural details are not \emph{visible} to the programmer~\cite{ch5:Asanovic:EECS-2006-183}. If, however, the library is flexible enough and permits developers to supply their own low-level implementations as well, this does not need to be an issue. These are some of the considerations library developers should take into account, and what we will try to address in this chapter.
 
 
-For demonstrative purposes we present details from an in-house generic CUDA-based C++ library for fast assembling of partial differential equation (PDE) solvers, utilizing the computational resources of GPUs. It has been developed as part of research activities associated with GPUlab, at the Technical University of Denmark, and therefore referred to as the GPUlab library. It falls into the category of computational libraries, as categorized by Hoefler and Snir~\cite{ch5:Hoefler2011}. Memory allocation and basic algebraic operations are supported via object oriented components, without the user having to write CUDA specific kernels. As back-end vector class, the CUDA Thrust library is used, enabling easy memory allocation and a high-level interface for vector manipulation~\cite{ch5:Bell2011}. Inspirations for \emph{good library design}, some of which we will present in this chapter, originates from guidelines proposed throughout literature~\cite{ch5:Hoefler2011,ch5:Gamma1995,ch5:Skjellum1994}. An identification of desirable properties, that any library should strive to achieve, is pointed out by Korson and McGregor~\cite{ch5:Korson1992}, in particular we mention easy-to-use, extensible, and intuitive.
+For demonstrative purposes we present details from an in-house generic CUDA-based C++ library for fast assembling of partial differential equation (PDE) solvers, utilizing the computational resources of GPUs. This library has been developed as part of research activities associated with the GPUlab, at the Technical University of Denmark and, therefore, is referred to as the {\em GPUlab library}. It falls into the category of computational libraries, as categorized by Hoefler and Snir~\cite{ch5:Hoefler2011}. Memory allocation and basic algebraic operations are supported via object-oriented components, without the user having to write CUDA specific kernels. As a back-end vector class, the parallel CUDA Thrust template-based library is used, enabling easy memory allocation and a high-level interface for vector manipulation~\cite{ch5:Bell2011}. Inspirations for \emph{good library design}, some of which we will present in this chapter, originate from guidelines proposed throughout the literature~\cite{ch5:Hoefler2011,ch5:Gamma1995,ch5:Skjellum1994}. An identification of desirable properties, which any library should strive to achieve, is pointed out by Korson and McGregor~\cite{ch5:Korson1992}. In particular we mention being easy-to-use, extensible, and intuitive.
 
 
 %We rely on template-based implementations to allow full flexibility to change and assemble types~\cite{ch5:Vandevoorde2002}
 
 %The library presented in this chapter falls into the category of computational libraries, as defined by Hoefler and Snir~\cite{ch5:Hoefler2011}, and seeks to fulfil the relevant objectives presented, such as designing components around data structures (classes), utilizing inheritance for component reuse, and hiding irrelevant information from the end users. An identification of desirable attributes for library development is also pointed out by Korson and McGregor~\cite{ch5:Korson1992}, in particular the library is designed to be easy-to-use, extensibility, and intuitive.
 
-The library is designed to be effective and scalable for fast prototyping of PDE solvers, (primarily) based on matrix-free implementations of finite difference (stencil) approximations on logically structured grids. It offers functionalities that will help assemble PDE solvers that automatically exploit heterogeneous architectures, much faster than manually having to manage GPU memory allocation, memory transfers, kernel launching, etc.
+The library is designed to be effective and scalable for fast prototyping of PDE solvers, (primarily) based on matrix-free implementations of finite difference (stencil) approximations on logically structured grids. It offers functionalities that will help assemble PDE solvers that automatically exploit heterogeneous architectures much faster than manually having to manage GPU memory allocation, memory transfers, kernel launching, etc.
 
-In the following sections we demonstrate how software components that play important roles in many scientific applications can be designed to fit a simple framework that will run efficiently on heterogeneous systems. One example is finite difference approximations, commonly used to find numerical solutions to differential equations. Matrix-free implementations minimize both memory consumption and main memory access, two important features for efficient GPU utilization and for enabling solution of large-scale problems. The bottleneck problem for many PDE applications, is to find the solution of a large sparse linear systems, arising from the discretization. In order to help solve these systems, the library includes a set of iterative solvers. All iterative solvers are template-based, such that vector and matrix classes, along with their underlying implementations, can be freely interchanged or whole new solvers can even be implemented without much coding effort. The generic nature of the library, along with a predefined set of interface rules, allow assembling of components into PDE solvers. The use of parameterized type binding, allows the user to control the assembling of PDE solvers at a high abstraction level, without having to change the remaining implementation.
+In the following sections we demonstrate how software components that play important roles in scientific applications can be designed to fit a simple framework that will run efficiently on heterogeneous systems. One example is finite difference approximations, commonly used to find numerical solutions to differential equations. Matrix-free implementations minimize both memory consumption and memory access, two important features for efficient GPU utilization and for enabling the solution of large-scale problems. The bottleneck problem for many PDE applications is to solve large sparse linear systems, arising from the discretization. In order to help solve these systems, the library includes a set of iterative solvers. All iterative solvers are template-based, such that vector and matrix classes, along with their underlying implementations, can be freely interchanged. New solvers can also be implemented without much coding effort. The generic nature of the library, along with a predefined set of interface rules, allows assembling components into PDE solvers. The use of parameterized-type binding allows the user to assemble PDE solvers at a high abstraction level, without having to change the remaining implementation.
 
-Since this chapter is mostly dedicated to the discussion of software development for high performance heterogeneous systems, the focus will be merely on the development and usage of an in-house GPU-based library, more than on specific scientific applications. We demonstrate how to use the library on two elementary model problems, and refer to Chapter \ref{ch7} for a detailed description of an advanced application tool for free surface water wave simulations. These examples are assembled using the library components presented in this chapter.
+Since this chapter is mostly dedicated to the discussion of software development for high performance heterogeneous systems, the focus will be more on the development and usage of an in-house GPU-based library, than on specific scientific applications. We demonstrate how to use the library on two elementary model problems and refer the reader to Chapter \ref{ch7} for a detailed description of an advanced application tool for free surface water wave simulations. These examples are assembled using the library components presented in this chapter.
 
 % Make sure that we present a way to implement PDE solver components, and that our library is an example that demonstrate the use.
 
@@ -66,43 +62,43 @@ Since this chapter is mostly dedicated to the discussion of software development
 %- The following sections are ordered as follows...
 
 \subsection{Test environments}\label{ch5:sec:testenvironments}
-Throughout the chapter we have used two different test environments; a high-end desktop computer located at the GPUlab -- Technical University of Denmark, and a GPU cluster located at the Center for Computing and Visualization, Brown University, USA. Hardware details for the two systems are as follows:
+Throughout the chapter we use three different test environments: two high-end desktop computers located at the GPUlab--Technical University of Denmark, and a GPU cluster located at the Center for Computing and Visualization, Brown University, USA. Hardware details for the two systems are as follows:
 \begin{description}
-\item[Test environment 1.] Desktop computer, Linux Ubuntu v10.04, dual Intel Xeon E5620 (2.4 GHz) quad-core Westmere processor, 12 GB of DDR-3 memory (1066 MHz), 2x Nvidia GeForce 590 GPU with 3GB DDR5 memory, PCIe 2.0.
-\item[Test environment 2.] GPU cluster, Linux, up to 44 compute nodes based on dual Intel Xeon E5540 (2.53 GHz) quad-core Nehalem processors, 24 GB of DDR-3 memory (1333 MHz), 2x Nvidia Tesla M2050 GPUs with 3GB GDDR5 memory, 40 Gb/s Quad-Data-Rate (QDR) InfiniBand interconnect.
+\item[Test environment 1.] Desktop computer, Linux Ubuntu, Intel Xeon E5620 (2.4GHz) quad-core Westmere processor, 12GB of DDR-3 memory (1066 MHz), 2x NVIDIA GeForce GTX590 GPU with 3GB DDR5 memory, PCIe 2.0.
+\item[Test environment 2.] Desktop computer, Linux Ubuntu, Intel Core i7-3820 (3.60GHz) Sandy Bridge processors, 32GB RAM, 2x NVIDIA Tesla K20 GPUs with 5GB DDR5 memory, PCIe 2.0.
+\item[Test environment 3.] GPU cluster, Linux, up to 44 compute nodes based on dual Intel Xeon E5540 (2.53GHz) quad-core Nehalem processors, 24 GB of DDR-3 memory (1333MHz), 2x NVIDIA Tesla M2050 GPUs with 3GB GDDR5 memory, 40 Gb/s Quad-Data-Rate (QDR) InfiniBand interconnect.
 \end{description}
 
 
-\section{Heterogenous library design for PDE solvers}
-A generic CUDA-based C++ library has been developed to ease the assembling of PDE solvers. The template \index{templates} based design allows users to assemble solver parts easily, and to supply their own implementation of problem specific parts. In the following, we present an overview of the library and the supported features, we introduce the concepts of the library components and give short code examples to ease understanding. The library is a starting point for fast assembling of GPU-based PDE solvers, developed mainly to support finite difference operations on regular grids. However, this is not a limitation, since existing vector objects could be used as base classes for extending to other discretization methods or grid types as well.
+\section{Heterogeneous library design for PDE solvers}
+A generic CUDA-based C++ library has been developed to ease the assembling of PDE solvers. The template-based\index{templates} design allows users to assemble solver parts easily and to supply their own implementation of problem specific parts. In the following, we present an overview of the library and the supported features, introduce the concepts of the library components, and give short code examples to ease understanding. The library is a starting point for fast assembling of GPU-based PDE solvers, developed mainly to support finite difference operations on regular grids. However, this is not a limitation, since existing vector objects could be used as base classes for extending to other discretization methods or grid types as well.
 
 \subsection{Component and concept design}
-The library is grouped into component classes. Each component should fulfill a set of simple interface and template rules, called concepts \index{concept}, in order to guarantee compatibility with the rest of the library. In the context of PDE solving, we present five component types; vectors, matrices, iterative solvers for linear system of equations, preconditioners for the iterative solvers, and time integrators. Figure \ref{ch5:fig:componentdesign} lists the five components along with the type definitions they must provide, and the methods they should implement. It is possible to extend the implementation of these components with more functionalities, that relate to specific problems, but this is the minimum requirement for compatibility with the remaining library. With these rather concept rules fulfilled, components can rely on other components to have implementations of their member functions.
+The library is grouped into component classes. Each component should fulfill a set of simple interface and template rules, called concepts\index{concept}, in order to guarantee compatibility with the rest of the library. In the context of PDE solving, we present five component classes: vectors, matrices, iterative solvers for linear system of equations, preconditioners for the iterative solvers, and time integrators. Figure \ref{ch5:fig:componentdesign} lists the five components along with a subset of the type definitions they should provide and the methods they should implement. It is possible to extend the implementation of these components with more functionality that relate to specific problems, but this is the minimum requirement for compatibility with the remaining library. With these concept rules fulfilled, components can rely on other components to have their respective functions implemented.
 
 \begin{figure}[!htb]
-\begin{center}
+\centering
 \input{Chapters/chapter5/figures/component_design.tikz}
-\end{center}
-\caption[Schematic representation of the five main components, their type definitions, and member functions.]{Schematic representation of the five main components, their type definitions, and member functions. Because components are template based, the argument types cannot be known in beforehand. This concept rule set ensures compliance between components.}\label{ch5:fig:componentdesign}
+\caption[Schematic representation of the five main components, their type definitions, and member functions.]{Schematic representation of the five main components, their type definitions, and member functions. Because components are template based, the argument types cannot be known beforehand. The concepts ensure compliance among components.}\label{ch5:fig:componentdesign}
 \end{figure}
 
-A component is implemented as a generic C++ class, that normally takes as template arguments the same types that it offers through type definitions: a matrix takes a vector class as template argument, and a vector class takes a working precision value type. The matrix can then access the working precision via the vector class. Components that rely on multiple template arguments can combine these arguments via type binders to maintain code simplicity. We will demonstrate use of such type binders in the model problem examples. A thorough introduction to template-based programming in C++ can be found in \cite{ch5:Vandevoorde2002}.
+A component is implemented as a generic C++ class, and normally takes as a template arguments the same types that it offers through type definitions: a matrix takes a vector as template argument, and a vector takes the working precision type. The matrix can then access the working precision through the vector class. Components that rely on multiple template arguments can combine these arguments via type binders to reduce the number of arguments and maintain code simplicity. We will demonstrate use of such type binders in the model problem examples. A thorough introduction to template-based programming in C++ can be found in \cite{ch5:Vandevoorde2002}.
 
-The generic configuration allows the developer to define and assemble solver parts at the very beginning of the program, using type definitions. Changing PDE solver parts at a later time is then only a matter of changing the proper definitions. We will give examples of how to assemble PDE solvers in section \ref{ch5:sec:modelproblems}.
+The generic configuration allows the developer to define and assemble solver parts at the very beginning of the program using type definitions. Changing PDE parts at a later time is then only a matter of changing type definitions. We will give two model examples of how to assemble PDE solvers in Section \ref{ch5:sec:modelproblems}.
 
 \subsection{A matrix-free finite difference component}\index{finite difference}\index{matrix-free}
-Common vector operations, such as memory allocation, element-wise assignments, and basic algebraic transformations, require many lines of codes for a purely CUDA-based implementation. These CUDA specific operations and kernels are hidden from the user behind library implementations, to ensure a high abstraction level. The vector classes inherit from the CUDA-based Thrust library and therefore offer the same level of abstraction, that enhance developer productivity and enables performance portability. Creating and allocating device (GPU) memory for two vectors can be done in a simple and intuitive way using the GPUlab library, as shown in the following example where two vectors are added together:
+Common vector operations, such as memory allocation, element-wise assignments, and basic algebraic transformations, require many lines of codes for a purely CUDA-based implementation. These CUDA-specific operations and kernels are hidden from the user behind library implementations, to ensure a high abstraction level. The vector class inherits from the CUDA-based Thrust library and therefore offer the same level of abstraction that enhances developer productivity and enables performance portability. Creating and allocating device (GPU) memory for two vectors can be done in a simple and intuitive way using the GPUlab library, as shown in Listing \ref{ch5:lst:vector} where two vectors are added together.
 
-\lstinputlisting[label=ch5:lst:vector,caption={Allocating, initializing, and adding together two vectors on the GPU. First example are using pure CUDA C, second example use the built-in library template-based vector class.}]{Chapters/chapter5/code/ex1.cu}
+\lstinputlisting[label=ch5:lst:vector,caption={allocating, initializing, and adding together two vectors on the GPU: first example uses pure CUDA C; second example uses the built-in library template-based vector class}]{Chapters/chapter5/code/ex1.cu}
 
-The vector class (and derived classes hereof) works with the rest of the library components. Therefore we encourage to use this class as the base vector object. Matrix-vector multiplications are usually what makes PDE-based applications differ, and the need to write a user specific implementation of the matrix-vector product is essential to solve specific PDE problems. The PDE and the choice of discretization method determine the structure and sparsity of the resulting matrix. Spatial discretization is supported by the library with finite difference approximations, and it offers an efficient low-storage (matrix-free) flexible order\index{flexible order} implementation, to help developers tailor their custom codes. These matrix-free operators are feasible for problems where the matrix structure is known in advance and can be exploited, such that the matrix values can either be precomputed or computed on-the-fly. Furthermore, the low constant memory requirement makes them perfect in the context of solving large scale problems, whereas traditional sparse matrix formats require increasingly more memory~\cite{ch5:Bell2009}.
+The vector class (and derived classes hereof) is compliant with the rest of the library components. Matrix-vector multiplications are usually what makes PDE-based applications different from each other, and the need to write a user specific implementation of the matrix-vector product is essential when solving specific PDE problems. The PDE and the choice of discretization method determine the structure and sparsity of the resulting matrix. Spatial discretization is supported by the library with finite difference approximations, and it offers an efficient low-storage (matrix-free) flexible order\index{flexible order} implementation to help developers tailor their custom codes. These matrix-free operators are feasible for problems where the matrix structure is known in advance and can be exploited, such that the matrix values can be either precomputed or computed on the fly. Furthermore, the low constant memory requirement makes them perfect in the context of solving large scale problems, whereas traditional sparse matrix formats require increasingly more memory, see e.g., ~\cite{ch5:Bell2009} for details on GPU sparse matrix formats.
 
 Finite differences approximate the derivative of some function $u(x)$ as a weighted sum of neighboring elements. In compact notation we write
 \begin{align}\label{ch5:eq:fdstencil}
 \frac{\partial^q u(x_i)}{\partial x^q} \approx \sum_{n=-\alpha}^{\beta}c_n u(x_{i+n}),
 \end{align}
-where $q$ is the order of the derivative, $c_n$ is a set of finite difference coefficients, where $\alpha$ plus $\beta$ define the number of coefficients that are used for the approximation. The total set of contributing elements is called the stencil,\index{stencil} and the size of the stencil is called the rank, given as $\alpha + \beta + 1$. The stencil coefficients $c_n$ can be derived from a Taylor expansion based on the values of $\alpha$, $\beta$, and for a specific order $q$, using the method of undetermined coefficients~\cite{ch5:LeVeque2007}.
-An example of a three-point finite difference matrix that approximates the first ($q=1$) or second ($q=2$) derivative of a one dimensional uniformly distributed vector $u$ of length $8$ is given here
+where $q$ is the order of the derivative, $c_n$ is a set of finite difference coefficients, and $\alpha$ plus $\beta$ define the number of coefficients that are used for the approximation. The total set of contributing elements is called the stencil,\index{stencil} and the size of the stencil is called the rank, given as $\alpha + \beta + 1$. The stencil coefficients $c_n$ can be derived from a Taylor expansion based on the values of $\alpha$ and $\beta$, and $q$, using the method of undetermined coefficients~\cite{ch5:LeVeque2007}.
+An example of a three-point finite difference matrix that approximates the first ($q=1$) or second ($q=2$) derivative of a one-dimensional uniformly distributed vector $u$ of length $8$ is given here:
 \begin{eqnarray}\label{ch5:eq:stencilmatrix}
 \left[\begin{array}{c c c c c c c c}
 c_{00} & c_{01} & c_{02} &      0 &      0 &      0 &      0 &      0 \\
@@ -136,7 +132,7 @@ u^{(q)}_6 \\
 u^{(q)}_7
 \end{array}\right].
 \end{eqnarray}
-It is clear from this example, that the matrix is sparse and that there are many repetitions of the same coefficients. Notice, that the coefficients only differ near the boundaries, where off-centered stencils are used. It is natural to pack this information into a stencil operator, that stores only the unique set of coefficients:
+It is clear from this example that the matrix is sparse and that the same coefficients are repeated for all centered rows. The coefficients differ only near the boundaries, where off-centered stencils are used. It is natural to pack this information into a stencil operator that stores only the unique set of coefficients:
 \begin{eqnarray}\label{ch5:eq:stencilcoeffs}
 \myvec{c} & = &
 \left[\begin{array}{c c c}
@@ -145,62 +141,65 @@ c_{10} & c_{11} & c_{12} \\
 c_{20} & c_{21} & c_{22} \\
 \end{array}\right].
 \end{eqnarray}
-Library matrix components calculate these compact stencil coefficient matrices, and implements member functions that compute the finite difference approximation to vector objects. Unit scaled coefficients (assuming grid spacing is one) are computed and stored to be accessible both via CPU and GPU memory. On the GPU, the constant memory space is used for faster memory access~\cite{ch5:cudaguide}. In order to apply a stencil on a non-unit spaced grid, with grid space $\Delta x$, a scale factor of $1/(\Delta x)^q$ will have to be multiplied to the finite difference sum, i.e. $(c_{00}u_0 + c_{01}u_1 + c_{02}u_2)/(\Delta x)^q \approx u^{(q)}_0$.
+Matrix components precompute these compact stencil coefficients and provides member functions that computes the finite difference approximation of input vectors. Unit scaled coefficients (assuming grid spacing is one) are computed and stored to be accessible via both CPU and GPU memory. On the GPU, the constant memory space is used for faster memory access~\cite{ch5:cudaguide}. In order to apply a stencil on a non unit-spaced grid, with grid space $\Delta x$, the scale factor $1/(\Delta x)^q$ will have to be multiplied by the finite difference sum, i.e., $(c_{00}u_0 + c_{01}u_1 + c_{02}u_2)/(\Delta x)^q \approx u^{(q)}_0$ as in the first row of \eqref{ch5:eq:stencilmatrix}.
 
-Setting up a two dimensional grid of size $N_x \times N_y$ in the unit square and computing the first derivative hereof, is illustrated in Listing~\ref{ch5:lst:stencil}. The grid is a vector component, derived from the vector class. It is by default treated as a device object and memory is automatically allocated on the device to fit the grid size. The finite difference approximation as in \eqref{ch5:eq:fdstencil}, are performed via a CUDA kernel behind the scenes during the calls to mult and diff\_x, utilizing the memory hierarchy as the CUDA guidelines prescribe~\cite{ch5:cudaguide,ch5:cudapractice}. To increase developer productivity, kernel launch configurations have default settings, based on CUDA guidelines principles and experiences from performance testings, such that the user does not have to explicitly specify them. For problem specific finite difference approximations, where the built-in stencil operators are insufficient, a pointer to the coefficient matrix \eqref{ch5:eq:stencilcoeffs} can be passed to customized kernels.
+Setting up a two-dimensional grid of size $N_x \times N_y$ in the unit square and computing the first derivative hereof is illustrated in Listing~\ref{ch5:lst:stencil}. The grid is a vector component, derived from the vector class. It is by default treated as a device object and memory is automatically allocated on the device to fit the grid size. The finite difference approximation as in \eqref{ch5:eq:fdstencil}, is performed via a CUDA kernel behind the scenes during the calls to \texttt{mult} and \texttt{diff\_x}, utilizing the memory hierarchy as the CUDA guidelines prescribe~\cite{ch5:cudaguide,ch5:cudapractice}. To increase developer productivity, kernel launch configurations have default settings, based on CUDA guidelines, principles, and experiences from performance testings, such that the user does not have to explicitly specify them. For problem-specific finite difference approximations, where the built-in stencil operators are insufficient, a pointer to the coefficient matrix \eqref{ch5:eq:stencilcoeffs} can be accessed as demonstrated in Listing \ref{ch5:lst:stencil} and passed to customized kernels.
 
-\lstinputlisting[label=ch5:lst:stencil,caption={Two dimensional finite difference stencil example. Computing the first derivative using five points ($\alpha=\beta=2$) per dimension, a total nine point stencil.}]{Chapters/chapter5/code/ex2.cu}
+\lstinputlisting[label=ch5:lst:stencil,caption={two-dimensional finite difference stencil example: computing the first derivative using five points ($\alpha=\beta=2$) per dimension, a total nine-point stencil}]{Chapters/chapter5/code/ex2.cu}
 
-In the following sections we demonstrate how to go from a PDE posed as an initial value problem (IVP) or a boundary value problem (BVP) to a working application solver, by combining existing library components along with new custom tailored components. We also demonstrate how to apply spatial and temporal domain decomposition strategies, that can make existing solvers take advantage of systems equipped with multiple GPUs. In the following we demonstrate how to rapidly assemble a PDE solver using library components.
+In the following sections we demonstrate how to go from an initial value problem (IVP) or a boundary value problem (BVP) to a working application solver by combining existing library components along with new custom-tailored components. We also demonstrate how to apply spatial and temporal domain decomposition strategies that can make existing solvers take advantage of systems equipped with multiple GPUs. Next section demonstrates how to rapidly assemble a PDE solver using library components.
 
 \section{Model problems}\label{ch5:sec:modelproblems}
 
 %Physical phenomenons are at all times present around us. Many of them are well defined via the laws of physics and can be described via differential equations. Though many differential equations are well defined, their solutions are not, and the need for computers to approximate and simulate solutions are inevitable. The accuracy of these simulations is critical for the quality of scientific applications within all fields of engineering. Improved accuracy often comes with the price of increased computational times, that can even be too overwhelming for a single computer to run. Fortunately, the fundamental change in chip-design led development of single core systems into multi- and many core high-performance architectures, that perfectly fits the engineering needs for increased computational resources. The massively parallel architecture of GPUs is a particular good match for accurate methods with high computational intensity\cite{ch5:Kloeckner2011}.
 
-We present two elementary PDE model problems, to demonstrate how to assemble PDE solvers, using library components that follow the guidelines described above. The first model problem is the unsteady parabolic heat conduction equation, the second model problem is the elliptic Poisson equation. The two model problems consist of elements that play important roles in solving a broad range of more advanced PDE problems.
+We present two elementary PDE model problems, to demonstrate how to assemble PDE solvers, using library components that follow the guidelines described above. The first model problem is the unsteady parabolic heat conduction equation; the second model problem is the elliptic Poisson equation. The two model problems consist of elements that play important roles in solving a broad range of more advanced PDE problems.
 
-We refer to Chapter \ref{ch7} for an example of a scientific application relevant for coastal and maritime engineering analysis, that has been assembled using customized library components similar to those presented in the following.
+We refer the reader to Chapter \ref{ch7} for an example of a scientific application relevant for coastal and maritime engineering analysis that has been assembled using customized library components similar to those presented in the following.
 
 \subsection{Heat conduction equation}\index{heat conduction}
-First, we consider a two dimensional heat conduction problem defined on a unit square. The heat conduction equation is a parabolic partial differential diffusion equation, including both spatial and temporal derivatives. It describes how the diffusion of heat in a medium changes with time. Diffusion equations find great importance in many fields of sciences, e.g. fluid dynamics, where the fluid motion is uniquely described by the Navier-Stokes equations, which include a diffusive viscous term~\cite{ch5:chorin1993,ch5:Ferziger1996}.%, or in financial science where diffusive terms are present in the Black-Scholes equations for estimation of option price trends~\cite{}.
+First, we consider a two-dimensional heat conduction problem defined on a unit square. The heat conduction equation is a parabolic partial differential diffusion equation, including both spatial and temporal derivatives. It describes how the diffusion of heat in a medium changes with time. Diffusion equations are of great importance in many fields of sciences, e.g., fluid dynamics, where the fluid motion is uniquely described by the Navier-Stokes equations, which include a diffusive viscous term~\cite{ch5:chorin1993,ch5:Ferziger1996}.%, or in financial science where diffusive terms are present in the Black-Scholes equations for estimation of option price trends~\cite{}.
 
-The heat problem is an IVP \index{initial value problem} that describes how the heat distribution evolves from a specified initial state. Together with homogeneous Dirichlet boundary conditions\index{boundary conditions}, the heat problem is given as
+The heat problem is an IVP \index{initial value problem}, it describes how the heat distribution evolves from a specified initial state. Together with homogeneous Dirichlet boundary conditions\index{boundary conditions}, the heat problem in the unit square is given as
 \begin{subequations}\begin{align}
-\frac{\partial u}{\partial t} - \kappa\nabla^2u =  0, & \qquad (x,y)\in \Omega([0,1]\times[0,1]),\label{ch5:eq:heateqdt}\\
+\frac{\partial u}{\partial t} - \kappa\nabla^2u =  0, & \qquad (x,y)\in \Omega([0,1]\times[0,1]),\quad t\geq 0, \label{ch5:eq:heateqdt}\\
 u =  0, & \qquad (x,y) \in \partial\Omega,\label{ch5:eq:heateqbc}
 \end{align}\label{ch5:eq:heateq}\end{subequations}
-where $u(x,y,t)$ is the unknown heat distribution, $\kappa$ is a heat conductivity constant (assume $\kappa=1$ from now) and $\nabla^2$ is the two dimensional Laplace\index{Laplace operator} differential operator $(\partial_{xx}+\partial_{yy})$. We use the following initial condition, because it has a known analytical solution over the entire time span, and it satisfies the homogeneous boundary condition given by \eqref{ch5:eq:heateqbc},
+where $u(x,y,t)$ is the unknown heat distribution defined within the domain $\Omega$, $t$ is the time, $\kappa$ is a heat conductivity constant (let $\kappa=1$), and $\nabla^2$ is the two-dimensional Laplace\index{Laplace operator} differential operator $(\partial_{xx}+\partial_{yy})$. We use the following initial condition:
 \begin{align}\label{ch5:eq:heatinit}
-u(x,y,t_0) = \sin(\pi x)\,\sin(\pi y), & \qquad (x,y) \in \Omega.
+u(x,y,t_0) = \sin(\pi x)\,\sin(\pi y), & \qquad (x,y) \in \Omega,
 \end{align}
-An illustrative example of the numerical solution to the heat problem, using \eqref{ch5:eq:heatinit} as the initial condition is given in Figure \ref{ch5:fig:heatsolution}.
+because it has a known analytic solution over the entire time span, and it satisfies the homogeneous boundary condition given by \eqref{ch5:eq:heateqbc}. An illustrative example of the numerical solution to the heat problem, using \eqref{ch5:eq:heatinit} as the initial condition, is given in Figure \ref{ch5:fig:heatsolution}.
 \begin{figure}[!htbp]
-    \begin{center}
-    \setlength\figurewidth{0.3\textwidth}        %
-    \setlength\figureheight{0.3\textwidth}       %
-    \subfigure[$t=0.00s$]%{\input{Chapters/chapter5/figures/HeatSolution0.tikz}}
-{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_conv.pdf}}
-    \subfigure[$t=0.05s$]%{\input{Chapters/chapter5/figures/HeatSolution0.049307.tikz}}
-{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_049307_conv.pdf}}
-    %\subfigure[$t=0.10s$]{\input{Chapters/chapter5/figures/HeatSolution0.099723.tikz}}
-{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_099723_conv.pdf}}
-    \end{center}
-    \caption{Discrete solution at times $t=0s$ and $t=0.05s$, using \eqref{ch5:eq:heatinit} as initial condition and a small $20\times20$ numerical grid.}\label{ch5:fig:heatsolution}
+	\scriptsize
+    \centering
+    \setlength\figurewidth{0.26\textwidth}        %
+    \setlength\figureheight{0.26\textwidth}       %
+    \subfigure[$t=0.00s$]{\input{Chapters/chapter5/figures/HeatSolution0.tikz}}%
+%{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_conv.pdf}}
+    \subfigure[$t=0.05s$]{\input{Chapters/chapter5/figures/HeatSolution0.049307.tikz}}%
+%{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_049307_conv.pdf}}
+    \subfigure[$t=0.10s$]{\input{Chapters/chapter5/figures/HeatSolution0.099723.tikz}}%
+%{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_099723_conv.pdf}}
+    \caption{Discrete solution, at times $t=0s$ and $t=0.05s$, using \eqref{ch5:eq:heatinit} as the initial condition and a small $20\times20$ numerical grid.}\label{ch5:fig:heatsolution}
 \end{figure}
 
 We use a Method of Lines (MoL)\index{method of lines} approach to solve \eqref{ch5:eq:heateq}. Thus,  the spatial derivatives are replaced with finite difference approximations, leaving only the temporal derivative as unknown. The spatial derivatives are approximated from $\myvec{u}^n$, where $\myvec{u}^n$ represents the approximate solution to $u(t_n)$ at a given time $t_n$ with time step size $\delta t$ such that $t_n=n\delta t$ for $n=0,1,\ldots$. The finite difference approximation\index{finite difference} can be interpreted as a matrix-vector product as sketched in \eqref{ch5:eq:stencilmatrix}, and so the semi-discrete heat conduction problem becomes
 \begin{align}\label{ch5:eq:discreteheateq}
 \frac{\partial u}{\partial t} = \mymat{A}\myvec{u}, \qquad \mymat{A} \in \mathbb{R}^{N\times N}, \quad \myvec{u} \in \mathbb{R}^{N},
 \end{align}
-where $\mymat{A}$ is the sparse finite difference matrix and $N$ is the number of unknowns in the discrete system. The temporal derivative is now free to be approximated by any suitable choice of a time integration method\index{time integration}. The most simple integration scheme would be the first order explicit forward Euler's method\index{forward Euler}, resulting in an update on the form
+where $\mymat{A}$ is the sparse finite difference matrix and $N$ is the number of unknowns in the discrete system. The temporal derivative is now free to be approximated by any suitable choice of a time-integration method\index{time integration}. The most simple integration scheme would be the first-order accurate explicit forward Euler method\index{forward Euler},
 \begin{align}\label{ch5:eq:forwardeuler}
 \myvec{u}^{n+1} = \myvec{u}^n + \delta t\,\mymat{A}\myvec{u}^n,
 \end{align}
-where $n+1$ refers to the solution at the next time step. The forward Euler method can be exchanged with alternative high-order accurate time integration methods, such as Runge-Kutta methods or linear multistep methods, if numerical instability becomes an issue, see e.g. \cite{ch5:LeVeque2007} for details on numerical stability analysis. For demonstrative purpose, we simply use conservative time step sizes to avoid stability issues. However, the component based library design provides exactly the flexibility for the application developer to select or change PDE solver parts, such as the time integrator, with little coding effort. A generic implementation of the forward Euler method, that satisfy the library concept rules is illustrated in Listing~\ref{ch5:lst:euler}. According to the component guidelines in Figure \ref{ch5:fig:componentdesign}, a time integrator is basically a functor, which means that it implements the parenthesis operator, taking five template arguments: a right hand side operator, the state vector, integration start time, integration end time, and a time step size. The method takes as many time steps necessary to integrate from the start till end, continuously updating the state vector according to \eqref{ch5:eq:forwardeuler}. Notice, that nothing in Listing~\ref{ch5:lst:euler} indicates wether GPUs are used or not. However, it is likely that the underlying implementation of the right hand side functor and the \texttt{axpy} vector function, that can be used for scaling and summation of vectors, do rely on fast GPU kernels. However, it is not something that the developer of the component has to account for. For this reason, the template-based approach, along with simple interface concepts, make it easy to create new components that will fit well into the library.
+where $n+1$ refers to the solution at the next time step. The forward Euler method can be exchanged with alternative high-order accurate time integration methods, such as Runge-Kutta methods or linear multistep methods, if numerical instability becomes an issue, see, e.g., \cite{ch5:LeVeque2007} for details on numerical stability analysis. For demonstrative purpose, we simply use conservative time step sizes to avoid stability issues. However, the component-based library design provides exactly the flexibility for the application developer to select or change PDE solver parts, such as the time integrator, with little coding effort. A generic implementation of the forward Euler method that satisfies the library concept rules is illustrated in Listing~\ref{ch5:lst:euler}. According to the component guidelines in Figure \ref{ch5:fig:componentdesign}, a time integrator is basically a functor, which means that it implements the parenthesis operator, taking five template arguments: a right hand side operator, the state vector, integration start time, integration end time, and a time step size. The method takes as many time steps necessary to integrate from the start to the end, continuously updating the state vector according to \eqref{ch5:eq:forwardeuler}. Notice, that nothing in Listing~\ref{ch5:lst:euler} indicates wether GPUs are used or not. However, it is likely that the underlying implementation of the right hand side functor and the \texttt{axpy} vector function, do rely on fast GPU kernels. However, it is not something that the developer of the component has to account for. For this reason, the template-based approach, along with simple interface concepts, make it easy to create new components that will fit well into a generic library.
 
-\lstinputlisting[label=ch5:lst:euler,caption={Generic implementation of explicit first order forward Euler integration.}]{Chapters/chapter5/code/ex3.cu}
 
-A basic approach to solve the heat conduction problem has now been outlined, and we are ready to assemble the PDE solver.
+The basic numerical approach to solve the heat conduction problem has now been outlined, and we are ready to assemble the PDE solver.
+
+\pagebreak
+\lstinputlisting[label=ch5:lst:euler,caption={generic implementation of explicit first-order forward Euler integration}]{Chapters/chapter5/code/ex3.cu}
+
 
 %More accurate methods, like variations of Runge-Kutta methods or multi-step methods might be desirable, particulary if the accuracy of the spatial approximation is also high, see e.g. LeVeque for examples hereof~\cite{ch5:LeVeque2007}.
 
@@ -209,31 +208,31 @@ A basic approach to solve the heat conduction problem has now been outlined, and
 
 
 \subsubsection{Assembling the heat conduction solver}
-Before we are able to numerically solve the discrete heat conduction problem \eqref{ch5:eq:heateq}, we need implementations to handle the the following items
+Before we are able to numerically solve the discrete heat conduction problem \eqref{ch5:eq:heateq}, we need implementations to handle the the following items:
 \begin{description}
-\item[Grid] - A discrete numerical grid, to represent the two dimensional heat distribution domain and the arithmetical working precision ($32$-bit single precision or $64$-bit double precision).
-\item[RHS] - A right hand side operator for \eqref{ch5:eq:discreteheateq}, that approximates the second order spatial derivatives (matrix-vector product).
-\item[Boundary conditions] - A strategy, that ensures that the Dirichlet conditions are satisfied on the boundary.
-\item[Time integrator] - A time integration scheme, that approximates the time derivative from equation \eqref{ch5:eq:discreteheateq}.
+\item[Grid.] A discrete numerical grid to represent the two-dimensional heat distribution domain and the arithmetical working precision ($32$-bit single precision or $64$-bit double precision).
+\item[RHS.] A right-hand side operator for \eqref{ch5:eq:discreteheateq} that approximates the second-order spatial derivatives (matrix-vector product).
+\item[Boundary conditions.] A strategy that ensures that the Dirichlet conditions are satisfied on the boundary.
+\item[Time integrator.] A time integration scheme, that approximates the time derivative from \eqref{ch5:eq:discreteheateq}.
 \end{description}
-All items are either directly available in the library or can be designed from components herein. The built-in stencil operator may assist in implementing the matrix-vector product, but we need to explicitly ensure that the Dirichlet boundary conditions are satisfied. We demonstrated in Listing \ref{ch5:lst:stencil} how to approximate the derivative using flexible order finite difference stencils. However, from \eqref{ch5:eq:heateqbc} we know, that boundary values are zero. Therefore we extend the stencil operator with a simple kernel call that assigns zero to the entire boundary. Listing \ref{ch5:lst:laplaceimpl} shows the code for the two dimensional Laplace right hand side operator. The constructor takes as argument the stencil half size $\alpha$ and assumes $\alpha=\beta$. Thus the total two dimensional stencil rank will be $4\alpha+1$. For simplicity we also assume that the grid is uniformly distributed, $N_x=N_y$. Performance optimizations for the stencil kernel, such as shared memory utilization, is taken care of in the underlying implementation, accordingly to CUDA guidelines~\cite{ch5:cudaguide,ch5:cudapractice}. The two macros, BLOCK1D and GRID1D, are used to help set up kernel configurations based on grid sizes.
+All items are either directly available in the library or can be designed from components herein. The built-in stencil operator may assist in implementing the matrix-vector product, but we need to explicitly ensure that the Dirichlet boundary conditions are satisfied. We demonstrated in Listing \ref{ch5:lst:stencil} how to approximate the derivative using flexible order finite difference stencils. However, from \eqref{ch5:eq:heateqbc} we know that boundary values are zero. Therefore, we extend the stencil operator with a simple kernel call that assigns zero to the entire boundary. Listing \ref{ch5:lst:laplaceimpl} shows the code for the two-dimensional Laplace right-hand side operator. The constructor takes as an argument the stencil half size $\alpha$ and assumes $\alpha=\beta$. Thus, the total two-dimensional stencil rank will be $4\alpha+1$. For simplicity we also assume that the grid is uniformly distributed, $N_x=N_y$. Performance optimizations for the stencil kernel, such as shared memory utilization, are handled in the underlying implementation, accordingly to CUDA guidelines~\cite{ch5:cudaguide,ch5:cudapractice}. The macros, \texttt{BLOCK1D} and \texttt{GRID1D}, are used to help set up kernel configurations based on grid sizes and \texttt{RAW\_PTR} is used to cast the vector object to a valid device memory pointer.
 
-\lstinputlisting[label=ch5:lst:laplaceimpl,caption={The right hand side Laplace operator. The built-in stencil approximates the two dimensional spatial derivatives, while the custom set\_dirichlet\_bc kernel takes care of satisfying boundary conditions.}]{Chapters/chapter5/code/laplacian.cu}
+\lstinputlisting[label=ch5:lst:laplaceimpl,caption={the right-hand side Laplace operator: the built-in stencil approximates the two dimensional spatial derivatives, while the custom set\_dirichlet\_bc kernel takes care of satisfying boundary conditions}]{Chapters/chapter5/code/laplacian.cu}
 
-With the right hand side operator in place, we are ready to implement the solver. For this simple PDE problem we compute all necessary initial data in the body of the main function and use the forward Euler time integrator to compute the solution till $t=t_{end}$. For more advanced PDE solvers, a built-in \texttt{ode\_solver} class is defined, that helps taking care of initialization and storage of multiple state variables. Declaring type definitions for all components at the beginning of the main file gives a good overview of the solver composition. In this way, it will be easy to control or change solver components at later times. Listing~\ref{ch5:lst:heattypedefs} lists the type definitions\index{type definitions} that are used to assemble the heat conduction solver. %\stefan{elaborate on the choices}.
+With the right-hand side operator in place, we are ready to implement the solver. For this simple PDE problem we compute all necessary initial data in the body of the main function and use the forward Euler time integrator to compute the solution until $t=t_{end}$. For more advanced solvers, a built-in \texttt{ode\_solver} class is defined that helps take care of initialization and storage of multiple state variables. Declaring type definitions for all components at the beginning of the main file gives a good overview of the solver composition. In this way, it will be easy to control or change solver components at later times. Listing~\ref{ch5:lst:heattypedefs} lists the type definitions\index{type definitions} that are used to assemble the heat conduction solver. %\stefan{elaborate on the choices}.
 
-\lstset{caption={Type definitions for all the heat conduction solver components used throughout the remaining code.}}
-\begin{lstlisting}[label=ch5:lst:heattypedefs]
+\lstset{label=ch5:lst:heattypedefs,caption={type definitions for all the heat conduction solver components used throughout the remaining code}}
+\begin{lstlisting}
 typedef double                              value_type;
 typedef laplacian<value_type>               rhs_type;
 typedef gpulab::grid<value_type>            vector_type;
 typedef vector_type::property_type          property_type;
 typedef gpulab::integration::forward_euler  time_integrator_type;
 \end{lstlisting}
-The grid is by default treated as a device object and memory is allocated on the GPU upon initialization of the grid. Setting up the grid can be done via the property type class. The property class holds information about the discrete and physical dimensions, along with fictitious ghost (halo) layers and periodicity conditions. For the heat conduction problem we use a non-periodic domain within the unit square and with no ghost layers. Listing \ref{ch5:lst:gridsetup} illustrates the grid assembly.
+The grid is by default treated as a device object, and memory is allocated on the GPU upon initialization of the grid. Setting up the grid can be done via the property type class. The property class holds information about the discrete and physical dimensions, along with fictitious ghost (halo) layers and periodicity conditions. For the heat conduction problem we use a non periodic domain of size $N\times N$ within the unit square with no ghost layers. Listing \ref{ch5:lst:gridsetup} illustrates the grid assembly.
 
-\lstset{caption={Create 2D grid of discrete dimension \texttt{N} times \texttt{N} and physical dimension $0$ to $1$.}}
-\begin{lstlisting}[label=ch5:lst:gridsetup]
+\lstset{label=ch5:lst:gridsetup,caption={creating a two-dimensional grid of size \texttt{N} times \texttt{N} and physical dimension $0$ to $1$}}
+\begin{lstlisting}
 // Setup discrete and physical dimensions
 gpulab::grid_dim<int>          dim(N,N,1);
 gpulab::grid_dim<value_type>   p0(0,0);
@@ -243,15 +242,15 @@ property_type                  props(dim,p0,p1);
 // Initialize vector
 vector_type                    u(props);
 \end{lstlisting}
-Hereafter the vector \texttt{u} can be initialized accordingly to \eqref{ch5:eq:heatinit}. Finally we need to instantiate the right hand side Laplacian operator from Listing \ref{ch5:lst:laplaceimpl} and the forward Euler time integrator in order to integrate from $t_0$ till $t_{end}$.
+Hereafter the vector \texttt{u} can be initialized accordingly to \eqref{ch5:eq:heatinit}. Finally we need to instantiate the right-hand side Laplacian operator from Listing \ref{ch5:lst:laplaceimpl} and the forward Euler time integrator in order to integrate from $t_0$ until $t_{end}$.
 
-\lstset{caption=Create time integrator and the right hand side Laplacian operator.}
-\begin{lstlisting}[label=ch5:lst:timeintegrator]
-rhs_type rhs(alpha);           // Create right hand side operator
+\lstset{label=ch5:lst:timeintegrator,caption=creating a time integrator and the right-hand side Laplacian operator.}
+\begin{lstlisting}
+rhs_type rhs(alpha);           // Create right-hand side operator
 time_integrator_type solver;   // Create time integrator
-solver(&rhs,u,0.0f,tend,dt);   // Integrate from to tend using dt
+solver(&rhs,u,0.0f,tend,dt);   // Integrate from 0 to tend using dt
 \end{lstlisting}
-The last line invokes the forward Euler time integration scheme, defined in Listing \ref{ch5:lst:heattypedefs}. If the developer decides to change the integrator into another explicit scheme, only the time integrator type definition in Listing \ref{ch5:lst:heattypedefs} needs to be changed. The heat conduction solver is now complete.%The simple model problem illustrated, that a well designed library upholds developer productivity, by minimizing the
+The last line invokes the forward Euler time integration scheme defined in Listing \ref{ch5:lst:heattypedefs}. If the developer decides to change the integrator into another explicit scheme, only the time integrator type definition in Listing \ref{ch5:lst:heattypedefs} needs to be changed. The heat conduction solver is now complete.%The simple model problem illustrated, that a well designed library upholds developer productivity, by minimizing the
 
 % how few CUDA implementations that were needed, and thereby upholding developer productivity.
 
@@ -259,53 +258,55 @@ The last line invokes the forward Euler time integration scheme, defined in List
 
 \subsubsection{Numerical solutions to the heat conduction problem}
 
-Solution times for the heat conduction problem is in itself not very interesting, as it is only a simple model problem. What is interesting for GPU kernels, like the finite differences kernel, is that increased computational work often comes with a very little price. This is because the relative fast computations can be hidden by the slower memory fetches. Therefore we are able to improve the accuracy of the numerical solution via more accurate finite differences (larger stencil sizes), while improving the computational performance in terms of floating point operations per second (flops). Figure \ref{ch5:fig:stencilperformance} confirms, that larger stencils improve the kernel performance. Notice that even though these performance results are favorable compared to single core systems ($\sim 10$ GFlops double precision on a $2.5$-GHz processor), it is still far from the peak performance of the GeForce GTX590 ($\sim 2.4$ TFlops single precision). The reason is that the kernel is bandwidth bound, i.e. performance is limited by the time it takes to move memory between the global GPU memory and the chip. However, this is a general limitation for matrix-vector-like operations that approximate solutions to discretized PDE problems. Only matrix-matrix operations, that have a high ratio of computations versus memory transactions, are able to reach near-optimal performance results~\cite{ch5:Kirk2010}. These kind of operators are, however, rarely used to solve PDE problems.
+Solution time for the heat conduction problem is in itself not very interesting, as it is only a simple model problem. What is interesting for GPU kernels, such as the finite differences kernel, is that increased computational work often comes with a very small price, because the fast computations can be hidden by the relatively slower memory fetches. Therefore, we are able to improve the accuracy of the numerical solution via more accurate finite differences (larger stencil sizes), while improving the computational performance in terms of floating point operations per second (flops). Figure \ref{ch5:fig:stencilperformance} confirms, that larger stencils improve the kernel performance. Notice that even though these performance results are favorable compared to single core systems ($\sim 10$ GFlops double precision on a $2.5$-GHz processor), they are still far from their peak performance, e.g., $\sim 2.4$ TFlops single precision for the GeForce GTX590. The reason is that the kernel is bandwidth bound, i.e., performance is limited by the time it takes to move memory between the global GPU memory and the chip. The Tesla K20 performs better than the GeForce GTX590 because it obtains the highest bandwidth. Being bandwidth bound is a general limitation for matrix-vector-like operations that arise from the discretization of PDE problems. Only matrix-matrix multiplications, which have a high ratio of computations versus memory transactions, are able to reach near-optimal performance results~\cite{ch5:Kirk2010}. These kinds of operators are, however, rarely used to solve PDE problems.
 
 \begin{figure}[!htb]
-\setlength\figureheight{0.3\textwidth}
+\setlength\figureheight{0.35\textwidth}
 \setlength\figurewidth{0.4\textwidth}
-\begin{center}
-{\small
-%\input{Chapters/chapter5/figures/AlphaPerformanceGTX590_N16777216.tikz}
-{\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/AlphaPerformanceGTX590_N16777216_conv.pdf}}
+\centering
+{\scriptsize
+\subfigure[GeForce GTX590, test environment 1.]{
+	\input{Chapters/chapter5/figures/AlphaPerformanceGTX590_N16777216.tikz}%
+}%
+\subfigure[Tesla K20c, test environment 2.]{
+	\input{Chapters/chapter5/figures/AlphaPerformanceTeslaK20c_N16777216.tikz}
+}%
+%{\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/AlphaPerformanceGTX590_N16777216_conv.pdf}}
 }
-\end{center}
-\caption[Single and double precision floating point operations per second for a two dimensional stencil operator on a numerical grid of size $4096^2$.]{Single and double precision floating point operations per second for a two dimensional stencil operator on a numerical grid of size $4096^2$. Various stencil sizes are used $\alpha=1,2,3,4$, equivalent to $5$pt, $9$pt, $13$pt, and $17$pt stencils. Test environment 1.}\label{ch5:fig:stencilperformance}
+\caption[Single- and double-precision floating point operations per second for a two-dimensional stencil operator on a numerical grid of size $4096^2$.]{Single and double precision floating point operations per second for a two dimensional stencil operator on a numerical grid of size $4096^2$. Various stencil sizes are used $\alpha=1,2,3,4$, equivalent to $5$pt, $9$pt, $13$pt, and $17$pt stencils.}\label{ch5:fig:stencilperformance}
 \end{figure}
 
 
 
 
 \subsection{Poisson equation}\index{Poisson equation}
-The Poisson equation is a second-order elliptic differential equation, often encountered in applications within scientific fields like electrostatics and mechanics. We consider the two dimensional BVP \index{boundary volume problem} defined in terms of Poisson's equation with homogeneous Dirichlet boundary conditions on the form
+The Poisson equation is a second-order elliptic differential equation, often encountered in applications within scientific fields such as electrostatics and mechanics. We consider the two- dimensional BVP \index{boundary volume problem} defined in terms of Poisson's equation with homogeneous Dirichlet boundary conditions on the form
 \begin{subequations}\begin{align}
 \nabla^2 u = f(x,y),& \qquad (x,y) \in \Omega([0,1]\times[0,1]), \\
 u = 0,& \qquad (x,y) \in \partial\Omega.
 \end{align}\label{ch5:eq:poissoneq}\end{subequations}
-Notice the similarities to the heat conduction equation \eqref{ch5:eq:heateq}. In fact, this could be a steady state solution to the heat equation, when there is no temporal change $\frac{\partial u}{\partial t}=0$, but a source term $f(x,y)$. Since the Laplace operator and the boundary conditions are the same for both problems, we are able to reuse the same implementation with few modifications.
+Notice the similarities to the heat conduction equation \eqref{ch5:eq:heateq}. In fact, \eqref{ch5:eq:poissoneq} could be a steady-state solution to the heat equation, when there is no temporal change $\frac{\partial u}{\partial t}=0$, but a source term $f(x,y)$. Since the Laplace operator and the boundary conditions are the same for both problems, we are able to reuse the same implementation with few modifications.
 
-Opposite to the heat equation, there are no initial conditions. Instead, we seek some $u(x,y)$ that satisfies \eqref{ch5:eq:poissoneq}, given a source term $f(x,y)$, on the right hand side. For simplicity, assume that we know the exact solution, $u_{\textrm{true}}$, given as \eqref{ch5:eq:heatinit}. Then we use the method of manufactured solutions to derive an expression for the corresponding right hand side $f(x,y)$:
+Opposite to the heat equation, there are no initial conditions. Instead, we seek some $u(x,y)$ that satisfies \eqref{ch5:eq:poissoneq}, given a source term $f(x,y)$, on the right-hand side. For simplicity, assume that we know the exact solution, $u_{\textrm{true}}$, corresponding to \eqref{ch5:eq:heatinit}. Then we use the method of manufactured solutions to derive an expression for the corresponding right-hand side $f(x,y)$:
 \begin{align}
 f(x,y) = \nabla^2 u_{\textrm{true}} = -2\pi^2\,\sin(\pi x)\,\sin(\pi y). \label{ch5:eq:poissonrhs}
 \end{align}
-The spatial derivative in \eqref{ch5:eq:poissoneq} is again approximated with finite differences\index{finite difference}, similar to the example in \eqref{ch5:eq:stencilmatrix}, except boundary values are explicitly set to zero. The discrete form of the system can now be written as a sparse linear system of equations
+The spatial derivative in \eqref{ch5:eq:poissoneq} is again approximated with finite differences\index{finite difference}, similar to the example in \eqref{ch5:eq:stencilmatrix}, except boundary values are explicitly set to zero. The discrete form of the system can now be written as a sparse linear system of equations:
 \begin{align}
-\mymat{A}\myvec{u}=\myvec{f}, \quad \mymat{A} \in \mathbb{R}^{N\times N}, \label{ch5:eq:poissonsystem}
+\mymat{A}\myvec{u}=\myvec{f}, \qquad \myvec{u},\myvec{f} \in \mathbb{R}^{N}, \quad \mymat{A} \in \mathbb{R}^{N\times N}, \label{ch5:eq:poissonsystem}
 \end{align}
-where $\mymat{A}$ is the sparse matrix formed by finite difference coefficients, $N$ is the number of unknowns and $\myvec{f}$ is given by \eqref{ch5:eq:poissonrhs}. Equation \eqref{ch5:eq:poissonsystem} can be solved in numerous ways, but a few observations may help us do it efficiently. Direct solvers based on Gaussian elimination are accurate and use a finite number of operations for a constant problem size. However, the arithmetic complexity grows with the problem size with as much as $\mathcal{O}(N^3)$ and do not exploit the sparsity of $\mymat{A}$. Direct solvers are therefore mostly feasible for dense systems of limited sizes. Sparse direct solvers exist, but they are often difficult to parallelize, or only applicable for certain types of matrices. Regardless of the discretization technique, the discretization of an elliptic PDE into a linear system like \eqref{ch5:eq:poissonsystem}, yields a very sparse matrix $\mymat{A}$ when $N$ is very large. Iterative methods\index{iterative methods} for solving large sparse linear systems find broad use in scientific applications, because they only require an implementation of the matrix-vector product, and often use a limited amount of additional memory. Comprehensive introductions to iterative methods may be found in any of~\cite{ch5:Saad2003,ch5:Kelley1995,ch5:Barrett1994}.
+where $\mymat{A}$ is the sparse matrix formed by finite difference coefficients, $N$ is the number of unknowns, and $\myvec{f}$ is given by \eqref{ch5:eq:poissonrhs}. Equation \eqref{ch5:eq:poissonsystem} can be solved in numerous ways, but a few observations may help do it more efficiently. Direct solvers based on Gaussian elimination are accurate and use a finite number of operations for a constant problem size. However, the arithmetic complexity grows with the problem size by as much as $\mathcal{O}(N^3)$ and does not exploit the sparsity of $\mymat{A}$. Direct solvers are therefore mostly feasible for dense systems of limited sizes. Sparse direct solvers exist, but they are often difficult to parallelize, or applicable for only certain types of matrices. Regardless of the discretization technique, the discretization of an elliptic PDE into a linear system as in \eqref{ch5:eq:poissonsystem} yields a very sparse matrix $\mymat{A}$ when $N$ is large. Iterative methods\index{iterative methods} for solving large sparse linear systems find broad use in scientific applications, because they require only an implementation of the matrix-vector product, and they often use a limited amount of additional memory. Comprehensive introductions to iterative methods may be found in any of~\cite{ch5:Saad2003,ch5:Kelley1995,ch5:Barrett1994}.
 
-One benefit of the high abstraction level and the template-based library design, is to allow developers to implement their own components, such as iterative methods for solving sparse linear systems. The library includes three popular iterative methods: conjugate gradient, defect correction\index{defect correction}, and geometric multigrid. The conjugate gradient method is applicable only to systems with symmetric positive definite matrices. This is true for the two dimensional Poisson problem, when it is discretized with a five point finite difference stencil, because then there will be no off-centered approximations near the boundary. For high-order approximations ($\alpha>1$), we use the defect correction method with multigrid preconditioning. See \cite{ch5:Trottenberg2001} for details on high-order multigrid methods.
+One benefit of the high abstraction level and the template-based library design is to allow developers to implement their own components, such as iterative methods for solving sparse linear systems. The library includes three popular iterative methods: conjugate gradient, defect correction\index{defect correction}, and geometric multigrid. The conjugate gradient method is applicable only to systems with symmetric positive definite matrices. This is true for the two-dimensional Poisson problem, when it is discretized with a five-point finite difference stencil, because then there will be no off-centered approximations near the boundary. For high-order approximations ($\alpha>1$), we use the defect correction method with multigrid preconditioning. See e.g., \cite{ch5:Trottenberg2001} for details on multigrid methods.
 
-We will not present the implementation details for all three methods, but briefly demonstrate the simplicity of implementing the body of such an iterative solver, given a textbook recipe or mathematical formulation. The defect correction method iteratively improves the solution to $\mymat{A}\myvec{x}=\myvec{b}$, given an initial start guess $\myvec{x}^0$, by continuously solving a preconditioned error equation. One defect correction \index{defect correction!iteration} iteration can be written via mathematical notation as
+We will not present the implementation details for all three methods but briefly demonstrate the simplicity of implementing the body of such an iterative solver, given a textbook recipe or mathematical formulation. The defect correction method iteratively improves the solution to $\mymat{A}\myvec{x}=\myvec{b}$, given an initial start guess $\myvec{x}^0$, by continuously solving a preconditioned error equation. The defect correction\index{defect correction!iteration} iteration can be written as
 \begin{align}
-\myvec{x}^{k+1} = \myvec{x}^{k} + \mymat{M}^{-1}(\myvec{b}-\mymat{A}\myvec{x}^{k}), \quad \mathcal{M}\in\mathbb{R}^{N\times N}, \quad {\bf x},{\bf b}\in\mathbb{R}^N \label{ch5:eq:dc}
+\myvec{x}^{k+1} = \myvec{x}^{k} + \mymat{M}^{-1}(\myvec{b}-\mymat{A}\myvec{x}^{k}), \quad \mymat{A},\mathcal{M}\in\mathbb{R}^{N\times N}, \quad {\bf x},{\bf b}\in\mathbb{R}^N, \label{ch5:eq:dc}
 \end{align}
-where $k$ is the iteration number and $\mymat{M}$ is the preconditioner\index{preconditioning} which is based on approximation to the original coefficient matrix. To achieve efficient algebraic convergence\index{convergence}, a solve step with the preconditioner should be computational inexpensive compared to using $\mymat{A}$. How to implement \eqref{ch5:eq:dc} within the library context is illustrated in Listing \ref{ch5:lst:dc}. The host CPU is responsible for traversing through each line in Listing \ref{ch5:lst:dc} and testing for convergence, while the computationally expensive matrix-vector operation and preconditioning, can be executed on the GPU, if GPU-based components are used. The defect correction method has two attractive properties. First, global reduction is only required to monitor convergence once per iteration during convergence evaluation, which reduces communication requirements and provides a basis for efficient and scalable parallelization. Second, it has minimal memory footprint, which is good for the solution of very large systems.
+where $k$ is the iteration number and $\mymat{M}$ is the preconditioner\index{preconditioning} which should be an approximation to the original coefficient matrix $\mymat{A}$. To achieve fast numerical convergence\index{convergence}, applying the preconditioner should be a computationally inexpensive operation compared to solving the original system. How to implement \eqref{ch5:eq:dc} within the library context is illustrated in Listing \ref{ch5:lst:dc}. The host CPU traverses each line in Listing \ref{ch5:lst:dc} and tests for convergence, while the computationally expensive matrix-vector operation and preconditioning, can be executed on the GPU, if GPU-based components are used. The defect correction method has two attractive properties. First, global reduction is required to monitor convergence only once per iteration during convergence evaluation, which reduces communication requirements and provides a basis for efficient and scalable parallelization. Second, it has a minimal constant memory footprint, making it a suitable method for solving very large systems.
 
-In the following section we demonstrate how to assemble a solver for the discrete Poisson problem, using one of the tree iterative methods to efficiently solve \eqref{ch5:eq:poissonsystem}.
-
-\lstset{caption={Main loop for the iterative defect correction solver. The solver is instantiated with template argument types for the matrix and vector classes, allowing underlying implementations to be based on GPU kernels.}}
-\begin{lstlisting}[label=ch5:lst:dc]
+\lstset{label=ch5:lst:dc,caption={main loop for the iterative defect correction solver: the solver is instantiated with template argument types for the matrix and vector classes, allowing underlying implementations to be based on GPU kernels}}
+\begin{lstlisting}
 while(r.nrm2() > tol)
 {
     // Calculate residual
@@ -323,15 +324,16 @@ while(r.nrm2() > tol)
 }
 \end{lstlisting}
 
+In the following section we demonstrate how to assemble a solver for the discrete Poisson problem, using one of the three iterative methods to efficiently solve \eqref{ch5:eq:poissonsystem}.
 
-\subsubsection{Assembling the Poisson Solver}
+\subsubsection{Assembling the Poisson solver}
 
-Assembling the Poisson solver follows almost the same procedure as the heat conduction solver, except the time integration part is exchanged with an iterative method to to solve the system of linear equations \eqref{ch5:eq:poissonsystem}. For the discrete matrix-vector product we reuse the Laplace operator from the heat conduction problem in Listing \ref{ch5:lst:laplaceimpl} with a few modifications. The Laplace operator now appears as a matrix component, so to be compatible with the component interface rules in Figure \ref{ch5:fig:componentdesign} the parenthesis operator is renamed to \texttt{mult}, taking two vector components as argument.
+Assembling the Poisson solver follows almost the same procedure as the heat conduction solver, except the time integration part is exchanged with an iterative method to solve the system of linear equations \eqref{ch5:eq:poissonsystem}. For the discrete matrix-vector product we reuse the Laplace operator from the heat conduction problem in Listing \ref{ch5:lst:laplaceimpl} with few modifications. The Laplace operator is now a matrix component, so to be compatible with the component interface rules in Figure \ref{ch5:fig:componentdesign}, a \texttt{mult} function taking two vector arguments is implemented instead of the parentheses operator. We leave out this code example as it almost identical to the one in Listing \ref{ch5:lst:laplaceimpl}.
 
-At the beginning of the solver implementation we list the type definitions for the Poisson solver that will be used throughout the implementation. Here we use a geometric multigrid\index{multigrid} method as a preconditioner for the defect correction method. Therefore the multigrid solver is assembled first, so that it can be used in the assembling of the defect correction method. Listing \ref{ch5:lst:poissontypedefs} defines the types for the vector, the matrix, the multigrid preconditioner and the defect correction solver. The geometric multigrid method needs two additional template arguments, that are specific for multigrid, namely a smoother and a grid restriction/interpolation operator. These arguments are free to be implemented and supplied by the developer if special care is required for their specific problems, e.g. for a custom grid structure. For the Poisson problem on a regular grid, the library contains built-in restriction and interpolation operators, and a red-black Gauss-Seidel smoother. We refer to \cite{ch5:Trottenberg2001} for extensive details on multigrid methods. The monitor and config types that appear in Listing \ref{ch5:lst:poissontypedefs} are used for convergence monitoring within the iterative solver and to control run time parameters, such as tolerances, iteration limits, etc.
+At the beginning of the solver implementation we list the type definitions for the Poisson solver that will be used throughout the implementation. Here we use a geometric multigrid\index{multigrid} method as a preconditioner for the defect correction method. Therefore the multigrid solver is assembled first, so that it can be used in the assembling of the defect correction solver. Listing \ref{ch5:lst:poissontypedefs} defines the types for the vector, the matrix, the multigrid preconditioner, and the defect correction solver. The geometric multigrid method needs two additional template arguments that are specific for multigrid, namely, a smoother and a grid restriction/interpolation operator. These arguments are free to be implemented and supplied by the developer if special care is required, e.g., for a custom grid structure. For the Poisson problem on a regular grid, the library contains built-in restriction and interpolation operators, and a red-black Gauss-Seidel smoother. We refer the reader to \cite{ch5:Trottenberg2001} for extensive details on multigrid methods. The monitor and config types that appear in Listing \ref{ch5:lst:poissontypedefs} are used for convergence monitoring within the iterative solver and to control runtime parameters, such as tolerances and iteration limits.
 
-\lstset{caption={Type definitions for the Laplacian matrix component and the multigrid preconditioned iterative defect correction\index{defect correction} solver.}}
-\begin{lstlisting}[label=ch5:lst:poissontypedefs]
+\lstset{label=ch5:lst:poissontypedefs,caption={type definitions for the Laplacian matrix component and the multigrid preconditioned iterative defect correction\index{defect correction} solver}}
+\begin{lstlisting}
 typedef double                                   value_type;
 typedef gpulab::grid<value_type>                 vector_type;
 typedef laplacian<vector_type>                   matrix_type;
@@ -341,7 +343,7 @@ typedef gpulab::solvers::multigrid_types<
 	  vector_type
 	, matrix_type
 	, gpulab::solvers::gauss_seidel_rb_2d
-	, gpulab::solvers::grid_handler_2d_boundary>   mg_types;
+	, gpulab::solvers::grid_handler_2d>           mg_types;
 typedef gpulab::solvers::multigrid<mg_types>     mg_solver_type;
 typedef mg_solver_type::monitor_type             monitor_type;
 typedef monitor_type::config_type                config_type;
@@ -351,14 +353,15 @@ typedef gpulab::solvers::defect_correction_types<
 	  vector_type
 	, matrix_type
 	, monitor_type
-	, mg_solver_type>                              dc_types;
+	, mg_solver_type>                                 dc_types;
 typedef gpulab::solvers::defect_correction<dc_types> dc_solver_type;
 \end{lstlisting}
 
-With the type definitions set up, the implementation for the Poisson solver follows in Listing \ref{ch5:lst:poissonsolver}. Some of the initializations are left out, as they follow the same procedure as for the Heat conduction example. The defect correction and geometric multigrid solvers are initiated and then the multigrid is set as a preconditioner to the defect correction method. Finally the system is solved via a call to \texttt{solve()}.
+With the type definitions set up, the implementation for the Poisson solver follows in Listing \ref{ch5:lst:poissonsolver}. Some of the initializations are left out, as they follow the same procedure as for the heat conduction example. The defect correction and geometric multigrid solvers are initialized and then multigrid is set as a preconditioner to the defect correction method. Finally the system is solved via a call to \texttt{solve()}.
 
-\lstset{caption={Initializing the preconditioned defect correction solver to approximate the solution to $\mymat{A}\myvec{u}=\myvec{f}$.}}
-\begin{lstlisting}[label=ch5:lst:poissonsolver]
+\pagebreak
+\lstset{label=ch5:lst:poissonsolver,caption={initializing the preconditioned defect correction solver to approximate the solution to $\mymat{A}\myvec{u}=\myvec{f}$}}
+\begin{lstlisting}
 matrix_type A(alpha);               // High-order matrix
 matrix_type M(1);                   // Low-order matrix
 
@@ -380,26 +383,22 @@ if(monitor.converged())
 \subsubsection{Numerical solutions to the Poisson problem}\label{ch5:sec:poissonresults}
 
 
-The discrete Poisson problem \eqref{ch5:eq:poissonsystem}, has been solved using the three iterative methods presented above. Convergence histories for the conjugate gradient method and geometric multigrid method, using two different resolutions are illustrated in Figure \ref{ch5:fig:poissonconvergence:a}. Multigrid methods have the attractive properties, that they are very robust and algorithmic efficient, independently of the problem size. Figure \ref{ch5:fig:poissonconvergence:a} confirms that the rate of convergence for the multigrid method is unchanged for both problem sizes. Only the attainable accuracy is slightly worsened, as a consequence of a more ill-conditioned system for large problem sizes.
+The discrete Poisson problem \eqref{ch5:eq:poissonsystem} has been solved using the three iterative methods presented above. Convergence histories for the conjugate gradient method and geometric multigrid method, using two different resolutions, are illustrated in Figure \ref{ch5:fig:poissonconvergence:a}. Multigrid methods are very robust and algorithmic efficient, independent of the problem size. Figure \ref{ch5:fig:poissonconvergence:a} confirms that the rate of convergence for the multigrid method is unchanged for both problem sizes. Only the attainable accuracy is slightly worsened, as a consequence of a more ill-conditioned system for large problem sizes.
 
-Defect correction in combination with multigrid preconditioning, enables efficient solution of high-order approximations of the Poisson problem, illustrated in Figure \ref{ch5:fig:poissonconvergence:b}. The multigrid preconditioning matrix $\mymat{M}$ is based on a low order approximation to \eqref{ch5:eq:poissonsystem}, whereas matrix $\mymat{A}$ is a high-order approximation. When $\mymat{M}$ is a close approximation to $\mymat{A}$, defect correction converges most rapidly. This is the effect that can be seen between the three convergence lines in Figure \ref{ch5:fig:poissonconvergence:b}.
+Defect correction in combination with multigrid preconditioning enables efficient solution of high-order approximations of the Poisson problem, illustrated in Figure \ref{ch5:fig:poissonconvergence:b}. The multigrid preconditioning matrix $\mymat{M}$ is based on a low-order approximation to \eqref{ch5:eq:poissonsystem}, whereas matrix $\mymat{A}$ is a high-order approximation. When $\mymat{M}$ is a close approximation to $\mymat{A}$, defect correction converges most rapidly. This is the effect that can be seen between the three convergence lines in Figure \ref{ch5:fig:poissonconvergence:b}.
 
 
 \begin{figure}[!htb]
-\setlength\figureheight{0.33\textwidth}
-\setlength\figurewidth{0.33\textwidth}
-\begin{center}
-\subfigure[Convergence history for the conjugate gradient and multigrid methods, for two different problem sizes.]{\label{ch5:fig:poissonconvergence:a}
+\centering
+\subfigure[Convergence histories for the conjugate gradient (CG) and multigrid (MG) methods, for two different problem sizes.]{\label{ch5:fig:poissonconvergence:a}
     %{\scriptsize \input{Chapters/chapter5/figures/ConvergenceMGvsCG.tikz}}
-    {\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/ConvergenceMGvsCG_conv.pdf}}
-}
-
- \hspace{0.2cm}%
+    {\includegraphics[width=0.45\textwidth]{Chapters/chapter5/figures/ConvergenceMGvsCG_conv.pdf}}
+}%
+\hspace{0.5cm}%
 \subfigure[Defect correction convergence history for three different stencil sizes.]{\label{ch5:fig:poissonconvergence:b}
     %{\scriptsize \input{Chapters/chapter5/figures/ConvergenceDC.tikz}}
- {\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/ConvergenceDC_conv.pdf}}
+ {\includegraphics[width=0.45\textwidth]{Chapters/chapter5/figures/ConvergenceDC_conv.pdf}}
 }
-\end{center}
 \caption{Algorithmic performance for the conjugate gradient, multigrid, and defect correction methods, measured in terms of the relative residual per iteration.}\label{ch5:fig:poissonconvergence}
 \end{figure}
 
@@ -409,85 +408,84 @@ Defect correction in combination with multigrid preconditioning, enables efficie
 
 \section{Optimization strategies for multi-GPU systems}\label{ch5:sec:multigpu}\index{multi-GPU}
 
-CUDA enabled GPUs are optimized for high memory bandwidth and fast on-chip performance. However, the role as a separate co-processor to the CPU can be a limiting factor for large scale scientific applications, because the per GPU memory capacity is fixed and is only in the range of a few Gigabytes. In comparison, it is not unusual for a high-end workstation to be equipped with $\sim32$GB of main memory, plus a terabyte capacity hard disk for secondary storage. Therefore, large scale scientific applications that process Gigabytes of data, require distributed computations on multiple GPU devices. Multi-GPU desktop computers and clusters can have a very attractive peak performance, but the addition of multiple devices introduces the potential performance bottleneck of slow data transfers across PCIe busses and network interconnections, as illustrated in Figure \ref{ch5:fig:gpu2gputransfer}. The ratio between data transfers and computational work has a significant impact on the possibility for latency hiding and thereby overall application performance.
+CUDA enabled GPUs are optimized for high memory bandwidth and fast on-chip performance. However, the role as a separate coprocessor to the CPU can be a limiting factor for large scale scientific applications, because the GPU memory capacity is fixed and is only in the range of a few gigabytes. In comparison, it is not unusual for a high-end workstation to be equipped with $\sim32$GB of main memory, plus a terabyte hard disk capacity for secondary storage. Therefore, large scale scientific applications that process gigabytes of data, require distributed computations on multiple GPU devices. Multi-GPU desktop computers and clusters can have a very attractive peak performance, but the addition of multiple devices introduces the potential performance bottleneck of slow data transfers across PCIe busses and network interconnections, as illustrated in Figure \ref{ch5:fig:gpu2gputransfer}. The ratio between data transfers and computational work has a significant impact on the possibility for latency hiding and thereby overall application performance.
 
 \begin{figure}[!htb]
 \begin{center}
 %\includegraphics[width=0.6\textwidth]{Chapters/chapter5/figures/GPU2GPU.png}
 \input{Chapters/chapter5/figures/GPU2GPU.tikz}
 \end{center}
-\caption[Message passing between two GPUs involves several memory transfers across lower bandwidth connections.]{Message passing between two GPUs involves several memory transfers across lower bandwidth connections. A kernel that aligns the data to be transferred is required, if the data is not already sequentially stored in device memory.}\label{ch5:fig:gpu2gputransfer}
+\caption[Message passing between two GPUs involves several memory transfers across lower bandwidth connections.]{Message passing between two GPUs involves several memory transfers across lower bandwidth connections. The kernel call is required if the data is not already sequentially stored in device memory. Recent generations of NVIDIA GPUs, CUDA, and MPI support direct transfers without explicitely transfering data to the host first.}\label{ch5:fig:gpu2gputransfer}
 \end{figure}
 
 
-Developing applications that exploit the full computational capabilities of modern clusters -- GPU-based or not -- is no trivial matter. Developers are faced with the complexity of distributing and coordinating computations on nodes consisting of many-core CPUs, GPUs and potentially other types of accelerators as well. These complexities give rise to challenges in finding numerical algorithms, that are well suited for such systems, forcing developers to search for novel methods for introducing concurrency.
+Developing applications that exploit the full computational capabilities of modern clusters--GPU-based or not--is no trivial matter. Developers are faced with the complexity of distributing and coordinating computations on nodes consisting of many-core CPUs, GPUs and potentially other types of accelerators as well. These complexities give rise to challenges in finding numerical algorithms, that are well suited for such systems, forcing developers to search for novel methods that utilize concurrency.
 
-To ease software development, we use MPI-2 for message passing and ensure a safe and private communication space by creation of a communicator private to the library during initialization, as recommended by Hoefler and Snir~\cite{ch5:Hoefler2011}. With the addition of GPUDirect to CUDA, it is possible to make direct transfers between GPUs, eliminating CPU overhead. Unfortunately there are rather strict system requirements to enable these direct transfers, and there is no stable MPI implementation that supports message passing via pointers to device memory. Therefore device memory is first transferred to the CPU main memory before invoking any MPI calls. We do provide device to device transfers via template-based library routines, that work directly with GPU vector objects. It hides the complexity of message passing from the developer and help developers design new components for multi-GPU execution.
+To ease software development, we use MPI-2 for message passing and ensure a safe and private communication space by creation of a communicator private to the library during initialization, as recommended by Hoefler and Snir~\cite{ch5:Hoefler2011}. With the addition of remote direct memory access (RDMA) for GPUDirect it is possible to make direct memory transfers between recent generation of GPUs (Kepler), eliminating CPU overhead. Unfortunately there are some strict system and driver requirements to enable these features. Therefore, in the following examples, device memory is first transferred to the CPU main memory before invoking any MPI calls. The library provides device-to-device transfers via template-based routines that work directly with GPU vector objects. This hides the complexity of message passing from the developer and helps developers design new components for multi-GPU execution. 
 
 %(GPUDirect v2 and Unified Virtual Addressing can help). Effort is being made to create GPU aware MPI implementations, that will allow direct memory transfers(cite: openMPI, MVAPICH2). However, only 64-bit systems with Unified Virtual Addressing (UVA) from CUDA v4.0 and Tesla 20-serie (Kepler) GPUs are capable of direct transfers. Therefore only a limited number of today's GPU clusters will be able to utilize these features.
 %For now, the simple solution is to first transfer data to the CPU main memory before invoking the appropriate MPI calls.
 
-In the following sections we present two very different methods for distributed computing based on spatial and temporal decomposition. Each method has its own characteristic, which make them attractive for various types of PDE problems, and for different problem sizes.
+In the following sections we present two very different methods for distributed computing based on spatial and temporal decomposition. Each method has its own characteristic, which makes the method attractive for various types of PDE problems and for different problem sizes.
 
 
 
 \subsection{Spatial domain decomposition}\label{ch5:sec:dd}\index{domain decomposition}
 
-Domain decomposition methods can be used for distributing computational work in the numerical solution of boundary value problems~\cite{ch5:Smith1996}. These methods add parallelism, by splitting the spatial dimensions, on which the boundary values are defined, into a number of smaller boundary value problems, and coordinating the solution between adjacent subdomains. Domain decomposition techniques, such as the classical overlapping Schwarz methods, may be considered as preconditioners \index{preconditioning} to the system of equations, that arise from discretization of the PDE, e.g. as for the Poisson problem in \eqref{ch5:eq:poissonsystem}. The algorithmic efficiency of such methods depends on the size of the domain overlaps, while an additional coarse grid correction method is sometimes necessary to maintain fast global convergence.
+Domain decomposition methods can be used for distributing computational work in the numerical solution of boundary value problems~\cite{ch5:Smith1996}. These methods add parallelism by splitting the spatial dimensions, on which the boundary values are defined, into a number of smaller boundary value problems and then coordinating the solution between adjacent subdomains. Domain decomposition techniques, such as the classical overlapping Schwarz methods, may be considered as preconditioners \index{preconditioning} to the system of equations that arise from the discretization of PDEs, e.g., as for the Poisson problem in \eqref{ch5:eq:poissonsystem}. The algorithmic efficiency of such methods depends on the size of the domain overlaps, while an additional coarse grid correction method is sometimes necessary to maintain fast global convergence.
 %A detailed study and analysis of convergence criterions, overlapping zones, local domain solvers etc. can lead to parallel efficiencies that scale linearly with the number of processors for large problem sizes, see e.g. \cite{ch5:Cai2005}.
 
-An alternative to the preconditioning strategy is to have each subdomain query information from adjacent subdomains whenever needed. For PDEs that are discretized onto regular shaped grids, this can be an attractive strategy, as the decomposition of subdomains, and thereby the communication topology\index{topology}, is straightforward. The library supports decomposition of regular grids by either pre- or user-defined topologies. A topology in this context, is a description of connectivity between processors, that share the same grid along with information about the local and global discretization. Layers of ghost points are inserted at the artificially introduced boundaries to account for grid points that reside in adjacent subdomains. The ghost point values can be updated upon request from the user. An illustrative example is given in Figure \ref{ch5:fig:dd2d}; the arrows indicate message passing between adjacent subdomains, when updating grid points within the ghost layers.
+An alternative to the preconditioning strategy is to have each subdomain query information from adjacent subdomains whenever needed. For PDEs that are discretized onto regular shaped grids, this can be an attractive strategy, as the decomposition of subdomains, and thereby the communication topology\index{topology}, is straightforward. The library supports decomposition of regular grids by either pre- or user-defined topologies. A topology in this context, is a description of connectivity between processors that share the same grid along with information about the local and global discretization. Layers of ghost points are inserted at the artificially introduced boundaries to account for grid points that reside in adjacent subdomains. The ghost point values can be updated upon request from the user. An illustrative example is given in Figure \ref{ch5:fig:dd2d}; the arrows indicate message passing between adjacent subdomains, when updating grid points within the ghost layers.
 \begin{figure}[!htb]
 \begin{center}
 \input{Chapters/chapter5/figures/dd2d.tikz}
 \end{center}
-\caption[Domain distribution along the horizontal direction of a two dimensional grid into three subdomains.]{Domain distribution along the horizontal direction of a two dimensional grid into three subdomains. {\large$\bullet$} and {\scriptsize$\textcolor[rgb]{0.5,0.5,0.5}{\blacksquare}$} represent internal grid points and ghost points respectively.}\label{ch5:fig:dd2d}
+\caption[Domain distribution of a two-dimensional grid into three subdomains.]{Domain distribution of a two-dimensional grid into three subdomains. {\large$\bullet$} and {\scriptsize$\textcolor[rgb]{0.5,0.5,0.5}{\blacksquare}$} represent internal grid points and ghost points, respectively.}\label{ch5:fig:dd2d}
 \end{figure}
 
-Topologies are introduced via an extra template argument to the grid component. A grid is by default not distributed, because the default template argument is a non-distribution topology implementation. The grid class is extended with a new member function \texttt{update()}, that makes sure that the topology instance updates all ghost points. The library contains two topology strategies, based on one dimensional and two dimensional distributions of the grid. The number of grid subdomains will be equal to the number of MPI processes executing the program.
+Topologies are introduced via an extra template argument to the grid class. A grid is by default not decomposed, because the default template argument is based on a non distribution topology implementation. The grid class is extended with a new member function \texttt{update()}, which makes sure that all ghost points are updated according to the grid topology. The library contains topologies based on one-dimensional and two-dimensional distributions of the grid. The number of grid subdomains will be equal to the number of MPI processes executing the program.
 
-If these updates are performed whenever information from adjacent subdomains are needed, i.e. before a stencil operation, all distributed interior points will be exactly the same as they would be for the non-distributed setup. Therefore, one advantage of this method is, that the algorithmic efficiency of an application can be preserved, if grid updates are invoked at the proper times.
+If grid ghost layers are updated whenever information from adjacent subdomains is needed, e.g., before a stencil operation, all interior points will be exactly the same as they would be for the non distributed setup. Therefore, one advantage of this approach is that the algorithmic efficiency of an application can be preserved, if grid updates are consistently invoked at the proper times.
 
-Distributed performance for the finite difference stencil operation is illustrated in Figure \ref{ch5:fig:multigpu}. The timings include the compute time for the finite difference approximation and the time for updating ghost layers via message passing, cf. Figure \ref{ch5:fig:gpu2gputransfer}. It is obvious from Figure \ref{ch5:fig:multigpu:a}, that communication overhead dominates for smaller problem sizes, where the non-distributed grid (1 GPU) is fastest. However, communication times do not grow as rapidly for larger problems as computation times, due to the surface to volume ratio. Therefore message passing becomes less influential for large problems, where reasonable performance speedups are obtained. Figure \ref{ch5:fig:multigpu:b} demonstrates how the computational performance on multi-GPU systems can be significantly improved for various stencil sizes. With this simple domain decomposition technique, developers are able to implement applications based on heterogeneous distributed computing, without explicitly dealing with message passing, while it is still possible to provide user specific implementations of the topology class for customized grid updates.
+Distributed performance for the finite difference stencil operation is illustrated in Figure \ref{ch5:fig:multigpu}. The timings include the compute time for the finite difference approximation and the time for updating ghost layers via message passing. It is obvious from Figure \ref{ch5:fig:multigpu:a} that communication overhead dominates for the smallest problem sizes, where the non distributed grid (1 GPU) is fastest. However, communication overhead does not grow as rapidly as computation times, due to the surface-to-volume ratio. Therefore message passing becomes less influential for large problems, where reasonable performance speedups are obtained. Figure \ref{ch5:fig:multigpu:b} demonstrates how the computational performance on multi-GPU systems can be significantly improved for various stencil sizes. With this simple domain decomposition technique, developers are able to implement applications based on heterogeneous distributed computing, without explicitly dealing with message passing and it is still possible to provide user specific implementations of the topology class for customized grid updates.
 
 % TODO: Should we put in the DD algebra?
 
 \begin{figure}[!htb]
-    \setlength\figureheight{0.27\textwidth}
-    \setlength\figurewidth{0.55\textwidth}
-    \begin{center}
+    \setlength\figureheight{0.39\textwidth}
+    \setlength\figurewidth{0.39\textwidth}
+%	\centering
     \subfigure[Absolute timings, $\alpha=3$.]{
-    %{\small\input{Chapters/chapter5/figures/MultiGPUAlpha3TimingsTeslaM2050.tikz}}
-    {\includegraphics[width=0.6\textwidth]{Chapters/chapter5/figures/MultiGPUAlpha3TimingsTeslaM2050_conv.pdf}}
+    {\scriptsize\input{Chapters/chapter5/figures/MultiGPUAlpha3TimingsTeslaM2050.tikz}}
+    %{\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/MultiGPUAlpha3TimingsTeslaM2050_conv.pdf}}
     \label{ch5:fig:multigpu:a}
-    }
+    }%
     \subfigure[Performance at $N=4069^2$, single precision.]{
-   % {\small\input{Chapters/chapter5/figures/MultiGPUAlphaPerformanceTeslaM2050_N16777216.tikz}}
-{\includegraphics[width=0.6\textwidth]{Chapters/chapter5/figures/MultiGPUAlphaPerformanceTeslaM2050_N16777216_conv.pdf}}
+    {\scriptsize\input{Chapters/chapter5/figures/MultiGPUAlphaPerformanceTeslaM2050_N16777216.tikz}}
+	%{\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/MultiGPUAlphaPerformanceTeslaM2050_N16777216_conv.pdf}}
     \label{ch5:fig:multigpu:b}
     }
-    \end{center}
-    \caption{Performance timings for distributed stencil operations, including communication and computation times. Test environment 2.}\label{ch5:fig:multigpu}
+    \caption{Performance timings for distributed stencil operations, including communication and computation times. Executed on test environment 3.}\label{ch5:fig:multigpu}
 \end{figure}
 
-\subsection{Parareal -- Parallel time integration}\label{ch5:sec:parareal}\index{parareal}
+\subsection{Parareal--parallel time integration}\label{ch5:sec:parareal}\index{parareal}
 
-The use of spatial domain decomposition methods is widespread, as they have proven to be efficient on a wide range of problems. Unfortunately, applications that are concerned with limited numerical problem sizes, rapidly reach their speedup limit for a low number of processors, due to a degradation of the scalability when the number of processors become large as this leads to an increasingly unfavourable communication to compute ratio of the individual subdomains. This issue is continuously worsened by the fact that communication speed has been increasing at a far slower pace than compute speed for the past many years, and is expected to continue to do so for years to come. It is often referred to as "the memory wall"~\cite{ch5:Asanovic:EECS-2006-183}, one of the grand challenges facing development and architectural design of future high-performance systems~\cite{ch5:Keyes2011,ch5:ScientificGrandChallenges2010}. Finally, there are applications based on ordinary differential equations, where classical domain decomposition methods are not even applicable~\cite{ch5:YMTR08}. For these type of applications, a method of adding parallelism in the temporal integration is of great interest. Contrary to space however, time is -- by its very nature -- sequential, which precludes a straightforward implementation of a parallel approach.
+The use of spatial domain decomposition methods is widespread, as they have proven to be efficient on a wide range of problems. Unfortunately, applications that are concerned with limited numerical problem sizes can rapidly reach a speedup limit for a low number of processors due to scalability degradation when the number of processors becomes large, as this leads to an increasingly unfavourable communication-to-compute ratio. This issue is continuously worsened by the fact that communication speed has been increasing at a far slower pace than compute speed for the past several years, and this trend is expected to continue for years to come. It is often referred to as \emph{the memory wall}~\cite{ch5:Asanovic:EECS-2006-183}, one of the grand challenges facing development and architectural design of future high-performance systems~\cite{ch5:Keyes2011,ch5:ScientificGrandChallenges2010}. Also, there are applications based on ordinary differential equations, where classical domain decomposition methods are not even applicable~\cite{ch5:YMTR08}. For these type of applications, a method of adding parallelism in the temporal integration is of great interest. Contrary to space however, time is--by its very nature--sequential, which precludes a straightforward implementation of a parallel approach.
 
-One method that introduces concurrency to the solution of evolution problems is the parareal algorithm. Parareal is an iterative method imposed on a time decomposition. Gander and Vandewalle showed in \cite{ch5:MS07}, that the algorithm can be written as both a multiple shooting method and as a two level multigrid in time approach, even though the leading idea came from spatial domain decomposition. The method has many exciting features: it is fault tolerant and has different communication characteristics than those of the classical domain decomposition methods. It has also been demonstrated to work effectively on a wide range of problems, and most importantly, once the proper distribution infrastructure is in place, it can easily be wrapped around any type of numerical integrator, for any type of initial value problem.
+One method that introduces concurrency to the solution of evolution problems is the parareal algorithm. Parareal is an iterative method imposed on a time decomposition. Gander and Vandewalle showed in \cite{ch5:MS07} that the algorithm can be written both as a multiple shooting method and as a two-level multigrid-in-time approach, even though the leading idea came from spatial domain decomposition. The method has many exciting features: it is fault tolerant and has different communication characteristics than those of the classical domain decomposition methods. It has also been demonstrated to work effectively on a wide range of problems, and most importantly, once the proper distribution infrastructure is in place, it can easily be wrapped around any type of numerical integrator, for any type of initial value problem.
+\label{ch5:parareal}
+
+\subsubsection{The parareal algorithm}
 
+The parareal algorithm was first presented in 2001, in a paper by Lions et al.~\cite{ch5:LMT01}, and later introduced in a slightly revised predictor-corrector form in 2002 by Baffico et al.~\cite{ch5:LSY02}. The parareal-in-time approach proposes to break the global problem of time evolution into a series of independent evolution problems on smaller intervals, see Figure \ref{ch5:fig:ParallelInTime}. %
 \begin{figure}[!htb]
 \def\FigureSize{0.6}
 \begin{center}
 \input{Chapters/chapter5/figures/ParallelInTime.tikz}
 \end{center}
-\caption[Time domain decomposition.]{Time domain decomposition. A compute node is assigned to each individual time sub-main to compute the initial value problem. Consistency at the time sub-domain boundaries is obtained with the application of a computationally cheap integrator in conjunction with the parareal iterative predictor-corrector algorithm, eq. \eqref{ch5:eq:PARAREAL}. }\label{ch5:fig:ParallelInTime}
-\end{figure}
-\label{ch5:parareal}
-
-\subsubsection{The Parareal algorithm}
-
-The parareal algorithm was first presented in 2001, in a paper by Lions et al.~\cite{ch5:LMT01}, and later introduced in a slightly revised predictor-corrector form in 2002 by Baffico et al.~\cite{ch5:LSY02}. The parareal in time approach proposes to break the global problem of time evolution into a series of independent evolution problems on smaller intervals, see Figure \ref{ch5:fig:ParallelInTime}. Initial states for these problems are needed, and supplied by a simple, less accurate, but computationally cheap sequential integrator. The smaller independent evolution problems can then be solved in parallel. The information, generated during the concurrent solution of the independent evolution problems with accurate propagators and inaccurate initial states, is used in a predictor-corrector fashion in conjunction with the coarse integrator to propagate the solution faster, now using the information generated in parallel. We define the decomposition into $N$ intervals, that is
+\caption[Time domain decomposition.]{Time domain decomposition. A compute node is assigned to each individual time subdomain to compute the initial value problem. Consistency at the time subdomain boundaries is obtained with the application of a computationally cheap integrator in conjunction with the parareal iterative predictor-corrector algorithm}\label{ch5:fig:ParallelInTime}
+\end{figure}%
+Initial states for these problems are needed and supplied by a simple, less accurate, but computationally cheap sequential integrator. The smaller independent evolution problems can then be solved in parallel. The information, generated during the concurrent solution of the independent evolution problems with accurate propagators and inaccurate initial states, is used in a predictor-corrector fashion in conjunction with the coarse integrator to propagate the solution faster, now using the information generated in parallel. We define the decomposition into $N$ intervals, that is,
 \begin{align}
  & T_{0}<T_{1}<\cdots<T_{n}=n\Delta T<T_{n+1}<T_{N},
 \end{align}
@@ -495,26 +493,27 @@ where $\Delta T$ is the size of the time intervals and $n=0,1,\ldots,N$. The gen
 \begin{align}\label{ch5:eq:difproblem}
  \frac{\partial u}{dt}+\mathcal{A}u=0, \qquad u(T_{0})=u^{0}, \quad t\in\left[T_{0},T_{N}\right],
 \end{align}
-where $\mathcal{A}$ is an operator from one Hilbert space to another. To solve the differential problem \eqref{ch5:eq:difproblem} we define an operator $\mathcal{F}_{\Delta T}$ that operates on some initial state $U_{n} \approx u(T_{n})$, and return an approximate solution to \eqref{ch5:eq:difproblem}, at time $T_{n}+\Delta T$. Such an operator is achieved by the implementation of a numerical time integrator, using some small time-step $\delta t\ll\Delta T$ in the integration. The numerical solution to \eqref{ch5:eq:difproblem} is then obtained by applying the fine propagator sequentially for $n=1,2,\ldots,N$.
+where $\mathcal{A}$ is an operator from one Hilbert space to another. To solve the differential problem \eqref{ch5:eq:difproblem} we define an operator $\mathcal{F}_{\Delta T}$ that operates on some initial state $U_{n} \approx u(T_{n})$ and returns an approximate solution to \eqref{ch5:eq:difproblem}, at time $T_{n}+\Delta T$. Such an operator is achieved by the implementation of a numerical time integrator, using some small time-step $\delta t\ll\Delta T$ in the integration. The numerical solution to \eqref{ch5:eq:difproblem} can then be obtained by applying the fine propagator sequentially for $n=1,2,\ldots,N$.
 \begin{align}\label{ch5:eq:fineOp}
 \hat{U}_{n}=\mathcal{F}_{\Delta T}\left(T_{n-1},\hat{U}_{n-1}\right), \qquad\hat{U}_{0}=u^{0}.
 \end{align}
-For the purpose of parallel acceleration of the otherwise purely sequential process of obtaining $\mathcal{F}_{\Delta T}^{N} u^{0} \approx u(T_{N})$, we define the coarse propagator $\mathcal{G}_{\Delta T}$. $\mathcal{G}_{\Delta T}$ also operates on some initial state $U_{n}$, propagating the solution over the time interval $\Delta T$, but now using a time step $\delta T$. Typically $\delta t<\delta T<\Delta T$. For the parareal algorithm to be effective, the coarse propagator $\mathcal{G}_{\Delta T}$ has to be substantially faster to evaluate than the fine propagator $\mathcal{F}_{\Delta T}$. There are many ways of constructing the coarse propagator, the simplest one being to apply the same numerical integrator as for the fine propagator, but using a coarser time discretization. We refer to \cite{ch5:ASNP12} for an introduction to other methods. The coarse operator reads
+
+For the purpose of parallel acceleration of the otherwise purely sequential process of obtaining $\mathcal{F}_{\Delta T}^{N} u^{0} \approx u(T_{N})$, we define the coarse propagator $\mathcal{G}_{\Delta T}$. $\mathcal{G}_{\Delta T}$ also operates on some initial state $U_{n}$, propagating the solution over the time interval $\Delta T$, but now using a time step $\delta T$. Typically $\delta t<\delta T<\Delta T$. For the parareal algorithm to be effective, the coarse propagator $\mathcal{G}_{\Delta T}$ has to be substantially faster to evaluate than the fine propagator $\mathcal{F}_{\Delta T}$. There are many ways of constructing the coarse propagator, the simplest one being to apply the same numerical integrator as for the fine propagator, but using a coarser time discretization. We refer the reader to \cite{ch5:ASNP12} for an introduction to other methods. The coarse operator reads
 \begin{align}\label{ch5:eq:coarseOp}
 \tilde{U}_{n}=\mathcal{\mathcal{G}}_{\Delta T}\left(T_{n-1},\tilde{U}_{n-1}\right),\qquad\tilde{U_{0}}=u^{0}.
 \end{align}
-Using the defined $\mathcal{F}_{\Delta T}$ and $\mathcal{G}_{\Delta T}$ operators, the predictor-corrector form of the parareal algorithm can be written in a single line
+Using the defined $\mathcal{F}_{\Delta T}$ and $\mathcal{G}_{\Delta T}$ operators, the predictor-corrector form of the parareal algorithm can be written in a single line as
 \begin{equation}\label{ch5:eq:PARAREAL}
 U_{n}^{k+1}=\mathcal{\mathcal{G}}_{\Delta T}\left(U_{n-1}^{k+1}\right)+\mathcal{\mathcal{F}}_{\Delta T}\left(U_{n-1}^{k}\right)-\mathcal{\mathcal{G}}_{\Delta T}\left(U_{n-1}^{k}\right),\quad U_{0}^{k}=u^{0},
 \end{equation}
-with the initial prediction $U^{0}_{n} = \mathcal{G}_{\Delta T}^{n} u^{0}$ for $n=1\ldots N$ and $k=1\ldots K$. $N$ being the number of time subdomains, while $K\geq1$ is the number of predictor-corrector iterations applied. The parareal algorithm is implemented in the library as a separate time integration component, using a fully distributed work scheduling model, as proposed by Aubanel (2010)~\cite{ch5:EA10}. The model is schematically presented in Figure \ref{ch5:fig:FullyDistributedCores}. The parareal component hides all communication and work distribution from the application developer.  It is defined such that a user only has to decide what coarse and fine propagators to use. Setting up the type definitions for parareal time integration using forward Euler for coarse propagation and fourth order Runge-Kutta for fine propagation, could then be defined as follows:
-\lstset{caption={Assembling a parareal time integrator using forward Euler for coarse propagation and a Runge-Kutta method for fine propagation.}}
-\begin{lstlisting}[label=ch5:lst:parareal]
+with the initial prediction $U^{0}_{n} = \mathcal{G}_{\Delta T}^{n} u^{0}$ for $n=1\ldots N$ and $k=1\ldots K$. $N$ being the number of time subdomains, while $K\geq1$ is the number of predictor-corrector iterations applied. The parareal algorithm is implemented in the library as a separate time-integration component, using a fully distributed work scheduling model, as proposed by Aubanel~\cite{ch5:EA10}. The model is schematically presented in Figure \ref{ch5:fig:FullyDistributedCores}. The parareal component hides all communication and work distribution from the application developer.  It is defined such that a user only has to decide what coarse and fine propagators to use. Setting up the type definitions for parareal time-integration using forward Euler for coarse propagation and fourth order Runge-Kutta for fine propagation could then be defined as in Listings \ref{ch5:lst:parareal}. The number of GPUs used for parallelization depends on the number of MPI processes executing the application.
+\lstset{label=ch5:lst:parareal,caption={assembling a parareal time integrator using forward Euler for coarse propagation and a Runge-Kutta method for fine propagation}}
+\begin{lstlisting}
 typedef gpulab::integration::forward_euler          coarse;
 typedef gpulab::integration::ERK4                   fine;
 typedef gpulab::integration::parareal<coarse,fine>  integrator;
 \end{lstlisting}
-The number of GPUs used for parallelization depends on the number of MPI processes that the application is initiated with.
+
 
 %Each application of $\mathcal{\mathcal{G}}_{\Delta T}$ or $\mathcal{\mathcal{F}}_{\Delta T}$ to a state $\tilde{U}_{n}^{k}$ is performed on a GPU while the corrections themselves are performed on CPUs. In addition to an efficient distribution model, a stopping strategy is needed as parareal is an iterative algorithm. Various choices has been implemented in the library, these are presented in section \ref{ch5:subsec:Lib_impl}.
 
@@ -523,24 +522,28 @@ The number of GPUs used for parallelization depends on the number of MPI process
 \begin{center}
 \input{Chapters/chapter5/figures/FullyDistributed.tikz}
 \end{center}
-\caption[Schematic visualization of a fully distributed work scheduling model for the parareal algorithm as proposed by Aubanel.]{\label{ch5:fig:FullyDistributedCores} Schematic visualization of a fully distributed work scheduling model for the parareal algorithm as proposed by Aubanel. Each GPU is responsible for computing the solution on a single time sub-domain. The computation is initiated at rank $0$ and cascades trough to rank $N$ where the final solution can be fetched. }
+\caption[Schematic visualization of a fully distributed work scheduling model for the parareal algorithm as proposed by Aubanel.]{\label{ch5:fig:FullyDistributedCores} Schematic visualization of a fully distributed work scheduling model for the parareal algorithm as proposed by Aubanel~\cite{ch5:EA10}. Each GPU is responsible for computing the solution on a single time subdomain. The computation is initiated at rank $0$ and cascades through to rank $N$ where the final solution can be fetched.}
 \end{figure}
 
 \subsubsection{Computational complexity}
 
-In the analysis of the computational complexity, we first recognize that both the coarse and the fine propagator, regardless of the type of discretization scheme, involves a complexity that is proportional to the number of time steps being used. Let us define two scalar values $\mathcal{C}_\mathcal{F}$ and $\mathcal{C}_\mathcal{G}$ as the computational cost of performing a single step with the fine and coarse propagators. The computational complexity of a propagator integrating over an interval $\Delta T$ is then given by $\mathcal{C}_\mathcal{F}\frac{\Delta T}{\delta t}$ and $\mathcal{C}_\mathcal{G}\frac{\Delta T}{\delta T}$ respectively. $R$ is introduced as the relation between the two, that is, $R$ is a measure of how much faster the coarse propagator is compared to the fine propagator over the time interval $\Delta T$. The total computational cost for parareal over $N$ intervals is proportional to
+In the analysis of the computational complexity, we first recognize that both the coarse and the fine propagators, regardless of the type of discretization scheme, involve a complexity that is proportional to the number of time steps being used. Let us define two scalar values $\mathcal{C}_\mathcal{F}$ and $\mathcal{C}_\mathcal{G}$ as the computational cost of performing a single step with the fine and coarse propagators. The computational complexity of a propagator integrating over an interval $\Delta T$ is then given by $\mathcal{C}_\mathcal{F}\frac{\Delta T}{\delta t}$ and $\mathcal{C}_\mathcal{G}\frac{\Delta T}{\delta T}$, respectively. $R$ is introduced as the relation between the two; that is, $R$ is a measure of how much faster the coarse propagator is compared to the fine propagator in integrating the time interval $\Delta T$. The total computational cost for parareal over $N$ intervals is then proportional to
 \begin{align}
-\left(k+1\right)N\mathcal{C_{G}}\frac{\Delta T}{\delta T}+kN\mathcal{C_{F}}\frac{\Delta T}{\delta t},
+\left(k+1\right)N\mathcal{C_{G}}\frac{\Delta T}{\delta T}+kN\mathcal{C_{F}}\frac{\Delta T}{\delta t}.
 \end{align}
-recognising that the second term can be distributed over $N$ processors, we are left with
+Recognizing that the second term can be distributed over $N$ processors, we are left with
 \begin{align}\label{ch5:eq:ComComplexPara}
 \left(k+1\right)N\mathcal{C_{G}}\frac{\Delta T}{\delta T}+k\mathcal{C_{F}}\frac{\Delta T}{\delta t}.
 \end{align}
-The above should be compared to the computational complexity of a purely sequential propagation, using only the fine operator, $\frac{T_{N}-T_{0}}{\delta t}\mathcal{C}_\mathcal{F}=N\frac{\Delta T}{\delta t}\mathcal{C}_\mathcal{F}$. From the latter we can estimate the speed-up, here denoted $\psi$, as the ratio between the computational complexity of the purely sequential solution, and the complexity of the solution obtained by the parareal algorithm \eqref{ch5:eq:ComComplexPara}. Neglecting the influence of communication speed and correction time, we are left with the estimate
+The above should be compared to the computational complexity of a purely sequential propagation, using only the fine operator,
+\begin{align}
+\frac{T_{N}-T_{0}}{\delta t}\mathcal{C}_\mathcal{F}=N\frac{\Delta T}{\delta t}\mathcal{C}_\mathcal{F}.
+\end{align}
+We can now estimate the speedup, here denoted $\psi$, as the ratio between the computational complexity of the purely sequential solution, and the complexity of the solution obtained by the parareal algorithm \eqref{ch5:eq:ComComplexPara}. Neglecting the influence of communication speed and correction time, we are left with the estimate
 \begin{align}\label{ch5:eq:EstiSpeedBasicPara}
 \psi=\frac{N\frac{\Delta T}{\delta t}\mathcal{C}_\mathcal{F}}{\left(k+1\right)N\mathcal{C}_\mathcal{G}\frac{\Delta T}{\delta T}+k\mathcal{C}_\mathcal{F}\frac{\Delta T}{\delta t}}=\frac{N}{\left(k+1\right)N\frac{\mathcal{C}_\mathcal{G}}{\mathcal{C}_\mathcal{F}}\frac{\delta t}{\delta T}+k}.
 \end{align}
-If we in addition assume that the time spend on coarse propagation is negligible compared to time spend on the fine propagation, i.e. the limit $\frac{\mathcal{C}_\mathcal{G}}{\mathcal{C}_\mathcal{F}}\frac{\delta t}{\delta T}\rightarrow0$, the estimate reduces to $\psi=\frac{N}{k}$. It is thus clear that the number of iterations $k$ for the algorithm to converge pose an upper bound on obtainable parallel efficiency. The number of iterations needed for convergence is intimately coupled with the ratio between the speed of the coarse and the fine integrator $\frac{\mathcal{C}_\mathcal{G}}{\mathcal{C}_\mathcal{F}}\frac{\delta t}{\delta T}$, denoted $R$. Using a slow, but more accurate coarse integrator, will lead to convergence in fewer iterations $k$, but at the same time it also makes $\frac{\mathcal{C}_\mathcal{G}}{\mathcal{C}_\mathcal{F}}\frac{\delta t}{\delta T}$ larger. Ultimately, this will degrade the obtained speedup as can be deduced from \eqref{ch5:eq:EstiSpeedBasicPara}, and by Amdahl's law also lower the upper bound on possible attainable speedup. Thus, the ratio $\frac{\mathcal{C}_\mathcal{G}}{\mathcal{C}_\mathcal{F}}\frac{\delta t}{\delta T}$ \emph{can not} be made arbitrarily small since the relation is inversely proportional to the iterations $k$, needed for convergence. This poses a challenge in obtaining speedup and is a trade-off between time spend on the fundamentally sequential part of the algorithm and the number of iterations needed for convergence. It is particularly important to consider this trade-off in the choice of stopping strategy; a more thorough discussion on this topic is available in \cite{ch5:ASNP12} for the interested reader. Measurements on parallel efficiency is typically observed to be in the range of 20--50\% in the literature, depending on the problem and the number of time subdomains, which is also confirmed by our measurements using GPUs. Here we include a demonstration of the obtained speedup of parareal applied to the two dimensional heat problem \ref{ch5:eq:heateqbc}. In Figure \ref{ch5:fig:pararealRvsGPUs} the iterations needed for convergence using the forward Euler method as both fine and coarse propagator is presented. R is regulated by changing the coarse time discretization. In Figure \ref{ch5:fig:pararealGPUs} speed-up and parallel efficiency measurements are presented. Notice how when using many GPUs it is advantageous to use a faster, less accurate coarse propagator, despite this requiring an extra parareal iteration that increases the total computational complexity.
+If we additionally assume that the time spent on coarse propagation is negligible compared to the time spent on the fine propagation, i.e., the limit $\frac{\mathcal{C}_\mathcal{G}}{\mathcal{C}_\mathcal{F}}\frac{\delta t}{\delta T}\rightarrow0$, the estimate reduces to $\psi=\frac{N}{k}$. It is thus clear that the number of iterations $k$ for the algorithm to converge poses an upper bound on obtainable parallel efficiency. The number of iterations needed for convergence is intimately coupled with the ratio $R$ between the speed of the fine and the coarse integrators $\frac{\mathcal{C}_\mathcal{F}}{\mathcal{C}_\mathcal{G}}\frac{\delta T}{\delta t}$. Using a slow, but more accurate coarse integrator will lead to convergence in fewer iterations $k$, but at the same time it also makes $R$ smaller. Ultimately, this will degrade the obtained speedup as can be deduced from \eqref{ch5:eq:EstiSpeedBasicPara}, and by Amdahl's law it will also lower the upper bound on possible attainable speedup. Thus, $R$ \emph{cannot} be made arbitrarily large since the ratio is inversely proportional to the number of iterations $k$ needed for convergence. This poses a challenge in obtaining speedup and is a trade-off between time spent on the fundamentally sequential part of the algorithm and the number of iterations needed for convergence. It is particularly important to consider this trade-off in the choice of stopping strategy; a more thorough discussion on this topic is available in \cite{ch5:ASNP12} for the interested reader. Measurements on parallel efficiency are typically observed in the literature to be in the range of 20--50\%, depending on the problem and the number of time subdomains, which is also confirmed by our measurements using GPUs. Here we include a demonstration of the obtained speedup of parareal applied to the two-dimensional heat problem \eqref{ch5:eq:heateq}. In Figure \ref{ch5:fig:pararealRvsGPUs} the iterations needed for convergence using the forward Euler method for both fine and coarse integration are presented. $R$ is regulated by changing the time step size for the coarse integrator. In Figure \ref{ch5:fig:pararealGPUs} speedup and parallel efficiency measurements are presented. Notice, when using many GPUs it is advantageous to use a faster, less accurate coarse propagator, despite it requires an extra parareal iteration that increases the total computational complexity.
 
 \begin{figure}[!htb]
     \setlength\figureheight{0.32\textwidth}
@@ -555,14 +558,14 @@ If we in addition assume that the time spend on coarse propagation is negligible
     \label{ch5:fig:pararealRvsGPUs:b}
     }
     \end{center}
-    \caption[Parareal convergence properties as a function of $R$ and number GPUs used.]{Parareal convergence properties as a function of $R$ and number GPUs used. The error is measured as the relative difference between the purely sequential solution and the parareal solution.}\label{ch5:fig:pararealRvsGPUs}
+    \caption[Parareal convergence properties as a function of $R$ and number of GPUs used.]{Parareal convergence properties as a function of $R$ and number GPUs used. The error is measured as the relative difference between the purely sequential solution and the parareal solution.}\label{ch5:fig:pararealRvsGPUs}
 \end{figure}
 
 \begin{figure}[!htb]
     \setlength\figureheight{0.32\textwidth}
-    \setlength\figurewidth{0.35\textwidth}
+    \setlength\figurewidth{0.34\textwidth}
     \begin{center}
-    \subfigure[Measured parallel speed-up]{
+    \subfigure[Measured parallel speedup]{
     {\small\input{Chapters/chapter5/figures/pararealSpeedupvsRvsGPUs.tikz}}
     \label{ch5:fig:pararealGPUs:a}
     }
@@ -571,7 +574,7 @@ If we in addition assume that the time spend on coarse propagation is negligible
     \label{ch5:fig:pararealGPUs:b}
     }
     \end{center}
-    \caption[Parareal performance properties as a function of $R$ and number GPUs used.]{Parareal performance properties as a function of $R$ and number GPUs used. Notice how the obtained performance depends greatly on the choice of $R$ as a function of the number of GPUs. Test environment 2. }\label{ch5:fig:pararealGPUs}
+    \caption[Parareal performance properties as a function of $R$ and number of GPUs used.]{Parareal performance properties as a function of $R$ and number GPUs used. Notice how the obtained performance depends greatly on the choice of $R$ as a function of the number of GPUs. Executed on test environment 3.}\label{ch5:fig:pararealGPUs}
 \end{figure}
 
 %TODO: Do we make this into a subsubsection:
@@ -596,19 +599,19 @@ If we in addition assume that the time spend on coarse propagation is negligible
 
 \section{Conclusion and outlook}
 
-Massively parallel heterogeneous systems continue to enter the consumer market, and there has been no identification that this trend will stop for the next years to come. However, these parallel architectures require software vendors to adjust to new programming models and optimization strategies. Good software libraries are important tools for reducing time and complexity of adjusting to new architectures and they provide the user with an intuitive programming interface.
+Massively parallel heterogeneous systems continue to enter the consumer market, and there has been no identification that this trend will stop for years to come. However, these parallel architectures require software vendors to adjust to new programming models and optimization strategies. Good software libraries are important tools for reducing the time and complexity of adjusting to new architectures, and they provide the user with an intuitive programming interface.
 
-We presented ideas for a generic GPU-based library for fast assembling of PDE solvers. A high-level interface and some simple implementation concepts were created with the objective of enhancing developer productivity and to ensure performance portability. Two elementary PDE model problems were used to demonstrate assembling of both spatial and temporal approximation techniques. Results confirmed that numerical methods for solution of PDE systems are bandwidth limited on GPU systems. Therefore higher-order methods can be advantageous, because extra computational work can be performed during memory transfer idle times, leading to increased flop performance.
+We have presented ideas for a generic GPU-based library for fast assembling of PDE solvers. A high-level interface and simple implementation concepts were created with the objective of enhancing developer productivity and to ensure performance portability. Two elementary PDE model problems were used to demonstrate assembling of both spatial and temporal approximation techniques. Results confirmed that numerical methods for solution of PDE systems are bandwidth limited on GPU systems. Therefore higher-order methods can be advantageous, when extra computational work can be performed during memory transfer idle times, leading to increased flop performance.
 
-Decomposition strategies for spatial and temporal parallelization on multi-GPU systems has been presented, with reasonable performance speedups. Novel results for the parareal algorithm on multi-GPU systems was presented, and an example of parallel efficiency for the heat conduction problem was demonstrated. Furthermore, a domain decomposition technique that preserves algorithmic efficiency was presented for the Poisson problem, with good parallel scalability.
+Decomposition strategies for spatial and temporal parallelization on multi-GPU systems have been presented, with reasonable performance speedups. Novel results for the parareal algorithm on multi-GPU systems have been presented, and an example of parallel efficiency for the heat conduction problem has been demonstrated. Furthermore, a domain decomposition technique that preserves algorithmic efficiency has been presented for the Poisson problem.
 
-The library has already successfully been used for development of a fast tool intended for scientific applications within maritime engineering, see Chapter \ref{ch7} for details. We intend to further extend the library, as we explore new techniques, suitable for parallelization on heterogeneous systems, that fit the scope of our applications.
+The library has already successfully been used for development of a fast tool intended for scientific applications within maritime engineering; see Chapter \ref{ch7} for details. We intend to further extend the library, as we explore new techniques, suitable for parallelization on heterogeneous systems, that fit the scope of our applications.
 
-For more information about the library and our work within the GPUlab group at the Technical University of Denmark, we encourage you to visit \texttt{http://gpulab.imm.dtu.dk}.
+For more information about the library and our work within the GPUlab group at the Technical University of Denmark, we encourage you to visit \texttt{http://gpulab.imm.dtu.dk}
 
-\section*{Acknowledgements}
-This work was supported by grant no. 09-070032 %on "Desktop Scientific Computing on Consumer Graphics Cards"
-from the Danish Research Council for Technology and Production Sciences. A special thank goes to Professor Jan S. Hesthaven for supporting parts of this work. Scalability and performance tests was done in GPUlab at DTU Informatics, Technical University of Denmark and using the GPU-cluster at Center for Computing and Visualization, Brown University, USA. Nvidia Corporation is acknowledged for generous hardware donations to facilities of GPUlab.
+\section*{Acknowledgments}
+This work have been supported by grant no. 09-070032 %on "Desktop Scientific Computing on Consumer Graphics Cards"
+from the Danish Research Council for Technology and Production Sciences. A special thanks goes to Professor Jan S. Hesthaven for supporting parts of this work. Scalability and performance tests were done in the GPUlab at DTU Informatics, Technical University of Denmark and using the GPU-cluster at the Center for Computing and Visualization, Brown University, USA. The NVIDIA Corporation is acknowledged for generous hardware donations to facilities of the GPUlab.
 
 %-------------------------------------------------------------------------
 %\backmatter % bibliography, index and similar things, which are not numbered.
diff --git a/BookGPU/Chapters/chapter5/code/ex1.cu b/BookGPU/Chapters/chapter5/code/ex1.cu
index 13d4322..847d20a 100644
--- a/BookGPU/Chapters/chapter5/code/ex1.cu
+++ b/BookGPU/Chapters/chapter5/code/ex1.cu
@@ -1,29 +1,29 @@
-#include <gpulab/vector.h>
-
-__global__ void add(double* a, double const* b, int N)
-{
-	int i = blockDim.x*blockIdx.x + threadIdx.x;
-	if(i<N)
-		a[i] += b[i];
-}
-
-int main(int argc, char *argv[])
-{
-	int N = 1000;
-
-	// Plain CUDA example
-	double *a1, *b1;
-	cudaMalloc((void**)&a1, N*sizeof(double));
-	cudaMalloc((void**)&b1, N*sizeof(double));
-	cudaMemset(a1, 2.0, N);
-	cudaMemset(b1, 3.0, N);
-	int blocksize = 128;
-	add<<<(N+blocksize-1)/blocksize,blocksize>>>(a1, b1, N);
-
-	// gpulab example
-	gpulab::vector<double,gpulab::device_memory> a2(N, 2.0);
-	gpulab::vector<double,gpulab::device_memory> b2(N, 3.0);
-	a2.axpy(1.0, b2); // BLAS1: a2 = 1*b2 + a2
-	
-	return 0;
+#include <gpulab/vector.h>
+
+__global__ void add(double* a, double const* b, int N)
+{
+	int i = blockDim.x*blockIdx.x + threadIdx.x;
+	if(i<N)
+		a[i] += b[i];
+}
+
+int main(int argc, char *argv[])
+{
+	int N = 1000;
+
+	// Basic CUDA example
+	double *a1, *b1;
+	cudaMalloc((void**)&a1, N*sizeof(double));
+	cudaMalloc((void**)&b1, N*sizeof(double));
+	cudaMemset(a1, 2.0, N);
+	cudaMemset(b1, 3.0, N);
+	int blocksize = 128;
+	add<<<(N+blocksize-1)/blocksize,blocksize>>>(a1, b1, N);
+
+	// gpulab example
+	gpulab::vector<double,gpulab::device_memory> a2(N, 2.0);
+	gpulab::vector<double,gpulab::device_memory> b2(N, 3.0);
+	a2.axpy(1.0, b2); // BLAS1: a2 = 1*b2 + a2
+	
+	return 0;
 }
\ No newline at end of file
diff --git a/BookGPU/Chapters/chapter5/figures/AlphaPerformanceGTX590_N16777216.tikz b/BookGPU/Chapters/chapter5/figures/AlphaPerformanceGTX590_N16777216.tikz
index 008c5c3..0618990 100644
--- a/BookGPU/Chapters/chapter5/figures/AlphaPerformanceGTX590_N16777216.tikz
+++ b/BookGPU/Chapters/chapter5/figures/AlphaPerformanceGTX590_N16777216.tikz
@@ -18,10 +18,11 @@ scale only axis,
 xmin=0, xmax=5,
 xtick={1,2,3,4},
 xlabel={$\alpha$},
-xmajorgrids,
+%xmajorgrids,
 ymin=0, ymax=60,
 ylabel={GFlops},
 ymajorgrids,
+ytick={10,20,30,40,50,60},
 legend style={at={(0.03,0.97)},anchor=north west,nodes=right},
 area legend]
 \addplot[ybar,bar width=0.0457142857142857\figurewidth,bar shift=-0.0285714285714286\figurewidth,fill=black,draw=black] plot coordinates{ (1,28.1201594459724) (2,39.4770078207878) (3,46.729267222466) (4,51.1805955922042) };
diff --git a/BookGPU/Chapters/chapter5/figures/GPU2GPU.tikz b/BookGPU/Chapters/chapter5/figures/GPU2GPU.tikz
index 96ba97d..31a6fed 100644
--- a/BookGPU/Chapters/chapter5/figures/GPU2GPU.tikz
+++ b/BookGPU/Chapters/chapter5/figures/GPU2GPU.tikz
@@ -1,16 +1,21 @@
-\tikzstyle{iblock}=[rectangle, draw=black, rounded corners, top color=white, bottom color=black!50, drop shadow, text centered, anchor=north, text width=3cm]
-
-\begin{tikzpicture}[->,>=stealth',shorten >=1pt,thick]
-        
-\node (GPU1mem) [iblock] { Device memory };
-\node (GPU2mem) [iblock,right=2.0cm of GPU1mem] { Device memory };
-\node (CPU1mem) [iblock,below=2.0cm of GPU1mem] { Host memory };
-\node (CPU2mem) [iblock,right=2.0cm of CPU1mem] { Host memory };
-
-\draw[->,loop left]  (GPU1mem.west) to node {kernel} (GPU1mem.west);
-\draw[->,loop right] (GPU2mem.east) to node {kernel} (GPU2mem.east);  
-\draw[->] (GPU1mem.south) to node[auto] {PCIe} (CPU1mem.north);
-\draw[->] (CPU1mem.east)  to node[auto] {MPI}  (CPU2mem.west);
-\draw[->] (CPU2mem.north) to node[auto] {PCIe} (GPU2mem.south);
-
-\end{tikzpicture}
+\tikzstyle{iblock}=[rectangle, draw=black, rounded corners, top color=white, bottom color=black!50, drop shadow, text centered, anchor=north, text width=3cm]
+\newcommand*{\gridscale}{0.85\linewidth}
+\resizebox{\gridscale}{!}{
+
+\begin{tikzpicture}[->,>=stealth',shorten >=1pt,thick]
+        
+\node (GPU1mem) [iblock] { Device memory };
+\node (GPU2mem) [iblock,right=2.0cm of GPU1mem] { Device memory };
+\node (CPU1mem) [iblock,below=2.0cm of GPU1mem] { Host memory };
+\node (CPU2mem) [iblock,right=2.0cm of CPU1mem] { Host memory };
+
+\draw[->,loop left]  (GPU1mem.west) to node {kernel} (GPU1mem.west);
+\draw[->,loop right] (GPU2mem.east) to node {kernel} (GPU2mem.east);  
+\draw[->] (GPU1mem.south) to node[left] {PCIe} (CPU1mem.north);
+\draw[->] (CPU1mem.east)  to node[auto] {Network}  (CPU2mem.west);
+\draw[->] (CPU2mem.north) to node[right] {PCIe} (GPU2mem.south);
+
+\draw[->] (GPU1mem.south) edge[out=-90,in=-90,->] node[auto] {GPUDirect} (GPU2mem.south);
+
+\end{tikzpicture}
+}
diff --git a/BookGPU/Chapters/chapter5/figures/HeatSolution0.049307.tikz b/BookGPU/Chapters/chapter5/figures/HeatSolution0.049307.tikz
index 502409f..46c9045 100644
--- a/BookGPU/Chapters/chapter5/figures/HeatSolution0.049307.tikz
+++ b/BookGPU/Chapters/chapter5/figures/HeatSolution0.049307.tikz
@@ -17,13 +17,11 @@ height=\figureheight,
 scale only axis,
 xmin=0, xmax=1,
 xlabel={x},
-xmajorgrids,
+grid=both,
 ymin=0, ymax=1,
 ylabel={y},
-ymajorgrids,
-zmin=0, zmax=0.993180651701361,
-zlabel={u},
-zmajorgrids,
+zmin=0, zmax=1,
+%zlabel={u},
 axis lines*=left]
 
 \addplot3[%
diff --git a/BookGPU/Chapters/chapter5/figures/HeatSolution0.099723.tikz b/BookGPU/Chapters/chapter5/figures/HeatSolution0.099723.tikz
index 676ddad..ac2f9d0 100644
--- a/BookGPU/Chapters/chapter5/figures/HeatSolution0.099723.tikz
+++ b/BookGPU/Chapters/chapter5/figures/HeatSolution0.099723.tikz
@@ -17,13 +17,12 @@ height=\figureheight,
 scale only axis,
 xmin=0, xmax=1,
 xlabel={x},
-xmajorgrids,
+grid=both,
 ymin=0, ymax=1,
 ylabel={y},
 ymajorgrids,
-zmin=0, zmax=0.993180651701361,
-zlabel={u},
-zmajorgrids,
+zmin=0, zmax=1,
+%zlabel={u},
 axis lines*=left]
 
 \addplot3[%
diff --git a/BookGPU/Chapters/chapter5/figures/HeatSolution0.tikz b/BookGPU/Chapters/chapter5/figures/HeatSolution0.tikz
index 7b92444..e1c5b4b 100644
--- a/BookGPU/Chapters/chapter5/figures/HeatSolution0.tikz
+++ b/BookGPU/Chapters/chapter5/figures/HeatSolution0.tikz
@@ -17,13 +17,11 @@ height=\figureheight,
 scale only axis,
 xmin=0, xmax=1,
 xlabel={x},
-xmajorgrids,
+grid=both,
 ymin=0, ymax=1,
 ylabel={y},
-ymajorgrids,
-zmin=0, zmax=0.993180651701361,
+zmin=0, zmax=1.0,
 zlabel={u},
-zmajorgrids,
 axis lines*=left]
 
 \addplot3[%
diff --git a/BookGPU/Chapters/chapter5/figures/MultiGPUAlpha3TimingsTeslaM2050.pdf b/BookGPU/Chapters/chapter5/figures/MultiGPUAlpha3TimingsTeslaM2050.pdf
index ca4c5d2..653159a 100644
Binary files a/BookGPU/Chapters/chapter5/figures/MultiGPUAlpha3TimingsTeslaM2050.pdf and b/BookGPU/Chapters/chapter5/figures/MultiGPUAlpha3TimingsTeslaM2050.pdf differ
diff --git a/BookGPU/Chapters/chapter5/figures/MultiGPUAlpha3TimingsTeslaM2050.tikz b/BookGPU/Chapters/chapter5/figures/MultiGPUAlpha3TimingsTeslaM2050.tikz
index a78b9d5..666f733 100644
--- a/BookGPU/Chapters/chapter5/figures/MultiGPUAlpha3TimingsTeslaM2050.tikz
+++ b/BookGPU/Chapters/chapter5/figures/MultiGPUAlpha3TimingsTeslaM2050.tikz
@@ -1,5 +1,5 @@
-% This file was created by matlab2tikz v0.2.2.
-% Copyright (c) 2008--2012, Nico SchlÃ¶mer <nico.schloemer@gmail.com>
+% This file was created by matlab2tikz v0.4.0.
+% Copyright (c) 2008--2013, Nico SchlÃ¶mer <nico.schloemer@gmail.com>
 % All rights reserved.
 % 
 % The latest updates can be retrieved from
@@ -10,34 +10,42 @@
 % 
 \begin{tikzpicture}
 
-\begin{loglogaxis}[%
-view={0}{90},
+\begin{axis}[%
 width=\figurewidth,
 height=\figureheight,
 scale only axis,
-xmin=10000, xmax=100000000,
+xmode=log,
+xmin=10000,
+xmax=100000000,
 xminorticks=true,
 xlabel={N},
 xmajorgrids,
 xminorgrids,
-ymin=0.0001, ymax=0.1,
+ymode=log,
+ymin=1e-005,
+ymax=0.1,
 yminorticks=true,
 ylabel={Time [s]},
 ymajorgrids,
 yminorgrids,
-%legend style={at={(0.03,0.97)},anchor=north west,nodes=right}
-legend pos=outer north east]
+legend style={at={(0.03,0.97)},anchor=north west,draw=black,fill=white,legend cell align=left}
+]
 \addplot [
 color=darkgray!40!black,
 dotted,
 line width=1.0pt,
-mark=*,
+mark=x,
 mark options={solid,fill=lightgray!80!black}
 ]
-coordinates{
- (65536,0.000104689598083496)(262144,0.000240898132324219)(1048576,0.000788998603820801)(4194304,0.00280699729919434)(16777216,0.0109266042709351)(67108864,0.0435449838638306) 
+table[row sep=crcr]{
+65536 6.61134719848633e-005\\
+262144 0.000195407867431641\\
+1048576 0.000729703903198242\\
+4194304 0.00263791084289551\\
+16777216 0.0104172945022583\\
+67108864 0.0417473077774048\\
 };
-\addlegendentry{1 GPU};
+\addlegendentry{\tiny{1 GPU}};
 
 \addplot [
 color=darkgray!40!black,
@@ -46,10 +54,15 @@ line width=1.0pt,
 mark=*,
 mark options={solid,fill=lightgray!80!black}
 ]
-coordinates{
- (65536,0.00100100040435791)(262144,0.00106160640716553)(1048576,0.0015192985534668)(4194304,0.00300109386444092)(16777216,0.0078394889831543)(67108864,0.0255790948867798) 
+table[row sep=crcr]{
+65536 0.000160098075866699\\
+262144 0.000235605239868164\\
+1048576 0.000513696670532227\\
+4194304 0.00168628692626953\\
+16777216 0.00567271709442139\\
+67108864 0.0216208934783936\\
 };
-\addlegendentry{2 GPUs};
+\addlegendentry{\tiny{2 GPUs}};
 
 \addplot [
 color=darkgray!40!black,
@@ -58,10 +71,15 @@ line width=1.0pt,
 mark=triangle*,
 mark options={solid,fill=lightgray!80!black}
 ]
-coordinates{
- (65536,0.00090029239654541)(262144,0.001084303855896)(1048576,0.00136759281158447)(4194304,0.00224909782409668)(16777216,0.00520608425140381)(67108864,0.0148308992385864) 
+table[row sep=crcr]{
+65536 0.000244712829589844\\
+262144 0.000288915634155273\\
+1048576 0.000451016426086426\\
+4194304 0.00104920864105225\\
+16777216 0.00314109325408936\\
+67108864 0.0112285852432251\\
 };
-\addlegendentry{4 GPUs};
+\addlegendentry{\tiny{4 GPUs}};
 
 \addplot [
 color=darkgray!40!black,
@@ -70,10 +88,15 @@ line width=1.0pt,
 mark=square*,
 mark options={solid,fill=lightgray!80!black}
 ]
-coordinates{
- (65536,0.00088038444519043)(262144,0.00106220245361328)(1048576,0.00128121376037598)(4194304,0.00191099643707275)(16777216,0.00365688800811768)(67108864,0.00957400798797607) 
+table[row sep=crcr]{
+65536 0.000238609313964844\\
+262144 0.000275707244873047\\
+1048576 0.000345993041992188\\
+4194304 0.00065159797668457\\
+16777216 0.00187878608703613\\
+67108864 0.00595250129699707\\
 };
-\addlegendentry{8 GPUs};
+\addlegendentry{\tiny{8 GPUs}};
 
 \addplot [
 color=darkgray!40!black,
@@ -82,10 +105,15 @@ line width=1.0pt,
 mark=diamond*,
 mark options={solid,fill=lightgray!80!black}
 ]
-coordinates{
- (65536,0.000890207290649414)(262144,0.0010530948638916)(1048576,0.0012747049331665)(4194304,0.00172920227050781)(16777216,0.00289230346679688)(67108864,0.00648910999298096) 
+table[row sep=crcr]{
+65536 0.000242710113525391\\
+262144 0.000268697738647461\\
+1048576 0.000305795669555664\\
+4194304 0.000477385520935059\\
+16777216 0.0010857105255127\\
+67108864 0.0031994104385376\\
 };
-\addlegendentry{16 GPUs};
+\addlegendentry{\tiny{16 GPUs}};
 
 \addplot [
 color=darkgray!40!black,
@@ -94,10 +122,15 @@ line width=1.0pt,
 mark=triangle*,
 mark options={solid,,rotate=180,fill=lightgray!80!black}
 ]
-coordinates{
- (65536,0.000879597663879395)(262144,0.00104289054870605)(1048576,0.00122830867767334)(4194304,0.00163850784301758)(16777216,0.00254049301147461)(67108864,0.00491151809692383) 
+table[row sep=crcr]{
+65536 0.00025029182434082\\
+262144 0.000261116027832031\\
+1048576 0.000296306610107422\\
+4194304 0.000368380546569824\\
+16777216 0.000673604011535645\\
+67108864 0.00190730094909668\\
 };
-\addlegendentry{32 GPUs};
+\addlegendentry{\tiny{32 GPUs}};
 
-\end{loglogaxis}
-\end{tikzpicture}%
+\end{axis}
+\end{tikzpicture}%
\ No newline at end of file
diff --git a/BookGPU/Chapters/chapter5/figures/MultiGPUAlphaPerformanceTeslaM2050_N16777216.pdf b/BookGPU/Chapters/chapter5/figures/MultiGPUAlphaPerformanceTeslaM2050_N16777216.pdf
index 6cfd4cb..2c72db8 100644
Binary files a/BookGPU/Chapters/chapter5/figures/MultiGPUAlphaPerformanceTeslaM2050_N16777216.pdf and b/BookGPU/Chapters/chapter5/figures/MultiGPUAlphaPerformanceTeslaM2050_N16777216.pdf differ
diff --git a/BookGPU/Chapters/chapter5/figures/MultiGPUAlphaPerformanceTeslaM2050_N16777216.tikz b/BookGPU/Chapters/chapter5/figures/MultiGPUAlphaPerformanceTeslaM2050_N16777216.tikz
index e6776db..89ecd46 100644
--- a/BookGPU/Chapters/chapter5/figures/MultiGPUAlphaPerformanceTeslaM2050_N16777216.tikz
+++ b/BookGPU/Chapters/chapter5/figures/MultiGPUAlphaPerformanceTeslaM2050_N16777216.tikz
@@ -1,5 +1,5 @@
-% This file was created by matlab2tikz v0.2.2.
-% Copyright (c) 2008--2012, Nico SchlÃ¶mer <nico.schloemer@gmail.com>
+% This file was created by matlab2tikz v0.4.0.
+% Copyright (c) 2008--2013, Nico SchlÃ¶mer <nico.schloemer@gmail.com>
 % All rights reserved.
 % 
 % The latest updates can be retrieved from
@@ -10,31 +10,35 @@
 % 
 
 % defining custom colors
-\definecolor{mycolor1}{rgb}{0.19047619047619,0.19047619047619,0.19047619047619}
-\definecolor{mycolor2}{rgb}{0.396825396825397,0.396825396825397,0.396825396825397}
-\definecolor{mycolor3}{rgb}{0.603174603174603,0.603174603174603,0.603174603174603}
-\definecolor{mycolor4}{rgb}{0.80952380952381,0.80952380952381,0.80952380952381}
+\definecolor{mycolor1}{rgb}{0.19047619047619,0.19047619047619,0.19047619047619}%
+\definecolor{mycolor2}{rgb}{0.396825396825397,0.396825396825397,0.396825396825397}%
+\definecolor{mycolor3}{rgb}{0.603174603174603,0.603174603174603,0.603174603174603}%
+\definecolor{mycolor4}{rgb}{0.80952380952381,0.80952380952381,0.80952380952381}%
 
 \begin{tikzpicture}
 
 \begin{axis}[%
-view={0}{90},
 width=\figurewidth,
 height=\figureheight,
+area legend,
 scale only axis,
-xmin=0, xmax=5,
-xtick={1,2,3,4},
+xmin=0,
+xmax=5,
+xtick={1, 2, 3, 4},
 xlabel={$\alpha$},
 xmajorgrids,
-ymin=0, ymax=200,
+ymin=0,
+ymax=700,
 ylabel={GFlops},
 ymajorgrids,
-%legend style={at={(0.03,0.97)},anchor=north west,nodes=right},
-legend pos=outer north east,
-area legend]
-\addplot[ybar,bar width=0.0213333333333333\figurewidth,bar shift=-0.0666666666666667\figurewidth,fill=black,draw=black] plot coordinates{ (1,22.7580842315813) (2,31.7449314051055) (3,37.1798950457675) (4,40.253492018759) };
+legend style={at={(0.03,0.97)},anchor=north west,draw=black,fill=white,legend cell align=left}
+]
+\addplot[ybar,bar width=0.0213333333333333\figurewidth,bar shift=-0.0666666666666667\figurewidth,fill=black,draw=black] plot coordinates{(1,25.7269262023185)
+(2,34.5244303964177)
+(3,38.9976495252132)
+(4,42.0688704662039)};
 
-\addlegendentry{1 GPU};
+\addlegendentry{\tiny{1 GPU}};
 
 \addplot [
 color=black,
@@ -42,28 +46,44 @@ solid,
 line width=1.0pt,
 forget plot
 ]
-coordinates{
- (0,0)(5,0) 
+table[row sep=crcr]{
+0 0\\
+5 0\\
 };
-\addplot[ybar,bar width=0.0213333333333333\figurewidth,bar shift=-0.04\figurewidth,fill=mycolor1,draw=black] plot coordinates{ (1,32.3920897978954) (2,44.0505761891603) (3,51.8209797695948) (4,56.6033287946837) };
+\addplot[ybar,bar width=0.0213333333333333\figurewidth,bar shift=-0.04\figurewidth,fill=mycolor1,draw=black] plot coordinates{(1,45.7660023184681)
+(2,61.0707130321338)
+(3,71.6147118282191)
+(4,76.4971402283698)};
 
-\addlegendentry{2 GPUs};
+\addlegendentry{\tiny{2 GPUs}};
 
-\addplot[ybar,bar width=0.0213333333333333\figurewidth,bar shift=-0.0133333333333333\figurewidth,fill=mycolor2,draw=black] plot coordinates{ (1,49.0164694619377) (2,66.4549239202078) (3,78.0336968020553) (4,85.0178374521632) };
+\addplot[ybar,bar width=0.0213333333333333\figurewidth,bar shift=-0.0133333333333333\figurewidth,fill=mycolor2,draw=black] plot coordinates{(1,81.6048014543824)
+(2,110.999576570219)
+(3,129.333950678194)
+(4,141.772486940809)};
 
-\addlegendentry{4 GPUs};
+\addlegendentry{\tiny{4 GPUs}};
 
-\addplot[ybar,bar width=0.0213333333333333\figurewidth,bar shift=0.0133333333333333\figurewidth,fill=mycolor3,draw=black] plot coordinates{ (1,70.193327263964) (2,95.9835965533234) (3,111.091725833056) (4,121.135339367746) };
+\addplot[ybar,bar width=0.0213333333333333\figurewidth,bar shift=0.0133333333333333\figurewidth,fill=mycolor3,draw=black] plot coordinates{(1,133.626947231058)
+(2,183.23775202709)
+(3,216.230044922718)
+(4,243.078097897826)};
 
-\addlegendentry{8 GPUs};
+\addlegendentry{\tiny{8 GPUs}};
 
-\addplot[ybar,bar width=0.0213333333333333\figurewidth,bar shift=0.04\figurewidth,fill=mycolor4,draw=black] plot coordinates{ (1,88.712013536379) (2,120.532134464085) (3,140.458981798998) (4,152.964872416231) };
+\addplot[ybar,bar width=0.0213333333333333\figurewidth,bar shift=0.04\figurewidth,fill=mycolor4,draw=black] plot coordinates{(1,214.808744960503)
+(2,305.941179521759)
+(3,374.17892748913)
+(4,412.649357383607)};
 
-\addlegendentry{16 GPUs};
+\addlegendentry{\tiny{16 GPUs}};
 
-\addplot[ybar,bar width=0.0213333333333333\figurewidth,bar shift=0.0666666666666667\figurewidth,fill=white,draw=black] plot coordinates{ (1,96.2830194223254) (2,137.618031008295) (3,159.909906528023) (4,174.668140378932) };
+\addplot[ybar,bar width=0.0213333333333333\figurewidth,bar shift=0.0666666666666667\figurewidth,fill=white,draw=black] plot coordinates{(1,320.65759859086)
+(2,484.833340183305)
+(3,603.099139914345)
+(4,649.931163224828)};
 
-\addlegendentry{32 GPUs};
+\addlegendentry{\tiny{32 GPUs}};
 
 \end{axis}
-\end{tikzpicture}%
+\end{tikzpicture}%
\ No newline at end of file
diff --git a/BookGPU/Chapters/chapter5/figures/component_design.tikz b/BookGPU/Chapters/chapter5/figures/component_design.tikz
index 984ae7b..beb7103 100644
--- a/BookGPU/Chapters/chapter5/figures/component_design.tikz
+++ b/BookGPU/Chapters/chapter5/figures/component_design.tikz
@@ -1,97 +1,98 @@
-\begin{tikzpicture}
-
-\tikzstyle{abstract}=[rectangle, draw=black, rounded corners, fill=white, drop shadow, text centered, anchor=north, text width=6cm]
-        
-%nodes
-\node (Vector) [abstract, rectangle split, rectangle split parts=3]
-{
-\textbf{Vector}
-\nodepart{second}
-\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily]
-typedef value_type;
-typedef size_type;
-\end{lstlisting}
-\nodepart{third}
-\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily]
-           Vector(size_type);
-           Vector(Vector);
-void       axpy(value_type,Vector);
-void       axpby(value_type,Vector);
-void       copy(Vector);
-value_type dot(Vector);
-Vector*    duplicate();
-void       fill(value_type);
-value_type nrmi();
-value_type nrm2();
-void       scal(vale_type);
-size_type  size();
-\end{lstlisting}
-};
-
-\node (Matrix) [abstract, rectangle split, rectangle split parts=3, below right=0cm and 0.4cm of Vector.north east]
-{
-\textbf{Matrix}
-\nodepart{second}
-\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily]
-typedef vector_type;
-\end{lstlisting}
-\nodepart{third}
-\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily]
-void mult(vector_type);
-\end{lstlisting}
-};
-
-\node (Solver) [abstract, rectangle split, rectangle split parts=3, below=of Vector]
-{
-\textbf{EqSolver}
-\nodepart{second}
-\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily]
-typedef vector_type;
-typedef matrix_type;
-typedef monitor_type;
-typedef preconditioner_type;
-\end{lstlisting}
-\nodepart{third}
-\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily]
-     EqSolver(matrix_type
-             ,monitor_type);
-void solve(vector_type,vector_type);
-void set_preconditioner(preconditioner_type);
-\end{lstlisting}
-};
-
-\node (Preconditioner) [abstract, rectangle split, rectangle split parts=3, below=of Matrix]
-{
-\textbf{Preconditioner}
-\nodepart{second}
-\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily]
-typedef vector_type;
-typedef matrix_type;
-typedef monitor_type;
-\end{lstlisting}
-\nodepart{third}
-\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily]
-     Preconditioner(matrix_type
-                   ,monitor_type);
-void operator()(vector_type,vector_type)
-\end{lstlisting}
-};
-
-\node (TimeIntegrator) [abstract, rectangle split, rectangle split parts=2, below=of Preconditioner]
-{
-\textbf{TimeIntegrator}
-\nodepart{second}
-%\nodepart{third}
-\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily]
-template <typename rhs_type
-        , typename vector_type
-        , typename value_type>
-void operator()(rhs_type
-               ,vector_type
-               ,value_type
-               ,value_type
-               ,value_type);
-\end{lstlisting}
-};
-
-\end{tikzpicture}
+\begin{tikzpicture}
+
+\tikzstyle{abstract}=[rectangle, draw=black, rounded corners, fill=white, drop shadow, text centered, anchor=north, text width=5.5cm]
+        
+%nodes
+\node (Vector) [abstract, rectangle split, rectangle split parts=3]
+{
+\textbf{Vector}
+\nodepart{second}
+\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily,backgroundcolor=\color{white},numbers=none]
+typedef value_type;
+typedef size_type;
+\end{lstlisting}
+\nodepart{third}
+\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily,backgroundcolor=\color{white},numbers=none]
+           Vector(size_type);
+           Vector(Vector);
+void       axpy(value_type,Vector);
+void       axpby(value_type,Vector);
+void       copy(Vector);
+value_type dot(Vector);
+Vector*    duplicate();
+void       fill(value_type);
+value_type nrmi();
+value_type nrm2();
+void       scal(vale_type);
+size_type  size();
+\end{lstlisting}
+};
+
+\node (Matrix) [abstract, rectangle split, rectangle split parts=3, below right=0cm and 0.4cm of Vector.north east]
+{
+\textbf{Matrix}
+\nodepart{second}
+\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily,backgroundcolor=\color{white},numbers=none]
+typedef vector_type;
+\end{lstlisting}
+\nodepart{third}
+\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily,backgroundcolor=\color{white},numbers=none]
+void mult(vector_type,vector_type);
+\end{lstlisting}
+};
+
+\node (Solver) [abstract, rectangle split, rectangle split parts=3, below=0.4cm of Vector]
+{
+\textbf{EqSolver}
+\nodepart{second}
+\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily,backgroundcolor=\color{white},numbers=none]
+typedef vector_type;
+typedef matrix_type;
+typedef monitor_type;
+typedef preconditioner_type;
+\end{lstlisting}
+\nodepart{third}
+\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily,backgroundcolor=\color{white},numbers=none]
+     EqSolver(matrix_type
+             ,monitor_type);
+void solve(vector_type,vector_type);
+void set_preconditioner(preconditioner_type);
+\end{lstlisting}
+};
+
+\node (Preconditioner) [abstract, rectangle split, rectangle split parts=3, below=0.4cm of Matrix]
+{
+\textbf{Preconditioner}
+\nodepart{second}
+\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily,backgroundcolor=\color{white},numbers=none]
+typedef vector_type;
+typedef matrix_type;
+typedef monitor_type;
+\end{lstlisting}
+\nodepart{third}
+\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily,backgroundcolor=\color{white},numbers=none]
+     Preconditioner(matrix_type
+                   ,monitor_type);
+void operator()(vector_type
+               ,vector_type)
+\end{lstlisting}
+};
+
+\node (TimeIntegrator) [abstract, rectangle split, rectangle split parts=2, below=0.4cm of Preconditioner]
+{
+\textbf{TimeIntegrator}
+\nodepart{second}
+%\nodepart{third}
+\begin{lstlisting}[frame=none,basicstyle=\scriptsize\ttfamily,backgroundcolor=\color{white},numbers=none]
+template <typename rhs_type
+        , typename vector_type
+        , typename value_type>
+void operator()(rhs_type
+               ,vector_type
+               ,value_type
+               ,value_type
+               ,value_type);
+\end{lstlisting}
+};
+
+\end{tikzpicture}
diff --git a/BookGPU/Chapters/chapter5/figures/dd2d.tikz b/BookGPU/Chapters/chapter5/figures/dd2d.tikz
index 2b84097..b36cbf3 100644
--- a/BookGPU/Chapters/chapter5/figures/dd2d.tikz
+++ b/BookGPU/Chapters/chapter5/figures/dd2d.tikz
@@ -1,62 +1,62 @@
-\tikzstyle{ghostblock} = [draw,rectangle,minimum height=5.6cm, minimum width=1.6cm,color=gray!80,rounded corners=2pt]
-\newcommand*{\boxcolor}{gray!80}
-\newcommand*{\circolor}{black}
-\newcommand*{\gridscale}{1.0\linewidth}
-\resizebox{\gridscale}{!}{
-\begin{tikzpicture}[thick]
-\draw (0,0) grid (8,5);
-\foreach \c in {(0,0), (1,0), (2,0), (3,0), (4,0), (5,0), (6,0)}
-    \foreach \k in {(0,0), (0,1), (0,2), (0,3), (0,4), (0,5)}
-        \fill [fill=\circolor] \k + \c circle (0.12);
-\foreach \k in {0,1,2,3,4,5}
-{
-    \fill [fill=\boxcolor] (6.9,-0.1+\k) rectangle (7.1,0.1+\k);
-    \fill [fill=\boxcolor] (7.9,-0.1+\k) rectangle (8.1,0.1+\k);    
-}   
-\node (rect1l) [ghostblock] at  (7.5,2.5) {};
-\node (rect2l) [ghostblock] at (12.5,2.5) {};
-\draw (rect1l.north) edge[out=45,in=135,triangle 45-] (rect2l.north);
-
-
-
-\draw (10,0) grid (20,5);
-\foreach \c in {(12,0), (13,0), (14,0), (15,0), (16,0), (17,0), (18,0)}
-    \foreach \k in {(0,0), (0,1), (0,2), (0,3), (0,4), (0,5)}
-        \fill [fill=\circolor] \k + \c circle (0.12);
-\foreach \k in {0,1,2,3,4,5}
-{
-    \fill [fill=\boxcolor] (9.9,-0.1+\k) rectangle (10.1,0.1+\k);
-    \fill [fill=\boxcolor] (10.9,-0.1+\k) rectangle (11.1,0.1+\k);    
-}   
-\node (rect1r) [ghostblock] at  (5.5,2.5) {};
-\node (rect2r) [ghostblock] at (10.5,2.5) {};
-\draw (rect1r.south) edge[out=-45,in=-135,-triangle 45] (rect2r.south);
-\foreach \k in {0,1,2,3,4,5}
-{
-    \fill [fill=\boxcolor] (18.9,-0.1+\k) rectangle (19.1,0.1+\k);
-    \fill [fill=\boxcolor] (19.9,-0.1+\k) rectangle (20.1,0.1+\k);
-}   
-\node (rect1r2) [ghostblock] at (19.5,2.5) {};
-\node (rect2r2) [ghostblock] at (24.5,2.5) {};
-\draw (rect1r2.north) edge[out=45,in=135,triangle 45-] (rect2r2.north);
-
-
-
-\draw (22,0) grid (30,5);
-\foreach \c in {(24,0), (25,0), (26,0), (27,0), (28,0), (29,0), (30,0)}
-    \foreach \k in {(0,0), (0,1), (0,2), (0,3), (0,4), (0,5)}
-        \fill [fill=\circolor] \k + \c circle (0.12);
-\foreach \k in {0,1,2,3,4,5}
-{
-    \fill [fill=\boxcolor] (21.9,-0.1+\k) rectangle (22.1,0.1+\k);
-    \fill [fill=\boxcolor] (22.9,-0.1+\k) rectangle (23.1,0.1+\k);    
-}   
-\node (rect1r3) [ghostblock] at  (17.5,2.5) {};
-\node (rect2r3) [ghostblock] at (22.5,2.5) {};
-\draw (rect1r3.south) edge[out=-45,in=-135,-triangle 45] (rect2r3.south);
-
-
-
-\end{tikzpicture}
-}
-
+\tikzstyle{ghostblock} = [draw,rectangle,minimum height=5.6cm, minimum width=1.6cm,color=gray!80,rounded corners=2pt]
+\newcommand*{\boxcolor}{gray!80}
+\newcommand*{\circolor}{black}
+\newcommand*{\gridscale}{1.0\linewidth}
+\resizebox{\gridscale}{!}{
+\begin{tikzpicture}[thick]
+\draw (0,0) grid (8,5);
+\foreach \c in {(0,0), (1,0), (2,0), (3,0), (4,0), (5,0), (6,0)}
+    \foreach \k in {(0,0), (0,1), (0,2), (0,3), (0,4), (0,5)}
+        \fill [fill=\circolor] \k + \c circle (0.14);
+\foreach \k in {0,1,2,3,4,5}
+{
+    \fill [fill=\boxcolor] (6.85,-0.15+\k) rectangle (7.15,0.15+\k);
+    \fill [fill=\boxcolor] (7.85,-0.15+\k) rectangle (8.15,0.15+\k);    
+}   
+\node (rect1l) [ghostblock] at  (7.5,2.5) {};
+\node (rect2l) [ghostblock] at (12.5,2.5) {};
+\draw (rect1l.north) edge[out=45,in=135,triangle 45-] (rect2l.north);
+
+
+
+\draw (10,0) grid (20,5);
+\foreach \c in {(12,0), (13,0), (14,0), (15,0), (16,0), (17,0), (18,0)}
+    \foreach \k in {(0,0), (0,1), (0,2), (0,3), (0,4), (0,5)}
+        \fill [fill=\circolor] \k + \c circle (0.14);
+\foreach \k in {0,1,2,3,4,5}
+{
+    \fill [fill=\boxcolor] (9.85,-0.15+\k) rectangle (10.15,0.15+\k);
+    \fill [fill=\boxcolor] (10.85,-0.15+\k) rectangle (11.15,0.15+\k);    
+}   
+\node (rect1r) [ghostblock] at  (5.5,2.5) {};
+\node (rect2r) [ghostblock] at (10.5,2.5) {};
+\draw (rect1r.south) edge[out=-45,in=-135,-triangle 45] (rect2r.south);
+\foreach \k in {0,1,2,3,4,5}
+{
+    \fill [fill=\boxcolor] (18.85,-0.15+\k) rectangle (19.15,0.15+\k);
+    \fill [fill=\boxcolor] (19.85,-0.15+\k) rectangle (20.15,0.15+\k);
+}   
+\node (rect1r2) [ghostblock] at (19.5,2.5) {};
+\node (rect2r2) [ghostblock] at (24.5,2.5) {};
+\draw (rect1r2.north) edge[out=45,in=135,triangle 45-] (rect2r2.north);
+
+
+
+\draw (22,0) grid (30,5);
+\foreach \c in {(24,0), (25,0), (26,0), (27,0), (28,0), (29,0), (30,0)}
+    \foreach \k in {(0,0), (0,1), (0,2), (0,3), (0,4), (0,5)}
+        \fill [fill=\circolor] \k + \c circle (0.14);
+\foreach \k in {0,1,2,3,4,5}
+{
+    \fill [fill=\boxcolor] (21.85,-0.15+\k) rectangle (22.15,0.15+\k);
+    \fill [fill=\boxcolor] (22.85,-0.15+\k) rectangle (23.15,0.15+\k);    
+}   
+\node (rect1r3) [ghostblock] at  (17.5,2.5) {};
+\node (rect2r3) [ghostblock] at (22.5,2.5) {};
+\draw (rect1r3.south) edge[out=-45,in=-135,-triangle 45] (rect2r3.south);
+
+
+
+\end{tikzpicture}
+}
+
diff --git a/BookGPU/Chapters/chapter5/figures/pararealEfficiencyvsRvsGPUs.tikz b/BookGPU/Chapters/chapter5/figures/pararealEfficiencyvsRvsGPUs.tikz
index 160e286..5f17e19 100644
--- a/BookGPU/Chapters/chapter5/figures/pararealEfficiencyvsRvsGPUs.tikz
+++ b/BookGPU/Chapters/chapter5/figures/pararealEfficiencyvsRvsGPUs.tikz
@@ -16,12 +16,13 @@ width=\figurewidth,
 height=\figureheight,
 scale only axis,
 xmin=0, xmax=150,
-xlabel={R},
+xlabel={$R$},
 xmajorgrids,
+xtick={0, 50, 100, 150},
 ymin=0, ymax=20,
 ylabel={GPUs},
 ymajorgrids,
-zmin=0.2, zmax=1,
+zmin=0.0, zmax=1,
 zlabel={Efficiency},
 zmajorgrids,
 axis lines*=left]
diff --git a/BookGPU/Chapters/chapter5/figures/pararealK1ErrvsRvsGPUs.tikz b/BookGPU/Chapters/chapter5/figures/pararealK1ErrvsRvsGPUs.tikz
index 3917588..1c9d491 100644
--- a/BookGPU/Chapters/chapter5/figures/pararealK1ErrvsRvsGPUs.tikz
+++ b/BookGPU/Chapters/chapter5/figures/pararealK1ErrvsRvsGPUs.tikz
@@ -16,7 +16,7 @@ width=\figurewidth,
 height=\figureheight,
 scale only axis,
 xmin=0, xmax=150,
-xlabel={R},
+xlabel={$R$},
 xmajorgrids,
 ymin=0, ymax=20,
 ylabel={GPUs},
diff --git a/BookGPU/Chapters/chapter5/figures/pararealKvsRvsGPUs.tikz b/BookGPU/Chapters/chapter5/figures/pararealKvsRvsGPUs.tikz
index 5590371..224e835 100644
--- a/BookGPU/Chapters/chapter5/figures/pararealKvsRvsGPUs.tikz
+++ b/BookGPU/Chapters/chapter5/figures/pararealKvsRvsGPUs.tikz
@@ -16,7 +16,7 @@ width=\figurewidth,
 height=\figureheight,
 scale only axis,
 xmin=15, xmax=150,
-xlabel={R},
+xlabel={$R$},
 xmajorgrids,
 ymin=2, ymax=20,
 ylabel={GPUs},
diff --git a/BookGPU/Chapters/chapter5/figures/pararealSpeedupvsRvsGPUs.tikz b/BookGPU/Chapters/chapter5/figures/pararealSpeedupvsRvsGPUs.tikz
index 2021f93..521f6ea 100644
--- a/BookGPU/Chapters/chapter5/figures/pararealSpeedupvsRvsGPUs.tikz
+++ b/BookGPU/Chapters/chapter5/figures/pararealSpeedupvsRvsGPUs.tikz
@@ -16,8 +16,9 @@ width=\figurewidth,
 height=\figureheight,
 scale only axis,
 xmin=0, xmax=150,
-xlabel={R},
+xlabel={$R$},
 xmajorgrids,
+xtick={0, 50, 100, 150},
 ymin=0, ymax=20,
 ylabel={GPUs},
 ymajorgrids,
diff --git a/BookGPU/Chapters/chapter7/biblio7.bib b/BookGPU/Chapters/chapter7/biblio7.bib
index e48f2ce..22678b2 100644
--- a/BookGPU/Chapters/chapter7/biblio7.bib
+++ b/BookGPU/Chapters/chapter7/biblio7.bib
@@ -56,8 +56,8 @@ PAGES     = "197--208"
 
 @INPROCEEDINGS{ch7:Glimberg2011,
   AUTHOR =       {S. L. Glimberg and A. P. Engsig-Karup and M. G. Madsen},
-  TITLE =        {A Fast GPU-accelerated Mixed-precision Strategy for Fully Nonlinear Water Wave Computations},
-  BOOKTITLE =    {Numerical Mathematics and Advanced Applications 2011, Proceedings of ENUMATH 2011, the 9th European Conference on Numerical Mathematics and Advanced Applications, Leicester, September 2011},
+  TITLE =        "{A Fast GPU-accelerated Mixed-precision Strategy for Fully Nonlinear Water Wave Computations}",
+  BOOKTITLE =   {Proceedings of ENUMATH 2011, the 9th European Conference on Numerical Mathematics and Advanced Applications, Leicester, September},
   YEAR =         {2011},
   editor =       {A. Cangiani and R. L. Davidchack and E. Georgoulis and A.N. Gorban and J. Levesley and M. V. Tretyakov},
   publisher =    {Springer},
@@ -72,6 +72,13 @@ VOLUME    = "228",
 PAGES     = "2100-2118"
 }
 
+@ARTICLE{ch7:EE13,
+AUTHOR    = "Eskilsson, C. and Engsig-Karup, A. P.",
+TITLE     = "On devising Boussinesq-type models with bounded eigenspectra: One horizontal dimension",
+YEAR      = "2013",
+JOURNAL   = "Submitted to Journal of Computational Physics",
+}
+
 @ARTICLE{ch7:LiFleming1997,
 AUTHOR    = "Li, B. and Fleming, C. A.",
 TITLE     = "A three dimensional multigrid model for fully nonlinear water waves",
@@ -90,27 +97,36 @@ YEAR    = "1999",
 PAGES   = "26--53",
 }
 
-@BOOK{ch7:Lin2008,
-AUTHOR    = "Lin, P.",
-TITLE     = "Numerical Modeling of Water Waves",
-PUBLISHER = "Taylor \& Francis",
-YEAR      = "2008"
+@book{ch7:Lin2008,
+  title={Numerical Modeling of Water Waves},
+  author={Lin, P.},
+  isbn={9780203937754},
+  year={2008},
+  publisher={Taylor \& Francis}
 }
 
 @INPROCEEDINGS{ch7:LindbergEtAl2012b,
 AUTHOR = " Lindberg, O. and Bingham, H. B. and Engsig-Karup, A. P. and Madsen, P.",
 TITLE = "Towards Real Time Simulation of Ship-Ship Interaction",
 BOOKTITLE = "Proceedings of The 27th International Workshop on Water Waves and Floating Bodies (IWWWFB)",
+ADDRESS = "Kongens Lyngby, Denmark",
 YEAR = "2012"
 }
 
-@Article{ch7:DucrozetEtAl2011,
-    author = "Ducrozet, G. and Bingham, H. B. and Engsig-Karup, A. P. and Bonnefoy
-1, F. and Ferrant, P.",
-    title = "A comparative study of two fast nonlinear free-surface
-water wave models",
-    journal = "Int. J. Num. Methods in Fluids (E-published)",
-    year = "2011"    }
+@article {ch7:DucrozetEtAl2011,
+author = {Ducrozet, G. and Bingham, H. B. and Engsig-Karup, A.P. and Bonnefoy, F. and Ferrant, P.},
+title = {A comparative study of two fast nonlinear free-surface water wave models},
+journal = {International Journal for Numerical Methods in Fluids},
+volume = {69},
+number = {11},
+publisher = {John Wiley & Sons, Ltd},
+issn = {1097-0363},
+url = {http://dx.doi.org/10.1002/fld.2672},
+doi = {10.1002/fld.2672},
+pages = {1818--1834},
+keywords = {hydrodynamics, water waves, high-order finite differences, high-order spectral, OceanWave3D, numerical comparisons},
+year = {2012},
+}
 
 @INPROCEEDINGS{ch7:HenshawEtAll1996,
 AUTHOR    = "Henshaw, W. D.",
@@ -180,8 +196,8 @@ PAGES     = "{319--333}"
 
 @incollection {ch7:TsaiYue1996,
     AUTHOR = {Tsai, W. and Yue, D. K. P.},
-     TITLE = {Computation of nonlinear free-surface flows},
- BOOKTITLE = {Annual review of fluid mechanics, Vol.\ 28},
+     TITLE = {Computation of Nonlinear Free-surface Flows},
+ BOOKTITLE = {Annual Review of Fluid Mechanics, Vol.\ 28},
      PAGES = {249--278},
  PUBLISHER = {Annual Reviews},
    ADDRESS = {Palo Alto, CA},
@@ -242,7 +258,7 @@ PAGES     = "285--297"
 @incollection {ch7:Yeung1982,
     AUTHOR = {Yeung, R. W.},
      TITLE = {Numerical methods in free-surface flows},
- BOOKTITLE = {Annual review of fluid mechanics, Vol. 14},
+ BOOKTITLE = {Annual Review of Fluid Mechanics, Vol. 14},
      PAGES = {395--442},
  PUBLISHER = {Annual Reviews},
    ADDRESS = {Palo Alto, Calif.},
@@ -257,7 +273,7 @@ title = "Wave modelling - The state of the art",
 journal = "Progress in Oceanography",
 volume = "75",
 number = "4",
-pages = "603 - 674",
+pages = "603--674",
 year = "2007",
 note = "",
 issn = "0079-6611",
@@ -292,9 +308,9 @@ VOLUME    = "29",
 PAGES     = "425--443"
 }
 
-@BOOK{ch7:AuzingerStetter1982,
+@INCOLLECTION{ch7:AuzingerStetter1982,
 AUTHOR    = "Auzinger, W. and Stetter, H. J.",
-TITLE     = "Defect corrections and multigrid iterations",
+TITLE     = "{Defect Corrections and Multigrid Iterations}",
 BOOKTITLE = "Multigrid Methods",
 VOLUME = "960",
 PAGES     = "327--351",
@@ -302,9 +318,9 @@ PUBLISHER = "Springer-Verlag, New York.",
 YEAR      = "1982"
 }
 
-@BOOK{ch7:Hackbusch1982,
+@INCOLLECTION{ch7:Hackbusch1982,
 AUTHOR    = "Hackbusch, W.",
-TITLE     = "On multigrid iterations with defect correction. In: Hackbusch, W.; Trottenberg, U. (eds): Lecture Notes in Math.",
+TITLE     = "On Multigrid iterations with Defect Correction. In: Hackbusch, W.; Trottenberg, U. (eds): Lecture Notes in Math.",
 BOOKTITLE = "Multigrid Methods",
 VOLUME = "960",
 PAGES     = "461--473",
@@ -334,7 +350,7 @@ MRREVIEWER = {S. F. McCormick},
 
 @ARTICLE{ch7:LaytonEtAl2002,
 AUTHOR  = {Layton, W. and Lee, H. K. and Peterson, J.},
-TITLE   = {A defect-correction method for the incompressible Navier-Stokes equations},
+TITLE   = "{A defect-correction method for the incompressible Navier-Stokes equations}",
 JOURNAL = {Appl. Math. Comput.},
 VOLUME  = {129},
 NUMBER = {1},
@@ -347,7 +363,7 @@ title = "Exaflop/s: The why and the how",
 journal = "Comptes Rendus Mecanique",
 volume = "339",
 number = "23",
-pages = "70 - 77",
+pages = "70--77",
 year = "2011",
 issn = "1631-0721",
 doi = "10.1016/j.crme.2010.11.002",
@@ -361,21 +377,22 @@ keywords = "Exaflop"
 
 @misc{ ch7:lessismore,
   author = "Feldman, M.",
-  title = "Less is More: Exploiting Single Precision Math in HPC",
+  title = "{Less is More: Exploiting Single Precision Math in HPC}",
   year = "2006",
-  url = "http://archive.hpcwire.com/hpc/692906.html"
+  howpublished = "http://archive.hpcwire.com/hpc/692906.html"
 }
 
 @PHDTHESIS{ch7:ENG06,
 AUTHOR    = "Engsig-Karup, A. P.",
 TITLE     = "Unstructured Nodal {DG-FEM} solution of high-order Boussinesq-type equations",
-SCHOOL    = "PhD. Thesis. Department of Mechanical Engineering, Technical University of Denmark",
-YEAR      = "2006"
+SCHOOL    = "Department of Mechanical Engineering, Technical University of Denmark",
+YEAR      = "2006",
+howpublished = "http://orbit.dtu.dk/en/publications/id(52502078-1608-4799-8e55-41f60bb92db6).html"
 }
 
 @book{ch7:Whalin1971,
   title={The Limit of Applicability of Linear Wave Refraction Theory in a Convergence Zone},
-  author={Whalin, R. W. and United States. Army. Corps of Engineers and Waterways Experiment Station (U.S.)},
+  author={Whalin, R. W. and United States Army. Corps of Engineers and Waterways Experiment Station (U.S.)},
   series={Research report},
   url={http://books.google.dk/books?id=wwvWSgAACAAJ},
   year={1971},
@@ -405,7 +422,7 @@ year = "1978"
 @ARTICLE{ch7:MS98,
 AUTHOR    = "Madsen, P. A. and Sch{\"{a}}ffer, H. A.",
 TITLE     = "Higher order Boussinesq-type equations for surface gravity waves - derivation and analysis.",
-JOURNAL   = "In Advances in Coastal and Ocean Engineering",
+JOURNAL   = "Advances in Coastal and Ocean Engineering",
 VOLUME    = "356",
 YEAR      = "1998",
 PAGES     = "3123--3181"
@@ -431,10 +448,10 @@ published = {SIAM}
 
 @ARTICLE{ch7:GlimbergEtAl2012,
   AUTHOR =       {S. L. Glimberg and A. P. Engsig-Karup},
-  TITLE =        {On a Multi-GPU Implementation of a Free Surface Water Wave Model for Large-scale Simulations},
-  JOURNAL =      {Submitted to: Special Issue of the Journal Parallel Computing},
+  TITLE =        "{On a Multi-GPU Implementation of a Free Surface Water Wave Model for Large-scale Simulations}",
+  JOURNAL =      "{Submitted to a Special Issue of the Journal Parallel Computing}",
   YEAR =         {2012},
-  volume =       {7th Special Issue devoted to PMAA 2012},
+  volume =       {7th Special Issue devoted to PMAA},
 }
 
 @BOOK{ch5:Smith1996,
@@ -452,7 +469,7 @@ published = {SIAM}
 @article{ch7:MS07,
 	author = {M. Gander and S. Vandewalle},
 	title = {Analysis of the parareal time-parallel time-integration method},
-	journal = {SIAM Journal of scientific computing},
+	journal = {SIAM Journal of Scientific Computing},
 	year = {2007},
 	volume = {29},
 	number = {2},
@@ -462,7 +479,7 @@ published = {SIAM}
 @article{ch7:LMT01,
 	author = {J.-L. Lions and Y. Maday and G. Turinici},
 	title = {R\'{e}solution d'EDP par un sch\'{e}ma en temps parar\'{e}el},
-	journal = {C.R. Acad Sci. Paris S\'{e}r. I math},
+	journal = {C.R. Acad Sci. Paris S\'{e}r. I Math},
 	year = {2001},
 	volume = {332},
 	pages = {661-668},
@@ -491,7 +508,7 @@ published = {SIAM}
   title =        "Accuracy and Stability of Numerical Algorithms",
   publisher =    "Society for Industrial and Applied Mathematics",
   address =      "Philadelphia, PA, USA",
-  edition =      "Second",
+  edition =      "2nd",
   year =         "2002",
   pages =        "xxx+680",
   ISBN =         "0-89871-521-0"
@@ -510,8 +527,9 @@ doi = {10.1029/JZ070i018p04561}
 
 @BOOK{ch7:SJ01, 
 AUTHOR    = "Svendsen, I. A. and Jonsson, I. G.", 
-TITLE     = "Hydrodynamics of coastal regions", 
+TITLE     = "Hydrodynamics of Coastal Regions", 
 PUBLISHER = "Technical University of Denmark", 
+ADDRESS = "Kongens Lyngby",
 YEAR      = "2001" 
 }
 
@@ -520,7 +538,7 @@ title = "Velocity potential formulations of highly accurate Boussinesq-type mode
 journal = "Coastal Engineering",
 volume = "56",
 number = "4",
-pages = "467 - 478",
+pages = "467--478",
 year = "2009",
 note = "",
 issn = "0378-3839",
@@ -535,11 +553,20 @@ keywords = "Bragg scattering",
 keywords = "Wave reflection/transmission"
 }
 
+
+@INPROCEEDINGS{ch7:LindbergEtAl2012b,
+AUTHOR = " Lindberg, O. and Bingham, H. B. and Engsig-Karup, A. P. and Madsen, P.",
+TITLE = "Towards Real Time Simulation of Ship-Ship Interaction",
+BOOKTITLE = "Proceedings of The 27th International Workshop on Water Waves and Floating Bodies (IWWWFB)",
+ADDRESS = "Kongens Lyngby, Denmark",
+YEAR = "2012"
+}
 @INPROCEEDINGS{ch7:LindbergEtAlIWWWFB2012,
-   author    = {Lindberg, O. and Bingham, H. B. and Engsig-Karup, A. P. and Madsen, P. A.}, 
-   title     = {Toward Real Time Simulation of Ship-Ship Interaction}, 
-   booktitle = {The 27th International Workshop on Water Waves and Floating Bodies}, 
-   year      = {2012}
+AUTHOR = " Lindberg, O. and Bingham, H. B. and Engsig-Karup, A. P. and Madsen, P.",
+TITLE = "Towards Real Time Simulation of Ship-Ship Interaction",
+BOOKTITLE = "Proceedings of The 27th International Workshop on Water Waves and Floating Bodies (IWWWFB)",
+ADDRESS = "Kongens Lyngby, Denmark",
+YEAR = "2012"
 }
 
 @article{ch7:RavenJMST2010,
@@ -571,7 +598,7 @@ keywords = "Wave reflection/transmission"
 
 @ARTICLE{ch7:PT90,
 AUTHOR    = "Press, W. H. and Teukolsky, S. A.",
-TITLE     = "Savitzky-Golay smoothening filters",
+TITLE     = "{Savitzky-Golay smoothening filters}",
 JOURNAL   = "Computers in Physics",
 VOLUME    = "4",
 YEAR      = "1990",
diff --git a/BookGPU/Chapters/chapter7/ch7.tex b/BookGPU/Chapters/chapter7/ch7.tex
index a44cbb5..03703d4 100644
--- a/BookGPU/Chapters/chapter7/ch7.tex
+++ b/BookGPU/Chapters/chapter7/ch7.tex
@@ -1,5 +1,5 @@
 
-\chapterauthor{Allan P. Engsig-Karup, Stefan L. Glimberg, Allan S. Nielsen and Ole Lindberg}{Technical University of Denmark}
+\chapterauthor{Allan P. Engsig-Karup, Stefan L. Glimberg, Allan S. Nielsen, Ole Lindberg}{Technical University of Denmark}
 %\chapterauthor{Stefan L. Glimberg}{Technical University of Denmark}
 %\chapterauthor{Allan S. Nielsen}{Technical University of Denmark}
 %\chapterauthor{Ole Lindberg}{Technical University of Denmark}
@@ -10,39 +10,46 @@
 \begin{figure}[!htb]
 \centering
 \includegraphics[width=0.95\textwidth]{Chapters/chapter7/figures/figSeries60CB06Type7StedaySnapshot-eps-converted-to.pdf}
-%\caption{Snapshot of steady state wave field generated by a Series 60 ship hull.}
+\caption{Snapshot of steady state wave field generated by a Series 60 ship hull.}
 \end{figure}
 
 \newpage
-In this chapter, we present details of a heterogenous and massively parallel GPU library implementation in CUDA C/C++ of a  nonlinear free surface water wave model \cite{ch7:EngsigKarupEtAl2011}. We describe how flexible-order finite difference\index{finite difference} approximations to the partial differential equations of the model can be prototyped using library components provided in an in-house library. In this library hardware-specific implementation details are hidden via template-based components, as described in chapter \ref{ch5}. We provide details of the modelling basis and important unique numerical properties which has been made tuneable to create a powerful tool that can be tailored for specific purposes in engineering analysis. The model is based on unified potential flow theory, and can be applied in scientific applications related to maritime engineering. It can be applied for cost-efficient estimation of wave propagation and transformation of irregular multidirectional waves over variable depth, kinematics and structural wave loads over large areas and scales.
+In this chapter, we use our library for heterogenous and massively parallel GPU implementations. The library is written in Compute Unified Device Architecture (CUDA) C/C++ and a fully nonlinear and dispersive free surface water wave model \cite{ch7:EngsigKarupEtAl2011} is implemented. We describe how flexible-order finite difference\index{finite difference} (stencil) approximations to the partial differential equations of the model can be prototyped using library components provided in an in-house library. In this library hardware-specific implementation details are hidden via template-based components, as described in Chapter \ref{ch5}. We provide details of the modeling basis and important unique numerical properties which have been made tuneable to create a powerful and robust tool that can be tailored for specific purposes in engineering analysis. The model is based on unified potential flow theory and can be applied in scientific applications related to maritime engineering. It can be applied for cost-efficient estimation of broad banded wave propagation, transformation of irregular multidirectional waves over variable depth, kinematics and structural wave loads over large areas and scales.
 
-A main motivation of this work is to deliver exceptional performance to minimize calculation times, using modern parallel hardware technologies in combination with a proper choice of discretization methods and data-local algorithms. Data-local algorithms with optimal complexity, such that work and memory requirements grow (scale) linearly with problem size on any hardware system. For the wave model this is achieved by explicit time integration and iterative solution of a large non-symmetric and sparse linear $\sigma$-transformed Laplace problem. For the latter, we use an iterative Preconditioned Defect Correction (PDC) method, accelerated using a geometric multigrid preconditioning strategy. We use modern programming paradigms in the form of MPI and CUDA for development of a novel massively parallel wave modelling tool, executable on modern heterogenous many-core hardware.
+A main motivation of this work is to deliver exceptional performance to minimize calculation times, using modern parallel hardware technologies in combination with a proper choice of discretization methods and data-local algorithms with optimal complexity. This enable work and memory requirements to grow (scale) linearly with problem size on a suitable hardware system. For the wave model this is achieved by explicit time integration and iterative solution of a large nonsymmetric and sparse linear $\sigma$-transformed Laplace problem. For the latter, we use an iterative Preconditioned Defect Correction (PDC) method, accelerated using a geometric multigrid preconditioning strategy. We use modern programming paradigms in the form of Message Passing Interface (MPI) and CUDA for development of a novel massively parallel wave modelling tool, executable on modern heterogenous many-core hardware.
 
-One purpose of the developed numerical model is to perform hydrodynamic calculations in computationally intensive interactive real-time simulations. Realistic simulations are, with present technology and available computational resources, a tremendous challenge in this setting. Yet, our aim is to take a first step in this direction, and compute first-order accurate hydrodynamics for near-realistic simulations of unsteady ship-wave dynamics in a large ship simulator, used for training purposes in seakeeping operations. For this type of application, a mandatory ingredient for real-time and interactive simulation is a truly high-performance parallel implementation to ensure data processing in time for interactive visualization and responses. Details of the model properties, implementation, and promising novel combinations of techniques and algorithms for acceleration of performance are presented. Numerical experiments and benchmarks are provided to demonstrate the accuracy and efficiency of the model across recent generations of many-core CUDA-enabled hardware.
+One purpose of the developed numerical model is to ultimately perform hydrodynamic calculations in the time domain for practical analysis and simulation, e.g., to enable computationally intensive interactive real-time simulations. Realistic interactive simulations are, with present technology and available computational resources, a tremendous challenge in this setting. Yet, our aim is to take a first step in this direction and compute first-order accurate hydrodynamics for near-realistic simulations of unsteady ship-wave dynamics in a large ship simulator, used for training purposes in seakeeping operations. For this type of application, a mandatory ingredient for real-time and interactive simulation is a truly high-performance parallel implementation to ensure data processing in time for interactive visualization and responses. Details of the model properties, implementation, and promising novel combinations of techniques and algorithms for acceleration of performance are presented. Numerical experiments and benchmarks are provided to demonstrate the accuracy and efficiency of the model across recent generations of many-core CUDA-enabled hardware.
 
 \section{On hardware trends and challenges in scientific applications}
 
-During the last two decades we have seen how computer graphics hardware has been developed from fixed pipeline processors with no level of programmability, to flexible high-performance hardware platforms, suitable for general purpose scientific computations other than computer graphics. This trend has been disruptive high-performance computing on mass-produced commodity hardware and give new opportunities for computational science and engineering on work stations for a broad range of scientific applications. This emphasizes the increasingly important role of computers in simulation of real world dynamics \cite{ch7:Keyes201170}. In recent years, the Compute Unified Device Architecture (CUDA) programming model, based on the standard C/C++ programming language and introduced by Nvidia, has become popular as a proprietary and widely used standard in high performance communities. It is by design and supported functionality, easy and sufficient to be deployed for wide improvement of existing and new applications across science and engineering fields, that can benefit from the the use of heterogenous hardware.
+During the last two decades we have seen how computer graphics hardware has been developed from fixed pipeline processors with no level of programmability, to flexible high-performance hardware platforms, suitable for general purpose scientific computations other than computer graphics. This trend has contributed to a disruptive breakthrough in high-performance computing on mass-produced commodity hardware and fuelled new opportunities for computational science and engineering for a broad range of scientific as well as modern business applications. This emphasizes the increasingly important role of computers in simulation of real world dynamics \cite{ch7:Keyes201170}. In recent years, the CUDA programming model, based on the standard C/C++ programming language and introduced by NVIDIA Corporation worldwide, has become popular as a proprietary and widely used standard in high performance communities. It is, by design and supported functionality, easy and sufficient to be deployed for wide improvement of existing and new applications across science and engineering fields, that can benefit from the the use of heterogenous hardware.
 
-We should be careful about speculating about the future and extrapolate from current trends. The TOP 500 list\footnote{\url{http://www.top500.org.}} of supercomputers shows that there are some general noticeable hardware trends and give indication of what to expect in the near future. First, since 2005 we have seen how power constraints and resulting heat dissipation problems forced chip producers to increase the number of cores rather than clock frequency.  Multi-core processors have become the new standard and many-core processors are becoming available as a standard in commodity hardware, from personal laptops to super-computing clusters.
+We should be careful about speculating about the future and extrapolating from current trends. The TOP 500 list\footnote{\url{http://www.top500.org.}} of supercomputers shows that there are some general noticeable hardware trends and gives indication of what to expect in the near future. First, since 2005 we have seen how power constraints and resulting heat dissipation problems forced chip producers to increase the number of cores rather than clock frequency.  Multicore processors have become the new standard and many-core processors are becoming available as a standard in commodity hardware, from personal laptops to supercomputing clusters.
 
-This trend suggests, that there will be less fast low-latency memory available per core in the future, favoring data-locality. In addition, we have also seen how communication speed to computation speed ratio decreases, making it increasingly difficult to supply data to hungry floating point units. In addition, there will likely be increasing amounts of data to store as a result of increasing processing capabilities. The rapidly increasing floating point performance following Moore's law for transistor production has resulted in a significant memory gap which leaves most PDE-based scientific applications bandwidth bound rather than compute bound. This trend is driven by pure commercial needs and not the needs of high-performance computing. Roads to better performance includes standardization of software infrastructure, rethinking algorithms to better exploit memory hierarchies optimally to boost strong scaling properties, increase locality in algorithms and introduce as much concurrency and work as possible to both utilize and exploit the many cores. Also, software that can utilize many cores should be fault-tolerant to maximize time to solution for application users. We should also expect to see multiple layers of parallelism that will have to be exploited and possibly auto-tuned to optimally utilize hardware. This introduces new challenges in compilers, requires programming experts with hardware knowledge and introduces new trends in software developments to leverage productivity and utilize available computational resources in more optimal ways. We have observed a fundamental paradigm shift of underlying hardware design towards much more heterogeneity and parallelism.
+This trend suggests that there will be less fast low-latency memory available per core in the future, favoring data-locality in algorithms. In addition, we have also seen how communication speed to computation speed ratio decreases, making it increasingly difficult to supply data to hungry floating point units. In addition, there will likely be increasing amounts of data to store as a result of increasing processing capabilities. The rapidly increasing floating point performance following Moore's law for transistor production has resulted in a significant memory gap which leaves most scientific applications based on partial differential equations (PDEs) bandwidth bound rather than compute bound. This trend is driven by pure commercial needs and not the needs of high-performance computing. Roads to better performance include standardization of software infrastructure, rethinking algorithms to better exploit memory hierarchies optimally to boost strong scaling properties, increasing locality in algorithms, and introducing as much concurrency and work as possible to both utilize and exploit the many cores. Also, software that can utilize many cores should be fault-tolerant to maximize time to solution for application users. We should also expect to see multiple layers of parallelism that will have to be exploited and possibly autotuned to optimally utilize available hardware resources. This introduces new challenges in compilers, requires programming experts with hardware knowledge, and introduces new trends in software developments to leverage productivity and utilize available computational resources in more optimal ways. We have observed a fundamental paradigm shift of underlying hardware design towards much more heterogeneity and parallelism.
 
-A key problem is, that improvements in performance require porting legacy codes\footnote{In the worst case, a legacy code is an undocumented serial code developed by a developer no longer around and for a long time ago for execution on single core hardware.} to new hardware, and possibly changing algorithms which have been developed for the conventional single core processors decades ago. Without this, it may be impossible to utilize and scale algorithmic work optimally to achieve high performance on modern and emerging hardware. This problem is currently addressed with rapid progress by researchers and industry by development of new optimized libraries, that can utilize such new hardware at minimum effort. While we have seen significant improvements in such efforts, and today see much more rapid development of applications, there are still few applications running entirely on heterogenous hardware. However, increasing amounts of applications are utilizing accelerators to parts of their code to gain speedups albeit with less dramatic improvements of performance as one can potentially find by adapting most, if not all of the application to modern hardware.
+A key problem is that improvements in performance require porting legacy codes\footnote{In the worst case, a legacy code is an undocumented serial code developed a long time ago by a developer no longer around.} to new hardware, and possibly changing algorithms which have been developed for the conventional single core processors decades ago. Without this, it may be impossible to utilize and scale algorithmic work optimally to achieve high performance on modern and emerging hardware. This problem is currently addressed with rapid progress by researchers and industry by development of new optimized libraries that can utilize such new hardware. While we have seen significant improvements in such efforts, and today see much more rapid development of applications, there are still relatively few scientific applications running entirely on heterogenous hardware. 
+
+%The main justification for porting or developing application on such hardware is a significant performance band expected (significant) 
+
+%This will change with improvements in library software and tools provided by key vendors 
+%
+%However, increasing amounts of applications are utilizing accelerators to parts of their code to gain speedups albeit with less dramatic improvements of performance as one can potentially find by adapting most, if not all of the application to modern hardware.
+
+In this work, we explore some of these trends by developing, by bottom-up-design, a water wave model which can be utilized in maritime engineering and with the intended use on affordable office desktops as well as on more expensive modern compute clusters for engineering analysis purposes.  
 
-In this work, we explore some of these trends by developing, by bottom-up-design, a water wave model which can be utilized in maritime engineering and with intended use on affordable office desktops as well as on more expensive modern compute clusters for analysis purposes.  
 \section{On modeling paradigms for highly nonlinear and dispersive water waves}
 \label{ch7:sec:modernwavemodellingparadigms}
 
-We see development of new hardware technologies as a key driver for exploring new and revisiting existing approaches that can contribute to next-generation modelling techniques.
+We see the development of new or improved hardware technologies as a key driver for exploring new and revisiting existing approaches that can contribute to next-generation modeling techniques.
 
-For instance, the dominant wave modelling paradigm today for numerical simulation in coastal engineering tools is the use of Boussinesq-type \index{Boussinesq models} formulations for approximate solution of unified potential flow\index{potential flow} equations over varying bathymetry \cite{ch7:MS98}. The use of Boussinesq-type models in engineering tools was pioneered by Abott et. al. (1978) \cite{ch7:AbottEtAl1978,ch7:AbottEtAl1984} based on the original Boussinesq equations due to Peregrine (1967) \cite{ch7:Peregrine1967} for calculations of waves in a harbour area. New formulations for highly nonlinear and dispersive water waves, useful for description of wave propagation in the important application range from deep to shallow areas, have been subject of intense research for more than two decades. Such higher order formulations can be derived by first introducing an infinite Mclaurin series solution to the Laplace equation as described in \cite{ch7:AMS99}. This technique was later generalized to arbitrary expansion levels \cite{ch7:MBL02}. By analytical truncation of such series solutions, a polynomial variation in the vertical is assumed, and provides the basis for efficient higher order Boussinesq-type formulations \cite{ch7:MBS03,ch7:Bingham2009467} for fully nonlinear and dispersive water waves. It is attractive, since it is then possible to eliminate the vertical coordinate in the analytical formulation of the Laplace problem. The resulting approximate model typically contains higher order derivatives that require treatment in numerical models. Thus, this truncation procedure inherently limits the practical application range, however, can be significantly improved by optimization via Pad\'e approximations together with introduction of free parameters for extending the application range via optimization of accuracy in dispersion, kinematics and shoaling characteristics.
+For instance, the dominant wave modeling paradigm today for numerical simulation in coastal engineering tools is the use of Boussinesq-type \index{Boussinesq models} formulations for approximate solution of unified potential flow\index{potential flow} equations over varying bathymetry \cite{ch7:MS98}. The use of Boussinesq-type models in engineering tools was pioneered in 1978 by Abott et. al. \cite{ch7:AbottEtAl1978,ch7:AbottEtAl1984} based on the original Boussinesq equations due to Peregrine \cite{ch7:Peregrine1967} for calculations of waves in a harbor area. New formulations for highly nonlinear and dispersive water waves, useful for description of wave propagation in the important application range from deep to shallow areas, have been the subject of intense research for more than two decades. Such higher order formulations can be derived by first introducing an infinite Mclaurin series solution to the Laplace equation as described in \cite{ch7:AMS99}. This technique was later generalized to arbitrary expansion levels \cite{ch7:MBL02}. By analytical truncation of such series solutions, a polynomial variation in the vertical is assumed and provides the basis for efficient higher order Boussinesq-type formulations \cite{ch7:MBS03,ch7:Bingham2009467} for fully nonlinear and dispersive water waves. It is attractive, since it is then possible to eliminate the vertical coordinate in the analytical formulation of the Laplace problem. The resulting approximate model contains higher order derivatives to describe dispersion and these require careful treatment in numerical models. Thus, this truncation procedure inherently limits the practical application range, however, it can be significantly improved via Pad\'e approximations together with the introduction of free parameters for extending the finite application range by mathematical optimization to enhance accuracy in dispersion, kinematics and shoaling characteristics.
 
-Main challenges of Boussinesq-type models are accurate and large-scale simulation of waves propagating towards near-shore from deep to shallow waters through surf zones, while accounting for high-order dispersive and nonlinear effects \cite{ch7:Cavaleri2007603}. Within the last two decades, much research has focused on extending the application range through improved formulations in terms of both dispersion, shoaling, kinematic and nonlinear properties. The ultimate high-order Boussinesq-type model due to \cite{ch7:MBS03} was at the time considered a breakthrough in this direction, and since then new promising formulations have been proposed. For example, the methodology behind Boussinesq-type formulations can be extended via a multi-layer approach \cite{ch7:LynettEtAl2004a,ch7:LynettEtAl2004b,ch7:ChazelEtAl2010}, that makes it possible to achieve a similar range of application and levels of accuracy, but without higher derivatives in the formulation that can cause numerical difficulties.
+Main challenges of Boussinesq-type models are accurate and large-scale simulation of waves propagating towards near-shore from deep to shallow waters through surf zones, while accounting for high-order dispersive and nonlinear effects \cite{ch7:Cavaleri2007603}. Within the last two decades, much research has focused on extending the application range through improved formulations in terms of dispersion, shoaling, kinematic and nonlinear properties. The ultimate high-order Boussinesq-type model due to \cite{ch7:MBS03} was at the time considered a breakthrough in this direction, and since then promising new formulations have been proposed. For example, the methodology behind Boussinesq-type formulations can be extended via a multilayer approach \cite{ch7:LynettEtAl2004a,ch7:LynettEtAl2004b,ch7:ChazelEtAl2010} that makes it possible to achieve a similar range of application and levels of accuracy, but without higher derivatives in the formulation that can cause numerical difficulties.
 
-Boussinesq-type formulations for free surface waves are conventionally evaluated against the unified potential flow theory to evaluate limits to application range and accuracy limits. The use of unified potential theory as a basis for numerical models has traditionally been perceived too expensive \cite{ch7:Lin2008} to solve in comparison with the typically more efficient Boussinesq-type models. This may be true in a strict comparison between the models, especially with respect to applications towards the more restricted shallow regions. However, this is despite that a numerical unified potential flow model can be used for a larger range of practical scientific applications. A unified potential flow model has at most second order derivatives in the formulation. In a numerical setting it has good opportunities for balancing accuracy and work effort by appropriate tuning of discrete parameters. This comes without a need for changing the underlying wave model to extend application range towards deep waters. Thus, the main problem related to the practical use of a unified model in maritime applications is arguably an issue of numerical efficiency.
+Boussinesq-type formulations for free surface waves are conventionally evaluated against the unified potential flow theory to evaluate limits to application range and accuracy limits. The use of unified potential theory as a basis for numerical models has traditionally been perceived as too expensive \cite{ch7:Lin2008} to solve in comparison with the typically more efficient Boussinesq-type models. This may be true in a strict comparison between the models, especially with respect to applications towards the more restricted shallow regions. However, this is in spite of the fact that a numerical unified potential flow model can be used for a larger range of practical scientific applications. A unified potential flow model has at most second-order derivatives in the formulation. In a numerical setting it has good opportunities for balancing accuracy and work effort by appropriate tuning of discrete parameters. This comes without a need for changing the underlying wave model to extend application range towards deep waters. Thus, the main problem related to the practical use of a unified model in maritime applications is arguably an issue of numerical efficiency.
 
-To address this issue, we have recently proposed a proof-of-concept approach, that combines modern many-core hardware with appropriate numerical and parallel strategies to facilitate efficient, accurate and scalable solution of water wave problems~\cite{ch7:EngsigKarupEtAl2011}. The use of potential theory for unsteady water wave computations can be traced at least back to \cite{ch7:HausslingVanEseltine1975}, and the fully nonlinear potential equations have been solved using various numerical methods since then, e.g., see reviews \cite{ch7:Yeung1982,ch7:TsaiYue1996,ch7:DiasBridges2006,ch7:Lin2008}. In the context of the finite-difference method, an efficient and scalable second-order geometric multigrid approach was first proposed by Li \& Fleming (1997) \cite{ch7:LiFleming1997}. Since then, the numerical strategy has been significantly improved in several works \cite{ch7:BinghamZhang2007,ch7:EBL08} via more efficient discretization techniques, with the objective to develop an efficient general purpose strategy, that can be used for a broad range of practical maritime applications. Recently, a comparison with a High-Order Spectral (HOS) model \cite{ch7:DucrozetEtAl2011} was also reported to assess  accuracy and relative differences in efficiency on single-core hardware against a superior spectral modelling basis for a numerical wave tank setup in a structured domain with a flat sea bed.
+To address this issue, we have recently proposed a new approach in a proof-of-concept that combines modern many-core hardware with appropriate numerical and parallel strategies to facilitate efficient, accurate and scalable solution of water wave problems~\cite{ch7:EngsigKarupEtAl2011}. The use of potential theory for unsteady water wave computations can be traced at least back to 1975 \cite{ch7:HausslingVanEseltine1975}, and the fully nonlinear potential equations have been solved using various numerical methods since then, e.g., see reviews \cite{ch7:Yeung1982,ch7:TsaiYue1996,ch7:DiasBridges2006,ch7:Lin2008}. In the context of the finite-difference method, an efficient and scalable second-order geometric multigrid approach was first proposed by Li and Fleming in 1997 \cite{ch7:LiFleming1997}. Since then, the numerical strategy has been significantly improved in several works \cite{ch7:BinghamZhang2007,ch7:EBL08} that have led to more efficient and robust discretization techniques, with the objective of developing a general purpose strategy, that can be used for a broad range of practical maritime applications. Recently, a comparison with a High-Order Spectral (HOS) model \cite{ch7:DucrozetEtAl2011} was also reported to assess  accuracy and relative differences in efficiency on single-core hardware against a superior spectral modeling basis for a numerical wave tank setup in a structured domain with a flat sea bed.
 
 \section{Governing equations}
 \label{ch7:goveq}
@@ -53,7 +60,7 @@ Conservation of mass\index{mass conservation} for an infinitely small control vo
 \begin{align}
 \frac{\partial \rho}{\partial t} + \nabla \cdot (\rho {\bf u}) = 0,
 \end{align}
-where $\rho$ is fluid density, and ${\bf u}=(u,v,w)$ is the velocity field vector and $\nabla=(\partial/\partial x, \partial/\partial y, \partial/\partial z)$ a Cartesian gradient operator. If we assume that fluid density is constant, i.e. that the fluid is incompressible, the mass continuity equation simplifies to
+where $\rho$ is fluid density, ${\bf u}=(u,v,w)$ is the velocity field vector, and $\nabla=(\partial/\partial x, \partial/\partial y, \partial/\partial z)$ is a Cartesian gradient operator. If we assume that fluid density is constant, i.e., that the fluid is incompressible, the mass continuity equation simplifies to
 \begin{align}
 \label{ch7:conteq}
  \nabla \cdot {\bf u} = 0,
@@ -65,9 +72,9 @@ Conservation of momentum\index{momentum conservation} for an infinitely small co
 \label{ch7:NSeq}
 \rho \frac{D{\bf u}}{Dt} = -\nabla p + \mu \nabla^2{\bf u} + {\bf F},
 \end{align}
-where $p$ is pressure and ${\bf F}$ is the net force vector acting on the fluid volume, assumed to be of the form ${\bf F}=\rho{\bf g}$ with ${\bf g}=(0,0,-g_z)$ accounting for gravitational effects in the vertical direction. This implies that surface tension effects on the free surface is neglected. Exact solutions to the Navier-Stokes equations are in general difficult to obtain and this motivate the use of numerical methods for direct simulation of fluid flow and if necessary analytical simplifications to only account for physics of interest.
+where $p$ is pressure and ${\bf F}$ is the net force vector acting on the fluid volume, assumed to be of the form ${\bf F}=\rho{\bf g}$ with ${\bf g}=(0,0,-g_z)$ accounting for gravitational effects in the vertical direction. This implies that surface tension effects on the free surface are neglected. Exact solutions to the Navier-Stokes equations are in general difficult to obtain, and this motivates the use of numerical methods for direct simulation of fluid flow and if necessary analytical simplifications to account only for physics of interest.
 
-The material derivative for a co-moving coordinate system used in Lagrangian formulations
+The material derivative for a comoving coordinate system used in Lagrangian formulations
 \begin{align}
 \frac{D}{Dt}\equiv \frac{\partial }{\partial t} + ({\bf u}\cdot \nabla),
 \end{align}
@@ -84,14 +91,14 @@ and make use of the following relationship known from vector calculus
 \begin{align}
 \frac{1}{2}\nabla({\bf u}\cdot {\bf u}) = ({\bf u}\cdot \nabla){\bf u} + {\bf u}\times (\nabla \times {\bf u}),
 \end{align}
-we can rewrite \eqref{ch7:momentum} into
+we can rewrite \eqref{ch7:momentum} as
 \begin{align}
 \label{ch7:momentum2}
 \frac{D{\bf u}}{Dt} = \frac{\partial {\bf u}}{\partial t} +  \frac{1}{2}\nabla({\bf u}\cdot {\bf u}).
 \end{align}
-Since the Navier-Stokes equations \eqref{ch7:NSeq} can be interpreted as the application of Newton's second law to an infinitely small fluid volume, we can establish that the changes in momentum in an infinitely small control volume of a fluid is simply the sum of forces due to pressure gradients, dissipative forces, gravitational forces and possibly other forces acting inside the fluid volume.
+Since the Navier-Stokes equations \eqref{ch7:NSeq} can be derived by application of Newton's second law to an infinitely small fluid volume, we can establish the following. Changes in momentum in an infinitely small control volume of a fluid is simply the sum of forces due to pressure gradients, dissipative forces, gravitational forces, and possibly other forces acting inside the fluid volume.
 
-To accurately simulate propagation of long gravity waves and high-reynolds number flows in the context of maritime applications it is for many applications acceptable to assume that viscous forces are small in comparison with inertial forces. In this case, it is reasonable to assume that the fluid is inviscid and we can neglect the viscous terms in \eqref{ch7:NSeq}. Then we obtain the set of Euler equations
+To accurately simulate propagation of long gravity waves and high Reynolds number flows in the context of maritime applications it is for many applications acceptable to assume that viscous forces are small in comparison with inertial forces. In this case, it is reasonable to assume that the fluid is inviscid and we can neglect the viscous terms in \eqref{ch7:NSeq}. Then we obtain the set of Euler equations
 \begin{align}
 \label{ch7:momentum3}
 \frac{D{\bf u}}{Dt} = - \frac{1}{\rho}\nabla p + \frac{1}{\rho} {\bf F},
@@ -106,7 +113,7 @@ If we introduce a scalar velocity potential function $\phi$ that relates to the
 the number of unknowns can be lowered and we find that the scalar velocity potential function due to \eqref{ch7:conteq} must satisfy the Laplace equation
 \begin{align}
 \label{ch7:laplaceeq}
-\nabla^2\phi=0,
+\nabla^2\phi=0.
 \end{align}
 Solutions to this equation are completely determined by the boundary conditions\index{boundary condition}. Thus, it is possible, given appropriate boundary conditions, to determine the scalar velocity potential $\phi$ in all of the domain by solving the resulting Laplace problem. When the scalar velocity potential function is known, detailed information of the kinematics can immediately be obtained.
 
@@ -114,15 +121,15 @@ With the definition of the vector field \eqref{ch7:vectorfield}, we can collect
 \begin{align}
 \rho\left( \frac{\partial \nabla \phi}{\partial t} + \nabla \left(\frac{1}{2} \nabla\phi \cdot \nabla\phi\right)  \right) = -\nabla p - \nabla(\rho g z),
 \end{align}
-having assumed that the net force can be decomposed into pressure and gravity forces only. This set of equations can be rewritten into
+having assumed that the net force can be decomposed into pressure and gravity forces only. This set of equations can be rewritten as
 \begin{align}
 \nabla \left[ \rho \frac{\partial \phi}{\partial t} + \rho \frac{1}{2} \nabla\phi \cdot \nabla\phi + p + \rho g z \right] = 0,
 \end{align}
-and by integration in space we arrive at the unsteady Bernoulli's Equation
+and by integration in space we arrive at the unsteady Bernoulli's equation
 \begin{align}
 \rho \frac{\partial \phi}{\partial t} + \rho \frac{1}{2} \nabla\phi \cdot \nabla\phi + p + \rho g z = G(t),
 \end{align}
-where $G(t)$ is an arbitrary function of the integration that can be assumed to be zero as it only defines a reference value for the unphysical scalar velocity potential function. Bernoulli's equation is typically used as a dynamic condition for the free fluid surface, expressing that the fluid pressure at the free surface is equal to the pressure in the air above the free surface.
+where $G(t)$ is an arbitrary function of the integration that can be assumed to be zero as it defines only a reference value for the unphysical scalar velocity potential function. Bernoulli's equation is typically used as a dynamic condition for the free fluid surface, expressing that the fluid pressure at the free surface is equal to the pressure in the air above the free surface.
 
 In the following, we assume that the displacement of the free surface $z=\eta({\bf x},t)$ is described in a Cartesian coordinate system with the $xy$-plane located at the still water level and the positive $z$-axis pointing upwards. It is typical to assume a constant atmospheric pressure at the free surface by defining $p=0$ as reference. This leaves us with a dynamic boundary condition for the free surface velocity potential function stated as
 \begin{align}
@@ -132,7 +139,7 @@ At the free surface, we can determine a kinematic free surface condition by dete
 \begin{align}
 \frac{\partial \eta}{\partial t} = -\frac{\partial \phi}{\partial x} \frac{\partial \eta}{\partial x} - \frac{\partial \phi}{\partial y} \frac{\partial \eta}{\partial y} +  \frac{\partial \phi}{\partial z}, \quad z=\eta.
 \end{align}
-Spatial and temporal differentiation of the free surface variables are related by the chain rule
+Spatial and temporal differentiations of the free surface variables are related by the chain rule
 %
 \begin{align}
 \boldsymbol{\nabla}\tilde{\phi} &= (\boldsymbol{\nabla}\phi)_{z=\eta} + \left(\frac{\partial \phi}{\partial z}\right)_{z=\eta}\boldsymbol{\nabla}\eta,  \\
@@ -144,37 +151,37 @@ and can be used to transform the free surface problem to variables defined solel
 \begin{subequations}
 \begin{align}
 \frac{\partial}{\partial t} \eta &= -\boldsymbol{\nabla}\eta\cdot\boldsymbol{\nabla}\tilde{\phi}+\tilde{w}(1+\boldsymbol{\nabla}\eta\cdot\boldsymbol{\nabla}\eta), \label{ch7:FSeta} \\
-\frac{\partial}{\partial t} \tilde{\phi} &= -g\eta - \frac{1}{2}\left(\boldsymbol{\nabla}\tilde{\phi}\cdot\boldsymbol{\nabla}\tilde{\phi}-\tilde{w}^2(1+\boldsymbol{\nabla}\eta\cdot\boldsymbol{\nabla}\eta)\right).
+\frac{\partial}{\partial t} \tilde{\phi} &= -g\eta - \frac{1}{2}\left(\boldsymbol{\nabla}\tilde{\phi}\cdot\boldsymbol{\nabla}\tilde{\phi}-\tilde{w}^2(1+\boldsymbol{\nabla}\eta\cdot\boldsymbol{\nabla}\eta)\right),
 \label{ch7:FSphi}
 \end{align}
 \label{ch7:FSorigin}
 \end{subequations}
 %
-with the $\boldsymbol{\nabla}$-operator from here and forward defined as a horizontal gradient operator $\boldsymbol{\nabla}=(\partial_x,\partial_y)$ and tilde's are used for free surface variables. To solve the set of unsteady free surface equations \eqref{ch7:FSorigin}, we need a closure between the horizontal and vertical free surface velocity variables. This can be established by solving the Laplace equation \eqref{ch7:laplaceeq} in the interior domain together with suitable boundary conditions.
+with the $\boldsymbol{\nabla}$-operator from here conveniently re-defined as a horizontal gradient operator $\boldsymbol{\nabla}=(\partial_x,\partial_y)$ and tilde's used for free surface variables. To solve the set of unsteady free surface equations \eqref{ch7:FSorigin}, we need a closure between the horizontal and vertical free surface velocity variables. This can be established by solving the Laplace equation \eqref{ch7:laplaceeq} in the interior domain together with suitable boundary conditions.
 
 A kinematic bottom condition can be derived by assuming that the fluid particles follow a streamline along the solid sea bed. Consider the rate of change of such a streamline at still-water depth $z=-h({\bf x},t)$ and we find
 \begin{align}
 \label{ch7:kinbot}
 \frac{\partial z}{\partial t} = - \frac{\partial h}{\partial x}  \frac{\partial \phi}{\partial x} - \frac{\partial h}{\partial y}  \frac{\partial \phi}{\partial y} - \frac{\partial h}{\partial t}, \quad z=-h({\bf x},t).
 \end{align}
-We assume that the sea bed is static allowing us to neglect the last term. Thus, by specifying $\tilde\phi$ as a Dirichlet condition at the free surface together with a kinematic bottom boundary condition at the sea bed defines a Laplace problem
+We assume that the sea bed is static allowing us to neglect the last term. Thus, specifying $\tilde\phi$ as a Dirichlet condition at the free surface together with a kinematic bottom boundary condition at the sea bed defines a Laplace problem
 %
 \begin{subequations}
 \label{ch7:eq:laplaceproblem}
 \begin{align}
 \phi & =  \tilde{\phi}, \quad z = \eta({\bf x},t), \\
 \boldsymbol{\nabla}^2\phi + \partial_{zz}\phi & = 0, \quad -h\leq z<\eta({\bf x},t), \label{ch7:Laplace} \\
-\partial_z \phi + \boldsymbol{\nabla}h\cdot\boldsymbol{\nabla}\phi &= 0, \quad z=-h. \label{ch7:KB}
+\partial_z \phi + \boldsymbol{\nabla}h\cdot\boldsymbol{\nabla}\phi &= 0, \quad z=-h, \label{ch7:KB}
 \end{align}
 \end{subequations}
-where we have used that $\partial_t z \equiv \partial_z \phi$ to rewrite the first term of the kinematic bottom condition \eqref{ch7:kinbot}.
+where we have used $\partial_t z \equiv \partial_z \phi$ to rewrite the first term of the kinematic bottom condition \eqref{ch7:kinbot}.
 %
-The moving free surface makes the spatial fluid domain $\Omega$ vary in time. The main challenges in solving these equations numerically are to deal with the time-dependent fluid domain and nonlinearity of the equations.
+The moving free surface makes the spatial fluid domain $\Omega$ vary in time. The main challenges in solving these equations numerically are dealing with the time-dependent fluid domain and nonlinearity of the equations.
 %
 
 \subsection{Boundary conditions}\index{boundary condition}
 
-We consider three types of boundaries, namely, fully reflective boundaries, incident wave boundaries and absorbing boundaries. The fully reflective boundaries are handled through numerical approximations of the boundary conditions for solid walls and bottom surfaces stating that the velocity in the normal direction is zero
+We consider three types of boundaries, namely, fully reflective boundaries, incident wave boundaries, and absorbing boundaries. The fully reflective boundaries are handled through numerical approximations of the boundary conditions for solid walls and bottom surfaces stating that the velocity in the normal direction is zero
 \begin{align}
 {\bf n}\cdot \nabla \phi = 0, \quad {\bf x}\in\partial\Omega,
 \end{align}
@@ -183,25 +190,25 @@ where ${\bf n}=(n_x,n_y)$ is a two-dimensional normal vector pointing outwards f
 {\bf n} \cdot \nabla\eta = 0.
 \end{align}
 
-Incident wave and absorbing boundary conditions are imposed via an embedded penalty forcing technique as described in section \ref{ch7:wavegen}.
+Incident wave and absorbing boundary conditions are imposed via an embedded penalty forcing technique as described in Section \ref{ch7:wavegen}.
 
 \section{The numerical model}
 \label{ch7:sec:nummodel}
 
 The unified potential flow model is attractive as a basis due to the underlying analytical properties.
-From a numerical point of view, an efficient and scalable discretization strategy should be based on using a data-local method, e.g., a flexible-order finite difference method for discretely approximating the governing equations and imposing boundary conditions via fictitious ghost points techniques as described in \cite{ch7:BinghamZhang2007,ch7:EBL08}. Such an approach has several attractive features from a scientific computing perspective. For example, finite difference methods are among the simplest and most efficient methods due to the use of structured grids and data structures. This result in low implementation and computational complexity which maps efficiently to modern computer architectures. Formal accuracy and tuneable numerics are achieved by employing flexible-order finite difference\index{finite difference} (local stencil)\index{stencil} approximations. 
+From a numerical point of view, an efficient and scalable discretization strategy should be based on using a data-local method, e.g., a flexible-order finite difference method for discretely approximating the governing equations and imposing boundary conditions via fictitious ghost points techniques as described in \cite{ch7:BinghamZhang2007,ch7:EBL08}. Such an approach has several attractive features from a scientific computing perspective. For example, finite difference methods are among the simplest and most efficient methods due to the use of structured grids and data structures. This results in low implementation and computational complexity which maps efficiently to modern computer architectures. Formal accuracy and tuneable numerics are achieved by employing flexible-order finite difference\index{finite difference} (local stencil)\index{stencil} approximations. 
 
-We present scalability and performance tests based on the same two test environments outlined in chapter \ref{ch5} section \ref{ch5:sec:testenvironments}, plus a a third test environment based on the most recent hardware generation:
+We present scalability and performance tests based on the same two test environments outlined in Chapter \ref{ch5}, Section \ref{ch5:sec:testenvironments}, plus a third test environment based on the most recent hardware generation:
 \begin{description}
-\item[Test environment 3.] Desktop with dual-socket Sandy Bridge Intel Xeon E5-2670 (2.60 GHz) processors, 64GB RAM, 2x Nvidia Tesla K20 GPUs.
+\item[Test environment 4.] Desktop with dual-socket Sandy Bridge Intel Xeon E5-2670 (2.60 GHz) processors, 64GB RAM, 2x Nvidia Tesla K20 GPUs.
 \end{description}
 Performance results can be used to predict actual runtimes as described in \cite{ch7:EngsigKarupEtAl2011}, e.g., for estimation of whether a real-time constraint for a given problem size can be met.
 
 \subsection{Strategies for efficient solution of the Laplace problem}\index{Laplace problem}
 
-As explained in section \ref{ch7:sec:modernwavemodellingparadigms}, for the formulation of potential flow problems there are two widely used paradigms for solving the Laplace problem efficiently. The most widely used approach is of Boussinesq-type where essentially the three-dimensional formulation is reduced to a two-dimensional formulation. The main argument for this type of model reduction procedure is the resulting efficiency in the numerical models. The price paid is typically high-order derivatives in the approximate formulation and is justified by efficient solution of an approximate Laplace problem.
+As explained in Section \ref{ch7:sec:modernwavemodellingparadigms}, for the formulation of potential flow problems there are two widely used paradigms for solving the Laplace problem efficiently. The most widely used approach is the Boussinesq-type where essentially the three-dimensional formulation is reduced to a two-dimensional formulation. The main argument for this type of model reduction procedure is the resulting efficiency in the numerical models. The price paid is typically high-order derivatives in the approximate formulation and is justified by the efficient solution of an approximate Laplace problem.
 
-A second approach, is to transform the equations at PDE level to provide a basis for efficient direct solution of the discrete Laplace problem for the entire volume. This strategy is based on a paradigm where approximations are done by discrete approximations rather than analytical manipulations of the equation at PDE level. This approach introduces at a first look more complexity in the formulation, e.g. by the introduction of mixed derivatives, however, essentially does not limit the application range beyond those resulting from numerical approximations and properties hereof. Using this second approach, it is standard to introduce a $\sigma$-transformation in the vertical coordinate
+A second approach is to transform the partial differential equation mathematically to provide a basis for an efficient direct solution of the discrete Laplace problem for the entire volume. This strategy is based on a paradigm where approximations are done by discrete approximations rather than analytical manipulations of the equation. At a first look, this approach introduces more complexity in the formulation, e.g., by the introduction of mixed derivatives, however, essentially does not limit the application range beyond the numerical approximations and properties hereof. Using this second approach, it is standard to introduce a $\sigma$-transformation in the vertical coordinate
 \begin{align}
 \label{ch7:sigtrans}
 \sigma \equiv \frac{z+h(\boldsymbol{x})}{d(\boldsymbol{x},t)}, \quad 0\leq \sigma \leq 1,
@@ -240,13 +247,13 @@ h-\tfrac{\boldsymbol{\nabla} h\cdot\boldsymbol{\nabla} h}{d}\right)
 \end{align}
 \end{subequations}
 %
-All of these coefficients can be computed explicitly from the known two-dimensional free surface and bottom positions at given instants of time.
+All of these coefficients can be computed explicitly from the known two-dimensional free surface and bottom positions at given instances of time.
 
 The velocity field can be determined from a known $\Phi$ using the relation
 \begin{align}
 ({\bf u},w) = (\boldsymbol{\nabla}, \partial_z \sigma \partial_\sigma) \Phi.
 \end{align}
-The flow can be computed from the scalar velocity potential and used for estimating non-hydrostatic pressure and resulting wave loads. An exact expression for local pressure as a function of the vertical coordinate can be found by vertical integration of the vertical momentum equation to be of the form
+The flow can be computed from the scalar velocity potential and used for estimating nonhydrostatic pressure and resulting wave loads. An exact expression for local pressure as a function of the vertical coordinate can be found by vertical integration of the vertical momentum equation to be of the form
 \begin{align}
 p(z) = \rho g (\eta -z ) + \int_{z}^\eta \partial_t w dz + \frac{1}{2}(\tilde{u}^2-u(z)^2 + \tilde{v}^2 - v(z)^2 + \tilde{w}^2 - w(z)^2).
 \end{align}
@@ -259,21 +266,21 @@ where $S$ is a structural surface.
 
 \subsection{Finite difference approximations}\index{finite difference}
 
-The numerical scheme is implemented as a flexible-order finite difference collocation scheme where all finite difference approximations of derivatives are constructed from one-dimensional approximations in a standard way each having the maximum possible accuracy. In explicit numerical schemes, finite difference approximations can be implemented using a matrix-free technique to exploit that only a few different stencils are in fact needed. This can significantly reduce memory requirements of the implemented model by exploiting that the same small set of stencils can be reused. See chapter \ref{ch5} for more details about matrix free stencil operations supported in our in-house library for heterogenous and massively parallel computing using GPUs.
+The numerical scheme is implemented as a flexible-order finite difference collocation scheme where all finite difference approximations of derivatives are constructed from one-dimensional approximations in a standard way, each having the maximum possible accuracy. In explicit numerical schemes, finite difference approximations can be implemented using a matrix-free technique to exploit that only a few different stencils are in fact needed. This can significantly reduce memory requirements of the implemented model by exploiting that the same small set of stencils can be reused. See Chapter \ref{ch5} for more details about matrix-free stencil operations supported in our in-house library for heterogenous and massively parallel computing using GPUs.
 
 %\newpage
 \subsection{Time integration}\index{time integration}
 
-For users of scientific applications robustness is of paramount importance for the solution of time-dependent PDEs. This makes stability considerations relevant in the context of both explicit and iterative numerical methods often considered most suitable for massively parallel applications. In the following, we address aspects of explicit time integration schemes which is associated with a stability\index{stability} requirement on time steps.
+For users of scientific applications, robustness is of paramount importance for the solution of time-dependent PDEs. This makes stability considerations relevant in the context of both explicit and iterative numerical methods often considered most suitable for massively parallel applications. In the following, we address aspects of explicit time integration schemes which are associated with a stability\index{stability} requirement on time steps.
 
-A Method of Lines\index{method of lines} approach is used for the discretization of the wave model. The spatial discretization yields a system of ordinary differential equations which can be expressed as a semi-discrete system.
+A method of lines\index{method of lines} approach is used for the discretization of the wave model. The spatial discretization yields a system of ordinary differential equations which can be expressed as a semidiscrete system.
 %
-We use the classical fourth order Explicit Runge-Kutta Method (ERK4). This algorithm is suitable for massive parallel computations via a data-parallel implementation where the spatial discretization terms are processed. As a means to introduce more concurrency into the time integration we consider for the first time the 'Parareal' algorithm as described in section \ref{ch7:parareal}.
+We use the classical fourth-order Explicit Runge-Kutta Method (ERK4). This algorithm is suitable for massive parallel computations via a data-parallel implementation where the spatial discretization terms are processed. As a means to introduce more concurrency into the time integration, we consider the Parareal algorithm as described in Section \ref{ch7:parareal}.
 For explicit time-integration schemes a Courant-Levy-Friedrichs (CFL) condition defines a necessary restriction for temporal stability of the form
 \begin{align}
 \Delta t\leq \frac{C}{\max_{n} |\lambda_n(\mathcal{J}_h)|},
 \end{align}
-with $C\in\mathbb{R}_+$ a CFL constant typically of size $\mathcal{O}(1)$ dependent on chosen scheme, and $\mathcal{J}_h\in\mathbb{R}^{2m\times2m}$, where $m=N_xN_y$, is a discrete Jacobian\index{Jacobian} matrix obtained by local linearization in time of \eqref{ch7:FSorigin}. For ERK4, $C=2\sqrt{2}$ if all eigenvalues are purely imaginary.
+with $C\in\mathbb{R}_+$ a CFL constant typically of size $\mathcal{O}(1)$ dependent on chosen scheme, and $\mathcal{J}_h\in\mathbb{R}^{2m\times2m}$, where $m=N_xN_y$ is a discrete Jacobian\index{Jacobian} matrix obtained by local linearization in time of \eqref{ch7:FSorigin}. For ERK4, $C=2\sqrt{2}$ if all eigenvalues are purely imaginary.
 
 To gain insight into necessary conditions for stability, we employ a linear stability analysis based on the semi-discrete linear system
 %
@@ -297,17 +304,17 @@ From these equations we find for the discrete block Jacobian operator
 \frac{\partial \phi}{\partial z}\Big|_{z=\eta} = \mathcal{J}_{12,h} \tilde{\phi}, \quad  \mathcal{J}_{12,h} = \mathcal{D}_{bb} - \mathcal{D}_{bi} \mathcal{A}_{ii}^{-1} \mathcal{A}_{ib}.
 \end{align}
 %
-As shown in \cite{ch7:RobertsonSherwin1999} the eigenvalues of $\mathcal{J}_{12,h}$ is related to the eigenvalues of the discrete Jacobian $\mathcal{J}_h$ through the following relationship
+As shown in \cite{ch7:RobertsonSherwin1999} the eigenvalues of $\mathcal{J}_{12,h}$ are related to the eigenvalues of the discrete Jacobian $\mathcal{J}_h$ through the following relationship
 %
 \begin{align}
 \lambda(\mathcal{J}_h) = \pm i \sqrt{ \lambda(\mathcal{J}_{12,h}) g }.
 \end{align}
 %
-and is all imaginary confirming the hyperbolic (energy-conserving) nature of the potential flow formulation. Thus, for a given discretization of the linearized equations, it is possible to compute the eigenvalues of the discrete block operator to determine the eigenspectrum of the full operator. A discrete analysis of the eigenvalues is given in \cite[Section 4.1]{ch7:EBL08}, but it is not clearly pointed out that in fact the discrete eigenspectrum is compact (bounded) for a fixed polynomial order in the vertical, i.e. that for a constant depth $h$
+and are all imaginary confirming the hyperbolic (energy-conserving) nature of the potential flow formulation. Thus, for a given discretization of the linearized equations, it is possible to compute the eigenvalues of the discrete block operator to determine the eigenspectrum of the full operator. A discrete analysis of the eigenvalues is given in \cite[Section 4.1]{ch7:EBL08}, but it is not clearly pointed out that in fact the discrete eigenspectrum is compact (bounded) for a fixed polynomial order in the vertical, i.e., that for a constant depth $h$
 \begin{align}
 \max|\lambda(\mathcal{J}_h)| = \lim_{kh\to\infty}|\lambda(\mathcal{J}_h)|\leq C(N_z)\sqrt{\frac{g}{h}}.
 \end{align}
-Similar results were reported for the first time in the context of high-order Boussinesq-type equations in \cite{ch7:ENG06,ch7:EHBM06}. This is an important practical property of the discrete scheme as it is favourable to numerical stability. It implies that the linear model is not severely limited by the spatial resolution in the horizontal for a specific choice of the number  of collocation nodes ($N_z$) in the vertical. This suggests that the model is quite robust due to insensitivity in the choice of time step, with the implication that local grid adaptivity can be used for improving spatial accuracy. Interestingly, for the unified potential flow model we find that this also holds for nonlinear simulations. Large time steps can be chosen when using dense grids and high-order numerics without severely degrading overall numerical stability and efficiency. This is confirmed in numerical experiments and demonstrated in figure \ref{ch7:numexp}. However, for very steep nonlinear waves and very densely clustered non-uniform grids, stability is found to be compromised without filtering. A proper filtering strategy can be used to remedy stability problems without destroying accuracy. 
+Similar results were reported for the first time in the context of high-order Boussinesq-type equations in \cite{ch7:ENG06,ch7:EHBM06} and recently it has been shown \cite{ch7:EE13} that widely used implicitly-implicit Boussinesq-type equations can be re-formulated to have bounded eigenspectra using high-order discretisation methods. This is an important practical property of the discrete scheme as it is favorable to numerical stability. It implies that the linear model is not severely limited by the spatial resolution in the horizontal for a specific choice of the number  of collocation nodes ($N_z$) in the vertical. This suggests that the model is quite robust due to insensitivity in the choice of time step, with the implication that local grid adaptivity can be used for improving spatial accuracy. Interestingly, for the unified potential flow model we find that this also holds for nonlinear simulations. Large time steps can be chosen when using dense grids and high-order numerics without severely degrading overall numerical stability and efficiency. This is confirmed in numerical experiments and demonstrated in Figure \ref{ch7:numexp}. However, for very steep nonlinear waves and very densely clustered nonuniform grids, stability is found to be compromised without filtering. A proper filtering strategy, e.g., based on a super collocation technique \cite{ch7Kirby03de-aliasingon}, can be used to remedy stability problems without destroying accuracy. 
 %
 \begin{figure}[!htb]
 \centering
@@ -315,7 +322,7 @@ Similar results were reported for the first time in the context of high-order Bo
 % MainLaplace2D_ex03.m
 \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/scalingNx25-eps-converted-to.pdf}
 }
-\subfigure[High-order spatial discretisation and stable explicit time-stepping with large time steps for a nonlinear standing wave. Scaling based on $a=0$. ]{
+\subfigure[High-order spatial discretization and stable explicit time-stepping with large time steps for a nonlinear standing wave. Scaling based on $a=0$. ]{
 % MainLaplace2D_ex03.m
 \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/standingwaveglozman-eps-converted-to.pdf}
 }
@@ -327,7 +334,7 @@ Similar results were reported for the first time in the context of high-order Bo
 % MainLaplace2D_ex035_nonlinearLaplace.m
 \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/SFwaves_snapshots_clustered-eps-converted-to.pdf}
 }
-\caption[Numerical experiments to assess stability properties of numerical wave model.]{Numerical experiments to assess stability properties of numerical wave model. In three cases, computed snapshots are taken of the wave elevation over one wave period of time. In a) the grid distribution of nodes in a one-parameter mapping for the grid is illustrated. Results from changes in wave elevation are illustrated for b) a mildly nonlinear standing wave on a highly clustered grid, c) regular stream function wave of medium steepness in shallow water $(kh,H/L)=(0.5,0.0292)$ on a uniform grid ($N_x=80$) and d) for a nonuniform grid with a minimal grid spacing 20 times smaller(!). In every case the step size remains fixed at $\Delta t = T/160$ s corresponding to a Courant number $C_r=c\tfrac{\Delta t}{\Delta x}=0.5$ for the uniform grid. A 6'$th$ order scheme and explicit EKR4 time-stepping is used in each test case.}
+\caption[Numerical experiments to assess stability properties of numerical wave model.]{Numerical experiments to assess stability properties of numerical wave model. In three cases, computed snapshots are taken of the wave elevation over one wave period of time. In (a) the grid distribution of nodes in a one-parameter mapping for the grid is illustrated. Results from changes in wave elevation are illustrated for (b) a mildly nonlinear standing wave on a highly clustered grid, (c) regular stream function wave of medium steepness in shallow water $(kh,H/L)=(0.5,0.0292)$ on a uniform grid ($N_x=80$), and (d) a nonuniform grid with a minimal grid spacing 20 times smaller(!). In every case the step size remains fixed at $\Delta t = T/160$ s corresponding to a Courant number $C_r=c\tfrac{\Delta t}{\Delta x}=0.5$ for the uniform grid. A sixth order scheme and explicit EKR4 time-stepping are used in each test case.}
 \label{ch7:numexp}
 \end{figure}
 %\newpage
@@ -335,56 +342,58 @@ Similar results were reported for the first time in the context of high-order Bo
 \subsection{Wave generation and absorption}
 \label{ch7:wavegen}
 
-To simulate waves using a numerical model, a general purpose technique for both generating and absorbing waves inside the finite numerical domain is needed. It is preferable that the technique is suitable for easy integration in a software library component setup. One such technique is the line relaxation\index{relaxation} method attributed to \cite{ch7:LD83}. This is a simple ad hoc technique sufficiently accurate for engineering purposes. It modifies the computed solution every time step during simulation by a post-processing step where the relaxed solution becomes
+To simulate waves using a numerical model, a general purpose technique for both generating and absorbing waves inside the finite numerical domain is needed. It is preferable that the technique is suitable for easy integration in a software library component setup. One such technique is the line relaxation\index{relaxation} method attributed to \cite{ch7:LD83}. This is a simple ad hoc technique sufficiently accurate for engineering purposes. It modifies the computed solution every time step during simulation by a postprocessing step where the relaxed solution becomes
 %determined as
 \begin{align}
 \label{ch7:eq:discreteupdate}
 g^*(x_i,t) = \Gamma(x_i)g(x_i,t) + (1-\Gamma(x_i))g_e(x_i,t), \quad x_i\in\Omega_\Gamma.
 \end{align}
-Here $g(x,t)$ is one of the free surface variables $\tilde{\phi},\eta$ at an instant in time and $0\leq \Gamma(x)\leq 1$ is a single-valued function within the relaxation region $x_i\in\Omega_\Gamma$. The first term acts as a sponge layer which is responsible for effectively dissipating energy inside a specified relaxation zone\index{relaxation!zone} $\Omega_\Gamma$. The terms containing $g_e(x,t)$, where $g_e$ is an analytical solution (e.g. such as stream function wave theory \cite{ch7:Dean1965}) act as source terms in the relaxation zone. This makes it possible to generate arbitrary waves accurately in the computational domain in accordance with an analytical representation of incident waves.
+Here $g(x,t)$ is one of the free surface variables $\tilde{\phi},\eta$ at an instant in time, and $0\leq \Gamma(x)\leq 1$ is a single-valued function within the relaxation region $x_i\in\Omega_\Gamma$. The first term acts as a sponge layer which is responsible for effectively dissipating energy inside a specified relaxation zone\index{relaxation!zone} $\Omega_\Gamma$. The terms containing $g_e(x,t)$, where $g_e$ is an analytical solution (e.g., such as the stream function wave theory \cite{ch7:Dean1965}), act as source terms in the relaxation zone. This makes it possible to generate arbitrary waves accurately in the computational domain in accordance with an analytical representation of incident waves.
 
-We can interpret \eqref{ch7:eq:discreteupdate} as a discrete update of the solution at an isolated spatial point inside a relaxation zone. We introduce the notation $g^n=g(x_i,t_n)$ and $g^{*,n+1}=g^*(x_i,t_{n+1})$ and assume that $t_{n+1}=t_n+\tau$ is an instant in time. Then we can rewrite \eqref{ch7:eq:discreteupdate} to motivate an analytical modification to time-dependent equations that can provide similar modification (forcing) in simulation.
+We can interpret \eqref{ch7:eq:discreteupdate} as a discrete update of the solution at an isolated spatial point inside a relaxation zone. We introduce the notations $g^n=g(x_i,t_n)$ and $g^{*,n+1}=g^*(x_i,t_{n+1})$ and assume that $t_{n+1}=t_n+\tau$ is an instant in time. Then we can rewrite \eqref{ch7:eq:discreteupdate} to motivate an analytical modification to time-dependent equations that can provide similar modification (forcing) in simulation.
 
-Subtract $g^n$ in \eqref{ch7:eq:discreteupdate} and divide by a pseudo time step size $\tau$ to obtain the equivalent form
+Subtracting $g^n$ in \eqref{ch7:eq:discreteupdate} and dividing by a pseudo time step size $\tau$, we obtain the equivalent form
 
 %%
 \begin{align}
 \frac{g^{*,n+1}-g^n}{\tau} =\frac{(1-\Gamma)}{\tau} (g_e^n-g^n).
 \end{align}
 %
-The first term is similar to a first order accurate Forward Euler\index{forward Euler} approximation of a rate of change term. This motivates an {\em embedded penalty forcing technique} based on adding a correction term of the form
+The first term is similar to a first-order accurate Forward Euler\index{forward Euler} approximation of a rate of change term. This motivates an {\em embedded penalty forcing technique} based on adding a correction term of the form
 %
 \begin{align}\label{ch7:eq:penalty}
 \partial_t g = \mathcal{N}(g) + \frac{1-\Gamma(x)}{\tau} (g_e(t,x)-g(t,x)), \quad {\bf x}\in\Omega_\Gamma,
 \end{align}
 %
-where $\mathcal{N}$ represents a general nonlinear operator for the right hand side. The immediate advantage is that a time stepping scheme can easily be interchanged in a model implementation. The added terms is a source term resulting in forcing inside relaxation zones when $g_e(t,x)\neq g(t,x)$ and $\Gamma(x)\neq1$ and otherwise has no effect. The strength of the forcing is influenced by the arbitrary parameter $\tau\in\mathbb{R}_+$ which can be defined to match the time scale of the dynamics. We have found that  $\tau\approx\Delta t$ works well, however, it is possible that a more optimal choice exist. Remark, a too small $\tau$ may degrade numerical stability of the model.
+where $\mathcal{N}$ represents a general nonlinear operator for the right-hand side. The immediate advantage is that a time-stepping scheme can easily be interchanged in a model implementation. The added term is a source term resulting in forcing inside relaxation zones when $g_e(t,x)\neq g(t,x)$ and $\Gamma(x)\neq1$ and otherwise has no effect. The strength of the forcing is influenced by the arbitrary parameter $\tau\in\mathbb{R}_+$ which can be defined to match the time scale of the dynamics. We have found that  $\tau\approx\Delta t$ works well, however, it is possible that a more optimal choice exist. Note that a too small $\tau$ may degrade the numerical stability of the model.
 
-A simple validation of the zones is shown in figure \ref{ch7:figstandwave} where waves are generated at the left wall, and propagate to the right wall, where reflection occur leading formation of standing waves du to resulting interaction with the incident waves inside the numerical wave tank.
+A simple validation of the zones is shown in Figure \ref{ch7:figstandwave} where waves are generated at the left wall and propagate to the right wall, where reflection occurs leading to formation of standing waves due to the resulting interaction with the incident waves inside the numerical wave tank.
 
-The following relaxation functions proposed in \cite{ch7:ENG06} guarantees continuity across interface of relaxation zone and computational domain and are used in simulations for, respectively, sponge layers and wave generation zones
+The following relaxation functions proposed in \cite{ch7:ENG06} guarantee continuity across the interface of the relaxation zone and computational domain and are used in simulations for sponge layers and wave generation zones, respectively
 %
 \begin{align}
 % \label{relaxfunc1}
-\Gamma_{s}(x)  = 1-(1-x)^p, \quad \Gamma_{g}(x) = -2x^3+3x^2, \quad x\in[0,1].
+\Gamma_{s}(\hat{x})  = 1-(1-\hat{x})^p, \quad \Gamma_{g}(\hat{x}) = -2\hat{x}^3+3\hat{x}^2, \quad \hat{x}\in[0,1]
 \label{ch7:relaxfunc2}
 \end{align}
 %
-The profiles can be reversed by a change of coordinate, i.e. $\Gamma(1-x)$, and scaled to interval sizes of interest. The first function satisfies the condition that any derivative at the left boundary vanishes at $x=1$. The first derivatives of the second function vanish at both ends. The relaxation zones are positioned appropriately where waves are to be both/either generated and/or absorbed. The rule of thumb is that a relaxation used for absorption has a spatial length of at least two wave lengths. For absorption zones, we find that this technique is more efficient in velocity formulation of the free surface equations often used in Boussinesq-type formulations, e.g., see \cite{ch7:ENG06,ch7:EHBM06,ch7:EHBW08} in comparison with scalar potential formulations \eqref{ch7:FSorigin}. However, similar performance can be achieved by merely increasing the length of relaxation zones in such regions. Demonstrations of the technique are seen in figure \ref{ch7:figstandwave} where vertical dashed lines indicates interfaces between relaxation zones and the computational region. Incident waves propagate from left to right in both examples.
+where $\hat{x}\equiv x/x_L$ is coordinate normalised by the length $x_L$ of the zone in question.
+
+The profiles can be reversed by a change of coordinate, i.e., $\Gamma(1-\hat{x})$, and scaled to interval sizes of interest. The first function satisfies the condition that any derivative at the left boundary vanishes at $\hat{x}=1$. The first derivatives of the second function vanish at both ends. The relaxation zones are positioned appropriately where waves are to be both/either generated and/or absorbed. A practical rule of thumb is that a relaxation used for absorption has a spatial length of at least two wave lengths. For absorption zones, we find that this technique is more efficient in velocity formulation of the free surface equations often used in Boussinesq-type formulations, e.g., see \cite{ch7:ENG06,ch7:EHBM06,ch7:EHBW08} in comparison with scalar potential formulations \eqref{ch7:FSorigin}. However, similar performance can be achieved by merely increasing the length of relaxation zones in such regions. Demonstrations of the technique are seen in Figure \ref{ch7:figstandwave} where vertical dashed lines indicate interfaces between relaxation zones and the computational region. Incident waves propagate from left to right in both examples.
 %
 \begin{figure}[!htb]
 \centering
 \subfigure[Wave generation, reflection and absorption of small-amplitude waves.]{
 % Script : MainLaplace2D_ex03penalityLINEAR_REFLECTEDWAVES.m
 \includegraphics[width=0.98\textwidth]{Chapters/chapter7/figures/standingwavespenalty-eps-converted-to.pdf}
-% Nx = 480, 6th order, vertical clustering, Nz=6;
+% Nx = 480, sixth order, vertical clustering, Nz=6;
 }
 \subfigure[Wave generation and absorption of steep finite-amplitude waves.]{
 % Script : MainLaplace2D_ex03penalityNONLINEAR_GENERATEWAVES.m
 \includegraphics[width=0.98\textwidth]{Chapters/chapter7/figures/nonlinearwavespenalty-eps-converted-to.pdf}
 % Nx = 540, 6th order, vertical clustering, Nz=6;
 }
-\caption[Snapshots at intervals $T/8$ over one wave period in time.]{Snapshots at intervals $T/8$ over one wave period in time of computed a) small-amplitude $(kh,kH)=(0.63,0.005)$ and b) finite-amplitude $(kh,kH)=(1,0.41)$ stream function waves elevations having reached a steady state after transient startup. Combined wave generation and absorption zones in the western relaxation zone of both a) and b). In b) an absorption zone is positioned next to the eastern boundary and causes minor visible reflections. }
+\caption[Snapshots at intervals $T/8$ over one wave period in time.]{Snapshots at intervals $T/8$ over one wave period in time of computed (a) small-amplitude $(kh,kH)=(0.63,0.005)$ and (b) finite-amplitude $(kh,kH)=(1,0.41)$ stream function waves elevations having reached a steady state after transient startup. Combined wave generation and absorption zones occur in the left relaxation zone of both (a) and (b). In b) an absorption zone is positioned next to the right boundary and causes minor visible reflections. }
 \label{ch7:figstandwave}
 \end{figure}
 
@@ -396,55 +405,51 @@ For the solution of sparse linear systems
 \label{ch7:linsyscompact}
 \mathcal{A}{\bf \Phi} = {\bf b}, \quad \mathcal{A}\in\mathbb{R}^{n\times n}, \quad {\bf b}\in\mathbb{R}^{n},
 \end{align}
-it is attractive to use iterative methods for large system sizes $n=N_xN_yN_z$ and for parallel implementations. Acceleration of suitable iterative methods can be done, e.g., by instead solving a left-preconditioned\index{preconditioning!left-preconditioning} system of the form %typical %A key challenge is preconditioning for acceleration of convergence of iterative methods.
+it is attractive to use iterative methods for large system sizes $n=N_xN_yN_z$ and for parallel implementations. Acceleration of suitable iterative methods can be done, e.g., by solving a left-preconditioned\index{preconditioning!left-preconditioning} system of the form %typical %A key challenge is preconditioning for acceleration of convergence of iterative methods.
 \begin{align}
 \label{ch7:eq:linsys}
 \mathcal{M}^{-1} ( \mathcal{A} {\bf \Phi} = {\bf b}), \quad \mathcal{M}\in\mathbb{R}^{n \times n},
 \end{align}
-where $\mathcal{M}$ is a preconditioned with the property that $\mathcal{M}^{-1}\approx \mathcal{A}^{-1}$ can be computed at low cost.
+where $\mathcal{M}$ is a preconditioner with the property that $\mathcal{M}^{-1}\approx \mathcal{A}^{-1}$ can be computed at low cost.
 
-The bottleneck problem in a unified potential flow model is the solution of a discrete $\sigma$-transformed Laplace problem stated in the compact forms \eqref{ch7:linsyscompact} or \eqref{ch7:eq:linsys}. It is attractive to find an efficient iterative strategy where convergence\index{convergence} is understood via a convergence theory, have modest storage requirements, have minimal global communication requirements (in the form of global inner products) and is suitable for flexible-order\index{flexible order} discretizations. The class of geometric Multigrid Methods\index{multigrid} fulfils these requirements and have shown to be among the most efficient iterative strategies for a wide class of problems \cite{ch7:Trottenberg01}. In particular, the time required to solve a system of linear equations to a given accuracy level can be made to scale proportional to the number of unknowns.
+The bottleneck problem in a unified potential flow model is the solution of a discrete $\sigma$-transformed Laplace problem stated in the compact forms \eqref{ch7:linsyscompact} or \eqref{ch7:eq:linsys}. It is attractive to find an efficient iterative strategy where convergence\index{convergence} is understood via a convergence theory, has modest storage requirements, has minimal global communication requirements (in the form of global inner products), and is suitable for flexible-order\index{flexible order} discretizations. The class of geometric Multigrid Methods\index{multigrid} fulfills these requirements and has shown to be among the most efficient iterative strategies for a wide class of problems \cite{ch7:Trottenberg01}. In particular, the time required to solve a system of linear equations to a given accuracy level can be made to scale proportional to the number of unknowns.
 
-There are several known approaches to multigrid methods \cite{ch7:MR744926} for high-order discretizations. Among these, Defect Correction\index{defect correction} Methods (DCMs) \cite{ch7:Stetter1978,ch7:AuzingerStetter1982,ch7:Hackbusch1982,ch7:Auzinger1987,ch7:Trottenberg01} have been employed successfully, e.g., in Computational Fluid Dynamics \cite{ch7:LaytonEtAl2002}, in numerical simulations since the early 1970s.  
-The fundamental idea of DCMs is to combine the good stability properties of low-order discretizations with high-order accuracy discretizations for explicit residual evaluations. These iterative methods impose low storage requirements and have scalable work effort under suitable choices of preconditioning strategies and may be accelerated using a multigrid method based on low-order discretizations, while still achieving high-order accuracy.
+There are several known approaches to multigrid methods \cite{ch7:MR744926} for high-order discretizations. Among these, Defect Correction\index{defect correction} Methods (DCMs) \cite{ch7:Stetter1978,ch7:AuzingerStetter1982,ch7:Hackbusch1982,ch7:Auzinger1987,ch7:Trottenberg01} have been employed successfully, e.g., in computational fluid dynamics \cite{ch7:LaytonEtAl2002}, in numerical simulations since the early 1970s.  
+The fundamental idea of DCMs is to combine the good stability properties of low-order discretizations with high-order accuracy discretizations for explicit residual evaluations. These iterative methods impose low storage requirements, have scalable work effort under suitable choices of preconditioning strategies, and may be accelerated using a multigrid method based on low-order discretizations while still achieving high-order accuracy.
 Furthermore, it has been shown, that the rate of convergence of DCM combined with standard multigrid methods can achieve rates of convergence corresponding to the most efficient multigrid methods~\cite{ch7:Auzinger1987}.
 
-Therefore, for the efficient and scalable solution of the unified potential flow model, we have recently \cite{ch7:EngsigKarupEtAl2011} proposed a Preconditioned Defect Correction (PDC) Method for efficient iterative low-storage solution of high-order accurate discretization of the $\sigma$-transformed Laplace problem \eqref{ch7:TransformedLaplace}. The proposed strategy can be seen as a generalization of the multigrid strategy proposed by \cite{ch7:LiFleming1997}. The PDC method enables significant improvement of overall efficiency and accuracy with the preconditioning based on a second-order linearized version of the full coefficient matrix $\mathcal{A}$ as described in \cite{ch7:EBL08}.
+Therefore, for the efficient and scalable solution of the unified potential flow model, we have recently \cite{ch7:EngsigKarupEtAl2011} proposed a Preconditioned Defect Correction (PDC) method for efficient iterative low-storage solution of high-order accurate discretization of the $\sigma$-transformed Laplace problem \eqref{ch7:TransformedLaplace}. The proposed strategy can be seen as a generalization of the multigrid strategy proposed by \cite{ch7:LiFleming1997}. The PDC method enables significant improvement of overall efficiency and accuracy with the preconditioning based on a second-order linearized version of the full coefficient matrix $\mathcal{A}$ as described in \cite{ch7:EBL08}.
 
  Starting from some initial guess ${\bf \Phi}^{[0]}\in\mathbb{R}^n$, the PDC method for solving \eqref{ch7:eq:linsys} can be stated compactly as a three-step recurrence for $k=1,2,...$
 \begin{align}
 \label{ch7:eq:defectcorrectionprocedure}
 \Phi^{[k]} = \Phi^{[k-1]} + \delta^{[k-1]}, \;\; \delta^{[k-1]}=\mathcal{M}^{-1} {\bf r}^{[k-1]}, \;\; {\bf r}^{[k-1]}={\bf  b} - \mathcal{A}\Phi^{[k-1]},
 \end{align}
-where $\Phi^{[k]},{\boldsymbol \delta}^{[k]},{\bf r}^{[k]}\in\mathbb{R}^n$ are, respectively, the approximate solution, the defect (preconditioned residual) and the residual of \eqref{ch7:eq:linsys} at the $k$'th iteration. The algorithm can be speedup by using mixed-precision calculations on modern many-core hardware as demonstrated in \cite{ch7:Glimberg2011}.  
+where $\Phi^{[k]},{\boldsymbol \delta}^{[k]},{\bf r}^{[k]}\in\mathbb{R}^n$ are the approximate solution, the defect (preconditioned residual), and the residual of \eqref{ch7:eq:linsys} at the $k$th iteration, respectively. The algorithm can be speeded up by using mixed-precision calculations on modern many-core hardware as demonstrated in \cite{ch7:Glimberg2011}.  
 
 \subsection{Distributed computations via domain decomposition}\label{ch7:sec:dd}\index{domain decomposition}
 
-Numerical modelling of large ocean areas to account for nonlinear wave-wave interactions and wave-structure interactions require large degrees of spatial resolution, significant computational resources and parallel computations to be practical. The recent generations of programmable GPUs are heavily optimized for on-chip bandwidth performance but not capacity. This implies that for the solution of large-scale PDE problems, distributed computing on multiple GPU devices is required due to limited capacity in the global memory space of current GPUs. Via a combination of MPI and CUDA we have recently demonstrated how both small and large systems can be solved efficiently by heterogenous computations using a data domain decomposition technique in parallel \cite{ch7:GlimbergEtAl2012}. The idea is to distribute the computational tasks to multiple GPUs, to enable reduced computational times and increased problem sizes. A homogenous partitioning of the data is used to ensure that the load balance across the GPUs is uniform. Data distribution and message passing introduce a data transfer bottleneck in the form of the PCIe link and network interconnection. Thus, if the computational intensity of the local problem is not large enough to enable sufficient latency hiding of this bottleneck, the whole application are likely to be (severely) limited by the PCIe link or network bandwidth performance rather than the high on-chip bandwidth of the individual GPUs.
+Numerical modeling of large ocean areas to account for nonlinear wave-wave interactions and wave-structure interactions requires large degrees of spatial resolution, significant computational resources, and parallel computations to be practical. The recent generations of programmable GPUs are heavily optimized for on-chip bandwidth performance but not capacity. This implies that for the solution of large-scale PDE problems, distributed computing on multiple GPU devices is required due to limited capacity in the global memory space of current GPUs. Via a combination of MPI and CUDA we have recently demonstrated how both small and large systems can be solved efficiently by heterogenous computations using a data domain decomposition technique in parallel \cite{ch7:GlimbergEtAl2012}. The idea is to distribute the computational tasks to multiple GPUs, to enable reduced computational times and increased problem sizes. A homogenous partitioning of the data is used to ensure that the load balance across the GPUs is uniform. Data distribution and message passing introduce a data transfer bottleneck in the form of the Peripheral Component Interconnect express (PCIe) link and network interconnection. Thus, if the computational intensity of the local problem is not large enough to enable sufficient latency hiding of this bottleneck, the whole application is likely to be (severely) limited by the PCIe link or network bandwidth performance rather than the high on-chip bandwidth of the individual GPUs.
 
-The ratio between necessary data transfers and computational work for the proposed numerical model for free surface water waves is high enough to expect reasonable latency hiding. The data domain decomposition method consists of a logically structured division of the computational domain into multiple subdomains. Each of these subdomains are connected via fictitious ghost layers at the artificial boundaries of width corresponding to the half-width of the finite difference stencils employed. This results in a favourable volume-to-boundary ratio as the problem size increases, diminishing communication overhead for message passing. Information between subdomains are exchanged through ghost layers at every step of the iterative PDC method, in connection with the matrix-vector evaluation for the $\sigma$-transformed Laplace problem, and before relaxation steps in the multigrid method. A single global synchronization point occur at most once each iteration, if convergence is monitored, where a global reduction step (inner product) between all processor nodes takes place. The main advantage of this decomposition strategy is, that the decomposition into multiple subdomains is straightforward. However, it comes with the cost of extra data transfers to update the set of fictitious ghost layers.
+The ratio between necessary data transfers and computational work for the proposed numerical model for free surface water waves is high enough to expect reasonable latency hiding. The data domain decomposition method consists of a logically structured division of the computational domain into multiple subdomains. Each of these subdomains is connected via fictitious ghost layers at the artificial boundaries of width corresponding to the half-width of the finite difference stencils employed. This results in a favorable volume-to-boundary ratio as the problem size increases, and diminishing communication overhead for message passing. Information between subdomains is exchanged through ghost layers at every step of the iterative PDC method, in connection with the matrix-vector evaluation for the $\sigma$-transformed Laplace problem, and before relaxation steps in the multigrid method. A single global synchronization point occurs at most once each iteration, if convergence is monitored, where a global reduction step (inner product) between all processor nodes takes place. The main advantage of this decomposition strategy is that the decomposition into multiple subdomains is straightforward. However, it comes with the cost of extra data transfers to update the set of fictitious ghost layers.
 
 \begin{figure}[!htb]
     \setlength\figureheight{0.30\textwidth}
     \setlength\figurewidth{0.33\textwidth}
     \begin{center}
-    \subfigure[Test environment 3.]{
+    \subfigure[Test environment 4.]{
     {\scriptsize\input{Chapters/chapter7/figures/TeslaK20PerformanceScaling3D.tikz}}
     }
-    \subfigure[Test environment 3.]{
+    \subfigure[Test environment 4.]{
     {\scriptsize\input{Chapters/chapter7/figures/TeslaK20SpeedupGPUvsCPU3D.tikz}}
     }
     \end{center}
-    \caption[Performance timings per PDC iteration as a function of increasing problem size $N$, for single, mixed and double precision arithmetics.]{Performance timings per PDC iteration as a function of increasing problem size $N$, for single, mixed and double precision arithmetics. Three dimensional nonlinear waves, using $6^{th}$ order finite difference approximations, preconditioned with one multigrid V-cycle and one pre- and post- Red-black Gauss-Seidel smoothing. Speedup compared to fastest known serial implementation. Using Test environment 3. CPU timings represent starting point for our investigations and has been obtained using Fortran 90 code and is based on a single-core run on a Intel Core i7, 2.80GHz processor.}\label{ch7:fig:perftimings}
+    \caption[Performance timings per PDC iteration as a function of increasing problem size $N$, for single, mixed and double precision arithmetics.]{Performance timings per PDC iteration as a function of increasing problem size $N$, for single, mixed, and double precision arithmetics. Three-dimensional nonlinear waves, using sixth order finite difference approximations, preconditioned with one multigrid V-cycle and with one pre- and post- red-black Gauss-Seidel smoothing operation. Speedup compared to fastest known serial implementation. Using Test environment 4, CPU timings represent starting points for our investigations and have been obtained using the Fortran 90 code. These references results are based on a single-core (non-parallel) run on a Intel Core i7, 2.80GHz processor.}\label{ch7:fig:perftimings}
 \end{figure}
 
-The parallel domain decomposition solver has been validated against the sequential solvers with respect to algorithmic efficiency to establish that the code produce correct results. An analysis of the numerical efficiency have also been carried out on different GPU systems to identify comparative behaviors as both the problems sizes and number of compute nodes vary. For example, performance scalings on Test environment 1 and Test environment 2 are presented in figure \ref{ch7:fig:multigpuperformance}. The figure confirms that there is only a limited benefit from using multiple GPUs for small problem sizes, since the computational intensity is simply too low to efficiently hide the latency of message passing. A substantial speedup is achieved compared to the single GPU version, while being able to solve even larger systems.
+The parallel domain decomposition solver has been validated against the sequential solvers with respect to algorithmic efficiency to establish that the code produces correct results. An analysis of the numerical efficiency has also been carried out on different GPU systems to identify comparative behaviors as both the problems sizes and number of compute nodes vary. For example, performance scalings on Test environment 1 and Test environment 3 are presented in Figure \ref{ch7:fig:multigpuperformance}. The figure confirms that there is only a limited benefit from using multiple GPUs for small problem sizes, since the computational intensity is simply too low to efficiently hide the latency of message passing. A substantial speedup is achieved compared to the single GPU version, while being able to solve even larger systems.
 With the linear scaling of memory requirements and improved computational speed, the methodology based on multiple GPUs makes it possible to simulate water waves in very large numerical wave tanks with improved performance.
 
-
-
-
-
 \begin{figure}[!htb]
     \setlength\figureheight{0.4\textwidth}
     \setlength\figurewidth{0.68\textwidth}
@@ -452,28 +457,28 @@ With the linear scaling of memory requirements and improved computational speed,
     \subfigure[Test environment 1.]{
     {\scriptsize\input{Chapters/chapter7/figures/GTX590MultiGPUScaling3D.tikz}}
     }
-    \subfigure[Test environment 2.]{
+    \subfigure[Test environment 3.]{
     {\scriptsize\input{Chapters/chapter7/figures/TeslaM2050MultiGPUScaling3D.tikz}}
     }
     \end{center}
-    \caption[Domain decomposition performance on multi-GPU systems.]{Domain decomposition performance on multi-GPU systems. Performance timings per PDC iteration as a function of increasing problem sizes using single precision. Same setup as in figure \ref{ch7:fig:perftimings}.}
+    \caption[Domain decomposition performance on multi-GPU systems.]{Domain decomposition performance on multi-GPU systems. In single precision, performance timings per PDC iteration are a function of increasing problem sizes. Same setup as in Figure \ref{ch7:fig:perftimings}.}
     \label{ch7:fig:multigpuperformance}
 \end{figure}
 
-Future work involves extending the domain decomposition method to include support for more general curvilinear boundary-fitted meshes for representing the free surface plane and include bottom-mounted structures which is typically encountered in near-costal areas. This will enable opportunities to resolve wave-structure interactions such as those encountered in large harbour regions and isolated human-made structures in offshore regions, e.g., legs of oil rigs or offshore windmill foundations.
+Future work involves extending the domain decomposition method to include support for more general curvilinear boundary-fitted meshes for representing the free surface plane and to include bottom-mounted structures which are typically encountered in near-costal areas. This will enable opportunities to resolve wave-structure interactions such as those encountered in large harbor regions and isolated human-made structures in offshore regions, e.g., legs of oil rigs or offshore windmill foundations.
 
 \subsection{Assembling the wave model from library components}
 
-It is described in chapter \ref{ch5} how we have developed a heterogenous library has for fast prototyping of PDE solvers, utilizing the massively parallel architecture of CUDA-enabled GPUs. The objective is to provide a set of generic components within a single framework, such that software developers can assemble application specific solvers efficiently at a high abstraction level, requiring a minimum of CUDA specific kernel implementations and parameter tuning.
+It is described in Chapter \ref{ch5} how we have developed a heterogenous library has for fast prototyping of PDE solvers, utilizing the massively parallel architecture of CUDA-enabled GPUs. The objective is to provide a set of generic components within a single framework, such that software developers can assemble application-specific solvers efficiently at a high abstraction level, requiring a minimum of CUDA specific kernel implementations and parameter tuning.
 
-The CUDA-based numerical wave model has been developed based on all the numerical techniques described in preceding sections. These techniques are a part of the library implemented as generic components, which makes them useful for the numerical solutions of partial differential equation (PDE) systems. The components of the numeral model as described in section \ref{ch7:sec:nummodel} is an ERK4 time integrator, flexible-order finite difference approximations on regular grids, and an iterative multigrid PDC solver for the $\sigma$-transformed Laplace equation \eqref{ch7:TransformedLaplace}. Application developers can either use these components directly from the library or they are free to combine their own implementations with library components, if they need alternative strategies that are not present in the library.
+The CUDA-based numerical wave model has been developed based on all the numerical techniques described in preceding sections. These techniques are a part of the library implemented as generic components, which makes them useful for the numerical solutions of PDE systems. The components of the numeral model as described in Section \ref{ch7:sec:nummodel} include an ERK4 time integrator, flexible-order finite difference approximations on regular grids, and an iterative multigrid PDC solver for the $\sigma$-transformed Laplace equation \eqref{ch7:TransformedLaplace}. Application developers can use either these components directly from the library or are free to combine their own implementations with library components, if they need alternative strategies that are not present in the library.
 
-For the unified potential flow model the user will need to provide implementations of the following components; the right hand side operator for the semi-discrete free surface variables \eqref{ch7:FSorigin}, the matrix-vector operator for the discretized $\sigma$-transformed Laplace equation \eqref{ch7:TransformedLaplace}, a smoother for the multigrid relaxation step, and the potential flow solver itself, that reads initial data and advance the solution in time. In order to make the library as generic as possible, all components are template-based, which makes it possible to assemble the PDE solver by combining type definitions in the preamble of the application. An excerpt of the potential flow assembling is given in listing \ref{ch7:lst:solversetup}.
+For the unified potential flow model the user will need to provide implementations of the following components: the right-hand side operator for the semidiscrete free surface variables \eqref{ch7:FSorigin}, the matrix-vector operator for the discretized $\sigma$-transformed Laplace equation \eqref{ch7:TransformedLaplace}, a smoother for the multigrid relaxation step, and the potential flow solver itself, that reads initial data and advances the solution in time. In order to make the library as generic as possible, all components are template-based, which makes it possible to assemble the PDE solver by combining type definitions in the preamble of the application. An excerpt of the potential flow assembling is given in Listing \ref{ch7:lst:solversetup}.
 
-\lstset{caption={Generic assembling of the potential flow solver for fully nonlinear free surface water waves.}
+\lstset{label=ch7:lst:solversetup,caption={generic assembling of the potential flow solver for fully nonlinear free surface water waves}
 %,basicstyle=\scriptsize
 }
-\begin{lstlisting}[label=ch7:lst:solversetup]
+\begin{lstlisting}
 // Basics
 typedef double value_type;
 typedef gpulab::grid<value_type> vector_type;
@@ -503,12 +508,12 @@ typedef free_surface::potential_flow_solver_types<
 typedef free_surface::potential_flow_solver_3d<potential_flow_types> potential_flow_solver_type;
 \end{lstlisting}
 
-Hereafter, the potential flow solver is aware of all component types that should be used to solve the entire PDE system, and it will be easy for developers to exchange parts at later times. The \texttt{laplace\_sigma\_stencil\_3d} class implements both the matrix-vector and right hand side operator. The flexible-order finite difference kernel for the matrix-free matrix-vector product for the two-dimensional Laplace problem is presented in listing \ref{ch7:lst:fd2d}. Library macros and reusable kernel routines are used throughout the implementations to enhance developer productivity and hide hardware specific details. This kernel can be used both for matrix-vector products for the original system and for the preconditioning.
+Hereafter, the potential flow solver is aware of all component types that should be used to solve the entire PDE system, and it will be easy for developers to exchange parts at later times. The \texttt{laplace\_sigma\_stencil\_3d} class implements both the matrix-vector and right-hand side operator. The flexible-order finite difference kernel for the matrix-free matrix-vector product for the two-dimensional Laplace problem is presented in Listing \ref{ch7:lst:fd2d}. Library macros and reusable kernel routines are used throughout the implementations to enhance developer productivity and hide hardware specific details. This kernel can be used both for matrix-vector products for the original system and for the preconditioning.
 
-\lstset{caption={CUDA kernel implementation for the two dimensional finite difference approximation to the transformed Laplace equation.}
+\lstset{label=ch7:lst:fd2d,caption={CUDA kernel implementation for the two dimensional finite difference approximation to the transformed Laplace equation}
 %,basicstyle=\scriptsize\ttfamily
 }
-\begin{lstlisting}[label=ch7:lst:fd2d]
+\begin{lstlisting}
 template <typename value_type, typename size_type>
 __global__ void laplace_sigma_transformed(
       value_type* out            , value_type const* p
@@ -574,7 +579,7 @@ __global__ void laplace_sigma_transformed(
 		dpds  /= ds;
 		dpdss /= (ds*ds);
 
-		// Calculate dpdxds
+		// Calculate dp/dxds
 		value_type dpdxds = values<value_type>::zero();
 		for(size_type ss = 0; ss<rank; ++ss)
 		{
@@ -594,11 +599,11 @@ __global__ void laplace_sigma_transformed(
 }
 \end{lstlisting}
 
-In a similar template-based approach, the kernel for the right hand side operator of the two dimensional problem is implemented and listed in Listing \ref{ch7:lst:rhs2d}. The kernel computes the right hand side updates for both surface variables, $\eta$ and $\tilde{\phi}$, and applies an embedded penalty forcing \eqref{ch7:eq:penalty}, for all nodes within generation or absorption zones. The penalty forcing functions are computed based on linear or non-linear wave theory in a separate device function.
+In a similar template-based approach, the kernel for the right-hand side operator of the two-dimensional problem is implemented and given in Listing \ref{ch7:lst:rhs2d}. The kernel computes the right-hand side updates for both surface variables, $\eta$ and $\tilde{\phi}$, and applies an embedded penalty forcing, cf. \eqref{ch7:eq:penalty}, for all nodes within generation or absorption zones. The penalty forcing functions are computed based on linear or nonlinear wave theory in a separate device function.
 
-\lstset{caption={CUDA kernel implementation for the 2D right hand side.}%,basicstyle=\scriptsize\ttfamily
+\lstset{label=ch7:lst:rhs2d,caption={CUDA kernel implementation for the 2D right-hand side}%,basicstyle=\scriptsize\ttfamily
 }
-\begin{lstlisting}[label=ch7:lst:rhs2d]
+\begin{lstlisting}
 template <typename value_type, typename size_type>
 __global__ void rhs(value_type const* p    , value_type const* p_surf
                   , value_type const* eta  , value_type* dp_surf_dt
@@ -644,7 +649,7 @@ __global__ void rhs(value_type const* p    , value_type const* p_surf
 		}
 		dpdx /= dx;
 
-        // Update surface variables rhs
+        // Update right-hand side function
 		dp_surf_dt[j] = - g*eta[j] - 0.5*(dpdx*dpdx - w*w*(1.0 + (detax*detax)));
 		deta_dt[j]    = - detax*dpdx + w*(1.0 + (detax*detax));
 
@@ -664,12 +669,12 @@ __global__ void rhs(value_type const* p    , value_type const* p_surf
 
 \section{Properties of the numerical model}
 
-We now consider different properties of the numerical model in order to shed light on unique features and limits of the model with respect to maritime engineering applications. The presented results extend and complements earlier studies \cite{ch7:BinghamZhang2007,ch7:EBL08} for the same model. In particular, we seek to highlight that the properties are tuneable to the practical applications of interest through proper choice of discretization parameters and therefore also provide details of numerical properties.
+We now consider different properties of the numerical model in order to shed light on unique features and limits of the model with respect to maritime engineering applications. The presented results extend and complement earlier studies \cite{ch7:BinghamZhang2007,ch7:EBL08} for the same model. In particular, we seek to highlight that the properties are tunable to the practical applications of interest through proper choice of discretization parameters, and we therefore also provide details of numerical properties.
 
 \subsection{Dispersion and kinematic properties}\index{dispersion}\index{kinematic}
 \label{ch7:dispkin}
 
-The dispersion and kinematic properties of the unified model is determined by the tuneable discretization parameters and should in general be chosen for specific problems. For assessment of errors, we introduce the metrics proposed in \cite{ch7:MBS03}
+The dispersion and kinematic properties of the unified model are determined by the tunable discretization parameters and should in general be chosen for specific problems. For assessment of errors, we introduce the metrics proposed in \cite{ch7:MBS03}
 \begin{subequations}
 \begin{align}
 E_c(N_x,N_z,kh,h/L) & = \frac{c^2}{c_s^2},
@@ -678,7 +683,7 @@ E_c(N_x,N_z,kh,h/L) & = \frac{c^2}{c_s^2},
 E_m(N_x,N_z,kh,h/L) &= \frac{1}{h}\int_{-h}^\eta \left( \frac{\phi(z)-\phi_s(z)}{\phi_s(z)} \right)^2 dz,
 \end{align}
 \end{subequations}
-where $m$ is one of the scalar functions $\phi,u,w$ describing kinematics, $c$ is the numerical phase celerity of regular waves and $c_s=\sqrt{g\tanh(kh)/k}$ is the exact phase celerity according to linear Stokes Theory \cite{ch7:SJ01}.  Measurements of the error are taken in the vertical below the crest of a wave which is well-resolved in the horizontal direction.
+where $m$ is one of the scalar functions $\phi,u,w$ describing kinematics, $c$ is the numerical phase celerity of regular waves, and $c_s=\sqrt{g\tanh(kh)/k}$ is the exact phase celerity according to linear Stokes Theory \cite{ch7:SJ01}.  Measurements of the errors are taken in the vertical below the crest of a wave which is well resolved in the horizontal direction.
 
 % kinematicAnalysis.m
 \begin{figure}[!htb]
@@ -712,23 +717,25 @@ $N_z\in[6,12]$. Sixth order scheme.}
 \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/kinematicsW_Nx30-HL90-p6_Nonlinear-eps-converted-to.pdf}
 }
 \end{center}
-\caption[Assessment of kinematic error is presented in terms of the depth-averaged error.]{Assessment of kinematic error is presented in terms of the depth-averaged error determined by \eqref{ch7:errkin} for a) scalar velocity potential and b) vertical velocity for a small-amplitude (linear) wave, and c) scalar velocity potential and d) vertical velocity for a finite-amplitude (nonlinear) wave with wave height $H/L=90\%(H/L)_\textrm{max}$.
+\caption[Assessment of kinematic error is presented in terms of the depth-averaged error.]{Assessment of kinematic error is presented in terms of the depth-averaged error determined by \eqref{ch7:errkin} for, (a) scalar velocity potential, (b) vertical velocity for a small-amplitude (linear) wave, (c) scalar velocity potential, and (d) vertical velocity for a finite-amplitude (nonlinear) wave with wave height $H/L=90\%(H/L)_\textrm{max}$.
 $N_z\in[6,12]$. Sixth order scheme. Clustered vertical grid. }
 \label{ch7:figlinear2}
 \end{figure}
 
-The accuracy of the dispersion and kinematic properties are found to be excellent with small differences between the accuracy of computed profiles for  linear and nonlinear waves. Figure \ref{ch7:figlinear} shows curves from a discrete analysis of how the accuracy can be improved by increasing the number of nodes in the vertical for a) uniform grids and b) cosine-clustered grids. Similarly, figure \ref{ch7:figlinear2} shows how the accuracy of the kinematic properties of the model can be controlled by choosing an appropriate number of co-sine clustered vertical collocation points.
+The accuracy of the dispersion and kinematic properties are found to be excellent with small differences between the accuracy of computed profiles for  linear and nonlinear waves. Figure \ref{ch7:figlinear} shows curves from a discrete analysis of how the accuracy can be improved by increasing the number of nodes in the vertical for (a) uniform grids and (b) cosine-clustered grids. Similarly, Figure \ref{ch7:figlinear2} shows how the accuracy of the kinematic properties of the model can be controlled by choosing an appropriate number of cosine clustered vertical collocation points.
 
 %\section{Verification and Validation}
 \section{Numerical experiments}
 
-The numerical model detailed has been subject to careful verification and validation utilizing a range of standard benchmarks, cf. \cite{ch7:EBL08,ch7:EngsigKarupEtAl2011}. Here we exclusively focus on properties and performance of the numerical wave model. We provide several new results that highlights possibilities for acceleration of the wave model via simple and readily applicable techniques that works well on massively parallel hardware. Finally, we describe how we can extend the implementation of the wave model into a novel GPU implementation of a linear ship-wave model for fast hydrodynamics calculations.
+The numerical model detailed has been subject to careful verification and validation utilizing a range of standard benchmarks, cf. \cite{ch7:EBL08,ch7:EngsigKarupEtAl2011}. Here we exclusively focus on properties and performance of the numerical wave model. We provide several new results that highlight possibilities for acceleration of the wave model via simple and readily applicable techniques that work well on massively parallel hardware. Finally, we describe how we can extend the implementation of the wave model into a novel GPU implementation of a linear ship-wave model for fast hydrodynamics calculations.
 
 \subsection{On precision requirements in engineering applications}\index{precision}
 
-Practical engineering applications are widely used for analysis purposes and give support to decision making in engineering design. For engineering purposes the turn-around time for producing analysis results is of crucial importance as it affects cost-benefit of work efforts. The key interest is often just 'engineering accuracy' in computed end results which suggest that we can do with less precision in calculations. One may ask what is the precision requirements for engineering applications? In a recent study \cite{ch7:EngsigKarupEtAl2011,ch7:Glimberg2011}, it is shown that the PDC method when executed on GPUs can be utilized to efficiently solve water wave problems. This was done by trading accuracy for speed in parts of the PDC algorithm, e.g. by using either single or mixed-precision\index{precision!mixed} computations. Without preconditioning the PDC method reduces to a classical iterative refinement technique, which is known to be fault tolerant \cite{ch7:Higham:2002:ASN}.
+Practical engineering applications are widely used for analysis purposes and give support to decision making in engineering design. For engineering purposes the turn-around time for producing analysis results is of crucial importance as it affects cost-benefit of work efforts. The key interest is often just "engineering  accuracy" in computed end results which suggests that we can do with less precision in calculations. One may ask: {\em what are the precision requirements for engineering applications?} 
+
+In a recent study \cite{ch7:EngsigKarupEtAl2011,ch7:Glimberg2011}, it was shown that the PDC method when executed on GPUs can be utilized to efficiently solve water wave problems. This was done by trading accuracy for speed in parts of the PDC algorithm, e.g., by using either single, or mixed-precision\index{precision!mixed} computations. Without preconditioning the PDC method reduces to a classical iterative refinement technique, which is known to be fault tolerant \cite{ch7:Higham:2002:ASN}.
 
-Previously reported performance results for the wave model can be taken a step further. We seek to demonstrate how single precision computations can be used for engineering analysis without significantly affecting accuracy in final computational results. At the same time improvements in computational speed can be as much as a factor of two for large problems as a direct result of reduced data transfer, cf. figure \ref{ch7:fig:perftimings}. Therefore, in pursue of high performance, it is of interest to exploit the reduced data transfers associated with replacing double precision with single precision floating point calculations. In a well organized code this step can be taken with minimal programming effort.
+Previously reported performance results for the wave model can be taken a step further. We seek to demonstrate how single precision computations can be used for engineering analysis without significantly affecting accuracy in final computational results. At the same time improvements in computational speed can be almost a factor of two for large problems as a direct result of reduced data transfer, cf. Figure \ref{ch7:fig:perftimings}. Therefore, in pursue of high performance, it is of interest to exploit the reduced data transfers associated with replacing double-precision with single-precision floating point calculations. In a well-organized code this step can be taken with minimal programming effort.
 %
 \begin{figure}[!htb]
 \begin{center}
@@ -740,23 +747,23 @@ Previously reported performance results for the wave model can be taken a step f
 \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/PrecisionDOUBLE-eps-converted-to.pdf}
 }
 \end{center}
-\caption[Comparison between convergence histories for single and double precision computations using a PDC method for the solution of the transformed Laplace problem.]{Comparison between convergence histories for single and double precision computations using a PDC method for the solution of the transformed Laplace problem. Very steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0903)$. Discretizaiton based on $(N_x,N_z)=(15,9)$ with 6'$th$ order stencils.}
+\caption[Comparison between convergence histories for single- and double-precision computations using a PDC method for the solution of the transformed Laplace problem.]{Comparison between convergence histories for single- and double-precision computations using a PDC method for the solution of the transformed Laplace problem. Estimated errors are qualitatively very close to the algebraic errors. Very steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0903)$. Discretization based on $(N_x,N_z)=(15,9)$ with sixth order stencils.}
 \label{ch7:convhist}
 \end{figure}
 
-Most scientific applications \cite{ch7:lessismore} use double precision calculations to minimize accumulation of round-off errors, to employ higher precision for ill-conditioned problems, or to stabilize critical sections in the code that requires higher precision. The main reason is that roundoff-errors tend to accumulate slower when higher precision is used, thereby avoiding significant losses of accuracy due to round-off errors. Paradoxically, for many computational tasks, the need for high precision connected with the above-mentioned restrictions do not apply at all, or only apply to a portion of the tasks. In addition, on modern hardware there can be relative differences in peak floating point performance of up to about one order of magnitude in favor of single precision over double precision math calculations depending on choice of hardware architecture. However, for bandwidth bound applications, the key performance metric is not floating point performance, but rather bandwidth performance. In both cases, data transfer can be effectively halved by switching to single precision storage, in which case bandwidth performance increases and at the same time makes it possible to feed floating point units with data at effectively twice the speed. If maximizing performance is an ultimate goal, such considerations suggests that it can be possible to compute faster by using single over double precision arithmetics fi accuracy requirements can be fulfilled.
+Most scientific applications \cite{ch7:lessismore} use double-precision calculations to minimize accumulation of round-off errors, to employ higher precision for ill-conditioned problems, or to stabilize critical sections in the code that require higher precision. Round-off errors tend to accumulate more slowly when higher precision is used, thereby avoiding significant losses of accuracy due to round-off errors. Paradoxically, for many computational tasks, the need for high precision connected with the above-mentioned restrictions does not apply at all, or only applies to a portion of the tasks. In addition, on modern hardware there can be relative differences in peak floating point performance of up to about one order of magnitude in favor of single-precision over double-precision math calculations depending on choice of hardware architecture. However, for bandwidth-bound applications, the key performance metric is not floating-point performance, but rather bandwidth performance. In both cases, data transfer can be effectively halved by switching to single precision storage, in which case bandwidth performance increases and at the same time makes it possible to feed floating point units with data at effectively twice the speed. If maximizing performance is an ultimate goal, such considerations suggests that it can be possible to compute faster by using single- over double-precision arithmetics if accuracy requirements can be fulfilled.
 %
 
-As a simple illustrative numerical experiment, we can consider the iterative solution of \eqref{ch7:eq:linsys} using the PDC method using, respectively, single and double precision math. As a simple test case, we consider the solution of periodic stream function waves in two spatial dimensions. Computed convergence histories are presented in figure \ref{ch7:convhist} where it is clear that the main difference is in the attainable accuracy level achievable before stagnation. In both cases, the attainable accuracy, defined in terms of the absolute error for the exact stream function solution to the governing equations, is associated with accuracy of approximately $10^{-4}$ (solid line). In single precision math, the algebraic and estimated errors measured in the 2-norm can reach the level of machine precision. These results suggest that single precision math is sufficient for calculating accurate solutions at the chosen spatial resolution. We find that the iterative solution of the $\sigma$-transformed Laplace problem by the PDC method does not immediately lead to significant accumulation of roundoff-errors and further investigations are warranted for unsteady computations.
+As a simple illustrative numerical experiment, we can consider the iterative solution of \eqref{ch7:eq:linsys} using the PDC method using single- and double-precision math, respectively. As a simple test case, we consider the solution of periodic stream function waves in two spatial dimensions. Computed convergence histories are presented in Figure \ref{ch7:convhist} where it is clear that the main difference is the attainable accuracy level achievable before stagnation. In both cases, the attainable accuracy, defined in terms of the absolute error for the exact stream function solution to the governing equations, is associated with accuracy of approximately $10^{-4}$ (solid line). In single-precision math, the algebraic and estimated errors measured in the two-norm can reach the level of machine precision. These results suggest that single-precision math is sufficient for calculating accurate solutions at the chosen spatial resolution. We find that the iterative solution of the $\sigma$-transformed Laplace problem by the PDC method does not immediately lead to significant accumulation of round-off errors and further investigations are warranted for unsteady computations.
 
-Elaborating on this example, we examine the propagation of a regular stream function wave in time. We consider the errors in wave elevation as a function of time, respectively, with and without a filtering\index{filtering} strategy for single precision calculations in comparison with double precision calculations. With an objective to exert control on accumulation of round-off errors that appear as high-frequency noise, the idea is to employ an inexpensive stencil-based filtering strategy. For example, a central filter in one spatial dimension
+Elaborating on this example, we examine the propagation of a regular stream function wave in time. We consider the errors in wave elevation as a function of time with and without a filtering\index{filtering} strategy for single-precision calculations in comparison with double-precision calculations. With an objective to exert control on accumulation of round-off errors that appear as high-frequency noise, the idea is to employ an inexpensive stencil-based filtering strategy, for example, a central filter in one spatial dimension
 \begin{align}
 \label{ch7:filter}
 \mathcal{F}u(x_i) = \sum_{n=-\alpha}^{\alpha} c_n u(x_{i+n}),
 \end{align}
-where $c_n\in\mathbb{R}$ are the stencil coefficients and $\alpha\in\mathbb{Z}_+$ is the stencil half-width . An active filter can, e.g., be based on employing a Savitzky-Golay smoothening filter \cite{ch7:PT90}, e.g. the mild 7-point SG(6,10) filter, and applying it after every 10th time step to each of the collocation nodes defining the free surface variables. The same procedure can be used for stabilization of nonlinear simulations to remove high-frequency 'saw-tooth' instabilities as shown in \cite{ch7:EBL08}. This filtering technique can also remove high-frequency noise resulting from roundoff-errors in computations that would otherwise potentially pollute the computational results and in the worst case leave them useless. The effect of this type of filtering on the numerical efficiency of the model is insignificant.
+where $c_n\in\mathbb{R}$ are the stencil coefficients and $\alpha\in\mathbb{Z}_+$ is the stencil half-width. An active filter can for example be based on employing a Savitzky-Golay smoothening filter \cite{ch7:PT90}, e.g., the mild 7-point SG(6,10) filter, and applying it after every 10th time step to each of the collocation nodes defining the free surface variables. The same procedure can be used for stabilization of nonlinear simulations to remove high-frequency "saw-tooth" instabilities as shown in \cite{ch7:EBL08}. This filtering technique can also remove high-frequency noise resulting from round-off errors in computations that would otherwise potentially pollute the computational results and in the worst case leave them useless. The effect of this type of filtering on the numerical efficiency of the model is insignificant.
 
-Results from numerical experiments are presented in figure \ref{ch7:filtering} and most of the errors can be attributed to phase errors resulting from difference in exact versus numerical phase speed. In numerical experiments, we find that while results computed in double precision are not significantly affected by accumulation of round-off errors, the single precision results are. In figures \ref{ch7:filtering} a) and b), a direct solver based on sparse Gaussian Elimination within Matlab\footnote{\url{http://www.mathworks.com}.} is used to solve the linear system every stage and a comparison is made between single and unfiltered double precision calculations. It is shown in figure \ref{ch7:filtering} a) that without a filter, the single precision calculations result in 'blow-up' after which the solver fails just before 50 wave periods of calculation time. However, in figure \ref{ch7:filtering} b) it is demonstrated that invoking a smoothening filter, cf. \eqref{ch7:filter}, stabilizes the accumulation of round-off errors and the calculations continue and achieves reduced accuracy compared to the computed double precision results. Thus, it is confirmed that such a filter can be used to control and suppress high-frequency oscillations that results from accumulation of round-off errors. In contrast, replacing the direct solver with an iterative PDC method using the GPU-accelerated wave model appears to be much more attractive upon inspection of figures \ref{ch7:filtering} c) and d). The single precision results are found to be stable with and {\em without} the filter-based strategy for this problem. The calculations shows that single precision math leads to slightly faster error accumulation for this choice of resolution, however, with only small differences in error level during long time integration. This highlights that fault-tolerance of the iterative PDC method which contributes to securing robustness of the calculations.
+Results from numerical experiments are presented in Figure \ref{ch7:filtering}, and most of the errors can be attributed to phase errors resulting from difference in exact versus numerical phase speed. In numerical experiments, we find that while results computed in double-precision are not significantly affected by accumulation of round-off errors, the single-precision results are. In figures \ref{ch7:filtering} (a) and (b), a direct solver based on sparse Gaussian elimination within MATLAB\footnote{\url{http://www.mathworks.com}.} is used to solve the linear system at every stage and a comparison is made between single and unfiltered double-precision calculations. It is shown in Figure \ref{ch7:filtering} a) that without a filter, the single-precision calculations result in "blow-up" after which the solver fails just before 50 wave periods of calculation time. However, in Figure \ref{ch7:filtering} (b) it is demonstrated that invoking a smoothening filter, cf. \eqref{ch7:filter}, stabilizes the accumulation of round-off errors and the calculations continue and achieve reduced accuracy compared to the computed double-precision results. Thus, it is confirmed that such a filter can be used to control and suppress high-frequency oscillations that results from accumulation of round-off errors. In contrast, replacing the direct solver with an iterative PDC method using the GPU-accelerated wave model appears to be much more attractive upon inspection of Figures \ref{ch7:filtering} (c) and (d). The single-precision results are found to be stable with and {\em without} the filter-based strategy for this problem. The calculations show that single-precision math leads to slightly faster error accumulation for this choice of resolution, however, with only small differences in error level during long time integration. This highlights that fault-tolerance of the iterative PDC method contributes to securing robustness of the calculations.
 
 \begin{figure}[!htb]
 \begin{center}
@@ -774,20 +781,20 @@ Results from numerical experiments are presented in figure \ref{ch7:filtering} a
 \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/ComparisonDCWithFiltering-eps-converted-to.pdf}
 }
 \end{center}
-\caption[Comparison between accuracy as a function of time for double precision calculations vs. single precision with and without filtering.]{Comparison between accuracy as a function of time for double precision calculations vs. single precision with and without filtering. The double precision result are unfiltered in each comparison and shows to be less sensitive to roundoff-errors. Medium steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0502)$. Discretization is based on $(N_x,N_z)=(30,6)$, A courant number of $C_r=0.5$ and 6'$th$ order stencils.}
+\caption[Comparison of accuracy as a function of time for double-precision calculations vs. single-precision with and without filtering.]{Comparison between accuracy as a function of time for double-precision calculations vs. single-precision with and without filtering. The double-precision result is unfiltered in each comparison and shows to be less sensitive to round-off errors. Medium steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0502)$. Discretization is based on $(N_x,N_z)=(30,6)$, A courant number of $C_r=0.5$, and sixth order stencils.}
 \label{ch7:filtering}
 \end{figure}
 
-Last, we demonstrate using a classical benchmark for propagation of nonlinear waves over a semi-circular shoal that single precision math is likely to be sufficient for achieving engineering accuracy. The benchmark is based on Whalin's experiment~\cite{ch7:Whalin1971} which is often used in validation of dispersive water wave models for coastal engineering applications, e.g., see provious work \cite{ch7:EBL08}. Experimental results exists for incident waves with wave periods $T=1,2,3\,s$ and wave heights $H=0.0390, 0.0150, 0.0136\,m$. All three test cases have been discretized with a computational grid of size ($257 \times 41 \times 7$) to resolve the physical dimensions of $L_x=35\,m$, $L_y=6.096\,m$. The still water depth decreases in the direction of the incident waves as a semi-circular shoal from $0.4572\,m$ to $0.1524\,m$ with an illustration of a snapshot of the free surface given in figure \ref{ch7:fig:whalinsetup}. The time step $\Delta t$ is computed based on a constant Courant number of $Cr=0.8$, where $c$ is the incident wave speed and $\Delta x$ is the grid spacing. Waves are generated in the generation zone $0 \leq x/L \leq 1.5$, where $L$ is the wave length of incident waves, and absorbed again in the zone $35 - 2L \leq x \leq 35\, m$.
+Last, we demonstrate using a classical benchmark for propagation of nonlinear waves over a semicircular shoal that single-precision math is likely to be sufficient for achieving engineering accuracy. The benchmark is based on Whalin's experiment~\cite{ch7:Whalin1971} which is often used in validation of dispersive water wave models for coastal engineering applications, e.g., see previous work \cite{ch7:EBL08}. Experimental results exists for incident waves with wave periods $T=1,2,3\,$s and wave heights $H=0.0390, 0.0150, 0.0136\,$m. All three test cases have been discretized with a computational grid of size ($257 \times 41 \times 7$) to resolve the physical dimensions of $L_x=35\,$m, $L_y=6.096\,$m. The still water depth decreases in the direction of the incident waves as a semicircular shoal from $0.4572\,$m to $0.1524\,$m with an illustration of a snapshot of the free surface given in Figure \ref{ch7:fig:whalinsetup}. The time step $\Delta t$ is computed based on a constant Courant number of $Cr=c\Delta x/\Delta t=0.8$, where $c$ is the incident wave speed and $\Delta x$ is the grid spacing. Waves are generated in the generation zone $0 \leq x/L \leq 1.5$, where $L$ is the wave length of incident waves, and absorbed again in the zone $35 - 2L \leq x \leq 35\,$m.
 %
 
-A harmonic analysis of the wave spectrum at the shoal center line is computed and plotted in figure \ref{ch7:whalinresults} for comparison with the analogous results obtained from the experiments data. The three harmonic amplitudes are computed via a Fast Fourier Transform (FFT) method using the last three wave periods up till $t=50\, s$. There is a satisfactory agreement between the computed and experimental results and no noticeable loss in accuracy resulting from the use of single precision math.
+A harmonic analysis of the wave spectrum at the shoal center line is computed and plotted in Figure \ref{ch7:whalinresults} for comparison with the analogous results obtained from the experiments data. The three harmonic amplitudes are computed via a Fast Fourier Transform (FFT) method using the last three wave periods up to $t=50\,$s. There is a satisfactory agreement between the computed and experimental results and no noticeable loss in accuracy resulting from the use of single-precision math.
 %
 \begin{figure}[!htb]
     \setlength\figureheight{0.3\textwidth}
     \setlength\figurewidth{0.32\textwidth}
 %    \begin{center}
-    \subfigure[Whalin test case at $t=50s$, wave period $T=2s$, and grid dimensions ($257 \times 41 \times 7$)]{\label{ch7:fig:whalinsetup}
+    \subfigure[Whalin test case at $t=50$s, wave period $T=2$s, and grid dimensions ($257 \times 41 \times 7$)]{\label{ch7:fig:whalinsetup}
     \includegraphics[width=0.5\textwidth]{Chapters/chapter7/figures/WhalinWaveSol_t50_T2_single.pdf}
     }
     \subfigure[$T=1s$]{
@@ -800,22 +807,22 @@ A harmonic analysis of the wave spectrum at the shoal center line is computed an
     {\scriptsize\input{Chapters/chapter7/figures/WhalinWaveHarmonics_T3_single.tikz}}
     }
 %    \end{center}
-    \caption[Harmonic analysis for the experiment of Whalin for $T=1,2,3\,s$ respectively.]{Harmonic analysis for the experiment of Whalin for $T=1,2,3\,s$ respectively. Measured experimental and computed results (single precision) are in good agreement. Test environment 1.}\label{ch7:whalinresults}
+    \caption[Harmonic analysis for the experiment of Whalin for $T=1,2,3\,$s.]{Harmonic analysis for the experiment of Whalin for $T=1,2,3\,$s respectively. Measured experimental and computed results (single-precision) are in good agreement. Test environment 1.}\label{ch7:whalinresults}
 \end{figure}
 
-\subsection{Acceleration via parallelism in time using 'Parareal'}\label{ch7:parareal}\index{parareal}
+\subsection{Acceleration via parallelism in time using parareal}\label{ch7:parareal}\index{parareal}
 
 % Performance is no longer free  with new hardware
-With modern many-core architectures, performance is no longer intrinsic and free on new generations of hardware. Added performance with new hardware now comes with the requirement of sufficient parallelism in the application to be accelerated. Methods, tricks and techniques for extracting parallelism in scientific applications are thus becoming increasingly relevant to enable added numerical accuracy as well as minimization of time to solution in the pursuit of faster and better analysis for engineering applications.
+With modern many-core architectures, performance is no longer intrinsic and free on new generations of hardware. Added performance with new hardware now comes with the requirement of sufficient parallelism in the application to be accelerated. Methods, tricks, and techniques for extracting parallelism in scientific applications are thus becoming increasingly relevant to enable added numerical accuracy as well as minimization of time to solution in the pursuit of faster and better analysis for engineering applications.
 
 % Parareal as component
-The Parareal algorithm has been introduced as a component in our in-house GPU library as described in section \ref{ch5:parareal}. The parareal library component makes it possible to easily investigate potential opportunities for further acceleration of the water wave model on a heterogeneous system and to assess practical feasibility of this algorithmic strategy for various wave types. We omit a detailed review of the Parareal algorithm and refer to details given in section \ref{ch5:sec:parareal} together with recent reviews \cite{ch7:LMT01,ch7:MS07,ch7:ASNP12}.
+The parareal algorithm has been introduced as a component in our in-house GPU library as described in Section \ref{ch5:parareal}. The parareal library component makes it possible to easily investigate potential opportunities for further acceleration of the water wave model on a heterogeneous system and to assess practical feasibility of this algorithmic strategy for various wave types. We omit a detailed review of the parareal algorithm and refer to details given in Section \ref{ch5:sec:parareal} together with recent reviews \cite{ch7:LMT01,ch7:MS07,ch7:ASNP12}.
 
 % Explain key parameters shortly
-In section \ref{ch5:parareal} it is assumed that communication costs can be neglected and a simple model for the algorithmic work complexity is derived. It is found that there are four key discretization parameters for parareal that needs to be balanced appropriately in order to achieve high parallel efficiency. It is the number of coarse-grained time intervals $N$, the number of iterations $K$, the ratio between the computational cost of the coarse to the fine propagator $\mathcal{C}_\mathcal{G}/\mathcal{C}_\mathcal{F}$ and the ratio between fine and coarse time step sizes $\delta t/\delta T$.
+In Section \ref{ch5:parareal} it is assumed that communication costs can be neglected and a simple model for the algorithmic work complexity is derived. It is found that there are four key discretization parameters for parareal that need to be balanced appropriately in order to achieve high parallel efficiency: the number of coarse-grained time intervals $N$, the number of iterations $K$, the ratio between the computational cost of the coarse to the fine propagator $\mathcal{C}_\mathcal{G}/\mathcal{C}_\mathcal{F}$, and the ratio between fine and coarse time step sizes $\delta t/\delta T$.
 
 % How to obtain speed-up
-Ideally, the ratio $\mathcal{C}_\mathcal{G}/\mathcal{C}_\mathcal{F}$ is small and convergence happens in $k=1$ iteration. This is seldom the case though, as it requires the coarse propagator to achieve accuracy close to that of the fine propagator while at the same time being substantially cheaper computationally, these two objectives obviously being conflicting. Obtaining the highest possible speed-up is a matter of trade-off, typically, the more GPUs used, the faster the coarse propagator should be. The performance of parareal is problem and discretization dependent and as such one would suspect that different wave parameters influence the suitability of the method. This was investigated in \cite{ch7:ASNP12} and indeed the performance does change with wave parameters. Typically the method work better for deep water waves with low to medium wave amplitude.
+Ideally, the ratio $\mathcal{C}_\mathcal{G}/\mathcal{C}_\mathcal{F}$ is small and convergence happens in $k=1$ iteration. This is seldom the case though, as it requires the coarse propagator to achieve accuracy close to that of the fine propagator while at the same time being substantially cheaper computationally, these two objectives obviously being conflicting. Obtaining the highest possible speed-up is a matter of trade-off, typically, the more GPUs used, the faster the coarse propagator should be. The performance of parareal is problem- and discretization-dependent and as such one would suspect that different wave parameters influence the suitability of the method. This was investigated in \cite{ch7:ASNP12} and indeed the performance does change with wave parameters. Typically the method works better for deep water waves with low to medium wave amplitudes.
 %
 \begin{figure}[!htb]
     \begin{center}
@@ -830,13 +837,13 @@ Ideally, the ratio $\mathcal{C}_\mathcal{G}/\mathcal{C}_\mathcal{F}$ is small an
  \includegraphics[width=0.47\textwidth]{Chapters/chapter7/figures/PararealSpeedupGTX590_conv.pdf}
     }
     \end{center}
-    \caption[Parareal absolute timings and parareal speedup.]{(a) Parareal absolute timings for an increasingly number of water waves traveling one wave length, each wave resolution is ($33\times 9$). (b) Parareal speedup for two to sixteen compute nodes compared to the purely sequential single GPU solver. Notice how insensitive the parareal scheme is to the size of the problem solved. Test environment 2.}\label{ch7:fig:DDPA_SPEEDUP}
+    \caption[Parareal absolute timings and parareal speedup.]{(a) Parareal absolute timings for an increasingly number of water waves traveling one wave length; each wave resolution is ($33\times 9$). (b) Parareal speedup for two to sixteen compute nodes compared to the purely sequential single GPU solver. Is is noticeable how insensitive the parareal scheme is to the size of the problem solved. Test environment 3.}\label{ch7:fig:DDPA_SPEEDUP}
 \end{figure}
 %
 
 % What did we do and what are the results
-We have performed a scalability study for parareal using 2D nonlinear stream function waves based on a discretization with $(N_x,N_z)=(33,9)$ collocation points, cf. figure \ref{ch7:fig:DDPA_SPEEDUP}. The study shows that moderate speedup are possible for this hyperbolic system. Using four GPU nodes a speedup of slightly more than two was achieved while using sixteen GPU nodes resulted in a speedup of slightly less than five. As noticed in figure \ref{ch7:fig:DDPA_SPEEDUP}, parallel efficiency decrease quite fast when using more GPUs. This limitation is due to the usage a fairly slow and accurate coarse propagator and linked to a known difficulty with parareal applied to hyperbolic systems. For hyperbolic systems, instabilities tend to arise when using a very inaccurate coarse propagator. This prevents using a large number of time sub-domains, as this by Amdahl's law also requires a very fast coarse propagator. The numbers are still impressive though, considering that it comes as additional speedup to an already efficient and fast code.
-Performance results for the Whalin test case have also been reported in figure \ref{ch7:fig:whalinparareal}. There is a natural limitation to how much we can increase $R$ (the ratio between the complexity of the fine and coarse propagators), because of stability issues with the coarse propagator. In this test case we simulate from $t=[0,1]s$, using up to $32$ GPUs. For low $R$ and only two GPUs, there is no speedup gain, but for configuration with eight or more GPUs and $R\geq6$, we are able to get more than $2$ times speedup. Though these hyperbolic systems are not optimal for performance tuning using the parareal method, results still confirm that reasonable speedups are in fact possible on heterogenous systems.
+We have performed a scalability study for parareal using 2D nonlinear stream function waves based on a discretization with $(N_x,N_z)=(33,9)$ collocation points, cf. Figure \ref{ch7:fig:DDPA_SPEEDUP}. The study shows that moderate speedup is possible for this hyperbolic system. Using four GPU nodes, a speedup of slightly more than two was achieved while using sixteen GPU nodes resulted in a speedup of slightly less than five. As noticed in Figure \ref{ch7:fig:DDPA_SPEEDUP}, parallel efficiency decreases quite fast when using more GPUs. This limitation is due to the usage of a fairly slow and accurate coarse propagator and is linked to a known difficulty with parareal applied to hyperbolic systems. For hyperbolic systems, instabilities tend to arise when using a very inaccurate coarse propagator. This prevents using a large number of time subdomains, as this by Amdahl's law also requires a very fast coarse propagator. The numbers are still impressive though, considering that the speedup due to parareal comes as additional speedup to an already efficient and fast code.
+Performance results for the Whalin test case are also shown in Figure \ref{ch7:fig:whalinparareal}. There is a natural limitation to how much we can increase $R$ (the ratio between the complexity of the fine and coarse propagators), because of stability issues with the coarse propagator. In this test case we simulate from $t=[0,1]$s, using up to $32$ GPUs. For low $R$ and only two GPUs, there is no speedup gain, but for the configuration with eight or more GPUs and $R\geq6$, we are able to get more than $2$ times speedup. Though these hyperbolic systems are not optimal for performance tuning using the parareal method, results still confirm that reasonable speedups are in fact possible on heterogenous systems.
 
 \begin{figure}[!htb]
     \setlength\figureheight{0.3\textwidth}
@@ -849,28 +856,28 @@ Performance results for the Whalin test case have also been reported in figure \
     {\small\input{Chapters/chapter7/figures/WhalinPararealEfficiency.tikz}}
     }
 %    \end{center}
-    \caption[Parallel time integration using the parareal method.]{Parallel time integration using the parareal method. $R$ is the ratio between the complexity of the fine and coarse propagators. Test environment 2.}\label{ch7:fig:whalinparareal}
+    \caption[Parallel time integration using the parareal method.]{Parallel time integration using the parareal method. $R$ is the ratio between the complexity of the fine and coarse propagators. Test environment 3.}\label{ch7:fig:whalinparareal}
 \end{figure}
 
 % Comparison with DD
-The parareal method is observed to be the most viable approach at speeding up small-scale problems due to the reduced communication and overhead involved. For sufficiently large problems, where sufficient work is available to hide the latency in data communication, we find the spatial domain decomposition method to be more favorable as it does not involve the addition of computational work and thereby allows for ideal speed-up, something usually out of reach for the parareal algorithm. An important thing to note here is that it is technically possible to extend the work and wrap the parareal method around the domain decomposition method, thereby obtaining a multiplication of the combined speed-up of both methods. This is of great interest in the sense that for any problem size, increasing the number of spatial sub-domains will eventually degrade speed-up due to the latency in communication of boundaries. By exploiting the latency robustness of parareal in conjunction with domain decomposition parallelism, it may be possible to go large scale for problems that would otherwise be to small to exploit a large number of GPUs. These investigations are subject to future work.
+The parareal method is observed to be the most viable approach at speeding up small-scale problems due to the reduced communication and overhead involved. For sufficiently large problems, where sufficient work is available to hide the latency in data communication, we find the spatial domain decomposition method to be more favorable as it does not involve the addition of computational work and thereby allows for ideal speedup, something usually out of reach for the parareal algorithm. An important thing to note here is that it is technically possible to extend the work and wrap the parareal method around the domain decomposition method, thereby obtaining a multiplication of the combined speedup of both methods. This is of great interest in the sense that for any problem size, increasing the number of spatial sub-domains will eventually degrade speedup due to the latency in communication of boundaries. By exploiting the latency robustness of parareal in conjunction with domain decomposition parallelism, it may be possible to go large scale for problems that would otherwise be too small to exploit a large number of GPUs. These investigations are subject to future work.
 
 % Final remarks
-Finally, we remark that the Parareal algorithm is also a fault tolerant algorithm. This property follows from the iterative nature of the algorithm and implies that a process can be lost during computations and regenerated without restarting the computations. This can be exploited to minimize total run time in case of such failures. 
+Finally, we remark that the parareal algorithm is also a fault-tolerant algorithm. This property follows from the iterative nature of the algorithm and implies that a process can be lost during computations and regenerated without restarting the computations. This can be exploited to minimize total run time in case of such failures. 
 
 %\newpage
 \subsection{Towards real-time interactive ship simulation}\index{real-time simulation}
 
-A fast GPU-accelerated ship hydrodynamic model is developed for real-time interactive ship simulation by modification of the unified potential flow model presented in section \ref{ch7:goveq}. The target scientific application is an interactive full mission marine simulator, where multiple ships controlled by naval officers can navigate in a near-realistic virtual marine environment. Full mission simulators are used for education and training of naval officers in critical manoeuvring operations and for evaluation of ship and marine infrastructure designs. To predict the motion of ships, a hydrodynamics model is required for prediction of forces by \eqref{ch7:forcecalc} which is affected by the kinematic properties of the model, cf. section \ref{ch7:dispkin}. The state-of-the-art for such a hydrodynamic model in todays real-time ship simulators is based on fast interpolation and proper scaling of experimental model data. The amount of experimental model data are limited with respect to hull forms and configurations, requiring the need for extrapolation that compromises the accuracy.
+A fast GPU-accelerated ship hydrodynamic model is developed for real-time interactive ship simulation by modification of the unified potential flow model presented in Section \ref{ch7:goveq}. The target scientific application is an interactive full mission marine simulator, where multiple ships controlled by naval officers can navigate in a near-realistic virtual marine environment. Full mission simulators are used for education and training of naval officers in critical manoeuvring operations and for evaluation of ship and marine infrastructure designs. To predict the motion of ships, a hydrodynamics model is required for prediction of forces by \eqref{ch7:forcecalc} which is affected by the kinematic properties of the model, cf. Section \ref{ch7:dispkin}. The state-of-the-art for such a hydrodynamic model in today's real-time ship simulators is based on fast interpolation and proper scaling of experimental model data. The amount of experimental model data is limited with respect to hull forms and configurations, requiring the need for extrapolation that compromises the accuracy.
 
-The objective of current and ongoing work is aimed at removing these limitations by replacing the existing hydrodynamic model and instead calculate at full-scale the flow field, wave field, ship-structure and ship-ship interaction forces in real-time using massive parallel computation technology. The potential flow (OceanWave3D) model presented in section \ref{ch7:goveq} is suitable as the modeling basis for this purpose since it is robust, accurate, efficient and scalable to arbitrary large domains. Furthermore, it can accurately account for dispersive waves in the range from shallow to deep waters in marine settings where the sea bed may be uneven.
+The objective of current and ongoing work is aimed at removing these limitations by replacing the existing hydrodynamic model and instead calculating at full-scale the flow field, wave field, ship-structure, and ship-ship interaction forces in real-time using massive parallel computation technology. The potential flow model (OceanWave3D) presented in Section \ref{ch7:goveq} is suitable as the modeling basis for this purpose since it is robust, accurate, efficient, and scalable to arbitrarily large domains. Furthermore, it can accurately account for dispersive waves in the range from shallow to deep waters in marine settings where the sea bed may be uneven.
 
-The inclusion of ships in the wave model requires an approximate representation of such ships in the model. This ship approximations have to be chosen carefully with consideration to the computational performance of the numerical model to enable interactive real-time computing on today's modern hardware. For a first proof-of-concept linear wave and ship models is used as the model basis. This implied that wave heights and ship draft are linearized around the mean sea level $z=0$ m.
+The inclusion of ships in the wave model requires an approximate representation of such ships in the model. These ship approximations have to be chosen carefully with consideration to the computational performance of the numerical model to enable interactive real-time computing on today's modern hardware. For a first simple proof-of-concept we develop a linear wave and ship model to be used as the model basis. This implies that wave heights and ship draft are assumed to be of small amplitude corresponding and derived by a linearization technique around the mean sea level $z=0$ m.
 
-The physical domain for the computation is bounded from below by the seabed and from above by the free surface of the sea or the hull of the ship. If the ship is navigating in open water the ship physical spatial domain is unbounded in the horizontal direction and in confined waters it is bounded by harbour structures, etc. The representation of the physical domain surrounding the ship is done by finite truncation in the horizontal directions. The resulting time-varying finite physical domain $\Omega(t)$ is a box fixed to follow the ship motion, with the ship in the middle of the top face and with Cartesian coordinate axes aligned with the horizontal components of the forward and sideward ship directions and the upward is the opposite of the direction of gravitational acceleration.
+The physical domain for the computation is bounded from below by the seabed and from above by the free surface of the sea or the hull of the ship. If the ship is navigating in open water, the ship's physical spatial domain is unbounded in the horizontal direction and in confined waters it is bounded by harbor structures, etc. The representation of the physical domain surrounding the ship is done by finite truncation in the horizontal directions. The resulting time-varying finite physical domain $\Omega(t)$ is a box fixed to follow the ship motion, with the ship in the middle of the top face and with Cartesian coordinate axes aligned with the horizontal components of the forward and sideward ship directions and the upward is the opposite of the direction of gravitational acceleration.
 
-A linearized model can be formulated in terms of kinematic and dynamic boundary conditions at the mean sea level \eqref{ch7:FSorigin} together with a Laplace problem subject to a variable depth kinematic boundary condition \eqref{ch7:eq:laplaceproblem}. The effects of ship hulls can be accounted for by splitting the scalar velocity potential function into steady $\phi_0$ and unsteady $\phi_1$ potentials such that $\phi = \phi_0 + \phi_1$ and with a quasi-static approximation of the pressure acting on the ship hull as suggested in \cite{ch7:LindbergEtAlIWWWFB2012}. This leads to a relatively simple ship model that enables a flexible and computationally efficient approximation of the ship geometry.
-The steady potential $\phi_0$ is calculated using a double body approximation \cite{ch7:RavenJMST2010} of the ship and the unsteady potential $\phi_1$ is calculated by a linear free surface flow model. The resulting double-body flat-ship problem becomes
+A linear model can be formulated in terms of kinematic and dynamic boundary conditions at the mean sea level \eqref{ch7:FSorigin} together with a Laplace problem subject to a variable depth kinematic boundary condition \eqref{ch7:eq:laplaceproblem}. The effects of ship hulls can be accounted for by splitting the scalar velocity potential function into steady $\phi_0$ and unsteady $\phi_1$ potentials such that $\phi = \phi_0 + \phi_1$ and with a quasi-static approximation of the pressure acting on the ship hull as suggested in \cite{ch7:LindbergEtAlIWWWFB2012}. This leads to a relatively simple ship model that enables a flexible and computationally efficient approximation of the ship geometry.
+The steady potential $\phi_0$ is calculated using a double-body approximation \cite{ch7:RavenJMST2010} of the ship and the unsteady potential $\phi_1$ is calculated by a linear free surface flow model. The resulting double-body flat-ship problem becomes
 \begin{subequations}
 \begin{align}
  \nabla^2 \phi_0 &= 0,  \quad -h \leq z \leq 0 \\
@@ -892,9 +899,9 @@ where $U$ is the velocity of the ship. The unsteady linear water problem is used
 + v_0 \frac{\partial \eta_1}{\partial y} &= \frac{\partial \phi_1}{\partial z}, \quad z=0,
  \end{align}
  \end{subequations}
-where the pressure on the ship hull $p_{ship}$ is calculated explicitly based on a quasi-static approximation which is determined by assuming $\partial_t\phi_1\approx0$ and rewriting \eqref{ch7:quasistatic}. In general, a ship hull is a complex surface in three-dimensional space, but its draft can be approximated by a single valued function of the horizontal coordinates $\eta_0 = \eta_0(x,y)$ and the no-flux condition on the ship hull is approximated by a flat-ship approximation. Radiation boundary conditions are approximated by a Sommerfelt absorbing boundary condition \cite{ch7:DgayguiJolySJAM1994} on the vertical sides of the physical domain to let waves escape the domain.
+where the pressure on the ship hull $p_{ship}$ is calculated explicitly based on a quasi-static approximation which is determined by assuming $\partial_t\phi_1\approx0$ and rewriting \eqref{ch7:quasistatic}. In general, a ship hull is a complex surface in three-dimensional space, but its draft can be approximated by a single-valued function of the horizontal coordinates $\eta_0 = \eta_0(x,y)$, and the no-flux condition on the ship hull is approximated by a flat-ship approximation. Radiation boundary conditions are approximated by a Sommerfelt absorbing boundary condition \cite{ch7:DgayguiJolySJAM1994} on the vertical sides of the physical domain to let waves escape the domain.
 
-The modified numerical model can still be based on flexible-order finite difference method as discussed in section \ref{ch7:sec:nummodel}. The computational bottleneck problem is the efficient solution of the Laplace problem twice which can be done efficiently by the GPU-accelerated iterative PDC method as explained in section \ref{ch7:PDCmethod}. A snapshot of the steady state wave field is provided the introduction to this chapter. Computed resistance curves for a Series 60 hull moving at forward speed corresponding to Froude number $F_n=0.316$ knots in calm water are compared to experimental data \cite{ch7:TodaEtAl1992} in figure \ref{ch7:fig:shiphydro} a). The computed Kelvin wave system is shown in figure \ref{ch7:fig:shiphydro} b). The computed results compare well with experiments at moderate ship Froudes numbers $F_n=U/\sqrt{gh}$ in the range 0.1-0.25 as expected for a linear model. The real-time constraint required to fulfil the interactive and visualization requirements can be met with the GPU-accelerated hydrodynamics model for problem sizes of approximately $10^6$ for ship Froudes numbers in the range 0.1-0.3. The modelling and real-time aspects will be addressed in more detail in ongoing work.
+The modified numerical model can still be based on flexible-order finite difference method as discussed in Section \ref{ch7:sec:nummodel}. The computational bottleneck problem is the efficient solution of the Laplace problem twice which can be done efficiently by the GPU-accelerated iterative PDC method as explained in section \ref{ch7:PDCmethod}. A snapshot of the steady state wave field is provided in the introduction to this chapter. Computed resistance curves for a Series 60 hull moving at forward speed corresponding to Froude number $F_n=0.316$ knots in calm water are compared to experimental data \cite{ch7:TodaEtAl1992} in Figure \ref{ch7:fig:shiphydro} (a). The computed Kelvin wave system is shown in Figure \ref{ch7:fig:shiphydro} (b). The computed results compare well with experiments at moderate ship Froudes numbers $F_n=U/\sqrt{gh}$ in the range 0.1-0.25 as expected for a linear model. The real-time constraint required to fulfill the interactive and visualization requirements can currently be met with the GPU-accelerated hydrodynamics model for problem sizes of approximately $10^6$ for ship Froude numbers in the range 0.1-0.3. The modeling and real-time aspects will be addressed in more detail in ongoing work.
 
 %
 \begin{figure}[!htb]
@@ -906,7 +913,7 @@ The modified numerical model can still be based on flexible-order finite differe
 \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/figSeries60CB06Type7kelvin-eps-converted-to.pdf}
 }
 \end{center}
-\caption{Computed results. Comparison with experiments for hydrodynamics force calculations confirming engineering accuracy for low Froudes numbers.}
+\caption{Computed results. Comparison with experiments for hydrodynamics force calculations confirming engineering accuracy for low Froude numbers.}
 \label{ch7:fig:shiphydro}
 \end{figure}
 %
@@ -914,14 +921,14 @@ The modified numerical model can still be based on flexible-order finite differe
 %\newpage
 \section{Conclusion and future work}
 
-We have presented implementation details together with several novel results on development of a new massively parallel and scalable tool for simulation of nonlinear free surface water waves on heterogenous hardware. The tool is based on the unified potential flow model referred to as OceanWave3D \cite{ch7:EBL08} which provide the basis for efficient and scalable simulation of water waves over uneven bottoms on arbitrary domain sizes. We have demonstrated in a few examples how we can accelerate performance by using single precision math without comprising accuracy. We have shown that performance can be accelerated by introducing concurrency in the time integration using the Parareal algorithm and for the first time in a heterogenous setup based on the use of multiple GPUs. Interestingly, we find that parallel computations using parareal may be more efficient than using conventional data-parallel distributed computations in a multi-GPU setup for moderate problem sizes. We have measured absolute performance and scalability on several of the most recent generations of Nvidia GPUs to detail the efficiency of the current code. This is useful to predict time to results as explained in \cite{ch7:EngsigKarupEtAl2011} and may be compared against other wave models.
+We have presented implementation details together with several novel results on development of a new massively parallel and scalable tool for simulation of nonlinear free surface water waves on heterogenous hardware. The tool is based on the unified potential flow model referred to as OceanWave3D \cite{ch7:EBL08} which provides the basis for efficient and scalable simulation of water waves over uneven bottoms on arbitrary domain sizes. We have demonstrated in a few examples how we can accelerate performance by using single-precision math without comprimising accuracy. We have shown that performance can be accelerated by introducing concurrency in the time integration using the parareal algorithm and for the first time in a heterogenous setup based on the use of multiple GPUs. Interestingly, we find that parallel computations using parareal may be more efficient than using conventional data-parallel distributed computations in a multi-GPU setup for moderate problem sizes. We have measured absolute performance and scalability using several of the most recent generations of NVIDIA GPUs to detail the efficiency of the current code. This is useful to predict time to results as explained in \cite{ch7:EngsigKarupEtAl2011} and may be compared against other wave models in fair comparisons. 
 
-Work in progress focus on extending the governing equations to account for lack of physics such as wave runup and wave breaking. Also, we plan to extend the domain decomposition method to unstructured grids of blocks that can be boundary-fitted to more general bottom-mounted structures to be able to address wave-structure problems, cf. \cite{ch7:EHBM06,ch7:EHBW08}. For example, this will provide the basis for simulations of wave transformations in large harbour areas or predict wave climates in near-coastal areas.
+Work in progress focuses on extending the governing equations to account for lack of physics such as wave runup and wave breaking. Also, we plan to extend the domain decomposition method to unstructured grids of blocks that can be boundary-fitted to more general bottom-mounted structures to be able to address wave-structure problems, cf. \cite{ch7:EHBM06,ch7:EHBW08}. For example, this will provide the basis for simulations of wave transformations in large harbor areas or predict wave climates in near-coastal areas.
 
-We anticipate that a tool based on the proposed parallel solution strategies will be useful for further advancement in fast and robust analysis techniques and large-scale simulation of free surface wave simulation (e.g. for use as an efficient far-field solver at large scales) and be a basis for next-generation wave models. We also expect that the tool can be useful for hybrid-solution strategies with local flow features possibly resolved by other models and for advancing state-of-the-art in fast physics-based wave-body simulations, e.g., ship-wave interactions in ship simulation where real-time constraints are imposed due to visualization. These subjects will be part of ongoing work addressing application aspects. 
+We anticipate that a tool based on the proposed parallel solution strategies will be useful for further advancement in fast and robust analysis techniques and large-scale simulation of free surface wave simulation (e.g., for use as an efficient far-field solver at large scales) and be a basis for next-generation wave models. We also expect that the tool can be useful for hybrid-solution strategies with local flow features possibly resolved by other models and for advancing state-of-the-art in fast physics-based wave-body simulations, e.g., ship-wave interactions in ship simulation where real-time constraints are imposed due to visualization. These subjects will be part of ongoing work addressing application aspects. 
 
-\section{Acknowledgement}
+\section{Acknowledgment}
 
-This work was supported by grant no. 09-070032 from the Danish Research Council for Technology and Production Sciences. A special thank goes to Professor Jan S. Hesthaven for supporting parts of this work. Scalability and performance tests was done in GPUlab at DTU Informatics, Technical University of Denmark and using the GPU-cluster at Center for Computing and Visualization, Brown University, USA. Nvidia Corporation is acknowledged for generous hardware donations to facilities of GPUlab.
+This work was supported by grant no. 09-070032 from the Danish Research Council for Technology and Production Sciences. A special thank goes to Professor Jan S. Hesthaven for supporting parts of this work. Scalability and performance tests was done in the GPUlab at DTU Informatics, Technical University of Denmark and using the GPU-cluster at Center for Computing and Visualization, Brown University, USA. NVIDIA Corporation is acknowledged for generous hardware donations to facilities of the GPUlab.
 
 \putbib[Chapters/chapter7/biblio7]