From: couturie <couturie@extinction>
Date: Tue, 23 Jul 2013 19:57:49 +0000 (+0200)
Subject: new
X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/e2f7ea69b2321fbf77291f35360751e460a99f44?ds=inline;hp=348825c45a586538a695ea2b2492c3357bb96b31

new
---

diff --git a/BookGPU/Chapters/chapter12/biblio12.bib b/BookGPU/Chapters/chapter12/biblio12.bib
index f7dd392..803d354 100644
--- a/BookGPU/Chapters/chapter12/biblio12.bib
+++ b/BookGPU/Chapters/chapter12/biblio12.bib
@@ -1,36 +1,35 @@
-@article{ch12:ref1,
-title = {Iterative methods for sparse linear systems},
+
+@book{ch12:ref1,
 author = {Saad, Y.},
-journal = {Society for Industrial and Applied Mathematics,  2nd edition},
-volume = {},
-number = {},
-pages = {},
+title = {Iterative Methods for Sparse Linear Systems},
+publisher = {Society for Industrial and Applied Mathematics},
 year = {2003},
+edition   = {Second}
 }
 
 @article{ch12:ref2,
 title = {Methods of conjugate gradients for solving linear systems},
-author = {Hestenes, M. R. and Stiefel, E.},
+author = {Hestenes, M.R. and Stiefel, E.},
 journal = {Journal of Research of the National Bureau of Standards},
 volume = {49},
 number = {6},
 pages = {409--436},
-year = {1952},
+year = {1952}
 }
 
 @article{ch12:ref3,
 title = {{GMRES}: a generalized minimal residual algorithm for solving nonsymmetric linear systems},
-author = {Saad, Y. and Schultz, M. H.},
+author = {Saad, Y. and Schultz, M.H.},
 journal = {SIAM Journal on Scientific and Statistical Computing},
 volume = {7},
 number = {3},
 pages = {856--869},
-year = {1986},
+year = {1986}
 }
 
 @article{ch12:ref4,
 title = {Solution of sparse indefinite systems of linear equations},
-author = {Paige, C. C. and Saunders, M. A.},
+author = {Paige, C.C. and Saunders, M.A.},
 journal = {SIAM Journal on Numerical Analysis},
 volume = {12},
 number = {4},
@@ -40,7 +39,7 @@ year = {1975},
 
 @article{ch12:ref5,
 title = {The principle of minimized iteration in the solution of the matrix eigenvalue problem},
-author = {Arnoldi, W. E.},
+author = {Arnoldi, W.E.},
 journal = {Quarterly of Applied Mathematics},
 volume = {9},
 number = {17},
@@ -49,8 +48,8 @@ year = {1951},
 }
 
 @article{ch12:ref6,
-title = {{CUDA} Toolkit 4.2 {CUBLAS} Library},
-author = {NVIDIA Corporation},
+title = {{CUDA} {T}oolkit 4.2 {CUBLAS} {L}ibrary},
+author = {NVIDIA {C}orporation},
 journal = {},
 volume = {},
 number = {},
@@ -59,14 +58,14 @@ note = {\url{http://developer.download.nvidia.com/compute/DevZone/docs/html/CUDA
 year = {2012},
 }
 
-@article{ch12:ref7,
-title = {Efficient sparse matrix-vector multiplication on {CUDA}},
-author = {Bell, N. and Garland, M.},
-journal = {NVIDIA Technical Report NVR-2008-004, NVIDIA Corporation},
-volume = {},
-number = {},
-pages = {},
-year = {2008},
+@techreport{ch12:ref7,
+    author = {Bell, N. and Garland, M.},
+    title = {Efficient Sparse Matrix-Vector Multiplication on {CUDA}},
+    month = dec,
+    year = 2008,
+    institution = {NVIDIA Corporation},
+    type = {NVIDIA Technical Report},
+    number = {NVR-2008-004},
 }
 
 @article{ch12:ref8,
@@ -81,7 +80,7 @@ year = {},
 }
 
 @article{ch12:ref9,
-title = {{NVIDIA} {CUDA} {C} programming guide},
+title = {{NVIDIA} {CUDA} {C} {P}rogramming {G}uide},
 author = {NVIDIA Corporation},
 journal = {},
 volume = {},
@@ -92,7 +91,7 @@ year = {2012},
 }
 
 @article{ch12:ref10,
-title = {The university of {F}lorida sparse matrix collection},
+title = {The {U}niversity of {F}lorida {S}parse {M}atrix {C}ollection},
 author = {Davis, T. and Hu, Y.},
 journal = {},
 volume = {},
@@ -115,7 +114,7 @@ year = {1999},
 
 @article{ch12:ref12,
 title = {{hMETIS}: A hypergraph partitioning package},
-author = {Karypis, George and Kumar, Vipin},
+author = {Karypis, G. and Kumar, V.},
 journal = {},
 volume = {},
 number = {},
@@ -126,7 +125,7 @@ year = {1998},
 
 @article{ch12:ref13,
 title = {{PaToH}: partitioning tool for hypergraphs},
-author = {Catalyurek, Umit V. and Aykanat, Cevdet},
+author = {Catalyurek, U.V. and Aykanat, C.},
 journal = {},
 volume = {},
 number = {},
@@ -137,7 +136,7 @@ year = {1999},
 
 @article{ch12:ref14,
 title = {Parallel hypergraph partitioning for scientific computing},
-author = {Devine, Karen D. and Boman, Erik G. and Heaphy, Robert T. and Bisseling, Rob H. and Catalyurek, Umit V.},
+author = {Devine, K.D. and Boman, E.G. and Heaphy, R.T. and Bisseling, R.H. and Catalyurek, U.V.},
 journal = {In Proceedings of the 20th international conference on Parallel and distributed processing, IPDPSâ06},
 volume = {},
 number = {},
diff --git a/BookGPU/Chapters/chapter12/ch12.tex b/BookGPU/Chapters/chapter12/ch12.tex
index 0b743eb..3843597 100755
--- a/BookGPU/Chapters/chapter12/ch12.tex
+++ b/BookGPU/Chapters/chapter12/ch12.tex
@@ -5,7 +5,7 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  
 %\chapterauthor{}{}
-\chapterauthor{Lilia Ziane Khodja, RaphaÃ«l Couturier and Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France}
+\chapterauthor{Lilia Ziane Khodja, RaphaÃ«l Couturier, and Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France}
 %\chapterauthor{RaphaÃ«l Couturier}{Femto-ST Institute, University of Franche-Comte, France}
 %\chapterauthor{Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France}
 
@@ -20,15 +20,15 @@
 Sparse linear systems are used to model many scientific and industrial problems,
 such as the environmental simulations or the industrial processing of the complex or
 non-Newtonian fluids. Moreover, the resolution of these problems often involves the
-solving of such linear systems which are considered as the most expensive process in
+solving of such linear systems that are considered the most expensive process in
 terms of execution time and memory space. Therefore, solving sparse linear systems
 must be as efficient as possible in order to deal with problems of ever increasing
 size.
 
 There are, in the jargon of numerical analysis, different methods of solving sparse
-linear systems that can be classified in two classes: the direct and iterative methods.
-However, the iterative methods are often more suitable than their counterpart, direct
-methods, to  solve these systems. Indeed, they are less memory consuming and easier
+linear systems that can be classified in two classes: direct and iterative methods.
+However, the iterative methods are often more suitable than their counterparts, direct
+methods, to  solve these systems. Indeed, they are less memory-consuming and easier
 to parallelize on parallel computers than direct methods. Different computing platforms,
 sequential and parallel computers, are used to solve sparse linear systems with iterative
 solutions. Nowadays, graphics processing units (GPUs) have become attractive to solve
@@ -38,8 +38,8 @@ traditional CPUs.
 In Section~\ref{ch12:sec:02}, we describe the general principle of two well-known iterative
 methods: the conjugate gradient method and the generalized minimal residual method. In Section~\ref{ch12:sec:03},
 we give the main key points of the parallel implementation of both methods on a cluster of
-GPUs. Finally, in Section~\ref{ch12:sec:04}, we present the experimental results obtained on a
-CPU cluster and on a GPU cluster, to solve large sparse linear systems.    
+GPUs. Finally, in Section~\ref{ch12:sec:04}, we present the experimental results, obtained on a
+CPU cluster and on a GPU cluster of solving large sparse linear systems.    
 
 
 %%--------------------------%%
@@ -54,12 +54,12 @@ Ax=b,
 \label{ch12:eq:01}
 \end{equation}
 where $A\in\mathbb{R}^{n\times n}$ is a sparse nonsingular square matrix, $x\in\mathbb{R}^{n}$
-is the solution vector, $b\in\mathbb{R}^{n}$ is the right-hand side and $n\in\mathbb{N}$ is a
+is the solution vector, $b\in\mathbb{R}^{n}$ is the right-hand side, and $n\in\mathbb{N}$ is a
 large integer number. 
 
 The iterative methods\index{Iterative~method} for solving the large sparse linear system~(\ref{ch12:eq:01})
 proceed by successive iterations of a same block of elementary operations, during which an
-infinite number of approximate solutions $\{x_k\}_{k\geq 0}$ are computed. Indeed, from an
+infinite number of approximate solutions $\{x_k\}_{k\geq 0}$ is computed. Indeed, from an
 initial guess $x_0$, an iterative method determines at each iteration $k>0$ an approximate
 solution $x_k$ which, gradually, converges to the exact solution $x^{*}$ as follows:
 \begin{equation}
@@ -78,9 +78,9 @@ where $\varepsilon<1$ is the required convergence tolerance threshold\index{Conv
 
 Some of the most iterative methods that have proven their efficiency for solving large sparse
 linear systems are those called \textit{Krylov subspace methods}~\cite{ch12:ref1}\index{Iterative~method!Krylov~subspace}.
-In the present chapter, we describe two Krylov methods which are widely used: the conjugate
-gradient method (CG) and the generalized minimal residual method (GMRES). In practice, the
-Krylov subspace methods are usually used with preconditioners that allow to improve their
+In the present chapter, we describe two Krylov methods which are widely used: the CG method (conjugate
+gradient method) and the GMRES method (generalized minimal residual method). In practice, the
+Krylov subspace methods are usually used with preconditioners that allow the improvement of their
 convergence. So, in what follows, the CG and GMRES methods are used to solve the left-preconditioned\index{Sparse~linear~system!Preconditioned}
 sparse linear system:
 \begin{equation}
@@ -95,7 +95,7 @@ where $M$ is the preconditioning matrix.
 \subsection{CG method}
 \label{ch12:sec:02.01}
 The conjugate gradient method was initially developed by Hestenes and Stiefel in 1952~\cite{ch12:ref2}.
-It is one of the well known iterative method to solve large sparse linear systems. In addition, it
+It is one of the well-known iterative methods to solve large sparse linear systems. In addition, it
 can be adapted to solve nonlinear equations and optimization problems. However, it can only be applied
 to problems with positive definite symmetric matrices.
 
@@ -111,7 +111,7 @@ such that the Galerkin condition\index{Galerkin~condition} must be satisfied:
 r_k \bot \mathcal{K}_k(A,r_0),
 \label{ch12:eq:05}
 \end{equation}
-where $x_0$ is the initial guess, $r_k=b-Ax_k$ is the residual of the computed solution $x_k$ and $\mathcal{K}_k$
+where $x_0$ is the initial guess, $r_k=b-Ax_k$ is the residual of the computed solution $x_k$, and $\mathcal{K}_k$
 the Krylov subspace of order $k$: \[\mathcal{K}_k(A,r_0) \equiv\text{span}\{r_0, Ar_0, A^2r_0,\ldots, A^{k-1}r_0\}.\]
 In fact, CG is based on the construction of a sequence $\{p_k\}_{k\in\mathbb{N}}$ of direction vectors in $\mathcal{K}_k$
 which are pairwise $A$-conjugate ($A$-orthogonal):
@@ -142,9 +142,9 @@ p_0=r_0, & p_k=r_k+\beta_k p_{k-1}, & \beta_k\in\mathbb{R}.
 \label{ch12:eq:09}
 \end{equation}
 Moreover, the scalars $\{\alpha_k\}_{k>0}$ are chosen so as to minimize the $A$-norm error $\|x^{*}-x_k\|_A$
-over the Krylov subspace $\mathcal{K}_{k}$ and the scalars $\{\beta_k\}_{k>0}$ are chosen so as to ensure
+over the Krylov subspace $\mathcal{K}_{k}$, and the scalars $\{\beta_k\}_{k>0}$ are chosen so as to ensure
 that the direction vectors are pairwise $A$-conjugate. So, the assumption that matrix $A$ is symmetric and
-the recurrences~(\ref{ch12:eq:08}) and~(\ref{ch12:eq:09}) allow to deduce that:
+the recurrences~(\ref{ch12:eq:08}) and~(\ref{ch12:eq:09}) allow the deduction that:
 \begin{equation}
 \begin{array}{ll}
 \alpha_{k}=\frac{r^{T}_{k-1}r_{k-1}}{p_{k}^{T}Ap_{k}}, & \beta_{k}=\frac{r_{k}^{T}r_{k}}{r_{k-1}^{T}r_{k-1}}.
@@ -176,21 +176,21 @@ the recurrences~(\ref{ch12:eq:08}) and~(\ref{ch12:eq:09}) allow to deduce that:
       $k = k + 1$\;
     }
   }
-\caption{Left-preconditioned CG method}
+\caption{left-preconditioned CG method}
 \label{ch12:alg:01}
 \end{algorithm}
 
 Algorithm~\ref{ch12:alg:01} shows the main key points of the preconditioned CG method. It allows
-to solve the left-preconditioned\index{Sparse~linear~system!Preconditioned} sparse linear system~(\ref{ch12:eq:11}).
+the solving the left-preconditioned\index{Sparse~linear~system!Preconditioned} sparse linear system~(\ref{ch12:eq:11}).
 In this algorithm, $\varepsilon$ is the convergence tolerance threshold, $maxiter$ is the maximum
-number of iterations and $(\cdot,\cdot)$ defines the dot product between two vectors in $\mathbb{R}^{n}$.
+number of iterations, and $(\cdot,\cdot)$ defines the dot product between two vectors in $\mathbb{R}^{n}$.
 At every iteration, a direction vector $p_k$ is determined, so that it is orthogonal to the preconditioned
 residual $z_k$ and to the direction vectors $\{p_i\}_{i<k}$ previously determined (from line~$8$ to
 line~$13$). Then, at lines~$16$ and~$17$, the iterate $x_k$ and the residual $r_k$ are computed using
 formulas~(\ref{ch12:eq:07}) and~(\ref{ch12:eq:08}), respectively. The CG method converges after, at
 most, $n$ iterations. In practice, the CG algorithm stops when the tolerance threshold\index{Convergence!Tolerance~threshold}
 $\varepsilon$ and/or the maximum number of iterations\index{Convergence!Maximum~number~of~iterations}
-$maxiter$ are reached.
+$maxiter$ is reached.
 
 
 %%****************%%
@@ -240,7 +240,7 @@ x_k = x_0 + V_k y, & y\in\mathbb{R}^{k}.
 \end{array}
 \label{ch12:eq:16}
 \end{equation}
-From both formulas~(\ref{ch12:eq:15}) and~(\ref{ch12:eq:16}) and $r_k=b-Ax_k$, we can deduce that:
+From both formulas~(\ref{ch12:eq:15}) and~(\ref{ch12:eq:16}) and $r_k=b-Ax_k$, we can deduce that
 \begin{equation}
 \begin{array}{lll}
   r_{k} & = & b - A (x_{0} + V_{k}y) \\
@@ -257,22 +257,23 @@ norm of the residual $r_k$. Consequently, a linear least-squares problem of size
 \underset{y\in\mathbb{R}^{k}}{min}\|r_{k}\|_{2}=\underset{y\in\mathbb{R}^{k}}{min}\|\beta e_{1}-\bar{H}_{k}y\|_{2}.
 \label{ch12:eq:18}
 \end{equation}
-The QR factorization of matrix $\bar{H}_k$ is used to compute the solution of this problem by using
-Givens rotations~\cite{ch12:ref1,ch12:ref3}, such that:
+The QR factorization of matrix $\bar{H}_k$ is used (the decomposition of the matrix $\bar{H}$ into $Q$ and $R$ matrices)
+to compute the solution of this problem by using
+Givens rotations~\cite{ch12:ref1,ch12:ref3}, such that
 \begin{equation}
 \begin{array}{lll}
 \bar{H}_{k}=Q_{k}R_{k}, & Q_{k}\in\mathbb{R}^{(k+1)\times (k+1)}, & R_{k}\in\mathbb{R}^{(k+1)\times k},
 \end{array}
 \label{ch12:eq:19}
 \end{equation}
-where $Q_kQ_k^T=I_k$ and $R_k$ is an upper triangular matrix.
+where $Q_k$ is an orthogonal matrix and $R_k$ is an upper triangular matrix.
 
 The GMRES method computes an approximate solution with a sufficient precision after, at most, $n$
 iterations ($n$ is the size of the sparse linear system to be solved). However, the GMRES algorithm
 must construct and store in the memory an orthonormal basis $V_k$ whose size is proportional to the
 number of iterations required to achieve the convergence. Then, to avoid a huge memory storage, the
 GMRES method must be restarted at each $m$ iterations, such that $m$ is very small ($m\ll n$), and
-with $x_m$ as the initial guess to the next iteration. This allows to limit the size of the basis
+with $x_m$ as the initial guess to the next iteration. This allows the limitation of the size of the basis
 $V$ to $m$ orthogonal vectors.
 
 \begin{algorithm}[!t]
@@ -292,7 +293,7 @@ $V$ to $m$ orthogonal vectors.
       $h_{j+1,j} = \|w_{j}\|_{2}$\;
       $v_{j+1} = w_{j}/h_{j+1,j}$\;
     }
-    Set $V_{m}=\{v_{j}\}_{1\leq j \leq m}$ and $\bar{H}_{m}=(h_{i,j})$ a $(m+1)\times m$ upper Hessenberg matrix\;
+    Set $V_{m}=\{v_{j}\}_{1\leq j \leq m}$ and $\bar{H}_{m}=(h_{i,j})$ is an upper Hessenberg matrix of size $(m+1)\times m$\;
     Solve a least-squares problem of size $m$: $min_{y\in\mathrm{I\!R}^{m}}\|\beta e_{1}-\bar{H}_{m}y\|_{2}$\;
     $x_{m} = x_{0}+V_{m}y_{m}$\;
     $r_{m} = M^{-1}(b-Ax_{m})$\;
@@ -305,7 +306,7 @@ $V$ to $m$ orthogonal vectors.
       $k = k + 1$\;
     }
   }
-\caption{Left-preconditioned GMRES method with restarts}
+\caption{left-preconditioned GMRES method with restarts}
 \label{ch12:alg:02}
 \end{algorithm}
 
@@ -330,10 +331,10 @@ is reached.
 \label{ch12:sec:03}
 In this section, we present the parallel algorithms of both iterative CG\index{Iterative~method!CG}
 and GMRES\index{Iterative~method!GMRES} methods for GPU clusters. The implementation is performed on
-a GPU cluster composed of different computing nodes, such that each node is a CPU core managed by a
-MPI process and equipped with a GPU card. The parallelization of these algorithms is carried out by
+a GPU cluster composed of different computing nodes, such that each node is a CPU core managed by one
+MPI (message passing interface) process and equipped with a GPU card. The parallelization of these algorithms is carried out by
 using the MPI communication routines between the GPU computing nodes\index{Computing~node} and the
-CUDA programming environment inside each node. In what follows, the algorithms of the iterative methods
+CUDA (compute unified device architecture) programming environment inside each node. In what follows, the algorithms of the iterative methods
 are called iterative solvers.
 
 
@@ -342,24 +343,24 @@ are called iterative solvers.
 \subsection{Data partitioning}
 \label{ch12:sec:03.01}
 The parallel solving of the large sparse linear system~(\ref{ch12:eq:11}) requires a data partitioning
-between the computing nodes of the GPU cluster. Let $p$ denotes the number of the computing nodes on the
-GPU cluster. The partitioning operation consists in the decomposition of the vectors and matrices, involved
-in the iterative solver, in $p$ portions. Indeed, this operation allows to assign to each computing node
+between the computing nodes of the GPU cluster. Let $p$ denote the number of the computing nodes on the
+GPU cluster. The partitioning operation consists of the decomposition of the vectors and matrices, involved
+in the iterative solver, in $p$ portions. Indeed, this operation allows the assignment to each computing node
 $i$:
 \begin{itemize}
 \item a portion of size $\frac{n}{p}$ elements of each vector,
-\item a sparse rectangular sub-matrix $A_i$ of size $(\frac{n}{p},n)$ and,
-\item a square preconditioning sub-matrix $M_i$ of size $(\frac{n}{p},\frac{n}{p})$, 
+\item a sparse rectangular submatrix $A_i$ of size $(\frac{n}{p},n)$, and
+\item a square preconditioning submatrix $M_i$ of size $(\frac{n}{p},\frac{n}{p})$, 
 \end{itemize} 
 where $n$ is the size of the sparse linear system to be solved. In the first instance, we perform a naive
 row-wise partitioning (row-by-row decomposition) on the data of the sparse linear systems to be solved.
 Figure~\ref{ch12:fig:01} shows an example of a row-wise data partitioning between four computing nodes
-of a sparse linear system (sparse matrix $A$, solution vector $x$ and right-hand side $b$) of size $16$
+of a sparse linear system (sparse matrix $A$, solution vector $x$, and right-hand side $b$) of size $16$
 unknown values. 
 
 \begin{figure}
 \centerline{\includegraphics[scale=0.35]{Chapters/chapter12/figures/partition}}
-\caption{A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.}
+\caption{A data partitioning of the sparse matrix $A$, the solution vector $x$, and the right-hand side $b$ into four portions.}
 \label{ch12:fig:01}
 \end{figure}
 
@@ -371,17 +372,17 @@ unknown values.
 After the partitioning operation, all the data involved from this operation must be
 transferred from the CPU memories to the GPU memories, in order to be processed by
 GPUs. We use two functions of the CUBLAS\index{CUBLAS} library (CUDA Basic Linear
-Algebra Subroutines), developed by Nvidia~\cite{ch12:ref6}: \verb+cublasAlloc()+
+Algebra Subroutines) developed by NVIDIA~\cite{ch12:ref6}: \verb+cublasAlloc()+
 for the memory allocations on GPUs and \verb+cublasSetVector()+ for the memory
 copies from the CPUs to the GPUs.
 
-An efficient implementation of CG and GMRES solvers on a GPU cluster requires to
-determine all parts of their codes that can be executed in parallel and, thus, take
+An efficient implementation of CG and GMRES solvers on a GPU cluster requires the
+determining of all parts of their codes that can be executed in parallel and, thus, takes
 advantage of the GPU acceleration. As many Krylov subspace methods, the CG and GMRES
 methods are mainly based on arithmetic operations dealing with vectors or matrices:
 sparse matrix-vector multiplications, scalar-vector multiplications, dot products,
-Euclidean norms, AXPY operations ($y\leftarrow ax+y$ where $x$ and $y$ are vectors
-and $a$ is a scalar) and so on. These vector operations are often easy to parallelize
+Euclidean norms, AXPY operations ($y\leftarrow ax+y$ where $x$ and $y$ are vectors and $a$ is a scalar),
+and so on. These vector operations are often easy to parallelize
 and they are more efficient on parallel computers when they work on large vectors.
 Therefore, all the vector operations used in CG and GMRES solvers must be executed
 by the GPUs as kernels.
@@ -389,41 +390,41 @@ by the GPUs as kernels.
 We use the kernels of the CUBLAS library to compute some vector operations of CG and
 GMRES solvers. The following kernels of CUBLAS (dealing with double floating point)
 are used: \verb+cublasDdot()+ for the dot products, \verb+cublasDnrm2()+ for the
-Euclidean norms and \verb+cublasDaxpy()+ for the AXPY operations. For the rest of
+Euclidean norms, and \verb+cublasDaxpy()+ for the AXPY operations ($y\leftarrow ax+y$, compute a scalar-vector product and add 
+the result to a vector). For the rest of
 the data-parallel operations, we code their kernels in CUDA. In the CG solver, we
-develop a kernel for the XPAY operation ($y\leftarrow x+ay$) used  line~$12$ in
+develop a kernel for the XPAY operation ($y\leftarrow x+ay$) used in line~$12$ in
 Algorithm~\ref{ch12:alg:01}. In the GMRES solver, we program a kernel for the scalar-vector
 multiplication (lines~$7$ and~$15$ in Algorithm~\ref{ch12:alg:02}), a kernel to
-solve the least-squares problem and a kernel to update the elements  of the solution
+solve the least-squares problem, and a kernel to update the elements of the solution
 vector $x$.
 
 The least-squares problem in the GMRES method is solved by performing a QR factorization
 on the Hessenberg matrix\index{Hessenberg~matrix} $\bar{H}_m$ with plane rotations and,
 then, solving the triangular system by backward substitutions to compute $y$. Consequently,
-solving the least-squares problem on the GPU is not interesting. Indeed, the triangular
+solving the least-squares problem on the GPU is not efficient. Indeed, the triangular
 solves are not easy to parallelize and inefficient on GPUs. However, the least-squares
 problem to solve in the GMRES method with restarts has, generally, a very small size $m$.
-Therefore, we develop an inexpensive kernel which must be executed in sequential by a
-single CUDA thread. 
+Therefore, we develop an inexpensive kernel which must be executed by a single CUDA thread. 
 
 The most important operation in CG\index{Iterative~method!CG} and GMRES\index{Iterative~method!GMRES}
-methods is the sparse matrix-vector multiplication (SpMV)\index{SpMV~multiplication},
+methods is the SpMV multiplication (sparse matrix-vector multiplication)\index{SpMV~multiplication},
 because it is often an expensive operation in terms of execution time and memory space.
-Moreover, it requires to take care of the storage format of the sparse matrix in the
+Moreover, it requires taking care of the storage format of the sparse matrix in the
 memory. Indeed, the naive storage, row-by-row or column-by-column, of a sparse matrix
 can cause a significant waste of memory space and execution time. In addition, the sparse
 nature of the matrix often leads to irregular memory accesses to read the matrix nonzero
-values. So, the computation of the SpMV multiplication on GPUs can involve non coalesced
+values. So, the computation of the SpMV multiplication on GPUs can involve noncoalesced
 accesses to the global memory, which slows down its performances even more. One of the
 most efficient compressed storage formats\index{Compressed~storage~format} of sparse
-matrices on GPUs is the HYB\index{Compressed~storage~format!HYB} format~\cite{ch12:ref7}.
+matrices on GPUs is the HYB (hybrid)\index{Compressed~storage~format!HYB} format~\cite{ch12:ref7}.
 It is a combination of ELLpack (ELL) and Coordinate (COO) formats. Indeed, it stores
 a typical number of nonzero values per row in ELL\index{Compressed~storage~format!ELL}
-format and remaining entries of exceptional rows in COO format. It combines the efficiency
+format and the remaining entries of exceptional rows in COO format. It combines the efficiency
 of ELL due to the regularity of its memory accesses and the flexibility of COO\index{Compressed~storage~format!COO}
 which is insensitive to the matrix structure. Consequently, we use the HYB kernel~\cite{ch12:ref8}
-developed by Nvidia to implement the SpMV multiplication of CG and GMRES methods on GPUs.
-Moreover, to avoid the non coalesced accesses to the high-latency global memory, we fill
+developed by NVIDIA to implement the SpMV multiplication of CG and GMRES methods on GPUs.
+Moreover, to avoid the noncoalesced accesses to the high-latency global memory, we fill
 the elements of the iterate vector $x$ in the cached texture memory.
 
 
@@ -440,54 +441,54 @@ the cluster. In what follows, two computing nodes sharing data are called neighb
 
 As already mentioned, the most important operation of CG and GMRES methods is the SpMV multiplication.
 In the parallel implementation of the iterative methods, each computing node $i$ performs the
-SpMV multiplication on its own sparse rectangular sub-matrix $A_i$. Locally, it has only sub-vectors
-of size $\frac{n}{p}$ corresponding to rows of its sub-matrix $A_i$. However, it also requires
-the vector elements of its neighbors, corresponding to the column indices on which its sub-matrix
+SpMV multiplication on its own sparse rectangular submatrix $A_i$. Locally, it has only subvectors
+of size $\frac{n}{p}$ corresponding to rows of its submatrix $A_i$. However, it also requires
+the vector elements of its neighbors, corresponding to the column indices on which its submatrix
 has nonzero values (see Figure~\ref{ch12:fig:01}). So, in addition to the local vectors, each
 node must also manage vector elements shared with neighbors and required to compute the SpMV
 multiplication. Therefore, the iterate vector $x$ managed by each computing node is composed
-of a local sub-vector $x^{local}$ of size $\frac{n}{p}$ and a sub-vector of shared elements $x^{shared}$.
+of a local subvector $x^{local}$ of size $\frac{n}{p}$ and a subvector of shared elements $x^{shared}$.
 In the same way, the vector used to construct the orthonormal basis of the Krylov subspace (vectors
-$p$ and $v$ in CG and GMRES methods, respectively) is composed of a local sub-vector and a shared
-sub-vector. 
+$p$ and $v$ in CG and GMRES methods, respectively) is composed of a local subvector and a shared
+subvector. 
 
 Therefore, before computing the SpMV multiplication\index{SpMV~multiplication}, the neighboring
 nodes\index{Neighboring~node} over the GPU cluster must exchange between them the shared vector
 elements necessary to compute this multiplication. First, each computing node determines, in its
-local sub-vector, the vector elements needed by other nodes. Then, the neighboring nodes exchange
+local subvector, the vector elements needed by other nodes. Then, the neighboring nodes exchange
 between them these shared vector elements. The data exchanges are implemented by using the MPI
 point-to-point communication routines: blocking\index{MPI~subroutines!Blocking} sends with \verb+MPI_Send()+
 and nonblocking\index{MPI~subroutines!Nonblocking} receives with \verb+MPI_Irecv()+. Figure~\ref{ch12:fig:02}
-shows an example of data exchanges between \textit{Node 1} and its neighbors \textit{Node 0}, \textit{Node 2}
+shows an example of data exchanges between \textit{Node 1} and its neighbors \textit{Node 0}, \textit{Node 2},
 and \textit{Node 3}. In this example, the iterate matrix $A$ split between these four computing
 nodes is that presented in Figure~\ref{ch12:fig:01}.
 
 \begin{figure}
 \centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/compress}}
-\caption{Data exchanges between \textit{Node 1} and its neighbors \textit{Node 0}, \textit{Node 2} and \textit{Node 3}.}
+\caption{Data exchanges between \textit{Node 1} and its neighbors \textit{Node 0}, \textit{Node 2}, and \textit{Node 3}.}
 \label{ch12:fig:02}
 \end{figure}
 
 After the synchronization operation, the computing nodes receive, from their respective neighbors,
-the shared elements in a sub-vector stored in a compressed format. However, in order to compute the
+the shared elements in a subvector stored in a compressed format. However, in order to compute the
 SpMV multiplication, the computing nodes operate on sparse global vectors (see Figure~\ref{ch12:fig:02}).
 In this case, the received vector elements must be copied to the corresponding indices in the global
 vector. So as not to need to perform this at each iteration, we propose to reorder the columns of
-each sub-matrix $\{A_i\}_{0\leq i<p}$, so that the shared sub-vectors could be used in their compressed
-storage formats. Figure~\ref{ch12:fig:03} shows a reordering of a sparse sub-matrix (sub-matrix of
+each submatrix $\{A_i\}_{0\leq i<p}$, so that the shared subvectors could be used in their compressed
+storage formats. Figure~\ref{ch12:fig:03} shows a reordering of a sparse submatrix (submatrix of
 \textit{Node 1}). 
 
 \begin{figure}
 \centerline{\includegraphics[scale=0.35]{Chapters/chapter12/figures/reorder}}
-\caption{Columns reordering of a sparse sub-matrix.}
+\caption{Columns reordering of a sparse submatrix.}
 \label{ch12:fig:03}
 \end{figure}
 
 A GPU cluster\index{GPU~cluster} is a parallel platform with a distributed memory. So, the synchronizations
-and communication data between GPU nodes are carried out by passing messages. However, GPUs can not communicate
-between them in a direct way. Then, CPUs via MPI processes are in charge of the synchronizations within the GPU
+and communication data between GPU nodes are carried out by passing messages. However, a GPU cannot exchange data
+with other GPUs in a direct way. Then, CPUs via MPI processes are in charge of the synchronizations within the GPU
 cluster. Consequently, the vector elements to be exchanged must be copied from the GPU memory to the CPU memory
-and vice-versa before and after the synchronization operation between CPUs. We have used the CUBLAS\index{CUBLAS}
+and vice versa before and after the synchronization operation between CPUs. We have used the CUBLAS\index{CUBLAS}
 communication subroutines to perform the data transfers between a CPU core and its GPU: \verb+cublasGetVector()+
 and \verb+cublasSetVector()+. Finally, in addition to the data exchanges, GPU nodes perform reduction operations
 to compute in parallel the dot products and Euclidean norms. This is implemented by using the MPI global communication\index{MPI~subroutines!Global}
@@ -502,7 +503,7 @@ to compute in parallel the dot products and Euclidean norms. This is implemented
 \label{ch12:sec:04}
 In this section, we present the performances of the parallel CG and GMRES linear solvers obtained
 on a cluster of $12$ GPUs. Indeed, this GPU cluster of tests is composed of six machines connected
-by $20$Gbps InfiniBand network. Each machine is a Quad-Core Xeon E5530 CPU running at $2.4$GHz and
+by a $20$GB/s InfiniBand network. Each machine is a Quad-Core Xeon E5530 CPU running at $2.4$GHz and
 providing $12$GB of RAM with a memory bandwidth of $25.6$GB/s. In addition, two Tesla C1060 GPUs are
 connected to each machine via a PCI-Express 16x Gen 2.0 interface with a throughput of $8$GB/s. A
 Tesla C1060 GPU contains $240$ cores running at $1.3$GHz and providing a global memory of $4$GB with
@@ -511,12 +512,12 @@ that we used in the experimental tests.
 
 Linux cluster version 2.6.39 OS is installed on CPUs. C programming language is used to code
 the parallel algorithms of both methods on the GPU cluster. CUDA version 4.0~\cite{ch12:ref9}
-is used to program GPUs, using CUBLAS library~\cite{ch12:ref6} to deal with vector operations
+is used to program GPUs, using the CUBLAS library~\cite{ch12:ref6} to deal with vector operations
 in GPUs and, finally, MPI routines of OpenMPI 1.3.3 are used to carry out the communications between
 CPU cores. Indeed, the experiments are done on a cluster of $12$ computing nodes, where each node
-is managed by a MPI process and it is composed of one CPU core and one GPU card.
+is managed by one MPI process and is composed of one CPU core and one GPU card.
 
-\begin{figure}[!h]
+\begin{figure}
 \centerline{\includegraphics[scale=0.25]{Chapters/chapter12/figures/cluster}}
 \caption{General scheme of the GPU cluster of tests composed of six machines, each with two GPUs.}
 \label{ch12:fig:04}
@@ -524,26 +525,34 @@ is managed by a MPI process and it is composed of one CPU core and one GPU card.
 
 All tests are made on double-precision floating point operations. The parameters of both linear
 solvers are initialized as follows: the residual tolerance threshold $\varepsilon=10^{-12}$, the
-maximum number of iterations $maxiter=500$, the right-hand side $b$ is filled with $1.0$ and the
+maximum number of iterations $maxiter=500$, the right-hand side $b$ is filled with $1.0$, and the
 initial guess $x_0$ is filled with $0.0$. In addition, we limited the Arnoldi process\index{Iterative~method!Arnoldi~process}
 used in the GMRES method to $16$ iterations ($m=16$). For the sake of simplicity, we have chosen
-the preconditioner $M$ as the main diagonal of the sparse matrix $A$. Indeed, it allows to easily
-compute the required inverse matrix $M^{-1}$ and it provides a relatively good preconditioning for
+the preconditioner $M$ as the main diagonal of the sparse matrix $A$. Indeed, it allows us to easily
+compute the required inverse matrix $M^{-1}$, and it provides a relatively good preconditioning for
 not too ill-conditioned matrices. In the GPU computing, the size of thread blocks is fixed to $512$
 threads. Finally, the performance results, presented hereafter, are obtained from the mean value
 over $10$ executions of the same parallel linear solver and for the same input data.
 
 \begin{figure}
 \centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/matrices}}
-\caption{Sketches of sparse matrices chosen from the Davis collection.}
+\caption{Sketches of sparse matrices chosen from the University of Florida collection.}
 \label{ch12:fig:05}
 \end{figure}
 
+To get more realistic results, we have tested the CG and GMRES algorithms on sparse matrices of the University of Florida
+collection~\cite{ch12:ref10}, that arise in a wide spectrum of real-world applications. We have chosen six
+symmetric sparse matrices and six nonsymmetric ones from this collection. In Figure~\ref{ch12:fig:05},
+we show the structures of these matrices and in Table~\ref{ch12:tab:01} we present their main characteristics
+which are the number of rows, the total number of nonzero values, and the maximal bandwidth. In
+the present chapter, the bandwidth of a sparse matrix is defined as the number of matrix columns separating
+the first and the last nonzero value on a matrix row.
+
 \begin{table}
 \centering
 \begin{tabular}{|c|c|c|c|c|}
 \hline
-{\bf Matrix type}             & {\bf Matrix name} & {\bf \# rows} & {\bf \# nnz} & {\bf Bandwidth} \\ \hline \hline
+{\bf Matrix Type}             & {\bf Matrix Name} & {\bf \# Rows} & {\bf \# Nonzeros} & {\bf Bandwidth} \\ \hline \hline
 
 \multirow{6}{*}{Symmetric}    & 2cubes\_sphere    & $101,492$     & $1,647,264$  & $100,464$ \\
 
@@ -569,23 +578,15 @@ over $10$ executions of the same parallel linear solver and for the same input d
 
                               & torso3            & $259,156$     & $4,429,042$  & $216,854$  \\ \hline
 \end{tabular}
-\caption{Main characteristics of sparse matrices chosen from the Davis collection.}
+\caption{Main characteristics of sparse matrices chosen from the University of Florida collection.}
 \label{ch12:tab:01}
 \end{table}
 
-To get more realistic results, we have tested the CG and GMRES algorithms on sparse matrices of the Davis
-collection~\cite{ch12:ref10}, that arise in a wide spectrum of real-world applications. We have chosen six
-symmetric sparse matrices and six nonsymmetric ones from this collection. In Figure~\ref{ch12:fig:05},
-we show the structures of these matrices and in Table~\ref{ch12:tab:01} we present their main characteristics
-which are the number of rows, the total number of nonzero values (nnz) and the maximal bandwidth. In
-the present chapter, the bandwidth of a sparse matrix is defined as the number of matrix columns separating
-the first and the last nonzero value on a matrix row.
-
-\begin{table}
+\begin{table}[!h]
 \begin{center}
 \begin{tabular}{|c|c|c|c|c|c|c|} 
 \hline
-{\bf Matrix}     & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$  & $\mathbf{\# iter.}$ & $\mathbf{prec.}$     & $\mathbf{\Delta}$   \\ \hline \hline
+{\bf Matrix}     & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$  & $\mathbf{\#~Iter.}$ & $\mathbf{Prec.}$     & $\mathbf{\Delta}$   \\ \hline \hline
 
 2cubes\_sphere    & $0.132s$           & $0.069s$            & $1.93$        & $12$           & $1.14e$-$09$     & $3.47e$-$18$ \\
 
@@ -604,11 +605,11 @@ thermal2          & $1.172s$           & $0.622s$            & $1.88$        & $
 \end{center}
 \end{table}
 
-\begin{table}
+\begin{table}[!h]
 \begin{center}
 \begin{tabular}{|c|c|c|c|c|c|c|} 
 \hline
-{\bf Matrix}     & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$  & $\mathbf{\# iter.}$ & $\mathbf{prec.}$     & $\mathbf{\Delta}$   \\ \hline \hline
+{\bf Matrix}     & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$  & $\mathbf{\#~Iter.}$ & $\mathbf{Prec.}$     & $\mathbf{\Delta}$   \\ \hline \hline
 
 2cubes\_sphere    & $0.234s$           & $0.124s$            & $1.88$        & $21$           & $2.10e$-$14$     & $3.47e$-$18$ \\
 
@@ -640,13 +641,13 @@ torso3            & $4.242s$           & $2.030s$            & $2.09$        & $
 \end{table}
 
 Tables~\ref{ch12:tab:02} and~\ref{ch12:tab:03} show the performances of the parallel
-CG and GMRES solvers, respectively, for solving linear systems associated to the sparse
-matrices presented in Tables~\ref{ch12:tab:01}. They allow to compare the performances
+CG and~GMRES solvers, respectively, for solving linear systems associated to the sparse
+matrices presented in Table~\ref{ch12:tab:01}. They allow us to compare the performances
 obtained on a cluster of $24$ CPU cores and on a cluster of $12$ GPUs. However, Table~\ref{ch12:tab:02}
-only shows the performances of solving symmetric sparse linear systems, due to the inability
+shows the performances of solving only symmetric sparse linear systems, due to the inability
 of the CG method to solve the nonsymmetric systems. In both tables, the second and third
 columns give, respectively, the execution times in seconds obtained on $24$ CPU cores
-($Time_{gpu}$) and that obtained on $12$ GPUs ($Time_{gpu}$). Moreover, we take into account
+($Time_{cpu}$) and that obtained on $12$ GPUs ($Time_{gpu}$). Moreover, we take into account
 the relative gains $\tau$ of a solver implemented on the GPU cluster compared to the same
 solver implemented on the CPU cluster. The relative gains\index{Relative~gain}, presented
 in the fourth column, are computed as a ratio of the CPU execution time over the GPU
@@ -656,9 +657,9 @@ execution time:
 \label{ch12:eq:20}
 \end{equation}
 In addition, Tables~\ref{ch12:tab:02} and~\ref{ch12:tab:03} give the number of iterations
-($iter$), the precision $prec$ of the solution computed on the GPU cluster and the difference
+($iter$), the precision ($prec$) of the solution computed on the GPU cluster, and the difference
 $\Delta$ between the solution computed on the CPU cluster and that computed on the GPU cluster.
-Both parameters $prec$ and $\Delta$ allow to validate and verify the accuracy of the solution
+Both parameters $prec$ and $\Delta$ allow us to validate and verify the accuracy of the solution
 computed on the GPU cluster. We have computed them as follows:
 \begin{eqnarray}
 \Delta = max|x^{cpu}-x^{gpu}|,\\
@@ -670,35 +671,35 @@ $prec$ is the maximum element, in absolute value, of the residual vector $r^{gpu
 of the solution $x^{gpu}$. Thus, we can see that the solutions obtained on the GPU cluster
 were computed with a sufficient accuracy (about $10^{-10}$) and they are, more or less, equivalent
 to those computed on the CPU cluster with a small difference ranging from $10^{-10}$ to $10^{-26}$.
-However, we can notice from the relative gains $\tau$ that it is not interesting to use multiple
-GPUs for solving small sparse linear systems. In fact, a small sparse matrix does not allow to
+However, we can notice from the relative gains $\tau$ that it is not efficient to use multiple
+GPUs for solving small sparse linear systems. In fact, a small sparse matrix does not allow us to
 maximize utilization of GPU cores. In addition, the communications required to synchronize the
 computations over the cluster increase the idle times of GPUs and slow down  the parallel
 computations further.
 
 Consequently, in order to test the performances of the parallel solvers, we developed in C programming
-language a generator of large sparse matrices. This generator takes a matrix from the Davis collection~\cite{ch12:ref10}
-as an initial matrix to build large sparse matrices exceeding ten million of rows. It must be executed
-in parallel by the MPI processes of the computing nodes, so that each process could build its sparse
-sub-matrix. In the first experimental tests, we focused on sparse matrices having a banded structure,
+language a generator of large sparse matrices. This generator takes a matrix from the University of Florida collection~\cite{ch12:ref10}
+as an initial matrix to build large sparse matrices exceeding ten million rows. It must be executed
+in parallel by the MPI processes of the computing nodes, so that each process can build its sparse
+submatrix. In the first experimental tests, we focused on sparse matrices having a banded structure,
 because they are those arising the most in the majority of numerical problems. So to generate the global sparse matrix,
-each MPI process constructs its sub-matrix by performing several copies of an initial sparse matrix chosen
-from the Davis collection. Then, it puts all these copies on the main diagonal of the global matrix
+each MPI process constructs its submatrix by performing several copies of an initial sparse matrix chosen
+from the University of Florida collection. Then, it puts all these copies on the main diagonal of the global matrix
 (see Figure~\ref{ch12:fig:06}). Moreover, the empty spaces between two successive copies in the main
-diagonal are filled with sub-copies (left-copy and right-copy in Figure~\ref{ch12:fig:06}) of the same
+diagonal are filled with subcopies (left-copy and right-copy in Figure~\ref{ch12:fig:06}) of the same
 initial matrix.
 
-\begin{figure}[htbp]
+\begin{figure}
 \centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/generation}}
 \caption{Parallel generation of a large sparse matrix by four computing nodes.}
 \label{ch12:fig:06}
 \end{figure}
 
-\begin{table}[htbp]
+\begin{table}[!h]
 \centering
 \begin{tabular}{|c|c|c|c|}
 \hline
-{\bf Matrix type}             & {\bf Matrix name} & {\bf \# nnz} & {\bf Bandwidth} \\ \hline \hline
+{\bf Matrix Type}             & {\bf Matrix Name} & {\bf \# Nonzeros} & {\bf Bandwidth} \\ \hline \hline
 
 \multirow{6}{*}{Symmetric}    & 2cubes\_sphere    & $413,703,602$ & $198,836$     \\
 
@@ -725,15 +726,28 @@ initial matrix.
                               & torso3            & $433,795,264$ & $328,757$        \\ \hline
 \end{tabular}
 \vspace{0.5cm}
-\caption{Main characteristics of sparse banded matrices generated from those of the Davis collection.}
+\caption{Main characteristics of sparse banded matrices generated from those of the University of Florida collection.}
 \label{ch12:tab:04}
 \end{table}
 
-\begin{table}[htbp]
+We have used the parallel CG and GMRES algorithms for solving sparse linear systems of $25$
+million unknown values. The sparse matrices associated to these linear systems are generated
+from those presented in Table~\ref{ch12:tab:01}. Their main characteristics are given in Table~\ref{ch12:tab:04}.
+Tables~\ref{ch12:tab:05} and~\ref{ch12:tab:06} show the performances of the parallel CG and
+GMRES solvers, respectively, obtained on a cluster of $24$ CPU cores and on a cluster of $12$
+GPUs. Obviously, we can notice from these tables that solving large sparse linear systems on
+a GPU cluster is more efficient than on a CPU cluster (see relative gains $\tau$). We can also
+notice that the execution times of the CG method, whether in a CPU cluster or in a GPU cluster,
+are better than those of the GMRES method for solving large symmetric linear systems. In fact, the
+CG method is characterized by a better convergence\index{Convergence} rate and a shorter execution
+time of an iteration than those of the GMRES method. Moreover, an iteration of the parallel GMRES
+method requires more data exchanges between computing nodes compared to the parallel CG method.
+
+\begin{table}[!h]
 \begin{center}
 \begin{tabular}{|c|c|c|c|c|c|c|} 
 \hline
-{\bf Matrix}    & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{\# iter.}$ & $\mathbf{prec.}$ & $\mathbf{\Delta}$   \\ \hline \hline
+{\bf Matrix}    & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{\#~Iter.}$ & $\mathbf{Prec.}$ & $\mathbf{\Delta}$   \\ \hline \hline
 
 2cubes\_sphere  & $1.625s$             & $0.401s$              & $4.05$          & $14$                & $5.73e$-$11$     & $5.20e$-$18$ \\
 
@@ -753,11 +767,11 @@ on a cluster of 12 GPUs.}
 \end{center}
 \end{table}
 
-\begin{table}
+\begin{table}[!h]
 \begin{center}
 \begin{tabular}{|c|c|c|c|c|c|c|} 
 \hline
-{\bf Matrix}      & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{\# iter.}$ & $\mathbf{prec.}$ & $\mathbf{\Delta}$   \\ \hline \hline
+{\bf Matrix}      & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{\#~Iter.}$ & $\mathbf{Prec.}$ & $\mathbf{\Delta}$   \\ \hline \hline
 
 2cubes\_sphere    & $3.597s$             & $0.514s$              & $6.99$          & $21$                & $2.11e$-$14$     & $8.67e$-$18$ \\
 
@@ -787,22 +801,7 @@ torso3            & $31.463s$            & $3.681s$              & $8.55$
 on a cluster of 12 GPUs.}
 \label{ch12:tab:06}
 \end{center}
-\end{table}
-
-
-We have used the parallel CG and GMRES algorithms for solving sparse linear systems of $25$
-million unknown values. The sparse matrices associated to these linear systems are generated
-from those presented in Table~\ref{ch12:tab:01}. Their main characteristics are given in Table~\ref{ch12:tab:04}.
-Tables~\ref{ch12:tab:05} and~\ref{ch12:tab:06} shows the performances of the parallel CG and
-GMRES solvers, respectively, obtained on a cluster of $24$ CPU cores and on a cluster of $12$
-GPUs. Obviously, we can notice from these tables that solving large sparse linear systems on
-a GPU cluster is more efficient than on a CPU cluster (see relative gains $\tau$). We can also
-notice that the execution times of the CG method, whether in a CPU cluster or in a GPU cluster,
-are better than those of the GMRES method for solving large symmetric linear systems. In fact, the
-CG method is characterized by a better convergence\index{Convergence} rate and a shorter execution
-time of an iteration than those of the GMRES method. Moreover, an iteration of the parallel GMRES
-method requires more data exchanges between computing nodes compared to the parallel CG method.
- 
+\end{table} 
 
 %%--------------------------%%
 %%       SECTION 5          %%
@@ -812,7 +811,7 @@ method requires more data exchanges between computing nodes compared to the para
 In this chapter, we have aimed at harnessing the computing power of a
 cluster of GPUs for solving large sparse linear systems. For this, we
 have used two Krylov subspace iterative methods: the CG and GMRES methods.
-The first method is well-known for its efficiency to solve symmetric
+The first method is well known for its efficiency to solve symmetric
 linear systems and the second one is used, particularly, to solve
 nonsymmetric linear systems. 
 
@@ -820,28 +819,28 @@ We have presented the parallel implementation of both iterative methods
 on a GPU cluster. Particularly, the operations dealing with the vectors
 and/or matrices, of these methods, are parallelized between the different
 GPU computing nodes of the cluster. Indeed, the data-parallel vector operations
-are accelerated by GPUs and the communications required to synchronize the
+are accelerated by GPUs, and the communications required to synchronize the
 parallel computations are carried out by CPU cores. For this, we have used
-a heterogeneous CUDA/MPI programming to implement the parallel iterative
+heterogeneous CUDA/MPI programming to implement the parallel iterative
 algorithms.
 
 In the experimental tests, we have shown that using a GPU cluster is efficient
 for solving linear systems associated to very large sparse matrices. The experimental
-results, obtained in the present chapter, show that a cluster of $12$ GPUs is
+results, discussed in the present chapter, show that a cluster of $12$ GPUs is
 about $7$ times faster than a cluster of $24$ CPU cores for solving large sparse
-linear systems of $25$ million unknown values. This is due to the GPU ability to
+linear systems of $25$ million unknown values. This is due to the GPUs ability to
 compute the data-parallel operations faster than the CPUs.
 
-In our future works, we plan to test the parallel algorithms of CG and GMRES methods, adapted
+In our future works, we plan to test the parallel algorithms of CG and~GMRES methods, adapted
 to GPUs, for solving large linear systems associated to sparse matrices of different structures.
-For example, the matrices having large bandwidths, which can lead to many data dependencies
+For example, the matrices having large bandwidths can lead to many data dependencies
 between the computing nodes and, thus, degrade the performances of both algorithms. So in
 this case, it would be interesting to study the different data partitioning techniques, in
 order to minimize the dependencies between the computing nodes and thus to reduce the total
 communication volume. This may improve the performances of both algorithms implemented on
 a GPU cluster. Moreover, in the recent GPU hardware and software architectures, the GPU-Direct
 system with CUDA version 5.0 is used so that two GPUs located on the same node or on distant
-nodes can communicate between them directly without CPUs. This allows to improve the data
+nodes can communicate between each other directly without CPUs. This allows us to improve the data
 transfers between GPUs.          
    
   
diff --git a/BookGPU/Chapters/chapter12/ch12.tex~ b/BookGPU/Chapters/chapter12/ch12.tex~
deleted file mode 100755
index eaa4f9c..0000000
--- a/BookGPU/Chapters/chapter12/ch12.tex~
+++ /dev/null
@@ -1,1232 +0,0 @@
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%                          %%
-%%       CHAPTER 12         %%
-%%                          %%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
- 
-%\chapterauthor{}{}
-\chapterauthor{Lilia Ziane Khodja}{Femto-ST Institute, University of Franche-Comte, France}
-\chapterauthor{RaphaÃ«l Couturier}{Femto-ST Institute, University of Franche-Comte, France}
-\chapterauthor{Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France}
-
-\chapter{Solving sparse linear systems with GMRES and CG methods on GPU clusters}
-\label{ch12}
-
-%%--------------------------%%
-%%       SECTION 1          %%
-%%--------------------------%%
-\section{Introduction}
-\label{ch12:sec:01}
-The sparse linear systems are used to model many scientific and industrial problems,
-such as the environmental simulations or the industrial processing of the complex or
-non-Newtonian fluids. Moreover, the resolution of these problems often involves the
-solving of such linear systems which is considered as the most expensive process in
-terms of execution time and memory space. Therefore, solving sparse linear systems
-must be as efficient as possible in order to deal with problems of ever increasing
-size.
-
-There are, in the jargon of numerical analysis, different methods of solving sparse
-linear systems that can be classified in two classes: the direct and iterative methods.
-However, the iterative methods are often more suitable than their counterpart, direct
-methods, for solving these systems. Indeed, they are less memory consuming and easier
-to parallelize on parallel computers than direct methods. Different computing platforms,
-sequential and parallel computers, are used for solving sparse linear systems with iterative
-solutions. Nowadays, graphics processing units (GPUs) have become attractive for solving
-these systems, due to their computing power and their ability to compute faster than
-traditional CPUs.
-
-In Section~\ref{ch12:sec:02}, we describe the general principle of two well-known iterative
-methods: the conjugate gradient method and the generalized minimal residual method. In Section~\ref{ch12:sec:03},
-we give the main key points of the parallel implementation of both methods on a cluster of
-GPUs. Then, in Section~\ref{ch12:sec:04}, we present the experimental results obtained on a
-CPU cluster and on a GPU cluster, for solving sparse linear systems associated to matrices
-of different structures. Finally, in Section~\ref{ch12:sec:05}, we apply the hypergraph partitioning
-technique to reduce the total communication volume between the computing nodes and, thus,
-to improve the execution times of the parallel algorithms of both iterative methods.   
-
-
-%%--------------------------%%
-%%       SECTION 2          %%
-%%--------------------------%%
-\section{Krylov iterative methods}
-\label{ch12:sec:02}
-Let us consider the following system of $n$ linear equations\index{Sparse~linear~system}
-in $\mathbb{R}$: 
-\begin{equation}
-Ax=b,
-\label{ch12:eq:01}
-\end{equation}
-where $A\in\mathbb{R}^{n\times n}$ is a sparse nonsingular square matrix, $x\in\mathbb{R}^{n}$
-is the solution vector, $b\in\mathbb{R}^{n}$ is the right-hand side and $n\in\mathbb{N}$ is a
-large integer number. 
-
-The iterative methods\index{Iterative~method} for solving the large sparse linear system~(\ref{ch12:eq:01})
-proceed by successive iterations of a same block of elementary operations, during which an
-infinite number of approximate solutions $\{x_k\}_{k\geq 0}$ are computed. Indeed, from an
-initial guess $x_0$, an iterative method determines at each iteration $k>0$ an approximate
-solution $x_k$ which, gradually, converges to the exact solution $x^{*}$ as follows:
-\begin{equation}
-x^{*}=\lim\limits_{k\to\infty}x_{k}=A^{-1}b.
-\label{ch12:eq:02}
-\end{equation}
-The number of iterations necessary to reach the exact solution $x^{*}$ is not known beforehand
-and can be infinite. In practice, an iterative method often finds an approximate solution $\tilde{x}$
-after a fixed number of iterations and/or when a given convergence criterion\index{Convergence}
-is satisfied as follows:   
-\begin{equation}
-\|b-A\tilde{x}\| < \varepsilon,
-\label{ch12:eq:03}
-\end{equation}
-where $\varepsilon<1$ is the required convergence tolerance threshold\index{Convergence!Tolerance~threshold}. 
-
-Some of the most iterative methods that have proven their efficiency for solving large sparse
-linear systems are those called \textit{Krylov subspace methods}~\cite{ch12:ref1}\index{Iterative~method!Krylov~subspace}.
-In the present chapter, we describe two Krylov methods which are widely used: the conjugate
-gradient method (CG) and the generalized minimal residual method (GMRES). In practice, the
-Krylov subspace methods are usually used with preconditioners that allow to improve their
-convergence. So, in what follows, the CG and GMRES methods are used for solving the left-preconditioned\index{Sparse~linear~system!Preconditioned}
-sparse linear system:
-\begin{equation}
-M^{-1}Ax=M^{-1}b,
-\label{ch12:eq:11}
-\end{equation}
-where $M$ is the preconditioning matrix.
-
-
-%%****************%%
-%%****************%%
-\subsection{CG method}
-\label{ch12:sec:02.01}
-The conjugate gradient method is initially developed by Hestenes and Stiefel in 1952~\cite{ch12:ref2}.
-It is one of the well known iterative method for solving large sparse linear systems. In addition, it
-can be adapted for solving nonlinear equations and optimization problems. However, it can only be applied
-to problems with positive definite symmetric matrices.
-
-The main idea of the CG method\index{Iterative~method!CG} is the computation of a sequence of approximate
-solutions $\{x_k\}_{k\geq 0}$ in a Krylov subspace\index{Iterative~method!Krylov~subspace} of order $k$ as
-follows: 
-\begin{equation}
-x_k \in x_0 + \mathcal{K}_k(A,r_0),
-\label{ch12:eq:04}
-\end{equation}
-such that the Galerkin condition\index{Galerkin~condition} must be satisfied:
-\begin{equation}
-r_k \bot \mathcal{K}_k(A,r_0),
-\label{ch12:eq:05}
-\end{equation}
-where $x_0$ is the initial guess, $r_k=b-Ax_k$ is the residual of the computed solution $x_k$ and $\mathcal{K}_k$
-the Krylov subspace of order $k$: \[\mathcal{K}_k(A,r_0) \equiv\text{span}\{r_0, Ar_0, A^2r_0,\ldots, A^{k-1}r_0\}.\]
-In fact, CG is based on the construction of a sequence $\{p_k\}_{k\in\mathbb{N}}$ of direction vectors in $\mathcal{K}_k$
-which are pairwise $A$-conjugate ($A$-orthogonal):
-\begin{equation}
-\begin{array}{ll}
-p_i^T A p_j = 0, & i\neq j. 
-\end{array} 
-\label{ch12:eq:06}
-\end{equation}
-At each iteration $k$, an approximate solution $x_k$ is computed by recurrence as follows:  
-\begin{equation}
-\begin{array}{ll}
-x_k = x_{k-1} + \alpha_k p_k, & \alpha_k\in\mathbb{R}.
-\end{array} 
-\label{ch12:eq:07}
-\end{equation}
-Consequently, the residuals $r_k$ are computed in the same way:
-\begin{equation}
-r_k = r_{k-1} - \alpha_k A p_k. 
-\label{ch12:eq:08}
-\end{equation}
-In the case where all residuals are nonzero, the direction vectors $p_k$ can be determined so that
-the following recurrence holds:
-\begin{equation}
-\begin{array}{lll}
-p_0=r_0, & p_k=r_k+\beta_k p_{k-1}, & \beta_k\in\mathbb{R}.
-\end{array} 
-\label{ch12:eq:09}
-\end{equation}
-Moreover, the scalars $\{\alpha_k\}_{k>0}$ are chosen so as to minimize the $A$-norm error $\|x^{*}-x_k\|_A$
-over the Krylov subspace $\mathcal{K}_{k}$ and the scalars $\{\beta_k\}_{k>0}$ are chosen so as to ensure
-that the direction vectors are pairwise $A$-conjugate. So, the assumption that matrix $A$ is symmetric and
-the recurrences~(\ref{ch12:eq:08}) and~(\ref{ch12:eq:09}) allow to deduce that:
-\begin{equation}
-\begin{array}{ll}
-\alpha_{k}=\frac{r^{T}_{k-1}r_{k-1}}{p_{k}^{T}Ap_{k}}, & \beta_{k}=\frac{r_{k}^{T}r_{k}}{r_{k-1}^{T}r_{k-1}}.
-\end{array}
-\label{ch12:eq:10}
-\end{equation}
-
-\begin{algorithm}[!t]
-  Choose an initial guess $x_0$\;
-  $r_{0} = b - A x_{0}$\;
-  $convergence$ = false\;
-  $k = 1$\;
-  \Repeat{convergence}{
-    $z_{k} = M^{-1} r_{k-1}$\;
-    $\rho_{k} = (r_{k-1},z_{k})$\;
-    \eIf{$k = 1$}{
-      $p_{k} = z_{k}$\;
-    }{
-      $\beta_{k} = \rho_{k} / \rho_{k-1}$\;
-      $p_{k} = z_{k} + \beta_{k} \times p_{k-1}$\;
-    }
-    $q_{k} = A \times p_{k}$\;
-    $\alpha_{k} = \rho_{k} / (p_{k},q_{k})$\;
-    $x_{k} = x_{k-1} + \alpha_{k} \times p_{k}$\;
-    $r_{k} = r_{k-1} - \alpha_{k} \times q_{k}$\;
-    \eIf{$(\rho_{k} < \varepsilon)$ {\bf or} $(k \geq maxiter)$}{
-      $convergence$ = true\;
-    }{
-      $k = k + 1$\;
-    }
-  }
-\caption{Left-preconditioned CG method}
-\label{ch12:alg:01}
-\end{algorithm}
-
-Algorithm~\ref{ch12:alg:01} shows the main key points of the preconditioned CG method. It allows
-to solve the left-preconditioned\index{Sparse~linear~system!Preconditioned} sparse linear system~(\ref{ch12:eq:11}).
-In this algorithm, $\varepsilon$ is the convergence tolerance threshold, $maxiter$ is the maximum
-number of iterations and $(\cdot,\cdot)$ defines the dot product between two vectors in $\mathbb{R}^{n}$.
-At every iteration, a direction vector $p_k$ is determined, so that it is orthogonal to the preconditioned
-residual $z_k$ and to the direction vectors $\{p_i\}_{i<k}$ previously determined (from line~$8$ to
-line~$13$). Then, at lines~$16$ and~$17$, the iterate $x_k$ and the residual $r_k$ are computed using
-formulas~(\ref{ch12:eq:07}) and~(\ref{ch12:eq:08}), respectively. The CG method converges after, at
-most, $n$ iterations. In practice, the CG algorithm stops when the tolerance threshold\index{Convergence!Tolerance~threshold}
-$\varepsilon$ and/or the maximum number of iterations\index{Convergence!Maximum~number~of~iterations}
-$maxiter$ are reached.
-
-
-%%****************%%
-%%****************%%
-\subsection{GMRES method} 
-\label{ch12:sec:02.02}
-The iterative GMRES method is developed by Saad and Schultz in 1986~\cite{ch12:ref3} as a generalization
-of the minimum residual method MINRES~\cite{ch12:ref4}\index{Iterative~method!MINRES}. Indeed, GMRES can
-be applied for solving symmetric or nonsymmetric linear systems. 
-
-The main principle of the GMRES method\index{Iterative~method!GMRES} is to find an approximation minimizing
-at best the residual norm. In fact, GMRES computes a sequence of approximate solutions $\{x_k\}_{k>0}$ in
-a Krylov subspace\index{Iterative~method!Krylov~subspace} $\mathcal{K}_k$ as follows:
-\begin{equation}
-\begin{array}{ll}
-x_k \in x_0 + \mathcal{K}_k(A, v_1),& v_1=\frac{r_0}{\|r_0\|_2},
-\end{array}
-\label{ch12:eq:12}
-\end{equation} 
-so that the Petrov-Galerkin condition\index{Petrov-Galerkin~condition} is satisfied:
-\begin{equation}
-\begin{array}{ll}
-r_k \bot A \mathcal{K}_k(A, v_1).
-\end{array}
-\label{ch12:eq:13}
-\end{equation}
-GMRES uses the Arnoldi process~\cite{ch12:ref5}\index{Iterative~method!Arnoldi~process} to construct an
-orthonormal basis $V_k$ for the Krylov subspace $\mathcal{K}_k$ and an upper Hessenberg matrix\index{Hessenberg~matrix}
-$\bar{H}_k$ of order $(k+1)\times k$:
-\begin{equation}
-\begin{array}{ll}
-V_k = \{v_1, v_2,\ldots,v_k\}, & \forall k>1, v_k=A^{k-1}v_1,
-\end{array}
-\label{ch12:eq:14}
-\end{equation}
-and
-\begin{equation}
-V_k A = V_{k+1} \bar{H}_k.
-\label{ch12:eq:15}
-\end{equation}
-
-Then, at each iteration $k$, an approximate solution $x_k$ is computed in the Krylov subspace $\mathcal{K}_k$
-spanned by $V_k$ as follows:
-\begin{equation}
-\begin{array}{ll}
-x_k = x_0 + V_k y, & y\in\mathbb{R}^{k}.
-\end{array}
-\label{ch12:eq:16}
-\end{equation}
-From both formulas~(\ref{ch12:eq:15}) and~(\ref{ch12:eq:16}) and $r_k=b-Ax_k$, we can deduce that:
-\begin{equation}
-\begin{array}{lll}
-  r_{k} & = & b - A (x_{0} + V_{k}y) \\
-        & = & r_{0} - AV_{k}y \\
-        & = & \beta v_{1} - V_{k+1}\bar{H}_{k}y \\
-        & = & V_{k+1}(\beta e_{1} - \bar{H}_{k}y),
-\end{array}
-\label{ch12:eq:17}
-\end{equation}
-such that $\beta=\|r_0\|_2$ and $e_1=(1,0,\cdots,0)$ is the first vector of the canonical basis of
-$\mathbb{R}^k$. So, the vector $y$ is chosen in $\mathbb{R}^k$ so as to minimize at best the Euclidean
-norm of the residual $r_k$. Consequently, a linear least-squares problem of size $k$ is solved:
-\begin{equation}
-\underset{y\in\mathbb{R}^{k}}{min}\|r_{k}\|_{2}=\underset{y\in\mathbb{R}^{k}}{min}\|\beta e_{1}-\bar{H}_{k}y\|_{2}.
-\label{ch12:eq:18}
-\end{equation}
-The QR factorization of matrix $\bar{H}_k$ is used to compute the solution of this problem by using
-Givens rotations~\cite{ch12:ref1,ch12:ref3}, such that:
-\begin{equation}
-\begin{array}{lll}
-\bar{H}_{k}=Q_{k}R_{k}, & Q_{k}\in\mathbb{R}^{(k+1)\times (k+1)}, & R_{k}\in\mathbb{R}^{(k+1)\times k},
-\end{array}
-\label{ch12:eq:19}
-\end{equation}
-where $Q_kQ_k^T=I_k$ and $R_k$ is an upper triangular matrix.
-
-The GMRES method computes an approximate solution with a sufficient precision after, at most, $n$
-iterations ($n$ is the size of the sparse linear system to be solved). However, the GMRES algorithm
-must construct and store in the memory an orthonormal basis $V_k$ whose size is proportional to the
-number of iterations required to achieve the convergence. Then, to avoid a huge memory storage, the
-GMRES method must be restarted at each $m$ iterations, such that $m$ is very small ($m\ll n$), and
-with $x_m$ as the initial guess to the next iteration. This allows to limit the size of the basis
-$V$ to $m$ orthogonal vectors.
-
-\begin{algorithm}[!t]
-  Choose an initial guess $x_0$\;
-  $convergence$ = false\;
-  $k = 1$\;
-  $r_{0} = M^{-1}(b-Ax_{0})$\;
-  $\beta = \|r_{0}\|_{2}$\;
-  \While{$\neg convergence$}{
-    $v_{1} = r_{0}/\beta$\;
-    \For{$j=1$ \KwTo $m$}{ 
-      $w_{j} = M^{-1}Av_{j}$\;
-      \For{$i=1$ \KwTo $j$}{
-        $h_{i,j} = (w_{j},v_{i})$\;
-        $w_{j} = w_{j}-h_{i,j}v_{i}$\;
-      }
-      $h_{j+1,j} = \|w_{j}\|_{2}$\;
-      $v_{j+1} = w_{j}/h_{j+1,j}$\;
-    }
-    Set $V_{m}=\{v_{j}\}_{1\leq j \leq m}$ and $\bar{H}_{m}=(h_{i,j})$ a $(m+1)\times m$ upper Hessenberg matrix\;
-    Solve a least-squares problem of size $m$: $min_{y\in\mathrm{I\!R}^{m}}\|\beta e_{1}-\bar{H}_{m}y\|_{2}$\;
-    $x_{m} = x_{0}+V_{m}y_{m}$\;
-    $r_{m} = M^{-1}(b-Ax_{m})$\;
-    $\beta = \|r_{m}\|_{2}$\;   
-    \eIf{ $(\beta<\varepsilon)$ {\bf or} $(k\geq maxiter)$}{
-      $convergence$ = true\;
-    }{
-      $x_{0} = x_{m}$\;
-      $r_{0} = r_{m}$\;
-      $k = k + 1$\;
-    }
-  }
-\caption{Left-preconditioned GMRES method with restarts}
-\label{ch12:alg:02}
-\end{algorithm}
-
-Algorithm~\ref{ch12:alg:02} shows the main key points of the GMRES method with restarts.
-It solves the left-preconditioned\index{Sparse~linear~system!Preconditioned} sparse linear
-system~(\ref{ch12:eq:11}), such that $M$ is the preconditioning matrix. At each iteration
-$k$, GMRES uses the Arnoldi process\index{Iterative~method!Arnoldi~process} (defined from
-line~$7$ to line~$17$) to construct a basis $V_m$ of $m$ orthogonal vectors and an upper
-Hessenberg matrix\index{Hessenberg~matrix} $\bar{H}_m$ of size $(m+1)\times m$. Then, it
-solves the linear least-squares problem of size $m$ to find the vector $y\in\mathbb{R}^{m}$
-which minimizes at best the residual norm (line~$18$). Finally, it computes an approximate
-solution $x_m$ in the Krylov subspace spanned by $V_m$ (line~$19$). The GMRES algorithm is
-stopped when the residual norm is sufficiently small ($\|r_m\|_2<\varepsilon$) and/or the
-maximum number of iterations\index{Convergence!Maximum~number~of~iterations} ($maxiter$)
-is reached.
-
-
-%%--------------------------%%
-%%       SECTION 3          %%
-%%--------------------------%%
-\section{Parallel implementation on a GPU cluster}
-\label{ch12:sec:03}
-In this section, we present the parallel algorithms of both iterative CG\index{Iterative~method!CG}
-and GMRES\index{Iterative~method!GMRES} methods for GPU clusters. The implementation is performed on
-a GPU cluster composed of different computing nodes, such that each node is a CPU core managed by a
-MPI process and equipped with a GPU card. The parallelization of these algorithms is carried out by
-using the MPI communication routines between the GPU computing nodes\index{Computing~node} and the
-CUDA programming environment inside each node. In what follows, the algorithms of the iterative methods
-are called iterative solvers.
-
-
-%%****************%%
-%%****************%%
-\subsection{Data partitioning}
-\label{ch12:sec:03.01}
-The parallel solving of the large sparse linear system~(\ref{ch12:eq:11}) requires a data partitioning
-between the computing nodes of the GPU cluster. Let $p$ denotes the number of the computing nodes on the
-GPU cluster. The partitioning operation consists in the decomposition of the vectors and matrices, involved
-in the iterative solver, in $p$ portions. Indeed, this operation allows to assign to each computing node
-$i$:
-\begin{itemize}
-\item a portion of size $\frac{n}{p}$ elements of each vector,
-\item a sparse rectangular sub-matrix $A_i$ of size $(\frac{n}{p},n)$ and,
-\item a square preconditioning sub-matrix $M_i$ of size $(\frac{n}{p},\frac{n}{p})$, 
-\end{itemize} 
-where $n$ is the size of the sparse linear system to be solved. In the first instance, we perform a naive
-row-wise partitioning (decomposition row-by-row) on the data of the sparse linear systems to be solved.
-Figure~\ref{ch12:fig:01} shows an example of a row-wise data partitioning between four computing nodes
-of a sparse linear system (sparse matrix $A$, solution vector $x$ and right-hand side $b$) of size $16$
-unknown values. 
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.35]{Chapters/chapter12/figures/partition}}
-\caption{A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.}
-\label{ch12:fig:01}
-\end{figure}
-
-
-%%****************%%
-%%****************%%
-\subsection{GPU computing}
-\label{ch12:sec:03.02}
-After the partitioning operation, all the data involved from this operation must be
-transferred from the CPU memories to the GPU memories, in order to be processed by
-GPUs. We use two functions of the CUBLAS\index{CUBLAS} library (CUDA Basic Linear
-Algebra Subroutines), developed by Nvidia~\cite{ch12:ref6}: \verb+cublasAlloc()+
-for the memory allocations on GPUs and \verb+cublasSetVector()+ for the memory
-copies from the CPUs to the GPUs.
-
-An efficient implementation of CG and GMRES solvers on a GPU cluster requires to
-determine all parts of their codes that can be executed in parallel and, thus, take
-advantage of the GPU acceleration. As many Krylov subspace methods, the CG and GMRES
-methods are mainly based on arithmetic operations dealing with vectors or matrices:
-sparse matrix-vector multiplications, scalar-vector multiplications, dot products,
-Euclidean norms, AXPY operations ($y\leftarrow ax+y$ where $x$ and $y$ are vectors
-and $a$ is a scalar) and so on. These vector operations are often easy to parallelize
-and they are more efficient on parallel computers when they work on large vectors.
-Therefore, all the vector operations used in CG and GMRES solvers must be executed
-by the GPUs as kernels.
-
-We use the kernels of the CUBLAS library to compute some vector operations of CG and
-GMRES solvers. The following kernels of CUBLAS (dealing with double floating point)
-are used: \verb+cublasDdot()+ for the dot products, \verb+cublasDnrm2()+ for the
-Euclidean norms and \verb+cublasDaxpy()+ for the AXPY operations. For the rest of
-the data-parallel operations, we code their kernels in CUDA. In the CG solver, we
-develop a kernel for the XPAY operation ($y\leftarrow x+ay$) used at line~$12$ in
-Algorithm~\ref{ch12:alg:01}. In the GMRES solver, we program a kernel for the scalar-vector
-multiplication (lines~$7$ and~$15$ in Algorithm~\ref{ch12:alg:02}), a kernel for
-solving the least-squares problem and a kernel for the elements updates of the solution
-vector $x$.
-
-The least-squares problem in the GMRES method is solved by performing a QR factorization
-on the Hessenberg matrix\index{Hessenberg~matrix} $\bar{H}_m$ with plane rotations and,
-then, solving the triangular system by backward substitutions to compute $y$. Consequently,
-solving the least-squares problem on the GPU is not interesting. Indeed, the triangular
-solves are not easy to parallelize and inefficient on GPUs. However, the least-squares
-problem to solve in the GMRES method with restarts has, generally, a very small size $m$.
-Therefore, we develop an inexpensive kernel which must be executed in sequential by a
-single CUDA thread. 
-
-The most important operation in CG\index{Iterative~method!CG} and GMRES\index{Iterative~method!GMRES}
-methods is the sparse matrix-vector multiplication (SpMV)\index{SpMV~multiplication},
-because it is often an expensive operation in terms of execution time and memory space.
-Moreover, it requires to take care of the storage format of the sparse matrix in the
-memory. Indeed, the naive storage, row-by-row or column-by-column, of a sparse matrix
-can cause a significant waste of memory space and execution time. In addition, the sparsity
-nature of the matrix often leads to irregular memory accesses to read the matrix nonzero
-values. So, the computation of the SpMV multiplication on GPUs can involve non coalesced
-accesses to the global memory, which slows down even more its performances. One of the
-most efficient compressed storage formats\index{Compressed~storage~format} of sparse
-matrices on GPUs is HYB\index{Compressed~storage~format!HYB} format~\cite{ch12:ref7}.
-It is a combination of ELLpack (ELL) and Coordinate (COO) formats. Indeed, it stores
-a typical number of nonzero values per row in ELL\index{Compressed~storage~format!ELL}
-format and remaining entries of exceptional rows in COO format. It combines the efficiency
-of ELL due to the regularity of its memory accesses and the flexibility of COO\index{Compressed~storage~format!COO}
-which is insensitive to the matrix structure. Consequently, we use the HYB kernel~\cite{ch12:ref8}
-developed by Nvidia to implement the SpMV multiplication of CG and GMRES methods on GPUs.
-Moreover, to avoid the non coalesced accesses to the high-latency global memory, we fill
-the elements of the iterate vector $x$ in the cached texture memory.
-
-
-%%****************%%
-%%****************%%
-\subsection{Data communications}
-\label{ch12:sec:03.03}
-All the computing nodes of the GPU cluster execute in parallel the same iterative solver
-(Algorithm~\ref{ch12:alg:01} or Algorithm~\ref{ch12:alg:02}) adapted to GPUs, but on their
-own portions of the sparse linear system\index{Sparse~linear~system}: $M^{-1}_iA_ix_i=M^{-1}_ib_i$,
-$0\leq i<p$. However, in order to solve the complete sparse linear system~(\ref{ch12:eq:11}),
-synchronizations must be performed between the local computations of the computing nodes over
-the cluster. In what follows, two computing nodes sharing data are called neighboring nodes\index{Neighboring~node}.
-
-As already mentioned, the most important operation of CG and GMRES methods is the SpMV multiplication.
-In the parallel implementation of the iterative methods, each computing node $i$ performs the
-SpMV multiplication on its own sparse rectangular sub-matrix $A_i$. Locally, it has only sub-vectors
-of size $\frac{n}{p}$ corresponding to rows of its sub-matrix $A_i$. However, it also requires
-the vector elements of its neighbors, corresponding to the column indices on which its sub-matrix
-has nonzero values (see Figure~\ref{ch12:fig:01}). So, in addition to the local vectors, each
-node must also manage vector elements shared with neighbors and required to compute the SpMV
-multiplication. Therefore, the iterate vector $x$ managed by each computing node is composed
-of a local sub-vector $x^{local}$ of size $\frac{n}{p}$ and a sub-vector of shared elements $x^{shared}$.
-In the same way, the vector used to construct the orthonormal basis of the Krylov subspace (vectors
-$p$ and $v$ in CG and GMRES methods, respectively) is composed of a local sub-vector and a shared
-sub-vector. 
-
-Therefore, before computing the SpMV multiplication\index{SpMV~multiplication}, the neighboring
-nodes\index{Neighboring~node} over the GPU cluster must exchange between them the shared vector
-elements necessary to compute this multiplication. First, each computing node determines, in its
-local sub-vector, the vector elements needed by other nodes. Then, the neighboring nodes exchange
-between them these shared vector elements. The data exchanges are implemented by using the MPI
-point-to-point communication routines: blocking\index{MPI~subroutines!Blocking} sends with \verb+MPI_Send()+
-and nonblocking\index{MPI~subroutines!Nonblocking} receives with \verb+MPI_Irecv()+. Figure~\ref{ch12:fig:02}
-shows an example of data exchanges between \textit{Node 1} and its neighbors \textit{Node 0}, \textit{Node 2}
-and \textit{Node 3}. In this example, the iterate matrix $A$ split between these four computing
-nodes is that presented in Figure~\ref{ch12:fig:01}.
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/compress}}
-\caption{Data exchanges between \textit{Node 1} and its neighbors \textit{Node 0}, \textit{Node 2} and \textit{Node 3}.}
-\label{ch12:fig:02}
-\end{figure}
-
-After the synchronization operation, the computing nodes receive, from their respective neighbors,
-the shared elements in a sub-vector stored in a compressed format. However, in order to compute the
-SpMV multiplication, the computing nodes operate on sparse global vectors (see Figure~\ref{ch12:fig:02}).
-In this case, the received vector elements must be copied to the corresponding indices in the global
-vector. So as not to need to perform this at each iteration, we propose to reorder the columns of
-each sub-matrix $\{A_i\}_{0\leq i<p}$, so that the shared sub-vectors could be used in their compressed
-storage formats. Figure~\ref{ch12:fig:03} shows a reordering of a sparse sub-matrix (sub-matrix of
-\textit{Node 1}). 
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.35]{Chapters/chapter12/figures/reorder}}
-\caption{Columns reordering of a sparse sub-matrix.}
-\label{ch12:fig:03}
-\end{figure}
-
-A GPU cluster\index{GPU~cluster} is a parallel platform with a distributed memory. So, the synchronizations
-and communication data between GPU nodes are carried out by passing messages. However, GPUs can not communicate
-between them in direct way. Then, CPUs via MPI processes are in charge of the synchronizations within the GPU
-cluster. Consequently, the vector elements to be exchanged must be copied from the GPU memory to the CPU memory
-and vice-versa before and after the synchronization operation between CPUs. We have used the CUBLAS\index{CUBLAS}
-communication subroutines to perform the data transfers between a CPU core and its GPU: \verb+cublasGetVector()+
-and \verb+cublasSetVector()+. Finally, in addition to the data exchanges, GPU nodes perform reduction operations
-to compute in parallel the dot products and Euclidean norms. This is implemented by using the MPI global communication\index{MPI~subroutines!Global}
-\verb+MPI_Allreduce()+.
-
-
-
-%%--------------------------%%
-%%       SECTION 4          %%
-%%--------------------------%%
-\section{Experimental results}
-\label{ch12:sec:04}
-In this section, we present the performances of the parallel CG and GMRES linear solvers obtained
-on a cluster of $12$ GPUs. Indeed, this GPU cluster of tests is composed of six machines connected
-by $20$Gbps InfiniBand network. Each machine is a Quad-Core Xeon E5530 CPU running at $2.4$GHz and
-providing $12$GB of RAM with a memory bandwidth of $25.6$GB/s. In addition, two Tesla C1060 GPUs are
-connected to each machine via a PCI-Express 16x Gen 2.0 interface with a throughput of $8$GB/s. A
-Tesla C1060 GPU contains $240$ cores running at $1.3$GHz and providing a global memory of $4$GB with
-a memory bandwidth of $102$GB/s. Figure~\ref{ch12:fig:04} shows the general scheme of the GPU cluster\index{GPU~cluster}
-that we used in the experimental tests.
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.25]{Chapters/chapter12/figures/cluster}}
-\caption{General scheme of the GPU cluster of tests composed of six machines, each with two GPUs.}
-\label{ch12:fig:04}
-\end{figure}
-
-Linux cluster version 2.6.39 OS is installed on CPUs. C programming language is used for coding
-the parallel algorithms of both methods on the GPU cluster. CUDA version 4.0~\cite{ch12:ref9}
-is used for programming GPUs, using CUBLAS library~\cite{ch12:ref6} to deal with vector operations
-in GPUs and, finally, MPI routines of OpenMPI 1.3.3 are used to carry out the communications between
-CPU cores. Indeed, the experiments are done on a cluster of $12$ computing nodes, where each node
-is managed by a MPI process and it is composed of one CPU core and one GPU card.
-
-All tests are made on double-precision floating point operations. The parameters of both linear
-solvers are initialized as follows: the residual tolerance threshold $\varepsilon=10^{-12}$, the
-maximum number of iterations $maxiter=500$, the right-hand side $b$ is filled with $1.0$ and the
-initial guess $x_0$ is filled with $0.0$. In addition, we limited the Arnoldi process\index{Iterative~method!Arnoldi~process}
-used in the GMRES method to $16$ iterations ($m=16$). For the sake of simplicity, we have chosen
-the preconditioner $M$ as the main diagonal of the sparse matrix $A$. Indeed, it allows to easily
-compute the required inverse matrix $M^{-1}$ and it provides a relatively good preconditioning for
-not too ill-conditioned matrices. In the GPU computing, the size of thread blocks is fixed to $512$
-threads. Finally, the performance results, presented hereafter, are obtained from the mean value
-over $10$ executions of the same parallel linear solver and for the same input data.
-
-To get more realistic results, we tested the CG and GMRES algorithms on sparse matrices of the Davis's
-collection~\cite{ch12:ref10}, that arise in a wide spectrum of real-world applications. We chose six
-symmetric sparse matrices and six nonsymmetric ones from this collection. In Figure~\ref{ch12:fig:05},
-we show structures of these matrices and in Table~\ref{ch12:tab:01} we present their main characteristics
-which are the number of rows, the total number of nonzero values (nnz) and the maximal bandwidth. In
-the present chapter, the bandwidth of a sparse matrix is defined as the number of matrix columns separating
-the first and the last nonzero value on a matrix row.
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/matrices}}
-\caption{Sketches of sparse matrices chosen from the Davis's collection.}
-\label{ch12:fig:05}
-\end{figure}
-
-\begin{table}
-\centering
-\begin{tabular}{|c|c|c|c|c|}
-\hline
-{\bf Matrix type}             & {\bf Matrix name} & {\bf \# rows} & {\bf \# nnz} & {\bf Bandwidth} \\ \hline \hline
-
-\multirow{6}{*}{Symmetric}    & 2cubes\_sphere    & $101,492$     & $1,647,264$  & $100,464$ \\
-
-                              & ecology2          & $999,999$     & $4,995,991$  & $2,001$   \\ 
-
-                              & finan512          & $74,752$      & $596,992$    & $74,725$  \\ 
-
-                              & G3\_circuit       & $1,585,478$   & $7,660,826$  & $1,219,059$ \\
-            
-                              & shallow\_water2   & $81,920$      & $327,680$    & $58,710$ \\
-
-                              & thermal2          & $1,228,045$   & $8,580,313$  & $1,226,629$ \\ \hline \hline
-            
-\multirow{6}{*}{Nonsymmetric} & cage13            & $445,315$     & $7,479,343$  & $318,788$\\
-
-                              & crashbasis        & $160,000$     & $1,750,416$  & $120,202$ \\
-
-                              & FEM\_3D\_thermal2 & $147,900$     & $3,489.300$  & $117,827$ \\
-
-                              & language          & $399,130$     & $1,216,334$  & $398,622$\\
- 
-                              & poli\_large       & $15,575$      & $33,074$     & $15,575$ \\
-
-                              & torso3            & $259,156$     & $4,429,042$  & $216,854$  \\ \hline
-\end{tabular}
-\vspace{0.5cm}
-\caption{Main characteristics of sparse matrices chosen from the Davis's collection.}
-\label{ch12:tab:01}
-\end{table}
-
-\begin{table}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|} 
-\hline
-{\bf Matrix}     & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$  & $\mathbf{\# iter.}$ & $\mathbf{prec.}$     & $\mathbf{\Delta}$   \\ \hline \hline
-
-2cubes\_sphere    & $0.132s$           & $0.069s$            & $1.93$        & $12$           & $1.14e$-$09$     & $3.47e$-$18$ \\
-
-ecology2          & $0.026s$           & $0.017s$            & $1.52$        & $13$           & $5.06e$-$09$     & $8.33e$-$17$ \\
-
-finan512          & $0.053s$           & $0.036s$            & $1.49$        & $12$           & $3.52e$-$09$     & $1.66e$-$16$ \\
-
-G3\_circuit       & $0.704s$           & $0.466s$            & $1.51$        & $16$           & $4.16e$-$10$     & $4.44e$-$16$ \\
-
-shallow\_water2   & $0.017s$           & $0.010s$            & $1.68$        & $5$            & $2.24e$-$14$     & $3.88e$-$26$ \\
-
-thermal2          & $1.172s$           & $0.622s$            & $1.88$        & $15$           & $5.11e$-$09$     & $3.33e$-$16$ \\ \hline   
-\end{tabular}
-\caption{Performances of the parallel CG method on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.}
-\label{ch12:tab:02}
-\end{center}
-\end{table}
-
-\begin{table}[!h]
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|} 
-\hline
-{\bf Matrix}     & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$  & $\mathbf{\# iter.}$ & $\mathbf{prec.}$     & $\mathbf{\Delta}$   \\ \hline \hline
-
-2cubes\_sphere    & $0.234s$           & $0.124s$            & $1.88$        & $21$           & $2.10e$-$14$     & $3.47e$-$18$ \\
-
-ecology2          & $0.076s$           & $0.035s$            & $2.15$        & $21$           & $4.30e$-$13$     & $4.38e$-$15$ \\
-
-finan512          & $0.073s$           & $0.052s$            & $1.40$        & $17$           & $3.21e$-$12$     & $5.00e$-$16$ \\
-
-G3\_circuit       & $1.016s$           & $0.649s$            & $1.56$        & $22$           & $1.04e$-$12$     & $2.00e$-$15$ \\
-
-shallow\_water2   & $0.061s$           & $0.044s$            & $1.38$        & $17$           & $5.42e$-$22$     & $2.71e$-$25$ \\
-
-thermal2          & $1.666s$           & $0.880s$            & $1.89$        & $21$           & $6.58e$-$12$     & $2.77e$-$16$ \\ \hline \hline
-
-cage13            & $0.721s$           & $0.338s$            & $2.13$        & $26$           & $3.37e$-$11$     & $2.66e$-$15$ \\
-
-crashbasis        & $1.349s$           & $0.830s$            & $1.62$        & $121$          & $9.10e$-$12$     & $6.90e$-$12$ \\
-
-FEM\_3D\_thermal2 & $0.797s$           & $0.419s$            & $1.90$        & $64$           & $3.87e$-$09$     & $9.09e$-$13$ \\
-
-language          & $2.252s$           & $1.204s$            & $1.87$        & $90$           & $1.18e$-$10$     & $8.00e$-$11$ \\
-
-poli\_large       & $0.097s$           & $0.095s$            & $1.02$        & $69$           & $4.98e$-$11$     & $1.14e$-$12$ \\
-
-torso3            & $4.242s$           & $2.030s$            & $2.09$        & $175$          & $2.69e$-$10$     & $1.78e$-$14$ \\ \hline
-\end{tabular}
-\caption{Performances of the parallel GMRES method on a cluster 24 CPU cores vs. on cluster of 12 GPUs.}
-\label{ch12:tab:03}
-\end{center}
-\end{table}
-
-Tables~\ref{ch12:tab:02} and~\ref{ch12:tab:03} shows the performances of the parallel
-CG and GMRES solvers, respectively, for solving linear systems associated to the sparse
-matrices presented in Tables~\ref{ch12:tab:01}. They allow to compare the performances
-obtained on a cluster of $24$ CPU cores and on a cluster of $12$ GPUs. However, Table~\ref{ch12:tab:02}
-shows only the performances of solving symmetric sparse linear systems, due to the inability
-of the CG method to solve the nonsymmetric systems. In both tables, the second and third
-columns give, respectively, the execution times in seconds obtained on $24$ CPU cores
-($Time_{gpu}$) and that obtained on $12$ GPUs ($Time_{gpu}$). Moreover, we take into account
-the relative gains $\tau$ of a solver implemented on the GPU cluster compared to the same
-solver implemented on the CPU cluster. The relative gains\index{Relative~gain}, presented
-in the fourth column, are computed as a ratio of the CPU execution time over the GPU
-execution time:
-\begin{equation}
-\tau = \frac{Time_{cpu}}{Time_{gpu}}.
-\label{ch12:eq:20}
-\end{equation}
-In addition, Tables~\ref{ch12:tab:02} and~\ref{ch12:tab:03} give the number of iterations
-($iter$), the precision $prec$ of the solution computed on the GPU cluster and the difference
-$\Delta$ between the solution computed on the CPU cluster and that computed on the GPU cluster.
-Both parameters $prec$ and $\Delta$ allow to validate and verify the accuracy of the solution
-computed on the GPU cluster. We have computed them as follows:
-\begin{eqnarray}
-\Delta = max|x^{cpu}-x^{gpu}|,\\
-prec = max|M^{-1}r^{gpu}|,
-\end{eqnarray}
-where $\Delta$ is the maximum vector element, in absolute value, of the difference between
-the two solutions $x^{cpu}$ and $x^{gpu}$ computed, respectively, on CPU and GPU clusters and
-$prec$ is the maximum element, in absolute value, of the residual vector $r^{gpu}\in\mathbb{R}^{n}$
-of the solution $x^{gpu}$. Thus, we can see that the solutions obtained on the GPU cluster
-were computed with a sufficient accuracy (about $10^{-10}$) and they are, more or less, equivalent
-to those computed on the CPU cluster with a small difference ranging from $10^{-10}$ to $10^{-26}$.
-However, we can notice from the relative gains $\tau$ that is not interesting to use multiple
-GPUs for solving small sparse linear systems. in fact, a small sparse matrix does not allow to
-maximize utilization of GPU cores. In addition, the communications required to synchronize the
-computations over the cluster increase the idle times of GPUs and slow down further the parallel
-computations.
-
-Consequently, in order to test the performances of the parallel solvers, we developed in C programming
-language a generator of large sparse matrices. This generator takes a matrix from the Davis's collection~\cite{ch12:ref10}
-as an initial matrix to construct large sparse matrices exceeding ten million of rows. It must be executed
-in parallel by the MPI processes of the computing nodes, so that each process could construct its sparse
-sub-matrix. In first experimental tests, we are focused on sparse matrices having a banded structure,
-because they are those arise in the most of numerical problems. So to generate the global sparse matrix,
-each MPI process constructs its sub-matrix by performing several copies of an initial sparse matrix chosen
-from the Davis's collection. Then, it puts all these copies on the main diagonal of the global matrix
-(see Figure~\ref{ch12:fig:06}). Moreover, the empty spaces between two successive copies in the main
-diagonal are filled with sub-copies (left-copy and right-copy in Figure~\ref{ch12:fig:06}) of the same
-initial matrix.
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/generation}}
-\caption{Parallel generation of a large sparse matrix by four computing nodes.}
-\label{ch12:fig:06}
-\end{figure}
-
-\begin{table}[!h]
-\centering
-\begin{tabular}{|c|c|c|c|}
-\hline
-{\bf Matrix type}             & {\bf Matrix name} & {\bf \# nnz} & {\bf Bandwidth} \\ \hline \hline
-
-\multirow{6}{*}{Symmetric}    & 2cubes\_sphere    & $413,703,602$ & $198,836$     \\
-
-                              & ecology2          & $124,948,019$ & $2,002$          \\ 
-
-                              & finan512          & $278,175,945$ & $123,900$        \\ 
-
-                              & G3\_circuit       & $125,262,292$ & $1,891,887$      \\
-            
-                              & shallow\_water2   & $100,235,292$ & $62,806$      \\
-
-                              & thermal2          & $175,300,284$ & $2,421,285$ \\ \hline \hline
-            
-\multirow{6}{*}{Nonsymmetric} & cage13            & $435,770,480$ & $352,566$        \\
-
-                              & crashbasis        & $409,291,236$ & $200,203$        \\
-
-                              & FEM\_3D\_thermal2 & $595,266,787$ & $206,029$ \\
-
-                              & language          & $76,912,824$  & $398,626$ \\
-
-                              & poli\_large       & $53,322,580$  & $15,576$         \\
-
-                              & torso3            & $433,795,264$ & $328,757$        \\ \hline
-\end{tabular}
-\vspace{0.5cm}
-\caption{Main characteristics of sparse banded matrices generated from those of the Davis's collection.}
-\label{ch12:tab:04}
-\end{table}
-
-We have used the parallel CG and GMRES algorithms for solving sparse linear systems of $25$
-million unknown values. The sparse matrices associated to these linear systems are generated
-from those presented in Table~\ref{ch12:tab:01}. Their main characteristics are given in Table~\ref{ch12:tab:04}.
-Tables~\ref{ch12:tab:05} and~\ref{ch12:tab:06} shows the performances of the parallel CG and
-GMRES solvers, respectively, obtained on a cluster of $24$ CPU cores and on a cluster of $12$
-GPUs. Obviously, we can notice from these tables that solving large sparse linear systems on
-a GPU cluster is more efficient than on a CPU cluster (see relative gains $\tau$). We can also
-notice that the execution times of the CG method, whether in a CPU cluster or on a GPU cluster,
-are better than those the GMRES method for solving large symmetric linear systems. In fact, the
-CG method is characterized by a better convergence\index{Convergence} rate and a shorter execution
-time of an iteration than those of the GMRES method. Moreover, an iteration of the parallel GMRES
-method requires more data exchanges between computing nodes compared to the parallel CG method.
- 
-\begin{table}[!h]
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|} 
-\hline
-{\bf Matrix}    & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{\# iter.}$ & $\mathbf{prec.}$ & $\mathbf{\Delta}$   \\ \hline \hline
-
-2cubes\_sphere  & $1.625s$             & $0.401s$              & $4.05$          & $14$                & $5.73e$-$11$     & $5.20e$-$18$ \\
-
-ecology2        & $0.856s$             & $0.103s$              & $8.27$          & $15$                & $3.75e$-$10$     & $1.11e$-$16$ \\
-
-finan512        & $1.210s$             & $0.354s$              & $3.42$          & $14$                & $1.04e$-$10$     & $2.77e$-$16$ \\
-
-G3\_circuit     & $1.346s$             & $0.263s$              & $5.12$          & $17$                & $1.10e$-$10$     & $5.55e$-$16$ \\
-
-shallow\_water2 & $0.397s$             & $0.055s$              & $7.23$          & $7$                 & $3.43e$-$15$     & $5.17e$-$26$ \\
-
-thermal2        & $1.411s$             & $0.244s$              & $5.78$          & $16$                & $1.67e$-$09$     & $3.88e$-$16$ \\ \hline  
-\end{tabular}
-\caption{Performances of the parallel CG method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. 
-on a cluster of 12 GPUs.}
-\label{ch12:tab:05}
-\end{center}
-\end{table}
-
-\begin{table}[!h]
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|} 
-\hline
-{\bf Matrix}      & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{\# iter.}$ & $\mathbf{prec.}$ & $\mathbf{\Delta}$   \\ \hline \hline
-
-2cubes\_sphere    & $3.597s$             & $0.514s$              & $6.99$          & $21$                & $2.11e$-$14$     & $8.67e$-$18$ \\
-
-ecology2          & $2.549s$             & $0.288s$              & $8.83$          & $21$                & $4.88e$-$13$     & $2.08e$-$14$ \\
-
-finan512          & $2.660s$             & $0.377s$              & $7.05$          & $17$                & $3.22e$-$12$     & $8.82e$-$14$ \\
-
-G3\_circuit       & $3.139s$             & $0.480s$              & $6.53$          & $22$                & $1.04e$-$12$     & $5.00e$-$15$ \\
-
-shallow\_water2   & $2.195s$             & $0.253s$              & $8.68$          & $17$                & $5.54e$-$21$     & $7.92e$-$24$ \\
-
-thermal2          & $3.206s$             & $0.463s$              & $6.93$          & $21$                & $8.89e$-$12$     & $3.33e$-$16$ \\ \hline \hline
-
-cage13            & $5.560s$             & $0.663s$              & $8.39$          & $26$                & $3.29e$-$11$     & $1.59e$-$14$ \\
-
-crashbasis        & $25.802s$            & $3.511s$              & $7.35$          & $135$               & $6.81e$-$11$     & $4.61e$-$15$ \\
-
-FEM\_3D\_thermal2 & $13.281s$            & $1.572s$              & $8.45$          & $64$                & $3.88e$-$09$     & $1.82e$-$12$ \\
-
-language          & $12.553s$            & $1.760s$              & $7.13$          & $89$                & $2.11e$-$10$     & $1.60e$-$10$ \\
-
-poli\_large       & $8.515s$             & $1.053s$              & $8.09$          & $69$                & $5.05e$-$11$     & $6.59e$-$12$ \\
-
-torso3            & $31.463s$            & $3.681s$              & $8.55$          & $175$               & $2.69e$-$10$     & $2.66e$-$14$ \\ \hline
-\end{tabular}
-\caption{Performances of the parallel GMRES method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. 
-on a cluster of 12 GPUs.}
-\label{ch12:tab:06}
-\end{center}
-\end{table}
-
-
-%%--------------------------%%
-%%       SECTION 5          %%
-%%--------------------------%%
-\section{Hypergraph partitioning}
-\label{ch12:sec:05}
-In this section, we present the performances of both parallel CG and GMRES solvers for solving linear
-systems associated to sparse matrices having large bandwidths. Indeed, we are interested on sparse
-matrices having the nonzero values distributed along their bandwidths. 
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.22]{Chapters/chapter12/figures/generation_1}}
-\caption{Parallel generation of a large sparse five-bands matrix by four computing nodes.}
-\label{ch12:fig:07}
-\end{figure}
-
-\begin{table}[!h]
-\begin{center}
-\begin{tabular}{|c|c|c|c|} 
-\hline
-{\bf Matrix type}             & {\bf Matrix name} & {\bf \# nnz}  & {\bf Bandwidth} \\ \hline \hline
-
-\multirow{6}{*}{Symmetric}    & 2cubes\_sphere    & $829,082,728$ & $24,999,999$     \\
-
-                              & ecology2          & $254,892,056$ & $25,000,000$     \\ 
-
-                              & finan512          & $556,982,339$ & $24,999,973$     \\ 
-
-                              & G3\_circuit       & $257,982,646$ & $25,000,000$     \\
-            
-                              & shallow\_water2   & $200,798,268$ & $25,000,000$     \\
-
-                              & thermal2          & $359,340,179$ & $24,999,998$     \\ \hline \hline
-            
-\multirow{6}{*}{Nonsymmetric} & cage13            & $879,063,379$ & $24,999,998$     \\
-
-                              & crashbasis        & $820,373,286$ & $24,999,803$     \\
-
-                              & FEM\_3D\_thermal2 & $1,194,012,703$ & $24,999,998$     \\
-
-                              & language          & $155,261,826$ & $24,999,492$     \\
-
-                              & poli\_large       & $106,680,819$ & $25,000,000$    \\
-
-                              & torso3            & $872,029,998$ & $25,000,000$\\ \hline
-\end{tabular}
-\caption{Main characteristics of sparse five-bands matrices generated from those of the Davis's collection.}
-\label{ch12:tab:07}
-\end{center}
-\end{table}
-
-We have developed in C programming language a generator of large sparse matrices
-having five bands distributed along their bandwidths (see Figure~\ref{ch12:fig:07}).
-The principle of this generator is equivalent to that in Section~\ref{ch12:sec:04}.
-However, the copies performed on the initial matrix (chosen from the Davis's collection)
-are placed on the main diagonal and on four off-diagonals, two on the right and two
-on the left of the main diagonal. Figure~\ref{ch12:fig:07} shows an example of a
-generation of a sparse five-bands matrix by four computing nodes. Table~\ref{ch12:tab:07}
-shows the main characteristics of sparse five-bands matrices generated from those
-presented in Table~\ref{ch12:tab:01} and associated to linear systems of $25$ million
-unknown values.   
-
-\begin{table}[!h]
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|} 
-\hline
-{\bf Matrix}      & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{\# iter.}$ & $\mathbf{prec.}$ & $\mathbf{\Delta}$   \\ \hline \hline
-
-2cubes\_sphere    & $6.041s$     & $3.338s$      & $1.81$ & $30$ & $6.77e$-$11$ & $3.25e$-$19$ \\
-
-ecology2          & $1.404s$     & $1.301s$      & $1.08$ & $13$     & $5.22e$-$11$ & $2.17e$-$18$ \\
-
-finan512          & $1.822s$     & $1.299s$      & $1.40$ & $12$     & $3.52e$-$11$ & $3.47e$-$18$ \\
-
-G3\_circuit       & $2.331s$     & $2.129s$      & $1.09$ & $15$     & $1.36e$-$11$ & $5.20e$-$18$ \\
-
-shallow\_water2   & $0.541s$     & $0.504s$      & $1.07$ & $6$      & $2.12e$-$16$ & $5.05e$-$28$ \\
-
-thermal2          & $2.549s$     & $1.705s$      & $1.49$ & $14$     & $2.36e$-$10$ & $5.20e$-$18$ \\ \hline  
-\end{tabular}
-\caption{Performances of parallel CG solver for solving linear systems associated to sparse five-bands matrices
-on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs}
-\label{ch12:tab:08}
-\end{center}
-\end{table}
-
-\begin{table}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|} 
-\hline
-{\bf Matrix}      & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{\# iter.}$ & $\mathbf{prec.}$ & $\mathbf{\Delta}$   \\ \hline \hline
-
-2cubes\_sphere    & $15.963s$    & $7.250s$      & $2.20$  & $58$     & $6.23e$-$16$ & $3.25e$-$19$ \\
-
-ecology2          & $3.549s$     & $2.176s$      & $1.63$  & $21$     & $4.78e$-$15$ & $1.06e$-$15$ \\
-
-finan512          & $3.862s$     & $1.934s$      & $1.99$  & $17$     & $3.21e$-$14$ & $8.43e$-$17$ \\
-
-G3\_circuit       & $4.636s$     & $2.811s$      & $1.65$  & $22$     & $1.08e$-$14$ & $1.77e$-$16$ \\
-
-shallow\_water2   & $2.738s$     & $1.539s$      & $1.78$  & $17$     & $5.54e$-$23$ & $3.82e$-$26$ \\
-
-thermal2          & $5.017s$     & $2.587s$      & $1.94$  & $21$     & $8.25e$-$14$ & $4.34e$-$18$ \\ \hline \hline
-
-cage13            & $9.315s$     & $3.227s$      & $2.89$  & $26$     & $3.38e$-$13$ & $2.08e$-$16$ \\
-
-crashbasis        & $35.980s$    & $14.770s$     & $2.43$  & $127$    & $1.17e$-$12$ & $1.56e$-$17$ \\
-
-FEM\_3D\_thermal2 & $24.611s$    & $7.749s$      & $3.17$  & $64$     & $3.87e$-$11$ & $2.84e$-$14$ \\
-
-language          & $16.859s$    & $9.697s$      & $1.74$  & $89$     & $2.17e$-$12$ & $1.70e$-$12$ \\
-
-poli\_large       & $10.200s$    & $6.534s$      & $1.56$  & $69$     & $5.14e$-$13$ & $1.63e$-$13$ \\
-
-torso3            & $49.074s$    & $19.397s$     & $2.53$  & $175$    & $2.69e$-$12$ & $2.77e$-$16$ \\ \hline
-\end{tabular}
-\caption{Performances of parallel GMRES solver for solving linear systems associated to sparse five-bands matrices
-on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs}
-\label{ch12:tab:09}
-\end{center}
-\end{table}
-
-Tables~\ref{ch12:tab:08} and~\ref{ch12:tab:09} shows the performances of the parallel
-CG and GMRES solvers, respectively, obtained on a cluster of $24$ CPU cores and on a
-cluster of $12$ GPUs. The linear systems solved in these tables are associated to the
-sparse five-bands matrices presented on Table~\ref{ch12:tab:07}. We can notice from
-both Tables~\ref{ch12:tab:08} and~\ref{ch12:tab:09} that using a GPU cluster is not
-efficient for solving these kind of sparse linear systems\index{Sparse~linear~system}.
-We can see that the execution times obtained on the GPU cluster are almost equivalent
-to those obtained on the CPU cluster (see the relative gains presented in column~$4$
-of each table). This is due to the large number of communications necessary to synchronize
-the computations over the cluster. Indeed, the naive partitioning, row-by-row or column-by-column,
-of sparse matrices having large bandwidths can link a computing node to many neighbors
-and then generate a large number of data dependencies between these computing nodes in
-the cluster. 
-
-Therefore, we have chosen to use a hypergraph partitioning method\index{Hypergraph},
-which is well-suited to numerous kinds of sparse matrices~\cite{ch12:ref11}. Indeed,
-it can well model the communications between the computing nodes, particularly in the
-case of nonsymmetric and irregular matrices, and it gives good reduction of the total
-communication volume. In contrast, it is an expensive operation in terms of execution
-time and memory space. 
-
-The sparse matrix $A$ of the linear system to be solved is modeled as a hypergraph
-$\mathcal{H}=(\mathcal{V},\mathcal{E})$\index{Hypergraph} as follows:
-\begin{itemize}
-\item each matrix row $\{i\}_{0\leq i<n}$ corresponds to a vertex $v_i\in\mathcal{V}$ and,
-\item each matrix column $\{j\}_{0\leq j<n}$ corresponds to a hyperedge $e_j\in\mathcal{E}$, where:
-\begin{equation}
-\forall a_{ij} \neq 0 \mbox{~is a nonzero value of matrix~} A \mbox{~:~} v_i \in pins[e_j],
-\end{equation} 
-\item $w_i$ is the weight of vertex $v_i$ and,
-\item $c_j$ is the cost of hyperedge $e_j$.
-\end{itemize}
-A $K$-way partitioning of a hypergraph $\mathcal{H}=(\mathcal{V},\mathcal{E})$ is
-defined as $\mathcal{P}=\{\mathcal{V}_1,\ldots,\mathcal{V}_K\}$ a set of pairwise
-disjoint non-empty subsets (or parts) of the vertex set $\mathcal{V}$, so that each
-subset is attributed to a computing node. Figure~\ref{ch12:fig:08} shows an example
-of the hypergraph model of a  $(9\times 9)$ sparse matrix in three parts. The circles
-and squares correspond, respectively, to the vertices and hyperedges of the hypergraph.
-The solid squares define the cut hyperedges connecting at least two different parts. 
-The connectivity $\lambda_j$ of a cut hyperedge $e_j$ denotes the number of different
-parts spanned by $e_j$.
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.5]{Chapters/chapter12/figures/hypergraph}}
-\caption{An example of the hypergraph partitioning of a sparse matrix decomposed between three computing nodes.}
-\label{ch12:fig:08}
-\end{figure}
-
-The cut hyperedges model the total communication volume between the different computing
-nodes in the cluster, necessary to perform the parallel SpMV multiplication\index{SpMV~multiplication}.
-Indeed, each hyperedge $e_j$ defines a set of atomic computations $b_i\leftarrow b_i+a_{ij}x_j$,
-$0\leq i,j<n$, of the SpMV multiplication $Ax=b$ that need the $j^{th}$ unknown value of
-solution vector $x$. Therefore, pins of hyperedge $e_j$, $pins[e_j]$, are the set of matrix
-rows sharing and requiring the same unknown value $x_j$. For example in Figure~\ref{ch12:fig:08},
-hyperedge $e_9$ whose pins are: $pins[e_9]=\{v_2,v_5,v_9\}$ represents the dependency of matrix
-rows $2$, $5$ and $9$ to unknown $x_9$ needed to perform in parallel the atomic operations:
-$b_2\leftarrow b_2+a_{29}x_9$, $b_5\leftarrow b_5+a_{59}x_9$ and $b_9\leftarrow b_9+a_{99}x_9$.
-However, unknown $x_9$ is the third entry of the sub-solution vector $x$ of part (or node) $3$.
-So the computing node $3$ must exchange this value with nodes $1$ and $2$, which leads to perform
-two communications.
-
-The hypergraph partitioning\index{Hypergraph} allows to reduce the total communication volume
-required to perform the parallel SpMV multiplication, while maintaining the load balancing between
-the computing nodes. In fact, it allows to minimize at best the following amount:
-\begin{equation}
-\mathcal{X}(\mathcal{P})=\sum_{e_{j}\in\mathcal{E}_{C}}c_{j}(\lambda_{j}-1),
-\end{equation}
-where $\mathcal{E}_{C}$ denotes the set of the cut hyperedges coming from the hypergraph partitioning
-$\mathcal{P}$ and $c_j$ and $\lambda_j$ are, respectively, the cost and the connectivity of cut hyperedge
-$e_j$. Moreover, it also ensures the load balancing between the $K$ parts as follows: 
-\begin{equation}
-  W_{k}\leq (1+\epsilon)W_{avg}, \hspace{0.2cm} (1\leq k\leq K) \hspace{0.2cm} \text{and} \hspace{0.2cm} (0<\epsilon<1),
-\end{equation} 
-where $W_{k}$ is the sum of all vertex weights ($w_{i}$) in part $\mathcal{V}_{k}$, $W_{avg}$ is the
-average weight of all $K$ parts and $\epsilon$ is the maximum allowed imbalanced ratio.
-
-The hypergraph partitioning is a NP-complete problem but software tools using heuristics are developed,
-for example: hMETIS~\cite{ch12:ref12}, PaToH~\cite{ch12:ref13} and Zoltan~\cite{ch12:ref14}. Since our
-objective is solving large sparse linear systems, we use the parallel hypergraph partitioning which must
-be performed by at least two MPI processes. It allows to accelerate the data partitioning of large sparse
-matrices. For this, the hypergraph $\mathcal{H}$ must be partitioned in $p$ (number of MPI processes)
-sub-hypergraphs $\mathcal{H}_k=(\mathcal{V}_k,\mathcal{E}_k)$, $0\leq k<p$, and then we performed the
-parallel hypergraph partitioning method using some functions of the MPI library between the $p$ processes.
-
-Tables~\ref{ch12:tab:10} and~\ref{ch12:tab:11} shows the performances of the parallel CG and GMRES solvers,
-respectively, using the hypergraph partitioning for solving large linear systems associated to the sparse
-five-bands matrices presented in Table~\ref{ch12:tab:07}. For these experimental tests, we have applied the
-parallel hypergraph partitioning~\cite{ch12:ref15} developed in Zoltan tool~\cite{ch12:ref14}. We have initialized
-the parameters of the partitioning operation as follows:
-\begin{itemize}
-\item the weight $w_{i}$ of each vertex $v_{j}\in\mathcal{V}$ is set to the number of nonzero values on matrix row $i$,
-\item for the sake of simplicity, the cost $c_{j}$ of each hyperedge $e_{j}\in\mathcal{E}$ is fixed to $1$,
-\item the maximum imbalanced load ratio $\epsilon$ is limited to $10\%$.\\
-\end{itemize}  
-
-\begin{table}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|} 
-\hline
-{\bf Matrix}    & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{Gains \%}$ \\ \hline \hline
-
-2cubes\_sphere  & $5.935s$             & $1.213s$              & $4.89$          & $63.66\%$ \\
-
-ecology2        & $1.093s$             & $0.136s$              & $8.00$          & $89.55\%$ \\
-
-finan512        & $1.762s$             & $0.475s$              & $3.71$          & $63.43\%$ \\
-
-G3\_circuit     & $2.095s$             & $0.558s$              & $3.76$          & $73.79\%$ \\
-
-shallow\_water2 & $0.498s$             & $0.068s$              & $7.31$          & $86.51\%$ \\
-
-thermal2        & $1.889s$             & $0.348s$              & $5.43$          & $79.59\%$ \\ \hline  
-\end{tabular}
-\caption{Performances of the parallel CG solver using hypergraph partitioning for solving linear systems associated to
-sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPU.}
-\label{ch12:tab:10}
-\end{center}
-\end{table}
-
-\begin{table}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|} 
-\hline
-{\bf Matrix}      & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{Gains \%}$ \\ \hline \hline
-
-2cubes\_sphere    & $16.430s$            & $2.840s$              & $5.78$          & $60.83\%$ \\
-
-ecology2          & $3.152s$             & $0.367s$              & $8.59$          & $83.13\%$ \\
-
-finan512          & $3.672s$             & $0.723s$              & $5.08$          & $62.62\%$ \\
-
-G3\_circuit       & $4.468s$             & $0.971s$              & $4.60$          & $65.46\%$ \\
-
-shallow\_water2   & $2.647s$             & $0.312s$              & $8.48$          & $79.73\%$ \\
-
-thermal2          & $4.190s$             & $0.666s$              & $6.29$          & $74.25\%$ \\ \hline \hline
-
-cage13            & $8.077s$             & $1.584s$              & $5.10$          & $50.91\%$ \\
-
-crashbasis        & $35.173s$            & $5.546s$              & $6.34$          & $62.43\%$ \\
-
-FEM\_3D\_thermal2 & $24.825s$            & $3.113s$              & $7.97$          & $59.83\%$ \\
-
-language          & $16.706s$            & $2.522s$              & $6.62$          & $73.99\%$ \\
-
-poli\_large       & $12.715s$            & $3.989s$              & $3.19$          & $38.95\%$ \\
-
-torso3            & $48.459s$            & $6.234s$              & $7.77$          & $67.86\%$ \\ \hline
-\end{tabular}
-\caption{Performances of the parallel GMRES solver using hypergraph partitioning for solving linear systems associated to
-sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPU.}
-\label{ch12:tab:11}
-\end{center}
-\end{table}
-
-We can notice from both Tables~\ref{ch12:tab:10} and~\ref{ch12:tab:11} that the
-hypergraph partitioning has improved the performances of both parallel CG and GMRES
-algorithms. The execution times on the GPU cluster of both parallel solvers are
-significantly improved compared to those obtained by using the partitioning row-by-row.
-For these examples of sparse matrices, the execution times of CG and GMRES solvers
-are reduced about $76\%$ and $65\%$ respectively (see column~$5$ of each table)
-compared to those obtained in Tables~\ref{ch12:tab:08} and~\ref{ch12:tab:09}.
-
-In fact, the hypergraph partitioning\index{Hypergraph} applied to sparse matrices
-having large bandwidths allows to reduce the total communication volume necessary
-to synchronize the computations between the computing nodes in the GPU cluster.
-Table~\ref{ch12:tab:12} presents, for each sparse matrix, the total communication
-volume between $12$ GPU computing nodes obtained by using the partitioning row-by-row
-(column~$2$), the total communication volume obtained by using the hypergraph partitioning
-(column~$3$) and the execution times in minutes of the hypergraph partitioning
-operation performed by $12$ MPI processes (column~$4$). The total communication
-volume defines the total number of the vector elements exchanged by the computing
-nodes. Then, Table~\ref{ch12:tab:12} shows that the hypergraph partitioning method
-can split the sparse matrix so as to minimize the data dependencies between the
-computing nodes and thus to reduce the total communication volume.
-
-\begin{table}
-\begin{center}
-\begin{tabular}{|c|c|c|c|} 
-\hline
-\multirow{4}{*}{\bf Matrix}  & {\bf Total comms.}      & {\bf Total comms.}      & {\bf Execution} \\
-                             & {\bf volume without}    & {\bf volume with}       & {\bf trime}  \\
-                             & {\bf hypergraph}        & {\bf hypergraph }       & {\bf of the parti.}  \\  
-                             & {\bf parti.}            & {\bf parti.}            & {\bf in minutes}\\ \hline \hline
-
-2cubes\_sphere               & $25,360,543$            & $240,679$               & $68.98$         \\
-
-ecology2                     & $26,044,002$            & $73,021$                & $4.92$          \\
-
-finan512                     & $26,087,431$            & $900,729$               & $33.72$         \\
-
-G3\_circuit                  & $31,912,003$            & $5,366,774$             & $11.63$         \\ 
-
-shallow\_water2              & $25,105,108$            & $60,899$                & $5.06$          \\ 
-
-thermal2                     & $30,012,846$            & $1,077,921$             & $17.88$         \\ \hline \hline
-
-cage13                       & $28,254,282$            & $3,845,440$             & $196.45$        \\
-
-crashbasis                   & $29,020,060$            & $2,401,876$             & $33.39$         \\
-
-FEM\_3D\_thermal2            & $25,263,767$            & $250,105$               & $49.89$         \\
-
-language                     & $27,291,486$            & $1,537,835$             & $9.07$          \\
-
-poli\_large                  & $25,053,554$            & $7,388,883$             & $5.92$          \\
-
-torso3                       & $25,682,514$            & $613,250$               & $61.51$         \\ \hline       
-\end{tabular}
-\caption{The total communication volume between 12 GPU computing nodes without and with the hypergraph partitioning method.}
-\label{ch12:tab:12}
-\end{center}
-\end{table}
-
-Nevertheless, as we can see from the fourth column of Table~\ref{ch12:tab:12},
-the hypergraph partitioning takes longer compared to the execution times of the
-resolutions. As previously mentioned, the hypergraph partitioning method is less
-efficient in terms of memory consumption and partitioning time than its graph
-counterpart, but the hypergraph well models the nonsymmetric and irregular problems.
-So for the applications which often use the same sparse matrices, we can perform
-the hypergraph partitioning on these matrices only once for each and then, we save
-the traces of these partitionings in files to be reused several times. Therefore,
-this allows to avoid the partitioning of the sparse matrices at each resolution
-of the linear systems.
-
-\begin{figure}[!h]
-\centering
-  \mbox{\subfigure[Sparse band matrices]{\includegraphics[scale=0.7]{Chapters/chapter12/figures/scale_band}\label{ch12:fig:09.01}}}
-\vfill 
-  \mbox{\subfigure[Sparse five-bands matrices]{\includegraphics[scale=0.7]{Chapters/chapter12/figures/scale_5band}\label{ch12:fig:09.02}}}
-\caption{Weak-scaling of the parallel CG and GMRES solvers on a GPU cluster for solving large sparse linear systems.}
-\label{ch12:fig:09}
-\end{figure}
-
-However, the most important performance parameter is the scalability of the parallel
-CG\index{Iterative~method!CG} and GMRES\index{Iterative~method!GMRES} solvers on a GPU
-cluster. Particularly, we have taken into account the weak-scaling of both parallel
-algorithms on a cluster of one to 12 GPU computing nodes. We have performed a set of
-experiments on both matrix structures: band matrices and five-bands matrices. The sparse
-matrices of tests are generated from the symmetric sparse matrix {\it thermal2} chosen
-from the Davis's collection. Figures~\ref{ch12:fig:09.01} and~\ref{ch12:fig:09.02}
-show the execution times of both parallel methods for solving large linear systems
-associated to band matrices and those associated to five-bands matrices, respectively.
-The size of a sparse sub-matrix per computing node, for each matrix structure, is fixed
-as follows:
-\begin{itemize}
-\item band matrix: $15$ million of rows and $105,166,557$ of nonzero values,
-\item five-bands matrix: $5$ million of rows and $78,714,492$ of nonzero values. 
-\end{itemize}
-We can see from these figures that both parallel solvers are quite scalable on a GPU
-cluster. Indeed, the execution times remains almost constant while the size of the
-sparse linear systems to be solved increases proportionally with the number of the
-GPU computing nodes. This means that the communication cost is relatively constant
-regardless of the number the computing nodes in the GPU cluster.
-
-
-
-%%--------------------------%%
-%%       SECTION 6          %%
-%%--------------------------%%
-\section{Conclusion}
-\label{ch12:sec:06}
-In this chapter, we have aimed at harnessing the computing power of a
-cluster of GPUs for solving large sparse linear systems. For this, we
-have used two Krylov subspace iterative methods: the CG and GMRES methods.
-The first method is well-known to its efficiency for solving symmetric
-linear systems and the second one is used, particularly, for solving
-nonsymmetric linear systems. 
-
-We have presented the parallel implementation of both iterative methods
-on a GPU cluster. Particularly, the operations dealing with the vectors
-and/or matrices, of these methods, are parallelized between the different
-GPU computing nodes of the cluster. Indeed, the data-parallel vector operations
-are accelerated by GPUs and the communications required to synchronize the
-parallel computations are carried out by CPU cores. For this, we have used
-a heterogeneous CUDA/MPI programming to implement the parallel iterative
-algorithms.
-
-In the experimental tests, we have shown that using a GPU cluster is efficient
-for solving linear systems associated to very large sparse matrices. The experimental
-results, obtained in the present chapter, showed that a cluster of $12$ GPUs is
-about $7$ times faster than a cluster of $24$ CPU cores for solving large sparse
-linear systems of $25$ million unknown values. This is due to the GPU ability to
-compute the data-parallel operations faster than the CPUs. However, we have shown
-that solving linear systems associated to matrices having large bandwidths uses
-many communications to synchronize the computations of GPUs, which slow down even
-more the resolution. Moreover, there are two kinds of communications: between a
-CPU and its GPU and between CPUs of the computing nodes, such that the first ones
-are the slowest communications on a GPU cluster. So, we have proposed to use the
-hypergraph partitioning instead of the row-by-row partitioning. This allows to
-minimize the data dependencies between the GPU computing nodes and thus to reduce
-the total communication volume. The experimental results showed that using the
-hypergraph partitioning technique improve the execution times on average of $76\%$
-to the CG method and of $65\%$ to the GMRES method on a cluster of $12$ GPUs. 
-
-In the recent GPU hardware and software architectures, the GPU-Direct system with
-CUDA version 5.0 is used so that two GPUs located on the same node or on distant
-nodes can communicate between them directly without CPUs. This allows to improve
-the data transfers between GPUs.          
-
-\putbib[Chapters/chapter12/biblio12]
-
diff --git a/BookGPU/Chapters/chapter12/figures/cluster.eps b/BookGPU/Chapters/chapter12/figures/cluster.eps
index 75027da..b2ea0e4 100644
--- a/BookGPU/Chapters/chapter12/figures/cluster.eps
+++ b/BookGPU/Chapters/chapter12/figures/cluster.eps
@@ -1,7 +1,7 @@
 %!PS-Adobe-2.0 EPSF-2.0
 %%Title: cluster.fig
 %%Creator: fig2dev Version 3.2 Patchlevel 5c
-%%CreationDate: Sat Feb  9 22:21:11 2013
+%%CreationDate: Fri Jul 19 19:18:46 2013
 %%BoundingBox: 0 0 1391 723
 %Magnification: 1.0000
 %%EndComments
@@ -730,7 +730,7 @@ gs 1 -1 sc  90.0 rot (Node 11) col18 sh gr
 gs 1 -1 sc (Machine 6) col0 sh gr
 /Times-Roman-iso ff 539.75 scf sf
 9810 11475 m
-gs 1 -1 sc (Infiniband 20Gbps) col0 sh gr
+gs 1 -1 sc (Infiniband 20GB/s) col0 sh gr
 % here ends figure;
 pagefooter
 showpage
diff --git a/BookGPU/Chapters/chapter12/figures/cluster.fig b/BookGPU/Chapters/chapter12/figures/cluster.fig
index 32355b1..5e3547f 100644
--- a/BookGPU/Chapters/chapter12/figures/cluster.fig
+++ b/BookGPU/Chapters/chapter12/figures/cluster.fig
@@ -307,4 +307,4 @@ Single
 	0 0 2.00 225.00 225.00
 	0 0 2.00 225.00 225.00
 	 19350 9720 19350 10800
-4 0 0 50 -1 0 34 0.0000 4 525 4155 9810 11475 Infiniband 20Gbps\001
+4 0 0 50 -1 0 34 0.0000 4 450 4125 9810 11475 Infiniband 20GB/s\001
diff --git a/BookGPU/Chapters/chapter12/figures/cluster.fig.bak b/BookGPU/Chapters/chapter12/figures/cluster.fig.bak
index 11a3c6b..32355b1 100644
--- a/BookGPU/Chapters/chapter12/figures/cluster.fig.bak
+++ b/BookGPU/Chapters/chapter12/figures/cluster.fig.bak
@@ -286,8 +286,13 @@ Single
 4 0 18 50 -1 0 28 1.5708 4 315 1545 17100 6390 Node 11\001
 4 0 0 50 -1 0 30 0.0000 4 345 2100 18315 630 Machine 6\001
 -6
-2 1 2 2 0 7 50 -1 -1 3.000 0 0 -1 0 0 2
-	 13545 4950 16245 4950
+1 3 0 2 0 -1 50 -1 20 0.000 1 0.0000 13725 4950 90 90 13725 4950 13815 4950
+1 3 0 2 0 -1 50 -1 20 0.000 1 0.0000 15975 4950 90 90 15975 4950 16065 4950
+1 3 0 2 0 -1 50 -1 20 0.000 1 0.0000 15615 4950 90 90 15615 4950 15705 4950
+1 3 0 2 0 -1 50 -1 20 0.000 1 0.0000 15255 4950 90 90 15255 4950 15345 4950
+1 3 0 2 0 -1 50 -1 20 0.000 1 0.0000 14895 4950 90 90 14895 4950 14985 4950
+1 3 0 2 0 -1 50 -1 20 0.000 1 0.0000 14535 4950 90 90 14535 4950 14625 4950
+1 3 0 2 0 -1 50 -1 20 0.000 1 0.0000 14130 4950 90 90 14130 4950 14220 4950
 2 1 0 4 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
 	0 0 2.00 225.00 225.00
 	0 0 2.00 225.00 225.00
diff --git a/BookGPU/Chapters/chapter12/figures/cluster.pdf b/BookGPU/Chapters/chapter12/figures/cluster.pdf
index 6eb4739..8aaff8c 100644
Binary files a/BookGPU/Chapters/chapter12/figures/cluster.pdf and b/BookGPU/Chapters/chapter12/figures/cluster.pdf differ
diff --git a/BookGPU/Chapters/chapter12/figures/compress.eps b/BookGPU/Chapters/chapter12/figures/compress.eps
index 8d4d84e..62a4f5b 100644
--- a/BookGPU/Chapters/chapter12/figures/compress.eps
+++ b/BookGPU/Chapters/chapter12/figures/compress.eps
@@ -1,8 +1,8 @@
 %!PS-Adobe-2.0 EPSF-2.0
 %%Title: compress.fig
 %%Creator: fig2dev Version 3.2 Patchlevel 5c
-%%CreationDate: Fri Feb  8 16:15:28 2013
-%%BoundingBox: 0 0 964 614
+%%CreationDate: Thu Jul 18 18:50:59 2013
+%%BoundingBox: 0 0 961 614
 %Magnification: 1.0000
 %%EndComments
 %%BeginProlog
@@ -77,52 +77,12 @@ end
   bind def
 /shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul
   4 -2 roll mul srgb} bind def
-/reencdict 12 dict def /ReEncode { reencdict begin
-/newcodesandnames exch def /newfontname exch def /basefontname exch def
-/basefontdict basefontname findfont def /newfont basefontdict maxlength dict def
-basefontdict { exch dup /FID ne { dup /Encoding eq
-{ exch dup length array copy newfont 3 1 roll put }
-{ exch newfont 3 1 roll put } ifelse } { pop pop } ifelse } forall
-newfont /FontName newfontname put newcodesandnames aload pop
-128 1 255 { newfont /Encoding get exch /.notdef put } for
-newcodesandnames length 2 idiv { newfont /Encoding get 3 1 roll put } repeat
-newfontname newfont definefont pop end } def
-/isovec [
-8#055 /minus 8#200 /grave 8#201 /acute 8#202 /circumflex 8#203 /tilde
-8#204 /macron 8#205 /breve 8#206 /dotaccent 8#207 /dieresis
-8#210 /ring 8#211 /cedilla 8#212 /hungarumlaut 8#213 /ogonek 8#214 /caron
-8#220 /dotlessi 8#230 /oe 8#231 /OE
-8#240 /space 8#241 /exclamdown 8#242 /cent 8#243 /sterling
-8#244 /currency 8#245 /yen 8#246 /brokenbar 8#247 /section 8#250 /dieresis
-8#251 /copyright 8#252 /ordfeminine 8#253 /guillemotleft 8#254 /logicalnot
-8#255 /hyphen 8#256 /registered 8#257 /macron 8#260 /degree 8#261 /plusminus
-8#262 /twosuperior 8#263 /threesuperior 8#264 /acute 8#265 /mu 8#266 /paragraph
-8#267 /periodcentered 8#270 /cedilla 8#271 /onesuperior 8#272 /ordmasculine
-8#273 /guillemotright 8#274 /onequarter 8#275 /onehalf
-8#276 /threequarters 8#277 /questiondown 8#300 /Agrave 8#301 /Aacute
-8#302 /Acircumflex 8#303 /Atilde 8#304 /Adieresis 8#305 /Aring
-8#306 /AE 8#307 /Ccedilla 8#310 /Egrave 8#311 /Eacute
-8#312 /Ecircumflex 8#313 /Edieresis 8#314 /Igrave 8#315 /Iacute
-8#316 /Icircumflex 8#317 /Idieresis 8#320 /Eth 8#321 /Ntilde 8#322 /Ograve
-8#323 /Oacute 8#324 /Ocircumflex 8#325 /Otilde 8#326 /Odieresis 8#327 /multiply
-8#330 /Oslash 8#331 /Ugrave 8#332 /Uacute 8#333 /Ucircumflex
-8#334 /Udieresis 8#335 /Yacute 8#336 /Thorn 8#337 /germandbls 8#340 /agrave
-8#341 /aacute 8#342 /acircumflex 8#343 /atilde 8#344 /adieresis 8#345 /aring
-8#346 /ae 8#347 /ccedilla 8#350 /egrave 8#351 /eacute
-8#352 /ecircumflex 8#353 /edieresis 8#354 /igrave 8#355 /iacute
-8#356 /icircumflex 8#357 /idieresis 8#360 /eth 8#361 /ntilde 8#362 /ograve
-8#363 /oacute 8#364 /ocircumflex 8#365 /otilde 8#366 /odieresis 8#367 /divide
-8#370 /oslash 8#371 /ugrave 8#372 /uacute 8#373 /ucircumflex
-8#374 /udieresis 8#375 /yacute 8#376 /thorn 8#377 /ydieresis] def
-/Times-Roman /Times-Roman-iso isovec ReEncode
-/AvantGarde-Book /AvantGarde-Book-iso isovec ReEncode
-/Times-Italic /Times-Italic-iso isovec ReEncode
 /$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def
 /$F2psEnd {$F2psEnteredState restore end} def
 
 /pageheader {
 save
-newpath 0 614 moveto 0 0 lineto 964 0 lineto 964 614 lineto closepath clip newpath
+newpath 0 614 moveto 0 0 lineto 961 0 lineto 961 614 lineto closepath clip newpath
 171.0 1185.8 translate
 1 -1 scale
 $F2psBegin
@@ -299,339 +259,339 @@ n 7065 13455 m
 % arrowhead
 15.000 slw
 n 9280 12372 m 9386 12250 l 9225 12265 l  col0 s
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 2790 10755 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4140 9855 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4140 10755 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4590 10305 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5040 9855 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5040 10755 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5490 10305 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5490 11205 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5940 11205 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6390 9855 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6390 10305 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6390 10755 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 8235 10305 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 8235 11205 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 8685 10305 m
 gs 1 -1 sc (X) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 4905 14310 m
 gs 1 -1 sc (1) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 5355 14310 m
 gs 1 -1 sc (8) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 5805 14310 m
 gs 1 -1 sc (9) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 6120 14310 m
 gs 1 -1 sc (13) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 6615 14310 m
 gs 1 -1 sc (14) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4860 13905 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5310 13905 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5760 13905 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6210 13905 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6705 13905 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 3015 16155 m
 gs 1 -1 sc (X) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 2385 17910 m
 gs 1 -1 sc (0) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 2835 17910 m
 gs 1 -1 sc (1) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 3285 17910 m
 gs 1 -1 sc (2) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 3735 17910 m
 gs 1 -1 sc (3) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 2340 18405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 3240 18405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 2790 18405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 3735 18405 m
 gs 1 -1 sc (X) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 5040 17910 m
 gs 1 -1 sc (8) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 5535 17910 m
 gs 1 -1 sc (9) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 5895 17910 m
 gs 1 -1 sc (10) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 6390 17910 m
 gs 1 -1 sc (11) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5040 18405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5490 18405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5985 18405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6435 18405 m
 gs 1 -1 sc (X) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 7650 17910 m
 gs 1 -1 sc (12) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 8145 17910 m
 gs 1 -1 sc (13) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 8595 17910 m
 gs 1 -1 sc (14) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 9045 17910 m
 gs 1 -1 sc (15) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 8235 18405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 7785 18405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 8685 18405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 9090 18405 m
 gs 1 -1 sc (X) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 2385 11610 m
 gs 1 -1 sc (0) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 6435 11610 m
 gs 1 -1 sc (9) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 5985 11610 m
 gs 1 -1 sc (8) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 5535 11610 m
 gs 1 -1 sc (7) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 5085 11610 m
 gs 1 -1 sc (6) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 4635 11610 m
 gs 1 -1 sc (5) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 4185 11610 m
 gs 1 -1 sc (4) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 3735 11610 m
 gs 1 -1 sc (3) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 3285 11610 m
 gs 1 -1 sc (2) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 2790 11610 m
 gs 1 -1 sc (1) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 9045 11610 m
 gs 1 -1 sc (15) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 8595 11610 m
 gs 1 -1 sc (14) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 8145 11610 m
 gs 1 -1 sc (13) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 7695 11610 m
 gs 1 -1 sc (12) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 7290 11610 m
 gs 1 -1 sc (11) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 3060 16560 m
 gs 1 -1 sc (1) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 5535 16560 m
 gs 1 -1 sc (8) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 5985 16560 m
 gs 1 -1 sc (9) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 8100 16560 m
 gs 1 -1 sc (13) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 8595 16560 m
 gs 1 -1 sc (14) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
+/Times-Roman ff 381.00 scf sf
 6750 11610 m
 gs 1 -1 sc (10) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5985 16155 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 8190 16155 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 8685 16155 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5490 16155 m
 gs 1 -1 sc (X) col0 sh gr
-/Times-Italic-iso ff 381.00 scf sf
+/Times-Italic ff 381.00 scf sf
 -2250 11700 m
 gs 1 -1 sc (Node 1) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4140 12105 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4635 12105 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5040 12105 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5490 12105 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 2790 12105 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5985 12105 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 2340 12105 m
 gs 1 -1 sc (X) col32 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 3240 12105 m
 gs 1 -1 sc (X) col32 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 3690 12105 m
 gs 1 -1 sc (X) col32 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 7740 12105 m
 gs 1 -1 sc (X) col32 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 9135 12105 m
 gs 1 -1 sc (X) col32 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 7335 12105 m
 gs 1 -1 sc (X) col32 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6435 12105 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 8685 12105 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 8235 12105 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6840 12105 m
 gs 1 -1 sc (X) col32 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 4680 12420 m
 gs 1 -1 sc (local) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 4275 12645 m
-gs 1 -1 sc (Sub-vector ) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+gs 1 -1 sc (Subvector ) col0 sh gr
+/Times-Roman ff 317.50 scf sf
 -135 10350 m
-gs 1 -1 sc (Local sub-matrix) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+gs 1 -1 sc (Local submatrix) col0 sh gr
+/Times-Roman ff 317.50 scf sf
 270 12015 m
 gs 1 -1 sc (Global vector ) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 2205 13815 m
-gs 1 -1 sc (Shared sub-vector) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+gs 1 -1 sc (Shared subvector) col0 sh gr
+/Times-Roman ff 317.50 scf sf
 10575 12015 m
 gs 1 -1 sc (Sparse storage) col4 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 990 14580 m
 gs 1 -1 sc (Send vector elements ) col4 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 1800 14940 m
 gs 1 -1 sc (to Node 1) col4 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 8190 13815 m
 gs 1 -1 sc (Compressed storage ) col4 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 11115 12330 m
 gs 1 -1 sc (format) col4 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 9135 14130 m
 gs 1 -1 sc (format) col4 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 -675 17010 m
 gs 1 -1 sc (Determine vector elements) col4 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 0 17415 m
 gs 1 -1 sc (needed by Node 1) col4 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 -225 18315 m
-gs 1 -1 sc (Local sub-vectors) col0 sh gr
-/Times-Italic-iso ff 317.50 scf sf
+gs 1 -1 sc (Local subvectors) col0 sh gr
+/Times-Italic ff 317.50 scf sf
 2655 18810 m
 gs 1 -1 sc (Node 0) col0 sh gr
-/Times-Italic-iso ff 317.50 scf sf
+/Times-Italic ff 317.50 scf sf
 5355 18810 m
 gs 1 -1 sc (Node 2) col0 sh gr
-/Times-Italic-iso ff 317.50 scf sf
+/Times-Italic ff 317.50 scf sf
 8055 18810 m
 gs 1 -1 sc (Node 3) col0 sh gr
-/Times-Italic-iso ff 381.00 scf sf
+/Times-Italic ff 381.00 scf sf
 -2700 17055 m
 gs 1 -1 sc (Neighbors ) col0 sh gr
-/Times-Italic-iso ff 381.00 scf sf
+/Times-Italic ff 381.00 scf sf
 -2475 17505 m
-gs 1 -1 sc (0, 2 et 3) col0 sh gr
+gs 1 -1 sc (0, 2, and 3) col0 sh gr
 % here ends figure;
 pagefooter
 showpage
diff --git a/BookGPU/Chapters/chapter12/figures/compress.fig b/BookGPU/Chapters/chapter12/figures/compress.fig
index ac628da..5cc0187 100644
--- a/BookGPU/Chapters/chapter12/figures/compress.fig
+++ b/BookGPU/Chapters/chapter12/figures/compress.fig
@@ -17,76 +17,76 @@ Single
 	 5850 9450 5850 11250
 2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 7650 9450 7650 11250
-4 0 0 50 -1 4 25 0.0000 4 315 255 2790 10755 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4140 9855 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4140 10755 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4590 10305 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5040 9855 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5040 10755 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5490 10305 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5490 11205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5940 11205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6390 9855 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6390 10305 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6390 10755 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8235 10305 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8235 11205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8685 10305 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2790 10755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4140 9855 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4140 10755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4590 10305 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5040 9855 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5040 10755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5490 10305 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5490 11205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5940 11205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6390 9855 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6390 10305 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6390 10755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8235 10305 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8235 11205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8685 10305 X\001
 -6
 6 4725 13455 7065 14355
 2 2 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 4770 13500 7020 13500 7020 13950 4770 13950 4770 13500
-4 0 0 50 -1 0 24 0.0000 4 270 210 4905 14310 1\001
-4 0 0 50 -1 0 24 0.0000 4 270 210 5355 14310 8\001
-4 0 0 50 -1 0 24 0.0000 4 285 210 5805 14310 9\001
-4 0 0 50 -1 0 24 0.0000 4 270 420 6120 14310 13\001
-4 0 0 50 -1 0 24 0.0000 4 270 420 6615 14310 14\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4860 13905 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5310 13905 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5760 13905 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6210 13905 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6705 13905 X\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 4905 14310 1\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 5355 14310 8\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 5805 14310 9\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 6120 14310 13\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 6615 14310 14\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4860 13905 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5310 13905 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5760 13905 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6210 13905 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6705 13905 X\001
 -6
 6 2880 15705 3420 16245
 2 2 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 2925 15750 3375 15750 3375 16200 2925 16200 2925 15750
-4 0 0 50 -1 4 25 0.0000 4 315 255 3015 16155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3015 16155 X\001
 -6
 6 2205 17640 4095 18495
 2 2 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 2250 18000 4050 18000 4050 18450 2250 18450 2250 18000
-4 0 0 50 -1 0 24 0.0000 4 270 210 2385 17910 0\001
-4 0 0 50 -1 0 24 0.0000 4 270 210 2835 17910 1\001
-4 0 0 50 -1 0 24 0.0000 4 270 210 3285 17910 2\001
-4 0 0 50 -1 0 24 0.0000 4 270 210 3735 17910 3\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2340 18405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3240 18405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2790 18405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3735 18405 X\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 2385 17910 0\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 2835 17910 1\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 3285 17910 2\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 3735 17910 3\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2340 18405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3240 18405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2790 18405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3735 18405 X\001
 -6
 6 4905 17640 6840 18495
 2 2 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 4950 18000 6750 18000 6750 18450 4950 18450 4950 18000
-4 0 0 50 -1 0 24 0.0000 4 270 210 5040 17910 8\001
-4 0 0 50 -1 0 24 0.0000 4 285 210 5535 17910 9\001
-4 0 0 50 -1 0 24 0.0000 4 270 420 5895 17910 10\001
-4 0 0 50 -1 0 24 0.0000 4 270 420 6390 17910 11\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5040 18405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5490 18405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5985 18405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 18405 X\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 5040 17910 8\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 5535 17910 9\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 5895 17910 10\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 6390 17910 11\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5040 18405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5490 18405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5985 18405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 18405 X\001
 -6
 6 7605 17595 9495 18495
 2 2 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 7650 18000 9450 18000 9450 18450 7650 18450 7650 18000
-4 0 0 50 -1 0 24 0.0000 4 270 420 7650 17910 12\001
-4 0 0 50 -1 0 24 0.0000 4 270 420 8145 17910 13\001
-4 0 0 50 -1 0 24 0.0000 4 270 420 8595 17910 14\001
-4 0 0 50 -1 0 24 0.0000 4 285 420 9045 17910 15\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8235 18405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7785 18405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8685 18405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 9090 18405 X\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 7650 17910 12\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 8145 17910 13\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 8595 17910 14\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 9045 17910 15\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8235 18405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7785 18405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8685 18405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 9090 18405 X\001
 -6
 2 2 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 5400 15750 6300 15750 6300 16200 5400 16200 5400 15750
@@ -136,64 +136,64 @@ Single
 2 1 1 1 0 32 50 -1 -1 4.000 0 0 -1 1 0 2
 	0 0 2.00 120.00 150.00
 	 7065 13455 9405 12240
-4 0 0 50 -1 0 24 0.0000 4 270 210 2385 11610 0\001
-4 0 0 50 -1 0 24 0.0000 4 285 210 6435 11610 9\001
-4 0 0 50 -1 0 24 0.0000 4 270 210 5985 11610 8\001
-4 0 0 50 -1 0 24 0.0000 4 270 210 5535 11610 7\001
-4 0 0 50 -1 0 24 0.0000 4 270 210 5085 11610 6\001
-4 0 0 50 -1 0 24 0.0000 4 285 210 4635 11610 5\001
-4 0 0 50 -1 0 24 0.0000 4 270 210 4185 11610 4\001
-4 0 0 50 -1 0 24 0.0000 4 270 210 3735 11610 3\001
-4 0 0 50 -1 0 24 0.0000 4 270 210 3285 11610 2\001
-4 0 0 50 -1 0 24 0.0000 4 270 210 2790 11610 1\001
-4 0 0 50 -1 0 24 0.0000 4 285 420 9045 11610 15\001
-4 0 0 50 -1 0 24 0.0000 4 270 420 8595 11610 14\001
-4 0 0 50 -1 0 24 0.0000 4 270 420 8145 11610 13\001
-4 0 0 50 -1 0 24 0.0000 4 270 420 7695 11610 12\001
-4 0 0 50 -1 0 24 0.0000 4 270 420 7290 11610 11\001
-4 0 0 50 -1 0 24 0.0000 4 270 210 3060 16560 1\001
-4 0 0 50 -1 0 24 0.0000 4 270 210 5535 16560 8\001
-4 0 0 50 -1 0 24 0.0000 4 285 210 5985 16560 9\001
-4 0 0 50 -1 0 24 0.0000 4 270 420 8100 16560 13\001
-4 0 0 50 -1 0 24 0.0000 4 270 420 8595 16560 14\001
-4 0 0 50 -1 0 24 0.0000 4 270 420 6750 11610 10\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5985 16155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8190 16155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8685 16155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5490 16155 X\001
-4 0 0 50 -1 1 24 0.0000 4 285 1185 -2250 11700 Node 1\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4140 12105 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4635 12105 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5040 12105 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5490 12105 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2790 12105 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5985 12105 X\001
-4 0 32 50 -1 4 25 0.0000 4 315 255 2340 12105 X\001
-4 0 32 50 -1 4 25 0.0000 4 315 255 3240 12105 X\001
-4 0 32 50 -1 4 25 0.0000 4 315 255 3690 12105 X\001
-4 0 32 50 -1 4 25 0.0000 4 315 255 7740 12105 X\001
-4 0 32 50 -1 4 25 0.0000 4 315 255 9135 12105 X\001
-4 0 32 50 -1 4 25 0.0000 4 315 255 7335 12105 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 12105 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8685 12105 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8235 12105 X\001
-4 0 32 50 -1 4 25 0.0000 4 315 255 6840 12105 X\001
-4 0 0 50 -1 0 20 0.0000 4 225 645 4680 12420 local\001
-4 0 0 50 -1 0 20 0.0000 4 225 1530 4275 12645 Sub-vector \001
-4 0 0 50 -1 0 20 0.0000 4 225 2280 -135 10350 Local sub-matrix\001
-4 0 0 50 -1 0 20 0.0000 4 225 1905 270 12015 Global vector \001
-4 0 0 50 -1 0 20 0.0000 4 225 2400 2205 13815 Shared sub-vector\001
-4 0 4 50 -1 0 20 0.0000 4 300 1935 10575 12015 Sparse storage\001
-4 0 4 50 -1 0 20 0.0000 4 225 2940 990 14580 Send vector elements \001
-4 0 4 50 -1 0 20 0.0000 4 225 1320 1800 14940 to Node 1\001
-4 0 4 50 -1 0 20 0.0000 4 300 2790 8190 13815 Compressed storage \001
-4 0 4 50 -1 0 20 0.0000 4 225 870 11115 12330 format\001
-4 0 4 50 -1 0 20 0.0000 4 225 870 9135 14130 format\001
-4 0 4 50 -1 0 20 0.0000 4 225 3585 -675 17010 Determine vector elements\001
-4 0 4 50 -1 0 20 0.0000 4 300 2430 0 17415 needed by Node 1\001
-4 0 0 50 -1 0 20 0.0000 4 225 2385 -225 18315 Local sub-vectors\001
-4 0 0 50 -1 1 20 0.0000 4 225 960 2655 18810 Node 0\001
-4 0 0 50 -1 1 20 0.0000 4 225 960 5355 18810 Node 2\001
-4 0 0 50 -1 1 20 0.0000 4 225 960 8055 18810 Node 3\001
-4 0 0 50 -1 1 24 0.0000 4 375 1845 -2700 17055 Neighbors \001
-4 0 0 50 -1 1 24 0.0000 4 315 1350 -2475 17505 0, 2 et 3\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 2385 11610 0\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 6435 11610 9\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 5985 11610 8\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 5535 11610 7\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 5085 11610 6\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 4635 11610 5\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 4185 11610 4\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 3735 11610 3\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 3285 11610 2\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 2790 11610 1\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 9045 11610 15\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 8595 11610 14\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 8145 11610 13\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 7695 11610 12\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 7290 11610 11\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 3060 16560 1\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 5535 16560 8\001
+4 0 0 50 -1 0 24 0.0000 4 270 195 5985 16560 9\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 8100 16560 13\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 8595 16560 14\001
+4 0 0 50 -1 0 24 0.0000 4 270 390 6750 11610 10\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5985 16155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8190 16155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8685 16155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5490 16155 X\001
+4 0 0 50 -1 1 24 0.0000 4 285 1110 -2250 11700 Node 1\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4140 12105 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4635 12105 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5040 12105 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5490 12105 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2790 12105 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5985 12105 X\001
+4 0 32 50 -1 4 25 0.0000 4 300 255 2340 12105 X\001
+4 0 32 50 -1 4 25 0.0000 4 300 255 3240 12105 X\001
+4 0 32 50 -1 4 25 0.0000 4 300 255 3690 12105 X\001
+4 0 32 50 -1 4 25 0.0000 4 300 255 7740 12105 X\001
+4 0 32 50 -1 4 25 0.0000 4 300 255 9135 12105 X\001
+4 0 32 50 -1 4 25 0.0000 4 300 255 7335 12105 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 12105 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8685 12105 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8235 12105 X\001
+4 0 32 50 -1 4 25 0.0000 4 300 255 6840 12105 X\001
+4 0 0 50 -1 0 20 0.0000 4 225 615 4680 12420 local\001
+4 0 0 50 -1 0 20 0.0000 4 225 1395 4275 12645 Subvector \001
+4 0 0 50 -1 0 20 0.0000 4 225 2130 -135 10350 Local submatrix\001
+4 0 0 50 -1 0 20 0.0000 4 225 1845 270 12015 Global vector \001
+4 0 0 50 -1 0 20 0.0000 4 225 2235 2205 13815 Shared subvector\001
+4 0 4 50 -1 0 20 0.0000 4 300 1890 10575 12015 Sparse storage\001
+4 0 4 50 -1 0 20 0.0000 4 225 2850 990 14580 Send vector elements \001
+4 0 4 50 -1 0 20 0.0000 4 225 1260 1800 14940 to Node 1\001
+4 0 4 50 -1 0 20 0.0000 4 300 2670 8190 13815 Compressed storage \001
+4 0 4 50 -1 0 20 0.0000 4 225 885 11115 12330 format\001
+4 0 4 50 -1 0 20 0.0000 4 225 885 9135 14130 format\001
+4 0 4 50 -1 0 20 0.0000 4 225 3495 -675 17010 Determine vector elements\001
+4 0 4 50 -1 0 20 0.0000 4 300 2265 0 17415 needed by Node 1\001
+4 0 0 50 -1 0 20 0.0000 4 225 2175 -225 18315 Local subvectors\001
+4 0 0 50 -1 1 20 0.0000 4 240 885 2655 18810 Node 0\001
+4 0 0 50 -1 1 20 0.0000 4 240 885 5355 18810 Node 2\001
+4 0 0 50 -1 1 20 0.0000 4 240 885 8055 18810 Node 3\001
+4 0 0 50 -1 1 24 0.0000 4 390 1725 -2700 17055 Neighbors \001
+4 0 0 50 -1 1 24 0.0000 4 345 1665 -2475 17505 0, 2, and 3\001
diff --git a/BookGPU/Chapters/chapter12/figures/compress.fig.bak b/BookGPU/Chapters/chapter12/figures/compress.fig.bak
index b8ecc58..ac628da 100644
--- a/BookGPU/Chapters/chapter12/figures/compress.fig.bak
+++ b/BookGPU/Chapters/chapter12/figures/compress.fig.bak
@@ -126,14 +126,16 @@ Single
 2 1 0 1 4 7 50 -1 -1 0.000 0 0 -1 0 1 2
 	1 1 2.00 90.00 150.00
 	 9540 11925 10530 11925
-2 1 1 1 0 32 50 -1 -1 4.000 0 0 -1 0 0 2
-	 4680 13455 2295 12240
-2 1 1 1 0 32 50 -1 -1 4.000 0 0 -1 0 0 2
-	 7065 13455 9405 12240
 2 1 0 1 0 32 50 -1 -1 0.000 0 0 -1 0 0 1
 	 9405 12240
 2 1 0 1 0 32 50 -1 -1 0.000 0 0 -1 0 0 1
 	 9405 12240
+2 1 1 1 0 32 50 -1 -1 4.000 0 0 -1 1 0 2
+	0 0 2.00 120.00 150.00
+	 4680 13455 2295 12240
+2 1 1 1 0 32 50 -1 -1 4.000 0 0 -1 1 0 2
+	0 0 2.00 120.00 150.00
+	 7065 13455 9405 12240
 4 0 0 50 -1 0 24 0.0000 4 270 210 2385 11610 0\001
 4 0 0 50 -1 0 24 0.0000 4 285 210 6435 11610 9\001
 4 0 0 50 -1 0 24 0.0000 4 270 210 5985 11610 8\001
diff --git a/BookGPU/Chapters/chapter12/figures/compress.pdf b/BookGPU/Chapters/chapter12/figures/compress.pdf
index 3163620..e53930c 100644
Binary files a/BookGPU/Chapters/chapter12/figures/compress.pdf and b/BookGPU/Chapters/chapter12/figures/compress.pdf differ
diff --git a/BookGPU/Chapters/chapter12/figures/partition.eps b/BookGPU/Chapters/chapter12/figures/partition.eps
index 280f271..a7e3121 100644
--- a/BookGPU/Chapters/chapter12/figures/partition.eps
+++ b/BookGPU/Chapters/chapter12/figures/partition.eps
@@ -1,8 +1,8 @@
 %!PS-Adobe-2.0 EPSF-2.0
 %%Title: partition.fig
 %%Creator: fig2dev Version 3.2 Patchlevel 5c
-%%CreationDate: Tue Feb  5 11:11:52 2013
-%%BoundingBox: 0 0 635 592
+%%CreationDate: Fri Jul 19 19:04:39 2013
+%%BoundingBox: 0 0 630 592
 %Magnification: 1.0000
 %%EndComments
 %%BeginProlog
@@ -81,7 +81,7 @@ end
 
 /pageheader {
 save
-newpath 0 592 moveto 0 0 lineto 635 0 lineto 635 592 lineto closepath clip newpath
+newpath 0 592 moveto 0 0 lineto 630 0 lineto 630 592 lineto closepath clip newpath
 -44.4 545.2 translate
 1 -1 scale
 $F2psBegin
@@ -476,13 +476,13 @@ gs 1 -1 sc (X) col0 sh gr
 gs 1 -1 sc (X) col0 sh gr
 /Times-Roman ff 317.50 scf sf
 4500 810 m
-gs 1 -1 sc (local x) col0 sh gr
+gs 1 -1 sc (local ) col0 sh gr
 /Times-Roman ff 317.50 scf sf
 2790 810 m
-gs 1 -1 sc (shared x) col0 sh gr
+gs 1 -1 sc (shared ) col0 sh gr
 /Times-Roman ff 317.50 scf sf
 6795 810 m
-gs 1 -1 sc (shared x) col0 sh gr
+gs 1 -1 sc (shared ) col0 sh gr
 /Times-Roman ff 381.00 scf sf
 4590 -450 m
 gs 1 -1 sc (Matrix bandwidth) col0 sh gr
@@ -500,13 +500,13 @@ gs 1 -1 sc (offset 2) col0 sh gr
 gs 1 -1 sc (offset 3) col0 sh gr
 /Times-Roman ff 317.50 scf sf
 9585 8550 m
-gs 1 -1 sc (Vector b) col0 sh gr
+gs 1 -1 sc (Vector ) col0 sh gr
 /Times-Roman ff 317.50 scf sf
 4905 8550 m
-gs 1 -1 sc (Sparse matrix A) col0 sh gr
+gs 1 -1 sc (Sparse matrix ) col0 sh gr
 /Times-Roman ff 317.50 scf sf
 990 180 m
-gs 1 -1 sc (Vector x) col0 sh gr
+gs 1 -1 sc (Vector ) col0 sh gr
 /Times-Roman ff 381.00 scf sf
 720 1935 m
 gs 1 -1 sc (Node 0) col0 sh gr
@@ -519,6 +519,24 @@ gs 1 -1 sc (Node 2) col0 sh gr
 /Times-Roman ff 381.00 scf sf
 720 7290 m
 gs 1 -1 sc (Node 3) col0 sh gr
+/Times-Italic ff 317.50 scf sf
+10530 8550 m
+gs 1 -1 sc (b) col0 sh gr
+/Times-Italic ff 317.50 scf sf
+6885 8550 m
+gs 1 -1 sc (A) col0 sh gr
+/Times-Italic ff 317.50 scf sf
+1980 180 m
+gs 1 -1 sc (x) col0 sh gr
+/Times-Italic ff 317.50 scf sf
+7740 810 m
+gs 1 -1 sc (x) col0 sh gr
+/Times-Italic ff 317.50 scf sf
+5175 810 m
+gs 1 -1 sc (x) col0 sh gr
+/Times-Italic ff 317.50 scf sf
+3690 810 m
+gs 1 -1 sc (x) col0 sh gr
 % here ends figure;
 pagefooter
 showpage
diff --git a/BookGPU/Chapters/chapter12/figures/partition.fig b/BookGPU/Chapters/chapter12/figures/partition.fig
index 06df2c4..e0fad5f 100644
--- a/BookGPU/Chapters/chapter12/figures/partition.fig
+++ b/BookGPU/Chapters/chapter12/figures/partition.fig
@@ -18,22 +18,22 @@ Single
 	 5850 315 5850 -135
 2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 7650 315 7650 -135
-4 0 0 50 -1 4 25 0.0000 4 315 255 7785 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2385 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2835 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3285 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3735 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4185 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4635 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5085 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5535 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5985 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6885 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7335 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8235 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8685 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 9135 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7785 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2385 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2835 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3285 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3735 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4185 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4635 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5085 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5535 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5985 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6885 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7335 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8235 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8685 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 9135 225 X\001
 -6
 6 2205 855 9495 8145
 2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
@@ -52,56 +52,56 @@ Single
 	 7650 900 7650 8100
 2 2 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 2250 900 9450 900 9450 8100 2250 8100 2250 900
-4 0 0 50 -1 4 25 0.0000 4 315 255 2340 1260 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2835 1710 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3240 2160 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3735 2610 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4140 3060 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4635 3510 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5085 3960 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5490 4410 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5985 4905 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 5310 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6885 5805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7290 6210 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7740 6660 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8235 7155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8685 7560 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 9090 8055 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5490 2205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5940 2205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 2205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 3105 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8190 4410 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8190 3555 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8685 3555 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 3960 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8235 2205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8640 1755 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6885 5310 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5985 4410 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5490 3555 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2790 2205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3285 1305 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3690 1755 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2790 2655 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4185 3960 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5085 3105 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2790 4005 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3240 4905 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2790 4905 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 5805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8685 7110 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7785 7155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6840 7560 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7740 8055 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4635 5355 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4635 5805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5085 5355 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3285 7155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 3510 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7740 5760 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5040 6705 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2340 1260 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2835 1710 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3240 2160 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3735 2610 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4140 3060 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4635 3510 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5085 3960 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5490 4410 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5985 4905 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 5310 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6885 5805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7290 6210 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7740 6660 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8235 7155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8685 7560 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 9090 8055 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5490 2205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5940 2205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 2205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 3105 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8190 4410 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8190 3555 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8685 3555 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 3960 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8235 2205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8640 1755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6885 5310 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5985 4410 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5490 3555 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2790 2205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3285 1305 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3690 1755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2790 2655 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4185 3960 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5085 3105 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2790 4005 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3240 4905 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2790 4905 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 5805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8685 7110 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7785 7155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6840 7560 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7740 8055 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4635 5355 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4635 5805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5085 5355 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3285 7155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 3510 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7740 5760 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5040 6705 X\001
 -6
 6 9855 855 10395 8145
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
@@ -114,22 +114,22 @@ Single
 	 9900 2700 10350 2700
 2 2 0 1 4 4 50 -1 38 0.000 0 0 -1 0 0 5
 	 9900 2700 10350 2700 10350 4500 9900 4500 9900 2700
-4 0 0 50 -1 4 25 0.0000 4 315 255 9990 1260 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 9990 1755 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 9990 2205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 9990 2655 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 3105 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 3555 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 4005 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 4455 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 4905 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 5355 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 5805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 6255 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 7155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 8055 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 6660 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 7605 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 9990 1260 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 9990 1755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 9990 2205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 9990 2655 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 3105 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 3555 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 4005 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 4455 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 4905 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 5355 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 5805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 6255 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 7155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 8055 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 6660 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 7605 X\001
 -6
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
 	0 0 1.00 75.00 120.00
@@ -155,18 +155,24 @@ Single
 	0 0 1.00 75.00 120.00
 	0 0 1.00 75.00 120.00
 	 4050 540 5850 540
-4 0 0 50 -1 1 20 0.0000 0 225 900 4500 810 local x\001
-4 0 0 50 -1 1 20 0.0000 0 225 1125 2790 810 shared x\001
-4 0 0 50 -1 1 20 0.0000 0 225 1125 6795 810 shared x\001
-4 0 0 50 -1 1 24 0.0000 0 270 3000 4590 -450 Matrix bandwidth\001
-4 0 0 50 -1 1 18 0.0000 4 270 900 1215 990 offset 0\001
-4 0 4 50 -1 1 18 0.0000 4 255 900 1260 2790 offset 1\001
-4 0 0 50 -1 1 18 0.0000 4 255 900 1260 4590 offset 2\001
-4 0 0 50 -1 1 18 0.0000 4 270 900 1260 6390 offset 3\001
-4 0 0 50 -1 1 20 0.0000 0 225 1155 9585 8550 Vector b\001
-4 0 0 50 -1 1 20 0.0000 0 300 2160 4905 8550 Sparse matrix A\001
-4 0 0 50 -1 1 20 0.0000 0 225 1155 990 180 Vector x\001
-4 0 0 50 -1 1 24 0.0000 0 270 1215 720 1935 Node 0\001
-4 0 4 50 -1 1 24 0.0000 0 270 1215 720 3645 Node 1\001
-4 0 0 50 -1 1 24 0.0000 0 270 1215 720 5490 Node 2\001
-4 0 0 50 -1 1 24 0.0000 0 270 1215 720 7290 Node 3\001
+4 0 0 50 -1 1 20 0.0000 0 225 705 4500 810 local \001
+4 0 0 50 -1 1 20 0.0000 0 225 930 2790 810 shared \001
+4 0 0 50 -1 1 20 0.0000 0 225 930 6795 810 shared \001
+4 0 0 50 -1 1 24 0.0000 0 270 2850 4590 -450 Matrix bandwidth\001
+4 0 0 50 -1 1 18 0.0000 4 270 795 1215 990 offset 0\001
+4 0 4 50 -1 1 18 0.0000 4 270 795 1260 2790 offset 1\001
+4 0 0 50 -1 1 18 0.0000 4 270 795 1260 4590 offset 2\001
+4 0 0 50 -1 1 18 0.0000 4 270 795 1260 6390 offset 3\001
+4 0 0 50 -1 1 20 0.0000 0 225 975 9585 8550 Vector \001
+4 0 0 50 -1 1 20 0.0000 0 300 1920 4905 8550 Sparse matrix \001
+4 0 0 50 -1 1 20 0.0000 0 225 975 990 180 Vector \001
+4 0 0 50 -1 1 24 0.0000 0 270 1140 720 1935 Node 0\001
+4 0 4 50 -1 1 24 0.0000 0 270 1140 720 3645 Node 1\001
+4 0 0 50 -1 1 24 0.0000 0 270 1140 720 5490 Node 2\001
+4 0 0 50 -1 1 24 0.0000 0 270 1140 720 7290 Node 3\001
+4 0 0 50 -1 1 20 0.0000 4 240 150 10530 8550 b\001
+4 0 0 50 -1 1 20 0.0000 4 225 195 6885 8550 A\001
+4 0 0 50 -1 1 20 0.0000 4 150 135 1980 180 x\001
+4 0 0 50 -1 1 20 0.0000 4 150 135 7740 810 x\001
+4 0 0 50 -1 1 20 0.0000 4 150 135 5175 810 x\001
+4 0 0 50 -1 1 20 0.0000 4 150 135 3690 810 x\001
diff --git a/BookGPU/Chapters/chapter12/figures/partition.fig.bak b/BookGPU/Chapters/chapter12/figures/partition.fig.bak
index bdfab60..087cfa0 100644
--- a/BookGPU/Chapters/chapter12/figures/partition.fig.bak
+++ b/BookGPU/Chapters/chapter12/figures/partition.fig.bak
@@ -18,22 +18,22 @@ Single
 	 5850 315 5850 -135
 2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 7650 315 7650 -135
-4 0 0 50 -1 4 25 0.0000 4 315 255 7785 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2385 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2835 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3285 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3735 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4185 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4635 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5085 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5535 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5985 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6885 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7335 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8235 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8685 225 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 9135 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7785 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2385 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2835 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3285 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3735 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4185 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4635 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5085 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5535 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5985 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6885 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7335 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8235 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8685 225 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 9135 225 X\001
 -6
 6 2205 855 9495 8145
 2 1 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
@@ -52,56 +52,56 @@ Single
 	 7650 900 7650 8100
 2 2 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 	 2250 900 9450 900 9450 8100 2250 8100 2250 900
-4 0 0 50 -1 4 25 0.0000 4 315 255 2340 1260 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2835 1710 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3240 2160 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3735 2610 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4140 3060 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4635 3510 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5085 3960 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5490 4410 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5985 4905 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 5310 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6885 5805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7290 6210 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7740 6660 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8235 7155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8685 7560 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 9090 8055 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5490 2205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5940 2205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 2205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 3105 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8190 4410 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8190 3555 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8685 3555 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 3960 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8235 2205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8640 1755 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6885 5310 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5985 4410 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5490 3555 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2790 2205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3285 1305 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3690 1755 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2790 2655 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4185 3960 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5085 3105 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2790 4005 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3240 4905 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2790 4905 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 5805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8685 7110 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7785 7155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6840 7560 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7740 8055 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4635 5355 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4635 5805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5085 5355 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3285 7155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6435 3510 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7740 5760 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5040 6705 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2340 1260 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2835 1710 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3240 2160 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3735 2610 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4140 3060 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4635 3510 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5085 3960 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5490 4410 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5985 4905 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 5310 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6885 5805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7290 6210 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7740 6660 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8235 7155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8685 7560 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 9090 8055 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5490 2205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5940 2205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 2205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 3105 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8190 4410 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8190 3555 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8685 3555 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 3960 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8235 2205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8640 1755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6885 5310 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5985 4410 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5490 3555 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2790 2205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3285 1305 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3690 1755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2790 2655 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4185 3960 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5085 3105 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2790 4005 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3240 4905 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2790 4905 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 5805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8685 7110 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7785 7155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6840 7560 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7740 8055 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4635 5355 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4635 5805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5085 5355 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3285 7155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6435 3510 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7740 5760 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5040 6705 X\001
 -6
 6 9855 855 10395 8145
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
@@ -114,27 +114,23 @@ Single
 	 9900 2700 10350 2700
 2 2 0 1 4 4 50 -1 38 0.000 0 0 -1 0 0 5
 	 9900 2700 10350 2700 10350 4500 9900 4500 9900 2700
-4 0 0 50 -1 4 25 0.0000 4 315 255 9990 1260 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 9990 1755 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 9990 2205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 9990 2655 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 3105 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 3555 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 4005 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 4455 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 4905 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 5355 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 5805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 6255 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 7155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 8055 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 6660 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 10035 7605 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 9990 1260 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 9990 1755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 9990 2205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 9990 2655 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 3105 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 3555 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 4005 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 4455 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 4905 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 5355 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 5805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 6255 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 7155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 8055 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 6660 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 10035 7605 X\001
 -6
-2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
-	0 0 1.00 75.00 120.00
-	0 0 1.00 75.00 120.00
-	 4050 540 5850 540
 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
 	0 0 1.00 75.00 120.00
 	0 0 1.00 75.00 120.00
@@ -155,18 +151,25 @@ Single
 	 2115 900 2250 900
 2 1 0 2 4 7 50 -1 -1 0.000 0 0 -1 0 0 2
 	 2115 2700 2250 2700
-4 0 0 50 -1 1 20 0.0000 0 225 900 4500 810 x local\001
-4 0 0 50 -1 1 20 0.0000 0 300 1230 2790 810 x partag\351\001
-4 0 0 50 -1 1 20 0.0000 0 300 1230 6795 810 x partag\351\001
-4 0 0 50 -1 1 20 0.0000 0 225 1305 9495 8550 Vecteur b\001
-4 0 0 50 -1 1 20 0.0000 0 225 1305 765 180 Vecteur x\001
-4 0 0 50 -1 1 20 0.0000 0 225 2310 4770 8550 Matrice creuse A\001
-4 0 0 50 -1 1 24 0.0000 0 270 1425 450 5490 Noeud 2\001
-4 0 0 50 -1 1 24 0.0000 0 270 1425 450 7290 Noeud 3\001
-4 0 4 50 -1 1 24 0.0000 0 270 1425 495 3690 Noeud 1\001
-4 0 0 50 -1 1 24 0.0000 0 270 1425 450 1890 Noeud 0\001
-4 0 0 50 -1 1 24 0.0000 0 360 2895 4590 -450 Largeur de bande\001
-4 0 0 50 -1 1 18 0.0000 4 270 900 1215 990 offset 0\001
-4 0 4 50 -1 1 18 0.0000 4 255 900 1260 2790 offset 1\001
-4 0 0 50 -1 1 18 0.0000 4 255 900 1260 4590 offset 2\001
-4 0 0 50 -1 1 18 0.0000 4 270 900 1260 6390 offset 3\001
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
+	0 0 1.00 75.00 120.00
+	0 0 1.00 75.00 120.00
+	 4050 540 5850 540
+4 0 0 50 -1 1 20 0.0000 0 225 705 4500 810 local \001
+4 0 0 50 -1 1 20 0.0000 0 225 930 2790 810 shared \001
+4 0 0 50 -1 1 20 0.0000 0 225 930 6795 810 shared \001
+4 0 0 50 -1 1 24 0.0000 0 270 2850 4590 -450 Matrix bandwidth\001
+4 0 0 50 -1 1 18 0.0000 4 270 795 1215 990 offset 0\001
+4 0 4 50 -1 1 18 0.0000 4 270 795 1260 2790 offset 1\001
+4 0 0 50 -1 1 18 0.0000 4 270 795 1260 4590 offset 2\001
+4 0 0 50 -1 1 18 0.0000 4 270 795 1260 6390 offset 3\001
+4 0 0 50 -1 1 20 0.0000 0 225 975 9585 8550 Vector \001
+4 0 0 50 -1 1 20 0.0000 0 300 1920 4905 8550 Sparse matrix \001
+4 0 0 50 -1 1 20 0.0000 0 225 975 990 180 Vector \001
+4 0 0 50 -1 1 24 0.0000 0 270 1140 720 1935 Node 0\001
+4 0 4 50 -1 1 24 0.0000 0 270 1140 720 3645 Node 1\001
+4 0 0 50 -1 1 24 0.0000 0 270 1140 720 5490 Node 2\001
+4 0 0 50 -1 1 24 0.0000 0 270 1140 720 7290 Node 3\001
+4 0 0 50 -1 1 20 0.0000 4 240 150 10530 8550 b\001
+4 0 0 50 -1 1 20 0.0000 4 150 135 11520 4455 x\001
+4 0 0 50 -1 1 20 0.0000 4 225 195 6885 8550 A\001
diff --git a/BookGPU/Chapters/chapter12/figures/partition.pdf b/BookGPU/Chapters/chapter12/figures/partition.pdf
index b16aa06..bdd000c 100644
Binary files a/BookGPU/Chapters/chapter12/figures/partition.pdf and b/BookGPU/Chapters/chapter12/figures/partition.pdf differ
diff --git a/BookGPU/Chapters/chapter12/figures/reorder.eps b/BookGPU/Chapters/chapter12/figures/reorder.eps
index 845c100..ab31241 100644
--- a/BookGPU/Chapters/chapter12/figures/reorder.eps
+++ b/BookGPU/Chapters/chapter12/figures/reorder.eps
@@ -1,7 +1,7 @@
 %!PS-Adobe-2.0 EPSF-2.0
 %%Title: reorder.fig
 %%Creator: fig2dev Version 3.2 Patchlevel 5c
-%%CreationDate: Fri Feb  8 17:00:38 2013
+%%CreationDate: Thu Jul 18 18:51:59 2013
 %%BoundingBox: 0 0 638 514
 %Magnification: 1.0000
 %%EndComments
@@ -77,46 +77,6 @@ end
   bind def
 /shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul
   4 -2 roll mul srgb} bind def
-/reencdict 12 dict def /ReEncode { reencdict begin
-/newcodesandnames exch def /newfontname exch def /basefontname exch def
-/basefontdict basefontname findfont def /newfont basefontdict maxlength dict def
-basefontdict { exch dup /FID ne { dup /Encoding eq
-{ exch dup length array copy newfont 3 1 roll put }
-{ exch newfont 3 1 roll put } ifelse } { pop pop } ifelse } forall
-newfont /FontName newfontname put newcodesandnames aload pop
-128 1 255 { newfont /Encoding get exch /.notdef put } for
-newcodesandnames length 2 idiv { newfont /Encoding get 3 1 roll put } repeat
-newfontname newfont definefont pop end } def
-/isovec [
-8#055 /minus 8#200 /grave 8#201 /acute 8#202 /circumflex 8#203 /tilde
-8#204 /macron 8#205 /breve 8#206 /dotaccent 8#207 /dieresis
-8#210 /ring 8#211 /cedilla 8#212 /hungarumlaut 8#213 /ogonek 8#214 /caron
-8#220 /dotlessi 8#230 /oe 8#231 /OE
-8#240 /space 8#241 /exclamdown 8#242 /cent 8#243 /sterling
-8#244 /currency 8#245 /yen 8#246 /brokenbar 8#247 /section 8#250 /dieresis
-8#251 /copyright 8#252 /ordfeminine 8#253 /guillemotleft 8#254 /logicalnot
-8#255 /hyphen 8#256 /registered 8#257 /macron 8#260 /degree 8#261 /plusminus
-8#262 /twosuperior 8#263 /threesuperior 8#264 /acute 8#265 /mu 8#266 /paragraph
-8#267 /periodcentered 8#270 /cedilla 8#271 /onesuperior 8#272 /ordmasculine
-8#273 /guillemotright 8#274 /onequarter 8#275 /onehalf
-8#276 /threequarters 8#277 /questiondown 8#300 /Agrave 8#301 /Aacute
-8#302 /Acircumflex 8#303 /Atilde 8#304 /Adieresis 8#305 /Aring
-8#306 /AE 8#307 /Ccedilla 8#310 /Egrave 8#311 /Eacute
-8#312 /Ecircumflex 8#313 /Edieresis 8#314 /Igrave 8#315 /Iacute
-8#316 /Icircumflex 8#317 /Idieresis 8#320 /Eth 8#321 /Ntilde 8#322 /Ograve
-8#323 /Oacute 8#324 /Ocircumflex 8#325 /Otilde 8#326 /Odieresis 8#327 /multiply
-8#330 /Oslash 8#331 /Ugrave 8#332 /Uacute 8#333 /Ucircumflex
-8#334 /Udieresis 8#335 /Yacute 8#336 /Thorn 8#337 /germandbls 8#340 /agrave
-8#341 /aacute 8#342 /acircumflex 8#343 /atilde 8#344 /adieresis 8#345 /aring
-8#346 /ae 8#347 /ccedilla 8#350 /egrave 8#351 /eacute
-8#352 /ecircumflex 8#353 /edieresis 8#354 /igrave 8#355 /iacute
-8#356 /icircumflex 8#357 /idieresis 8#360 /eth 8#361 /ntilde 8#362 /ograve
-8#363 /oacute 8#364 /ocircumflex 8#365 /otilde 8#366 /odieresis 8#367 /divide
-8#370 /oslash 8#371 /ugrave 8#372 /uacute 8#373 /ucircumflex
-8#374 /udieresis 8#375 /yacute 8#376 /thorn 8#377 /ydieresis] def
-/Times-Roman /Times-Roman-iso isovec ReEncode
-/Times-Roman /Times-Roman-iso isovec ReEncode
-/AvantGarde-Book /AvantGarde-Book-iso isovec ReEncode
 /$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def
 /$F2psEnd {$F2psEnteredState restore end} def
 
@@ -219,280 +179,280 @@ n 5850 11970 m
 45.000 slw
 n 5760 13435 m 5850 13698 l 5940 13435 l 5850 13488 l 5760 13435 l 
  cp gs 0.00 setgray ef gr  col0 s
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 2790 11205 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4140 10305 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4140 11205 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4590 10755 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5085 10305 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5085 11205 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5490 10755 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5490 11655 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5940 11655 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6390 11205 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6390 10755 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6390 10305 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 8190 11655 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 8190 10755 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 8640 10755 m
 gs 1 -1 sc (X) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 2385 9810 m
 gs 1 -1 sc (0) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 2835 9810 m
 gs 1 -1 sc (1) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 3330 9810 m
 gs 1 -1 sc (2) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 4230 9810 m
 gs 1 -1 sc (4) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 3735 9810 m
 gs 1 -1 sc (3) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 5985 9810 m
 gs 1 -1 sc (8) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 6435 9810 m
 gs 1 -1 sc (9) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 5535 9810 m
 gs 1 -1 sc (7) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 5130 9810 m
 gs 1 -1 sc (6) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 6795 9810 m
 gs 1 -1 sc (10) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 4680 9810 m
 gs 1 -1 sc (5) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 7290 9810 m
 gs 1 -1 sc (11) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 8595 9810 m
 gs 1 -1 sc (14) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 9045 9810 m
 gs 1 -1 sc (15) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 8145 9810 m
 gs 1 -1 sc (13) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 7740 9810 m
 gs 1 -1 sc (12) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 2340 9405 m
 gs 1 -1 sc (X) col32 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 3285 9405 m
 gs 1 -1 sc (X) col32 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 3735 9405 m
 gs 1 -1 sc (X) col32 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6885 9405 m
 gs 1 -1 sc (X) col32 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 7335 9405 m
 gs 1 -1 sc (X) col32 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 9135 9405 m
 gs 1 -1 sc (X) col32 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 2790 9405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4185 9405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4590 9405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5040 9405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5490 9405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5940 9405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6390 9405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 8190 9405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 8685 9405 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 7785 9405 m
 gs 1 -1 sc (X) col32 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 3960 14805 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4410 14805 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4860 14805 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5310 14805 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5760 14805 m
 gs 1 -1 sc (X) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 4005 15210 m
 gs 1 -1 sc (4) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 4455 15210 m
 gs 1 -1 sc (5) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 4905 15210 m
 gs 1 -1 sc (6) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 5355 15210 m
 gs 1 -1 sc (7) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 5805 15210 m
 gs 1 -1 sc (1) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6210 14805 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6660 14805 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 7110 14805 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 7560 14805 m
 gs 1 -1 sc (X) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 6255 15210 m
 gs 1 -1 sc (8) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 6705 15210 m
 gs 1 -1 sc (9) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 7065 15210 m
 gs 1 -1 sc (13) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 7515 15210 m
 gs 1 -1 sc (14) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 3960 15705 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 3960 16605 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4860 15705 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4860 16605 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 4410 16155 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5310 16155 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5310 17055 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 5760 16605 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6210 17055 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6660 15705 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6660 16155 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 6660 16605 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 7110 16155 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 7110 17055 m
 gs 1 -1 sc (X) col0 sh gr
-/AvantGarde-Book-iso ff 396.88 scf sf
+/AvantGarde-Book ff 396.88 scf sf
 7560 16155 m
 gs 1 -1 sc (X) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 -630 9315 m
 gs 1 -1 sc (Sparse global vector) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 -315 10800 m
-gs 1 -1 sc (Sparse sub-matrix) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+gs 1 -1 sc (Sparse submatrix) col0 sh gr
+/Times-Roman ff 317.50 scf sf
 3690 12645 m
 gs 1 -1 sc (Reordering of) col4 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 3690 13050 m
-gs 1 -1 sc (the sub-matrix) col4 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+gs 1 -1 sc (the submatrix) col4 sh gr
+/Times-Roman ff 317.50 scf sf
 4095 14265 m
-gs 1 -1 sc (sub-vector) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+gs 1 -1 sc (subvector) col0 sh gr
+/Times-Roman ff 317.50 scf sf
 6210 14265 m
-gs 1 -1 sc (sub-vector) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+gs 1 -1 sc (subvector) col0 sh gr
+/Times-Roman ff 317.50 scf sf
 4455 13950 m
 gs 1 -1 sc (local) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 6480 13950 m
 gs 1 -1 sc (shared) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 1440 16110 m
 gs 1 -1 sc (Reordered sparse) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 1935 16425 m
-gs 1 -1 sc (sub-matrix) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+gs 1 -1 sc (submatrix) col0 sh gr
+/Times-Roman ff 317.50 scf sf
 1035 14850 m
 gs 1 -1 sc (storage format) col0 sh gr
-/Times-Roman-iso ff 317.50 scf sf
+/Times-Roman ff 317.50 scf sf
 -45 14490 m
 gs 1 -1 sc (Global vector in compressed ) col0 sh gr
 % here ends figure;
diff --git a/BookGPU/Chapters/chapter12/figures/reorder.fig b/BookGPU/Chapters/chapter12/figures/reorder.fig
index 92678e7..c6c57dc 100644
--- a/BookGPU/Chapters/chapter12/figures/reorder.fig
+++ b/BookGPU/Chapters/chapter12/figures/reorder.fig
@@ -22,21 +22,21 @@ Single
 -6
 2 2 1 1 4 4 50 -1 38 4.000 0 0 -1 0 0 5
 	 4050 9900 5850 9900 5850 11700 4050 11700 4050 9900
-4 0 0 50 -1 4 25 0.0000 4 315 255 2790 11205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4140 10305 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4140 11205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4590 10755 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5085 10305 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5085 11205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5490 10755 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5490 11655 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5940 11655 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6390 11205 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6390 10755 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6390 10305 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8190 11655 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8190 10755 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8640 10755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2790 11205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4140 10305 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4140 11205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4590 10755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5085 10305 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5085 11205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5490 10755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5490 11655 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5940 11655 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6390 11205 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6390 10755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6390 10305 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8190 11655 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8190 10755 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8640 10755 X\001
 -6
 2 1 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 2
 	 7650 9000 7650 9450
@@ -48,38 +48,38 @@ Single
 	 5850 9000 5850 9450
 2 2 1 1 4 4 50 -1 38 4.000 0 0 -1 0 0 5
 	 4050 9000 5850 9000 5850 9450 4050 9450 4050 9000
-4 0 0 50 -1 0 20 0.0000 4 225 165 2385 9810 0\001
-4 0 0 50 -1 0 20 0.0000 4 225 165 2835 9810 1\001
-4 0 0 50 -1 0 20 0.0000 4 225 165 3330 9810 2\001
-4 0 0 50 -1 0 20 0.0000 4 225 165 4230 9810 4\001
-4 0 0 50 -1 0 20 0.0000 4 225 165 3735 9810 3\001
-4 0 0 50 -1 0 20 0.0000 4 225 165 5985 9810 8\001
-4 0 0 50 -1 0 20 0.0000 4 240 165 6435 9810 9\001
-4 0 0 50 -1 0 20 0.0000 4 225 165 5535 9810 7\001
-4 0 0 50 -1 0 20 0.0000 4 225 165 5130 9810 6\001
-4 0 0 50 -1 0 20 0.0000 4 225 330 6795 9810 10\001
-4 0 0 50 -1 0 20 0.0000 4 240 165 4680 9810 5\001
-4 0 0 50 -1 0 20 0.0000 4 225 330 7290 9810 11\001
-4 0 0 50 -1 0 20 0.0000 4 225 330 8595 9810 14\001
-4 0 0 50 -1 0 20 0.0000 4 240 330 9045 9810 15\001
-4 0 0 50 -1 0 20 0.0000 4 225 330 8145 9810 13\001
-4 0 0 50 -1 0 20 0.0000 4 225 330 7740 9810 12\001
-4 0 32 50 -1 4 25 0.0000 4 315 255 2340 9405 X\001
-4 0 32 50 -1 4 25 0.0000 4 315 255 3285 9405 X\001
-4 0 32 50 -1 4 25 0.0000 4 315 255 3735 9405 X\001
-4 0 32 50 -1 4 25 0.0000 4 315 255 6885 9405 X\001
-4 0 32 50 -1 4 25 0.0000 4 315 255 7335 9405 X\001
-4 0 32 50 -1 4 25 0.0000 4 315 255 9135 9405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 2790 9405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4185 9405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4590 9405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5040 9405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5490 9405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5940 9405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6390 9405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8190 9405 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 8685 9405 X\001
-4 0 32 50 -1 4 25 0.0000 4 315 255 7785 9405 X\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 2385 9810 0\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 2835 9810 1\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 3330 9810 2\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 4230 9810 4\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 3735 9810 3\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 5985 9810 8\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 6435 9810 9\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 5535 9810 7\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 5130 9810 6\001
+4 0 0 50 -1 0 20 0.0000 4 225 300 6795 9810 10\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 4680 9810 5\001
+4 0 0 50 -1 0 20 0.0000 4 225 300 7290 9810 11\001
+4 0 0 50 -1 0 20 0.0000 4 225 300 8595 9810 14\001
+4 0 0 50 -1 0 20 0.0000 4 225 300 9045 9810 15\001
+4 0 0 50 -1 0 20 0.0000 4 225 300 8145 9810 13\001
+4 0 0 50 -1 0 20 0.0000 4 225 300 7740 9810 12\001
+4 0 32 50 -1 4 25 0.0000 4 300 255 2340 9405 X\001
+4 0 32 50 -1 4 25 0.0000 4 300 255 3285 9405 X\001
+4 0 32 50 -1 4 25 0.0000 4 300 255 3735 9405 X\001
+4 0 32 50 -1 4 25 0.0000 4 300 255 6885 9405 X\001
+4 0 32 50 -1 4 25 0.0000 4 300 255 7335 9405 X\001
+4 0 32 50 -1 4 25 0.0000 4 300 255 9135 9405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 2790 9405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4185 9405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4590 9405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5040 9405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5490 9405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5940 9405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6390 9405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8190 9405 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 8685 9405 X\001
+4 0 32 50 -1 4 25 0.0000 4 300 255 7785 9405 X\001
 -6
 6 3825 14355 7965 17145
 2 2 0 2 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
@@ -92,52 +92,52 @@ Single
 	 3870 15300 7920 15300 7920 17100 3870 17100 3870 15300
 2 2 1 1 4 4 50 -1 38 4.000 0 0 -1 0 0 5
 	 3870 15300 5670 15300 5670 17100 3870 17100 3870 15300
-4 0 0 50 -1 4 25 0.0000 4 315 255 3960 14805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4410 14805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4860 14805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5310 14805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5760 14805 X\001
-4 0 0 50 -1 0 20 0.0000 4 225 165 4005 15210 4\001
-4 0 0 50 -1 0 20 0.0000 4 240 165 4455 15210 5\001
-4 0 0 50 -1 0 20 0.0000 4 225 165 4905 15210 6\001
-4 0 0 50 -1 0 20 0.0000 4 225 165 5355 15210 7\001
-4 0 0 50 -1 0 20 0.0000 4 225 165 5805 15210 1\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6210 14805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6660 14805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7110 14805 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7560 14805 X\001
-4 0 0 50 -1 0 20 0.0000 4 225 165 6255 15210 8\001
-4 0 0 50 -1 0 20 0.0000 4 240 165 6705 15210 9\001
-4 0 0 50 -1 0 20 0.0000 4 225 330 7065 15210 13\001
-4 0 0 50 -1 0 20 0.0000 4 225 330 7515 15210 14\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3960 15705 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 3960 16605 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4860 15705 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4860 16605 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 4410 16155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5310 16155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5310 17055 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 5760 16605 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6210 17055 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6660 15705 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6660 16155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 6660 16605 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7110 16155 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7110 17055 X\001
-4 0 0 50 -1 4 25 0.0000 4 315 255 7560 16155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3960 14805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4410 14805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4860 14805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5310 14805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5760 14805 X\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 4005 15210 4\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 4455 15210 5\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 4905 15210 6\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 5355 15210 7\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 5805 15210 1\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6210 14805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6660 14805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7110 14805 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7560 14805 X\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 6255 15210 8\001
+4 0 0 50 -1 0 20 0.0000 4 225 150 6705 15210 9\001
+4 0 0 50 -1 0 20 0.0000 4 225 300 7065 15210 13\001
+4 0 0 50 -1 0 20 0.0000 4 225 300 7515 15210 14\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3960 15705 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 3960 16605 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4860 15705 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4860 16605 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 4410 16155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5310 16155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5310 17055 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 5760 16605 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6210 17055 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6660 15705 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6660 16155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 6660 16605 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7110 16155 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7110 17055 X\001
+4 0 0 50 -1 4 25 0.0000 4 300 255 7560 16155 X\001
 -6
 2 1 0 8 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	2 1 4.00 180.00 210.00
 	 5850 11970 5850 13770
-4 0 0 50 -1 1 20 0.0000 0 300 2715 -630 9315 Sparse global vector\001
-4 0 0 50 -1 1 20 0.0000 0 300 2400 -315 10800 Sparse sub-matrix\001
-4 0 4 50 -1 0 20 0.0000 4 300 1845 3690 12645 Reordering of\001
-4 0 4 50 -1 0 20 0.0000 4 225 1920 3690 13050 the sub-matrix\001
-4 0 0 50 -1 0 20 0.0000 4 225 1395 4095 14265 sub-vector\001
-4 0 0 50 -1 0 20 0.0000 4 225 1395 6210 14265 sub-vector\001
-4 0 0 50 -1 0 20 0.0000 4 225 645 4455 13950 local\001
-4 0 0 50 -1 0 20 0.0000 4 225 870 6480 13950 shared\001
-4 0 0 50 -1 1 20 0.0000 0 300 2310 1440 16110 Reordered sparse\001
-4 0 0 50 -1 0 20 0.0000 4 225 1425 1935 16425 sub-matrix\001
-4 0 0 50 -1 0 20 0.0000 4 300 1920 1035 14850 storage format\001
-4 0 0 50 -1 0 20 0.0000 4 300 3915 -45 14490 Global vector in compressed \001
+4 0 0 50 -1 1 20 0.0000 0 300 2625 -630 9315 Sparse global vector\001
+4 0 0 50 -1 1 20 0.0000 0 300 2265 -315 10800 Sparse submatrix\001
+4 0 4 50 -1 0 20 0.0000 4 300 1785 3690 12645 Reordering of\001
+4 0 4 50 -1 0 20 0.0000 4 225 1800 3690 13050 the submatrix\001
+4 0 0 50 -1 0 20 0.0000 4 225 1230 4095 14265 subvector\001
+4 0 0 50 -1 0 20 0.0000 4 225 1230 6210 14265 subvector\001
+4 0 0 50 -1 0 20 0.0000 4 225 615 4455 13950 local\001
+4 0 0 50 -1 0 20 0.0000 4 225 840 6480 13950 shared\001
+4 0 0 50 -1 1 20 0.0000 0 300 2205 1440 16110 Reordered sparse\001
+4 0 0 50 -1 0 20 0.0000 4 225 1305 1935 16425 submatrix\001
+4 0 0 50 -1 0 20 0.0000 4 300 1905 1035 14850 storage format\001
+4 0 0 50 -1 0 20 0.0000 4 300 3750 -45 14490 Global vector in compressed \001
diff --git a/BookGPU/Chapters/chapter12/figures/reorder.fig.bak b/BookGPU/Chapters/chapter12/figures/reorder.fig.bak
index a6cb5c7..92678e7 100644
--- a/BookGPU/Chapters/chapter12/figures/reorder.fig.bak
+++ b/BookGPU/Chapters/chapter12/figures/reorder.fig.bak
@@ -129,15 +129,15 @@ Single
 2 1 0 8 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 	2 1 4.00 180.00 210.00
 	 5850 11970 5850 13770
-4 0 0 50 -1 1 20 0.0000 0 225 3570 -1440 10800 Sous-matrice locale creuse\001
-4 0 0 50 -1 1 20 0.0000 0 300 2790 -675 9315 Vecteur global creux\001
-4 0 0 50 -1 0 20 0.0000 4 225 645 4455 14310 local\001
-4 0 0 50 -1 0 20 0.0000 4 300 975 6255 14265 partag\351\001
-4 0 0 50 -1 0 20 0.0000 4 225 1725 3915 13995 Sous-vecteur\001
-4 0 0 50 -1 0 20 0.0000 4 225 1725 5895 13995 Sous-vecteur\001
-4 0 4 50 -1 0 20 0.0000 4 225 2430 3240 13050 de la sous-matrice\001
-4 0 4 50 -1 0 20 0.0000 4 300 2415 3240 12645 R\351organisation de\001
-4 0 0 50 -1 1 20 0.0000 0 225 2715 1080 16110 Sous-matrice locale \001
-4 0 0 50 -1 0 20 0.0000 4 225 2430 1170 16425 creuse r\351ordonn\351e\001
-4 0 0 50 -1 0 20 0.0000 4 300 4125 -315 14490 Vecteur global sous un format \001
-4 0 0 50 -1 0 20 0.0000 4 300 3075 135 14850 de stockage compress\351\001
+4 0 0 50 -1 1 20 0.0000 0 300 2715 -630 9315 Sparse global vector\001
+4 0 0 50 -1 1 20 0.0000 0 300 2400 -315 10800 Sparse sub-matrix\001
+4 0 4 50 -1 0 20 0.0000 4 300 1845 3690 12645 Reordering of\001
+4 0 4 50 -1 0 20 0.0000 4 225 1920 3690 13050 the sub-matrix\001
+4 0 0 50 -1 0 20 0.0000 4 225 1395 4095 14265 sub-vector\001
+4 0 0 50 -1 0 20 0.0000 4 225 1395 6210 14265 sub-vector\001
+4 0 0 50 -1 0 20 0.0000 4 225 645 4455 13950 local\001
+4 0 0 50 -1 0 20 0.0000 4 225 870 6480 13950 shared\001
+4 0 0 50 -1 1 20 0.0000 0 300 2310 1440 16110 Reordered sparse\001
+4 0 0 50 -1 0 20 0.0000 4 225 1425 1935 16425 sub-matrix\001
+4 0 0 50 -1 0 20 0.0000 4 300 1920 1035 14850 storage format\001
+4 0 0 50 -1 0 20 0.0000 4 300 3915 -45 14490 Global vector in compressed \001
diff --git a/BookGPU/Chapters/chapter12/figures/reorder.pdf b/BookGPU/Chapters/chapter12/figures/reorder.pdf
index 0e3ddcf..c7fe219 100644
Binary files a/BookGPU/Chapters/chapter12/figures/reorder.pdf and b/BookGPU/Chapters/chapter12/figures/reorder.pdf differ
diff --git a/BookGPU/Chapters/chapter13/biblio13.bib b/BookGPU/Chapters/chapter13/biblio13.bib
index 0903bfe..3d6e56c 100644
--- a/BookGPU/Chapters/chapter13/biblio13.bib
+++ b/BookGPU/Chapters/chapter13/biblio13.bib
@@ -1,27 +1,23 @@
 @article{ch13:ref1,
 title = {Asynchronous grid computation for {A}merican options derivatives},
-author = {Chau, M. and Couturier, R. and Bahi, J. M. and Spiteri, P.},
+author = {Chau, M. and Couturier, R. and Bahi, J.M. and Spiteri, P.},
 journal = {Advances in Engineering Software},
-volume = {***-***},
-number = {***-***},
-pages = {***-***},
-note = {Online version first},
-year = {2012},
+volume = {60-61},
+pages={136--144},
+year = {2012}
 }
 
-@article{ch13:ref2,
-title = {Matrix iterative analysis},
-author = {Varga, R. S.},
-journal = {Prentice Hall},
-volume = {},
-number = {},
-pages = {},
-note = {},
-year = {},
+@book{ch13:ref2,
+      author       = "Varga, R.S.",
+      title        = "Matrix Iterative Analysis",
+      publisher    = "Springer",
+      address      = "Dordrecht",
+      series       = "Springer Series in Computational Mathematics",
+      year         = "2009",
 }
 
 @article{ch13:ref3,
- author = {Baudet, G. M.},
+ author = {Baudet, G.M.},
  title = {Asynchronous iterative methods for multiprocessors},
  journal = {Journal Assoc. Comput. Mach.},
  volume = {25},
@@ -30,27 +26,27 @@ year = {},
  year = {1978},
 }
 
-@article{ch13:ref4,
- author = {Bertsekas, D. P. and Tsitsiklis, J. N.},
- title = {Parallel and distributed computation, numerical methods},
- journal = {Prentice Hall Englewood Cliffs N. J. (1989)},
- volume = {},
- number = {},
- pages = {},
+@book{ch13:ref4,
+ author = {Bertsekas, D.P. and Tsitsiklis, J.N.},
+ title = {Parallel and Distributed Computation: Numerical Methods},
  year = {1989},
-}
+ publisher = {Prentice-Hall, Inc.},
+ address = {Upper Saddle River, NJ, USA},
+} 
 
 @book{ch13:ref5,
- author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.},
- title = {Parallel iterative algorithms: from sequential to grid computing},
- journal = {Chapman \& Hall/CRC, Numerical Analysis \& Scientific Computating 1 (2007)},
- year = {2007},
+    title = {{Parallel Iterative Algorithms: from Sequential to Grid Computing}},
+    author = {Bahi, J.M. and Contassot-Vivier, S. and Couturier, R.},
+    publisher = {Chapman \& Hall/CRC},
+    pages = {240},
+    series = {Numerical Analysis \& Scientific Computing Series },
+    year = {2007},
 }
 
 @article{ch13:ref6,
  author = {Miellou, J.-C. and Spiteri, P.},
  title = {Two criteria for the convergence of asynchronous iterations},
- journal = {in Computers and computing, P. Chenin et al. ed., Wiley Masson},
+ journal = {in P. Chenin et al. ed., Computers and computing, Wiley Masson},
  volume = {},
  number = {},
  pages = {91--95},
@@ -68,7 +64,7 @@ year = {},
 }
 
 @article{ch13:ref8,
-title = {{CUDA} Toolkit 4.2 {CUBLAS} Library},
+title = {{CUDA} {T}oolkit 4.2 {CUBLAS} {L}ibrary},
 author = {NVIDIA Corporation},
 journal = {},
 volume = {},
@@ -78,18 +74,20 @@ note = {\url{http://developer.download.nvidia.com/compute/DevZone/docs/html/CUDA
 year = {2012},
 }
 
-@article{ch13:ref9,
+@inproceedings{ch13:ref9,
  author = {Micikevicius, P.},
  title = {{3D} finite difference computation on {GPUs} using {CUDA}},
- journal = {Proceedings of 2nd Workshop on General Purpose Processing on Graphics Processing Units},
- volume = {},
- number = {},
- pages = {79--84},
+ booktitle = {Proceedings of 2nd Workshop on General Purpose Processing on Graphics Processing Units},
+ series = {GPGPU-2},
  year = {2009},
-}
+ pages = {79--84},
+ numpages = {6},
+ publisher = {ACM},
+ address = {New York, NY, USA}
+} 
 
 @article{ch13:ref10,
- author = {Leist, A. and Playne, D. P. and Hawick, K. A.},
+ author = {Leist, A. and Playne, D.P. and Hawick, K.A.},
  title = {Exploiting graphical processing units for data-parallel scientific applications},
  journal = {Concurrency and Computation: Practice and Experience},
  volume = {21},
@@ -99,7 +97,7 @@ year = {2012},
 }
 
 @article{ch13:ref11,
- author = {Chau, M. and Couturier, R. and Bahi, J. M. and Spiteri, P.},
+ author = {Chau, M. and Couturier, R. and Bahi, J.M. and Spiteri, P.},
  title = {Parallel solution of the obstacle problem in grid environments},
  journal = {International Journal of High Performance Computing Applications},
  volume = {25},
@@ -109,9 +107,9 @@ year = {2012},
 }
 
 @article{ch13:ref12,
- author = {Nvidia},
+ author = {{NVIDIA}},
  title = {{NVIDIA} {CUDA} {C} {P}rogramming {G}uide},
- journal = {Version 4.2 (2012)},
+ journal = {Version 4.2},
  volume = {},
  number = {},
  pages = {},
@@ -119,7 +117,7 @@ year = {2012},
 }
 
 @article{ch13:ref13,
- author = {Evans, D. J.},
+ author = {Evans, D.J.},
  title = {Parallel {S}.{O}.{R}. iterative methods},
  journal = {Parallel Computing},
  volume = {1},
@@ -133,7 +131,7 @@ year = {2012},
  title = {Block red-black ordering method for parallel processing of {ICCG} solver},
  journal = {High Performance Computing},
  volume = {2327},
- number = {},
+ number = {0},
  pages = {},
  year = {2006},
 }
@@ -141,7 +139,7 @@ year = {2012},
 @article{ch13:ref15,
 title = {Iterative methods for sparse linear systems},
 author = {Saad, Y.},
-journal = {Society for Industrial and Applied Mathematics, 2nd edition},
+journal = {Society for Industrial and Applied Mathematics, second edition},
 volume = {},
 number = {},
 pages = {},
diff --git a/BookGPU/Chapters/chapter13/ch12.tex~ b/BookGPU/Chapters/chapter13/ch12.tex~
deleted file mode 100755
index 370c6c7..0000000
--- a/BookGPU/Chapters/chapter13/ch12.tex~
+++ /dev/null
@@ -1,1080 +0,0 @@
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%                          %%
-%%       CHAPTER 12         %%
-%%                          %%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
- 
-\chapterauthor{}{}
-\chapter{Solving sparse linear systems with GMRES and CG methods on GPU clusters}
-
-%%--------------------------%%
-%%       SECTION 1          %%
-%%--------------------------%%
-\section{Introduction}
-\label{sec:01}
-The sparse linear systems are used to model many scientific and industrial problems, such as the environmental simulations or
-the industrial processing of the complex or non-Newtonian fluids. Moreover, the resolution of these problems often involves the
-solving of such linear systems which is considered as the most expensive process in terms of time execution and memory space.
-Therefore, solving sparse linear systems must be as efficient as possible in order to deal with problems of ever increasing size.
-
-There are, in the jargon of numerical analysis, different methods of solving sparse linear systems that we can classify in two
-classes: the direct and iterative methods. However, the iterative methods are often more suitable than their counterpart, direct
-methods, for solving large sparse linear systems. Indeed, they are less memory consuming and easier to parallelize on parallel
-computers than direct methods. Different computing platforms, sequential and parallel computers, are used for solving sparse
-linear systems with iterative solutions. Nowadays, graphics processing units (GPUs) have become attractive for solving these
-linear systems, due to their computing power and their ability to compute faster than traditional CPUs.
-
-In Section~\ref{sec:02}, we describe the general principle of two well-known iterative methods: the conjugate gradient method and
-the generalized minimal residual method. In Section~\ref{sec:03}, we give the main key points of the parallel implementation of both
-methods on a cluster of GPUs. Then, in Section~\ref{sec:04}, we present the experimental results obtained on a CPU cluster and on
-a GPU cluster, for solving sparse linear systems associated to matrices of different structures. Finally, in Section~\ref{sec:05},
-we apply the hypergraph partitioning technique to reduce the total communication volume between the computing nodes and, thus, to
-improve the execution times of the parallel algorithms of both iterative methods.   
-
-
-%%--------------------------%%
-%%       SECTION 2          %%
-%%--------------------------%%
-\section{Krylov iterative methods}
-\label{sec:02}
-Let us consider the following system of $n$ linear equations in $\mathbb{R}$: 
-\begin{equation}
-Ax=b,
-\label{eq:01}
-\end{equation}
-where $A\in\mathbb{R}^{n\times n}$ is a sparse nonsingular square matrix, $x\in\mathbb{R}^{n}$ is the solution vector,
-$b\in\mathbb{R}^{n}$ is the right-hand side and $n\in\mathbb{N}$ is a large integer number. 
-
-The iterative methods for solving the large sparse linear system~(\ref{eq:01}) proceed by successive iterations of a same
-block of elementary operations, during which an infinite number of approximate solutions $\{x_k\}_{k\geq 0}$ are computed.
-Indeed, from an initial guess $x_0$, an iterative method determines at each iteration $k>0$ an approximate solution $x_k$
-which, gradually, converges to the exact solution $x^{*}$ as follows:
-\begin{equation}
-x^{*}=\lim\limits_{k\to\infty}x_{k}=A^{-1}b.
-\label{eq:02}
-\end{equation}
-The number of iterations necessary to reach the exact solution $x^{*}$ is not known beforehand and can be infinite. In
-practice, an iterative method often finds an approximate solution $\tilde{x}$ after a fixed number of iterations and/or
-when a given convergence criterion is satisfied as follows:   
-\begin{equation}
-\|b-A\tilde{x}\| < \varepsilon,
-\label{eq:03}
-\end{equation}
-where $\varepsilon<1$ is the required convergence tolerance threshold. 
-
-Some of the most iterative methods that have proven their efficiency for solving large sparse linear systems are those
-called \textit{Krylov sub-space methods}~\cite{ref1}. In the present chapter, we describe two Krylov methods which are
-widely used: the conjugate gradient method (CG) and the generalized minimal residual method (GMRES). In practice, the
-Krylov sub-space methods are usually used with preconditioners that allow to improve their convergence. So, in what
-follows, the CG and GMRES methods are used for solving the left-preconditioned sparse linear system:
-\begin{equation}
-M^{-1}Ax=M^{-1}b,
-\label{eq:11}
-\end{equation}
-where $M$ is the preconditioning matrix.
-
-%%****************%%
-%%****************%%
-\subsection{CG method}
-\label{sec:02.01}
-The conjugate gradient method is initially developed by Hestenes and Stiefel in 1952~\cite{ref2}. It is one of the well
-known iterative method for solving large sparse linear systems. In addition, it can be adapted for solving nonlinear 
-equations and optimization problems. However, it can only be applied to problems with positive definite symmetric matrices.
-
-The main idea of the CG method is the computation of a sequence of approximate solutions $\{x_k\}_{k\geq 0}$ in a Krylov
-sub-space of order $k$ as follows: 
-\begin{equation}
-x_k \in x_0 + \mathcal{K}_k(A,r_0),
-\label{eq:04}
-\end{equation}
-such that the Galerkin condition must be satisfied:
-\begin{equation}
-r_k \bot \mathcal{K}_k(A,r_0),
-\label{eq:05}
-\end{equation}
-where $x_0$ is the initial guess, $r_k=b-Ax_k$ is the residual of the computed solution $x_k$ and $\mathcal{K}_k$ the Krylov
-sub-space of order $k$: \[\mathcal{K}_k(A,r_0) \equiv\text{span}\{r_0, Ar_0, A^2r_0,\ldots, A^{k-1}r_0\}.\]
-In fact, CG is based on the construction of a sequence $\{p_k\}_{k\in\mathbb{N}}$ of direction vectors in $\mathcal{K}_k$
-which are pairwise $A$-conjugate ($A$-orthogonal):
-\begin{equation}
-\begin{array}{ll}
-p_i^T A p_j = 0, & i\neq j. 
-\end{array} 
-\label{eq:06}
-\end{equation}
-At each iteration $k$, an approximate solution $x_k$ is computed by recurrence as follows:  
-\begin{equation}
-\begin{array}{ll}
-x_k = x_{k-1} + \alpha_k p_k, & \alpha_k\in\mathbb{R}.
-\end{array} 
-\label{eq:07}
-\end{equation}
-Consequently, the residuals $r_k$ are computed in the same way:
-\begin{equation}
-r_k = r_{k-1} - \alpha_k A p_k. 
-\label{eq:08}
-\end{equation}
-In the case where all residuals are nonzero, the direction vectors $p_k$ can be determined so that the following recurrence 
-holds:
-\begin{equation}
-\begin{array}{lll}
-p_0=r_0, & p_k=r_k+\beta_k p_{k-1}, & \beta_k\in\mathbb{R}.
-\end{array} 
-\label{eq:09}
-\end{equation}
-Moreover, the scalars $\{\alpha_k\}_{k>0}$ are chosen so as to minimize the $A$-norm error $\|x^{*}-x_k\|_A$ over the Krylov
-sub-space $\mathcal{K}_{k}$ and the scalars $\{\beta_k\}_{k>0}$ are chosen so as to ensure that the direction vectors are
-pairwise $A$-conjugate. So, the assumption that matrix $A$ is symmetric and the recurrences~(\ref{eq:08}) and~(\ref{eq:09})
-allow to deduce that:
-\begin{equation}
-\begin{array}{ll}
-\alpha_{k}=\frac{r^{T}_{k-1}r_{k-1}}{p_{k}^{T}Ap_{k}}, & \beta_{k}=\frac{r_{k}^{T}r_{k}}{r_{k-1}^{T}r_{k-1}}.
-\end{array}
-\label{eq:10}
-\end{equation}
-
-Algorithm~\ref{alg:01} shows the main key points of the preconditioned CG method. It allows to solve the left-preconditioned
-sparse linear system~(\ref{eq:11}). In this algorithm, $\varepsilon$ is the convergence tolerance threshold, $maxiter$ is the maximum
-number of iterations and $(\cdot,\cdot)$ defines the dot product between two vectors in $\mathbb{R}^{n}$. At every iteration, a direction
-vector $p_k$ is determined, so that it is orthogonal to the preconditioned residual $z_k$ and to the direction vectors $\{p_i\}_{i<k}$
-previously determined (from line~$8$ to line~$13$). Then, at lines~$16$ and~$17$ , the iterate $x_k$ and the residual $r_k$ are computed
-using formulas~(\ref{eq:07}) and~(\ref{eq:08}), respectively. The CG method converges after, at most, $n$ iterations. In practice, the CG
-algorithm stops when the tolerance threshold $\varepsilon$ and/or the maximum number of iterations $maxiter$ are reached.
-
-\begin{algorithm}
-  \SetLine
-  \linesnumbered
-  Choose an initial guess $x_0$\;
-  $r_{0} = b - A x_{0}$\;
-  $convergence$ = false\;
-  $k = 1$\;
-  \Repeat{convergence}{
-    $z_{k} = M^{-1} r_{k-1}$\;
-    $\rho_{k} = (r_{k-1},z_{k})$\;
-    \eIf{$k = 1$}{
-      $p_{k} = z_{k}$\;
-    }{
-      $\beta_{k} = \rho_{k} / \rho_{k-1}$\;
-      $p_{k} = z_{k} + \beta_{k} \times p_{k-1}$\;
-    }
-    $q_{k} = A \times p_{k}$\;
-    $\alpha_{k} = \rho_{k} / (p_{k},q_{k})$\;
-    $x_{k} = x_{k-1} + \alpha_{k} \times p_{k}$\;
-    $r_{k} = r_{k-1} - \alpha_{k} \times q_{k}$\;
-    \eIf{$(\rho_{k} < \varepsilon)$ {\bf or} $(k \geq maxiter)$}{
-      $convergence$ = true\;
-    }{
-      $k = k + 1$\;
-    }
-  }
-\caption{Left-preconditioned CG method}
-\label{alg:01}
-\end{algorithm}
-
-%%****************%%
-%%****************%%
-\subsection{GMRES method} 
-\label{sec:02.02}
-The iterative method GMRES is developed by Saad and Schultz in 1986~\cite{ref3} as a generalization of the minimum residual method
-MINRES~\cite{ref4}. Indeed, GMRES can be applied for solving symmetric or asymmetric linear systems. 
-
-The main principle of the GMRES method is to find an approximation minimizing at best the residual norm. In fact, GMRES
-computes a sequence of approximate solutions $\{x_k\}_{k>0}$ in a Krylov sub-space $\mathcal{K}_k$ as follows:
-\begin{equation}
-\begin{array}{ll}
-x_k \in x_0 + \mathcal{K}_k(A, v_1),& v_1=\frac{r_0}{\|r_0\|_2},
-\end{array}
-\label{eq:12}
-\end{equation} 
-so that the Petrov-Galerkin condition is satisfied:
-\begin{equation}
-\begin{array}{ll}
-r_k \bot A \mathcal{K}_k(A, v_1).
-\end{array}
-\label{eq:13}
-\end{equation}
-GMRES uses the Arnoldi process~\cite{ref5} to construct an orthonormal basis $V_k$ for the Krylov sub-space $\mathcal{K}_k$
-and an upper Hessenberg matrix $\bar{H}_k$ of order $(k+1)\times k$:
-\begin{equation}
-\begin{array}{ll}
-V_k = \{v_1, v_2,\ldots,v_k\}, & \forall k>1, v_k=A^{k-1}v_1,
-\end{array}
-\label{eq:14}
-\end{equation}
-and
-\begin{equation}
-V_k A = V_{k+1} \bar{H}_k.
-\label{eq:15}
-\end{equation}
-
-Then, at each iteration $k$, an approximate solution $x_k$ is computed in the Krylov sub-space $\mathcal{K}_k$ spanned by $V_k$
-as follows:
-\begin{equation}
-\begin{array}{ll}
-x_k = x_0 + V_k y, & y\in\mathbb{R}^{k}.
-\end{array}
-\label{eq:16}
-\end{equation}
-From both formulas~(\ref{eq:15}) and~(\ref{eq:16}) and $r_k=b-Ax_k$, we can deduce that:
-\begin{equation}
-\begin{array}{lll}
-  r_{k} & = & b - A (x_{0} + V_{k}y) \\
-        & = & r_{0} - AV_{k}y \\
-        & = & \beta v_{1} - V_{k+1}\bar{H}_{k}y \\
-        & = & V_{k+1}(\beta e_{1} - \bar{H}_{k}y),
-\end{array}
-\label{eq:17}
-\end{equation}
-such that $\beta=\|r_0\|_2$ and $e_1=(1,0,\cdots,0)$ is the first vector of the canonical basis of $\mathbb{R}^k$. So,
-the vector $y$ is chosen in $\mathbb{R}^k$ so as to minimize at best the Euclidean norm of the residual $r_k$. Consequently,
-a linear least-squares problem of size $k$ is solved:
-\begin{equation}
-\underset{y\in\mathbb{R}^{k}}{min}\|r_{k}\|_{2}=\underset{y\in\mathbb{R}^{k}}{min}\|\beta e_{1}-\bar{H}_{k}y\|_{2}.
-\label{eq:18}
-\end{equation}
-The QR factorization of matrix $\bar{H}_k$ is used to compute the solution of this problem by using Givens rotations~\cite{ref1,ref3},
-such that:
-\begin{equation}
-\begin{array}{lll}
-\bar{H}_{k}=Q_{k}R_{k}, & Q_{k}\in\mathbb{R}^{(k+1)\times (k+1)}, & R_{k}\in\mathbb{R}^{(k+1)\times k},
-\end{array}
-\label{eq:19}
-\end{equation}
-where $Q_kQ_k^T=I_k$ and $R_k$ is an upper triangular matrix.
-
-The GMRES method computes an approximate solution with a sufficient precision after, at most, $n$ iterations ($n$ is the size of the 
-sparse linear system to be solved). However, the GMRES algorithm must construct and store in the memory an orthonormal basis $V_k$ whose
-size is proportional to the number of iterations required to achieve the convergence. Then, to avoid a huge memory storage, the GMRES
-method must be restarted at each $m$ iterations, such that $m$ is very small ($m\ll n$), and with $x_m$ as the initial guess to the
-next iteration. This allows to limit the size of the basis $V$ to $m$ orthogonal vectors.
-
-Algorithm~\ref{alg:02} shows the main key points of the GMRES method with restarts. It solves the left-preconditioned sparse linear
-system~(\ref{eq:11}), such that $M$ is the preconditioning matrix. At each iteration $k$, GMRES uses the Arnoldi process (defined
-from line~$7$ to line~$17$) to construct a basis $V_m$ of $m$ orthogonal vectors and an upper Hessenberg matrix $\bar{H}_m$ of size
-$(m+1)\times m$. Then, it solves the linear least-squares problem of size $m$ to find the vector $y\in\mathbb{R}^{m}$ which minimizes
-at best the residual norm (line~$18$). Finally, it computes an approximate solution $x_m$ in the Krylov sub-space spanned by $V_m$ 
-(line~$19$). The GMRES algorithm is stopped when the residual norm is sufficiently small ($\|r_m\|_2<\varepsilon$) and/or the maximum
-number of iterations ($maxiter$) is reached.
-
-\begin{algorithm}
-  \SetLine
-  \linesnumbered
-  Choose an initial guess $x_0$\;
-  $convergence$ = false\;
-  $k = 1$\;
-  $r_{0} = M^{-1}(b-Ax_{0})$\;
-  $\beta = \|r_{0}\|_{2}$\;
-  \While{$\neg convergence$}{
-    $v_{1} = r_{0}/\beta$\;
-    \For{$j=1$ \KwTo $m$}{ 
-      $w_{j} = M^{-1}Av_{j}$\;
-      \For{$i=1$ \KwTo $j$}{
-        $h_{i,j} = (w_{j},v_{i})$\;
-        $w_{j} = w_{j}-h_{i,j}v_{i}$\;
-      }
-      $h_{j+1,j} = \|w_{j}\|_{2}$\;
-      $v_{j+1} = w_{j}/h_{j+1,j}$\;
-    }
-    Set $V_{m}=\{v_{j}\}_{1\leq j \leq m}$ and $\bar{H}_{m}=(h_{i,j})$ a $(m+1)\times m$ upper Hessenberg matrix\;
-    Solve a least-squares problem of size $m$: $min_{y\in\mathrm{I\!R}^{m}}\|\beta e_{1}-\bar{H}_{m}y\|_{2}$\;
-    $x_{m} = x_{0}+V_{m}y_{m}$\;
-    $r_{m} = M^{-1}(b-Ax_{m})$\;
-    $\beta = \|r_{m}\|_{2}$\;   
-    \eIf{ $(\beta<\varepsilon)$ {\bf or} $(k\geq maxiter)$}{
-      $convergence$ = true\;
-    }{
-      $x_{0} = x_{m}$\;
-      $r_{0} = r_{m}$\;
-      $k = k + 1$\;
-    }
-  }
-\caption{Left-preconditioned GMRES method with restarts}
-\label{alg:02}
-\end{algorithm}
-
-
-%%--------------------------%%
-%%       SECTION 3          %%
-%%--------------------------%%
-\section{Parallel implementation on a GPU cluster}
-\label{sec:03}
-In this section, we present the parallel algorithms of both iterative CG and GMRES methods for GPU clusters.
-The implementation is performed on a GPU cluster composed of different computing nodes, such that each node
-is a CPU core managed by a MPI process and equipped with a GPU card. The parallelization of these algorithms
-is carried out by using the MPI communication routines between the GPU computing nodes and the CUDA programming
-environment inside each node. In what follows, the algorithms of the iterative methods are called iterative
-solvers.
-
-%%****************%%
-%%****************%%
-\subsection{Data partitioning}
-\label{sec:03.01}
-The parallel solving of the large sparse linear system~(\ref{eq:11}) requires a data partitioning between the computing
-nodes of the GPU cluster. Let $p$ denotes the number of the computing nodes on the GPU cluster. The partitioning operation
-consists in the decomposition of the vectors and matrices, involved in the iterative solver, in $p$ portions. Indeed, this
-operation allows to assign to each computing node $i$:
-\begin{itemize*}
-\item a portion of size $\frac{n}{p}$ elements of each vector,
-\item a sparse rectangular sub-matrix $A_i$ of size $(n,\frac{n}{p})$ and,
-\item a square preconditioning sub-matrix $M_i$ of size $(\frac{n}{p},\frac{n}{p})$, 
-\end{itemize*} 
-where $n$ is the size of the sparse linear system to be solved. In the first instance, we perform a naive row-wise partitioning
-(decomposition row-by-row) on the data of the sparse linear systems to be solved. Figure~\ref{fig:01} shows an example of a row-wise
-data partitioning between four computing nodes of a sparse linear system (sparse matrix $A$, solution vector $x$ and right-hand
-side $b$) of size $16$ unknown values. 
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.35]{Chapters/chapter12/figures/partition}}
-\caption{A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.}
-\label{fig:01}
-\end{figure}
-
-%%****************%%
-%%****************%%
-\subsection{GPU computing}
-\label{sec:03.02}
-After the partitioning operation, all the data involved from this operation must be transferred from the CPU memories to the GPU
-memories, in order to be processed by GPUs. We use two functions of the CUBLAS library (CUDA Basic Linear Algebra Subroutines),
-developed by Nvidia~\cite{ref6}: \verb+cublasAlloc()+ for the memory allocations on GPUs and \verb+cublasSetVector()+ for the
-memory copies from the CPUs to the GPUs.
-
-An efficient implementation of CG and GMRES solvers on a GPU cluster requires to determine all parts of their codes that can be
-executed in parallel and, thus, take advantage of the GPU acceleration. As many Krylov sub-space methods, the CG and GMRES methods
-are mainly based on arithmetic operations dealing with vectors or matrices: sparse matrix-vector multiplications, scalar-vector
-multiplications, dot products, Euclidean norms, AXPY operations ($y\leftarrow ax+y$ where $x$ and $y$ are vectors and $a$ is a
-scalar) and so on. These vector operations are often easy to parallelize and they are more efficient on parallel computers when
-they work on large vectors. Therefore, all the vector operations used in CG and GMRES solvers must be executed by the GPUs as kernels.
-
-We use the kernels of the CUBLAS library to compute some vector operations of CG and GMRES solvers. The following kernels of CUBLAS
-(dealing with double floating point) are used: \verb+cublasDdot()+ for the dot products, \verb+cublasDnrm2()+ for the Euclidean
-norms and \verb+cublasDaxpy()+ for the AXPY operations. For the rest of the data-parallel operations, we code their kernels in CUDA.
-In the CG solver, we develop a kernel for the XPAY operation ($y\leftarrow x+ay$) used at line~$12$ in Algorithm~\ref{alg:01}. In the
-GMRES solver, we program a kernel for the scalar-vector multiplication (lines~$7$ and~$15$ in Algorithm~\ref{alg:02}), a kernel for
-solving the least-squares problem and a kernel for the elements updates of the solution vector $x$.
-
-The least-squares problem in the GMRES method is solved by performing a QR factorization on the Hessenberg matrix $\bar{H}_m$ with
-plane rotations and, then, solving the triangular system by backward substitutions to compute $y$. Consequently, solving the least-squares
-problem on the GPU is not interesting. Indeed, the triangular solves are not easy to parallelize and inefficient on GPUs. However,
-the least-squares problem to solve in the GMRES method with restarts has, generally, a very small size $m$. Therefore, we develop
-an inexpensive kernel which must be executed in sequential by a single CUDA thread. 
-
-The most important operation in CG and GMRES methods is the sparse matrix-vector multiplication (SpMV), because it is often an
-expensive operation in terms of execution time and memory space. Moreover, it requires to take care of the storage format of the
-sparse matrix in the memory. Indeed, the naive storage, row-by-row or column-by-column, of a sparse matrix can cause a significant
-waste of memory space and execution time. In addition, the sparsity nature of the matrix often leads to irregular memory accesses
-to read the matrix nonzero values. So, the computation of the SpMV multiplication on GPUs can involve non coalesced accesses to
-the global memory, which slows down even more its performances. One of the most efficient compressed storage formats of sparse
-matrices on GPUs is HYB format~\cite{ref7}. It is a combination of ELLpack (ELL) and Coordinate (COO) formats. Indeed, it stores
-a typical number of nonzero values per row in ELL format and remaining entries of exceptional rows in COO format. It combines
-the efficiency of ELL due to the regularity of its memory accesses and the flexibility of COO which is insensitive to the matrix
-structure. Consequently, we use the HYB kernel~\cite{ref8} developed by Nvidia to implement the SpMV multiplication of CG and
-GMRES methods on GPUs. Moreover, to avoid the non coalesced accesses to the high-latency global memory, we fill the elements of
-the iterate vector $x$ in the cached texture memory.
-
-%%****************%%
-%%****************%%
-\subsection{Data communications}
-\label{sec:03.03}
-All the computing nodes of the GPU cluster execute in parallel the same iterative solver (Algorithm~\ref{alg:01} or Algorithm~\ref{alg:02})
-adapted to GPUs, but on their own portions of the sparse linear system: $M^{-1}_iA_ix_i=M^{-1}_ib_i$, $0\leq i<p$. However in order to solve
-the complete sparse linear system~(\ref{eq:11}), synchronizations must be performed between the local computations of the computing nodes
-over the cluster. In what follows, two computing nodes sharing data are called neighboring nodes.
-
-As already mentioned, the most important operation of CG and GMRES methods is the SpMV multiplication. In the parallel implementation of
-the iterative methods, each computing node $i$ performs the SpMV multiplication on its own sparse rectangular sub-matrix $A_i$. Locally, it
-has only sub-vectors of size $\frac{n}{p}$ corresponding to rows of its sub-matrix $A_i$. However, it also requires the vector elements
-of its neighbors, corresponding to the column indices on which its sub-matrix has nonzero values (see Figure~\ref{fig:01}). So, in addition
-to the local vectors, each node must also manage vector elements shared with neighbors and required to compute the SpMV multiplication.
-Therefore, the iterate vector $x$ managed by each computing node is composed of a local sub-vector $x^{local}$ of size $\frac{n}{p}$ and a
-sub-vector of shared elements $x^{shared}$. In the same way, the vector used to construct the orthonormal basis of the Krylov sub-space 
-(vectors $p$ and $v$ in CG and GMRES methods, respectively) is composed of a local sub-vector and a shared sub-vector. 
-
-Therefore, before computing the SpMV multiplication, the neighboring nodes over the GPU cluster must exchange between them the shared 
-vector elements necessary to compute this multiplication. First, each computing node determines, in its local sub-vector, the vector
-elements needed by other nodes. Then, the neighboring nodes exchange between them these shared vector elements. The data exchanges 
-are implemented by using the MPI point-to-point communication routines: blocking sends with \verb+MPI_Send()+ and nonblocking receives
-with \verb+MPI_Irecv()+. Figure~\ref{fig:02} shows an example of data exchanges between \textit{Node 1} and its neighbors \textit{Node 0},
-\textit{Node 2} and \textit{Node 3}. In this example, the iterate matrix $A$ split between these four computing nodes is that presented
-in Figure~\ref{fig:01}.
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/compress}}
-\caption{Data exchanges between \textit{Node 1} and its neighbors \textit{Node 0}, \textit{Node 2} and \textit{Node 3}.}
-\label{fig:02}
-\end{figure}
-
-After the synchronization operation, the computing nodes receive, from their respective neighbors, the shared elements in a sub-vector
-stored in a compressed format. However, in order to compute the SpMV multiplication, the computing nodes operate on sparse global vectors
-(see Figure~\ref{fig:02}). In this case, the received vector elements must be copied to the corresponding indices in the global vector.
-So as not to need to perform this at each iteration, we propose to reorder the columns of each sub-matrix $\{A_i\}_{0\leq i<p}$, so that
-the shared sub-vectors could be used in their compressed storage formats. Figure~\ref{fig:03} shows a reordering of a sparse sub-matrix
-(sub-matrix of \textit{Node 1}). Furthermore, we use the texture memory to cache the global vector. This allows to avoid the non coalesced
-accesses to the GPU global memory. 
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/reorder}}
-\caption{Columns reordering of a sparse sub-matrix.}
-\label{fig:03}
-\end{figure}
-
-A GPU cluster is a parallel platform with a distributed memory. So, the synchronizations and communication data between GPU nodes are
-carried out by passing messages. However, GPUs can not communicate between them in direct way. Then, CPUs via MPI processes are in charge
-of the synchronizations within the GPU cluster. Consequently, the vector elements to be exchanged must be copied from the GPU memory
-to the CPU memory and vice-versa before and after the synchronization operation between CPUs. We have used the CBLAS communication subroutines
-to perform the data transfers between a CPU core and its GPU: \verb+cublasGetVector()+ and \verb+cublasSetVector()+. Finally, in addition
-to the data exchanges, GPU nodes perform reduction operations to compute in parallel the dot products and Euclidean norms. This is implemented
-by using the MPI global communication \verb+MPI_Allreduce()+.
-
-
-%%--------------------------%%
-%%       SECTION 4          %%
-%%--------------------------%%
-\section{Experimental results}
-\label{sec:04}
-In this section, we present the performances of the parallel CG and GMRES linear solvers obtained on a cluster of $12$ GPUs. Indeed, this GPU
-cluster of tests is composed of six machines connected by $20$Gbps InfiniBand network. Each machine is a Quad-Core Xeon E5530 CPU running at
-$2.4$GHz and providing $12$GB of RAM with a memory bandwidth of $25.6$GB/s. In addition, two Tesla C1060 GPUs are connected to each machine via
-a PCI-Express 16x Gen 2.0 interface with a throughput of $8$GB/s. A Tesla C1060 GPU contains $240$ cores running at $1.3$GHz and providing a
-global memory of $4$GB with a memory bandwidth of $102$GB/s. Figure~\ref{fig:04} shows the general scheme of the GPU cluster that we used in
-the experimental tests.
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.25]{Chapters/chapter12/figures/cluster}}
-\caption{General scheme of the GPU cluster of tests composed of six machines, each with two GPUs.}
-\label{fig:04}
-\end{figure}
-
-Linux cluster version 2.6.39 OS is installed on CPUs. C programming language is used for coding the parallel algorithms of both methods on the
-GPU cluster. CUDA version 4.0~\cite{ref9} is used for programming GPUs, using CUBLAS library~\cite{ref6} to deal with vector operations in GPUs
-and, finally, MPI routines of OpenMPI 1.3.3 are used to carry out the communications between CPU cores. Indeed, the experiments are done on a
-cluster of $12$ computing nodes, where each node is managed by a MPI process and it is composed of one CPU core and one GPU card.
-
-All tests are made on double-precision floating point operations. The parameters of both linear solvers are initialized as follows: the residual
-tolerance threshold $\varepsilon=10^{-12}$, the maximum number of iterations $maxiter=500$, the right-hand side $b$ is filled with $1.0$ and the
-initial guess $x_0$ is filled with $0.0$. In addition, we limited the Arnoldi process used in the GMRES method to $16$ iterations ($m=16$). For
-the sake of simplicity, we have chosen the preconditioner $M$ as the main diagonal of the sparse matrix $A$. Indeed, it allows to easily compute
-the required inverse matrix $M^{-1}$ and it provides a relatively good preconditioning for not too ill-conditioned matrices. In the GPU computing,
-the size of thread blocks is fixed to $512$ threads. Finally, the performance results, presented hereafter, are obtained from the mean value over
-$10$ executions of the same parallel linear solver and for the same input data.
-
-To get more realistic results, we tested the CG and GMRES algorithms on sparse matrices of the Davis's collection~\cite{ref10}, that arise in a wide
-spectrum of real-world applications. We chose six symmetric sparse matrices and six nonsymmetric ones from this collection. In Figure~\ref{fig:05},
-we show structures of these matrices and in Table~\ref{tab:01} we present their main characteristics which are the number of rows, the total number
-of nonzero values (nnz) and the maximal bandwidth. In the present chapter, the bandwidth of a sparse matrix is defined as the number of matrix columns
-separating the first and the last nonzero value on a matrix row.
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/matrices}}
-\caption{Sketches of sparse matrices chosen from the Davis's collection.}
-\label{fig:05}
-\end{figure}
-
-\begin{table}[!h]
-\centering
-\begin{tabular}{|c|c|c|c|c|}
-\hline
-{\bf Matrix type}             & {\bf Matrix name} & {\bf \# rows} & {\bf \# nnz} & {\bf Bandwidth} \\ \hline \hline
-
-\multirow{6}{*}{Symmetric}    & 2cubes\_sphere    & $101,492$     & $1,647,264$  & $100,464$ \\
-
-                              & ecology2          & $999,999$     & $4,995,991$  & $2,001$   \\ 
-
-                              & finan512          & $74,752$      & $596,992$    & $74,725$  \\ 
-
-                              & G3\_circuit       & $1,585,478$   & $7,660,826$  & $1,219,059$ \\
-            
-                              & shallow\_water2   & $81,920$      & $327,680$    & $58,710$ \\
-
-                              & thermal2          & $1,228,045$   & $8,580,313$  & $1,226,629$ \\ \hline \hline
-            
-\multirow{6}{*}{Nonsymmetric} & cage13            & $445,315$     & $7,479,343$  & $318,788$\\
-
-                              & crashbasis        & $160,000$     & $1,750,416$  & $120,202$ \\
-
-                              & FEM\_3D\_thermal2 & $147,900$     & $3,489.300$  & $117,827$ \\
-
-                              & language          & $399,130$     & $1,216,334$  & $398,622$\\
- 
-                              & poli\_large       & $15,575$      & $33,074$     & $15,575$ \\
-
-                              & torso3            & $259,156$     & $4,429,042$  & $216,854$  \\ \hline
-\end{tabular}
-\vspace{0.5cm}
-\caption{Main characteristics of sparse matrices chosen from the Davis's collection.}
-\label{tab:01}
-\end{table}
-
-\begin{table}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|} 
-\hline
-{\bf Matrix}     & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$  & $\mathbf{\# iter.}$ & $\mathbf{prec.}$     & $\mathbf{\Delta}$   \\ \hline \hline
-
-2cubes\_sphere    & $0.132s$           & $0.069s$            & $1.93$        & $12$           & $1.14e$-$09$     & $3.47e$-$18$ \\
-
-ecology2          & $0.026s$           & $0.017s$            & $1.52$        & $13$           & $5.06e$-$09$     & $8.33e$-$17$ \\
-
-finan512          & $0.053s$           & $0.036s$            & $1.49$        & $12$           & $3.52e$-$09$     & $1.66e$-$16$ \\
-
-G3\_circuit       & $0.704s$           & $0.466s$            & $1.51$        & $16$           & $4.16e$-$10$     & $4.44e$-$16$ \\
-
-shallow\_water2   & $0.017s$           & $0.010s$            & $1.68$        & $5$            & $2.24e$-$14$     & $3.88e$-$26$ \\
-
-thermal2          & $1.172s$           & $0.622s$            & $1.88$        & $15$           & $5.11e$-$09$     & $3.33e$-$16$ \\ \hline   
-\end{tabular}
-\caption{Performances of the parallel CG method on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.}
-\label{tab:02}
-\end{center}
-\end{table}
-
-\begin{table}[!h]
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|} 
-\hline
-{\bf Matrix}     & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$  & $\mathbf{\# iter.}$ & $\mathbf{prec.}$     & $\mathbf{\Delta}$   \\ \hline \hline
-
-2cubes\_sphere    & $0.234s$           & $0.124s$            & $1.88$        & $21$           & $2.10e$-$14$     & $3.47e$-$18$ \\
-
-ecology2          & $0.076s$           & $0.035s$            & $2.15$        & $21$           & $4.30e$-$13$     & $4.38e$-$15$ \\
-
-finan512          & $0.073s$           & $0.052s$            & $1.40$        & $17$           & $3.21e$-$12$     & $5.00e$-$16$ \\
-
-G3\_circuit       & $1.016s$           & $0.649s$            & $1.56$        & $22$           & $1.04e$-$12$     & $2.00e$-$15$ \\
-
-shallow\_water2   & $0.061s$           & $0.044s$            & $1.38$        & $17$           & $5.42e$-$22$     & $2.71e$-$25$ \\
-
-thermal2          & $1.666s$           & $0.880s$            & $1.89$        & $21$           & $6.58e$-$12$     & $2.77e$-$16$ \\ \hline \hline
-
-cage13            & $0.721s$           & $0.338s$            & $2.13$        & $26$           & $3.37e$-$11$     & $2.66e$-$15$ \\
-
-crashbasis        & $1.349s$           & $0.830s$            & $1.62$        & $121$          & $9.10e$-$12$     & $6.90e$-$12$ \\
-
-FEM\_3D\_thermal2 & $0.797s$           & $0.419s$            & $1.90$        & $64$           & $3.87e$-$09$     & $9.09e$-$13$ \\
-
-language          & $2.252s$           & $1.204s$            & $1.87$        & $90$           & $1.18e$-$10$     & $8.00e$-$11$ \\
-
-poli\_large       & $0.097s$           & $0.095s$            & $1.02$        & $69$           & $4.98e$-$11$     & $1.14e$-$12$ \\
-
-torso3            & $4.242s$           & $2.030s$            & $2.09$        & $175$          & $2.69e$-$10$     & $1.78e$-$14$ \\ \hline
-\end{tabular}
-\caption{Performances of the parallel GMRES method on a cluster 24 CPU cores vs. on cluster of 12 GPUs.}
-\label{tab:03}
-\end{center}
-\end{table}
-
-Tables~\ref{tab:02} and~\ref{tab:03} shows the performances of the parallel CG and GMRES solvers, respectively, for solving linear systems associated to
-the sparse matrices presented in Tables~\ref{tab:01}. They allow to compare the performances obtained on a cluster of $24$ CPU cores and on a cluster
-of $12$ GPUs. However, Table~\ref{tab:02} shows only the solving performances of symmetric sparse linear systems, due to the inability of the CG method
-to solve the nonsymmetric systems. In both tables, the second and third columns give, respectively, the execution times in seconds obtained on $24$ CPU
-cores($Time_{gpu}$) and that obtained on $12$ GPUs ($Time_{gpu}$). Moreover, we take into account the relative gains $\tau$ of a solver implemented on the
-GPU cluster compared to the same solver implemented on the CPU cluster. The relative gains, presented in the fourth column, are computed as a ratio of
-the CPU execution time over the GPU execution time:
-\begin{equation}
-\tau = \frac{Time_{cpu}}{Time_{gpu}}.
-\label{eq:20}
-\end{equation}
-In addition, Tables~\ref{tab:02} and~\ref{tab:03} give the number of iterations ($iter$), the precision $prec$ of the solution computed on the GPU cluster
-and the difference $\Delta$ between the solution computed on the CPU cluster and that computed on the GPU cluster. Both parameters $prec$ and $\Delta$
-allow to validate and verify the accuracy of the solution computed on the GPU cluster. We have computed them as follows:
-\begin{eqnarray}
-\Delta = max|x^{cpu}-x^{gpu}|,\\
-prec = max|M^{-1}r^{gpu}|,
-\end{eqnarray}
-where $\Delta$ is the maximum vector element, in absolute value, of the difference between the two solutions $x^{cpu}$ and $x^{gpu}$ computed, respectively,
-on CPU and GPU cluster and $prec$ is the maximum element, in absolute value, of the residual vector $r^{gpu}\in\mathbb{R}^{n}$ of the solution $x^{gpu}$.
-Thus, we can see that the solutions obtained on the GPU cluster were computed with a sufficient accuracy (about $10^{-10}$) and they are, more or less,
-equivalent to those computed on the CPU cluster with a small difference ranging from $10^{-10}$ and $10^{-26}$. However, we can notice from the relative
-gains $\tau$ that is not interesting to use multiple GPUs for solving small sparse linear systems. in fact, a small sparse matrix does not allow to maximize
-utilization of GPU cores. In addition, the communications required to synchronize the computations over the cluster increase the idle times of GPUs and
-slow down further the parallel computations.
-
-Consequently, in order to test the performances of the parallel solvers, we developed in C programming language a generator of large sparse matrices. 
-This generator takes a matrix from the Davis's collection~\cite{ref10} as an initial matrix to construct large sparse matrices exceeding ten million
-of rows. It must be executed in parallel by the MPI processes of the computing nodes, so that each process could construct its sparse sub-matrix. In
-first experimental tests, we are focused on sparse matrices having a banded structure, because they are those arise in the most of numerical problems.
-So to generate the global sparse matrix, each MPI process constructs its sub-matrix by performing several copies of an initial sparse matrix chosen
-from the Davis's collection. Then, it puts all these copies on the main diagonal of the global matrix (see Figure~\ref{fig:06}). Moreover, the empty
-spaces between two successive copies in the main diagonal are filled with sub-copies (left-copy and right-copy in Figure~\ref{fig:06}) of the same
-initial matrix.
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/generation}}
-\caption{Parallel generation of a large sparse matrix by four computing nodes.}
-\label{fig:06}
-\end{figure}
-
-We have used the parallel CG and GMRES algorithms for solving sparse linear systems of $25$ million of unknown values. The sparse matrices associated 
-to these linear systems are generated from those presented in Table~\ref{tab:01}. Their main characteristics are given in Table~\ref{tab:04}. Tables~\ref{tab:05}
-and~\ref{tab:06} shows the performances of the parallel CG and GMRES solvers, respectively, obtained on a cluster of $24$ CPU cores and on a cluster 
-of $12$ GPUs. Obviously, we can notice from these tables that solving large sparse linear systems on a GPU cluster is more efficient than on a CPU 
-cluster (see relative gains $\tau$). We can also notice that the execution times of the CG method, whether in a CPU cluster or on a GPU cluster, are
-better than those the GMRES method for solving large symmetric linear systems. In fact, the CG method is characterized by a better convergence rate 
-and a shorter execution time of an iteration than those of the GMRES method. Moreover, an iteration of the parallel GMRES method requires more data
-exchanges between computing nodes compared to the parallel CG method.
- 
-\begin{table}
-\centering
-\begin{tabular}{|c|c|c|c|}
-\hline
-{\bf Matrix type}             & {\bf Matrix name} & {\bf \# nnz} & {\bf Bandwidth} \\ \hline \hline
-
-\multirow{6}{*}{Symmetric}    & 2cubes\_sphere    & $413,703,602$ & $198,836$     \\
-
-                              & ecology2          & $124,948,019$ & $2,002$          \\ 
-
-                              & finan512          & $278,175,945$ & $123,900$        \\ 
-
-                              & G3\_circuit       & $125,262,292$ & $1,891,887$      \\
-            
-                              & shallow\_water2   & $100,235,292$ & $62,806$      \\
-
-                              & thermal2          & $175,300,284$ & $2,421,285$ \\ \hline \hline
-            
-\multirow{6}{*}{Nonsymmetric} & cage13            & $435,770,480$ & $352,566$        \\
-
-                              & crashbasis        & $409,291,236$ & $200,203$        \\
-
-                              & FEM\_3D\_thermal2 & $595,266,787$ & $206,029$ \\
-
-                              & language          & $76,912,824$  & $398,626$ \\
-
-                              & poli\_large       & $53,322,580$  & $15,576$         \\
-
-                              & torso3            & $433,795,264$ & $328,757$        \\ \hline
-\end{tabular}
-\vspace{0.5cm}
-\caption{Main characteristics of sparse banded matrices generated from those of the Davis's collection.}
-\label{tab:04}
-\end{table}
- 
-\begin{table}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|} 
-\hline
-{\bf Matrix}    & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{\# iter.}$ & $\mathbf{prec.}$ & $\mathbf{\Delta}$   \\ \hline \hline
-
-2cubes\_sphere  & $1.625s$             & $0.401s$              & $4.05$          & $14$                & $5.73e$-$11$     & $5.20e$-$18$ \\
-
-ecology2        & $0.856s$             & $0.103s$              & $8.27$          & $15$                & $3.75e$-$10$     & $1.11e$-$16$ \\
-
-finan512        & $1.210s$             & $0.354s$              & $3.42$          & $14$                & $1.04e$-$10$     & $2.77e$-$16$ \\
-
-G3\_circuit     & $1.346s$             & $0.263s$              & $5.12$          & $17$                & $1.10e$-$10$     & $5.55e$-$16$ \\
-
-shallow\_water2 & $0.397s$             & $0.055s$              & $7.23$          & $7$                 & $3.43e$-$15$     & $5.17e$-$26$ \\
-
-thermal2        & $1.411s$             & $0.244s$              & $5.78$          & $16$                & $1.67e$-$09$     & $3.88e$-$16$ \\ \hline  
-\end{tabular}
-\caption{Performances of the parallel CG method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. 
-on a cluster of 12 GPUs.}
-\label{tab:05}
-\end{center}
-\end{table}
-
-\begin{table}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|} 
-\hline
-{\bf Matrix}      & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{\# iter.}$ & $\mathbf{prec.}$ & $\mathbf{\Delta}$   \\ \hline \hline
-
-2cubes\_sphere    & $3.597s$             & $0.514s$              & $6.99$          & $21$                & $2.11e$-$14$     & $8.67e$-$18$ \\
-
-ecology2          & $2.549s$             & $0.288s$              & $8.83$          & $21$                & $4.88e$-$13$     & $2.08e$-$14$ \\
-
-finan512          & $2.660s$             & $0.377s$              & $7.05$          & $17$                & $3.22e$-$12$     & $8.82e$-$14$ \\
-
-G3\_circuit       & $3.139s$             & $0.480s$              & $6.53$          & $22$                & $1.04e$-$12$     & $5.00e$-$15$ \\
-
-shallow\_water2   & $2.195s$             & $0.253s$              & $8.68$          & $17$                & $5.54e$-$21$     & $7.92e$-$24$ \\
-
-thermal2          & $3.206s$             & $0.463s$              & $6.93$          & $21$                & $8.89e$-$12$     & $3.33e$-$16$ \\ \hline \hline
-
-cage13            & $5.560s$             & $0.663s$              & $8.39$          & $26$                & $3.29e$-$11$     & $1.59e$-$14$ \\
-
-crashbasis        & $25.802s$            & $3.511s$              & $7.35$          & $135$               & $6.81e$-$11$     & $4.61e$-$15$ \\
-
-FEM\_3D\_thermal2 & $13.281s$            & $1.572s$              & $8.45$          & $64$                & $3.88e$-$09$     & $1.82e$-$12$ \\
-
-language          & $12.553s$            & $1.760s$              & $7.13$          & $89$                & $2.11e$-$10$     & $1.60e$-$10$ \\
-
-poli\_large       & $8.515s$             & $1.053s$              & $8.09$          & $69$                & $5.05e$-$11$     & $6.59e$-$12$ \\
-
-torso3            & $31.463s$            & $3.681s$              & $8.55$          & $175$               & $2.69e$-$10$     & $2.66e$-$14$ \\ \hline
-\end{tabular}
-\caption{Performances of the parallel GMRES method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. 
-on a cluster of 12 GPUs.}
-\label{tab:06}
-\end{center}
-\end{table}
-
-
-%%--------------------------%%
-%%       SECTION 5          %%
-%%--------------------------%%
-\section{Hypergraph partitioning}
-\label{sec:05}
-In this section, we present the performances of both parallel CG and GMRES solvers for solving linear systems associated to sparse matrices having
-large bandwidths. Indeed, we are interested on sparse matrices having the nonzero values distributed along their bandwidths. 
-
-We have developed in C programming language a generator of large sparse matrices having five bands distributed along their bandwidths (see Figure~\ref{fig:07}).
-The principle of this generator is equivalent to that in Section~\ref{sec:04}. However, the copies performed on the initial matrix (chosen from the
-Davis's collection) are placed on the main diagonal and on four off-diagonals, two on the right and two on the left of the main diagonal. Figure~\ref{fig:07}
-shows an example of a generation of a sparse five-bands matrix by four computing nodes. Table~\ref{tab:07} shows the main characteristics of sparse
-five-bands matrices generated from those presented in Table~\ref{tab:01} and associated to linear systems of $25$ million of unknown values.   
-
-\begin{figure}[!h]
-\centerline{\includegraphics[scale=0.23]{Chapters/chapter12/figures/generation_1}}
-\caption{Parallel generation of a large sparse five-bands matrix by four computing nodes.}
-\label{fig:07}
-\end{figure}
-
-\begin{table}
-\begin{center}
-\begin{tabular}{|c|c|c|c|} 
-\hline
-{\bf Matrix type}             & {\bf Matrix name} & {\bf \# nnz}  & {\bf Bandwidth} \\ \hline \hline
-
-\multirow{6}{*}{Symmetric}    & 2cubes\_sphere    & $829,082,728$ & $24,999,999$     \\
-
-                              & ecology2          & $254,892,056$ & $25,000,000$     \\ 
-
-                              & finan512          & $556,982,339$ & $24,999,973$     \\ 
-
-                              & G3\_circuit       & $257,982,646$ & $25,000,000$     \\
-            
-                              & shallow\_water2   & $200,798,268$ & $25,000,000$     \\
-
-                              & thermal2          & $359,340,179$ & $24,999,998$     \\ \hline \hline
-            
-\multirow{6}{*}{Nonsymmetric} & cage13            & $879,063,379$ & $24,999,998$     \\
-
-                              & crashbasis        & $820,373,286$ & $24,999,803$     \\
-
-                              & FEM\_3D\_thermal2 & $1,194,012,703$ & $24,999,998$     \\
-
-                              & language          & $155,261,826$ & $24,999,492$     \\
-
-                              & poli\_large       & $106,680,819$ & $25,000,000$    \\
-
-                              & torso3            & $872,029,998$ & $25,000,000$\\ \hline
-\end{tabular}
-\caption{Main characteristics of sparse five-bands matrices generated from those of the Davis's collection.}
-\label{tab:07}
-\end{center}
-\end{table}
-
-
-\begin{table}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|} 
-\hline
-{\bf Matrix}      & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{\# iter.}$ & $\mathbf{prec.}$ & $\mathbf{\Delta}$   \\ \hline \hline
-
-2cubes\_sphere    & $6.041s$     & $3.338s$      & $1.81$ & $30$ & $6.77e$-$11$ & $3.25e$-$19$ \\
-
-ecology2          & $1.404s$     & $1.301s$      & $1.08$ & $13$     & $5.22e$-$11$ & $2.17e$-$18$ \\
-
-finan512          & $1.822s$     & $1.299s$      & $1.40$ & $12$     & $3.52e$-$11$ & $3.47e$-$18$ \\
-
-G3\_circuit       & $2.331s$     & $2.129s$      & $1.09$ & $15$     & $1.36e$-$11$ & $5.20e$-$18$ \\
-
-shallow\_water2   & $0.541s$     & $0.504s$      & $1.07$ & $6$      & $2.12e$-$16$ & $5.05e$-$28$ \\
-
-thermal2          & $2.549s$     & $1.705s$      & $1.49$ & $14$     & $2.36e$-$10$ & $5.20e$-$18$ \\ \hline  
-\end{tabular}
-\caption{Performances of parallel CG solver for solving linear systems associated to sparse five-bands matrices
-on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs}
-\label{tab:08}
-\end{center}
-\end{table}
-
-\begin{table}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|} 
-\hline
-{\bf Matrix}      & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{\# iter.}$ & $\mathbf{prec.}$ & $\mathbf{\Delta}$   \\ \hline \hline
-
-2cubes\_sphere    & $15.963s$    & $7.250s$      & $2.20$  & $58$     & $6.23e$-$16$ & $3.25e$-$19$ \\
-
-ecology2          & $3.549s$     & $2.176s$      & $1.63$  & $21$     & $4.78e$-$15$ & $1.06e$-$15$ \\
-
-finan512          & $3.862s$     & $1.934s$      & $1.99$  & $17$     & $3.21e$-$14$ & $8.43e$-$17$ \\
-
-G3\_circuit       & $4.636s$     & $2.811s$      & $1.65$  & $22$     & $1.08e$-$14$ & $1.77e$-$16$ \\
-
-shallow\_water2   & $2.738s$     & $1.539s$      & $1.78$  & $17$     & $5.54e$-$23$ & $3.82e$-$26$ \\
-
-thermal2          & $5.017s$     & $2.587s$      & $1.94$  & $21$     & $8.25e$-$14$ & $4.34e$-$18$ \\ \hline \hline
-
-cage13            & $9.315s$     & $3.227s$      & $2.89$  & $26$     & $3.38e$-$13$ & $2.08e$-$16$ \\
-
-crashbasis        & $35.980s$    & $14.770s$     & $2.43$  & $127$    & $1.17e$-$12$ & $1.56e$-$17$ \\
-
-FEM\_3D\_thermal2 & $24.611s$    & $7.749s$      & $3.17$  & $64$     & $3.87e$-$11$ & $2.84e$-$14$ \\
-
-language          & $16.859s$    & $9.697s$      & $1.74$  & $89$     & $2.17e$-$12$ & $1.70e$-$12$ \\
-
-poli\_large       & $10.200s$    & $6.534s$      & $1.56$  & $69$     & $5.14e$-$13$ & $1.63e$-$13$ \\
-
-torso3            & $49.074s$    & $19.397s$     & $2.53$  & $175$    & $2.69e$-$12$ & $2.77e$-$16$ \\ \hline
-\end{tabular}
-\caption{Performances of parallel GMRES solver for solving linear systems associated to sparse five-bands matrices
-on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs}
-\label{tab:09}
-\end{center}
-\end{table}
-
-Tables~\ref{tab:08} and~\ref{tab:09} shows the performaces of the parallel CG and GMRES solvers, respectively, obtained on
-a cluster of $24$ CPU cores and on a cluster of $12$ GPUs. The linear systems solved in these tables are associated to the
-sparse five-bands matrices presented on Table~\ref{tab:07}. We can notice from both Tables~\ref{tab:08} and~\ref{tab:09} that 
-using a GPU cluster is not efficient for solving these kind of sparse linear systems. We can see that the execution times obtained
-on the GPU cluster are almost equivalent to those obtained on the CPU cluster (see the relative gains presented in column~$4$
-of each table). This is due to the large number of communications necessary to synchronize the computations over the cluster.
-Indeed, the naive partitioning, row-by-row or column-by-column, of sparse matrices having large bandwidths can link a computing
-node to many neighbors and then generate a large number of data dependencies between these computing nodes in the cluster. 
-
-Therefore, we have chosen to use a hypergraph partitioning method, which is well-suited to numerous kinds of sparse matrices~\cite{ref11}.
-Indeed, it can well model the communications between the computing nodes, particularly in the case of nonsymmetric and irregular
-matrices, and it gives good reduction of the total communication volume. In contrast, it is an expensive operation in terms of 
-execution time and memory space. 
-
-The sparse matrix $A$ of the linear system to be solved is modeled as a hypergraph $\mathcal{H}=(\mathcal{V},\mathcal{E})$ as
-follows:
-\begin{itemize*}
-\item each matrix row $\{i\}_{0\leq i<n}$ corresponds to a vertex $v_i\in\mathcal{V}$ and,
-\item each matrix column $\{j\}_{0\leq j<n}$ corresponds to a hyperedge $e_j\in\mathcal{E}$, where:
-\begin{equation}
-\forall a_{ij} \neq 0 \mbox{~is a nonzero value of matrix~} A \mbox{~:~} v_i \in pins[e_j],
-\end{equation} 
-\item $w_i$ is the weight of vertex $v_i$ and,
-\item $c_j$ is the cost of hyperedge $e_j$.
-\end{itemize*}
-A $K$-way partitioning of a hypergraph $\mathcal{H}=(\mathcal{V},\mathcal{E})$ is defined as $\mathcal{P}=\{\mathcal{V}_1,\ldots,\mathcal{V}_K\}$
-a set of pairwise disjoint non-empty subsets (or parts) of the vertex set $\mathcal{V}$, so that each subset is attributed to a computing node.
-Figure~\ref{fig:08} shows an example of the hypergraph model of a  $(9\times 9)$ sparse matrix in three parts. The circles and squares correspond,
-respectively, to the vertices and hyperedges of the hypergraph. The solid squares define the cut hyperedges connecting at least two different parts. 
-The connectivity $\lambda_j$ of a cut hyperedge $e_j$ denotes the number of different parts spanned by $e_j$.
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.5]{Chapters/chapter12/figures/hypergraph}}
-\caption{An example of the hypergraph partitioning of a sparse matrix decomposed between three computing nodes.}
-\label{fig:08}
-\end{figure}
-
-The cut hyperedges model the total communication volume between the different computing nodes in the cluster, necessary to perform the parallel SpMV
-multiplication. Indeed, each hyperedge $e_j$ defines a set of atomic computations $b_i\leftarrow b_i+a_{ij}x_j$, $0\leq i,j<n$, of the SpMV multiplication
-$Ax=b$ that need the $j^{th}$ unknown value of solution vector $x$. Therefore, pins of hyperedge $e_j$, $pins[e_j]$, are the set of matrix rows sharing
-and requiring the same unknown value $x_j$. For example in Figure~\ref{fig:08}, hyperedge $e_9$ whose pins are: $pins[e_9]=\{v_2,v_5,v_9\}$ represents the
-dependency of matrix rows $2$, $5$ and $9$ to unknown $x_9$ needed to perform in parallel the atomic operations: $b_2\leftarrow b_2+a_{29}x_9$,
-$b_5\leftarrow b_5+a_{59}x_9$ and $b_9\leftarrow b_9+a_{99}x_9$. However, unknown $x_9$ is the third entry of the sub-solution vector $x$ of part (or node) $3$.
-So the computing node $3$ must exchange this value with nodes $1$ and $2$, which leads to perform two communications.
-
-The hypergraph partitioning allows to reduce the total communication volume required to perform the parallel SpMV multiplication, while maintaining the 
-load balancing between the computing nodes. In fact, it allows to minimize at best the following amount:
-\begin{equation}
-\mathcal{X}(\mathcal{P})=\sum_{e_{j}\in\mathcal{E}_{C}}c_{j}(\lambda_{j}-1),
-\end{equation}
-where $\mathcal{E}_{C}$ denotes the set of the cut hyperedges coming from the hypergraph partitioning $\mathcal{P}$ and $c_j$ and $\lambda_j$ are, respectively,
-the cost and the connectivity of cut hyperedge $e_j$. Moreover, it also ensures the load balancing between the $K$ parts as follows: 
-\begin{equation}
-  W_{k}\leq (1+\epsilon)W_{avg}, \hspace{0.2cm} (1\leq k\leq K) \hspace{0.2cm} \text{and} \hspace{0.2cm} (0<\epsilon<1),
-\end{equation} 
-where $W_{k}$ is the sum of all vertex weights ($w_{i}$) in part $\mathcal{V}_{k}$, $W_{avg}$ is the average weight of all $K$ parts and $\epsilon$ is the 
-maximum allowed imbalanced ratio.
-
-The hypergraph partitioning is a NP-complete problem but software tools using heuristics are developed, for example: hMETIS~\cite{ref12}, PaToH~\cite{ref13}
-and Zoltan~\cite{ref14}. Since our objective is solving large sparse linear systems, we use the parallel hypergraph partitioning which must be performed by
-at least two MPI processes. It allows us to accelerate the data partitioning of large sparse matrices. For this, the hypergraph $\mathcal{H}$ must be partitioned
-in $p$ (number of MPI processes) sub-hypergraphs $\mathcal{H}_k=(\mathcal{V}_k,\mathcal{E}_k)$, $0\leq k<p$, and then we performed the parallel hypergraph
-partitioning method using some functions of the MPI library between the $p$ processes.
-
-Tables~\ref{tab:10} and~\ref{tab:11} shows the performances of the parallel CG and GMRES solvers, respectively, using the hypergraph partitioning for solving
-large linear systems associated to the sparse five-bands matrices presented in Table~\ref{tab:07}. For these experimental tests, we have applied the parallel
-hypergraph partitioning~\cite{ref15} developed in Zoltan tool~\ref{ref14}. We have initialized the parameters of the partitioning as follows:
-\begin{itemize*}
-\item the weight $w_{i}$ of each vertex $v_{j}\in\mathcal{V}$ is set to the number of nonzero values on matrix row $i$,
-\item for the sake of simplicity, the cost $c_{j}$ of each hyperedge $e_{j}\in\mathcal{E}$ is fixed to $1$,
-\item the maximum imbalanced load ratio $\epsilon$ is limited to $10\%$.\\
-\end{itemize*}  
-
-\begin{table}[!h]
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|} 
-\hline
-{\bf Matrix}    & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{Gains \%}$ \\ \hline \hline
-
-2cubes\_sphere  & $5.935s$             & $1.213s$              & $4.89$          & $63.66\%$ \\
-
-ecology2        & $1.093s$             & $0.136s$              & $8.00$          & $89.55\%$ \\
-
-finan512        & $1.762s$             & $0.475s$              & $3.71$          & $63.43\%$ \\
-
-G3\_circuit     & $2.095s$             & $0.558s$              & $3.76$          & $73.79\%$ \\
-
-shallow\_water2 & $0.498s$             & $0.068s$              & $7.31$          & $86.51\%$ \\
-
-thermal2        & $1.889s$             & $0.348s$              & $5.43$          & $79.59\%$ \\ \hline  
-\end{tabular}
-\caption{Performances of the parallel CG solver using hypergraph partitioning for solving linear systems associated to
-sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPU.}
-\label{tab:10}
-\end{center}
-\end{table}
-
-\begin{table}[!h]
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|} 
-\hline
-{\bf Matrix}      & $\mathbf{Time_{cpu}}$ & $\mathbf{Time_{gpu}}$ & $\mathbf{\tau}$ & $\mathbf{Gains \%}$ \\ \hline \hline
-
-2cubes\_sphere    & $16.430s$            & $2.840s$              & $5.78$          & $60.83\%$ \\
-
-ecology2          & $3.152s$             & $0.367s$              & $8.59$          & $83.13\%$ \\
-
-finan512          & $3.672s$             & $0.723s$              & $5.08$          & $62.62\%$ \\
-
-G3\_circuit       & $4.468s$             & $0.971s$              & $4.60$          & $65.46\%$ \\
-
-shallow\_water2   & $2.647s$             & $0.312s$              & $8.48$          & $79.73\%$ \\
-
-thermal2          & $4.190s$             & $0.666s$              & $6.29$          & $74.25\%$ \\ \hline \hline
-
-cage13            & $8.077s$             & $1.584s$              & $5.10$          & $50.91\%$ \\
-
-crashbasis        & $35.173s$            & $5.546s$              & $6.34$          & $62.43\%$ \\
-
-FEM\_3D\_thermal2 & $24.825s$            & $3.113s$              & $7.97$          & $59.83\%$ \\
-
-language          & $16.706s$            & $2.522s$              & $6.62$          & $73.99\%$ \\
-
-poli\_large       & $12.715s$            & $3.989s$              & $3.19$          & $38.95\%$ \\
-
-torso3            & $48.459s$            & $6.234s$              & $7.77$          & $67.86\%$ \\ \hline
-\end{tabular}
-\caption{Performances of the parallel GMRES solver using hypergraph partitioning for solving linear systems associated to
-sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPU.}
-\label{tab:11}
-\end{center}
-\end{table}
-
-We can notice from both Tables~\ref{tab:10} and~\ref{tab:11} that the hypergraph partitioning has improved the performances of both
-parallel CG and GMRES algorithms for solving large linear systems associated to matrices having large bandwidths. The execution times
-on the GPU cluster of both parallel solvers are significantly improved compared to those obtained by using the partitioning row-by-row.
-For these examples of sparse matrices, the execution times of CG and GMRES solvers are reduced on average about $76\%$ and $65\%$ respectively
-(see column~$5$ of each table) compared to those obtained in Tables~\ref{tab:08} and~\ref{tab:09}.
-
-In fact, the hypergraph partitioning applied to sparse matrices having large bandwidths allows to reduce the total communication volume
-necessary to synchronize the computations between the computing nodes in the GPU cluster. Table~\ref{tab:12} presents, for each sparse
-matrix, the total communication volume between $12$ GPU computing nodes obtained by using the partitioning row-by-row (column~$2$), the
-total communication volume obtained by using the hypergraph partitioning (column~$3$) and the execution times in minutes of the hypergraph
-partitioning operation performed by $12$ MPI processes (column~$4$). The total communication volume defines the total number of the vector
-elements exchanged by the computing nodes. Then, Table~\ref{tab:12} shows that the hypergraph partitioning method can split the sparse 
-matrix so as to minimize the data dependencies between the computing nodes and thus to reduce the total communication volume.
-
-
-\begin{table}[!h]
-\begin{center}
-\begin{tabular}{|c|c|c|c|} 
-\hline
-\multirow{4}{*}{\bf Matrix}  & {\bf Total comms.}      & {\bf Total comms.}      & {\bf Execution} \\
-                             & {\bf volume without}    & {\bf volume with}       & {\bf trime}  \\
-                             & {\bf hypergraph}        & {\bf hypergraph }       & {\bf of the parti.}  \\  
-                             & {\bf parti.}            & {\bf parti.}            & {\bf in minutes}\\ \hline \hline
-
-2cubes\_sphere               & $25,360,543$            & $240,679$               & $68.98$         \\
-
-ecology2                     & $26,044,002$            & $73,021$                & $4.92$          \\
-
-finan512                     & $26,087,431$            & $900,729$               & $33.72$         \\
-
-G3\_circuit                  & $31,912,003$            & $5,366,774$             & $11.63$         \\ 
-
-shallow\_water2              & $25,105,108$            & $60,899$                & $5.06$          \\ 
-
-thermal2                     & $30,012,846$            & $1,077,921$             & $17.88$         \\ \hline \hline
-
-cage13                       & $28,254,282$            & $3,845,440$             & $196.45$        \\
-
-crashbasis                   & $29,020,060$            & $2,401,876$             & $33.39$         \\
-
-FEM\_3D\_thermal2            & $25,263,767$            & $250,105$               & $49.89$         \\
-
-language                     & $27,291,486$            & $1,537,835$             & $9.07$          \\
-
-poli\_large                  & $25,053,554$            & $7,388,883$             & $5.92$          \\
-
-torso3                       & $25,682,514$            & $613,250$               & $61.51$         \\ \hline       
-\end{tabular}
-\caption{The total communication volume between 12 GPU computing nodes without and with the hypergraph partitioning method.}
-\label{tab:12}
-\end{center}
-\end{table}
-
-Nevertheless, as we can see from the fourth column of Table~\ref{tab:12}, the hypergraph partitioning takes longer compared
-to the execution times of the resolutions. As previously mentioned, the hypergraph partitioning method is less efficient in
-terms of memory consumption and partitioning time than its graph counterpart, but the hypergraph well models the nonsymmetric
-and irregular problems. So for the applications which often use the same sparse matrices, we can perform the hypergraph partitioning
-on these matrices only once for each and then, we save the traces of these partitionings in files to be reused several times.
-Therefore, this allows us to avoid the partitioning of the sparse matrices at each resolution of the linear systems.
-
-However, the most important performance parameter is the scalability of the parallel CG and GMRES solvers on a GPU cluster.
-Particularly, we have taken into account the weak-scaling of both parallel algorithms on a cluster of one to 12 GPU computing
-nodes. We have performed a set of experimental tests on both matrix structures: band matrices and five-bands matrices. The
-sparse matrices of tests are generated from the symmetric sparse matrix {\it thermal2} chosen from the Davis's collection.
-Figures~\ref{fig:09}-$(a)$ and~\ref{fig:09}-$(b)$ show the execution times of both parallel methods for solving large linear
-systems associated to band matrices and those associated to five-bands matrices, respectively. The size of a sparse sub-matrix
-per computing node, for each matrix structure, is fixed as follows:
-\begin{itemize*}
-\item band matrix: $15$ million of rows and $105,166,557$ of nonzero values,
-\item five-bands matrix: $5$ million of rows and $78,714,492$ of nonzero values. 
-\end{itemize*}
-We can see from these figures that both parallel solvers are quite scalable on a GPU cluster. Indeed, the execution times remains
-almost constant while the size of the size of the sparse linear systems to be solved increases proportionally with the number of 
-the GPU computing nodes. This means that the communication cost is relatively constant regardless of the number the computing nodes
-in the GPU cluster.
-
-\begin{figure}
-\centering
-\begin{tabular}{c}
-\includegraphics[scale=0.7]{Chapters/chapter12/figures/scale_band} \\
-\small{(a) Sparse band matrices} \\
-\\
-\includegraphics[scale=0.7]{Chapters/chapter12/figures/scale_5band} \\
-\small{(b) Sparse five-bands matrices}
-\end{tabular}
-\caption{Weak-scaling of the parallel CG and GMRES solvers on a GPU cluster for solving large sparse linear systems.}
-\label{fig:09}
-\end{figure}
-
-%%--------------------------%%
-%%       SECTION 6          %%
-%%--------------------------%%
-\section{Conclusion}
-\label{sec:06}
-In this chapter, we have aimed at harnessing the computing power of a cluster of GPUs for solving large sparse linear systems.
-For this, we have used two Krylov sub-space iterative methods: the CG and GMRES methods. The first method is well-known to its
-efficiency for solving symmetric linear systems and the second one is used, particularly, for solving nonsymmetric linear systems. 
-
-We have presentend the parallel implementation of both iterative methods on a GPU cluster. Particularly, the operations dealing with
-the vectors and/or matrices, of these methods, are parallelized between the different GPU computing nodes of the cluster. Indeed,
-the data-parallel vector operations are accelerated by GPUs and the communications required to synchronize the parallel computations
-are carried out by CPU cores. For this, we have used a heterogeneous CUDA/MPI programming to implement the parallel iterative
-algorithms.
-
-In the experimental tests, we have shown that using a GPU cluster is efficient for solving linear systems associated to very large
-sparse matrices. The experimental results, obtained in the present chapter, showed that a cluster of $12$ GPUs is about $7$
-times faster than a cluster of $24$ CPU cores for solving large sparse linear systems of $25$ million unknown values. This is due
-to the GPU ability to compute the data-parallel operations faster than the CPUs. However, we have shown that solving linear systems
-associated to matrices having large bandwidths uses many communications to synchronize the computations of GPUs, which slow down 
-even more the resolution. Moreover, there two kinds of communications: between a CPU and its GPU and between CPUs of the computing
-nodes, such that the first ones are the slowest communications on a GPU cluster. So, we have proposed to use the hypergraph partitioning
-instead of the row-by-row partitioning. This allows to minimize the data dependencies between the GPU computing nodes and thus to
-reduce the total communication volume. The experimental results showed that using the hypergraph partitioning technique improve the
-execution times on average of $76\%$ to the CG method and of $65\%$ to the GMRES method on a cluster of $12$ GPUs. 
-
-In the recent GPU hardware and software architectures, the GPU-Direct system with CUDA version 5.0 is used so that two GPUs located on
-the same node or on distant nodes can communicate between them directly without CPUs. This allows to improve the data transfers between
-GPUs.          
-
-\putbib[Chapters/chapter12/biblio12]
-
diff --git a/BookGPU/Chapters/chapter13/ch13.tex b/BookGPU/Chapters/chapter13/ch13.tex
index 941f085..b803de9 100755
--- a/BookGPU/Chapters/chapter13/ch13.tex
+++ b/BookGPU/Chapters/chapter13/ch13.tex
@@ -5,7 +5,7 @@
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  
 %\chapterauthor{}{}
-\chapterauthor{Lilia Ziane Khodja, RaphaÃ«l Couturier and Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France}
+\chapterauthor{Lilia Ziane Khodja, RaphaÃ«l Couturier, and Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France}
 \chapterauthor{Ming Chau}{Advanced Solutions Accelerator, Castelnau Le Lez, France}
 %\chapterauthor{RaphaÃ«l Couturier}{Femto-ST Institute, University of Franche-Comte, France}
 \chapterauthor{Pierre SpitÃ©ri}{ENSEEIHT-IRIT, Toulouse, France}
@@ -23,23 +23,23 @@
 %%--------------------------%%
 \section{Introduction}
 \label{ch13:sec:01}
-The obstacle problem is one kind of free boundary problems. It allows to model,
+The obstacle problem is one kind of free boundary problem. It allows us to model,
 for example, an elastic membrane covering a solid obstacle. In this case, the
 objective is to find an equilibrium position of this membrane constrained to be
 above the obstacle and which tends to minimize its surface and/or its energy.
-The study of such problems occurs in many applications, for example: fluid mechanics,
-bio-mathematics (tumor growth process) or financial mathematics (American or
+The study of such problems occurs in many applications, for example, fluid mechanics,
+biomathematics (tumor growth process), or financial mathematics (American or
 European option pricing).
 
 In this chapter, we focus on solutions of large obstacle problems defined in a
-three-dimensional domain. Particularly, the present study consists in solving
+three-dimensional domain. Particularly, the present study consists of solving
 large nonlinear systems derived from the spatial discretization of these problems.
 Owing to the great size of such systems, in order to reduce computation times,
 we proceed by solving them by parallel synchronous or asynchronous iterative
 algorithms. Moreover, we aim at harnessing the computing power of GPUs to accelerate
 computations of these parallel algorithms. For this, we use an iterative method
-involving a projection on a convex set, which is: the projected Richardson method.
-We choose this method among other iterative methods because it is easy to implement
+involving a projection on a convex set, which is the projected Richardson method.
+We chose this method among other iterative methods because it is easy to implement
 on parallel computers and easy to adapt to GPU architectures.
 
 In Section~\ref{ch13:sec:02}, we present the mathematical model of obstacle problems
@@ -48,7 +48,7 @@ projected Richardson method. Next, in Section~\ref{ch13:sec:04}, we give the mai
 points of the parallel implementation of both synchronous and asynchronous algorithms
 of the projected Richardson method on a GPU cluster. In Section~\ref{ch13:sec:05}, we
 present the performances of both parallel algorithms obtained from simulations carried
-out on a CPU and GPU clusters. Finally, in Section~\ref{ch13:sec:06}, we use the read-black
+out on GPU clusters. Finally, in Section~\ref{ch13:sec:06}, we use the read-black
 ordering technique to improve the convergence and, thus, the execution times of the parallel
 projected Richardson algorithms on the GPU cluster. 
 
@@ -68,7 +68,7 @@ three-dimensional domain. This model is based on that presented in~\cite{ch13:re
 \subsection{Mathematical model}
 \label{ch13:sec:02.01}
 An obstacle problem\index{Obstacle~problem}, arising for example in mechanics or financial
-derivatives, consists in solving a time dependent nonlinear equation\index{Nonlinear}:
+derivatives, consists of solving a time-dependent nonlinear equation\index{Nonlinear}:
 \begin{equation}
 \left\{
 \begin{array}{l}
@@ -80,17 +80,17 @@ u(0,x,y,z)=u_0(x,y,z),\\
 \right.
 \label{ch13:eq:01}
 \end{equation}
-where $u_0$ is the initial condition, $c\geq 0$, $b$ and $\eta$ are physical parameters,
-$T$ is the final time, $u=u(t,x,y,z)$ is an element of the solution vector $U$ to compute,
-$f$ is the right-hand side that could represent, for example, the external forces, B.C.
-describes the boundary conditions on the boundary $\partial\Omega$ of the domain $\Omega$,
-$\phi$ models a constraint imposed to $u$, $\Delta$ is the Laplacian operator, $\nabla$
-is the gradient operator, a.e.w. means almost every where and ``.'' defines the products
-between two scalars, a scalar and a vector or a matrix and a vector. In practice the boundary
+where $u_0$ is the initial condition; $c\geq 0$, $b$, and $\eta$ are physical parameters;
+$T$ is the final time; $u=u(t,x,y,z)$ is an element of the solution vector $U$ to compute;
+$f$ is the right-hand side that could represent, for example, the external forces; B.C.
+describes the boundary conditions on the boundary $\partial\Omega$ of the domain $\Omega$;
+$\phi$ models a constraint imposed to $u$; $\Delta$ is the Laplacian operator; $\nabla$
+is the gradient operator; a.e.w. means almost everywhere, and ``.'' defines the products
+between two scalars, a scalar and a vector, or a matrix and a vector. In practice the boundary
 condition, generally considered, is the Dirichlet condition (where $u$ is fixed on $\partial\Omega$)
 or the Neumann condition (where the normal derivative of $u$ is fixed on $\partial\Omega$). 
 
-The time dependent equation~(\ref{ch13:eq:01}) is numerically solved by considering an
+The time-dependent equation~(\ref{ch13:eq:01}) is numerically solved by considering an
 implicit or a semi-implicit time marching, where at each time step $k$ a stationary nonlinear
 problem\index{Nonlinear} is solved:
 \begin{equation}
@@ -118,7 +118,7 @@ the operator is self-adjoint or not plays an important role in the choice
 of the appropriate algorithm for solving nonlinear systems derived from the
 discretization of the obstacle problem\index{Obstacle~problem}. Nevertheless,
 since the convection coefficients arising in the operator~(\ref{ch13:eq:02})
-are constant, we can formulate the same problem by an self-adjoint operator
+are constant, we can formulate the same problem by self-adjoint operator
 by performing a classical change of variables. Then, we can replace the stationary
 convection-diffusion problem:
 \begin{equation}
@@ -130,19 +130,19 @@ by the following stationary diffusion operator:
 -\eta.\Delta u+(\frac{\|b\|^{2}_{2}}{4\eta}+c+\delta).u=e^{-a}g=f,
 \label{ch13:eq:04}
 \end{equation}
-where $b=\{b_{1},b_{2},b_{3}\}$, $\|b\|_{2}$ denotes the Euclidean norm of $b$ and
+where $b=\{b_{1},b_{2},b_{3}\}$, $\|b\|_{2}$ denotes the Euclidean norm of $b$, and
 $v=e^{-a}.u$ represents the general change of variables such that $a=\frac{b^{t}(x,y,z)}{2\eta}$.
 Consequently, the numerical resolution of the diffusion problem (the self-adjoint
-operator~(\ref{ch13:eq:04})) is done by optimization algorithms, in contrast, that
+operator~(\ref{ch13:eq:04})) is done by optimization algorithms, in contrast to that
 of the convection-diffusion problem (non self-adjoint operator~(\ref{ch13:eq:03}))
-is done by relaxation algorithms. In the case of our studied algorithm, the convergence\index{Convergence}
-is ensured by M-matrix property then, the performance is linked to the magnitude of
+which is done by relaxation algorithms. In the case of our studied algorithm, the convergence\index{Convergence}
+is ensured by M-matrix property; then, the performance is linked to the magnitude of
 the spectral radius of the iteration matrix, which is independent of the condition
 number.
 
 Next, the three-dimensional domain $\Omega\subset\mathbb{R}^{3}$ is set to $\Omega=\lbrack 0,1\rbrack^{3}$
-and discretized with an uniform Cartesian mesh constituted by $M=m^3$ discretization
-points, where $m$ related to the spatial discretization step by $h=\frac{1}{m+1}$. This
+and discretized with a uniform Cartesian mesh constituted by $M=m^3$ discretization
+points, where $m$ is related to the spatial discretization step by $h=\frac{1}{m+1}$. This
 is carried out by using a classical order 2 finite difference approximation of the Laplacian.
 So, the complete discretization of both stationary boundary value problems~(\ref{ch13:eq:03})
 and~(\ref{ch13:eq:04}) leads to the solution of a large discrete complementary problem
@@ -160,7 +160,7 @@ of the following form, when both Dirichlet or Neumann boundary conditions are us
 where $A$ is a matrix obtained after the spatial discretization by a finite difference
 method, $G$ is derived from the Euler first order implicit time marching scheme and from
 the discretized right-hand side of the obstacle problem, $\delta$ is the inverse of the
-time step $k$ and $I$ is the identity matrix. The matrix $A$ is symmetric when the self-adjoint
+time step $k$, and $I$ is the identity matrix. The matrix $A$ is symmetric when the self-adjoint
 operator is considered and nonsymmetric otherwise.
 
 According to the chosen discretization scheme of the Laplacian, $A$ is an M-matrix (irreducibly
@@ -179,7 +179,7 @@ In this chapter, we aim at harnessing the computing power of GPU clusters for so
 large nonlinear systems\index{Nonlinear}. Then, we choose to use the projected Richardson
 iterative method\index{Iterative~method!Projected~Richardson} for solving the diffusion
 problem~(\ref{ch13:eq:04}). Indeed, this method is based on the iterations of the Jacobi
-method\index{Iterative~method!Jacobi} which are easy to parallelize on parallel computers
+method\index{Iterative~method!Jacobi}, which are easy to parallelize on parallel computers
 and easy to adapt to GPU architectures. Then, according to the boundary value problem
 formulation with a self-adjoint operator~(\ref{ch13:eq:04}), we can consider here the
 equivalent optimization problem and the fixed point mapping associated to its solution.
@@ -198,7 +198,7 @@ U^{*} = F(U^{*}), \\
 \end{equation}
 where $U\mapsto F(U)$ is an application from $E$ to $E$.
 
-Let $K$ be a closed convex set defined by:
+Let $K$ be a closed convex set defined by
 \begin{equation}
 K = \{U | U \geq \Phi \mbox{~everywhere in~} E\},
 \label{ch13:eq:07}
@@ -214,16 +214,16 @@ is formulated as the following constrained optimization problem:
 \right.
 \label{ch13:eq:08}
 \end{equation}
-where the cost function is given by:
+where the cost function is given by
 \begin{equation}
 J(U) = \frac{1}{2}\scalprod{\mathcal{A}.U}{U} - \scalprod{G}{U},
 \label{ch13:eq:09}
 \end{equation}
 in which $\scalprod{.}{.}$ denotes the scalar product in $E$, $\mathcal{A}=A+\delta I$
-is a symmetric positive definite, $A$ is the discretization matrix associated with the
+is a symmetric positive definite, and $A$ is the discretization matrix associated with the
 self-adjoint operator~(\ref{ch13:eq:04}) after change of variables.
 
-For any $U\in E$, let $P_K(U)$ be the projection of $U$ on $K$. For any $\gamma\in\mathbb{R}$,
+For any $U\in E$; let $P_K(U)$ be the projection of $U$ on $K$. For any $\gamma\in\mathbb{R}$,
 $\gamma>0$, the fixed point mapping $F_{\gamma}$ of the projected Richardson method\index{Iterative~method!Projected~Richardson}
 is defined as follows:
 \begin{equation}
@@ -251,8 +251,8 @@ F_{\gamma}(U) & = & (F_{1,\gamma}(U),\ldots,F_{\alpha,\gamma}(U)). \\
 \end{array}
 \label{ch13:eq:11}
 \end{equation}
-Assume that the convex set $K=\displaystyle\prod_{i=1}^{\alpha}K_{i}$, such that $\forall i\in\{1,\ldots,\alpha\},K_i\subset E_i$
-and $K_i$ is a closed convex set. Let also $G=(G_1,\ldots,G_{\alpha})\in E$ and, for any
+Assume that the convex set $K=\displaystyle\prod_{i=1}^{\alpha}K_{i}$, such that $\forall i\in\{1,\ldots,\alpha\},K_i\subset E_i$,
+and $K_i$ is a closed convex set. Let also $G=(G_1,\ldots,G_{\alpha})\in E$; for any
 $U\in E$, $P_K(U)=(P_{K_1}(U_1),\ldots,P_{K_{\alpha}}(U_{\alpha}))$ is the projection of $U$
 on $K$ where $\forall i\in\{1,\ldots,\alpha\},P_{K_i}$ is the projector from $E_i$ onto
 $K_i$. So, the fixed point mapping of the projected Richardson method~(\ref{ch13:eq:10})\index{Iterative~method!Projected~Richardson}
@@ -267,7 +267,7 @@ $\mathcal{A}_{i,j}$ denote block matrices of $\mathcal{A}$.
 The parallel asynchronous iterations of the projected Richardson method for solving the
 obstacle problem~(\ref{ch13:eq:08}) are defined as follows: let $U^0\in E,U^0=(U^0_1,\ldots,U^0_\alpha)$
 be the initial solution, then for all $p\in\mathbb{N}$, the iterate $U^{p+1}=(U^{p+1}_1,\ldots,U^{p+1}_{\alpha})$
-is recursively defined by:
+is recursively defined by
 \begin{equation}
 U_i^{p+1} = 
 \left\{
@@ -283,7 +283,7 @@ where
 \left\{
 \begin{array}{l}
 \forall p\in\mathbb{N}, s(p)\subset\{1,\ldots,\alpha\}\mbox{~and~} s(p)\ne\emptyset, \\
-\forall i\in\{1,\ldots,\alpha\},\{p \ | \ i \in s(p)\}\mbox{~is denombrable},
+\forall i\in\{1,\ldots,\alpha\},\{p \ | \ i \in s(p)\}\mbox{~is enumerable},
 \end{array}
 \right.
 \label{ch13:eq:14}
@@ -300,27 +300,27 @@ and $\forall j\in\{1,\ldots,\alpha\}$,
 \end{equation}
 
 The previous asynchronous scheme\index{Asynchronous} of the projected Richardson
-method models computations that are carried out in parallel without order nor
+method models computations that are carried out in parallel without order or
 synchronization (according to the behavior of the parallel iterative method) and
 describes a subdomain method without overlapping. It is a general model that takes
 into account all possible situations of parallel computations and nonblocking message
-passing. So, the synchronous iterative scheme\index{Synchronous} is defined by:
+passing. So, the synchronous iterative scheme\index{Synchronous} is defined by
 \begin{equation}
 \forall j\in\{1,\ldots,\alpha\} \mbox{,~} \forall p\in\mathbb{N} \mbox{,~} \rho_j(p)=p.
 \label{ch13:eq:16}
 \end{equation}
 The values of $s(p)$ and $\rho_j(p)$ are defined dynamically and not explicitly by
 the parallel asynchronous or synchronous execution of the algorithm. Particularly,
-it enables one to consider distributed computations whereby processors compute at
+They allow us to consider distributed computations whereby processors compute at
 their own pace according to their intrinsic characteristics and computational load.
 The parallelism between the processors is well described by the set $s(p)$ which
 contains at each step $p$ the index of the components relaxed by each processor on
 a parallel way while the use of delayed components in~(\ref{ch13:eq:13}) permits one
 to model nondeterministic behavior and does not imply inefficiency of the considered
 distributed scheme of computation. Note that, according to~\cite{ch13:ref7}, theoretically,
-each component of the vector must be relaxed an infinity of time. The choice of the
-relaxed components to be used in the computational process may be guided by any criterion
-and, in particular, a natural criterion is to pick-up the most recently available
+each component of the vector must be relaxed an infinite number of times. The choice of the
+relaxed components to be used in the computational process may be guided by any criterion,
+and in particular, a natural criterion is to pickup the most recently available
 values of the components computed by the processors. Furthermore, the asynchronous
 iterations are implemented by means of nonblocking MPI communication subroutines\index{MPI~subroutines!Nonblocking}
 (asynchronous communications).
@@ -329,7 +329,7 @@ The important property ensuring the convergence of the parallel projected Richar
 method, both synchronous and asynchronous algorithms, is the fact that $\mathcal{A}$
 is an M-matrix. Moreover, the convergence\index{Convergence} proceeds from a result
 of~\cite{ch13:ref6}. Indeed, there exists a value $\gamma_0>0$, such that $\forall\gamma\in ]0,\gamma_0[$,
-the parallel iterations~(\ref{ch13:eq:13}), (\ref{ch13:eq:14}) and~(\ref{ch13:eq:15}),
+the parallel iterations~(\ref{ch13:eq:13}), (\ref{ch13:eq:14}), and~(\ref{ch13:eq:15}),
 associated to the fixed point mapping $F_\gamma$~(\ref{ch13:eq:12}), converge to the
 unique solution $U^{*}$ of the discretized problem. 
 
@@ -343,64 +343,64 @@ In this section, we give the main key points of the parallel implementation of t
 projected Richardson method, both synchronous and asynchronous versions, on a GPU
 cluster, for solving the nonlinear systems derived from the discretization of large
 obstacle problems. More precisely, each nonlinear system is solved iteratively using
-the whole cluster. We use a heterogeneous CUDA/MPI programming. Indeed, the communication
+the whole cluster. We use a heterogeneous CUDA and MPI programming. Indeed, the communication
 of data, at each iteration between the GPU computing nodes, can be either synchronous
 or asynchronous using the MPI communication subroutines, whereas inside each GPU node,
 a CUDA parallelization is performed.
 
-\begin{figure}[!h]
-\centerline{\includegraphics[scale=0.30]{Chapters/chapter13/figures/splitCPU}}
-\caption{Data partitioning of a problem to be solved among $S=3\times 4$ computing nodes.}
-\label{ch13:fig:01}
-\end{figure}
-
 Let $S$ denote the number of computing nodes\index{Computing~node} on the GPU cluster,
 where a computing node is composed of CPU core holding one MPI process and a GPU card.
 So, before starting computations, the obstacle problem of size $(NX\times NY\times NZ)$
-is split into $S$ parallelepipedic sub-problems, each for a node (MPI process, GPU), as
+is split into $S$ parallelepipedic subproblems, each for a node (MPI process, GPU), as
 is shown in Figure~\ref{ch13:fig:01}. Indeed, the $NY$ and $NZ$ dimensions (according
-to the $y$ and $z$ axises) of the three-dimensional problem are, respectively, split
+to the $y$ and $z$ axises) of the three-dimensional problem are split, respectively,
 into $Sy$ and $Sz$ parts, such that $S=Sy\times Sz$. In this case, each computing node
-has at most four neighboring nodes. This kind of the data partitioning reduces the data
+has at most four neighboring nodes. This kind of data partitioning reduces the data
 exchanges at subdomain boundaries compared to a naive $z$-axis-wise partitioning.
 
-\begin{algorithm}[!t]
-Initialization of the parameters of the sub-problem\;
-Allocate and fill the data in the global memory GPU\;
-\For{$i=1$ {\bf to} $NbSteps$}{
-   $G = \frac{1}{k}.U^0 + F$\;
-   Solve($A$, $U^0$, $G$, $U$, $\varepsilon$, $MaxRelax$)\;
-   $U^0 = U$\;
-}
-Copy the solution $U$ back from GPU memory\;
-\caption{Parallel solving of the obstacle problem on a GPU cluster}
-\label{ch13:alg:01}
-\end{algorithm}
+\begin{figure}
+\centerline{\includegraphics[scale=0.30]{Chapters/chapter13/figures/splitCPU}}
+\caption{Data partitioning of a problem to be solved among $S=3\times 4$ computing nodes.}
+\label{ch13:fig:01}
+\end{figure}
 
-All the computing nodes of the GPU cluster execute in parallel the same Algorithm~\ref{ch13:alg:01}
-but on different three-dimensional sub-problems of size $(NX\times ny\times nz)$.
+All the computing nodes of the GPU cluster execute in parallel the Algorithm~\ref{ch13:alg:01}
+on a three-dimensional subproblems of size $(NX\times ny\times nz)$.
 This algorithm gives the main key points for solving an obstacle problem\index{Obstacle~problem}
 defined in a three-dimensional domain, where $A$ is the discretization matrix, $G$
-is the right-hand side and $U$ is the solution vector. After the initialization step,
+is the right-hand side, and $U$ is the solution vector. After the initialization step,
 all the data generated from the partitioning operation are copied from the CPU memories
-to the GPU global memories, to be processed on the GPUs. Next, the algorithm uses $NbSteps$
+to the GPU global memories to be processed on the GPUs. Next, the algorithm uses $NbSteps$
 time steps to solve the global obstacle problem. In fact, it uses a parallel algorithm
-adapted to GPUs of the projected Richardson iterative method for solving the nonlinear
+adapted to GPUs from the projected Richardson iterative method for solving the nonlinear
 systems\index{Nonlinear} of the obstacle problem. This function is defined by {\it Solve()}
 in Algorithm~\ref{ch13:alg:01}. At every time step, the initial guess $U^0$ for the iterative
 algorithm is set to the solution found at the previous time step. Moreover, the right-hand
 side $G$ is computed as follows: \[G = \frac{1}{k}.U^{prev} + F\] where $k$ is the time step,
-$U^{prev}$ is the solution computed in the previous time step and each element $f(x, y, z)$
+$U^{prev}$ is the solution computed in the previous time step, and each element $f(x, y, z)$
 of the vector $F$ is computed as follows:
 \begin{equation}
 f(x,y,z)=\cos(2\pi x)\cdot\cos(4\pi y)\cdot\cos(6\pi z).
 \label{ch13:eq:18}
 \end{equation}
 Finally, the solution $U$ of the obstacle problem is copied back from the GPU global
-memories to the CPU memories. We use the communication subroutines of the CUBLAS library~\cite{ch13:ref8}\index{CUBLAS}
-(CUDA Basic Linear Algebra Subroutines) for the memory allocations in the GPU (\verb+cublasAlloc()+)
+memories to the CPU memories. We use the communication subroutines of the CUBLAS 
+(CUDA Basic Linear Algebra Subroutines) library~\cite{ch13:ref8}\index{CUBLAS} for the memory allocations in the GPU (\verb+cublasAlloc()+)
 and the data transfers between the CPU and its GPU: \verb+cublasSetVector()+ and \verb+cublasGetVector()+. 
 
+\begin{algorithm}[t]
+Initialization of the parameters of the subproblem\;
+Allocate and fill the data in the global memory GPU\;
+\For{$i=1$ {\bf to} $NbSteps$}{
+   $G = \frac{1}{k}.U^0 + F$\;
+   Solve($A$, $U^0$, $G$, $U$, $\varepsilon$, $MaxRelax$)\;
+   $U^0 = U$\;
+}
+Copy the solution $U$ back from GPU memory\;
+\caption{parallel solving of the obstacle problem on a GPU cluster}
+\label{ch13:alg:01}
+\end{algorithm}
+
 \begin{algorithm}[!t]
   $p = 0$\;
   $conv = false$\;
@@ -414,17 +414,17 @@ and the data transfers between the CPU and its GPU: \verb+cublasSetVector()+ and
     $p = p + 1$\;
     $conv$ = Convergence($error$, $p$, $\varepsilon$, $MaxRelax$)\;
   }
-\caption{Parallel iterative solving of the nonlinear systems on a GPU cluster ($Solve()$ function)}
+\caption{parallel iterative solving of the nonlinear systems on a GPU cluster ($Solve()$ function)}
 \label{ch13:alg:02}
 \end{algorithm}
 
-As many other iterative methods, the algorithm of the projected Richardson
+As are many other iterative methods, the algorithm of the projected Richardson
 method\index{Iterative~method!Projected~Richardson} is based on algebraic
 functions operating on vectors and/or matrices, which are more efficient on
 parallel computers when they work on large vectors. Its parallel implementation
 on the GPU cluster is carried out so that the GPUs execute the vector operations
 as kernels and the CPUs execute the serial codes, supervise the kernel executions
-and the data exchanges with the neighboring nodes\index{Neighboring~node} and
+and the data exchanges with the neighboring nodes\index{Neighboring~node}, and
 supply the GPUs with data. Algorithm~\ref{ch13:alg:02} shows the main key points
 of the parallel iterative algorithm (function $Solve()$ in Algorithm~\ref{ch13:alg:01}).
 All the vector operations inside the main loop ({\bf repeat} ... {\bf until})
@@ -432,7 +432,7 @@ are executed by the GPU. We use the following functions of the CUBLAS library\in
 \begin{itemize}
 \item \verb+cublasDaxpy()+ to compute the difference between the solution vectors $U^{p}$ and $U^{p+1}$ computed in two successive relaxations
 $p$ and $p+1$ (line~$7$ in Algorithm~\ref{ch13:alg:02}),
-\item \verb+cublasDnrm2()+ to perform the Euclidean norm (line~$8$) and,
+\item \verb+cublasDnrm2()+ to perform the Euclidean norm (line~$8$), and
 \item \verb+cublasDcpy()+ for the data copy of a vector to another one in the GPU memory (lines~$3$ and~$9$).
 \end{itemize}
 
@@ -441,41 +441,41 @@ depend on the resources of the GPU multiprocessor and the resource requirements
 of the kernel. So, if $block$ defines the size of a thread block, which must
 not exceed the maximum size of a thread block, then the number of thread blocks
 in the grid, denoted by $grid$, can be computed according to the size of the
-local sub-problem as follows: \[grid = \frac{(NX\times ny\times nz)+block-1}{block}.\]
+local subproblem as follows: \[grid = \frac{(NX\times ny\times nz)+block-1}{block}.\]
 However, when solving very large problems, the size of the thread grid can exceed
-the maximum number of thread blocks that can be executed on the GPUs (up-to $65.535$
-thread blocks) and, thus, the kernel will fail to launch. Therefore, for each kernel,
-we decompose the three-dimensional sub-problem into $nz$ two-dimensional slices of size
+the maximum number of thread blocks that can be executed on the GPUs (upto $65.535$
+thread blocks), and thus, the kernel will fail to launch. Therefore, for each kernel,
+we decompose the three-dimensional subproblem into $nz$ two-dimensional slices of size
 ($NX\times ny$), as is shown in Figure~\ref{ch13:fig:02}. All slices of the same kernel
-are executed using {\bf for} loop by $NX\times ny$ parallel threads organized in a
+are executed using a {\bf for} loop by $NX\times ny$ parallel threads organized in a
 two-dimensional grid of two-dimensional thread blocks, as is shown in Listing~\ref{ch13:list:01}.
 Each thread is in charge of $nz$ discretization points (one from each slice), accessed
 in the GPU memory with a constant stride $(NX\times ny)$.
 
 \begin{figure}
 \centerline{\includegraphics[scale=0.30]{Chapters/chapter13/figures/splitGPU}}
-\caption{Decomposition of a sub-problem in a GPU into $nz$ slices.}
+\caption{Decomposition of a subproblem in a GPU into $nz$ slices.}
 \label{ch13:fig:02}
 \end{figure}
 
 \begin{center}
-\lstinputlisting[label=ch13:list:01,caption=Skeleton codes of a GPU kernel and a CPU function]{Chapters/chapter13/ex1.cu}
+\lstinputlisting[label=ch13:list:01,caption=skeleton codes of a GPU kernel and a CPU function]{Chapters/chapter13/ex1.cu}
 \end{center}
 The function $Determine\_Bordering\_Vector\_Elements()$ (line~$5$ in Algorithm~\ref{ch13:alg:02})
 determines the values of the vector elements shared at the boundaries with neighboring computing
-nodes. Its main operations are defined as follows:
+nodes. Its main operations are as follows:
 \begin{enumerate}
 \item define the values associated to the bordering points needed by the neighbors,
 \item copy the values associated to the bordering points from the GPU to the CPU,
-\item exchange the values associated to the bordering points between the neighboring CPUs,
-\item copy the received values associated to the bordering points from the CPU to the GPU,
+\item exchange the values associated to the bordering points between the neighboring CPUs, and
+\item copy the received values associated to the bordering points from the CPU to the GPU.
 \end{enumerate}
 The first operation of this function is implemented as kernels to be performed by the GPU:
 \begin{itemize}
-\item a kernel executed by $NX\times nz$ threads to define the values associated to the bordering vector elements along $y$-axis and,
-\item a kernel executed by $NX\times ny$ threads to define the values associated to the bordering vector elements along $z$-axis.  
+\item a kernel executed by $NX\times nz$ threads to define the values associated to the bordering vector elements along the $y$-axis, and
+\item a kernel executed by $NX\times ny$ threads to define the values associated to the bordering vector elements along the $z$-axis.  
 \end{itemize}
-As mentioned before, we develop the \emph{synchronous} and \emph{asynchronous}
+As mentioned previously, we develop the \emph{synchronous} and \emph{asynchronous}
 algorithms of the projected Richardson method. Obviously, in this scope, the
 synchronous\index{Synchronous} or asynchronous\index{Asynchronous} communications
 refer to the communications between the CPU cores (MPI processes) on the GPU cluster,
@@ -487,11 +487,11 @@ and \verb+cublasGetVectorAsync()+ in the asynchronous algorithm. Moreover, we
 use the communication routines of the MPI library to carry out the data exchanges
 between the neighboring nodes. We use the following communication routines: \verb+MPI_Isend()+
 and \verb+MPI_Irecv()+ to perform nonblocking\index{MPI~subroutines!Nonblocking}
-sends and receptions, respectively. For the synchronous algorithm, we use the MPI
+sends and receives, respectively. For the synchronous algorithm, we use the MPI
 routine \verb+MPI_Waitall()+ which puts the MPI process of a computing node in
-blocking status until all data exchanges with neighboring nodes (sends and receptions)
+blocking status until all data exchanges with neighboring nodes (sends and receives)
 are completed. In contrast, for the asynchronous algorithms, we use the MPI routine
-\verb+MPI_Test()+ which tests the completion of a data exchange (send or reception)
+\verb+MPI_Test()+ which tests the completion of a data exchange (send or receives)
 without putting the MPI process in blocking status\index{MPI~subroutines!Blocking}.   
 
 The function $Compute\_New\_Vector\_Elements()$ (line~$6$ in Algorithm~\ref{ch13:alg:02})
@@ -510,7 +510,7 @@ u^{p+1}(x,y,z) =& \frac{1}{Center}(g(x,y,z) - (Center\cdot u^{p}(x,y,z) + \\
 \end{equation}
 where $u^{p}(x,y,z)$ is an element of the iterate vector $U$ computed at the
 iteration $p$ and $g(x,y,z)$ is a vector element of the right-hand side $G$.
-The scalars $Center$, $West$, $East$, $South$, $North$, $Rear$ and $Front$
+The scalars $Center$, $West$, $East$, $South$, $North$, $Rear$, and $Front$
 define constant coefficients of the block matrix $A$. Figure~\ref{ch13:fig:03}
 shows the positions of these coefficients in a three-dimensional domain.  
 
@@ -533,6 +533,7 @@ of the projected Richardson method, which are: the matrix-vector multiplication
 (\verb+MV_Multiplication()+) and the vector elements updates (\verb+Vector_Updates()+).
 The codes of these kernels are based on that presented in Listing~\ref{ch13:list:01}.
 
+\pagebreak
 \lstinputlisting[label=ch13:list:02,caption=GPU kernels of the projected Richardson method]{Chapters/chapter13/ex2.cu}
 
 \begin{figure}
@@ -544,37 +545,38 @@ The codes of these kernels are based on that presented in Listing~\ref{ch13:list
 Each kernel is executed by $NX\times ny$ GPU threads so that $nz$ slices
 of $(NX\times ny)$ vector elements are computed in a {\bf for} loop. In
 this case, each thread is in charge of one vector element from each slice
-(in total $nz$ vector elements along $z$-axis). We can notice from the
+(in total $nz$ vector elements along the $z$-axis). We can notice from the
 formula~(\ref{ch13:eq:17}) that the computation of a vector element $u^{p+1}(x,y,z)$,
 by a thread at iteration $p+1$, requires seven vector elements computed
 at the previous iteration $p$: two vector elements in each dimension plus
-the vector element at the intersection of the three axises $x$, $y$ and $z$
+the vector element at the intersection of the three axes $x$, $y$, and $z$
 (see Figure~\ref{ch13:fig:04}). So, to reduce the memory accesses to the
 high-latency global memory, the vector elements of the current slice can
 be stored in the low-latency shared memories of thread blocks, as is described
 in~\cite{ch13:ref9}. Nevertheless, the fact that the computation of a vector
-element requires only two elements in each dimension does not allow to maximize
+element requires only two elements in each dimension does not allow us to maximize
 the data reuse from the shared memories. The computation of a slice involves
 in total $(bx+2)\times(by+2)$ accesses to the global memory per thread block,
 to fill the required vector elements in the shared memory where $bx$ and $by$
 are the dimensions of a thread block. Then, in order to optimize the memory
 accesses on GPUs, the elements of the iterate vector $U$ are filled in the
-cache texture memory (see~\cite{ch13:ref10}). In new GPU generations as Fermi
+cache texture memory (see~\cite{ch13:ref10}). In new GPU hardware and software as Fermi
 or Kepler, the global memory accesses are always cached in L1 and L2 caches.
-For example, for a given kernel, we can favour the use of the L1 cache to that
+For example, for a given kernel, we can favor the use of the L1 cache to that
 of the shared memory by using the function \verb+cudaFuncSetCacheConfig(Kernel,cudaFuncCachePreferL1)+.
 So, the initial access to the global memory loads the vector elements required
 by the threads of a block into the cache memory (texture or L1/L2 caches). Then,
 all the following memory accesses read from this cache memory. In Listing~\ref{ch13:list:02},
 the function \verb+fetch_double(v,i)+ is used to read from the texture memory
-the $i^{th}$ element of the double-precision vector \verb+v+ (see Listing~\ref{ch13:list:03}).
+the $ith$ element of the double-precision vector \verb+v+ (see Listing~\ref{ch13:list:03}).
 Moreover, the seven constant coefficients of matrix $A$ can be stored in the
 constant memory but, since they are reused $nz$ times by each thread, it is more
-interesting to fill them on the low-latency registers of each thread.    
+efficient to fill them on the low-latency registers of each thread.    
 
-\lstinputlisting[label=ch13:list:03,caption=Memory access to the cache texture memory]{Chapters/chapter13/ex3.cu}
+\pagebreak
+\lstinputlisting[label=ch13:list:03,caption=memory access to the cache texture memory]{Chapters/chapter13/ex3.cu}
 
-The function $Convergence()$ (line~$11$ in Algorithm~\ref{ch13:alg:02}) allows
+The function $Convergence()$ (line~$11$ in Algorithm~\ref{ch13:alg:02}) allows us
 to detect the convergence of the parallel iterative algorithm and is based on
 the tolerance threshold\index{Convergence!Tolerance~threshold} $\varepsilon$
 and the maximum number of relaxations\index{Convergence!Maximum~number~of~relaxations}
@@ -598,13 +600,13 @@ conv \leftarrow true;
 $$
 where the function $AllReduce()$ uses the MPI global reduction subroutine\index{MPI~subroutines!Global}
 \verb+MPI_Allreduce()+ to compute the maximal value, $maxerror$, among the local
-absolute errors, $error$, of all computing nodes and $p$ (in Algorithm~\ref{ch13:alg:02})
+absolute errors, $error$, of all computing nodes, and $p$ (in Algorithm~\ref{ch13:alg:02})
 is used as a counter of the local relaxations carried out by a computing node. In
 the asynchronous\index{Asynchronous} algorithms, the global convergence is detected
 when all computing nodes locally converge. For this, we use a token ring architecture
 around which a boolean token travels, in one direction, from a computing node to another.
 Starting from node $0$, the boolean token is set to $true$ by node $i$ if the local
-convergence is reached or to $false$ otherwise and, then, it is sent to node $i+1$.
+convergence is reached or to $false$ otherwise, and then, it is sent to node $i+1$.
 Finally, the global convergence is detected when node $0$ receives from its neighbor
 node $S-1$, in the ring architecture, a token set to $true$. In this case, node $0$
 sends a stop message (end of parallel solving) to all computing nodes in the cluster.
@@ -615,54 +617,48 @@ sends a stop message (end of parallel solving) to all computing nodes in the clu
 %%--------------------------%%
 \section{Experimental tests on a GPU cluster}
 \label{ch13:sec:05}
-The GPU cluster\index{GPU~cluster} of tests, that we used in this chapter, is an $20Gbps$
+The GPU cluster\index{GPU~cluster} of tests that we used in this chapter is an $20GB/s$
 Infiniband network of six machines. Each machine is a Quad-Core Xeon E5530 CPU running at
 $2.4$GHz. It provides a RAM memory of $12$GB with a memory bandwidth of $25.6$GB/s and it
-is equipped with two Nvidia Tesla C1060 GPUs. A Tesla GPU contains in total $240$ cores
+is equipped with two NVIDIA Tesla C1060 GPUs. A Tesla GPU contains in total $240$ cores
 running at $1.3$GHz. It provides $4$GB of global memory with a memory bandwidth of $102$GB/s,
 accessible by all its cores and also by the CPU through the PCI-Express 16x Gen 2.0 interface
 with a throughput of $8$GB/s. Hence, the memory copy operations between the GPU and the CPU
 are about $12$ times slower than those of the Tesla GPU memory. We have performed our simulations
-on a cluster of $24$ CPU cores and on a cluster of $12$ GPUs. Figure~\ref{ch13:fig:05} describes
+on a cluster of $24$ CPU cores and on a cluster of $12$ GPUs. Figure~\ref{ch12:fig:04} describes
 the components of the GPU cluster of tests.
 
 Linux cluster version 2.6.39 OS is installed on CPUs. C programming language is used for
 coding the parallel algorithms of the methods on both GPU cluster and CPU cluster. CUDA
 version 4.0~\cite{ch13:ref12} is used for programming GPUs, using CUBLAS library~\cite{ch13:ref8}
-to deal with vector operations in GPUs and, finally, MPI functions of OpenMPI 1.3.3 are
+to deal with vector operations in GPUs, and finally, MPI functions of OpenMPI 1.3.3 are
 used to carry out the synchronous and asynchronous communications between CPU cores. Indeed,
-in our experiments, a computing node is managed by a MPI process and it is composed of
+in our experiments, a computing node is managed by one MPI process and it is composed of
 one CPU core and one GPU card.
  
 All experimental results of the parallel projected Richardson algorithms are obtained
 from simulations made in double precision data. The obstacle problems to be solved are
 defined in constant three-dimensional domain $\Omega\subset\mathbb{R}^{3}$. The numerical
-values of the parameters of the obstacle problems are: $\eta=0.2$, $c=1.1$, $f$ is computed
-by formula~(\ref{ch13:eq:18}) and final time $T=0.02$. Moreover, three time steps ($NbSteps=3$)
+values of the parameters of the obstacle problems are $\eta=0.2$, $c=1.1$, $f$ is computed
+by formula~(\ref{ch13:eq:18}), and final time $T=0.02$. Moreover, three time steps ($NbSteps=3$)
 are computed with $k=0.0066$. As the discretization matrix is constant along the time
 steps, the convergence properties of the iterative algorithms do not change. Thus, the
 performance characteristics obtained with three time steps will still be valid for more
 time steps. The initial function $u(0,x,y,z)$ of the obstacle problem~(\ref{ch13:eq:01})
 is set to $0$, with a constraint $u\geq\phi=0$. The relaxation parameter $\gamma$ used
 by the projected Richardson method is computed automatically thanks to the diagonal entries
-of the discretization matrix. The formula and its proof can be found in~\cite{ch13:ref11},
-Section~2.3. The convergence tolerance threshold $\varepsilon$ is set to $1e$-$04$ and the
+of the discretization matrix. The formula and its proof can be found in~\cite{ch13:ref11}.
+The convergence tolerance threshold $\varepsilon$ is set to $1e$-$04$ and the
 maximum number of relaxations is limited to $10^{6}$ relaxations. Finally, the number of
 threads per block is set to $256$ threads, which gives, in general, good performances for
 most GPU applications. We have performed some tests for the execution configurations and
-we have noticed that the best configuration of the $256$ threads per block is an organization
+have noticed that the best configuration of the $256$ threads per block is an organization
 into two dimensions of sizes $(64,4)$. 
 
-\begin{figure}
-\centerline{\includegraphics[scale=0.25]{Chapters/chapter13/figures/cluster}}
-\caption{GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.}
-\label{ch13:fig:05}
-\end{figure}
-
 The performance measures that we took into account are the execution times and the number
 of relaxations performed by the parallel iterative algorithms, both synchronous and asynchronous
 versions, on the GPU and CPU clusters. These algorithms are used for solving nonlinear systems
-derived from the discretization of obstacle problems of sizes $256^{3}$, $512^{3}$, $768^{3}$
+derived from the discretization of obstacle problems of sizes $256^{3}$, $512^{3}$, $768^{3}$,
 and $800^{3}$. In Table~\ref{ch13:tab:01} and Table~\ref{ch13:tab:02}, we show the performances
 of the parallel synchronous and asynchronous algorithms of the projected Richardson method
 implemented, respectively, on a cluster of $24$ CPU cores and on a cluster of $12$ GPUs. In
@@ -670,14 +666,14 @@ these tables, the execution time defines the time spent by the slowest computing
 number of relaxations is computed as the summation of those carried out by all computing nodes.
 
 In the sixth column of Table~\ref{ch13:tab:01} and in the eighth column of Table~\ref{ch13:tab:02},
-we give the gains in $\%$ obtained by using an asynchronous algorithm compared to a synchronous
+we give the gains in percentage obtained by using an asynchronous algorithm compared to a synchronous
 one. We can notice that the asynchronous version on CPU and GPU clusters is slightly faster than
 the synchronous one for both methods. Indeed, the cluster of tests is composed of local and homogeneous
 nodes communicating via low-latency connections. So, in the case of distant and/or heterogeneous
-nodes (or even with geographically distant clusters) the asynchronous version would be faster than
+nodes (or even with geographically distant clusters), the asynchronous version would be faster than
 the synchronous one. However, the gains obtained on the GPU cluster are better than those obtained
 on the CPU cluster. In fact, the computation times are reduced by accelerating the computations on
-GPUs while the communication times still unchanged.
+GPUs while the communication times remain unchanged.
 
 \begin{table}
 \centering
@@ -685,7 +681,7 @@ GPUs while the communication times still unchanged.
 \hline
 \multirow{2}{*}{\bf Pb. size} & \multicolumn{2}{c|}{\bf Synchronous} & \multicolumn{2}{c|}{\bf Asynchronous} & \multirow{2}{*}{\bf Gain\%} \\ \cline{2-5}
 
-                              & $\mathbf{T_{cpu}}$ & {\bf \#relax.}  & $\mathbf{T_{cpu}}$ & {\bf \#relax.}  &  \\ \hline \hline
+                              & $\mathbf{T_{cpu}}$ & {\bf \# Relax.}  & $\mathbf{T_{cpu}}$ & {\bf \# Relax.}  &  \\ \hline \hline
 
 $256^{3}$                     & $575.22$           & $198,288$        & $539.25$          & $198,613$        & $6.25$ \\ \hline \hline
 
@@ -706,7 +702,7 @@ $800^{3}$                     & $222,108.09$       & $1,769,232$      & $188,790
 \hline
 \multirow{2}{*}{\bf Pb. size} & \multicolumn{3}{c|}{\bf Synchronous}                 & \multicolumn{3}{c|}{\bf Asynchronous}                & \multirow{2}{*}{\bf Gain\%}  \\ \cline{2-7}
 
-                             & $\mathbf{T_{gpu}}$ & {\bf \#relax.}  & $\mathbf{\tau}$ & $\mathbf{T_{gpu}}$ & {\bf \#relax.}  & $\mathbf{\tau}$ & \\  \hline \hline
+                             & $\mathbf{T_{gpu}}$ & {\bf \# Relax.}  & $\mathbf{\tau}$ & $\mathbf{T_{gpu}}$ & {\bf \# Relax.}  & $\mathbf{\tau}$ & \\  \hline \hline
 
 $256^{3}$                    & $29.67$            & $100,692$       &  $19.39$        & $18.00$           & $94,215$         & $29.96$    & $39.33$ \\\hline \hline
 
@@ -728,11 +724,11 @@ $\tau$ as a ratio between the execution time $T_{cpu}$ spent on the CPU cluster
 that $T_{gpu}$ spent on the GPU cluster: \[\tau=\frac{T_{cpu}}{T_{gpu}}.\] We can see
 from these ratios that solving large obstacle problems is faster on the GPU cluster
 than on the CPU cluster. Indeed, the GPUs are more efficient than their counterpart
-CPUs to execute large data-parallel operations. In addition, the projected Richardson
-method is implemented as a fixed point-based iteration and uses the Jacobi vector updates
-that allow a well thread-parallelization on GPUs, such that each GPU thread is in charge
+CPUs at executing large data-parallel operations. In addition, the projected Richardson
+method is implemented as a fixed point based iteration and uses the Jacobi vector updates
+that allow a well-suited thread-parallelization on GPUs, such that each GPU thread is in charge
 of one vector component at a time without being dependent on other vector components
-computed by other threads. Then, this allow to exploit at best the high performance
+computed by other threads. Then, this allows us to exploit at best the high performance
 computing of the GPUs by using all the GPU resources and avoiding the idle cores.
 
 Finally, the number of relaxations performed by the parallel synchronous algorithm
@@ -747,11 +743,11 @@ consequently it also depends on the number of computing nodes.
 %%--------------------------%%
 %%       SECTION 6          %%
 %%--------------------------%%
-\section{Red-Black ordering technique}
+\section{Red-black ordering technique}
 \label{ch13:sec:06}
-As is well-known, the Jacobi method\index{Iterative~method!Jacobi} is characterized
+As is wellknown, the Jacobi method\index{Iterative~method!Jacobi} is characterized
 by a slow convergence\index{Convergence} rate compared to some iterative methods\index{Iterative~method}
-(for example Gauss-Seidel method\index{Iterative~method!Gauss-Seidel}). So, in this
+(for example, Gauss-Seidel method\index{Iterative~method!Gauss-Seidel}). So, in this
 section, we present some solutions to reduce the execution time and the number of
 relaxations and, more specifically, to speed up the convergence of the parallel
 projected Richardson method on the GPU cluster. We propose to use the point red-black
@@ -762,30 +758,30 @@ apply it to the projected Richardson method as a compromise between the Jacobi
 and Gauss-Seidel iterative methods. 
 
 The general principle of the red-black technique is as follows. Let $t$ be the
-summation of the integer $x$-, $y$- and $z$-coordinates of a vector element $u(x,y,z)$
+summation of the integer $x$-, $y$-, and $z$-coordinates of a vector element $u(x,y,z)$
 on a three-dimensional domain: $t=x+y+z$. As is shown in Figure~\ref{ch13:fig:06.01},
-the red-black ordering technique consists in the parallel computing of the red
-vector elements having even value $t$ by using the values of the black ones then,
+the red-black ordering technique consists of the parallel computing of the red
+vector elements having even value $t$ by using the values of the black ones, then
 the parallel computing of the black vector elements having odd values $t$ by using
 the new values of the red ones.
 
 This technique can be implemented on the GPU in two different manners:
 \begin{itemize}
-\item among all launched threads ($NX\times ny$ threads), only one thread out of two computes its red or black vector element at a time or,
-\item all launched threads (on average half of $NX\times ny$ threads) compute the red vector elements first and, then, the black ones.
+\item among all launched threads ($NX\times ny$ threads), only one thread out of two computes its red or black vector element at a time or
+\item all launched threads (on average half of $NX\times ny$ threads) compute the red vector elements first, and then the black ones.
 \end{itemize}
 However, in both solutions, for each memory transaction, only half of the memory
 segment addressed by a half-warp is used. So, the computation of the red and black
-vector elements leads to use twice the initial number of memory transactions. Then,
+vector elements leads to using twice the initial number of memory transactions. Then,
 we apply the point red-black ordering\index{Iterative~method!Red-Black~ordering}
 accordingly to the $y$-coordinate, as is shown in Figure~\ref{ch13:fig:06.02}. In
 this case, the vector elements having even $y$-coordinate are computed in parallel
-using the values of those having odd $y$-coordinate and then vice-versa. Moreover,
+using the values of those having odd $y$-coordinate and then viceversa. Moreover,
 in the GPU implementation of the parallel projected Richardson method (Section~\ref{ch13:sec:04}),
-we have shown that a sub-problem of size $(NX\times ny\times nz)$ is decomposed into
+we have shown that a subproblem of size $(NX\times ny\times nz)$ is decomposed into
 $nz$ grids of size $(NX\times ny)$. Then, each kernel is executed in parallel by
 $NX\times ny$ GPU threads, so that each thread is in charge of $nz$ vector elements
-along $z$-axis (one vector element in each grid of the sub-problem). So, we propose
+along the $z$-axis (one vector element in each grid of the subproblem). So, we propose
 to use the new values of the vector elements computed in grid $i$ to compute those
 of the vector elements in grid $i+1$. Listing~\ref{ch13:list:04} describes the kernel
 of the matrix-vector multiplication and the kernel of the vector elements updates of
@@ -793,11 +789,12 @@ the parallel projected Richardson method using the red-black ordering technique.
 
 \begin{figure}
 \centering
-  \mbox{\subfigure[Red-Black ordering on x, y and z axises]{\includegraphics[width=2.3in]{Chapters/chapter13/figures/rouge-noir}\label{ch13:fig:06.01}}\quad
-        \subfigure[Red-Black ordering on y axis]{\includegraphics[width=2.3in]{Chapters/chapter13/figures/rouge-noir-y}\label{ch13:fig:06.02}}}
-\caption{Red-Black ordering for computing the iterate vector elements in a three-dimensional space.}
+  \mbox{\subfigure[Red-black ordering on x, y, and z axises]{\includegraphics[width=2.3in]{Chapters/chapter13/figures/rouge-noir}\label{ch13:fig:06.01}}\quad
+        \subfigure[Red-black ordering on y axis]{\includegraphics[width=2.3in]{Chapters/chapter13/figures/rouge-noir-y}\label{ch13:fig:06.02}}}
+\caption{Red-black ordering for computing the iterate vector elements in a three-dimensional space.}
 \end{figure}
 
+\pagebreak
 \lstinputlisting[label=ch13:list:04,caption=GPU kernels of the projected Richardson method using the red-black technique]{Chapters/chapter13/ex4.cu}
 
 Finally, we exploit the concurrent executions between the host functions and the GPU
@@ -814,8 +811,8 @@ neighboring CPUs and this in both synchronous and asynchronous cases.
 
 In Table~\ref{ch13:tab:03}, we report the execution times and the number of relaxations
 performed on a cluster of $12$ GPUs by the parallel projected Richardson algorithms; it
-can be noted that the performances of the projected Richardson are improved by using the
-point read-black ordering. We compare the performances of the parallel projected Richardson
+can be noted that the performances of the projected Richardson algorithm are improved by using the
+point red-black ordering. We compare the performances of the parallel projected Richardson
 method with and without this later ordering (Tables~\ref{ch13:tab:02} and~\ref{ch13:tab:03}).
 We can notice that both parallel synchronous and asynchronous algorithms are faster when
 they use the red-black ordering. Indeed, we can see in Table~\ref{ch13:tab:03} that the
@@ -828,7 +825,7 @@ shown in Table~\ref{ch13:tab:02}.
 \hline
 \multirow{2}{*}{\bf Pb. size} & \multicolumn{2}{c|}{\bf Synchronous} & \multicolumn{2}{c|}{\bf Asynchronous} & \multirow{2}{*}{\bf Gain\%}  \\ \cline{2-5}
 
-                              & $\mathbf{T_{gpu}}$ & {\bf \#relax.}   & $\mathbf{T_{gpu}}$ & {\bf \#relax.}   &           \\  \hline \hline
+                              & $\mathbf{T_{gpu}}$ & {\bf \# Relax.}   & $\mathbf{T_{gpu}}$ & {\bf \# Relax.}   &           \\  \hline \hline
 
 $256^{3}$                     & $18.37$            & $71,988$         & $12.58$           & $67,638$         & $31.52$  \\ \hline \hline
 
@@ -839,7 +836,7 @@ $768^{3}$                     & $2,773.65$         & $590,652$        & $2,222.2
 $800^{3}$                     & $2,748.23$         & $638,916$        & $2,502.61$        & $592,525$        & $8.92$ \\ \hline 
 \end{tabular}
 \vspace{0.5cm}
-\caption{Execution times in seconds of the parallel projected Richardson method using read-black ordering technique implemented on a cluster of 12 GPUs.}
+\caption{Execution times in seconds of the parallel projected Richardson method using red-black ordering technique implemented on a cluster of 12 GPUs.}
 \label{ch13:tab:03}
 \end{table}
 
@@ -854,15 +851,15 @@ the communication time of the parallel projected Richardson algorithms on a GPU
 cluster. The experimental tests are carried out on a cluster composed of one to
 ten Tesla GPUs. We have focused on the weak scaling of both parallel, synchronous
 and asynchronous, algorithms using the red-black ordering technique. For this, we
-have fixed the size of a sub-problem to $256^{3}$ per computing node (a CPU core
+have fixed the size of a subproblem to $256^{3}$ per computing node (a CPU core
 and a GPU). Then, Figure~\ref{ch13:fig:07} shows the number of relaxations performed,
 on average, per second by a computing node. We can see from this figure that the
 efficiency of the asynchronous algorithm is almost stable, while that of the synchronous
-algorithm decreases (down to $81\%$ in this example) with the increasing of the
+algorithm decreases (down to $81\%$ in this example) with the increase in the
 number of computing nodes on the cluster. This is due to the fact that the ratio
 between the time of the computation over that of the communication is reduced when
 the computations are performed on GPUs. Indeed, GPUs compute faster than CPUs and
-communications are more time consuming. In this context, asynchronous algorithms
+communications are more time-consuming. In this context, asynchronous algorithms
 are more scalable than synchronous ones. So, with large scale GPU clusters, synchronous\index{Synchronous}
 algorithms might be more penalized by communications, as can be deduced from Figure~\ref{ch13:fig:07}.
 That is why we think that asynchronous\index{Asynchronous} iterative algorithms
@@ -880,7 +877,7 @@ spatial discretization of three-dimensional obstacle problems. For this, we have
 both synchronous and asynchronous algorithms of the Richardson iterative method using a projection
 on a convex set. Indeed, this method uses point-based iterations of the Jacobi method that
 are very easy to parallelize on parallel computers. We have shown that its adapted parallel
-algorithms to GPU architectures allows to exploit at best the computing power of the GPUs and
+algorithms to GPU architectures allow us to exploit at best the computing power of the GPUs and
 to accelerate the resolution of large nonlinear systems. Consequently, the experimental results
 have shown that solving nonlinear systems of large obstacle problems with this method is about
 fifty times faster on a cluster of $12$ GPUs than on a cluster of $24$ CPU cores. Moreover,
@@ -890,7 +887,7 @@ performed on the cluster of $12$ GPUs are reduced on average of $32\%$.
 
 Afterwards, the experiments have shown that the asynchronous version is slightly more efficient
 than the synchronous one. In fact, the computations are accelerated by using GPUs while the communication
-times still unchanged. In addition, we have studied the weak-scaling in the synchronous and asynchronous
+times are still unchanged. In addition, we have studied the weak-scaling in the synchronous and asynchronous
 cases, which has confirmed that the ratio between the computations and the communications are reduced
 when using a cluster of GPUs. We highlight that asynchronous iterative algorithms are more scalable
 than synchronous ones. Therefore, we can conclude that asynchronous iterations are well suited to
diff --git a/BookGPU/Chapters/chapter13/ch13.tex~ b/BookGPU/Chapters/chapter13/ch13.tex~
deleted file mode 100755
index 65a9980..0000000
--- a/BookGPU/Chapters/chapter13/ch13.tex~
+++ /dev/null
@@ -1,710 +0,0 @@
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%                          %%
-%%       CHAPTER 13         %%
-%%                          %%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
- 
-\chapterauthor{}{}
-\newcommand{\scalprod}[2]%
-{\ensuremath{\langle #1 \, , #2 \rangle}}
-\chapter{Solving sparse nonlinear systems of obstacle problems on GPU clusters}
-
-%%--------------------------%%
-%%       SECTION 1          %%
-%%--------------------------%%
-\section{Introduction}
-\label{sec:01}
-The obstacle problem is one kind of free boundary problems. It allows to model, for example, an elastic membrane covering a solid obstacle.
-In this case, the objective is to find an equilibrium position of this membrane constrained to be above the obstacle and which tends to minimize
-its surface and/or its energy. The study of such problems occurs in many applications, for example: fluid mechanics, bio-mathematics (tumour growth
-process) or financial mathematics (American or European option pricing).
-
-In this chapter, we focus on solutions of large obstacle problems defined in a three-dimensional domain. Particularly, the present study consists
-in solving large nonlinear systems derived from the spatial discretization of these problems. Owing to the great size of such systems, in order to
-reduce computation times, we proceed by solving them by parallel synchronous or asynchronous iterative algorithms. Moreover, we aim at harnessing
-the computing power of GPUs to accelerate computations of these parallel algorithms. For this, we use an iterative method involving a projection
-on a convex set, which is: the projected Richardson method. We choose this method among other iterative methods because it is easy to implement on
-parallel computers and easy to adapt to GPU architectures.
-
-In Section~\ref{sec:02}, we present the mathematical model of obstacle problems then, in Section~\ref{sec:03}, we describe the general principle of 
-the parallel projected Richardson method. Next, in Section~\ref{sec:04}, we give the main key points of the parallel implementation of both synchronous
-and asynchronous algorithms of the projected Richardson method on a GPU cluster. In Section~\ref{sec:05}, we present the performances of both parallel
-algorithms obtained from simulations carried out on a CPU and GPU clusters. Finally, in Section~\ref{sec:06}, we use the read-black ordering technique
-to improve the convergence and, thus, the execution times of the parallel projected Richardson algorithms on the GPU cluster. 
-
-
-%%--------------------------%%
-%%       SECTION 2          %%
-%%--------------------------%%
-\section{Obstacle problems}
-\label{sec:02}
-In this section, we present the mathematical model of obstacle problems defined in a three-dimensional domain.
-This model is based on that presented in~\cite{ref1}.
-
-%%*******************
-%%*******************
-\subsection{Mathematical model}
-\label{sec:02.01}
-An obstacle problem, arising for example in mechanics or financial derivatives, consists in solving a time dependent
-nonlinear equation:
-\begin{equation}
-\left\{
-\begin{array}{l}
-\frac{\partial u}{\partial t}+b^t.\nabla u-\eta.\Delta u+c.u-f\geq 0\mbox{,~}u\geq\phi\mbox{,~a.e.w. in~}\lbrack 0,T\rbrack\times\Omega\mbox{,~}\eta>0,\\
-(\frac{\partial u}{\partial t}+b^t.\nabla u-\eta.\Delta u+c.u-f)(u-\phi)=0\mbox{,~a.e.w. in~}\lbrack 0,T\rbrack\times\Omega,\\
-u(0,x,y,z)=u_0(x,y,z),\\
-\mbox{B.C. on~}u(t,x,y,z)\mbox{~defined on~}\partial\Omega,
-\end{array}
-\right.
-\label{eq:01}
-\end{equation}
-where $u_0$ is the initial condition, $c\geq 0$, $b$ and $\eta$ are physical parameters, $T$ is the final time, $u=u(t,x,y,z)$
-is an element of the solution vector $U$ to compute, $f$ is the right-hand right that could represent, for example, the external
-forces, B.C. describes the boundary conditions on the boundary $\partial\Omega$ of the domain $\Omega$, $\phi$ models a constraint 
-imposed to $u$, $\Delta$ is the Laplacian operator, $\nabla$ is the gradient operator, a.e.w. means almost every where and ``.''
-defines the products between two scalars, a scalar and a vector or a matrix and a vector. In practice the boundary condition,
-generally considered, is the Dirichlet condition (where $u$ is fixed on $\partial\Omega$) or the Neumann condition (where the
-normal derivative of $u$ is fixed on $\partial\Omega$). 
-
-The time dependent equation~(\ref{eq:01}) is numerically solved by considering an implicit or a semi-implicit time marching,
-where at each time step $k$ a stationary nonlinear problem is solved:
-\begin{equation}
-\left\{
-\begin{array}{l}
-b^t.\nabla u-\eta.\Delta u+(c+\delta).u-g\geq 0\mbox{,~}u\geq\phi\mbox{,~a.e.w. in~}\lbrack 0,T\rbrack\times\Omega\mbox{,~}\eta>0, \\
-(b^t.\nabla u-\eta.\Delta u+(c+\delta).u- g)(u-\phi)=0\mbox{,~a.e.w. in~}\lbrack 0,T\rbrack\times\Omega, \\
-\mbox{B.C. on~}u(t,x,y,z)\mbox{~defined on~}\partial\Omega,
-\end{array}
-\right.
-\label{eq:02}
-\end{equation}
-where $\delta=\frac{1}{k}$ is the inverse of the time step $k$, $g=f+\delta u^{prev}$ and $u^{prev}$ is the solution computed at the
-previous time step. 
-
-
-%%*******************
-%%*******************
-\subsection{Discretization}
-\label{sec:02.02}
-First, we note that the spatial discretization of the previous stationary problem~(\ref{eq:02}) does not provide a symmetric matrix,
-because the convection-diffusion operator is not self-adjoint. Moreover, the fact that the operator is self-adjoint or not plays an
-important role in the choice of the appropriate algorithm for solving nonlinear systems derived from the discretization of the obstacle
-problem. Nevertheless, since the convection coefficients arising in the operator~(\ref{eq:02}) are constant, we can formulate the same 
-problem by an self-adjoint operator by performing a classical change of variables. Then, we can replace the stationary convection-diffusion
-problem:
-\begin{equation}
-b^{t}.\nabla v-\eta.\Delta v+(c+\delta).v=g\mbox{,~a.e.w. in~}\lbrack 0,T\rbrack\times\Omega\mbox{,~}c\geq 0\mbox{,~}\delta\geq~0,
-\label{eq:03}
-\end{equation}
-by the following stationary diffusion operator:
-\begin{equation}
--\eta.\Delta u+(\frac{\|b\|^{2}_{2}}{4\eta}+c+\delta).u=e^{-a}g=f,
-\label{eq:04}
-\end{equation}
-where $b=\{b_{1},b_{2},b_{3}\}$, $\|b\|_{2}$ denotes the Euclidean norm of $b$ and $v=e^{-a}.u$ represents the general change of variables
-such that $a=\frac{b^{t}(x,y,z)}{2\eta}$. Consequently, the numerical resolution of the diffusion problem (the self-adjoint operator~(\ref{eq:04}))
-is done by optimization algorithms, in contrast, that of the convection-diffusion problem (non self-adjoint operator~(\ref{eq:03})) is
-done by relaxation algorithms. In the case of our studied algorithm, the convergence is ensured by M-matrix property then, the performance
-is linked to the magnitude of the spectral radius of the iteration matrix, which is independent of the condition number.
-
-Next, the three-dimensional domain $\Omega\subset\mathbb{R}^{3}$ is set to $\Omega=\lbrack 0,1\rbrack^{3}$ and discretized with an uniform 
-Cartesian mesh constituted by $M=m^3$ discretization points, where $m$ related to the spatial discretization step by $h=\frac{1}{m+1}$. This
-is carried out by using a classical order 2 finite difference approximation of the Laplacian. So, the complete discretization of both stationary
-boundary value problems~(\ref{eq:03}) and~(\ref{eq:04}) leads to the solution of a large discrete complementary problem of the following
-form, when both Dirichlet or Neumann boundary conditions are used:
-\begin{equation}
-\left\{
-\begin{array}{l}
-\mbox{Find~}U^{*}\in\mathbb{R}^{M}\mbox{~such~that} \\
-(A+\delta I)U^{*}-G\geq 0\mbox{,~}U^{*}\geq\bar{\Phi},\\
-((A+\delta I)U^{*}-G)^{T}(U^{*}-\bar{\Phi})=0,\\
-\end{array}
-\right.
-\label{eq:05}
-\end{equation}
-where $A$ is a matrix obtained after the spatial discretization by a finite difference method, $G$ is derived from the Euler first order implicit time
-marching scheme and from the discretized right-hand side of the obstacle problem, $\delta$ is the inverse of the time step $k$ and $I$ is the identity
-matrix. The matrix $A$ is symmetric when the self-adjoint operator is considered and nonsymmetric otherwise.
-
-According to the chosen discretization scheme of the Laplacian, $A$ is an M-matrix (irreducibly diagonal dominant, see~\cite{ref2}) and, consequently,
-the matrix $(A+\delta I)$ is also an M-matrix. This property is important to the convergence of iterative methods.
-
-
-%%--------------------------%%
-%%       SECTION 3          %%
-%%--------------------------%%
-\section{Parallel iterative method}
-\label{sec:03}
-Owing to the large size of the previous discrete complementary problem~(\ref{eq:05}), we will solve it by parallel synchronous or asynchronous iterative
-algorithms (see~\cite{ref3,ref4,ref5}). In this chapter, we aim at harnessing the computing power of GPU clusters for solving these large nonlinear systems.
-Then, we choose to use the projected Richardson iterative method for solving the diffusion problem~(\ref{eq:04}). Indeed, this method is based on the iterations
-of the Jacobi method which are easy to parallelize on parallel computers and easy to adapt to GPU architectures. Then, according to the boundary value problem
-formulation with a self-adjoint operator~(\ref{eq:04}), we can consider here the equivalent optimization problem and the fixed point mapping associated to
-its solution.
-
-Assume that $E=\mathbb{R}^{M}$ is a Hilbert space, in which $\scalprod{.}{.}$ is the scalar product and $\|.\|$ its associated norm. So, the general fixed
-point problem to be solved is defined as follows:
-\begin{equation}
-\left\{
-\begin{array}{l}
-\mbox{Find~} U^{*} \in E \mbox{~such that} \\
-U^{*} = F(U^{*}), \\
-\end{array}
-\right.
-\label{eq:06}
-\end{equation}
-where $U\mapsto F(U)$ is an application from $E$ to $E$.
-
-Let $K$ be a closed convex set defined by:
-\begin{equation}
-K = \{U | U \geq \Phi \mbox{~everywhere in~} E\},
-\label{eq:07}
-\end{equation}
-where $\Phi$ is the discrete obstacle function. In fact, the obstacle problem~(\ref{eq:05}) is formulated as the following constrained optimization problem:
-\begin{equation}
-\left\{
-\begin{array}{l}
-\mbox{Find~} U^{*} \in K \mbox{~such that} \\
-\forall V \in K, J(U^{*}) \leq J(V), \\
-\end{array}
-\right.
-\label{eq:08}
-\end{equation}
-where the cost function is given by:
-\begin{equation}
-J(U) = \frac{1}{2}\scalprod{\mathcal{A}.U}{U} - \scalprod{G}{U},
-\label{eq:09}
-\end{equation}
-in which $\scalprod{.}{.}$ denotes the scalar product in $E$, $\mathcal{A}=A+\delta I$ is a symmetric positive definite, $A$ is the discretization matrix
-associated with the self-adjoint operator~(\ref{eq:04}) after change of variables.
-
-For any $U\in E$, let $P_K(U)$ be the projection of $U$ on $K$. For any $\gamma\in\mathbb{R}$, $\gamma>0$, the fixed point mapping $F_{\gamma}$ of the projected
-Richardson method is defined as follows:
-\begin{equation}
-U^{*} = F_{\gamma}(U^{*}) = P_K(U^{*} - \gamma(\mathcal{A}.U^{*} - G)).
-\label{eq:10}
-\end{equation}
-In order to reduce the computation time, the large optimization problem is solved in a numerical way by using a parallel asynchronous algorithm of the projected
-Richardson method on the convex set $K$. Particularly, we will consider an asynchronous parallel adaptation of the projected Richardson method~\cite{ref6}.
-
-Let $\alpha\in\mathbb{N}$ be a positive integer. We consider that the space $E=\displaystyle\prod_{i=1}^{\alpha} E_i$ is a product of $\alpha$ subspaces $E_i$
-where $i\in\{1,\ldots,\alpha\}$. Note that $E_i=\mathbb{R}^{m_i}$, where $\displaystyle\sum_{i=1}^{\alpha} m_{i}=M$, is also a Hilbert space in which $\scalprod{.}{.}_i$
-denotes the scalar product and $|.|_i$ the associated norm, for all $i\in\{1,\ldots,\alpha\}$. Then, for all $u,v\in E$, $\scalprod{u}{v}=\displaystyle\sum_{i=1}^{\alpha}\scalprod{u_i}{v_i}_i$
-is the scalar product on $E$.
-
-Let $U\in E$, we consider the following decomposition of $U$ and the corresponding decomposition of $F_\gamma$ into $\alpha$ blocks:
-\begin{equation}
-\begin{array}{rcl}
-U    & = & (U_1,\ldots,U_{\alpha}), \\
-F_{\gamma}(U) & = & (F_{1,\gamma}(U),\ldots,F_{\alpha,\gamma}(U)). \\
-\end{array}
-\label{eq:11}
-\end{equation}
-Assume that the convex set $K=\displaystyle\prod_{i=1}^{\alpha}K_{i}$, such that $\forall i\in\{1,\ldots,\alpha\},K_i\subset E_i$ and $K_i$ is a closed convex set.
-Let also $G=(G_1,\ldots,G_{\alpha})\in E$ and, for any $U\in E$, $P_K(U)=(P_{K_1}(U_1),\ldots,P_{K_{\alpha}}(U_{\alpha}))$ is the projection of $U$ on $K$ where $\forall i\in\{1,\ldots,\alpha\},P_{K_i}$
-is the projector from $E_i$ onto $K_i$. So, the fixed point mapping of the projected Richardson method~(\ref{eq:10}) can be written in the following way:
-\begin{equation}
-\forall U\in E\mbox{,~}\forall i\in\{1,\ldots,\alpha\}\mbox{,~}F_{i,\gamma}(U) = P_{K_i}(U_i - \gamma(\mathcal{A}_i.U - G_i)).
-\label{eq:12}
-\end{equation}
-Note that $\displaystyle\mathcal{A}_i.U= \sum_{j=1}^{\alpha}\mathcal{A}_{i,j}.U_j$, where $\mathcal{A}_{i,j}$ denote block matrices of $\mathcal{A}$.
-
-The parallel asynchronous iterations of the projected Richardson method for solving the obstacle problem~(\ref{eq:08}) are defined as follows: let $U^0\in E,U^0=(U^0_1,\ldots,U^0_\alpha)$ be
-the initial solution, then for all $p\in\mathbb{N}$, the iterate $U^{p+1}=(U^{p+1}_1,\ldots,U^{p+1}_{\alpha})$ is recursively defined by:
-\begin{equation}
-U_i^{p+1} = 
-\left\{
-\begin{array}{l}
-F_{i,\gamma}(U_1^{\rho_1(p)}, \ldots, U_{\alpha}^{\rho_{\alpha}(p)}) \mbox{~if~} i\in s(p), \\
-U_i^p \mbox{~otherwise}, \\
-\end{array}
-\right.
-\label{eq:13}
-\end{equation}
-where
-\begin{equation}
-\left\{
-\begin{array}{l}
-\forall p\in\mathbb{N}, s(p)\subset\{1,\ldots,\alpha\}\mbox{~and~} s(p)\ne\emptyset, \\
-\forall i\in\{1,\ldots,\alpha\},\{p \ | \ i \in s(p)\}\mbox{~is denombrable},
-\end{array}
-\right.
-\label{eq:14}
-\end{equation}
-and $\forall j\in\{1,\ldots,\alpha\}$,
-\begin{equation}
-\left\{
-\begin{array}{l}
-\forall p\in\mathbb{N}, \rho_j(p)\in\mathbb{N}, 0\leq\rho_j(p)\leq p\mbox{~and~}\rho_j(p)=p\mbox{~if~} j\in s(p),\\
-\displaystyle\lim_{p\to\infty}\rho_j(p) = +\infty.\\
-\end{array}
-\right.
-\label{eq:15}
-\end{equation}
-
-The previous asynchronous scheme of the projected Richardson method models computations that are carried out in parallel
-without order nor synchronization (according to the behavior of the parallel iterative method) and describes a subdomain
-method without overlapping. It is a general model that takes into account all possible situations of parallel computations
-and non-blocking message passing. So, the synchronous iterative scheme is defined by:
-\begin{equation}
-\forall j\in\{1,\ldots,\alpha\} \mbox{,~} \forall p\in\mathbb{N} \mbox{,~} \rho_j(p)=p.
-\label{eq:16}
-\end{equation}
-The values of $s(p)$ and $\rho_j(p)$ are defined dynamically and not explicitly by the parallel asynchronous or synchronous
-execution of the algorithm. Particularly, it enables one to consider distributed computations whereby processors compute at
-their own pace according to their intrinsic characteristics and computational load. The parallelism between the processors is
-well described by the set $s(p)$ which contains at each step $p$ the index of the components relaxed by each processor on a
-parallel way while the use of delayed components in~(\ref{eq:13}) permits one to model nondeterministic behavior and does not
-imply inefficiency of the considered distributed scheme of computation. Note that, according to~\cite{ref7}, theoretically,
-each component of the vector must be relaxed an infinity of time. The choice of the relaxed components to be used in the
-computational process may be guided by any criterion and, in particular, a natural criterion is to pick-up the most recently
-available values of the components computed by the processors. Furthermore, the asynchronous iterations are implemented by
-means of non-blocking MPI communication subroutines (asynchronous communications).
-
-The important property ensuring the convergence of the parallel projected Richardson method, both  synchronous and asynchronous
-algorithms, is the fact that $\mathcal{A}$ is an M-matrix. Moreover, the convergence proceeds from a result of~\cite{ref6}.
-Indeed, there exists a value $\gamma_0>0$, such that $\forall\gamma\in ]0,\gamma_0[$, the parallel iterations~(\ref{eq:13}), 
-(\ref{eq:14}) and~(\ref{eq:15}), associated to the fixed point mapping $F_\gamma$~(\ref{eq:12}), converge to the unique solution
-$U^{*}$ of the discretized problem. 
-
-
-%%--------------------------%%
-%%       SECTION 4          %%
-%%--------------------------%%
-\section{Parallel implementation on a GPU cluster}
-\label{sec:04}
-In this section, we give the main key points of the parallel implementation of the projected Richardson method, both synchronous
-and asynchronous versions, on a GPU cluster, for solving the nonlinear systems derived from the discretization of large obstacle
-problems. More precisely, each nonlinear system is solved iteratively using the whole cluster. We use a heteregeneous CUDA/MPI
-programming. Indeed, the communication of data, at each iteration between the GPU computing nodes, can be either synchronous
-or asynchronous using the MPI communication subroutines, whereas inside each GPU node, a CUDA parallelization is performed.
-
-\begin{figure}[!h]
-\centerline{\includegraphics[scale=0.30]{Chapters/chapter13/figures/splitCPU}}
-\caption{Data partitioning of a problem to be solved among $S=3\times 4$ computing nodes.}
-\label{fig:01}
-\end{figure}
-
-Let $S$ denote the number of computing nodes on the GPU cluster, where a computing node is composed of CPU core holding one MPI
-process and a GPU card. So, before starting computations, the obstacle problem of size $(NX\times NY\times NZ)$ is split into $S$
-parallelepipedic sub-problems, each for a node (MPI process, GPU), as is shown in Figure~\ref{fig:01}. Indeed, the $NY$ and $NZ$
-dimensions (according to the $y$ and $z$ axises) of the three-dimensional problem are, respectively, split into $Sy$ and $Sz$ parts,
-such that $S=Sy\times Sz$. In this case, each computing node has at most four neighboring nodes. This kind of the data partitioning
-reduces the data exchanges at subdomain boundaries compared to a naive $z$-axis-wise partitioning.
-
-\begin{algorithm}[!t]
-\SetLine
-\linesnumbered
-Initialization of the parameters of the sub-problem\;
-Allocate and fill the data in the global memory GPU\;
-\For{$i=1$ {\bf to} $NbSteps$}{
-   $G = \frac{1}{k}.U^0 + F$\;
-   Solve($A$, $U^0$, $G$, $U$, $\varepsilon$, $MaxRelax$)\;
-   $U^0 = U$\;
-}
-Copy the solution $U$ back from GPU memory\;
-\caption{Parallel solving of the obstacle problem on a GPU cluster}
-\label{alg:01}
-\end{algorithm}
-
-All the computing nodes of the GPU cluster execute in parallel the same Algorithm~\ref{alg:01} but on different three-dimensional
-sub-problems of size $(NX\times ny\times nz)$. This algorithm gives the main key points for solving an obstacle problem defined in
-a three-dimensional domain, where $A$ is the discretization matrix, $G$ is the right-hand side and $U$ is the solution vector. After
-the initialization step, all the data generated from the partitioning operation are copied from the CPU memories to the GPU global
-memories, to be processed on the GPUs. Next, the algorithm uses $NbSteps$ time steps to solve the global obstacle problem. In fact,
-it uses a parallel algorithm adapted to GPUs of the projected Richardson iterative method for solving the nonlinear systems of the
-obstacle problem. This function is defined by {\it Solve()} in Algorithm~\ref{alg:01}. At every time step, the initial guess $U^0$
-for the iterative algorithm is set to the solution found at the previous time step. Moreover, the right-hand side $G$ is computed
-as follows: \[G = \frac{1}{k}.U^{prev} + F\] where $k$ is the time step, $U^{prev}$ is the solution computed in the previous time
-step and each element $f(x, y, z)$ of the vector $F$ is computed as follows:
-\begin{equation}
-f(x,y,z)=\cos(2\pi x)\cdot\cos(4\pi y)\cdot\cos(6\pi z).
-\label{eq:18}
-\end{equation}
-Finally, the solution $U$ of the obstacle problem is copied back from the GPU global memories to the CPU memories. We use the
-communication subroutines of the CUBLAS library~\cite{ref8} (CUDA Basic Linear Algebra Subroutines) for the memory allocations in
-the GPU (\verb+cublasAlloc()+) and the data transfers between the CPU and its GPU: \verb+cublasSetVector()+ and \verb+cublasGetVector()+. 
-
-\begin{algorithm}[!t]
-  \SetLine
-  \linesnumbered
-  $p = 0$\;
-  $conv = false$\;
-  $U = U^{0}$\;
-  \Repeat{$(conv=true)$}{
-    Determine\_Bordering\_Vector\_Elements($U$)\;
-    Compute\_New\_Vector\_Elements($A$, $G$, $U$)\;
-    $tmp = U^{0} - U$\;
-    $error = \|tmp\|_{2}$\;
-    $U^{0} = U$\;
-    $p = p + 1$\;
-    $conv$ = Convergence($error$, $p$, $\varepsilon$, $MaxRelax$)\;
-  }
-\caption{Parallel iterative solving of the nonlinear systems on a GPU cluster ($Solve()$ function)}
-\label{alg:02}
-\end{algorithm}
-
-As many other iterative methods, the algorithm of the projected Richardson method is based on algebraic functions operating on vectors
-and/or matrices, which are more efficient on parallel computers when they work on large vectors. Its parallel implementation on the GPU
-cluster is carried out so that the GPUs execute the vector operations as kernels and the CPUs execute the serial codes, supervise the
-kernel executions and the data exchanges with the neighboring nodes and supply the GPUs with data. Algorithm~\ref{alg:02} shows the
-main key points of the parallel iterative algorithm (function $Solve()$ in Algorithm~\ref{alg:01}). All the vector operations inside
-the main loop ({\bf repeat} ... {\bf until}) are executed by the GPU. We use the following functions of the CUBLAS library:
-\begin{itemize*}
-\item \verb+cublasDaxpy()+ to compute the difference between the solution vectors $U^{p}$ and $U^{p+1}$ computed in two successive relaxations
-$p$ and $p+1$ (line~$7$ in Algorithm~\ref{alg:02}),
-\item \verb+cublasDnrm2()+ to perform the Euclidean norm (line~$8$) and,
-\item \verb+cublasDcpy()+ for the data copy of a vector to another one in the GPU memory (lines~$3$ and~$9$).
-\end{itemize*}
-
-The dimensions of the grid and blocks of threads that execute a given kernel depend on the resources of the GPU multiprocessor and the
-resource requirements of the kernel. So, if $block$ defines the size of a thread block, which must not exceed the maximum size of a thread
-block, then the number of thread blocks in the grid, denoted by $grid$, can be computed according to the size of the local sub-problem
-as follows: \[grid = \frac{(NX\times ny\times nz)+block-1}{block}.\] However, when solving very large problems, the size of the thread
-grid can exceed the maximum number of thread blocks that can be executed on the GPUs (up-to $65.535$ thread blocks) and, thus, the kernel
-will fail to launch. Therefore, for each kernel, we decompose the three-dimensional sub-problem into $nz$ two-dimensional slices of size
-($NX\times ny$), as is shown in Figure~\ref{fig:02}. All slices of the same kernel are executed using {\bf for} loop by $NX\times ny$ parallel
-threads organized in a two-dimensional grid of two-dimensional thread blocks, as is shown in Listing~\ref{list:01}. Each thread is in charge
-of $nz$ discretization points (one from each slice), accessed in the GPU memory with a constant stride $(NX\times ny)$.
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.30]{Chapters/chapter13/figures/splitGPU}}
-\caption{Decomposition of a sub-problem in a GPU into $nz$ slices.}
-\label{fig:02}
-\end{figure}
-
-\begin{center}
-\lstinputlisting[label=list:01,caption=Skeleton codes of a GPU kernel and a CPU function]{Chapters/chapter13/ex1.cu}
-\end{center}
-The function $Determine\_Bordering\_Vector\_Elements()$ (line~$5$ in Algorithm~\ref{alg:02}) determines the values of the vector
-elements shared at the boundaries with neighboring computing nodes. Its main operations are defined as follows:
-\begin{enumerate*}
-\item define the values associated to the bordering points needed by the neighbors,
-\item copy the values associated to the bordering points from the GPU to the CPU,
-\item exchange the values associated to the bordering points between the neighboring CPUs,
-\item copy the received values associated to the bordering points from the CPU to the GPU,
-\end{enumerate*}
-The first operation of this function is implemented as kernels to be performed by the GPU:
-\begin{itemize*}
-\item a kernel executed by $NX\times nz$ threads to define the values associated to the bordering vector elements along $y$-axis and,
-\item a kernel executed by $NX\times ny$ threads to define the values associated to the bordering vector elements along $z$-axis.  
-\end{itemize*}
-As mentioned before, we develop the \emph{synchronous} and \emph{asynchronous} algorithms of the projected Richardson method. Obviously,
-in this scope, the synchronous or asynchronous communications refer to the communications between the CPU cores (MPI processes) on the
-GPU cluster, in order to exchange the vector elements associated to subdomain boundaries. For the memory copies between a CPU core and
-its GPU, we use the synchronous communication routines of the CUBLAS library: \verb+cublasSetVector()+ and \verb+cublasGetVector()+
-in the synchronous algorithm and the asynchronous ones: \verb+cublasSetVectorAsync()+ and \verb+cublasGetVectorAsync()+ in the
-asynchronous algorithm. Moreover, we use the communication routines of the MPI library to carry out the data exchanges between the neighboring
-nodes. We use the following communication routines: \verb+MPI_Isend()+ and \verb+MPI_Irecv()+ to perform non-blocking sends and receptions,
-respectively. For the synchronous algorithm, we use the MPI routine \verb+MPI_Waitall()+ which puts the MPI process of a computing node
-in blocking status until all data exchanges with neighboring nodes (sends and receptions) are completed. In contrast, for the asynchronous
-algorithms, we use the MPI routine \verb+MPI_Test()+ which tests the completion of a data exchange (send or reception) without putting the
-MPI process in blocking status.   
-
-The function $Compute\_New\_Vector\_Elements()$ (line~$6$ in Algorithm~\ref{alg:02}) computes, at each iteration, the new elements
-of the iterate vector $U$. Its general code is presented in Listing~\ref{list:01} (CPU function). The iterations of the projected
-Richardson method, based on those of the Jacobi method, are defined as follows: 
-\begin{equation}
-\begin{array}{ll}
-u^{p+1}(x,y,z) =& \frac{1}{Center}(g(x,y,z) - (Center\cdot u^{p}(x,y,z) + \\
-& West\cdot u^{p}(x-h,y,z) + East\cdot u^{p}(x+h,y,z) + \\
-& South\cdot u^{p}(x,y-h,z) + North\cdot u^{p}(x,y+h,z) + \\
-& Rear\cdot u^{p}(x,y,z-h) + Front\cdot u^{p}(x,y,z+h))),  
-\end{array}
-\label{eq:17}
-\end{equation}
-where $u^{p}(x,y,z)$ is an element of the iterate vector $U$ computed at the iteration $p$ and $g(x,y,z)$ is a vector element of the
-right-hand side $G$. The scalars $Center$, $West$, $East$, $South$, $North$, $Rear$ and $Front$ define constant coefficients of the
-block matrix $A$. Figure~\ref{fig:03} shows the positions of these coefficients in a three-dimensional domain.  
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.35]{Chapters/chapter13/figures/matrix}}
-\caption{Matrix constant coefficients in a three-dimensional domain.}
-\label{fig:03}
-\end{figure}
-
-The kernel implementations of the projected Richardson method on GPUs uses a perfect fine-grain multithreading parallelism. Since the 
-projected Richardson algorithm is implemented as a fixed point method, each kernel is executed by a large number of GPU threads such
-that each thread is in charge of the computation of one element of the iterate vector $U$. Moreover, this method uses the vector elements
-updates of the Jacobi method, which means that each thread $i$ computes the new value of its element $u_{i}^{p+1}$ independently of the
-new values $u_{j}^{p+1}$, where $j\neq i$, of those computed in parallel by other threads at the same iteration $p+1$. Listing~\ref{list:02}
-shows the GPU implementations of the main kernels of the projected Richardson method, which are: the matrix-vector multiplication
-(\verb+MV_Multiplication()+) and the vector elements updates (\verb+Vector_Updates()+). The codes of these kernels are based on
-that presented in Listing~\ref{list:01}.
-
-\lstinputlisting[label=list:02,caption=GPU kernels of the projected Richardson method]{Chapters/chapter13/ex2.cu}
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.3]{Chapters/chapter13/figures/points3D}}
-\caption{Computation of a vector element with the projected Richardson method.}
-\label{fig:04}
-\end{figure}
-
-Each kernel is executed by $NX\times ny$ GPU threads so that $nz$ slices of $(NX\times ny)$ vector elements are computed in
-a {\bf for} loop. In this case, each thread is in charge of one vector element from each slice (in total $nz$ vector elements
-along $z$-axis). We can notice from the formula~(\ref{eq:17}) that the computation of a vector element $u^{p+1}(x,y,z)$, by
-a thread at iteration $p+1$, requires seven vector elements computed at the previous iteration $p$: two vector elements in
-each dimension plus the vector element at the intersection of the three axises $x$, $y$ and $z$ (see Figure~\ref{fig:04}). 
-So, to reduce the memory accesses to the high-latency global memory, the vector elements of the current slice can be stored
-in the low-latency shared memories of thread blocks, as is described in~\cite{ref9}. Nevertheless, the fact that the computation
-of a vector element requires only two elements in each dimension does not allow to maximize the data reuse from the shared memories.
-The computation of a slice involves in total $(bx+2)\times(by+2)$ accesses to the global memory per thread block, to fill the
-required vector elements in the shared memory where $bx$ and $by$ are the dimensions of a thread block. Then, in order to optimize
-the memory accesses on GPUs, the elements of the iterate vector $U$ are filled in the cache texture memory (see~\cite{ref10}).
-In new GPU generations as Fermi or Kepler, the global memory accesses are always cached in L1 and L2 caches. For example, for
-a given kernel, we can favour the use of the L1 cache to that of the shared memory by using the function \verb+cudaFuncSetCacheConfig(Kernel,cudaFuncCachePreferL1)+.
-So, the initial access to the global memory loads the vector elements required by the threads of a block into the cache memory
-(texture or L1/L2 caches). Then, all the following memory accesses read from this cache memory. In Listing~\ref{list:02}, the
-function \verb+fetch_double(v,i)+ is used to read from the texture memory the $i^{th}$ element of the double-precision vector
-\verb+v+ (see Listing~\ref{list:03}). Moreover, the seven constant coefficients of matrix $A$ can be stored in the constant memory
-but, since they are reused $nz$ times by each thread, it is more interesting to fill them on the low-latency registers of each thread.    
-
-\lstinputlisting[label=list:03,caption=Memory access to the cache texture memory]{Chapters/chapter13/ex3.cu}
-
-The function $Convergence()$ (line~$11$ in Algorithm~\ref{alg:02}) allows to detect the convergence of the parallel iterative algorithm
-and is based on the tolerance threshold $\varepsilon$ and the maximum number of relaxations $MaxRelax$. We take into account the number 
-of relaxations since that of iterations cannot be computed in the asynchronous case. Indeed, a relaxation is the update~(\ref{eq:13}) of
-a local iterate vector $U_i$ according to $F_i$. Then, counting the number of relaxations is possible in both synchronous and asynchronous
-cases. On the other hand, an iteration is the update of at least all vector components with $F_i$.
-
-In the synchronous algorithm, the global convergence is detected when the maximal value of the absolute error, $error$, is sufficiently small
-and/or the maximum number of relaxations, $MaxRelax$, is reached, as follows:
-$$
-\begin{array}{l}
-error=\|U^{p}-U^{p+1}\|_{2}; \\
-AllReduce(error,\hspace{0.1cm}maxerror,\hspace{0.1cm}MAX); \\
-\text{if}((maxerror<\varepsilon)\hspace{0.2cm}\text{or}\hspace{0.2cm}(p\geq MaxRelax)) \\
-conv \leftarrow true;
-\end{array}
-$$
-where the function $AllReduce()$ uses the MPI reduction subroutine \verb+MPI_Allreduce()+ to compute the maximal value, $maxerror$, among the
-local absolute errors, $error$, of all computing nodes and $p$ (in Algorithm~\ref{alg:02}) is used as a counter of the local relaxations carried
-out by a computing node. In the asynchronous algorithms, the global convergence is detected when all computing nodes locally converge. For this,
-we use a token ring architecture around which a boolean token travels, in one direction, from a computing node to another. Starting from node $0$,
-the boolean token is set to $true$ by node $i$ if the local convergence is reached or to $false$ otherwise and, then, it is sent to node $i+1$.
-Finally, the global convergence is detected when node $0$ receives from its neighbor node $S-1$, in the ring architecture, a token set to $true$.
-In this case, node $0$ sends a stop message (end of parallel solving) to all computing nodes in the cluster.
-
-
-%%--------------------------%%
-%%       SECTION 5          %%
-%%--------------------------%%
-\section{Experimental tests on a GPU cluster}
-\label{sec:05}
-The GPU cluster of tests, that we used in this chapter, is an $20Gbps$ Infiniband network of six machines. Each machine is a Quad-Core Xeon
-E5530 CPU running at $2.4$GHz. It provides a RAM memory of $12$GB with a memory bandwidth of $25.6$GB/s and it is equipped with two Nvidia
-Tesla C1060 GPUs. A Tesla GPU contains in total $240$ cores running at $1.3$GHz. It provides $4$GB of global memory with a memory bandwidth
-of $102$GB/s, accessible by all its cores and also by the CPU through the PCI-Express 16x Gen 2.0 interface with a throughput of $8$GB/s.
-Hence, the memory copy operations between the GPU and the CPU are about $12$ times slower than those of the Tesla GPU memory. We have performed
-our simulations on a cluster of $24$ CPU cores and on a cluster of $12$ GPUs. Figure~\ref{fig:05} describes the components of the GPU cluster
-of tests.
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.25]{Chapters/chapter13/figures/cluster}}
-\caption{GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.}
-\label{fig:05}
-\end{figure}
-
-Linux cluster version 2.6.39 OS is installed on CPUs. C programming language is used for coding the parallel algorithms of the methods on both
-GPU cluster and CPU cluster. CUDA version 4.0~\cite{ref12} is used for programming GPUs, using CUBLAS library~\cite{ref8} to deal with vector
-operations in GPUs and, finally, MPI functions of OpenMPI 1.3.3 are used to carry out the synchronous and asynchronous communications between
-CPU cores. Indeed, in our experiments, a computing node is managed by a MPI process and it is composed of one CPU core and one GPU card.
- 
-All experimental results of the parallel projected Richardson algorithms are obtained from simulations made in double precision data. The obstacle
-problems to be solved are defined in constant three-dimensional domain $\Omega\subset\mathbb{R}^{3}$. The numerical values of the parameters of the
-obstacle problems are: $\eta=0.2$, $c=1.1$, $f$ is computed by formula~(\ref{eq:18}) and final time $T=0.02$. Moreover, three time steps ($NbSteps=3$)
-are computed with $k=0.0066$. As the discretization matrix is constant along the time steps, the convergence properties of the iterative algorithms
-do not change. Thus, the performance characteristics obtained with three time steps will still be valid for more time steps. The initial function
-$u(0,x,y,z)$ of the obstacle problem~(\ref{eq:01}) is set to $0$, with a constraint $u\geq\phi=0$. The relaxation parameter $\gamma$ used by the
-projected Richardson method is computed automatically thanks to the diagonal entries of the discretization matrix. The formula and its proof can be
-found in~\cite{ref11}, Section~2.3. The convergence tolerance threshold $\varepsilon$ is set to $1e$-$04$ and the maximum number of relaxations is
-limited to $10^{6}$ relaxations. Finally, the number of threads per block is set to $256$ threads, which gives, in general, good performances for
-most GPU applications. We have performed some tests for the execution configurations and we have noticed that the best configuration of the $256$
-threads per block is an organization into two dimensions of sizes $(64,4)$. 
-
-\begin{table}[!h]
-\centering
-\begin{tabular}{|c|c|c|c|c|c|}
-\hline
-\multirow{2}{*}{\bf Pb. size} & \multicolumn{2}{c|}{\bf Synchronous} & \multicolumn{2}{c|}{\bf Asynchronous} & \multirow{2}{*}{\bf Gain\%} \\ \cline{2-5}
-
-                              & $\mathbf{T_{cpu}}$ & {\bf \#relax.}  & $\mathbf{T_{cpu}}$ & {\bf \#relax.}  &  \\ \hline \hline
-
-$256^{3}$                     & $575.22$           & $198,288$        & $539.25$          & $198,613$        & $6.25$ \\ \hline \hline
-
-$512^{3}$                     & $19,250.25$        & $750,912$        & $18,237.14$       & $769,611$        & $5.26$ \\ \hline \hline 
-
-$768^{3}$                     & $206,159.44$       & $1,635,264$      & $183,582.60$      & $1,577,004$      & $10.95$ \\ \hline \hline
-
-$800^{3}$                     & $222,108.09$       & $1,769,232$      & $188,790.04$      & $1,701,735$      & $15.00$ \\ \hline
-\end{tabular}
-\vspace{0.5cm}
-\caption{Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 24 CPU cores.}
-\label{tab:01}
-\end{table}
-
-\begin{table}[!h]
-\centering
-\begin{tabular}{|c|c|c|c|c|c|c|c|}
-\hline
-\multirow{2}{*}{\bf Pb. size} & \multicolumn{3}{c|}{\bf Synchronous}                   & \multicolumn{3}{c|}{\bf Asynchronous}                   & \multirow{2}{*}{\bf Gain\%}  \\ \cline{2-7}
-
-                              & $\mathbf{T_{gpu}}$ & {\bf \#relax.}  & $\mathbf{\tau}$ & $\mathbf{T_{gpu}}$ & {\bf \#relax.}  & $\mathbf{\tau}$ &           \\  \hline \hline
-
-$256^{3}$                     & $29.67$            & $100,692$        &  $19.39$        & $18.00$           & $94,215$         & $29.96$         & $39.33$  \\ \hline \hline
-
-$512^{3}$                     & $521.83$           & $381,300$        & $36.89$         & $425.15$          & $347,279$        & $42.89$         & $18.53$ \\ \hline \hline 
-
-$768^{3}$                     & $4,112.68$         & $831,144$        & $50.13$         & $3,313.87$        & $750,232$        & $55.40$         & $19.42$ \\ \hline \hline 
-
-$800^{3}$                     & $3,950.87$         & $899,088$        & $56.22$         & $3,636.57$        & $834,900$        & $51.91$         & $7.95$ \\ \hline 
-\end{tabular}
-\vspace{0.5cm}
-\caption{Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.}
-\label{tab:02}
-\end{table}
-
-The performance measures that we took into account are the execution times and the number of relaxations performed by the parallel iterative algorithms,
-both synchronous and asynchronous versions, on the GPU and CPU clusters. These algorithms are used for solving nonlinear systems derived from the discretization
-of obstacle problems of sizes $256^{3}$, $512^{3}$, $768^{3}$ and $800^{3}$. In Table~\ref{tab:01} and Table~\ref{tab:02}, we show the performances
-of the parallel synchronous and asynchronous algorithms of the projected Richardson method implemented, respectively, on a cluster of $24$ CPU cores
-and on a cluster of $12$ GPUs. In these tables, the execution time defines the time spent by the slowest computing node and the number of relaxations
-is computed as the summation of those carried out by all computing nodes.
-
-In the sixth column of Table~\ref{tab:01} and in the eighth column of Table~\ref{tab:02}, we give the gains in $\%$ obtained by using an
-asynchronous algorithm compared to a synchronous one. We can notice that the asynchronous version on CPU and GPU clusters is slightly faster
-than the synchronous one for both methods. Indeed, the cluster of tests is composed of local and homogeneous nodes communicating via low-latency
-connections. So, in the case of distant and/or heterogeneous nodes (or even with geographically distant clusters) the asynchronous version
-would be faster than the synchronous one. However, the gains obtained on the GPU cluster are better than those obtained on the CPU cluster.
-In fact, the computation times are reduced by accelerating the computations on GPUs while the communication times still unchanged.  
-
-The fourth and seventh columns of Table~\ref{tab:02} show the relative gains obtained by executing the parallel algorithms on the cluster
-of $12$ GPUs instead on the cluster of $24$ CPU cores. We compute the relative gain $\tau$ as a ratio between the execution time $T_{cpu}$
-spent on the CPU cluster over that $T_{gpu}$ spent on the GPU cluster: \[\tau=\frac{T_{cpu}}{T_{gpu}}.\] We can see from these ratios that
-solving large obstacle problems is faster on the GPU cluster than on the CPU cluster. Indeed, the GPUs are more efficient than their
-counterpart CPUs to execute large data-parallel operations. In addition, the projected Richardson method is implemented as a fixed point-based
-iteration and uses the Jacobi vector updates that allow a well thread-parallelization on GPUs, such that each GPU thread is in charge
-of one vector component at a time without being dependent on other vector components computed by other threads. Then, this allow to exploit
-at best the high performance computing of the GPUs by using all the GPU resources and avoiding the idle cores.
-
-Finally, the number of relaxations performed by the parallel synchronous algorithm is different in the CPU and GPU versions, because the number
-of computing nodes involved in the GPU cluster and in the CPU cluster is different. In the CPU case, $24$ computing nodes ($24$ CPU cores) are
-considered, whereas in the GPU case, $12$ computing nodes ($12$ GPUs) are considered. As the number of relaxations depends on the domain decomposition,
-consequently it also depends on the number of computing nodes.
-
-
-%%--------------------------%%
-%%       SECTION 6          %%
-%%--------------------------%%
-\section{Red-Black ordering technique}
-\label{sec:06}
-As is well-known, the Jacobi method is characterized by a slow convergence rate compared to some iterative methods (for example Gauss-Seidel method).
-So, in this section, we present some solutions to reduce the execution time and the number of relaxations and, more specifically, to speed up the
-convergence of the parallel projected Richardson method on the GPU cluster. We propose to use the point red-black ordering technique to accelerate
-the convergence. This technique is often used to increase the parallelism of iterative methods for solving linear systems~\cite{ref13,ref14,ref15}.
-We apply it to the projected Richardson method as a compromise between the Jacobi and Gauss-Seidel iterative methods. 
-
-The general principle of the red-black technique is as follows. Let $t$ be the summation of the integer $x$-, $y$- and $z$-coordinates of a vector
-element $u(x,y,z)$ on a three-dimensional domain: $t=x+y+z$. As is shown in Figure~\ref{fig:06.01}, the red-black ordering technique consists in the
-parallel computing of the red vector elements having even value $t$ by using the values of the black ones then, the parallel computing of the black
-vector elements having odd values $t$ by using the new values of the red ones.
-
-\begin{figure}
-\centering
-  \mbox{\subfigure[Red-black ordering on x, y and z axises]{\includegraphics[width=2.3in]{Chapters/chapter13/figures/rouge-noir}\label{fig:06.01}}\quad
-        \subfigure[Red-black ordering on y axis]{\includegraphics[width=2.3in]{Chapters/chapter13/figures/rouge-noir-y}\label{fig:06.02}}}
-\caption{Red-black ordering for computing the iterate vector elements in a three-dimensional space.}
-\end{figure}
-
-This technique can be implemented on the GPU in two different manners:
-\begin{itemize*}
-\item among all launched threads ($NX\times ny$ threads), only one thread out of two computes its red or black vector element at a time or,
-\item all launched threads (on average half of $NX\times ny$ threads) compute the red vector elements first and, then, the black ones.
-\end{itemize*}
-However, in both solutions, for each memory transaction, only half of the memory segment addressed by a half-warp is used. So, the computation of the
-red and black vector elements leads to use twice the initial number of memory transactions. Then, we apply the point red-black ordering accordingly to
-the $y$-coordinate, as is shown in Figure~\ref{fig:06.02}. In this case, the vector elements having even $y$-coordinate are computed in parallel using
-the values of those having odd $y$-coordinate and then vice-versa. Moreover, in the GPU implementation of the parallel projected Richardson method (Section~\ref{sec:04}),
-we have shown that a sub-problem of size $(NX\times ny\times nz)$ is decomposed into $nz$ grids of size $(NX\times ny)$. Then, each kernel is executed
-in parallel by $NX\times ny$ GPU threads, so that each thread is in charge of $nz$ vector elements along $z$-axis (one vector element in each grid of
-the sub-problem). So, we propose to use the new values of the vector elements computed in grid $i$ to compute those of the vector elements in grid $i+1$.
-Listing~\ref{list:04} describes the kernel of the matrix-vector multiplication and the kernel of the vector elements updates of the parallel projected
-Richardson method using the red-black ordering technique.
-
-\lstinputlisting[label=list:04,caption=GPU kernels of the projected Richardson method using the red-black technique]{Chapters/chapter13/ex4.cu}
-
-Finally, we exploit the concurrent executions between the host functions and the GPU kernels provided by the GPU hardware and software. In fact, the kernel
-launches are asynchronous (when this environment variable is not disabled on the GPUs), such that the control is returned to the host (MPI process) before
-the GPU has completed the requested task (kernel)~\cite{ref12}. Therefore, all the kernels necessary to update the local vector elements, $u(x,y,z)$ where
-$0<y<(ny-1)$ and $0<z<(nz-1)$, are executed first. Then, the values associated to the bordering vector elements are exchanged between the neighbors. Finally,
-the values of the vector elements associated to the bordering vector elements are updated. In this case, the computation of the local vector elements is
-performed concurrently with the data exchanges between neighboring CPUs and this in both synchronous and asynchronous cases.
-
-\begin{table}[!h]
-\centering
-\begin{tabular}{|c|c|c|c|c|c|}
-\hline
-\multirow{2}{*}{\bf Pb. size} & \multicolumn{2}{c|}{\bf Synchronous} & \multicolumn{2}{c|}{\bf Asynchronous} & \multirow{2}{*}{\bf Gain\%}  \\ \cline{2-5}
-
-                              & $\mathbf{T_{gpu}}$ & {\bf \#relax.}   & $\mathbf{T_{gpu}}$ & {\bf \#relax.}   &           \\  \hline \hline
-
-$256^{3}$                     & $18.37$            & $71,988$         & $12.58$           & $67,638$         & $31.52$  \\ \hline \hline
-
-$512^{3}$                     & $349.23$           & $271,188$        & $289.41$          & $246,036$        & $17.13$ \\ \hline \hline 
-
-$768^{3}$                     & $2,773.65$         & $590,652$        & $2,222.22$        & $532,806$        & $19.88$ \\ \hline \hline 
-
-$800^{3}$                     & $2,748.23$         & $638,916$        & $2,502.61$        & $592,525$        & $8.92$ \\ \hline 
-\end{tabular}
-\vspace{0.5cm}
-\caption{Execution times in seconds of the parallel projected Richardson method using read-black ordering technique implemented on a cluster of 12 GPUs.}
-\label{tab:03}
-\end{table}
-
-In Table~\ref{tab:03}, we report the execution times and the number of relaxations performed on a cluster of $12$ GPUs by the parallel projected Richardson
-algorithms; it can be noted that the performances of the projected Richardson are improved by using the point read-black ordering. We compare the performances
-of the parallel projected Richardson method with and without this later ordering (Tables~\ref{tab:02} and~\ref{tab:03}). We can notice that both parallel synchronous
-and asynchronous algorithms are faster when they use the red-black ordering. Indeed, we can see in Table~\ref{tab:03} that the execution times of these algorithms
-are reduced, on average, by $32\%$ compared to those shown in Table~\ref{tab:02}.
-
-\begin{figure}
-\centerline{\includegraphics[scale=0.9]{Chapters/chapter13/figures/scale}}
-\caption{Weak scaling of both synchronous and asynchronous algorithms of the projected Richardson method using red-black ordering technique.}
-\label{fig:07}
-\end{figure}
-
-In Figure~\ref{fig:07}, we study the ratio between the computation time and the communication time of the parallel projected Richardson algorithms on a GPU cluster.
-The experimental tests are carried out on a cluster composed of one to ten Tesla GPUs. We have focused on the weak scaling of both parallel, synchronous and asynchronous,
-algorithms using the red-black ordering technique. For this, we have fixed the size of a sub-problem to $256^{3}$ per computing node (a CPU core and a GPU). Then,
-Figure~\ref{fig:07} shows the number of relaxations performed, on average, per second by a computing node. We can see from this figure that the efficiency of the
-asynchronous algorithm is almost stable, while that of the synchronous algorithm decreases (down to $81\%$ in this example) with the increasing of the number of
-computing nodes on the cluster. This is due to the fact that the ratio between the time of the computation over that of the communication is reduced when the computations
-are performed on GPUs. Indeed, GPUs compute faster than CPUs and communications are more time consuming. In this context, asynchronous algorithms are more scalable
-than synchronous ones. So, with large scale GPU clusters, synchronous algorithms might be more penalized by communications, as can be deduced from Figure~\ref{fig:07}.
-That is why we think that asynchronous iterative algorithms are all the more interesting in this case.
-
-
-%%--------------------------%%
-%%       SECTION 7          %%
-%%--------------------------%%
-\section{Conclusion}
-\label{sec:07}
-Our main contribution, in this chapter, is the parallel implementation of an asynchronous iterative method on GPU clusters for solving large scale nonlinear
-systems derived from the spatial discretization of three-dimensional obstacle problems. For this, we have implemented both synchronous and asynchronous algorithms of the
-Richardson iterative method using a projection on a convex set. Indeed, this method uses point-based iterations of the Jacobi method that are very easy to parallelize on
-parallel computers. We have shown that its adapted parallel algorithms to GPU architectures allows to exploit at best the computing power of the GPUs and to accelerate the
-resolution of large nonlinear systems. Consequently, the experimental results have shown that solving nonlinear systems of large obstacle problems with this method is about
-fifty times faster on a cluster of $12$ GPUs than on a cluster of $24$ CPU cores. Moreover, we have applied to this projected Richardson method the red-black ordering technique
-which allows it to improve its convergence rate. Thus, the execution times of both parallel algorithms performed on the cluster of $12$ GPUs are reduced on average of $32\%$.
-
-Afterwards, the experiments have shown that the asynchronous version is slightly more efficient than the synchronous one. In fact, the computations are accelerated by using GPUs
-while the communication times still unchanged. In addition, we have studied the weak-scaling in the synchronous and asynchronous cases, which has confirmed that the ratio between
-the computations and the communications are reduced when using a cluster of GPUs. We highlight that asynchronous iterative algorithms are more scalable than synchronous ones.
-Therefore, we can conclude that asynchronous iterations are well suited to tackle scalability issues on GPU clusters.
-
-In future works, we plan to perform experiments on large scale GPU clusters and on geographically distant GPU clusters, because we expect that asynchronous versions would
-be faster and more scalable on such architectures. Furthermore, we want to study the performance behavior and the scalability of other numerical algorithms which support,
-if possible, the model of asynchronous iterations.
-
-\putbib[Chapters/chapter13/biblio13]
-
diff --git a/BookGPU/Chapters/chapter13/ex1.cu b/BookGPU/Chapters/chapter13/ex1.cu
index 546d8ef..d4997d6 100644
--- a/BookGPU/Chapters/chapter13/ex1.cu
+++ b/BookGPU/Chapters/chapter13/ex1.cu
@@ -15,7 +15,7 @@ __global__ void kernel(..., int n, int nx, int ny, int slices, int stride, ...)
 /* CPU function */
 void Function(...)
 {
-	int n = NX * ny * nz; //size of the sub-problem
+	int n = NX * ny * nz; //size of the subproblem
 	int slices = nz;
 	int stride = NX * ny;
 	int bx = 64, by = 4;
diff --git a/BookGPU/Chapters/chapter13/figures/splitGPU.eps b/BookGPU/Chapters/chapter13/figures/splitGPU.eps
index 20657e8..eb3cb78 100644
--- a/BookGPU/Chapters/chapter13/figures/splitGPU.eps
+++ b/BookGPU/Chapters/chapter13/figures/splitGPU.eps
@@ -1,8 +1,8 @@
 %!PS-Adobe-2.0 EPSF-2.0
 %%Title: splitGPU.fig
 %%Creator: fig2dev Version 3.2 Patchlevel 5c
-%%CreationDate: Tue Jan 22 15:07:57 2013
-%%BoundingBox: 0 0 1110 416
+%%CreationDate: Fri Jul 19 20:12:51 2013
+%%BoundingBox: 0 0 1103 419
 %Magnification: 1.0000
 %%EndComments
 %%BeginProlog
@@ -120,8 +120,8 @@ newfontname newfont definefont pop end } def
 
 /pageheader {
 save
-newpath 0 416 moveto 0 0 lineto 1110 0 lineto 1110 416 lineto closepath clip newpath
--55.3 581.0 translate
+newpath 0 419 moveto 0 0 lineto 1103 0 lineto 1103 419 lineto closepath clip newpath
+-55.3 584.0 translate
 1 -1 scale
 $F2psBegin
 10 setmiterlimit
@@ -336,9 +336,6 @@ gs 1 -1 sc (Thread block) col0 sh gr
 1080 6840 m
 gs 1 -1 sc (node11) col0 sh gr
 /Times-Roman-iso ff 381.00 scf sf
-5490 9135 m
-gs 1 -1 sc  45.0 rot (Slice \(nz-1\)) col0 sh gr
-/Times-Roman-iso ff 381.00 scf sf
 8550 8730 m
 gs 1 -1 sc  45.0 rot (Slice \(3\)) col0 sh gr
 /Times-Roman-iso ff 381.00 scf sf
@@ -359,6 +356,12 @@ gs 1 -1 sc (NX) col0 sh gr
 /Times-Italic-iso ff 476.25 scf sf
 5625 3600 m
 gs 1 -1 sc (ny) col0 sh gr
+/Times-Italic-iso ff 381.00 scf sf
+6120 8505 m
+gs 1 -1 sc  45.0 rot (nz) col0 sh gr
+/Times-Roman-iso ff 381.00 scf sf
+5445 9180 m
+gs 1 -1 sc  45.0 rot (Slice \(    -1\)) col0 sh gr
 % here ends figure;
 pagefooter
 showpage
diff --git a/BookGPU/Chapters/chapter13/figures/splitGPU.pdf b/BookGPU/Chapters/chapter13/figures/splitGPU.pdf
index 76853d1..a551922 100644
Binary files a/BookGPU/Chapters/chapter13/figures/splitGPU.pdf and b/BookGPU/Chapters/chapter13/figures/splitGPU.pdf differ