From: couturie <couturie@carcariass.(none)> Date: Tue, 26 Feb 2013 21:51:13 +0000 (+0100) Subject: new X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/bc21a4e60ae5e4ef525ce6933905824c6597401d?ds=sidebyside;hp=-c new --- bc21a4e60ae5e4ef525ce6933905824c6597401d diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index f4f8323..b58372d 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -45,7 +45,8 @@ \include{Chapters/chapter1/preamble} \include{Chapters/chapter5/preamble} - +\newcommand{\scalprod}[2]% +{\ensuremath{\langle #1 \, , #2 \rangle}} \makeatletter diff --git a/BookGPU/Chapters/chapter12/biblio12.bib b/BookGPU/Chapters/chapter12/biblio12.bib index cfb5778..d3cac8f 100644 --- a/BookGPU/Chapters/chapter12/biblio12.bib +++ b/BookGPU/Chapters/chapter12/biblio12.bib @@ -1,6 +1,6 @@ -@article{ref1, +@article{ch12:ref1, title = {Iterative methods for sparse linear systems}, -author = {Saad, Y.}, +author = {Saad, Yousef}, journal = {Society for Industrial and Applied Mathematics, 2nd edition}, volume = {}, number = {}, @@ -8,9 +8,9 @@ pages = {}, year = {2003}, } -@article{ref2, +@article{ch12:ref2, title = {Methods of conjugate gradients for solving linear systems}, -author = {Hestenes, M. R. and Stiefel, E.}, +author = {Hestenes, Maqnus R. and Stiefel, Eduard}, journal = {Journal of Research of the National Bureau of Standards}, volume = {49}, number = {6}, @@ -18,9 +18,9 @@ pages = {409--436}, year = {1952}, } -@article{ref3, +@article{ch12:ref3, title = {{GMRES}: a generalized minimal residual algorithm for solving nonsymmetric linear systems}, -author = {Saad, Y. and Schultz, M. H.}, +author = {Saad, Yousef and Schultz, Martin H.}, journal = {SIAM Journal on Scientific and Statistical Computing}, volume = {7}, number = {3}, @@ -28,9 +28,9 @@ pages = {856--869}, year = {1986}, } -@article{ref4, +@article{ch12:ref4, title = {Solution of sparse indefinite systems of linear equations}, -author = {Paige, C. C. and Saunders, M. A.}, +author = {Paige, Chris C. and Saunders, Michael A.}, journal = {SIAM Journal on Numerical Analysis}, volume = {12}, number = {4}, @@ -38,9 +38,9 @@ pages = {617--629}, year = {1975}, } -@article{ref5, +@article{ch12:ref5, title = {The principle of minimized iteration in the solution of the matrix eigenvalue problem}, -author = {Arnoldi, W. E.}, +author = {Arnoldi, Walter E.}, journal = {Quarterly of Applied Mathematics}, volume = {9}, number = {17}, @@ -48,7 +48,7 @@ pages = {17--29}, year = {1951}, } -@article{ref6, +@article{ch12:ref6, title = {{CUDA} Toolkit 4.2 {CUBLAS} Library}, author = {NVIDIA Corporation}, journal = {}, @@ -59,9 +59,9 @@ note = {\url{http://developer.download.nvidia.com/compute/DevZone/docs/html/CUDA year = {2012}, } -@article{ref7, +@article{ch12:ref7, title = {Efficient sparse matrix-vector multiplication on {CUDA}}, -author = {Bell, N. and Garland, M.}, +author = {Bell, Nathan and Garland, Michael}, journal = {NVIDIA Technical Report NVR-2008-004, NVIDIA Corporation}, volume = {}, number = {}, @@ -69,7 +69,7 @@ pages = {}, year = {2008}, } -@article{ref8, +@article{ch12:ref8, title = {{CUSP} {L}ibrary}, author = {}, journal = {}, @@ -80,7 +80,7 @@ note = {\url{http://code.google.com/p/cusp-library/}}, year = {}, } -@article{ref9, +@article{ch12:ref9, title = {{NVIDIA} {CUDA} {C} programming guide}, author = {NVIDIA Corporation}, journal = {}, @@ -91,7 +91,7 @@ note = {Version 4.2}, year = {2012}, } -@article{ref10, +@article{ch12:ref10, title = {The university of {F}lorida sparse matrix collection}, author = {Davis, T. and Hu, Y.}, journal = {}, @@ -102,9 +102,9 @@ note = {\url{http://www.cise.ufl.edu/research/sparse/matrices/list_by_id.html}}, year = {1997}, } -@article{ref11, +@article{ch12:ref11, title = {Hypergraph partitioning based decomposition for parallel sparse matrix-vector multiplication}, -author = {Catalyurek, U. and Aykanat, C.}, +author = {Catalyurek, Umit V. and Aykanat, Cevdet}, journal = {{IEEE} {T}ransactions on {P}arallel and {D}istributed {S}ystems}, volume = {10}, number = {7}, @@ -113,9 +113,9 @@ note = {}, year = {1999}, } -@article{ref12, +@article{ch12:ref12, title = {{hMETIS}: A hypergraph partitioning package}, -author = {Karypis, G. and Kumar, V.}, +author = {Karypis, George and Kumar, Vipin}, journal = {}, volume = {}, number = {}, @@ -124,9 +124,9 @@ note = {}, year = {1998}, } -@article{ref13, +@article{ch12:ref13, title = {{PaToH}: partitioning tool for hypergraphs}, -author = {Catalyurek, U. and Aykanat, C.}, +author = {Catalyurek, Umit V. and Aykanat, Cevdet}, journal = {}, volume = {}, number = {}, @@ -135,9 +135,9 @@ note = {}, year = {1999}, } -@article{ref14, +@article{ch12:ref14, title = {Parallel hypergraph partitioning for scientific computing}, -author = {Devine, K.D. and Boman, E.G. and Heaphy, R.T. and Bisseling, R.H and Catalyurek, U.V.}, +author = {Devine, Karen D. and Boman, Erik G. and Heaphy, Robert T. and Bisseling, Rob H. and Catalyurek, Umit V.}, journal = {In Proceedings of the 20th international conference on Parallel and distributed processing, IPDPSâ06}, volume = {}, number = {}, @@ -146,7 +146,7 @@ note = {}, year = {2006}, } -@article{ref15, +@article{ch12:ref15, title = {{PHG} - parallel hypergraph and graph partitioning with {Z}oltan}, author = {}, journal = {}, diff --git a/BookGPU/Chapters/chapter12/ch12.aux b/BookGPU/Chapters/chapter12/ch12.aux index 0cc343d..26d263a 100644 --- a/BookGPU/Chapters/chapter12/ch12.aux +++ b/BookGPU/Chapters/chapter12/ch12.aux @@ -1,97 +1,106 @@ \relax -\@writefile{toc}{\author{}{}} +\@writefile{toc}{\author{Lilia Ziane Khodja}{}} +\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} +\@writefile{toc}{\author{Jacques Bahi}{}} \@writefile{loa}{\addvspace {10\p@ }} \@writefile{toc}{\contentsline {chapter}{\numberline {11}Solving sparse linear systems with GMRES and CG methods on GPU clusters}{249}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} +\newlabel{ch12}{{11}{249}} \@writefile{toc}{\contentsline {section}{\numberline {11.1}Introduction}{249}} -\newlabel{sec:01}{{11.1}{249}} +\newlabel{ch12:sec:01}{{11.1}{249}} \@writefile{toc}{\contentsline {section}{\numberline {11.2}Krylov iterative methods}{250}} -\newlabel{sec:02}{{11.2}{250}} -\newlabel{eq:01}{{11.1}{250}} -\newlabel{eq:02}{{11.2}{250}} -\newlabel{eq:03}{{11.3}{250}} -\newlabel{eq:11}{{11.4}{251}} +\newlabel{ch12:sec:02}{{11.2}{250}} +\newlabel{ch12:eq:01}{{11.1}{250}} +\newlabel{ch12:eq:02}{{11.2}{250}} +\newlabel{ch12:eq:03}{{11.3}{250}} +\newlabel{ch12:eq:11}{{11.4}{251}} \@writefile{toc}{\contentsline {subsection}{\numberline {11.2.1}CG method}{251}} -\newlabel{sec:02.01}{{11.2.1}{251}} -\newlabel{eq:04}{{11.5}{251}} -\newlabel{eq:05}{{11.6}{251}} -\newlabel{eq:06}{{11.7}{251}} -\newlabel{eq:07}{{11.8}{251}} -\newlabel{eq:08}{{11.9}{251}} -\newlabel{eq:09}{{11.10}{251}} +\newlabel{ch12:sec:02.01}{{11.2.1}{251}} +\newlabel{ch12:eq:04}{{11.5}{251}} +\newlabel{ch12:eq:05}{{11.6}{251}} +\newlabel{ch12:eq:06}{{11.7}{251}} +\newlabel{ch12:eq:07}{{11.8}{251}} +\newlabel{ch12:eq:08}{{11.9}{251}} +\newlabel{ch12:eq:09}{{11.10}{251}} \@writefile{loa}{\contentsline {algocf}{\numberline {9}{\ignorespaces Left-preconditioned CG method\relax }}{252}} -\newlabel{alg:01}{{9}{252}} -\newlabel{eq:10}{{11.11}{252}} +\newlabel{ch12:alg:01}{{9}{252}} +\newlabel{ch12:eq:10}{{11.11}{252}} \@writefile{toc}{\contentsline {subsection}{\numberline {11.2.2}GMRES method}{253}} -\newlabel{sec:02.02}{{11.2.2}{253}} -\newlabel{eq:12}{{11.12}{253}} -\newlabel{eq:13}{{11.13}{253}} -\newlabel{eq:14}{{11.14}{253}} -\newlabel{eq:15}{{11.15}{253}} -\newlabel{eq:16}{{11.16}{253}} -\newlabel{eq:17}{{11.17}{253}} -\newlabel{eq:18}{{11.18}{253}} -\newlabel{eq:19}{{11.19}{253}} +\newlabel{ch12:sec:02.02}{{11.2.2}{253}} +\newlabel{ch12:eq:12}{{11.12}{253}} +\newlabel{ch12:eq:13}{{11.13}{253}} +\newlabel{ch12:eq:14}{{11.14}{253}} +\newlabel{ch12:eq:15}{{11.15}{253}} +\newlabel{ch12:eq:16}{{11.16}{253}} +\newlabel{ch12:eq:17}{{11.17}{253}} +\newlabel{ch12:eq:18}{{11.18}{253}} +\newlabel{ch12:eq:19}{{11.19}{253}} \@writefile{loa}{\contentsline {algocf}{\numberline {10}{\ignorespaces Left-preconditioned GMRES method with restarts\relax }}{254}} -\newlabel{alg:02}{{10}{254}} +\newlabel{ch12:alg:02}{{10}{254}} \@writefile{toc}{\contentsline {section}{\numberline {11.3}Parallel implementation on a GPU cluster}{255}} -\newlabel{sec:03}{{11.3}{255}} +\newlabel{ch12:sec:03}{{11.3}{255}} \@writefile{toc}{\contentsline {subsection}{\numberline {11.3.1}Data partitioning}{255}} -\newlabel{sec:03.01}{{11.3.1}{255}} +\newlabel{ch12:sec:03.01}{{11.3.1}{255}} \@writefile{lof}{\contentsline {figure}{\numberline {11.1}{\ignorespaces A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.\relax }}{256}} -\newlabel{fig:01}{{11.1}{256}} +\newlabel{ch12:fig:01}{{11.1}{256}} \@writefile{toc}{\contentsline {subsection}{\numberline {11.3.2}GPU computing}{256}} -\newlabel{sec:03.02}{{11.3.2}{256}} +\newlabel{ch12:sec:03.02}{{11.3.2}{256}} \@writefile{toc}{\contentsline {subsection}{\numberline {11.3.3}Data communications}{257}} -\newlabel{sec:03.03}{{11.3.3}{257}} +\newlabel{ch12:sec:03.03}{{11.3.3}{257}} \@writefile{lof}{\contentsline {figure}{\numberline {11.2}{\ignorespaces Data exchanges between \textit {Node 1} and its neighbors \textit {Node 0}, \textit {Node 2} and \textit {Node 3}.\relax }}{258}} -\newlabel{fig:02}{{11.2}{258}} +\newlabel{ch12:fig:02}{{11.2}{258}} \@writefile{lof}{\contentsline {figure}{\numberline {11.3}{\ignorespaces Columns reordering of a sparse sub-matrix.\relax }}{259}} -\newlabel{fig:03}{{11.3}{259}} +\newlabel{ch12:fig:03}{{11.3}{259}} \@writefile{lof}{\contentsline {figure}{\numberline {11.4}{\ignorespaces General scheme of the GPU cluster of tests composed of six machines, each with two GPUs.\relax }}{260}} -\newlabel{fig:04}{{11.4}{260}} +\newlabel{ch12:fig:04}{{11.4}{260}} \@writefile{toc}{\contentsline {section}{\numberline {11.4}Experimental results}{260}} -\newlabel{sec:04}{{11.4}{260}} +\newlabel{ch12:sec:04}{{11.4}{260}} \@writefile{lof}{\contentsline {figure}{\numberline {11.5}{\ignorespaces Sketches of sparse matrices chosen from the Davis's collection.\relax }}{261}} -\newlabel{fig:05}{{11.5}{261}} +\newlabel{ch12:fig:05}{{11.5}{261}} \@writefile{lot}{\contentsline {table}{\numberline {11.1}{\ignorespaces Main characteristics of sparse matrices chosen from the Davis's collection.\relax }}{262}} -\newlabel{tab:01}{{11.1}{262}} +\newlabel{ch12:tab:01}{{11.1}{262}} \@writefile{lot}{\contentsline {table}{\numberline {11.2}{\ignorespaces Performances of the parallel CG method on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{262}} -\newlabel{tab:02}{{11.2}{262}} +\newlabel{ch12:tab:02}{{11.2}{262}} \@writefile{lot}{\contentsline {table}{\numberline {11.3}{\ignorespaces Performances of the parallel GMRES method on a cluster 24 CPU cores vs. on cluster of 12 GPUs.\relax }}{263}} -\newlabel{tab:03}{{11.3}{263}} -\newlabel{eq:20}{{11.20}{263}} +\newlabel{ch12:tab:03}{{11.3}{263}} +\newlabel{ch12:eq:20}{{11.20}{263}} \@writefile{lof}{\contentsline {figure}{\numberline {11.6}{\ignorespaces Parallel generation of a large sparse matrix by four computing nodes.\relax }}{264}} -\newlabel{fig:06}{{11.6}{264}} +\newlabel{ch12:fig:06}{{11.6}{264}} \@writefile{lot}{\contentsline {table}{\numberline {11.4}{\ignorespaces Main characteristics of sparse banded matrices generated from those of the Davis's collection.\relax }}{265}} -\newlabel{tab:04}{{11.4}{265}} +\newlabel{ch12:tab:04}{{11.4}{265}} \@writefile{lot}{\contentsline {table}{\numberline {11.5}{\ignorespaces Performances of the parallel CG method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{265}} -\newlabel{tab:05}{{11.5}{265}} +\newlabel{ch12:tab:05}{{11.5}{265}} \@writefile{toc}{\contentsline {section}{\numberline {11.5}Hypergraph partitioning}{265}} -\newlabel{sec:05}{{11.5}{265}} +\newlabel{ch12:sec:05}{{11.5}{265}} \@writefile{lot}{\contentsline {table}{\numberline {11.6}{\ignorespaces Performances of the parallel GMRES method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{266}} -\newlabel{tab:06}{{11.6}{266}} +\newlabel{ch12:tab:06}{{11.6}{266}} \@writefile{lot}{\contentsline {table}{\numberline {11.7}{\ignorespaces Main characteristics of sparse five-bands matrices generated from those of the Davis's collection.\relax }}{266}} -\newlabel{tab:07}{{11.7}{266}} +\newlabel{ch12:tab:07}{{11.7}{266}} \@writefile{lof}{\contentsline {figure}{\numberline {11.7}{\ignorespaces Parallel generation of a large sparse five-bands matrix by four computing nodes.\relax }}{267}} -\newlabel{fig:07}{{11.7}{267}} +\newlabel{ch12:fig:07}{{11.7}{267}} \@writefile{lot}{\contentsline {table}{\numberline {11.8}{\ignorespaces Performances of parallel CG solver for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs\relax }}{267}} -\newlabel{tab:08}{{11.8}{267}} +\newlabel{ch12:tab:08}{{11.8}{267}} \@writefile{lot}{\contentsline {table}{\numberline {11.9}{\ignorespaces Performances of parallel GMRES solver for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs\relax }}{268}} -\newlabel{tab:09}{{11.9}{268}} +\newlabel{ch12:tab:09}{{11.9}{268}} \@writefile{lof}{\contentsline {figure}{\numberline {11.8}{\ignorespaces An example of the hypergraph partitioning of a sparse matrix decomposed between three computing nodes.\relax }}{269}} -\newlabel{fig:08}{{11.8}{269}} +\newlabel{ch12:fig:08}{{11.8}{269}} \@writefile{lot}{\contentsline {table}{\numberline {11.10}{\ignorespaces Performances of the parallel CG solver using hypergraph partitioning for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPU.\relax }}{270}} -\newlabel{tab:10}{{11.10}{270}} +\newlabel{ch12:tab:10}{{11.10}{270}} \@writefile{lot}{\contentsline {table}{\numberline {11.11}{\ignorespaces Performances of the parallel GMRES solver using hypergraph partitioning for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPU.\relax }}{271}} -\newlabel{tab:11}{{11.11}{271}} +\newlabel{ch12:tab:11}{{11.11}{271}} \@writefile{lot}{\contentsline {table}{\numberline {11.12}{\ignorespaces The total communication volume between 12 GPU computing nodes without and with the hypergraph partitioning method.\relax }}{272}} -\newlabel{tab:12}{{11.12}{272}} +\newlabel{ch12:tab:12}{{11.12}{272}} +\newlabel{ch12:fig:09.01}{{11.9(a)}{273}} +\newlabel{sub@ch12:fig:09.01}{{(a)}{273}} +\newlabel{ch12:fig:09.02}{{11.9(b)}{273}} +\newlabel{sub@ch12:fig:09.02}{{(b)}{273}} \@writefile{lof}{\contentsline {figure}{\numberline {11.9}{\ignorespaces Weak-scaling of the parallel CG and GMRES solvers on a GPU cluster for solving large sparse linear systems.\relax }}{273}} -\newlabel{fig:09}{{11.9}{273}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Sparse band matrices}}}{273}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Sparse five-bands matrices}}}{273}} +\newlabel{ch12:fig:09}{{11.9}{273}} \@writefile{toc}{\contentsline {section}{\numberline {11.6}Conclusion}{273}} -\newlabel{sec:06}{{11.6}{273}} +\newlabel{ch12:sec:06}{{11.6}{273}} \@writefile{toc}{\contentsline {section}{Bibliography}{274}} \@setckpt{Chapters/chapter12/ch12}{ \setcounter{page}{276} diff --git a/BookGPU/Chapters/chapter12/ch12.tex b/BookGPU/Chapters/chapter12/ch12.tex index 7cd99f4..eaa4f9c 100755 --- a/BookGPU/Chapters/chapter12/ch12.tex +++ b/BookGPU/Chapters/chapter12/ch12.tex @@ -4,138 +4,158 @@ %% %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\chapterauthor{}{} +%\chapterauthor{}{} +\chapterauthor{Lilia Ziane Khodja}{Femto-ST Institute, University of Franche-Comte, France} +\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte, France} +\chapterauthor{Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France} + \chapter{Solving sparse linear systems with GMRES and CG methods on GPU clusters} +\label{ch12} %%--------------------------%% %% SECTION 1 %% %%--------------------------%% \section{Introduction} -\label{sec:01} -The sparse linear systems are used to model many scientific and industrial problems, such as the environmental simulations or -the industrial processing of the complex or non-Newtonian fluids. Moreover, the resolution of these problems often involves the -solving of such linear systems which is considered as the most expensive process in terms of time execution and memory space. -Therefore, solving sparse linear systems must be as efficient as possible in order to deal with problems of ever increasing size. - -There are, in the jargon of numerical analysis, different methods of solving sparse linear systems that we can classify in two -classes: the direct and iterative methods. However, the iterative methods are often more suitable than their counterpart, direct -methods, for solving large sparse linear systems. Indeed, they are less memory consuming and easier to parallelize on parallel -computers than direct methods. Different computing platforms, sequential and parallel computers, are used for solving sparse -linear systems with iterative solutions. Nowadays, graphics processing units (GPUs) have become attractive for solving these -linear systems, due to their computing power and their ability to compute faster than traditional CPUs. - -In Section~\ref{sec:02}, we describe the general principle of two well-known iterative methods: the conjugate gradient method and -the generalized minimal residual method. In Section~\ref{sec:03}, we give the main key points of the parallel implementation of both -methods on a cluster of GPUs. Then, in Section~\ref{sec:04}, we present the experimental results obtained on a CPU cluster and on -a GPU cluster, for solving sparse linear systems associated to matrices of different structures. Finally, in Section~\ref{sec:05}, -we apply the hypergraph partitioning technique to reduce the total communication volume between the computing nodes and, thus, to -improve the execution times of the parallel algorithms of both iterative methods. +\label{ch12:sec:01} +The sparse linear systems are used to model many scientific and industrial problems, +such as the environmental simulations or the industrial processing of the complex or +non-Newtonian fluids. Moreover, the resolution of these problems often involves the +solving of such linear systems which is considered as the most expensive process in +terms of execution time and memory space. Therefore, solving sparse linear systems +must be as efficient as possible in order to deal with problems of ever increasing +size. + +There are, in the jargon of numerical analysis, different methods of solving sparse +linear systems that can be classified in two classes: the direct and iterative methods. +However, the iterative methods are often more suitable than their counterpart, direct +methods, for solving these systems. Indeed, they are less memory consuming and easier +to parallelize on parallel computers than direct methods. Different computing platforms, +sequential and parallel computers, are used for solving sparse linear systems with iterative +solutions. Nowadays, graphics processing units (GPUs) have become attractive for solving +these systems, due to their computing power and their ability to compute faster than +traditional CPUs. + +In Section~\ref{ch12:sec:02}, we describe the general principle of two well-known iterative +methods: the conjugate gradient method and the generalized minimal residual method. In Section~\ref{ch12:sec:03}, +we give the main key points of the parallel implementation of both methods on a cluster of +GPUs. Then, in Section~\ref{ch12:sec:04}, we present the experimental results obtained on a +CPU cluster and on a GPU cluster, for solving sparse linear systems associated to matrices +of different structures. Finally, in Section~\ref{ch12:sec:05}, we apply the hypergraph partitioning +technique to reduce the total communication volume between the computing nodes and, thus, +to improve the execution times of the parallel algorithms of both iterative methods. %%--------------------------%% %% SECTION 2 %% %%--------------------------%% \section{Krylov iterative methods} -\label{sec:02} -Let us consider the following system of $n$ linear equations in $\mathbb{R}$: +\label{ch12:sec:02} +Let us consider the following system of $n$ linear equations\index{Sparse~linear~system} +in $\mathbb{R}$: \begin{equation} Ax=b, -\label{eq:01} +\label{ch12:eq:01} \end{equation} -where $A\in\mathbb{R}^{n\times n}$ is a sparse nonsingular square matrix, $x\in\mathbb{R}^{n}$ is the solution vector, -$b\in\mathbb{R}^{n}$ is the right-hand side and $n\in\mathbb{N}$ is a large integer number. - -The iterative methods for solving the large sparse linear system~(\ref{eq:01}) proceed by successive iterations of a same -block of elementary operations, during which an infinite number of approximate solutions $\{x_k\}_{k\geq 0}$ are computed. -Indeed, from an initial guess $x_0$, an iterative method determines at each iteration $k>0$ an approximate solution $x_k$ -which, gradually, converges to the exact solution $x^{*}$ as follows: +where $A\in\mathbb{R}^{n\times n}$ is a sparse nonsingular square matrix, $x\in\mathbb{R}^{n}$ +is the solution vector, $b\in\mathbb{R}^{n}$ is the right-hand side and $n\in\mathbb{N}$ is a +large integer number. + +The iterative methods\index{Iterative~method} for solving the large sparse linear system~(\ref{ch12:eq:01}) +proceed by successive iterations of a same block of elementary operations, during which an +infinite number of approximate solutions $\{x_k\}_{k\geq 0}$ are computed. Indeed, from an +initial guess $x_0$, an iterative method determines at each iteration $k>0$ an approximate +solution $x_k$ which, gradually, converges to the exact solution $x^{*}$ as follows: \begin{equation} x^{*}=\lim\limits_{k\to\infty}x_{k}=A^{-1}b. -\label{eq:02} +\label{ch12:eq:02} \end{equation} -The number of iterations necessary to reach the exact solution $x^{*}$ is not known beforehand and can be infinite. In -practice, an iterative method often finds an approximate solution $\tilde{x}$ after a fixed number of iterations and/or -when a given convergence criterion is satisfied as follows: +The number of iterations necessary to reach the exact solution $x^{*}$ is not known beforehand +and can be infinite. In practice, an iterative method often finds an approximate solution $\tilde{x}$ +after a fixed number of iterations and/or when a given convergence criterion\index{Convergence} +is satisfied as follows: \begin{equation} \|b-A\tilde{x}\| < \varepsilon, -\label{eq:03} +\label{ch12:eq:03} \end{equation} -where $\varepsilon<1$ is the required convergence tolerance threshold. - -Some of the most iterative methods that have proven their efficiency for solving large sparse linear systems are those -called \textit{Krylov sub-space methods}~\cite{ref1}. In the present chapter, we describe two Krylov methods which are -widely used: the conjugate gradient method (CG) and the generalized minimal residual method (GMRES). In practice, the -Krylov sub-space methods are usually used with preconditioners that allow to improve their convergence. So, in what -follows, the CG and GMRES methods are used for solving the left-preconditioned sparse linear system: +where $\varepsilon<1$ is the required convergence tolerance threshold\index{Convergence!Tolerance~threshold}. + +Some of the most iterative methods that have proven their efficiency for solving large sparse +linear systems are those called \textit{Krylov subspace methods}~\cite{ch12:ref1}\index{Iterative~method!Krylov~subspace}. +In the present chapter, we describe two Krylov methods which are widely used: the conjugate +gradient method (CG) and the generalized minimal residual method (GMRES). In practice, the +Krylov subspace methods are usually used with preconditioners that allow to improve their +convergence. So, in what follows, the CG and GMRES methods are used for solving the left-preconditioned\index{Sparse~linear~system!Preconditioned} +sparse linear system: \begin{equation} M^{-1}Ax=M^{-1}b, -\label{eq:11} +\label{ch12:eq:11} \end{equation} where $M$ is the preconditioning matrix. + %%****************%% %%****************%% \subsection{CG method} -\label{sec:02.01} -The conjugate gradient method is initially developed by Hestenes and Stiefel in 1952~\cite{ref2}. It is one of the well -known iterative method for solving large sparse linear systems. In addition, it can be adapted for solving nonlinear -equations and optimization problems. However, it can only be applied to problems with positive definite symmetric matrices. - -The main idea of the CG method is the computation of a sequence of approximate solutions $\{x_k\}_{k\geq 0}$ in a Krylov -sub-space of order $k$ as follows: +\label{ch12:sec:02.01} +The conjugate gradient method is initially developed by Hestenes and Stiefel in 1952~\cite{ch12:ref2}. +It is one of the well known iterative method for solving large sparse linear systems. In addition, it +can be adapted for solving nonlinear equations and optimization problems. However, it can only be applied +to problems with positive definite symmetric matrices. + +The main idea of the CG method\index{Iterative~method!CG} is the computation of a sequence of approximate +solutions $\{x_k\}_{k\geq 0}$ in a Krylov subspace\index{Iterative~method!Krylov~subspace} of order $k$ as +follows: \begin{equation} x_k \in x_0 + \mathcal{K}_k(A,r_0), -\label{eq:04} +\label{ch12:eq:04} \end{equation} -such that the Galerkin condition must be satisfied: +such that the Galerkin condition\index{Galerkin~condition} must be satisfied: \begin{equation} r_k \bot \mathcal{K}_k(A,r_0), -\label{eq:05} +\label{ch12:eq:05} \end{equation} -where $x_0$ is the initial guess, $r_k=b-Ax_k$ is the residual of the computed solution $x_k$ and $\mathcal{K}_k$ the Krylov -sub-space of order $k$: \[\mathcal{K}_k(A,r_0) \equiv\text{span}\{r_0, Ar_0, A^2r_0,\ldots, A^{k-1}r_0\}.\] +where $x_0$ is the initial guess, $r_k=b-Ax_k$ is the residual of the computed solution $x_k$ and $\mathcal{K}_k$ +the Krylov subspace of order $k$: \[\mathcal{K}_k(A,r_0) \equiv\text{span}\{r_0, Ar_0, A^2r_0,\ldots, A^{k-1}r_0\}.\] In fact, CG is based on the construction of a sequence $\{p_k\}_{k\in\mathbb{N}}$ of direction vectors in $\mathcal{K}_k$ which are pairwise $A$-conjugate ($A$-orthogonal): \begin{equation} \begin{array}{ll} p_i^T A p_j = 0, & i\neq j. \end{array} -\label{eq:06} +\label{ch12:eq:06} \end{equation} At each iteration $k$, an approximate solution $x_k$ is computed by recurrence as follows: \begin{equation} \begin{array}{ll} x_k = x_{k-1} + \alpha_k p_k, & \alpha_k\in\mathbb{R}. \end{array} -\label{eq:07} +\label{ch12:eq:07} \end{equation} Consequently, the residuals $r_k$ are computed in the same way: \begin{equation} r_k = r_{k-1} - \alpha_k A p_k. -\label{eq:08} +\label{ch12:eq:08} \end{equation} -In the case where all residuals are nonzero, the direction vectors $p_k$ can be determined so that the following recurrence -holds: +In the case where all residuals are nonzero, the direction vectors $p_k$ can be determined so that +the following recurrence holds: \begin{equation} \begin{array}{lll} p_0=r_0, & p_k=r_k+\beta_k p_{k-1}, & \beta_k\in\mathbb{R}. \end{array} -\label{eq:09} +\label{ch12:eq:09} \end{equation} -Moreover, the scalars $\{\alpha_k\}_{k>0}$ are chosen so as to minimize the $A$-norm error $\|x^{*}-x_k\|_A$ over the Krylov -sub-space $\mathcal{K}_{k}$ and the scalars $\{\beta_k\}_{k>0}$ are chosen so as to ensure that the direction vectors are -pairwise $A$-conjugate. So, the assumption that matrix $A$ is symmetric and the recurrences~(\ref{eq:08}) and~(\ref{eq:09}) -allow to deduce that: +Moreover, the scalars $\{\alpha_k\}_{k>0}$ are chosen so as to minimize the $A$-norm error $\|x^{*}-x_k\|_A$ +over the Krylov subspace $\mathcal{K}_{k}$ and the scalars $\{\beta_k\}_{k>0}$ are chosen so as to ensure +that the direction vectors are pairwise $A$-conjugate. So, the assumption that matrix $A$ is symmetric and +the recurrences~(\ref{ch12:eq:08}) and~(\ref{ch12:eq:09}) allow to deduce that: \begin{equation} \begin{array}{ll} \alpha_{k}=\frac{r^{T}_{k-1}r_{k-1}}{p_{k}^{T}Ap_{k}}, & \beta_{k}=\frac{r_{k}^{T}r_{k}}{r_{k-1}^{T}r_{k-1}}. \end{array} -\label{eq:10} +\label{ch12:eq:10} \end{equation} \begin{algorithm}[!t] - %\SetLine - %\linesnumbered Choose an initial guess $x_0$\; $r_{0} = b - A x_{0}$\; $convergence$ = false\; @@ -160,62 +180,70 @@ allow to deduce that: } } \caption{Left-preconditioned CG method} -\label{alg:01} +\label{ch12:alg:01} \end{algorithm} -Algorithm~\ref{alg:01} shows the main key points of the preconditioned CG method. It allows to solve the left-preconditioned -sparse linear system~(\ref{eq:11}). In this algorithm, $\varepsilon$ is the convergence tolerance threshold, $maxiter$ is the maximum -number of iterations and $(\cdot,\cdot)$ defines the dot product between two vectors in $\mathbb{R}^{n}$. At every iteration, a direction -vector $p_k$ is determined, so that it is orthogonal to the preconditioned residual $z_k$ and to the direction vectors $\{p_i\}_{i<k}$ -previously determined (from line~$8$ to line~$13$). Then, at lines~$16$ and~$17$ , the iterate $x_k$ and the residual $r_k$ are computed -using formulas~(\ref{eq:07}) and~(\ref{eq:08}), respectively. The CG method converges after, at most, $n$ iterations. In practice, the CG -algorithm stops when the tolerance threshold $\varepsilon$ and/or the maximum number of iterations $maxiter$ are reached. +Algorithm~\ref{ch12:alg:01} shows the main key points of the preconditioned CG method. It allows +to solve the left-preconditioned\index{Sparse~linear~system!Preconditioned} sparse linear system~(\ref{ch12:eq:11}). +In this algorithm, $\varepsilon$ is the convergence tolerance threshold, $maxiter$ is the maximum +number of iterations and $(\cdot,\cdot)$ defines the dot product between two vectors in $\mathbb{R}^{n}$. +At every iteration, a direction vector $p_k$ is determined, so that it is orthogonal to the preconditioned +residual $z_k$ and to the direction vectors $\{p_i\}_{i<k}$ previously determined (from line~$8$ to +line~$13$). Then, at lines~$16$ and~$17$, the iterate $x_k$ and the residual $r_k$ are computed using +formulas~(\ref{ch12:eq:07}) and~(\ref{ch12:eq:08}), respectively. The CG method converges after, at +most, $n$ iterations. In practice, the CG algorithm stops when the tolerance threshold\index{Convergence!Tolerance~threshold} +$\varepsilon$ and/or the maximum number of iterations\index{Convergence!Maximum~number~of~iterations} +$maxiter$ are reached. + %%****************%% %%****************%% \subsection{GMRES method} -\label{sec:02.02} -The iterative GMRES method is developed by Saad and Schultz in 1986~\cite{ref3} as a generalization of the minimum residual method -MINRES~\cite{ref4}. Indeed, GMRES can be applied for solving symmetric or nonsymmetric linear systems. - -The main principle of the GMRES method is to find an approximation minimizing at best the residual norm. In fact, GMRES -computes a sequence of approximate solutions $\{x_k\}_{k>0}$ in a Krylov sub-space $\mathcal{K}_k$ as follows: +\label{ch12:sec:02.02} +The iterative GMRES method is developed by Saad and Schultz in 1986~\cite{ch12:ref3} as a generalization +of the minimum residual method MINRES~\cite{ch12:ref4}\index{Iterative~method!MINRES}. Indeed, GMRES can +be applied for solving symmetric or nonsymmetric linear systems. + +The main principle of the GMRES method\index{Iterative~method!GMRES} is to find an approximation minimizing +at best the residual norm. In fact, GMRES computes a sequence of approximate solutions $\{x_k\}_{k>0}$ in +a Krylov subspace\index{Iterative~method!Krylov~subspace} $\mathcal{K}_k$ as follows: \begin{equation} \begin{array}{ll} x_k \in x_0 + \mathcal{K}_k(A, v_1),& v_1=\frac{r_0}{\|r_0\|_2}, \end{array} -\label{eq:12} +\label{ch12:eq:12} \end{equation} -so that the Petrov-Galerkin condition is satisfied: +so that the Petrov-Galerkin condition\index{Petrov-Galerkin~condition} is satisfied: \begin{equation} \begin{array}{ll} r_k \bot A \mathcal{K}_k(A, v_1). \end{array} -\label{eq:13} +\label{ch12:eq:13} \end{equation} -GMRES uses the Arnoldi process~\cite{ref5} to construct an orthonormal basis $V_k$ for the Krylov sub-space $\mathcal{K}_k$ -and an upper Hessenberg matrix $\bar{H}_k$ of order $(k+1)\times k$: +GMRES uses the Arnoldi process~\cite{ch12:ref5}\index{Iterative~method!Arnoldi~process} to construct an +orthonormal basis $V_k$ for the Krylov subspace $\mathcal{K}_k$ and an upper Hessenberg matrix\index{Hessenberg~matrix} +$\bar{H}_k$ of order $(k+1)\times k$: \begin{equation} \begin{array}{ll} V_k = \{v_1, v_2,\ldots,v_k\}, & \forall k>1, v_k=A^{k-1}v_1, \end{array} -\label{eq:14} +\label{ch12:eq:14} \end{equation} and \begin{equation} V_k A = V_{k+1} \bar{H}_k. -\label{eq:15} +\label{ch12:eq:15} \end{equation} -Then, at each iteration $k$, an approximate solution $x_k$ is computed in the Krylov sub-space $\mathcal{K}_k$ spanned by $V_k$ -as follows: +Then, at each iteration $k$, an approximate solution $x_k$ is computed in the Krylov subspace $\mathcal{K}_k$ +spanned by $V_k$ as follows: \begin{equation} \begin{array}{ll} x_k = x_0 + V_k y, & y\in\mathbb{R}^{k}. \end{array} -\label{eq:16} +\label{ch12:eq:16} \end{equation} -From both formulas~(\ref{eq:15}) and~(\ref{eq:16}) and $r_k=b-Ax_k$, we can deduce that: +From both formulas~(\ref{ch12:eq:15}) and~(\ref{ch12:eq:16}) and $r_k=b-Ax_k$, we can deduce that: \begin{equation} \begin{array}{lll} r_{k} & = & b - A (x_{0} + V_{k}y) \\ @@ -223,34 +251,34 @@ From both formulas~(\ref{eq:15}) and~(\ref{eq:16}) and $r_k=b-Ax_k$, we can dedu & = & \beta v_{1} - V_{k+1}\bar{H}_{k}y \\ & = & V_{k+1}(\beta e_{1} - \bar{H}_{k}y), \end{array} -\label{eq:17} +\label{ch12:eq:17} \end{equation} -such that $\beta=\|r_0\|_2$ and $e_1=(1,0,\cdots,0)$ is the first vector of the canonical basis of $\mathbb{R}^k$. So, -the vector $y$ is chosen in $\mathbb{R}^k$ so as to minimize at best the Euclidean norm of the residual $r_k$. Consequently, -a linear least-squares problem of size $k$ is solved: +such that $\beta=\|r_0\|_2$ and $e_1=(1,0,\cdots,0)$ is the first vector of the canonical basis of +$\mathbb{R}^k$. So, the vector $y$ is chosen in $\mathbb{R}^k$ so as to minimize at best the Euclidean +norm of the residual $r_k$. Consequently, a linear least-squares problem of size $k$ is solved: \begin{equation} \underset{y\in\mathbb{R}^{k}}{min}\|r_{k}\|_{2}=\underset{y\in\mathbb{R}^{k}}{min}\|\beta e_{1}-\bar{H}_{k}y\|_{2}. -\label{eq:18} +\label{ch12:eq:18} \end{equation} -The QR factorization of matrix $\bar{H}_k$ is used to compute the solution of this problem by using Givens rotations~\cite{ref1,ref3}, -such that: +The QR factorization of matrix $\bar{H}_k$ is used to compute the solution of this problem by using +Givens rotations~\cite{ch12:ref1,ch12:ref3}, such that: \begin{equation} \begin{array}{lll} \bar{H}_{k}=Q_{k}R_{k}, & Q_{k}\in\mathbb{R}^{(k+1)\times (k+1)}, & R_{k}\in\mathbb{R}^{(k+1)\times k}, \end{array} -\label{eq:19} +\label{ch12:eq:19} \end{equation} where $Q_kQ_k^T=I_k$ and $R_k$ is an upper triangular matrix. -The GMRES method computes an approximate solution with a sufficient precision after, at most, $n$ iterations ($n$ is the size of the -sparse linear system to be solved). However, the GMRES algorithm must construct and store in the memory an orthonormal basis $V_k$ whose -size is proportional to the number of iterations required to achieve the convergence. Then, to avoid a huge memory storage, the GMRES -method must be restarted at each $m$ iterations, such that $m$ is very small ($m\ll n$), and with $x_m$ as the initial guess to the -next iteration. This allows to limit the size of the basis $V$ to $m$ orthogonal vectors. +The GMRES method computes an approximate solution with a sufficient precision after, at most, $n$ +iterations ($n$ is the size of the sparse linear system to be solved). However, the GMRES algorithm +must construct and store in the memory an orthonormal basis $V_k$ whose size is proportional to the +number of iterations required to achieve the convergence. Then, to avoid a huge memory storage, the +GMRES method must be restarted at each $m$ iterations, such that $m$ is very small ($m\ll n$), and +with $x_m$ as the initial guess to the next iteration. This allows to limit the size of the basis +$V$ to $m$ orthogonal vectors. \begin{algorithm}[!t] - %\SetLine - %\linesnumbered Choose an initial guess $x_0$\; $convergence$ = false\; $k = 1$\; @@ -281,190 +309,245 @@ next iteration. This allows to limit the size of the basis $V$ to $m$ orthogonal } } \caption{Left-preconditioned GMRES method with restarts} -\label{alg:02} +\label{ch12:alg:02} \end{algorithm} -Algorithm~\ref{alg:02} shows the main key points of the GMRES method with restarts. It solves the left-preconditioned sparse linear -system~(\ref{eq:11}), such that $M$ is the preconditioning matrix. At each iteration $k$, GMRES uses the Arnoldi process (defined -from line~$7$ to line~$17$) to construct a basis $V_m$ of $m$ orthogonal vectors and an upper Hessenberg matrix $\bar{H}_m$ of size -$(m+1)\times m$. Then, it solves the linear least-squares problem of size $m$ to find the vector $y\in\mathbb{R}^{m}$ which minimizes -at best the residual norm (line~$18$). Finally, it computes an approximate solution $x_m$ in the Krylov sub-space spanned by $V_m$ -(line~$19$). The GMRES algorithm is stopped when the residual norm is sufficiently small ($\|r_m\|_2<\varepsilon$) and/or the maximum -number of iterations ($maxiter$) is reached. +Algorithm~\ref{ch12:alg:02} shows the main key points of the GMRES method with restarts. +It solves the left-preconditioned\index{Sparse~linear~system!Preconditioned} sparse linear +system~(\ref{ch12:eq:11}), such that $M$ is the preconditioning matrix. At each iteration +$k$, GMRES uses the Arnoldi process\index{Iterative~method!Arnoldi~process} (defined from +line~$7$ to line~$17$) to construct a basis $V_m$ of $m$ orthogonal vectors and an upper +Hessenberg matrix\index{Hessenberg~matrix} $\bar{H}_m$ of size $(m+1)\times m$. Then, it +solves the linear least-squares problem of size $m$ to find the vector $y\in\mathbb{R}^{m}$ +which minimizes at best the residual norm (line~$18$). Finally, it computes an approximate +solution $x_m$ in the Krylov subspace spanned by $V_m$ (line~$19$). The GMRES algorithm is +stopped when the residual norm is sufficiently small ($\|r_m\|_2<\varepsilon$) and/or the +maximum number of iterations\index{Convergence!Maximum~number~of~iterations} ($maxiter$) +is reached. + %%--------------------------%% %% SECTION 3 %% %%--------------------------%% \section{Parallel implementation on a GPU cluster} -\label{sec:03} -In this section, we present the parallel algorithms of both iterative CG and GMRES methods for GPU clusters. -The implementation is performed on a GPU cluster composed of different computing nodes, such that each node -is a CPU core managed by a MPI process and equipped with a GPU card. The parallelization of these algorithms -is carried out by using the MPI communication routines between the GPU computing nodes and the CUDA programming -environment inside each node. In what follows, the algorithms of the iterative methods are called iterative -solvers. +\label{ch12:sec:03} +In this section, we present the parallel algorithms of both iterative CG\index{Iterative~method!CG} +and GMRES\index{Iterative~method!GMRES} methods for GPU clusters. The implementation is performed on +a GPU cluster composed of different computing nodes, such that each node is a CPU core managed by a +MPI process and equipped with a GPU card. The parallelization of these algorithms is carried out by +using the MPI communication routines between the GPU computing nodes\index{Computing~node} and the +CUDA programming environment inside each node. In what follows, the algorithms of the iterative methods +are called iterative solvers. + %%****************%% %%****************%% \subsection{Data partitioning} -\label{sec:03.01} -The parallel solving of the large sparse linear system~(\ref{eq:11}) requires a data partitioning between the computing -nodes of the GPU cluster. Let $p$ denotes the number of the computing nodes on the GPU cluster. The partitioning operation -consists in the decomposition of the vectors and matrices, involved in the iterative solver, in $p$ portions. Indeed, this -operation allows to assign to each computing node $i$: +\label{ch12:sec:03.01} +The parallel solving of the large sparse linear system~(\ref{ch12:eq:11}) requires a data partitioning +between the computing nodes of the GPU cluster. Let $p$ denotes the number of the computing nodes on the +GPU cluster. The partitioning operation consists in the decomposition of the vectors and matrices, involved +in the iterative solver, in $p$ portions. Indeed, this operation allows to assign to each computing node +$i$: \begin{itemize} \item a portion of size $\frac{n}{p}$ elements of each vector, \item a sparse rectangular sub-matrix $A_i$ of size $(\frac{n}{p},n)$ and, \item a square preconditioning sub-matrix $M_i$ of size $(\frac{n}{p},\frac{n}{p})$, \end{itemize} -where $n$ is the size of the sparse linear system to be solved. In the first instance, we perform a naive row-wise partitioning -(decomposition row-by-row) on the data of the sparse linear systems to be solved. Figure~\ref{fig:01} shows an example of a row-wise -data partitioning between four computing nodes of a sparse linear system (sparse matrix $A$, solution vector $x$ and right-hand -side $b$) of size $16$ unknown values. +where $n$ is the size of the sparse linear system to be solved. In the first instance, we perform a naive +row-wise partitioning (decomposition row-by-row) on the data of the sparse linear systems to be solved. +Figure~\ref{ch12:fig:01} shows an example of a row-wise data partitioning between four computing nodes +of a sparse linear system (sparse matrix $A$, solution vector $x$ and right-hand side $b$) of size $16$ +unknown values. \begin{figure} \centerline{\includegraphics[scale=0.35]{Chapters/chapter12/figures/partition}} \caption{A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.} -\label{fig:01} +\label{ch12:fig:01} \end{figure} + %%****************%% %%****************%% \subsection{GPU computing} -\label{sec:03.02} -After the partitioning operation, all the data involved from this operation must be transferred from the CPU memories to the GPU -memories, in order to be processed by GPUs. We use two functions of the CUBLAS library (CUDA Basic Linear Algebra Subroutines), -developed by Nvidia~\cite{ref6}: \verb+cublasAlloc()+ for the memory allocations on GPUs and \verb+cublasSetVector()+ for the -memory copies from the CPUs to the GPUs. - -An efficient implementation of CG and GMRES solvers on a GPU cluster requires to determine all parts of their codes that can be -executed in parallel and, thus, take advantage of the GPU acceleration. As many Krylov sub-space methods, the CG and GMRES methods -are mainly based on arithmetic operations dealing with vectors or matrices: sparse matrix-vector multiplications, scalar-vector -multiplications, dot products, Euclidean norms, AXPY operations ($y\leftarrow ax+y$ where $x$ and $y$ are vectors and $a$ is a -scalar) and so on. These vector operations are often easy to parallelize and they are more efficient on parallel computers when -they work on large vectors. Therefore, all the vector operations used in CG and GMRES solvers must be executed by the GPUs as kernels. - -We use the kernels of the CUBLAS library to compute some vector operations of CG and GMRES solvers. The following kernels of CUBLAS -(dealing with double floating point) are used: \verb+cublasDdot()+ for the dot products, \verb+cublasDnrm2()+ for the Euclidean -norms and \verb+cublasDaxpy()+ for the AXPY operations. For the rest of the data-parallel operations, we code their kernels in CUDA. -In the CG solver, we develop a kernel for the XPAY operation ($y\leftarrow x+ay$) used at line~$12$ in Algorithm~\ref{alg:01}. In the -GMRES solver, we program a kernel for the scalar-vector multiplication (lines~$7$ and~$15$ in Algorithm~\ref{alg:02}), a kernel for -solving the least-squares problem and a kernel for the elements updates of the solution vector $x$. - -The least-squares problem in the GMRES method is solved by performing a QR factorization on the Hessenberg matrix $\bar{H}_m$ with -plane rotations and, then, solving the triangular system by backward substitutions to compute $y$. Consequently, solving the least-squares -problem on the GPU is not interesting. Indeed, the triangular solves are not easy to parallelize and inefficient on GPUs. However, -the least-squares problem to solve in the GMRES method with restarts has, generally, a very small size $m$. Therefore, we develop -an inexpensive kernel which must be executed in sequential by a single CUDA thread. - -The most important operation in CG and GMRES methods is the sparse matrix-vector multiplication (SpMV), because it is often an -expensive operation in terms of execution time and memory space. Moreover, it requires to take care of the storage format of the -sparse matrix in the memory. Indeed, the naive storage, row-by-row or column-by-column, of a sparse matrix can cause a significant -waste of memory space and execution time. In addition, the sparsity nature of the matrix often leads to irregular memory accesses -to read the matrix nonzero values. So, the computation of the SpMV multiplication on GPUs can involve non coalesced accesses to -the global memory, which slows down even more its performances. One of the most efficient compressed storage formats of sparse -matrices on GPUs is HYB format~\cite{ref7}. It is a combination of ELLpack (ELL) and Coordinate (COO) formats. Indeed, it stores -a typical number of nonzero values per row in ELL format and remaining entries of exceptional rows in COO format. It combines -the efficiency of ELL due to the regularity of its memory accesses and the flexibility of COO which is insensitive to the matrix -structure. Consequently, we use the HYB kernel~\cite{ref8} developed by Nvidia to implement the SpMV multiplication of CG and -GMRES methods on GPUs. Moreover, to avoid the non coalesced accesses to the high-latency global memory, we fill the elements of -the iterate vector $x$ in the cached texture memory. +\label{ch12:sec:03.02} +After the partitioning operation, all the data involved from this operation must be +transferred from the CPU memories to the GPU memories, in order to be processed by +GPUs. We use two functions of the CUBLAS\index{CUBLAS} library (CUDA Basic Linear +Algebra Subroutines), developed by Nvidia~\cite{ch12:ref6}: \verb+cublasAlloc()+ +for the memory allocations on GPUs and \verb+cublasSetVector()+ for the memory +copies from the CPUs to the GPUs. + +An efficient implementation of CG and GMRES solvers on a GPU cluster requires to +determine all parts of their codes that can be executed in parallel and, thus, take +advantage of the GPU acceleration. As many Krylov subspace methods, the CG and GMRES +methods are mainly based on arithmetic operations dealing with vectors or matrices: +sparse matrix-vector multiplications, scalar-vector multiplications, dot products, +Euclidean norms, AXPY operations ($y\leftarrow ax+y$ where $x$ and $y$ are vectors +and $a$ is a scalar) and so on. These vector operations are often easy to parallelize +and they are more efficient on parallel computers when they work on large vectors. +Therefore, all the vector operations used in CG and GMRES solvers must be executed +by the GPUs as kernels. + +We use the kernels of the CUBLAS library to compute some vector operations of CG and +GMRES solvers. The following kernels of CUBLAS (dealing with double floating point) +are used: \verb+cublasDdot()+ for the dot products, \verb+cublasDnrm2()+ for the +Euclidean norms and \verb+cublasDaxpy()+ for the AXPY operations. For the rest of +the data-parallel operations, we code their kernels in CUDA. In the CG solver, we +develop a kernel for the XPAY operation ($y\leftarrow x+ay$) used at line~$12$ in +Algorithm~\ref{ch12:alg:01}. In the GMRES solver, we program a kernel for the scalar-vector +multiplication (lines~$7$ and~$15$ in Algorithm~\ref{ch12:alg:02}), a kernel for +solving the least-squares problem and a kernel for the elements updates of the solution +vector $x$. + +The least-squares problem in the GMRES method is solved by performing a QR factorization +on the Hessenberg matrix\index{Hessenberg~matrix} $\bar{H}_m$ with plane rotations and, +then, solving the triangular system by backward substitutions to compute $y$. Consequently, +solving the least-squares problem on the GPU is not interesting. Indeed, the triangular +solves are not easy to parallelize and inefficient on GPUs. However, the least-squares +problem to solve in the GMRES method with restarts has, generally, a very small size $m$. +Therefore, we develop an inexpensive kernel which must be executed in sequential by a +single CUDA thread. + +The most important operation in CG\index{Iterative~method!CG} and GMRES\index{Iterative~method!GMRES} +methods is the sparse matrix-vector multiplication (SpMV)\index{SpMV~multiplication}, +because it is often an expensive operation in terms of execution time and memory space. +Moreover, it requires to take care of the storage format of the sparse matrix in the +memory. Indeed, the naive storage, row-by-row or column-by-column, of a sparse matrix +can cause a significant waste of memory space and execution time. In addition, the sparsity +nature of the matrix often leads to irregular memory accesses to read the matrix nonzero +values. So, the computation of the SpMV multiplication on GPUs can involve non coalesced +accesses to the global memory, which slows down even more its performances. One of the +most efficient compressed storage formats\index{Compressed~storage~format} of sparse +matrices on GPUs is HYB\index{Compressed~storage~format!HYB} format~\cite{ch12:ref7}. +It is a combination of ELLpack (ELL) and Coordinate (COO) formats. Indeed, it stores +a typical number of nonzero values per row in ELL\index{Compressed~storage~format!ELL} +format and remaining entries of exceptional rows in COO format. It combines the efficiency +of ELL due to the regularity of its memory accesses and the flexibility of COO\index{Compressed~storage~format!COO} +which is insensitive to the matrix structure. Consequently, we use the HYB kernel~\cite{ch12:ref8} +developed by Nvidia to implement the SpMV multiplication of CG and GMRES methods on GPUs. +Moreover, to avoid the non coalesced accesses to the high-latency global memory, we fill +the elements of the iterate vector $x$ in the cached texture memory. + %%****************%% %%****************%% \subsection{Data communications} -\label{sec:03.03} -All the computing nodes of the GPU cluster execute in parallel the same iterative solver (Algorithm~\ref{alg:01} or Algorithm~\ref{alg:02}) -adapted to GPUs, but on their own portions of the sparse linear system: $M^{-1}_iA_ix_i=M^{-1}_ib_i$, $0\leq i<p$. However in order to solve -the complete sparse linear system~(\ref{eq:11}), synchronizations must be performed between the local computations of the computing nodes -over the cluster. In what follows, two computing nodes sharing data are called neighboring nodes. - -As already mentioned, the most important operation of CG and GMRES methods is the SpMV multiplication. In the parallel implementation of -the iterative methods, each computing node $i$ performs the SpMV multiplication on its own sparse rectangular sub-matrix $A_i$. Locally, it -has only sub-vectors of size $\frac{n}{p}$ corresponding to rows of its sub-matrix $A_i$. However, it also requires the vector elements -of its neighbors, corresponding to the column indices on which its sub-matrix has nonzero values (see Figure~\ref{fig:01}). So, in addition -to the local vectors, each node must also manage vector elements shared with neighbors and required to compute the SpMV multiplication. -Therefore, the iterate vector $x$ managed by each computing node is composed of a local sub-vector $x^{local}$ of size $\frac{n}{p}$ and a -sub-vector of shared elements $x^{shared}$. In the same way, the vector used to construct the orthonormal basis of the Krylov sub-space -(vectors $p$ and $v$ in CG and GMRES methods, respectively) is composed of a local sub-vector and a shared sub-vector. - -Therefore, before computing the SpMV multiplication, the neighboring nodes over the GPU cluster must exchange between them the shared -vector elements necessary to compute this multiplication. First, each computing node determines, in its local sub-vector, the vector -elements needed by other nodes. Then, the neighboring nodes exchange between them these shared vector elements. The data exchanges -are implemented by using the MPI point-to-point communication routines: blocking sends with \verb+MPI_Send()+ and nonblocking receives -with \verb+MPI_Irecv()+. Figure~\ref{fig:02} shows an example of data exchanges between \textit{Node 1} and its neighbors \textit{Node 0}, -\textit{Node 2} and \textit{Node 3}. In this example, the iterate matrix $A$ split between these four computing nodes is that presented -in Figure~\ref{fig:01}. +\label{ch12:sec:03.03} +All the computing nodes of the GPU cluster execute in parallel the same iterative solver +(Algorithm~\ref{ch12:alg:01} or Algorithm~\ref{ch12:alg:02}) adapted to GPUs, but on their +own portions of the sparse linear system\index{Sparse~linear~system}: $M^{-1}_iA_ix_i=M^{-1}_ib_i$, +$0\leq i<p$. However, in order to solve the complete sparse linear system~(\ref{ch12:eq:11}), +synchronizations must be performed between the local computations of the computing nodes over +the cluster. In what follows, two computing nodes sharing data are called neighboring nodes\index{Neighboring~node}. + +As already mentioned, the most important operation of CG and GMRES methods is the SpMV multiplication. +In the parallel implementation of the iterative methods, each computing node $i$ performs the +SpMV multiplication on its own sparse rectangular sub-matrix $A_i$. Locally, it has only sub-vectors +of size $\frac{n}{p}$ corresponding to rows of its sub-matrix $A_i$. However, it also requires +the vector elements of its neighbors, corresponding to the column indices on which its sub-matrix +has nonzero values (see Figure~\ref{ch12:fig:01}). So, in addition to the local vectors, each +node must also manage vector elements shared with neighbors and required to compute the SpMV +multiplication. Therefore, the iterate vector $x$ managed by each computing node is composed +of a local sub-vector $x^{local}$ of size $\frac{n}{p}$ and a sub-vector of shared elements $x^{shared}$. +In the same way, the vector used to construct the orthonormal basis of the Krylov subspace (vectors +$p$ and $v$ in CG and GMRES methods, respectively) is composed of a local sub-vector and a shared +sub-vector. + +Therefore, before computing the SpMV multiplication\index{SpMV~multiplication}, the neighboring +nodes\index{Neighboring~node} over the GPU cluster must exchange between them the shared vector +elements necessary to compute this multiplication. First, each computing node determines, in its +local sub-vector, the vector elements needed by other nodes. Then, the neighboring nodes exchange +between them these shared vector elements. The data exchanges are implemented by using the MPI +point-to-point communication routines: blocking\index{MPI~subroutines!Blocking} sends with \verb+MPI_Send()+ +and nonblocking\index{MPI~subroutines!Nonblocking} receives with \verb+MPI_Irecv()+. Figure~\ref{ch12:fig:02} +shows an example of data exchanges between \textit{Node 1} and its neighbors \textit{Node 0}, \textit{Node 2} +and \textit{Node 3}. In this example, the iterate matrix $A$ split between these four computing +nodes is that presented in Figure~\ref{ch12:fig:01}. \begin{figure} \centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/compress}} \caption{Data exchanges between \textit{Node 1} and its neighbors \textit{Node 0}, \textit{Node 2} and \textit{Node 3}.} -\label{fig:02} +\label{ch12:fig:02} \end{figure} -After the synchronization operation, the computing nodes receive, from their respective neighbors, the shared elements in a sub-vector -stored in a compressed format. However, in order to compute the SpMV multiplication, the computing nodes operate on sparse global vectors -(see Figure~\ref{fig:02}). In this case, the received vector elements must be copied to the corresponding indices in the global vector. -So as not to need to perform this at each iteration, we propose to reorder the columns of each sub-matrix $\{A_i\}_{0\leq i<p}$, so that -the shared sub-vectors could be used in their compressed storage formats. Figure~\ref{fig:03} shows a reordering of a sparse sub-matrix -(sub-matrix of \textit{Node 1}). +After the synchronization operation, the computing nodes receive, from their respective neighbors, +the shared elements in a sub-vector stored in a compressed format. However, in order to compute the +SpMV multiplication, the computing nodes operate on sparse global vectors (see Figure~\ref{ch12:fig:02}). +In this case, the received vector elements must be copied to the corresponding indices in the global +vector. So as not to need to perform this at each iteration, we propose to reorder the columns of +each sub-matrix $\{A_i\}_{0\leq i<p}$, so that the shared sub-vectors could be used in their compressed +storage formats. Figure~\ref{ch12:fig:03} shows a reordering of a sparse sub-matrix (sub-matrix of +\textit{Node 1}). \begin{figure} -\centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/reorder}} +\centerline{\includegraphics[scale=0.35]{Chapters/chapter12/figures/reorder}} \caption{Columns reordering of a sparse sub-matrix.} -\label{fig:03} +\label{ch12:fig:03} \end{figure} -A GPU cluster is a parallel platform with a distributed memory. So, the synchronizations and communication data between GPU nodes are -carried out by passing messages. However, GPUs can not communicate between them in direct way. Then, CPUs via MPI processes are in charge -of the synchronizations within the GPU cluster. Consequently, the vector elements to be exchanged must be copied from the GPU memory -to the CPU memory and vice-versa before and after the synchronization operation between CPUs. We have used the CBLAS communication subroutines -to perform the data transfers between a CPU core and its GPU: \verb+cublasGetVector()+ and \verb+cublasSetVector()+. Finally, in addition -to the data exchanges, GPU nodes perform reduction operations to compute in parallel the dot products and Euclidean norms. This is implemented -by using the MPI global communication \verb+MPI_Allreduce()+. +A GPU cluster\index{GPU~cluster} is a parallel platform with a distributed memory. So, the synchronizations +and communication data between GPU nodes are carried out by passing messages. However, GPUs can not communicate +between them in direct way. Then, CPUs via MPI processes are in charge of the synchronizations within the GPU +cluster. Consequently, the vector elements to be exchanged must be copied from the GPU memory to the CPU memory +and vice-versa before and after the synchronization operation between CPUs. We have used the CUBLAS\index{CUBLAS} +communication subroutines to perform the data transfers between a CPU core and its GPU: \verb+cublasGetVector()+ +and \verb+cublasSetVector()+. Finally, in addition to the data exchanges, GPU nodes perform reduction operations +to compute in parallel the dot products and Euclidean norms. This is implemented by using the MPI global communication\index{MPI~subroutines!Global} +\verb+MPI_Allreduce()+. + %%--------------------------%% %% SECTION 4 %% %%--------------------------%% \section{Experimental results} -\label{sec:04} -In this section, we present the performances of the parallel CG and GMRES linear solvers obtained on a cluster of $12$ GPUs. Indeed, this GPU -cluster of tests is composed of six machines connected by $20$Gbps InfiniBand network. Each machine is a Quad-Core Xeon E5530 CPU running at -$2.4$GHz and providing $12$GB of RAM with a memory bandwidth of $25.6$GB/s. In addition, two Tesla C1060 GPUs are connected to each machine via -a PCI-Express 16x Gen 2.0 interface with a throughput of $8$GB/s. A Tesla C1060 GPU contains $240$ cores running at $1.3$GHz and providing a -global memory of $4$GB with a memory bandwidth of $102$GB/s. Figure~\ref{fig:04} shows the general scheme of the GPU cluster that we used in -the experimental tests. +\label{ch12:sec:04} +In this section, we present the performances of the parallel CG and GMRES linear solvers obtained +on a cluster of $12$ GPUs. Indeed, this GPU cluster of tests is composed of six machines connected +by $20$Gbps InfiniBand network. Each machine is a Quad-Core Xeon E5530 CPU running at $2.4$GHz and +providing $12$GB of RAM with a memory bandwidth of $25.6$GB/s. In addition, two Tesla C1060 GPUs are +connected to each machine via a PCI-Express 16x Gen 2.0 interface with a throughput of $8$GB/s. A +Tesla C1060 GPU contains $240$ cores running at $1.3$GHz and providing a global memory of $4$GB with +a memory bandwidth of $102$GB/s. Figure~\ref{ch12:fig:04} shows the general scheme of the GPU cluster\index{GPU~cluster} +that we used in the experimental tests. \begin{figure} \centerline{\includegraphics[scale=0.25]{Chapters/chapter12/figures/cluster}} \caption{General scheme of the GPU cluster of tests composed of six machines, each with two GPUs.} -\label{fig:04} +\label{ch12:fig:04} \end{figure} -Linux cluster version 2.6.39 OS is installed on CPUs. C programming language is used for coding the parallel algorithms of both methods on the -GPU cluster. CUDA version 4.0~\cite{ref9} is used for programming GPUs, using CUBLAS library~\cite{ref6} to deal with vector operations in GPUs -and, finally, MPI routines of OpenMPI 1.3.3 are used to carry out the communications between CPU cores. Indeed, the experiments are done on a -cluster of $12$ computing nodes, where each node is managed by a MPI process and it is composed of one CPU core and one GPU card. - -All tests are made on double-precision floating point operations. The parameters of both linear solvers are initialized as follows: the residual -tolerance threshold $\varepsilon=10^{-12}$, the maximum number of iterations $maxiter=500$, the right-hand side $b$ is filled with $1.0$ and the -initial guess $x_0$ is filled with $0.0$. In addition, we limited the Arnoldi process used in the GMRES method to $16$ iterations ($m=16$). For -the sake of simplicity, we have chosen the preconditioner $M$ as the main diagonal of the sparse matrix $A$. Indeed, it allows to easily compute -the required inverse matrix $M^{-1}$ and it provides a relatively good preconditioning for not too ill-conditioned matrices. In the GPU computing, -the size of thread blocks is fixed to $512$ threads. Finally, the performance results, presented hereafter, are obtained from the mean value over -$10$ executions of the same parallel linear solver and for the same input data. - -To get more realistic results, we tested the CG and GMRES algorithms on sparse matrices of the Davis's collection~\cite{ref10}, that arise in a wide -spectrum of real-world applications. We chose six symmetric sparse matrices and six nonsymmetric ones from this collection. In Figure~\ref{fig:05}, -we show structures of these matrices and in Table~\ref{tab:01} we present their main characteristics which are the number of rows, the total number -of nonzero values (nnz) and the maximal bandwidth. In the present chapter, the bandwidth of a sparse matrix is defined as the number of matrix columns -separating the first and the last nonzero value on a matrix row. +Linux cluster version 2.6.39 OS is installed on CPUs. C programming language is used for coding +the parallel algorithms of both methods on the GPU cluster. CUDA version 4.0~\cite{ch12:ref9} +is used for programming GPUs, using CUBLAS library~\cite{ch12:ref6} to deal with vector operations +in GPUs and, finally, MPI routines of OpenMPI 1.3.3 are used to carry out the communications between +CPU cores. Indeed, the experiments are done on a cluster of $12$ computing nodes, where each node +is managed by a MPI process and it is composed of one CPU core and one GPU card. + +All tests are made on double-precision floating point operations. The parameters of both linear +solvers are initialized as follows: the residual tolerance threshold $\varepsilon=10^{-12}$, the +maximum number of iterations $maxiter=500$, the right-hand side $b$ is filled with $1.0$ and the +initial guess $x_0$ is filled with $0.0$. In addition, we limited the Arnoldi process\index{Iterative~method!Arnoldi~process} +used in the GMRES method to $16$ iterations ($m=16$). For the sake of simplicity, we have chosen +the preconditioner $M$ as the main diagonal of the sparse matrix $A$. Indeed, it allows to easily +compute the required inverse matrix $M^{-1}$ and it provides a relatively good preconditioning for +not too ill-conditioned matrices. In the GPU computing, the size of thread blocks is fixed to $512$ +threads. Finally, the performance results, presented hereafter, are obtained from the mean value +over $10$ executions of the same parallel linear solver and for the same input data. + +To get more realistic results, we tested the CG and GMRES algorithms on sparse matrices of the Davis's +collection~\cite{ch12:ref10}, that arise in a wide spectrum of real-world applications. We chose six +symmetric sparse matrices and six nonsymmetric ones from this collection. In Figure~\ref{ch12:fig:05}, +we show structures of these matrices and in Table~\ref{ch12:tab:01} we present their main characteristics +which are the number of rows, the total number of nonzero values (nnz) and the maximal bandwidth. In +the present chapter, the bandwidth of a sparse matrix is defined as the number of matrix columns separating +the first and the last nonzero value on a matrix row. \begin{figure} \centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/matrices}} \caption{Sketches of sparse matrices chosen from the Davis's collection.} -\label{fig:05} +\label{ch12:fig:05} \end{figure} \begin{table} @@ -499,7 +582,7 @@ separating the first and the last nonzero value on a matrix row. \end{tabular} \vspace{0.5cm} \caption{Main characteristics of sparse matrices chosen from the Davis's collection.} -\label{tab:01} +\label{ch12:tab:01} \end{table} \begin{table} @@ -521,7 +604,7 @@ shallow\_water2 & $0.017s$ & $0.010s$ & $1.68$ & $ thermal2 & $1.172s$ & $0.622s$ & $1.88$ & $15$ & $5.11e$-$09$ & $3.33e$-$16$ \\ \hline \end{tabular} \caption{Performances of the parallel CG method on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.} -\label{tab:02} +\label{ch12:tab:02} \end{center} \end{table} @@ -556,49 +639,63 @@ poli\_large & $0.097s$ & $0.095s$ & $1.02$ & $ torso3 & $4.242s$ & $2.030s$ & $2.09$ & $175$ & $2.69e$-$10$ & $1.78e$-$14$ \\ \hline \end{tabular} \caption{Performances of the parallel GMRES method on a cluster 24 CPU cores vs. on cluster of 12 GPUs.} -\label{tab:03} +\label{ch12:tab:03} \end{center} \end{table} -Tables~\ref{tab:02} and~\ref{tab:03} shows the performances of the parallel CG and GMRES solvers, respectively, for solving linear systems associated to -the sparse matrices presented in Tables~\ref{tab:01}. They allow to compare the performances obtained on a cluster of $24$ CPU cores and on a cluster -of $12$ GPUs. However, Table~\ref{tab:02} shows only the performances of solving symmetric sparse linear systems, due to the inability of the CG method -to solve the nonsymmetric systems. In both tables, the second and third columns give, respectively, the execution times in seconds obtained on $24$ CPU -cores ($Time_{gpu}$) and that obtained on $12$ GPUs ($Time_{gpu}$). Moreover, we take into account the relative gains $\tau$ of a solver implemented on the -GPU cluster compared to the same solver implemented on the CPU cluster. The relative gains, presented in the fourth column, are computed as a ratio of -the CPU execution time over the GPU execution time: +Tables~\ref{ch12:tab:02} and~\ref{ch12:tab:03} shows the performances of the parallel +CG and GMRES solvers, respectively, for solving linear systems associated to the sparse +matrices presented in Tables~\ref{ch12:tab:01}. They allow to compare the performances +obtained on a cluster of $24$ CPU cores and on a cluster of $12$ GPUs. However, Table~\ref{ch12:tab:02} +shows only the performances of solving symmetric sparse linear systems, due to the inability +of the CG method to solve the nonsymmetric systems. In both tables, the second and third +columns give, respectively, the execution times in seconds obtained on $24$ CPU cores +($Time_{gpu}$) and that obtained on $12$ GPUs ($Time_{gpu}$). Moreover, we take into account +the relative gains $\tau$ of a solver implemented on the GPU cluster compared to the same +solver implemented on the CPU cluster. The relative gains\index{Relative~gain}, presented +in the fourth column, are computed as a ratio of the CPU execution time over the GPU +execution time: \begin{equation} \tau = \frac{Time_{cpu}}{Time_{gpu}}. -\label{eq:20} +\label{ch12:eq:20} \end{equation} -In addition, Tables~\ref{tab:02} and~\ref{tab:03} give the number of iterations ($iter$), the precision $prec$ of the solution computed on the GPU cluster -and the difference $\Delta$ between the solution computed on the CPU cluster and that computed on the GPU cluster. Both parameters $prec$ and $\Delta$ -allow to validate and verify the accuracy of the solution computed on the GPU cluster. We have computed them as follows: +In addition, Tables~\ref{ch12:tab:02} and~\ref{ch12:tab:03} give the number of iterations +($iter$), the precision $prec$ of the solution computed on the GPU cluster and the difference +$\Delta$ between the solution computed on the CPU cluster and that computed on the GPU cluster. +Both parameters $prec$ and $\Delta$ allow to validate and verify the accuracy of the solution +computed on the GPU cluster. We have computed them as follows: \begin{eqnarray} \Delta = max|x^{cpu}-x^{gpu}|,\\ prec = max|M^{-1}r^{gpu}|, \end{eqnarray} -where $\Delta$ is the maximum vector element, in absolute value, of the difference between the two solutions $x^{cpu}$ and $x^{gpu}$ computed, respectively, -on CPU and GPU clusters and $prec$ is the maximum element, in absolute value, of the residual vector $r^{gpu}\in\mathbb{R}^{n}$ of the solution $x^{gpu}$. -Thus, we can see that the solutions obtained on the GPU cluster were computed with a sufficient accuracy (about $10^{-10}$) and they are, more or less, -equivalent to those computed on the CPU cluster with a small difference ranging from $10^{-10}$ to $10^{-26}$. However, we can notice from the relative -gains $\tau$ that is not interesting to use multiple GPUs for solving small sparse linear systems. in fact, a small sparse matrix does not allow to maximize -utilization of GPU cores. In addition, the communications required to synchronize the computations over the cluster increase the idle times of GPUs and -slow down further the parallel computations. - -Consequently, in order to test the performances of the parallel solvers, we developed in C programming language a generator of large sparse matrices. -This generator takes a matrix from the Davis's collection~\cite{ref10} as an initial matrix to construct large sparse matrices exceeding ten million -of rows. It must be executed in parallel by the MPI processes of the computing nodes, so that each process could construct its sparse sub-matrix. In -first experimental tests, we are focused on sparse matrices having a banded structure, because they are those arise in the most of numerical problems. -So to generate the global sparse matrix, each MPI process constructs its sub-matrix by performing several copies of an initial sparse matrix chosen -from the Davis's collection. Then, it puts all these copies on the main diagonal of the global matrix (see Figure~\ref{fig:06}). Moreover, the empty -spaces between two successive copies in the main diagonal are filled with sub-copies (left-copy and right-copy in Figure~\ref{fig:06}) of the same +where $\Delta$ is the maximum vector element, in absolute value, of the difference between +the two solutions $x^{cpu}$ and $x^{gpu}$ computed, respectively, on CPU and GPU clusters and +$prec$ is the maximum element, in absolute value, of the residual vector $r^{gpu}\in\mathbb{R}^{n}$ +of the solution $x^{gpu}$. Thus, we can see that the solutions obtained on the GPU cluster +were computed with a sufficient accuracy (about $10^{-10}$) and they are, more or less, equivalent +to those computed on the CPU cluster with a small difference ranging from $10^{-10}$ to $10^{-26}$. +However, we can notice from the relative gains $\tau$ that is not interesting to use multiple +GPUs for solving small sparse linear systems. in fact, a small sparse matrix does not allow to +maximize utilization of GPU cores. In addition, the communications required to synchronize the +computations over the cluster increase the idle times of GPUs and slow down further the parallel +computations. + +Consequently, in order to test the performances of the parallel solvers, we developed in C programming +language a generator of large sparse matrices. This generator takes a matrix from the Davis's collection~\cite{ch12:ref10} +as an initial matrix to construct large sparse matrices exceeding ten million of rows. It must be executed +in parallel by the MPI processes of the computing nodes, so that each process could construct its sparse +sub-matrix. In first experimental tests, we are focused on sparse matrices having a banded structure, +because they are those arise in the most of numerical problems. So to generate the global sparse matrix, +each MPI process constructs its sub-matrix by performing several copies of an initial sparse matrix chosen +from the Davis's collection. Then, it puts all these copies on the main diagonal of the global matrix +(see Figure~\ref{ch12:fig:06}). Moreover, the empty spaces between two successive copies in the main +diagonal are filled with sub-copies (left-copy and right-copy in Figure~\ref{ch12:fig:06}) of the same initial matrix. \begin{figure} \centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/generation}} \caption{Parallel generation of a large sparse matrix by four computing nodes.} -\label{fig:06} +\label{ch12:fig:06} \end{figure} \begin{table}[!h] @@ -633,17 +730,21 @@ initial matrix. \end{tabular} \vspace{0.5cm} \caption{Main characteristics of sparse banded matrices generated from those of the Davis's collection.} -\label{tab:04} +\label{ch12:tab:04} \end{table} -We have used the parallel CG and GMRES algorithms for solving sparse linear systems of $25$ million unknown values. The sparse matrices associated -to these linear systems are generated from those presented in Table~\ref{tab:01}. Their main characteristics are given in Table~\ref{tab:04}. Tables~\ref{tab:05} -and~\ref{tab:06} shows the performances of the parallel CG and GMRES solvers, respectively, obtained on a cluster of $24$ CPU cores and on a cluster -of $12$ GPUs. Obviously, we can notice from these tables that solving large sparse linear systems on a GPU cluster is more efficient than on a CPU -cluster (see relative gains $\tau$). We can also notice that the execution times of the CG method, whether in a CPU cluster or on a GPU cluster, are -better than those the GMRES method for solving large symmetric linear systems. In fact, the CG method is characterized by a better convergence rate -and a shorter execution time of an iteration than those of the GMRES method. Moreover, an iteration of the parallel GMRES method requires more data -exchanges between computing nodes compared to the parallel CG method. +We have used the parallel CG and GMRES algorithms for solving sparse linear systems of $25$ +million unknown values. The sparse matrices associated to these linear systems are generated +from those presented in Table~\ref{ch12:tab:01}. Their main characteristics are given in Table~\ref{ch12:tab:04}. +Tables~\ref{ch12:tab:05} and~\ref{ch12:tab:06} shows the performances of the parallel CG and +GMRES solvers, respectively, obtained on a cluster of $24$ CPU cores and on a cluster of $12$ +GPUs. Obviously, we can notice from these tables that solving large sparse linear systems on +a GPU cluster is more efficient than on a CPU cluster (see relative gains $\tau$). We can also +notice that the execution times of the CG method, whether in a CPU cluster or on a GPU cluster, +are better than those the GMRES method for solving large symmetric linear systems. In fact, the +CG method is characterized by a better convergence\index{Convergence} rate and a shorter execution +time of an iteration than those of the GMRES method. Moreover, an iteration of the parallel GMRES +method requires more data exchanges between computing nodes compared to the parallel CG method. \begin{table}[!h] \begin{center} @@ -665,7 +766,7 @@ thermal2 & $1.411s$ & $0.244s$ & $5.78$ \end{tabular} \caption{Performances of the parallel CG method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.} -\label{tab:05} +\label{ch12:tab:05} \end{center} \end{table} @@ -701,7 +802,7 @@ torso3 & $31.463s$ & $3.681s$ & $8.55$ \end{tabular} \caption{Performances of the parallel GMRES method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.} -\label{tab:06} +\label{ch12:tab:06} \end{center} \end{table} @@ -710,14 +811,15 @@ on a cluster of 12 GPUs.} %% SECTION 5 %% %%--------------------------%% \section{Hypergraph partitioning} -\label{sec:05} -In this section, we present the performances of both parallel CG and GMRES solvers for solving linear systems associated to sparse matrices having -large bandwidths. Indeed, we are interested on sparse matrices having the nonzero values distributed along their bandwidths. +\label{ch12:sec:05} +In this section, we present the performances of both parallel CG and GMRES solvers for solving linear +systems associated to sparse matrices having large bandwidths. Indeed, we are interested on sparse +matrices having the nonzero values distributed along their bandwidths. \begin{figure} \centerline{\includegraphics[scale=0.22]{Chapters/chapter12/figures/generation_1}} \caption{Parallel generation of a large sparse five-bands matrix by four computing nodes.} -\label{fig:07} +\label{ch12:fig:07} \end{figure} \begin{table}[!h] @@ -751,15 +853,20 @@ large bandwidths. Indeed, we are interested on sparse matrices having the nonzer & torso3 & $872,029,998$ & $25,000,000$\\ \hline \end{tabular} \caption{Main characteristics of sparse five-bands matrices generated from those of the Davis's collection.} -\label{tab:07} +\label{ch12:tab:07} \end{center} \end{table} -We have developed in C programming language a generator of large sparse matrices having five bands distributed along their bandwidths (see Figure~\ref{fig:07}). -The principle of this generator is equivalent to that in Section~\ref{sec:04}. However, the copies performed on the initial matrix (chosen from the -Davis's collection) are placed on the main diagonal and on four off-diagonals, two on the right and two on the left of the main diagonal. Figure~\ref{fig:07} -shows an example of a generation of a sparse five-bands matrix by four computing nodes. Table~\ref{tab:07} shows the main characteristics of sparse -five-bands matrices generated from those presented in Table~\ref{tab:01} and associated to linear systems of $25$ million unknown values. +We have developed in C programming language a generator of large sparse matrices +having five bands distributed along their bandwidths (see Figure~\ref{ch12:fig:07}). +The principle of this generator is equivalent to that in Section~\ref{ch12:sec:04}. +However, the copies performed on the initial matrix (chosen from the Davis's collection) +are placed on the main diagonal and on four off-diagonals, two on the right and two +on the left of the main diagonal. Figure~\ref{ch12:fig:07} shows an example of a +generation of a sparse five-bands matrix by four computing nodes. Table~\ref{ch12:tab:07} +shows the main characteristics of sparse five-bands matrices generated from those +presented in Table~\ref{ch12:tab:01} and associated to linear systems of $25$ million +unknown values. \begin{table}[!h] \begin{center} @@ -781,7 +888,7 @@ thermal2 & $2.549s$ & $1.705s$ & $1.49$ & $14$ & $2.36e$-$ \end{tabular} \caption{Performances of parallel CG solver for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs} -\label{tab:08} +\label{ch12:tab:08} \end{center} \end{table} @@ -817,26 +924,33 @@ torso3 & $49.074s$ & $19.397s$ & $2.53$ & $175$ & $2.69e$- \end{tabular} \caption{Performances of parallel GMRES solver for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs} -\label{tab:09} +\label{ch12:tab:09} \end{center} \end{table} -Tables~\ref{tab:08} and~\ref{tab:09} shows the performaces of the parallel CG and GMRES solvers, respectively, obtained on -a cluster of $24$ CPU cores and on a cluster of $12$ GPUs. The linear systems solved in these tables are associated to the -sparse five-bands matrices presented on Table~\ref{tab:07}. We can notice from both Tables~\ref{tab:08} and~\ref{tab:09} that -using a GPU cluster is not efficient for solving these kind of sparse linear systems. We can see that the execution times obtained -on the GPU cluster are almost equivalent to those obtained on the CPU cluster (see the relative gains presented in column~$4$ -of each table). This is due to the large number of communications necessary to synchronize the computations over the cluster. -Indeed, the naive partitioning, row-by-row or column-by-column, of sparse matrices having large bandwidths can link a computing -node to many neighbors and then generate a large number of data dependencies between these computing nodes in the cluster. - -Therefore, we have chosen to use a hypergraph partitioning method, which is well-suited to numerous kinds of sparse matrices~\cite{ref11}. -Indeed, it can well model the communications between the computing nodes, particularly in the case of nonsymmetric and irregular -matrices, and it gives good reduction of the total communication volume. In contrast, it is an expensive operation in terms of -execution time and memory space. - -The sparse matrix $A$ of the linear system to be solved is modeled as a hypergraph $\mathcal{H}=(\mathcal{V},\mathcal{E})$ as -follows: +Tables~\ref{ch12:tab:08} and~\ref{ch12:tab:09} shows the performances of the parallel +CG and GMRES solvers, respectively, obtained on a cluster of $24$ CPU cores and on a +cluster of $12$ GPUs. The linear systems solved in these tables are associated to the +sparse five-bands matrices presented on Table~\ref{ch12:tab:07}. We can notice from +both Tables~\ref{ch12:tab:08} and~\ref{ch12:tab:09} that using a GPU cluster is not +efficient for solving these kind of sparse linear systems\index{Sparse~linear~system}. +We can see that the execution times obtained on the GPU cluster are almost equivalent +to those obtained on the CPU cluster (see the relative gains presented in column~$4$ +of each table). This is due to the large number of communications necessary to synchronize +the computations over the cluster. Indeed, the naive partitioning, row-by-row or column-by-column, +of sparse matrices having large bandwidths can link a computing node to many neighbors +and then generate a large number of data dependencies between these computing nodes in +the cluster. + +Therefore, we have chosen to use a hypergraph partitioning method\index{Hypergraph}, +which is well-suited to numerous kinds of sparse matrices~\cite{ch12:ref11}. Indeed, +it can well model the communications between the computing nodes, particularly in the +case of nonsymmetric and irregular matrices, and it gives good reduction of the total +communication volume. In contrast, it is an expensive operation in terms of execution +time and memory space. + +The sparse matrix $A$ of the linear system to be solved is modeled as a hypergraph +$\mathcal{H}=(\mathcal{V},\mathcal{E})$\index{Hypergraph} as follows: \begin{itemize} \item each matrix row $\{i\}_{0\leq i<n}$ corresponds to a vertex $v_i\in\mathcal{V}$ and, \item each matrix column $\{j\}_{0\leq j<n}$ corresponds to a hyperedge $e_j\in\mathcal{E}$, where: @@ -846,55 +960,70 @@ follows: \item $w_i$ is the weight of vertex $v_i$ and, \item $c_j$ is the cost of hyperedge $e_j$. \end{itemize} -A $K$-way partitioning of a hypergraph $\mathcal{H}=(\mathcal{V},\mathcal{E})$ is defined as $\mathcal{P}=\{\mathcal{V}_1,\ldots,\mathcal{V}_K\}$ -a set of pairwise disjoint non-empty subsets (or parts) of the vertex set $\mathcal{V}$, so that each subset is attributed to a computing node. -Figure~\ref{fig:08} shows an example of the hypergraph model of a $(9\times 9)$ sparse matrix in three parts. The circles and squares correspond, -respectively, to the vertices and hyperedges of the hypergraph. The solid squares define the cut hyperedges connecting at least two different parts. -The connectivity $\lambda_j$ of a cut hyperedge $e_j$ denotes the number of different parts spanned by $e_j$. +A $K$-way partitioning of a hypergraph $\mathcal{H}=(\mathcal{V},\mathcal{E})$ is +defined as $\mathcal{P}=\{\mathcal{V}_1,\ldots,\mathcal{V}_K\}$ a set of pairwise +disjoint non-empty subsets (or parts) of the vertex set $\mathcal{V}$, so that each +subset is attributed to a computing node. Figure~\ref{ch12:fig:08} shows an example +of the hypergraph model of a $(9\times 9)$ sparse matrix in three parts. The circles +and squares correspond, respectively, to the vertices and hyperedges of the hypergraph. +The solid squares define the cut hyperedges connecting at least two different parts. +The connectivity $\lambda_j$ of a cut hyperedge $e_j$ denotes the number of different +parts spanned by $e_j$. \begin{figure} \centerline{\includegraphics[scale=0.5]{Chapters/chapter12/figures/hypergraph}} \caption{An example of the hypergraph partitioning of a sparse matrix decomposed between three computing nodes.} -\label{fig:08} +\label{ch12:fig:08} \end{figure} -The cut hyperedges model the total communication volume between the different computing nodes in the cluster, necessary to perform the parallel SpMV -multiplication. Indeed, each hyperedge $e_j$ defines a set of atomic computations $b_i\leftarrow b_i+a_{ij}x_j$, $0\leq i,j<n$, of the SpMV multiplication -$Ax=b$ that need the $j^{th}$ unknown value of solution vector $x$. Therefore, pins of hyperedge $e_j$, $pins[e_j]$, are the set of matrix rows sharing -and requiring the same unknown value $x_j$. For example in Figure~\ref{fig:08}, hyperedge $e_9$ whose pins are: $pins[e_9]=\{v_2,v_5,v_9\}$ represents the -dependency of matrix rows $2$, $5$ and $9$ to unknown $x_9$ needed to perform in parallel the atomic operations: $b_2\leftarrow b_2+a_{29}x_9$, -$b_5\leftarrow b_5+a_{59}x_9$ and $b_9\leftarrow b_9+a_{99}x_9$. However, unknown $x_9$ is the third entry of the sub-solution vector $x$ of part (or node) $3$. -So the computing node $3$ must exchange this value with nodes $1$ and $2$, which leads to perform two communications. - -The hypergraph partitioning allows to reduce the total communication volume required to perform the parallel SpMV multiplication, while maintaining the -load balancing between the computing nodes. In fact, it allows to minimize at best the following amount: +The cut hyperedges model the total communication volume between the different computing +nodes in the cluster, necessary to perform the parallel SpMV multiplication\index{SpMV~multiplication}. +Indeed, each hyperedge $e_j$ defines a set of atomic computations $b_i\leftarrow b_i+a_{ij}x_j$, +$0\leq i,j<n$, of the SpMV multiplication $Ax=b$ that need the $j^{th}$ unknown value of +solution vector $x$. Therefore, pins of hyperedge $e_j$, $pins[e_j]$, are the set of matrix +rows sharing and requiring the same unknown value $x_j$. For example in Figure~\ref{ch12:fig:08}, +hyperedge $e_9$ whose pins are: $pins[e_9]=\{v_2,v_5,v_9\}$ represents the dependency of matrix +rows $2$, $5$ and $9$ to unknown $x_9$ needed to perform in parallel the atomic operations: +$b_2\leftarrow b_2+a_{29}x_9$, $b_5\leftarrow b_5+a_{59}x_9$ and $b_9\leftarrow b_9+a_{99}x_9$. +However, unknown $x_9$ is the third entry of the sub-solution vector $x$ of part (or node) $3$. +So the computing node $3$ must exchange this value with nodes $1$ and $2$, which leads to perform +two communications. + +The hypergraph partitioning\index{Hypergraph} allows to reduce the total communication volume +required to perform the parallel SpMV multiplication, while maintaining the load balancing between +the computing nodes. In fact, it allows to minimize at best the following amount: \begin{equation} \mathcal{X}(\mathcal{P})=\sum_{e_{j}\in\mathcal{E}_{C}}c_{j}(\lambda_{j}-1), \end{equation} -where $\mathcal{E}_{C}$ denotes the set of the cut hyperedges coming from the hypergraph partitioning $\mathcal{P}$ and $c_j$ and $\lambda_j$ are, respectively, -the cost and the connectivity of cut hyperedge $e_j$. Moreover, it also ensures the load balancing between the $K$ parts as follows: +where $\mathcal{E}_{C}$ denotes the set of the cut hyperedges coming from the hypergraph partitioning +$\mathcal{P}$ and $c_j$ and $\lambda_j$ are, respectively, the cost and the connectivity of cut hyperedge +$e_j$. Moreover, it also ensures the load balancing between the $K$ parts as follows: \begin{equation} W_{k}\leq (1+\epsilon)W_{avg}, \hspace{0.2cm} (1\leq k\leq K) \hspace{0.2cm} \text{and} \hspace{0.2cm} (0<\epsilon<1), \end{equation} -where $W_{k}$ is the sum of all vertex weights ($w_{i}$) in part $\mathcal{V}_{k}$, $W_{avg}$ is the average weight of all $K$ parts and $\epsilon$ is the -maximum allowed imbalanced ratio. - -The hypergraph partitioning is a NP-complete problem but software tools using heuristics are developed, for example: hMETIS~\cite{ref12}, PaToH~\cite{ref13} -and Zoltan~\cite{ref14}. Since our objective is solving large sparse linear systems, we use the parallel hypergraph partitioning which must be performed by -at least two MPI processes. It allows to accelerate the data partitioning of large sparse matrices. For this, the hypergraph $\mathcal{H}$ must be partitioned -in $p$ (number of MPI processes) sub-hypergraphs $\mathcal{H}_k=(\mathcal{V}_k,\mathcal{E}_k)$, $0\leq k<p$, and then we performed the parallel hypergraph -partitioning method using some functions of the MPI library between the $p$ processes. - -Tables~\ref{tab:10} and~\ref{tab:11} shows the performances of the parallel CG and GMRES solvers, respectively, using the hypergraph partitioning for solving -large linear systems associated to the sparse five-bands matrices presented in Table~\ref{tab:07}. For these experimental tests, we have applied the parallel -hypergraph partitioning~\cite{ref15} developed in Zoltan tool~\cite{ref14}. We have initialized the parameters of the partitioning operation as follows: +where $W_{k}$ is the sum of all vertex weights ($w_{i}$) in part $\mathcal{V}_{k}$, $W_{avg}$ is the +average weight of all $K$ parts and $\epsilon$ is the maximum allowed imbalanced ratio. + +The hypergraph partitioning is a NP-complete problem but software tools using heuristics are developed, +for example: hMETIS~\cite{ch12:ref12}, PaToH~\cite{ch12:ref13} and Zoltan~\cite{ch12:ref14}. Since our +objective is solving large sparse linear systems, we use the parallel hypergraph partitioning which must +be performed by at least two MPI processes. It allows to accelerate the data partitioning of large sparse +matrices. For this, the hypergraph $\mathcal{H}$ must be partitioned in $p$ (number of MPI processes) +sub-hypergraphs $\mathcal{H}_k=(\mathcal{V}_k,\mathcal{E}_k)$, $0\leq k<p$, and then we performed the +parallel hypergraph partitioning method using some functions of the MPI library between the $p$ processes. + +Tables~\ref{ch12:tab:10} and~\ref{ch12:tab:11} shows the performances of the parallel CG and GMRES solvers, +respectively, using the hypergraph partitioning for solving large linear systems associated to the sparse +five-bands matrices presented in Table~\ref{ch12:tab:07}. For these experimental tests, we have applied the +parallel hypergraph partitioning~\cite{ch12:ref15} developed in Zoltan tool~\cite{ch12:ref14}. We have initialized +the parameters of the partitioning operation as follows: \begin{itemize} \item the weight $w_{i}$ of each vertex $v_{j}\in\mathcal{V}$ is set to the number of nonzero values on matrix row $i$, \item for the sake of simplicity, the cost $c_{j}$ of each hyperedge $e_{j}\in\mathcal{E}$ is fixed to $1$, \item the maximum imbalanced load ratio $\epsilon$ is limited to $10\%$.\\ \end{itemize} -\begin{table}[!h] +\begin{table} \begin{center} \begin{tabular}{|c|c|c|c|c|} \hline @@ -914,11 +1043,11 @@ thermal2 & $1.889s$ & $0.348s$ & $5.43$ \end{tabular} \caption{Performances of the parallel CG solver using hypergraph partitioning for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPU.} -\label{tab:10} +\label{ch12:tab:10} \end{center} \end{table} -\begin{table}[!h] +\begin{table} \begin{center} \begin{tabular}{|c|c|c|c|c|} \hline @@ -950,23 +1079,30 @@ torso3 & $48.459s$ & $6.234s$ & $7.77$ \end{tabular} \caption{Performances of the parallel GMRES solver using hypergraph partitioning for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPU.} -\label{tab:11} +\label{ch12:tab:11} \end{center} \end{table} -We can notice from both Tables~\ref{tab:10} and~\ref{tab:11} that the hypergraph partitioning has improved the performances of both -parallel CG and GMRES algorithms. The execution times -on the GPU cluster of both parallel solvers are significantly improved compared to those obtained by using the partitioning row-by-row. -For these examples of sparse matrices, the execution times of CG and GMRES solvers are reduced about $76\%$ and $65\%$ respectively -(see column~$5$ of each table) compared to those obtained in Tables~\ref{tab:08} and~\ref{tab:09}. - -In fact, the hypergraph partitioning applied to sparse matrices having large bandwidths allows to reduce the total communication volume -necessary to synchronize the computations between the computing nodes in the GPU cluster. Table~\ref{tab:12} presents, for each sparse -matrix, the total communication volume between $12$ GPU computing nodes obtained by using the partitioning row-by-row (column~$2$), the -total communication volume obtained by using the hypergraph partitioning (column~$3$) and the execution times in minutes of the hypergraph -partitioning operation performed by $12$ MPI processes (column~$4$). The total communication volume defines the total number of the vector -elements exchanged by the computing nodes. Then, Table~\ref{tab:12} shows that the hypergraph partitioning method can split the sparse -matrix so as to minimize the data dependencies between the computing nodes and thus to reduce the total communication volume. +We can notice from both Tables~\ref{ch12:tab:10} and~\ref{ch12:tab:11} that the +hypergraph partitioning has improved the performances of both parallel CG and GMRES +algorithms. The execution times on the GPU cluster of both parallel solvers are +significantly improved compared to those obtained by using the partitioning row-by-row. +For these examples of sparse matrices, the execution times of CG and GMRES solvers +are reduced about $76\%$ and $65\%$ respectively (see column~$5$ of each table) +compared to those obtained in Tables~\ref{ch12:tab:08} and~\ref{ch12:tab:09}. + +In fact, the hypergraph partitioning\index{Hypergraph} applied to sparse matrices +having large bandwidths allows to reduce the total communication volume necessary +to synchronize the computations between the computing nodes in the GPU cluster. +Table~\ref{ch12:tab:12} presents, for each sparse matrix, the total communication +volume between $12$ GPU computing nodes obtained by using the partitioning row-by-row +(column~$2$), the total communication volume obtained by using the hypergraph partitioning +(column~$3$) and the execution times in minutes of the hypergraph partitioning +operation performed by $12$ MPI processes (column~$4$). The total communication +volume defines the total number of the vector elements exchanged by the computing +nodes. Then, Table~\ref{ch12:tab:12} shows that the hypergraph partitioning method +can split the sparse matrix so as to minimize the data dependencies between the +computing nodes and thus to reduce the total communication volume. \begin{table} \begin{center} @@ -1002,45 +1138,50 @@ poli\_large & $25,053,554$ & $7,388,883$ torso3 & $25,682,514$ & $613,250$ & $61.51$ \\ \hline \end{tabular} \caption{The total communication volume between 12 GPU computing nodes without and with the hypergraph partitioning method.} -\label{tab:12} +\label{ch12:tab:12} \end{center} \end{table} -Nevertheless, as we can see from the fourth column of Table~\ref{tab:12}, the hypergraph partitioning takes longer compared -to the execution times of the resolutions. As previously mentioned, the hypergraph partitioning method is less efficient in -terms of memory consumption and partitioning time than its graph counterpart, but the hypergraph well models the nonsymmetric -and irregular problems. So for the applications which often use the same sparse matrices, we can perform the hypergraph partitioning -on these matrices only once for each and then, we save the traces of these partitionings in files to be reused several times. -Therefore, this allows to avoid the partitioning of the sparse matrices at each resolution of the linear systems. - -\begin{figure}[!t] +Nevertheless, as we can see from the fourth column of Table~\ref{ch12:tab:12}, +the hypergraph partitioning takes longer compared to the execution times of the +resolutions. As previously mentioned, the hypergraph partitioning method is less +efficient in terms of memory consumption and partitioning time than its graph +counterpart, but the hypergraph well models the nonsymmetric and irregular problems. +So for the applications which often use the same sparse matrices, we can perform +the hypergraph partitioning on these matrices only once for each and then, we save +the traces of these partitionings in files to be reused several times. Therefore, +this allows to avoid the partitioning of the sparse matrices at each resolution +of the linear systems. + +\begin{figure}[!h] \centering -\begin{tabular}{c} -\includegraphics[scale=0.7]{Chapters/chapter12/figures/scale_band} \\ -\small{(a) Sparse band matrices} \\ -\\ -\includegraphics[scale=0.7]{Chapters/chapter12/figures/scale_5band} \\ -\small{(b) Sparse five-bands matrices} -\end{tabular} + \mbox{\subfigure[Sparse band matrices]{\includegraphics[scale=0.7]{Chapters/chapter12/figures/scale_band}\label{ch12:fig:09.01}}} +\vfill + \mbox{\subfigure[Sparse five-bands matrices]{\includegraphics[scale=0.7]{Chapters/chapter12/figures/scale_5band}\label{ch12:fig:09.02}}} \caption{Weak-scaling of the parallel CG and GMRES solvers on a GPU cluster for solving large sparse linear systems.} -\label{fig:09} +\label{ch12:fig:09} \end{figure} -However, the most important performance parameter is the scalability of the parallel CG and GMRES solvers on a GPU cluster. -Particularly, we have taken into account the weak-scaling of both parallel algorithms on a cluster of one to 12 GPU computing -nodes. We have performed a set of experiments on both matrix structures: band matrices and five-bands matrices. The -sparse matrices of tests are generated from the symmetric sparse matrix {\it thermal2} chosen from the Davis's collection. -Figures~\ref{fig:09}-$(a)$ and~\ref{fig:09}-$(b)$ show the execution times of both parallel methods for solving large linear -systems associated to band matrices and those associated to five-bands matrices, respectively. The size of a sparse sub-matrix -per computing node, for each matrix structure, is fixed as follows: +However, the most important performance parameter is the scalability of the parallel +CG\index{Iterative~method!CG} and GMRES\index{Iterative~method!GMRES} solvers on a GPU +cluster. Particularly, we have taken into account the weak-scaling of both parallel +algorithms on a cluster of one to 12 GPU computing nodes. We have performed a set of +experiments on both matrix structures: band matrices and five-bands matrices. The sparse +matrices of tests are generated from the symmetric sparse matrix {\it thermal2} chosen +from the Davis's collection. Figures~\ref{ch12:fig:09.01} and~\ref{ch12:fig:09.02} +show the execution times of both parallel methods for solving large linear systems +associated to band matrices and those associated to five-bands matrices, respectively. +The size of a sparse sub-matrix per computing node, for each matrix structure, is fixed +as follows: \begin{itemize} \item band matrix: $15$ million of rows and $105,166,557$ of nonzero values, \item five-bands matrix: $5$ million of rows and $78,714,492$ of nonzero values. \end{itemize} -We can see from these figures that both parallel solvers are quite scalable on a GPU cluster. Indeed, the execution times remains -almost constant while the size of the sparse linear systems to be solved increases proportionally with the number of -the GPU computing nodes. This means that the communication cost is relatively constant regardless of the number the computing nodes -in the GPU cluster. +We can see from these figures that both parallel solvers are quite scalable on a GPU +cluster. Indeed, the execution times remains almost constant while the size of the +sparse linear systems to be solved increases proportionally with the number of the +GPU computing nodes. This means that the communication cost is relatively constant +regardless of the number the computing nodes in the GPU cluster. @@ -1048,31 +1189,44 @@ in the GPU cluster. %% SECTION 6 %% %%--------------------------%% \section{Conclusion} -\label{sec:06} -In this chapter, we have aimed at harnessing the computing power of a cluster of GPUs for solving large sparse linear systems. -For this, we have used two Krylov sub-space iterative methods: the CG and GMRES methods. The first method is well-known to its -efficiency for solving symmetric linear systems and the second one is used, particularly, for solving nonsymmetric linear systems. - -We have presentend the parallel implementation of both iterative methods on a GPU cluster. Particularly, the operations dealing with -the vectors and/or matrices, of these methods, are parallelized between the different GPU computing nodes of the cluster. Indeed, -the data-parallel vector operations are accelerated by GPUs and the communications required to synchronize the parallel computations -are carried out by CPU cores. For this, we have used a heterogeneous CUDA/MPI programming to implement the parallel iterative +\label{ch12:sec:06} +In this chapter, we have aimed at harnessing the computing power of a +cluster of GPUs for solving large sparse linear systems. For this, we +have used two Krylov subspace iterative methods: the CG and GMRES methods. +The first method is well-known to its efficiency for solving symmetric +linear systems and the second one is used, particularly, for solving +nonsymmetric linear systems. + +We have presented the parallel implementation of both iterative methods +on a GPU cluster. Particularly, the operations dealing with the vectors +and/or matrices, of these methods, are parallelized between the different +GPU computing nodes of the cluster. Indeed, the data-parallel vector operations +are accelerated by GPUs and the communications required to synchronize the +parallel computations are carried out by CPU cores. For this, we have used +a heterogeneous CUDA/MPI programming to implement the parallel iterative algorithms. -In the experimental tests, we have shown that using a GPU cluster is efficient for solving linear systems associated to very large -sparse matrices. The experimental results, obtained in the present chapter, showed that a cluster of $12$ GPUs is about $7$ -times faster than a cluster of $24$ CPU cores for solving large sparse linear systems of $25$ million unknown values. This is due -to the GPU ability to compute the data-parallel operations faster than the CPUs. However, we have shown that solving linear systems -associated to matrices having large bandwidths uses many communications to synchronize the computations of GPUs, which slow down -even more the resolution. Moreover, there are two kinds of communications: between a CPU and its GPU and between CPUs of the computing -nodes, such that the first ones are the slowest communications on a GPU cluster. So, we have proposed to use the hypergraph partitioning -instead of the row-by-row partitioning. This allows to minimize the data dependencies between the GPU computing nodes and thus to -reduce the total communication volume. The experimental results showed that using the hypergraph partitioning technique improve the -execution times on average of $76\%$ to the CG method and of $65\%$ to the GMRES method on a cluster of $12$ GPUs. - -In the recent GPU hardware and software architectures, the GPU-Direct system with CUDA version 5.0 is used so that two GPUs located on -the same node or on distant nodes can communicate between them directly without CPUs. This allows to improve the data transfers between -GPUs. +In the experimental tests, we have shown that using a GPU cluster is efficient +for solving linear systems associated to very large sparse matrices. The experimental +results, obtained in the present chapter, showed that a cluster of $12$ GPUs is +about $7$ times faster than a cluster of $24$ CPU cores for solving large sparse +linear systems of $25$ million unknown values. This is due to the GPU ability to +compute the data-parallel operations faster than the CPUs. However, we have shown +that solving linear systems associated to matrices having large bandwidths uses +many communications to synchronize the computations of GPUs, which slow down even +more the resolution. Moreover, there are two kinds of communications: between a +CPU and its GPU and between CPUs of the computing nodes, such that the first ones +are the slowest communications on a GPU cluster. So, we have proposed to use the +hypergraph partitioning instead of the row-by-row partitioning. This allows to +minimize the data dependencies between the GPU computing nodes and thus to reduce +the total communication volume. The experimental results showed that using the +hypergraph partitioning technique improve the execution times on average of $76\%$ +to the CG method and of $65\%$ to the GMRES method on a cluster of $12$ GPUs. + +In the recent GPU hardware and software architectures, the GPU-Direct system with +CUDA version 5.0 is used so that two GPUs located on the same node or on distant +nodes can communicate between them directly without CPUs. This allows to improve +the data transfers between GPUs. \putbib[Chapters/chapter12/biblio12] diff --git a/BookGPU/Chapters/chapter13/biblio13.bib b/BookGPU/Chapters/chapter13/biblio13.bib index b9781bf..6faf02a 100644 --- a/BookGPU/Chapters/chapter13/biblio13.bib +++ b/BookGPU/Chapters/chapter13/biblio13.bib @@ -1,6 +1,6 @@ -@article{ref1, +@article{ch13:ref1, title = {Asynchronous grid computation for {A}merican options derivatives}, -author = {Chau, M. and Couturier, R. and Bahi, J. M. and Spiteri, P.}, +author = {Chau, Ming and Couturier, Rapha\"el and Bahi, Jacques M. and Spiteri, Pierre}, journal = {Advances in Engineering Software}, volume = {}, number = {0}, @@ -9,9 +9,9 @@ note = {}, year = {2012}, } -@article{ref2, +@article{ch13:ref2, title = {Matrix iterative analysis}, -author = {Varga, R. S.}, +author = {Varga, Richard S.}, journal = {Prentice Hall}, volume = {}, number = {}, @@ -20,8 +20,8 @@ note = {}, year = {}, } -@article{ref3, - author = {Baudet, G. M.}, +@article{ch13:ref3, + author = {Baudet, G\'erard M.}, title = {Asynchronous iterative methods for multiprocessors}, journal = {Journal Assoc. Comput. Mach.}, volume = {25}, @@ -30,8 +30,8 @@ year = {}, year = {1978}, } -@article{ref4, - author = {Bertsekas, D. P. and Tsitsiklis, J. N.}, +@article{ch13:ref4, + author = {Bertsekas, Dimitri P. and Tsitsiklis, John N.}, title = {Parallel and distributed computation, numerical methods}, journal = {Prentice Hall Englewood Cliffs N. J. (1989)}, volume = {}, @@ -40,8 +40,8 @@ year = {}, year = {1989}, } -@article{ref5, - author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.}, +@article{ch13:ref5, + author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"el}, title = {Parallel iterative algorithms: from sequential to grid computing}, journal = {Chapman \& Hall/CRC, Numerical Analysis \& Scientific Computating 1 (2007)}, volume = {}, @@ -50,8 +50,8 @@ year = {}, year = {2007}, } -@article{ref6, - author = {Miellou, J.-C. and Spiteri, P.}, +@article{ch13:ref6, + author = {Miellou, Jean-Claude and Spiteri, Pierre}, title = {Two criteria for the convergence of asynchronous iterations}, journal = {in Computers and computing, P. Chenin et al. ed., Wiley Masson}, volume = {}, @@ -60,8 +60,8 @@ year = {}, year = {1985}, } -@article{ref7, - author = {Miellou, J.-C.}, +@article{ch13:ref7, + author = {Miellou, Jean-Claude}, title = {Algorithmes de relaxation chaotique \`a retards}, journal = {RAIRO Analyse num\'erique}, volume = {R1}, @@ -70,7 +70,7 @@ year = {}, year = {1975}, } -@article{ref8, +@article{ch13:ref8, title = {{CUDA} Toolkit 4.2 {CUBLAS} Library}, author = {NVIDIA Corporation}, journal = {}, @@ -81,8 +81,8 @@ note = {\url{http://developer.download.nvidia.com/compute/DevZone/docs/html/CUDA year = {2012}, } -@article{ref9, - author = {Micikevicius, P.}, +@article{ch13:ref9, + author = {Micikevicius, Paulius}, title = {{3D} finite difference computation on {GPUs} using {CUDA}}, journal = {Proceedings of 2nd Workshop on General Purpose Processing on Graphics Processing Units}, volume = {}, @@ -91,8 +91,8 @@ year = {2012}, year = {2009}, } -@article{ref10, - author = {Leist, A. and Playne, D. P. and Hawick, K. A.}, +@article{ch13:ref10, + author = {Leist, Aron and Playne, Daniel P. and Hawick, Ken A.}, title = {Exploiting graphical processing units for data-parallel scientific applications}, journal = {Concurrency and Computation: Practice and Experience}, volume = {21}, @@ -101,8 +101,8 @@ year = {2012}, year = {2009}, } -@article{ref11, - author = {Chau, M. and Couturier, R. and Bahi, J. M. and Spiteri, P.}, +@article{ch13:ref11, + author = {Chau, Ming and Couturier, Rapha\"el and Bahi, Jacques M. and Spiteri, Pierre}, title = {Parallel solution of the obstacle problem in grid environments}, journal = {International Journal of High Performance Computing Applications}, volume = {25}, @@ -111,7 +111,7 @@ year = {2012}, year = {2011}, } -@article{ref12, +@article{ch13:ref12, author = {Nvidia}, title = {{NVIDIA} {CUDA} {C} {P}rogramming {G}uide}, journal = {Version 4.2 (2012)}, @@ -121,8 +121,8 @@ year = {2012}, year = {2012}, } -@article{ref13, - author = {Evans, D. J.}, +@article{ch13:ref13, + author = {Evans, David J.}, title = {Parallel {S}.{O}.{R}. iterative methods}, journal = {Parallel Computing}, volume = {1}, @@ -131,8 +131,8 @@ year = {2012}, year = {1984}, } -@article{ref14, - author = {Iwashita, T. and Shimasaki, M.}, +@article{ch13:ref14, + author = {Iwashita, Takeshi and Shimasaki, Masaaki}, title = {Block red-black ordering method for parallel processing of {ICCG} solver}, journal = {High Performance Computing}, volume = {2327}, @@ -141,9 +141,9 @@ year = {2012}, year = {2006}, } -@article{ref15, +@article{ch13:ref15, title = {Iterative methods for sparse linear systems}, -author = {Saad, Y.}, +author = {Saad, Yousef}, journal = {Society for Industrial and Applied Mathematics, 2nd edition}, volume = {}, number = {}, diff --git a/BookGPU/Chapters/chapter13/ch13.aux b/BookGPU/Chapters/chapter13/ch13.aux index 140e1c4..9460acd 100644 --- a/BookGPU/Chapters/chapter13/ch13.aux +++ b/BookGPU/Chapters/chapter13/ch13.aux @@ -1,82 +1,87 @@ \relax -\@writefile{toc}{\author{}{}} +\@writefile{toc}{\author{Lilia Ziane Khodja}{}} +\@writefile{toc}{\author{Ming Chau}{}} +\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} +\@writefile{toc}{\author{Pierre Spit\IeC {\'e}ri}{}} +\@writefile{toc}{\author{Jacques Bahi}{}} \@writefile{loa}{\addvspace {10\p@ }} \@writefile{toc}{\contentsline {chapter}{\numberline {12}Solving sparse nonlinear systems of obstacle problems on GPU clusters}{277}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} +\newlabel{ch13}{{12}{277}} \@writefile{toc}{\contentsline {section}{\numberline {12.1}Introduction}{277}} -\newlabel{sec:01}{{12.1}{277}} +\newlabel{ch13:sec:01}{{12.1}{277}} \@writefile{toc}{\contentsline {section}{\numberline {12.2}Obstacle problems}{278}} -\newlabel{sec:02}{{12.2}{278}} +\newlabel{ch13:sec:02}{{12.2}{278}} \@writefile{toc}{\contentsline {subsection}{\numberline {12.2.1}Mathematical model}{278}} -\newlabel{sec:02.01}{{12.2.1}{278}} -\newlabel{eq:01}{{12.1}{278}} -\newlabel{eq:02}{{12.2}{279}} +\newlabel{ch13:sec:02.01}{{12.2.1}{278}} +\newlabel{ch13:eq:01}{{12.1}{278}} +\newlabel{ch13:eq:02}{{12.2}{278}} \@writefile{toc}{\contentsline {subsection}{\numberline {12.2.2}Discretization}{279}} -\newlabel{sec:02.02}{{12.2.2}{279}} -\newlabel{eq:03}{{12.3}{279}} -\newlabel{eq:04}{{12.4}{279}} -\newlabel{eq:05}{{12.5}{280}} +\newlabel{ch13:sec:02.02}{{12.2.2}{279}} +\newlabel{ch13:eq:03}{{12.3}{279}} +\newlabel{ch13:eq:04}{{12.4}{279}} +\newlabel{ch13:eq:05}{{12.5}{279}} \@writefile{toc}{\contentsline {section}{\numberline {12.3}Parallel iterative method}{280}} -\newlabel{sec:03}{{12.3}{280}} -\newlabel{eq:06}{{12.6}{280}} -\newlabel{eq:07}{{12.7}{280}} -\newlabel{eq:08}{{12.8}{281}} -\newlabel{eq:09}{{12.9}{281}} -\newlabel{eq:10}{{12.10}{281}} -\newlabel{eq:11}{{12.11}{281}} -\newlabel{eq:12}{{12.12}{281}} -\newlabel{eq:13}{{12.13}{282}} -\newlabel{eq:14}{{12.14}{282}} -\newlabel{eq:15}{{12.15}{282}} -\newlabel{eq:16}{{12.16}{282}} +\newlabel{ch13:sec:03}{{12.3}{280}} +\newlabel{ch13:eq:06}{{12.6}{280}} +\newlabel{ch13:eq:07}{{12.7}{280}} +\newlabel{ch13:eq:08}{{12.8}{280}} +\newlabel{ch13:eq:09}{{12.9}{280}} +\newlabel{ch13:eq:10}{{12.10}{281}} +\newlabel{ch13:eq:11}{{12.11}{281}} +\newlabel{ch13:eq:12}{{12.12}{281}} +\newlabel{ch13:eq:13}{{12.13}{282}} +\newlabel{ch13:eq:14}{{12.14}{282}} +\newlabel{ch13:eq:15}{{12.15}{282}} +\newlabel{ch13:eq:16}{{12.16}{282}} \@writefile{toc}{\contentsline {section}{\numberline {12.4}Parallel implementation on a GPU cluster}{283}} -\newlabel{sec:04}{{12.4}{283}} +\newlabel{ch13:sec:04}{{12.4}{283}} \@writefile{lof}{\contentsline {figure}{\numberline {12.1}{\ignorespaces Data partitioning of a problem to be solved among $S=3\times 4$ computing nodes.\relax }}{283}} -\newlabel{fig:01}{{12.1}{283}} +\newlabel{ch13:fig:01}{{12.1}{283}} \@writefile{loa}{\contentsline {algocf}{\numberline {11}{\ignorespaces Parallel solving of the obstacle problem on a GPU cluster\relax }}{284}} -\newlabel{alg:01}{{11}{284}} -\newlabel{eq:18}{{12.17}{284}} +\newlabel{ch13:alg:01}{{11}{284}} +\newlabel{ch13:eq:18}{{12.17}{284}} \@writefile{loa}{\contentsline {algocf}{\numberline {12}{\ignorespaces Parallel iterative solving of the nonlinear systems on a GPU cluster ($Solve()$ function)\relax }}{285}} -\newlabel{alg:02}{{12}{285}} +\newlabel{ch13:alg:02}{{12}{285}} \@writefile{lof}{\contentsline {figure}{\numberline {12.2}{\ignorespaces Decomposition of a sub-problem in a GPU into $nz$ slices.\relax }}{286}} -\newlabel{fig:02}{{12.2}{286}} -\newlabel{list:01}{{12.1}{286}} +\newlabel{ch13:fig:02}{{12.2}{286}} +\newlabel{ch13:list:01}{{12.1}{286}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {12.1}Skeleton codes of a GPU kernel and a CPU function}{286}} \@writefile{lof}{\contentsline {figure}{\numberline {12.3}{\ignorespaces Matrix constant coefficients in a three-dimensional domain.\relax }}{288}} -\newlabel{fig:03}{{12.3}{288}} -\newlabel{eq:17}{{12.18}{288}} -\newlabel{list:02}{{12.2}{289}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.2}GPU kernels of the projected Richardson method}{289}} +\newlabel{ch13:fig:03}{{12.3}{288}} +\newlabel{ch13:eq:17}{{12.18}{288}} +\newlabel{ch13:list:02}{{12.2}{288}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.2}GPU kernels of the projected Richardson method}{288}} \@writefile{lof}{\contentsline {figure}{\numberline {12.4}{\ignorespaces Computation of a vector element with the projected Richardson method.\relax }}{290}} -\newlabel{fig:04}{{12.4}{290}} -\newlabel{list:03}{{12.3}{290}} +\newlabel{ch13:fig:04}{{12.4}{290}} +\newlabel{ch13:list:03}{{12.3}{290}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {12.3}Memory access to the cache texture memory}{290}} \@writefile{toc}{\contentsline {section}{\numberline {12.5}Experimental tests on a GPU cluster}{291}} -\newlabel{sec:05}{{12.5}{291}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.5}{\ignorespaces GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.\relax }}{292}} -\newlabel{fig:05}{{12.5}{292}} +\newlabel{ch13:sec:05}{{12.5}{291}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.5}{\ignorespaces GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.\relax }}{293}} +\newlabel{ch13:fig:05}{{12.5}{293}} \@writefile{lot}{\contentsline {table}{\numberline {12.1}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 24 CPU cores.\relax }}{293}} -\newlabel{tab:01}{{12.1}{293}} -\@writefile{lot}{\contentsline {table}{\numberline {12.2}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.\relax }}{293}} -\newlabel{tab:02}{{12.2}{293}} +\newlabel{ch13:tab:01}{{12.1}{293}} +\@writefile{lot}{\contentsline {table}{\numberline {12.2}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.\relax }}{294}} +\newlabel{ch13:tab:02}{{12.2}{294}} \@writefile{toc}{\contentsline {section}{\numberline {12.6}Red-Black ordering technique}{294}} -\newlabel{sec:06}{{12.6}{294}} -\newlabel{fig:06.01}{{12.6(a)}{295}} -\newlabel{sub@fig:06.01}{{(a)}{295}} -\newlabel{fig:06.02}{{12.6(b)}{295}} -\newlabel{sub@fig:06.02}{{(b)}{295}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.6}{\ignorespaces Red-black ordering for computing the iterate vector elements in a three-dimensional space.\relax }}{295}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Red-black ordering on x, y and z axises}}}{295}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Red-black ordering on y axis}}}{295}} -\newlabel{list:04}{{12.4}{296}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.4}GPU kernels of the projected Richardson method using the red-black technique}{296}} +\newlabel{ch13:sec:06}{{12.6}{294}} +\newlabel{ch13:list:04}{{12.4}{295}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.4}GPU kernels of the projected Richardson method using the red-black technique}{295}} +\newlabel{ch13:fig:06.01}{{12.6(a)}{296}} +\newlabel{sub@ch13:fig:06.01}{{(a)}{296}} +\newlabel{ch13:fig:06.02}{{12.6(b)}{296}} +\newlabel{sub@ch13:fig:06.02}{{(b)}{296}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.6}{\ignorespaces Red-Black ordering for computing the iterate vector elements in a three-dimensional space.\relax }}{296}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Red-Black ordering on x, y and z axises}}}{296}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Red-Black ordering on y axis}}}{296}} \@writefile{lot}{\contentsline {table}{\numberline {12.3}{\ignorespaces Execution times in seconds of the parallel projected Richardson method using read-black ordering technique implemented on a cluster of 12 GPUs.\relax }}{297}} -\newlabel{tab:03}{{12.3}{297}} +\newlabel{ch13:tab:03}{{12.3}{297}} \@writefile{lof}{\contentsline {figure}{\numberline {12.7}{\ignorespaces Weak scaling of both synchronous and asynchronous algorithms of the projected Richardson method using red-black ordering technique.\relax }}{298}} -\newlabel{fig:07}{{12.7}{298}} -\@writefile{toc}{\contentsline {section}{\numberline {12.7}Conclusion}{299}} -\newlabel{sec:07}{{12.7}{299}} +\newlabel{ch13:fig:07}{{12.7}{298}} +\@writefile{toc}{\contentsline {section}{\numberline {12.7}Conclusion}{298}} +\newlabel{ch13:sec:07}{{12.7}{298}} \@writefile{toc}{\contentsline {section}{Bibliography}{299}} \@setckpt{Chapters/chapter13/ch13}{ \setcounter{page}{301} diff --git a/BookGPU/Chapters/chapter13/ch13.tex b/BookGPU/Chapters/chapter13/ch13.tex index 635181d..7a1aa42 100755 --- a/BookGPU/Chapters/chapter13/ch13.tex +++ b/BookGPU/Chapters/chapter13/ch13.tex @@ -4,49 +4,71 @@ %% %% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\chapterauthor{}{} -\newcommand{\scalprod}[2]% -{\ensuremath{\langle #1 \, , #2 \rangle}} +%\chapterauthor{}{} +\chapterauthor{Lilia Ziane Khodja}{Femto-ST Institute, University of Franche-Comte, France} +\chapterauthor{Ming Chau}{Advanced Solutions Accelerator, Castelnau Le Lez, France} +\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte, France} +\chapterauthor{Pierre Spitéri}{ENSEEIHT-IRIT, Toulouse, France} +\chapterauthor{Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France} + + \chapter{Solving sparse nonlinear systems of obstacle problems on GPU clusters} +\label{ch13} + + + %%--------------------------%% %% SECTION 1 %% %%--------------------------%% \section{Introduction} -\label{sec:01} -The obstacle problem is one kind of free boundary problems. It allows to model, for example, an elastic membrane covering a solid obstacle. -In this case, the objective is to find an equilibrium position of this membrane constrained to be above the obstacle and which tends to minimize -its surface and/or its energy. The study of such problems occurs in many applications, for example: fluid mechanics, bio-mathematics (tumour growth -process) or financial mathematics (American or European option pricing). - -In this chapter, we focus on solutions of large obstacle problems defined in a three-dimensional domain. Particularly, the present study consists -in solving large nonlinear systems derived from the spatial discretization of these problems. Owing to the great size of such systems, in order to -reduce computation times, we proceed by solving them by parallel synchronous or asynchronous iterative algorithms. Moreover, we aim at harnessing -the computing power of GPUs to accelerate computations of these parallel algorithms. For this, we use an iterative method involving a projection -on a convex set, which is: the projected Richardson method. We choose this method among other iterative methods because it is easy to implement on -parallel computers and easy to adapt to GPU architectures. - -In Section~\ref{sec:02}, we present the mathematical model of obstacle problems then, in Section~\ref{sec:03}, we describe the general principle of -the parallel projected Richardson method. Next, in Section~\ref{sec:04}, we give the main key points of the parallel implementation of both synchronous -and asynchronous algorithms of the projected Richardson method on a GPU cluster. In Section~\ref{sec:05}, we present the performances of both parallel -algorithms obtained from simulations carried out on a CPU and GPU clusters. Finally, in Section~\ref{sec:06}, we use the read-black ordering technique -to improve the convergence and, thus, the execution times of the parallel projected Richardson algorithms on the GPU cluster. +\label{ch13:sec:01} +The obstacle problem is one kind of free boundary problems. It allows to model, +for example, an elastic membrane covering a solid obstacle. In this case, the +objective is to find an equilibrium position of this membrane constrained to be +above the obstacle and which tends to minimize its surface and/or its energy. +The study of such problems occurs in many applications, for example: fluid mechanics, +bio-mathematics (tumor growth process) or financial mathematics (American or +European option pricing). + +In this chapter, we focus on solutions of large obstacle problems defined in a +three-dimensional domain. Particularly, the present study consists in solving +large nonlinear systems derived from the spatial discretization of these problems. +Owing to the great size of such systems, in order to reduce computation times, +we proceed by solving them by parallel synchronous or asynchronous iterative +algorithms. Moreover, we aim at harnessing the computing power of GPUs to accelerate +computations of these parallel algorithms. For this, we use an iterative method +involving a projection on a convex set, which is: the projected Richardson method. +We choose this method among other iterative methods because it is easy to implement +on parallel computers and easy to adapt to GPU architectures. + +In Section~\ref{ch13:sec:02}, we present the mathematical model of obstacle problems +then, in Section~\ref{ch13:sec:03}, we describe the general principle of the parallel +projected Richardson method. Next, in Section~\ref{ch13:sec:04}, we give the main key +points of the parallel implementation of both synchronous and asynchronous algorithms +of the projected Richardson method on a GPU cluster. In Section~\ref{ch13:sec:05}, we +present the performances of both parallel algorithms obtained from simulations carried +out on a CPU and GPU clusters. Finally, in Section~\ref{ch13:sec:06}, we use the read-black +ordering technique to improve the convergence and, thus, the execution times of the parallel +projected Richardson algorithms on the GPU cluster. + %%--------------------------%% %% SECTION 2 %% %%--------------------------%% \section{Obstacle problems} -\label{sec:02} -In this section, we present the mathematical model of obstacle problems defined in a three-dimensional domain. -This model is based on that presented in~\cite{ref1}. +\label{ch13:sec:02} +In this section, we present the mathematical model of obstacle problems defined in a +three-dimensional domain. This model is based on that presented in~\cite{ch13:ref1}. + %%******************* %%******************* \subsection{Mathematical model} -\label{sec:02.01} -An obstacle problem, arising for example in mechanics or financial derivatives, consists in solving a time dependent -nonlinear equation: +\label{ch13:sec:02.01} +An obstacle problem\index{Obstacle~problem}, arising for example in mechanics or financial +derivatives, consists in solving a time dependent nonlinear equation\index{Nonlinear}: \begin{equation} \left\{ \begin{array}{l} @@ -56,18 +78,21 @@ u(0,x,y,z)=u_0(x,y,z),\\ \mbox{B.C. on~}u(t,x,y,z)\mbox{~defined on~}\partial\Omega, \end{array} \right. -\label{eq:01} +\label{ch13:eq:01} \end{equation} -where $u_0$ is the initial condition, $c\geq 0$, $b$ and $\eta$ are physical parameters, $T$ is the final time, $u=u(t,x,y,z)$ -is an element of the solution vector $U$ to compute, $f$ is the right-hand right that could represent, for example, the external -forces, B.C. describes the boundary conditions on the boundary $\partial\Omega$ of the domain $\Omega$, $\phi$ models a constraint -imposed to $u$, $\Delta$ is the Laplacian operator, $\nabla$ is the gradient operator, a.e.w. means almost every where and ``.'' -defines the products between two scalars, a scalar and a vector or a matrix and a vector. In practice the boundary condition, -generally considered, is the Dirichlet condition (where $u$ is fixed on $\partial\Omega$) or the Neumann condition (where the -normal derivative of $u$ is fixed on $\partial\Omega$). - -The time dependent equation~(\ref{eq:01}) is numerically solved by considering an implicit or a semi-implicit time marching, -where at each time step $k$ a stationary nonlinear problem is solved: +where $u_0$ is the initial condition, $c\geq 0$, $b$ and $\eta$ are physical parameters, +$T$ is the final time, $u=u(t,x,y,z)$ is an element of the solution vector $U$ to compute, +$f$ is the right-hand side that could represent, for example, the external forces, B.C. +describes the boundary conditions on the boundary $\partial\Omega$ of the domain $\Omega$, +$\phi$ models a constraint imposed to $u$, $\Delta$ is the Laplacian operator, $\nabla$ +is the gradient operator, a.e.w. means almost every where and ``.'' defines the products +between two scalars, a scalar and a vector or a matrix and a vector. In practice the boundary +condition, generally considered, is the Dirichlet condition (where $u$ is fixed on $\partial\Omega$) +or the Neumann condition (where the normal derivative of $u$ is fixed on $\partial\Omega$). + +The time dependent equation~(\ref{ch13:eq:01}) is numerically solved by considering an +implicit or a semi-implicit time marching, where at each time step $k$ a stationary nonlinear +problem\index{Nonlinear} is solved: \begin{equation} \left\{ \begin{array}{l} @@ -76,42 +101,52 @@ b^t.\nabla u-\eta.\Delta u+(c+\delta).u-g\geq 0\mbox{,~}u\geq\phi\mbox{,~a.e.w. \mbox{B.C. on~}u(t,x,y,z)\mbox{~defined on~}\partial\Omega, \end{array} \right. -\label{eq:02} +\label{ch13:eq:02} \end{equation} -where $\delta=\frac{1}{k}$ is the inverse of the time step $k$, $g=f+\delta u^{prev}$ and $u^{prev}$ is the solution computed at the -previous time step. +where $\delta=\frac{1}{k}$ is the inverse of the time step $k$, $g=f+\delta u^{prev}$ and +$u^{prev}$ is the solution computed at the previous time step. %%******************* %%******************* \subsection{Discretization} -\label{sec:02.02} -First, we note that the spatial discretization of the previous stationary problem~(\ref{eq:02}) does not provide a symmetric matrix, -because the convection-diffusion operator is not self-adjoint. Moreover, the fact that the operator is self-adjoint or not plays an -important role in the choice of the appropriate algorithm for solving nonlinear systems derived from the discretization of the obstacle -problem. Nevertheless, since the convection coefficients arising in the operator~(\ref{eq:02}) are constant, we can formulate the same -problem by an self-adjoint operator by performing a classical change of variables. Then, we can replace the stationary convection-diffusion -problem: +\label{ch13:sec:02.02} +First, we note that the spatial discretization of the previous stationary +problem~(\ref{ch13:eq:02}) does not provide a symmetric matrix, because the +convection-diffusion operator is not self-adjoint. Moreover, the fact that +the operator is self-adjoint or not plays an important role in the choice +of the appropriate algorithm for solving nonlinear systems derived from the +discretization of the obstacle problem\index{Obstacle~problem}. Nevertheless, +since the convection coefficients arising in the operator~(\ref{ch13:eq:02}) +are constant, we can formulate the same problem by an self-adjoint operator +by performing a classical change of variables. Then, we can replace the stationary +convection-diffusion problem: \begin{equation} b^{t}.\nabla v-\eta.\Delta v+(c+\delta).v=g\mbox{,~a.e.w. in~}\lbrack 0,T\rbrack\times\Omega\mbox{,~}c\geq 0\mbox{,~}\delta\geq~0, -\label{eq:03} +\label{ch13:eq:03} \end{equation} by the following stationary diffusion operator: \begin{equation} -\eta.\Delta u+(\frac{\|b\|^{2}_{2}}{4\eta}+c+\delta).u=e^{-a}g=f, -\label{eq:04} +\label{ch13:eq:04} \end{equation} -where $b=\{b_{1},b_{2},b_{3}\}$, $\|b\|_{2}$ denotes the Euclidean norm of $b$ and $v=e^{-a}.u$ represents the general change of variables -such that $a=\frac{b^{t}(x,y,z)}{2\eta}$. Consequently, the numerical resolution of the diffusion problem (the self-adjoint operator~(\ref{eq:04})) -is done by optimization algorithms, in contrast, that of the convection-diffusion problem (non self-adjoint operator~(\ref{eq:03})) is -done by relaxation algorithms. In the case of our studied algorithm, the convergence is ensured by M-matrix property then, the performance -is linked to the magnitude of the spectral radius of the iteration matrix, which is independent of the condition number. - -Next, the three-dimensional domain $\Omega\subset\mathbb{R}^{3}$ is set to $\Omega=\lbrack 0,1\rbrack^{3}$ and discretized with an uniform -Cartesian mesh constituted by $M=m^3$ discretization points, where $m$ related to the spatial discretization step by $h=\frac{1}{m+1}$. This -is carried out by using a classical order 2 finite difference approximation of the Laplacian. So, the complete discretization of both stationary -boundary value problems~(\ref{eq:03}) and~(\ref{eq:04}) leads to the solution of a large discrete complementary problem of the following -form, when both Dirichlet or Neumann boundary conditions are used: +where $b=\{b_{1},b_{2},b_{3}\}$, $\|b\|_{2}$ denotes the Euclidean norm of $b$ and +$v=e^{-a}.u$ represents the general change of variables such that $a=\frac{b^{t}(x,y,z)}{2\eta}$. +Consequently, the numerical resolution of the diffusion problem (the self-adjoint +operator~(\ref{ch13:eq:04})) is done by optimization algorithms, in contrast, that +of the convection-diffusion problem (non self-adjoint operator~(\ref{ch13:eq:03})) +is done by relaxation algorithms. In the case of our studied algorithm, the convergence\index{Convergence} +is ensured by M-matrix property then, the performance is linked to the magnitude of +the spectral radius of the iteration matrix, which is independent of the condition +number. + +Next, the three-dimensional domain $\Omega\subset\mathbb{R}^{3}$ is set to $\Omega=\lbrack 0,1\rbrack^{3}$ +and discretized with an uniform Cartesian mesh constituted by $M=m^3$ discretization +points, where $m$ related to the spatial discretization step by $h=\frac{1}{m+1}$. This +is carried out by using a classical order 2 finite difference approximation of the Laplacian. +So, the complete discretization of both stationary boundary value problems~(\ref{ch13:eq:03}) +and~(\ref{ch13:eq:04}) leads to the solution of a large discrete complementary problem +of the following form, when both Dirichlet or Neumann boundary conditions are used: \begin{equation} \left\{ \begin{array}{l} @@ -120,30 +155,38 @@ form, when both Dirichlet or Neumann boundary conditions are used: ((A+\delta I)U^{*}-G)^{T}(U^{*}-\bar{\Phi})=0,\\ \end{array} \right. -\label{eq:05} +\label{ch13:eq:05} \end{equation} -where $A$ is a matrix obtained after the spatial discretization by a finite difference method, $G$ is derived from the Euler first order implicit time -marching scheme and from the discretized right-hand side of the obstacle problem, $\delta$ is the inverse of the time step $k$ and $I$ is the identity -matrix. The matrix $A$ is symmetric when the self-adjoint operator is considered and nonsymmetric otherwise. +where $A$ is a matrix obtained after the spatial discretization by a finite difference +method, $G$ is derived from the Euler first order implicit time marching scheme and from +the discretized right-hand side of the obstacle problem, $\delta$ is the inverse of the +time step $k$ and $I$ is the identity matrix. The matrix $A$ is symmetric when the self-adjoint +operator is considered and nonsymmetric otherwise. -According to the chosen discretization scheme of the Laplacian, $A$ is an M-matrix (irreducibly diagonal dominant, see~\cite{ref2}) and, consequently, -the matrix $(A+\delta I)$ is also an M-matrix. This property is important to the convergence of iterative methods. +According to the chosen discretization scheme of the Laplacian, $A$ is an M-matrix (irreducibly +diagonal dominant, see~\cite{ch13:ref2}) and, consequently, the matrix $(A+\delta I)$ is also +an M-matrix. This property is important to the convergence of iterative methods. %%--------------------------%% %% SECTION 3 %% %%--------------------------%% \section{Parallel iterative method} -\label{sec:03} -Owing to the large size of the previous discrete complementary problem~(\ref{eq:05}), we will solve it by parallel synchronous or asynchronous iterative -algorithms (see~\cite{ref3,ref4,ref5}). In this chapter, we aim at harnessing the computing power of GPU clusters for solving these large nonlinear systems. -Then, we choose to use the projected Richardson iterative method for solving the diffusion problem~(\ref{eq:04}). Indeed, this method is based on the iterations -of the Jacobi method which are easy to parallelize on parallel computers and easy to adapt to GPU architectures. Then, according to the boundary value problem -formulation with a self-adjoint operator~(\ref{eq:04}), we can consider here the equivalent optimization problem and the fixed point mapping associated to -its solution. - -Assume that $E=\mathbb{R}^{M}$ is a Hilbert space, in which $\scalprod{.}{.}$ is the scalar product and $\|.\|$ its associated norm. So, the general fixed -point problem to be solved is defined as follows: +\label{ch13:sec:03} +Owing to the large size of the previous discrete complementary problem~(\ref{ch13:eq:05}), +we will solve it by parallel synchronous or asynchronous iterative algorithms (see~\cite{ch13:ref3,ch13:ref4,ch13:ref5}). +In this chapter, we aim at harnessing the computing power of GPU clusters for solving these +large nonlinear systems\index{Nonlinear}. Then, we choose to use the projected Richardson +iterative method\index{Iterative~method!Projected~Richardson} for solving the diffusion +problem~(\ref{ch13:eq:04}). Indeed, this method is based on the iterations of the Jacobi +method\index{Iterative~method!Jacobi} which are easy to parallelize on parallel computers +and easy to adapt to GPU architectures. Then, according to the boundary value problem +formulation with a self-adjoint operator~(\ref{ch13:eq:04}), we can consider here the +equivalent optimization problem and the fixed point mapping associated to its solution. + +Assume that $E=\mathbb{R}^{M}$ is a Hilbert space\index{Hilbert~space}, in which $\scalprod{.}{.}$ +is the scalar product and $\|.\|$ its associated norm. So, the general fixed point problem +to be solved is defined as follows: \begin{equation} \left\{ \begin{array}{l} @@ -151,16 +194,17 @@ point problem to be solved is defined as follows: U^{*} = F(U^{*}), \\ \end{array} \right. -\label{eq:06} +\label{ch13:eq:06} \end{equation} where $U\mapsto F(U)$ is an application from $E$ to $E$. Let $K$ be a closed convex set defined by: \begin{equation} K = \{U | U \geq \Phi \mbox{~everywhere in~} E\}, -\label{eq:07} +\label{ch13:eq:07} \end{equation} -where $\Phi$ is the discrete obstacle function. In fact, the obstacle problem~(\ref{eq:05}) is formulated as the following constrained optimization problem: +where $\Phi$ is the discrete obstacle function. In fact, the obstacle problem~(\ref{ch13:eq:05})\index{Obstacle~problem} +is formulated as the following constrained optimization problem: \begin{equation} \left\{ \begin{array}{l} @@ -168,49 +212,62 @@ where $\Phi$ is the discrete obstacle function. In fact, the obstacle problem~(\ \forall V \in K, J(U^{*}) \leq J(V), \\ \end{array} \right. -\label{eq:08} +\label{ch13:eq:08} \end{equation} where the cost function is given by: \begin{equation} J(U) = \frac{1}{2}\scalprod{\mathcal{A}.U}{U} - \scalprod{G}{U}, -\label{eq:09} +\label{ch13:eq:09} \end{equation} -in which $\scalprod{.}{.}$ denotes the scalar product in $E$, $\mathcal{A}=A+\delta I$ is a symmetric positive definite, $A$ is the discretization matrix -associated with the self-adjoint operator~(\ref{eq:04}) after change of variables. +in which $\scalprod{.}{.}$ denotes the scalar product in $E$, $\mathcal{A}=A+\delta I$ +is a symmetric positive definite, $A$ is the discretization matrix associated with the +self-adjoint operator~(\ref{ch13:eq:04}) after change of variables. -For any $U\in E$, let $P_K(U)$ be the projection of $U$ on $K$. For any $\gamma\in\mathbb{R}$, $\gamma>0$, the fixed point mapping $F_{\gamma}$ of the projected -Richardson method is defined as follows: +For any $U\in E$, let $P_K(U)$ be the projection of $U$ on $K$. For any $\gamma\in\mathbb{R}$, +$\gamma>0$, the fixed point mapping $F_{\gamma}$ of the projected Richardson method\index{Iterative~method!Projected~Richardson} +is defined as follows: \begin{equation} U^{*} = F_{\gamma}(U^{*}) = P_K(U^{*} - \gamma(\mathcal{A}.U^{*} - G)). -\label{eq:10} +\label{ch13:eq:10} \end{equation} -In order to reduce the computation time, the large optimization problem is solved in a numerical way by using a parallel asynchronous algorithm of the projected -Richardson method on the convex set $K$. Particularly, we will consider an asynchronous parallel adaptation of the projected Richardson method~\cite{ref6}. - -Let $\alpha\in\mathbb{N}$ be a positive integer. We consider that the space $E=\displaystyle\prod_{i=1}^{\alpha} E_i$ is a product of $\alpha$ subspaces $E_i$ -where $i\in\{1,\ldots,\alpha\}$. Note that $E_i=\mathbb{R}^{m_i}$, where $\displaystyle\sum_{i=1}^{\alpha} m_{i}=M$, is also a Hilbert space in which $\scalprod{.}{.}_i$ -denotes the scalar product and $|.|_i$ the associated norm, for all $i\in\{1,\ldots,\alpha\}$. Then, for all $u,v\in E$, $\scalprod{u}{v}=\displaystyle\sum_{i=1}^{\alpha}\scalprod{u_i}{v_i}_i$ +In order to reduce the computation time, the large optimization problem is solved in a +numerical way by using a parallel asynchronous algorithm of the projected Richardson method +on the convex set $K$. Particularly, we will consider an asynchronous parallel adaptation +of the projected Richardson method~\cite{ch13:ref6}. + +Let $\alpha\in\mathbb{N}$ be a positive integer. We consider that the space $E=\displaystyle\prod_{i=1}^{\alpha} E_i$ +is a product of $\alpha$ subspaces $E_i$ where $i\in\{1,\ldots,\alpha\}$. Note that $E_i=\mathbb{R}^{m_i}$, +where $\displaystyle\sum_{i=1}^{\alpha} m_{i}=M$, is also a Hilbert space\index{Hilbert~space} +in which $\scalprod{.}{.}_i$ denotes the scalar product and $|.|_i$ the associated norm, for +all $i\in\{1,\ldots,\alpha\}$. Then, for all $u,v\in E$, $\scalprod{u}{v}=\displaystyle\sum_{i=1}^{\alpha}\scalprod{u_i}{v_i}_i$ is the scalar product on $E$. -Let $U\in E$, we consider the following decomposition of $U$ and the corresponding decomposition of $F_\gamma$ into $\alpha$ blocks: +Let $U\in E$, we consider the following decomposition of $U$ and the corresponding decomposition +of $F_\gamma$ into $\alpha$ blocks: \begin{equation} \begin{array}{rcl} U & = & (U_1,\ldots,U_{\alpha}), \\ F_{\gamma}(U) & = & (F_{1,\gamma}(U),\ldots,F_{\alpha,\gamma}(U)). \\ \end{array} -\label{eq:11} +\label{ch13:eq:11} \end{equation} -Assume that the convex set $K=\displaystyle\prod_{i=1}^{\alpha}K_{i}$, such that $\forall i\in\{1,\ldots,\alpha\},K_i\subset E_i$ and $K_i$ is a closed convex set. -Let also $G=(G_1,\ldots,G_{\alpha})\in E$ and, for any $U\in E$, $P_K(U)=(P_{K_1}(U_1),\ldots,P_{K_{\alpha}}(U_{\alpha}))$ is the projection of $U$ on $K$ where $\forall i\in\{1,\ldots,\alpha\},P_{K_i}$ -is the projector from $E_i$ onto $K_i$. So, the fixed point mapping of the projected Richardson method~(\ref{eq:10}) can be written in the following way: +Assume that the convex set $K=\displaystyle\prod_{i=1}^{\alpha}K_{i}$, such that $\forall i\in\{1,\ldots,\alpha\},K_i\subset E_i$ +and $K_i$ is a closed convex set. Let also $G=(G_1,\ldots,G_{\alpha})\in E$ and, for any +$U\in E$, $P_K(U)=(P_{K_1}(U_1),\ldots,P_{K_{\alpha}}(U_{\alpha}))$ is the projection of $U$ +on $K$ where $\forall i\in\{1,\ldots,\alpha\},P_{K_i}$ is the projector from $E_i$ onto +$K_i$. So, the fixed point mapping of the projected Richardson method~(\ref{ch13:eq:10})\index{Iterative~method!Projected~Richardson} +can be written in the following way: \begin{equation} \forall U\in E\mbox{,~}\forall i\in\{1,\ldots,\alpha\}\mbox{,~}F_{i,\gamma}(U) = P_{K_i}(U_i - \gamma(\mathcal{A}_i.U - G_i)). -\label{eq:12} +\label{ch13:eq:12} \end{equation} -Note that $\displaystyle\mathcal{A}_i.U= \sum_{j=1}^{\alpha}\mathcal{A}_{i,j}.U_j$, where $\mathcal{A}_{i,j}$ denote block matrices of $\mathcal{A}$. +Note that $\displaystyle\mathcal{A}_i.U= \sum_{j=1}^{\alpha}\mathcal{A}_{i,j}.U_j$, where +$\mathcal{A}_{i,j}$ denote block matrices of $\mathcal{A}$. -The parallel asynchronous iterations of the projected Richardson method for solving the obstacle problem~(\ref{eq:08}) are defined as follows: let $U^0\in E,U^0=(U^0_1,\ldots,U^0_\alpha)$ be -the initial solution, then for all $p\in\mathbb{N}$, the iterate $U^{p+1}=(U^{p+1}_1,\ldots,U^{p+1}_{\alpha})$ is recursively defined by: +The parallel asynchronous iterations of the projected Richardson method for solving the +obstacle problem~(\ref{ch13:eq:08}) are defined as follows: let $U^0\in E,U^0=(U^0_1,\ldots,U^0_\alpha)$ +be the initial solution, then for all $p\in\mathbb{N}$, the iterate $U^{p+1}=(U^{p+1}_1,\ldots,U^{p+1}_{\alpha})$ +is recursively defined by: \begin{equation} U_i^{p+1} = \left\{ @@ -219,7 +276,7 @@ F_{i,\gamma}(U_1^{\rho_1(p)}, \ldots, U_{\alpha}^{\rho_{\alpha}(p)}) \mbox{~if~} U_i^p \mbox{~otherwise}, \\ \end{array} \right. -\label{eq:13} +\label{ch13:eq:13} \end{equation} where \begin{equation} @@ -229,7 +286,7 @@ where \forall i\in\{1,\ldots,\alpha\},\{p \ | \ i \in s(p)\}\mbox{~is denombrable}, \end{array} \right. -\label{eq:14} +\label{ch13:eq:14} \end{equation} and $\forall j\in\{1,\ldots,\alpha\}$, \begin{equation} @@ -239,62 +296,75 @@ and $\forall j\in\{1,\ldots,\alpha\}$, \displaystyle\lim_{p\to\infty}\rho_j(p) = +\infty.\\ \end{array} \right. -\label{eq:15} +\label{ch13:eq:15} \end{equation} -The previous asynchronous scheme of the projected Richardson method models computations that are carried out in parallel -without order nor synchronization (according to the behavior of the parallel iterative method) and describes a subdomain -method without overlapping. It is a general model that takes into account all possible situations of parallel computations -and non-blocking message passing. So, the synchronous iterative scheme is defined by: +The previous asynchronous scheme\index{Asynchronous} of the projected Richardson +method models computations that are carried out in parallel without order nor +synchronization (according to the behavior of the parallel iterative method) and +describes a subdomain method without overlapping. It is a general model that takes +into account all possible situations of parallel computations and nonblocking message +passing. So, the synchronous iterative scheme\index{Synchronous} is defined by: \begin{equation} \forall j\in\{1,\ldots,\alpha\} \mbox{,~} \forall p\in\mathbb{N} \mbox{,~} \rho_j(p)=p. -\label{eq:16} +\label{ch13:eq:16} \end{equation} -The values of $s(p)$ and $\rho_j(p)$ are defined dynamically and not explicitly by the parallel asynchronous or synchronous -execution of the algorithm. Particularly, it enables one to consider distributed computations whereby processors compute at -their own pace according to their intrinsic characteristics and computational load. The parallelism between the processors is -well described by the set $s(p)$ which contains at each step $p$ the index of the components relaxed by each processor on a -parallel way while the use of delayed components in~(\ref{eq:13}) permits one to model nondeterministic behavior and does not -imply inefficiency of the considered distributed scheme of computation. Note that, according to~\cite{ref7}, theoretically, -each component of the vector must be relaxed an infinity of time. The choice of the relaxed components to be used in the -computational process may be guided by any criterion and, in particular, a natural criterion is to pick-up the most recently -available values of the components computed by the processors. Furthermore, the asynchronous iterations are implemented by -means of non-blocking MPI communication subroutines (asynchronous communications). - -The important property ensuring the convergence of the parallel projected Richardson method, both synchronous and asynchronous -algorithms, is the fact that $\mathcal{A}$ is an M-matrix. Moreover, the convergence proceeds from a result of~\cite{ref6}. -Indeed, there exists a value $\gamma_0>0$, such that $\forall\gamma\in ]0,\gamma_0[$, the parallel iterations~(\ref{eq:13}), -(\ref{eq:14}) and~(\ref{eq:15}), associated to the fixed point mapping $F_\gamma$~(\ref{eq:12}), converge to the unique solution -$U^{*}$ of the discretized problem. +The values of $s(p)$ and $\rho_j(p)$ are defined dynamically and not explicitly by +the parallel asynchronous or synchronous execution of the algorithm. Particularly, +it enables one to consider distributed computations whereby processors compute at +their own pace according to their intrinsic characteristics and computational load. +The parallelism between the processors is well described by the set $s(p)$ which +contains at each step $p$ the index of the components relaxed by each processor on +a parallel way while the use of delayed components in~(\ref{ch13:eq:13}) permits one +to model nondeterministic behavior and does not imply inefficiency of the considered +distributed scheme of computation. Note that, according to~\cite{ch13:ref7}, theoretically, +each component of the vector must be relaxed an infinity of time. The choice of the +relaxed components to be used in the computational process may be guided by any criterion +and, in particular, a natural criterion is to pick-up the most recently available +values of the components computed by the processors. Furthermore, the asynchronous +iterations are implemented by means of nonblocking MPI communication subroutines\index{MPI~subroutines!Nonblocking} +(asynchronous communications). + +The important property ensuring the convergence of the parallel projected Richardson +method, both synchronous and asynchronous algorithms, is the fact that $\mathcal{A}$ +is an M-matrix. Moreover, the convergence\index{Convergence} proceeds from a result +of~\cite{ch13:ref6}. Indeed, there exists a value $\gamma_0>0$, such that $\forall\gamma\in ]0,\gamma_0[$, +the parallel iterations~(\ref{ch13:eq:13}), (\ref{ch13:eq:14}) and~(\ref{ch13:eq:15}), +associated to the fixed point mapping $F_\gamma$~(\ref{ch13:eq:12}), converge to the +unique solution $U^{*}$ of the discretized problem. %%--------------------------%% %% SECTION 4 %% %%--------------------------%% \section{Parallel implementation on a GPU cluster} -\label{sec:04} -In this section, we give the main key points of the parallel implementation of the projected Richardson method, both synchronous -and asynchronous versions, on a GPU cluster, for solving the nonlinear systems derived from the discretization of large obstacle -problems. More precisely, each nonlinear system is solved iteratively using the whole cluster. We use a heteregeneous CUDA/MPI -programming. Indeed, the communication of data, at each iteration between the GPU computing nodes, can be either synchronous -or asynchronous using the MPI communication subroutines, whereas inside each GPU node, a CUDA parallelization is performed. +\label{ch13:sec:04} +In this section, we give the main key points of the parallel implementation of the +projected Richardson method, both synchronous and asynchronous versions, on a GPU +cluster, for solving the nonlinear systems derived from the discretization of large +obstacle problems. More precisely, each nonlinear system is solved iteratively using +the whole cluster. We use a heterogeneous CUDA/MPI programming. Indeed, the communication +of data, at each iteration between the GPU computing nodes, can be either synchronous +or asynchronous using the MPI communication subroutines, whereas inside each GPU node, +a CUDA parallelization is performed. \begin{figure}[!h] \centerline{\includegraphics[scale=0.30]{Chapters/chapter13/figures/splitCPU}} \caption{Data partitioning of a problem to be solved among $S=3\times 4$ computing nodes.} -\label{fig:01} +\label{ch13:fig:01} \end{figure} -Let $S$ denote the number of computing nodes on the GPU cluster, where a computing node is composed of CPU core holding one MPI -process and a GPU card. So, before starting computations, the obstacle problem of size $(NX\times NY\times NZ)$ is split into $S$ -parallelepipedic sub-problems, each for a node (MPI process, GPU), as is shown in Figure~\ref{fig:01}. Indeed, the $NY$ and $NZ$ -dimensions (according to the $y$ and $z$ axises) of the three-dimensional problem are, respectively, split into $Sy$ and $Sz$ parts, -such that $S=Sy\times Sz$. In this case, each computing node has at most four neighboring nodes. This kind of the data partitioning -reduces the data exchanges at subdomain boundaries compared to a naive $z$-axis-wise partitioning. +Let $S$ denote the number of computing nodes\index{Computing~node} on the GPU cluster, +where a computing node is composed of CPU core holding one MPI process and a GPU card. +So, before starting computations, the obstacle problem of size $(NX\times NY\times NZ)$ +is split into $S$ parallelepipedic sub-problems, each for a node (MPI process, GPU), as +is shown in Figure~\ref{ch13:fig:01}. Indeed, the $NY$ and $NZ$ dimensions (according +to the $y$ and $z$ axises) of the three-dimensional problem are, respectively, split +into $Sy$ and $Sz$ parts, such that $S=Sy\times Sz$. In this case, each computing node +has at most four neighboring nodes. This kind of the data partitioning reduces the data +exchanges at subdomain boundaries compared to a naive $z$-axis-wise partitioning. \begin{algorithm}[!t] -%\SetLine -%\linesnumbered Initialization of the parameters of the sub-problem\; Allocate and fill the data in the global memory GPU\; \For{$i=1$ {\bf to} $NbSteps$}{ @@ -304,30 +374,34 @@ Allocate and fill the data in the global memory GPU\; } Copy the solution $U$ back from GPU memory\; \caption{Parallel solving of the obstacle problem on a GPU cluster} -\label{alg:01} +\label{ch13:alg:01} \end{algorithm} -All the computing nodes of the GPU cluster execute in parallel the same Algorithm~\ref{alg:01} but on different three-dimensional -sub-problems of size $(NX\times ny\times nz)$. This algorithm gives the main key points for solving an obstacle problem defined in -a three-dimensional domain, where $A$ is the discretization matrix, $G$ is the right-hand side and $U$ is the solution vector. After -the initialization step, all the data generated from the partitioning operation are copied from the CPU memories to the GPU global -memories, to be processed on the GPUs. Next, the algorithm uses $NbSteps$ time steps to solve the global obstacle problem. In fact, -it uses a parallel algorithm adapted to GPUs of the projected Richardson iterative method for solving the nonlinear systems of the -obstacle problem. This function is defined by {\it Solve()} in Algorithm~\ref{alg:01}. At every time step, the initial guess $U^0$ -for the iterative algorithm is set to the solution found at the previous time step. Moreover, the right-hand side $G$ is computed -as follows: \[G = \frac{1}{k}.U^{prev} + F\] where $k$ is the time step, $U^{prev}$ is the solution computed in the previous time -step and each element $f(x, y, z)$ of the vector $F$ is computed as follows: +All the computing nodes of the GPU cluster execute in parallel the same Algorithm~\ref{ch13:alg:01} +but on different three-dimensional sub-problems of size $(NX\times ny\times nz)$. +This algorithm gives the main key points for solving an obstacle problem\index{Obstacle~problem} +defined in a three-dimensional domain, where $A$ is the discretization matrix, $G$ +is the right-hand side and $U$ is the solution vector. After the initialization step, +all the data generated from the partitioning operation are copied from the CPU memories +to the GPU global memories, to be processed on the GPUs. Next, the algorithm uses $NbSteps$ +time steps to solve the global obstacle problem. In fact, it uses a parallel algorithm +adapted to GPUs of the projected Richardson iterative method for solving the nonlinear +systems\index{Nonlinear} of the obstacle problem. This function is defined by {\it Solve()} +in Algorithm~\ref{ch13:alg:01}. At every time step, the initial guess $U^0$ for the iterative +algorithm is set to the solution found at the previous time step. Moreover, the right-hand +side $G$ is computed as follows: \[G = \frac{1}{k}.U^{prev} + F\] where $k$ is the time step, +$U^{prev}$ is the solution computed in the previous time step and each element $f(x, y, z)$ +of the vector $F$ is computed as follows: \begin{equation} f(x,y,z)=\cos(2\pi x)\cdot\cos(4\pi y)\cdot\cos(6\pi z). -\label{eq:18} +\label{ch13:eq:18} \end{equation} -Finally, the solution $U$ of the obstacle problem is copied back from the GPU global memories to the CPU memories. We use the -communication subroutines of the CUBLAS library~\cite{ref8} (CUDA Basic Linear Algebra Subroutines) for the memory allocations in -the GPU (\verb+cublasAlloc()+) and the data transfers between the CPU and its GPU: \verb+cublasSetVector()+ and \verb+cublasGetVector()+. +Finally, the solution $U$ of the obstacle problem is copied back from the GPU global +memories to the CPU memories. We use the communication subroutines of the CUBLAS library~\cite{ch13:ref8}\index{CUBLAS} +(CUDA Basic Linear Algebra Subroutines) for the memory allocations in the GPU (\verb+cublasAlloc()+) +and the data transfers between the CPU and its GPU: \verb+cublasSetVector()+ and \verb+cublasGetVector()+. \begin{algorithm}[!t] -% \SetLine -% \linesnumbered $p = 0$\; $conv = false$\; $U = U^{0}$\; @@ -341,43 +415,55 @@ the GPU (\verb+cublasAlloc()+) and the data transfers between the CPU and its GP $conv$ = Convergence($error$, $p$, $\varepsilon$, $MaxRelax$)\; } \caption{Parallel iterative solving of the nonlinear systems on a GPU cluster ($Solve()$ function)} -\label{alg:02} +\label{ch13:alg:02} \end{algorithm} -As many other iterative methods, the algorithm of the projected Richardson method is based on algebraic functions operating on vectors -and/or matrices, which are more efficient on parallel computers when they work on large vectors. Its parallel implementation on the GPU -cluster is carried out so that the GPUs execute the vector operations as kernels and the CPUs execute the serial codes, supervise the -kernel executions and the data exchanges with the neighboring nodes and supply the GPUs with data. Algorithm~\ref{alg:02} shows the -main key points of the parallel iterative algorithm (function $Solve()$ in Algorithm~\ref{alg:01}). All the vector operations inside -the main loop ({\bf repeat} ... {\bf until}) are executed by the GPU. We use the following functions of the CUBLAS library: +As many other iterative methods, the algorithm of the projected Richardson +method\index{Iterative~method!Projected~Richardson} is based on algebraic +functions operating on vectors and/or matrices, which are more efficient on +parallel computers when they work on large vectors. Its parallel implementation +on the GPU cluster is carried out so that the GPUs execute the vector operations +as kernels and the CPUs execute the serial codes, supervise the kernel executions +and the data exchanges with the neighboring nodes\index{Neighboring~node} and +supply the GPUs with data. Algorithm~\ref{ch13:alg:02} shows the main key points +of the parallel iterative algorithm (function $Solve()$ in Algorithm~\ref{ch13:alg:01}). +All the vector operations inside the main loop ({\bf repeat} ... {\bf until}) +are executed by the GPU. We use the following functions of the CUBLAS library\index{CUBLAS}: \begin{itemize} \item \verb+cublasDaxpy()+ to compute the difference between the solution vectors $U^{p}$ and $U^{p+1}$ computed in two successive relaxations -$p$ and $p+1$ (line~$7$ in Algorithm~\ref{alg:02}), +$p$ and $p+1$ (line~$7$ in Algorithm~\ref{ch13:alg:02}), \item \verb+cublasDnrm2()+ to perform the Euclidean norm (line~$8$) and, \item \verb+cublasDcpy()+ for the data copy of a vector to another one in the GPU memory (lines~$3$ and~$9$). \end{itemize} -The dimensions of the grid and blocks of threads that execute a given kernel depend on the resources of the GPU multiprocessor and the -resource requirements of the kernel. So, if $block$ defines the size of a thread block, which must not exceed the maximum size of a thread -block, then the number of thread blocks in the grid, denoted by $grid$, can be computed according to the size of the local sub-problem -as follows: \[grid = \frac{(NX\times ny\times nz)+block-1}{block}.\] However, when solving very large problems, the size of the thread -grid can exceed the maximum number of thread blocks that can be executed on the GPUs (up-to $65.535$ thread blocks) and, thus, the kernel -will fail to launch. Therefore, for each kernel, we decompose the three-dimensional sub-problem into $nz$ two-dimensional slices of size -($NX\times ny$), as is shown in Figure~\ref{fig:02}. All slices of the same kernel are executed using {\bf for} loop by $NX\times ny$ parallel -threads organized in a two-dimensional grid of two-dimensional thread blocks, as is shown in Listing~\ref{list:01}. Each thread is in charge -of $nz$ discretization points (one from each slice), accessed in the GPU memory with a constant stride $(NX\times ny)$. +The dimensions of the grid and blocks of threads that execute a given kernel +depend on the resources of the GPU multiprocessor and the resource requirements +of the kernel. So, if $block$ defines the size of a thread block, which must +not exceed the maximum size of a thread block, then the number of thread blocks +in the grid, denoted by $grid$, can be computed according to the size of the +local sub-problem as follows: \[grid = \frac{(NX\times ny\times nz)+block-1}{block}.\] +However, when solving very large problems, the size of the thread grid can exceed +the maximum number of thread blocks that can be executed on the GPUs (up-to $65.535$ +thread blocks) and, thus, the kernel will fail to launch. Therefore, for each kernel, +we decompose the three-dimensional sub-problem into $nz$ two-dimensional slices of size +($NX\times ny$), as is shown in Figure~\ref{ch13:fig:02}. All slices of the same kernel +are executed using {\bf for} loop by $NX\times ny$ parallel threads organized in a +two-dimensional grid of two-dimensional thread blocks, as is shown in Listing~\ref{ch13:list:01}. +Each thread is in charge of $nz$ discretization points (one from each slice), accessed +in the GPU memory with a constant stride $(NX\times ny)$. \begin{figure} \centerline{\includegraphics[scale=0.30]{Chapters/chapter13/figures/splitGPU}} \caption{Decomposition of a sub-problem in a GPU into $nz$ slices.} -\label{fig:02} +\label{ch13:fig:02} \end{figure} \begin{center} -\lstinputlisting[label=list:01,caption=Skeleton codes of a GPU kernel and a CPU function]{Chapters/chapter13/ex1.cu} +\lstinputlisting[label=ch13:list:01,caption=Skeleton codes of a GPU kernel and a CPU function]{Chapters/chapter13/ex1.cu} \end{center} -The function $Determine\_Bordering\_Vector\_Elements()$ (line~$5$ in Algorithm~\ref{alg:02}) determines the values of the vector -elements shared at the boundaries with neighboring computing nodes. Its main operations are defined as follows: +The function $Determine\_Bordering\_Vector\_Elements()$ (line~$5$ in Algorithm~\ref{ch13:alg:02}) +determines the values of the vector elements shared at the boundaries with neighboring computing +nodes. Its main operations are defined as follows: \begin{enumerate} \item define the values associated to the bordering points needed by the neighbors, \item copy the values associated to the bordering points from the GPU to the CPU, @@ -389,21 +475,30 @@ The first operation of this function is implemented as kernels to be performed b \item a kernel executed by $NX\times nz$ threads to define the values associated to the bordering vector elements along $y$-axis and, \item a kernel executed by $NX\times ny$ threads to define the values associated to the bordering vector elements along $z$-axis. \end{itemize} -As mentioned before, we develop the \emph{synchronous} and \emph{asynchronous} algorithms of the projected Richardson method. Obviously, -in this scope, the synchronous or asynchronous communications refer to the communications between the CPU cores (MPI processes) on the -GPU cluster, in order to exchange the vector elements associated to subdomain boundaries. For the memory copies between a CPU core and -its GPU, we use the synchronous communication routines of the CUBLAS library: \verb+cublasSetVector()+ and \verb+cublasGetVector()+ -in the synchronous algorithm and the asynchronous ones: \verb+cublasSetVectorAsync()+ and \verb+cublasGetVectorAsync()+ in the -asynchronous algorithm. Moreover, we use the communication routines of the MPI library to carry out the data exchanges between the neighboring -nodes. We use the following communication routines: \verb+MPI_Isend()+ and \verb+MPI_Irecv()+ to perform non-blocking sends and receptions, -respectively. For the synchronous algorithm, we use the MPI routine \verb+MPI_Waitall()+ which puts the MPI process of a computing node -in blocking status until all data exchanges with neighboring nodes (sends and receptions) are completed. In contrast, for the asynchronous -algorithms, we use the MPI routine \verb+MPI_Test()+ which tests the completion of a data exchange (send or reception) without putting the -MPI process in blocking status. - -The function $Compute\_New\_Vector\_Elements()$ (line~$6$ in Algorithm~\ref{alg:02}) computes, at each iteration, the new elements -of the iterate vector $U$. Its general code is presented in Listing~\ref{list:01} (CPU function). The iterations of the projected -Richardson method, based on those of the Jacobi method, are defined as follows: +As mentioned before, we develop the \emph{synchronous} and \emph{asynchronous} +algorithms of the projected Richardson method. Obviously, in this scope, the +synchronous\index{Synchronous} or asynchronous\index{Asynchronous} communications +refer to the communications between the CPU cores (MPI processes) on the GPU cluster, +in order to exchange the vector elements associated to subdomain boundaries. For +the memory copies between a CPU core and its GPU, we use the synchronous communication +routines of the CUBLAS library\index{CUBLAS}: \verb+cublasSetVector()+ and \verb+cublasGetVector()+ +in the synchronous algorithm and the asynchronous ones: \verb+cublasSetVectorAsync()+ +and \verb+cublasGetVectorAsync()+ in the asynchronous algorithm. Moreover, we +use the communication routines of the MPI library to carry out the data exchanges +between the neighboring nodes. We use the following communication routines: \verb+MPI_Isend()+ +and \verb+MPI_Irecv()+ to perform nonblocking\index{MPI~subroutines!Nonblocking} +sends and receptions, respectively. For the synchronous algorithm, we use the MPI +routine \verb+MPI_Waitall()+ which puts the MPI process of a computing node in +blocking status until all data exchanges with neighboring nodes (sends and receptions) +are completed. In contrast, for the asynchronous algorithms, we use the MPI routine +\verb+MPI_Test()+ which tests the completion of a data exchange (send or reception) +without putting the MPI process in blocking status\index{MPI~subroutines!Blocking}. + +The function $Compute\_New\_Vector\_Elements()$ (line~$6$ in Algorithm~\ref{ch13:alg:02}) +computes, at each iteration, the new elements of the iterate vector $U$. Its general code +is presented in Listing~\ref{ch13:list:01} (CPU function). The iterations of the projected +Richardson method\index{Iterative~method!Projected~Richardson}, based on those of the Jacobi +method\index{Iterative~method!Jacobi}, are defined as follows: \begin{equation} \begin{array}{ll} u^{p+1}(x,y,z) =& \frac{1}{Center}(g(x,y,z) - (Center\cdot u^{p}(x,y,z) + \\ @@ -411,64 +506,88 @@ u^{p+1}(x,y,z) =& \frac{1}{Center}(g(x,y,z) - (Center\cdot u^{p}(x,y,z) + \\ & South\cdot u^{p}(x,y-h,z) + North\cdot u^{p}(x,y+h,z) + \\ & Rear\cdot u^{p}(x,y,z-h) + Front\cdot u^{p}(x,y,z+h))), \end{array} -\label{eq:17} +\label{ch13:eq:17} \end{equation} -where $u^{p}(x,y,z)$ is an element of the iterate vector $U$ computed at the iteration $p$ and $g(x,y,z)$ is a vector element of the -right-hand side $G$. The scalars $Center$, $West$, $East$, $South$, $North$, $Rear$ and $Front$ define constant coefficients of the -block matrix $A$. Figure~\ref{fig:03} shows the positions of these coefficients in a three-dimensional domain. +where $u^{p}(x,y,z)$ is an element of the iterate vector $U$ computed at the +iteration $p$ and $g(x,y,z)$ is a vector element of the right-hand side $G$. +The scalars $Center$, $West$, $East$, $South$, $North$, $Rear$ and $Front$ +define constant coefficients of the block matrix $A$. Figure~\ref{ch13:fig:03} +shows the positions of these coefficients in a three-dimensional domain. \begin{figure} \centerline{\includegraphics[scale=0.35]{Chapters/chapter13/figures/matrix}} \caption{Matrix constant coefficients in a three-dimensional domain.} -\label{fig:03} +\label{ch13:fig:03} \end{figure} -The kernel implementations of the projected Richardson method on GPUs uses a perfect fine-grain multithreading parallelism. Since the -projected Richardson algorithm is implemented as a fixed point method, each kernel is executed by a large number of GPU threads such -that each thread is in charge of the computation of one element of the iterate vector $U$. Moreover, this method uses the vector elements -updates of the Jacobi method, which means that each thread $i$ computes the new value of its element $u_{i}^{p+1}$ independently of the -new values $u_{j}^{p+1}$, where $j\neq i$, of those computed in parallel by other threads at the same iteration $p+1$. Listing~\ref{list:02} -shows the GPU implementations of the main kernels of the projected Richardson method, which are: the matrix-vector multiplication -(\verb+MV_Multiplication()+) and the vector elements updates (\verb+Vector_Updates()+). The codes of these kernels are based on -that presented in Listing~\ref{list:01}. - -\lstinputlisting[label=list:02,caption=GPU kernels of the projected Richardson method]{Chapters/chapter13/ex2.cu} +The kernel implementations of the projected Richardson method on GPUs uses a +perfect fine-grain multithreading parallelism. Since the projected Richardson +algorithm is implemented as a fixed point method, each kernel is executed by +a large number of GPU threads such that each thread is in charge of the computation +of one element of the iterate vector $U$. Moreover, this method uses the vector +elements updates of the Jacobi method, which means that each thread $i$ computes +the new value of its element $u_{i}^{p+1}$ independently of the new values $u_{j}^{p+1}$, +where $j\neq i$, of those computed in parallel by other threads at the same iteration +$p+1$. Listing~\ref{ch13:list:02} shows the GPU implementations of the main kernels +of the projected Richardson method, which are: the matrix-vector multiplication +(\verb+MV_Multiplication()+) and the vector elements updates (\verb+Vector_Updates()+). +The codes of these kernels are based on that presented in Listing~\ref{ch13:list:01}. + +\lstinputlisting[label=ch13:list:02,caption=GPU kernels of the projected Richardson method]{Chapters/chapter13/ex2.cu} \begin{figure} \centerline{\includegraphics[scale=0.3]{Chapters/chapter13/figures/points3D}} \caption{Computation of a vector element with the projected Richardson method.} -\label{fig:04} +\label{ch13:fig:04} \end{figure} -Each kernel is executed by $NX\times ny$ GPU threads so that $nz$ slices of $(NX\times ny)$ vector elements are computed in -a {\bf for} loop. In this case, each thread is in charge of one vector element from each slice (in total $nz$ vector elements -along $z$-axis). We can notice from the formula~(\ref{eq:17}) that the computation of a vector element $u^{p+1}(x,y,z)$, by -a thread at iteration $p+1$, requires seven vector elements computed at the previous iteration $p$: two vector elements in -each dimension plus the vector element at the intersection of the three axises $x$, $y$ and $z$ (see Figure~\ref{fig:04}). -So, to reduce the memory accesses to the high-latency global memory, the vector elements of the current slice can be stored -in the low-latency shared memories of thread blocks, as is described in~\cite{ref9}. Nevertheless, the fact that the computation -of a vector element requires only two elements in each dimension does not allow to maximize the data reuse from the shared memories. -The computation of a slice involves in total $(bx+2)\times(by+2)$ accesses to the global memory per thread block, to fill the -required vector elements in the shared memory where $bx$ and $by$ are the dimensions of a thread block. Then, in order to optimize -the memory accesses on GPUs, the elements of the iterate vector $U$ are filled in the cache texture memory (see~\cite{ref10}). -In new GPU generations as Fermi or Kepler, the global memory accesses are always cached in L1 and L2 caches. For example, for -a given kernel, we can favour the use of the L1 cache to that of the shared memory by using the function \verb+cudaFuncSetCacheConfig(Kernel,cudaFuncCachePreferL1)+. -So, the initial access to the global memory loads the vector elements required by the threads of a block into the cache memory -(texture or L1/L2 caches). Then, all the following memory accesses read from this cache memory. In Listing~\ref{list:02}, the -function \verb+fetch_double(v,i)+ is used to read from the texture memory the $i^{th}$ element of the double-precision vector -\verb+v+ (see Listing~\ref{list:03}). Moreover, the seven constant coefficients of matrix $A$ can be stored in the constant memory -but, since they are reused $nz$ times by each thread, it is more interesting to fill them on the low-latency registers of each thread. - -\lstinputlisting[label=list:03,caption=Memory access to the cache texture memory]{Chapters/chapter13/ex3.cu} - -The function $Convergence()$ (line~$11$ in Algorithm~\ref{alg:02}) allows to detect the convergence of the parallel iterative algorithm -and is based on the tolerance threshold $\varepsilon$ and the maximum number of relaxations $MaxRelax$. We take into account the number -of relaxations since that of iterations cannot be computed in the asynchronous case. Indeed, a relaxation is the update~(\ref{eq:13}) of -a local iterate vector $U_i$ according to $F_i$. Then, counting the number of relaxations is possible in both synchronous and asynchronous -cases. On the other hand, an iteration is the update of at least all vector components with $F_i$. - -In the synchronous algorithm, the global convergence is detected when the maximal value of the absolute error, $error$, is sufficiently small -and/or the maximum number of relaxations, $MaxRelax$, is reached, as follows: +Each kernel is executed by $NX\times ny$ GPU threads so that $nz$ slices +of $(NX\times ny)$ vector elements are computed in a {\bf for} loop. In +this case, each thread is in charge of one vector element from each slice +(in total $nz$ vector elements along $z$-axis). We can notice from the +formula~(\ref{ch13:eq:17}) that the computation of a vector element $u^{p+1}(x,y,z)$, +by a thread at iteration $p+1$, requires seven vector elements computed +at the previous iteration $p$: two vector elements in each dimension plus +the vector element at the intersection of the three axises $x$, $y$ and $z$ +(see Figure~\ref{ch13:fig:04}). So, to reduce the memory accesses to the +high-latency global memory, the vector elements of the current slice can +be stored in the low-latency shared memories of thread blocks, as is described +in~\cite{ch13:ref9}. Nevertheless, the fact that the computation of a vector +element requires only two elements in each dimension does not allow to maximize +the data reuse from the shared memories. The computation of a slice involves +in total $(bx+2)\times(by+2)$ accesses to the global memory per thread block, +to fill the required vector elements in the shared memory where $bx$ and $by$ +are the dimensions of a thread block. Then, in order to optimize the memory +accesses on GPUs, the elements of the iterate vector $U$ are filled in the +cache texture memory (see~\cite{ch13:ref10}). In new GPU generations as Fermi +or Kepler, the global memory accesses are always cached in L1 and L2 caches. +For example, for a given kernel, we can favour the use of the L1 cache to that +of the shared memory by using the function \verb+cudaFuncSetCacheConfig(Kernel,cudaFuncCachePreferL1)+. +So, the initial access to the global memory loads the vector elements required +by the threads of a block into the cache memory (texture or L1/L2 caches). Then, +all the following memory accesses read from this cache memory. In Listing~\ref{ch13:list:02}, +the function \verb+fetch_double(v,i)+ is used to read from the texture memory +the $i^{th}$ element of the double-precision vector \verb+v+ (see Listing~\ref{ch13:list:03}). +Moreover, the seven constant coefficients of matrix $A$ can be stored in the +constant memory but, since they are reused $nz$ times by each thread, it is more +interesting to fill them on the low-latency registers of each thread. + +\lstinputlisting[label=ch13:list:03,caption=Memory access to the cache texture memory]{Chapters/chapter13/ex3.cu} + +The function $Convergence()$ (line~$11$ in Algorithm~\ref{ch13:alg:02}) allows +to detect the convergence of the parallel iterative algorithm and is based on +the tolerance threshold\index{Convergence!Tolerance~threshold} $\varepsilon$ +and the maximum number of relaxations\index{Convergence!Maximum~number~of~relaxations} +$MaxRelax$. We take into account the number of relaxations since that of iterations +cannot be computed in the asynchronous case. Indeed, a relaxation is the update~(\ref{ch13:eq:13}) +of a local iterate vector $U_i$ according to $F_i$. Then, counting the number +of relaxations is possible in both synchronous and asynchronous cases. On the +other hand, an iteration is the update of at least all vector components with +$F_i$. + +In the synchronous\index{Synchronous} algorithm, the global convergence is detected +when the maximal value of the absolute error, $error$, is sufficiently small and/or +the maximum number of relaxations, $MaxRelax$, is reached, as follows: $$ \begin{array}{l} error=\|U^{p}-U^{p+1}\|_{2}; \\ @@ -477,52 +596,90 @@ AllReduce(error,\hspace{0.1cm}maxerror,\hspace{0.1cm}MAX); \\ conv \leftarrow true; \end{array} $$ -where the function $AllReduce()$ uses the MPI reduction subroutine \verb+MPI_Allreduce()+ to compute the maximal value, $maxerror$, among the -local absolute errors, $error$, of all computing nodes and $p$ (in Algorithm~\ref{alg:02}) is used as a counter of the local relaxations carried -out by a computing node. In the asynchronous algorithms, the global convergence is detected when all computing nodes locally converge. For this, -we use a token ring architecture around which a boolean token travels, in one direction, from a computing node to another. Starting from node $0$, -the boolean token is set to $true$ by node $i$ if the local convergence is reached or to $false$ otherwise and, then, it is sent to node $i+1$. -Finally, the global convergence is detected when node $0$ receives from its neighbor node $S-1$, in the ring architecture, a token set to $true$. -In this case, node $0$ sends a stop message (end of parallel solving) to all computing nodes in the cluster. +where the function $AllReduce()$ uses the MPI global reduction subroutine\index{MPI~subroutines!Global} +\verb+MPI_Allreduce()+ to compute the maximal value, $maxerror$, among the local +absolute errors, $error$, of all computing nodes and $p$ (in Algorithm~\ref{ch13:alg:02}) +is used as a counter of the local relaxations carried out by a computing node. In +the asynchronous\index{Asynchronous} algorithms, the global convergence is detected +when all computing nodes locally converge. For this, we use a token ring architecture +around which a boolean token travels, in one direction, from a computing node to another. +Starting from node $0$, the boolean token is set to $true$ by node $i$ if the local +convergence is reached or to $false$ otherwise and, then, it is sent to node $i+1$. +Finally, the global convergence is detected when node $0$ receives from its neighbor +node $S-1$, in the ring architecture, a token set to $true$. In this case, node $0$ +sends a stop message (end of parallel solving) to all computing nodes in the cluster. %%--------------------------%% %% SECTION 5 %% %%--------------------------%% \section{Experimental tests on a GPU cluster} -\label{sec:05} -The GPU cluster of tests, that we used in this chapter, is an $20Gbps$ Infiniband network of six machines. Each machine is a Quad-Core Xeon -E5530 CPU running at $2.4$GHz. It provides a RAM memory of $12$GB with a memory bandwidth of $25.6$GB/s and it is equipped with two Nvidia -Tesla C1060 GPUs. A Tesla GPU contains in total $240$ cores running at $1.3$GHz. It provides $4$GB of global memory with a memory bandwidth -of $102$GB/s, accessible by all its cores and also by the CPU through the PCI-Express 16x Gen 2.0 interface with a throughput of $8$GB/s. -Hence, the memory copy operations between the GPU and the CPU are about $12$ times slower than those of the Tesla GPU memory. We have performed -our simulations on a cluster of $24$ CPU cores and on a cluster of $12$ GPUs. Figure~\ref{fig:05} describes the components of the GPU cluster -of tests. +\label{ch13:sec:05} +The GPU cluster\index{GPU~cluster} of tests, that we used in this chapter, is an $20Gbps$ +Infiniband network of six machines. Each machine is a Quad-Core Xeon E5530 CPU running at +$2.4$GHz. It provides a RAM memory of $12$GB with a memory bandwidth of $25.6$GB/s and it +is equipped with two Nvidia Tesla C1060 GPUs. A Tesla GPU contains in total $240$ cores +running at $1.3$GHz. It provides $4$GB of global memory with a memory bandwidth of $102$GB/s, +accessible by all its cores and also by the CPU through the PCI-Express 16x Gen 2.0 interface +with a throughput of $8$GB/s. Hence, the memory copy operations between the GPU and the CPU +are about $12$ times slower than those of the Tesla GPU memory. We have performed our simulations +on a cluster of $24$ CPU cores and on a cluster of $12$ GPUs. Figure~\ref{ch13:fig:05} describes +the components of the GPU cluster of tests. + +Linux cluster version 2.6.39 OS is installed on CPUs. C programming language is used for +coding the parallel algorithms of the methods on both GPU cluster and CPU cluster. CUDA +version 4.0~\cite{ch13:ref12} is used for programming GPUs, using CUBLAS library~\cite{ch13:ref8} +to deal with vector operations in GPUs and, finally, MPI functions of OpenMPI 1.3.3 are +used to carry out the synchronous and asynchronous communications between CPU cores. Indeed, +in our experiments, a computing node is managed by a MPI process and it is composed of +one CPU core and one GPU card. + +All experimental results of the parallel projected Richardson algorithms are obtained +from simulations made in double precision data. The obstacle problems to be solved are +defined in constant three-dimensional domain $\Omega\subset\mathbb{R}^{3}$. The numerical +values of the parameters of the obstacle problems are: $\eta=0.2$, $c=1.1$, $f$ is computed +by formula~(\ref{ch13:eq:18}) and final time $T=0.02$. Moreover, three time steps ($NbSteps=3$) +are computed with $k=0.0066$. As the discretization matrix is constant along the time +steps, the convergence properties of the iterative algorithms do not change. Thus, the +performance characteristics obtained with three time steps will still be valid for more +time steps. The initial function $u(0,x,y,z)$ of the obstacle problem~(\ref{ch13:eq:01}) +is set to $0$, with a constraint $u\geq\phi=0$. The relaxation parameter $\gamma$ used +by the projected Richardson method is computed automatically thanks to the diagonal entries +of the discretization matrix. The formula and its proof can be found in~\cite{ch13:ref11}, +Section~2.3. The convergence tolerance threshold $\varepsilon$ is set to $1e$-$04$ and the +maximum number of relaxations is limited to $10^{6}$ relaxations. Finally, the number of +threads per block is set to $256$ threads, which gives, in general, good performances for +most GPU applications. We have performed some tests for the execution configurations and +we have noticed that the best configuration of the $256$ threads per block is an organization +into two dimensions of sizes $(64,4)$. \begin{figure} \centerline{\includegraphics[scale=0.25]{Chapters/chapter13/figures/cluster}} \caption{GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.} -\label{fig:05} +\label{ch13:fig:05} \end{figure} -Linux cluster version 2.6.39 OS is installed on CPUs. C programming language is used for coding the parallel algorithms of the methods on both -GPU cluster and CPU cluster. CUDA version 4.0~\cite{ref12} is used for programming GPUs, using CUBLAS library~\cite{ref8} to deal with vector -operations in GPUs and, finally, MPI functions of OpenMPI 1.3.3 are used to carry out the synchronous and asynchronous communications between -CPU cores. Indeed, in our experiments, a computing node is managed by a MPI process and it is composed of one CPU core and one GPU card. - -All experimental results of the parallel projected Richardson algorithms are obtained from simulations made in double precision data. The obstacle -problems to be solved are defined in constant three-dimensional domain $\Omega\subset\mathbb{R}^{3}$. The numerical values of the parameters of the -obstacle problems are: $\eta=0.2$, $c=1.1$, $f$ is computed by formula~(\ref{eq:18}) and final time $T=0.02$. Moreover, three time steps ($NbSteps=3$) -are computed with $k=0.0066$. As the discretization matrix is constant along the time steps, the convergence properties of the iterative algorithms -do not change. Thus, the performance characteristics obtained with three time steps will still be valid for more time steps. The initial function -$u(0,x,y,z)$ of the obstacle problem~(\ref{eq:01}) is set to $0$, with a constraint $u\geq\phi=0$. The relaxation parameter $\gamma$ used by the -projected Richardson method is computed automatically thanks to the diagonal entries of the discretization matrix. The formula and its proof can be -found in~\cite{ref11}, Section~2.3. The convergence tolerance threshold $\varepsilon$ is set to $1e$-$04$ and the maximum number of relaxations is -limited to $10^{6}$ relaxations. Finally, the number of threads per block is set to $256$ threads, which gives, in general, good performances for -most GPU applications. We have performed some tests for the execution configurations and we have noticed that the best configuration of the $256$ -threads per block is an organization into two dimensions of sizes $(64,4)$. - -\begin{table}[!h] +The performance measures that we took into account are the execution times and the number +of relaxations performed by the parallel iterative algorithms, both synchronous and asynchronous +versions, on the GPU and CPU clusters. These algorithms are used for solving nonlinear systems +derived from the discretization of obstacle problems of sizes $256^{3}$, $512^{3}$, $768^{3}$ +and $800^{3}$. In Table~\ref{ch13:tab:01} and Table~\ref{ch13:tab:02}, we show the performances +of the parallel synchronous and asynchronous algorithms of the projected Richardson method +implemented, respectively, on a cluster of $24$ CPU cores and on a cluster of $12$ GPUs. In +these tables, the execution time defines the time spent by the slowest computing node and the +number of relaxations is computed as the summation of those carried out by all computing nodes. + +In the sixth column of Table~\ref{ch13:tab:01} and in the eighth column of Table~\ref{ch13:tab:02}, +we give the gains in $\%$ obtained by using an asynchronous algorithm compared to a synchronous +one. We can notice that the asynchronous version on CPU and GPU clusters is slightly faster than +the synchronous one for both methods. Indeed, the cluster of tests is composed of local and homogeneous +nodes communicating via low-latency connections. So, in the case of distant and/or heterogeneous +nodes (or even with geographically distant clusters) the asynchronous version would be faster than +the synchronous one. However, the gains obtained on the GPU cluster are better than those obtained +on the CPU cluster. In fact, the computation times are reduced by accelerating the computations on +GPUs while the communication times still unchanged. + +\begin{table} \centering \begin{tabular}{|c|c|c|c|c|c|} \hline @@ -540,105 +697,130 @@ $800^{3}$ & $222,108.09$ & $1,769,232$ & $188,790 \end{tabular} \vspace{0.5cm} \caption{Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 24 CPU cores.} -\label{tab:01} +\label{ch13:tab:01} \end{table} -\begin{table}[!h] +\begin{table} \centering \begin{tabular}{|c|c|c|c|c|c|c|c|} \hline -\multirow{2}{*}{\bf Pb. size} & \multicolumn{3}{c|}{\bf Synchronous} & \multicolumn{3}{c|}{\bf Asynchronous} & \multirow{2}{*}{\bf Gain\%} \\ \cline{2-7} +\multirow{2}{*}{\bf Pb. size} & \multicolumn{3}{c|}{\bf Synchronous} & \multicolumn{3}{c|}{\bf Asynchronous} & \multirow{2}{*}{\bf Gain\%} \\ \cline{2-7} - & $\mathbf{T_{gpu}}$ & {\bf \#relax.} & $\mathbf{\tau}$ & $\mathbf{T_{gpu}}$ & {\bf \#relax.} & $\mathbf{\tau}$ & \\ \hline \hline + & $\mathbf{T_{gpu}}$ & {\bf \#relax.} & $\mathbf{\tau}$ & $\mathbf{T_{gpu}}$ & {\bf \#relax.} & $\mathbf{\tau}$ & \\ \hline \hline -$256^{3}$ & $29.67$ & $100,692$ & $19.39$ & $18.00$ & $94,215$ & $29.96$ & $39.33$ \\ \hline \hline +$256^{3}$ & $29.67$ & $100,692$ & $19.39$ & $18.00$ & $94,215$ & $29.96$ & $39.33$ \\\hline \hline -$512^{3}$ & $521.83$ & $381,300$ & $36.89$ & $425.15$ & $347,279$ & $42.89$ & $18.53$ \\ \hline \hline +$512^{3}$ & $521.83$ & $381,300$ & $36.89$ & $425.15$ & $347,279$ & $42.89$ & $18.53$ \\\hline \hline -$768^{3}$ & $4,112.68$ & $831,144$ & $50.13$ & $3,313.87$ & $750,232$ & $55.40$ & $19.42$ \\ \hline \hline +$768^{3}$ & $4,112.68$ & $831,144$ & $50.13$ & $3,313.87$ & $750,232$ & $55.40$ & $19.42$ \\ \hline \hline -$800^{3}$ & $3,950.87$ & $899,088$ & $56.22$ & $3,636.57$ & $834,900$ & $51.91$ & $7.95$ \\ \hline +$800^{3}$ & $3,950.87$ & $899,088$ & $56.22$ & $3,636.57$ & $834,900$ & $51.91$ & $7.95$ \\ \hline \end{tabular} \vspace{0.5cm} \caption{Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.} -\label{tab:02} +\label{ch13:tab:02} \end{table} -The performance measures that we took into account are the execution times and the number of relaxations performed by the parallel iterative algorithms, -both synchronous and asynchronous versions, on the GPU and CPU clusters. These algorithms are used for solving nonlinear systems derived from the discretization -of obstacle problems of sizes $256^{3}$, $512^{3}$, $768^{3}$ and $800^{3}$. In Table~\ref{tab:01} and Table~\ref{tab:02}, we show the performances -of the parallel synchronous and asynchronous algorithms of the projected Richardson method implemented, respectively, on a cluster of $24$ CPU cores -and on a cluster of $12$ GPUs. In these tables, the execution time defines the time spent by the slowest computing node and the number of relaxations -is computed as the summation of those carried out by all computing nodes. - -In the sixth column of Table~\ref{tab:01} and in the eighth column of Table~\ref{tab:02}, we give the gains in $\%$ obtained by using an -asynchronous algorithm compared to a synchronous one. We can notice that the asynchronous version on CPU and GPU clusters is slightly faster -than the synchronous one for both methods. Indeed, the cluster of tests is composed of local and homogeneous nodes communicating via low-latency -connections. So, in the case of distant and/or heterogeneous nodes (or even with geographically distant clusters) the asynchronous version -would be faster than the synchronous one. However, the gains obtained on the GPU cluster are better than those obtained on the CPU cluster. -In fact, the computation times are reduced by accelerating the computations on GPUs while the communication times still unchanged. - -The fourth and seventh columns of Table~\ref{tab:02} show the relative gains obtained by executing the parallel algorithms on the cluster -of $12$ GPUs instead on the cluster of $24$ CPU cores. We compute the relative gain $\tau$ as a ratio between the execution time $T_{cpu}$ -spent on the CPU cluster over that $T_{gpu}$ spent on the GPU cluster: \[\tau=\frac{T_{cpu}}{T_{gpu}}.\] We can see from these ratios that -solving large obstacle problems is faster on the GPU cluster than on the CPU cluster. Indeed, the GPUs are more efficient than their -counterpart CPUs to execute large data-parallel operations. In addition, the projected Richardson method is implemented as a fixed point-based -iteration and uses the Jacobi vector updates that allow a well thread-parallelization on GPUs, such that each GPU thread is in charge -of one vector component at a time without being dependent on other vector components computed by other threads. Then, this allow to exploit -at best the high performance computing of the GPUs by using all the GPU resources and avoiding the idle cores. - -Finally, the number of relaxations performed by the parallel synchronous algorithm is different in the CPU and GPU versions, because the number -of computing nodes involved in the GPU cluster and in the CPU cluster is different. In the CPU case, $24$ computing nodes ($24$ CPU cores) are -considered, whereas in the GPU case, $12$ computing nodes ($12$ GPUs) are considered. As the number of relaxations depends on the domain decomposition, +The fourth and seventh columns of Table~\ref{ch13:tab:02} show the relative gains +obtained by executing the parallel algorithms on the cluster of $12$ GPUs instead +on the cluster of $24$ CPU cores. We compute the relative gain\index{Relative~gain} +$\tau$ as a ratio between the execution time $T_{cpu}$ spent on the CPU cluster over +that $T_{gpu}$ spent on the GPU cluster: \[\tau=\frac{T_{cpu}}{T_{gpu}}.\] We can see +from these ratios that solving large obstacle problems is faster on the GPU cluster +than on the CPU cluster. Indeed, the GPUs are more efficient than their counterpart +CPUs to execute large data-parallel operations. In addition, the projected Richardson +method is implemented as a fixed point-based iteration and uses the Jacobi vector updates +that allow a well thread-parallelization on GPUs, such that each GPU thread is in charge +of one vector component at a time without being dependent on other vector components +computed by other threads. Then, this allow to exploit at best the high performance +computing of the GPUs by using all the GPU resources and avoiding the idle cores. + +Finally, the number of relaxations performed by the parallel synchronous algorithm +is different in the CPU and GPU versions, because the number of computing nodes involved +in the GPU cluster and in the CPU cluster is different. In the CPU case, $24$ computing +nodes ($24$ CPU cores) are considered, whereas in the GPU case, $12$ computing nodes +($12$ GPUs) are considered. As the number of relaxations depends on the domain decomposition, consequently it also depends on the number of computing nodes. + %%--------------------------%% %% SECTION 6 %% %%--------------------------%% \section{Red-Black ordering technique} -\label{sec:06} -As is well-known, the Jacobi method is characterized by a slow convergence rate compared to some iterative methods (for example Gauss-Seidel method). -So, in this section, we present some solutions to reduce the execution time and the number of relaxations and, more specifically, to speed up the -convergence of the parallel projected Richardson method on the GPU cluster. We propose to use the point red-black ordering technique to accelerate -the convergence. This technique is often used to increase the parallelism of iterative methods for solving linear systems~\cite{ref13,ref14,ref15}. -We apply it to the projected Richardson method as a compromise between the Jacobi and Gauss-Seidel iterative methods. - -The general principle of the red-black technique is as follows. Let $t$ be the summation of the integer $x$-, $y$- and $z$-coordinates of a vector -element $u(x,y,z)$ on a three-dimensional domain: $t=x+y+z$. As is shown in Figure~\ref{fig:06.01}, the red-black ordering technique consists in the -parallel computing of the red vector elements having even value $t$ by using the values of the black ones then, the parallel computing of the black -vector elements having odd values $t$ by using the new values of the red ones. - -\begin{figure} -\centering - \mbox{\subfigure[Red-black ordering on x, y and z axises]{\includegraphics[width=2.3in]{Chapters/chapter13/figures/rouge-noir}\label{fig:06.01}}\quad - \subfigure[Red-black ordering on y axis]{\includegraphics[width=2.3in]{Chapters/chapter13/figures/rouge-noir-y}\label{fig:06.02}}} -\caption{Red-black ordering for computing the iterate vector elements in a three-dimensional space.} -\end{figure} +\label{ch13:sec:06} +As is well-known, the Jacobi method\index{Iterative~method!Jacobi} is characterized +by a slow convergence\index{Convergence} rate compared to some iterative methods\index{Iterative~method} +(for example Gauss-Seidel method\index{Iterative~method!Gauss-Seidel}). So, in this +section, we present some solutions to reduce the execution time and the number of +relaxations and, more specifically, to speed up the convergence of the parallel +projected Richardson method on the GPU cluster. We propose to use the point red-black +ordering technique\index{Iterative~method!Red-Black~ordering} to accelerate the +convergence. This technique is often used to increase the parallelism of iterative +methods for solving linear systems~\cite{ch13:ref13,ch13:ref14,ch13:ref15}. We +apply it to the projected Richardson method as a compromise between the Jacobi +and Gauss-Seidel iterative methods. + +The general principle of the red-black technique is as follows. Let $t$ be the +summation of the integer $x$-, $y$- and $z$-coordinates of a vector element $u(x,y,z)$ +on a three-dimensional domain: $t=x+y+z$. As is shown in Figure~\ref{ch13:fig:06.01}, +the red-black ordering technique consists in the parallel computing of the red +vector elements having even value $t$ by using the values of the black ones then, +the parallel computing of the black vector elements having odd values $t$ by using +the new values of the red ones. This technique can be implemented on the GPU in two different manners: \begin{itemize} \item among all launched threads ($NX\times ny$ threads), only one thread out of two computes its red or black vector element at a time or, \item all launched threads (on average half of $NX\times ny$ threads) compute the red vector elements first and, then, the black ones. \end{itemize} -However, in both solutions, for each memory transaction, only half of the memory segment addressed by a half-warp is used. So, the computation of the -red and black vector elements leads to use twice the initial number of memory transactions. Then, we apply the point red-black ordering accordingly to -the $y$-coordinate, as is shown in Figure~\ref{fig:06.02}. In this case, the vector elements having even $y$-coordinate are computed in parallel using -the values of those having odd $y$-coordinate and then vice-versa. Moreover, in the GPU implementation of the parallel projected Richardson method (Section~\ref{sec:04}), -we have shown that a sub-problem of size $(NX\times ny\times nz)$ is decomposed into $nz$ grids of size $(NX\times ny)$. Then, each kernel is executed -in parallel by $NX\times ny$ GPU threads, so that each thread is in charge of $nz$ vector elements along $z$-axis (one vector element in each grid of -the sub-problem). So, we propose to use the new values of the vector elements computed in grid $i$ to compute those of the vector elements in grid $i+1$. -Listing~\ref{list:04} describes the kernel of the matrix-vector multiplication and the kernel of the vector elements updates of the parallel projected -Richardson method using the red-black ordering technique. - -\lstinputlisting[label=list:04,caption=GPU kernels of the projected Richardson method using the red-black technique]{Chapters/chapter13/ex4.cu} - -Finally, we exploit the concurrent executions between the host functions and the GPU kernels provided by the GPU hardware and software. In fact, the kernel -launches are asynchronous (when this environment variable is not disabled on the GPUs), such that the control is returned to the host (MPI process) before -the GPU has completed the requested task (kernel)~\cite{ref12}. Therefore, all the kernels necessary to update the local vector elements, $u(x,y,z)$ where -$0<y<(ny-1)$ and $0<z<(nz-1)$, are executed first. Then, the values associated to the bordering vector elements are exchanged between the neighbors. Finally, -the values of the vector elements associated to the bordering vector elements are updated. In this case, the computation of the local vector elements is -performed concurrently with the data exchanges between neighboring CPUs and this in both synchronous and asynchronous cases. +However, in both solutions, for each memory transaction, only half of the memory +segment addressed by a half-warp is used. So, the computation of the red and black +vector elements leads to use twice the initial number of memory transactions. Then, +we apply the point red-black ordering\index{Iterative~method!Red-Black~ordering} +accordingly to the $y$-coordinate, as is shown in Figure~\ref{ch13:fig:06.02}. In +this case, the vector elements having even $y$-coordinate are computed in parallel +using the values of those having odd $y$-coordinate and then vice-versa. Moreover, +in the GPU implementation of the parallel projected Richardson method (Section~\ref{ch13:sec:04}), +we have shown that a sub-problem of size $(NX\times ny\times nz)$ is decomposed into +$nz$ grids of size $(NX\times ny)$. Then, each kernel is executed in parallel by +$NX\times ny$ GPU threads, so that each thread is in charge of $nz$ vector elements +along $z$-axis (one vector element in each grid of the sub-problem). So, we propose +to use the new values of the vector elements computed in grid $i$ to compute those +of the vector elements in grid $i+1$. Listing~\ref{ch13:list:04} describes the kernel +of the matrix-vector multiplication and the kernel of the vector elements updates of +the parallel projected Richardson method using the red-black ordering technique. + +\begin{figure} +\centering + \mbox{\subfigure[Red-Black ordering on x, y and z axises]{\includegraphics[width=2.3in]{Chapters/chapter13/figures/rouge-noir}\label{ch13:fig:06.01}}\quad + \subfigure[Red-Black ordering on y axis]{\includegraphics[width=2.3in]{Chapters/chapter13/figures/rouge-noir-y}\label{ch13:fig:06.02}}} +\caption{Red-Black ordering for computing the iterate vector elements in a three-dimensional space.} +\end{figure} + +\lstinputlisting[label=ch13:list:04,caption=GPU kernels of the projected Richardson method using the red-black technique]{Chapters/chapter13/ex4.cu} + +Finally, we exploit the concurrent executions between the host functions and the GPU +kernels provided by the GPU hardware and software. In fact, the kernel launches are +asynchronous (when this environment variable is not disabled on the GPUs), such that +the control is returned to the host (MPI process) before the GPU has completed the +requested task (kernel)~\cite{ch13:ref12}. Therefore, all the kernels necessary to +update the local vector elements, $u(x,y,z)$ where $0<y<(ny-1)$ and $0<z<(nz-1)$, +are executed first. Then, the values associated to the bordering vector elements +are exchanged between the neighbors. Finally, the values of the vector elements +associated to the bordering vector elements are updated. In this case, the computation +of the local vector elements is performed concurrently with the data exchanges between +neighboring CPUs and this in both synchronous and asynchronous cases. + +In Table~\ref{ch13:tab:03}, we report the execution times and the number of relaxations +performed on a cluster of $12$ GPUs by the parallel projected Richardson algorithms; it +can be noted that the performances of the projected Richardson are improved by using the +point read-black ordering. We compare the performances of the parallel projected Richardson +method with and without this later ordering (Tables~\ref{ch13:tab:02} and~\ref{ch13:tab:03}). +We can notice that both parallel synchronous and asynchronous algorithms are faster when +they use the red-black ordering. Indeed, we can see in Table~\ref{ch13:tab:03} that the +execution times of these algorithms are reduced, on average, by $32\%$ compared to those +shown in Table~\ref{ch13:tab:02}. \begin{table}[!h] \centering @@ -658,53 +840,66 @@ $800^{3}$ & $2,748.23$ & $638,916$ & $2,502.6 \end{tabular} \vspace{0.5cm} \caption{Execution times in seconds of the parallel projected Richardson method using read-black ordering technique implemented on a cluster of 12 GPUs.} -\label{tab:03} +\label{ch13:tab:03} \end{table} -In Table~\ref{tab:03}, we report the execution times and the number of relaxations performed on a cluster of $12$ GPUs by the parallel projected Richardson -algorithms; it can be noted that the performances of the projected Richardson are improved by using the point read-black ordering. We compare the performances -of the parallel projected Richardson method with and without this later ordering (Tables~\ref{tab:02} and~\ref{tab:03}). We can notice that both parallel synchronous -and asynchronous algorithms are faster when they use the red-black ordering. Indeed, we can see in Table~\ref{tab:03} that the execution times of these algorithms -are reduced, on average, by $32\%$ compared to those shown in Table~\ref{tab:02}. - \begin{figure} -\centerline{\includegraphics[scale=0.9]{Chapters/chapter13/figures/scale}} +\centerline{\includegraphics[scale=0.7]{Chapters/chapter13/figures/scale}} \caption{Weak scaling of both synchronous and asynchronous algorithms of the projected Richardson method using red-black ordering technique.} -\label{fig:07} +\label{ch13:fig:07} \end{figure} -In Figure~\ref{fig:07}, we study the ratio between the computation time and the communication time of the parallel projected Richardson algorithms on a GPU cluster. -The experimental tests are carried out on a cluster composed of one to ten Tesla GPUs. We have focused on the weak scaling of both parallel, synchronous and asynchronous, -algorithms using the red-black ordering technique. For this, we have fixed the size of a sub-problem to $256^{3}$ per computing node (a CPU core and a GPU). Then, -Figure~\ref{fig:07} shows the number of relaxations performed, on average, per second by a computing node. We can see from this figure that the efficiency of the -asynchronous algorithm is almost stable, while that of the synchronous algorithm decreases (down to $81\%$ in this example) with the increasing of the number of -computing nodes on the cluster. This is due to the fact that the ratio between the time of the computation over that of the communication is reduced when the computations -are performed on GPUs. Indeed, GPUs compute faster than CPUs and communications are more time consuming. In this context, asynchronous algorithms are more scalable -than synchronous ones. So, with large scale GPU clusters, synchronous algorithms might be more penalized by communications, as can be deduced from Figure~\ref{fig:07}. -That is why we think that asynchronous iterative algorithms are all the more interesting in this case. +In Figure~\ref{ch13:fig:07}, we study the ratio between the computation time and +the communication time of the parallel projected Richardson algorithms on a GPU +cluster. The experimental tests are carried out on a cluster composed of one to +ten Tesla GPUs. We have focused on the weak scaling of both parallel, synchronous +and asynchronous, algorithms using the red-black ordering technique. For this, we +have fixed the size of a sub-problem to $256^{3}$ per computing node (a CPU core +and a GPU). Then, Figure~\ref{ch13:fig:07} shows the number of relaxations performed, +on average, per second by a computing node. We can see from this figure that the +efficiency of the asynchronous algorithm is almost stable, while that of the synchronous +algorithm decreases (down to $81\%$ in this example) with the increasing of the +number of computing nodes on the cluster. This is due to the fact that the ratio +between the time of the computation over that of the communication is reduced when +the computations are performed on GPUs. Indeed, GPUs compute faster than CPUs and +communications are more time consuming. In this context, asynchronous algorithms +are more scalable than synchronous ones. So, with large scale GPU clusters, synchronous\index{Synchronous} +algorithms might be more penalized by communications, as can be deduced from Figure~\ref{ch13:fig:07}. +That is why we think that asynchronous\index{Asynchronous} iterative algorithms +are all the more interesting in this case. %%--------------------------%% %% SECTION 7 %% %%--------------------------%% \section{Conclusion} -\label{sec:07} -Our main contribution, in this chapter, is the parallel implementation of an asynchronous iterative method on GPU clusters for solving large scale nonlinear -systems derived from the spatial discretization of three-dimensional obstacle problems. For this, we have implemented both synchronous and asynchronous algorithms of the -Richardson iterative method using a projection on a convex set. Indeed, this method uses point-based iterations of the Jacobi method that are very easy to parallelize on -parallel computers. We have shown that its adapted parallel algorithms to GPU architectures allows to exploit at best the computing power of the GPUs and to accelerate the -resolution of large nonlinear systems. Consequently, the experimental results have shown that solving nonlinear systems of large obstacle problems with this method is about -fifty times faster on a cluster of $12$ GPUs than on a cluster of $24$ CPU cores. Moreover, we have applied to this projected Richardson method the red-black ordering technique -which allows it to improve its convergence rate. Thus, the execution times of both parallel algorithms performed on the cluster of $12$ GPUs are reduced on average of $32\%$. - -Afterwards, the experiments have shown that the asynchronous version is slightly more efficient than the synchronous one. In fact, the computations are accelerated by using GPUs -while the communication times still unchanged. In addition, we have studied the weak-scaling in the synchronous and asynchronous cases, which has confirmed that the ratio between -the computations and the communications are reduced when using a cluster of GPUs. We highlight that asynchronous iterative algorithms are more scalable than synchronous ones. -Therefore, we can conclude that asynchronous iterations are well suited to tackle scalability issues on GPU clusters. - -In future works, we plan to perform experiments on large scale GPU clusters and on geographically distant GPU clusters, because we expect that asynchronous versions would -be faster and more scalable on such architectures. Furthermore, we want to study the performance behavior and the scalability of other numerical algorithms which support, -if possible, the model of asynchronous iterations. +\label{ch13:sec:07} +Our main contribution, in this chapter, is the parallel implementation of an asynchronous +iterative method on GPU clusters for solving large scale nonlinear systems derived from the +spatial discretization of three-dimensional obstacle problems. For this, we have implemented +both synchronous and asynchronous algorithms of the Richardson iterative method using a projection +on a convex set. Indeed, this method uses point-based iterations of the Jacobi method that +are very easy to parallelize on parallel computers. We have shown that its adapted parallel +algorithms to GPU architectures allows to exploit at best the computing power of the GPUs and +to accelerate the resolution of large nonlinear systems. Consequently, the experimental results +have shown that solving nonlinear systems of large obstacle problems with this method is about +fifty times faster on a cluster of $12$ GPUs than on a cluster of $24$ CPU cores. Moreover, +we have applied to this projected Richardson method the red-black ordering technique which +allows it to improve its convergence rate. Thus, the execution times of both parallel algorithms +performed on the cluster of $12$ GPUs are reduced on average of $32\%$. + +Afterwards, the experiments have shown that the asynchronous version is slightly more efficient +than the synchronous one. In fact, the computations are accelerated by using GPUs while the communication +times still unchanged. In addition, we have studied the weak-scaling in the synchronous and asynchronous +cases, which has confirmed that the ratio between the computations and the communications are reduced +when using a cluster of GPUs. We highlight that asynchronous iterative algorithms are more scalable +than synchronous ones. Therefore, we can conclude that asynchronous iterations are well suited to +tackle scalability issues on GPU clusters. + +In future works, we plan to perform experiments on large scale GPU clusters and on geographically +distant GPU clusters, because we expect that asynchronous versions would be faster and more scalable +on such architectures. Furthermore, we want to study the performance behavior and the scalability of +other numerical algorithms which support, if possible, the model of asynchronous iterations. \putbib[Chapters/chapter13/biblio13] diff --git a/BookGPU/Chapters/chapter13/figures/cluster.eps b/BookGPU/Chapters/chapter13/figures/cluster.eps index d143f51..75027da 100644 --- a/BookGPU/Chapters/chapter13/figures/cluster.eps +++ b/BookGPU/Chapters/chapter13/figures/cluster.eps @@ -1,8 +1,8 @@ %!PS-Adobe-2.0 EPSF-2.0 %%Title: cluster.fig %%Creator: fig2dev Version 3.2 Patchlevel 5c -%%CreationDate: Mon Jan 14 10:39:57 2013 -%%BoundingBox: 0 0 1391 706 +%%CreationDate: Sat Feb 9 22:21:11 2013 +%%BoundingBox: 0 0 1391 723 %Magnification: 1.0000 %%EndComments %%BeginProlog @@ -127,13 +127,26 @@ newfontname newfont definefont pop end } def 8#374 /udieresis 8#375 /yacute 8#376 /thorn 8#377 /ydieresis] def /Times-Roman /Times-Roman-iso isovec ReEncode /Times-Bold /Times-Bold-iso isovec ReEncode + /DrawEllipse { + /endangle exch def + /startangle exch def + /yrad exch def + /xrad exch def + /y exch def + /x exch def + /savematrix mtrx currentmatrix def + x y tr xrad yrad sc 0 0 1 startangle endangle arc + closepath + savematrix setmatrix + } def + /$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def /$F2psEnd {$F2psEnteredState restore end} def /pageheader { save -newpath 0 706 moveto 0 0 lineto 1391 0 lineto 1391 706 lineto closepath clip newpath --29.1 723.8 translate +newpath 0 723 moveto 0 0 lineto 1391 0 lineto 1391 723 lineto closepath clip newpath +-29.1 739.8 translate 1 -1 scale $F2psBegin 10 setmiterlimit @@ -447,61 +460,76 @@ n 18270 6750 m 17280 6750 l n 18012 6825 m 18222 6750 l 18012 6675 l col18 s % arrowhead n 17355 4443 m 17280 4233 l 17205 4443 l col18 s -% Polyline +% Ellipse 15.000 slw - [15 45] 45 sd -n 13545 4950 m - 16245 4950 l gs col0 s gr [] 0 sd +n 13725 4950 90 90 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr + +% Ellipse +n 15975 4950 90 90 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr + +% Ellipse +n 15615 4950 90 90 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr + +% Ellipse +n 15255 4950 90 90 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr + +% Ellipse +n 14895 4950 90 90 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr + +% Ellipse +n 14535 4950 90 90 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr + +% Ellipse +n 14130 4950 90 90 0 360 DrawEllipse gs 0.00 setgray ef gr gs col0 s gr + % Polyline 45.000 slw gs clippath -19462 9963 m 19462 9705 l 19237 9705 l 19237 9963 l 19237 9963 l 19350 9738 l 19462 9963 l cp -3712 9963 m 3712 9705 l 3487 9705 l 3487 9963 l 3487 9963 l 3600 9738 l 3712 9963 l cp +10237 10557 m 10237 10815 l 10462 10815 l 10462 10557 l 10462 10557 l 10350 10782 l 10237 10557 l cp +10462 9963 m 10462 9705 l 10237 9705 l 10237 9963 l 10237 9963 l 10350 9738 l 10462 9963 l cp eoclip -n 3600 9720 m 3600 10800 l 19350 10800 l - 19350 9720 l gs col0 s gr gr +n 10350 9720 m + 10350 10800 l gs col0 s gr gr % arrowhead 15.000 slw -n 3712 9963 m 3600 9738 l 3487 9963 l col0 s +n 10462 9963 m 10350 9738 l 10237 9963 l col0 s % arrowhead -n 19462 9963 m 19350 9738 l 19237 9963 l col0 s +n 10237 10557 m 10350 10782 l 10462 10557 l col0 s % Polyline -7.500 slw +45.000 slw gs clippath 3487 10557 m 3487 10815 l 3712 10815 l 3712 10557 l 3712 10557 l 3600 10782 l 3487 10557 l cp +3712 9963 m 3712 9705 l 3487 9705 l 3487 9963 l 3487 9963 l 3600 9738 l 3712 9963 l cp eoclip -n 3600 10350 m +n 3600 9720 m 3600 10800 l gs col0 s gr gr % arrowhead 15.000 slw +n 3712 9963 m 3600 9738 l 3487 9963 l col0 s +% arrowhead n 3487 10557 m 3600 10782 l 3712 10557 l col0 s % Polyline -7.500 slw +45.000 slw +n 2805 10800 m 2700 10800 2700 11595 105 arcto 4 {pop} repeat + 2700 11700 20145 11700 105 arcto 4 {pop} repeat + 20250 11700 20250 10905 105 arcto 4 {pop} repeat + 20250 10800 2805 10800 105 arcto 4 {pop} repeat + cp gs col0 s gr +% Polyline gs clippath 19237 10557 m 19237 10815 l 19462 10815 l 19462 10557 l 19462 10557 l 19350 10782 l 19237 10557 l cp +19462 9963 m 19462 9705 l 19237 9705 l 19237 9963 l 19237 9963 l 19350 9738 l 19462 9963 l cp eoclip -n 19350 10350 m +n 19350 9720 m 19350 10800 l gs col0 s gr gr % arrowhead 15.000 slw -n 19237 10557 m 19350 10782 l 19462 10557 l col0 s -% Polyline -45.000 slw -gs clippath -10237 10557 m 10237 10815 l 10462 10815 l 10462 10557 l 10462 10557 l 10350 10782 l 10237 10557 l cp -10462 9963 m 10462 9705 l 10237 9705 l 10237 9963 l 10237 9963 l 10350 9738 l 10462 9963 l cp -eoclip -n 10350 9720 m - 10350 10800 l gs col0 s gr gr - -% arrowhead -15.000 slw -n 10462 9963 m 10350 9738 l 10237 9963 l col0 s +n 19462 9963 m 19350 9738 l 19237 9963 l col0 s % arrowhead -n 10237 10557 m 10350 10782 l 10462 10557 l col0 s +n 19237 10557 m 19350 10782 l 19462 10557 l col0 s /Times-Bold-iso ff 317.50 scf sf 4365 2205 m gs 1 -1 sc (240 cores) col0 sh gr @@ -702,7 +730,7 @@ gs 1 -1 sc 90.0 rot (Node 11) col18 sh gr gs 1 -1 sc (Machine 6) col0 sh gr /Times-Roman-iso ff 539.75 scf sf 9810 11475 m -gs 1 -1 sc (Infiniband 20GB/s) col0 sh gr +gs 1 -1 sc (Infiniband 20Gbps) col0 sh gr % here ends figure; pagefooter showpage diff --git a/BookGPU/Chapters/chapter13/figures/cluster.pdf b/BookGPU/Chapters/chapter13/figures/cluster.pdf index dc6db85..6eb4739 100644 Binary files a/BookGPU/Chapters/chapter13/figures/cluster.pdf and b/BookGPU/Chapters/chapter13/figures/cluster.pdf differ