From: couturie Date: Mon, 25 Feb 2013 10:09:53 +0000 (+0100) Subject: ch12 X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/59263d81c5f07e8eff22c9091a0847e79b4fbf2c?hp=a9f4a14a06edf88832d058ec85346ff66bd171b4 ch12 --- diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index 0b7f5cf..76dc8f6 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -138,18 +138,19 @@ \setcounter{page}{1} \part{This is a Part} -%%\include{Chapters/chapter1/ch1} -%% \include{Chapters/chapter2/ch2} -%% \include{Chapters/chapter3/ch3} -%% \include{Chapters/chapter5/ch5} -%% \include{Chapters/chapter6/ch6} -%% \include{Chapters/chapter7/ch7} -%% \include{Chapters/chapter8/ch8} -%% \include{Chapters/chapter9/ch9} -%% \include{Chapters/chapter11/ch11} -%% \include{Chapters/chapter14/ch14} -%% \include{Chapters/chapter15/ch15} -%% \include{Chapters/chapter16/ch16} +\include{Chapters/chapter1/ch1} +\include{Chapters/chapter2/ch2} +\include{Chapters/chapter3/ch3} +\include{Chapters/chapter5/ch5} +\include{Chapters/chapter6/ch6} +\include{Chapters/chapter7/ch7} +\include{Chapters/chapter8/ch8} +\include{Chapters/chapter9/ch9} +\include{Chapters/chapter11/ch11} +\include{Chapters/chapter12/ch12} +\include{Chapters/chapter14/ch14} +\include{Chapters/chapter15/ch15} +\include{Chapters/chapter16/ch16} \include{Chapters/chapter18/ch18} \bibliographystyle{hep} diff --git a/BookGPU/Chapters/chapter12/biblio12.bib b/BookGPU/Chapters/chapter12/biblio12.bib new file mode 100644 index 0000000..8cc4ee7 --- /dev/null +++ b/BookGPU/Chapters/chapter12/biblio12.bib @@ -0,0 +1,163 @@ +@article{ref1, +title = {Iterative methods for sparse linear systems}, +author = {Saad, Yousef}, +journal = {Society for Industrial and Applied Mathematics, 2nd edition}, +volume = {}, +number = {}, +pages = {}, +year = {2003}, +} + +@article{ref2, +title = {Methods of conjugate gradients for solving linear systems}, +author = {Hestenes, Maqnus R. and Stiefel, Eduard}, +journal = {Journal of Research of the National Bureau of Standards}, +volume = {49}, +number = {6}, +pages = {409--436}, +year = {1952}, +} + +@article{ref3, +title = {{GMRES}: a generalized minimal residual algorithm for solving nonsymmetric linear systems}, +author = {Saad, Yousef and Schultz, Martin H.}, +journal = {SIAM Journal on Scientific and Statistical Computing}, +volume = {7}, +number = {3}, +pages = {856--869}, +year = {1986}, +} + +@article{ref4, +title = {Solution of sparse indefinite systems of linear equations}, +author = {Paige, Chris C. and Saunders, Michael A.}, +journal = {SIAM Journal on Numerical Analysis}, +volume = {12}, +number = {4}, +pages = {617--629}, +year = {1975}, +} + +@article{ref5, +title = {The principle of minimized iteration in the solution of the matrix eigenvalue problem}, +author = {Arnoldi, Walter E.}, +journal = {Quarterly of Applied Mathematics}, +volume = {9}, +number = {17}, +pages = {17--29}, +year = {1951}, +} + +@article{ref6, +title = {{CUDA} Toolkit 4.2 {CUBLAS} Library}, +author = {NVIDIA Corporation}, +journal = {}, +volume = {}, +number = {}, +pages = {}, +note = {\url{http://developer.download.nvidia.com/compute/DevZone/docs/html/CUDALibraries/doc/CUBLAS_Library.pdf}}, +year = {2012}, +} + +@article{ref7, +title = {Efficient sparse matrix-vector multiplication on {CUDA}}, +author = {Bell, Nathan and Garland, Michael}, +journal = {NVIDIA Technical Report NVR-2008-004, NVIDIA Corporation}, +volume = {}, +number = {}, +pages = {}, +year = {2008}, +} + +@article{ref8, +title = {{CUSP} {L}ibrary}, +author = {}, +journal = {}, +volume = {}, +number = {}, +pages = {}, +note = {\url{http://code.google.com/p/cusp-library/}}, +year = {}, +} + +@article{ref9, +title = {{NVIDIA} {CUDA} {C} programming guide}, +author = {NVIDIA Corporation}, +journal = {}, +volume = {}, +number = {}, +pages = {}, +note = {Version 4.2}, +year = {2012}, +} + +@article{ref10, +title = {The university of {F}lorida sparse matrix collection}, +author = {Davis, T. and Hu, Y.}, +journal = {}, +volume = {}, +number = {}, +pages = {}, +note = {\url{http://www.cise.ufl.edu/research/sparse/matrices/list_by_id.html}}, +year = {1997}, +} + +@article{ref11, +title = {Hypergraph partitioning based decomposition for parallel sparse matrix-vector multiplication}, +author = {Catalyurek, U. and Aykanat, C.}, +journal = {{IEEE} {T}ransactions on {P}arallel and {D}istributed {S}ystems}, +volume = {10}, +number = {7}, +pages = {673--693}, +note = {}, +year = {1999}, +} + +@article{ref12, +title = {{hMETIS}: A hypergraph partitioning package}, +author = {Karypis, G. and Kumar, V.}, +journal = {}, +volume = {}, +number = {}, +pages = {}, +note = {}, +year = {1998}, +} + +@article{ref13, +title = {{PaToH}: partitioning tool for hypergraphs}, +author = {Catalyurek, U. and Aykanat, C.}, +journal = {}, +volume = {}, +number = {}, +pages = {}, +note = {}, +year = {1999}, +} + +@article{ref14, +title = {Parallel hypergraph partitioning for scientific computing}, +author = {Devine, K.D. and Boman, E.G. and Heaphy, R.T. and Bisseling, R.H and Catalyurek, U.V.}, +journal = {In Proceedings of the 20th international conference on Parallel and distributed processing, IPDPS’06}, +volume = {}, +number = {}, +pages = {124–124}, +note = {}, +year = {2006}, +} + +@article{ref15, +title = {{PHG} - parallel hypergraph and graph partitioning with {Z}oltan}, +author = {}, +journal = {}, +volume = {}, +number = {}, +pages = {}, +note = {\url{http://www.cs.sandia.gov/Zoltan/ug_html/ug_alg_phg.html}}, +year = {}, +} + + + + + diff --git a/BookGPU/Chapters/chapter12/bibunits.sty b/BookGPU/Chapters/chapter12/bibunits.sty new file mode 100644 index 0000000..e5e7bb9 --- /dev/null +++ b/BookGPU/Chapters/chapter12/bibunits.sty @@ -0,0 +1,314 @@ +%% +%% This is file `bibunits.sty', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% bibunits.dtx (with options: `package') +%% +%% IMPORTANT NOTICE: +%% +%% For the copyright see the source file. +%% +%% Any modified versions of this file must be renamed +%% with new filenames distinct from bibunits.sty. +%% +%% For distribution of the original source see the terms +%% for copying and modification in the file bibunits.dtx. +%% +%% This generated file may be distributed as long as the +%% original source files, as listed above, are part of the +%% same distribution. (The sources need not necessarily be +%% in the same archive or directory.) +%% Package `bibunits' to use with LaTeX2e. +%% Copyright (C) 1999, 2000 by Thorsten Hansen. All rights reserved. +%% +\NeedsTeXFormat{LaTeX2e} +\ProvidesPackage{bibunits} + [2000/10/10 v2.2 Multiple bibliographies in one document.] +\newif\iflabelstoglobalaux \labelstoglobalauxfalse +\DeclareOption{labelstoglobalaux}{\labelstoglobalauxtrue} +\newif\ifglobalcitecopy +\globalcitecopyfalse +\DeclareOption{globalcitecopy}{\globalcitecopytrue} +\ProcessOptions +\newwrite\@bibunitaux +\newcount\@bibunitauxcnt \@bibunitauxcnt=0 +\def\@bibunitname{bu\the\@bibunitauxcnt} +\newif\if@starredversion +\let\std@cite\cite +\DeclareRobustCommand\bu@cite{% + \@ifstar + {\@starredversiontrue\std@cite}% + {\@starredversionfalse\std@cite}% +} +\AtBeginDocument{% + \@ifpackageloaded{natbib}% + {% + \NAT@set@cites \let\std@@citex\@citex + \def\bu@@citex[#1][#2]#3{% + \@startbibunitorrelax + \leavevmode + \begingroup\let\@auxout\@bibunitaux\std@@citex[#1][#2]{#3}\endgroup + \ifglobalcitecopy + \std@nocite{#3}% + \fi + }% + }% + {% + \@ifpackageloaded{overcite}% + {% + \let\std@@citew\@citew + \def\bu@@citew#1{% + \@startbibunitorrelax + \leavevmode + {\let\@auxout\@bibunitaux \std@@citew{#1}}% + \ifglobalcitecopy + \@nocite{#1}% + \else + \if@starredversion + \@nocite{#1}% + \fi + \fi + } + \let\std@@citex\@citex + \def\bu@@citex[#1]#2{% + \@startbibunitorrelax + \leavevmode + {\let\@auxout\@bibunitaux \std@@citex[#1]{#2}}% + \ifglobalcitecopy + \@nocite{#2}% + \else + \if@starredversion + \@nocite{#2}% + \fi + \fi + } + }% + {% + \@ifpackageloaded{jurabib}% + {% + \let\std@@citex\@citex + \def\bu@@citex[#1][#2]#3{% + \@startbibunitorrelax + \leavevmode + {\let\@auxout\@bibunitaux \std@@citex[#1][#2]{#3}}% + \ifglobalcitecopy + \std@nocite{#3}% + \else + \if@starredversion + \std@nocite{#3}% + \fi + \fi + }% + }% + {% + \let\std@@citex\@citex + \def\bu@@citex[#1]#2{% + \@startbibunitorrelax + \leavevmode + {\let\@auxout\@bibunitaux \std@@citex[#1]{#2}}% + \ifglobalcitecopy + \std@nocite{#2}% + \else + \if@starredversion + \std@nocite{#2}% + \fi + \fi + }% + }% + }% + }% +}% +\let\std@nocite\nocite +\def\bu@nocite{% + \@ifstar + {\@starredversiontrue\@bu@nocite}% + {\@starredversionfalse\@bu@nocite}% +} +\def\@bu@nocite#1{% + \@startbibunitorrelax + {\let\@auxout\@bibunitaux \std@nocite{#1}}% + \ifglobalcitecopy + \std@nocite{#1}% + \else + \if@starredversion + \std@nocite{#1}% + \fi + \fi +} +\def\bu@bibdata{} +\AtBeginDocument{% + \iflabelstoglobalaux + \else + \let\orig@bibliography\bibliography + \def\bibliography#1{% + \if@filesw + \immediate\openout\@bibunitaux bu.aux + \immediate\write\@mainaux{\string\@input{bu.aux}}% + \fi + \orig@bibliography{#1}% + \if@filesw + \immediate\closeout\@bibunitaux + \fi + }% + \fi + \let\std@bibliography\bibliography +} +\def\bu@bibliography{% + \@ifstar + {\@starredversiontrue\@bu@bibliography}% + {\@starredversionfalse\@bu@bibliography}% +} +\def\@bu@bibliography#1{% + \if@filesw + \immediate\write\@auxout{\string\gdef\string\bu@bibdata{#1}}% + \fi + \gdef\bu@bibdata{#1}% + \if@starredversion + \else + \std@bibliography{#1}% + \fi +} +\def\bu@bibstyle{} +\let\std@bibliographystyle\bibliographystyle +\def\bu@bibliographystyle{% + \@ifstar + {\@starredversiontrue\@bu@bibliographystyle}% + {\@starredversionfalse\@bu@bibliographystyle}% +} +\def\@bu@bibliographystyle#1{% + \if@filesw + \immediate\write\@auxout{\string\gdef\string\bu@bibstyle{#1}}% + \fi + \gdef\bu@bibstyle{#1}% + \if@starredversion + \else + \std@bibliographystyle{#1}% + \fi +} +\def\bibunit{% + \global\let\cite\bu@cite + \global\let\@citex\bu@@citex + \global\let\@citew\bu@@citew + \global\let\nocite\bu@nocite + \global\let\@startbibunitorrelax\@startbibunit + \global\let\@finishbibunit\relax + \@ifnextchar[{\@bibunitx}{\@bibunitx[\bu@bibstyle]}% +} +\def\@bibunitx[#1]{\gdef\@localbibstyle{#1}} +\def\endbibunit{% + \global\let\cite\std@cite + \global\let\@citex\std@@citex + \global\let\@citew\std@@citew + \global\let\nocite\std@nocite + \@finishbibunit + \@input{bu.aux}% +} +\def\@startbibunit{% + \global\let\@startbibunitorrelax\relax + \global\let\@finishbibunit\@finishstartedbibunit + \global\advance\@bibunitauxcnt 1 + \if@filesw + {\endlinechar-1 + \@input{\@bibunitname.aux}}% + \immediate\openout\@bibunitaux\@bibunitname.aux + \immediate\write\@bibunitaux{\string\bibstyle{\@localbibstyle}}% + \fi +} +\let\@finishbibunit\relax +\def\@finishstartedbibunit{% + \if@filesw + \immediate\closeout\@bibunitaux + \fi +} +\let\old@bibunit\@gobble +\def\@bibunit{\endbibunit\bibunit\old@bibunit} +\def\@endbibunit{} +\def\bibliographyunit{% + \@endbibunit + \@ifnextchar[{\@bibliographyunit}{% + \global\let\old@bibunit\@gobble + \global\let\bibliography\std@bibliography + \global\let\bibliographystyle\std@bibliographystyle + \endbibunit + \gdef\@endbibunit{}}% +} +\def\@bibliographyunit[#1]{% + \global\let\bibliography\bu@bibliography + \global\let\bibliographystyle\bu@bibliographystyle + \global\let\old@bibunit#1 + \global\let#1\@bibunit + \gdef\@endbibunit{\global\let#1\old@bibunit}% +} +\def\putbib{\@ifnextchar[{\@putbib}{\@putbib[\bu@bibdata]}} +\def\@putbib[#1]{% + \@startbibunitorrelax + \if@filesw + \immediate\write\@bibunitaux{\string\bibdata{#1}}% + \fi + \@input@{\@bibunitname.bbl}% +} +\AtBeginDocument{% + \iflabelstoglobalaux + \else + \let\std@bibitem\@bibitem + \let\std@lbibitem\@lbibitem + \def\@bibitem#1{% + \let\temp@auxout\@auxout + \let\@auxout\@bibunitaux + \std@bibitem{#1}% + \let\@auxout\temp@auxout + } + \def\@lbibitem[#1]#2{% + \let\temp@auxout\@auxout + \let\@auxout\@bibunitaux + \std@lbibitem[#1]{#2}% + \let\@auxout\temp@auxout + } + \fi +} +\def\remequivalent#1\from#2{% + \let\given=#1% + \ifx#2\empty + \else + \edef#2{\expandafter\plugh#2\plugh}% + \fi +} +\def\plugh\do#1#2{% + \ifx#1\given + \else + \noexpand\do\noexpand#1% + \fi + \ifx#2\plugh + \hgulp\fi\plugh#2% +} +\def\hgulp\fi\plugh\plugh{\fi} +\remequivalent\bibcite\from\@preamblecmds +\AtBeginDocument{% + \@ifpackageloaded{natbib}% + {\renewcommand\bibcite[2]{\global\@namedef{b@#1\@extra@binfo}{#2}}}% + {\renewcommand\bibcite[2]{\global\@namedef{b@#1}{#2}}}% +} +\AtBeginDocument{% + \@ifundefined{bbl@redefine}% + {}% + {% + \bbl@redefine\@input#1{% + \@safe@activestrue\org@@input{#1}\@safe@activesfalse}% + \@ifpackageloaded{natbib}% + {% + \bbl@redefine\std@@citex[#1][#2]#3{% + \@safe@activestrue\org@std@@citex[#1][#2]{#3}\@safe@activesfalse}% + \bbl@redefine\bu@@citex[#1][#2]#3{% + \@safe@activestrue\org@bu@@citex[#1][#2]{#3}\@safe@activesfalse}% + }% + {% natbib not loaded + \bbl@redefine\std@@citex[#1]#2{% + \@safe@activestrue\org@std@@citex[#1]{#2}\@safe@activesfalse}% + }% + }% +} +\endinput +%% +%% End of file `bibunits.sty'. diff --git a/BookGPU/Chapters/chapter12/ch12.aux b/BookGPU/Chapters/chapter12/ch12.aux new file mode 100644 index 0000000..0cc343d --- /dev/null +++ b/BookGPU/Chapters/chapter12/ch12.aux @@ -0,0 +1,133 @@ +\relax +\@writefile{toc}{\author{}{}} +\@writefile{loa}{\addvspace {10\p@ }} +\@writefile{toc}{\contentsline {chapter}{\numberline {11}Solving sparse linear systems with GMRES and CG methods on GPU clusters}{249}} +\@writefile{lof}{\addvspace {10\p@ }} +\@writefile{lot}{\addvspace {10\p@ }} +\@writefile{toc}{\contentsline {section}{\numberline {11.1}Introduction}{249}} +\newlabel{sec:01}{{11.1}{249}} +\@writefile{toc}{\contentsline {section}{\numberline {11.2}Krylov iterative methods}{250}} +\newlabel{sec:02}{{11.2}{250}} +\newlabel{eq:01}{{11.1}{250}} +\newlabel{eq:02}{{11.2}{250}} +\newlabel{eq:03}{{11.3}{250}} +\newlabel{eq:11}{{11.4}{251}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.2.1}CG method}{251}} +\newlabel{sec:02.01}{{11.2.1}{251}} +\newlabel{eq:04}{{11.5}{251}} +\newlabel{eq:05}{{11.6}{251}} +\newlabel{eq:06}{{11.7}{251}} +\newlabel{eq:07}{{11.8}{251}} +\newlabel{eq:08}{{11.9}{251}} +\newlabel{eq:09}{{11.10}{251}} +\@writefile{loa}{\contentsline {algocf}{\numberline {9}{\ignorespaces Left-preconditioned CG method\relax }}{252}} +\newlabel{alg:01}{{9}{252}} +\newlabel{eq:10}{{11.11}{252}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.2.2}GMRES method}{253}} +\newlabel{sec:02.02}{{11.2.2}{253}} +\newlabel{eq:12}{{11.12}{253}} +\newlabel{eq:13}{{11.13}{253}} +\newlabel{eq:14}{{11.14}{253}} +\newlabel{eq:15}{{11.15}{253}} +\newlabel{eq:16}{{11.16}{253}} +\newlabel{eq:17}{{11.17}{253}} +\newlabel{eq:18}{{11.18}{253}} +\newlabel{eq:19}{{11.19}{253}} +\@writefile{loa}{\contentsline {algocf}{\numberline {10}{\ignorespaces Left-preconditioned GMRES method with restarts\relax }}{254}} +\newlabel{alg:02}{{10}{254}} +\@writefile{toc}{\contentsline {section}{\numberline {11.3}Parallel implementation on a GPU cluster}{255}} +\newlabel{sec:03}{{11.3}{255}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.1}Data partitioning}{255}} +\newlabel{sec:03.01}{{11.3.1}{255}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.1}{\ignorespaces A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.\relax }}{256}} +\newlabel{fig:01}{{11.1}{256}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.2}GPU computing}{256}} +\newlabel{sec:03.02}{{11.3.2}{256}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.3}Data communications}{257}} +\newlabel{sec:03.03}{{11.3.3}{257}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.2}{\ignorespaces Data exchanges between \textit {Node 1} and its neighbors \textit {Node 0}, \textit {Node 2} and \textit {Node 3}.\relax }}{258}} +\newlabel{fig:02}{{11.2}{258}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.3}{\ignorespaces Columns reordering of a sparse sub-matrix.\relax }}{259}} +\newlabel{fig:03}{{11.3}{259}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.4}{\ignorespaces General scheme of the GPU cluster of tests composed of six machines, each with two GPUs.\relax }}{260}} +\newlabel{fig:04}{{11.4}{260}} +\@writefile{toc}{\contentsline {section}{\numberline {11.4}Experimental results}{260}} +\newlabel{sec:04}{{11.4}{260}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.5}{\ignorespaces Sketches of sparse matrices chosen from the Davis's collection.\relax }}{261}} +\newlabel{fig:05}{{11.5}{261}} +\@writefile{lot}{\contentsline {table}{\numberline {11.1}{\ignorespaces Main characteristics of sparse matrices chosen from the Davis's collection.\relax }}{262}} +\newlabel{tab:01}{{11.1}{262}} +\@writefile{lot}{\contentsline {table}{\numberline {11.2}{\ignorespaces Performances of the parallel CG method on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{262}} +\newlabel{tab:02}{{11.2}{262}} +\@writefile{lot}{\contentsline {table}{\numberline {11.3}{\ignorespaces Performances of the parallel GMRES method on a cluster 24 CPU cores vs. on cluster of 12 GPUs.\relax }}{263}} +\newlabel{tab:03}{{11.3}{263}} +\newlabel{eq:20}{{11.20}{263}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.6}{\ignorespaces Parallel generation of a large sparse matrix by four computing nodes.\relax }}{264}} +\newlabel{fig:06}{{11.6}{264}} +\@writefile{lot}{\contentsline {table}{\numberline {11.4}{\ignorespaces Main characteristics of sparse banded matrices generated from those of the Davis's collection.\relax }}{265}} +\newlabel{tab:04}{{11.4}{265}} +\@writefile{lot}{\contentsline {table}{\numberline {11.5}{\ignorespaces Performances of the parallel CG method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{265}} +\newlabel{tab:05}{{11.5}{265}} +\@writefile{toc}{\contentsline {section}{\numberline {11.5}Hypergraph partitioning}{265}} +\newlabel{sec:05}{{11.5}{265}} +\@writefile{lot}{\contentsline {table}{\numberline {11.6}{\ignorespaces Performances of the parallel GMRES method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{266}} +\newlabel{tab:06}{{11.6}{266}} +\@writefile{lot}{\contentsline {table}{\numberline {11.7}{\ignorespaces Main characteristics of sparse five-bands matrices generated from those of the Davis's collection.\relax }}{266}} +\newlabel{tab:07}{{11.7}{266}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.7}{\ignorespaces Parallel generation of a large sparse five-bands matrix by four computing nodes.\relax }}{267}} +\newlabel{fig:07}{{11.7}{267}} +\@writefile{lot}{\contentsline {table}{\numberline {11.8}{\ignorespaces Performances of parallel CG solver for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs\relax }}{267}} +\newlabel{tab:08}{{11.8}{267}} +\@writefile{lot}{\contentsline {table}{\numberline {11.9}{\ignorespaces Performances of parallel GMRES solver for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs\relax }}{268}} +\newlabel{tab:09}{{11.9}{268}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.8}{\ignorespaces An example of the hypergraph partitioning of a sparse matrix decomposed between three computing nodes.\relax }}{269}} +\newlabel{fig:08}{{11.8}{269}} +\@writefile{lot}{\contentsline {table}{\numberline {11.10}{\ignorespaces Performances of the parallel CG solver using hypergraph partitioning for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPU.\relax }}{270}} +\newlabel{tab:10}{{11.10}{270}} +\@writefile{lot}{\contentsline {table}{\numberline {11.11}{\ignorespaces Performances of the parallel GMRES solver using hypergraph partitioning for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPU.\relax }}{271}} +\newlabel{tab:11}{{11.11}{271}} +\@writefile{lot}{\contentsline {table}{\numberline {11.12}{\ignorespaces The total communication volume between 12 GPU computing nodes without and with the hypergraph partitioning method.\relax }}{272}} +\newlabel{tab:12}{{11.12}{272}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.9}{\ignorespaces Weak-scaling of the parallel CG and GMRES solvers on a GPU cluster for solving large sparse linear systems.\relax }}{273}} +\newlabel{fig:09}{{11.9}{273}} +\@writefile{toc}{\contentsline {section}{\numberline {11.6}Conclusion}{273}} +\newlabel{sec:06}{{11.6}{273}} +\@writefile{toc}{\contentsline {section}{Bibliography}{274}} +\@setckpt{Chapters/chapter12/ch12}{ +\setcounter{page}{276} +\setcounter{equation}{25} +\setcounter{enumi}{4} +\setcounter{enumii}{0} +\setcounter{enumiii}{0} +\setcounter{enumiv}{15} +\setcounter{footnote}{0} +\setcounter{mpfootnote}{0} +\setcounter{part}{1} +\setcounter{chapter}{11} +\setcounter{section}{6} +\setcounter{subsection}{0} +\setcounter{subsubsection}{0} +\setcounter{paragraph}{0} +\setcounter{subparagraph}{0} +\setcounter{figure}{9} +\setcounter{table}{12} +\setcounter{numauthors}{0} +\setcounter{parentequation}{46} +\setcounter{subfigure}{0} +\setcounter{lofdepth}{1} +\setcounter{subtable}{0} +\setcounter{lotdepth}{1} +\setcounter{lstnumber}{50} +\setcounter{ContinuedFloat}{0} +\setcounter{AlgoLine}{29} +\setcounter{algocfline}{10} +\setcounter{algocfproc}{10} +\setcounter{algocf}{10} +\setcounter{proposition}{1} +\setcounter{theorem}{0} +\setcounter{exercise}{0} +\setcounter{example}{0} +\setcounter{definition}{0} +\setcounter{proof}{1} +\setcounter{lstlisting}{0} +} diff --git a/BookGPU/Chapters/chapter12/ch12.tex b/BookGPU/Chapters/chapter12/ch12.tex new file mode 100755 index 0000000..7cd99f4 --- /dev/null +++ b/BookGPU/Chapters/chapter12/ch12.tex @@ -0,0 +1,1078 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% %% +%% CHAPTER 12 %% +%% %% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +\chapterauthor{}{} +\chapter{Solving sparse linear systems with GMRES and CG methods on GPU clusters} + +%%--------------------------%% +%% SECTION 1 %% +%%--------------------------%% +\section{Introduction} +\label{sec:01} +The sparse linear systems are used to model many scientific and industrial problems, such as the environmental simulations or +the industrial processing of the complex or non-Newtonian fluids. Moreover, the resolution of these problems often involves the +solving of such linear systems which is considered as the most expensive process in terms of time execution and memory space. +Therefore, solving sparse linear systems must be as efficient as possible in order to deal with problems of ever increasing size. + +There are, in the jargon of numerical analysis, different methods of solving sparse linear systems that we can classify in two +classes: the direct and iterative methods. However, the iterative methods are often more suitable than their counterpart, direct +methods, for solving large sparse linear systems. Indeed, they are less memory consuming and easier to parallelize on parallel +computers than direct methods. Different computing platforms, sequential and parallel computers, are used for solving sparse +linear systems with iterative solutions. Nowadays, graphics processing units (GPUs) have become attractive for solving these +linear systems, due to their computing power and their ability to compute faster than traditional CPUs. + +In Section~\ref{sec:02}, we describe the general principle of two well-known iterative methods: the conjugate gradient method and +the generalized minimal residual method. In Section~\ref{sec:03}, we give the main key points of the parallel implementation of both +methods on a cluster of GPUs. Then, in Section~\ref{sec:04}, we present the experimental results obtained on a CPU cluster and on +a GPU cluster, for solving sparse linear systems associated to matrices of different structures. Finally, in Section~\ref{sec:05}, +we apply the hypergraph partitioning technique to reduce the total communication volume between the computing nodes and, thus, to +improve the execution times of the parallel algorithms of both iterative methods. + + +%%--------------------------%% +%% SECTION 2 %% +%%--------------------------%% +\section{Krylov iterative methods} +\label{sec:02} +Let us consider the following system of $n$ linear equations in $\mathbb{R}$: +\begin{equation} +Ax=b, +\label{eq:01} +\end{equation} +where $A\in\mathbb{R}^{n\times n}$ is a sparse nonsingular square matrix, $x\in\mathbb{R}^{n}$ is the solution vector, +$b\in\mathbb{R}^{n}$ is the right-hand side and $n\in\mathbb{N}$ is a large integer number. + +The iterative methods for solving the large sparse linear system~(\ref{eq:01}) proceed by successive iterations of a same +block of elementary operations, during which an infinite number of approximate solutions $\{x_k\}_{k\geq 0}$ are computed. +Indeed, from an initial guess $x_0$, an iterative method determines at each iteration $k>0$ an approximate solution $x_k$ +which, gradually, converges to the exact solution $x^{*}$ as follows: +\begin{equation} +x^{*}=\lim\limits_{k\to\infty}x_{k}=A^{-1}b. +\label{eq:02} +\end{equation} +The number of iterations necessary to reach the exact solution $x^{*}$ is not known beforehand and can be infinite. In +practice, an iterative method often finds an approximate solution $\tilde{x}$ after a fixed number of iterations and/or +when a given convergence criterion is satisfied as follows: +\begin{equation} +\|b-A\tilde{x}\| < \varepsilon, +\label{eq:03} +\end{equation} +where $\varepsilon<1$ is the required convergence tolerance threshold. + +Some of the most iterative methods that have proven their efficiency for solving large sparse linear systems are those +called \textit{Krylov sub-space methods}~\cite{ref1}. In the present chapter, we describe two Krylov methods which are +widely used: the conjugate gradient method (CG) and the generalized minimal residual method (GMRES). In practice, the +Krylov sub-space methods are usually used with preconditioners that allow to improve their convergence. So, in what +follows, the CG and GMRES methods are used for solving the left-preconditioned sparse linear system: +\begin{equation} +M^{-1}Ax=M^{-1}b, +\label{eq:11} +\end{equation} +where $M$ is the preconditioning matrix. + +%%****************%% +%%****************%% +\subsection{CG method} +\label{sec:02.01} +The conjugate gradient method is initially developed by Hestenes and Stiefel in 1952~\cite{ref2}. It is one of the well +known iterative method for solving large sparse linear systems. In addition, it can be adapted for solving nonlinear +equations and optimization problems. However, it can only be applied to problems with positive definite symmetric matrices. + +The main idea of the CG method is the computation of a sequence of approximate solutions $\{x_k\}_{k\geq 0}$ in a Krylov +sub-space of order $k$ as follows: +\begin{equation} +x_k \in x_0 + \mathcal{K}_k(A,r_0), +\label{eq:04} +\end{equation} +such that the Galerkin condition must be satisfied: +\begin{equation} +r_k \bot \mathcal{K}_k(A,r_0), +\label{eq:05} +\end{equation} +where $x_0$ is the initial guess, $r_k=b-Ax_k$ is the residual of the computed solution $x_k$ and $\mathcal{K}_k$ the Krylov +sub-space of order $k$: \[\mathcal{K}_k(A,r_0) \equiv\text{span}\{r_0, Ar_0, A^2r_0,\ldots, A^{k-1}r_0\}.\] +In fact, CG is based on the construction of a sequence $\{p_k\}_{k\in\mathbb{N}}$ of direction vectors in $\mathcal{K}_k$ +which are pairwise $A$-conjugate ($A$-orthogonal): +\begin{equation} +\begin{array}{ll} +p_i^T A p_j = 0, & i\neq j. +\end{array} +\label{eq:06} +\end{equation} +At each iteration $k$, an approximate solution $x_k$ is computed by recurrence as follows: +\begin{equation} +\begin{array}{ll} +x_k = x_{k-1} + \alpha_k p_k, & \alpha_k\in\mathbb{R}. +\end{array} +\label{eq:07} +\end{equation} +Consequently, the residuals $r_k$ are computed in the same way: +\begin{equation} +r_k = r_{k-1} - \alpha_k A p_k. +\label{eq:08} +\end{equation} +In the case where all residuals are nonzero, the direction vectors $p_k$ can be determined so that the following recurrence +holds: +\begin{equation} +\begin{array}{lll} +p_0=r_0, & p_k=r_k+\beta_k p_{k-1}, & \beta_k\in\mathbb{R}. +\end{array} +\label{eq:09} +\end{equation} +Moreover, the scalars $\{\alpha_k\}_{k>0}$ are chosen so as to minimize the $A$-norm error $\|x^{*}-x_k\|_A$ over the Krylov +sub-space $\mathcal{K}_{k}$ and the scalars $\{\beta_k\}_{k>0}$ are chosen so as to ensure that the direction vectors are +pairwise $A$-conjugate. So, the assumption that matrix $A$ is symmetric and the recurrences~(\ref{eq:08}) and~(\ref{eq:09}) +allow to deduce that: +\begin{equation} +\begin{array}{ll} +\alpha_{k}=\frac{r^{T}_{k-1}r_{k-1}}{p_{k}^{T}Ap_{k}}, & \beta_{k}=\frac{r_{k}^{T}r_{k}}{r_{k-1}^{T}r_{k-1}}. +\end{array} +\label{eq:10} +\end{equation} + +\begin{algorithm}[!t] + %\SetLine + %\linesnumbered + Choose an initial guess $x_0$\; + $r_{0} = b - A x_{0}$\; + $convergence$ = false\; + $k = 1$\; + \Repeat{convergence}{ + $z_{k} = M^{-1} r_{k-1}$\; + $\rho_{k} = (r_{k-1},z_{k})$\; + \eIf{$k = 1$}{ + $p_{k} = z_{k}$\; + }{ + $\beta_{k} = \rho_{k} / \rho_{k-1}$\; + $p_{k} = z_{k} + \beta_{k} \times p_{k-1}$\; + } + $q_{k} = A \times p_{k}$\; + $\alpha_{k} = \rho_{k} / (p_{k},q_{k})$\; + $x_{k} = x_{k-1} + \alpha_{k} \times p_{k}$\; + $r_{k} = r_{k-1} - \alpha_{k} \times q_{k}$\; + \eIf{$(\rho_{k} < \varepsilon)$ {\bf or} $(k \geq maxiter)$}{ + $convergence$ = true\; + }{ + $k = k + 1$\; + } + } +\caption{Left-preconditioned CG method} +\label{alg:01} +\end{algorithm} + +Algorithm~\ref{alg:01} shows the main key points of the preconditioned CG method. It allows to solve the left-preconditioned +sparse linear system~(\ref{eq:11}). In this algorithm, $\varepsilon$ is the convergence tolerance threshold, $maxiter$ is the maximum +number of iterations and $(\cdot,\cdot)$ defines the dot product between two vectors in $\mathbb{R}^{n}$. At every iteration, a direction +vector $p_k$ is determined, so that it is orthogonal to the preconditioned residual $z_k$ and to the direction vectors $\{p_i\}_{i0}$ in a Krylov sub-space $\mathcal{K}_k$ as follows: +\begin{equation} +\begin{array}{ll} +x_k \in x_0 + \mathcal{K}_k(A, v_1),& v_1=\frac{r_0}{\|r_0\|_2}, +\end{array} +\label{eq:12} +\end{equation} +so that the Petrov-Galerkin condition is satisfied: +\begin{equation} +\begin{array}{ll} +r_k \bot A \mathcal{K}_k(A, v_1). +\end{array} +\label{eq:13} +\end{equation} +GMRES uses the Arnoldi process~\cite{ref5} to construct an orthonormal basis $V_k$ for the Krylov sub-space $\mathcal{K}_k$ +and an upper Hessenberg matrix $\bar{H}_k$ of order $(k+1)\times k$: +\begin{equation} +\begin{array}{ll} +V_k = \{v_1, v_2,\ldots,v_k\}, & \forall k>1, v_k=A^{k-1}v_1, +\end{array} +\label{eq:14} +\end{equation} +and +\begin{equation} +V_k A = V_{k+1} \bar{H}_k. +\label{eq:15} +\end{equation} + +Then, at each iteration $k$, an approximate solution $x_k$ is computed in the Krylov sub-space $\mathcal{K}_k$ spanned by $V_k$ +as follows: +\begin{equation} +\begin{array}{ll} +x_k = x_0 + V_k y, & y\in\mathbb{R}^{k}. +\end{array} +\label{eq:16} +\end{equation} +From both formulas~(\ref{eq:15}) and~(\ref{eq:16}) and $r_k=b-Ax_k$, we can deduce that: +\begin{equation} +\begin{array}{lll} + r_{k} & = & b - A (x_{0} + V_{k}y) \\ + & = & r_{0} - AV_{k}y \\ + & = & \beta v_{1} - V_{k+1}\bar{H}_{k}y \\ + & = & V_{k+1}(\beta e_{1} - \bar{H}_{k}y), +\end{array} +\label{eq:17} +\end{equation} +such that $\beta=\|r_0\|_2$ and $e_1=(1,0,\cdots,0)$ is the first vector of the canonical basis of $\mathbb{R}^k$. So, +the vector $y$ is chosen in $\mathbb{R}^k$ so as to minimize at best the Euclidean norm of the residual $r_k$. Consequently, +a linear least-squares problem of size $k$ is solved: +\begin{equation} +\underset{y\in\mathbb{R}^{k}}{min}\|r_{k}\|_{2}=\underset{y\in\mathbb{R}^{k}}{min}\|\beta e_{1}-\bar{H}_{k}y\|_{2}. +\label{eq:18} +\end{equation} +The QR factorization of matrix $\bar{H}_k$ is used to compute the solution of this problem by using Givens rotations~\cite{ref1,ref3}, +such that: +\begin{equation} +\begin{array}{lll} +\bar{H}_{k}=Q_{k}R_{k}, & Q_{k}\in\mathbb{R}^{(k+1)\times (k+1)}, & R_{k}\in\mathbb{R}^{(k+1)\times k}, +\end{array} +\label{eq:19} +\end{equation} +where $Q_kQ_k^T=I_k$ and $R_k$ is an upper triangular matrix. + +The GMRES method computes an approximate solution with a sufficient precision after, at most, $n$ iterations ($n$ is the size of the +sparse linear system to be solved). However, the GMRES algorithm must construct and store in the memory an orthonormal basis $V_k$ whose +size is proportional to the number of iterations required to achieve the convergence. Then, to avoid a huge memory storage, the GMRES +method must be restarted at each $m$ iterations, such that $m$ is very small ($m\ll n$), and with $x_m$ as the initial guess to the +next iteration. This allows to limit the size of the basis $V$ to $m$ orthogonal vectors. + +\begin{algorithm}[!t] + %\SetLine + %\linesnumbered + Choose an initial guess $x_0$\; + $convergence$ = false\; + $k = 1$\; + $r_{0} = M^{-1}(b-Ax_{0})$\; + $\beta = \|r_{0}\|_{2}$\; + \While{$\neg convergence$}{ + $v_{1} = r_{0}/\beta$\; + \For{$j=1$ \KwTo $m$}{ + $w_{j} = M^{-1}Av_{j}$\; + \For{$i=1$ \KwTo $j$}{ + $h_{i,j} = (w_{j},v_{i})$\; + $w_{j} = w_{j}-h_{i,j}v_{i}$\; + } + $h_{j+1,j} = \|w_{j}\|_{2}$\; + $v_{j+1} = w_{j}/h_{j+1,j}$\; + } + Set $V_{m}=\{v_{j}\}_{1\leq j \leq m}$ and $\bar{H}_{m}=(h_{i,j})$ a $(m+1)\times m$ upper Hessenberg matrix\; + Solve a least-squares problem of size $m$: $min_{y\in\mathrm{I\!R}^{m}}\|\beta e_{1}-\bar{H}_{m}y\|_{2}$\; + $x_{m} = x_{0}+V_{m}y_{m}$\; + $r_{m} = M^{-1}(b-Ax_{m})$\; + $\beta = \|r_{m}\|_{2}$\; + \eIf{ $(\beta<\varepsilon)$ {\bf or} $(k\geq maxiter)$}{ + $convergence$ = true\; + }{ + $x_{0} = x_{m}$\; + $r_{0} = r_{m}$\; + $k = k + 1$\; + } + } +\caption{Left-preconditioned GMRES method with restarts} +\label{alg:02} +\end{algorithm} + +Algorithm~\ref{alg:02} shows the main key points of the GMRES method with restarts. It solves the left-preconditioned sparse linear +system~(\ref{eq:11}), such that $M$ is the preconditioning matrix. At each iteration $k$, GMRES uses the Arnoldi process (defined +from line~$7$ to line~$17$) to construct a basis $V_m$ of $m$ orthogonal vectors and an upper Hessenberg matrix $\bar{H}_m$ of size +$(m+1)\times m$. Then, it solves the linear least-squares problem of size $m$ to find the vector $y\in\mathbb{R}^{m}$ which minimizes +at best the residual norm (line~$18$). Finally, it computes an approximate solution $x_m$ in the Krylov sub-space spanned by $V_m$ +(line~$19$). The GMRES algorithm is stopped when the residual norm is sufficiently small ($\|r_m\|_2<\varepsilon$) and/or the maximum +number of iterations ($maxiter$) is reached. + +%%--------------------------%% +%% SECTION 3 %% +%%--------------------------%% +\section{Parallel implementation on a GPU cluster} +\label{sec:03} +In this section, we present the parallel algorithms of both iterative CG and GMRES methods for GPU clusters. +The implementation is performed on a GPU cluster composed of different computing nodes, such that each node +is a CPU core managed by a MPI process and equipped with a GPU card. The parallelization of these algorithms +is carried out by using the MPI communication routines between the GPU computing nodes and the CUDA programming +environment inside each node. In what follows, the algorithms of the iterative methods are called iterative +solvers. + +%%****************%% +%%****************%% +\subsection{Data partitioning} +\label{sec:03.01} +The parallel solving of the large sparse linear system~(\ref{eq:11}) requires a data partitioning between the computing +nodes of the GPU cluster. Let $p$ denotes the number of the computing nodes on the GPU cluster. The partitioning operation +consists in the decomposition of the vectors and matrices, involved in the iterative solver, in $p$ portions. Indeed, this +operation allows to assign to each computing node $i$: +\begin{itemize} +\item a portion of size $\frac{n}{p}$ elements of each vector, +\item a sparse rectangular sub-matrix $A_i$ of size $(\frac{n}{p},n)$ and, +\item a square preconditioning sub-matrix $M_i$ of size $(\frac{n}{p},\frac{n}{p})$, +\end{itemize} +where $n$ is the size of the sparse linear system to be solved. In the first instance, we perform a naive row-wise partitioning +(decomposition row-by-row) on the data of the sparse linear systems to be solved. Figure~\ref{fig:01} shows an example of a row-wise +data partitioning between four computing nodes of a sparse linear system (sparse matrix $A$, solution vector $x$ and right-hand +side $b$) of size $16$ unknown values. + +\begin{figure} +\centerline{\includegraphics[scale=0.35]{Chapters/chapter12/figures/partition}} +\caption{A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.} +\label{fig:01} +\end{figure} + +%%****************%% +%%****************%% +\subsection{GPU computing} +\label{sec:03.02} +After the partitioning operation, all the data involved from this operation must be transferred from the CPU memories to the GPU +memories, in order to be processed by GPUs. We use two functions of the CUBLAS library (CUDA Basic Linear Algebra Subroutines), +developed by Nvidia~\cite{ref6}: \verb+cublasAlloc()+ for the memory allocations on GPUs and \verb+cublasSetVector()+ for the +memory copies from the CPUs to the GPUs. + +An efficient implementation of CG and GMRES solvers on a GPU cluster requires to determine all parts of their codes that can be +executed in parallel and, thus, take advantage of the GPU acceleration. As many Krylov sub-space methods, the CG and GMRES methods +are mainly based on arithmetic operations dealing with vectors or matrices: sparse matrix-vector multiplications, scalar-vector +multiplications, dot products, Euclidean norms, AXPY operations ($y\leftarrow ax+y$ where $x$ and $y$ are vectors and $a$ is a +scalar) and so on. These vector operations are often easy to parallelize and they are more efficient on parallel computers when +they work on large vectors. Therefore, all the vector operations used in CG and GMRES solvers must be executed by the GPUs as kernels. + +We use the kernels of the CUBLAS library to compute some vector operations of CG and GMRES solvers. The following kernels of CUBLAS +(dealing with double floating point) are used: \verb+cublasDdot()+ for the dot products, \verb+cublasDnrm2()+ for the Euclidean +norms and \verb+cublasDaxpy()+ for the AXPY operations. For the rest of the data-parallel operations, we code their kernels in CUDA. +In the CG solver, we develop a kernel for the XPAY operation ($y\leftarrow x+ay$) used at line~$12$ in Algorithm~\ref{alg:01}. In the +GMRES solver, we program a kernel for the scalar-vector multiplication (lines~$7$ and~$15$ in Algorithm~\ref{alg:02}), a kernel for +solving the least-squares problem and a kernel for the elements updates of the solution vector $x$. + +The least-squares problem in the GMRES method is solved by performing a QR factorization on the Hessenberg matrix $\bar{H}_m$ with +plane rotations and, then, solving the triangular system by backward substitutions to compute $y$. Consequently, solving the least-squares +problem on the GPU is not interesting. Indeed, the triangular solves are not easy to parallelize and inefficient on GPUs. However, +the least-squares problem to solve in the GMRES method with restarts has, generally, a very small size $m$. Therefore, we develop +an inexpensive kernel which must be executed in sequential by a single CUDA thread. + +The most important operation in CG and GMRES methods is the sparse matrix-vector multiplication (SpMV), because it is often an +expensive operation in terms of execution time and memory space. Moreover, it requires to take care of the storage format of the +sparse matrix in the memory. Indeed, the naive storage, row-by-row or column-by-column, of a sparse matrix can cause a significant +waste of memory space and execution time. In addition, the sparsity nature of the matrix often leads to irregular memory accesses +to read the matrix nonzero values. So, the computation of the SpMV multiplication on GPUs can involve non coalesced accesses to +the global memory, which slows down even more its performances. One of the most efficient compressed storage formats of sparse +matrices on GPUs is HYB format~\cite{ref7}. It is a combination of ELLpack (ELL) and Coordinate (COO) formats. Indeed, it stores +a typical number of nonzero values per row in ELL format and remaining entries of exceptional rows in COO format. It combines +the efficiency of ELL due to the regularity of its memory accesses and the flexibility of COO which is insensitive to the matrix +structure. Consequently, we use the HYB kernel~\cite{ref8} developed by Nvidia to implement the SpMV multiplication of CG and +GMRES methods on GPUs. Moreover, to avoid the non coalesced accesses to the high-latency global memory, we fill the elements of +the iterate vector $x$ in the cached texture memory. + +%%****************%% +%%****************%% +\subsection{Data communications} +\label{sec:03.03} +All the computing nodes of the GPU cluster execute in parallel the same iterative solver (Algorithm~\ref{alg:01} or Algorithm~\ref{alg:02}) +adapted to GPUs, but on their own portions of the sparse linear system: $M^{-1}_iA_ix_i=M^{-1}_ib_i$, $0\leq i0$ an approximate solution $x_k$ +which, gradually, converges to the exact solution $x^{*}$ as follows: +\begin{equation} +x^{*}=\lim\limits_{k\to\infty}x_{k}=A^{-1}b. +\label{eq:02} +\end{equation} +The number of iterations necessary to reach the exact solution $x^{*}$ is not known beforehand and can be infinite. In +practice, an iterative method often finds an approximate solution $\tilde{x}$ after a fixed number of iterations and/or +when a given convergence criterion is satisfied as follows: +\begin{equation} +\|b-A\tilde{x}\| < \varepsilon, +\label{eq:03} +\end{equation} +where $\varepsilon<1$ is the required convergence tolerance threshold. + +Some of the most iterative methods that have proven their efficiency for solving large sparse linear systems are those +called \textit{Krylov sub-space methods}~\cite{ref1}. In the present chapter, we describe two Krylov methods which are +widely used: the conjugate gradient method (CG) and the generalized minimal residual method (GMRES). In practice, the +Krylov sub-space methods are usually used with preconditioners that allow to improve their convergence. So, in what +follows, the CG and GMRES methods are used for solving the left-preconditioned sparse linear system: +\begin{equation} +M^{-1}Ax=M^{-1}b, +\label{eq:11} +\end{equation} +where $M$ is the preconditioning matrix. + +%%****************%% +%%****************%% +\subsection{CG method} +\label{sec:02.01} +The conjugate gradient method is initially developed by Hestenes and Stiefel in 1952~\cite{ref2}. It is one of the well +known iterative method for solving large sparse linear systems. In addition, it can be adapted for solving nonlinear +equations and optimization problems. However, it can only be applied to problems with positive definite symmetric matrices. + +The main idea of the CG method is the computation of a sequence of approximate solutions $\{x_k\}_{k\geq 0}$ in a Krylov +sub-space of order $k$ as follows: +\begin{equation} +x_k \in x_0 + \mathcal{K}_k(A,r_0), +\label{eq:04} +\end{equation} +such that the Galerkin condition must be satisfied: +\begin{equation} +r_k \bot \mathcal{K}_k(A,r_0), +\label{eq:05} +\end{equation} +where $x_0$ is the initial guess, $r_k=b-Ax_k$ is the residual of the computed solution $x_k$ and $\mathcal{K}_k$ the Krylov +sub-space of order $k$: \[\mathcal{K}_k(A,r_0) \equiv\text{span}\{r_0, Ar_0, A^2r_0,\ldots, A^{k-1}r_0\}.\] +In fact, CG is based on the construction of a sequence $\{p_k\}_{k\in\mathbb{N}}$ of direction vectors in $\mathcal{K}_k$ +which are pairwise $A$-conjugate ($A$-orthogonal): +\begin{equation} +\begin{array}{ll} +p_i^T A p_j = 0, & i\neq j. +\end{array} +\label{eq:06} +\end{equation} +At each iteration $k$, an approximate solution $x_k$ is computed by recurrence as follows: +\begin{equation} +\begin{array}{ll} +x_k = x_{k-1} + \alpha_k p_k, & \alpha_k\in\mathbb{R}. +\end{array} +\label{eq:07} +\end{equation} +Consequently, the residuals $r_k$ are computed in the same way: +\begin{equation} +r_k = r_{k-1} - \alpha_k A p_k. +\label{eq:08} +\end{equation} +In the case where all residuals are nonzero, the direction vectors $p_k$ can be determined so that the following recurrence +holds: +\begin{equation} +\begin{array}{lll} +p_0=r_0, & p_k=r_k+\beta_k p_{k-1}, & \beta_k\in\mathbb{R}. +\end{array} +\label{eq:09} +\end{equation} +Moreover, the scalars $\{\alpha_k\}_{k>0}$ are chosen so as to minimize the $A$-norm error $\|x^{*}-x_k\|_A$ over the Krylov +sub-space $\mathcal{K}_{k}$ and the scalars $\{\beta_k\}_{k>0}$ are chosen so as to ensure that the direction vectors are +pairwise $A$-conjugate. So, the assumption that matrix $A$ is symmetric and the recurrences~(\ref{eq:08}) and~(\ref{eq:09}) +allow to deduce that: +\begin{equation} +\begin{array}{ll} +\alpha_{k}=\frac{r^{T}_{k-1}r_{k-1}}{p_{k}^{T}Ap_{k}}, & \beta_{k}=\frac{r_{k}^{T}r_{k}}{r_{k-1}^{T}r_{k-1}}. +\end{array} +\label{eq:10} +\end{equation} + +\begin{algorithm}[!t] + \SetLine + \linesnumbered + Choose an initial guess $x_0$\; + $r_{0} = b - A x_{0}$\; + $convergence$ = false\; + $k = 1$\; + \Repeat{convergence}{ + $z_{k} = M^{-1} r_{k-1}$\; + $\rho_{k} = (r_{k-1},z_{k})$\; + \eIf{$k = 1$}{ + $p_{k} = z_{k}$\; + }{ + $\beta_{k} = \rho_{k} / \rho_{k-1}$\; + $p_{k} = z_{k} + \beta_{k} \times p_{k-1}$\; + } + $q_{k} = A \times p_{k}$\; + $\alpha_{k} = \rho_{k} / (p_{k},q_{k})$\; + $x_{k} = x_{k-1} + \alpha_{k} \times p_{k}$\; + $r_{k} = r_{k-1} - \alpha_{k} \times q_{k}$\; + \eIf{$(\rho_{k} < \varepsilon)$ {\bf or} $(k \geq maxiter)$}{ + $convergence$ = true\; + }{ + $k = k + 1$\; + } + } +\caption{Left-preconditioned CG method} +\label{alg:01} +\end{algorithm} + +Algorithm~\ref{alg:01} shows the main key points of the preconditioned CG method. It allows to solve the left-preconditioned +sparse linear system~(\ref{eq:11}). In this algorithm, $\varepsilon$ is the convergence tolerance threshold, $maxiter$ is the maximum +number of iterations and $(\cdot,\cdot)$ defines the dot product between two vectors in $\mathbb{R}^{n}$. At every iteration, a direction +vector $p_k$ is determined, so that it is orthogonal to the preconditioned residual $z_k$ and to the direction vectors $\{p_i\}_{i0}$ in a Krylov sub-space $\mathcal{K}_k$ as follows: +\begin{equation} +\begin{array}{ll} +x_k \in x_0 + \mathcal{K}_k(A, v_1),& v_1=\frac{r_0}{\|r_0\|_2}, +\end{array} +\label{eq:12} +\end{equation} +so that the Petrov-Galerkin condition is satisfied: +\begin{equation} +\begin{array}{ll} +r_k \bot A \mathcal{K}_k(A, v_1). +\end{array} +\label{eq:13} +\end{equation} +GMRES uses the Arnoldi process~\cite{ref5} to construct an orthonormal basis $V_k$ for the Krylov sub-space $\mathcal{K}_k$ +and an upper Hessenberg matrix $\bar{H}_k$ of order $(k+1)\times k$: +\begin{equation} +\begin{array}{ll} +V_k = \{v_1, v_2,\ldots,v_k\}, & \forall k>1, v_k=A^{k-1}v_1, +\end{array} +\label{eq:14} +\end{equation} +and +\begin{equation} +V_k A = V_{k+1} \bar{H}_k. +\label{eq:15} +\end{equation} + +Then, at each iteration $k$, an approximate solution $x_k$ is computed in the Krylov sub-space $\mathcal{K}_k$ spanned by $V_k$ +as follows: +\begin{equation} +\begin{array}{ll} +x_k = x_0 + V_k y, & y\in\mathbb{R}^{k}. +\end{array} +\label{eq:16} +\end{equation} +From both formulas~(\ref{eq:15}) and~(\ref{eq:16}) and $r_k=b-Ax_k$, we can deduce that: +\begin{equation} +\begin{array}{lll} + r_{k} & = & b - A (x_{0} + V_{k}y) \\ + & = & r_{0} - AV_{k}y \\ + & = & \beta v_{1} - V_{k+1}\bar{H}_{k}y \\ + & = & V_{k+1}(\beta e_{1} - \bar{H}_{k}y), +\end{array} +\label{eq:17} +\end{equation} +such that $\beta=\|r_0\|_2$ and $e_1=(1,0,\cdots,0)$ is the first vector of the canonical basis of $\mathbb{R}^k$. So, +the vector $y$ is chosen in $\mathbb{R}^k$ so as to minimize at best the Euclidean norm of the residual $r_k$. Consequently, +a linear least-squares problem of size $k$ is solved: +\begin{equation} +\underset{y\in\mathbb{R}^{k}}{min}\|r_{k}\|_{2}=\underset{y\in\mathbb{R}^{k}}{min}\|\beta e_{1}-\bar{H}_{k}y\|_{2}. +\label{eq:18} +\end{equation} +The QR factorization of matrix $\bar{H}_k$ is used to compute the solution of this problem by using Givens rotations~\cite{ref1,ref3}, +such that: +\begin{equation} +\begin{array}{lll} +\bar{H}_{k}=Q_{k}R_{k}, & Q_{k}\in\mathbb{R}^{(k+1)\times (k+1)}, & R_{k}\in\mathbb{R}^{(k+1)\times k}, +\end{array} +\label{eq:19} +\end{equation} +where $Q_kQ_k^T=I_k$ and $R_k$ is an upper triangular matrix. + +The GMRES method computes an approximate solution with a sufficient precision after, at most, $n$ iterations ($n$ is the size of the +sparse linear system to be solved). However, the GMRES algorithm must construct and store in the memory an orthonormal basis $V_k$ whose +size is proportional to the number of iterations required to achieve the convergence. Then, to avoid a huge memory storage, the GMRES +method must be restarted at each $m$ iterations, such that $m$ is very small ($m\ll n$), and with $x_m$ as the initial guess to the +next iteration. This allows to limit the size of the basis $V$ to $m$ orthogonal vectors. + +\begin{algorithm}[!t] + \SetLine + \linesnumbered + Choose an initial guess $x_0$\; + $convergence$ = false\; + $k = 1$\; + $r_{0} = M^{-1}(b-Ax_{0})$\; + $\beta = \|r_{0}\|_{2}$\; + \While{$\neg convergence$}{ + $v_{1} = r_{0}/\beta$\; + \For{$j=1$ \KwTo $m$}{ + $w_{j} = M^{-1}Av_{j}$\; + \For{$i=1$ \KwTo $j$}{ + $h_{i,j} = (w_{j},v_{i})$\; + $w_{j} = w_{j}-h_{i,j}v_{i}$\; + } + $h_{j+1,j} = \|w_{j}\|_{2}$\; + $v_{j+1} = w_{j}/h_{j+1,j}$\; + } + Set $V_{m}=\{v_{j}\}_{1\leq j \leq m}$ and $\bar{H}_{m}=(h_{i,j})$ a $(m+1)\times m$ upper Hessenberg matrix\; + Solve a least-squares problem of size $m$: $min_{y\in\mathrm{I\!R}^{m}}\|\beta e_{1}-\bar{H}_{m}y\|_{2}$\; + $x_{m} = x_{0}+V_{m}y_{m}$\; + $r_{m} = M^{-1}(b-Ax_{m})$\; + $\beta = \|r_{m}\|_{2}$\; + \eIf{ $(\beta<\varepsilon)$ {\bf or} $(k\geq maxiter)$}{ + $convergence$ = true\; + }{ + $x_{0} = x_{m}$\; + $r_{0} = r_{m}$\; + $k = k + 1$\; + } + } +\caption{Left-preconditioned GMRES method with restarts} +\label{alg:02} +\end{algorithm} + +Algorithm~\ref{alg:02} shows the main key points of the GMRES method with restarts. It solves the left-preconditioned sparse linear +system~(\ref{eq:11}), such that $M$ is the preconditioning matrix. At each iteration $k$, GMRES uses the Arnoldi process (defined +from line~$7$ to line~$17$) to construct a basis $V_m$ of $m$ orthogonal vectors and an upper Hessenberg matrix $\bar{H}_m$ of size +$(m+1)\times m$. Then, it solves the linear least-squares problem of size $m$ to find the vector $y\in\mathbb{R}^{m}$ which minimizes +at best the residual norm (line~$18$). Finally, it computes an approximate solution $x_m$ in the Krylov sub-space spanned by $V_m$ +(line~$19$). The GMRES algorithm is stopped when the residual norm is sufficiently small ($\|r_m\|_2<\varepsilon$) and/or the maximum +number of iterations ($maxiter$) is reached. + +%%--------------------------%% +%% SECTION 3 %% +%%--------------------------%% +\section{Parallel implementation on a GPU cluster} +\label{sec:03} +In this section, we present the parallel algorithms of both iterative CG and GMRES methods for GPU clusters. +The implementation is performed on a GPU cluster composed of different computing nodes, such that each node +is a CPU core managed by a MPI process and equipped with a GPU card. The parallelization of these algorithms +is carried out by using the MPI communication routines between the GPU computing nodes and the CUDA programming +environment inside each node. In what follows, the algorithms of the iterative methods are called iterative +solvers. + +%%****************%% +%%****************%% +\subsection{Data partitioning} +\label{sec:03.01} +The parallel solving of the large sparse linear system~(\ref{eq:11}) requires a data partitioning between the computing +nodes of the GPU cluster. Let $p$ denotes the number of the computing nodes on the GPU cluster. The partitioning operation +consists in the decomposition of the vectors and matrices, involved in the iterative solver, in $p$ portions. Indeed, this +operation allows to assign to each computing node $i$: +\begin{itemize*} +\item a portion of size $\frac{n}{p}$ elements of each vector, +\item a sparse rectangular sub-matrix $A_i$ of size $(\frac{n}{p},n)$ and, +\item a square preconditioning sub-matrix $M_i$ of size $(\frac{n}{p},\frac{n}{p})$, +\end{itemize*} +where $n$ is the size of the sparse linear system to be solved. In the first instance, we perform a naive row-wise partitioning +(decomposition row-by-row) on the data of the sparse linear systems to be solved. Figure~\ref{fig:01} shows an example of a row-wise +data partitioning between four computing nodes of a sparse linear system (sparse matrix $A$, solution vector $x$ and right-hand +side $b$) of size $16$ unknown values. + +\begin{figure} +\centerline{\includegraphics[scale=0.35]{Chapters/chapter12/figures/partition}} +\caption{A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.} +\label{fig:01} +\end{figure} + +%%****************%% +%%****************%% +\subsection{GPU computing} +\label{sec:03.02} +After the partitioning operation, all the data involved from this operation must be transferred from the CPU memories to the GPU +memories, in order to be processed by GPUs. We use two functions of the CUBLAS library (CUDA Basic Linear Algebra Subroutines), +developed by Nvidia~\cite{ref6}: \verb+cublasAlloc()+ for the memory allocations on GPUs and \verb+cublasSetVector()+ for the +memory copies from the CPUs to the GPUs. + +An efficient implementation of CG and GMRES solvers on a GPU cluster requires to determine all parts of their codes that can be +executed in parallel and, thus, take advantage of the GPU acceleration. As many Krylov sub-space methods, the CG and GMRES methods +are mainly based on arithmetic operations dealing with vectors or matrices: sparse matrix-vector multiplications, scalar-vector +multiplications, dot products, Euclidean norms, AXPY operations ($y\leftarrow ax+y$ where $x$ and $y$ are vectors and $a$ is a +scalar) and so on. These vector operations are often easy to parallelize and they are more efficient on parallel computers when +they work on large vectors. Therefore, all the vector operations used in CG and GMRES solvers must be executed by the GPUs as kernels. + +We use the kernels of the CUBLAS library to compute some vector operations of CG and GMRES solvers. The following kernels of CUBLAS +(dealing with double floating point) are used: \verb+cublasDdot()+ for the dot products, \verb+cublasDnrm2()+ for the Euclidean +norms and \verb+cublasDaxpy()+ for the AXPY operations. For the rest of the data-parallel operations, we code their kernels in CUDA. +In the CG solver, we develop a kernel for the XPAY operation ($y\leftarrow x+ay$) used at line~$12$ in Algorithm~\ref{alg:01}. In the +GMRES solver, we program a kernel for the scalar-vector multiplication (lines~$7$ and~$15$ in Algorithm~\ref{alg:02}), a kernel for +solving the least-squares problem and a kernel for the elements updates of the solution vector $x$. + +The least-squares problem in the GMRES method is solved by performing a QR factorization on the Hessenberg matrix $\bar{H}_m$ with +plane rotations and, then, solving the triangular system by backward substitutions to compute $y$. Consequently, solving the least-squares +problem on the GPU is not interesting. Indeed, the triangular solves are not easy to parallelize and inefficient on GPUs. However, +the least-squares problem to solve in the GMRES method with restarts has, generally, a very small size $m$. Therefore, we develop +an inexpensive kernel which must be executed in sequential by a single CUDA thread. + +The most important operation in CG and GMRES methods is the sparse matrix-vector multiplication (SpMV), because it is often an +expensive operation in terms of execution time and memory space. Moreover, it requires to take care of the storage format of the +sparse matrix in the memory. Indeed, the naive storage, row-by-row or column-by-column, of a sparse matrix can cause a significant +waste of memory space and execution time. In addition, the sparsity nature of the matrix often leads to irregular memory accesses +to read the matrix nonzero values. So, the computation of the SpMV multiplication on GPUs can involve non coalesced accesses to +the global memory, which slows down even more its performances. One of the most efficient compressed storage formats of sparse +matrices on GPUs is HYB format~\cite{ref7}. It is a combination of ELLpack (ELL) and Coordinate (COO) formats. Indeed, it stores +a typical number of nonzero values per row in ELL format and remaining entries of exceptional rows in COO format. It combines +the efficiency of ELL due to the regularity of its memory accesses and the flexibility of COO which is insensitive to the matrix +structure. Consequently, we use the HYB kernel~\cite{ref8} developed by Nvidia to implement the SpMV multiplication of CG and +GMRES methods on GPUs. Moreover, to avoid the non coalesced accesses to the high-latency global memory, we fill the elements of +the iterate vector $x$ in the cached texture memory. + +%%****************%% +%%****************%% +\subsection{Data communications} +\label{sec:03.03} +All the computing nodes of the GPU cluster execute in parallel the same iterative solver (Algorithm~\ref{alg:01} or Algorithm~\ref{alg:02}) +adapted to GPUs, but on their own portions of the sparse linear system: $M^{-1}_iA_ix_i=M^{-1}_ib_i$, $0\leq i