From: Christophe Guyeux Date: Sat, 11 Oct 2014 08:55:44 +0000 (+0200) Subject: dfmqslk X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/GMRES2stage.git/commitdiff_plain/296102e5b791e8d44dcc357426e5590f466a54e9?hp=64f1ca2222ea5046f340e606f8978e8a9a4dcd1e dfmqslk Merge branch 'master' of ssh://info.iut-bm.univ-fcomte.fr/GMRES2stage --- diff --git a/biblio.bib b/biblio.bib index 0e9f3ed..da3ffc3 100644 --- a/biblio.bib +++ b/biblio.bib @@ -62,3 +62,67 @@ howpublished = {\url{http://www.mcs.anl.gov/petsc}}, year = {2014} } + + +@misc{Dav97, + author = {Davis, T. and Hu, Y.}, + title = {The {U}niversity of {F}lorida Sparse Matrix Collection}, + year = {1997}, + note = {Digest, \url{http://www.cise.ufl.edu/research/sparse/matrices/}}, + } + + +@article{Saad:1993, + author = {Saad, Youcef}, + title = {A Flexible Inner-outer Preconditioned GMRES Algorithm}, + journal = {SIAM J. Sci. Comput.}, + issue_date = {March 1993}, + volume = {14}, + number = {2}, + month = mar, + year = {1993}, + issn = {1064-8275}, + pages = {461--469}, + numpages = {9}, + url = {http://dx.doi.org/10.1137/0914028}, + doi = {10.1137/0914028}, + acmid = {160089}, + publisher = {Society for Industrial and Applied Mathematics}, + address = {Philadelphia, PA, USA}, + keywords = {GMRES, Krylov subspace methods, non-Hermitian systems, preconditioned conjugate gradient, variable preconditioners}, +} + +@incollection{bahicontascoutu, +title = {{P}arallel iterative algorithms: from sequential to grid computing}, +author = {Bahi, J.M. and Contassot-Vivier, S. and Couturier, R.}, +booktitle = {Numerical Analysis and Scientific Computing}, +publisher = {Chapman \& Hall/CRC}, +year = {2008}, +} + +@Article{Nichols:1973:CTS, + author = "Nancy K. Nichols", + title = "On the Convergence of Two-Stage Iterative Processes + for Solving Linear Equations", + journal = "SIAM Journal on Numerical Analysis", + volume = "10", + number = "3", + pages = "460--469", + month = jun, + year = "1973", + CODEN = "SJNAAM", + ISSN = "0036-1429 (print), 1095-7170 (electronic)", + issn-l = "0036-1429", + bibdate = "Fri Oct 16 06:57:22 MDT 1998", + bibsource = "http://www.math.utah.edu/pub/tex/bib/siamjnumeranal.bib; + JSTOR database", + acknowledgement = "Nelson H. F. Beebe, University of Utah, Department + of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake + City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 + 801 581 4148, e-mail: \path|beebe@math.utah.edu|, + \path|beebe@acm.org|, \path|beebe@computer.org| + (Internet), URL: + \path|http://www.math.utah.edu/~beebe/|", + fjournal = "SIAM Journal on Numerical Analysis", + journal-url = "http://epubs.siam.org/sinum", +} \ No newline at end of file diff --git a/nb_iter_sec_ex15_juqueen.pdf b/nb_iter_sec_ex15_juqueen.pdf index 168f10c..748cda4 100644 Binary files a/nb_iter_sec_ex15_juqueen.pdf and b/nb_iter_sec_ex15_juqueen.pdf differ diff --git a/nb_iter_sec_ex15_juqueen.plot b/nb_iter_sec_ex15_juqueen.plot index 96a4191..775b019 100755 --- a/nb_iter_sec_ex15_juqueen.plot +++ b/nb_iter_sec_ex15_juqueen.plot @@ -1,4 +1,4 @@ -# Analysis description +# Analysis description set encoding iso_8859_1 set terminal x11 set size 1,0.5 diff --git a/nb_iter_sec_ex15_juqueen.txt b/nb_iter_sec_ex15_juqueen.txt new file mode 100644 index 0000000..c1ffe98 --- /dev/null +++ b/nb_iter_sec_ex15_juqueen.txt @@ -0,0 +1,5 @@ +# mg gm mg cgls mg lsqr sor gm sor cgls sor lsqr + 2048 45.13 41.41 42.01 76.55 70.43 69.37 + 4096 44.76 41.03 40.46 76.95 66.97 64.98 + 8192 43.93 38.50 36.91 75.84 61.11 57.85 + 16384 44.68 35.03 32.54 75.81 51.83 47.44 diff --git a/paper.tex b/paper.tex index 4d0e239..8764073 100644 --- a/paper.tex +++ b/paper.tex @@ -439,7 +439,7 @@ GMRES. \end{abstract} \begin{IEEEkeywords} -Iterative Krylov methods; sparse linear systems; residual minimization; PETSc; %à voir... +Iterative Krylov methods; sparse linear systems; two stage iteration; least-squares residual minimization; PETSc \end{IEEEkeywords} @@ -547,38 +547,42 @@ Iterative Krylov methods; sparse linear systems; residual minimization; PETSc; % % You must have at least 2 lines in the paragraph with the drop letter % (should never be an issue) -Iterative methods have recently become more attractive than direct ones to solve very large -sparse linear systems. They are more efficient in a parallel -context, supporting thousands of cores, and they require less memory and arithmetic -operations than direct methods. This is why new iterative methods are frequently -proposed or adapted by researchers, and the increasing need to solve very large sparse -linear systems has triggered the development of such efficient iterative techniques -suitable for parallel processing. - -Most of the successful iterative methods currently available are based on so-called ``Krylov -subspaces''. They consist in forming a basis of successive matrix -powers multiplied by an initial vector, which can be for instance the residual. These methods use vectors orthogonality of the Krylov subspace basis in order to solve linear -systems. The most known iterative Krylov subspace methods are conjugate -gradient and GMRES ones (Generalized Minimal RESidual). - - -However, iterative methods suffer from scalability problems on parallel -computing platforms with many processors, due to their need of reduction -operations, and to collective communications to achieve matrix-vector +Iterative methods have recently become more attractive than direct ones to solve +very large sparse linear systems\cite{Saad2003}. They are more efficient in a +parallel context, supporting thousands of cores, and they require less memory +and arithmetic operations than direct methods~\cite{bahicontascoutu}. This is +why new iterative methods are frequently proposed or adapted by researchers, and +the increasing need to solve very large sparse linear systems has triggered the +development of such efficient iterative techniques suitable for parallel +processing. + +Most of the successful iterative methods currently available are based on +so-called ``Krylov subspaces''. They consist in forming a basis of successive +matrix powers multiplied by an initial vector, which can be for instance the +residual. These methods use vectors orthogonality of the Krylov subspace basis +in order to solve linear systems. The most known iterative Krylov subspace +methods are conjugate gradient and GMRES ones (Generalized Minimal RESidual). + + +However, iterative methods suffer from scalability problems on parallel +computing platforms with many processors, due to their need of reduction +operations, and to collective communications to achieve matrix-vector multiplications. The communications on large clusters with thousands of cores -and large sizes of messages can significantly affect the performances of these -iterative methods. As a consequence, Krylov subspace iteration methods are often used -with preconditioners in practice, to increase their convergence and accelerate their -performances. However, most of the good preconditioners are not scalable on -large clusters. - -In this research work, a two-stage algorithm based on two nested iterations -called inner-outer iterations is proposed. This algorithm consists in solving the sparse -linear system iteratively with a small number of inner iterations, and restarting -the outer step with a new solution minimizing some error functions over some -previous residuals. This algorithm is iterative and easy to parallelize on large -clusters. Furthermore, the minimization technique improves its convergence and -performances. +and large sizes of messages can significantly affect the performances of these +iterative methods. As a consequence, Krylov subspace iteration methods are often +used with preconditioners in practice, to increase their convergence and +accelerate their performances. However, most of the good preconditioners are +not scalable on large clusters. + +In this research work, a two-stage algorithm based on two nested iterations +called inner-outer iterations is proposed. This algorithm consists in solving +the sparse linear system iteratively with a small number of inner iterations, +and restarting the outer step with a new solution minimizing some error +functions over some previous residuals. For further information on two-stage +iteration methods, interested readers are invited to +consult~\cite{Nichols:1973:CTS}. Two-stage algorithms are easy to parallelize on +large clusters. Furthermore, the least-squares minimization technique improves +its convergence and performances. The present article is organized as follows. Related works are presented in Section~\ref{sec:02}. Section~\ref{sec:03} details the two-stage algorithm using @@ -618,15 +622,14 @@ It can be summarized as follows: the inner solver is a Krylov based one. In order to accelerate its convergence, the outer solver periodically applies a least-squares minimization on the residuals computed by the inner one. %Tsolver which does not required to be changed. -At each outer iteration, the sparse linear system $Ax=b$ is partially -solved using only $m$ -iterations of an iterative method, this latter being initialized with the -last obtained approximation. -GMRES method~\cite{Saad86}, or any of its variants, can potentially be used as -inner solver. The current approximation of the Krylov method is then stored inside a $n \times s$ matrix -$S$, which is composed by the $s$ last solutions that have been computed during -the inner iterations phase. -In the remainder, the $i$-th column vector of $S$ will be denoted by $S_i$. +At each outer iteration, the sparse linear system $Ax=b$ is partially solved +using only $m$ iterations of an iterative method, this latter being initialized +with the last obtained approximation. GMRES method~\cite{Saad86}, or any of its +variants, can potentially be used as inner solver. The current approximation of +the Krylov method is then stored inside a $n \times s$ matrix $S$, which is +composed by the $s$ last solutions that have been computed during the inner +iterations phase. In the remainder, the $i$-th column vector of $S$ will be +denoted by $S_i$. $\|r_n\| \leq \left( 1-\frac{\lambda_{\mathrm{min}}^2(1/2(A^T + A))}{ \lambda_{\mathrm{max}}(A^T A)} \right)^{n/2} \|r_0\|,$ In the general case, where A is not positive definite, we have @@ -658,9 +661,8 @@ appropriate than a single direct method in a parallel context. \Output $x$ (solution vector)\vspace{0.2cm} \State Set the initial guess $x_0$ \For {$k=1,2,3,\ldots$ until convergence (error$<\epsilon_{tsirm}$)} \label{algo:conv} - \State $x_k=Solve(A,b,x_{k-1},max\_iter_{kryl})$ \label{algo:solve} - \State retrieve error - \State $S_{k \mod s}=x_k$ \label{algo:store} + \State $[x_k,error]=Solve(A,b,x_{k-1},max\_iter_{kryl})$ \label{algo:solve} + \State $S_{k \mod s}=x_k$ \label{algo:store} \Comment{update column (k mod s) of S} \If {$k \mod s=0$ {\bf and} error$>\epsilon_{kryl}$} \State $R=AS$ \Comment{compute dense matrix} \label{algo:matrix_mul} \State $\alpha=Least\_Squares(R,b,max\_iter_{ls})$ \label{algo:} @@ -671,19 +673,22 @@ appropriate than a single direct method in a parallel context. \label{algo:01} \end{algorithm} -Algorithm~\ref{algo:01} summarizes the principle of the proposed method. The outer -iteration is inside the \emph{for} loop. Line~\ref{algo:solve}, the Krylov method is -called for a maximum of $max\_iter_{kryl}$ iterations. In practice, we suggest to set this parameter -equal to the restart number in the GMRES-like method. Moreover, a tolerance -threshold must be specified for the solver. In practice, this threshold must be -much smaller than the convergence threshold of the TSIRM algorithm (\emph{i.e.}, -$\epsilon_{tsirm}$). Line~\ref{algo:store}, $S_{k \mod s}=x^k$ consists in copying the -solution $x_k$ into the column $k \mod s$ of $S$. -After the -minimization, the matrix $S$ is reused with the new values of the residuals. To -solve the minimization problem, an iterative method is used. Two parameters are -required for that: the maximum number of iterations and the threshold to stop the -method. +Algorithm~\ref{algo:01} summarizes the principle of the proposed method. The +outer iteration is inside the \emph{for} loop. Line~\ref{algo:solve}, the Krylov +method is called for a maximum of $max\_iter_{kryl}$ iterations. In practice, +we suggest to set this parameter equal to the restart number in the GMRES-like +method. Moreover, a tolerance threshold must be specified for the solver. In +practice, this threshold must be much smaller than the convergence threshold of +the TSIRM algorithm (\emph{i.e.}, $\epsilon_{tsirm}$). We also consider that +after the call of the $Solve$ function, we obtain the vector $x_k$ and the error +which is defined by $||Ax^k-b||_2$. + + Line~\ref{algo:store}, +$S_{k \mod s}=x^k$ consists in copying the solution $x_k$ into the column $k +\mod s$ of $S$. After the minimization, the matrix $S$ is reused with the new +values of the residuals. To solve the minimization problem, an iterative method +is used. Two parameters are required for that: the maximum number of iterations +and the threshold to stop the method. Let us summarize the most important parameters of TSIRM: \begin{itemize} @@ -805,10 +810,12 @@ than the one of the GMRES method. In order to see the influence of our algorithm with only one processor, we first -show a comparison with the standard version of GMRES and our algorithm. In -Table~\ref{tab:01}, we show the matrices we have used and some of them -characteristics. For all the matrices, the name, the field, the number of rows -and the number of nonzero elements are given. +show a comparison with GMRES or FGMRES and our algorithm. In Table~\ref{tab:01}, +we show the matrices we have used and some of them characteristics. Those +matrices are chosen from the Davis collection of the University of +Florida~\cite{Dav97}. They are matrices arising in real-world applications. For +all the matrices, the name, the field, the number of rows and the number of +nonzero elements are given. \begin{table}[htbp] \begin{center} @@ -841,13 +848,14 @@ Core(TM) i7-3630QM CPU @ 2.40GHz with the version 3.5.1 of PETSc. In Table~\ref{tab:02}, some experiments comparing the solving of the linear systems obtained with the previous matrices with a GMRES variant and with out 2 stage algorithm are given. In the second column, it can be noticed that either -gmres or fgmres is used to solve the linear system. According to the matrices, -different preconditioner is used. With TSIRM, the same solver and the same -preconditionner are used. This Table shows that TSIRM can drastically reduce the -number of iterations to reach the convergence when the number of iterations for -the normal GMRES is more or less greater than 500. In fact this also depends on -tow parameters: the number of iterations to stop GMRES and the number of -iterations to perform the minimization. +GRMES or FGMRES (Flexible GMRES)~\cite{Saad:1993} is used to solve the linear +system. According to the matrices, different preconditioner is used. With +TSIRM, the same solver and the same preconditionner are used. This Table shows +that TSIRM can drastically reduce the number of iterations to reach the +convergence when the number of iterations for the normal GMRES is more or less +greater than 500. In fact this also depends on tow parameters: the number of +iterations to stop GMRES and the number of iterations to perform the +minimization. \begin{table}[htbp] @@ -877,7 +885,7 @@ torso3 & fgmres / sor & 37.70 & 565 & 34.97 & 510 \\ -In order to perform larger experiments, we have tested some example applications +In order to perform larger experiments, we have tested some example applications of PETSc. Those applications are available in the ksp part which is suited for scalable linear equations solvers: \begin{itemize} @@ -890,11 +898,23 @@ scalable linear equations solvers: finite elements. For this example, the user can define the scaling of material coefficient in embedded circle called $\alpha$. \end{itemize} -For more technical details on these applications, interested readers are invited -to read the codes available in the PETSc sources. Those problems have been -chosen because they are scalable with many cores which is not the case of other problems that we have tested. +For more technical details on these applications, interested readers are invited +to read the codes available in the PETSc sources. Those problems have been +chosen because they are scalable with many cores which is not the case of other +problems that we have tested. + +In the following larger experiments are described on two large scale +architectures: Curie and Juqeen. Both these architectures are supercomputer +composed of 80,640 cores for Curie and 458,752 cores for Juqueen. Those machines +are respectively hosted by GENCI in France and Jülich Supercomputing Centre in +Germany. They belongs with other similar architectures of the PRACE initiative ( +Partnership for Advanced Computing in Europe) which aims at proposing high +performance supercomputing architecture to enhance research in Europe. The Curie +architecture is composed of Intel E5-2680 processors at 2.7 GHz with 2Gb memory +by core. The Juqueen architecture is composed of IBM PowerPC A2 at 1.6 GHz with +1Gb memory per core. Both those architecture are equiped with a dedicated high +speed network. -In the following larger experiments are described on two large scale architectures: Curie and Juqeen... {\bf description...}\\ {\bf Description of preconditioners}\\