new

[GMRES2stage.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index c66f8c71ae1ec2704b0db181df987640f4a3d97f..e4421cdb96b850d5b2d29ca171cbcfd35f608b0a 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -367,31 +367,29 @@
  %
  % paper title
  % can use linebreaks \\ within to get better formatting as desired
  %
  % paper title
  % can use linebreaks \\ within to get better formatting as desired
-\title{A Krylov two-stage algorithm to solve large sparse linear systems}
+\title{TSARM: A Two-Stage Algorithm with least-square Residual Minimization to solve large sparse linear systems}
  %où
  %\title{A two-stage algorithm with error minimization to solve large sparse linear systems}
  %où
  %\title{???}
  
  
  %où
  %\title{A two-stage algorithm with error minimization to solve large sparse linear systems}
  %où
  %\title{???}
  
  
+
+
+
  % author names and affiliations
  % use a multiple column layout for up to two different
  % affiliations
  
  % author names and affiliations
  % use a multiple column layout for up to two different
  % affiliations
  
-\author{\IEEEauthorblockN{Rapha\"el Couturier}
-\IEEEauthorblockA{Femto-ST Institute - DISC Department\\
-Universit\'e de Franche-Comt\'e, IUT de Belfort-Montb\'eliard\\
-19 avenue de Mar\'echal Juin, BP 527 \\
-90016 Belfort Cedex, France\\
-Email: raphael.couturier@univ-fcomte.fr}
-\and
-\IEEEauthorblockN{Lilia Ziane Khodja}
-\IEEEauthorblockA{Centre de Recherche INRIA Bordeaux Sud-Ouest\\
-200 avenue de la Vieille Tour\\
-33405 Talence Cedex, France\\
+\author{\IEEEauthorblockN{Rapha\"el Couturier\IEEEauthorrefmark{1}, Lilia Ziane Khodja \IEEEauthorrefmark{2} and Christophe Guyeux\IEEEauthorrefmark{1}}
+\IEEEauthorblockA{\IEEEauthorrefmark{1} Femto-ST Institute, University of Franche Comte, France\\
+Email: \{raphael.couturier,christophe.guyeux\}@univ-fcomte.fr}
+\IEEEauthorblockA{\IEEEauthorrefmark{2} INRIA Bordeaux Sud-Ouest, France\\
  Email: lilia.ziane@inria.fr}
  }
  
  Email: lilia.ziane@inria.fr}
  }
  
+
+
  % conference papers do not typically use \thanks and this command
  % is locked out in conference mode. If really needed, such as for
  % the acknowledgment of grants, issue a \IEEEoverridecommandlockouts
  % conference papers do not typically use \thanks and this command
  % is locked out in conference mode. If really needed, such as for
  % the acknowledgment of grants, issue a \IEEEoverridecommandlockouts
@@ -428,7 +426,16 @@ Email: lilia.ziane@inria.fr}
  
  
  \begin{abstract}
  
  
  \begin{abstract}
-%The abstract goes here. DO NOT USE SPECIAL CHARACTERS, SYMBOLS, OR MATH IN YOUR TITLE OR ABSTRACT.
+In  this paper  we propose  a  two stage  iterative method  which increases  the
+convergence of Krylov iterative methods,  typically those of GMRES variants. The
+principle of  our approach  is to  build an external  iteration over  the Krylov
+method  and to  save  the current  residual  frequently (for  example, for  each
+restart of GMRES). Then after a given number of outer iterations, a minimization
+step is applied on the matrix composed of the save residuals in order to compute
+a  better solution and  make a  new iteration  if necessary.  We prove  that our
+method  has the  same  convergence property  than  the inner  method used.  Some
+experiments using up  to 16,394 cores show that compared  to GMRES our algorithm
+can be around 7 times faster.
  \end{abstract}
  
  \begin{IEEEkeywords}
  \end{abstract}
  
  \begin{IEEEkeywords}
@@ -539,6 +546,7 @@ Iterative Krylov methods; sparse linear systems; error minimization; PETSc; %à
  % no \IEEEPARstart
  % You must have at least 2 lines in the paragraph with the drop letter
  % (should never be an issue)
  % no \IEEEPARstart
  % You must have at least 2 lines in the paragraph with the drop letter
  % (should never be an issue)
+{\bf RAPH : EST ce qu'on parle de Krylov pour dire que les résidus constituent une base de Krylov... J'hésite... Tof t'en penses quoi?}
  Iterative  methods are become  more attractive  than direct  ones to  solve very
  large sparse linear  systems. They are more effective in  a parallel context and
  require less memory  and arithmetic operations than direct  methods. A number of
  Iterative  methods are become  more attractive  than direct  ones to  solve very
  large sparse linear  systems. They are more effective in  a parallel context and
  require less memory  and arithmetic operations than direct  methods. A number of
@@ -637,8 +645,8 @@ direct method in the parallel context.
    \Input $A$ (sparse matrix), $b$ (right-hand side)
    \Output $x$ (solution vector)\vspace{0.2cm}
    \State Set the initial guess $x^0$
    \Input $A$ (sparse matrix), $b$ (right-hand side)
    \Output $x$ (solution vector)\vspace{0.2cm}
    \State Set the initial guess $x^0$
-  \For {$k=1,2,3,\ldots$ until convergence}
-    \State Solve iteratively $Ax^k=b$
+  \For {$k=1,2,3,\ldots$ until convergence} \label{algo:conv}
+    \State Solve iteratively $Ax^k=b$  \label{algo:solve}
      \State $S_{k~mod~s}=x^k$ 
      \If {$k$ mod $s=0$ {\bf and} not convergence}
        \State Compute dense matrix $R=AS$
      \State $S_{k~mod~s}=x^k$ 
      \If {$k$ mod $s=0$ {\bf and} not convergence}
        \State Compute dense matrix $R=AS$
@@ -690,12 +698,28 @@ torso3             & 2D/3D problem & 259,156 & 4,429,042 \\
  \end{center}
  \end{table}
  
  \end{center}
  \end{table}
  
-In  table~\ref{tab:02}, some  experiments comparing  the sovling  of  the linear
+The following  parameters have been chosen  for our experiments.   As by default
+the restart  of GMRES is performed every  30 iterations, we have  chosen to stop
+the     GMRES    every     30    iterations     (line     \ref{algo:solve}    in
+Algorithm~\ref{algo:01}).   $s$ is  set to  8. CGLS  is chosen  to  minimize the
+least-squares  problem.  Two  conditions  are  used to  stop  CGLS,  either  the
+precision is under $1e-40$ or the  number of iterations is greater to $20$.  The
+external   precision    is   set    to   $1e-10$   (line    \ref{algo:conv}   in
+Algorithm~\ref{algo:01}).  Those  experiments have been performed  on a Intel(R)
+Core(TM) i7-3630QM CPU @ 2.40GHz with the version 3.5.1 of PETSc.
+
+
+In  Table~\ref{tab:02}, some  experiments comparing  the solving  of  the linear
  systems obtained with the previous matrices  with a GMRES variant and with out 2
  stage algorithm are  given. In the second column, it can  be noticed that either
  gmres or fgmres is used to  solve the linear system.  According to the matrices,
  systems obtained with the previous matrices  with a GMRES variant and with out 2
  stage algorithm are  given. In the second column, it can  be noticed that either
  gmres or fgmres is used to  solve the linear system.  According to the matrices,
-different preconditioner is used.  With the 2  stage algorithm, the same
-solver and the same preconditionner is used.
+different preconditioner is  used.  With the 2 stage  algorithm, the same solver
+and  the same  preconditionner  is used.   This  Table shows  that  the 2  stage
+algorithm  can  drastically  reduce  the  number  of  iterations  to  reach  the
+convergence when the  number of iterations for the normal GMRES  is more or less
+greater than  500. In fact  this also depends  on tow parameters: the  number of
+iterations  to  stop  GMRES  and   the  number  of  iterations  to  perform  the
+minimization.
  
  
  \begin{table}
  
  
  \begin{table}
@@ -703,7 +727,8 @@ solver and the same preconditionner is used.
  \begin{tabular}{|c|c|r|r|r|r|} 
  \hline
  
  \begin{tabular}{|c|c|r|r|r|r|} 
  \hline
  
- \multirow{2}{*}{Matrix name}  & Solver /   & \multicolumn{2}{c|}{gmres variant} & \multicolumn{2}{c|}{2 stage} \\
+ \multirow{2}{*}{Matrix name}  & Solver /   & \multicolumn{2}{c|}{gmres variant} & \multicolumn{2}{c|}{2 stage CGLS} \\ 
+\cline{3-6}
         &  precond             & Time  & \# Iter.  & Time  & \# Iter.  \\\hline \hline
  
  crashbasis         & gmres / none             &  15.65     & 518  &  14.12 & 450  \\
         &  precond             & Time  & \# Iter.  & Time  & \# Iter.  \\\hline \hline
  
  crashbasis         & gmres / none             &  15.65     & 518  &  14.12 & 450  \\
@@ -711,24 +736,63 @@ parabolic\_fem     & gmres / ilu           & 1009.94   & 7573 & 401.52 & 2970 \\
  epb3               & fgmres / sor             &  8.67     & 600  &  8.21 & 540  \\
  atmosmodj          &  fgmres / sor & 104.23  & 451 & 88.97 & 366  \\
  bfwa398            & gmres / none  & 1.42 & 9612 & 0.28 & 1650 \\
  epb3               & fgmres / sor             &  8.67     & 600  &  8.21 & 540  \\
  atmosmodj          &  fgmres / sor & 104.23  & 451 & 88.97 & 366  \\
  bfwa398            & gmres / none  & 1.42 & 9612 & 0.28 & 1650 \\
-torso3             & fgmres/sor  & 565  & 37.70 & 34.97 & 510 \\
+torso3             & fgmres / sor  & 37.70 & 565 & 34.97 & 510 \\
  \hline
  
  \end{tabular}
  \hline
  
  \end{tabular}
-\caption{Comparison of GMRES and 2 stage GMRES algorithms in sequential with some matrices, time is expressed in seconds.}
+\caption{Comparison of (F)GMRES and 2 stage (F)GMRES algorithms in sequential with some matrices, time is expressed in seconds.}
  \label{tab:02}
  \end{center}
  \end{table}
  
  
  \label{tab:02}
  \end{center}
  \end{table}
  
  
-Param : retart 30 iters
-cols = 8
-iter cgls = 20
-cgls prec = 1e-40
-prec = 1e-10
-Intel(R) Core(TM) i7-3630QM CPU @ 2.40GHz
  
  
  
  
+Larger experiments ....
+
+\begin{table*}
+\begin{center}
+\begin{tabular}{|r|r|r|r|r|r|r|r|r|} 
+\hline
+
+  nb. cores & precond   & \multicolumn{2}{c|}{gmres variant} & \multicolumn{2}{c|}{2 stage CGLS} &  \multicolumn{2}{c|}{2 stage LSQR} & best gain \\ 
+\cline{3-8}
+             &                       & Time  & \# Iter.  & Time  & \# Iter. & Time  & \# Iter. & \\\hline \hline
+  2,048      & mg                    & 403.49   & 18,210    & 73.89  & 3,060   & 77.84  & 3,270  & 5.46 \\
+  2,048      & sor                   & 745.37   & 57,060    & 87.31  & 6,150   & 104.21 & 7,230  & 8.53 \\
+  4,096      & mg                    & 562.25   & 25,170    & 97.23  & 3,990   & 89.71  & 3,630  & 6.27 \\
+  4,096      & sor                   & 912.12   & 70,194    & 145.57 & 9,750   & 168.97 & 10,980 & 6.26 \\
+  8,192      & mg                    & 917.02   & 40,290    & 148.81 & 5,730   & 143.03 & 5,280  & 6.41 \\
+  8,192      & sor                   & 1,404.53 & 106,530   & 212.55 & 12,990  & 180.97 & 10,470 & 7.76 \\
+  16,384     & mg                    & 1,430.56 & 63,930    & 237.17 & 8,310   & 244.26 & 7,950  & 6.03 \\
+  16,384     & sor                   & 2,852.14 & 216,240   & 418.46 & 21,690  & 505.26 & 23,970 & 6.82 \\
+\hline
+
+\end{tabular}
+\caption{Comparison of FGMRES and 2 stage FGMRES algorithms for ex15 of Petsc with 25000 components per core on Juqueen (threshold 1e-3, restart=30, s=12),  time is expressed in seconds.}
+\label{tab:03}
+\end{center}
+\end{table*}
+
+
+\begin{table*}
+\begin{center}
+\begin{tabular}{|r|r|r|r|r|r|r|r|r|} 
+\hline
+
+  nb. cores & threshold   & \multicolumn{2}{c|}{gmres variant} & \multicolumn{2}{c|}{2 stage CGLS} &  \multicolumn{2}{c|}{2 stage LSQR} & best gain \\ 
+\cline{3-8}
+             &                       & Time  & \# Iter.  & Time  & \# Iter. & Time  & \# Iter. & \\\hline \hline
+  8,192      & 6e-5                  & 149.54 & 17,280  & 28.68  &  3,810  & 29.05  & 3,990  & 5.21 \\
+  8,192      & 5e-5                  & 792.11 & 109,590 & 76.83  &  10,470  & 65.20  & 9,030  & 12.14 \\
+  16,384     & 4e-5                  & 718.61 & 86,400 & 98.98  &  10,830  & 131.86  & 14,790  & 7.26 \\
+\hline
+
+\end{tabular}
+\caption{Comparison of FGMRES  and 2 stage FGMRES algorithms for ex54 of Petsc (both with the MG preconditioner) with 25000 components per core on Curie (restart=30, s=12),  time is expressed in seconds.}
+\label{tab:04}
+\end{center}
+\end{table*}
  %%%*********************************************************
  %%%*********************************************************
  
  %%%*********************************************************
  %%%*********************************************************
  
@@ -743,6 +807,12 @@ Intel(R) Core(TM) i7-3630QM CPU @ 2.40GHz
  %%%*********************************************************
  
  
  %%%*********************************************************
  
  
+future plan : \\
+- study other kinds of matrices, problems, inner solvers\\
+- adaptative number of outer iterations to minimize\\
+- other methods to minimize the residuals?\\
+- implement our solver inside PETSc
+
  
  % conference papers do not normally have an appendix
  
  
  % conference papers do not normally have an appendix
  
@@ -752,10 +822,10 @@ Intel(R) Core(TM) i7-3630QM CPU @ 2.40GHz
  %%%*********************************************************
  %%%*********************************************************
  \section*{Acknowledgment}
  %%%*********************************************************
  %%%*********************************************************
  \section*{Acknowledgment}
-%The authors would like to thank...
-%more thanks here
-%%%*********************************************************
-%%%*********************************************************
+This  paper  is   partially  funded  by  the  Labex   ACTION  program  (contract
+ANR-11-LABX-01-01).   We acknowledge PRACE  for awarding  us access  to resource
+Curie and Juqueen respectively based in France and Germany.
+
  
  
  % trigger a \newpage just before the given reference
  
  
  % trigger a \newpage just before the given reference