new

[GMRES2stage.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index 4f1da9bb181cbd6d6eeeceed136bd45adc874f86..4f9f60e64abbceb1842ada84f84d53e2a1026fbb 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -241,7 +241,7 @@
  % quality.
  
  
-%\usepackage{eqparbox}
+\usepackage{eqparbox}
  % Also of notable interest is Scott Pakin's eqparbox package for creating
  % (automatically sized) equal width boxes - aka "natural width parboxes".
  % Available at:
@@ -369,7 +369,7 @@
  %
  % paper title
  % can use linebreaks \\ within to get better formatting as desired
-\title{TSARM: A Two-Stage Algorithm with least-square Residual Minimization to solve large sparse linear systems}
+\title{TSIRM: A Two-Stage Iteration with least-square Residual Minimization algorithm to solve large sparse linear systems}
  %où
  %\title{A two-stage algorithm with error minimization to solve large sparse linear systems}
  %où
@@ -646,12 +646,12 @@ appropriate than a single direct method in a parallel context.
  
  
  \begin{algorithm}[t]
-\caption{TSARM}
+\caption{TSIRM}
  \begin{algorithmic}[1]
    \Input $A$ (sparse matrix), $b$ (right-hand side)
    \Output $x$ (solution vector)\vspace{0.2cm}
    \State Set the initial guess $x^0$
-  \For {$k=1,2,3,\ldots$ until convergence (error$<\epsilon_{tsarm}$)} \label{algo:conv}
+  \For {$k=1,2,3,\ldots$ until convergence (error$<\epsilon_{tsirm}$)} \label{algo:conv}
      \State  $x^k=Solve(A,b,x^{k-1},max\_iter_{kryl})$   \label{algo:solve}
      \State retrieve error
      \State $S_{k \mod s}=x^k$ \label{algo:store}
@@ -670,17 +670,17 @@ iteration is  inside the for  loop. Line~\ref{algo:solve}, the Krylov  method is
  called for a  maximum of $max\_iter_{kryl}$ iterations.  In practice, we  suggest to set this parameter
  equals to  the restart  number of the  GMRES-like method. Moreover,  a tolerance
  threshold must be specified for the  solver. In practice, this threshold must be
-much  smaller  than the  convergence  threshold  of  the TSARM  algorithm  (\emph{i.e.}
-$\epsilon_{tsarm}$).  Line~\ref{algo:store}, $S_{k~ mod~ s}=x^k$ consists in copying the
-solution  $x_k$  into the  column  $k~  mod~ s$ of  the  matrix  $S$. After  the
+much  smaller  than the  convergence  threshold  of  the TSIRM  algorithm  (\emph{i.e.}
+$\epsilon_{tsirm}$).  Line~\ref{algo:store}, $S_{k~ mod~ s}=x^k$ consists in copying the
+solution  $x_k$  into the  column  $k~ mod~ s$ of  the  matrix  $S$. After  the
  minimization, the matrix $S$ is reused with the new values of the residuals.  To
  solve the minimization problem, an  iterative method is used. Two parameters are
  required for that: the maximum number of iteration and the threshold to stop the
  method.
  
-Let us summarize the most important parameters of TSARM:
+Let us summarize the most important parameters of TSIRM:
  \begin{itemize}
-\item $\epsilon_{tsarm}$: the threshold to stop the TSARM method;
+\item $\epsilon_{tsirm}$: the threshold to stop the TSIRM method;
  \item $max\_iter_{kryl}$: the maximum number of iterations for the Krylov method;
  \item $s$: the number of outer iterations before applying the minimization step;
  \item $max\_iter_{ls}$: the maximum number of iterations for the iterative least-square method;
@@ -688,7 +688,7 @@ Let us summarize the most important parameters of TSARM:
  \end{itemize}
  
  
-The  parallelisation  of  TSARM  relies   on  the  parallelization  of  all  its
+The  parallelisation  of  TSIRM  relies   on  the  parallelization  of  all  its
  parts. More  precisely, except  the least-square step,  all the other  parts are
  obvious to  achieve out in parallel. In  order to develop a  parallel version of
  our   code,   we   have   chosen  to   use   PETSc~\cite{petsc-web-page}.    For
@@ -759,7 +759,7 @@ Table~\ref{tab:01},  we  show  the  matrices  we  have used  and  some  of  them
  characteristics. For all  the matrices, the name, the field,  the number of rows
  and the number of nonzero elements is given.
  
-\begin{table*}
+\begin{table*}[htbp]
  \begin{center}
  \begin{tabular}{|c|c|r|r|r|} 
  \hline
@@ -783,7 +783,7 @@ the restart  of GMRES is performed every  30 iterations, we have  chosen to stop
  the GMRES every 30 iterations, $max\_iter_{kryl}=30$).  $s$ is set to 8. CGLS is
  chosen  to minimize  the least-squares  problem with  the  following parameters:
  $\epsilon_{ls}=1e-40$ and $max\_iter_{ls}=20$.  The external precision is set to
-$\epsilon_{tsarm}=1e-10$.  Those  experiments have been performed  on a Intel(R)
+$\epsilon_{tsirm}=1e-10$.  Those  experiments have been performed  on a Intel(R)
  Core(TM) i7-3630QM CPU @ 2.40GHz with the version 3.5.1 of PETSc.
  
  
@@ -791,20 +791,20 @@ In  Table~\ref{tab:02}, some  experiments comparing  the solving  of  the linear
  systems obtained with the previous matrices  with a GMRES variant and with out 2
  stage algorithm are  given. In the second column, it can  be noticed that either
  gmres or fgmres is used to  solve the linear system.  According to the matrices,
-different  preconditioner is used.   With TSARM,  the same  solver and  the same
-preconditionner is used.  This Table shows that TSARM can drastically reduce the
+different  preconditioner is used.   With TSIRM,  the same  solver and  the same
+preconditionner is used.  This Table shows that TSIRM can drastically reduce the
  number of iterations to reach the  convergence when the number of iterations for
  the normal GMRES is more or less  greater than 500. In fact this also depends on
  tow  parameters: the  number  of iterations  to  stop GMRES  and  the number  of
  iterations to perform the minimization.
  
  
-\begin{table}
+\begin{table}[htbp]
  \begin{center}
  \begin{tabular}{|c|c|r|r|r|r|} 
  \hline
  
- \multirow{2}{*}{Matrix name}  & Solver /   & \multicolumn{2}{c|}{GMRES} & \multicolumn{2}{c|}{TSARM CGLS} \\ 
+ \multirow{2}{*}{Matrix name}  & Solver /   & \multicolumn{2}{c|}{GMRES} & \multicolumn{2}{c|}{TSIRM CGLS} \\ 
  \cline{3-6}
         &  precond             & Time  & \# Iter.  & Time  & \# Iter.  \\\hline \hline
  
@@ -849,12 +849,12 @@ In the following larger experiments are described on two large scale architectur
  
  {\bf Description of preconditioners}
  
-\begin{table*}
+\begin{table*}[htbp]
  \begin{center}
  \begin{tabular}{|r|r|r|r|r|r|r|r|r|} 
  \hline
  
-  nb. cores & precond   & \multicolumn{2}{c|}{FGMRES} & \multicolumn{2}{c|}{TSARM CGLS} &  \multicolumn{2}{c|}{TSARM LSQR} & best gain \\ 
+  nb. cores & precond   & \multicolumn{2}{c|}{FGMRES} & \multicolumn{2}{c|}{TSIRM CGLS} &  \multicolumn{2}{c|}{TSIRM LSQR} & best gain \\ 
  \cline{3-8}
               &                       & Time  & \# Iter.  & Time  & \# Iter. & Time  & \# Iter. & \\\hline \hline
    2,048      & mg                    & 403.49   & 18,210    & 73.89  & 3,060   & 77.84  & 3,270  & 5.46 \\
@@ -868,7 +868,7 @@ In the following larger experiments are described on two large scale architectur
  \hline
  
  \end{tabular}
-\caption{Comparison of FGMRES and TSARM with FGMRES for example ex15 of PETSc with two preconditioner (mg and sor) with 25,000 components per core on Juqueen (threshold 1e-3, restart=30, s=12),  time is expressed in seconds.}
+\caption{Comparison of FGMRES and TSIRM with FGMRES for example ex15 of PETSc with two preconditioner (mg and sor) with 25,000 components per core on Juqueen (threshold 1e-3, restart=30, s=12),  time is expressed in seconds.}
  \label{tab:03}
  \end{center}
  \end{table*}
@@ -881,12 +881,16 @@ problems)  per processor is  fixed to  25,000. This  number can  seem relatively
  small. In fact, for  some applications that need a lot of  memory, the number of
  components per processor requires sometimes to be small.
  
-In this Table, we  can notice that TSARM is always faster  than FGMRES. The last
-column shows the ratio between FGMRES and the best version of TSARM according to
-the minimization procedure: CGLS or LSQR.
+In this Table, we  can notice that TSIRM is always faster  than FGMRES. The last
+column shows the ratio between FGMRES and the best version of TSIRM according to
+the minimization  procedure: CGLS or  LSQR. Even if  we have computed  the worst
+case  between CGLS  and LSQR,  it is  clear that  TSIRM is  alsways  faster than
+FGMRES. For this example, the  multigrid preconditionner is faster than SOR. The
+gain  between   TSIRM  and  FGMRES  is   more  or  less  similar   for  the  two
+preconditioners
  
  
-\begin{figure}
+\begin{figure}[htbp]
  \centering
    \includegraphics[width=0.45\textwidth]{nb_iter_sec_ex15_juqueen}
  \caption{Number of iterations per second with ex15 and the same parameters than in Table~\ref{tab:03}}
@@ -897,12 +901,12 @@ the minimization procedure: CGLS or LSQR.
  
  
  
-\begin{table*}
+\begin{table*}[htbp]
  \begin{center}
  \begin{tabular}{|r|r|r|r|r|r|r|r|r|} 
  \hline
  
-  nb. cores & threshold   & \multicolumn{2}{c|}{GMRES} & \multicolumn{2}{c|}{TSARM CGLS} &  \multicolumn{2}{c|}{TSARM LSQR} & best gain \\ 
+  nb. cores & threshold   & \multicolumn{2}{c|}{GMRES} & \multicolumn{2}{c|}{TSIRM CGLS} &  \multicolumn{2}{c|}{TSIRM LSQR} & best gain \\ 
  \cline{3-8}
               &                       & Time  & \# Iter.  & Time  & \# Iter. & Time  & \# Iter. & \\\hline \hline
    2,048      & 8e-5                  & 108.88 & 16,560  & 23.06  &  3,630  & 22.79  & 3,630   & 4.77 \\
@@ -924,12 +928,12 @@ the minimization procedure: CGLS or LSQR.
  
  
  
-\begin{table*}
+\begin{table*}[htbp]
  \begin{center}
  \begin{tabular}{|r|r|r|r|r|r|r|r|r|r|r|} 
  \hline
  
-  nb. cores   & \multicolumn{2}{c|}{GMRES} & \multicolumn{2}{c|}{TSARM CGLS} &  \multicolumn{2}{c|}{TSARM LSQR} & best gain & \multicolumn{3}{c|}{efficiency} \\ 
+  nb. cores   & \multicolumn{2}{c|}{GMRES} & \multicolumn{2}{c|}{TSIRM CGLS} &  \multicolumn{2}{c|}{TSIRM LSQR} & best gain & \multicolumn{3}{c|}{efficiency} \\ 
  \cline{2-7} \cline{9-11}
                      & Time  & \# Iter.  & Time  & \# Iter. & Time  & \# Iter. &   & GMRES & TS CGLS & TS LSQR\\\hline \hline
     512              & 3,969.69 & 33,120 & 709.57 & 5,790  & 622.76 & 5,070  & 6.37  &   1    &    1    &     1     \\