new

[GMRES2stage.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index 9f8ded7a800afc4de8d6c0a1716518f70059d553..15a45f01b040b1c1e94e37ab157aab8906124fd3 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -241,7 +241,7 @@
  % quality.
  
  
-%\usepackage{eqparbox}
+\usepackage{eqparbox}
  % Also of notable interest is Scott Pakin's eqparbox package for creating
  % (automatically sized) equal width boxes - aka "natural width parboxes".
  % Available at:
@@ -369,7 +369,7 @@
  %
  % paper title
  % can use linebreaks \\ within to get better formatting as desired
-\title{TSARM: A Two-Stage Algorithm with least-square Residual Minimization to solve large sparse linear systems}
+\title{TSIRM: A Two-Stage Iteration with least-square Residual Minimization algorithm to solve large sparse linear systems}
  %où
  %\title{A two-stage algorithm with error minimization to solve large sparse linear systems}
  %où
@@ -646,12 +646,12 @@ appropriate than a single direct method in a parallel context.
  
  
  \begin{algorithm}[t]
-\caption{TSARM}
+\caption{TSIRM}
  \begin{algorithmic}[1]
    \Input $A$ (sparse matrix), $b$ (right-hand side)
    \Output $x$ (solution vector)\vspace{0.2cm}
    \State Set the initial guess $x^0$
-  \For {$k=1,2,3,\ldots$ until convergence (error$<\epsilon_{tsarm}$)} \label{algo:conv}
+  \For {$k=1,2,3,\ldots$ until convergence (error$<\epsilon_{tsirm}$)} \label{algo:conv}
      \State  $x^k=Solve(A,b,x^{k-1},max\_iter_{kryl})$   \label{algo:solve}
      \State retrieve error
      \State $S_{k \mod s}=x^k$ \label{algo:store}
@@ -670,17 +670,17 @@ iteration is  inside the for  loop. Line~\ref{algo:solve}, the Krylov  method is
  called for a  maximum of $max\_iter_{kryl}$ iterations.  In practice, we  suggest to set this parameter
  equals to  the restart  number of the  GMRES-like method. Moreover,  a tolerance
  threshold must be specified for the  solver. In practice, this threshold must be
-much  smaller  than the  convergence  threshold  of  the TSARM  algorithm  (\emph{i.e.}
-$\epsilon_{tsarm}$).  Line~\ref{algo:store}, $S_{k~ mod~ s}=x^k$ consists in copying the
+much  smaller  than the  convergence  threshold  of  the TSIRM  algorithm  (\emph{i.e.}
+$\epsilon_{tsirm}$).  Line~\ref{algo:store}, $S_{k~ mod~ s}=x^k$ consists in copying the
  solution  $x_k$  into the  column  $k~ mod~ s$ of  the  matrix  $S$. After  the
  minimization, the matrix $S$ is reused with the new values of the residuals.  To
  solve the minimization problem, an  iterative method is used. Two parameters are
  required for that: the maximum number of iteration and the threshold to stop the
  method.
  
-Let us summarize the most important parameters of TSARM:
+Let us summarize the most important parameters of TSIRM:
  \begin{itemize}
-\item $\epsilon_{tsarm}$: the threshold to stop the TSARM method;
+\item $\epsilon_{tsirm}$: the threshold to stop the TSIRM method;
  \item $max\_iter_{kryl}$: the maximum number of iterations for the Krylov method;
  \item $s$: the number of outer iterations before applying the minimization step;
  \item $max\_iter_{ls}$: the maximum number of iterations for the iterative least-square method;
@@ -688,7 +688,7 @@ Let us summarize the most important parameters of TSARM:
  \end{itemize}
  
  
-The  parallelisation  of  TSARM  relies   on  the  parallelization  of  all  its
+The  parallelisation  of  TSIRM  relies   on  the  parallelization  of  all  its
  parts. More  precisely, except  the least-square step,  all the other  parts are
  obvious to  achieve out in parallel. In  order to develop a  parallel version of
  our   code,   we   have   chosen  to   use   PETSc~\cite{petsc-web-page}.    For
@@ -759,7 +759,7 @@ Table~\ref{tab:01},  we  show  the  matrices  we  have used  and  some  of  them
  characteristics. For all  the matrices, the name, the field,  the number of rows
  and the number of nonzero elements is given.
  
-\begin{table*}
+\begin{table*}[htbp]
  \begin{center}
  \begin{tabular}{|c|c|r|r|r|} 
  \hline
@@ -783,7 +783,7 @@ the restart  of GMRES is performed every  30 iterations, we have  chosen to stop
  the GMRES every 30 iterations, $max\_iter_{kryl}=30$).  $s$ is set to 8. CGLS is
  chosen  to minimize  the least-squares  problem with  the  following parameters:
  $\epsilon_{ls}=1e-40$ and $max\_iter_{ls}=20$.  The external precision is set to
-$\epsilon_{tsarm}=1e-10$.  Those  experiments have been performed  on a Intel(R)
+$\epsilon_{tsirm}=1e-10$.  Those  experiments have been performed  on a Intel(R)
  Core(TM) i7-3630QM CPU @ 2.40GHz with the version 3.5.1 of PETSc.
  
  
@@ -791,20 +791,20 @@ In  Table~\ref{tab:02}, some  experiments comparing  the solving  of  the linear
  systems obtained with the previous matrices  with a GMRES variant and with out 2
  stage algorithm are  given. In the second column, it can  be noticed that either
  gmres or fgmres is used to  solve the linear system.  According to the matrices,
-different  preconditioner is used.   With TSARM,  the same  solver and  the same
-preconditionner is used.  This Table shows that TSARM can drastically reduce the
+different  preconditioner is used.   With TSIRM,  the same  solver and  the same
+preconditionner is used.  This Table shows that TSIRM can drastically reduce the
  number of iterations to reach the  convergence when the number of iterations for
  the normal GMRES is more or less  greater than 500. In fact this also depends on
  tow  parameters: the  number  of iterations  to  stop GMRES  and  the number  of
  iterations to perform the minimization.
  
  
-\begin{table}
+\begin{table}[htbp]
  \begin{center}
  \begin{tabular}{|c|c|r|r|r|r|} 
  \hline
  
- \multirow{2}{*}{Matrix name}  & Solver /   & \multicolumn{2}{c|}{GMRES} & \multicolumn{2}{c|}{TSARM CGLS} \\ 
+ \multirow{2}{*}{Matrix name}  & Solver /   & \multicolumn{2}{c|}{GMRES} & \multicolumn{2}{c|}{TSIRM CGLS} \\ 
  \cline{3-6}
         &  precond             & Time  & \# Iter.  & Time  & \# Iter.  \\\hline \hline
  
@@ -849,12 +849,12 @@ In the following larger experiments are described on two large scale architectur
  
  {\bf Description of preconditioners}
  
-\begin{table*}
+\begin{table*}[htbp]
  \begin{center}
  \begin{tabular}{|r|r|r|r|r|r|r|r|r|} 
  \hline
  
-  nb. cores & precond   & \multicolumn{2}{c|}{FGMRES} & \multicolumn{2}{c|}{TSARM CGLS} &  \multicolumn{2}{c|}{TSARM LSQR} & best gain \\ 
+  nb. cores & precond   & \multicolumn{2}{c|}{FGMRES} & \multicolumn{2}{c|}{TSIRM CGLS} &  \multicolumn{2}{c|}{TSIRM LSQR} & best gain \\ 
  \cline{3-8}
               &                       & Time  & \# Iter.  & Time  & \# Iter. & Time  & \# Iter. & \\\hline \hline
    2,048      & mg                    & 403.49   & 18,210    & 73.89  & 3,060   & 77.84  & 3,270  & 5.46 \\
@@ -868,7 +868,7 @@ In the following larger experiments are described on two large scale architectur
  \hline
  
  \end{tabular}
-\caption{Comparison of FGMRES and TSARM with FGMRES for example ex15 of PETSc with two preconditioner (mg and sor) with 25,000 components per core on Juqueen (threshold 1e-3, restart=30, s=12),  time is expressed in seconds.}
+\caption{Comparison of FGMRES and TSIRM with FGMRES for example ex15 of PETSc with two preconditioner (mg and sor) with 25,000 components per core on Juqueen (threshold 1e-3, restart=30, s=12),  time is expressed in seconds.}
  \label{tab:03}
  \end{center}
  \end{table*}
@@ -877,16 +877,30 @@ Table~\ref{tab:03} shows  the execution  times and the  number of  iterations of
  example ex15  of PETSc on the  Juqueen architecture. Differents  number of cores
  are  studied rangin  from  2,048  upto 16,383.   Two  preconditioners have  been
  tested.   For those experiments,  the number  of components  (or unknown  of the
-problems)  per processor is  fixed to  25,000. This  number can  seem relatively
-small. In fact, for  some applications that need a lot of  memory, the number of
-components per processor requires sometimes to be small.
-
-In this Table, we  can notice that TSARM is always faster  than FGMRES. The last
-column shows the ratio between FGMRES and the best version of TSARM according to
-the minimization procedure: CGLS or LSQR.
-
-
-\begin{figure}
+problems)  per processor  is fixed  to 25,000,  also called  weak  scaling. This
+number can seem relatively small. In fact, for some applications that need a lot
+of  memory, the  number of  components per  processor requires  sometimes  to be
+small.
+
+In this Table, we  can notice that TSIRM is always faster  than FGMRES. The last
+column shows the ratio between FGMRES and the best version of TSIRM according to
+the minimization  procedure: CGLS or  LSQR. Even if  we have computed  the worst
+case  between CGLS  and LSQR,  it is  clear that  TSIRM is  alsways  faster than
+FGMRES. For this example, the  multigrid preconditionner is faster than SOR. The
+gain  between   TSIRM  and  FGMRES  is   more  or  less  similar   for  the  two
+preconditioners.
+
+In  Figure~\ref{fig:01}, the number  of iterations  per second  corresponding to
+Table~\ref{tab:01} is displayed.  It should  be noticed that for TSIRM, only the
+iterations of  the Krylov solver are  taken into account. Iterations  of CGLS or
+LSQR are  not recorded but they are  time-consuming. It can be  noticed that the
+number of iterations  per second of FMGRES is constant  whereas it decrease with
+TSIRM with both preconditioner. This can  be explained by the fact that when the
+number of core  increases the time for the minimization  step also increases but
+it  is  also  more efficient  to reduce the number of iterations.
+
+
+\begin{figure}[htbp]
  \centering
    \includegraphics[width=0.45\textwidth]{nb_iter_sec_ex15_juqueen}
  \caption{Number of iterations per second with ex15 and the same parameters than in Table~\ref{tab:03}}
@@ -897,12 +911,12 @@ the minimization procedure: CGLS or LSQR.
  
  
  
-\begin{table*}
+\begin{table*}[htbp]
  \begin{center}
  \begin{tabular}{|r|r|r|r|r|r|r|r|r|} 
  \hline
  
-  nb. cores & threshold   & \multicolumn{2}{c|}{GMRES} & \multicolumn{2}{c|}{TSARM CGLS} &  \multicolumn{2}{c|}{TSARM LSQR} & best gain \\ 
+  nb. cores & threshold   & \multicolumn{2}{c|}{GMRES} & \multicolumn{2}{c|}{TSIRM CGLS} &  \multicolumn{2}{c|}{TSIRM LSQR} & best gain \\ 
  \cline{3-8}
               &                       & Time  & \# Iter.  & Time  & \# Iter. & Time  & \# Iter. & \\\hline \hline
    2,048      & 8e-5                  & 108.88 & 16,560  & 23.06  &  3,630  & 22.79  & 3,630   & 4.77 \\
@@ -924,12 +938,12 @@ the minimization procedure: CGLS or LSQR.
  
  
  
-\begin{table*}
+\begin{table*}[htbp]
  \begin{center}
  \begin{tabular}{|r|r|r|r|r|r|r|r|r|r|r|} 
  \hline
  
-  nb. cores   & \multicolumn{2}{c|}{GMRES} & \multicolumn{2}{c|}{TSARM CGLS} &  \multicolumn{2}{c|}{TSARM LSQR} & best gain & \multicolumn{3}{c|}{efficiency} \\ 
+  nb. cores   & \multicolumn{2}{c|}{GMRES} & \multicolumn{2}{c|}{TSIRM CGLS} &  \multicolumn{2}{c|}{TSIRM LSQR} & best gain & \multicolumn{3}{c|}{efficiency} \\ 
  \cline{2-7} \cline{9-11}
                      & Time  & \# Iter.  & Time  & \# Iter. & Time  & \# Iter. &   & GMRES & TS CGLS & TS LSQR\\\hline \hline
     512              & 3,969.69 & 33,120 & 709.57 & 5,790  & 622.76 & 5,070  & 6.37  &   1    &    1    &     1     \\