modif

[GMRES2stage.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index f6f363c38f605809e5bb683772782874db4f726f..e626ba05e614f730fb273627854e9d11c06ad6dc 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -354,6 +354,7 @@
  \usepackage{amsmath}
  \usepackage{amssymb}
  \usepackage{multirow}
  \usepackage{amsmath}
  \usepackage{amssymb}
  \usepackage{multirow}
+\usepackage{graphicx}
  
  \algnewcommand\algorithmicinput{\textbf{Input:}}
  \algnewcommand\Input{\item[\algorithmicinput]}
  
  \algnewcommand\algorithmicinput{\textbf{Input:}}
  \algnewcommand\Input{\item[\algorithmicinput]}
@@ -431,9 +432,9 @@ convergence of Krylov iterative methods,  typically those of GMRES variants. The
  principle of  our approach  is to  build an external  iteration over  the Krylov
  method  and to  save  the current  residual  frequently (for  example, for  each
  restart of GMRES). Then after a given number of outer iterations, a minimization
  principle of  our approach  is to  build an external  iteration over  the Krylov
  method  and to  save  the current  residual  frequently (for  example, for  each
  restart of GMRES). Then after a given number of outer iterations, a minimization
-step is applied on the matrix composed of the save residuals in order to compute
-a  better solution and  make a  new iteration  if necessary.  We prove  that our
-method  has the  same  convergence property  than  the inner  method used.  Some
+step  is applied  on the  matrix composed  of the  saved residuals  in  order to
+compute a better solution and make  a new iteration if necessary.  We prove that
+our method has  the same convergence property than the  inner method used.  Some
  experiments using up  to 16,394 cores show that compared  to GMRES our algorithm
  can be around 7 times faster.
  \end{abstract}
  experiments using up  to 16,394 cores show that compared  to GMRES our algorithm
  can be around 7 times faster.
  \end{abstract}
@@ -583,8 +584,7 @@ performances.
  The present paper is organized  as follows. In Section~\ref{sec:02} some related
  works are presented. Section~\ref{sec:03} presents our two-stage algorithm using
  a  least-square  residual  minimization.   Section~\ref{sec:04}  describes  some
  The present paper is organized  as follows. In Section~\ref{sec:02} some related
  works are presented. Section~\ref{sec:03} presents our two-stage algorithm using
  a  least-square  residual  minimization.   Section~\ref{sec:04}  describes  some
-convergence  results  on this  method.   In Section~\ref{sec:05},  parallization
-details  of  TSARM  are  given.  Section~\ref{sec:06}  shows  some  experimental
+convergence  results  on this  method.   Section~\ref{sec:05}  shows  some  experimental
  results  obtained on large  clusters of  our algorithm  using routines  of PETSc
  toolkit.  Finally Section~\ref{sec:06} concludes and gives some perspectives.
  %%%*********************************************************
  results  obtained on large  clusters of  our algorithm  using routines  of PETSc
  toolkit.  Finally Section~\ref{sec:06} concludes and gives some perspectives.
  %%%*********************************************************
@@ -644,11 +644,11 @@ appropriate than a direct method in a parallel context.
    \Input $A$ (sparse matrix), $b$ (right-hand side)
    \Output $x$ (solution vector)\vspace{0.2cm}
    \State Set the initial guess $x^0$
    \Input $A$ (sparse matrix), $b$ (right-hand side)
    \Output $x$ (solution vector)\vspace{0.2cm}
    \State Set the initial guess $x^0$
-  \For {$k=1,2,3,\ldots$ until convergence (error$<\epsilon_{kryl}$)} \label{algo:conv}
+  \For {$k=1,2,3,\ldots$ until convergence (error$<\epsilon_{tsarm}$)} \label{algo:conv}
      \State  $x^k=Solve(A,b,x^{k-1},max\_iter_{kryl})$   \label{algo:solve}
      \State retrieve error
      \State $S_{k~mod~s}=x^k$ \label{algo:store}
      \State  $x^k=Solve(A,b,x^{k-1},max\_iter_{kryl})$   \label{algo:solve}
      \State retrieve error
      \State $S_{k~mod~s}=x^k$ \label{algo:store}
-    \If {$k$ mod $s=0$ {\bf and} error$>\epsilon_{kryl}$}
+    \If {$k$ mod $s=0$ {\bf and} error$>\epsilon_{tsarm}$}
        \State $R=AS$ \Comment{compute dense matrix} \label{algo:matrix_mul}
        \State Solve least-squares problem $\underset{\alpha\in\mathbb{R}^{s}}{min}\|b-R\alpha\|_2$ \label{algo:}
        \State $x^k=S\alpha$  \Comment{compute new solution}
        \State $R=AS$ \Comment{compute dense matrix} \label{algo:matrix_mul}
        \State Solve least-squares problem $\underset{\alpha\in\mathbb{R}^{s}}{min}\|b-R\alpha\|_2$ \label{algo:}
        \State $x^k=S\alpha$  \Comment{compute new solution}
@@ -664,7 +664,7 @@ called for a  maximum of $max\_iter_{kryl}$ iterations.  In practice, we  sugges
  equals to  the restart  number of the  GMRES-like method. Moreover,  a tolerance
  threshold must be specified for the  solver. In practice, this threshold must be
  much  smaller  than the  convergence  threshold  of  the TSARM  algorithm  (i.e.
  equals to  the restart  number of the  GMRES-like method. Moreover,  a tolerance
  threshold must be specified for the  solver. In practice, this threshold must be
  much  smaller  than the  convergence  threshold  of  the TSARM  algorithm  (i.e.
-$\epsilon$).  Line~\ref{algo:store}, $S_{k~ mod~ s}=x^k$ consists in copying the
+$\epsilon_{tsarm}$).  Line~\ref{algo:store}, $S_{k~ mod~ s}=x^k$ consists in copying the
  solution  $x_k$  into the  column  $k~  mod~ s$ of  the  matrix  $S$. After  the
  minimization, the matrix $S$ is reused with the new values of the residuals.  To
  solve the minimization problem, an  iterative method is used. Two parameters are
  solution  $x_k$  into the  column  $k~  mod~ s$ of  the  matrix  $S$. After  the
  minimization, the matrix $S$ is reused with the new values of the residuals.  To
  solve the minimization problem, an  iterative method is used. Two parameters are
@@ -673,25 +673,13 @@ method.
  
  To summarize, the important parameters of TSARM are:
  \begin{itemize}
  
  To summarize, the important parameters of TSARM are:
  \begin{itemize}
-\item $\epsilon_{kryl}$ the threshold to stop the method of the krylov method
+\item $\epsilon_{tsarm}$ the threshold to stop the TSARM method
  \item $max\_iter_{kryl}$ the maximum number of iterations for the krylov method
  \item $s$ the number of outer iterations before applying the minimization step
  \item $max\_iter_{ls}$ the maximum number of iterations for the iterative least-square method
  \item $\epsilon_{ls}$ the threshold to stop the least-square method
  \end{itemize}
  
  \item $max\_iter_{kryl}$ the maximum number of iterations for the krylov method
  \item $s$ the number of outer iterations before applying the minimization step
  \item $max\_iter_{ls}$ the maximum number of iterations for the iterative least-square method
  \item $\epsilon_{ls}$ the threshold to stop the least-square method
  \end{itemize}
  
-%%%*********************************************************
-%%%*********************************************************
-
-\section{Convergence results}
-\label{sec:04}
-
-
-
-%%%*********************************************************
-%%%*********************************************************
-\section{Parallelization}
-\label{sec:05}
  
  The  parallelisation  of  TSARM  relies   on  the  parallelization  of  all  its
  parts. More  precisely, except  the least-square step,  all the other  parts are
  
  The  parallelisation  of  TSARM  relies   on  the  parallelization  of  all  its
  parts. More  precisely, except  the least-square step,  all the other  parts are
@@ -733,10 +721,21 @@ In each iteration  of CGLS, there is two  matrix-vector multiplications and some
  classical operations:  dots, norm, multiplication  and addition on  vectors. All
  these operations are easy to implement in PETSc or similar environment.
  
  classical operations:  dots, norm, multiplication  and addition on  vectors. All
  these operations are easy to implement in PETSc or similar environment.
  
+
+
+%%%*********************************************************
+%%%*********************************************************
+
+\section{Convergence results}
+\label{sec:04}
+
+
+
+
  %%%*********************************************************
  %%%*********************************************************
  \section{Experiments using petsc}
  %%%*********************************************************
  %%%*********************************************************
  \section{Experiments using petsc}
-\label{sec:06}
+\label{sec:05}
  
  
  In order to see the influence of our algorithm with only one processor, we first
  
  
  In order to see the influence of our algorithm with only one processor, we first
@@ -766,13 +765,12 @@ torso3             & 2D/3D problem & 259,156 & 4,429,042 \\
  
  The following  parameters have been chosen  for our experiments.   As by default
  the restart  of GMRES is performed every  30 iterations, we have  chosen to stop
  
  The following  parameters have been chosen  for our experiments.   As by default
  the restart  of GMRES is performed every  30 iterations, we have  chosen to stop
-the     GMRES    every     30    iterations     (line     \ref{algo:solve}    in
-Algorithm~\ref{algo:01}).   $s$ is  set to  8. CGLS  is chosen  to  minimize the
-least-squares  problem.  Two  conditions  are  used to  stop  CGLS,  either  the
-precision is under $1e-40$ or the  number of iterations is greater to $20$.  The
-external   precision    is   set    to   $1e-10$   (line    \ref{algo:conv}   in
-Algorithm~\ref{algo:01}).  Those  experiments have been performed  on a Intel(R)
-Core(TM) i7-3630QM CPU @ 2.40GHz with the version 3.5.1 of PETSc.
+the GMRES every 30 iterations, $max\_iter_{kryl}=30$).  $s$ is set to 8. CGLS is
+chosen  to minimize  the least-squares  problem with  the  following parameters:
+$\epsilon_{ls}=1e-40$ and $max\_iter_{ls}=20$.  The external precision is set to
+$1e-10$  (i.e. ).   Those experiments
+have been  performed on  a Intel(R)  Core(TM) i7-3630QM CPU  @ 2.40GHz  with the
+version 3.5.1 of PETSc.
  
  
  In  Table~\ref{tab:02}, some  experiments comparing  the solving  of  the linear
  
  
  In  Table~\ref{tab:02}, some  experiments comparing  the solving  of  the linear
@@ -814,9 +812,27 @@ torso3             & fgmres / sor  & 37.70 & 565 & 34.97 & 510 \\
  
  
  
  
  
  
-Larger experiments ....\\
  
  
-Describe the problems ex15 and ex54
+In   the   following  we   describe   the   applications   of  PETSc   we   have
+experimented. Those applications  are available in the ksp  part which is suited
+for scalable linear equations solvers:
+\begin{itemize}
+\item ex15  is an example  which solves in  parallel an operator using  a finite
+  difference  scheme.   The  diagonal  is  equals to  4  and  4  extra-diagonals
+  representing the neighbors in each directions  is equal to -1. This example is
+  used  in many  physical phenomena  , for  exemple, heat  and fluid  flow, wave
+  propagation...
+\item ex54 is another example based on 2D problem discretized with quadrilateral
+  finite elements. For this example, the user can define the scaling of material
+  coefficient in embedded circle, it is called $\alpha$.
+\end{itemize}
+For more technical details on  these applications, interested reader are invited
+to  read the  codes available  in the  PETSc sources.   Those problem  have been
+chosen because they  are scalable with many cores. We  have tested other problem
+but they are not scalable with many cores.
+
+
+
  
  \begin{table*}
  \begin{center}
  
  \begin{table*}
  \begin{center}
@@ -843,6 +859,17 @@ Describe the problems ex15 and ex54
  \end{table*}
  
  
  \end{table*}
  
  
+\begin{figure}
+\centering
+  \includegraphics[width=0.45\textwidth]{nb_iter_sec_ex15_juqueen}
+\caption{Number of iterations per second with ex15 and the same parameters than in Table~\ref{tab:03}}
+\label{fig:01}
+\end{figure}
+
+
+
+
+
  \begin{table*}
  \begin{center}
  \begin{tabular}{|r|r|r|r|r|r|r|r|r|} 
  \begin{table*}
  \begin{center}
  \begin{tabular}{|r|r|r|r|r|r|r|r|r|} 
@@ -856,7 +883,7 @@ Describe the problems ex15 and ex54
    4,096      & 7e-5                  & 160.59 & 22,530  & 35.15  &  5,130  & 29.21  & 4,350   & 5.49 \\
    4,096      & 6e-5                  & 249.27 & 35,520  & 52.13  &  7,950  & 39.24  & 5,790   & 6.35 \\
    8,192      & 6e-5                  & 149.54 & 17,280  & 28.68  &  3,810  & 29.05  & 3,990  & 5.21 \\
    4,096      & 7e-5                  & 160.59 & 22,530  & 35.15  &  5,130  & 29.21  & 4,350   & 5.49 \\
    4,096      & 6e-5                  & 249.27 & 35,520  & 52.13  &  7,950  & 39.24  & 5,790   & 6.35 \\
    8,192      & 6e-5                  & 149.54 & 17,280  & 28.68  &  3,810  & 29.05  & 3,990  & 5.21 \\
-  8,192      & 5e-5                  & 792.11 & 109,590 & 76.83  &  10,470  & 65.20  & 9,030  & 12.14 \\
+  8,192      & 5e-5                  & 785.04 & 109,590 & 76.07  &  10,470  & 69.42 & 9,030  & 11.30 \\
    16,384     & 4e-5                  & 718.61 & 86,400 & 98.98  &  10,830  & 131.86  & 14,790  & 7.26 \\
  \hline
  
    16,384     & 4e-5                  & 718.61 & 86,400 & 98.98  &  10,830  & 131.86  & 14,790  & 7.26 \\
  \hline
  
@@ -872,17 +899,17 @@ Describe the problems ex15 and ex54
  
  \begin{table*}
  \begin{center}
  
  \begin{table*}
  \begin{center}
-\begin{tabular}{|r|r|r|r|r|r|r|r|r|r|} 
+\begin{tabular}{|r|r|r|r|r|r|r|r|r|r|r|} 
  \hline
  
  \hline
  
-  nb. cores   & \multicolumn{2}{c|}{GMRES} & \multicolumn{2}{c|}{TSARM CGLS} &  \multicolumn{2}{c|}{TSARM LSQR} & \multicolumn{3}{c|}{efficiency} \\ 
-\cline{2-10}
-                    & Time  & \# Iter.  & Time  & \# Iter. & Time  & \# Iter. & GMRES & TS CGLS & TS LSQR\\\hline \hline
-   512              & 3,969.69 & 33,120 & 709.57 & 5,790  & 622.76 & 5,070  &    1    &    1    &     1     \\
-   1024             & 1,530.06  & 25,860 & 290.95 & 4,830  & 307.71 & 5,070 &   1.30  &    1.21  &   1.01     \\
-   2048             & 919.62    & 31,470 & 237.52 & 8,040  & 194.22 & 6,510 &  1.08   &    .75   &   .80\\
-   4096             & 405.60    & 28,380 & 111.67 & 7,590  & 91.72  & 6,510 &  1.22   &  .79     &   .84 \\
-   8192             & 785.04   & 109,590 & 76.07  & 10,470 & 69.42 & 9,030  &    .32  &   .58    &  .56 \\
+  nb. cores   & \multicolumn{2}{c|}{GMRES} & \multicolumn{2}{c|}{TSARM CGLS} &  \multicolumn{2}{c|}{TSARM LSQR} & best gain & \multicolumn{3}{c|}{efficiency} \\ 
+\cline{2-7} \cline{9-11}
+                    & Time  & \# Iter.  & Time  & \# Iter. & Time  & \# Iter. &   & GMRES & TS CGLS & TS LSQR\\\hline \hline
+   512              & 3,969.69 & 33,120 & 709.57 & 5,790  & 622.76 & 5,070  & 6.37  &   1    &    1    &     1     \\
+   1024             & 1,530.06  & 25,860 & 290.95 & 4,830  & 307.71 & 5,070 & 5.25  &  1.30  &    1.21  &   1.01     \\
+   2048             & 919.62    & 31,470 & 237.52 & 8,040  & 194.22 & 6,510 & 4.73  & 1.08   &    .75   &   .80\\
+   4096             & 405.60    & 28,380 & 111.67 & 7,590  & 91.72  & 6,510 & 4.42  & 1.22   &  .79     &   .84 \\
+   8192             & 785.04   & 109,590 & 76.07  & 10,470 & 69.42 & 9,030  & 11.30 &   .32  &   .58    &  .56 \\
  
  \hline
  
  
  \hline
  
@@ -900,7 +927,7 @@ Describe the problems ex15 and ex54
  %%%*********************************************************
  %%%*********************************************************
  \section{Conclusion}
  %%%*********************************************************
  %%%*********************************************************
  \section{Conclusion}
-\label{sec:07}
+\label{sec:06}
  %The conclusion goes here. this is more of the conclusion
  %%%*********************************************************
  %%%*********************************************************
  %The conclusion goes here. this is more of the conclusion
  %%%*********************************************************
  %%%*********************************************************