new

[Krylov_multi.git] / krylov_multi.tex
diff --git a/krylov_multi.tex b/krylov_multi.tex

index 1010f0775f13dc0ca215b7fe3e5c99b3dcae10ea..dca87e4b28d49a3ac2c44eb530e6ffd38bf8edb8 100644 (file)
--- a/krylov_multi.tex
+++ b/krylov_multi.tex
@@ -5,9 +5,20 @@
  \usepackage{graphicx}
  \usepackage{algorithm}
  \usepackage{algpseudocode}
  \usepackage{graphicx}
  \usepackage{algorithm}
  \usepackage{algpseudocode}
+\usepackage{multirow}
  
  
+\algnewcommand\algorithmicinput{\textbf{Input:}}
+\algnewcommand\Input{\item[\algorithmicinput]}
+
+\algnewcommand\algorithmicoutput{\textbf{Output:}}
+\algnewcommand\Output{\item[\algorithmicoutput]}
+
+\newcommand{\Time}[1]{\mathit{Time}_\mathit{#1}}
+\newcommand{\Prec}{\mathit{prec}}
+\newcommand{\Ratio}{\mathit{Ratio}}
  
  \title{A scalable multisplitting algorithm for solving large sparse linear systems} 
  
  \title{A scalable multisplitting algorithm for solving large sparse linear systems} 
+\date{}
  
  
  
  
  
  
@@ -22,7 +33,7 @@
  
  
  \begin{abstract}
  
  
  \begin{abstract}
-In  this  paper we  revist  the  krylov  multisplitting algorithm  presented  in
+In  this paper  we  revisit  the krylov  multisplitting  algorithm presented  in
  \cite{huang1993krylov}  which  uses  a  scalar  method to  minimize  the  krylov
  iterations computed by a multisplitting algorithm. Our new algorithm is based on
  a  parallel multisplitting  algorithm  with few  blocks  of large  size using  a
  \cite{huang1993krylov}  which  uses  a  scalar  method to  minimize  the  krylov
  iterations computed by a multisplitting algorithm. Our new algorithm is based on
  a  parallel multisplitting  algorithm  with few  blocks  of large  size using  a
@@ -55,8 +66,13 @@ solvers.   However, most  of the  good preconditionners  are not  sclalable when
  thousands of cores are used.
  
  
  thousands of cores are used.
  
  
-A completer...
-On ne peut pas parler de tout...\\
+Traditionnal iterative  solvers have  global synchronizations that  penalize the
+scalability.   Two  possible solutions  consists  either  in using  asynchronous
+iterative  methods~\cite{ref18} or  to  use multisplitting  algorithms. In  this
+paper, we will  reconsider the use of a multisplitting  method. In opposition to
+traditionnal  multisplitting  method  that  suffer  from  slow  convergence,  as
+proposed  in~\cite{huang1993krylov},  the  use  of a  minimization  process  can
+drastically improve the convergence.
  
  
  
  
  
  
@@ -195,9 +211,9 @@ is solved  independently by a cluster of  processors and communication
  are required  to update the  right-hand side vectors $Y_l$,  such that
  the  vectors  $X_i$  represent   the  data  dependencies  between  the
  clusters. In this work,  we use the parallel GMRES method~\cite{ref34}
  are required  to update the  right-hand side vectors $Y_l$,  such that
  the  vectors  $X_i$  represent   the  data  dependencies  between  the
  clusters. In this work,  we use the parallel GMRES method~\cite{ref34}
-as     an     inner    iteration     method     for    solving     the
+as     an     inner      iteration     method     to     solve     the
  sub-systems~(\ref{sec03:eq03}).  It  is a well-known  iterative method
  sub-systems~(\ref{sec03:eq03}).  It  is a well-known  iterative method
-which  gives good performances  for solving  sparse linear  systems in
+which  gives good performances  to solve  sparse linear  systems in
  parallel on a cluster of processors.
  
  It should be noted that  the convergence of the inner iterative solver
  parallel on a cluster of processors.
  
  It should be noted that  the convergence of the inner iterative solver
@@ -223,7 +239,10 @@ S=\{x^1,x^2,\ldots,x^s\},~s\leq n,
  \label{sec03:eq04}
  \end{equation}
  where   for  $j\in\{1,\ldots,s\}$,  $x^j=[X_1^j,\ldots,X_L^j]$   is  a
  \label{sec03:eq04}
  \end{equation}
  where   for  $j\in\{1,\ldots,s\}$,  $x^j=[X_1^j,\ldots,X_L^j]$   is  a
-solution of the  global linear system. The advantage  of such a  Krylov subspace is  that we need  neither an orthogonal basis  nor synchronizations between  the different clusters to generate this basis.
+solution of the  global linear system. The advantage  of such a Krylov
+subspace   is  that   we  need   neither  an   orthogonal   basis  nor
+synchronizations  between  the  different  clusters to  generate  this
+basis.
  
  The  multisplitting   method  is  periodically   restarted  every  $s$
  iterations  with   a  new  initial   guess  $\tilde{x}=S\alpha$  which
  
  The  multisplitting   method  is  periodically   restarted  every  $s$
  iterations  with   a  new  initial   guess  $\tilde{x}=S\alpha$  which
@@ -245,17 +264,19 @@ which is associated with the least squares problem
  \text{minimize}~\|b-R\alpha\|_2,
  \label{sec03:eq07}
  \end{equation}  
  \text{minimize}~\|b-R\alpha\|_2,
  \label{sec03:eq07}
  \end{equation}  
-where  $R^T$  denotes the  transpose  of  the  matrix $R$.  Since  $R$
-(i.e.  $AS$) and  $b$  are  split among  $L$  clusters, the  symmetric
-positive    definite    system~(\ref{sec03:eq06})    is   solved    in
-parallel. Thus, an  iterative method would be more  appropriate than a
-direct  one for  solving this  system. We  use the  parallel conjugate
-gradient method for the normal equations CGNR~\cite{S96,refCGNR}.
+where $R^T$ denotes the transpose  of the matrix $R$.  Since $R$ (i.e.
+$AS$) and  $b$ are  split among $L$  clusters, the  symmetric positive
+definite  system~(\ref{sec03:eq06}) is  solved in  parallel.  Thus, an
+iterative method would be more  appropriate than a direct one to solve
+this system.  We use  the parallel conjugate  gradient method  for the
+normal equations CGNR~\cite{S96,refCGNR}.
  
  \begin{algorithm}[!t]
  \caption{A two-stage linear solver with inner iteration GMRES method}
  \begin{algorithmic}[1]
  
  \begin{algorithm}[!t]
  \caption{A two-stage linear solver with inner iteration GMRES method}
  \begin{algorithmic}[1]
-\State Load $A_l$, $B_l$, initial guess $x^0$
+\Input $A_l$ (local sparse matrix), $B_l$ (local right-hand side), $x^0$ (initial guess)
+\Output $X_l$ (local solution vector)\vspace{0.2cm}
+\State Load $A_l$, $B_l$, $x^0$
  \State Initialize the minimizer $\tilde{x}^0=x^0$
  \For {$k=1,2,3,\ldots$ until the global convergence}
  \State Restart with $x^0=\tilde{x}^{k-1}$: \textbf{for} $j=1,2,\ldots,s$ \textbf{do}
  \State Initialize the minimizer $\tilde{x}^0=x^0$
  \For {$k=1,2,3,\ldots$ until the global convergence}
  \State Restart with $x^0=\tilde{x}^{k-1}$: \textbf{for} $j=1,2,\ldots,s$ \textbf{do}
@@ -288,16 +309,16 @@ gradient method for the normal equations CGNR~\cite{S96,refCGNR}.
  \label{algo:01}
  \end{algorithm}
  
  \label{algo:01}
  \end{algorithm}
  
-The main  key points  of the multisplitting  method for  solving large
-sparse  linear  systems are  given  in Algorithm~\ref{algo:01}.   This
+The  main key points  of the  multisplitting method  to solve  a large
+sparse  linear  system  are  given in  Algorithm~\ref{algo:01}.   This
  algorithm is based on a two-stage method with a minimization using the
  GMRES iterative method as an  inner solver. It is executed in parallel
  by  each cluster  of processors.   The matrices  and vectors  with the
  subscript  $l$ represent  the local  data for  the cluster  $l$, where
  $l\in\{1,\ldots,L\}$. The two-stage solver uses two different parallel
  algorithm is based on a two-stage method with a minimization using the
  GMRES iterative method as an  inner solver. It is executed in parallel
  by  each cluster  of processors.   The matrices  and vectors  with the
  subscript  $l$ represent  the local  data for  the cluster  $l$, where
  $l\in\{1,\ldots,L\}$. The two-stage solver uses two different parallel
-iterative algorithms: the GMRES method for solving each splitting on a
+iterative algorithms:  the GMRES method  to solve each splitting  on a
  cluster of processors, and the CGNR method executed in parallel by all
  cluster of processors, and the CGNR method executed in parallel by all
-clusters for  minimizing the function  error over the  Krylov subspace
+clusters  to minimize  the  function error  over  the Krylov  subspace
  spanned by  $S$.  The  algorithm requires two  global synchronizations
  between the $L$  clusters. The first one is  performed at line~$12$ in
  Algorithm~\ref{algo:01}  to exchange  the local  values of  the vector
  spanned by  $S$.  The  algorithm requires two  global synchronizations
  between the $L$  clusters. The first one is  performed at line~$12$ in
  Algorithm~\ref{algo:01}  to exchange  the local  values of  the vector
@@ -311,6 +332,101 @@ synchronizations   by   using   the   MPI   collective   communication
  subroutines.
  
  
  subroutines.
  
  
+\section{Experiments}
+
+In order  to illustrate  the interest  of our algorithm.   We have  compared our
+algorithm  with  the  GMRES  method  which  a very  well  used  method  in  many
+situations.  We have chosen to focus on only one problem which is very simple to
+implement: a 3 dimension Poisson problem.
+
+\begin{equation}
+\left\{
+                \begin{array}{ll}
+                  \nabla u&=f \mbox{~in~} \omega\\
+                  u &=0 \mbox{~on~}  \Gamma=\partial \omega
+                \end{array}
+              \right.
+\end{equation}
+
+After discretization, with a finite  difference scheme, a seven point stencil is
+used. It  is well-known that the  spectral radius of  matrices representing such
+problems are very close to 1.  Moreover, the larger the number of discretization
+points is,  the closer to 1  the spectral radius  is.  Hence, to solve  a matrix
+obtained for  a 3D Poisson  problem, the number  of iterations is high.  Using a
+preconditioner  it  is   possible  to  reduce  the  number   of  iterations  but
+preconditioners are not scalable when using many cores.
+
+Doing many experiments  with many cores is  not easy and require to  access to a
+supercomputer  with several  hours for  developping  a code  and then  improving
+it. In the following we presented  some experiments we could achieved out on the
+Hector architecture,  the previous UK's  high-end computing resource,  funded by
+the UK Research Councils, which has been stopped in the early 2014.
+
+In the experiments  we report the size of the 3D  poisson considered
+
+
+The first column  shows the size of the  problem The size is chosen  in order to
+have approximately 50,000 components per core.  The second column represents the
+number of  cores used. In parenthesis,  there is the decomposition  used for the
+Krylov multisplitting. The  third column and the sixth  column respectively show
+the execution time for the GMRES  and the Kyrlow multisplitting code. The fourth
+and  the   seventh  column   describes  the  number   of  iterations.   For  the
+multisplitting  code, the  total number  of inner  iterations is  represented in
+parenthesis.
+
+ We  also give  the other parameters:  the restart  for the GRMES method....
+
+\begin{table}[p]
+\begin{center}
+\begin{tabular}{|c|c||c|c|c||c|c|c||c|} 
+\hline
+\multirow{2}{*}{Pb size}&\multirow{2}{*}{Nb. cores} &  \multicolumn{3}{c||}{GMRES} &  \multicolumn{3}{c||}{Krylov Multisplitting} & \multirow{2}{*}{Ratio}\\
+ \cline{3-8}
+           &                   &  Time (s) & nb Iter. & $\Delta$  &   Time (s)& nb Iter. & $\Delta$ & \\
+\hline
+
+$590^3$ & 4096 (2x2048)        &  433.1    & 55,494    & 4.92e-7  &  74.1    & 1,101(8,211) & 6.62e-08  & 5.84   \\
+\hline
+$743^3$ & 8192 (2x4096)        & 704.4     & 87,822    & 4.80e-07 &  151.2   & 3,061(14,914) & 5.87e-08 & 4.65    \\
+\hline
+$743^3$ & 8192 (4x2048)        & 704.4     & 87,822    & 4.80e-07 &  110.3   & 1,531(12,721) & 1.47e-07& 6.39  \\
+\hline
+
+\end{tabular}
+\caption{Results without preconditioner}
+\label{tab1}
+\end{center}
+\end{table}
+
+
+\begin{table}[p]
+\begin{center}
+\begin{tabular}{|c|c||c|c|c||c|c|c||c|} 
+\hline
+\multirow{2}{*}{Pb size}&\multirow{2}{*}{Nb. cores} &  \multicolumn{3}{c||}{GMRES} &  \multicolumn{3}{c||}{Krylov Multisplitting} & \multirow{2}{*}{Ratio}\\
+ \cline{3-8}
+           &                   &  Time (s) & nb Iter. & $\Delta$  &   Time (s)& nb Iter. & $\Delta$ & \\
+\hline
+
+$590^3$ & 4096 (2x2048)        &  433.0    & 55,494    & 4.92e-7  &  80.4    & 1,091(9,545) & 7.64e-08  & 5.39   \\
+\hline
+$743^3$ & 8192 (2x4096)        & 704.4     & 87,822    & 4.80e-07 &  110.2   & 1,401(12,379) & 1.11e-07 & 6.39    \\
+\hline
+$743^3$ & 8192 (4x2048)        & 704.4     & 87,822    & 4.80e-07 &  139.8   & 1,891(15,960) & 1.60e-07& 5.03  \\
+\hline
+
+\end{tabular}
+\caption{Results with preconditioner}
+\label{tab2}
+\end{center}
+\end{table}
+
+\section{Conclusion and perspectives}
+
+Other applications (=> other matrices)\\
+Larger experiments\\
+Async\\
+Overlapping
  
  
  %%%%%%%%%%%%%%%%%%%%%%%%
  
  
  %%%%%%%%%%%%%%%%%%%%%%%%