\usepackage{graphicx}
\usepackage{algorithm}
\usepackage{algpseudocode}
+\usepackage{multirow}
+\algnewcommand\algorithmicinput{\textbf{Input:}}
+\algnewcommand\Input{\item[\algorithmicinput]}
+
+\algnewcommand\algorithmicoutput{\textbf{Output:}}
+\algnewcommand\Output{\item[\algorithmicoutput]}
+
+\newcommand{\Time}[1]{\mathit{Time}_\mathit{#1}}
+\newcommand{\Prec}{\mathit{prec}}
+\newcommand{\Ratio}{\mathit{Ratio}}
\title{A scalable multisplitting algorithm for solving large sparse linear systems}
+\date{}
\begin{abstract}
-In this paper we revist the krylov multisplitting algorithm presented in
+In this paper we revisit the krylov multisplitting algorithm presented in
\cite{huang1993krylov} which uses a scalar method to minimize the krylov
iterations computed by a multisplitting algorithm. Our new algorithm is based on
a parallel multisplitting algorithm with few blocks of large size using a
thousands of cores are used.
-A completer...
-On ne peut pas parler de tout...\\
+Traditionnal iterative solvers have global synchronizations that penalize the
+scalability. Two possible solutions consists either in using asynchronous
+iterative methods~\cite{ref18} or to use multisplitting algorithms. In this
+paper, we will reconsider the use of a multisplitting method. In opposition to
+traditionnal multisplitting method that suffer from slow convergence, as
+proposed in~\cite{huang1993krylov}, the use of a minimization process can
+drastically improve the convergence.
are required to update the right-hand side vectors $Y_l$, such that
the vectors $X_i$ represent the data dependencies between the
clusters. In this work, we use the parallel GMRES method~\cite{ref34}
-as an inner iteration method for solving the
+as an inner iteration method to solve the
sub-systems~(\ref{sec03:eq03}). It is a well-known iterative method
-which gives good performances for solving sparse linear systems in
+which gives good performances to solve sparse linear systems in
parallel on a cluster of processors.
It should be noted that the convergence of the inner iterative solver
\label{sec03:eq04}
\end{equation}
where for $j\in\{1,\ldots,s\}$, $x^j=[X_1^j,\ldots,X_L^j]$ is a
-solution of the global linear system.%The advantage such a method is that the Krylov subspace does not need to be spanned by an orthogonal basis.
-The advantage of such a Krylov subspace is that we need neither an
-orthogonal basis nor synchronizations between the different clusters
-to generate this basis.
+solution of the global linear system. The advantage of such a Krylov
+subspace is that we need neither an orthogonal basis nor
+synchronizations between the different clusters to generate this
+basis.
The multisplitting method is periodically restarted every $s$
iterations with a new initial guess $\tilde{x}=S\alpha$ which
\text{minimize}~\|b-R\alpha\|_2,
\label{sec03:eq07}
\end{equation}
-where $R^T$ denotes the transpose of the matrix $R$. Since $R$
-(i.e. $AS$) and $b$ are split among $L$ clusters, the symmetric
-positive definite system~(\ref{sec03:eq06}) is solved in
-parallel. Thus, an iterative method would be more appropriate than a
-direct one for solving this system. We use the parallel conjugate
-gradient method for the normal equations CGNR~\cite{S96,refCGNR}.
+where $R^T$ denotes the transpose of the matrix $R$. Since $R$ (i.e.
+$AS$) and $b$ are split among $L$ clusters, the symmetric positive
+definite system~(\ref{sec03:eq06}) is solved in parallel. Thus, an
+iterative method would be more appropriate than a direct one to solve
+this system. We use the parallel conjugate gradient method for the
+normal equations CGNR~\cite{S96,refCGNR}.
\begin{algorithm}[!t]
\caption{A two-stage linear solver with inner iteration GMRES method}
\begin{algorithmic}[1]
-\State Load $A_l$, $B_l$, initial guess $x^0$
+\Input $A_l$ (local sparse matrix), $B_l$ (local right-hand side), $x^0$ (initial guess)
+\Output $X_l$ (local solution vector)\vspace{0.2cm}
+\State Load $A_l$, $B_l$, $x^0$
\State Initialize the minimizer $\tilde{x}^0=x^0$
\For {$k=1,2,3,\ldots$ until the global convergence}
\State Restart with $x^0=\tilde{x}^{k-1}$: \textbf{for} $j=1,2,\ldots,s$ \textbf{do}
\State\hspace{0.5cm} Inner iteration solver: \Call{InnerSolver}{$x^0$, $j$}
-\State\hspace{0.5cm} Construct the basis $S$: add the column vector $X_l^j$ to the matrix $S_l$
+\State\hspace{0.5cm} Construct the basis $S$: add the column vector $X_l^j$ to the matrix $S_l^k$
\State\hspace{0.5cm} Exchange the local solution vector $X_l^j$ with the neighboring clusters
-\State\hspace{0.5cm} Compute the dense matrix $R$: $R_l^j=\sum^L_{i=1}A_{li}X_i^j$
+\State\hspace{0.5cm} Compute the dense matrix $R$: $R_l^{k,j}=\sum^L_{i=1}A_{li}X_i^j$
\State\textbf{end for}
-\State Minimization $\|b-R\alpha\|_2$: \Call{UpdateMinimizer}{$S_l$, $R$, $b$}
+\State Minimization $\|b-R\alpha\|_2$: \Call{UpdateMinimizer}{$S_l$, $R$, $b$, $k$}
\State Local solution of the linear system $Ax=b$: $X_l^k=\tilde{X}_l^k$
\State Exchange the local minimizer $\tilde{X}_l^k$ with the neighboring clusters
\EndFor
\Function {InnerSolver}{$x^0$, $j$}
\State Compute the local right-hand side: $Y_l = B_l - \sum^L_{i=1,i\neq l}A_{li}X_i^0$
-\State Solving the local splitting $A_{ll}X_l^j=Y_l$ with the parallel GMRES method, such that $X_l^0$ is the initial guess.
+\State Solving the local splitting $A_{ll}X_l^j=Y_l$ using the parallel GMRES method, such that $X_l^0$ is the initial guess
\State \Return $X_l^j$
\EndFunction
\Statex
\Function {UpdateMinimizer}{$S_l$, $R$, $b$, $k$}
-\State Solving the normal equations $R^TR\alpha=R^Tb$ in parallel by $L$ clusters using the parallel CGNR method
-\State Compute the local minimizer: $\tilde{X}_l^k=S_l\alpha$
+\State Solving the normal equations $(R^k)^TR^k\alpha^k=(R^k)^Tb$ in parallel by $L$ clusters using the parallel CGNR method
+\State Compute the local minimizer: $\tilde{X}_l^k=S_l^k\alpha^k$
\State \Return $\tilde{X}_l^k$
\EndFunction
\end{algorithmic}
\label{algo:01}
\end{algorithm}
-The main key points of the multisplitting method for solving large
-sparse linear systems are given in Algorithm~\ref{algo:01}. This
+The main key points of the multisplitting method to solve a large
+sparse linear system are given in Algorithm~\ref{algo:01}. This
algorithm is based on a two-stage method with a minimization using the
GMRES iterative method as an inner solver. It is executed in parallel
by each cluster of processors. The matrices and vectors with the
subscript $l$ represent the local data for the cluster $l$, where
$l\in\{1,\ldots,L\}$. The two-stage solver uses two different parallel
-iterative algorithms: the GMRES method for solving each splitting on a
+iterative algorithms: the GMRES method to solve each splitting on a
cluster of processors, and the CGNR method executed in parallel by all
-clusters for minimizing the function error over the Krylov subspace
+clusters to minimize the function error over the Krylov subspace
spanned by $S$. The algorithm requires two global synchronizations
between the $L$ clusters. The first one is performed at line~$12$ in
Algorithm~\ref{algo:01} to exchange the local values of the vector
subroutines.
+\section{Experiments}
+
+In order to illustrate the interest of our algorithm. We have compared our
+algorithm with the GMRES method which a very well used method in many
+situations. We have chosen to focus on only one problem which is very simple to
+implement: a 3 dimension Poisson problem.
+
+\begin{equation}
+\left\{
+ \begin{array}{ll}
+ \nabla u&=f \mbox{~in~} \omega\\
+ u &=0 \mbox{~on~} \Gamma=\partial \omega
+ \end{array}
+ \right.
+\end{equation}
+
+After discretization, with a finite difference scheme, a seven point stencil is
+used. It is well-known that the spectral radius of matrices representing such
+problems are very close to 1. Moreover, the larger the number of discretization
+points is, the closer to 1 the spectral radius is. Hence, to solve a matrix
+obtained for a 3D Poisson problem, the number of iterations is high. Using a
+preconditioner it is possible to reduce the number of iterations but
+preconditioners are not scalable when using many cores.
+
+Doing many experiments with many cores is not easy and require to access to a
+supercomputer with several hours for developping a code and then improving
+it. In the following we presented some experiments we could achieved out on the
+Hector architecture, the previous UK's high-end computing resource, funded by
+the UK Research Councils, which has been stopped in the early 2014.
+
+In the experiments we report the size of the 3D poisson considered
+
+
+The first column shows the size of the problem The size is chosen in order to
+have approximately 50,000 components per core. The second column represents the
+number of cores used. In parenthesis, there is the decomposition used for the
+Krylov multisplitting. The third column and the sixth column respectively show
+the execution time for the GMRES and the Kyrlow multisplitting code. The fourth
+and the seventh column describes the number of iterations. For the
+multisplitting code, the total number of inner iterations is represented in
+parenthesis.
+
+ We also give the other parameters: the restart for the GRMES method....
+
+\begin{table}[p]
+\begin{center}
+\begin{tabular}{|c|c||c|c|c||c|c|c||c|}
+\hline
+\multirow{2}{*}{Pb size}&\multirow{2}{*}{Nb. cores} & \multicolumn{3}{c||}{GMRES} & \multicolumn{3}{c||}{Krylov Multisplitting} & \multirow{2}{*}{Ratio}\\
+ \cline{3-8}
+ & & Time (s) & nb Iter. & $\Delta$ & Time (s)& nb Iter. & $\Delta$ & \\
+\hline
+
+$590^3$ & 4096 (2x2048) & 433.1 & 55,494 & 4.92e-7 & 74.1 & 1,101(8,211) & 6.62e-08 & 5.84 \\
+\hline
+$743^3$ & 8192 (2x4096) & 704.4 & 87,822 & 4.80e-07 & 151.2 & 3,061(14,914) & 5.87e-08 & 4.65 \\
+\hline
+$743^3$ & 8192 (4x2048) & 704.4 & 87,822 & 4.80e-07 & 110.3 & 1,531(12,721) & 1.47e-07& 6.39 \\
+\hline
+
+\end{tabular}
+\caption{Results without preconditioner}
+\label{tab1}
+\end{center}
+\end{table}
+
+
+\begin{table}[p]
+\begin{center}
+\begin{tabular}{|c|c||c|c|c||c|c|c||c|}
+\hline
+\multirow{2}{*}{Pb size}&\multirow{2}{*}{Nb. cores} & \multicolumn{3}{c||}{GMRES} & \multicolumn{3}{c||}{Krylov Multisplitting} & \multirow{2}{*}{Ratio}\\
+ \cline{3-8}
+ & & Time (s) & nb Iter. & $\Delta$ & Time (s)& nb Iter. & $\Delta$ & \\
+\hline
+
+$590^3$ & 4096 (2x2048) & 433.0 & 55,494 & 4.92e-7 & 80.4 & 1,091(9,545) & 7.64e-08 & 5.39 \\
+\hline
+$743^3$ & 8192 (2x4096) & 704.4 & 87,822 & 4.80e-07 & 110.2 & 1,401(12,379) & 1.11e-07 & 6.39 \\
+\hline
+$743^3$ & 8192 (4x2048) & 704.4 & 87,822 & 4.80e-07 & 139.8 & 1,891(15,960) & 1.60e-07& 5.03 \\
+\hline
+
+\end{tabular}
+\caption{Results with preconditioner}
+\label{tab2}
+\end{center}
+\end{table}
+
+\section{Conclusion and perspectives}
+
+Other applications (=> other matrices)\\
+Larger experiments\\
+Async\\
+Overlapping
%%%%%%%%%%%%%%%%%%%%%%%%