ec 04: tw-stage methods, biblio

[rce2015.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index 0ec97f38115c85d7e6531c1aafd749a88c2c1996..375f7c165ebaa882362f387b927d5aa4a52642e7 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -1,4 +1,3 @@
-%\documentclass[conference]{IEEEtran}
  \documentclass[times]{cpeauth}
  
  \usepackage{moreverb}
@@ -53,7 +52,8 @@
  \algnewcommand\algorithmicoutput{\textbf{Output:}}
  \algnewcommand\Output{\item[\algorithmicoutput]}
  
-\newcommand{\MI}{\mathit{MaxIter}}
+\newcommand{\TOLG}{\mathit{tol_{gmres}}}
+\newcommand{\MIG}{\mathit{maxit_{gmres}}}
  
  \usepackage{array}
  \usepackage{color, colortbl}
@@ -85,6 +85,8 @@
      Email: \email{{raphael.couturier,arnaud.giersch,david.laiymani,charles.ramamonjisoa}@univ-fcomte.fr}
  }
  
+%% Lilia Ziane Khodja: Department of Aerospace \& Mechanical Engineering\\ Non Linear Computational Mechanics\\ University of Liege\\ Liege, Belgium. Email: l.zianekhodja@ulg.ac.be
+
  \begin{abstract}
  ABSTRACT
  \end{abstract}
@@ -99,7 +101,78 @@ ABSTRACT
  
  \section{SimGrid}
  
-\section{Simulation of the multisplitting method}
+%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\section{Two-stage splitting methods}
+\label{sec:04}
+\subsection{Multisplitting methods for sparse linear systems}
+\label{sec:04.01}
+Let us consider the following sparse linear system of $n$ equations in $\mathbb{R}$
+\begin{equation}
+Ax=b,
+\label{eq:01}
+\end{equation}
+where $A$ is a sparse square and nonsingular matrix, $b$ is the right-hand side and $x$ is the solution of the system. The multisplitting methods solve the linear system~(\ref{eq:01}) iteratively as follows
+\begin{equation}
+x^{k+1}=\displaystyle\sum^L_{\ell=1} E_\ell M^{-1}_\ell (N_\ell x^k + b),~k=1,2,3,\ldots
+\label{eq:02}
+\end{equation}
+where a collection of $L$ triplets $(M_\ell, N_\ell, E_\ell)$ defines the multisplitting of matrix $A$~\cite{O'leary85,White86}, such that: the different splittings are defined as $A=M_\ell-N_\ell$ where $M_\ell$ are nonsingular matrices, and $\sum_\ell{E_\ell=I}$ are diagonal nonnegative weighting matrices and $I$ is the identity matrix. The iterations of the multisplitting methods can naturally be computed in parallel such that each processor or cluster of processors is responsible for solving one splitting as a linear sub-system   
+\begin{equation}
+M_\ell y_\ell = c_\ell^k,\mbox{~such that~} c_\ell^k = N_\ell x^k + b,
+\label{eq:03}
+\end{equation}
+then the weighting matrices $E_\ell$ are used to compute the solution of the global system~(\ref{eq:01})
+\begin{equation}
+x^{k+1}=\displaystyle\sum^L_{\ell=1} E_\ell y_\ell.
+\label{eq:04}
+\end{equation}
+The convergence of the multisplitting methods, based on synchronous or asynchronous iterations, is studied by many authors for example~\cite{O'leary85,bahi97,Bai99,bahi07}. %It is dependent on the condition  
+%\begin{equation}
+%\rho(\displaystyle\sum_{\ell=1}^L E_\ell M^{-1}_\ell N_\ell) < 1,
+%\label{eq:05}
+%\end{equation}
+%where $\rho$ is the spectral radius of the square matrix. 
+The multisplitting methods are convergent: 
+\begin{itemize}
+\item if $A^{-1}>0$ and the splittings of matrix $A$ are weak regular (i.e. $M^{-1}\geq 0$ and $M^{-1}N\geq 0$) when the iterations are synchronous, or
+\item if $A$ is M-matrix and its splittings are regular (i.e. $M^{-1}\geq 0$ and $N\geq 0$) when the iterations are asynchronous.
+\end{itemize}
+The solutions of the different linear sub-systems~(\ref{eq:03}) arising from the multisplitting of matrix $A$ can be either computed exactly with a direct method or approximated with an iterative method. In the latter case, the multisplitting methods are called {\it inner-outer iterative methods} or {\it two-stage multisplitting methods}. This kind of methods uses two nested iterations: the outer iteration and the inner iteration (that of the iterative method).
+
+In this paper we are focused on two-stage multisplitting methods, in their both versions synchronous and asynchronous, where the well-known iterative method GMRES ({\it Generalized Minimal RESidual})~\cite{saad86} is used as an inner iteration. Furthermore, our work in this paper is restricted to the block Jacobi splitting method. This approach of multisplitting consists in partitioning the matrix $A$ into $L$ horizontal band matrices of order $\frac{n}{L}\times n$ without overlapping (i.e. weighting matrices $E_\ell$ have only zero and one factors). In this case, the iteration of the multisplitting method presented by (\ref{eq:03}) and~(\ref{eq:04}) can be rewritten in the following form 
+\begin{equation}
+A_{\ell\ell} x_\ell^{k+1} = b_\ell - \displaystyle\sum^{L}_{\substack{m=1\\m\neq\ell}}{A_{\ell m}x^k_m},\mbox{~for~}\ell=1,\ldots,L\mbox{~and~}k=1,2,3,\ldots
+\label{eq:05}
+\end{equation}
+where $x_\ell$ are sub-vectors of the solution $x$, $b_\ell$ are the sub-vectors of the right-hand side $b$, and $A_{\ell\ell}$ and $A_{\ell m}$ are diagonal and off-diagonal blocks of matrix $A$ respectively. In each outer iteration $k$ until the convergence, each sub-system arising from the block Jacobi multisplitting
+\begin{equation}
+A_{\ell\ell} x_\ell = c_\ell,
+\label{eq:06}
+\end{equation}
+is solved iteratively using GMRES method and independently from other sub-systems by a cluster of processors. The right-hand sides $c_\ell=b_\ell-\sum_{m\neq\ell}A_{\ell m}x_m$ are computed using the shared vectors $x_m$. Algorithm~\ref{alg:01} shows the main key points of the block Jacobi two-stage method executed by a cluster of processors. In line~\ref{solve}, the linear sub-system~(\ref{eq:06}) is solved in parallel using GMRES method where $\MIG$ and $\TOLG$ are the maximum number of iterations and the tolerance threshold respectively.  
+
+\begin{algorithm}[t]
+\caption{Block Jacobi two-stage method}
+\begin{algorithmic}[1]
+  \Input $A_\ell$ (sparse matrix), $b_\ell$ (right-hand side)
+  \Output $x_\ell$ (solution vector)\vspace{0.2cm}
+  \State Set the initial guess $x^0$
+  \For {$k=1,2,3,\ldots$ until convergence}
+    \State $c_\ell=b_\ell-\sum_{m\neq\ell}A_{\ell m}x_m^{k-1}$
+    \State $x^k_\ell=Solve(A_{\ell\ell},c_\ell,x^{k-1}_\ell,\MIG,\TOLG)$ \label{solve}
+    \State Send $x_\ell^k$ to neighboring clusters
+    \State Receive $\{x_m^k\}_{m\neq\ell}$ from neighboring clusters
+  \EndFor
+\end{algorithmic}
+\label{alg:01}
+\end{algorithm}
+
+\subsection{Simulation of two-stage methods using SimGrid framework}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%
+%%%%%%%%%%%%%%%%%%%%%%%%%
  
  \section{Experimental, Results and Comments}
  
@@ -115,8 +188,7 @@ have been chosen for the study in the paper.
  
  \textbf{Step 2} : Collect the software materials needed for the 
  experimentation. In our case, we have three variants algorithms for the 
-resolution of three 3D-Poisson problem: (1) using the classical GMRES 
-\textit{(Generalized Minimal RESidual Method)} alias Algo-1 in this 
+resolution of three 3D-Poisson problem: (1) using the classical GMRES alias Algo-1 in this 
  paper, (2) using the multisplitting method alias Algo-2 and (3) an 
  enhanced version of the multisplitting method as Algo-3. In addition, 
  SIMGRID simulator has been chosen to simulate the behaviors of the 
@@ -226,14 +298,14 @@ The results in figure 1 show the non-variation of the number of
  iterations of classical GMRES for a given input matrix size; it is not 
  the case for the multisplitting method. 
  
-%%\begin{wrapfigure}{l}{60mm}
+%\begin{wrapfigure}{l}{60mm}
  \begin{figure} [ht!]
  \centering
-\includegraphics[width=60mm]{Cluster_x_Nodes_NX=150_and_NX=170.jpg}
-\caption{Cluster x Nodes NX=150 and NX=170 \label{overflow}}
+\includegraphics[width=60mm]{cluster_x_nodes_nx_150_and_nx_170.pdf}
+\caption{Cluster x Nodes NX=150 and NX=170} 
+%\label{overflow}}
  \end{figure}
-%%\end{wrapfigure}
-
+%\end{wrapfigure}
  
  Unless the 8x8 cluster, the time 
  execution difference between the two algorithms is important when 
@@ -260,11 +332,14 @@ matrix size.
  %\RCE{idem pour tous les tableaux de donnees}
  
  
+%\begin{wrapfigure}{l}{60mm}
  \begin{figure} [ht!]
  \centering
-\includegraphics[width=60mm]{Cluster_x_Nodes_N1_x_N2.jpg}
-\caption{Cluster x Nodes N1 x N2\label{overflow}}
+\includegraphics[width=60mm]{cluster_x_nodes_n1_x_n2.pdf}
+\caption{Cluster x Nodes N1 x N2}
+%\label{overflow}}
  \end{figure}
+%\end{wrapfigure}
  
  The experiments compare the behavior of the algorithms running first on 
  speed inter- cluster network (N1) and a less performant network (N2). 
@@ -291,8 +366,9 @@ Table 3 : Network latency impact
  
  \begin{figure} [ht!]
  \centering
-\includegraphics[width=60mm]{Network_latency_impact_on_execution_time.jpg}
-\caption{Network latency impact on execution time\label{overflow}}
+\includegraphics[width=60mm]{network_latency_impact_on_execution_time.pdf}
+\caption{Network latency impact on execution time}
+%\label{overflow}}
  \end{figure}
  
  
@@ -322,8 +398,9 @@ Table 4 : Network bandwidth impact
  
  \begin{figure} [ht!]
  \centering
-\includegraphics[width=60mm]{Network_bandwith_impact_on_execution_time.jpg}
-\caption{Network bandwith impact on execution time\label{overflow}}
+\includegraphics[width=60mm]{network_bandwith_impact_on_execution_time.pdf}
+\caption{Network bandwith impact on execution time}
+%\label{overflow}
  \end{figure}
  
  
@@ -350,8 +427,9 @@ Table 5 : Input matrix size impact
  
  \begin{figure} [ht!]
  \centering
-\includegraphics[width=60mm]{Pb_size_impact_on_execution_time.jpg}
-\caption{Pb size impact on execution time\label{overflow}}
+\includegraphics[width=60mm]{pb_size_impact_on_execution_time.pdf}
+\caption{Pb size impact on execution time}
+%\label{overflow}}
  \end{figure}
  
  In this experimentation, the input matrix size has been set from 
@@ -384,8 +462,9 @@ Table 6 : CPU Power impact
  
  \begin{figure} [ht!]
  \centering
-\includegraphics[width=60mm]{CPU_Power_impact_on_execution_time.jpg}
-\caption{CPU Power impact on execution time\label{overflow}}
+\includegraphics[width=60mm]{cpu_power_impact_on_execution_time.pdf}
+\caption{CPU Power impact on execution time}
+%\label{overflow}}
  \end{figure}
  
  Using the SIMGRID simulator flexibility, we have tried to determine the 
@@ -524,8 +603,8 @@ The authors would like to thank\dots{}
  % number - used to balance the columns on the last page
  % adjust value as needed - may need to be readjusted if
  % the document is modified later
-\bibliographystyle{IEEEtran}
-\bibliography{hpccBib}
+\bibliographystyle{plain}
+\bibliography{biblio}
  
  \end{document}