asynchronous tow-stage algorithm

[rce2015.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index 375f7c165ebaa882362f387b927d5aa4a52642e7..071f020fd22cf0f1d8373230176c519a833dd166 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -54,6 +54,8 @@
  
  \newcommand{\TOLG}{\mathit{tol_{gmres}}}
  \newcommand{\MIG}{\mathit{maxit_{gmres}}}
  
  \newcommand{\TOLG}{\mathit{tol_{gmres}}}
  \newcommand{\MIG}{\mathit{maxit_{gmres}}}
+\newcommand{\TOLM}{\mathit{tol_{multi}}}
+\newcommand{\MIM}{\mathit{maxit_{multi}}}
  
  \usepackage{array}
  \usepackage{color, colortbl}
  
  \usepackage{array}
  \usepackage{color, colortbl}
@@ -151,24 +153,37 @@ where $x_\ell$ are sub-vectors of the solution $x$, $b_\ell$ are the sub-vectors
  A_{\ell\ell} x_\ell = c_\ell,
  \label{eq:06}
  \end{equation}
  A_{\ell\ell} x_\ell = c_\ell,
  \label{eq:06}
  \end{equation}
-is solved iteratively using GMRES method and independently from other sub-systems by a cluster of processors. The right-hand sides $c_\ell=b_\ell-\sum_{m\neq\ell}A_{\ell m}x_m$ are computed using the shared vectors $x_m$. Algorithm~\ref{alg:01} shows the main key points of the block Jacobi two-stage method executed by a cluster of processors. In line~\ref{solve}, the linear sub-system~(\ref{eq:06}) is solved in parallel using GMRES method where $\MIG$ and $\TOLG$ are the maximum number of iterations and the tolerance threshold respectively.  
+is solved iteratively using GMRES method and independently from other sub-systems by a cluster of processors. The right-hand sides $c_\ell=b_\ell-\sum_{m\neq\ell}A_{\ell m}x_m$ are computed using the shared vectors $x_m$. Algorithm~\ref{alg:01} shows the main key points of our block Jacobi two-stage method executed by a cluster of processors. In line~\ref{solve}, the linear sub-system~(\ref{eq:06}) is solved in parallel using GMRES method where $\MIG$ and $\TOLG$ are the maximum number of inner iterations and the tolerance threshold of GMRES respectively.  
  
  \begin{algorithm}[t]
  
  \begin{algorithm}[t]
-\caption{Block Jacobi two-stage method}
+\caption{Block Jacobi two-stage multisplitting method}
  \begin{algorithmic}[1]
    \Input $A_\ell$ (sparse matrix), $b_\ell$ (right-hand side)
    \Output $x_\ell$ (solution vector)\vspace{0.2cm}
    \State Set the initial guess $x^0$
    \For {$k=1,2,3,\ldots$ until convergence}
      \State $c_\ell=b_\ell-\sum_{m\neq\ell}A_{\ell m}x_m^{k-1}$
  \begin{algorithmic}[1]
    \Input $A_\ell$ (sparse matrix), $b_\ell$ (right-hand side)
    \Output $x_\ell$ (solution vector)\vspace{0.2cm}
    \State Set the initial guess $x^0$
    \For {$k=1,2,3,\ldots$ until convergence}
      \State $c_\ell=b_\ell-\sum_{m\neq\ell}A_{\ell m}x_m^{k-1}$
-    \State $x^k_\ell=Solve(A_{\ell\ell},c_\ell,x^{k-1}_\ell,\MIG,\TOLG)$ \label{solve}
-    \State Send $x_\ell^k$ to neighboring clusters
-    \State Receive $\{x_m^k\}_{m\neq\ell}$ from neighboring clusters
+    \State $x^k_\ell=Solve(A_{\ell\ell},c_\ell,x^{k-1}_\ell,\MIG,\TOLG)$\label{solve}
+    \State Send $x_\ell^k$ to neighboring clusters\label{send}
+    \State Receive $\{x_m^k\}_{m\neq\ell}$ from neighboring clusters\label{recv}
    \EndFor
  \end{algorithmic}
  \label{alg:01}
  \end{algorithm}
  
    \EndFor
  \end{algorithmic}
  \label{alg:01}
  \end{algorithm}
  
+Multisplitting methods are more advantageous for large distributed computing platforms composed of hundreds or even thousands of processors interconnected by high latency networks. In this context, the parallel asynchronous model is preferred to the synchronous one to reduce overall execution times of the algorithms, even if it generally requires more iterations to converge. The asynchronous model allows the communications to be overlapped by computations which suppresses the idle times resulting from the synchronizations. So in asynchronous mode, our two-stage algorithm uses asynchronous outer iterations and asynchronous communications between clusters. The communications (i.e. lines~\ref{send} and~\ref{recv} in Algorithm~\ref{alg:01}) are performed by message passing using MPI non-blocking communication routines. The convergence of the asynchronous iterations is detected when all clusters have locally converged
+\begin{equation}
+k\geq\MIM\mbox{~or~}\|x_\ell^{k+1}-x_\ell^k\|_{\infty }\leq\TOLM,
+\label{eq:07}
+\end{equation}    
+where $\MIM$ is the maximum number of outer iterations and $\TOLM$ is the tolerance threshold of the two-stage algorithm. The procedure of the convergence detection is implemented as follows. All clusters are interconnected by a virtual unidirectional ring network around which a Boolean token circulates from a cluster to another.  
+
+
+
+
+
+
+
  \subsection{Simulation of two-stage methods using SimGrid framework}
  
  %%%%%%%%%%%%%%%%%%%%%%%%%
  \subsection{Simulation of two-stage methods using SimGrid framework}
  
  %%%%%%%%%%%%%%%%%%%%%%%%%
@@ -599,11 +614,7 @@ CONCLUSION
  The authors would like to thank\dots{}
  
  
  The authors would like to thank\dots{}
  
  
-% trigger a \newpage just before the given reference
-% number - used to balance the columns on the last page
-% adjust value as needed - may need to be readjusted if
-% the document is modified later
-\bibliographystyle{plain}
+\bibliographystyle{wileyj}
  \bibliography{biblio}
  
  \end{document}
  \bibliography{biblio}
  
  \end{document}