X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/rce2015.git/blobdiff_plain/599f8f6cf156ba20bf3ce06d2890246efdf0dc5d..11c3be9a07960a703ef923ef37d0dfa94e346ba7:/paper.tex?ds=sidebyside diff --git a/paper.tex b/paper.tex index 26eb7d6..05a0948 100644 --- a/paper.tex +++ b/paper.tex @@ -54,6 +54,8 @@ \newcommand{\TOLG}{\mathit{tol_{gmres}}} \newcommand{\MIG}{\mathit{maxit_{gmres}}} +\newcommand{\TOLM}{\mathit{tol_{multi}}} +\newcommand{\MIM}{\mathit{maxit_{multi}}} \usepackage{array} \usepackage{color, colortbl} @@ -151,24 +153,37 @@ where $x_\ell$ are sub-vectors of the solution $x$, $b_\ell$ are the sub-vectors A_{\ell\ell} x_\ell = c_\ell, \label{eq:06} \end{equation} -is solved iteratively using GMRES method and independently from other sub-systems by a cluster of processors. The right-hand sides $c_\ell=b_\ell-\sum_{m\neq\ell}A_{\ell m}x_m$ are computed using the shared vectors $x_m$. Algorithm~\ref{alg:01} shows the main key points of the block Jacobi two-stage method executed by a cluster of processors. In line~\ref{solve}, the linear sub-system~(\ref{eq:06}) is solved in parallel using GMRES method where $\MIG$ and $\TOLG$ are the maximum number of iterations and the tolerance threshold respectively. +is solved iteratively using GMRES method and independently from other sub-systems by a cluster of processors. The right-hand sides $c_\ell=b_\ell-\sum_{m\neq\ell}A_{\ell m}x_m$ are computed using the shared vectors $x_m$. Algorithm~\ref{alg:01} shows the main key points of our block Jacobi two-stage method executed by a cluster of processors. In line~\ref{solve}, the linear sub-system~(\ref{eq:06}) is solved in parallel using GMRES method where $\MIG$ and $\TOLG$ are the maximum number of inner iterations and the tolerance threshold of GMRES respectively. \begin{algorithm}[t] -\caption{Block Jacobi two-stage method} +\caption{Block Jacobi two-stage multisplitting method} \begin{algorithmic}[1] \Input $A_\ell$ (sparse matrix), $b_\ell$ (right-hand side) \Output $x_\ell$ (solution vector)\vspace{0.2cm} \State Set the initial guess $x^0$ \For {$k=1,2,3,\ldots$ until convergence} \State $c_\ell=b_\ell-\sum_{m\neq\ell}A_{\ell m}x_m^{k-1}$ - \State $x^k_\ell=Solve(A_{\ell\ell},c_\ell,x^{k-1}_\ell,\MIG,\TOLG)$ \label{solve} - \State Send $x_\ell^k$ to neighboring clusters - \State Receive $\{x_m^k\}_{m\neq\ell}$ from neighboring clusters + \State $x^k_\ell=Solve(A_{\ell\ell},c_\ell,x^{k-1}_\ell,\MIG,\TOLG)$\label{solve} + \State Send $x_\ell^k$ to neighboring clusters\label{send} + \State Receive $\{x_m^k\}_{m\neq\ell}$ from neighboring clusters\label{recv} \EndFor \end{algorithmic} \label{alg:01} \end{algorithm} +Multisplitting methods are more advantageous for large distributed computing platforms composed of hundreds or even thousands of processors interconnected by high latency networks. In this context, the parallel asynchronous model is preferred to the synchronous one to reduce overall execution times of the algorithms, even if it generally requires more iterations to converge. The asynchronous model allows the communications to be overlapped by computations which suppresses the idle times resulting from the synchronizations. So in asynchronous mode, our two-stage algorithm uses asynchronous outer iterations and asynchronous communications between clusters. The communications (i.e. lines~\ref{send} and~\ref{recv} in Algorithm~\ref{alg:01}) are performed by message passing using MPI non-blocking communication routines. The convergence of the asynchronous iterations is detected when all clusters have locally converged +\begin{equation} +k\geq\MIM\mbox{~or~}\|x_\ell^{k+1}-x_\ell^k\|_{\infty }\leq\TOLM, +\label{eq:07} +\end{equation} +where $\MIM$ is the maximum number of outer iterations and $\TOLM$ is the tolerance threshold of the two-stage algorithm. The procedure of the convergence detection is implemented as follows. All clusters are interconnected by a virtual unidirectional ring network around which a Boolean token circulates from a cluster to another. + + + + + + + \subsection{Simulation of two-stage methods using SimGrid framework} %%%%%%%%%%%%%%%%%%%%%%%%% @@ -540,27 +555,27 @@ internet. \centering \caption{Relative gain of the multisplitting algorithm compared with the classical GMRES} - \label{tab.cluster.2x50} + \label{"Table 7"} \begin{mytable}{6} \hline - bw - & 5 & 5 & 5 & 5 & 5 & 50 \\ + bandwidth (Mbit/s) + & 5 & 5 & 5 & 5 & 5 \\ \hline - lat - & 0.02 & 0.02 & 0.02 & 0.02 & 0.02 & 0.02 \\ + latency (ms) + & 20 & 20 & 20 & 20 & 20 \\ \hline - power - & 1 & 1 & 1 & 1.5 & 1.5 & 1.5 \\ + power (GFlops) + & 1 & 1 & 1 & 1.5 & 1.5 \\ \hline - size - & 62 & 62 & 62 & 100 & 100 & 110 \\ + size (N) + & 62 & 62 & 62 & 100 & 100 \\ \hline - Prec/Eprec - & \np{E-5} & \np{E-8} & \np{E-9} & \np{E-11} & \np{E-11} & \np{E-11} \\ + Precision + & \np{E-5} & \np{E-8} & \np{E-9} & \np{E-11} & \np{E-11} \\ \hline - speedup - & 0.396 & 0.392 & 0.396 & 0.391 & 0.393 & 0.395 \\ + Relative gain + & 2.52 & 2.55 & 2.52 & 2.57 & 2.54 \\ \hline \end{mytable} @@ -568,23 +583,23 @@ the classical GMRES} \begin{mytable}{6} \hline - bw - & 50 & 50 & 50 & 50 & 10 & 10 \\ + bandwidth (Mbit/s) + & 50 & 50 & 50 & 50 & 50 \\ \hline - lat - & 0.02 & 0.02 & 0.02 & 0.02 & 0.03 & 0.01 \\ + latency (ms) + & 20 & 20 & 20 & 20 & 20 \\ \hline - power - & 1.5 & 1.5 & 1.5 & 1.5 & 1 & 1.5 \\ + power (GFlops) + & 1.5 & 1.5 & 1 & 1.5 & 1.5 \\ \hline - size - & 120 & 130 & 140 & 150 & 171 & 171 \\ + size (N) + & 110 & 120 & 130 & 140 & 150 \\ \hline - Prec/Eprec - & \np{E-11} & \np{E-11} & \np{E-11} & \np{E-11} & \np{E-5} & \np{E-5} \\ + Precision + & \np{E-11} & \np{E-11} & \np{E-11} & \np{E-11} & \np{E-11}\\ \hline - speedup - & 0.398 & 0.388 & 0.393 & 0.394 & 0.63 & 0.778 \\ + Relative gain + & 2.53 & 2.51 & 2.58 & 2.55 & 2.54 \\ \hline \end{mytable} \end{table}