paper.tex

   1 \documentclass[times]{cpeauth}
   2
   3 \usepackage{moreverb}
   4
   5 %\usepackage[dvips,colorlinks,bookmarksopen,bookmarksnumbered,citecolor=red,urlcolor=red]{hyperref}
   6
   7 %\newcommand\BibTeX{{\rmfamily B\kern-.05em \textsc{i\kern-.025em b}\kern-.08em
   8 %T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}}
   9
  10 \def\volumeyear{2015}
  11
  12 \usepackage{graphicx}
  13 \usepackage{wrapfig}
  14 \usepackage{grffile}
  15
  16 \usepackage[T1]{fontenc}
  17 \usepackage[utf8]{inputenc}
  18 \usepackage{amsfonts,amssymb}
  19 \usepackage{amsmath}
  20 \usepackage{algorithm}
  21 \usepackage{algpseudocode}
  22 %\usepackage{amsthm}
  23 \usepackage{graphicx}
  24 \usepackage[american]{babel}
  25 % Extension pour les liens intra-documents (tagged PDF)
  26 % et l'affichage correct des URL (commande \url{http://example.com})
  27 %\usepackage{hyperref}
  28
  29 \usepackage{url}
  30 \DeclareUrlCommand\email{\urlstyle{same}}
  31
  32 \usepackage[autolanguage,np]{numprint}
  33 \AtBeginDocument{%
  34   \renewcommand*\npunitcommand[1]{\text{#1}}
  35   \npthousandthpartsep{}}
  36
  37 \usepackage{xspace}
  38 \usepackage[textsize=footnotesize]{todonotes}
  39
  40 \newcommand{\AG}[2][inline]{%
  41   \todo[color=green!50,#1]{\sffamily\textbf{AG:} #2}\xspace}
  42 \newcommand{\RC}[2][inline]{%
  43   \todo[color=red!10,#1]{\sffamily\textbf{RC:} #2}\xspace}
  44 \newcommand{\LZK}[2][inline]{%
  45   \todo[color=blue!10,#1]{\sffamily\textbf{LZK:} #2}\xspace}
  46 \newcommand{\RCE}[2][inline]{%
  47   \todo[color=yellow!10,#1]{\sffamily\textbf{RCE:} #2}\xspace}
  48
  49 \algnewcommand\algorithmicinput{\textbf{Input:}}
  50 \algnewcommand\Input{\item[\algorithmicinput]}
  51
  52 \algnewcommand\algorithmicoutput{\textbf{Output:}}
  53 \algnewcommand\Output{\item[\algorithmicoutput]}
  54
  55 \newcommand{\TOLG}{\mathit{tol_{gmres}}}
  56 \newcommand{\MIG}{\mathit{maxit_{gmres}}}
  57 \newcommand{\TOLM}{\mathit{tol_{multi}}}
  58 \newcommand{\MIM}{\mathit{maxit_{multi}}}
  59
  60 \usepackage{array}
  61 \usepackage{color, colortbl}
  62 \newcolumntype{M}[1]{>{\centering\arraybackslash}m{#1}}
  63 \newcolumntype{Z}[1]{>{\raggedleft}m{#1}}
  64
  65 \newcolumntype{g}{>{\columncolor{Gray}}c}
  66 \definecolor{Gray}{gray}{0.9}
  67
  68
  69
  70 \begin{document}
  71 \RCE{Titre a confirmer.}
  72 \title{Comparative performance analysis of simulated grid-enabled numerical iterative algorithms}
  73 %\itshape{\journalnamelc}\footnotemark[2]}
  74
  75 \author{    Charles Emile Ramamonjisoa and
  76     David Laiymani and
  77     Arnaud Giersch and
  78     Lilia Ziane Khodja and
  79     Raphaël Couturier
  80 }
  81
  82 \address{
  83         \centering
  84     Femto-ST Institute - DISC Department\\
  85     Université de Franche-Comté\\
  86     Belfort\\
  87     Email: \email{{raphael.couturier,arnaud.giersch,david.laiymani,charles.ramamonjisoa}@univ-fcomte.fr}
  88 }
  89
  90 %% Lilia Ziane Khodja: Department of Aerospace \& Mechanical Engineering\\ Non Linear Computational Mechanics\\ University of Liege\\ Liege, Belgium. Email: l.zianekhodja@ulg.ac.be
  91
  92 \begin{abstract}
  93 ABSTRACT
  94 \end{abstract}
  95
  96 \keywords{Algorithm; distributed; iterative; asynchronous; simulation; simgrid; performance}
  97
  98 \maketitle
  99
 100 \section{Introduction}
 101
 102 \section{The asynchronous iteration model}
 103
 104 \section{SimGrid}
 105
 106 %%%%%%%%%%%%%%%%%%%%%%%%%
 107 %%%%%%%%%%%%%%%%%%%%%%%%%
 108
 109 \section{Two-stage multisplitting methods}
 110 \label{sec:04}
 111 \subsection{Synchronous and asynchronous two-stage methods for sparse linear systems}
 112 \label{sec:04.01}
 113 In this paper we focus on two-stage multisplitting methods in their both versions synchronous and asynchronous~\cite{Frommer92,Szyld92,Bru95}. These iterative methods are based on multisplitting methods~\cite{O'leary85,White86,Alefeld97} and use two nested iterations: the outer iteration and the inner iteration. Let us consider the following sparse linear system of $n$ equations in $\mathbb{R}$
 114 \begin{equation}
 115 Ax=b,
 116 \label{eq:01}
 117 \end{equation}
 118 where $A$ is a sparse square and nonsingular matrix, $b$ is the right-hand side and $x$ is the solution of the system. Our work in this paper is restricted to the block Jacobi splitting method. This approach of multisplitting consists in partitioning the matrix $A$ into $L$ horizontal band matrices of order $\frac{n}{L}\times n$ without overlapping (i.e. sub-vectors $\{x_\ell\}_{1\leq\ell\leq L}$ are disjoint). The two-stage multisplitting methods solve the linear system~(\ref{eq:01}) iteratively as follows
 119 \begin{equation}
 120 x_\ell^{k+1} = A_{\ell\ell}^{-1}(b_\ell - \displaystyle\sum^{L}_{\substack{m=1\\m\neq\ell}}{A_{\ell m}x^k_m}),\mbox{~for~}\ell=1,\ldots,L\mbox{~and~}k=1,2,3,\ldots
 121 \label{eq:02}
 122 \end{equation}
 123 where $x_\ell$ are sub-vectors of the solution $x$, $b_\ell$ are the sub-vectors of the right-hand side $b$, and $A_{\ell\ell}$ and $A_{\ell m}$ are diagonal and off-diagonal blocks of matrix $A$ respectively. The iterations of these methods can naturally be computed in parallel such that each processor or cluster of processors is responsible for solving one splitting as a linear sub-system
 124 \begin{equation}
 125 A_{\ell\ell} x_\ell = c_\ell,\mbox{~for~}\ell=1,\ldots,L,
 126 \label{eq:03}
 127 \end{equation}
 128 where right-hand sides $c_\ell=b_\ell-\sum_{m\neq\ell}A_{\ell m}x_m$ are computed using the shared vectors $x_m$. In this paper, we use the well-known iterative method GMRES ({\it Generalized Minimal RESidual})~\cite{saad86} as an inner iteration to approximate the solutions of the different splittings arising from the block Jacobi multisplitting of matrix $A$. Algorithm~\ref{alg:01} shows the main key points of our block Jacobi two-stage method executed by a cluster of processors. In line~\ref{solve}, the linear sub-system~(\ref{eq:03}) is solved in parallel using GMRES method where $\MIG$ and $\TOLG$ are the maximum number of inner iterations and the tolerance threshold of GMRES respectively.
 129
 130 \begin{algorithm}[t]
 131 \caption{Block Jacobi two-stage multisplitting method}
 132 \begin{algorithmic}[1]
 133   \Input $A_\ell$ (sparse matrix), $b_\ell$ (right-hand side)
 134   \Output $x_\ell$ (solution vector)\vspace{0.2cm}
 135   \State Set the initial guess $x^0$
 136   \For {$k=1,2,3,\ldots$ until convergence}
 137     \State $c_\ell=b_\ell-\sum_{m\neq\ell}A_{\ell m}x_m^{k-1}$
 138     \State $x^k_\ell=Solve(A_{\ell\ell},c_\ell,x^{k-1}_\ell,\MIG,\TOLG)$\label{solve}
 139     \State Send $x_\ell^k$ to neighboring clusters\label{send}
 140     \State Receive $\{x_m^k\}_{m\neq\ell}$ from neighboring clusters\label{recv}
 141   \EndFor
 142 \end{algorithmic}
 143 \label{alg:01}
 144 \end{algorithm}
 145
 146 The convergence of the two-stage multisplitting methods, based on synchronous or asynchronous iterations, is studied by many authors for example~\cite{Szyld92,Bru95,Bai99,bahi07}. The multisplitting methods are convergent:
 147 \begin{itemize}
 148 \item if $A^{-1}>0$ and the splittings of matrix $A$ are weak regular when the iterations are synchronous, or
 149 \item if $A$ is M-matrix and its splittings are regular when the iterations are asynchronous.
 150 \end{itemize}
 151
 152 In this paper, we propose two algorithms of two-stage multisplitting methods. The first algorithm is based on asynchronous model which allows the communications to be overlapped by computations and reduces the idle times resulting from the synchronizations. So in the asynchronous mode, our two-stage algorithm uses asynchronous outer iterations and asynchronous communications between clusters. The communications (i.e. lines~\ref{send} and~\ref{recv} in Algorithm~\ref{alg:01}) are performed by message passing using MPI non-blocking communication routines. The convergence of the asynchronous iterations is detected when all clusters have locally converged
 153 \begin{equation}
 154 k\geq\MIM\mbox{~or~}\|x_\ell^{k+1}-x_\ell^k\|_{\infty }\leq\TOLM,
 155 \label{eq:04}
 156 \end{equation}
 157 where $\MIM$ is the maximum number of outer iterations and $\TOLM$ is the tolerance threshold of the two-stage algorithm.
 158
 159
 160
 161
 162
 163
 164
 165
 166 \subsection{Simulation of two-stage methods using SimGrid framework}
 167
 168 %%%%%%%%%%%%%%%%%%%%%%%%%
 169 %%%%%%%%%%%%%%%%%%%%%%%%%
 170
 171 \section{Experimental, Results and Comments}
 172
 173
 174 \textbf{V.1. Setup study and Methodology}
 175
 176 To conduct our study, we have put in place the following methodology
 177 which can be reused with any grid-enabled applications.
 178
 179 \textbf{Step 1} : Choose with the end users the class of algorithms or
 180 the application to be tested. Numerical parallel iterative algorithms
 181 have been chosen for the study in the paper.
 182
 183 \textbf{Step 2} : Collect the software materials needed for the
 184 experimentation. In our case, we have three variants algorithms for the
 185 resolution of three 3D-Poisson problem: (1) using the classical GMRES alias Algo-1 in this
 186 paper, (2) using the multisplitting method alias Algo-2 and (3) an
 187 enhanced version of the multisplitting method as Algo-3. In addition,
 188 SIMGRID simulator has been chosen to simulate the behaviors of the
 189 distributed applications. SIMGRID is running on the Mesocentre
 190 datacenter in Franche-Comte University $[$10$]$ but also in a virtual
 191 machine on a laptop.
 192
 193 \textbf{Step 3} : Fix the criteria which will be used for the future
 194 results comparison and analysis. In the scope of this study, we retain
 195 in one hand the algorithm execution mode (synchronous and asynchronous)
 196 and in the other hand the execution time and the number of iterations of
 197 the application before obtaining the convergence.
 198
 199 \textbf{Step 4 }: Setup up the different grid testbeds environment
 200 which will be simulated in the simulator tool to run the program. The
 201 following architecture has been configured in Simgrid : 2x16 - that is a
 202 grid containing 2 clusters with 16 hosts (processors/cores) each -, 4x8,
 203 4x16, 8x8 and 2x50. The network has been designed to operate with a
 204 bandwidth equals to 10Gbits (resp. 1Gbits/s) and a latency of 8E-6
 205 microseconds (resp. 5E-5) for the intra-clusters links (resp.
 206 inter-clusters backbone links).
 207
 208 \textbf{Step 5}: Process an extensive and comprehensive testings
 209 within these configurations in varying the key parameters, especially
 210 the CPU power capacity, the network parameters and also the size of the
 211 input matrix. Note that some parameters should be invariant to allow the
 212 comparison like some program input arguments.
 213
 214 \textbf{Step 6} : Collect and analyze the output results.
 215
 216 \textbf{ V.2. Factors impacting distributed applications performance in
 217 a grid environment}
 218
 219 From our previous experience on running distributed application in a
 220 computational grid, many factors are identified to have an impact on the
 221 program behavior and performance on this specific environment. Mainly,
 222 first of all, the architecture of the grid itself can obviously
 223 influence the performance results of the program. The performance gain
 224 might be important theoretically when the number of clusters and/or the
 225 number of nodes (processors/cores) in each individual cluster increase.
 226
 227 Another important factor impacting the overall performance of the
 228 application is the network configuration. Two main network parameters
 229 can modify drastically the program output results : (i) the network
 230 bandwidth (bw=bits/s) also known as "the data-carrying capacity"
 231 $[$13$]$ of the network is defined as the maximum of data that can pass
 232 from one point to another in a unit of time. (ii) the network latency
 233 (lat : microsecond) defined as the delay from the start time to send the
 234 data from a source and the final time the destination have finished to
 235 receive it. Upon the network characteristics, another impacting factor
 236 is the application dependent volume of data exchanged between the nodes
 237 in the cluster and between distant clusters. Large volume of data can be
 238 transferred in transit between the clusters and nodes during the code
 239 execution.
 240
 241  In a grid environment, it is common to distinguish in one hand, the
 242 "\,intra-network" which refers to the links between nodes within a
 243 cluster and in the other hand, the "\,inter-network" which is the
 244 backbone link between clusters. By design, these two networks perform
 245 with different speed. The intra-network generally works like a high
 246 speed local network with a high bandwith and very low latency. In
 247 opposite, the inter-network connects clusters sometime via heterogeneous
 248 networks components thru internet with a lower speed. The network
 249 between distant clusters might be a bottleneck for the global
 250 performance of the application.
 251
 252 \textbf{V.3 Comparing GMRES and Multisplitting algorithms in
 253 synchronous mode}
 254
 255 In the scope of this paper, our first objective is to demonstrate the
 256 Algo-2 (Multisplitting method) shows a better performance in grid
 257 architecture compared with Algo-1 (Classical GMRES) both running in
 258 \textbf{\textit{synchronous mode}}. Better algorithm performance
 259 should mean a less number of iterations output and a less execution time
 260 before reaching the convergence. For a systematic study, the experiments
 261 should figure out that, for various grid parameters values, the
 262 simulator will confirm the targeted outcomes, particularly for poor and
 263 slow networks, focusing on the impact on the communication performance
 264 on the chosen class of algorithm $[$12$]$.
 265
 266 The following paragraphs present the test conditions, the output results
 267 and our comments.
 268
 269
 270 \textit{3.a Executing the algorithms on various computational grid
 271 architecture scaling up the input matrix size}
 272 \\
 273
 274 % environment
 275 \begin{footnotesize}
 276 \begin{tabular}{r c }
 277  \hline
 278  Grid & 2x16, 4x8, 4x16 and 8x8\\ %\hline
 279  Network & N2 : bw=1Gbs-lat=5E-05 \\ %\hline
 280  Input matrix size & N$_{x}$ =150 x 150 x 150 and\\ %\hline
 281  - & N$_{x}$ =170 x 170 x 170    \\ \hline
 282  \end{tabular}
 283 \end{footnotesize}
 284
 285
 286  Table 1 : Clusters x Nodes with NX=150 or NX=170
 287
 288 \RCE{J'ai voulu mettre les tableaux des données mais je pense que c'est inutile et ça va surcharger}
 289
 290
 291 The results in figure 1 show the non-variation of the number of
 292 iterations of classical GMRES for a given input matrix size; it is not
 293 the case for the multisplitting method.
 294
 295 %\begin{wrapfigure}{l}{60mm}
 296 \begin{figure} [ht!]
 297 \centering
 298 \includegraphics[width=60mm]{cluster_x_nodes_nx_150_and_nx_170.pdf}
 299 \caption{Cluster x Nodes NX=150 and NX=170}
 300 %\label{overflow}}
 301 \end{figure}
 302 %\end{wrapfigure}
 303
 304 Unless the 8x8 cluster, the time
 305 execution difference between the two algorithms is important when
 306 comparing between different grid architectures, even with the same number of
 307 processors (like 2x16 and 4x8 = 32 processors for example). The
 308 experiment concludes the low sensitivity of the multisplitting method
 309 (compared with the classical GMRES) when scaling up to higher input
 310 matrix size.
 311
 312 \textit{3.b Running on various computational grid architecture}
 313
 314 % environment
 315 \begin{footnotesize}
 316 \begin{tabular}{r c }
 317  \hline
 318  Grid & 2x16, 4x8\\ %\hline
 319  Network & N1 : bw=10Gbs-lat=8E-06 \\ %\hline
 320  - & N2 : bw=1Gbs-lat=5E-05 \\
 321  Input matrix size & N$_{x}$ =150 x 150 x 150\\ \hline \\
 322  \end{tabular}
 323 \end{footnotesize}
 324
 325 %Table 2 : Clusters x Nodes - Networks N1 x N2
 326 %\RCE{idem pour tous les tableaux de donnees}
 327
 328
 329 %\begin{wrapfigure}{l}{60mm}
 330 \begin{figure} [ht!]
 331 \centering
 332 \includegraphics[width=60mm]{cluster_x_nodes_n1_x_n2.pdf}
 333 \caption{Cluster x Nodes N1 x N2}
 334 %\label{overflow}}
 335 \end{figure}
 336 %\end{wrapfigure}
 337
 338 The experiments compare the behavior of the algorithms running first on
 339 speed inter- cluster network (N1) and a less performant network (N2).
 340 The figure 2 shows that end users will gain to reduce the execution time
 341 for both algorithms in using a grid architecture like 4x16 or 8x8: the
 342 performance was increased in a factor of 2. The results depict also that
 343 when the network speed drops down, the difference between the execution
 344 times can reach more than 25\%.
 345
 346 \textit{\\\\\\\\\\\\\\\\\\3.c Network latency impacts on performance}
 347
 348 % environment
 349 \begin{footnotesize}
 350 \begin{tabular}{r c }
 351  \hline
 352  Grid & 2x16\\ %\hline
 353  Network & N1 : bw=1Gbs \\ %\hline
 354  Input matrix size & N$_{x}$ =150 x 150 x 150\\ \hline\\
 355  \end{tabular}
 356 \end{footnotesize}
 357
 358 Table 3 : Network latency impact
 359
 360
 361 \begin{figure} [ht!]
 362 \centering
 363 \includegraphics[width=60mm]{network_latency_impact_on_execution_time.pdf}
 364 \caption{Network latency impact on execution time}
 365 %\label{overflow}}
 366 \end{figure}
 367
 368
 369 According the results in table and figure 3, degradation of the network
 370 latency from 8.10$^{-6}$ to 6.10$^{-5}$ implies an absolute time
 371 increase more than 75\% (resp. 82\%) of the execution for the classical
 372 GMRES (resp. multisplitting) algorithm. In addition, it appears that the
 373 multisplitting method tolerates more the network latency variation with
 374 a less rate increase. Consequently, in the worst case (lat=6.10$^{-5
 375 }$), the execution time for GMRES is almost the double of the time for
 376 the multisplitting, even though, the performance was on the same order
 377 of magnitude with a latency of 8.10$^{-6}$.
 378
 379 \textit{3.d Network bandwidth impacts on performance}
 380
 381 % environment
 382 \begin{footnotesize}
 383 \begin{tabular}{r c }
 384  \hline
 385  Grid & 2x16\\ %\hline
 386  Network & N1 : bw=1Gbs - lat=5E-05 \\ %\hline
 387  Input matrix size & N$_{x}$ =150 x 150 x 150\\ \hline
 388  \end{tabular}
 389 \end{footnotesize}
 390
 391 Table 4 : Network bandwidth impact
 392
 393 \begin{figure} [ht!]
 394 \centering
 395 \includegraphics[width=60mm]{network_bandwith_impact_on_execution_time.pdf}
 396 \caption{Network bandwith impact on execution time}
 397 %\label{overflow}
 398 \end{figure}
 399
 400
 401
 402 The results of increasing the network bandwidth depict the improvement
 403 of the performance by reducing the execution time for both of the two
 404 algorithms. However, and again in this case, the multisplitting method
 405 presents a better performance in the considered bandwidth interval with
 406 a gain of 40\% which is only around 24\% for classical GMRES.
 407
 408 \textit{3.e Input matrix size impacts on performance}
 409
 410 % environment
 411 \begin{footnotesize}
 412 \begin{tabular}{r c }
 413  \hline
 414  Grid & 4x8\\ %\hline
 415  Network & N2 : bw=1Gbs - lat=5E-05 \\ %\hline
 416  Input matrix size & N$_{x}$ = From 40 to 200\\ \hline
 417  \end{tabular}
 418 \end{footnotesize}
 419
 420 Table 5 : Input matrix size impact
 421
 422 \begin{figure} [ht!]
 423 \centering
 424 \includegraphics[width=60mm]{pb_size_impact_on_execution_time.pdf}
 425 \caption{Pb size impact on execution time}
 426 %\label{overflow}}
 427 \end{figure}
 428
 429 In this experimentation, the input matrix size has been set from
 430 Nx=Ny=Nz=40 to 200 side elements that is from 40$^{3}$ = 64.000 to
 431 200$^{3}$ = 8.000.000 points. Obviously, as shown in the figure 5,
 432 the execution time for the algorithms convergence increases with the
 433 input matrix size. But the interesting result here direct on (i) the
 434 drastic increase (300 times) of the number of iterations needed before
 435 the convergence for the classical GMRES algorithm when the matrix size
 436 go beyond Nx=150; (ii) the classical GMRES execution time also almost
 437 the double from Nx=140 compared with the convergence time of the
 438 multisplitting method. These findings may help a lot end users to setup
 439 the best and the optimal targeted environment for the application
 440 deployment when focusing on the problem size scale up. Note that the
 441 same test has been done with the grid 2x16 getting the same conclusion.
 442
 443 \textit{3.f CPU Power impact on performance}
 444
 445 % environment
 446 \begin{footnotesize}
 447 \begin{tabular}{r c }
 448  \hline
 449  Grid & 2x16\\ %\hline
 450  Network & N2 : bw=1Gbs - lat=5E-05 \\ %\hline
 451  Input matrix size & N$_{x}$ = 150 x 150 x 150\\ \hline
 452  \end{tabular}
 453 \end{footnotesize}
 454
 455 Table 6 : CPU Power impact
 456
 457 \begin{figure} [ht!]
 458 \centering
 459 \includegraphics[width=60mm]{cpu_power_impact_on_execution_time.pdf}
 460 \caption{CPU Power impact on execution time}
 461 %\label{overflow}}
 462 \end{figure}
 463
 464 Using the SIMGRID simulator flexibility, we have tried to determine the
 465 impact on the algorithms performance in varying the CPU power of the
 466 clusters nodes from 1 to 19 GFlops. The outputs depicted in the figure 6
 467 confirm the performance gain, around 95\% for both of the two methods,
 468 after adding more powerful CPU. Note that the execution time axis in the
 469 figure is in logarithmic scale.
 470
 471  \textbf{V.4 Comparing GMRES in native synchronous mode and
 472 Multisplitting algorithms in asynchronous mode}
 473
 474 The previous paragraphs put in evidence the interests to simulate the
 475 behavior of the application before any deployment in a real environment.
 476 We have focused the study on analyzing the performance in varying the
 477 key factors impacting the results. In the same line, the study compares
 478 the performance of the two proposed methods in \textbf{synchronous mode
 479 }. In this section, with the same previous methodology, the goal is to
 480 demonstrate the efficiency of the multisplitting method in \textbf{
 481 asynchronous mode} compare with the classical GMRES staying in the
 482 synchronous mode.
 483
 484 Note that the interest of using the asynchronous mode for data exchange
 485 is mainly, in opposite of the synchronous mode, the non-wait aspects of
 486 the current computation after a communication operation like sending
 487 some data between nodes. Each processor can continue their local
 488 calculation without waiting for the end of the communication. Thus, the
 489 asynchronous may theoretically reduce the overall execution time and can
 490 improve the algorithm performance.
 491
 492 As stated supra, SIMGRID simulator tool has been used to prove the
 493 efficiency of the multisplitting in asynchronous mode and to find the
 494 best combination of the grid resources (CPU, Network, input matrix size,
 495 \ldots ) to get the highest "\,relative gain" in comparison with the
 496 classical GMRES time.
 497
 498
 499 The test conditions are summarized in the table below :
 500
 501 % environment
 502 \begin{footnotesize}
 503 \begin{tabular}{r c }
 504  \hline
 505  Grid & 2x50 totaling 100 processors\\ %\hline
 506  Processors & 1 GFlops to 1.5 GFlops\\
 507    Intra-Network & bw=1.25 Gbits - lat=5E-05 \\ %\hline
 508    Inter-Network & bw=5 Mbits - lat=2E-02\\
 509  Input matrix size & N$_{x}$ = From 62 to 150\\ %\hline
 510  Residual error precision: 10$^{-5}$ to 10$^{-9}$\\ \hline
 511  \end{tabular}
 512 \end{footnotesize}
 513
 514 Again, comprehensive and extensive tests have been conducted varying the
 515 CPU power and the network parameters (bandwidth and latency) in the
 516 simulator tool with different problem size. The relative gains greater
 517 than 1 between the two algorithms have been captured after each step of
 518 the test. Table I below has recorded the best grid configurations
 519 allowing a multiplitting method time more than 2.5 times lower than
 520 classical GMRES execution and convergence time. The finding thru this
 521 experimentation is the tolerance of the multisplitting method under a
 522 low speed network that we encounter usually with distant clusters thru the
 523 internet.
 524
 525 % use the same column width for the following three tables
 526 \newlength{\mytablew}\settowidth{\mytablew}{\footnotesize\np{E-11}}
 527 \newenvironment{mytable}[1]{% #1: number of columns for data
 528   \renewcommand{\arraystretch}{1.3}%
 529   \begin{tabular}{|>{\bfseries}r%
 530                   |*{#1}{>{\centering\arraybackslash}p{\mytablew}|}}}{%
 531     \end{tabular}}
 532
 533 \begin{table}[!t]
 534   \centering
 535   \caption{Relative gain of the multisplitting algorithm compared with
 536 the classical GMRES}
 537   \label{"Table 7"}
 538
 539   \begin{mytable}{6}
 540     \hline
 541     bandwidth (Mbit/s)
 542     & 5         & 5         & 5         & 5         & 5 \\
 543     \hline
 544     latency (ms)
 545     & 20      & 20      & 20      & 20      & 20 \\
 546     \hline
 547     power (GFlops)
 548     & 1         & 1         & 1         & 1.5       & 1.5 \\
 549     \hline
 550     size (N)
 551     & 62        & 62        & 62        & 100       & 100 \\
 552     \hline
 553     Precision
 554     & \np{E-5}  & \np{E-8}  & \np{E-9}  & \np{E-11} & \np{E-11} \\
 555     \hline
 556     Relative gain
 557     & 2.52     & 2.55     & 2.52     & 2.57     & 2.54 \\
 558     \hline
 559   \end{mytable}
 560
 561   \smallskip
 562
 563   \begin{mytable}{6}
 564     \hline
 565     bandwidth (Mbit/s)
 566     & 50        & 50        & 50        & 50        & 50 \\
 567     \hline
 568     latency (ms)
 569     & 20      & 20      & 20      & 20      & 20 \\
 570     \hline
 571     power (GFlops)
 572     & 1.5         & 1.5         & 1         & 1.5       & 1.5 \\
 573     \hline
 574     size (N)
 575     & 110       & 120       & 130       & 140       & 150 \\
 576     \hline
 577     Precision
 578     & \np{E-11} & \np{E-11} & \np{E-11} & \np{E-11} & \np{E-11}\\
 579     \hline
 580     Relative gain
 581     & 2.53     & 2.51     & 2.58     & 2.55     & 2.54 \\
 582     \hline
 583   \end{mytable}
 584 \end{table}
 585
 586 \section{Conclusion}
 587 CONCLUSION
 588
 589
 590 \section*{Acknowledgment}
 591
 592
 593 The authors would like to thank\dots{}
 594
 595
 596 \bibliographystyle{wileyj}
 597 \bibliography{biblio}
 598
 599 \end{document}
 600
 601 %%% Local Variables:
 602 %%% mode: latex
 603 %%% TeX-master: t
 604 %%% fill-column: 80
 605 %%% ispell-local-dictionary: "american"
 606 %%% End: