paper.tex

   1 \documentclass[times]{cpeauth}
   2
   3 \usepackage{moreverb}
   4
   5 %\usepackage[dvips,colorlinks,bookmarksopen,bookmarksnumbered,citecolor=red,urlcolor=red]{hyperref}
   6
   7 %\newcommand\BibTeX{{\rmfamily B\kern-.05em \textsc{i\kern-.025em b}\kern-.08em
   8 %T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}}
   9
  10 \def\volumeyear{2015}
  11
  12 \usepackage{graphicx}
  13 \usepackage{wrapfig}
  14 \usepackage{grffile}
  15
  16 \usepackage[T1]{fontenc}
  17 \usepackage[utf8]{inputenc}
  18 \usepackage{amsfonts,amssymb}
  19 \usepackage{amsmath}
  20 \usepackage{algorithm}
  21 \usepackage{algpseudocode}
  22 %\usepackage{amsthm}
  23 \usepackage{graphicx}
  24 \usepackage[american]{babel}
  25 % Extension pour les liens intra-documents (tagged PDF)
  26 % et l'affichage correct des URL (commande \url{http://example.com})
  27 %\usepackage{hyperref}
  28
  29 \usepackage{url}
  30 \DeclareUrlCommand\email{\urlstyle{same}}
  31
  32 \usepackage[autolanguage,np]{numprint}
  33 \AtBeginDocument{%
  34   \renewcommand*\npunitcommand[1]{\text{#1}}
  35   \npthousandthpartsep{}}
  36
  37 \usepackage{xspace}
  38 \usepackage[textsize=footnotesize]{todonotes}
  39
  40 \newcommand{\AG}[2][inline]{%
  41   \todo[color=green!50,#1]{\sffamily\textbf{AG:} #2}\xspace}
  42 \newcommand{\RC}[2][inline]{%
  43   \todo[color=red!10,#1]{\sffamily\textbf{RC:} #2}\xspace}
  44 \newcommand{\LZK}[2][inline]{%
  45   \todo[color=blue!10,#1]{\sffamily\textbf{LZK:} #2}\xspace}
  46 \newcommand{\RCE}[2][inline]{%
  47   \todo[color=yellow!10,#1]{\sffamily\textbf{RCE:} #2}\xspace}
  48
  49 \algnewcommand\algorithmicinput{\textbf{Input:}}
  50 \algnewcommand\Input{\item[\algorithmicinput]}
  51
  52 \algnewcommand\algorithmicoutput{\textbf{Output:}}
  53 \algnewcommand\Output{\item[\algorithmicoutput]}
  54
  55 \newcommand{\MI}{\mathit{MaxIter}}
  56
  57 \usepackage{array}
  58 \usepackage{color, colortbl}
  59 \newcolumntype{M}[1]{>{\centering\arraybackslash}m{#1}}
  60 \newcolumntype{Z}[1]{>{\raggedleft}m{#1}}
  61
  62 \newcolumntype{g}{>{\columncolor{Gray}}c}
  63 \definecolor{Gray}{gray}{0.9}
  64
  65
  66
  67 \begin{document}
  68 \RCE{Titre a confirmer.}
  69 \title{Comparative performance analysis of simulated grid-enabled numerical iterative algorithms}
  70 %\itshape{\journalnamelc}\footnotemark[2]}
  71
  72 \author{    Charles Emile Ramamonjisoa and
  73     David Laiymani and
  74     Arnaud Giersch and
  75     Lilia Ziane Khodja and
  76     Raphaël Couturier
  77 }
  78
  79 \address{
  80         \centering
  81     Femto-ST Institute - DISC Department\\
  82     Université de Franche-Comté\\
  83     Belfort\\
  84     Email: \email{{raphael.couturier,arnaud.giersch,david.laiymani,charles.ramamonjisoa}@univ-fcomte.fr}
  85 }
  86
  87 \begin{abstract}
  88 ABSTRACT
  89 \end{abstract}
  90
  91 \keywords{Algorithm; distributed; iterative; asynchronous; simulation; simgrid; performance}
  92
  93 \maketitle
  94
  95 \section{Introduction}
  96
  97 \section{The asynchronous iteration model}
  98
  99 \section{SimGrid}
 100
 101 %%%%%%%%%%%%%%%%%%%%%%%%%
 102 %%%%%%%%%%%%%%%%%%%%%%%%%
 103
 104 \section{Two-stage splitting methods}
 105 \label{sec:04}
 106 \subsection{Multisplitting methods for sparse linear systems}
 107 \label{sec:04.01}
 108 Let us consider the following sparse linear system of $n$ equations in $\mathbb{R}$:
 109 \begin{equation}
 110 Ax=b,
 111 \label{eq:01}
 112 \end{equation}
 113 where $A$ is a sparse square and nonsingular matrix, $b$ is the right-hand side and $x$ is the solution of the system. The multisplitting methods solve the linear system~(\ref{eq:01}) iteratively as follows:
 114 \begin{equation}
 115 x^{k+1}=\displaystyle\sum^L_{\ell=1} E_\ell M^{-1}_\ell (N_\ell x^k + b),~k=1,2,3,\ldots
 116 \label{eq:02}
 117 \end{equation}
 118 where a collection of $L$ triplets $(M_\ell, N_\ell, E_\ell)$ defines the multisplitting of matrix $A$, such that: the different splittings are defined as $A=M_\ell-N_\ell$ where $M_\ell$ are nonsingular matrices, and $\sum_\ell{E_\ell=I}$ are diagonal nonnegative weighting matrices and $I$ is the identity matrix.
 119
 120 \subsection{Simulation of two-stage methods using SimGrid framework}
 121
 122 %%%%%%%%%%%%%%%%%%%%%%%%%
 123 %%%%%%%%%%%%%%%%%%%%%%%%%
 124
 125 \section{Experimental, Results and Comments}
 126
 127
 128 \textbf{V.1. Setup study and Methodology}
 129
 130 To conduct our study, we have put in place the following methodology
 131 which can be reused with any grid-enabled applications.
 132
 133 \textbf{Step 1} : Choose with the end users the class of algorithms or
 134 the application to be tested. Numerical parallel iterative algorithms
 135 have been chosen for the study in the paper.
 136
 137 \textbf{Step 2} : Collect the software materials needed for the
 138 experimentation. In our case, we have three variants algorithms for the
 139 resolution of three 3D-Poisson problem: (1) using the classical GMRES
 140 \textit{(Generalized Minimal RESidual Method)} alias Algo-1 in this
 141 paper, (2) using the multisplitting method alias Algo-2 and (3) an
 142 enhanced version of the multisplitting method as Algo-3. In addition,
 143 SIMGRID simulator has been chosen to simulate the behaviors of the
 144 distributed applications. SIMGRID is running on the Mesocentre
 145 datacenter in Franche-Comte University $[$10$]$ but also in a virtual
 146 machine on a laptop.
 147
 148 \textbf{Step 3} : Fix the criteria which will be used for the future
 149 results comparison and analysis. In the scope of this study, we retain
 150 in one hand the algorithm execution mode (synchronous and asynchronous)
 151 and in the other hand the execution time and the number of iterations of
 152 the application before obtaining the convergence.
 153
 154 \textbf{Step 4 }: Setup up the different grid testbeds environment
 155 which will be simulated in the simulator tool to run the program. The
 156 following architecture has been configured in Simgrid : 2x16 - that is a
 157 grid containing 2 clusters with 16 hosts (processors/cores) each -, 4x8,
 158 4x16, 8x8 and 2x50. The network has been designed to operate with a
 159 bandwidth equals to 10Gbits (resp. 1Gbits/s) and a latency of 8E-6
 160 microseconds (resp. 5E-5) for the intra-clusters links (resp.
 161 inter-clusters backbone links).
 162
 163 \textbf{Step 5}: Process an extensive and comprehensive testings
 164 within these configurations in varying the key parameters, especially
 165 the CPU power capacity, the network parameters and also the size of the
 166 input matrix. Note that some parameters should be invariant to allow the
 167 comparison like some program input arguments.
 168
 169 \textbf{Step 6} : Collect and analyze the output results.
 170
 171 \textbf{ V.2. Factors impacting distributed applications performance in
 172 a grid environment}
 173
 174 From our previous experience on running distributed application in a
 175 computational grid, many factors are identified to have an impact on the
 176 program behavior and performance on this specific environment. Mainly,
 177 first of all, the architecture of the grid itself can obviously
 178 influence the performance results of the program. The performance gain
 179 might be important theoretically when the number of clusters and/or the
 180 number of nodes (processors/cores) in each individual cluster increase.
 181
 182 Another important factor impacting the overall performance of the
 183 application is the network configuration. Two main network parameters
 184 can modify drastically the program output results : (i) the network
 185 bandwidth (bw=bits/s) also known as "the data-carrying capacity"
 186 $[$13$]$ of the network is defined as the maximum of data that can pass
 187 from one point to another in a unit of time. (ii) the network latency
 188 (lat : microsecond) defined as the delay from the start time to send the
 189 data from a source and the final time the destination have finished to
 190 receive it. Upon the network characteristics, another impacting factor
 191 is the application dependent volume of data exchanged between the nodes
 192 in the cluster and between distant clusters. Large volume of data can be
 193 transferred in transit between the clusters and nodes during the code
 194 execution.
 195
 196  In a grid environment, it is common to distinguish in one hand, the
 197 "\,intra-network" which refers to the links between nodes within a
 198 cluster and in the other hand, the "\,inter-network" which is the
 199 backbone link between clusters. By design, these two networks perform
 200 with different speed. The intra-network generally works like a high
 201 speed local network with a high bandwith and very low latency. In
 202 opposite, the inter-network connects clusters sometime via heterogeneous
 203 networks components thru internet with a lower speed. The network
 204 between distant clusters might be a bottleneck for the global
 205 performance of the application.
 206
 207 \textbf{V.3 Comparing GMRES and Multisplitting algorithms in
 208 synchronous mode}
 209
 210 In the scope of this paper, our first objective is to demonstrate the
 211 Algo-2 (Multisplitting method) shows a better performance in grid
 212 architecture compared with Algo-1 (Classical GMRES) both running in
 213 \textbf{\textit{synchronous mode}}. Better algorithm performance
 214 should mean a less number of iterations output and a less execution time
 215 before reaching the convergence. For a systematic study, the experiments
 216 should figure out that, for various grid parameters values, the
 217 simulator will confirm the targeted outcomes, particularly for poor and
 218 slow networks, focusing on the impact on the communication performance
 219 on the chosen class of algorithm $[$12$]$.
 220
 221 The following paragraphs present the test conditions, the output results
 222 and our comments.
 223
 224
 225 \textit{3.a Executing the algorithms on various computational grid
 226 architecture scaling up the input matrix size}
 227 \\
 228
 229 % environment
 230 \begin{footnotesize}
 231 \begin{tabular}{r c }
 232  \hline
 233  Grid & 2x16, 4x8, 4x16 and 8x8\\ %\hline
 234  Network & N2 : bw=1Gbs-lat=5E-05 \\ %\hline
 235  Input matrix size & N$_{x}$ =150 x 150 x 150 and\\ %\hline
 236  - & N$_{x}$ =170 x 170 x 170    \\ \hline
 237  \end{tabular}
 238 \end{footnotesize}
 239
 240
 241  Table 1 : Clusters x Nodes with NX=150 or NX=170
 242
 243 \RCE{J'ai voulu mettre les tableaux des données mais je pense que c'est inutile et ça va surcharger}
 244
 245
 246 The results in figure 1 show the non-variation of the number of
 247 iterations of classical GMRES for a given input matrix size; it is not
 248 the case for the multisplitting method.
 249
 250 %\begin{wrapfigure}{l}{60mm}
 251 \begin{figure} [ht!]
 252 \centering
 253 \includegraphics[width=60mm]{cluster_x_nodes_nx_150_and_nx_170.pdf}
 254 \caption{Cluster x Nodes NX=150 and NX=170}
 255 %\label{overflow}}
 256 \end{figure}
 257 %\end{wrapfigure}
 258
 259 Unless the 8x8 cluster, the time
 260 execution difference between the two algorithms is important when
 261 comparing between different grid architectures, even with the same number of
 262 processors (like 2x16 and 4x8 = 32 processors for example). The
 263 experiment concludes the low sensitivity of the multisplitting method
 264 (compared with the classical GMRES) when scaling up to higher input
 265 matrix size.
 266
 267 \textit{3.b Running on various computational grid architecture}
 268
 269 % environment
 270 \begin{footnotesize}
 271 \begin{tabular}{r c }
 272  \hline
 273  Grid & 2x16, 4x8\\ %\hline
 274  Network & N1 : bw=10Gbs-lat=8E-06 \\ %\hline
 275  - & N2 : bw=1Gbs-lat=5E-05 \\
 276  Input matrix size & N$_{x}$ =150 x 150 x 150\\ \hline \\
 277  \end{tabular}
 278 \end{footnotesize}
 279
 280 %Table 2 : Clusters x Nodes - Networks N1 x N2
 281 %\RCE{idem pour tous les tableaux de donnees}
 282
 283
 284 %\begin{wrapfigure}{l}{60mm}
 285 \begin{figure} [ht!]
 286 \centering
 287 \includegraphics[width=60mm]{cluster_x_nodes_n1_x_n2.pdf}
 288 \caption{Cluster x Nodes N1 x N2}
 289 %\label{overflow}}
 290 \end{figure}
 291 %\end{wrapfigure}
 292
 293 The experiments compare the behavior of the algorithms running first on
 294 speed inter- cluster network (N1) and a less performant network (N2).
 295 The figure 2 shows that end users will gain to reduce the execution time
 296 for both algorithms in using a grid architecture like 4x16 or 8x8: the
 297 performance was increased in a factor of 2. The results depict also that
 298 when the network speed drops down, the difference between the execution
 299 times can reach more than 25\%.
 300
 301 \textit{\\\\\\\\\\\\\\\\\\3.c Network latency impacts on performance}
 302
 303 % environment
 304 \begin{footnotesize}
 305 \begin{tabular}{r c }
 306  \hline
 307  Grid & 2x16\\ %\hline
 308  Network & N1 : bw=1Gbs \\ %\hline
 309  Input matrix size & N$_{x}$ =150 x 150 x 150\\ \hline\\
 310  \end{tabular}
 311 \end{footnotesize}
 312
 313 Table 3 : Network latency impact
 314
 315
 316 \begin{figure} [ht!]
 317 \centering
 318 \includegraphics[width=60mm]{network_latency_impact_on_execution_time.pdf}
 319 \caption{Network latency impact on execution time}
 320 %\label{overflow}}
 321 \end{figure}
 322
 323
 324 According the results in table and figure 3, degradation of the network
 325 latency from 8.10$^{-6}$ to 6.10$^{-5}$ implies an absolute time
 326 increase more than 75\% (resp. 82\%) of the execution for the classical
 327 GMRES (resp. multisplitting) algorithm. In addition, it appears that the
 328 multisplitting method tolerates more the network latency variation with
 329 a less rate increase. Consequently, in the worst case (lat=6.10$^{-5
 330 }$), the execution time for GMRES is almost the double of the time for
 331 the multisplitting, even though, the performance was on the same order
 332 of magnitude with a latency of 8.10$^{-6}$.
 333
 334 \textit{3.d Network bandwidth impacts on performance}
 335
 336 % environment
 337 \begin{footnotesize}
 338 \begin{tabular}{r c }
 339  \hline
 340  Grid & 2x16\\ %\hline
 341  Network & N1 : bw=1Gbs - lat=5E-05 \\ %\hline
 342  Input matrix size & N$_{x}$ =150 x 150 x 150\\ \hline
 343  \end{tabular}
 344 \end{footnotesize}
 345
 346 Table 4 : Network bandwidth impact
 347
 348 \begin{figure} [ht!]
 349 \centering
 350 \includegraphics[width=60mm]{network_bandwith_impact_on_execution_time.pdf}
 351 \caption{Network bandwith impact on execution time}
 352 %\label{overflow}
 353 \end{figure}
 354
 355
 356
 357 The results of increasing the network bandwidth depict the improvement
 358 of the performance by reducing the execution time for both of the two
 359 algorithms. However, and again in this case, the multisplitting method
 360 presents a better performance in the considered bandwidth interval with
 361 a gain of 40\% which is only around 24\% for classical GMRES.
 362
 363 \textit{3.e Input matrix size impacts on performance}
 364
 365 % environment
 366 \begin{footnotesize}
 367 \begin{tabular}{r c }
 368  \hline
 369  Grid & 4x8\\ %\hline
 370  Network & N2 : bw=1Gbs - lat=5E-05 \\ %\hline
 371  Input matrix size & N$_{x}$ = From 40 to 200\\ \hline
 372  \end{tabular}
 373 \end{footnotesize}
 374
 375 Table 5 : Input matrix size impact
 376
 377 \begin{figure} [ht!]
 378 \centering
 379 \includegraphics[width=60mm]{pb_size_impact_on_execution_time.pdf}
 380 \caption{Pb size impact on execution time}
 381 %\label{overflow}}
 382 \end{figure}
 383
 384 In this experimentation, the input matrix size has been set from
 385 Nx=Ny=Nz=40 to 200 side elements that is from 40$^{3}$ = 64.000 to
 386 200$^{3}$ = 8.000.000 points. Obviously, as shown in the figure 5,
 387 the execution time for the algorithms convergence increases with the
 388 input matrix size. But the interesting result here direct on (i) the
 389 drastic increase (300 times) of the number of iterations needed before
 390 the convergence for the classical GMRES algorithm when the matrix size
 391 go beyond Nx=150; (ii) the classical GMRES execution time also almost
 392 the double from Nx=140 compared with the convergence time of the
 393 multisplitting method. These findings may help a lot end users to setup
 394 the best and the optimal targeted environment for the application
 395 deployment when focusing on the problem size scale up. Note that the
 396 same test has been done with the grid 2x16 getting the same conclusion.
 397
 398 \textit{3.f CPU Power impact on performance}
 399
 400 % environment
 401 \begin{footnotesize}
 402 \begin{tabular}{r c }
 403  \hline
 404  Grid & 2x16\\ %\hline
 405  Network & N2 : bw=1Gbs - lat=5E-05 \\ %\hline
 406  Input matrix size & N$_{x}$ = 150 x 150 x 150\\ \hline
 407  \end{tabular}
 408 \end{footnotesize}
 409
 410 Table 6 : CPU Power impact
 411
 412 \begin{figure} [ht!]
 413 \centering
 414 \includegraphics[width=60mm]{cpu_power_impact_on_execution_time.pdf}
 415 \caption{CPU Power impact on execution time}
 416 %\label{overflow}}
 417 \end{figure}
 418
 419 Using the SIMGRID simulator flexibility, we have tried to determine the
 420 impact on the algorithms performance in varying the CPU power of the
 421 clusters nodes from 1 to 19 GFlops. The outputs depicted in the figure 6
 422 confirm the performance gain, around 95\% for both of the two methods,
 423 after adding more powerful CPU. Note that the execution time axis in the
 424 figure is in logarithmic scale.
 425
 426  \textbf{V.4 Comparing GMRES in native synchronous mode and
 427 Multisplitting algorithms in asynchronous mode}
 428
 429 The previous paragraphs put in evidence the interests to simulate the
 430 behavior of the application before any deployment in a real environment.
 431 We have focused the study on analyzing the performance in varying the
 432 key factors impacting the results. In the same line, the study compares
 433 the performance of the two proposed methods in \textbf{synchronous mode
 434 }. In this section, with the same previous methodology, the goal is to
 435 demonstrate the efficiency of the multisplitting method in \textbf{
 436 asynchronous mode} compare with the classical GMRES staying in the
 437 synchronous mode.
 438
 439 Note that the interest of using the asynchronous mode for data exchange
 440 is mainly, in opposite of the synchronous mode, the non-wait aspects of
 441 the current computation after a communication operation like sending
 442 some data between nodes. Each processor can continue their local
 443 calculation without waiting for the end of the communication. Thus, the
 444 asynchronous may theoretically reduce the overall execution time and can
 445 improve the algorithm performance.
 446
 447 As stated supra, SIMGRID simulator tool has been used to prove the
 448 efficiency of the multisplitting in asynchronous mode and to find the
 449 best combination of the grid resources (CPU, Network, input matrix size,
 450 \ldots ) to get the highest "\,relative gain" in comparison with the
 451 classical GMRES time.
 452
 453
 454 The test conditions are summarized in the table below :
 455
 456 % environment
 457 \begin{footnotesize}
 458 \begin{tabular}{r c }
 459  \hline
 460  Grid & 2x50 totaling 100 processors\\ %\hline
 461  Processors & 1 GFlops to 1.5 GFlops\\
 462    Intra-Network & bw=1.25 Gbits - lat=5E-05 \\ %\hline
 463    Inter-Network & bw=5 Mbits - lat=2E-02\\
 464  Input matrix size & N$_{x}$ = From 62 to 150\\ %\hline
 465  Residual error precision: 10$^{-5}$ to 10$^{-9}$\\ \hline
 466  \end{tabular}
 467 \end{footnotesize}
 468
 469 Again, comprehensive and extensive tests have been conducted varying the
 470 CPU power and the network parameters (bandwidth and latency) in the
 471 simulator tool with different problem size. The relative gains greater
 472 than 1 between the two algorithms have been captured after each step of
 473 the test. Table I below has recorded the best grid configurations
 474 allowing a multiplitting method time more than 2.5 times lower than
 475 classical GMRES execution and convergence time. The finding thru this
 476 experimentation is the tolerance of the multisplitting method under a
 477 low speed network that we encounter usually with distant clusters thru the
 478 internet.
 479
 480 % use the same column width for the following three tables
 481 \newlength{\mytablew}\settowidth{\mytablew}{\footnotesize\np{E-11}}
 482 \newenvironment{mytable}[1]{% #1: number of columns for data
 483   \renewcommand{\arraystretch}{1.3}%
 484   \begin{tabular}{|>{\bfseries}r%
 485                   |*{#1}{>{\centering\arraybackslash}p{\mytablew}|}}}{%
 486     \end{tabular}}
 487
 488 \begin{table}[!t]
 489   \centering
 490   \caption{Relative gain of the multisplitting algorithm compared with
 491 the classical GMRES}
 492   \label{tab.cluster.2x50}
 493
 494   \begin{mytable}{6}
 495     \hline
 496     bw
 497     & 5         & 5         & 5         & 5         & 5         & 50 \\
 498     \hline
 499     lat
 500     & 0.02      & 0.02      & 0.02      & 0.02      & 0.02      & 0.02 \\
 501     \hline
 502     power
 503     & 1         & 1         & 1         & 1.5       & 1.5       & 1.5 \\
 504     \hline
 505     size
 506     & 62        & 62        & 62        & 100       & 100       & 110 \\
 507     \hline
 508     Prec/Eprec
 509     & \np{E-5}  & \np{E-8}  & \np{E-9}  & \np{E-11} & \np{E-11} & \np{E-11} \\
 510     \hline
 511     speedup
 512     & 0.396     & 0.392     & 0.396     & 0.391     & 0.393     & 0.395 \\
 513     \hline
 514   \end{mytable}
 515
 516   \smallskip
 517
 518   \begin{mytable}{6}
 519     \hline
 520     bw
 521     & 50        & 50        & 50        & 50        & 10        & 10 \\
 522     \hline
 523     lat
 524     & 0.02      & 0.02      & 0.02      & 0.02      & 0.03      & 0.01 \\
 525     \hline
 526     power
 527     & 1.5       & 1.5       & 1.5       & 1.5       & 1         & 1.5 \\
 528     \hline
 529     size
 530     & 120       & 130       & 140       & 150       & 171       & 171 \\
 531     \hline
 532     Prec/Eprec
 533     & \np{E-11} & \np{E-11} & \np{E-11} & \np{E-11} & \np{E-5}  & \np{E-5} \\
 534     \hline
 535     speedup
 536     & 0.398     & 0.388     & 0.393     & 0.394     & 0.63      & 0.778 \\
 537     \hline
 538   \end{mytable}
 539 \end{table}
 540
 541 \section{Conclusion}
 542 CONCLUSION
 543
 544
 545 \section*{Acknowledgment}
 546
 547
 548 The authors would like to thank\dots{}
 549
 550
 551 % trigger a \newpage just before the given reference
 552 % number - used to balance the columns on the last page
 553 % adjust value as needed - may need to be readjusted if
 554 % the document is modified later
 555 \bibliographystyle{IEEEtran}
 556 \bibliography{hpccBib}
 557
 558 \end{document}
 559
 560 %%% Local Variables:
 561 %%% mode: latex
 562 %%% TeX-master: t
 563 %%% fill-column: 80
 564 %%% ispell-local-dictionary: "american"
 565 %%% End: