Some remarks.

[hpcc2014.git] / hpcc.tex
diff --git a/hpcc.tex b/hpcc.tex

index 5fbeca1a90c34ec0b69a85df8ada7a635d721d99..d37619469442bee9c8f51c604bb526d60a114ab8 100644 (file)
--- a/hpcc.tex
+++ b/hpcc.tex
@@ -1,577 +1,691 @@
-
-%% bare_conf.tex
-%% V1.3
-%% 2007/01/11
-%% by Michael Shell
-%% See:
-%% http://www.michaelshell.org/
-%% for current contact information.
-%%
-%% This is a skeleton file demonstrating the use of IEEEtran.cls
-%% (requires IEEEtran.cls version 1.7 or later) with an IEEE conference paper.
-%%
-%% Support sites:
-%% http://www.michaelshell.org/tex/ieeetran/
-%% http://www.ctan.org/tex-archive/macros/latex/contrib/IEEEtran/
-%% and
-%% http://www.ieee.org/
-
-%%*************************************************************************
-%% Legal Notice:
-%% This code is offered as-is without any warranty either expressed or
-%% implied; without even the implied warranty of MERCHANTABILITY or
-%% FITNESS FOR A PARTICULAR PURPOSE! 
-%% User assumes all risk.
-%% In no event shall IEEE or any contributor to this code be liable for
-%% any damages or losses, including, but not limited to, incidental,
-%% consequential, or any other damages, resulting from the use or misuse
-%% of any information contained here.
-%%
-%% All comments are the opinions of their respective authors and are not
-%% necessarily endorsed by the IEEE.
-%%
-%% This work is distributed under the LaTeX Project Public License (LPPL)
-%% ( http://www.latex-project.org/ ) version 1.3, and may be freely used,
-%% distributed and modified. A copy of the LPPL, version 1.3, is included
-%% in the base LaTeX documentation of all distributions of LaTeX released
-%% 2003/12/01 or later.
-%% Retain all contribution notices and credits.
-%% ** Modified files should be clearly indicated as such, including  **
-%% ** renaming them and changing author support contact information. **
-%%
-%% File list of work: IEEEtran.cls, IEEEtran_HOWTO.pdf, bare_adv.tex,
-%%                    bare_conf.tex, bare_jrnl.tex, bare_jrnl_compsoc.tex
-%%*************************************************************************
-
-% *** Authors should verify (and, if needed, correct) their LaTeX system  ***
-% *** with the testflow diagnostic prior to trusting their LaTeX platform ***
-% *** with production work. IEEE's font choices can trigger bugs that do  ***
-% *** not appear when using other class files.                            ***
-% The testflow support page is at:
-% http://www.michaelshell.org/tex/testflow/
-
-
-
-% Note that the a4paper option is mainly intended so that authors in
-% countries using A4 can easily print to A4 and see how their papers will
-% look in print - the typesetting of the document will not typically be
-% affected with changes in paper size (but the bottom and side margins will).
-% Use the testflow package mentioned above to verify correct handling of
-% both paper sizes by the user's LaTeX system.
-%
-% Also note that the "draftcls" or "draftclsnofoot", not "draft", option
-% should be used if it is desired that the figures are to be displayed in
-% draft mode.
-%
  \documentclass[conference]{IEEEtran}
  \documentclass[conference]{IEEEtran}
-% Add the compsoc option for Computer Society conferences.
-%
-% If IEEEtran.cls has not been installed into the LaTeX system files,
-% manually specify the path to it like:
-% \documentclass[conference]{../sty/IEEEtran}
-
-
-
-
-
-% Some very useful LaTeX packages include:
-% (uncomment the ones you want to load)
-
-
-% *** CITATION PACKAGES ***
-%
-%\usepackage{cite}
-% cite.sty was written by Donald Arseneau
-% V1.6 and later of IEEEtran pre-defines the format of the cite.sty package
-% \cite{} output to follow that of IEEE. Loading the cite package will
-% result in citation numbers being automatically sorted and properly
-% "compressed/ranged". e.g., [1], [9], [2], [7], [5], [6] without using
-% cite.sty will become [1], [2], [5]--[7], [9] using cite.sty. cite.sty's
-% \cite will automatically add leading space, if needed. Use cite.sty's
-% noadjust option (cite.sty V3.8 and later) if you want to turn this off.
-% cite.sty is already installed on most LaTeX systems. Be sure and use
-% version 4.0 (2003-05-27) and later if using hyperref.sty. cite.sty does
-% not currently provide for hyperlinked citations.
-% The latest version can be obtained at:
-% http://www.ctan.org/tex-archive/macros/latex/contrib/cite/
-% The documentation is contained in the cite.sty file itself.
-
-
-
-
-
-
-% *** GRAPHICS RELATED PACKAGES ***
-%
-\ifCLASSINFOpdf
-  % \usepackage[pdftex]{graphicx}
-  % declare the path(s) where your graphic files are
-  % \graphicspath{{../pdf/}{../jpeg/}}
-  % and their extensions so you won't have to specify these with
-  % every instance of \includegraphics
-  % \DeclareGraphicsExtensions{.pdf,.jpeg,.png}
-\else
-  % or other class option (dvipsone, dvipdf, if not using dvips). graphicx
-  % will default to the driver specified in the system graphics.cfg if no
-  % driver is specified.
-  % \usepackage[dvips]{graphicx}
-  % declare the path(s) where your graphic files are
-  % \graphicspath{{../eps/}}
-  % and their extensions so you won't have to specify these with
-  % every instance of \includegraphics
-  % \DeclareGraphicsExtensions{.eps}
-\fi
-% graphicx was written by David Carlisle and Sebastian Rahtz. It is
-% required if you want graphics, photos, etc. graphicx.sty is already
-% installed on most LaTeX systems. The latest version and documentation can
-% be obtained at: 
-% http://www.ctan.org/tex-archive/macros/latex/required/graphics/
-% Another good source of documentation is "Using Imported Graphics in
-% LaTeX2e" by Keith Reckdahl which can be found as epslatex.ps or
-% epslatex.pdf at: http://www.ctan.org/tex-archive/info/
-%
-% latex, and pdflatex in dvi mode, support graphics in encapsulated
-% postscript (.eps) format. pdflatex in pdf mode supports graphics
-% in .pdf, .jpeg, .png and .mps (metapost) formats. Users should ensure
-% that all non-photo figures use a vector format (.eps, .pdf, .mps) and
-% not a bitmapped formats (.jpeg, .png). IEEE frowns on bitmapped formats
-% which can result in "jaggedy"/blurry rendering of lines and letters as
-% well as large increases in file sizes.
-%
-% You can find documentation about the pdfTeX application at:
-% http://www.tug.org/applications/pdftex
-
-
-
-
-
-% *** MATH PACKAGES ***
-%
-%\usepackage[cmex10]{amsmath}
-% A popular package from the American Mathematical Society that provides
-% many useful and powerful commands for dealing with mathematics. If using
-% it, be sure to load this package with the cmex10 option to ensure that
-% only type 1 fonts will utilized at all point sizes. Without this option,
-% it is possible that some math symbols, particularly those within
-% footnotes, will be rendered in bitmap form which will result in a
-% document that can not be IEEE Xplore compliant!
-%
-% Also, note that the amsmath package sets \interdisplaylinepenalty to 10000
-% thus preventing page breaks from occurring within multiline equations. Use:
-%\interdisplaylinepenalty=2500
-% after loading amsmath to restore such page breaks as IEEEtran.cls normally
-% does. amsmath.sty is already installed on most LaTeX systems. The latest
-% version and documentation can be obtained at:
-% http://www.ctan.org/tex-archive/macros/latex/required/amslatex/math/
-
-
-
-
-
-% *** SPECIALIZED LIST PACKAGES ***
-%
-%\usepackage{algorithmic}
-% algorithmic.sty was written by Peter Williams and Rogerio Brito.
-% This package provides an algorithmic environment fo describing algorithms.
-% You can use the algorithmic environment in-text or within a figure
-% environment to provide for a floating algorithm. Do NOT use the algorithm
-% floating environment provided by algorithm.sty (by the same authors) or
-% algorithm2e.sty (by Christophe Fiorio) as IEEE does not use dedicated
-% algorithm float types and packages that provide these will not provide
-% correct IEEE style captions. The latest version and documentation of
-% algorithmic.sty can be obtained at:
-% http://www.ctan.org/tex-archive/macros/latex/contrib/algorithms/
-% There is also a support site at:
-% http://algorithms.berlios.de/index.html
-% Also of interest may be the (relatively newer and more customizable)
-% algorithmicx.sty package by Szasz Janos:
-% http://www.ctan.org/tex-archive/macros/latex/contrib/algorithmicx/
-
-
-
-
-% *** ALIGNMENT PACKAGES ***
-%
-%\usepackage{array}
-% Frank Mittelbach's and David Carlisle's array.sty patches and improves
-% the standard LaTeX2e array and tabular environments to provide better
-% appearance and additional user controls. As the default LaTeX2e table
-% generation code is lacking to the point of almost being broken with
-% respect to the quality of the end results, all users are strongly
-% advised to use an enhanced (at the very least that provided by array.sty)
-% set of table tools. array.sty is already installed on most systems. The
-% latest version and documentation can be obtained at:
-% http://www.ctan.org/tex-archive/macros/latex/required/tools/
-
-
-%\usepackage{mdwmath}
-%\usepackage{mdwtab}
-% Also highly recommended is Mark Wooding's extremely powerful MDW tools,
-% especially mdwmath.sty and mdwtab.sty which are used to format equations
-% and tables, respectively. The MDWtools set is already installed on most
-% LaTeX systems. The lastest version and documentation is available at:
-% http://www.ctan.org/tex-archive/macros/latex/contrib/mdwtools/
-
-
-% IEEEtran contains the IEEEeqnarray family of commands that can be used to
-% generate multiline equations as well as matrices, tables, etc., of high
-% quality.
-
-
-%\usepackage{eqparbox}
-% Also of notable interest is Scott Pakin's eqparbox package for creating
-% (automatically sized) equal width boxes - aka "natural width parboxes".
-% Available at:
-% http://www.ctan.org/tex-archive/macros/latex/contrib/eqparbox/
-
-
-
-
-
-% *** SUBFIGURE PACKAGES ***
-%\usepackage[tight,footnotesize]{subfigure}
-% subfigure.sty was written by Steven Douglas Cochran. This package makes it
-% easy to put subfigures in your figures. e.g., "Figure 1a and 1b". For IEEE
-% work, it is a good idea to load it with the tight package option to reduce
-% the amount of white space around the subfigures. subfigure.sty is already
-% installed on most LaTeX systems. The latest version and documentation can
-% be obtained at:
-% http://www.ctan.org/tex-archive/obsolete/macros/latex/contrib/subfigure/
-% subfigure.sty has been superceeded by subfig.sty.
-
-
-
-%\usepackage[caption=false]{caption}
-%\usepackage[font=footnotesize]{subfig}
-% subfig.sty, also written by Steven Douglas Cochran, is the modern
-% replacement for subfigure.sty. However, subfig.sty requires and
-% automatically loads Axel Sommerfeldt's caption.sty which will override
-% IEEEtran.cls handling of captions and this will result in nonIEEE style
-% figure/table captions. To prevent this problem, be sure and preload
-% caption.sty with its "caption=false" package option. This is will preserve
-% IEEEtran.cls handing of captions. Version 1.3 (2005/06/28) and later 
-% (recommended due to many improvements over 1.2) of subfig.sty supports
-% the caption=false option directly:
-%\usepackage[caption=false,font=footnotesize]{subfig}
-%
-% The latest version and documentation can be obtained at:
-% http://www.ctan.org/tex-archive/macros/latex/contrib/subfig/
-% The latest version and documentation of caption.sty can be obtained at:
-% http://www.ctan.org/tex-archive/macros/latex/contrib/caption/
-
-
-
-
-% *** FLOAT PACKAGES ***
-%
-%\usepackage{fixltx2e}
-% fixltx2e, the successor to the earlier fix2col.sty, was written by
-% Frank Mittelbach and David Carlisle. This package corrects a few problems
-% in the LaTeX2e kernel, the most notable of which is that in current
-% LaTeX2e releases, the ordering of single and double column floats is not
-% guaranteed to be preserved. Thus, an unpatched LaTeX2e can allow a
-% single column figure to be placed prior to an earlier double column
-% figure. The latest version and documentation can be found at:
-% http://www.ctan.org/tex-archive/macros/latex/base/
-
-
-
-%\usepackage{stfloats}
-% stfloats.sty was written by Sigitas Tolusis. This package gives LaTeX2e
-% the ability to do double column floats at the bottom of the page as well
-% as the top. (e.g., "\begin{figure*}[!b]" is not normally possible in
-% LaTeX2e). It also provides a command:
-%\fnbelowfloat
-% to enable the placement of footnotes below bottom floats (the standard
-% LaTeX2e kernel puts them above bottom floats). This is an invasive package
-% which rewrites many portions of the LaTeX2e float routines. It may not work
-% with other packages that modify the LaTeX2e float routines. The latest
-% version and documentation can be obtained at:
-% http://www.ctan.org/tex-archive/macros/latex/contrib/sttools/
-% Documentation is contained in the stfloats.sty comments as well as in the
-% presfull.pdf file. Do not use the stfloats baselinefloat ability as IEEE
-% does not allow \baselineskip to stretch. Authors submitting work to the
-% IEEE should note that IEEE rarely uses double column equations and
-% that authors should try to avoid such use. Do not be tempted to use the
-% cuted.sty or midfloat.sty packages (also by Sigitas Tolusis) as IEEE does
-% not format its papers in such ways.
-
-
-
-
-
-% *** PDF, URL AND HYPERLINK PACKAGES ***
-%
-%\usepackage{url}
-% url.sty was written by Donald Arseneau. It provides better support for
-% handling and breaking URLs. url.sty is already installed on most LaTeX
-% systems. The latest version can be obtained at:
-% http://www.ctan.org/tex-archive/macros/latex/contrib/misc/
-% Read the url.sty source comments for usage information. Basically,
-% \url{my_url_here}.
-
-% *** Do not adjust lengths that control margins, column widths, etc. ***
-% *** Do not use packages that alter fonts (such as pslatex).         ***
-% There should be no need to do such things with IEEEtran.cls V1.6 and later.
-% (Unless specifically asked to do so by the journal or conference you plan
-% to submit to, of course. )
-
-
  
  \usepackage[T1]{fontenc}
  
  \usepackage[T1]{fontenc}
-\usepackage{ucs}
-%\usepackage[utf8x]{inputenc}
-\usepackage{lmodern}
-\usepackage{color}
-%% Jolis entetes %%
-\usepackage[Glenn]{fncychap}
-%\usepackage{amsmath}
+\usepackage[utf8]{inputenc}
+\usepackage{amsfonts,amssymb}
+\usepackage{amsmath}
+%\usepackage{algorithm}
+\usepackage{algpseudocode}
  %\usepackage{amsthm}
  %\usepackage{amsthm}
-%\usepackage{amsfonts}
-%\usepackage{graphicx}
-%\usepackage{xspace}
-% Definition des marges
-\usepackage{vmargin}
-\setpapersize[portrait]{A4}
-\usepackage[francais]{babel}
-% Extension pour les graphiques EPS
-%\usepackage[dvips]{graphicx}
-\usepackage[pdftex,final]{graphicx}
+\usepackage{graphicx}
+\usepackage[american]{babel}
  % Extension pour les liens intra-documents (tagged PDF)
  % et l'affichage correct des URL (commande \url{http://example.com})
  % Extension pour les liens intra-documents (tagged PDF)
  % et l'affichage correct des URL (commande \url{http://example.com})
-\usepackage{hyperref}
+%\usepackage{hyperref}
+
+\usepackage{url}
+\DeclareUrlCommand\email{\urlstyle{same}}
  
  
-\ifCLASSINFOpdf
-   \usepackage[pdftex]{graphicx}
-   \DeclareGraphicsExtensions{.pdf,.jpeg,.png}
-\else
-\fi
+\usepackage[autolanguage,np]{numprint}
+\AtBeginDocument{%
+  \renewcommand*\npunitcommand[1]{\text{#1}}
+  \npthousandthpartsep{}}
  
  
+\usepackage{xspace}
+\usepackage[textsize=footnotesize]{todonotes}
+\newcommand{\AG}[2][inline]{%
+  \todo[color=green!50,#1]{\sffamily\textbf{AG:} #2}\xspace}
+\newcommand{\DL}[2][inline]{%
+  \todo[color=yellow!50,#1]{\sffamily\textbf{DL:} #2}\xspace}
+\newcommand{\LZK}[2][inline]{%
+  \todo[color=blue!10,#1]{\sffamily\textbf{LZK:} #2}\xspace}
+\newcommand{\RC}[2][inline]{%
+  \todo[color=red!10,#1]{\sffamily\textbf{RC:} #2}\xspace}
+\newcommand{\CER}[2][inline]{%
+  \todo[color=pink!10,#1]{\sffamily\textbf{CER:} #2}\xspace}
  
  
+\algnewcommand\algorithmicinput{\textbf{Input:}}
+\algnewcommand\Input{\item[\algorithmicinput]}
  
  
-% correct bad hyphenation here
-\hyphenation{op-tical net-works semi-conduc-tor}
+\algnewcommand\algorithmicoutput{\textbf{Output:}}
+\algnewcommand\Output{\item[\algorithmicoutput]}
  
  
+\newcommand{\MI}{\mathit{MaxIter}}
  
  \begin{document}
  
  \begin{document}
-%
-% paper title
-% can use linebreaks \\ within to get better formatting as desired
-\title{Simulation of Asynchronous Iterative Numerical Algorithms Using SimGrid}
  
  
+\title{Simulation of Asynchronous Iterative Numerical Algorithms Using SimGrid}
  
  
-% author names and affiliations
-% use a multiple column layout for up to three different
-% affiliations
-\author{\IEEEauthorblockN{Raphaël Couturier and Arnaud Giersch and David Laiymani and Charles-Emile Ramamonjisoa}
-\IEEEauthorblockA{Femto-ST Institute - DISC Department\\
-Université de Franche-Comté\\
-Belfort\\
-Email: raphael.couturier@univ-fcomte.fr}
-%\and
-%\IEEEauthorblockN{Arnaud Giersch}
-%\IEEEauthorblockA{Twentieth Century Fox\\
-%Springfield, USA\\
-%Email: homer@thesimpsons.com}
-%\and
-%\IEEEauthorblockN{James Kirk\\ and Montgomery Scott}
-%\IEEEauthorblockA{Starfleet Academy\\
-%San Francisco, California 96678-2391\\
-%Telephone: (800) 555--1212\\
-%Fax: (888) 555--1212
+\author{%
+  \IEEEauthorblockN{%
+    Charles Emile Ramamonjisoa\IEEEauthorrefmark{1},
+    David Laiymani\IEEEauthorrefmark{1},
+    Arnaud Giersch\IEEEauthorrefmark{1},
+    Lilia Ziane Khodja\IEEEauthorrefmark{2} and
+    Raphaël Couturier\IEEEauthorrefmark{1}
+  }
+  \IEEEauthorblockA{\IEEEauthorrefmark{1}%
+    Femto-ST Institute -- DISC Department\\
+    Université de Franche-Comté,
+    IUT de Belfort-Montbéliard\\
+    19 avenue du Maréchal Juin, BP 527, 90016 Belfort cedex, France\\
+    Email: \email{{charles.ramamonjisoa,david.laiymani,arnaud.giersch,raphael.couturier}@univ-fcomte.fr}
+  }
+  \IEEEauthorblockA{\IEEEauthorrefmark{2}%
+    Inria Bordeaux Sud-Ouest\\
+    200 avenue de la Vieille Tour, 33405 Talence cedex, France \\
+    Email: \email{lilia.ziane@inria.fr}
+  }
  }
  
  }
  
-
-
-% make the title area
  \maketitle
  
  \maketitle
  
-
+\RC{Ordre des auteurs pas définitif.}
  \begin{abstract}
  \begin{abstract}
-%\boldmath
-The abstract goes here.
+\AG{L'abstract est AMHA incompréhensible et ne donne pas envie de lire la suite.}
+In recent years, the scalability of large-scale implementation in a 
+distributed environment of algorithms becoming more and more complex has 
+always been hampered by the limits of physical computing resources 
+capacity. One solution is to run the program in a virtual environment 
+simulating a real interconnected computers architecture. The results are 
+convincing and useful solutions are obtained with far fewer resources 
+than in a real platform. However, challenges remain for the convergence 
+and efficiency of a class of algorithms that concern us here, namely 
+numerical parallel iterative algorithms executed in asynchronous mode, 
+especially in a large scale level. Actually, such algorithm requires a 
+balance and a compromise between computation and communication time 
+during the execution. Two important factors determine the success of the 
+experimentation: the convergence of the iterative algorithm on a large 
+scale and the execution time reduction in asynchronous mode. Once again, 
+from the current work, a simulated environment like SimGrid provides
+accurate results which are difficult or even impossible to obtain in a 
+physical platform by exploiting the flexibility of the simulator on the 
+computing units clusters and the network structure design. Our 
+experimental outputs showed a saving of up to \np[\%]{40} for the algorithm
+execution time in asynchronous mode compared to the synchronous one with 
+a residual precision up to \np{E-11}. Such successful results open
+perspectives on experimentations for running the algorithm on a 
+simulated large scale growing environment and with larger problem size. 
+
+% no keywords for IEEE conferences
+% Keywords: Algorithm distributed iterative asynchronous simulation SimGrid
  \end{abstract}
  \end{abstract}
-% IEEEtran.cls defaults to using nonbold math in the Abstract.
-% This preserves the distinction between vectors and scalars. However,
-% if the conference you are submitting to favors bold math in the abstract,
-% then you can use LaTeX's standard command \boldmath at the very start
-% of the abstract to achieve this. Many IEEE journals/conferences frown on
-% math in the abstract anyway.
-
-% no keywords
-
-
-
-
-% For peer review papers, you can put extra information on the cover
-% page as needed:
-% \ifCLASSOPTIONpeerreview
-% \begin{center} \bfseries EDICS Category: 3-BBND \end{center}
-% \fi
-%
-% For peerreview papers, this IEEEtran command inserts a page break and
-% creates the second title. It will be ignored for other modes.
-\IEEEpeerreviewmaketitle
-
-
  
  \section{Introduction}
  
  
  \section{Introduction}
  
-Présenter un bref état de l'art sur la simulation d'algos parallèles. Présenter rapidement les algos itératifs asynchrones et leurs avantages. Parler de leurs inconvénients en particulier la difficulté de déploiement à grande échelle donc il serait bien de simuler. Dire qu'à notre connaissance il n'existe pas de simulation de ce type d'algo.
-Présenter les travaux et les résultats obtenus. Annoncer le plan.
+Parallel computing and high performance computing (HPC) are becoming  more and more imperative for solving various
+problems raised by  researchers on various scientific disciplines but also by industrial in  the field. Indeed, the
+increasing complexity of these requested  applications combined with a continuous increase of their sizes lead to  write
+distributed and parallel algorithms requiring significant hardware  resources (grid computing, clusters, broadband
+network, etc.) but also a non-negligible CPU execution time. We consider in this paper a class of highly efficient
+parallel algorithms called \emph{numerical iterative algorithms} executed in a distributed environment. As their name
+suggests, these algorithms solve a given problem by successive iterations ($X_{n +1} = f(X_{n})$) from an initial value
+$X_{0}$ to find an approximate value $X^*$ of the solution with a very low residual error. Several well-known methods
+demonstrate the convergence of these algorithms~\cite{BT89,Bahi07}.
+
+Parallelization of such algorithms generally involve the division of the problem into several \emph{blocks} that will
+be solved in parallel on multiple processing units. The latter will communicate each intermediate results before a new
+iteration starts and until the approximate solution is reached. These parallel  computations can be performed either in
+\emph{synchronous} mode where a new iteration begins only when all nodes communications are completed,
+or in \emph{asynchronous} mode where processors can continue independently with few or no synchronization points. For
+instance in the \textit{Asynchronous Iterations~-- Asynchronous Communications (AIAC)} model~\cite{bcvc06:ij}, local
+computations do not need to wait for required data. Processors can then perform their iterations with the data present
+at that time. Even if the number of iterations required before the convergence is generally greater than for the
+synchronous case, AIAC algorithms can significantly reduce overall execution times by suppressing idle times due to
+synchronizations especially in a grid computing context (see~\cite{Bahi07} for more details).
+
+Parallel numerical applications (synchronous or asynchronous) may have different
+configuration and deployment requirements.  Quantifying their resource
+allocation policies and application scheduling algorithms in grid computing
+environments under varying load, CPU power and network speeds is very costly,
+very labor intensive and very time
+consuming~\cite{Calheiros:2011:CTM:1951445.1951450}.  The case of AIAC
+algorithms is even more problematic since they are very sensible to the
+execution environment context. For instance, variations in the network bandwidth
+(intra and inter-clusters), in the number and the power of nodes, in the number
+of clusters\dots{} can lead to very different number of iterations and so to
+very different execution times. Then, it appears that the use of simulation
+tools to explore various platform scenarios and to run large numbers of
+experiments quickly can be very promising. In this way, the use of a simulation
+environment to execute parallel iterative algorithms found some interests in
+reducing the highly cost of access to computing resources: (1) for the
+applications development life cycle and in code debugging (2) and in production
+to get results in a reasonable execution time with a simulated infrastructure
+not accessible with physical resources. Indeed, the launch of distributed
+iterative asynchronous algorithms to solve a given problem on a large-scale
+simulated environment challenges to find optimal configurations giving the best
+results with a lowest residual error and in the best of execution time.
+
+To our knowledge, there is no existing work on the large-scale simulation of a
+real AIAC application. The aim of this paper is twofold. First we give a first
+approach of the simulation of AIAC algorithms using a simulation tool (i.e. the
+SimGrid toolkit~\cite{SimGrid}). Second, we confirm the effectiveness of
+asynchronous mode algorithms by comparing their performance with the synchronous
+mode. More precisely, we had implemented a program for solving large
+linear system of equations by numerical method GMRES (Generalized
+Minimal Residual) \cite{ref1}. We show, that with minor modifications of the
+initial MPI code, the SimGrid toolkit allows us to perform a test campaign of a
+real AIAC application on different computing architectures. The simulated
+results we obtained are in line with real results exposed in ??\AG[]{ref?}.
+SimGrid had allowed us to launch the application from a modest computing
+infrastructure by simulating different distributed architectures composed by
+clusters nodes interconnected by variable speed networks.  With selected
+parameters on the network platforms (bandwidth, latency of inter cluster
+network) and on the clusters architecture (number, capacity calculation power)
+in the simulated environment, the experimental results have demonstrated not
+only the algorithm convergence within a reasonable time compared with the
+physical environment performance, but also a time saving of up to \np[\%]{40} in
+asynchronous mode.
+\AG{Il faudrait revoir la phrase précédente (couper en deux?).  Là, on peut
+  avoir l'impression que le gain de \np[\%]{40} est entre une exécution réelle
+  et une exécution simulée!}
+
+This article is structured as follows: after this introduction, the next  section will give a brief description of
+iterative asynchronous model.  Then, the simulation framework SimGrid is presented with the settings to create various
+distributed architectures. The algorithm of  the multisplitting method used by GMRES written with MPI primitives and
+its adaptation to SimGrid with SMPI (Simulated MPI) is detailed in the next section. At last, the experiments results
+carried out will be presented before some concluding remarks and future works.
   
   
-\section{The asynchronous iteration model}
+\section{Motivations and scientific context}
+
+As exposed in the introduction, parallel iterative methods are now widely used in many scientific domains. They can be
+classified in three main classes depending on how iterations and communications are managed (for more details readers
+can refer to~\cite{bcvc06:ij}). In the \textit{Synchronous Iterations~-- Synchronous Communications (SISC)} model data
+are exchanged at the end of each iteration. All the processors must begin the same iteration at the same time and
+important idle times on processors are generated. The \textit{Synchronous Iterations~-- Asynchronous Communications
+(SIAC)} model can be compared to the previous one except that data required on another processor are sent asynchronously
+i.e.  without stopping current computations. This technique allows to partially overlap communications by computations
+but unfortunately, the overlapping is only partial and important idle times remain.  It is clear that, in a grid
+computing context, where the number of computational nodes is large, heterogeneous and widely distributed, the idle
+times generated by synchronizations are very penalizing. One way to overcome this problem is to use the
+\textit{Asynchronous Iterations~-- Asynchronous Communications (AIAC)} model. Here, local computations do not need to
+wait for required data. Processors can then perform their iterations with the data present at that time. Figure~\ref{fig:aiac}
+illustrates this model where the gray blocks represent the computation phases, the white spaces the idle
+times and the arrows the communications.
+\AG{There are no ``white spaces'' on the figure.}
+With this algorithmic model, the number of iterations required before the
+convergence is generally greater than for the two former classes. But, and as detailed in~\cite{bcvc06:ij}, AIAC
+algorithms can significantly reduce overall execution times by suppressing idle times due to synchronizations especially
+in a grid computing context.
+
+\begin{figure}[!t]
+  \centering
+    \includegraphics[width=8cm]{AIAC.pdf}
+  \caption{The Asynchronous Iterations~-- Asynchronous Communications model}
+  \label{fig:aiac}
+\end{figure}
+
+
+It is very challenging to develop efficient applications for large scale,
+heterogeneous and distributed platforms such as computing grids. Researchers and
+engineers have to develop techniques for maximizing application performance of
+these multi-cluster platforms, by redesigning the applications and/or by using
+novel algorithms that can account for the composite and heterogeneous nature of
+the platform. Unfortunately, the deployment of such applications on these very
+large scale systems is very costly, labor intensive and time consuming. In this
+context, it appears that the use of simulation tools to explore various platform
+scenarios at will and to run enormous numbers of experiments quickly can be very
+promising. Several works\dots{}
+
+\AG{Several works\dots{} what?\\
+  Le paragraphe suivant se trouve déjà dans l'intro ?}
+In the context of AIAC algorithms, the use of simulation tools is even more
+relevant. Indeed, this class of applications is very sensible to the execution
+environment context. For instance, variations in the network bandwidth (intra
+and inter-clusters), in the number and the power of nodes, in the number of
+clusters\dots{} can lead to very different number of iterations and so to very
+different execution times.
+
+
  
  
-Décrire le modèle asynchrone. Je m'en charge (DL)
  
  \section{SimGrid}
  
  
  \section{SimGrid}
  
-Décrire SimGrid (Arnaud)
+SimGrid~\cite{SimGrid,casanova+legrand+quinson.2008.simgrid} is a simulation
+framework to study the behavior of large-scale distributed systems.  As its name
+says, it emanates from the grid computing community, but is nowadays used to
+study grids, clouds, HPC or peer-to-peer systems.  The early versions of SimGrid
+date from 1999, but it's still actively developed and distributed as an open
+source software.  Today, it's one of the major generic tools in the field of
+simulation for large-scale distributed systems.
+
+SimGrid provides several programming interfaces: MSG to simulate Concurrent
+Sequential Processes, SimDAG to simulate DAGs of (parallel) tasks, and SMPI to
+run real applications written in MPI~\cite{MPI}.  Apart from the native C
+interface, SimGrid provides bindings for the C++, Java, Lua and Ruby programming
+languages.  SMPI is the interface that has been used for the work exposed in
+this paper.  The SMPI interface implements about \np[\%]{80} of the MPI 2.0
+standard~\cite{bedaride:hal-00919507}, and supports applications written in C or
+Fortran, with little or no modifications.
+
+With SimGrid, the execution of a distributed application is simulated on a
+single machine.  The application code is really executed, but some operations
+like the communications are intercepted to be simulated according to the
+characteristics of the simulated execution platform.  The description of this
+target platform is given as an input for the execution, by the mean of an XML
+file.  It describes the properties of the platform, such as the computing node
+with their computing power, the interconnection links with their bandwidth and
+latency, and the routing strategy.  The simulated running time of the
+application is computed according to these properties.
+
+%%% TODO: add some words+refs about SimGrid's accuracy and scalability.}
+
+\AG{Faut-il ajouter quelque-chose ?} 
+\CER{Comme tu as décrit la plateforme d'exécution, on peut ajouter éventuellement le fichier XML contenant des hosts dans les clusters formant la grille
+  \AG{Bof.}}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\section{Simulation of the multisplitting method}
+%Décrire le problème (algo) traité ainsi que le processus d'adaptation à SimGrid.
+Let $Ax=b$ be a large sparse system of $n$ linear equations in $\mathbb{R}$, where $A$ is a sparse square and nonsingular matrix, $x$ is the solution vector and $b$ is the right-hand side vector. We use a multisplitting method based on the block Jacobi splitting to solve this linear system on a large scale platform composed of $L$ clusters of processors~\cite{o1985multi}. In this case, we apply a row-by-row splitting without overlapping  
+\begin{equation*}
+  \left(\begin{array}{ccc}
+      A_{11} & \cdots & A_{1L} \\
+      \vdots & \ddots & \vdots\\
+      A_{L1} & \cdots & A_{LL}
+    \end{array} \right)
+  \times
+  \left(\begin{array}{c}
+      X_1 \\
+      \vdots\\
+      X_L
+    \end{array} \right)
+  =
+  \left(\begin{array}{c}
+      B_1 \\
+      \vdots\\
+      B_L
+    \end{array} \right)
+\end{equation*}
+in such a way that successive rows of matrix $A$ and both vectors $x$ and $b$ are assigned to one cluster, where for all $l,m\in\{1,\ldots,L\}$ $A_{lm}$ is a rectangular block of $A$ of size $n_l\times n_m$, $X_l$ and $B_l$ are sub-vectors of $x$ and $b$, respectively, of size $n_l$ each and $\sum_{l} n_l=\sum_{m} n_m=n$.
+
+The multisplitting method proceeds by iteration to solve in parallel the linear system on $L$ clusters of processors, in such a way each sub-system
+\begin{equation}
+  \label{eq:4.1}
+  \left\{
+    \begin{array}{l}
+      A_{ll}X_l = Y_l \text{, such that}\\
+      Y_l = B_l - \displaystyle\sum_{\substack{m=1\\ m\neq l}}^{L}A_{lm}X_m
+    \end{array}
+  \right.
+\end{equation}
+is solved independently by a cluster and communications are required to update the right-hand side sub-vector $Y_l$, such that the sub-vectors $X_m$ represent the data dependencies between the clusters. As each sub-system (\ref{eq:4.1}) is solved in parallel by a cluster of processors, our multisplitting method uses an iterative method as an inner solver which is easier to parallelize and more scalable than a direct method. In this work, we use the parallel algorithm of GMRES method~\cite{ref1} which is one of the most used iterative method by many researchers. 
+
+\begin{figure}[!t]
+  %%% IEEE instructions forbid to use an algorithm environment here, use figure
+  %%% instead
+\begin{algorithmic}[1]
+\Input $A_l$ (sparse sub-matrix), $B_l$ (right-hand side sub-vector)
+\Output $X_l$ (solution sub-vector)\vspace{0.2cm}
+\State Load $A_l$, $B_l$
+\State Set the initial guess $x^0$
+\For {$k=0,1,2,\ldots$ until the global convergence}
+\State Restart outer iteration with $x^0=x^k$
+\State Inner iteration: \Call{InnerSolver}{$x^0$, $k+1$}
+\State\label{algo:01:send} Send shared elements of $X_l^{k+1}$ to neighboring clusters
+\State\label{algo:01:recv} Receive shared elements in $\{X_m^{k+1}\}_{m\neq l}$
+\EndFor
+
+\Statex
+
+\Function {InnerSolver}{$x^0$, $k$}
+\State Compute local right-hand side $Y_l$:
+       \begin{equation*}
+         Y_l = B_l - \sum\nolimits^L_{\substack{m=1\\ m\neq l}}A_{lm}X_m^0
+       \end{equation*}
+\State Solving sub-system $A_{ll}X_l^k=Y_l$ with the parallel GMRES method
+\State \Return $X_l^k$
+\EndFunction
+\end{algorithmic}
+\caption{A multisplitting solver with GMRES method}
+\label{algo:01}
+\end{figure}
+
+Algorithm on Figure~\ref{algo:01} shows the main key points of the multisplitting method to solve a large sparse linear system. This algorithm is based on an outer-inner iteration method where the parallel synchronous GMRES method is used to solve the inner iteration. It is executed in parallel by each cluster of processors. For all $l,m\in\{1,\ldots,L\}$, the matrices and vectors with the subscript $l$ represent the local data for cluster $l$, while $\{A_{lm}\}_{m\neq l}$ are off-diagonal matrices of sparse matrix $A$ and $\{X_m\}_{m\neq l}$ contain vector elements of solution $x$ shared with neighboring clusters. At every outer iteration $k$, asynchronous communications are performed between processors of the local cluster and those of distant clusters (lines~\ref{algo:01:send} and~\ref{algo:01:recv} in Figure~\ref{algo:01}). The shared vector elements of the solution $x$ are exchanged by message passing using MPI non-blocking communication routines.
+
+\begin{figure}[!t]
+\centering
+  \includegraphics[width=60mm,keepaspectratio]{clustering}
+\caption{Example of three clusters of processors interconnected by a virtual unidirectional ring network.}
+\label{fig:4.1}
+\end{figure}
+
+The global convergence of the asynchronous multisplitting solver is detected
+when the clusters of processors have all converged locally. We implemented the
+global convergence detection process as follows. On each cluster a master
+processor is designated (for example the processor with rank 1) and masters of
+all clusters are interconnected by a virtual unidirectional ring network (see
+Figure~\ref{fig:4.1}). During the resolution, a Boolean token circulates around
+the virtual ring from a master processor to another until the global convergence
+is achieved. So starting from the cluster with rank 1, each master processor $i$
+sets the token to \textit{True} if the local convergence is achieved or to
+\textit{False} otherwise, and sends it to master processor $i+1$. Finally, the
+global convergence is detected when the master of cluster 1 receives from the
+master of cluster $L$ a token set to \textit{True}. In this case, the master of
+cluster 1 broadcasts a stop message to masters of other clusters. In this work,
+the local convergence on each cluster $l$ is detected when the following
+condition is satisfied
+\begin{equation*}
+  (k\leq \MI) \text{ or } (\|X_l^k - X_l^{k+1}\|_{\infty}\leq\epsilon)
+\end{equation*}
+where $\MI$ is the maximum number of outer iterations and $\epsilon$ is the
+tolerance threshold of the error computed between two successive local solution
+$X_l^k$ and $X_l^{k+1}$.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+We did not encounter major blocking problems when adapting the multisplitting algorithm previously described to a simulation environment like SimGrid unless some code 
+debugging. Indeed, apart from the review of the program sequence for asynchronous exchanges between processors within a cluster or between clusters, the algorithm was executed successfully with SMPI and provided identical outputs as those obtained with direct execution under MPI. In synchronous 
+mode, the execution of the program raised no particular issue but in asynchronous mode, the review of the sequence of MPI\_Isend, MPI\_Irecv and MPI\_Waitall instructions
+and with the addition of the primitive MPI\_Test was needed to avoid a memory fault due to an infinite loop resulting from the non-convergence of the algorithm.
+\CER{On voulait en fait montrer la simplicité de l'adaptation de l'algo a SimGrid. Les problèmes rencontrés décrits dans ce paragraphe concerne surtout le mode async}\LZK{OK. J'aurais préféré avoir un peu plus de détails sur l'adaptation de la version async}
+Note here that the use of SMPI functions optimizer for memory footprint and CPU usage is not recommended knowing that one wants to get real results by simulation.
+As mentioned, upon this adaptation, the algorithm is executed as in the real life in the simulated environment after the following minor changes. First, all declared 
+global variables have been moved to local variables for each subroutine. In fact, global variables generate side effects arising from the concurrent access of 
+shared memory used by threads simulating each computing unit in the SimGrid architecture. Second, the alignment of certain types of variables such as ``long int'' had
+also to be reviewed. Finally, some compilation errors on MPI\_Waitall and MPI\_Finalize primitives have been fixed with the latest version of SimGrid.
+In total, the initial MPI program running on the simulation environment SMPI gave after a very simple adaptation the same results as those obtained in a real 
+environment. We have tested in synchronous mode with a simulated platform starting from a modest 2 or 3 clusters grid to a larger configuration like simulating 
+Grid5000 with more than 1500 hosts with 5000 cores~\cite{bolze2006grid}.
  
  
-\section{Simulation of the multi-splitting method}
  
  
-Décrire le problème (algo) traité ainsi que le processus d'adaptation à SimGrid.
  
  \section{Experimental results}
  
  
  \section{Experimental results}
  
-\section{Conclusion}
+When the \textit{real} application runs in the simulation environment and produces the expected results, varying the input
+parameters and the program arguments allows us to compare outputs from the code execution. We have noticed from this
+study that the results depend on the following parameters:  
+\begin{itemize}
+\item At the network level, we found that the most critical values are the
+  bandwidth (bw) and the network latency (lat).
+\item Hosts power (GFlops) can also influence on the results.
+\item Finally, when submitting job batches for execution, the arguments values
+  passed to the program like the maximum number of iterations or the
+  \textit{external} precision are critical. They allow to ensure not only the
+  convergence of the algorithm but also to get the main objective of the
+  experimentation of the simulation in having an execution time in asynchronous
+  less than in synchronous mode (i.e. speed-up less than 1).
+\end{itemize}
+\LZK{Propositions pour remplacer le terme ``speedup'': acceleration ratio ou relative gain}
+
+A priori, obtaining a speedup less than 1 would be difficult in a local area
+network configuration where the synchronous mode will take advantage on the
+rapid exchange of information on such high-speed links. Thus, the methodology
+adopted was to launch the application on clustered network. In this last
+configuration, degrading the inter-cluster network performance will
+\textit{penalize} the synchronous mode allowing to get a speedup lower than 1.
+This action simulates the case of clusters linked with long distance network
+like Internet.
+
+In this paper, we solve the 3D Poisson problem whose the mathematical model is 
+\begin{equation}
+\left\{
+\begin{array}{l}
+\nabla^2 u = f \text{~in~} \Omega \\
+u =0 \text{~on~} \Gamma =\partial\Omega
+\end{array}
+\right.
+\label{eq:02}
+\end{equation}
+where $\nabla^2$ is the Laplace operator, $f$ and $u$ are real-valued functions, and $\Omega=[0,1]^3$. The spatial discretization with a finite difference scheme reduces problem~(\ref{eq:02}) to a system of sparse linear equations. The general iteration scheme of our multisplitting method in a 3D domain using a seven point stencil could be written as 
+\begin{equation}
+\begin{array}{ll}
+u^{k+1}(x,y,z)= & u^k(x,y,z) - \frac{1}{6}\times\\
+               & (u^k(x-1,y,z) + u^k(x+1,y,z) + \\
+               & u^k(x,y-1,z) + u^k(x,y+1,z) + \\
+               & u^k(x,y,z-1) + u^k(x,y,z+1)),
+\end{array}
+\label{eq:03}
+\end{equation} 
+where the iteration matrix $A$ of size $N_x\times N_y\times N_z$ of the discretized linear system is sparse, symmetric and positive definite. 
+
+The parallel solving of the 3D Poisson problem with our multisplitting method requires a data partitioning of the problem between clusters and between processors within a cluster. We have chosen the 3D partitioning instead of the row-by-row partitioning in order to reduce the data exchanges at sub-domain boundaries. Figure~\ref{fig:4.2} shows an example of the data partitioning of the 3D Poisson problem between two clusters of processors, where each sub-problem is assigned to a processor. In this context, a processor has at most six neighbors within a cluster or in distant clusters with which it shares data at sub-domain boundaries. 
+
+\begin{figure}[!t]
+\centering
+  \includegraphics[width=80mm,keepaspectratio]{partition}
+\caption{Example of the 3D data partitioning between two clusters of processors.}
+\label{fig:4.2}
+\end{figure}
+
+
+As a first step, the algorithm was run on a network consisting of two clusters
+containing 50 hosts each, totaling 100 hosts. Various combinations of the above
+factors have providing the results shown in Table~\ref{tab.cluster.2x50} with a
+matrix size ranging from $N_x = N_y = N_z = \text{62}$ to 171 elements or from
+$\text{62}^\text{3} = \text{\np{238328}}$ to $\text{171}^\text{3} =
+\text{\np{5211000}}$ entries.
+
+% use the same column width for the following three tables
+\newlength{\mytablew}\settowidth{\mytablew}{\footnotesize\np{E-11}}
+\newenvironment{mytable}[1]{% #1: number of columns for data
+  \renewcommand{\arraystretch}{1.3}%
+  \begin{tabular}{|>{\bfseries}r%
+                  |*{#1}{>{\centering\arraybackslash}p{\mytablew}|}}}{%
+    \end{tabular}}
+
+\begin{table}[!t]
+  \centering
+  \caption{2 clusters, each with 50 nodes}
+  \label{tab.cluster.2x50}
+
+  \begin{mytable}{6}
+    \hline
+    bw
+    & 5         & 5         & 5         & 5         & 5         & 50 \\
+    \hline
+    lat
+    & 0.02      & 0.02      & 0.02      & 0.02      & 0.02      & 0.02 \\
+    \hline
+    power
+    & 1         & 1         & 1         & 1.5       & 1.5       & 1.5 \\
+    \hline
+    size
+    & 62        & 62        & 62        & 100       & 100       & 110 \\
+    \hline
+    Prec/Eprec
+    & \np{E-5}  & \np{E-8}  & \np{E-9}  & \np{E-11} & \np{E-11} & \np{E-11} \\
+    \hline
+    speedup
+    & 0.396     & 0.392     & 0.396     & 0.391     & 0.393     & 0.395 \\
+    \hline
+  \end{mytable}
+
+  \smallskip
+
+  \begin{mytable}{6}
+    \hline
+    bw
+    & 50        & 50        & 50        & 50        & 10        & 10 \\
+    \hline
+    lat
+    & 0.02      & 0.02      & 0.02      & 0.02      & 0.03      & 0.01 \\
+    \hline
+    power
+    & 1.5       & 1.5       & 1.5       & 1.5       & 1         & 1.5 \\
+    \hline
+    size
+    & 120       & 130       & 140       & 150       & 171       & 171 \\
+    \hline
+    Prec/Eprec
+    & \np{E-11} & \np{E-11} & \np{E-11} & \np{E-11} & \np{E-5}  & \np{E-5} \\
+    \hline
+    speedup
+    & 0.398     & 0.388     & 0.393     & 0.394     & 0.63      & 0.778 \\
+    \hline
+  \end{mytable}
+\end{table}
+  
+Then we have changed the network configuration using three clusters containing
+respectively 33, 33 and 34 hosts, or again by on hundred hosts for all the
+clusters. In the same way as above, a judicious choice of key parameters has
+permitted to get the results in Table~\ref{tab.cluster.3x33} which shows the
+speedups less than 1 with a matrix size from 62 to 100 elements.
+
+\begin{table}[!t]
+  \centering
+  \caption{3 clusters, each with 33 nodes}
+  \label{tab.cluster.3x33}
+
+  \begin{mytable}{6}
+    \hline
+    bw
+    & 10       & 5        & 4        & 3        & 2        & 6 \\
+    \hline
+    lat
+    & 0.01     & 0.02     & 0.02     & 0.02     & 0.02     & 0.02 \\
+    \hline
+    power
+    & 1        & 1        & 1        & 1        & 1        & 1 \\
+    \hline
+    size
+    & 62       & 100      & 100      & 100      & 100      & 171 \\
+    \hline
+    Prec/Eprec
+    & \np{E-5} & \np{E-5} & \np{E-5} & \np{E-5} & \np{E-5} & \np{E-5} \\
+    \hline
+    speedup
+    & 0.997    & 0.99     & 0.93     & 0.84     & 0.78     & 0.99 \\
+    \hline
+  \end{mytable}
+\end{table}
+
+In a final step, results of an execution attempt to scale up the three clustered
+configuration but increasing by two hundreds hosts has been recorded in
+Table~\ref{tab.cluster.3x67}.
+
+\begin{table}[!t]
+  \centering
+  \caption{3 clusters, each with 66 nodes}
+  \label{tab.cluster.3x67}
+
+  \begin{mytable}{1}
+    \hline
+    bw         & 1 \\
+    \hline
+    lat        & 0.02 \\
+    \hline
+    power      & 1 \\
+    \hline
+    size       & 62 \\
+    \hline
+    Prec/Eprec & \np{E-5} \\
+    \hline
+    speedup    & 0.9 \\
+    \hline
+  \end{mytable}
+\end{table}
+
+Note that the program was run with the following parameters:
+
+\paragraph*{SMPI parameters}
+
+\begin{itemize}
+       \item HOSTFILE: Hosts file description.
+       \item PLATFORM: file description of the platform architecture : clusters (CPU power,
+\dots{}), intra cluster network description, inter cluster network (bandwidth bw,
+lat latency, \dots{}).
+\end{itemize}
+
+
+\paragraph*{Arguments of the program}
+
+\begin{itemize}
+       \item Description of the cluster architecture;
+       \item Maximum number of internal and external iterations;
+       \item Internal and external precisions;
+       \item Matrix size $N_x$, $N_y$ and $N_z$;
+%<<<<<<< HEAD
+       \item Matrix diagonal value: \np{6.0};
+       \item Matrix Off-diagonal value: \np{-1.0};
+%=======
+%>>>>>>> 5fb6769d88c1720b6480a28521119ef010462fa6
+       \item Execution Mode: synchronous or asynchronous.
+\end{itemize}
+
+\paragraph*{Interpretations and comments}
+
+After analyzing the outputs, generally, for the configuration with two or three
+clusters including one hundred hosts (Tables~\ref{tab.cluster.2x50}
+and~\ref{tab.cluster.3x33}), some combinations of the used parameters affecting
+the results have given a speedup less than 1, showing the effectiveness of the
+asynchronous performance compared to the synchronous mode.
+
+In the case of a two clusters configuration, Table~\ref{tab.cluster.2x50} shows
+that with a deterioration of inter cluster network set with \np[Mbit/s]{5} of
+bandwidth, a latency in order of a hundredth of a millisecond and a system power
+of one GFlops, an efficiency of about \np[\%]{40} in asynchronous mode is
+obtained for a matrix size of 62 elements. It is noticed that the result remains
+stable even if we vary the external precision from \np{E-5} to \np{E-9}. By
+increasing the problem size up to 100 elements, it was necessary to increase the
+CPU power of \np[\%]{50} to \np[GFlops]{1.5} for a convergence of the algorithm
+with the same order of asynchronous mode efficiency.  Maintaining such a system
+power but this time, increasing network throughput inter cluster up to
+\np[Mbit/s]{50}, the result of efficiency of about \np[\%]{40} is obtained with
+high external precision of \np{E-11} for a matrix size from 110 to 150 side
+elements.
+
+For the 3 clusters architecture including a total of 100 hosts,
+Table~\ref{tab.cluster.3x33} shows that it was difficult to have a combination
+which gives an efficiency of asynchronous below \np[\%]{80}. Indeed, for a
+matrix size of 62 elements, equality between the performance of the two modes
+(synchronous and asynchronous) is achieved with an inter cluster of
+\np[Mbit/s]{10} and a latency of \np[ms]{E-1}. To challenge an efficiency by
+\np[\%]{78} with a matrix size of 100 points, it was necessary to degrade the
+inter cluster network bandwidth from 5 to \np[Mbit/s]{2}.
+
+A last attempt was made for a configuration of three clusters but more powerful
+with 200 nodes in total. The convergence with a speedup of \np[\%]{90} was
+obtained with a bandwidth of \np[Mbit/s]{1} as shown in
+Table~\ref{tab.cluster.3x67}.
+
+\LZK{Dans le papier, on compare les deux versions synchrone et asycnhrone du multisplitting. Y a t il des résultats pour comparer gmres parallèle classique avec multisplitting asynchrone? Ca permettra de montrer l'intérêt du multisplitting asynchrone sur des clusters distants}
  
  
+\section{Conclusion}
+The experimental results on executing a parallel iterative algorithm in 
+asynchronous mode on an environment simulating a large scale of virtual 
+computers organized with interconnected clusters have been presented. 
+Our work has demonstrated that using such a simulation tool allow us to 
+reach the following three objectives: 
+
+\begin{enumerate}
+\item To have a flexible configurable execution platform resolving the 
+hard exercise to access to very limited but so solicited physical 
+resources;
+\item to ensure the algorithm convergence with a reasonable time and
+iteration number ;
+\item and finally and more importantly, to find the correct combination 
+of the cluster and network specifications permitting to save time in 
+executing the algorithm in asynchronous mode.
+\end{enumerate}
+Our results have shown that in certain conditions, asynchronous mode is 
+speeder up to \np[\%]{40} than executing the algorithm in synchronous mode
+which is not negligible for solving complex practical problems with more 
+and more increasing size.
+
+ Several studies have already addressed the performance execution time of 
+this class of algorithm. The work presented in this paper has 
+demonstrated an original solution to optimize the use of a simulation 
+tool to run efficiently an iterative parallel algorithm in asynchronous 
+mode in a grid architecture. 
+
+\LZK{Perspectives???}
  
  
-% An example of a floating figure using the graphicx package.
-% Note that \label must occur AFTER (or within) \caption.
-% For figures, \caption should occur after the \includegraphics.
-% Note that IEEEtran v1.7 and later has special internal code that
-% is designed to preserve the operation of \label within \caption
-% even when the captionsoff option is in effect. However, because
-% of issues like this, it may be the safest practice to put all your
-% \label just after \caption rather than within \caption{}.
-%
-% Reminder: the "draftcls" or "draftclsnofoot", not "draft", class
-% option should be used if it is desired that the figures are to be
-% displayed while in draft mode.
-%
-%\begin{figure}[!t]
-%\centering
-%\includegraphics[width=2.5in]{myfigure}
-% where an .eps filename suffix will be assumed under latex, 
-% and a .pdf suffix will be assumed for pdflatex; or what has been declared
-% via \DeclareGraphicsExtensions.
-%\caption{Simulation Results}
-%\label{fig_sim}
-%\end{figure}
-
-% Note that IEEE typically puts floats only at the top, even when this
-% results in a large percentage of a column being occupied by floats.
-
-
-% An example of a double column floating figure using two subfigures.
-% (The subfig.sty package must be loaded for this to work.)
-% The subfigure \label commands are set within each subfloat command, the
-% \label for the overall figure must come after \caption.
-% \hfil must be used as a separator to get equal spacing.
-% The subfigure.sty package works much the same way, except \subfigure is
-% used instead of \subfloat.
-%
-%\begin{figure*}[!t]
-%\centerline{\subfloat[Case I]\includegraphics[width=2.5in]{subfigcase1}%
-%\label{fig_first_case}}
-%\hfil
-%\subfloat[Case II]{\includegraphics[width=2.5in]{subfigcase2}%
-%\label{fig_second_case}}}
-%\caption{Simulation results}
-%\label{fig_sim}
-%\end{figure*}
-%
-% Note that often IEEE papers with subfigures do not employ subfigure
-% captions (using the optional argument to \subfloat), but instead will
-% reference/describe all of them (a), (b), etc., within the main caption.
-
-
-% An example of a floating table. Note that, for IEEE style tables, the 
-% \caption command should come BEFORE the table. Table text will default to
-% \footnotesize as IEEE normally uses this smaller font for tables.
-% The \label must come after \caption as always.
-%
-%\begin{table}[!t]
-%% increase table row spacing, adjust to taste
-%\renewcommand{\arraystretch}{1.3}
-% if using array.sty, it might be a good idea to tweak the value of
-% \extrarowheight as needed to properly center the text within the cells
-%\caption{An Example of a Table}
-%\label{table_example}
-%\centering
-%% Some packages, such as MDW tools, offer better commands for making tables
-%% than the plain LaTeX2e tabular which is used here.
-%\begin{tabular}{|c||c|}
-%\hline
-%One & Two\\
-%\hline
-%Three & Four\\
-%\hline
-%\end{tabular}
-%\end{table}
-
-
-% Note that IEEE does not put floats in the very first column - or typically
-% anywhere on the first page for that matter. Also, in-text middle ("here")
-% positioning is not used. Most IEEE journals/conferences use top floats
-% exclusively. Note that, LaTeX2e, unlike IEEE journals/conferences, places
-% footnotes above bottom floats. This can be corrected via the \fnbelowfloat
-% command of the stfloats package.
-
-
-
-
-
-
-
-% conference papers do not normally have an appendix
-
-
-% use section* for acknowledgement
  \section*{Acknowledgment}
  
  \section*{Acknowledgment}
  
-
-The authors would like to thank...
-
-
-
-
+This work is partially funded by the Labex ACTION program (contract ANR-11-LABX-01-01).
+\todo[inline]{The authors would like to thank\dots{}}
  
  % trigger a \newpage just before the given reference
  % number - used to balance the columns on the last page
  % adjust value as needed - may need to be readjusted if
  % the document is modified later
  
  % trigger a \newpage just before the given reference
  % number - used to balance the columns on the last page
  % adjust value as needed - may need to be readjusted if
  % the document is modified later
-%\IEEEtriggeratref{8}
-% The "triggered" command can be changed if desired:
-%\IEEEtriggercmd{\enlargethispage{-5in}}
-
-% references section
-
-% can use a bibliography generated by BibTeX as a .bbl file
-% BibTeX documentation can be easily obtained at:
-% http://www.ctan.org/tex-archive/biblio/bibtex/contrib/doc/
-% The IEEEtran BibTeX style support page is at:
-% http://www.michaelshell.org/tex/ieeetran/bibtex/
  \bibliographystyle{IEEEtran}
  \bibliographystyle{IEEEtran}
-% argument is your BibTeX string definitions and bibliography database(s)
-\bibliography{bib/hpccBib}
-%
-% <OR> manually copy in the resultant .bbl file
-% set second argument of \begin to the number of references
-% (used to reserve space for the reference number labels box)
-%\begin{thebibliography}{1}
-%
-%\bibitem{IEEEhowto:kopka}
-%H.~Kopka and P.~W. Daly, \emph{A Guide to \LaTeX}, 3rd~ed.\hskip 1em plus
-%  0.5em minus 0.4em\relax Harlow, England: Addison-Wesley, 1999.
-%
-%\end{thebibliography}
+\bibliography{IEEEabrv,hpccBib}
  
  
  
  
  
  
-
-% that's all folks
  \end{document}
  
  \end{document}
  
-
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: t
+%%% fill-column: 80
+%%% ispell-local-dictionary: "american"
+%%% End:
+
+% LocalWords:  Ramamonjisoa Laiymani Arnaud Giersch Ziane Khodja Raphaël Femto
+% LocalWords:  Université Franche Comté IUT Montbéliard Maréchal Juin Inria Sud
+% LocalWords:  Ouest Vieille Talence cedex scalability experimentations HPC MPI
+% LocalWords:  Parallelization AIAC GMRES multi SMPI SISC SIAC SimDAG DAGs Lua
+% LocalWords:  Fortran GFlops priori Mbit de du fcomte multisplitting scalable
+% LocalWords:  SimGrid Belfort parallelize Labex ANR LABX IEEEabrv hpccBib