Adding the repository for GPC'2011 conference.

[interreg4.git] / pdsec2010 / pdsec2010.tex
diff --git a/pdsec2010/pdsec2010.tex b/pdsec2010/pdsec2010.tex

index d2bf510f0cad2fbebeb9df492ba877891c1dd9fb..6f76693538c7b968b7dd0368984e4463084bbd87 100644 (file)
--- a/pdsec2010/pdsec2010.tex
+++ b/pdsec2010/pdsec2010.tex
@@ -27,7 +27,7 @@
  \usepackage[pdftex,final]{graphicx}
  % Extension pour les liens intra-documents (tagged PDF)
  % et l'affichage correct des URL (commande \url{http://example.com})
-\usepackage{hyperref}
+%\usepackage{hyperref}
  % Extension pour que plus de titres apparaissent dans la table des matieres
  % (table des matieres, bibliographie, index).
  %\usepackage{tocbibind}
@@ -130,7 +130,7 @@ and the region of Franche-Comté}
   
  \begin{IEEEkeywords}                                                            
  Mapping algorithms; Distributed clusters; Parallel iterative
-asynchronous algorithms; Heterogeneous distributed architectures
+asynchronous algorithms; Heterogeneous distributed architectures.
  \end{IEEEkeywords}   
  
  
@@ -159,15 +159,15 @@ computation capacity differences between clusters of each site.
  In order to efficiently use this massive distributed computation
  power, numerous numerical algorithms have been modified. These
  algorithms can be broadly classified into two categories. First,
-\textit{Direct methods}, which give the exact solution of the problem
+\textit{direct methods}, which give the exact solution of the problem
  using a finite number of operations (e.g. Cholesky,
-LU\ldots. These methods cannot be applied to all kinds of
-numerical problems. In general, they are not well adapted to very
+LU\dots). These methods cannot be applied to all kinds of
+numerical problems. Generally, they are not well adapted to very
  large problems. Then \textit{iterative methods}, that repeat the same
  instructions until a desired approximation of the solution is reached
  -- we say that the algorithm has converged. Iterative algorithms
  constitute the only known approach to solving some kinds of problems
-and they are easier to parallelize than direct methods. The Jacobi or
+and are easier to parallelize than direct methods. The Jacobi or
  Conjugate Gradient algorithms are examples of such iterative
  methods.
  % \myitemize{5}
@@ -182,9 +182,9 @@ methods.
  
  In the rest of this paper we only focus on iterative methods. Now to
  parallelize this kind of algorithm, two classes of parallel iterative
-models can be described. In \textit{The synchronous iteration model}
+models can be described. In the \textit{synchronous iteration model}
  after each iteration a node sends its results to its neighbors and
-waits for the reception of all dependency messages from its neighbors
+waits for the reception of all dependency messages from them
  to start the next iteration. This results in large idle times and is
  equivalent to a global synchronization of nodes after each
  iteration. These synchronizations can strongly penalize the overall
@@ -195,15 +195,14 @@ application will be blocked. In the same way, if a machine fails, all
  the computation will be blocked.
  
  \begin{figure}[h!]
-  \vspace{0.1cm}
    \centering
    \includegraphics[width=7.4cm]{images/IACA}
    \caption{Two processors computing in the Asynchronous Iteration - Asynchronous Communication (AIAC) model}
    \label{fig:AIAC}
  \end{figure}
+\vspace{-0.2cm}
  
-
-In the\textit{The asynchronous iteration model} a node sends its
+In the \textit{asynchronous iteration model} a node sends its
  results to its neighbors and starts immediately the next iteration
  with the last received data. These data could be data from previous
  iterations, because the most recent data has not arrived in time or
@@ -232,7 +231,7 @@ As we aim to solve very large problems on heterogeneous distributed
  architectures, in the rest of this study we only focus on the
  asynchronous iteration model. In order to efficiently use such
  algorithms on distributed clusters architectures, it is essential to
-map the tasks of the application to the best sub-sets of nodes of the
+map the application's tasks to the best sub-sets of nodes of the
  target architecture. This mapping procedure must take into account
  parameters such as network heterogeneity, computing nodes
  heterogeneity and tasks heterogeneity in order to minimize the overall
@@ -304,8 +303,9 @@ The JaceP2P-V2 architecture, is composed of three main entities:
  %    \label{fig:jaceP2P-v2}
  %  \end{figure}
  
-\myitemize{5}
-\item The first entity is the ``super-node'' %(represented by a big
+\myitemize{3}
+%\begin{itemize}
+\item The first entity is the ``super-node''. %(represented by a big
    %circle in Figure \ref{fig:jaceP2P-v2}). 
    Super-nodes form a circular network and store, in registers, the
    identifiers of all the computing nodes that are connected to the
@@ -331,17 +331,16 @@ The JaceP2P-V2 architecture, is composed of three main entities:
    of time, it declares that this computing node is dead and deletes
    its identifier from the register.
  
-\item The second entity is the ``spawner'' 
+\item The second entity is the ``spawner''. 
  %(represented by a square in
   % Figure \ref{fig:jaceP2P-v2}).  
-When a user wants to execute a
-  parallel application that requires $N$ computing nodes, he or she
-  launches a spawner. The spawner contacts a super-node to reserve the
-  $N$ computing nodes plus some extra nodes. When the spawner receives
-  the list of nodes from the super-node, it transforms the extra
-  nodes into spawners (for fault tolerance and scalability reasons)
-  and stores the identifiers of the rest of the nodes in its own
-  register. 
+  When a user wants to execute a parallel application that requires
+  $N$ computing nodes, he or she launches a spawner. This one contacts
+  a super-node to reserve the $N$ computing nodes plus some extra
+  nodes. When it receives the list of nodes from the super-node, it
+  transforms the extra nodes into spawners (for fault tolerance and
+  scalability reasons) and stores the identifiers of the rest of the
+  nodes in its own register.
  % Once the extra nodes are transformed into spawners, they
  %   form a circular network and they receive the register containing the
  %   identifiers of the computing nodes. 
@@ -363,12 +362,13 @@ Then each spawner becomes
  %   of the dead node and continues the computing task from that
  %   checkpoint.
  
-\item The third entity is the ``daemon'', or the computing node,
-  (represented in Figure \ref{fig:jaceP2P-v2} by a hashed small circle
-  if it is idle and by a white small circle if it is executing an
-  application). Once launched, it connects to a super-node and waits
-  for a task to execute. Once they begin executing an application they
-  form a circular network which is only used in the failure detection
+\item The third entity is the ``daemon'', or the computing node.
+%  (represented in Figure \ref{fig:jaceP2P-v2} by a hashed small circle
+%  if it is idle and by a white small circle if it is executing an
+%  application). 
+  Once launched, it connects to a super-node and waits for a task to
+  execute. Once they begin executing an application daemons form a
+  circular network which is only used in the failure detection
    mechanism. Each daemon can communicate directly with the daemons
    whose identifiers are in its register. At the end of a task, the
    daemons reconnect to a super-node.
@@ -378,7 +378,7 @@ To be able to execute asynchronous iterative applications, JaceP2P-V2
  has an asynchronous messaging mechanism and to resist daemons'
  failures, it implements a distributed backup mechanism called the
  uncoordinated distributed checkpointing. For more details on the
-JaceP2P-V2 platform, readers can refer to \cite{jaceP2P-v2}.
+JaceP2P-V2 platform, interested readers can refer to \cite{jaceP2P-v2}.
  
  %  This method allows daemons to
  % save their data on neighboring daemons without any user
@@ -398,21 +398,21 @@ JaceP2P-V2 platform, readers can refer to \cite{jaceP2P-v2}.
  % on distributed architectures.
  
  
+\vspace*{0.2cm}
  
  \subsubsection*{Benefits of mapping}
  \label{sec:benef}
  
-In the JaceP2P-V2 environment, presented in the previous section,
-there is no effective mapping solution. Indeed, when a user wants to
-launch an application, the spawner emits a request to the super-node,
-which is in charge of available daemons. Basically, the super-node
-returns the amount of requested computing nodes by choosing in its own
-list.
+In the previously described JaceP2P-V2 environment there is no
+effective mapping solution. Indeed, when a user wants to launch an
+application, the spawner emits a request to the super-node, which is
+in charge of available daemons. Basically, the super-node returns the
+amount of requested computing nodes by choosing in its own list.
  %,
  %if there are sufficient daemons connected on it, or in other
  %super-nodes lists, in addition of its one. 
  In this method, the super-node only cares about the amount of
-requested nodes, it returns in general nodes in the order of their
+requested nodes and it returns in general nodes in the order of their
  connection to the platform -- there is no specific selection.
  Distributed architectures such as distributed clusters, 
  %as can be seen
@@ -454,7 +454,7 @@ efficient to assign tasks communicating with each other on the same
  cluster, in order to improve communications. But, as we use very large
  problems, it is quite impossible to find clusters containing as many
  computing nodes as requested. So we have to dispatch tasks over
-several clusters. That implies a nedd to deal with heterogeneity in clusters
+several clusters. That implies a need to deal with heterogeneity in clusters
  computing power and heterogeneity in network. We should make a
  trade-off between both components in order to take the best part
  of each one to improve the overall performance.
@@ -463,7 +463,7 @@ of each one to improve the overall performance.
  %benefits of mapping solutions on the applications execution time.
  
  In order to check if a tasks mapping algorithm would provide
-performance improvement in JaceP2P-V2 environment, we have evaluated
+performance improvement in the JaceP2P-V2 environment, we have evaluated
  the contributions of a simple mapping algorithm, which is described in
  section \ref{sec:sma}. These experiments used the NPB Kernel CG
  application described in section \ref{sec:cg}, with two problem sizes
@@ -490,6 +490,7 @@ Table \ref{tab:benef}.
    \caption{Effects of a simple tasks
      mapping algorithm on application's execution time}
    \label{tab:benef}
+  \vspace{-0.5cm}
  \end{table}
  
  As can be seen in Table \ref{tab:benef}, the effects of a
@@ -576,8 +577,9 @@ the execution of tasks are not explicitly addressed: all the tasks are
  considered simultaneously executable and communications can take place
  at any time during the computation. That is why vertices and edges are
  labeled with weights
-describing computational and communication costs.\\
- 
+describing computational and communication costs.
+
+\vspace*{0.2cm}
  
  \subsubsection{Architecture modeling}
  \label{sec:pbmodelarchi}
@@ -595,7 +597,7 @@ representing a distributed clusters architecture, where $N =
  of undirectional edges. The vertices represent the computing nodes and
  the edges represent the links between them. An edge $L_i \in L$ is an
  unordered pair $(N_x,N_y) \in N$, representing a communication link
-between nodes $x$ and $y$. Let be $|C|$ the number of clusters in the
+between nodes $N_x$ and $N_y$. Let be $|C|$ the number of clusters in the
  architecture containing computing nodes. A function $WN : N
  \rightarrow R^+$ gives the computational power of nodes and $WL : L
  \rightarrow R^+$ gives the communication latency of links. We define
@@ -609,8 +611,9 @@ addition, like in the Grid'5000 testbed, if computing nodes seemly
  have the same computational power with a low communication latency, a
  cluster of these nodes can be defined. All participating clusters,
  including computing nodes, are in the same architecture level and
-communicate through the architecture network.\\
+communicate through the architecture network.
  
+\vspace*{0.2cm}
  
  \subsubsection{Mapping functions}
  \label{sec:pbmodelmapping}
@@ -732,15 +735,15 @@ task assignment problem \cite{npcomp}, and is thus NP-complete.
  %In the previous section we have determined that we need to use TIG
  %model based mapping algorithms to address our problem. 
  In the literature of the TIG mapping, we can find many algorithms,
-which can be divided into two categories. In the \textit{Edge-cuts
-  optimization} class of algorithms, yhe aim is to minimize the use of the
-penalizing links between clusters. As tasks are depending on
-neighbors, which are called dependencies, the goal is to choose nodes
-where distance, in term of network, is small, to improve
-communications between tasks. Here we can cite Metis\cite{metis},
-Chaco\cite{chaco} and PaGrid\cite{pagrid} which are libraries
-containing such kind of algorithms. The main drawback of edge-cuts
-algorithms is that they do not tackle the computing nodes
+which can be divided into two categories. First, in the
+\textit{Edge-cuts optimization} class of algorithms, the aim is to
+minimize the use of the penalizing links between clusters. As tasks
+are depending on neighbors, which are called dependencies, the goal is
+to choose nodes where distance, in term of network, is small to
+improve communications between tasks. Here we can cite
+Metis\cite{metis}, Chaco\cite{chaco} and PaGrid\cite{pagrid} which are
+libraries containing such kind of algorithms. The main drawback of
+edge-cuts algorithms is that they do not tackle the computing nodes
  heterogeneity issues. They only focus on communication overheads.
  %  The Figure \ref{fig:edge} shows
  %  that the optimization is on edges of communications, which are
@@ -752,10 +755,10 @@ heterogeneity issues. They only focus on communication overheads.
  %    \caption{The edge-cuts optimization}
  %    \label{fig:edge}
  %  \end{figure}
-In the \textit{Execution time optimization} class of algorithms the
-aim is to minimize the whole execution time of the application. They
-look for nodes which can provide the small execution time of tasks
-using their computational power. Here we can cite
+Then, in the \textit{Execution time optimization} class of algorithms
+the aim is to minimize the whole execution time of the
+application. They look for nodes which can provide the small execution
+time of tasks using their computational power. Here we can cite
  FastMap\cite{fastmap} and MiniMax\cite{minimax} as such kind of
  algorithms. QM\cite{qm_these} is also an algorithm of this category,
  but it aims to find for each task the node which can provide the best
@@ -774,11 +777,12 @@ application level
  
  The two classes of algorithms may fit with our goals, because in our
  model we have both the computational power of nodes and communication
-costs may influence the applications performance. We can also cite
-partitioning tools like Scotch \cite{scotch} which aims at privileging
-the load balancing of their partitioning schemes. Nevertheless, to the
-best of our knowledge, none of the existing algorithms take into
-consideration the specificities of the AIAC model (see next section).
+costs which may influence the applications performance. We can also
+cite partitioning tools like Scotch \cite{scotch} which aims at
+privileging the load balancing of their partitioning
+schemes. Nevertheless, to the best of our knowledge, none of the
+existing algorithms take into consideration the specificities of the
+AIAC model (see next section).
  %specifically address the AIAC mapping problem.
  %As the existing mapping algorithms are not designed to fit
  %with the AIAC mapping problem.
@@ -819,7 +823,10 @@ this model, the faster and more frequently a task receives its
  dependencies, the faster it converges. Moreover, as the JaceP2P-V2
  environment is fault tolerant and tasks save checkpoints on their
  neighbors, it is more efficient to save on near nodes than on far
-ones.
+ones. In the synchronous model, both heterogeneity and locality must
+be taken into account in a balanced way. In the asynchronous model,
+since no synchronization occurs, the heterogeneity issue is less
+important.
  
  
  %As our problem is on task mapping, only considering the dependency
@@ -943,7 +950,7 @@ of which their neighbors are mapped on.
  So, in this algorithm all nodes are first sorted in descending order
  according to their computation power, and all tasks are mapped on
  these nodes according to their identifier (they are also marked as
-``moveable'', that means that each task can be moved from a node to
+``moveable''; it means that each task can be moved from a node to
  another). As in the original QM algorithm, AIAC QM keeps track of the
  \textit{number of rounds} $r$ ($r > 0$), that all tasks have been
  searched for a better node. This allows to reduce at each round the
@@ -962,7 +969,7 @@ map the task on nodes that are neighbors of nodes of which the
  dependencies of $t_i$ are mapped on. This is one of the major
  modification to the original QM algorithm. It introduces a little part
  of ``edge-cuts'' optimization. In the original version, it tries to
-map the task $t_i$ on the same node of one as its dependencies. As
+map the task $t_i$ on the same node of one of its dependencies. As
  explain in \ref{sec:specAIACmapping}, this is not an acceptable
  solution in our case. Instead, the algorithm now searches to map task
  $t_i$ on nodes which are near the ones its dependencies are mapped
@@ -980,7 +987,7 @@ considered in this round, $r$ is incremented.
  The complexity of the AIAC QM algorithm is about $O(n^2 \cdot
  t \cdot ln(r))$. This complexity is the same as the original algorithm
  (details are given in \cite{qm_these}, with an increase of a factor
-$n$, corresponding to the edge-cuts part.
+$n$, corresponding to the edge-cuts part).
  
  \section{Experimentation}
  \label{sec:expe}
@@ -1013,11 +1020,11 @@ horizontal rectangle parts.
  Each
  of these parts is assigned to a processor -- so the size of data
  depends on the matrix size but also on the number of participating
-nodes. In this way, a processor is in charge of computing its $X Sub$
-part by solving the following subsystem: $ASub \times XSub = BSub -
-DepLeft \times XLeft - DepRight \times XRight$
+nodes. In this way, a processor is in charge of computing its {\small $X Sub$}
+part by solving the following subsystem: {\small $ASub \times XSub = BSub -
+DepLeft \times XLeft - DepRight \times XRight$}.
  
-After solving $XSub$, the result must be sent to other
+After solving {\small $XSub$}, the result must be sent to other
  processors which depend on it.
  
  % \begin{figure}[h!]
@@ -1059,8 +1066,8 @@ processors which depend on it.
  For more details about this method, interested readers are invited to
  see \cite{book_raph}. In our benchmark, the sequential solver part of
  the multisplitting method is the Conjugate Gradient, using the
-MTJ\cite{mtj} library. Its implementation is multi-threaded, so it
-benefits from multi-core processors.
+MTJ\cite{mtj} library. Its implementation is multi-threaded, to
+benefit from multi-core processors.
  
  We point out here that this benchmark is a typical AIAC
  application. In our study, we consider that the computational costs of
@@ -1187,6 +1194,8 @@ task is assigned to a particular computing node of the chosen cluster.
  %on distributed clusters architectures, with a gain over $30\%$ on
  %execution, as experiments described in section \ref{sec:benef}.
  
+\vspace*{0.2cm}
+
  \subsubsection{Edge-cuts optimization}
  \label{sec:edgcutalgo}
  
@@ -1354,10 +1363,11 @@ size $5,000,000$ (named ``class F'') using 128 nodes.
  
  
  
-\subsubsection{About heterogeneity}
-\label{sec:xphetero}
+%\subsubsection{About heterogeneity}
+%\label{sec:xphetero}
  
-The first experiments concern the study of the impact of the
+%The first experiments concern the study of the impact of the
+Our experiments concern the study of the impact of the
  heterogeneity of the computing nodes on the mapping
  results. Heterogeneity is an important factor in high performance
  computing in the grid all the more so when using the asynchronous
@@ -1414,7 +1424,7 @@ in comparison to the version without mapping.
    \caption{Gains in time of the execution of the class E of the CG 
      application on Arc1.1 using 64 nodes}
    \label{tab:exph1E}
-  \vspace*{-0.3cm}
+  \vspace*{-0.4cm}
  \end{table}
  
  %\vspace{-0.5cm}
@@ -1437,7 +1447,7 @@ in comparison to the version without mapping.
    \caption{Gains in time of the execution of the class F of the CG
      application on Arc1.2 using 128 nodes}
    \label{tab:exph1F}
-  \vspace*{-0.3cm}
+  \vspace*{-0.4cm}
  \end{table}
  
  At first, we can see that the Simple Mapping algorithm, though it is
@@ -1502,7 +1512,7 @@ mapping.
    \caption{Gains in time of the execution of the class E of the CG
      application on Arc2.1 using 64 nodes}
    \label{tab:exph2E}
-  \vspace*{-0.3cm}
+  \vspace*{-0.4cm}
  \end{table}
  
  \renewcommand{\arraystretch}{1.5}
@@ -1523,7 +1533,7 @@ mapping.
    \caption{Gains in time of the execution of the class F of the CG
      application on Arc2.2 using 128 nodes}
    \label{tab:exph2F}
-  \vspace*{-0.3cm}
+  \vspace*{-0.4cm}
  \end{table}
  
  To begin with, these experiments confirm that a mapping algorithm is
@@ -1736,7 +1746,7 @@ a whole cluster fails, as we save on neighbors (which are
  in general on the same cluster for communication efficiency reasons),
  an important part of the application is lost and we cannot restart
  this part; so the whole application fails. A trade-off should be done
-by having some saving nodes in external clusters.
+by having some saving nodes in external clusters.\\
  
  % \subsubsection*{Acknowledgements}