SG (wip).

[hpcc2014.git] / hpcc.tex
diff --git a/hpcc.tex b/hpcc.tex

index 4322e6e47d8548cca765374aec8b0629fc03a9e1..ebbdd4d41d4943acaa829dfe6d483dde45ab4502 100644 (file)
--- a/hpcc.tex
+++ b/hpcc.tex
@@ -40,11 +40,6 @@
  
  \newcommand{\MI}{\mathit{MaxIter}}
  
-\usepackage{array}
-\usepackage{color, colortbl}
-\newcolumntype{M}[1]{>{\centering\arraybackslash}m{#1}}
-\newcolumntype{Z}[1]{>{\raggedleft}m{#1}}
-
  \begin{document}
  
  \title{Simulation of Asynchronous Iterative Numerical Algorithms Using SimGrid}
@@ -179,7 +174,7 @@ convergence is generally greater than for the two former classes. But, and as de
  algorithms can significantly reduce overall execution times by suppressing idle times due to synchronizations especially
  in a grid computing context.
  
-\begin{figure}[htbp]
+\begin{figure}[!t]
    \centering
      \includegraphics[width=8cm]{AIAC.pdf}
    \caption{The Asynchronous Iterations - Asynchronous Communications model } 
@@ -208,16 +203,18 @@ iterations and so to very different execution times.
  SimGrid~\cite{casanova+legrand+quinson.2008.simgrid,SimGrid} is a simulation
  framework to sudy the behavior of large-scale distributed systems.  As its name
  says, it emanates from the grid computing community, but is nowadays used to
-study grids, clouds, HPC or peer-to-peer systems.
-%- open source, developped since 1999, one of the major solution in the field
-%
+study grids, clouds, HPC or peer-to-peer systems.  The early versions of SimGrid
+date from 1999, but it's still actively developped and distributed as an open
+source software.  Today, it's one of the major generic tools in the field of
+simulation for large-scale distributed systems.
+
  SimGrid provides several programming interfaces: MSG to simulate Concurrent
  Sequential Processes, SimDAG to simulate DAGs of (parallel) tasks, and SMPI to
  run real applications written in MPI~\cite{MPI}.  Apart from the native C
  interface, SimGrid provides bindings for the C++, Java, Lua and Ruby programming
  languages.  The SMPI interface supports applications written in C or Fortran,
-with little or no modifications.
-%- implements most of MPI-2 \cite{ref} standard [CHECK]
+with little or no modifications.  SMPI implements about \np[\%]{80} of the MPI
+2.0 standard~\cite{bedaride:hal-00919507}.
  
  %%% explain simulation
  %- simulated processes folded in one real process
@@ -231,8 +228,6 @@ with little or no modifications.
  
  %%% validation + refs
  
-\AG{Décrire SimGrid~\cite{casanova+legrand+quinson.2008.simgrid,SimGrid} (Arnaud)}
-
  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  \section{Simulation of the multisplitting method}
  %Décrire le problème (algo) traité ainsi que le processus d'adaptation à SimGrid.
@@ -269,7 +264,7 @@ Y_l = B_l - \displaystyle\sum_{\substack{m=1\\ m\neq l}}^{L}A_{lm}X_m
  \end{equation}
  is solved independently by a cluster and communications are required to update the right-hand side sub-vector $Y_l$, such that the sub-vectors $X_m$ represent the data dependencies between the clusters. As each sub-system (\ref{eq:4.1}) is solved in parallel by a cluster of processors, our multisplitting method uses an iterative method as an inner solver which is easier to parallelize and more scalable than a direct method. In this work, we use the parallel algorithm of GMRES method~\cite{ref1} which is one of the most used iterative method by many researchers. 
  
-\begin{figure}
+\begin{figure}[!t]
    %%% IEEE instructions forbid to use an algorithm environment here, use figure
    %%% instead
  \begin{algorithmic}[1]
@@ -310,7 +305,7 @@ clusters (lines $6$ and $7$ in Figure~\ref{algo:01}). The shared vector
  elements of the solution $x$ are exchanged by message passing using MPI
  non-blocking communication routines.
  
-\begin{figure}
+\begin{figure}[!t]
  \centering
    \includegraphics[width=60mm,keepaspectratio]{clustering}
  \caption{Example of three clusters of processors interconnected by a virtual unidirectional ring network.}
@@ -363,24 +358,58 @@ Table~\ref{tab.cluster.2x50} with a matrix size ranging from $N_x = N_y = N_z =
  62 \text{ to } 171$ elements or from $62^{3} = \np{238328}$ to $171^{3} =
  \np{5211000}$ entries.
  
-\begin{table}
+\begin{table}[!t]
    \centering
    \caption{2 clusters, each with 50 nodes}
    \label{tab.cluster.2x50}
-
- \tiny
- 
-\begin{tabular}{|Z{0.55cm}|Z{0.25cm}|Z{0.25cm}|M{0.25cm}|Z{0.25cm}|M{0.25cm}|M{0.25cm}|M{0.25cm}|M{0.25cm}|M{0.25cm}|M{0.25cm}|M{0.25cm}|M{0.25cm}|M{0.25cm}|} 
- \hline 
- \bf bw & 5 &5 & 5 & 5 & 5 & 50 & 50 & 50 & 50 & 50 & 10 & 10\\ 
- \hline
- \bf lat & 0.02 & 0.02 & 0.02 & 0.02 & 0.02 & 0.02 & 0.02 & 0.02 & 0.02 & 0.02 & 0.03 & 0.01\\ 
- \hline 
- \bf power & 1 & 1 & 1 & 1.5 & 1.5 & 1.5 & 1.5 & 1.5 & 1.5 & 1.5 & 1 & 1.5\\ \hline    \bf size & 62 & 62 & 62 & 100 & 100 & 110 & 120& 130 & 140 & 150 & 171 & 171\\ \hline
- \bf Prec/Eprec & 10$^{-5}$ & 10$^{-8}$ & 10$^{-9}$ & 10$^{-11}$ & 10$^{-11}$ & 10$^{-11}$ & 10$^{-11}$ & 10$^{-11}$ & 10$^{-11}$ & 10$^{-11}$ & 10$^{-5}$ & 10$^{-5}$\\ \hline 
- \bf speedup & 0.396 & 0.392 & 0.396 & 0.391 & 0.393 & 0.395 & 0.398 & 0.388 & 0.393 & 0.394 & 0.63 & 0.778\\ \hline 
- \end{tabular}
-\end{table} 
+  \renewcommand{\arraystretch}{1.3}
+
+  \begin{tabular}{|>{\bfseries}r|*{12}{c|}}
+    \hline
+    bw
+    & 5         & 5         & 5         & 5         & 5         & 50 \\
+    \hline
+    lat
+    & 0.02      & 0.02      & 0.02      & 0.02      & 0.02      & 0.02 \\
+    \hline
+    power
+    & 1         & 1         & 1         & 1.5       & 1.5       & 1.5 \\
+    \hline
+    size
+    & 62        & 62        & 62        & 100       & 100       & 110 \\
+    \hline
+    Prec/Eprec
+    & \np{E-5}  & \np{E-8}  & \np{E-9}  & \np{E-11} & \np{E-11} & \np{E-11} \\
+    \hline
+    speedup
+    & 0.396     & 0.392     & 0.396     & 0.391     & 0.393     & 0.395 \\
+    \hline
+  \end{tabular}
+
+  \smallskip
+
+  \begin{tabular}{|>{\bfseries}r|*{12}{c|}}
+    \hline
+    bw
+    & 50        & 50        & 50        & 50        & 10        & 10 \\
+    \hline
+    lat
+    & 0.02      & 0.02      & 0.02      & 0.02      & 0.03      & 0.01 \\
+    \hline
+    power
+    & 1.5       & 1.5       & 1.5       & 1.5       & 1         & 1.5 \\
+    \hline
+    size
+    & 120       & 130       & 140       & 150       & 171       & 171 \\
+    \hline
+    Prec/Eprec
+    & \np{E-11} & \np{E-11} & \np{E-11} & \np{E-11} & \np{E-5}  & \np{E-5} \\
+    \hline
+    speedup
+    & 0.398     & 0.388     & 0.393     & 0.394     & 0.63      & 0.778 \\
+    \hline
+  \end{tabular}
+\end{table}
    
  Then we have changed the network configuration using three clusters containing
  respectively 33, 33 and 34 hosts, or again by on hundred hosts for all the
@@ -388,52 +417,62 @@ clusters. In the same way as above, a judicious choice of key parameters has
  permitted to get the results in Table~\ref{tab.cluster.3x33} which shows the
  speedups less than 1 with a matrix size from 62 to 100 elements.
  
-\begin{table}
+\begin{table}[!t]
    \centering
    \caption{3 clusters, each with 33 nodes}
    \label{tab.cluster.3x33}
-
- \tiny
- 
-\begin{tabular}{|Z{0.55cm}|Z{0.25cm}|Z{0.25cm}|M{0.25cm}|Z{0.25cm}|M{0.25cm}|M{0.25cm}|} 
- \hline 
- \bf bw & 10 &5 & 4 & 3 & 2 & 6\\ \hline
- \bf lat & 0.01 & 0.02 & 0.02 & 0.02 & 0.02 & 0.02\\ 
- \hline 
- \bf power & 1 & 1 & 1 & 1 & 1 & 1\\ \hline    
- \bf size & 62 & 100 & 100 & 100 & 100 & 171\\ \hline
- \bf Prec/Eprec & 10$^{-5}$ & 10$^{-5}$ & 10$^{-5}$ & 10$^{-5}$ & 10$^{-5}$ & 10$^{-5}$\\ \hline 
- \bf speedup & 0.997 & 0.99 & 0.93 & 0.84 & 0.78 & 0.99\\ 
- \hline 
- \end{tabular}
-\end{table} 
+  \renewcommand{\arraystretch}{1.3}
+
+  \begin{tabular}{|>{\bfseries}r|*{6}{c|}}
+    \hline
+    bw
+    & 10       & 5        & 4        & 3        & 2        & 6 \\
+    \hline
+    lat
+    & 0.01     & 0.02     & 0.02     & 0.02     & 0.02     & 0.02 \\
+    \hline
+    power
+    & 1        & 1        & 1        & 1        & 1        & 1 \\
+    \hline
+    size
+    & 62       & 100      & 100      & 100      & 100      & 171 \\
+    \hline
+    Prec/Eprec
+    & \np{E-5} & \np{E-5} & \np{E-5} & \np{E-5} & \np{E-5} & \np{E-5} \\
+    \hline
+    speedup
+    & 0.997    & 0.99     & 0.93     & 0.84     & 0.78     & 0.99 \\
+    \hline
+  \end{tabular}
+\end{table}
  
  
  In a final step, results of an execution attempt to scale up the three clustered
  configuration but increasing by two hundreds hosts has been recorded in
  Table~\ref{tab.cluster.3x67}.
  
-\begin{table}
+\begin{table}[!t]
    \centering
    \caption{3 clusters, each with 66 nodes}
    \label{tab.cluster.3x67}
-
- \tiny
-\begin{tabular}{|M{0.55cm}|M{0.25cm}|} 
- \hline 
- \bf bw & 1\\ \hline
- \bf lat & 0.02\\ 
- \hline 
- \bf power & 1\\ 
- \hline    
- \bf size & 62\\ 
- \hline
- \bf Prec/Eprec & 10$^{-5}$\\ 
- \hline 
- \bf speedup & 0.9\\ 
- \hline 
+  \renewcommand{\arraystretch}{1.3}
+
+  \begin{tabular}{|>{\bfseries}r|c|}
+    \hline
+    bw         & 1 \\
+    \hline
+    lat        & 0.02 \\
+    \hline
+    power      & 1 \\
+    \hline
+    size       & 62 \\
+    \hline
+    Prec/Eprec & \np{E-5} \\
+    \hline
+    speedup    & 0.9 \\
+    \hline
   \end{tabular}
-\end{table} 
+\end{table}
  
  Note that the program was run with the following parameters: