SG (wip).

[hpcc2014.git] / hpcc.tex
diff --git a/hpcc.tex b/hpcc.tex

index 4322e6e47d8548cca765374aec8b0629fc03a9e1..ebbdd4d41d4943acaa829dfe6d483dde45ab4502 100644 (file)
--- a/hpcc.tex
+++ b/hpcc.tex
@@ -40,11 +40,6 @@
  
  \newcommand{\MI}{\mathit{MaxIter}}
  
  
  \newcommand{\MI}{\mathit{MaxIter}}
  
-\usepackage{array}
-\usepackage{color, colortbl}
-\newcolumntype{M}[1]{>{\centering\arraybackslash}m{#1}}
-\newcolumntype{Z}[1]{>{\raggedleft}m{#1}}
-
  \begin{document}
  
  \title{Simulation of Asynchronous Iterative Numerical Algorithms Using SimGrid}
  \begin{document}
  
  \title{Simulation of Asynchronous Iterative Numerical Algorithms Using SimGrid}
@@ -179,7 +174,7 @@ convergence is generally greater than for the two former classes. But, and as de
  algorithms can significantly reduce overall execution times by suppressing idle times due to synchronizations especially
  in a grid computing context.
  
  algorithms can significantly reduce overall execution times by suppressing idle times due to synchronizations especially
  in a grid computing context.
  
-\begin{figure}[htbp]
+\begin{figure}[!t]
    \centering
      \includegraphics[width=8cm]{AIAC.pdf}
    \caption{The Asynchronous Iterations - Asynchronous Communications model } 
    \centering
      \includegraphics[width=8cm]{AIAC.pdf}
    \caption{The Asynchronous Iterations - Asynchronous Communications model } 
@@ -208,16 +203,18 @@ iterations and so to very different execution times.
  SimGrid~\cite{casanova+legrand+quinson.2008.simgrid,SimGrid} is a simulation
  framework to sudy the behavior of large-scale distributed systems.  As its name
  says, it emanates from the grid computing community, but is nowadays used to
  SimGrid~\cite{casanova+legrand+quinson.2008.simgrid,SimGrid} is a simulation
  framework to sudy the behavior of large-scale distributed systems.  As its name
  says, it emanates from the grid computing community, but is nowadays used to
-study grids, clouds, HPC or peer-to-peer systems.
-%- open source, developped since 1999, one of the major solution in the field
-%
+study grids, clouds, HPC or peer-to-peer systems.  The early versions of SimGrid
+date from 1999, but it's still actively developped and distributed as an open
+source software.  Today, it's one of the major generic tools in the field of
+simulation for large-scale distributed systems.
+
  SimGrid provides several programming interfaces: MSG to simulate Concurrent
  Sequential Processes, SimDAG to simulate DAGs of (parallel) tasks, and SMPI to
  run real applications written in MPI~\cite{MPI}.  Apart from the native C
  interface, SimGrid provides bindings for the C++, Java, Lua and Ruby programming
  languages.  The SMPI interface supports applications written in C or Fortran,
  SimGrid provides several programming interfaces: MSG to simulate Concurrent
  Sequential Processes, SimDAG to simulate DAGs of (parallel) tasks, and SMPI to
  run real applications written in MPI~\cite{MPI}.  Apart from the native C
  interface, SimGrid provides bindings for the C++, Java, Lua and Ruby programming
  languages.  The SMPI interface supports applications written in C or Fortran,
-with little or no modifications.
-%- implements most of MPI-2 \cite{ref} standard [CHECK]
+with little or no modifications.  SMPI implements about \np[\%]{80} of the MPI
+2.0 standard~\cite{bedaride:hal-00919507}.
  
  %%% explain simulation
  %- simulated processes folded in one real process
  
  %%% explain simulation
  %- simulated processes folded in one real process
@@ -231,8 +228,6 @@ with little or no modifications.
  
  %%% validation + refs
  
  
  %%% validation + refs
  
-\AG{Décrire SimGrid~\cite{casanova+legrand+quinson.2008.simgrid,SimGrid} (Arnaud)}
-
  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  \section{Simulation of the multisplitting method}
  %Décrire le problème (algo) traité ainsi que le processus d'adaptation à SimGrid.
  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  \section{Simulation of the multisplitting method}
  %Décrire le problème (algo) traité ainsi que le processus d'adaptation à SimGrid.
@@ -269,7 +264,7 @@ Y_l = B_l - \displaystyle\sum_{\substack{m=1\\ m\neq l}}^{L}A_{lm}X_m
  \end{equation}
  is solved independently by a cluster and communications are required to update the right-hand side sub-vector $Y_l$, such that the sub-vectors $X_m$ represent the data dependencies between the clusters. As each sub-system (\ref{eq:4.1}) is solved in parallel by a cluster of processors, our multisplitting method uses an iterative method as an inner solver which is easier to parallelize and more scalable than a direct method. In this work, we use the parallel algorithm of GMRES method~\cite{ref1} which is one of the most used iterative method by many researchers. 
  
  \end{equation}
  is solved independently by a cluster and communications are required to update the right-hand side sub-vector $Y_l$, such that the sub-vectors $X_m$ represent the data dependencies between the clusters. As each sub-system (\ref{eq:4.1}) is solved in parallel by a cluster of processors, our multisplitting method uses an iterative method as an inner solver which is easier to parallelize and more scalable than a direct method. In this work, we use the parallel algorithm of GMRES method~\cite{ref1} which is one of the most used iterative method by many researchers. 
  
-\begin{figure}
+\begin{figure}[!t]
    %%% IEEE instructions forbid to use an algorithm environment here, use figure
    %%% instead
  \begin{algorithmic}[1]
    %%% IEEE instructions forbid to use an algorithm environment here, use figure
    %%% instead
  \begin{algorithmic}[1]
@@ -310,7 +305,7 @@ clusters (lines $6$ and $7$ in Figure~\ref{algo:01}). The shared vector
  elements of the solution $x$ are exchanged by message passing using MPI
  non-blocking communication routines.
  
  elements of the solution $x$ are exchanged by message passing using MPI
  non-blocking communication routines.
  
-\begin{figure}
+\begin{figure}[!t]
  \centering
    \includegraphics[width=60mm,keepaspectratio]{clustering}
  \caption{Example of three clusters of processors interconnected by a virtual unidirectional ring network.}
  \centering
    \includegraphics[width=60mm,keepaspectratio]{clustering}
  \caption{Example of three clusters of processors interconnected by a virtual unidirectional ring network.}
@@ -363,24 +358,58 @@ Table~\ref{tab.cluster.2x50} with a matrix size ranging from $N_x = N_y = N_z =
  62 \text{ to } 171$ elements or from $62^{3} = \np{238328}$ to $171^{3} =
  \np{5211000}$ entries.
  
  62 \text{ to } 171$ elements or from $62^{3} = \np{238328}$ to $171^{3} =
  \np{5211000}$ entries.
  
-\begin{table}
+\begin{table}[!t]
    \centering
    \caption{2 clusters, each with 50 nodes}
    \label{tab.cluster.2x50}
    \centering
    \caption{2 clusters, each with 50 nodes}
    \label{tab.cluster.2x50}
-
- \tiny
- 
-\begin{tabular}{|Z{0.55cm}|Z{0.25cm}|Z{0.25cm}|M{0.25cm}|Z{0.25cm}|M{0.25cm}|M{0.25cm}|M{0.25cm}|M{0.25cm}|M{0.25cm}|M{0.25cm}|M{0.25cm}|M{0.25cm}|M{0.25cm}|} 
- \hline 
- \bf bw & 5 &5 & 5 & 5 & 5 & 50 & 50 & 50 & 50 & 50 & 10 & 10\\ 
- \hline
- \bf lat & 0.02 & 0.02 & 0.02 & 0.02 & 0.02 & 0.02 & 0.02 & 0.02 & 0.02 & 0.02 & 0.03 & 0.01\\ 
- \hline 
- \bf power & 1 & 1 & 1 & 1.5 & 1.5 & 1.5 & 1.5 & 1.5 & 1.5 & 1.5 & 1 & 1.5\\ \hline    \bf size & 62 & 62 & 62 & 100 & 100 & 110 & 120& 130 & 140 & 150 & 171 & 171\\ \hline
- \bf Prec/Eprec & 10$^{-5}$ & 10$^{-8}$ & 10$^{-9}$ & 10$^{-11}$ & 10$^{-11}$ & 10$^{-11}$ & 10$^{-11}$ & 10$^{-11}$ & 10$^{-11}$ & 10$^{-11}$ & 10$^{-5}$ & 10$^{-5}$\\ \hline 
- \bf speedup & 0.396 & 0.392 & 0.396 & 0.391 & 0.393 & 0.395 & 0.398 & 0.388 & 0.393 & 0.394 & 0.63 & 0.778\\ \hline 
- \end{tabular}
-\end{table} 
+  \renewcommand{\arraystretch}{1.3}
+
+  \begin{tabular}{|>{\bfseries}r|*{12}{c|}}
+    \hline
+    bw
+    & 5         & 5         & 5         & 5         & 5         & 50 \\
+    \hline
+    lat
+    & 0.02      & 0.02      & 0.02      & 0.02      & 0.02      & 0.02 \\
+    \hline
+    power
+    & 1         & 1         & 1         & 1.5       & 1.5       & 1.5 \\
+    \hline
+    size
+    & 62        & 62        & 62        & 100       & 100       & 110 \\
+    \hline
+    Prec/Eprec
+    & \np{E-5}  & \np{E-8}  & \np{E-9}  & \np{E-11} & \np{E-11} & \np{E-11} \\
+    \hline
+    speedup
+    & 0.396     & 0.392     & 0.396     & 0.391     & 0.393     & 0.395 \\
+    \hline
+  \end{tabular}
+
+  \smallskip
+
+  \begin{tabular}{|>{\bfseries}r|*{12}{c|}}
+    \hline
+    bw
+    & 50        & 50        & 50        & 50        & 10        & 10 \\
+    \hline
+    lat
+    & 0.02      & 0.02      & 0.02      & 0.02      & 0.03      & 0.01 \\
+    \hline
+    power
+    & 1.5       & 1.5       & 1.5       & 1.5       & 1         & 1.5 \\
+    \hline
+    size
+    & 120       & 130       & 140       & 150       & 171       & 171 \\
+    \hline
+    Prec/Eprec
+    & \np{E-11} & \np{E-11} & \np{E-11} & \np{E-11} & \np{E-5}  & \np{E-5} \\
+    \hline
+    speedup
+    & 0.398     & 0.388     & 0.393     & 0.394     & 0.63      & 0.778 \\
+    \hline
+  \end{tabular}
+\end{table}
    
  Then we have changed the network configuration using three clusters containing
  respectively 33, 33 and 34 hosts, or again by on hundred hosts for all the
    
  Then we have changed the network configuration using three clusters containing
  respectively 33, 33 and 34 hosts, or again by on hundred hosts for all the
@@ -388,52 +417,62 @@ clusters. In the same way as above, a judicious choice of key parameters has
  permitted to get the results in Table~\ref{tab.cluster.3x33} which shows the
  speedups less than 1 with a matrix size from 62 to 100 elements.
  
  permitted to get the results in Table~\ref{tab.cluster.3x33} which shows the
  speedups less than 1 with a matrix size from 62 to 100 elements.
  
-\begin{table}
+\begin{table}[!t]
    \centering
    \caption{3 clusters, each with 33 nodes}
    \label{tab.cluster.3x33}
    \centering
    \caption{3 clusters, each with 33 nodes}
    \label{tab.cluster.3x33}
-
- \tiny
- 
-\begin{tabular}{|Z{0.55cm}|Z{0.25cm}|Z{0.25cm}|M{0.25cm}|Z{0.25cm}|M{0.25cm}|M{0.25cm}|} 
- \hline 
- \bf bw & 10 &5 & 4 & 3 & 2 & 6\\ \hline
- \bf lat & 0.01 & 0.02 & 0.02 & 0.02 & 0.02 & 0.02\\ 
- \hline 
- \bf power & 1 & 1 & 1 & 1 & 1 & 1\\ \hline    
- \bf size & 62 & 100 & 100 & 100 & 100 & 171\\ \hline
- \bf Prec/Eprec & 10$^{-5}$ & 10$^{-5}$ & 10$^{-5}$ & 10$^{-5}$ & 10$^{-5}$ & 10$^{-5}$\\ \hline 
- \bf speedup & 0.997 & 0.99 & 0.93 & 0.84 & 0.78 & 0.99\\ 
- \hline 
- \end{tabular}
-\end{table} 
+  \renewcommand{\arraystretch}{1.3}
+
+  \begin{tabular}{|>{\bfseries}r|*{6}{c|}}
+    \hline
+    bw
+    & 10       & 5        & 4        & 3        & 2        & 6 \\
+    \hline
+    lat
+    & 0.01     & 0.02     & 0.02     & 0.02     & 0.02     & 0.02 \\
+    \hline
+    power
+    & 1        & 1        & 1        & 1        & 1        & 1 \\
+    \hline
+    size
+    & 62       & 100      & 100      & 100      & 100      & 171 \\
+    \hline
+    Prec/Eprec
+    & \np{E-5} & \np{E-5} & \np{E-5} & \np{E-5} & \np{E-5} & \np{E-5} \\
+    \hline
+    speedup
+    & 0.997    & 0.99     & 0.93     & 0.84     & 0.78     & 0.99 \\
+    \hline
+  \end{tabular}
+\end{table}
  
  
  In a final step, results of an execution attempt to scale up the three clustered
  configuration but increasing by two hundreds hosts has been recorded in
  Table~\ref{tab.cluster.3x67}.
  
  
  
  In a final step, results of an execution attempt to scale up the three clustered
  configuration but increasing by two hundreds hosts has been recorded in
  Table~\ref{tab.cluster.3x67}.
  
-\begin{table}
+\begin{table}[!t]
    \centering
    \caption{3 clusters, each with 66 nodes}
    \label{tab.cluster.3x67}
    \centering
    \caption{3 clusters, each with 66 nodes}
    \label{tab.cluster.3x67}
-
- \tiny
-\begin{tabular}{|M{0.55cm}|M{0.25cm}|} 
- \hline 
- \bf bw & 1\\ \hline
- \bf lat & 0.02\\ 
- \hline 
- \bf power & 1\\ 
- \hline    
- \bf size & 62\\ 
- \hline
- \bf Prec/Eprec & 10$^{-5}$\\ 
- \hline 
- \bf speedup & 0.9\\ 
- \hline 
+  \renewcommand{\arraystretch}{1.3}
+
+  \begin{tabular}{|>{\bfseries}r|c|}
+    \hline
+    bw         & 1 \\
+    \hline
+    lat        & 0.02 \\
+    \hline
+    power      & 1 \\
+    \hline
+    size       & 62 \\
+    \hline
+    Prec/Eprec & \np{E-5} \\
+    \hline
+    speedup    & 0.9 \\
+    \hline
   \end{tabular}
   \end{tabular}
-\end{table} 
+\end{table}
  
  Note that the program was run with the following parameters:
  
  
  Note that the program was run with the following parameters: