modif dans les explications des expés

[rce2015.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index 7e740ec629335c9856704f5b7828be1e7a5f85e0..81afcf517c38d633ee85667120bd5421be900c69 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -321,7 +321,7 @@ A_{\ell\ell} x_\ell = c_\ell,\mbox{~for~}\ell=1,\ldots,L,
  \end{equation}
  where right-hand sides $c_\ell=b_\ell-\sum_{m\neq\ell}A_{\ell m}x_m$ are computed using the shared vectors $x_m$. In this paper, we use the well-known iterative method GMRES~\cite{saad86} as an inner iteration to approximate the solutions of the different splittings arising from the block Jacobi multisplitting of matrix $A$. The algorithm in Figure~\ref{alg:01} shows the main key points of our block Jacobi two-stage method executed by a cluster of processors. In line~\ref{solve}, the linear sub-system~(\ref{eq:03}) is solved in parallel using GMRES method where $\MIG$ and $\TOLG$ are the maximum number of inner iterations and the tolerance threshold for GMRES respectively. The convergence of the two-stage multisplitting methods, based on synchronous or asynchronous iterations, has been studied by many authors for example~\cite{Bru95,bahi07}.
  
  \end{equation}
  where right-hand sides $c_\ell=b_\ell-\sum_{m\neq\ell}A_{\ell m}x_m$ are computed using the shared vectors $x_m$. In this paper, we use the well-known iterative method GMRES~\cite{saad86} as an inner iteration to approximate the solutions of the different splittings arising from the block Jacobi multisplitting of matrix $A$. The algorithm in Figure~\ref{alg:01} shows the main key points of our block Jacobi two-stage method executed by a cluster of processors. In line~\ref{solve}, the linear sub-system~(\ref{eq:03}) is solved in parallel using GMRES method where $\MIG$ and $\TOLG$ are the maximum number of inner iterations and the tolerance threshold for GMRES respectively. The convergence of the two-stage multisplitting methods, based on synchronous or asynchronous iterations, has been studied by many authors for example~\cite{Bru95,bahi07}.
  
-\begin{figure}[t]
+\begin{figure}[htpb]
  %\begin{algorithm}[t]
  %\caption{Block Jacobi two-stage multisplitting method}
  \begin{algorithmic}[1]
  %\begin{algorithm}[t]
  %\caption{Block Jacobi two-stage multisplitting method}
  \begin{algorithmic}[1]
@@ -359,7 +359,7 @@ At each $s$ outer iterations, the algorithm computes a new approximation $\tilde
  \end{equation}
  The algorithm in Figure~\ref{alg:02} includes the procedure of the residual minimization and the outer iteration is restarted with a new approximation $\tilde{x}$ at every $s$ iterations. The least-squares problem~(\ref{eq:06}) is solved in parallel by all clusters using CGLS method~\cite{Hestenes52} such that $\MIC$ is the maximum number of iterations and $\TOLC$ is the tolerance threshold for this method (line~\ref{cgls} in Figure~\ref{alg:02}).
  
  \end{equation}
  The algorithm in Figure~\ref{alg:02} includes the procedure of the residual minimization and the outer iteration is restarted with a new approximation $\tilde{x}$ at every $s$ iterations. The least-squares problem~(\ref{eq:06}) is solved in parallel by all clusters using CGLS method~\cite{Hestenes52} such that $\MIC$ is the maximum number of iterations and $\TOLC$ is the tolerance threshold for this method (line~\ref{cgls} in Figure~\ref{alg:02}).
  
-\begin{figure}[t]
+\begin{figure}[htbp]
  %\begin{algorithm}[t]
  %\caption{Krylov two-stage method using block Jacobi multisplitting}
  \begin{algorithmic}[1]
  %\begin{algorithm}[t]
  %\caption{Krylov two-stage method using block Jacobi multisplitting}
  \begin{algorithmic}[1]
@@ -407,10 +407,10 @@ in which  several clusters are  geographically distant,  so there are  intra and
  inter-cluster communications. In the following, these parameters are described:
  
  \begin{itemize}
  inter-cluster communications. In the following, these parameters are described:
  
  \begin{itemize}
-       \item hostfile: hosts description file.
+       \item hostfile: hosts description file,
         \item platform: file describing the platform architecture: clusters (CPU power,
  \dots{}), intra cluster network description, inter cluster network (bandwidth $bw$,
         \item platform: file describing the platform architecture: clusters (CPU power,
  \dots{}), intra cluster network description, inter cluster network (bandwidth $bw$,
-latency $lat$, \dots{}).
+latency $lat$, \dots{}),
         \item archi   : grid computational description (number of clusters, number of
  nodes/processors in each cluster).
  \end{itemize}
         \item archi   : grid computational description (number of clusters, number of
  nodes/processors in each cluster).
  \end{itemize}
@@ -485,7 +485,7 @@ results comparison and analysis. In the scope of this study, we retain
  on the  one hand the algorithm execution mode (synchronous and asynchronous)
  and on the other hand the execution time and the number of iterations to reach the convergence. \\
  
  on the  one hand the algorithm execution mode (synchronous and asynchronous)
  and on the other hand the execution time and the number of iterations to reach the convergence. \\
  
-\textbf{Step 4  }: Set up the  different grid testbed environments  that will be
+\textbf{Step 4}: Set up the  different grid testbed environments  that will be
  simulated in the  simulator tool to run the program.  The following architectures
  have been configured in SimGrid : 2$\times$16, 4$\times$8, 4$\times$16, 8$\times$8 and 2$\times$50. The first number
  represents the number  of clusters in the grid and  the second number represents
  simulated in the  simulator tool to run the program.  The following architectures
  have been configured in SimGrid : 2$\times$16, 4$\times$8, 4$\times$16, 8$\times$8 and 2$\times$50. The first number
  represents the number  of clusters in the grid and  the second number represents
@@ -494,8 +494,8 @@ designed to  operate with a bandwidth  equals to 10Gbits (resp.  1Gbits/s) and a
  latency of 8.10$^{-6}$ seconds (resp.  5.10$^{-5}$) for the intra-clusters links
  (resp.  inter-clusters backbone links).  \\
  
  latency of 8.10$^{-6}$ seconds (resp.  5.10$^{-5}$) for the intra-clusters links
  (resp.  inter-clusters backbone links).  \\
  
-\LZK{Il me semble que le bw et lat des deux réseaux varient dans les expés d'une simu à l'autre. On vire la dernière phrase?}
-\RC{il me semble qu'on peut laisser ca}
+%\LZK{Il me semble que le bw et lat des deux réseaux varient dans les expés d'une simu à l'autre. On vire la dernière phrase?}
+%\RC{il me semble qu'on peut laisser ca}
  
  \textbf{Step 5}: Conduct an extensive and comprehensive testings
  within these configurations by varying the key parameters, especially
  
  \textbf{Step 5}: Conduct an extensive and comprehensive testings
  within these configurations by varying the key parameters, especially
@@ -591,7 +591,7 @@ the Krylov two-stage algorithm.
  %\RC{Les légendes ne sont pas explicites...}
  %\RCE{Corrige}
  
  %\RC{Les légendes ne sont pas explicites...}
  %\RCE{Corrige}
  
-\begin{figure} [ht!]
+\begin{figure} [htbp]
    \begin{center}
      \includegraphics[width=100mm]{cluster_x_nodes_nx_150_and_nx_170.pdf}
    \end{center}
    \begin{center}
      \includegraphics[width=100mm]{cluster_x_nodes_nx_150_and_nx_170.pdf}
    \end{center}
@@ -641,12 +641,12 @@ the  network speed  drops down (variation of 12.5\%), the  difference between  t
  
  
  %\begin{wrapfigure}{l}{100mm}
  
  
  %\begin{wrapfigure}{l}{100mm}
-\begin{figure} [ht!]
+\begin{figure} [htbp]
  \centering
  \includegraphics[width=100mm]{cluster_x_nodes_n1_x_n2.pdf}
  \centering
  \includegraphics[width=100mm]{cluster_x_nodes_n1_x_n2.pdf}
-\caption{Various grid configurations with networks N1 vs N2
-\AG{\np{8E-6}, \np{5E-6} au lieu de 8E-6, 5E-6}}
-\RCE{Corrige}
+\caption{Various grid configurations with networks N1 vs N2}
+%\AG{\np{8E-6}, \np{5E-6} au lieu de 8E-6, 5E-6}}
+%\RCE{Corrige}
  \label{fig:02}
  \end{figure}
  %\end{wrapfigure}
  \label{fig:02}
  \end{figure}
  %\end{wrapfigure}
@@ -667,21 +667,22 @@ the  network speed  drops down (variation of 12.5\%), the  difference between  t
  \label{tab:03}
  \end{table}
  
  \label{tab:03}
  \end{table}
  
-\begin{figure} [ht!]
+\begin{figure} [htbp]
  \centering
  \includegraphics[width=100mm]{network_latency_impact_on_execution_time.pdf}
  \centering
  \includegraphics[width=100mm]{network_latency_impact_on_execution_time.pdf}
-\caption{Network latency impacts on execution time
-\AG{\np{E-6}}}
+\caption{Network latency impacts on execution time}
+%\AG{\np{E-6}}}
  \label{fig:03}
  \end{figure}
  
  \label{fig:03}
  \end{figure}
  
-According to  the results of  Figure~\ref{fig:03}, a degradation of  the network
-latency from  $8.10^{-6}$ to  $6.10^{-5}$ implies an  absolute time  increase of
-more  than $75\%$  (resp.  $82\%$)  of the  execution  for  the classical  GMRES
-(resp.  Krylov multisplitting)  algorithm which means that the GMRES seems tolerate more the network latency variation with a less  rate increase  of  the  execution time. However, the execution time factor between the two algorithms varies from 2.2 to 1.5 times with a network latency decreasing from $8.10^{-6}$ to  $6.10^{-5}$.
+In Table~\ref{tab:03}, parameters  for the influence of the  network latency are
+reported.  According to the results of Figure~\ref{fig:03}, a degradation of the
+network  latency  from  $8.10^{-6}$  to $6.10^{-5}$  implies  an  absolute  time
+increase of more than $75\%$ (resp.   $82\%$) of the execution for the classical
+GMRES  (resp.   Krylov  multisplitting)  algorithm. The  execution  time  factor
+between the two algorithms  varies from 2.2 to 1.5 times  with a network latency
+decreasing from $8.10^{-6}$ to $6.10^{-5}$.
  
  
-\RC{Les  2  précédentes phrases  me  semblent en contradiction....}  
-\RCE{Reformule}
  
  \subsubsection{Network bandwidth impacts on performance}
  \ \\
  
  \subsubsection{Network bandwidth impacts on performance}
  \ \\
@@ -694,18 +695,19 @@ more  than $75\%$  (resp.  $82\%$)  of the  execution  for  the classical  GMRES
                            & $lat$= 5.10$^{-5}$ second \\
   Input matrix size & $N_{x} \times N_{y} \times N_{z} =150 \times 150 \times 150$\\ \hline \\
   \end{tabular}
                            & $lat$= 5.10$^{-5}$ second \\
   Input matrix size & $N_{x} \times N_{y} \times N_{z} =150 \times 150 \times 150$\\ \hline \\
   \end{tabular}
-\caption{Test conditions: Network bandwidth impacts\RC{Qu'est ce qui varie ici? Il n'y a pas de variation dans le tableau}}
-\RCE{C est le bw}
+\caption{Test conditions: Network bandwidth impacts}
+%  \RC{Qu'est ce qui varie ici? Il n'y a pas de variation dans le tableau}
+%\RCE{C est le bw}
  \label{tab:04}
  \end{table}
  
  
  \label{tab:04}
  \end{table}
  
  
-\begin{figure} [ht!]
+\begin{figure} [htbp]
  \centering
  \includegraphics[width=100mm]{network_bandwith_impact_on_execution_time.pdf}
  \centering
  \includegraphics[width=100mm]{network_bandwith_impact_on_execution_time.pdf}
-\caption{Network bandwith impacts on execution time
-\AG{``Execution time'' avec un 't' minuscule}. Idem autres figures.}
-\RCE{Corrige}
+\caption{Network bandwith impacts on execution time}
+%\AG{``Execution time'' avec un 't' minuscule}. Idem autres figures.}
+%\RCE{Corrige}
  \label{fig:04}
  \end{figure}
  
  \label{fig:04}
  \end{figure}
  
@@ -730,7 +732,7 @@ of $40\%$ which is only around $24\%$ for the classical GMRES.
  \end{table}
  
  
  \end{table}
  
  
-\begin{figure} [ht!]
+\begin{figure} [htbp]
  \centering
  \includegraphics[width=100mm]{pb_size_impact_on_execution_time.pdf}
  \caption{Problem size impacts on execution time}
  \centering
  \includegraphics[width=100mm]{pb_size_impact_on_execution_time.pdf}
  \caption{Problem size impacts on execution time}
@@ -759,7 +761,7 @@ grid 2 $\times$ 16 leading to the same conclusion.
  
  \subsubsection{CPU Power impacts on performance}
  
  
  \subsubsection{CPU Power impacts on performance}
  
-\begin{table} [ht!]
+\begin{table} [htbp]
  \centering
  \begin{tabular}{r c }
   \hline
  \centering
  \begin{tabular}{r c }
   \hline
@@ -811,18 +813,19 @@ synchronization  with   the  other   processors.  Thus,  the   asynchronous  may
  theoretically reduce  the overall execution  time and can improve  the algorithm
  performance.
  
  theoretically reduce  the overall execution  time and can improve  the algorithm
  performance.
  
-\RC{la phrase suivante est bizarre, je ne comprends pas pourquoi elle vient ici}
-\RCE{C est la description du dernier test sync/async avec l'introduction de la notion de relative gain}
-In this section, Simgrid simulator tool has been successfully used to show
-the efficiency of  the multisplitting in asynchronous mode and  to find the best
-combination of the grid resources (CPU,  Network, input matrix size, \ldots ) to
-get    the   highest    \textit{"relative    gain"}   (exec\_time$_{GMRES}$    /
-exec\_time$_{multisplitting}$) in comparison with the classical GMRES time.
+In this section,  the Simgrid simulator is  used to compare the  behavior of the
+multisplitting in  asynchronous mode  with GMRES  in synchronous  mode.  Several
+benchmarks have  been performed with  various combination of the  grid resources
+(CPU, Network, input  matrix size, \ldots ). The test  conditions are summarized
+in  Table~\ref{tab:07}. In  order to  compare  the execution  times, this  table
+reports the  relative gain between both  algorithms. It is defined  by the ratio
+between  the   execution  time  of   GMRES  and   the  execution  time   of  the
+multisplitting.  The  ration  is  greater  than  one  because  the  asynchronous
+multisplitting version is faster than GMRES.
  
  
  
  
-The test conditions are summarized in the table~\ref{tab:07}: \\
  
  
-\begin{table} [ht!]
+\begin{table} [htbp]
  \centering
  \begin{tabular}{r c }
   \hline
  \centering
  \begin{tabular}{r c }
   \hline
@@ -872,7 +875,7 @@ geographically distant clusters through the internet.
      power (GFlops)
      & 1    & 1    & 1    & 1.5       & 1.5  & 1.5         & 1.5         & 1         & 1.5       & 1.5 \\
      \hline
      power (GFlops)
      & 1    & 1    & 1    & 1.5       & 1.5  & 1.5         & 1.5         & 1         & 1.5       & 1.5 \\
      \hline
-    size (N)
+    size ($N^3$)
      & 62  & 62   & 62        & 100       & 100 & 110       & 120       & 130       & 140       & 150 \\
      \hline
      Precision
      & 62  & 62   & 62        & 100       & 100 & 110       & 120       & 130       & 140       & 150 \\
      \hline
      Precision