new

[book_gpu.git] / BookGPU / Chapters / chapter6 / PartieAsync.tex
diff --git a/BookGPU/Chapters/chapter6/PartieAsync.tex b/BookGPU/Chapters/chapter6/PartieAsync.tex

index 9c8de06f9f26509534e71029fa88a7b85b0d9b04..3f4a5394d1774fe52f4086dc4955e9590a8aba50 100644 (file)
--- a/BookGPU/Chapters/chapter6/PartieAsync.tex
+++ b/BookGPU/Chapters/chapter6/PartieAsync.tex
@@ -2,19 +2,19 @@
  computation/communication overlapping}
  \label{ch6:part2}
  
  computation/communication overlapping}
  \label{ch6:part2}
  
-In the previous part, we have seen how to efficiently implement overlap of
-computations (CPU and GPU) with communications (GPU transfers and inter-node
+In the previous section, we have seen how to efficiently implement overlap of
+computations (CPU and GPU) with communications (GPU transfers and internode
  communications).  However, we have previously shown that for some parallel
  iterative algorithms, it is sometimes even more efficient to use an asynchronous
  communications).  However, we have previously shown that for some parallel
  iterative algorithms, it is sometimes even more efficient to use an asynchronous
-scheme of iterations\index{iterations!asynchronous} \cite{HPCS2002,ParCo05,Para10}.  In that case, the nodes do
-not wait for each others but they perform their iterations using the last
+scheme of iterations\index{asynchronous iterations} \cite{HPCS2002,ParCo05,Para10}.  In that case, the nodes do
+not wait for each other but they perform their iterations using the last
  external data they have received from the other nodes, even if this
  data was produced \emph{before} the previous iteration on the other nodes.
  
  Formally, if we denote by $f=(f_1,...,f_n)$ the function representing the
  iterative process and by $x^t=(x_1^t,...,x_n^t)$ the values of the $n$ elements of
  the system at iteration $t$, we pass from a synchronous iterative scheme of the
  external data they have received from the other nodes, even if this
  data was produced \emph{before} the previous iteration on the other nodes.
  
  Formally, if we denote by $f=(f_1,...,f_n)$ the function representing the
  iterative process and by $x^t=(x_1^t,...,x_n^t)$ the values of the $n$ elements of
  the system at iteration $t$, we pass from a synchronous iterative scheme of the
-form:
+form given in \Alg{algo:ch6p2sync}
  %% \begin{algorithm}[H]
  %%   \caption{Synchronous iterative scheme}\label{algo:ch6p2sync}
  %%   \begin{Algo}
  %% \begin{algorithm}[H]
  %%   \caption{Synchronous iterative scheme}\label{algo:ch6p2sync}
  %%   \begin{Algo}
@@ -27,7 +27,7 @@ form:
  %%   \end{Algo}
  %% \end{algorithm}
  \begin{algorithm}[H]
  %%   \end{Algo}
  %% \end{algorithm}
  \begin{algorithm}[H]
-  \caption{Synchronous iterative scheme}\label{algo:ch6p2sync}
+  \caption{synchronous iterative scheme}\label{algo:ch6p2sync}
    $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\;
    \For{ $t=0,1,...$} {
      \For{ $i=1,...,n$}{
    $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\;
    \For{ $t=0,1,...$} {
      \For{ $i=1,...,n$}{
@@ -36,9 +36,8 @@ form:
    }
  \end{algorithm}
  
    }
  \end{algorithm}
  
-
  \noindent
  \noindent
-to an asynchronous iterative scheme of the form:\\
+to an asynchronous iterative scheme of the form given in \Alg{algo:ch6p2async}.\\
  %% \begin{algorithm}[H]
  %%   \caption{Asynchronous iterative scheme}\label{algo:ch6p2async}
  %%   \begin{Algo}
  %% \begin{algorithm}[H]
  %%   \caption{Asynchronous iterative scheme}\label{algo:ch6p2async}
  %%   \begin{Algo}
@@ -56,7 +55,7 @@ to an asynchronous iterative scheme of the form:\\
  %%   \end{Algo}
  %% \end{algorithm}
  \begin{algorithm}[H]
  %%   \end{Algo}
  %% \end{algorithm}
  \begin{algorithm}[H]
-  \caption{Asynchronous iterative scheme}\label{algo:ch6p2async}
+  \caption{asynchronous iterative scheme}\label{algo:ch6p2async}
      $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\;
      \For {$t=0,1,...$} {
        \For{ $i=1,...,n$} {
      $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\;
      \For {$t=0,1,...$} {
        \For{ $i=1,...,n$} {
@@ -69,8 +68,8 @@ to an asynchronous iterative scheme of the form:\\
        }
      }
  \end{algorithm}
        }
      }
  \end{algorithm}
-where $s_j^i(t)$ is the iteration number of the production of the value $x_j$ of
-element $j$  that is  used on element  $i$ at iteration  $t$ (see for example~\cite{BT89,
+In this scheme, $s_j^i(t)$ is the iteration number of the production of the value $x_j$ of
+element $j$  that is  used on element  $i$ at iteration  $t$ (see, for example,~\cite{BT89,
    FS2000} for further details).
  Such schemes  are called AIAC\index{AIAC} for  \emph{Asynchronous Iterations and
    Asynchronous Communications}.  They combine  two aspects that are respectively
    FS2000} for further details).
  Such schemes  are called AIAC\index{AIAC} for  \emph{Asynchronous Iterations and
    Asynchronous Communications}.  They combine  two aspects that are respectively
@@ -87,7 +86,7 @@ which new data arrives on each node. Typically,
  if a node receives newer data only every four or five local iterations, it is
  strongly probable that the evolution of its local iterative process will be
  slower than if it receives data at every iteration. The key point here is that
  if a node receives newer data only every four or five local iterations, it is
  strongly probable that the evolution of its local iterative process will be
  slower than if it receives data at every iteration. The key point here is that
-this frequency does not only depend on the hardware configuration of the
+not only does this frequency  depend on the hardware configuration of the
  parallel system but it also depends on the software that is used to
  implement the algorithmic scheme.
  
  parallel system but it also depends on the software that is used to
  implement the algorithmic scheme.
  
@@ -95,11 +94,11 @@ The impact of the programming environments used to implement asynchronous
  algorithms has already been investigated in~\cite{SuperCo05}.  Although the
  features required to efficiently implement asynchronous schemes have not
  changed, the available programming environments and computing hardware have
  algorithms has already been investigated in~\cite{SuperCo05}.  Although the
  features required to efficiently implement asynchronous schemes have not
  changed, the available programming environments and computing hardware have
-evolved, in particular now GPUs are available.  So, there is a need to reconsider the
+evolved, in particular now that GPUs are available.  So, there is a need to reconsider the
  implementation schemes of AIAC according to the new de facto standards for parallel
  programming (communications and threads) as well as the integration of the GPUs.
  One of the main objective here is to obtain a maximal overlap between the
  implementation schemes of AIAC according to the new de facto standards for parallel
  programming (communications and threads) as well as the integration of the GPUs.
  One of the main objective here is to obtain a maximal overlap between the
-activities of the three types of devices that are the CPU, the GPU and the
+activities of the three types of devices: the CPU, the GPU, and the
  network. Moreover, another objective is to present what we think
  is the best compromise between the simplicity of the implementation and its
  maintainability on one side and its
  network. Moreover, another objective is to present what we think
  is the best compromise between the simplicity of the implementation and its
  maintainability on one side and its
@@ -112,16 +111,16 @@ with full overlap.  Between these two extremes, we propose a synchronization
  mechanism on top of our asynchronous scheme that can be used either statically or
  dynamically during the application execution.
  
  mechanism on top of our asynchronous scheme that can be used either statically or
  dynamically during the application execution.
  
-Although there exist several programming environments for inter-node
-communications, multi-threading and GPU programming, a few of them have
-become \emph{de facto standards}, either due to their good stability, their ease
-of use and/or their wide adoption by the scientific community.
-Therefore, as in the previous section all the schemes presented in the following use MPI~\cite{MPI},
-OpenMP~\cite{openMP} and CUDA~\cite{CUDA}.  However, there is no loss of
-generality as those schemes may easily be implemented with other libraries.
+Although there exist several programming environments for internode
+communications, multithreading, and GPU programming, a few of them have
+become \emph{de facto standards}, due to their good stability, their ease
+of use, and/or their wide adoption by the scientific community.
+Therefore, as in the previous section, all the schemes presented in the following use MPI~\cite{MPI},
+OpenMP~\cite{openMP}, and CUDA~\cite{CUDA}.  However, there is no loss of
+generality as these schemes may easily be implemented with other libraries.
  
  Finally, in order to stay as clear as possible, only the parts of code and
  
  Finally, in order to stay as clear as possible, only the parts of code and
-variables related to the control of parallelism (communications, threads,...)
+variables related to the control of parallelism (communications, threads, etc.)
  are presented in our schemes. The inner organization of data is not detailed as
  it depends on the application.  We only consider that we have two data arrays
  (previous version and current version) and communication buffers.  However, in
  are presented in our schemes. The inner organization of data is not detailed as
  it depends on the application.  We only consider that we have two data arrays
  (previous version and current version) and communication buffers.  However, in
@@ -134,17 +133,17 @@ avoid data copies.
  The first step toward our complete scheme is to implement a basic asynchronous
  scheme that includes an actual overlap of the communications with the
  computations\index{overlap!computation and communication}. In order to ensure that the communications are actually performed
  The first step toward our complete scheme is to implement a basic asynchronous
  scheme that includes an actual overlap of the communications with the
  computations\index{overlap!computation and communication}. In order to ensure that the communications are actually performed
-in parallel of the computations, it is necessary to use different threads.  It
+in parallel with the computations, it is necessary to use different threads.  It
  is important to remember that asynchronous communications provided in
  is important to remember that asynchronous communications provided in
-communication libraries like MPI are not systematically performed in parallel of
+communication libraries such as MPI are not systematically performed in parallel with
  the computations~\cite{ChVCV13,Hoefler08a}.  So, the logical and classical way
  to implement such an overlap is to use three threads: one for
  the computations~\cite{ChVCV13,Hoefler08a}.  So, the logical and classical way
  to implement such an overlap is to use three threads: one for
-computing, one for sending and one for receiving. Moreover, since
-the communication is performed by threads, blocking synchronous communications\index{MPI!communication!blocking}\index{MPI!communication!synchronous}
+computing, one for sending, and one for receiving. Moreover, since
+the communication is performed by threads, blocking synchronous communications\index{MPI!blocking}\index{MPI!synchronous}
  can be used without deteriorating the overall performance.
  
  In this basic version, the termination\index{termination} of the global process is performed
  can be used without deteriorating the overall performance.
  
  In this basic version, the termination\index{termination} of the global process is performed
-individually on each node according to their own termination. This can be guided by either a
+individually on each node according to its own termination. This can be guided by either a
  number of iterations or a local convergence detection\index{convergence detection}. The important step at
  the end of the process is to perform the receptions of all pending
  communications in order to ensure the termination of the two communication
  number of iterations or a local convergence detection\index{convergence detection}. The important step at
  the end of the process is to perform the receptions of all pending
  communications in order to ensure the termination of the two communication
@@ -155,7 +154,7 @@ So, the global organization of this scheme is set up in \Lst{algo:ch6p2BasicAsyn
  % \begin{algorithm}[H]
  %   \caption{Initialization of the basic asynchronous scheme.}
  %   \label{algo:ch6p2BasicAsync}
  % \begin{algorithm}[H]
  %   \caption{Initialization of the basic asynchronous scheme.}
  %   \label{algo:ch6p2BasicAsync}
-\begin{Listing}{algo:ch6p2BasicAsync}{Initialization of the basic asynchronous scheme}
+\begin{Listing}{algo:ch6p2BasicAsync}{initialization of the basic asynchronous scheme}
  // Variables declaration and initialization
  // Controls the sendings from the computing thread
  omp_lock_t lockSend; 
  // Variables declaration and initialization
  // Controls the sendings from the computing thread
  omp_lock_t lockSend; 
@@ -225,7 +224,7 @@ MPI_Finalize();
  In this scheme, the \texttt{lockRec} mutex\index{OpenMP!mutex} is not mandatory.
  It is only used to ensure that data dependencies are actually exchanged at the
  first iteration of the process.  Data initialization and distribution
  In this scheme, the \texttt{lockRec} mutex\index{OpenMP!mutex} is not mandatory.
  It is only used to ensure that data dependencies are actually exchanged at the
  first iteration of the process.  Data initialization and distribution
-(lines~16-17) are not detailed here because they are directly related to the
+(lines~20--21) are not detailed here because they are directly related to the
  application. The important point is that, in most cases, they should be done
  before the iterative process.  The computing function is given in
  \Lst{algo:ch6p2BasicAsyncComp}.
  application. The important point is that, in most cases, they should be done
  before the iterative process.  The computing function is given in
  \Lst{algo:ch6p2BasicAsyncComp}.
@@ -233,7 +232,7 @@ before the iterative process.  The computing function is given in
  %\begin{algorithm}[H]
  %  \caption{Computing function in the basic asynchronous scheme.}
  %  \label{algo:ch6p2BasicAsyncComp}
  %\begin{algorithm}[H]
  %  \caption{Computing function in the basic asynchronous scheme.}
  %  \label{algo:ch6p2BasicAsyncComp}
-\begin{Listing}{algo:ch6p2BasicAsyncComp}{Computing function in the basic asynchronous scheme}
+\begin{Listing}{algo:ch6p2BasicAsyncComp}{computing function in the basic asynchronous scheme}
  // Variables declaration and initialization
  int iter = 1;      // Number of the current iteration
  double difference; // Variation of one element between two iterations
  // Variables declaration and initialization
  int iter = 1;      // Number of the current iteration
  double difference; // Variation of one element between two iterations
@@ -241,10 +240,10 @@ double residual;   // Residual of the current iteration
  
  // Computation loop
  while(!Finished){
  
  // Computation loop
  while(!Finished){
-  // Sendings of data dependencies if there is no previous sending
+  // Sending of data dependencies if there is no previous sending
    // in progress
    if(!SendsInProgress){
    // in progress
    if(!SendsInProgress){
-    // Potential copy of data to be sent in additional buffers
+    // Potential copy of data to be sent into additional buffers
      ...
      // Change of sending state
      SendsInProgress = 1;
      ...
      // Change of sending state
      SendsInProgress = 1;
@@ -288,30 +287,29 @@ while(!Finished){
  \end{Listing}
  %\end{algorithm}
  
  \end{Listing}
  %\end{algorithm}
  
-As mentioned above, it can be seen in line~18 of \Lst{algo:ch6p2BasicAsyncComp}
+As mentioned above, it can be seen in lines~19--21 of \Lst{algo:ch6p2BasicAsyncComp}
  that the \texttt{lockRec} mutex is used only at the first iteration to wait for
  the initial data dependencies before the computations. The residual\index{residual}, initialized
  that the \texttt{lockRec} mutex is used only at the first iteration to wait for
  the initial data dependencies before the computations. The residual\index{residual}, initialized
-in line~23 and computed in lines~34-37, is defined by the maximal difference
+in line~24 and computed in lines~35--38, is defined by the maximal difference
  between the elements from two consecutive iterations. It is classically used to
  detect the local convergence of the process on each node.  In the more
  complete schemes presented in the sequel, a global termination detection that
  takes the states of all the nodes into account will be exhibited.
  
  between the elements from two consecutive iterations. It is classically used to
  detect the local convergence of the process on each node.  In the more
  complete schemes presented in the sequel, a global termination detection that
  takes the states of all the nodes into account will be exhibited.
  
-Finally, the local convergence is tested and updated when necessary. In line~44,
+Finally, the local convergence is tested and updated when necessary. In line~45,
  the \texttt{lockSend} mutex is unlocked to allow the sending function to send
  final messages to the dependency nodes.  Those messages are required to keep the
  reception function alive until all the final messages have been received.
  the \texttt{lockSend} mutex is unlocked to allow the sending function to send
  final messages to the dependency nodes.  Those messages are required to keep the
  reception function alive until all the final messages have been received.
-Otherwise, a node could stop its reception function whereas other nodes are
+Otherwise, a node could stop its reception function while other nodes are
  still trying to communicate with it.  Moreover, a local sending of a final
  still trying to communicate with it.  Moreover, a local sending of a final
-message to the node itself is required (line~45) to ensure that the reception
+message to the node itself is required (line~46) to ensure that the reception
  function will not stay blocked in a message probing
  function will not stay blocked in a message probing
-(see~\Lst{algo:ch6p2BasicAsyncReceptions}, line~11). This may happen if the node
-receives the final messages from its dependencies \emph{before} being itself in
+(see~\Lst{algo:ch6p2BasicAsyncReceptions}, line~12). This may happen if the node
+receives the final messages from its dependencies \emph{before} reaching its own
  local convergence.
  
  All the messages but this final local one are performed in the sending function
  described in \Lst{algo:ch6p2BasicAsyncSendings}.
  local convergence.
  
  All the messages but this final local one are performed in the sending function
  described in \Lst{algo:ch6p2BasicAsyncSendings}.
-
  The main loop is only conditioned by the end of the computing process (line~4).
  At each iteration, the thread waits for the permission from the computing thread
  (according to the \texttt{lockSend} mutex).  Then, data are sent with
  The main loop is only conditioned by the end of the computing process (line~4).
  At each iteration, the thread waits for the permission from the computing thread
  (according to the \texttt{lockSend} mutex).  Then, data are sent with
@@ -324,7 +322,7 @@ the main loop, the final messages are sent to the dependencies of the node.
  %\begin{algorithm}[H]
  %  \caption{Sending function in the basic asynchronous scheme.}
  %  \label{algo:ch6p2BasicAsyncSendings}
  %\begin{algorithm}[H]
  %  \caption{Sending function in the basic asynchronous scheme.}
  %  \label{algo:ch6p2BasicAsyncSendings}
-\begin{Listing}{algo:ch6p2BasicAsyncSendings}{Sending function in the basic asynchronous scheme}
+\begin{Listing}{algo:ch6p2BasicAsyncSendings}{sending function in the basic asynchronous scheme}
  // Variables declaration and initialization
  ...
  
  // Variables declaration and initialization
  ...
  
@@ -351,10 +349,10 @@ The last function, detailed in \Lst{algo:ch6p2BasicAsyncReceptions}, does all th
  %  \label{algo:ch6p2BasicAsyncReceptions}
  \begin{Listing}{algo:ch6p2BasicAsyncReceptions}{Reception function in the basic asynchronous scheme}
  // Variables declaration and initialization
  %  \label{algo:ch6p2BasicAsyncReceptions}
  \begin{Listing}{algo:ch6p2BasicAsyncReceptions}{Reception function in the basic asynchronous scheme}
  // Variables declaration and initialization
-char countReceipts = 0; // Boolean indicating whether receptions are 
-// counted or not
+char countReceipts = 1; // Boolean indicating whether receptions are 
+                        // counted or not
  int nbEndMsg = 0;       // Number of end messages received
  int nbEndMsg = 0;       // Number of end messages received
-int arrived = 0;        // Boolean indicating if a message is arrived
+int arrived = 0;        // Boolean indicating if a message has arrived
  int srcNd;              // Source node of the message
  int size;               // Message size
  
  int srcNd;              // Source node of the message
  int size;               // Message size
  
@@ -386,7 +384,7 @@ while(!Finished){
  }
  
  // Reception of pending messages and counting of end messages
  }
  
  // Reception of pending messages and counting of end messages
-do{ // Loop over the remaining incoming/waited messages
+do{ // Loop over the remaining incoming/end messages
    MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
    MPI_Get_count(&status, MPI_CHAR, &size);
    // Actual reception in dummy buffer
    MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
    MPI_Get_count(&status, MPI_CHAR, &size);
    // Actual reception in dummy buffer
@@ -399,24 +397,26 @@ do{ // Loop over the remaining incoming/waited messages
  \end{Listing}
  %\end{algorithm}
  
  \end{Listing}
  %\end{algorithm}
  
-As in the sending function, the main loop of receptions is done while the
-iterative process is not \texttt{Finished}.  In line~11, the thread waits until
-a message arrives on the node.  Then, it performs the actual reception and the
-corresponding subsequent actions (potential data copies for data messages and
-counting for end messages). Lines 20-23 check that all data dependencies have
-been received before unlocking the \texttt{lockRec} mutex. As mentioned before,
-they are not mandatory and are included only to ensure that all data
-dependencies are received at the first iteration. Lines 25-28 are required to
-manage end messages that arrive on the node \emph{before} it reaches its own
-termination process. As the nodes are \emph{not} synchronized, this may happen.
-Finally, lines 34-43 perform the receptions of all pending communications,
-including the remaining end messages (at least the one from the node itself).
+As  in the  sending function,  the main  loop of  receptions is  done  while the
+iterative process is not \texttt{Finished}.   In line~12, the thread waits until
+a message arrives  on the node.  Then, it performs the  actual reception and the
+corresponding subsequent  actions (potential data  copies for data  messages and
+counting for  end messages). Lines~23--26  check, only at the  first iteration of
+computations, that all data dependencies have been received before unlocking the
+\texttt{lockRec} mutex. %As mentioned previously,  they are not mandatory and are included only  to
+Although this is not mandatory, it ensures that  all data dependencies  are
+received before starting the computations. % at  the first iteration.
+Lines~28--31 are required to  manage end messages that  arrive on the
+node \emph{before}  it reaches  its own termination  process.  As the  nodes are
+\emph{not}  synchronized, this  may happen.   Finally, lines~37--46  perform the
+receptions of  all pending communications, including the  remaining end messages
+(at least the one from the node itself).
  
  \medskip
  
  \medskip
-So, with those algorithms, we obtain a quite simple and efficient asynchronous
+So, with these algorithms, we obtain a quite simple and efficient asynchronous
  iterative scheme. It is interesting to notice that GPU computing can be easily
  included in the computing thread.  This will be fully addressed in
  iterative scheme. It is interesting to notice that GPU computing can be easily
  included in the computing thread.  This will be fully addressed in
-paragraph~\ref{ch6:p2GPUAsync}.  However, before presenting the complete
+Section~\ref{ch6:p2GPUAsync}.  However, before presenting the complete
  asynchronous scheme with GPU computing, we have to detail how our initial scheme
  can be made synchronous.
  
  asynchronous scheme with GPU computing, we have to detail how our initial scheme
  can be made synchronous.
  
@@ -438,11 +438,11 @@ threshold.
  % code, which tends to improve  the implementation and maintenance costs when both
  % versions are required.
  % The  second one  is  that
  % code, which tends to improve  the implementation and maintenance costs when both
  % versions are required.
  % The  second one  is  that
-In our context, the interest of being able to dynamically change the operating
-mode (sync/async) during the process execution, is that this strongly simplifies
+In our context,  being able to dynamically change the operating
+mode (sync/async) during the process execution  strongly simplifies
  the global convergence detection.  In fact, our past experience in the design
  and implementation of global convergence detection in asynchronous
  the global convergence detection.  In fact, our past experience in the design
  and implementation of global convergence detection in asynchronous
-algorithms~\cite{SuperCo05, BCC07, Vecpar08a}, have led us to the conclusion
+algorithms~\cite{SuperCo05, BCC07, Vecpar08a} has led us to the conclusion
  that although a decentralized detection scheme is possible and may be more
  efficient in some situations, its much higher complexity is an
  obstacle to actual use in practice, especially in industrial contexts where
  that although a decentralized detection scheme is possible and may be more
  efficient in some situations, its much higher complexity is an
  obstacle to actual use in practice, especially in industrial contexts where
@@ -454,13 +454,13 @@ consists in dynamically changing the operating mode between asynchronous and syn
  during the execution of the process in order to check the global convergence.
  This is why we need to synchronize our asynchronous scheme.
  
  during the execution of the process in order to check the global convergence.
  This is why we need to synchronize our asynchronous scheme.
  
-In each  algorithm of the  initial scheme, we  only exhibit the  additional code
+In each  algorithm of the  initial scheme, we  only give the  additional code
  required to change the operating mode.
  
  %\begin{algorithm}[H]
  %  \caption{Initialization of the synchronized scheme.}
  %  \label{algo:ch6p2Sync}
  required to change the operating mode.
  
  %\begin{algorithm}[H]
  %  \caption{Initialization of the synchronized scheme.}
  %  \label{algo:ch6p2Sync}
-\begin{Listing}{algo:ch6p2Sync}{Initialization of the synchronized scheme}
+\begin{Listing}{algo:ch6p2Sync}{initialization of the synchronized scheme}
  // Variables declarations and initialization
  ...
  // Controls the synchronous exchange of local states 
  // Variables declarations and initialization
  ...
  // Controls the synchronous exchange of local states 
@@ -516,9 +516,9 @@ receptions of all state messages coming from the other nodes.  As shown
  in \Lst{algo:ch6p2SyncComp}, those messages contain only a boolean indicating
  for each node if it is in local convergence\index{convergence!local}. So, once all the states are
  received on a node, it is possible to determine if all the nodes are in local
  in \Lst{algo:ch6p2SyncComp}, those messages contain only a boolean indicating
  for each node if it is in local convergence\index{convergence!local}. So, once all the states are
  received on a node, it is possible to determine if all the nodes are in local
-convergence, and thus to detect the global convergence.  The \texttt{lockIter}
+convergence and, thus, to detect the global convergence.  The \texttt{lockIter}
  mutex is used to synchronize all the nodes at the end of each iteration.  There
  mutex is used to synchronize all the nodes at the end of each iteration.  There
-are also two new variables that respectively represent the local state of the
+are also two new variables that  represent the local state of the
  node (\texttt{localCV}) according to the iterative process (convergence) and the
  number of other nodes that are in local convergence (\texttt{nbOtherCVs}).
  
  node (\texttt{localCV}) according to the iterative process (convergence) and the
  number of other nodes that are in local convergence (\texttt{nbOtherCVs}).
  
@@ -528,18 +528,18 @@ in \Lst{algo:ch6p2SyncComp}.
  %\begin{algorithm}[H]
  %  \caption{Computing function in the synchronized scheme.}
  %  \label{algo:ch6p2SyncComp}
  %\begin{algorithm}[H]
  %  \caption{Computing function in the synchronized scheme.}
  %  \label{algo:ch6p2SyncComp}
-\begin{Listing}{algo:ch6p2SyncComp}{Computing function in the synchronized scheme}
+\begin{Listing}{algo:ch6p2SyncComp}{computing function in the synchronized scheme}
  // Variables declarations and initialization
  ...
  
  // Computation loop
  while(!Finished){
  // Variables declarations and initialization
  ...
  
  // Computation loop
  while(!Finished){
-  // Sendings of data dependencies at @\emph{each}@ iteration
+  // Sending of data dependencies at @\color{white}\emph{\textbf{each}}@ iteration
    // Potential copy of data to be sent in additional buffers
    ...
    omp_unset_lock(&lockSend);
  
    // Potential copy of data to be sent in additional buffers
    ...
    omp_unset_lock(&lockSend);
  
-  // Blocking receptions at @\emph{each}@ iteration
+  // Blocking receptions at @\color{white}\emph{\textbf{each}}@ iteration
    omp_set_lock(&lockRec);
    
    // Local computation 
    omp_set_lock(&lockRec);
    
    // Local computation 
@@ -594,39 +594,39 @@ Most of the added code is related to the waiting for specific communications.
  Between lines~6 and~7, the use of the flag \texttt{SendsInProgress} is no longer
  needed since the sends are performed at each iteration.  In line~12, the
  thread waits for the data receptions from its dependencies. In
  Between lines~6 and~7, the use of the flag \texttt{SendsInProgress} is no longer
  needed since the sends are performed at each iteration.  In line~12, the
  thread waits for the data receptions from its dependencies. In
-lines~26-34, the local states are determined and exchanged among all nodes.  A
+lines~27--34, the local states are determined and exchanged among all nodes.  A
  new message tag (\texttt{tagState}) is required for identifying those messages.
  In line~37, the global termination state is determined.  When it is reached,
  new message tag (\texttt{tagState}) is required for identifying those messages.
  In line~37, the global termination state is determined.  When it is reached,
-lines~38-42 change the \texttt{Finished} boolean to stop the iterative process,
+lines~39--42 change the \texttt{Finished} boolean to stop the iterative process
  and send the end messages.  Otherwise each node resets its local state
  and send the end messages.  Otherwise each node resets its local state
-information about the other nodes and a global barrier is done between all the
+information about the other nodes and a global barrier is added between all the
  nodes at the end of each iteration with another new tag (\texttt{tagIter}).
  That barrier is needed to ensure that data messages from successive iterations
  are actually received during the \emph{same} iteration on the destination nodes.
  Nevertheless, it is not useful at the termination of the global process as it is
  replaced by the global exchange of end messages.
  
  nodes at the end of each iteration with another new tag (\texttt{tagIter}).
  That barrier is needed to ensure that data messages from successive iterations
  are actually received during the \emph{same} iteration on the destination nodes.
  Nevertheless, it is not useful at the termination of the global process as it is
  replaced by the global exchange of end messages.
  
-There is no big modification induced by the synchronization in the sending
-function. The only change could be the suppression of line~11 that is not useful
-in this case.  Apart from that, the function stays the same as
-in \Lst{algo:ch6p2BasicAsyncSendings}.
+There  is no  big modification  induced by  the synchronization  in  the sending
+function.     The     function    stays     almost     the     same    as     in
+\Lst{algo:ch6p2BasicAsyncSendings}. The only change  could be the suppression of
+line~11 that is not useful in this case.
  
  In the reception function, given in \Lst{algo:ch6p2SyncReceptions}, there are
  
  In the reception function, given in \Lst{algo:ch6p2SyncReceptions}, there are
-mainly two insertions (in lines~19-30 and 31-40), corresponding to the
+mainly two insertions (in lines~19--31 and 32--42), corresponding to the
  additional types of messages to receive.  There is also the insertion of three
  variables that are used for the receptions of the new message types.  In
  additional types of messages to receive.  There is also the insertion of three
  variables that are used for the receptions of the new message types.  In
-lines~24-29 and 34-39 are located messages counting and mutex unlocking
+lines~24--30 and 35--41 are located messages counting and mutex unlocking
  mechanisms that are used to block the computing thread at the corresponding steps of its
  execution. They are similar to the mechanism used for managing the end messages
  at the end of the entire process.  Line~23 directly updates the
  number of other nodes that are in local convergence by adding the
  received state of the source node. This is possible due to the encoding that is used to
  mechanisms that are used to block the computing thread at the corresponding steps of its
  execution. They are similar to the mechanism used for managing the end messages
  at the end of the entire process.  Line~23 directly updates the
  number of other nodes that are in local convergence by adding the
  received state of the source node. This is possible due to the encoding that is used to
-represent the local convergence (1) and the non-convergence (0).
+represent the local convergence (1) and the non convergence (0).
  
  %\begin{algorithm}[H]
  %  \caption{Reception function in the synchronized scheme.}
  %  \label{algo:ch6p2SyncReceptions}
  
  %\begin{algorithm}[H]
  %  \caption{Reception function in the synchronized scheme.}
  %  \label{algo:ch6p2SyncReceptions}
-\begin{Listing}{algo:ch6p2SyncReceptions}{Reception function in the synchronized scheme}
+\begin{Listing}{algo:ch6p2SyncReceptions}{reception function in the synchronized scheme}
  // Variables declarations and initialization
  ...
  int nbStateMsg = 0; // Number of local state messages received
  // Variables declarations and initialization
  ...
  int nbStateMsg = 0; // Number of local state messages received
@@ -648,7 +648,7 @@ while(!Finished){
        case tagState: // Management of local state messages
         // Actual reception of the message
         MPI_Recv(&recvdState, 1, MPI_CHAR, status.MPI_SOURCE, tagState, MPI_COMM_WORLD, &status); 
        case tagState: // Management of local state messages
         // Actual reception of the message
         MPI_Recv(&recvdState, 1, MPI_CHAR, status.MPI_SOURCE, tagState, MPI_COMM_WORLD, &status); 
-    // Updates of numbers of stabilized nodes and received state msgs 
+       // Updates of numbers of stabilized nodes and received state msgs 
         nbOtherCVs += recvdState;
         nbStateMsg++;
         // Unlocking of the computing thread when states of all other 
         nbOtherCVs += recvdState;
         nbStateMsg++;
         // Unlocking of the computing thread when states of all other 
@@ -661,7 +661,7 @@ while(!Finished){
        case tagIter: // Management of "end of iteration" messages
         // Actual reception of the message in dummy buffer
         MPI_Recv(dummyBuffer, 1, MPI_CHAR, status.MPI_SOURCE, tagIter, MPI_COMM_WORLD, &status); 
        case tagIter: // Management of "end of iteration" messages
         // Actual reception of the message in dummy buffer
         MPI_Recv(dummyBuffer, 1, MPI_CHAR, status.MPI_SOURCE, tagIter, MPI_COMM_WORLD, &status); 
-       nbIterMsg++; // Update of the nb of iteration messages
+       nbIterMsg++; // Update of the number of iteration messages
         // Unlocking of the computing thread when iteration messages 
         // are received from all other nodes       
         if(nbIterMsg == nbP - 1){
         // Unlocking of the computing thread when iteration messages 
         // are received from all other nodes       
         if(nbIterMsg == nbP - 1){
@@ -674,7 +674,7 @@ while(!Finished){
  }
  
  // Reception of pending messages and counting of end messages
  }
  
  // Reception of pending messages and counting of end messages
-do{ // Loop over the remaining incoming/waited messages
+do{ // Loop over the remaining incoming/end messages
    ...
  }while(arrived == 1 || nbEndMsg < nbDeps + 1);
  \end{Listing}
    ...
  }while(arrived == 1 || nbEndMsg < nbDeps + 1);
  \end{Listing}
@@ -683,10 +683,10 @@ do{ // Loop over the remaining incoming/waited messages
  Now that we can synchronize our asynchronous scheme, the final step is to
  dynamically alternate the two operating modes in order to regularly check the
  global convergence of the iterative process. This is detailed in the following
  Now that we can synchronize our asynchronous scheme, the final step is to
  dynamically alternate the two operating modes in order to regularly check the
  global convergence of the iterative process. This is detailed in the following
-paragraph together with the inclusion of GPU computing in the final asynchronous
+section together with the inclusion of GPU computing in the final asynchronous
  scheme.
  
  scheme.
  
-\subsection{Asynchronous scheme using MPI, OpenMP and CUDA}
+\subsection{Asynchronous scheme using MPI, OpenMP, and CUDA}
  \label{ch6:p2GPUAsync}
  
  As mentioned above, the strategy proposed to obtain a good compromise between
  \label{ch6:p2GPUAsync}
  
  As mentioned above, the strategy proposed to obtain a good compromise between
@@ -701,34 +701,34 @@ convergence.
  
  The last problem is to decide \emph{when} to switch from the
  asynchronous to the synchronous mode.  Here again, for the sake of simplicity, any
  
  The last problem is to decide \emph{when} to switch from the
  asynchronous to the synchronous mode.  Here again, for the sake of simplicity, any
-asynchronous mechanism for \emph{detecting} such instant is avoided and we
+asynchronous mechanism for \emph{detecting} such moment is avoided, and we
  prefer to use a mechanism that is local to each node. Obviously, that local system must
  rely neither on the number of local iterations done nor on the local
  convergence.  The former would slow down the fastest nodes according to the
  slowest ones.  The latter would provoke too much synchronization because the
  prefer to use a mechanism that is local to each node. Obviously, that local system must
  rely neither on the number of local iterations done nor on the local
  convergence.  The former would slow down the fastest nodes according to the
  slowest ones.  The latter would provoke too much synchronization because the
-residuals on all nodes commonly do not evolve in the same way and, in most
+residuals on all nodes generally do not evolve in the same way, and in most
  cases, there is a convergence wave phenomenon throughout the elements.  So, a
  good solution is to insert a local timer mechanism on each node with a given
  initial duration. Then, that duration may be modified during the execution
  according to the successive results of the synchronous sections.
  
  Another problem induced by entering synchronous mode from the asynchronous one
  cases, there is a convergence wave phenomenon throughout the elements.  So, a
  good solution is to insert a local timer mechanism on each node with a given
  initial duration. Then, that duration may be modified during the execution
  according to the successive results of the synchronous sections.
  
  Another problem induced by entering synchronous mode from the asynchronous one
-is the possibility to receive some data messages
+is the possibility of receiving some data messages
  from previous asynchronous iterations during synchronous iterations. This could lead to deadlocks. In order
  from previous asynchronous iterations during synchronous iterations. This could lead to deadlocks. In order
-to avoid this, a wait of the end of previous send is added to the
+to avoid this, a wait for the end of previous send is added to the
  transition between the two modes.  This is implemented by replacing the variable
  transition between the two modes.  This is implemented by replacing the variable
-\texttt{SendsInProgress} by a mutex \texttt{lockSendsDone} which is unlocked
+\texttt{SendsInProgress} with a mutex \texttt{lockSendsDone} which is unlocked
  once all the messages have been sent in the sending function.  Moreover, it is
  also necessary to stamp data messages\index{message!stamping} (by the function \texttt{stampData}) with
  once all the messages have been sent in the sending function.  Moreover, it is
  also necessary to stamp data messages\index{message!stamping} (by the function \texttt{stampData}) with
-a Boolean indicating whether they have been sent during a synchronous or
+a boolean indicating whether they have been sent during a synchronous or
  asynchronous iteration.  Then, the \texttt{lockRec} mutex is unlocked only
  after to the complete reception of data messages from synchronous
  iterations.  The message ordering of point-to-point communications in MPI
  asynchronous iteration.  Then, the \texttt{lockRec} mutex is unlocked only
  after to the complete reception of data messages from synchronous
  iterations.  The message ordering of point-to-point communications in MPI
-together with the barrier at the end of each iteration ensure two important
+and the barrier at the end of each iteration ensure two important
  properties of this mechanism. First, data messages from previous
  asynchronous iterations will be received but not taken into account during
  properties of this mechanism. First, data messages from previous
  asynchronous iterations will be received but not taken into account during
-synchronous sections. Then, a data message from a synchronous
-iteration cannot be received in another synchronous iteration.  In the
+synchronous sections. Then, a data message from a given synchronous
+iteration cannot be received during another synchronous iteration.  In the
  asynchronous sections, no additional mechanism is needed as there are no such
  constraints concerning the data receptions.
  
  asynchronous sections, no additional mechanism is needed as there are no such
  constraints concerning the data receptions.
  
@@ -737,25 +737,25 @@ Finally, the required modifications of the previous scheme
  are mainly related to the computing thread.  Small additions or modifications
  are also required in the main process and the other threads.
  
  are mainly related to the computing thread.  Small additions or modifications
  are also required in the main process and the other threads.
  
-In the main process, two new variables are added to store respectively the main
+In the main process, two new variables are added to store the main
  operating mode of the iterative process (\texttt{mainMode}) and the duration of
  asynchronous sections (\texttt{asyncDuration}). Those variables are
  operating mode of the iterative process (\texttt{mainMode}) and the duration of
  asynchronous sections (\texttt{asyncDuration}). Those variables are
-initialized by the programmer. The \texttt{lockSendsDone} mutex is also declared,
-initialized (locked) and destroyed with the other mutex in this process.
+initialized by the programmer. The mutex \texttt{lockSendsDone} is also declared,
+initialized (locked), and destroyed with the other mutex in this process.
  
  In the computing function, shown in \Lst{algo:ch6p2AsyncSyncComp}, the
  
  In the computing function, shown in \Lst{algo:ch6p2AsyncSyncComp}, the
-modifications consist of the insertion of the timer mechanism and of the conditions
+modifications consist of the insertion of the timer mechanism and the tests
  to differentiate the actions to be done in each mode.  Some additional variables
  are also required to store the current operating mode in action during the
  execution (\texttt{curMode}), the starting time of the current asynchronous
  to differentiate the actions to be done in each mode.  Some additional variables
  are also required to store the current operating mode in action during the
  execution (\texttt{curMode}), the starting time of the current asynchronous
-section (\texttt{asyncStart}) and the number of successive synchronous
+section (\texttt{asyncStart}), and the number of successive synchronous
  iterations done (\texttt{nbSyncIter}).
  
  %\begin{algorithm}[H]
  %  \caption{Computing function in the final asynchronous scheme.}% without GPU computing.}
  %  \label{algo:ch6p2AsyncSyncComp}
  %\pagebreak
  iterations done (\texttt{nbSyncIter}).
  
  %\begin{algorithm}[H]
  %  \caption{Computing function in the final asynchronous scheme.}% without GPU computing.}
  %  \label{algo:ch6p2AsyncSyncComp}
  %\pagebreak
-\begin{Listing}{algo:ch6p2AsyncSyncComp}{Computing function in the final asynchronous scheme}% without GPU computing.}
+\begin{Listing}{algo:ch6p2AsyncSyncComp}{computing function in the final asynchronous scheme}% without GPU computing.}
  // Variables declarations and initialization
  ...
  OpMode curMode = SYNC;// Current operating mode (always begin in sync)
  // Variables declarations and initialization
  ...
  OpMode curMode = SYNC;// Current operating mode (always begin in sync)
@@ -788,7 +788,7 @@ while(!Finished){
      }
    }
  
      }
    }
  
-  // Sendings of data dependencies
+  // Sending of data dependencies
    if(curMode == SYNC || !SendsInProgress){
      ... 
    }
    if(curMode == SYNC || !SendsInProgress){
      ... 
    }
@@ -799,18 +799,18 @@ while(!Finished){
    }  
  
    // Local computation 
    }  
  
    // Local computation 
-  // (init of residual, arrays swapping and iteration computation)
+  // (init of residual, arrays swapping, and iteration computation)
    ...
  
    ...
  
-  // Checking of convergences (local & global) only in sync mode
+  // Checking convergences (local & global) only in sync mode
    if(curMode == SYNC){
      // Local convergence checking (residual under threshold)
      ...
      // Blocking global exchange of local states of the nodes
      ...
      // Determination of global convergence (all nodes in local CV)
    if(curMode == SYNC){
      // Local convergence checking (residual under threshold)
      ...
      // Blocking global exchange of local states of the nodes
      ...
      // Determination of global convergence (all nodes in local CV)
-    //    Stop of the iterative process and sending of end messages
-    // or Re-initialization of state information and iteration barrier
+    //    Stopping the iterative process and sending end messages
+    // or reinitialization of state information and iteration barrier
      ...
      }
    }
      ...
      }
    }
@@ -822,24 +822,24 @@ while(!Finished){
  %\end{algorithm}
  
  In the sending function, the only modification is the replacement in line~11 of
  %\end{algorithm}
  
  In the sending function, the only modification is the replacement in line~11 of
-the assignment of variable \texttt{SendsInProgress} by the unlocking of
+the assignment of variable \texttt{SendsInProgress} with the unlocking of
  \texttt{lockSendsDone}.  Finally, in the reception function, the only
  \texttt{lockSendsDone}.  Finally, in the reception function, the only
-modification is the insertion before line~19
+modification is the insertion before line~21
  of \Lst{algo:ch6p2BasicAsyncReceptions} of the extraction of the stamp from the
  message and its counting among the receipts only if the stamp is \texttt{SYNC}.
  
  The final step to get our complete scheme using GPU is to insert the GPU
  management in the computing thread.  The first possibility, detailed
  in \Lst{algo:ch6p2syncGPU}, is to simply replace the
  of \Lst{algo:ch6p2BasicAsyncReceptions} of the extraction of the stamp from the
  message and its counting among the receipts only if the stamp is \texttt{SYNC}.
  
  The final step to get our complete scheme using GPU is to insert the GPU
  management in the computing thread.  The first possibility, detailed
  in \Lst{algo:ch6p2syncGPU}, is to simply replace the
-CPU kernel (lines~41-43 in \Lst{algo:ch6p2AsyncSyncComp}) by a blocking GPU kernel call.  This includes data
+CPU kernel (lines~42--44 in \Lst{algo:ch6p2AsyncSyncComp}) by a blocking GPU kernel call.  This includes data
  transfers from the node RAM to the GPU RAM, the launching of the GPU kernel, the
  transfers from the node RAM to the GPU RAM, the launching of the GPU kernel, the
-waiting for kernel completion and the results transfers from GPU RAM to
+waiting for kernel completion, and the results transfers from GPU RAM to
  node RAM.
  
  %\begin{algorithm}[H]
  %  \caption{Computing function in the final asynchronous scheme.}
  %  \label{algo:ch6p2syncGPU}
  node RAM.
  
  %\begin{algorithm}[H]
  %  \caption{Computing function in the final asynchronous scheme.}
  %  \label{algo:ch6p2syncGPU}
-\begin{Listing}{algo:ch6p2syncGPU}{Computing function in the final asynchronous scheme}
+\begin{Listing}{algo:ch6p2syncGPU}{computing function in the final asynchronous scheme}
  // Variables declarations and initialization
  ...
  dim3 Dg, Db; // CUDA kernel grids
  // Variables declarations and initialization
  ...
  dim3 Dg, Db; // CUDA kernel grids
@@ -847,13 +847,13 @@ dim3 Dg, Db; // CUDA kernel grids
  // Computation loop
  while(!Finished){
    // Determination of the dynamic operating mode, sendings of data
  // Computation loop
  while(!Finished){
    // Determination of the dynamic operating mode, sendings of data
-  // dependencies and blocking data receptions in sync mode
+  // dependencies, and blocking data receptions in sync mode
    ...
    // Local GPU computation
    // Data transfers from node RAM to GPU
    CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer");
    ... // There may be several data transfers: typically A and b in 
    ...
    // Local GPU computation
    // Data transfers from node RAM to GPU
    CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer");
    ... // There may be several data transfers: typically A and b in 
-      // linear problems
+      // linear problems of the form A.x = b
    // GPU grid definition
    Db.x = BLOCK_SIZE_X; // BLOCK_SIZE_# are kernel design dependent
    Db.y = BLOCK_SIZE_Y;
    // GPU grid definition
    Db.x = BLOCK_SIZE_X; // BLOCK_SIZE_# are kernel design dependent
    Db.y = BLOCK_SIZE_Y;
@@ -879,44 +879,44 @@ while(!Finished){
  %\end{algorithm}
  
  This scheme provides asynchronism through a cluster of GPUs as well as a
  %\end{algorithm}
  
  This scheme provides asynchronism through a cluster of GPUs as well as a
-complete overlap of communications with GPU computations (similarly
-to~\Sec{ch6:part1}).  However, the autonomy of GPU devices according to their
+complete overlap of communications with GPU computations (similar
+to the one described in~\Sec{ch6:part1}).  However, the autonomy of GPU devices according to their
  host can be further exploited in order to perform some computations on the CPU
  while the GPU kernel is running. The nature of computations that can be done by
  the CPU may vary depending on the application. For example, when processing data
  host can be further exploited in order to perform some computations on the CPU
  while the GPU kernel is running. The nature of computations that can be done by
  the CPU may vary depending on the application. For example, when processing data
-streams (pipelines), pre-processing of next data item and/or post-processing of
-previous result can be done on the CPU while the GPU is processing the current
+streams (pipelines), pre-processing of the next data item and/or post-processing
+of the previous result can be done on the CPU while the GPU is processing the current
  data item.  In other cases, the CPU can perform \emph{auxiliary}
  data item.  In other cases, the CPU can perform \emph{auxiliary}
-computations\index{computation!auxiliary}
+computations\index{computation auxiliary}
  that are not absolutely required to obtain the result but that may accelerate
  the entire iterative process.  Another possibility would be to distribute the
  main computations between the GPU and CPU. However, this
  that are not absolutely required to obtain the result but that may accelerate
  the entire iterative process.  Another possibility would be to distribute the
  main computations between the GPU and CPU. However, this
-usually leads to poor performance increases. This is mainly due to data
+usually leads to poor performance increases mainly due to data
  dependencies that often require additional transfers between CPU and GPU.
  
  So, if we consider that the application enables such overlap of
  computations, its implementation is straightforward as it consists in inserting
  dependencies that often require additional transfers between CPU and GPU.
  
  So, if we consider that the application enables such overlap of
  computations, its implementation is straightforward as it consists in inserting
-the additional CPU computations between lines~23 and~24
-in \Lst{algo:ch6p2syncGPU}.  Nevertheless, such scheme is fully efficient only
+the additional CPU computations between lines~25 and~26
+in \Lst{algo:ch6p2syncGPU}.  Nevertheless, such a scheme is fully efficient only
  if the computation times on both sides are similar.
  
  In some cases, especially with auxiliary computations, another interesting
  solution is to add a fourth CPU thread to perform them. This suppresses the
  duration constraint over those optional computations as they are performed in
  if the computation times on both sides are similar.
  
  In some cases, especially with auxiliary computations, another interesting
  solution is to add a fourth CPU thread to perform them. This suppresses the
  duration constraint over those optional computations as they are performed in
-parallel of the main iterative process, without blocking it.  Moreover, this
+parallel with the main iterative process, without blocking it.  Moreover, this
  scheme stays coherent with current architectures as most nodes include four CPU
  cores.  The algorithmic scheme of such context of complete overlap of
  CPU/GPU computations and communications is described in
  scheme stays coherent with current architectures as most nodes include four CPU
  cores.  The algorithmic scheme of such context of complete overlap of
  CPU/GPU computations and communications is described in
-Listings~\ref{algo:ch6p2FullOverAsyncMain},~\ref{algo:ch6p2FullOverAsyncComp1}
-and~\ref{algo:ch6p2FullOverAsyncComp2}, where we suppose that  auxiliary
+Listings~\ref{algo:ch6p2FullOverAsyncMain},~\ref{algo:ch6p2FullOverAsyncComp1},
+and~\ref{algo:ch6p2FullOverAsyncComp2}, where we assume that  auxiliary
  computations use intermediate results of the main computation process from any previous iteration. This may be
  different according to the application.
  
  %\begin{algorithm}[H]
  %  \caption{Initialization of the main process of complete overlap with asynchronism.}
  % \label{algo:ch6p2FullOverAsyncMain}
  computations use intermediate results of the main computation process from any previous iteration. This may be
  different according to the application.
  
  %\begin{algorithm}[H]
  %  \caption{Initialization of the main process of complete overlap with asynchronism.}
  % \label{algo:ch6p2FullOverAsyncMain}
-\pagebreak
-\begin{Listing}{algo:ch6p2FullOverAsyncMain}{Initialization of the main process of complete overlap with asynchronism}
+%\pagebreak
+\begin{Listing}{algo:ch6p2FullOverAsyncMain}{initialization of the main process of complete overlap with asynchronism}
  // Variables declarations and initialization
  ...
  omp_lock_t lockAux;   // Informs main thread about new aux results
  // Variables declarations and initialization
  ...
  omp_lock_t lockAux;   // Informs main thread about new aux results
@@ -924,7 +924,7 @@ omp_lock_t lockRes;   // Informs aux thread about new results
  omp_lock_t lockWrite; // Controls exclusion of results access
  ... auxRes ... ;      // Results of auxiliary computations 
  
  omp_lock_t lockWrite; // Controls exclusion of results access
  ... auxRes ... ;      // Results of auxiliary computations 
  
-// Parameters reading, MPI initialization, data initialization and
+// Parameters reading, MPI initialization, and data initialization and
  // distribution
  ...
  // OpenMP initialization
  // distribution
  ...
  // OpenMP initialization
@@ -977,32 +977,32 @@ MPI_Finalize();
  %\begin{algorithm}[H]
  %  \caption{Computing function in the final asynchronous scheme with CPU/GPU overlap.}
  %  \label{algo:ch6p2FullOverAsyncComp1}
  %\begin{algorithm}[H]
  %  \caption{Computing function in the final asynchronous scheme with CPU/GPU overlap.}
  %  \label{algo:ch6p2FullOverAsyncComp1}
-\pagebreak
-\begin{Listing}{algo:ch6p2FullOverAsyncComp1}{Computing function in the final asynchronous scheme with CPU/GPU overlap}
+%\pagebreak
+\begin{Listing}{algo:ch6p2FullOverAsyncComp1}{computing function in the final asynchronous scheme with CPU/GPU overlap}
  // Variables declarations and initialization
  ...
  dim3 Dg, Db; // CUDA kernel grids
  
  // Computation loop
  while(!Finished){
  // Variables declarations and initialization
  ...
  dim3 Dg, Db; // CUDA kernel grids
  
  // Computation loop
  while(!Finished){
-  // Determination of the dynamic operating mode, sendings of data 
-  // dependencies and blocking data receptions in sync mode
+  // Determination of the dynamic operating mode, sending of data 
+  // dependencies, and blocking data receptions in sync mode
    ...
    // Local GPU computation
    ...
    // Local GPU computation
-  // Data transfers from node RAM to GPU, GPU grid definition and init 
-  // of shared mem
+  // Data transfers from node RAM to GPU, GPU grid definition,  
+  // and init of shared memory
    CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer");
    ...
    // Kernel call
    gpuKernelName<<<Dg,Db>>>(... @\emph{kernel parameters}@ ...);
    CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer");
    ...
    // Kernel call
    gpuKernelName<<<Dg,Db>>>(... @\emph{kernel parameters}@ ...);
-  // Potential pre/post-treatments in pipeline like computations
+  // Potential pre-/post-treatments in pipeline-like computations
    ...
    // Waiting for kernel completion
    cudaDeviceSynchronize(); 
    // Results transfer from GPU to node RAM
    omp_set_lock(&lockWrite); // Wait for write access to resultsInRam
    CHECK_CUDA_SUCCESS(cudaMemcpyFromSymbol(resultsInRam, resultsOnGPU, resultsSize, 0, cudaMemcpyDeviceToHost), "Results transfer");
    ...
    // Waiting for kernel completion
    cudaDeviceSynchronize(); 
    // Results transfer from GPU to node RAM
    omp_set_lock(&lockWrite); // Wait for write access to resultsInRam
    CHECK_CUDA_SUCCESS(cudaMemcpyFromSymbol(resultsInRam, resultsOnGPU, resultsSize, 0, cudaMemcpyDeviceToHost), "Results transfer");
-  // Potential post-treatments in non-pipeline computations
+  // Potential post-treatments in non pipeline computations
    ...
    omp_unset_lock(&lockWrite); // Give back read access to aux thread
    omp_test_lock(&lockRes);
    ...
    omp_unset_lock(&lockWrite); // Give back read access to aux thread
    omp_test_lock(&lockRes);
@@ -1020,13 +1020,13 @@ while(!Finished){
      ...
      // Determination of global convergence (all nodes in local CV)
      if(cvLocale == 1 && nbCVLocales == nbP-1){
      ...
      // Determination of global convergence (all nodes in local CV)
      if(cvLocale == 1 && nbCVLocales == nbP-1){
-      // Stop of the iterative process and sending of end messages
+      // Stopping the iterative process and sending end messages
        ...
        ...
-      // Unlocking of aux thread for termination
+      // Unlocking aux thread for termination
        omp_test_lock(&lockRes);
        omp_unset_lock(&lockRes);
      }else{
        omp_test_lock(&lockRes);
        omp_unset_lock(&lockRes);
      }else{
-      // Re-initialization of state information and iteration barrier
+      // Reinitialization of state information and iteration barrier
        ...
      }
    }
        ...
      }
    }
@@ -1037,8 +1037,8 @@ while(!Finished){
  %\begin{algorithm}[H]
  %  \caption{Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap.}
  %  \label{algo:ch6p2FullOverAsyncComp2}
  %\begin{algorithm}[H]
  %  \caption{Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap.}
  %  \label{algo:ch6p2FullOverAsyncComp2}
-\pagebreak
-\begin{Listing}{algo:ch6p2FullOverAsyncComp2}{Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}
+%\pagebreak
+\begin{Listing}{algo:ch6p2FullOverAsyncComp2}{auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}
  // Variables declarations and initialization
  ... auxInput ... // Local array for input data
  
  // Variables declarations and initialization
  ... auxInput ... // Local array for input data
  
@@ -1066,23 +1066,23 @@ while(!Finished){
  %\end{algorithm}
  
  As can be seen in \Lst{algo:ch6p2FullOverAsyncMain}, there are three additional
  %\end{algorithm}
  
  As can be seen in \Lst{algo:ch6p2FullOverAsyncMain}, there are three additional
-mutex (\texttt{lockAux}, \texttt{lockRes} and \texttt{lockWrite}) that are used
-respectively to inform the main computation thread that new auxiliary results
-are available (lines~20-21 in \Lst{algo:ch6p2FullOverAsyncComp2} and line~29 in
+mutex (\texttt{lockAux}, \texttt{lockRes}, and \texttt{lockWrite}) that are used
+to inform the main computation thread that new auxiliary results
+are available (lines~20--21 in \Lst{algo:ch6p2FullOverAsyncComp2} and line~31 in
  \Lst{algo:ch6p2FullOverAsyncComp1}), to inform the auxiliary thread that new
  \Lst{algo:ch6p2FullOverAsyncComp1}), to inform the auxiliary thread that new
-results from the main thread are available (lines~25-26
+results from the main thread are available (lines~27--28
  in \Lst{algo:ch6p2FullOverAsyncComp1} and line~7
  in \Lst{algo:ch6p2FullOverAsyncComp2}), and to perform exclusive accesses to the
  in \Lst{algo:ch6p2FullOverAsyncComp1} and line~7
  in \Lst{algo:ch6p2FullOverAsyncComp2}), and to perform exclusive accesses to the
-results from those two threads (lines~20,~24
-in \Lst{algo:ch6p2FullOverAsyncComp1} and 9,~13
+results from those two threads (lines~22 and 26
+in \Lst{algo:ch6p2FullOverAsyncComp1} and 9 and 13
  in \Lst{algo:ch6p2FullOverAsyncComp2}).  Also, an additional array
  (\texttt{auxRes}) is required to store the results of the auxiliary computations
  as well as a local array for the input of the auxiliary function
  (\texttt{auxInput}). That last function has the same general organization as the
  in \Lst{algo:ch6p2FullOverAsyncComp2}).  Also, an additional array
  (\texttt{auxRes}) is required to store the results of the auxiliary computations
  as well as a local array for the input of the auxiliary function
  (\texttt{auxInput}). That last function has the same general organization as the
-send/receive ones, that is a global loop conditioned by the end of the global
+send/receive ones, that is, a global loop conditioned by the end of the global
  process.  At each iteration in this function, the thread waits for the
  availability of new results produced by the main computation thread. This avoids
  process.  At each iteration in this function, the thread waits for the
  availability of new results produced by the main computation thread. This avoids
-to perform the same computations several times with the same input data.
+ performing the same computations several times with the same input data.
  Then, input data of auxiliary computations
  % (as supposed here, they often
  % correspond to the results of the main computations, but may sometimes be
  Then, input data of auxiliary computations
  % (as supposed here, they often
  % correspond to the results of the main computations, but may sometimes be
@@ -1091,24 +1091,26 @@ is copied with a mutual exclusion mechanism. Finally, auxiliary
  computations are performed.  When they are completed, the associated mutex is
  unlocked to signal the availability of those auxiliary results to the main
  computing thread.  The main thread regularly checks this availability at the end
  computations are performed.  When they are completed, the associated mutex is
  unlocked to signal the availability of those auxiliary results to the main
  computing thread.  The main thread regularly checks this availability at the end
-of its iterations and takes them into account whenever this is possible.
+of its iterations and takes them into account whenever possible.
  
  
-Finally, we  obtain an algorithmic  scheme allowing maximal  overlap between
-CPU and  GPU computations as well  as communications. It is  worth noticing that
-such scheme is also usable for systems without GPUs but 4-cores nodes.
+Finally, we  obtain an algorithmic  scheme allowing maximal overlap  between CPU
+and GPU computations  as well as communications. It is  worth noticing that such
+scheme is also efficiently usable for systems without GPUs but with nodes having
+at least four cores. In such contexts, each thread in \Lst{algo:ch6p2FullOverAsyncMain} can
+be executed on distinct cores.
  
  \subsection{Experimental validation}
  \label{sec:ch6p2expes}
  
  As in~\Sec{ch6:part1},  we validate the  feasibility of our  asynchronous scheme
  with  some experiments  performed with  a representative  example  of scientific
  
  \subsection{Experimental validation}
  \label{sec:ch6p2expes}
  
  As in~\Sec{ch6:part1},  we validate the  feasibility of our  asynchronous scheme
  with  some experiments  performed with  a representative  example  of scientific
-application.      It     is    a     three-dimensional     version    of     the
-advection-diffusion-reaction   process\index{PDE example}  that   models  the   evolution   of  the
+application.  This     three-dimensional     version    of     the
+advection-diffusion-reaction   process\index{PDE example}  models  the   evolution   of  the
  concentrations of  two chemical  species in shallow  waters. As this  process is
  dynamic in time,  the simulation is performed for a  given number of consecutive
  time steps.  This  implies two nested loops in the  iterative process, the outer
  one for the time  steps and the inner one for solving  the problem at each time.
  concentrations of  two chemical  species in shallow  waters. As this  process is
  dynamic in time,  the simulation is performed for a  given number of consecutive
  time steps.  This  implies two nested loops in the  iterative process, the outer
  one for the time  steps and the inner one for solving  the problem at each time.
-Full details about this PDE  problem can be found in~\cite{ChapNRJ2011}. That
+Full details about this PDE  problem can be found in~\cite{ChapNRJ2011}. This
  two-stage  iterative process  implies a  few adaptations  of the  general scheme
  presented above in order to include the  outer iterations over  the time steps,  but the
  inner iterative process closely follows the same scheme.
  two-stage  iterative process  implies a  few adaptations  of the  general scheme
  presented above in order to include the  outer iterations over  the time steps,  but the
  inner iterative process closely follows the same scheme.
@@ -1116,7 +1118,7 @@ inner iterative process closely follows the same scheme.
  We show two  series of experiments performed with 16 nodes  of the first cluster
  described  in~\Sec{ch6:p1expes}.  The  first one  deals with  the  comparison of
  synchronous and asynchronous computations.  The second one is related to the use
  We show two  series of experiments performed with 16 nodes  of the first cluster
  described  in~\Sec{ch6:p1expes}.  The  first one  deals with  the  comparison of
  synchronous and asynchronous computations.  The second one is related to the use
-of auxiliary computations. In the context of our PDE application, they consist in
+of auxiliary computations. In the context of our PDE application, they consist of
  the update of the Jacobian of the system.
  
  \subsubsection*{Synchronous and asynchronous computations}
  the update of the Jacobian of the system.
  
  \subsubsection*{Synchronous and asynchronous computations}
@@ -1138,10 +1140,10 @@ corresponds in fact to $30\times30\times30\times2$ values.
    \label{fig:ch6p2syncasync}
  \end{figure}
  
    \label{fig:ch6p2syncasync}
  \end{figure}
  
-The results obtained show that  the asynchronous version is sensibly faster than
+The results obtained show that  the asynchronous version is significantly faster than
  the synchronous one  for smaller problem sizes, then it  becomes similar or even
  a bit slower  for  larger problem  sizes.   A  closer  comparison of computation  and
  the synchronous one  for smaller problem sizes, then it  becomes similar or even
  a bit slower  for  larger problem  sizes.   A  closer  comparison of computation  and
-communication times  in each execution  confirms that this behavior  is consistent.
+communication times  of each execution  confirms that this behavior  is consistent.
  The  asynchronous version is  interesting if communication time
  is similar or  larger than computation time.  In our  example, this is the
  case up  to a problem  size between 50 and 60.   Then, computations  become longer
  The  asynchronous version is  interesting if communication time
  is similar or  larger than computation time.  In our  example, this is the
  case up  to a problem  size between 50 and 60.   Then, computations  become longer
@@ -1161,18 +1163,18 @@ the system.  In such cases, it is  necessary to compute the vector $\Delta x$ in
  $F'\times \Delta  x=-F$ to update $x$ with  $\Delta x$. There are  two levels of
  iterations, the  inner level to get a  stabilized version of $x$,  and the outer
  level to compute $x$ at the successive time steps in the simulation process.  In
  $F'\times \Delta  x=-F$ to update $x$ with  $\Delta x$. There are  two levels of
  iterations, the  inner level to get a  stabilized version of $x$,  and the outer
  level to compute $x$ at the successive time steps in the simulation process.  In
-this context,  classical algorithms either compute  $F'$ only at the  first iteration
+this context,  classic algorithms either compute  $F'$ at only the  first iteration
  of each time step or at some iterations but not all because the  computation of $F'$ is done
  in the main iterative process and it has a relatively high computing cost.
  
  However, with the scheme presented above, it is possible to continuously compute
  of each time step or at some iterations but not all because the  computation of $F'$ is done
  in the main iterative process and it has a relatively high computing cost.
  
  However, with the scheme presented above, it is possible to continuously compute
-new  versions  of  $F'$  in  parallel  to the  main  iterative  process  without
+new  versions  of  $F'$  in  parallel with the  main  iterative  process  without
  penalizing  it. Hence,  $F'$ is  updated  as often  as possible  and taken  into
  account in  the main computations  when it is  relevant. So, the  Newton process
  should be  accelerated a little bit.
  
  We  compare the  performance obtained  with overlapped  Jacobian  updatings and
  penalizing  it. Hence,  $F'$ is  updated  as often  as possible  and taken  into
  account in  the main computations  when it is  relevant. So, the  Newton process
  should be  accelerated a little bit.
  
  We  compare the  performance obtained  with overlapped  Jacobian  updatings and
-non-overlapped ones for several problem sizes, see~\Fig{fig:ch6p2aux}.
+non overlapped ones for several problem sizes (see~\Fig{fig:ch6p2aux}).
  \begin{figure}[h]
    \centering
    \includegraphics[width=.75\columnwidth]{Chapters/chapter6/curves/recouvs.pdf}
  \begin{figure}[h]
    \centering
    \includegraphics[width=.75\columnwidth]{Chapters/chapter6/curves/recouvs.pdf}
@@ -1182,31 +1184,31 @@ non-overlapped ones for several problem sizes, see~\Fig{fig:ch6p2aux}.
  \end{figure}
  
  The  overlap is  clearly efficient  as  the computation  times with  overlapping
  \end{figure}
  
  The  overlap is  clearly efficient  as  the computation  times with  overlapping
-Jacobian updatings  are much better  than without overlap.  Moreover,  the ratio
-between the  two versions tend  to increase with  the problem size, which  is as
+Jacobian updatings  are much better  than the ones without overlap.  Moreover,  the ratio
+between the  two versions tends  to increase with  the problem size, which  is as
  expected. Also, we have tested the application without auxiliary computations at
  all, that is, the  Jacobian is computed only once at the  beginning of each time
  step of the  simulation. The results for this last version  are quite similar to
  the overlapped auxiliary computations, and  even better for small problem sizes.
  expected. Also, we have tested the application without auxiliary computations at
  all, that is, the  Jacobian is computed only once at the  beginning of each time
  step of the  simulation. The results for this last version  are quite similar to
  the overlapped auxiliary computations, and  even better for small problem sizes.
-The fact that no sensible gain can be seen on this range of problem sizes is due
+The fact that no significant gain can be seen on this range of problem sizes is due
  to the limited number of Jacobian updates taken into  account in the main
  computation.  This happens when  the Jacobian update is as long as
  several  iterations of  the main  process. So,  the benefit  is reduced  in this
  particular case.
  
  to the limited number of Jacobian updates taken into  account in the main
  computation.  This happens when  the Jacobian update is as long as
  several  iterations of  the main  process. So,  the benefit  is reduced  in this
  particular case.
  
-Those results show two things; first, auxiliary computations do not induce great
-overhead in  the whole  process.  Second, for  this particular  application the
+Those results show two things. First, auxiliary computations do not induce great
+overhead  in the  whole process.   Second, for  this particular  application the
  choice of updating the Jacobian  matrix as auxiliary computations does not speed
  up the iterative process.  This does  not question the parallel scheme in itself
  choice of updating the Jacobian  matrix as auxiliary computations does not speed
  up the iterative process.  This does  not question the parallel scheme in itself
-but   merely  points  out   the  difficulty   to  identify   relevant  auxiliary
+but  merely  points  out   the  difficulty  of  identifying  relevant  auxiliary
  computations.  Indeed, this identification depends on the considered application
  and requires a profound specialized analysis.
  
  Another  interesting choice  could be  the  computation of  load estimation  for
  dynamic load  balancing, especially in decentralized  diffusion strategies where
  computations.  Indeed, this identification depends on the considered application
  and requires a profound specialized analysis.
  
  Another  interesting choice  could be  the  computation of  load estimation  for
  dynamic load  balancing, especially in decentralized  diffusion strategies where
-loads are  transferred between  neighboring nodes~\cite{BCVG11}.  In  such case,
+loads are  transferred between  neighboring nodes~\cite{BCVG11}.  In  such a case,
  the load evaluation and the comparison  with other nodes can be done in parallel
  the load evaluation and the comparison  with other nodes can be done in parallel
-of the  main computations without perturbing  them.
+with the  main computations without perturbing  them.
  
  %%% Local Variables:
  %%% mode: latex
  
  %%% Local Variables:
  %%% mode: latex