ch18

[book_gpu.git] / BookGPU / Chapters / chapter6 / PartieAsync.tex
diff --git a/BookGPU/Chapters/chapter6/PartieAsync.tex b/BookGPU/Chapters/chapter6/PartieAsync.tex

index 44eadd5609ee4032d59adf459dfb4654dcc60360..0b209267be0013c67a2b663860795a3daf305315 100644 (file)
--- a/BookGPU/Chapters/chapter6/PartieAsync.tex
+++ b/BookGPU/Chapters/chapter6/PartieAsync.tex
@@ -15,34 +15,59 @@ Formally, if we denote by $f=(f_1,...,f_n)$ the function representing the
  iterative process and by $x^t=(x_1^t,...,x_n^t)$ the values of the $n$ elements of
  the system at iteration $t$, we pass from a synchronous iterative scheme of the
  form:
+%% \begin{algorithm}[H]
+%%   \caption{Synchronous iterative scheme}\label{algo:ch6p2sync}
+%%   \begin{Algo}
+%%     $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
+%%     \textbf{for} $t=0,1,...$\\
+%%     \>\textbf{for} $i=1,...,n$\\
+%%     \>\>$x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\\
+%%     \>\textbf{endfor}\\
+%%     \textbf{endfor}
+%%   \end{Algo}
+%% \end{algorithm}
  \begin{algorithm}[H]
    \caption{Synchronous iterative scheme}\label{algo:ch6p2sync}
-  \begin{Algo}
-    $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
-    \textbf{for} $t=0,1,...$\\
-    \>\textbf{for} $i=1,...,n$\\
-    \>\>$x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\\
-    \>\textbf{endfor}\\
-    \textbf{endfor}
-  \end{Algo}
+  $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\;
+  \For{ $t=0,1,...$} {
+    \For{ $i=1,...,n$}{
+      $x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\;
+    }
+  }
  \end{algorithm}
+
+
  \noindent
-to an asynchronous iterative scheme of the form:
+to an asynchronous iterative scheme of the form:\\
+%% \begin{algorithm}[H]
+%%   \caption{Asynchronous iterative scheme}\label{algo:ch6p2async}
+%%   \begin{Algo}
+%%     $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
+%%     \textbf{for} $t=0,1,...$\\
+%%     \>\textbf{for} $i=1,...,n$\\
+%%     \>\>$x_{i}^{t+1}=\left\{
+%%       \begin{array}[h]{ll}
+%%         x_i^t & \text{if } i \text{ is \emph{not} updated at iteration } i\\
+%%         f_i(x_1^{s_1^i(t)},...,x_n^{s_n^i(t)}) & \text{if } i \text{ is updated at iteration } i
+%%       \end{array}
+%%     \right.$\\
+%%     \>\textbf{endfor}\\
+%%     \textbf{endfor}
+%%   \end{Algo}
+%% \end{algorithm}
  \begin{algorithm}[H]
    \caption{Asynchronous iterative scheme}\label{algo:ch6p2async}
-  \begin{Algo}
-    $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
-    \textbf{for} $t=0,1,...$\\
-    \>\textbf{for} $i=1,...,n$\\
-    \>\>$x_{i}^{t+1}=\left\{
+    $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\;
+    \For {$t=0,1,...$} {
+      \For{ $i=1,...,n$} {
+        $x_{i}^{t+1}=\left\{
        \begin{array}[h]{ll}
          x_i^t & \text{if } i \text{ is \emph{not} updated at iteration } i\\
          f_i(x_1^{s_1^i(t)},...,x_n^{s_n^i(t)}) & \text{if } i \text{ is updated at iteration } i
        \end{array}
-    \right.$\\
-    \>\textbf{endfor}\\
-    \textbf{endfor}
-  \end{Algo}
+    \right.$
+      }
+    }
  \end{algorithm}
  where $s_j^i(t)$ is the iteration number of the production of the value $x_j$ of
  element $j$  that is  used on element  $i$ at iteration  $t$ (see for example~\cite{BT89,
@@ -132,11 +157,15 @@ So, the global organization of this scheme is set up in \Lst{algo:ch6p2BasicAsyn
  %   \label{algo:ch6p2BasicAsync}
  \begin{Listing}{algo:ch6p2BasicAsync}{Initialization of the basic asynchronous scheme}
  // Variables declaration and initialization
-omp_lock_t lockSend; // Controls the sendings from the computing thread
-omp_lock_t lockRec;  // Ensures the initial reception of external data
-char Finished = 0;   // Boolean indicating the end of the process
-char SendsInProgress = 0; // Boolean indicating if previous data sendings are still in progress
-double Threshold;    // Threshold of the residual for convergence detection
+// Controls the sendings from the computing thread
+omp_lock_t lockSend; 
+// Ensures the initial reception of external data
+omp_lock_t lockRec;  
+char Finished = 0; // Boolean indicating the end of the process
+// Boolean indicating if previous data sendings are still in progress
+char SendsInProgress = 0; 
+// Threshold of the residual for convergence detection
+double Threshold; 
  
  // Parameters reading
  ...
@@ -152,9 +181,10 @@ MPI_Comm_rank(MPI_COMM_WORLD, &numP);
  // OpenMP initialization (mainly declarations and setting up of locks)
  omp_set_num_threads(3);
  omp_init_lock(&lockSend);
-omp_set_lock(&lockSend); // Initially locked, unlocked to start sendings
+omp_set_lock(&lockSend);//Initially locked, unlocked to start sendings
  omp_init_lock(&lockRec);
-omp_set_lock(&lockRec);  // Initially locked, unlocked when initial data are received
+//Initially locked, unlocked when initial data are received
+omp_set_lock(&lockRec);
  
  #pragma omp parallel
  {
@@ -211,7 +241,8 @@ double residual;   // Residual of the current iteration
  
  // Computation loop
  while(!Finished){
-  // Sendings of data dependencies if there is no previous sending in progress
+  // Sendings of data dependencies if there is no previous sending
+  // in progress
    if(!SendsInProgress){
      // Potential copy of data to be sent in additional buffers
      ...
@@ -320,7 +351,8 @@ The last function, detailed in \Lst{algo:ch6p2BasicAsyncReceptions}, does all th
  %  \label{algo:ch6p2BasicAsyncReceptions}
  \begin{Listing}{algo:ch6p2BasicAsyncReceptions}{Reception function in the basic asynchronous scheme}
  // Variables declaration and initialization
-char countReceipts = 0; // Boolean indicating whether receptions are counted or not
+char countReceipts = 0; // Boolean indicating whether receptions are 
+// counted or not
  int nbEndMsg = 0;       // Number of end messages received
  int arrived = 0;        // Boolean indicating if a message is arrived
  int srcNd;              // Source node of the message
@@ -334,10 +366,12 @@ while(!Finished){
      // Management of data messages
      switch(status.MPI_TAG){
        case tagCom: // Management of data messages
-       srcNd = status.MPI_SOURCE; // Get the source node of the message
+       // Get the source node of the message
+       srcNd = status.MPI_SOURCE; 
         // Actual data reception in the corresponding buffer
         MPI_Recv(dataBufferOf(srcNd), nbDataOf(srcNd), dataTypeOf(srcNd), srcNd, tagCom, MPI_COMM_WORLD, &status); 
-       // Unlocking of the computing thread when data are received from all dependencies
+       // Unlocking of the computing thread when data are received 
+       // from all dependencies
         if(countReceipts == 1 && ... @\emph{receptions from ALL dependencies}@ ...){
           omp_unset_lock(&lockRec);
           countReceipts = 0; // No more counting after first iteration
@@ -429,10 +463,14 @@ required to change the operating mode.
  \begin{Listing}{algo:ch6p2Sync}{Initialization of the synchronized scheme}
  // Variables declarations and initialization
  ...
-omp_lock_t lockStates; // Controls the synchronous exchange of local states 
-omp_lock_t lockIter;   // Controls the synchronization at the end of each iteration
-char localCV = 0;      // Boolean indicating whether the local stabilization is reached or not
-int nbOtherCVs = 0;    // Number of other nodes being in local stabilization
+// Controls the synchronous exchange of local states 
+omp_lock_t lockStates; 
+// Controls the synchronization at the end of each iteration
+omp_lock_t lockIter;   
+//Boolean indicating whether the local stabilization is reached or not
+char localCV = 0;      
+// Number of other nodes being in local stabilization
+int nbOtherCVs = 0;    
  
  // Parameters reading
  ...
@@ -443,9 +481,12 @@ int nbOtherCVs = 0;    // Number of other nodes being in local stabilization
  // OpenMP initialization (mainly declarations and setting up of locks)
  ...
  omp_init_lock(&lockStates);
-omp_set_lock(&lockStates); // Initially locked, unlocked when all state messages are received
+// Initially locked, unlocked when all state messages are received
+omp_set_lock(&lockStates); 
  omp_init_lock(&lockIter);
-omp_set_lock(&lockIter);   // Initially locked, unlocked when all "end of iteration" messages are received
+// Initially locked, unlocked when all "end of iteration" messages are
+// received
+omp_set_lock(&lockIter);   
  
  // Threads launching
  #pragma omp parallel
@@ -523,7 +564,7 @@ while(!Finished){
    // Waiting for the state messages receptions from the other nodes
    omp_set_lock(&lockStates);
    
-  // Determination of global convergence (if all nodes are in local CV)
+  //Determination of global convergence (if all nodes are in local CV)
    if(localCV + nbOtherCVs == nbP){
      // Entering global CV state
      Finished = 1;
@@ -607,10 +648,11 @@ while(!Finished){
        case tagState: // Management of local state messages
         // Actual reception of the message
         MPI_Recv(&recvdState, 1, MPI_CHAR, status.MPI_SOURCE, tagState, MPI_COMM_WORLD, &status); 
-       // Updates of numbers of stabilized nodes and received state msgs 
+    // Updates of numbers of stabilized nodes and received state msgs 
         nbOtherCVs += recvdState;
         nbStateMsg++;
-       // Unlocking of the computing thread when states of all other nodes are received
+       // Unlocking of the computing thread when states of all other 
+       // nodes are received
         if(nbStateMsg == nbP-1){
           nbStateMsg = 0;
           omp_unset_lock(&lockStates);
@@ -620,7 +662,8 @@ while(!Finished){
         // Actual reception of the message in dummy buffer
         MPI_Recv(dummyBuffer, 1, MPI_CHAR, status.MPI_SOURCE, tagIter, MPI_COMM_WORLD, &status); 
         nbIterMsg++; // Update of the nb of iteration messages
-       // Unlocking of the computing thread when iteration messages are received from all other nodes       
+       // Unlocking of the computing thread when iteration messages 
+       // are received from all other nodes       
         if(nbIterMsg == nbP - 1){
           nbIterMsg = 0;
           omp_unset_lock(&lockIter);
@@ -711,13 +754,13 @@ iterations done (\texttt{nbSyncIter}).
  %\begin{algorithm}[H]
  %  \caption{Computing function in the final asynchronous scheme.}% without GPU computing.}
  %  \label{algo:ch6p2AsyncSyncComp}
-%\pagebreak
+\pagebreak
  \begin{Listing}{algo:ch6p2AsyncSyncComp}{Computing function in the final asynchronous scheme}% without GPU computing.}
  // Variables declarations and initialization
  ...
-OpMode curMode = SYNC; // Current operating mode (always begin in sync)
-double asyncStart;     // Starting time of the current async section
-int nbSyncIter = 0;    // Number of sync iterations done in async mode
+OpMode curMode = SYNC;// Current operating mode (always begin in sync)
+double asyncStart;    // Starting time of the current async section
+int nbSyncIter = 0;   // Number of sync iterations done in async mode
  
  // Computation loop
  while(!Finished){
@@ -726,14 +769,15 @@ while(!Finished){
      // Entering synchronous mode when asyncDuration is reached
  @%    // (additional conditions can be specified if needed) 
  @    if(MPI_Wtime() - asyncStart >= asyncDuration){
-      // Waiting for the end of previous sends before starting sync mode
+    // Waiting for the end of previous sends before starting sync mode
        omp_set_lock(&lockSendsDone);
        curMode = SYNC;              // Entering synchronous mode
        stampData(dataToSend, SYNC); // Mark data to send with sync flag
        nbSyncIter = 0;
      }
    }else{
-    // In main async mode, going back to async mode when the max number of sync iterations are done
+   // In main async mode, going back to async mode when the max number 
+   // of sync iterations are done
      if(mainMode == ASYNC){
        nbSyncIter++; // Update of the number of sync iterations done
        if(nbSyncIter == 2){
@@ -802,12 +846,14 @@ dim3 Dg, Db; // CUDA kernel grids
  
  // Computation loop
  while(!Finished){
-  // Determination of the dynamic operating mode, sendings of data dependencies and blocking data receptions in sync mode
+  // Determination of the dynamic operating mode, sendings of data
+  // dependencies and blocking data receptions in sync mode
    ...
    // Local GPU computation
    // Data transfers from node RAM to GPU
    CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer");
-  ... // There may be several data transfers: typically A and b in linear problems
+  ... // There may be several data transfers: typically A and b in 
+      // linear problems
    // GPU grid definition
    Db.x = BLOCK_SIZE_X; // BLOCK_SIZE_# are kernel design dependent
    Db.y = BLOCK_SIZE_Y;
@@ -869,7 +915,7 @@ different according to the application.
  %\begin{algorithm}[H]
  %  \caption{Initialization of the main process of complete overlap with asynchronism.}
  % \label{algo:ch6p2FullOverAsyncMain}
-\pagebreak
+%\pagebreak
  \begin{Listing}{algo:ch6p2FullOverAsyncMain}{Initialization of the main process of complete overlap with asynchronism}
  // Variables declarations and initialization
  ...
@@ -878,12 +924,13 @@ omp_lock_t lockRes;   // Informs aux thread about new results
  omp_lock_t lockWrite; // Controls exclusion of results access
  ... auxRes ... ;      // Results of auxiliary computations 
  
-// Parameters reading, MPI initialization, data initialization and distribution
+// Parameters reading, MPI initialization, data initialization and
+// distribution
  ...
  // OpenMP initialization
  ...
  omp_init_lock(&lockAux);
-omp_set_lock(&lockAux);  // Unlocked when new aux results are available
+omp_set_lock(&lockAux);//Unlocked when new aux results are available
  omp_init_lock(&lockRes);
  omp_set_lock(&lockRes);  // Unlocked when new results are available
  omp_init_lock(&lockWrite);
@@ -930,7 +977,7 @@ MPI_Finalize();
  %\begin{algorithm}[H]
  %  \caption{Computing function in the final asynchronous scheme with CPU/GPU overlap.}
  %  \label{algo:ch6p2FullOverAsyncComp1}
-\pagebreak
+%\pagebreak
  \begin{Listing}{algo:ch6p2FullOverAsyncComp1}{Computing function in the final asynchronous scheme with CPU/GPU overlap}
  // Variables declarations and initialization
  ...
@@ -938,10 +985,12 @@ dim3 Dg, Db; // CUDA kernel grids
  
  // Computation loop
  while(!Finished){
-  // Determination of the dynamic operating mode, sendings of data dependencies and blocking data receptions in sync mode
+  // Determination of the dynamic operating mode, sendings of data 
+  // dependencies and blocking data receptions in sync mode
    ...
    // Local GPU computation
-  // Data transfers from node RAM to GPU, GPU grid definition and init of shared mem
+  // Data transfers from node RAM to GPU, GPU grid definition and init 
+  // of shared mem
    CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer");
    ...
    // Kernel call
@@ -988,7 +1037,7 @@ while(!Finished){
  %\begin{algorithm}[H]
  %  \caption{Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap.}
  %  \label{algo:ch6p2FullOverAsyncComp2}
-\pagebreak
+%\pagebreak
  \begin{Listing}{algo:ch6p2FullOverAsyncComp2}{Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}
  // Variables declarations and initialization
  ... auxInput ... // Local array for input data
@@ -1002,13 +1051,13 @@ while(!Finished){
      for(ind=0; ind<resultsSize; ++ind){
        auxInput[ind] = resultsInRam[ind];
      }
-    omp_unset_lock(&lockWrite); // Give back write access to main thread
+    omp_unset_lock(&lockWrite);//Give back write access to main thread
      // Auxiliary computations with possible interruption at the end
      for(ind=0; ind<auxSize && !Finished; ++ind){
        // Computation of auxRes array according to auxInput
        ...
      }
-    // Informs main thread that new aux results are available in auxData
+// Informs main thread that new aux results are available in auxData
      omp_test_lock(&lockAux); // Ensures mutex is locked when unlocking
      omp_unset_lock(&lockAux);
    }