X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/blobdiff_plain/44b8f845847505b81dc0f1199c49e67a495ed7a0..1ebb106491ad04e5627daf016c8ff77bdcb26ffa:/BookGPU/Chapters/chapter6/PartieAsync.tex?ds=sidebyside diff --git a/BookGPU/Chapters/chapter6/PartieAsync.tex b/BookGPU/Chapters/chapter6/PartieAsync.tex index 44eadd5..0b20926 100644 --- a/BookGPU/Chapters/chapter6/PartieAsync.tex +++ b/BookGPU/Chapters/chapter6/PartieAsync.tex @@ -15,34 +15,59 @@ Formally, if we denote by $f=(f_1,...,f_n)$ the function representing the iterative process and by $x^t=(x_1^t,...,x_n^t)$ the values of the $n$ elements of the system at iteration $t$, we pass from a synchronous iterative scheme of the form: +%% \begin{algorithm}[H] +%% \caption{Synchronous iterative scheme}\label{algo:ch6p2sync} +%% \begin{Algo} +%% $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\ +%% \textbf{for} $t=0,1,...$\\ +%% \>\textbf{for} $i=1,...,n$\\ +%% \>\>$x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\\ +%% \>\textbf{endfor}\\ +%% \textbf{endfor} +%% \end{Algo} +%% \end{algorithm} \begin{algorithm}[H] \caption{Synchronous iterative scheme}\label{algo:ch6p2sync} - \begin{Algo} - $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\ - \textbf{for} $t=0,1,...$\\ - \>\textbf{for} $i=1,...,n$\\ - \>\>$x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\\ - \>\textbf{endfor}\\ - \textbf{endfor} - \end{Algo} + $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\; + \For{ $t=0,1,...$} { + \For{ $i=1,...,n$}{ + $x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\; + } + } \end{algorithm} + + \noindent -to an asynchronous iterative scheme of the form: +to an asynchronous iterative scheme of the form:\\ +%% \begin{algorithm}[H] +%% \caption{Asynchronous iterative scheme}\label{algo:ch6p2async} +%% \begin{Algo} +%% $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\ +%% \textbf{for} $t=0,1,...$\\ +%% \>\textbf{for} $i=1,...,n$\\ +%% \>\>$x_{i}^{t+1}=\left\{ +%% \begin{array}[h]{ll} +%% x_i^t & \text{if } i \text{ is \emph{not} updated at iteration } i\\ +%% f_i(x_1^{s_1^i(t)},...,x_n^{s_n^i(t)}) & \text{if } i \text{ is updated at iteration } i +%% \end{array} +%% \right.$\\ +%% \>\textbf{endfor}\\ +%% \textbf{endfor} +%% \end{Algo} +%% \end{algorithm} \begin{algorithm}[H] \caption{Asynchronous iterative scheme}\label{algo:ch6p2async} - \begin{Algo} - $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\ - \textbf{for} $t=0,1,...$\\ - \>\textbf{for} $i=1,...,n$\\ - \>\>$x_{i}^{t+1}=\left\{ + $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\; + \For {$t=0,1,...$} { + \For{ $i=1,...,n$} { + $x_{i}^{t+1}=\left\{ \begin{array}[h]{ll} x_i^t & \text{if } i \text{ is \emph{not} updated at iteration } i\\ f_i(x_1^{s_1^i(t)},...,x_n^{s_n^i(t)}) & \text{if } i \text{ is updated at iteration } i \end{array} - \right.$\\ - \>\textbf{endfor}\\ - \textbf{endfor} - \end{Algo} + \right.$ + } + } \end{algorithm} where $s_j^i(t)$ is the iteration number of the production of the value $x_j$ of element $j$ that is used on element $i$ at iteration $t$ (see for example~\cite{BT89, @@ -132,11 +157,15 @@ So, the global organization of this scheme is set up in \Lst{algo:ch6p2BasicAsyn % \label{algo:ch6p2BasicAsync} \begin{Listing}{algo:ch6p2BasicAsync}{Initialization of the basic asynchronous scheme} // Variables declaration and initialization -omp_lock_t lockSend; // Controls the sendings from the computing thread -omp_lock_t lockRec; // Ensures the initial reception of external data -char Finished = 0; // Boolean indicating the end of the process -char SendsInProgress = 0; // Boolean indicating if previous data sendings are still in progress -double Threshold; // Threshold of the residual for convergence detection +// Controls the sendings from the computing thread +omp_lock_t lockSend; +// Ensures the initial reception of external data +omp_lock_t lockRec; +char Finished = 0; // Boolean indicating the end of the process +// Boolean indicating if previous data sendings are still in progress +char SendsInProgress = 0; +// Threshold of the residual for convergence detection +double Threshold; // Parameters reading ... @@ -152,9 +181,10 @@ MPI_Comm_rank(MPI_COMM_WORLD, &numP); // OpenMP initialization (mainly declarations and setting up of locks) omp_set_num_threads(3); omp_init_lock(&lockSend); -omp_set_lock(&lockSend); // Initially locked, unlocked to start sendings +omp_set_lock(&lockSend);//Initially locked, unlocked to start sendings omp_init_lock(&lockRec); -omp_set_lock(&lockRec); // Initially locked, unlocked when initial data are received +//Initially locked, unlocked when initial data are received +omp_set_lock(&lockRec); #pragma omp parallel { @@ -211,7 +241,8 @@ double residual; // Residual of the current iteration // Computation loop while(!Finished){ - // Sendings of data dependencies if there is no previous sending in progress + // Sendings of data dependencies if there is no previous sending + // in progress if(!SendsInProgress){ // Potential copy of data to be sent in additional buffers ... @@ -320,7 +351,8 @@ The last function, detailed in \Lst{algo:ch6p2BasicAsyncReceptions}, does all th % \label{algo:ch6p2BasicAsyncReceptions} \begin{Listing}{algo:ch6p2BasicAsyncReceptions}{Reception function in the basic asynchronous scheme} // Variables declaration and initialization -char countReceipts = 0; // Boolean indicating whether receptions are counted or not +char countReceipts = 0; // Boolean indicating whether receptions are +// counted or not int nbEndMsg = 0; // Number of end messages received int arrived = 0; // Boolean indicating if a message is arrived int srcNd; // Source node of the message @@ -334,10 +366,12 @@ while(!Finished){ // Management of data messages switch(status.MPI_TAG){ case tagCom: // Management of data messages - srcNd = status.MPI_SOURCE; // Get the source node of the message + // Get the source node of the message + srcNd = status.MPI_SOURCE; // Actual data reception in the corresponding buffer MPI_Recv(dataBufferOf(srcNd), nbDataOf(srcNd), dataTypeOf(srcNd), srcNd, tagCom, MPI_COMM_WORLD, &status); - // Unlocking of the computing thread when data are received from all dependencies + // Unlocking of the computing thread when data are received + // from all dependencies if(countReceipts == 1 && ... @\emph{receptions from ALL dependencies}@ ...){ omp_unset_lock(&lockRec); countReceipts = 0; // No more counting after first iteration @@ -429,10 +463,14 @@ required to change the operating mode. \begin{Listing}{algo:ch6p2Sync}{Initialization of the synchronized scheme} // Variables declarations and initialization ... -omp_lock_t lockStates; // Controls the synchronous exchange of local states -omp_lock_t lockIter; // Controls the synchronization at the end of each iteration -char localCV = 0; // Boolean indicating whether the local stabilization is reached or not -int nbOtherCVs = 0; // Number of other nodes being in local stabilization +// Controls the synchronous exchange of local states +omp_lock_t lockStates; +// Controls the synchronization at the end of each iteration +omp_lock_t lockIter; +//Boolean indicating whether the local stabilization is reached or not +char localCV = 0; +// Number of other nodes being in local stabilization +int nbOtherCVs = 0; // Parameters reading ... @@ -443,9 +481,12 @@ int nbOtherCVs = 0; // Number of other nodes being in local stabilization // OpenMP initialization (mainly declarations and setting up of locks) ... omp_init_lock(&lockStates); -omp_set_lock(&lockStates); // Initially locked, unlocked when all state messages are received +// Initially locked, unlocked when all state messages are received +omp_set_lock(&lockStates); omp_init_lock(&lockIter); -omp_set_lock(&lockIter); // Initially locked, unlocked when all "end of iteration" messages are received +// Initially locked, unlocked when all "end of iteration" messages are +// received +omp_set_lock(&lockIter); // Threads launching #pragma omp parallel @@ -523,7 +564,7 @@ while(!Finished){ // Waiting for the state messages receptions from the other nodes omp_set_lock(&lockStates); - // Determination of global convergence (if all nodes are in local CV) + //Determination of global convergence (if all nodes are in local CV) if(localCV + nbOtherCVs == nbP){ // Entering global CV state Finished = 1; @@ -607,10 +648,11 @@ while(!Finished){ case tagState: // Management of local state messages // Actual reception of the message MPI_Recv(&recvdState, 1, MPI_CHAR, status.MPI_SOURCE, tagState, MPI_COMM_WORLD, &status); - // Updates of numbers of stabilized nodes and received state msgs + // Updates of numbers of stabilized nodes and received state msgs nbOtherCVs += recvdState; nbStateMsg++; - // Unlocking of the computing thread when states of all other nodes are received + // Unlocking of the computing thread when states of all other + // nodes are received if(nbStateMsg == nbP-1){ nbStateMsg = 0; omp_unset_lock(&lockStates); @@ -620,7 +662,8 @@ while(!Finished){ // Actual reception of the message in dummy buffer MPI_Recv(dummyBuffer, 1, MPI_CHAR, status.MPI_SOURCE, tagIter, MPI_COMM_WORLD, &status); nbIterMsg++; // Update of the nb of iteration messages - // Unlocking of the computing thread when iteration messages are received from all other nodes + // Unlocking of the computing thread when iteration messages + // are received from all other nodes if(nbIterMsg == nbP - 1){ nbIterMsg = 0; omp_unset_lock(&lockIter); @@ -711,13 +754,13 @@ iterations done (\texttt{nbSyncIter}). %\begin{algorithm}[H] % \caption{Computing function in the final asynchronous scheme.}% without GPU computing.} % \label{algo:ch6p2AsyncSyncComp} -%\pagebreak +\pagebreak \begin{Listing}{algo:ch6p2AsyncSyncComp}{Computing function in the final asynchronous scheme}% without GPU computing.} // Variables declarations and initialization ... -OpMode curMode = SYNC; // Current operating mode (always begin in sync) -double asyncStart; // Starting time of the current async section -int nbSyncIter = 0; // Number of sync iterations done in async mode +OpMode curMode = SYNC;// Current operating mode (always begin in sync) +double asyncStart; // Starting time of the current async section +int nbSyncIter = 0; // Number of sync iterations done in async mode // Computation loop while(!Finished){ @@ -726,14 +769,15 @@ while(!Finished){ // Entering synchronous mode when asyncDuration is reached @% // (additional conditions can be specified if needed) @ if(MPI_Wtime() - asyncStart >= asyncDuration){ - // Waiting for the end of previous sends before starting sync mode + // Waiting for the end of previous sends before starting sync mode omp_set_lock(&lockSendsDone); curMode = SYNC; // Entering synchronous mode stampData(dataToSend, SYNC); // Mark data to send with sync flag nbSyncIter = 0; } }else{ - // In main async mode, going back to async mode when the max number of sync iterations are done + // In main async mode, going back to async mode when the max number + // of sync iterations are done if(mainMode == ASYNC){ nbSyncIter++; // Update of the number of sync iterations done if(nbSyncIter == 2){ @@ -802,12 +846,14 @@ dim3 Dg, Db; // CUDA kernel grids // Computation loop while(!Finished){ - // Determination of the dynamic operating mode, sendings of data dependencies and blocking data receptions in sync mode + // Determination of the dynamic operating mode, sendings of data + // dependencies and blocking data receptions in sync mode ... // Local GPU computation // Data transfers from node RAM to GPU CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer"); - ... // There may be several data transfers: typically A and b in linear problems + ... // There may be several data transfers: typically A and b in + // linear problems // GPU grid definition Db.x = BLOCK_SIZE_X; // BLOCK_SIZE_# are kernel design dependent Db.y = BLOCK_SIZE_Y; @@ -869,7 +915,7 @@ different according to the application. %\begin{algorithm}[H] % \caption{Initialization of the main process of complete overlap with asynchronism.} % \label{algo:ch6p2FullOverAsyncMain} -\pagebreak +%\pagebreak \begin{Listing}{algo:ch6p2FullOverAsyncMain}{Initialization of the main process of complete overlap with asynchronism} // Variables declarations and initialization ... @@ -878,12 +924,13 @@ omp_lock_t lockRes; // Informs aux thread about new results omp_lock_t lockWrite; // Controls exclusion of results access ... auxRes ... ; // Results of auxiliary computations -// Parameters reading, MPI initialization, data initialization and distribution +// Parameters reading, MPI initialization, data initialization and +// distribution ... // OpenMP initialization ... omp_init_lock(&lockAux); -omp_set_lock(&lockAux); // Unlocked when new aux results are available +omp_set_lock(&lockAux);//Unlocked when new aux results are available omp_init_lock(&lockRes); omp_set_lock(&lockRes); // Unlocked when new results are available omp_init_lock(&lockWrite); @@ -930,7 +977,7 @@ MPI_Finalize(); %\begin{algorithm}[H] % \caption{Computing function in the final asynchronous scheme with CPU/GPU overlap.} % \label{algo:ch6p2FullOverAsyncComp1} -\pagebreak +%\pagebreak \begin{Listing}{algo:ch6p2FullOverAsyncComp1}{Computing function in the final asynchronous scheme with CPU/GPU overlap} // Variables declarations and initialization ... @@ -938,10 +985,12 @@ dim3 Dg, Db; // CUDA kernel grids // Computation loop while(!Finished){ - // Determination of the dynamic operating mode, sendings of data dependencies and blocking data receptions in sync mode + // Determination of the dynamic operating mode, sendings of data + // dependencies and blocking data receptions in sync mode ... // Local GPU computation - // Data transfers from node RAM to GPU, GPU grid definition and init of shared mem + // Data transfers from node RAM to GPU, GPU grid definition and init + // of shared mem CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer"); ... // Kernel call @@ -988,7 +1037,7 @@ while(!Finished){ %\begin{algorithm}[H] % \caption{Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap.} % \label{algo:ch6p2FullOverAsyncComp2} -\pagebreak +%\pagebreak \begin{Listing}{algo:ch6p2FullOverAsyncComp2}{Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap} // Variables declarations and initialization ... auxInput ... // Local array for input data @@ -1002,13 +1051,13 @@ while(!Finished){ for(ind=0; ind