X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/blobdiff_plain/637049bfdd22c413d65ad0548d2d18a70a1fa6be..4c07e9cc681b9a8ed54d9a1b48e9d8b00af63126:/BookGPU/Chapters/chapter6/PartieAsync.tex diff --git a/BookGPU/Chapters/chapter6/PartieAsync.tex b/BookGPU/Chapters/chapter6/PartieAsync.tex index 4a2b6d2..0b20926 100644 --- a/BookGPU/Chapters/chapter6/PartieAsync.tex +++ b/BookGPU/Chapters/chapter6/PartieAsync.tex @@ -157,11 +157,15 @@ So, the global organization of this scheme is set up in \Lst{algo:ch6p2BasicAsyn % \label{algo:ch6p2BasicAsync} \begin{Listing}{algo:ch6p2BasicAsync}{Initialization of the basic asynchronous scheme} // Variables declaration and initialization -omp_lock_t lockSend; // Controls the sendings from the computing thread -omp_lock_t lockRec; // Ensures the initial reception of external data -char Finished = 0; // Boolean indicating the end of the process -char SendsInProgress = 0; // Boolean indicating if previous data sendings are still in progress -double Threshold; // Threshold of the residual for convergence detection +// Controls the sendings from the computing thread +omp_lock_t lockSend; +// Ensures the initial reception of external data +omp_lock_t lockRec; +char Finished = 0; // Boolean indicating the end of the process +// Boolean indicating if previous data sendings are still in progress +char SendsInProgress = 0; +// Threshold of the residual for convergence detection +double Threshold; // Parameters reading ... @@ -177,9 +181,10 @@ MPI_Comm_rank(MPI_COMM_WORLD, &numP); // OpenMP initialization (mainly declarations and setting up of locks) omp_set_num_threads(3); omp_init_lock(&lockSend); -omp_set_lock(&lockSend); // Initially locked, unlocked to start sendings +omp_set_lock(&lockSend);//Initially locked, unlocked to start sendings omp_init_lock(&lockRec); -omp_set_lock(&lockRec); // Initially locked, unlocked when initial data are received +//Initially locked, unlocked when initial data are received +omp_set_lock(&lockRec); #pragma omp parallel { @@ -236,7 +241,8 @@ double residual; // Residual of the current iteration // Computation loop while(!Finished){ - // Sendings of data dependencies if there is no previous sending in progress + // Sendings of data dependencies if there is no previous sending + // in progress if(!SendsInProgress){ // Potential copy of data to be sent in additional buffers ... @@ -345,7 +351,8 @@ The last function, detailed in \Lst{algo:ch6p2BasicAsyncReceptions}, does all th % \label{algo:ch6p2BasicAsyncReceptions} \begin{Listing}{algo:ch6p2BasicAsyncReceptions}{Reception function in the basic asynchronous scheme} // Variables declaration and initialization -char countReceipts = 0; // Boolean indicating whether receptions are counted or not +char countReceipts = 0; // Boolean indicating whether receptions are +// counted or not int nbEndMsg = 0; // Number of end messages received int arrived = 0; // Boolean indicating if a message is arrived int srcNd; // Source node of the message @@ -359,10 +366,12 @@ while(!Finished){ // Management of data messages switch(status.MPI_TAG){ case tagCom: // Management of data messages - srcNd = status.MPI_SOURCE; // Get the source node of the message + // Get the source node of the message + srcNd = status.MPI_SOURCE; // Actual data reception in the corresponding buffer MPI_Recv(dataBufferOf(srcNd), nbDataOf(srcNd), dataTypeOf(srcNd), srcNd, tagCom, MPI_COMM_WORLD, &status); - // Unlocking of the computing thread when data are received from all dependencies + // Unlocking of the computing thread when data are received + // from all dependencies if(countReceipts == 1 && ... @\emph{receptions from ALL dependencies}@ ...){ omp_unset_lock(&lockRec); countReceipts = 0; // No more counting after first iteration @@ -454,10 +463,14 @@ required to change the operating mode. \begin{Listing}{algo:ch6p2Sync}{Initialization of the synchronized scheme} // Variables declarations and initialization ... -omp_lock_t lockStates; // Controls the synchronous exchange of local states -omp_lock_t lockIter; // Controls the synchronization at the end of each iteration -char localCV = 0; // Boolean indicating whether the local stabilization is reached or not -int nbOtherCVs = 0; // Number of other nodes being in local stabilization +// Controls the synchronous exchange of local states +omp_lock_t lockStates; +// Controls the synchronization at the end of each iteration +omp_lock_t lockIter; +//Boolean indicating whether the local stabilization is reached or not +char localCV = 0; +// Number of other nodes being in local stabilization +int nbOtherCVs = 0; // Parameters reading ... @@ -468,9 +481,12 @@ int nbOtherCVs = 0; // Number of other nodes being in local stabilization // OpenMP initialization (mainly declarations and setting up of locks) ... omp_init_lock(&lockStates); -omp_set_lock(&lockStates); // Initially locked, unlocked when all state messages are received +// Initially locked, unlocked when all state messages are received +omp_set_lock(&lockStates); omp_init_lock(&lockIter); -omp_set_lock(&lockIter); // Initially locked, unlocked when all "end of iteration" messages are received +// Initially locked, unlocked when all "end of iteration" messages are +// received +omp_set_lock(&lockIter); // Threads launching #pragma omp parallel @@ -548,7 +564,7 @@ while(!Finished){ // Waiting for the state messages receptions from the other nodes omp_set_lock(&lockStates); - // Determination of global convergence (if all nodes are in local CV) + //Determination of global convergence (if all nodes are in local CV) if(localCV + nbOtherCVs == nbP){ // Entering global CV state Finished = 1; @@ -632,10 +648,11 @@ while(!Finished){ case tagState: // Management of local state messages // Actual reception of the message MPI_Recv(&recvdState, 1, MPI_CHAR, status.MPI_SOURCE, tagState, MPI_COMM_WORLD, &status); - // Updates of numbers of stabilized nodes and received state msgs + // Updates of numbers of stabilized nodes and received state msgs nbOtherCVs += recvdState; nbStateMsg++; - // Unlocking of the computing thread when states of all other nodes are received + // Unlocking of the computing thread when states of all other + // nodes are received if(nbStateMsg == nbP-1){ nbStateMsg = 0; omp_unset_lock(&lockStates); @@ -645,7 +662,8 @@ while(!Finished){ // Actual reception of the message in dummy buffer MPI_Recv(dummyBuffer, 1, MPI_CHAR, status.MPI_SOURCE, tagIter, MPI_COMM_WORLD, &status); nbIterMsg++; // Update of the nb of iteration messages - // Unlocking of the computing thread when iteration messages are received from all other nodes + // Unlocking of the computing thread when iteration messages + // are received from all other nodes if(nbIterMsg == nbP - 1){ nbIterMsg = 0; omp_unset_lock(&lockIter); @@ -736,13 +754,13 @@ iterations done (\texttt{nbSyncIter}). %\begin{algorithm}[H] % \caption{Computing function in the final asynchronous scheme.}% without GPU computing.} % \label{algo:ch6p2AsyncSyncComp} -%\pagebreak +\pagebreak \begin{Listing}{algo:ch6p2AsyncSyncComp}{Computing function in the final asynchronous scheme}% without GPU computing.} // Variables declarations and initialization ... -OpMode curMode = SYNC; // Current operating mode (always begin in sync) -double asyncStart; // Starting time of the current async section -int nbSyncIter = 0; // Number of sync iterations done in async mode +OpMode curMode = SYNC;// Current operating mode (always begin in sync) +double asyncStart; // Starting time of the current async section +int nbSyncIter = 0; // Number of sync iterations done in async mode // Computation loop while(!Finished){ @@ -751,14 +769,15 @@ while(!Finished){ // Entering synchronous mode when asyncDuration is reached @% // (additional conditions can be specified if needed) @ if(MPI_Wtime() - asyncStart >= asyncDuration){ - // Waiting for the end of previous sends before starting sync mode + // Waiting for the end of previous sends before starting sync mode omp_set_lock(&lockSendsDone); curMode = SYNC; // Entering synchronous mode stampData(dataToSend, SYNC); // Mark data to send with sync flag nbSyncIter = 0; } }else{ - // In main async mode, going back to async mode when the max number of sync iterations are done + // In main async mode, going back to async mode when the max number + // of sync iterations are done if(mainMode == ASYNC){ nbSyncIter++; // Update of the number of sync iterations done if(nbSyncIter == 2){ @@ -827,12 +846,14 @@ dim3 Dg, Db; // CUDA kernel grids // Computation loop while(!Finished){ - // Determination of the dynamic operating mode, sendings of data dependencies and blocking data receptions in sync mode + // Determination of the dynamic operating mode, sendings of data + // dependencies and blocking data receptions in sync mode ... // Local GPU computation // Data transfers from node RAM to GPU CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer"); - ... // There may be several data transfers: typically A and b in linear problems + ... // There may be several data transfers: typically A and b in + // linear problems // GPU grid definition Db.x = BLOCK_SIZE_X; // BLOCK_SIZE_# are kernel design dependent Db.y = BLOCK_SIZE_Y; @@ -894,7 +915,7 @@ different according to the application. %\begin{algorithm}[H] % \caption{Initialization of the main process of complete overlap with asynchronism.} % \label{algo:ch6p2FullOverAsyncMain} -\pagebreak +%\pagebreak \begin{Listing}{algo:ch6p2FullOverAsyncMain}{Initialization of the main process of complete overlap with asynchronism} // Variables declarations and initialization ... @@ -903,12 +924,13 @@ omp_lock_t lockRes; // Informs aux thread about new results omp_lock_t lockWrite; // Controls exclusion of results access ... auxRes ... ; // Results of auxiliary computations -// Parameters reading, MPI initialization, data initialization and distribution +// Parameters reading, MPI initialization, data initialization and +// distribution ... // OpenMP initialization ... omp_init_lock(&lockAux); -omp_set_lock(&lockAux); // Unlocked when new aux results are available +omp_set_lock(&lockAux);//Unlocked when new aux results are available omp_init_lock(&lockRes); omp_set_lock(&lockRes); // Unlocked when new results are available omp_init_lock(&lockWrite); @@ -955,7 +977,7 @@ MPI_Finalize(); %\begin{algorithm}[H] % \caption{Computing function in the final asynchronous scheme with CPU/GPU overlap.} % \label{algo:ch6p2FullOverAsyncComp1} -\pagebreak +%\pagebreak \begin{Listing}{algo:ch6p2FullOverAsyncComp1}{Computing function in the final asynchronous scheme with CPU/GPU overlap} // Variables declarations and initialization ... @@ -963,10 +985,12 @@ dim3 Dg, Db; // CUDA kernel grids // Computation loop while(!Finished){ - // Determination of the dynamic operating mode, sendings of data dependencies and blocking data receptions in sync mode + // Determination of the dynamic operating mode, sendings of data + // dependencies and blocking data receptions in sync mode ... // Local GPU computation - // Data transfers from node RAM to GPU, GPU grid definition and init of shared mem + // Data transfers from node RAM to GPU, GPU grid definition and init + // of shared mem CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer"); ... // Kernel call @@ -1013,7 +1037,7 @@ while(!Finished){ %\begin{algorithm}[H] % \caption{Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap.} % \label{algo:ch6p2FullOverAsyncComp2} -\pagebreak +%\pagebreak \begin{Listing}{algo:ch6p2FullOverAsyncComp2}{Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap} // Variables declarations and initialization ... auxInput ... // Local array for input data @@ -1027,13 +1051,13 @@ while(!Finished){ for(ind=0; ind