iterative process and by $x^t=(x_1^t,...,x_n^t)$ the values of the $n$ elements of
the system at iteration $t$, we pass from a synchronous iterative scheme of the
form:
+%% \begin{algorithm}[H]
+%% \caption{Synchronous iterative scheme}\label{algo:ch6p2sync}
+%% \begin{Algo}
+%% $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
+%% \textbf{for} $t=0,1,...$\\
+%% \>\textbf{for} $i=1,...,n$\\
+%% \>\>$x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\\
+%% \>\textbf{endfor}\\
+%% \textbf{endfor}
+%% \end{Algo}
+%% \end{algorithm}
\begin{algorithm}[H]
\caption{Synchronous iterative scheme}\label{algo:ch6p2sync}
- \begin{Algo}
- $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
- \textbf{for} $t=0,1,...$\\
- \>\textbf{for} $i=1,...,n$\\
- \>\>$x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\\
- \>\textbf{endfor}\\
- \textbf{endfor}
- \end{Algo}
+ $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\;
+ \For{ $t=0,1,...$} {
+ \For{ $i=1,...,n$}{
+ $x_{i}^{t+1}=f_{i}(x_{1}^t,...,x_i^t,...,x_{n}^t)$\;
+ }
+ }
\end{algorithm}
+
+
\noindent
-to an asynchronous iterative scheme of the form:
+to an asynchronous iterative scheme of the form:\\
+%% \begin{algorithm}[H]
+%% \caption{Asynchronous iterative scheme}\label{algo:ch6p2async}
+%% \begin{Algo}
+%% $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
+%% \textbf{for} $t=0,1,...$\\
+%% \>\textbf{for} $i=1,...,n$\\
+%% \>\>$x_{i}^{t+1}=\left\{
+%% \begin{array}[h]{ll}
+%% x_i^t & \text{if } i \text{ is \emph{not} updated at iteration } i\\
+%% f_i(x_1^{s_1^i(t)},...,x_n^{s_n^i(t)}) & \text{if } i \text{ is updated at iteration } i
+%% \end{array}
+%% \right.$\\
+%% \>\textbf{endfor}\\
+%% \textbf{endfor}
+%% \end{Algo}
+%% \end{algorithm}
\begin{algorithm}[H]
\caption{Asynchronous iterative scheme}\label{algo:ch6p2async}
- \begin{Algo}
- $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\\
- \textbf{for} $t=0,1,...$\\
- \>\textbf{for} $i=1,...,n$\\
- \>\>$x_{i}^{t+1}=\left\{
+ $x^{0}=(x_{1}^{0},...,x_{n}^{0})$\;
+ \For {$t=0,1,...$} {
+ \For{ $i=1,...,n$} {
+ $x_{i}^{t+1}=\left\{
\begin{array}[h]{ll}
x_i^t & \text{if } i \text{ is \emph{not} updated at iteration } i\\
f_i(x_1^{s_1^i(t)},...,x_n^{s_n^i(t)}) & \text{if } i \text{ is updated at iteration } i
\end{array}
- \right.$\\
- \>\textbf{endfor}\\
- \textbf{endfor}
- \end{Algo}
+ \right.$
+ }
+ }
\end{algorithm}
where $s_j^i(t)$ is the iteration number of the production of the value $x_j$ of
element $j$ that is used on element $i$ at iteration $t$ (see for example~\cite{BT89,
% \label{algo:ch6p2BasicAsync}
\begin{Listing}{algo:ch6p2BasicAsync}{Initialization of the basic asynchronous scheme}
// Variables declaration and initialization
-omp_lock_t lockSend; // Controls the sendings from the computing thread
-omp_lock_t lockRec; // Ensures the initial reception of external data
-char Finished = 0; // Boolean indicating the end of the process
-char SendsInProgress = 0; // Boolean indicating if previous data sendings are still in progress
-double Threshold; // Threshold of the residual for convergence detection
+// Controls the sendings from the computing thread
+omp_lock_t lockSend;
+// Ensures the initial reception of external data
+omp_lock_t lockRec;
+char Finished = 0; // Boolean indicating the end of the process
+// Boolean indicating if previous data sendings are still in progress
+char SendsInProgress = 0;
+// Threshold of the residual for convergence detection
+double Threshold;
// Parameters reading
...
// OpenMP initialization (mainly declarations and setting up of locks)
omp_set_num_threads(3);
omp_init_lock(&lockSend);
-omp_set_lock(&lockSend); // Initially locked, unlocked to start sendings
+omp_set_lock(&lockSend);//Initially locked, unlocked to start sendings
omp_init_lock(&lockRec);
-omp_set_lock(&lockRec); // Initially locked, unlocked when initial data are received
+//Initially locked, unlocked when initial data are received
+omp_set_lock(&lockRec);
#pragma omp parallel
{
// Computation loop
while(!Finished){
- // Sendings of data dependencies if there is no previous sending in progress
+ // Sendings of data dependencies if there is no previous sending
+ // in progress
if(!SendsInProgress){
// Potential copy of data to be sent in additional buffers
...
% \label{algo:ch6p2BasicAsyncReceptions}
\begin{Listing}{algo:ch6p2BasicAsyncReceptions}{Reception function in the basic asynchronous scheme}
// Variables declaration and initialization
-char countReceipts = 0; // Boolean indicating whether receptions are counted or not
+char countReceipts = 0; // Boolean indicating whether receptions are
+// counted or not
int nbEndMsg = 0; // Number of end messages received
int arrived = 0; // Boolean indicating if a message is arrived
int srcNd; // Source node of the message
// Management of data messages
switch(status.MPI_TAG){
case tagCom: // Management of data messages
- srcNd = status.MPI_SOURCE; // Get the source node of the message
+ // Get the source node of the message
+ srcNd = status.MPI_SOURCE;
// Actual data reception in the corresponding buffer
MPI_Recv(dataBufferOf(srcNd), nbDataOf(srcNd), dataTypeOf(srcNd), srcNd, tagCom, MPI_COMM_WORLD, &status);
- // Unlocking of the computing thread when data are received from all dependencies
+ // Unlocking of the computing thread when data are received
+ // from all dependencies
if(countReceipts == 1 && ... @\emph{receptions from ALL dependencies}@ ...){
omp_unset_lock(&lockRec);
countReceipts = 0; // No more counting after first iteration
\begin{Listing}{algo:ch6p2Sync}{Initialization of the synchronized scheme}
// Variables declarations and initialization
...
-omp_lock_t lockStates; // Controls the synchronous exchange of local states
-omp_lock_t lockIter; // Controls the synchronization at the end of each iteration
-char localCV = 0; // Boolean indicating whether the local stabilization is reached or not
-int nbOtherCVs = 0; // Number of other nodes being in local stabilization
+// Controls the synchronous exchange of local states
+omp_lock_t lockStates;
+// Controls the synchronization at the end of each iteration
+omp_lock_t lockIter;
+//Boolean indicating whether the local stabilization is reached or not
+char localCV = 0;
+// Number of other nodes being in local stabilization
+int nbOtherCVs = 0;
// Parameters reading
...
// OpenMP initialization (mainly declarations and setting up of locks)
...
omp_init_lock(&lockStates);
-omp_set_lock(&lockStates); // Initially locked, unlocked when all state messages are received
+// Initially locked, unlocked when all state messages are received
+omp_set_lock(&lockStates);
omp_init_lock(&lockIter);
-omp_set_lock(&lockIter); // Initially locked, unlocked when all "end of iteration" messages are received
+// Initially locked, unlocked when all "end of iteration" messages are
+// received
+omp_set_lock(&lockIter);
// Threads launching
#pragma omp parallel
// Waiting for the state messages receptions from the other nodes
omp_set_lock(&lockStates);
- // Determination of global convergence (if all nodes are in local CV)
+ //Determination of global convergence (if all nodes are in local CV)
if(localCV + nbOtherCVs == nbP){
// Entering global CV state
Finished = 1;
case tagState: // Management of local state messages
// Actual reception of the message
MPI_Recv(&recvdState, 1, MPI_CHAR, status.MPI_SOURCE, tagState, MPI_COMM_WORLD, &status);
- // Updates of numbers of stabilized nodes and received state msgs
+ // Updates of numbers of stabilized nodes and received state msgs
nbOtherCVs += recvdState;
nbStateMsg++;
- // Unlocking of the computing thread when states of all other nodes are received
+ // Unlocking of the computing thread when states of all other
+ // nodes are received
if(nbStateMsg == nbP-1){
nbStateMsg = 0;
omp_unset_lock(&lockStates);
// Actual reception of the message in dummy buffer
MPI_Recv(dummyBuffer, 1, MPI_CHAR, status.MPI_SOURCE, tagIter, MPI_COMM_WORLD, &status);
nbIterMsg++; // Update of the nb of iteration messages
- // Unlocking of the computing thread when iteration messages are received from all other nodes
+ // Unlocking of the computing thread when iteration messages
+ // are received from all other nodes
if(nbIterMsg == nbP - 1){
nbIterMsg = 0;
omp_unset_lock(&lockIter);
%\begin{algorithm}[H]
% \caption{Computing function in the final asynchronous scheme.}% without GPU computing.}
% \label{algo:ch6p2AsyncSyncComp}
-%\pagebreak
+\pagebreak
\begin{Listing}{algo:ch6p2AsyncSyncComp}{Computing function in the final asynchronous scheme}% without GPU computing.}
// Variables declarations and initialization
...
-OpMode curMode = SYNC; // Current operating mode (always begin in sync)
-double asyncStart; // Starting time of the current async section
-int nbSyncIter = 0; // Number of sync iterations done in async mode
+OpMode curMode = SYNC;// Current operating mode (always begin in sync)
+double asyncStart; // Starting time of the current async section
+int nbSyncIter = 0; // Number of sync iterations done in async mode
// Computation loop
while(!Finished){
// Entering synchronous mode when asyncDuration is reached
@% // (additional conditions can be specified if needed)
@ if(MPI_Wtime() - asyncStart >= asyncDuration){
- // Waiting for the end of previous sends before starting sync mode
+ // Waiting for the end of previous sends before starting sync mode
omp_set_lock(&lockSendsDone);
curMode = SYNC; // Entering synchronous mode
stampData(dataToSend, SYNC); // Mark data to send with sync flag
nbSyncIter = 0;
}
}else{
- // In main async mode, going back to async mode when the max number of sync iterations are done
+ // In main async mode, going back to async mode when the max number
+ // of sync iterations are done
if(mainMode == ASYNC){
nbSyncIter++; // Update of the number of sync iterations done
if(nbSyncIter == 2){
// Computation loop
while(!Finished){
- // Determination of the dynamic operating mode, sendings of data dependencies and blocking data receptions in sync mode
+ // Determination of the dynamic operating mode, sendings of data
+ // dependencies and blocking data receptions in sync mode
...
// Local GPU computation
// Data transfers from node RAM to GPU
CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer");
- ... // There may be several data transfers: typically A and b in linear problems
+ ... // There may be several data transfers: typically A and b in
+ // linear problems
// GPU grid definition
Db.x = BLOCK_SIZE_X; // BLOCK_SIZE_# are kernel design dependent
Db.y = BLOCK_SIZE_Y;
%\begin{algorithm}[H]
% \caption{Initialization of the main process of complete overlap with asynchronism.}
% \label{algo:ch6p2FullOverAsyncMain}
-\pagebreak
+%\pagebreak
\begin{Listing}{algo:ch6p2FullOverAsyncMain}{Initialization of the main process of complete overlap with asynchronism}
// Variables declarations and initialization
...
omp_lock_t lockWrite; // Controls exclusion of results access
... auxRes ... ; // Results of auxiliary computations
-// Parameters reading, MPI initialization, data initialization and distribution
+// Parameters reading, MPI initialization, data initialization and
+// distribution
...
// OpenMP initialization
...
omp_init_lock(&lockAux);
-omp_set_lock(&lockAux); // Unlocked when new aux results are available
+omp_set_lock(&lockAux);//Unlocked when new aux results are available
omp_init_lock(&lockRes);
omp_set_lock(&lockRes); // Unlocked when new results are available
omp_init_lock(&lockWrite);
%\begin{algorithm}[H]
% \caption{Computing function in the final asynchronous scheme with CPU/GPU overlap.}
% \label{algo:ch6p2FullOverAsyncComp1}
-\pagebreak
+%\pagebreak
\begin{Listing}{algo:ch6p2FullOverAsyncComp1}{Computing function in the final asynchronous scheme with CPU/GPU overlap}
// Variables declarations and initialization
...
// Computation loop
while(!Finished){
- // Determination of the dynamic operating mode, sendings of data dependencies and blocking data receptions in sync mode
+ // Determination of the dynamic operating mode, sendings of data
+ // dependencies and blocking data receptions in sync mode
...
// Local GPU computation
- // Data transfers from node RAM to GPU, GPU grid definition and init of shared mem
+ // Data transfers from node RAM to GPU, GPU grid definition and init
+ // of shared mem
CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer");
...
// Kernel call
%\begin{algorithm}[H]
% \caption{Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap.}
% \label{algo:ch6p2FullOverAsyncComp2}
-\pagebreak
+%\pagebreak
\begin{Listing}{algo:ch6p2FullOverAsyncComp2}{Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}
// Variables declarations and initialization
... auxInput ... // Local array for input data
for(ind=0; ind<resultsSize; ++ind){
auxInput[ind] = resultsInRam[ind];
}
- omp_unset_lock(&lockWrite); // Give back write access to main thread
+ omp_unset_lock(&lockWrite);//Give back write access to main thread
// Auxiliary computations with possible interruption at the end
for(ind=0; ind<auxSize && !Finished; ++ind){
// Computation of auxRes array according to auxInput
...
}
- // Informs main thread that new aux results are available in auxData
+// Informs main thread that new aux results are available in auxData
omp_test_lock(&lockAux); // Ensures mutex is locked when unlocking
omp_unset_lock(&lockAux);
}