\makeindex
-%\includeonly{Chapters/chapter15/ch15}
+%\includeonly{Chapters/chapter5/ch5}
\begin{document}
\include{Chapters/chapter3/ch3}
\include{Chapters/chapter4/ch4}
\part{Software development}
-\include{Chapters/chapter5/ch5}
+\include{Chapters/chapter5/ch5}
\include{Chapters/chapter6/ch6}
\part{Optimization}
\include{Chapters/chapter8/ch8}
% Encoding: Cp1252
@INCOLLECTION{ch5:Bajrovic2011,
- author = {Bajrovic, Enes and Traff, Jesper},
+ author = {Bajrovic, E. and Traff, J.},
title = {Using MPI Derived Datatypes in Numerical Libraries},
booktitle = {Recent Advances in the Message Passing Interface},
publisher = {Springer Berlin / Heidelberg},
year = {2011},
- editor = {Cotronis, Yiannis and Danalis, Anthony and Nikolopoulos, Dimitrios
- and Dongarra, Jack},
+ editor = {Cotronis, Y. and Danalis, A. and Nikolopoulos, D.
+ and Dongarra, J.},
volume = {6960},
series = {Lecture Notes in Computer Science},
pages = {29-38},
}
@ARTICLE{ch5:Bell2011,
- author = {Nathan Bell and Jared Hoberock},
+ author = {N. Bell and J. Hoberock},
title = {Thrust: A Productivity-Oriented Library for CUDA},
journal = {in GPU Computing Gems, Jade Edition, Edited by Wen-mei W. Hwu},
year = {2011},
title = {Design Patterns - Elements of Reusable Object-Oriented Software},
publisher = {Addison-Wesley Professional Computing Series},
year = {1995},
- author = {Erich Gamma and Richard Helm and Ralph Johnson and John Vlissides},
+ author = {E. Gamma and R. Helm and R. Johnson and J. Vlissides},
owner = {slgl},
timestamp = {2012.09.10}
}
Interface, {\rm 2nd edition}},
publisher = {MIT Press},
year = {1999},
- author = {William Gropp and Ewing Lusk and Anthony Skjellum},
+ author = {W. Gropp and E. Lusk and A. Skjellum},
address = {Cambridge, MA},
area = {M},
areaseq = {0}
}
@INCOLLECTION{ch5:Hoefler2011,
- author = {Hoefler, Torsten and Snir, Marc},
+ author = {Hoefler, T. and Snir, M.},
title = {Writing Parallel Libraries with MPI - Common Practice, Issues, and
Extensions},
booktitle = {Recent Advances in the Message Passing Interface},
publisher = {Springer Berlin / Heidelberg},
year = {2011},
- editor = {Cotronis, Yiannis and Danalis, Anthony and Nikolopoulos, Dimitrios
- and Dongarra, Jack},
+ editor = {Cotronis, Y. and Danalis, A. and Nikolopoulos, D.
+ and Dongarra, J.},
volume = {6960},
series = {Lecture Notes in Computer Science},
pages = {345-355},
- steady-state and time-dependent problems},
publisher = {SIAM},
year = {2007},
- author = {Randall J. LeVeque},
+ author = {R. J. LeVeque},
pages = {I-XV, 1-341},
bibsource = {DBLP, http://dblp.uni-trier.de},
ee = {http://www.ec-securehost.com/SIAM/OT98.html},
}
@TECHREPORT{ch5:Skjellum1994,
- author = {Anthony Skjellum and Nathan E. Doss and Purushotham V. Bangaloret},
+ author = {A. Skjellum and N. E. Doss and P. V. Bangaloret},
title = {Writing Libraries in MPI},
institution = {Department of Computer Science and NSF Engineering Research Center
for Computational Fiels Simulation. Mississippi State University},
title = {C++ Templates: The Complete Guide},
publisher = {Addison-Wesley Professional},
year = {2002},
- author = {David Vandevoorde and Nicolai M. Josuttis},
+ author = {D. Vandevoorde and N. M. Josuttis},
pages = {552},
edition = {1st},
month = {November},
}
@article{ch5:Korson1992,
- author = {Korson, Tim and McGregor, John D.},
+ author = {Korson, T. and McGregor, J. D.},
title = {Technical criteria for the specification and evaluation of object-oriented libraries},
journal = {Softw. Eng. J.},
issue_date = {March 1992},
}
@INPROCEEDINGS{ch5:Glimberg2011,
- AUTHOR = {Stefan L. Glimberg and Allan P. Engsig-Karup and Morten G. Madsen},
+ AUTHOR = {S. L. Glimberg and A. P. Engsig-Karup and M. G. Madsen},
TITLE = {A Fast GPU-accelerated Mixed-precision Strategy for Fully Nonlinear Water Wave Computations},
BOOKTITLE = {Numerical Mathematics and Advanced Applications 2011, Proceedings of ENUMATH 2011, the 9th European Conference on Numerical Mathematics and Advanced Applications, Leicester, September 2011},
YEAR = {2011},
@book{ch5:chorin1993,
title={A Mathematical Introduction to Fluid Mechanics},
- author={Chorin, A.J. and Marsden, J.E.},
+ author={Chorin, A. J. and Marsden, J. E.},
isbn={9780387979182},
lccn={98115991},
series={Texts in Applied Mathematics},
}
@book{ch5:Saad2003,
- author = {Saad, Yousef},
+ author = {Saad, Y.},
title = {Iterative Methods for Sparse Linear Systems},
year = {2003},
isbn = {0898715342},
@book{ch5:Kelley1995,
title={Iterative Methods for Linear and Nonlinear Equations},
- author={Kelley, C.T.},
+ author={Kelley, C. T.},
isbn={9780898713527},
lccn={lc95032249},
series={Frontiers in Applied Mathematics Series},
}
@techreport{ch5:YMTR08,
- author = {Yvon Maday},
+ author = {Y. Maday},
title = {The parareal in time algorithm},
institution = {Universite Pierr\'{e} et Marie Curie},
year = {2008},
}
@mastersthesis{ch5:ASNP12,
- author = {Allan S. Nielsen},
+ author = {A. S. Nielsen},
title = {Feasibility study of the Parareal algorithm},
school = {Technical University of Denmark, Department of Informatics and Mathematical Modeling},
year = {2012},
}
@TECHREPORT{ch5:ScientificGrandChallenges2010,
- author = {David L. Brown and Paul Messina et. al},
+ author = {D. L. Brown and P. Messina et. al},
title = {Scientific Grand Challenges, Crosscutting technologies for computing at the exascale},
institution = {U.S. Department of Energy},
year = {2010},
}
@article{ch5:Keyes2011,
-author = {David E. Keyes},
+author = {D. E. Keyes},
title = {{Exaflop/s: The why and the how}},
journal = {Comptes Rendus Mecanique},
volume = {339},
}
@inproceedings{ch5:Bell2009,
- author = {Bell, Nathan and Garland, Michael},
+ author = {Bell, N. and Garland, M.},
title = {Implementing sparse matrix-vector multiplication on throughput-oriented processors},
booktitle = {SC '09: Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis},
year = {2009},
}
@book{ch5:Kirk2010,
- author = {Kirk, David B. and Hwu, Wen-mei W.},
+ author = {Kirk, D. B. and Hwu, W.-M. W.},
title = {Programming Massively Parallel Processors: A Hands-on Approach},
year = {2010},
isbn = {0123814723, 9780123814722},
@book{ch5:Trottenberg2001,
title={Multigrid},
- author={Trottenberg, U. and Oosterlee, C.W. and Sch{\"u}ller, A.},
+ author={Trottenberg, U. and Oosterlee, C. W. and Sch{\"u}ller, A.},
isbn={9780127010700},
lccn={00103940},
url={http://books.google.dk/books?id=9ysyNPZoR24C},
u(x,y,t_0) = \sin(\pi x)\,\sin(\pi y), & \qquad (x,y) \in \Omega.
\end{align}
An illustrative example of the numerical solution to the heat problem, using \eqref{ch5:eq:heatinit} as the initial condition is given in Figure \ref{ch5:fig:heatsolution}.
-\begin{figure}[!htb]
+\begin{figure}[!htbp]
\begin{center}
\setlength\figurewidth{0.3\textwidth} %
- \setlength\figureheight{0.32\textwidth} %
+ \setlength\figureheight{0.3\textwidth} %
\subfigure[$t=0.00s$]%{\input{Chapters/chapter5/figures/HeatSolution0.tikz}}
-{\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/HeatSolution0_conv.pdf}}
+{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_conv.pdf}}
\subfigure[$t=0.05s$]%{\input{Chapters/chapter5/figures/HeatSolution0.049307.tikz}}
-{\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/HeatSolution0_049307_conv.pdf}}
+{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_049307_conv.pdf}}
%\subfigure[$t=0.10s$]{\input{Chapters/chapter5/figures/HeatSolution0.099723.tikz}}
-{\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/HeatSolution0_099723_conv.pdf}}
+{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_099723_conv.pdf}}
\end{center}
\caption{Discrete solution at times $t=0s$ and $t=0.05s$, using \eqref{ch5:eq:heatinit} as initial condition and a small $20\times20$ numerical grid.}\label{ch5:fig:heatsolution}
\end{figure}
% \label{algo:ch6p2BasicAsync}
\begin{Listing}{algo:ch6p2BasicAsync}{Initialization of the basic asynchronous scheme}
// Variables declaration and initialization
-omp_lock_t lockSend; // Controls the sendings from the computing thread
-omp_lock_t lockRec; // Ensures the initial reception of external data
-char Finished = 0; // Boolean indicating the end of the process
-char SendsInProgress = 0; // Boolean indicating if previous data sendings are still in progress
-double Threshold; // Threshold of the residual for convergence detection
+// Controls the sendings from the computing thread
+omp_lock_t lockSend;
+// Ensures the initial reception of external data
+omp_lock_t lockRec;
+char Finished = 0; // Boolean indicating the end of the process
+// Boolean indicating if previous data sendings are still in progress
+char SendsInProgress = 0;
+// Threshold of the residual for convergence detection
+double Threshold;
// Parameters reading
...
// OpenMP initialization (mainly declarations and setting up of locks)
omp_set_num_threads(3);
omp_init_lock(&lockSend);
-omp_set_lock(&lockSend); // Initially locked, unlocked to start sendings
+omp_set_lock(&lockSend);//Initially locked, unlocked to start sendings
omp_init_lock(&lockRec);
-omp_set_lock(&lockRec); // Initially locked, unlocked when initial data are received
+//Initially locked, unlocked when initial data are received
+omp_set_lock(&lockRec);
#pragma omp parallel
{
// Computation loop
while(!Finished){
- // Sendings of data dependencies if there is no previous sending in progress
+ // Sendings of data dependencies if there is no previous sending
+ // in progress
if(!SendsInProgress){
// Potential copy of data to be sent in additional buffers
...
% \label{algo:ch6p2BasicAsyncReceptions}
\begin{Listing}{algo:ch6p2BasicAsyncReceptions}{Reception function in the basic asynchronous scheme}
// Variables declaration and initialization
-char countReceipts = 0; // Boolean indicating whether receptions are counted or not
+char countReceipts = 0; // Boolean indicating whether receptions are
+// counted or not
int nbEndMsg = 0; // Number of end messages received
int arrived = 0; // Boolean indicating if a message is arrived
int srcNd; // Source node of the message
// Management of data messages
switch(status.MPI_TAG){
case tagCom: // Management of data messages
- srcNd = status.MPI_SOURCE; // Get the source node of the message
+ // Get the source node of the message
+ srcNd = status.MPI_SOURCE;
// Actual data reception in the corresponding buffer
MPI_Recv(dataBufferOf(srcNd), nbDataOf(srcNd), dataTypeOf(srcNd), srcNd, tagCom, MPI_COMM_WORLD, &status);
- // Unlocking of the computing thread when data are received from all dependencies
+ // Unlocking of the computing thread when data are received
+ // from all dependencies
if(countReceipts == 1 && ... @\emph{receptions from ALL dependencies}@ ...){
omp_unset_lock(&lockRec);
countReceipts = 0; // No more counting after first iteration
\begin{Listing}{algo:ch6p2Sync}{Initialization of the synchronized scheme}
// Variables declarations and initialization
...
-omp_lock_t lockStates; // Controls the synchronous exchange of local states
-omp_lock_t lockIter; // Controls the synchronization at the end of each iteration
-char localCV = 0; // Boolean indicating whether the local stabilization is reached or not
-int nbOtherCVs = 0; // Number of other nodes being in local stabilization
+// Controls the synchronous exchange of local states
+omp_lock_t lockStates;
+// Controls the synchronization at the end of each iteration
+omp_lock_t lockIter;
+//Boolean indicating whether the local stabilization is reached or not
+char localCV = 0;
+// Number of other nodes being in local stabilization
+int nbOtherCVs = 0;
// Parameters reading
...
// OpenMP initialization (mainly declarations and setting up of locks)
...
omp_init_lock(&lockStates);
-omp_set_lock(&lockStates); // Initially locked, unlocked when all state messages are received
+// Initially locked, unlocked when all state messages are received
+omp_set_lock(&lockStates);
omp_init_lock(&lockIter);
-omp_set_lock(&lockIter); // Initially locked, unlocked when all "end of iteration" messages are received
+// Initially locked, unlocked when all "end of iteration" messages are
+// received
+omp_set_lock(&lockIter);
// Threads launching
#pragma omp parallel
// Waiting for the state messages receptions from the other nodes
omp_set_lock(&lockStates);
- // Determination of global convergence (if all nodes are in local CV)
+ //Determination of global convergence (if all nodes are in local CV)
if(localCV + nbOtherCVs == nbP){
// Entering global CV state
Finished = 1;
case tagState: // Management of local state messages
// Actual reception of the message
MPI_Recv(&recvdState, 1, MPI_CHAR, status.MPI_SOURCE, tagState, MPI_COMM_WORLD, &status);
- // Updates of numbers of stabilized nodes and received state msgs
+ // Updates of numbers of stabilized nodes and received state msgs
nbOtherCVs += recvdState;
nbStateMsg++;
- // Unlocking of the computing thread when states of all other nodes are received
+ // Unlocking of the computing thread when states of all other
+ // nodes are received
if(nbStateMsg == nbP-1){
nbStateMsg = 0;
omp_unset_lock(&lockStates);
// Actual reception of the message in dummy buffer
MPI_Recv(dummyBuffer, 1, MPI_CHAR, status.MPI_SOURCE, tagIter, MPI_COMM_WORLD, &status);
nbIterMsg++; // Update of the nb of iteration messages
- // Unlocking of the computing thread when iteration messages are received from all other nodes
+ // Unlocking of the computing thread when iteration messages
+ // are received from all other nodes
if(nbIterMsg == nbP - 1){
nbIterMsg = 0;
omp_unset_lock(&lockIter);
\begin{Listing}{algo:ch6p2AsyncSyncComp}{Computing function in the final asynchronous scheme}% without GPU computing.}
// Variables declarations and initialization
...
-OpMode curMode = SYNC; // Current operating mode (always begin in sync)
-double asyncStart; // Starting time of the current async section
-int nbSyncIter = 0; // Number of sync iterations done in async mode
+OpMode curMode = SYNC;// Current operating mode (always begin in sync)
+double asyncStart; // Starting time of the current async section
+int nbSyncIter = 0; // Number of sync iterations done in async mode
// Computation loop
while(!Finished){
// Entering synchronous mode when asyncDuration is reached
@% // (additional conditions can be specified if needed)
@ if(MPI_Wtime() - asyncStart >= asyncDuration){
- // Waiting for the end of previous sends before starting sync mode
+ // Waiting for the end of previous sends before starting sync mode
omp_set_lock(&lockSendsDone);
curMode = SYNC; // Entering synchronous mode
stampData(dataToSend, SYNC); // Mark data to send with sync flag
nbSyncIter = 0;
}
}else{
- // In main async mode, going back to async mode when the max number of sync iterations are done
+ // In main async mode, going back to async mode when the max number
+ // of sync iterations are done
if(mainMode == ASYNC){
nbSyncIter++; // Update of the number of sync iterations done
if(nbSyncIter == 2){
// Computation loop
while(!Finished){
- // Determination of the dynamic operating mode, sendings of data dependencies and blocking data receptions in sync mode
+ // Determination of the dynamic operating mode, sendings of data
+ // dependencies and blocking data receptions in sync mode
...
// Local GPU computation
// Data transfers from node RAM to GPU
CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer");
- ... // There may be several data transfers: typically A and b in linear problems
+ ... // There may be several data transfers: typically A and b in
+ // linear problems
// GPU grid definition
Db.x = BLOCK_SIZE_X; // BLOCK_SIZE_# are kernel design dependent
Db.y = BLOCK_SIZE_Y;
omp_lock_t lockWrite; // Controls exclusion of results access
... auxRes ... ; // Results of auxiliary computations
-// Parameters reading, MPI initialization, data initialization and distribution
+// Parameters reading, MPI initialization, data initialization and
+// distribution
...
// OpenMP initialization
...
omp_init_lock(&lockAux);
-omp_set_lock(&lockAux); // Unlocked when new aux results are available
+omp_set_lock(&lockAux);//Unlocked when new aux results are available
omp_init_lock(&lockRes);
omp_set_lock(&lockRes); // Unlocked when new results are available
omp_init_lock(&lockWrite);
// Computation loop
while(!Finished){
- // Determination of the dynamic operating mode, sendings of data dependencies and blocking data receptions in sync mode
+ // Determination of the dynamic operating mode, sendings of data
+ // dependencies and blocking data receptions in sync mode
...
// Local GPU computation
- // Data transfers from node RAM to GPU, GPU grid definition and init of shared mem
+ // Data transfers from node RAM to GPU, GPU grid definition and init
+ // of shared mem
CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer");
...
// Kernel call
for(ind=0; ind<resultsSize; ++ind){
auxInput[ind] = resultsInRam[ind];
}
- omp_unset_lock(&lockWrite); // Give back write access to main thread
+ omp_unset_lock(&lockWrite);//Give back write access to main thread
// Auxiliary computations with possible interruption at the end
for(ind=0; ind<auxSize && !Finished; ++ind){
// Computation of auxRes array according to auxInput
...
}
- // Informs main thread that new aux results are available in auxData
+// Informs main thread that new aux results are available in auxData
omp_test_lock(&lockAux); // Ensures mutex is locked when unlocking
omp_unset_lock(&lockAux);
}
// Computation loop (using the GPU)
for (int i = 0; i < NbIter; i++) {
- cudaMemcpy(gpuInputTabAdr, cpuInputTabAdr, // Data transfer:
- sizeof(float)*N, // CPU --> GPU (sync. op)
+ cudaMemcpy(gpuInputTabAdr, cpuInputTabAdr, // Data transfer:
+ sizeof(float)*N, // CPU --> GPU (sync. op)
cudaMemcpyHostToDevice);
- gpuKernel_k1<<<Dg,Db>>>(); // GPU comp. (async. op)
- MPI_Sendrecv_replace(cpuInputTabAdr, // MPI comms. (sync. op)
+ gpuKernel_k1<<<Dg,Db>>>(); // GPU comp. (async. op)
+ MPI_Sendrecv_replace(cpuInputTabAdr, // MPI comms. (sync. op)
N,MPI_FLOAT,
dest, 0, src, 0, ...);
// IF there is (now) a result to transfer from the GPU to the CPU:
- cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer:
- sizeof(float), // GPU --> CPU (sync. op)
+ cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer:
+ sizeof(float), // GPU --> CPU (sync. op)
cudaMemcpyDeviceToHost);
}
...
// - Thread 0: achieves MPI communications
if (omp_get_thread_num() == 0) {
- MPI_Sendrecv(current, // MPI comms. (sync. op)
+ MPI_Sendrecv(current, // MPI comms. (sync. op)
N, MPI_FLOAT, dest, 0,
future,
N, MPI_FLOAT, dest, 0, ...);
// - Thread 1: achieves the GPU sequence (GPU computations and
// CPU/GPU data transfers)
} else if (omp_get_thread_num() == 1) {
- cudaMemcpy(gpuInputTabAdr, current, // Data transfer:
- sizeof(float)*N, // CPU --> GPU (sync. op)
+ cudaMemcpy(gpuInputTabAdr, current, // Data transfer:
+ sizeof(float)*N, // CPU --> GPU (sync. op)
cudaMemcpyHostToDevice);
- gpuKernel_k1<<<Dg,Db>>>(); // GPU comp. (async. op)
- // IF there is (now) a result to transfer from the GPU to the CPU:
- cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer:
- sizeof(float), // GPU --> CPU (sync. op)
+ gpuKernel_k1<<<Dg,Db>>>(); // GPU comp. (async. op)
+ // IF there is (now) a result to transfer from the GPU to the CPU:
+ cudaMemcpy(cpuResTabAdr + i, gpuResAdr,// Data transfer:
+ sizeof(float), // GPU --> CPU (sync. op)
cudaMemcpyDeviceToHost);
}
for (int i = 0; i < NbIter; i++) {
// - Thread 0: achieves MPI communications
if (omp_get_thread_num() == 0) {
- MPI_Sendrecv(current, // MPI comms. (sync. op)
+ MPI_Sendrecv(current, // MPI comms. (sync. op)
N, MPI_FLOAT, dest, 0,
future,
N, MPI_FLOAT, dest, 0, ...);
- // - Thread 1: achieves the streamed GPU sequence (GPU computations
- // and CPU/GPU data transfers)
+ // - Thread 1: achieves the streamed GPU sequence (GPU
+ // computations and CPU/GPU data transfers)
} else if (omp_get_thread_num() == 1) {
- for (int s = 0; s < NbS; s++) { // Start all data transfers:
+ for (int s = 0; s < NbS; s++) { // Start all data transfers:
cudaMemcpyAsync(gpuInputTabAdr + s*stride, // CPU --> GPU
current + s*stride, // (async. ops)
sizeof(float)*stride,
cudaMemcpyHostToDevice,
TabS[s]);
}
- for (int s = 0; s < NbS; s++) { // Start all GPU comps. (async.)
+ for (int s = 0; s < NbS; s++) { // Start all GPU comps. (async.)
gpuKernel_k1<<<Dg, Db, 0, TabS[s]>>>(gpuInputTabAdr + s*stride);
}
- cudaThreadSynchronize(); // Wait all threads are ended
- // IF there is (now) a result to transfer from the GPU to the CPU:
- cudaMemcpy(cpuResTabAdr, // Data transfers:
- gpuResAdr, // GPU --> CPU (sync. op)
+ cudaThreadSynchronize(); // Wait all threads are ended
+ // IF there is (now) a result to transfer from the GPU to the CPU:
+ cudaMemcpy(cpuResTabAdr, // Data transfers:
+ gpuResAdr, // GPU --> CPU (sync. op)
sizeof(float),
cudaMemcpyDeviceToHost);
}
// - Thread 0: achieves MPI communications
if (omp_get_thread_num() == 0) {
if (i < NbIter) {
- MPI_Sendrecv(cpuCurrent, // MPI comms. (sync. op)
+ MPI_Sendrecv(cpuCurrent, // MPI comms. (sync. op)
N, MPI_FLOAT, dest, 0,
cpuFuture,
N, MPI_FLOAT, dest, 0, ...);
// - Thread 1: achieves the CPU/GPU data transfers
} else if (omp_get_thread_num() == 1) {
if (i < NbIter) {
- cudaMemcpy(gpuFuture, cpuCurrent, // Data transfer:
- sizeof(float)*N, // CPU --> GPU (sync. op)
+ cudaMemcpy(gpuFuture, cpuCurrent, // Data transfer:
+ sizeof(float)*N, // CPU --> GPU (sync. op)
cudaMemcpyHostToDevice);
}
- // - Thread 2: achieves the GPU computations and the result transfer
+ // - Thread 2: achieves the GPU computations and the result transfer
} else if (omp_get_thread_num() == 2) {
if (i > 0) {
- gpuKernel_k1<<<Dg,Db>>>(gpuCurrent); // GPU comp. (async. op)
+ gpuKernel_k1<<<Dg,Db>>>(gpuCurrent);// GPU comp. (async. op)
// IF there is (now) a result to transfer from GPU to CPU:
- cudaMemcpy(cpuResTabAdr + (i-1), // Data transfer:
- gpuResAdr, sizeof(float), // GPU --> CPU (sync. op)
+ cudaMemcpy(cpuResTabAdr + (i-1), // Data transfer:
+ gpuResAdr, sizeof(float),// GPU --> CPU (sync. op)
cudaMemcpyDeviceToHost);
}
}
@Book{BT89,
- author = {D.P.~Bertsekas and J.N.~Tsitsiklis},
+ author = {D. P. Bertsekas and J. N. Tsitsiklis},
ALTeditor = {},
title = {Parallel and Distributed Computation},
publisher = {Prentice Hall},
}
@InBook{ChVCV13,
- author = {St\'{e}phane Vialle and Sylvain Contassot-Vivier},
+ author = {S. Vialle and S. Contassot-Vivier},
editor = {Magoul\'{e}s, Fr\'{e}d\'{e}ric},
title = {Patterns for parallel programming on {G}{P}{U}s},
chapter = {Optimization methodology for Parallel Programming of Homogeneous or Hybrid Clusters},
}
@Book{BCC07,
- author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l},
+ author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.},
title = {Parallel Iterative Algorithms: from sequential to grid computing},
publisher = {Chapman \& Hall/CRC},
year = {2007},
}
@InProceedings{HPCS2002,
- author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l},
+ author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.},
title = {Asynchronism for Iterative Algorithms in a Global Computing Environment},
booktitle = {The 16th Annual International Symposium on High Performance
Computing Systems and Applications (HPCS'2002)},
}
@InProceedings{Vecpar08a,
- author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l},
+ author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.},
title = {An efficient and robust decentralized algorithm for detecting the global
convergence in asynchronous iterative algorithms},
booktitle = {8th International Meeting on High Performance Computing for Computational Science, VECPAR'08},
}
@article{ParCo05,
- author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l},
+ author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.},
title = {Evaluation of the Asynchronous Iterative Algorithms in the Context of Distant Heterogeneous Clusters},
journal = {Parallel Computing},
volume = {31},
}
@InProceedings{ECost10,
- author = {Contassot-Vivier, Sylvain and Vialle, St\'{e}phane and Jost, Thomas},
+ author = {Contassot-Vivier, S. and Vialle, S. and Jost, T.},
title = {Optimizing computing and energy performances on {GPU} clusters: experimentation on a {PDE} solver},
booktitle = {COST Action IC0804 on Large Scale Distributed Systems,1st Year},
OPTpages = {},
}
@Article{SuperCo05,
- author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l},
+ author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.},
title = {Performance comparison of parallel programming environments for implementing {AIAC} algorithms},
journal = {Journal of Supercomputing. Special Issue on Performance Modelling and Evaluation of Parallel and Distributed Systems},
year = {2006},
}
@InProceedings{Para10,
- author = {Contassot-Vivier, Sylvain and Jost, Thomas and Vialle, St\'{e}phane},
+ author = {Contassot-Vivier, S. and Jost, T. and Vialle, S.},
title = {Impact of asynchronism on {GPU} accelerated parallel iterative computations},
booktitle = {PARA 2010 conference: State of the Art in Scientific and Parallel Computing},
OPTpages = {--},
}
@InProceedings{ECost10,
- author = {Contassot-Vivier, Sylvain and Vialle, St\'{e}phane and Jost, Thomas},
+ author = {Contassot-Vivier, S. and Vialle, S. and Jost, T.},
title = {Optimizing computing and energy performances on {GPU} clusters: experimentation on a {PDE} solver},
booktitle = {COST Action IC0804 on Large Scale Distributed Systems,1st Year},
OPTpages = {},
}
@InCollection{JCVV10,
- author = {Thomas Jost and Sylvain Contassot-Vivier and St\'{e}phane Vialle},
+ author = {T. Jost and S. Contassot-Vivier and S. Vialle},
title = {An efficient multi-algorithm sparse linear solver for {GPU}s},
booktitle = {Parallel Computing : From Multicores and GPU's to Petascale},
pages = {546--553},
}
@InProceedings{ParCo09,
- author = {Thomas Jost and Sylvain Contassot-Vivier and St\'{e}phane Vialle},
+ author = {T. Jost and S. Contassot-Vivier and S. Vialle},
title = {An efficient multi-algorithms sparse linear solver for {GPU}s},
booktitle = {EuroGPU mini-symposium of the International Conference on Parallel Computing, ParCo'2009},
pages = {546--553},
}
@InProceedings{BCVG11,
- author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Giersch, Arnaud},
+ author = {Bahi, J. M. and Contassot-Vivier, S. and Giersch, A.},
title = {Load Balancing in Dynamic Networks by Bounded Delays Asynchronous Diffusion},
booktitle = {VECPAR 2010},
pages = {352--365},
}
@article{Hoefler08a,
-author = {Torsten Hoefler and Andrew Lumsdaine},
+author = {T. Hoefler and A. Lumsdaine},
title = {Overlapping Communication and Computation with High Level Communication Routines},
journal ={Cluster Computing and the Grid, IEEE International Symposium on},
OPTvolume = {0},
}
@Article{Valiant:BSP,
- author = {Valiant, Leslie G.},
+ author = {Valiant, L. G.},
title = {A bridging model for parallel computation},
journal = {Communications of the ACM},
year = 1990,
}
@inproceedings{gustedt:hal-00639289,
- AUTHOR = {Gustedt, Jens and Jeanvoine, Emmanuel},
+ AUTHOR = {Gustedt, J. and Jeanvoine, E.},
TITLE = {{Relaxed Synchronization with Ordered Read-Write Locks}},
BOOKTITLE = {{Euro-Par 2011: Parallel Processing Workshops}},
YEAR = {2012},
}
@article{clauss:2010:inria-00330024:1,
- AUTHOR = {Clauss, Pierre-Nicolas and Gustedt, Jens},
+ AUTHOR = {Clauss, P.-N. and Gustedt, J.},
TITLE = {{Iterative Computations with Ordered Read-Write Locks}},
JOURNAL = {{Journal of Parallel and Distributed Computing}},
PUBLISHER = {Elsevier},
TITLE = {The par{X}{X}{L} Environment: Scalable Fine Grained Development for Large Coarse Grained Platforms},
X-PAYS = {IT},
X-INTERNATIONAL-AUDIENCE = {yes},
- AUTHOR = {Gustedt, Jens AND Vialle, Stéphane AND De Vivo, Amelia},
+ AUTHOR = {Gustedt, J. AND Vialle, S. AND De Vivo, A.},
BOOKTITLE = {PARA 06},
LONG-BOOKTITLE = {PARA 06: Worshop on state-of-the-art in scientific and parallel computing },
EDITOR = {Bo K{\aa}gstr{\"o}m and others},
\newcommand{\Equ}[1]{(\ref{#1})}
\def\Reals{\mathbb{R}}
-\definecolor{shadecolor}{rgb}{0.95,0.95,0.95}
%\newenvironment{Algo}{\vspace{-1em}\begin{center}\begin{minipage}[h]{0.95\columnwidth}\begin{shaded}\begin{tabbing}%
% \hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\= \kill} %
% { \end{tabbing}\vspace{-1em}\end{shaded}\end{minipage}\end{center}\vspace{-1em}}
MM & $m \times (m-1)$ & $m \times (m-1)$ \\
\hline
\end{tabular}
-\vspace{0.5cm}
\caption{The different data structures of the $LB$ algorithm and their associated complexities in memory size and numbers of accesses. The parameters $n$, $m$ and $n'$ designate respectively the total number of jobs, the total number of machines and the number of remaining jobs to be scheduled for the sub-problems the lower bound is being computed.}
\label{ch8:tabMemComplex}
\end{table}
In order to achieve further performances, we also take care of adequately use the global memory by judiciously configuring the L1 cache which greatly enables improving performance over direct access to global memory. Indeed, the GPU device we are using in our experiments is based on the NVIDIA Fermi architecture which introduced two new hierarchies of memories (L1 $/$ L2 cache)
compared to older architectures.
-\begin{table*}
+\begin{table}
\centering
\footnotesize
\begin{tabular}{|r|r|r|r|r|r|}
$20 \times 20$ & 3.800 (3.8KB) & 3.800 (7.6KB) & 400 (0.4KB) & 20 (0.04KB) & 380 (0.76KB) \\
\hline
\end{tabular}
-\vspace{0.5cm}
\caption{The sizes of each data structure for the different experimented problem instances. The sizes are given in number of elements and in bytes (between brackets).}
\label{ch8:tabMemSizes}
-\end{table*}
+\end{table}
\vspace{0.2cm}
Table \ref{ch8:instance_time} gives, for each instance according to its number of jobs and its number of machines, the used resolution time with a sequential B\&B. For example, the sequential resolution time of each instance defined with $20$ jobs and $20$ machines is approximately 10 minutes. Of course, the computation time of the lower bound of a sub-problem defined with $20$ jobs and $20$ machines is on average greater than the computation time of the lower bound of a sub-problem defined with $50$ jobs and $20$ machines. Therefore, as shown in this table, the sequential resolution time increases with the size of the instance in order to be sure that the number of sub-problems explored is significant for all instances.
-\begin{table*}
+\begin{table}
\setlength{\tabcolsep}{0.2cm}
\renewcommand{\arraystretch}{1.2}
\centering
Sequential resolution time (minutes) & 10 & 50 & 150 & 300 \\
\hline
\end{tabular}
-\vspace{0.3cm}
\caption{The sequential resolution time of each instance according to its number of jobs and machines}
\label{ch8:instance_time}
-\end{table*}
+\end{table}
\subsection{Performance impact of GPU-based parallelism}
The results show also that the parallel efficiency decreases with the size of the problem instance. For a fixed number of machines (here 20 machines) and a fixed pool size, the obtained speedup decline accordingly with the number of jobs. For instance for a pool size of 262144, the acceleration factor obtained with 200 jobs (13.4) while it is (40.50) for the instances with 20 jobs. This behavior is mainly due to the overhead induced by the transfer of the pool of resulting sub-problems between the CPU and the GPU. For example, for the instances with 200 jobs the size of the pool to exchange between the CPU and the GPU is ten times bigger than the size of the pool for the instances with 20 jobs.
-\begin{table*}
+\begin{table}
\setlength{\tabcolsep}{0.2cm}
\renewcommand{\arraystretch}{1.2}
\centering
% \hline
% \hline
\end{tabular}
-\vspace{0.3cm}
\caption{Speedups for different problem instances and pool sizes with the GPU-PTE-BB approach.}
\label{ch8:ParaGPU1}
-\end{table*}
+\end{table}
The results obtained with the GPU-PEB-BB approach (see Table \ref{ch8:ParaGPU2}) show that evaluating in parallel the bounds of a selected pool, allow to significantly speedup the execution of the B\&B. Indeed, an acceleration factor up to 71.69 is obtained for the 200 $\times$ 20 problem instances using a pool of 262144 sub-problems. The results show also that the parallel efficiency grows with the size of the problem instance. For a fixed number of machines (here 20 machines) and a fixed pool size, the obtained speedup grows accordingly with the number of jobs. For instance for a pool size of 262144, the acceleration factor obtained with 200 jobs (71.69) is almost the double of the one obtained with 20 jobs (38.40).
As far the pool size tuning is considered, we could notice that this parameter depends strongly on the problem instance being solved. Indeed, while the best acceleration is obtained with a pool size of 8192 sub-problems for the instances 50 $\times$ 20 and 20 $\times$ 20, the best speedups are obtained with a pool size of 262144 sub-problems with the instances 200 $\times$ 20 and 100 $\times$ 20.\\
-\begin{table*}
+\begin{table}
\setlength{\tabcolsep}{0.2cm}
\renewcommand{\arraystretch}{1.2}
\centering
% \hline
% \hline
\end{tabular}
-\vspace{0.3cm}
\caption{Speedups for different problem instances and pool sizes with the GPU-PEB-BB approach.}
\label{ch8:ParaGPU2}
-\end{table*}
+\end{table}
Compared to the parallel tree exploration-based GPU-accelerated B\&B approach, the parallel evaluation of bounds approach is by far much more efficient wherever the instance is. For example, while the GPU-PEB-BB approach reaches speedup of $\times$71.69 for the instance with 200 jobs on 20 machines, a speedup of a $\times$13.4 is measured with the parallel tree exploration-based approach which corresponds to an acceleration of $\times$5.56 . Moreover, on the contrary to the GPU-PEB-BB approach, in the GPU-PTE-BB the speedups decrease when the problem instance becomes higher. Remember here that while in the GPU-PEB-BB approach all threads evaluate only one node each whatever the permutation size is. In the GPU-PTE-BB, each thread branches all the children of its assigned parent node. Therefore, the bigger the size of the permutation is, the bigger the amount of work performed by each thread is and the bigger the difference between the workload is. Indeed, let us suppose that for the instance with $200$ jobs, the thread $0$ handles a node from the level $2$ of the tree and the thread $100$ handles a node from the level $170$ of the tree. In this case, the thread $0$ generates and evaluates $198$ nodes while the thread $100$ decomposes and bounds only $30$ nodes. The problem in this example is that the kernel execution would last until the thread $0$ finishes its work while the other threads might have ended their works and stayed idle.
The objective of this section is to demonstrate that the thread divergence reduction mechanisms we propose has an impact on the performance of the GPU accelerated B\&B and to evaluate how this impact is significant.
In the following, the reported results are obtained with the GPU-accelerated B\&B based on the parallel evaluation of bounds.
-\begin{table*}[!h]
+\begin{table}[!h]
\setlength{\tabcolsep}{0.2cm}
\renewcommand{\arraystretch}{1.2}
\centering
% \hline
% \hline
\end{tabular}
-\vspace{0.3cm}
\caption{Speedups for different instances and pool sizes using thread divergence management.}
\label{ch8:ParaDivergence}
-\end{table*}
+\end{table}
Table~\ref{ch8:ParaDivergence} shows the experimental results obtained using the sorting process and the refactoring approach presented in Section \ref{ch8:ThreadDivergence}. Results show that the proposed optimizations emphasize the GPU acceleration reported in Table~\ref{ch8:ParaGPU2} and obtained without thread divergence reduction. For example, for the instances of 200 jobs over 20 machines and a pool size of 262144, the average reported speedup is 77.46 while the average acceleration factor obtained without thread divergence management for the same instances and the same pool size is 71.69 which corresponds to an improvement of 7.68\%. Such considerable but not outstanding improvement is predictable, as claimed in \cite{ch8:Han}, since the factorized part of the branches in the FSP lower bound is very small.
Table~\ref{ch8:PTM-on-SM} reports the speedups obtained for the first experimented scenario where only the matrix $PTM$ is put on the shared memory. Results show that the speedup grows on average with the growing of the pool size in the same way as in Table~\ref{ch8:ParaDivergence}. For the largest problem instance and pool size, putting the PTM matrix on the shared memory improves the speedups up to ($14\%$) compared to those obtained when $PTM$ is on global memory reaching an acceleration of $\times 90.51$ for the problem instances $200 \times 20$ and a pool size of $262144$ sub-problems .
-\begin{table*}
+\begin{table}
\centering
\footnotesize
\begin{tabular}{|r|r|r|r|r|r|r|r|}
% \hline
% \hline
\end{tabular}
-\vspace{0.3cm}
\caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. $PTM$ is placed in shared memory and all others are placed in global memory.}
\label{ch8:PTM-on-SM}
-\end{table*}
+\end{table}
Table~\ref{ch8:JM-on-SM} reports the behavior of the speedup averaged on the different problem instances (sizes) as a function of the pool size for the scenario where the Johnson's matrix is put on the shared memory. Results show that putting the $JM$ matrix on the shared matrix improves more the performances comparing to the first scenario where $PTM$ is put on the shared memory. Indeed, according to Table~\ref{ch8:tabMemComplex}, matrix $JM$ is accessed more frequently than matrix $PTM$. Putting $JM$ matrix on the shared memory allows accelerations up to $\times 97.83$ for the problem instances $200 \times 20$.
-\begin{table*}
+\begin{table}
\centering
\footnotesize
\begin{tabular}{|r|r|r|r|r|r|r|r|}
% \hline
% \hline
\end{tabular}
-\vspace{0.3cm}
\caption{Speedup for different FSP instances and pool sizes obtained with data access optimization.
$JM$ is placed in shared memory and all others are placed in global memory.}
\label{ch8:JM-on-SM}
-\end{table*}
+\end{table}
Table~\ref{ch8:JM-PTM-on-SM} reports the behavior of the average speedup for the different problem instances (sizes) with $20$ machines for the data placement scenario where both $PTM$ and $JM$ are put on shared memory. According to the underlying Table, the scenarios~(3) ($JM$ together or without $PTM$ in shared memory) is clearly better than the scenarii~(1)and~(2) (respectively $PTM$ in shared memory and $JM$ in shared memory) whatever is the problem instance (size).
-\begin{table*}
+\begin{table}
\centering
\footnotesize
\begin{tabular}{|r|r|r|r|r|r|r|r|}
% \hline
% \hline
\end{tabular}
-\vspace{0.3cm}
- \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization.
-$PTM$ and $JM$ are placed together in shared memory and all others are placed in global memory.}
+ \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. $PTM$ and $JM$ are placed together in shared memory and all others are placed in global memory.}
\label{ch8:JM-PTM-on-SM}
-\end{table*}
+\end{table}
By carefully analyzing each of the scenarii of data placement on the memory hierarchies of the GPU, the recommendation is to put in the shared memory the Johnson's and the processing time matrices ($JM$ and $PTM$) if they fit in together. Otherwise, the whole or a part of the Johnson's matrix has to be put in priority in the shared memory. The other data structures are mapped to the global memory.