From: couturie Date: Sun, 5 May 2013 17:00:57 +0000 (+0200) Subject: new X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/6e645292c9053655e4e44bbebf1c1eb751a554a6 new --- diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index 1ca9c1a..e011a89 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -147,7 +147,7 @@ \makeindex -%\includeonly{Chapters/chapter15/ch15} +%\includeonly{Chapters/chapter5/ch5} \begin{document} @@ -184,7 +184,7 @@ \include{Chapters/chapter3/ch3} \include{Chapters/chapter4/ch4} \part{Software development} -\include{Chapters/chapter5/ch5} +\include{Chapters/chapter5/ch5} \include{Chapters/chapter6/ch6} \part{Optimization} \include{Chapters/chapter8/ch8} diff --git a/BookGPU/Chapters/chapter5/biblio5.bib b/BookGPU/Chapters/chapter5/biblio5.bib index 8cff215..04267a1 100644 --- a/BookGPU/Chapters/chapter5/biblio5.bib +++ b/BookGPU/Chapters/chapter5/biblio5.bib @@ -2,13 +2,13 @@ % Encoding: Cp1252 @INCOLLECTION{ch5:Bajrovic2011, - author = {Bajrovic, Enes and Traff, Jesper}, + author = {Bajrovic, E. and Traff, J.}, title = {Using MPI Derived Datatypes in Numerical Libraries}, booktitle = {Recent Advances in the Message Passing Interface}, publisher = {Springer Berlin / Heidelberg}, year = {2011}, - editor = {Cotronis, Yiannis and Danalis, Anthony and Nikolopoulos, Dimitrios - and Dongarra, Jack}, + editor = {Cotronis, Y. and Danalis, A. and Nikolopoulos, D. + and Dongarra, J.}, volume = {6960}, series = {Lecture Notes in Computer Science}, pages = {29-38}, @@ -23,7 +23,7 @@ } @ARTICLE{ch5:Bell2011, - author = {Nathan Bell and Jared Hoberock}, + author = {N. Bell and J. Hoberock}, title = {Thrust: A Productivity-Oriented Library for CUDA}, journal = {in GPU Computing Gems, Jade Edition, Edited by Wen-mei W. Hwu}, year = {2011}, @@ -48,7 +48,7 @@ title = {Design Patterns - Elements of Reusable Object-Oriented Software}, publisher = {Addison-Wesley Professional Computing Series}, year = {1995}, - author = {Erich Gamma and Richard Helm and Ralph Johnson and John Vlissides}, + author = {E. Gamma and R. Helm and R. Johnson and J. Vlissides}, owner = {slgl}, timestamp = {2012.09.10} } @@ -58,7 +58,7 @@ Interface, {\rm 2nd edition}}, publisher = {MIT Press}, year = {1999}, - author = {William Gropp and Ewing Lusk and Anthony Skjellum}, + author = {W. Gropp and E. Lusk and A. Skjellum}, address = {Cambridge, MA}, area = {M}, areaseq = {0} @@ -75,14 +75,14 @@ } @INCOLLECTION{ch5:Hoefler2011, - author = {Hoefler, Torsten and Snir, Marc}, + author = {Hoefler, T. and Snir, M.}, title = {Writing Parallel Libraries with MPI - Common Practice, Issues, and Extensions}, booktitle = {Recent Advances in the Message Passing Interface}, publisher = {Springer Berlin / Heidelberg}, year = {2011}, - editor = {Cotronis, Yiannis and Danalis, Anthony and Nikolopoulos, Dimitrios - and Dongarra, Jack}, + editor = {Cotronis, Y. and Danalis, A. and Nikolopoulos, D. + and Dongarra, J.}, volume = {6960}, series = {Lecture Notes in Computer Science}, pages = {345-355}, @@ -100,7 +100,7 @@ - steady-state and time-dependent problems}, publisher = {SIAM}, year = {2007}, - author = {Randall J. LeVeque}, + author = {R. J. LeVeque}, pages = {I-XV, 1-341}, bibsource = {DBLP, http://dblp.uni-trier.de}, ee = {http://www.ec-securehost.com/SIAM/OT98.html}, @@ -108,7 +108,7 @@ } @TECHREPORT{ch5:Skjellum1994, - author = {Anthony Skjellum and Nathan E. Doss and Purushotham V. Bangaloret}, + author = {A. Skjellum and N. E. Doss and P. V. Bangaloret}, title = {Writing Libraries in MPI}, institution = {Department of Computer Science and NSF Engineering Research Center for Computational Fiels Simulation. Mississippi State University}, @@ -133,7 +133,7 @@ title = {C++ Templates: The Complete Guide}, publisher = {Addison-Wesley Professional}, year = {2002}, - author = {David Vandevoorde and Nicolai M. Josuttis}, + author = {D. Vandevoorde and N. M. Josuttis}, pages = {552}, edition = {1st}, month = {November}, @@ -143,7 +143,7 @@ } @article{ch5:Korson1992, - author = {Korson, Tim and McGregor, John D.}, + author = {Korson, T. and McGregor, J. D.}, title = {Technical criteria for the specification and evaluation of object-oriented libraries}, journal = {Softw. Eng. J.}, issue_date = {March 1992}, @@ -168,7 +168,7 @@ } @INPROCEEDINGS{ch5:Glimberg2011, - AUTHOR = {Stefan L. Glimberg and Allan P. Engsig-Karup and Morten G. Madsen}, + AUTHOR = {S. L. Glimberg and A. P. Engsig-Karup and M. G. Madsen}, TITLE = {A Fast GPU-accelerated Mixed-precision Strategy for Fully Nonlinear Water Wave Computations}, BOOKTITLE = {Numerical Mathematics and Advanced Applications 2011, Proceedings of ENUMATH 2011, the 9th European Conference on Numerical Mathematics and Advanced Applications, Leicester, September 2011}, YEAR = {2011}, @@ -234,7 +234,7 @@ Since real world applications are naturally parallel and hardware is naturally p @book{ch5:chorin1993, title={A Mathematical Introduction to Fluid Mechanics}, - author={Chorin, A.J. and Marsden, J.E.}, + author={Chorin, A. J. and Marsden, J. E.}, isbn={9780387979182}, lccn={98115991}, series={Texts in Applied Mathematics}, @@ -244,7 +244,7 @@ Since real world applications are naturally parallel and hardware is naturally p } @book{ch5:Saad2003, - author = {Saad, Yousef}, + author = {Saad, Y.}, title = {Iterative Methods for Sparse Linear Systems}, year = {2003}, isbn = {0898715342}, @@ -255,7 +255,7 @@ Since real world applications are naturally parallel and hardware is naturally p @book{ch5:Kelley1995, title={Iterative Methods for Linear and Nonlinear Equations}, - author={Kelley, C.T.}, + author={Kelley, C. T.}, isbn={9780898713527}, lccn={lc95032249}, series={Frontiers in Applied Mathematics Series}, @@ -265,7 +265,7 @@ Since real world applications are naturally parallel and hardware is naturally p } @techreport{ch5:YMTR08, - author = {Yvon Maday}, + author = {Y. Maday}, title = {The parareal in time algorithm}, institution = {Universite Pierr\'{e} et Marie Curie}, year = {2008}, @@ -302,7 +302,7 @@ Since real world applications are naturally parallel and hardware is naturally p } @mastersthesis{ch5:ASNP12, - author = {Allan S. Nielsen}, + author = {A. S. Nielsen}, title = {Feasibility study of the Parareal algorithm}, school = {Technical University of Denmark, Department of Informatics and Mathematical Modeling}, year = {2012}, @@ -327,7 +327,7 @@ Since real world applications are naturally parallel and hardware is naturally p } @TECHREPORT{ch5:ScientificGrandChallenges2010, - author = {David L. Brown and Paul Messina et. al}, + author = {D. L. Brown and P. Messina et. al}, title = {Scientific Grand Challenges, Crosscutting technologies for computing at the exascale}, institution = {U.S. Department of Energy}, year = {2010}, @@ -336,7 +336,7 @@ Since real world applications are naturally parallel and hardware is naturally p } @article{ch5:Keyes2011, -author = {David E. Keyes}, +author = {D. E. Keyes}, title = {{Exaflop/s: The why and the how}}, journal = {Comptes Rendus Mecanique}, volume = {339}, @@ -381,7 +381,7 @@ masid = {49649121} } @inproceedings{ch5:Bell2009, - author = {Bell, Nathan and Garland, Michael}, + author = {Bell, N. and Garland, M.}, title = {Implementing sparse matrix-vector multiplication on throughput-oriented processors}, booktitle = {SC '09: Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis}, year = {2009}, @@ -394,7 +394,7 @@ masid = {49649121} } @book{ch5:Kirk2010, - author = {Kirk, David B. and Hwu, Wen-mei W.}, + author = {Kirk, D. B. and Hwu, W.-M. W.}, title = {Programming Massively Parallel Processors: A Hands-on Approach}, year = {2010}, isbn = {0123814723, 9780123814722}, @@ -405,7 +405,7 @@ masid = {49649121} @book{ch5:Trottenberg2001, title={Multigrid}, - author={Trottenberg, U. and Oosterlee, C.W. and Sch{\"u}ller, A.}, + author={Trottenberg, U. and Oosterlee, C. W. and Sch{\"u}ller, A.}, isbn={9780127010700}, lccn={00103940}, url={http://books.google.dk/books?id=9ysyNPZoR24C}, diff --git a/BookGPU/Chapters/chapter5/ch5.tex b/BookGPU/Chapters/chapter5/ch5.tex index 1c5da65..1d3d957 100644 --- a/BookGPU/Chapters/chapter5/ch5.tex +++ b/BookGPU/Chapters/chapter5/ch5.tex @@ -174,16 +174,16 @@ where $u(x,y,t)$ is the unknown heat distribution, $\kappa$ is a heat conductivi u(x,y,t_0) = \sin(\pi x)\,\sin(\pi y), & \qquad (x,y) \in \Omega. \end{align} An illustrative example of the numerical solution to the heat problem, using \eqref{ch5:eq:heatinit} as the initial condition is given in Figure \ref{ch5:fig:heatsolution}. -\begin{figure}[!htb] +\begin{figure}[!htbp] \begin{center} \setlength\figurewidth{0.3\textwidth} % - \setlength\figureheight{0.32\textwidth} % + \setlength\figureheight{0.3\textwidth} % \subfigure[$t=0.00s$]%{\input{Chapters/chapter5/figures/HeatSolution0.tikz}} -{\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/HeatSolution0_conv.pdf}} +{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_conv.pdf}} \subfigure[$t=0.05s$]%{\input{Chapters/chapter5/figures/HeatSolution0.049307.tikz}} -{\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/HeatSolution0_049307_conv.pdf}} +{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_049307_conv.pdf}} %\subfigure[$t=0.10s$]{\input{Chapters/chapter5/figures/HeatSolution0.099723.tikz}} -{\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/HeatSolution0_099723_conv.pdf}} +{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_099723_conv.pdf}} \end{center} \caption{Discrete solution at times $t=0s$ and $t=0.05s$, using \eqref{ch5:eq:heatinit} as initial condition and a small $20\times20$ numerical grid.}\label{ch5:fig:heatsolution} \end{figure} diff --git a/BookGPU/Chapters/chapter6/PartieAsync.tex b/BookGPU/Chapters/chapter6/PartieAsync.tex index 4a2b6d2..9c8de06 100644 --- a/BookGPU/Chapters/chapter6/PartieAsync.tex +++ b/BookGPU/Chapters/chapter6/PartieAsync.tex @@ -157,11 +157,15 @@ So, the global organization of this scheme is set up in \Lst{algo:ch6p2BasicAsyn % \label{algo:ch6p2BasicAsync} \begin{Listing}{algo:ch6p2BasicAsync}{Initialization of the basic asynchronous scheme} // Variables declaration and initialization -omp_lock_t lockSend; // Controls the sendings from the computing thread -omp_lock_t lockRec; // Ensures the initial reception of external data -char Finished = 0; // Boolean indicating the end of the process -char SendsInProgress = 0; // Boolean indicating if previous data sendings are still in progress -double Threshold; // Threshold of the residual for convergence detection +// Controls the sendings from the computing thread +omp_lock_t lockSend; +// Ensures the initial reception of external data +omp_lock_t lockRec; +char Finished = 0; // Boolean indicating the end of the process +// Boolean indicating if previous data sendings are still in progress +char SendsInProgress = 0; +// Threshold of the residual for convergence detection +double Threshold; // Parameters reading ... @@ -177,9 +181,10 @@ MPI_Comm_rank(MPI_COMM_WORLD, &numP); // OpenMP initialization (mainly declarations and setting up of locks) omp_set_num_threads(3); omp_init_lock(&lockSend); -omp_set_lock(&lockSend); // Initially locked, unlocked to start sendings +omp_set_lock(&lockSend);//Initially locked, unlocked to start sendings omp_init_lock(&lockRec); -omp_set_lock(&lockRec); // Initially locked, unlocked when initial data are received +//Initially locked, unlocked when initial data are received +omp_set_lock(&lockRec); #pragma omp parallel { @@ -236,7 +241,8 @@ double residual; // Residual of the current iteration // Computation loop while(!Finished){ - // Sendings of data dependencies if there is no previous sending in progress + // Sendings of data dependencies if there is no previous sending + // in progress if(!SendsInProgress){ // Potential copy of data to be sent in additional buffers ... @@ -345,7 +351,8 @@ The last function, detailed in \Lst{algo:ch6p2BasicAsyncReceptions}, does all th % \label{algo:ch6p2BasicAsyncReceptions} \begin{Listing}{algo:ch6p2BasicAsyncReceptions}{Reception function in the basic asynchronous scheme} // Variables declaration and initialization -char countReceipts = 0; // Boolean indicating whether receptions are counted or not +char countReceipts = 0; // Boolean indicating whether receptions are +// counted or not int nbEndMsg = 0; // Number of end messages received int arrived = 0; // Boolean indicating if a message is arrived int srcNd; // Source node of the message @@ -359,10 +366,12 @@ while(!Finished){ // Management of data messages switch(status.MPI_TAG){ case tagCom: // Management of data messages - srcNd = status.MPI_SOURCE; // Get the source node of the message + // Get the source node of the message + srcNd = status.MPI_SOURCE; // Actual data reception in the corresponding buffer MPI_Recv(dataBufferOf(srcNd), nbDataOf(srcNd), dataTypeOf(srcNd), srcNd, tagCom, MPI_COMM_WORLD, &status); - // Unlocking of the computing thread when data are received from all dependencies + // Unlocking of the computing thread when data are received + // from all dependencies if(countReceipts == 1 && ... @\emph{receptions from ALL dependencies}@ ...){ omp_unset_lock(&lockRec); countReceipts = 0; // No more counting after first iteration @@ -454,10 +463,14 @@ required to change the operating mode. \begin{Listing}{algo:ch6p2Sync}{Initialization of the synchronized scheme} // Variables declarations and initialization ... -omp_lock_t lockStates; // Controls the synchronous exchange of local states -omp_lock_t lockIter; // Controls the synchronization at the end of each iteration -char localCV = 0; // Boolean indicating whether the local stabilization is reached or not -int nbOtherCVs = 0; // Number of other nodes being in local stabilization +// Controls the synchronous exchange of local states +omp_lock_t lockStates; +// Controls the synchronization at the end of each iteration +omp_lock_t lockIter; +//Boolean indicating whether the local stabilization is reached or not +char localCV = 0; +// Number of other nodes being in local stabilization +int nbOtherCVs = 0; // Parameters reading ... @@ -468,9 +481,12 @@ int nbOtherCVs = 0; // Number of other nodes being in local stabilization // OpenMP initialization (mainly declarations and setting up of locks) ... omp_init_lock(&lockStates); -omp_set_lock(&lockStates); // Initially locked, unlocked when all state messages are received +// Initially locked, unlocked when all state messages are received +omp_set_lock(&lockStates); omp_init_lock(&lockIter); -omp_set_lock(&lockIter); // Initially locked, unlocked when all "end of iteration" messages are received +// Initially locked, unlocked when all "end of iteration" messages are +// received +omp_set_lock(&lockIter); // Threads launching #pragma omp parallel @@ -548,7 +564,7 @@ while(!Finished){ // Waiting for the state messages receptions from the other nodes omp_set_lock(&lockStates); - // Determination of global convergence (if all nodes are in local CV) + //Determination of global convergence (if all nodes are in local CV) if(localCV + nbOtherCVs == nbP){ // Entering global CV state Finished = 1; @@ -632,10 +648,11 @@ while(!Finished){ case tagState: // Management of local state messages // Actual reception of the message MPI_Recv(&recvdState, 1, MPI_CHAR, status.MPI_SOURCE, tagState, MPI_COMM_WORLD, &status); - // Updates of numbers of stabilized nodes and received state msgs + // Updates of numbers of stabilized nodes and received state msgs nbOtherCVs += recvdState; nbStateMsg++; - // Unlocking of the computing thread when states of all other nodes are received + // Unlocking of the computing thread when states of all other + // nodes are received if(nbStateMsg == nbP-1){ nbStateMsg = 0; omp_unset_lock(&lockStates); @@ -645,7 +662,8 @@ while(!Finished){ // Actual reception of the message in dummy buffer MPI_Recv(dummyBuffer, 1, MPI_CHAR, status.MPI_SOURCE, tagIter, MPI_COMM_WORLD, &status); nbIterMsg++; // Update of the nb of iteration messages - // Unlocking of the computing thread when iteration messages are received from all other nodes + // Unlocking of the computing thread when iteration messages + // are received from all other nodes if(nbIterMsg == nbP - 1){ nbIterMsg = 0; omp_unset_lock(&lockIter); @@ -740,9 +758,9 @@ iterations done (\texttt{nbSyncIter}). \begin{Listing}{algo:ch6p2AsyncSyncComp}{Computing function in the final asynchronous scheme}% without GPU computing.} // Variables declarations and initialization ... -OpMode curMode = SYNC; // Current operating mode (always begin in sync) -double asyncStart; // Starting time of the current async section -int nbSyncIter = 0; // Number of sync iterations done in async mode +OpMode curMode = SYNC;// Current operating mode (always begin in sync) +double asyncStart; // Starting time of the current async section +int nbSyncIter = 0; // Number of sync iterations done in async mode // Computation loop while(!Finished){ @@ -751,14 +769,15 @@ while(!Finished){ // Entering synchronous mode when asyncDuration is reached @% // (additional conditions can be specified if needed) @ if(MPI_Wtime() - asyncStart >= asyncDuration){ - // Waiting for the end of previous sends before starting sync mode + // Waiting for the end of previous sends before starting sync mode omp_set_lock(&lockSendsDone); curMode = SYNC; // Entering synchronous mode stampData(dataToSend, SYNC); // Mark data to send with sync flag nbSyncIter = 0; } }else{ - // In main async mode, going back to async mode when the max number of sync iterations are done + // In main async mode, going back to async mode when the max number + // of sync iterations are done if(mainMode == ASYNC){ nbSyncIter++; // Update of the number of sync iterations done if(nbSyncIter == 2){ @@ -827,12 +846,14 @@ dim3 Dg, Db; // CUDA kernel grids // Computation loop while(!Finished){ - // Determination of the dynamic operating mode, sendings of data dependencies and blocking data receptions in sync mode + // Determination of the dynamic operating mode, sendings of data + // dependencies and blocking data receptions in sync mode ... // Local GPU computation // Data transfers from node RAM to GPU CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer"); - ... // There may be several data transfers: typically A and b in linear problems + ... // There may be several data transfers: typically A and b in + // linear problems // GPU grid definition Db.x = BLOCK_SIZE_X; // BLOCK_SIZE_# are kernel design dependent Db.y = BLOCK_SIZE_Y; @@ -903,12 +924,13 @@ omp_lock_t lockRes; // Informs aux thread about new results omp_lock_t lockWrite; // Controls exclusion of results access ... auxRes ... ; // Results of auxiliary computations -// Parameters reading, MPI initialization, data initialization and distribution +// Parameters reading, MPI initialization, data initialization and +// distribution ... // OpenMP initialization ... omp_init_lock(&lockAux); -omp_set_lock(&lockAux); // Unlocked when new aux results are available +omp_set_lock(&lockAux);//Unlocked when new aux results are available omp_init_lock(&lockRes); omp_set_lock(&lockRes); // Unlocked when new results are available omp_init_lock(&lockWrite); @@ -963,10 +985,12 @@ dim3 Dg, Db; // CUDA kernel grids // Computation loop while(!Finished){ - // Determination of the dynamic operating mode, sendings of data dependencies and blocking data receptions in sync mode + // Determination of the dynamic operating mode, sendings of data + // dependencies and blocking data receptions in sync mode ... // Local GPU computation - // Data transfers from node RAM to GPU, GPU grid definition and init of shared mem + // Data transfers from node RAM to GPU, GPU grid definition and init + // of shared mem CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer"); ... // Kernel call @@ -1027,13 +1051,13 @@ while(!Finished){ for(ind=0; ind GPU (sync. op) + cudaMemcpy(gpuInputTabAdr, cpuInputTabAdr, // Data transfer: + sizeof(float)*N, // CPU --> GPU (sync. op) cudaMemcpyHostToDevice); - gpuKernel_k1<<>>(); // GPU comp. (async. op) - MPI_Sendrecv_replace(cpuInputTabAdr, // MPI comms. (sync. op) + gpuKernel_k1<<>>(); // GPU comp. (async. op) + MPI_Sendrecv_replace(cpuInputTabAdr, // MPI comms. (sync. op) N,MPI_FLOAT, dest, 0, src, 0, ...); // IF there is (now) a result to transfer from the GPU to the CPU: - cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer: - sizeof(float), // GPU --> CPU (sync. op) + cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer: + sizeof(float), // GPU --> CPU (sync. op) cudaMemcpyDeviceToHost); } ... @@ -258,7 +258,7 @@ omp_set_num_threads(2); // - Thread 0: achieves MPI communications if (omp_get_thread_num() == 0) { - MPI_Sendrecv(current, // MPI comms. (sync. op) + MPI_Sendrecv(current, // MPI comms. (sync. op) N, MPI_FLOAT, dest, 0, future, N, MPI_FLOAT, dest, 0, ...); @@ -266,13 +266,13 @@ omp_set_num_threads(2); // - Thread 1: achieves the GPU sequence (GPU computations and // CPU/GPU data transfers) } else if (omp_get_thread_num() == 1) { - cudaMemcpy(gpuInputTabAdr, current, // Data transfer: - sizeof(float)*N, // CPU --> GPU (sync. op) + cudaMemcpy(gpuInputTabAdr, current, // Data transfer: + sizeof(float)*N, // CPU --> GPU (sync. op) cudaMemcpyHostToDevice); - gpuKernel_k1<<>>(); // GPU comp. (async. op) - // IF there is (now) a result to transfer from the GPU to the CPU: - cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer: - sizeof(float), // GPU --> CPU (sync. op) + gpuKernel_k1<<>>(); // GPU comp. (async. op) + // IF there is (now) a result to transfer from the GPU to the CPU: + cudaMemcpy(cpuResTabAdr + i, gpuResAdr,// Data transfer: + sizeof(float), // GPU --> CPU (sync. op) cudaMemcpyDeviceToHost); } @@ -393,27 +393,27 @@ omp_set_num_threads(2); for (int i = 0; i < NbIter; i++) { // - Thread 0: achieves MPI communications if (omp_get_thread_num() == 0) { - MPI_Sendrecv(current, // MPI comms. (sync. op) + MPI_Sendrecv(current, // MPI comms. (sync. op) N, MPI_FLOAT, dest, 0, future, N, MPI_FLOAT, dest, 0, ...); - // - Thread 1: achieves the streamed GPU sequence (GPU computations - // and CPU/GPU data transfers) + // - Thread 1: achieves the streamed GPU sequence (GPU + // computations and CPU/GPU data transfers) } else if (omp_get_thread_num() == 1) { - for (int s = 0; s < NbS; s++) { // Start all data transfers: + for (int s = 0; s < NbS; s++) { // Start all data transfers: cudaMemcpyAsync(gpuInputTabAdr + s*stride, // CPU --> GPU current + s*stride, // (async. ops) sizeof(float)*stride, cudaMemcpyHostToDevice, TabS[s]); } - for (int s = 0; s < NbS; s++) { // Start all GPU comps. (async.) + for (int s = 0; s < NbS; s++) { // Start all GPU comps. (async.) gpuKernel_k1<<>>(gpuInputTabAdr + s*stride); } - cudaThreadSynchronize(); // Wait all threads are ended - // IF there is (now) a result to transfer from the GPU to the CPU: - cudaMemcpy(cpuResTabAdr, // Data transfers: - gpuResAdr, // GPU --> CPU (sync. op) + cudaThreadSynchronize(); // Wait all threads are ended + // IF there is (now) a result to transfer from the GPU to the CPU: + cudaMemcpy(cpuResTabAdr, // Data transfers: + gpuResAdr, // GPU --> CPU (sync. op) sizeof(float), cudaMemcpyDeviceToHost); } @@ -539,7 +539,7 @@ omp_set_num_threads(3); // - Thread 0: achieves MPI communications if (omp_get_thread_num() == 0) { if (i < NbIter) { - MPI_Sendrecv(cpuCurrent, // MPI comms. (sync. op) + MPI_Sendrecv(cpuCurrent, // MPI comms. (sync. op) N, MPI_FLOAT, dest, 0, cpuFuture, N, MPI_FLOAT, dest, 0, ...); @@ -547,17 +547,17 @@ omp_set_num_threads(3); // - Thread 1: achieves the CPU/GPU data transfers } else if (omp_get_thread_num() == 1) { if (i < NbIter) { - cudaMemcpy(gpuFuture, cpuCurrent, // Data transfer: - sizeof(float)*N, // CPU --> GPU (sync. op) + cudaMemcpy(gpuFuture, cpuCurrent, // Data transfer: + sizeof(float)*N, // CPU --> GPU (sync. op) cudaMemcpyHostToDevice); } - // - Thread 2: achieves the GPU computations and the result transfer + // - Thread 2: achieves the GPU computations and the result transfer } else if (omp_get_thread_num() == 2) { if (i > 0) { - gpuKernel_k1<<>>(gpuCurrent); // GPU comp. (async. op) + gpuKernel_k1<<>>(gpuCurrent);// GPU comp. (async. op) // IF there is (now) a result to transfer from GPU to CPU: - cudaMemcpy(cpuResTabAdr + (i-1), // Data transfer: - gpuResAdr, sizeof(float), // GPU --> CPU (sync. op) + cudaMemcpy(cpuResTabAdr + (i-1), // Data transfer: + gpuResAdr, sizeof(float),// GPU --> CPU (sync. op) cudaMemcpyDeviceToHost); } } diff --git a/BookGPU/Chapters/chapter6/biblio6.bib b/BookGPU/Chapters/chapter6/biblio6.bib index 65dfda7..641c3cf 100644 --- a/BookGPU/Chapters/chapter6/biblio6.bib +++ b/BookGPU/Chapters/chapter6/biblio6.bib @@ -17,7 +17,7 @@ @Book{BT89, - author = {D.P.~Bertsekas and J.N.~Tsitsiklis}, + author = {D. P. Bertsekas and J. N. Tsitsiklis}, ALTeditor = {}, title = {Parallel and Distributed Computation}, publisher = {Prentice Hall}, @@ -47,7 +47,7 @@ } @InBook{ChVCV13, - author = {St\'{e}phane Vialle and Sylvain Contassot-Vivier}, + author = {S. Vialle and S. Contassot-Vivier}, editor = {Magoul\'{e}s, Fr\'{e}d\'{e}ric}, title = {Patterns for parallel programming on {G}{P}{U}s}, chapter = {Optimization methodology for Parallel Programming of Homogeneous or Hybrid Clusters}, @@ -67,7 +67,7 @@ } @Book{BCC07, - author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l}, + author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.}, title = {Parallel Iterative Algorithms: from sequential to grid computing}, publisher = {Chapman \& Hall/CRC}, year = {2007}, @@ -76,7 +76,7 @@ } @InProceedings{HPCS2002, - author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l}, + author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.}, title = {Asynchronism for Iterative Algorithms in a Global Computing Environment}, booktitle = {The 16th Annual International Symposium on High Performance Computing Systems and Applications (HPCS'2002)}, @@ -87,7 +87,7 @@ Computing Systems and Applications (HPCS'2002)}, } @InProceedings{Vecpar08a, - author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l}, + author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.}, title = {An efficient and robust decentralized algorithm for detecting the global convergence in asynchronous iterative algorithms}, booktitle = {8th International Meeting on High Performance Computing for Computational Science, VECPAR'08}, @@ -98,7 +98,7 @@ convergence in asynchronous iterative algorithms}, } @article{ParCo05, - author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l}, + author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.}, title = {Evaluation of the Asynchronous Iterative Algorithms in the Context of Distant Heterogeneous Clusters}, journal = {Parallel Computing}, volume = {31}, @@ -108,7 +108,7 @@ convergence in asynchronous iterative algorithms}, } @InProceedings{ECost10, - author = {Contassot-Vivier, Sylvain and Vialle, St\'{e}phane and Jost, Thomas}, + author = {Contassot-Vivier, S. and Vialle, S. and Jost, T.}, title = {Optimizing computing and energy performances on {GPU} clusters: experimentation on a {PDE} solver}, booktitle = {COST Action IC0804 on Large Scale Distributed Systems,1st Year}, OPTpages = {}, @@ -119,7 +119,7 @@ convergence in asynchronous iterative algorithms}, } @Article{SuperCo05, - author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l}, + author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.}, title = {Performance comparison of parallel programming environments for implementing {AIAC} algorithms}, journal = {Journal of Supercomputing. Special Issue on Performance Modelling and Evaluation of Parallel and Distributed Systems}, year = {2006}, @@ -129,7 +129,7 @@ convergence in asynchronous iterative algorithms}, } @InProceedings{Para10, - author = {Contassot-Vivier, Sylvain and Jost, Thomas and Vialle, St\'{e}phane}, + author = {Contassot-Vivier, S. and Jost, T. and Vialle, S.}, title = {Impact of asynchronism on {GPU} accelerated parallel iterative computations}, booktitle = {PARA 2010 conference: State of the Art in Scientific and Parallel Computing}, OPTpages = {--}, @@ -139,7 +139,7 @@ convergence in asynchronous iterative algorithms}, } @InProceedings{ECost10, - author = {Contassot-Vivier, Sylvain and Vialle, St\'{e}phane and Jost, Thomas}, + author = {Contassot-Vivier, S. and Vialle, S. and Jost, T.}, title = {Optimizing computing and energy performances on {GPU} clusters: experimentation on a {PDE} solver}, booktitle = {COST Action IC0804 on Large Scale Distributed Systems,1st Year}, OPTpages = {}, @@ -150,7 +150,7 @@ convergence in asynchronous iterative algorithms}, } @InCollection{JCVV10, - author = {Thomas Jost and Sylvain Contassot-Vivier and St\'{e}phane Vialle}, + author = {T. Jost and S. Contassot-Vivier and S. Vialle}, title = {An efficient multi-algorithm sparse linear solver for {GPU}s}, booktitle = {Parallel Computing : From Multicores and GPU's to Petascale}, pages = {546--553}, @@ -170,7 +170,7 @@ convergence in asynchronous iterative algorithms}, } @InProceedings{ParCo09, - author = {Thomas Jost and Sylvain Contassot-Vivier and St\'{e}phane Vialle}, + author = {T. Jost and S. Contassot-Vivier and S. Vialle}, title = {An efficient multi-algorithms sparse linear solver for {GPU}s}, booktitle = {EuroGPU mini-symposium of the International Conference on Parallel Computing, ParCo'2009}, pages = {546--553}, @@ -180,7 +180,7 @@ convergence in asynchronous iterative algorithms}, } @InProceedings{BCVG11, - author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Giersch, Arnaud}, + author = {Bahi, J. M. and Contassot-Vivier, S. and Giersch, A.}, title = {Load Balancing in Dynamic Networks by Bounded Delays Asynchronous Diffusion}, booktitle = {VECPAR 2010}, pages = {352--365}, @@ -217,7 +217,7 @@ convergence in asynchronous iterative algorithms}, } @article{Hoefler08a, -author = {Torsten Hoefler and Andrew Lumsdaine}, +author = {T. Hoefler and A. Lumsdaine}, title = {Overlapping Communication and Computation with High Level Communication Routines}, journal ={Cluster Computing and the Grid, IEEE International Symposium on}, OPTvolume = {0}, @@ -230,7 +230,7 @@ address = {Los Alamitos, CA, USA}, } @Article{Valiant:BSP, - author = {Valiant, Leslie G.}, + author = {Valiant, L. G.}, title = {A bridging model for parallel computation}, journal = {Communications of the ACM}, year = 1990, @@ -240,7 +240,7 @@ address = {Los Alamitos, CA, USA}, } @inproceedings{gustedt:hal-00639289, - AUTHOR = {Gustedt, Jens and Jeanvoine, Emmanuel}, + AUTHOR = {Gustedt, J. and Jeanvoine, E.}, TITLE = {{Relaxed Synchronization with Ordered Read-Write Locks}}, BOOKTITLE = {{Euro-Par 2011: Parallel Processing Workshops}}, YEAR = {2012}, @@ -258,7 +258,7 @@ address = {Los Alamitos, CA, USA}, } @article{clauss:2010:inria-00330024:1, - AUTHOR = {Clauss, Pierre-Nicolas and Gustedt, Jens}, + AUTHOR = {Clauss, P.-N. and Gustedt, J.}, TITLE = {{Iterative Computations with Ordered Read-Write Locks}}, JOURNAL = {{Journal of Parallel and Distributed Computing}}, PUBLISHER = {Elsevier}, @@ -277,7 +277,7 @@ address = {Los Alamitos, CA, USA}, TITLE = {The par{X}{X}{L} Environment: Scalable Fine Grained Development for Large Coarse Grained Platforms}, X-PAYS = {IT}, X-INTERNATIONAL-AUDIENCE = {yes}, - AUTHOR = {Gustedt, Jens AND Vialle, Stéphane AND De Vivo, Amelia}, + AUTHOR = {Gustedt, J. AND Vialle, S. AND De Vivo, A.}, BOOKTITLE = {PARA 06}, LONG-BOOKTITLE = {PARA 06: Worshop on state-of-the-art in scientific and parallel computing }, EDITOR = {Bo K{\aa}gstr{\"o}m and others}, diff --git a/BookGPU/Chapters/chapter6/ch6.tex b/BookGPU/Chapters/chapter6/ch6.tex index 0cf6f84..676da7b 100755 --- a/BookGPU/Chapters/chapter6/ch6.tex +++ b/BookGPU/Chapters/chapter6/ch6.tex @@ -9,7 +9,6 @@ \newcommand{\Equ}[1]{(\ref{#1})} \def\Reals{\mathbb{R}} -\definecolor{shadecolor}{rgb}{0.95,0.95,0.95} %\newenvironment{Algo}{\vspace{-1em}\begin{center}\begin{minipage}[h]{0.95\columnwidth}\begin{shaded}\begin{tabbing}% % \hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\= \kill} % % { \end{tabbing}\vspace{-1em}\end{shaded}\end{minipage}\end{center}\vspace{-1em}} diff --git a/BookGPU/Chapters/chapter8/ch8.tex b/BookGPU/Chapters/chapter8/ch8.tex index 101a552..a2529b1 100644 --- a/BookGPU/Chapters/chapter8/ch8.tex +++ b/BookGPU/Chapters/chapter8/ch8.tex @@ -499,7 +499,6 @@ To reduce the computation time cost of the term $\min\limits_{(i,j)\in \jmath^2, MM & $m \times (m-1)$ & $m \times (m-1)$ \\ \hline \end{tabular} -\vspace{0.5cm} \caption{The different data structures of the $LB$ algorithm and their associated complexities in memory size and numbers of accesses. The parameters $n$, $m$ and $n'$ designate respectively the total number of jobs, the total number of machines and the number of remaining jobs to be scheduled for the sub-problems the lower bound is being computed.} \label{ch8:tabMemComplex} \end{table} @@ -518,7 +517,7 @@ The focus is put on the shared memory which is a key enabler for many high-perfo In order to achieve further performances, we also take care of adequately use the global memory by judiciously configuring the L1 cache which greatly enables improving performance over direct access to global memory. Indeed, the GPU device we are using in our experiments is based on the NVIDIA Fermi architecture which introduced two new hierarchies of memories (L1 $/$ L2 cache) compared to older architectures. -\begin{table*} +\begin{table} \centering \footnotesize \begin{tabular}{|r|r|r|r|r|r|} @@ -535,10 +534,9 @@ $50 \times 20$ & 9.500 (9.5KB) & 9.500 (19KB) & 1.000 (1KB) & 20 (0.04KB) & 380 $20 \times 20$ & 3.800 (3.8KB) & 3.800 (7.6KB) & 400 (0.4KB) & 20 (0.04KB) & 380 (0.76KB) \\ \hline \end{tabular} -\vspace{0.5cm} \caption{The sizes of each data structure for the different experimented problem instances. The sizes are given in number of elements and in bytes (between brackets).} \label{ch8:tabMemSizes} -\end{table*} +\end{table} \vspace{0.2cm} @@ -610,7 +608,7 @@ Using the approach defined in \cite{ch8:Mezmaz_2007}, it is possible to obtain a Table \ref{ch8:instance_time} gives, for each instance according to its number of jobs and its number of machines, the used resolution time with a sequential B\&B. For example, the sequential resolution time of each instance defined with $20$ jobs and $20$ machines is approximately 10 minutes. Of course, the computation time of the lower bound of a sub-problem defined with $20$ jobs and $20$ machines is on average greater than the computation time of the lower bound of a sub-problem defined with $50$ jobs and $20$ machines. Therefore, as shown in this table, the sequential resolution time increases with the size of the instance in order to be sure that the number of sub-problems explored is significant for all instances. -\begin{table*} +\begin{table} \setlength{\tabcolsep}{0.2cm} \renewcommand{\arraystretch}{1.2} \centering @@ -622,10 +620,9 @@ Instance (No. of jobs x No. of machines) & 20$\times$20 & 50$\times$20 & 100$\ti Sequential resolution time (minutes) & 10 & 50 & 150 & 300 \\ \hline \end{tabular} -\vspace{0.3cm} \caption{The sequential resolution time of each instance according to its number of jobs and machines} \label{ch8:instance_time} -\end{table*} +\end{table} \subsection{Performance impact of GPU-based parallelism} @@ -637,7 +634,7 @@ The results obtained with the GPU-PTE-BB approach (see Table \ref{ch8:ParaGPU1}) The results show also that the parallel efficiency decreases with the size of the problem instance. For a fixed number of machines (here 20 machines) and a fixed pool size, the obtained speedup decline accordingly with the number of jobs. For instance for a pool size of 262144, the acceleration factor obtained with 200 jobs (13.4) while it is (40.50) for the instances with 20 jobs. This behavior is mainly due to the overhead induced by the transfer of the pool of resulting sub-problems between the CPU and the GPU. For example, for the instances with 200 jobs the size of the pool to exchange between the CPU and the GPU is ten times bigger than the size of the pool for the instances with 20 jobs. -\begin{table*} +\begin{table} \setlength{\tabcolsep}{0.2cm} \renewcommand{\arraystretch}{1.2} \centering @@ -662,16 +659,15 @@ $20 \times $20 & 6.43 & 11.43 & 20.14 & 27.78 & 30.12 & 35.74 & 40.50\\ % \hline % \hline \end{tabular} -\vspace{0.3cm} \caption{Speedups for different problem instances and pool sizes with the GPU-PTE-BB approach.} \label{ch8:ParaGPU1} -\end{table*} +\end{table} The results obtained with the GPU-PEB-BB approach (see Table \ref{ch8:ParaGPU2}) show that evaluating in parallel the bounds of a selected pool, allow to significantly speedup the execution of the B\&B. Indeed, an acceleration factor up to 71.69 is obtained for the 200 $\times$ 20 problem instances using a pool of 262144 sub-problems. The results show also that the parallel efficiency grows with the size of the problem instance. For a fixed number of machines (here 20 machines) and a fixed pool size, the obtained speedup grows accordingly with the number of jobs. For instance for a pool size of 262144, the acceleration factor obtained with 200 jobs (71.69) is almost the double of the one obtained with 20 jobs (38.40). As far the pool size tuning is considered, we could notice that this parameter depends strongly on the problem instance being solved. Indeed, while the best acceleration is obtained with a pool size of 8192 sub-problems for the instances 50 $\times$ 20 and 20 $\times$ 20, the best speedups are obtained with a pool size of 262144 sub-problems with the instances 200 $\times$ 20 and 100 $\times$ 20.\\ -\begin{table*} +\begin{table} \setlength{\tabcolsep}{0.2cm} \renewcommand{\arraystretch}{1.2} \centering @@ -696,10 +692,9 @@ $20 \times $20 & 38.74 & \textbf{46.47} & 45.37 & 41.92 & 39.55 & 38.90 & 38.40\ % \hline % \hline \end{tabular} -\vspace{0.3cm} \caption{Speedups for different problem instances and pool sizes with the GPU-PEB-BB approach.} \label{ch8:ParaGPU2} -\end{table*} +\end{table} Compared to the parallel tree exploration-based GPU-accelerated B\&B approach, the parallel evaluation of bounds approach is by far much more efficient wherever the instance is. For example, while the GPU-PEB-BB approach reaches speedup of $\times$71.69 for the instance with 200 jobs on 20 machines, a speedup of a $\times$13.4 is measured with the parallel tree exploration-based approach which corresponds to an acceleration of $\times$5.56 . Moreover, on the contrary to the GPU-PEB-BB approach, in the GPU-PTE-BB the speedups decrease when the problem instance becomes higher. Remember here that while in the GPU-PEB-BB approach all threads evaluate only one node each whatever the permutation size is. In the GPU-PTE-BB, each thread branches all the children of its assigned parent node. Therefore, the bigger the size of the permutation is, the bigger the amount of work performed by each thread is and the bigger the difference between the workload is. Indeed, let us suppose that for the instance with $200$ jobs, the thread $0$ handles a node from the level $2$ of the tree and the thread $100$ handles a node from the level $170$ of the tree. In this case, the thread $0$ generates and evaluates $198$ nodes while the thread $100$ decomposes and bounds only $30$ nodes. The problem in this example is that the kernel execution would last until the thread $0$ finishes its work while the other threads might have ended their works and stayed idle. @@ -708,7 +703,7 @@ Compared to the parallel tree exploration-based GPU-accelerated B\&B approach, t The objective of this section is to demonstrate that the thread divergence reduction mechanisms we propose has an impact on the performance of the GPU accelerated B\&B and to evaluate how this impact is significant. In the following, the reported results are obtained with the GPU-accelerated B\&B based on the parallel evaluation of bounds. -\begin{table*}[!h] +\begin{table}[!h] \setlength{\tabcolsep}{0.2cm} \renewcommand{\arraystretch}{1.2} \centering @@ -734,10 +729,9 @@ $20 \times $20 & 41.71 & \textbf{50.28} & 49.19 & 45.90 & 42.03 & 41.80 & 41.65\ % \hline % \hline \end{tabular} -\vspace{0.3cm} \caption{Speedups for different instances and pool sizes using thread divergence management.} \label{ch8:ParaDivergence} -\end{table*} +\end{table} Table~\ref{ch8:ParaDivergence} shows the experimental results obtained using the sorting process and the refactoring approach presented in Section \ref{ch8:ThreadDivergence}. Results show that the proposed optimizations emphasize the GPU acceleration reported in Table~\ref{ch8:ParaGPU2} and obtained without thread divergence reduction. For example, for the instances of 200 jobs over 20 machines and a pool size of 262144, the average reported speedup is 77.46 while the average acceleration factor obtained without thread divergence management for the same instances and the same pool size is 71.69 which corresponds to an improvement of 7.68\%. Such considerable but not outstanding improvement is predictable, as claimed in \cite{ch8:Han}, since the factorized part of the branches in the FSP lower bound is very small. @@ -747,7 +741,7 @@ The objective of the experimental study presented in this section is to find the Table~\ref{ch8:PTM-on-SM} reports the speedups obtained for the first experimented scenario where only the matrix $PTM$ is put on the shared memory. Results show that the speedup grows on average with the growing of the pool size in the same way as in Table~\ref{ch8:ParaDivergence}. For the largest problem instance and pool size, putting the PTM matrix on the shared memory improves the speedups up to ($14\%$) compared to those obtained when $PTM$ is on global memory reaching an acceleration of $\times 90.51$ for the problem instances $200 \times 20$ and a pool size of $262144$ sub-problems . -\begin{table*} +\begin{table} \centering \footnotesize \begin{tabular}{|r|r|r|r|r|r|r|r|} @@ -771,14 +765,13 @@ $20 \times $20 & 41.94 & \textbf{60.10} & 48.28 & 39.86 & 39.61 & 38.93 & 37.79 % \hline % \hline \end{tabular} -\vspace{0.3cm} \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. $PTM$ is placed in shared memory and all others are placed in global memory.} \label{ch8:PTM-on-SM} -\end{table*} +\end{table} Table~\ref{ch8:JM-on-SM} reports the behavior of the speedup averaged on the different problem instances (sizes) as a function of the pool size for the scenario where the Johnson's matrix is put on the shared memory. Results show that putting the $JM$ matrix on the shared matrix improves more the performances comparing to the first scenario where $PTM$ is put on the shared memory. Indeed, according to Table~\ref{ch8:tabMemComplex}, matrix $JM$ is accessed more frequently than matrix $PTM$. Putting $JM$ matrix on the shared memory allows accelerations up to $\times 97.83$ for the problem instances $200 \times 20$. -\begin{table*} +\begin{table} \centering \footnotesize \begin{tabular}{|r|r|r|r|r|r|r|r|} @@ -802,15 +795,14 @@ $20 \times $20 & 49.00 & \textbf{60.25} & 55.50 & 45.88 & 44.47 & 43.11 & 42.82 % \hline % \hline \end{tabular} -\vspace{0.3cm} \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. $JM$ is placed in shared memory and all others are placed in global memory.} \label{ch8:JM-on-SM} -\end{table*} +\end{table} Table~\ref{ch8:JM-PTM-on-SM} reports the behavior of the average speedup for the different problem instances (sizes) with $20$ machines for the data placement scenario where both $PTM$ and $JM$ are put on shared memory. According to the underlying Table, the scenarios~(3) ($JM$ together or without $PTM$ in shared memory) is clearly better than the scenarii~(1)and~(2) (respectively $PTM$ in shared memory and $JM$ in shared memory) whatever is the problem instance (size). -\begin{table*} +\begin{table} \centering \footnotesize \begin{tabular}{|r|r|r|r|r|r|r|r|} @@ -834,11 +826,9 @@ $20 \times $20 & 53.64 & \textbf{61.47} & 59.55 & 51.39 & 47.40 & 46.53 & 46.37\ % \hline % \hline \end{tabular} -\vspace{0.3cm} - \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. -$PTM$ and $JM$ are placed together in shared memory and all others are placed in global memory.} + \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. $PTM$ and $JM$ are placed together in shared memory and all others are placed in global memory.} \label{ch8:JM-PTM-on-SM} -\end{table*} +\end{table} By carefully analyzing each of the scenarii of data placement on the memory hierarchies of the GPU, the recommendation is to put in the shared memory the Johnson's and the processing time matrices ($JM$ and $PTM$) if they fit in together. Otherwise, the whole or a part of the Johnson's matrix has to be put in priority in the shared memory. The other data structures are mapped to the global memory.