-\@writefile{toc}{\contentsline {section}{\numberline {3.1}Introduction}{24}}
-\newlabel{ch6:intro}{{3.1}{24}}
-\@writefile{toc}{\contentsline {section}{\numberline {3.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{24}}
-\newlabel{ch6:part1}{{3.2}{24}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.1}Synchronous parallel algorithms on GPU clusters}{24}}
-\@writefile{lof}{\contentsline {figure}{\numberline {3.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{26}}
-\newlabel{fig:ch6p1overlapnative}{{3.1}{26}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.2}Native overlap of CPU communications and GPU computations}{26}}
-\newlabel{algo:ch6p1overlapnative}{{3.1}{27}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{27}}
-\@writefile{lof}{\contentsline {figure}{\numberline {3.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{28}}
-\newlabel{fig:ch6p1overlapseqsequence}{{3.2}{28}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.3}Overlapping with sequences of transfers and computations}{28}}
-\newlabel{algo:ch6p1overlapseqsequence}{{3.2}{29}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{29}}
-\@writefile{lof}{\contentsline {figure}{\numberline {3.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{30}}
-\newlabel{fig:ch6p1overlapstreamsequence}{{3.3}{30}}
-\newlabel{algo:ch6p1overlapstreamsequence}{{3.3}{31}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{31}}
-\@writefile{lof}{\contentsline {figure}{\numberline {3.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{33}}
-\newlabel{fig:ch6p1overlapinterleaved}{{3.4}{33}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.4}Interleaved communications-transfers-computations overlapping}{33}}
-\newlabel{algo:ch6p1overlapinterleaved}{{3.4}{34}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{34}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.5}Experimental validation}{36}}
-\newlabel{ch6:p1expes}{{3.2.5}{36}}
-\newlabel{ch6:p1block-cyclic}{{3.2.5}{36}}
-\@writefile{lof}{\contentsline {figure}{\numberline {3.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{37}}
-\newlabel{fig:ch6p1syncexpematrixprod}{{3.5}{37}}
-\@writefile{toc}{\contentsline {section}{\numberline {3.3}General scheme of asynchronous parallel code with computation/communication overlapping}{38}}
-\newlabel{ch6:part2}{{3.3}{38}}
-\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces Synchronous iterative scheme\relax }}{38}}
-\newlabel{algo:ch6p2sync}{{1}{38}}
-\@writefile{loa}{\contentsline {algorithm}{\numberline {2}{\ignorespaces Asynchronous iterative scheme\relax }}{38}}
-\newlabel{algo:ch6p2async}{{2}{38}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.1}A basic asynchronous scheme}{40}}
-\newlabel{ch6:p2BasicAsync}{{3.3.1}{40}}
-\newlabel{algo:ch6p2BasicAsync}{{3.5}{40}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.5}Initialization of the basic asynchronous scheme}{40}}
-\newlabel{algo:ch6p2BasicAsyncComp}{{3.6}{41}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.6}Computing function in the basic asynchronous scheme}{41}}
-\newlabel{algo:ch6p2BasicAsyncSendings}{{3.7}{42}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.7}Sending function in the basic asynchronous scheme}{42}}
-\newlabel{algo:ch6p2BasicAsyncReceptions}{{3.8}{43}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.8}Reception function in the basic asynchronous scheme}{43}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.2}Synchronization of the asynchronous scheme}{44}}
-\newlabel{ch6:p2SsyncOverAsync}{{3.3.2}{44}}
-\newlabel{algo:ch6p2Sync}{{3.9}{45}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.9}Initialization of the synchronized scheme}{45}}
-\newlabel{algo:ch6p2SyncComp}{{3.10}{46}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.10}Computing function in the synchronized scheme}{46}}
-\newlabel{algo:ch6p2SyncReceptions}{{3.11}{47}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.11}Reception function in the synchronized scheme}{47}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{48}}
-\newlabel{ch6:p2GPUAsync}{{3.3.3}{48}}
-\newlabel{algo:ch6p2AsyncSyncComp}{{3.12}{50}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.12}Computing function in the final asynchronous scheme}{50}}
-\newlabel{algo:ch6p2syncGPU}{{3.13}{51}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.13}Computing function in the final asynchronous scheme}{51}}
-\newlabel{algo:ch6p2FullOverAsyncMain}{{3.14}{53}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.14}Initialization of the main process of complete overlap with asynchronism}{53}}
-\newlabel{algo:ch6p2FullOverAsyncComp1}{{3.15}{54}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{54}}
-\newlabel{algo:ch6p2FullOverAsyncComp2}{{3.16}{55}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{55}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.4}Experimental validation}{56}}
-\newlabel{sec:ch6p2expes}{{3.3.4}{56}}
-\@writefile{lof}{\contentsline {figure}{\numberline {3.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{57}}
-\newlabel{fig:ch6p2syncasync}{{3.6}{57}}
-\@writefile{lof}{\contentsline {figure}{\numberline {3.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{58}}
-\newlabel{fig:ch6p2aux}{{3.7}{58}}
-\@writefile{toc}{\contentsline {section}{\numberline {3.4}Perspective: A unifying programming model}{59}}
-\newlabel{sec:ch6p3unify}{{3.4}{59}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.4.1}Resources}{59}}
-\newlabel{sec:ch6p3resources}{{3.4.1}{59}}
-\newlabel{algo:ch6p3ORWLresources}{{3.17}{60}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{60}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.4.2}Control}{60}}
-\newlabel{sec:ch6p3ORWLcontrol}{{3.4.2}{60}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.4.3}Example: block-cyclic matrix multiplication (MM)}{61}}
-\newlabel{sec:ch6p3ORWLMM}{{3.4.3}{61}}
-\newlabel{algo:ch6p3ORWLBCCMM}{{3.18}{61}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.18}Block-cyclic matrix multiplication, high level per task view}{61}}
-\newlabel{algo:ch6p3ORWLlcopy}{{3.19}{62}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.19}An iterative local copy operation}{62}}
-\newlabel{algo:ch6p3ORWLrcopy}{{3.20}{62}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{62}}
-\newlabel{algo:ch6p3ORWLtrans}{{3.21}{62}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{62}}
-\newlabel{algo:ch6p3ORWLdecl}{{3.22}{63}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.22}Dynamic declaration of handles to represent the resources}{63}}
-\newlabel{algo:ch6p3ORWLinit}{{3.23}{64}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.23}Dynamic initialization of access mode and priorities}{64}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {3.4.4}Tasks and operations}{64}}
-\newlabel{sec:ch6p3tasks}{{3.4.4}{64}}
-\@writefile{toc}{\contentsline {section}{\numberline {3.5}Conclusion}{65}}
-\newlabel{ch6:conclu}{{3.5}{65}}
-\@writefile{toc}{\contentsline {section}{\numberline {3.6}Glossary}{65}}
-\@writefile{toc}{\contentsline {section}{Bibliography}{66}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.1}Introduction}{88}}
+\newlabel{ch6:intro}{{6.1}{88}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{88}}
+\newlabel{ch6:part1}{{6.2}{88}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.1}Synchronous parallel algorithms on GPU clusters}{88}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{90}}
+\newlabel{fig:ch6p1overlapnative}{{6.1}{90}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.2}Native overlap of CPU communications and GPU computations}{90}}
+\newlabel{algo:ch6p1overlapnative}{{6.1}{91}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{91}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{92}}
+\newlabel{fig:ch6p1overlapseqsequence}{{6.2}{92}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{92}}
+\newlabel{algo:ch6p1overlapseqsequence}{{6.2}{93}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{93}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{95}}
+\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{95}}
+\newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{95}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{95}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{97}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{98}}
+\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{98}}
+\newlabel{algo:ch6p1overlapinterleaved}{{6.4}{98}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{98}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{100}}
+\newlabel{ch6:p1expes}{{6.2.5}{100}}
+\newlabel{ch6:p1block-cyclic}{{6.2.5}{100}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{101}}
+\newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{101}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{102}}
+\newlabel{ch6:part2}{{6.3}{102}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{102}}
+\newlabel{algo:ch6p2sync}{{3}{102}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{103}}
+\newlabel{algo:ch6p2async}{{4}{103}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{104}}
+\newlabel{ch6:p2BasicAsync}{{6.3.1}{104}}
+\newlabel{algo:ch6p2BasicAsync}{{6.5}{104}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{104}}
+\newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{105}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{105}}
+\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{107}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{107}}
+\newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{107}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{107}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{109}}
+\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{109}}
+\newlabel{algo:ch6p2Sync}{{6.9}{109}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{109}}
+\newlabel{algo:ch6p2SyncComp}{{6.10}{110}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{110}}
+\newlabel{algo:ch6p2SyncReceptions}{{6.11}{112}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{112}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{113}}
+\newlabel{ch6:p2GPUAsync}{{6.3.3}{113}}
+\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{114}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{114}}
+\newlabel{algo:ch6p2syncGPU}{{6.13}{115}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{115}}
+\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{118}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{118}}
+\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{119}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{119}}
+\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{120}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{120}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{121}}
+\newlabel{sec:ch6p2expes}{{6.3.4}{121}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{122}}
+\newlabel{fig:ch6p2syncasync}{{6.6}{122}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{123}}
+\newlabel{fig:ch6p2aux}{{6.7}{123}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{124}}
+\newlabel{sec:ch6p3unify}{{6.4}{124}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{124}}
+\newlabel{sec:ch6p3resources}{{6.4.1}{124}}
+\newlabel{algo:ch6p3ORWLresources}{{6.17}{125}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{125}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{125}}
+\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{125}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{126}}
+\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{126}}
+\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{126}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{126}}
+\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{127}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{127}}
+\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{127}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{127}}
+\newlabel{algo:ch6p3ORWLtrans}{{6.21}{127}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{127}}
+\newlabel{algo:ch6p3ORWLdecl}{{6.22}{128}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{128}}
+\newlabel{algo:ch6p3ORWLinit}{{6.23}{128}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{128}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{129}}
+\newlabel{sec:ch6p3tasks}{{6.4.4}{129}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{130}}
+\newlabel{ch6:conclu}{{6.5}{130}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{130}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{131}}