\@writefile{toc}{\author{Stephane Vialle}{}}
\@writefile{toc}{\author{Jens Gustedt}{}}
\@writefile{loa}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {chapter}{\numberline {6}Development methodologies for GPU and cluster of GPUs}{89}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {7}Development methodologies for GPU and cluster of GPUs}{107}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {section}{\numberline {6.1}Introduction}{90}}
-\newlabel{ch6:intro}{{6.1}{90}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{90}}
-\newlabel{ch6:part1}{{6.2}{90}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.1}Synchronous parallel algorithms on GPU clusters}{90}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{92}}
-\newlabel{fig:ch6p1overlapnative}{{6.1}{92}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.2}Native overlap of CPU communications and GPU computations}{92}}
-\newlabel{algo:ch6p1overlapnative}{{6.1}{93}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{93}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{94}}
-\newlabel{fig:ch6p1overlapseqsequence}{{6.2}{94}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{94}}
-\newlabel{algo:ch6p1overlapseqsequence}{{6.2}{95}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{95}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{97}}
-\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{97}}
-\newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{97}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{97}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{99}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{100}}
-\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{100}}
-\newlabel{algo:ch6p1overlapinterleaved}{{6.4}{100}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{100}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{102}}
-\newlabel{ch6:p1expes}{{6.2.5}{102}}
-\newlabel{ch6:p1block-cyclic}{{6.2.5}{102}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{103}}
-\newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{103}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{104}}
-\newlabel{ch6:part2}{{6.3}{104}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{104}}
-\newlabel{algo:ch6p2sync}{{3}{104}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{105}}
-\newlabel{algo:ch6p2async}{{4}{105}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{106}}
-\newlabel{ch6:p2BasicAsync}{{6.3.1}{106}}
-\newlabel{algo:ch6p2BasicAsync}{{6.5}{106}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{106}}
-\newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{107}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{107}}
-\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{109}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{109}}
-\newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{109}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{109}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{111}}
-\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{111}}
-\newlabel{algo:ch6p2Sync}{{6.9}{111}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{111}}
-\newlabel{algo:ch6p2SyncComp}{{6.10}{112}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{112}}
-\newlabel{algo:ch6p2SyncReceptions}{{6.11}{114}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{114}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{115}}
-\newlabel{ch6:p2GPUAsync}{{6.3.3}{115}}
-\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{116}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{116}}
-\newlabel{algo:ch6p2syncGPU}{{6.13}{117}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{117}}
-\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{120}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{120}}
-\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{121}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{121}}
-\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{122}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{122}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{123}}
-\newlabel{sec:ch6p2expes}{{6.3.4}{123}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{124}}
-\newlabel{fig:ch6p2syncasync}{{6.6}{124}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{125}}
-\newlabel{fig:ch6p2aux}{{6.7}{125}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{126}}
-\newlabel{sec:ch6p3unify}{{6.4}{126}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{126}}
-\newlabel{sec:ch6p3resources}{{6.4.1}{126}}
-\newlabel{algo:ch6p3ORWLresources}{{6.17}{127}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{127}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{127}}
-\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{127}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{128}}
-\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{128}}
-\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{128}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{128}}
-\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{129}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{129}}
-\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{129}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{129}}
-\newlabel{algo:ch6p3ORWLtrans}{{6.21}{129}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{129}}
-\newlabel{algo:ch6p3ORWLdecl}{{6.22}{130}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{130}}
-\newlabel{algo:ch6p3ORWLinit}{{6.23}{130}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{130}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{131}}
-\newlabel{sec:ch6p3tasks}{{6.4.4}{131}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{132}}
-\newlabel{ch6:conclu}{{6.5}{132}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{132}}
-\@writefile{toc}{\contentsline {section}{Bibliography}{133}}
+\@writefile{toc}{\contentsline {section}{\numberline {7.1}Introduction}{108}}
+\newlabel{ch6:intro}{{7.1}{108}}
+\@writefile{toc}{\contentsline {section}{\numberline {7.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{108}}
+\newlabel{ch6:part1}{{7.2}{108}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {7.2.1}Synchronous parallel algorithms on GPU clusters}{108}}
+\@writefile{lof}{\contentsline {figure}{\numberline {7.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{110}}
+\newlabel{fig:ch6p1overlapnative}{{7.1}{110}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {7.2.2}Native overlap of CPU communications and GPU computations}{110}}
+\newlabel{algo:ch6p1overlapnative}{{7.1}{111}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{111}}
+\@writefile{lof}{\contentsline {figure}{\numberline {7.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{112}}
+\newlabel{fig:ch6p1overlapseqsequence}{{7.2}{112}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {7.2.3}Overlapping with sequences of transfers and computations}{112}}
+\newlabel{algo:ch6p1overlapseqsequence}{{7.2}{113}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{113}}
+\@writefile{lof}{\contentsline {figure}{\numberline {7.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{115}}
+\newlabel{fig:ch6p1overlapstreamsequence}{{7.3}{115}}
+\newlabel{algo:ch6p1overlapstreamsequence}{{7.3}{115}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{115}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {7.2.4}Interleaved communications-transfers-computations overlapping}{117}}
+\@writefile{lof}{\contentsline {figure}{\numberline {7.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{118}}
+\newlabel{fig:ch6p1overlapinterleaved}{{7.4}{118}}
+\newlabel{algo:ch6p1overlapinterleaved}{{7.4}{118}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{118}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {7.2.5}Experimental validation}{120}}
+\newlabel{ch6:p1expes}{{7.2.5}{120}}
+\newlabel{ch6:p1block-cyclic}{{7.2.5}{120}}
+\@writefile{lof}{\contentsline {figure}{\numberline {7.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{121}}
+\newlabel{fig:ch6p1syncexpematrixprod}{{7.5}{121}}
+\@writefile{toc}{\contentsline {section}{\numberline {7.3}General scheme of asynchronous parallel code with computation/communication overlapping}{122}}
+\newlabel{ch6:part2}{{7.3}{122}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Synchronous iterative scheme\relax }}{122}}
+\newlabel{algo:ch6p2sync}{{4}{122}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {5}{\ignorespaces Asynchronous iterative scheme\relax }}{123}}
+\newlabel{algo:ch6p2async}{{5}{123}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {7.3.1}A basic asynchronous scheme}{124}}
+\newlabel{ch6:p2BasicAsync}{{7.3.1}{124}}
+\newlabel{algo:ch6p2BasicAsync}{{7.5}{124}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.5}Initialization of the basic asynchronous scheme}{124}}
+\newlabel{algo:ch6p2BasicAsyncComp}{{7.6}{125}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.6}Computing function in the basic asynchronous scheme}{125}}
+\newlabel{algo:ch6p2BasicAsyncSendings}{{7.7}{127}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.7}Sending function in the basic asynchronous scheme}{127}}
+\newlabel{algo:ch6p2BasicAsyncReceptions}{{7.8}{127}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.8}Reception function in the basic asynchronous scheme}{127}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {7.3.2}Synchronization of the asynchronous scheme}{129}}
+\newlabel{ch6:p2SsyncOverAsync}{{7.3.2}{129}}
+\newlabel{algo:ch6p2Sync}{{7.9}{129}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.9}Initialization of the synchronized scheme}{129}}
+\newlabel{algo:ch6p2SyncComp}{{7.10}{130}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.10}Computing function in the synchronized scheme}{130}}
+\newlabel{algo:ch6p2SyncReceptions}{{7.11}{132}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.11}Reception function in the synchronized scheme}{132}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {7.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{133}}
+\newlabel{ch6:p2GPUAsync}{{7.3.3}{133}}
+\newlabel{algo:ch6p2AsyncSyncComp}{{7.12}{134}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.12}Computing function in the final asynchronous scheme}{134}}
+\newlabel{algo:ch6p2syncGPU}{{7.13}{135}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.13}Computing function in the final asynchronous scheme}{135}}
+\newlabel{algo:ch6p2FullOverAsyncMain}{{7.14}{138}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.14}Initialization of the main process of complete overlap with asynchronism}{138}}
+\newlabel{algo:ch6p2FullOverAsyncComp1}{{7.15}{139}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{139}}
+\newlabel{algo:ch6p2FullOverAsyncComp2}{{7.16}{140}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{140}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {7.3.4}Experimental validation}{141}}
+\newlabel{sec:ch6p2expes}{{7.3.4}{141}}
+\@writefile{lof}{\contentsline {figure}{\numberline {7.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{142}}
+\newlabel{fig:ch6p2syncasync}{{7.6}{142}}
+\@writefile{lof}{\contentsline {figure}{\numberline {7.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{143}}
+\newlabel{fig:ch6p2aux}{{7.7}{143}}
+\@writefile{toc}{\contentsline {section}{\numberline {7.4}Perspective: A unifying programming model}{144}}
+\newlabel{sec:ch6p3unify}{{7.4}{144}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {7.4.1}Resources}{144}}
+\newlabel{sec:ch6p3resources}{{7.4.1}{144}}
+\newlabel{algo:ch6p3ORWLresources}{{7.17}{145}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{145}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {7.4.2}Control}{145}}
+\newlabel{sec:ch6p3ORWLcontrol}{{7.4.2}{145}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {7.4.3}Example: block-cyclic matrix multiplication (MM)}{146}}
+\newlabel{sec:ch6p3ORWLMM}{{7.4.3}{146}}
+\newlabel{algo:ch6p3ORWLBCCMM}{{7.18}{146}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.18}Block-cyclic matrix multiplication, high level per task view}{146}}
+\newlabel{algo:ch6p3ORWLlcopy}{{7.19}{147}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.19}An iterative local copy operation}{147}}
+\newlabel{algo:ch6p3ORWLrcopy}{{7.20}{147}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{147}}
+\newlabel{algo:ch6p3ORWLtrans}{{7.21}{147}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{147}}
+\newlabel{algo:ch6p3ORWLdecl}{{7.22}{148}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.22}Dynamic declaration of handles to represent the resources}{148}}
+\newlabel{algo:ch6p3ORWLinit}{{7.23}{148}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {7.23}Dynamic initialization of access mode and priorities}{148}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {7.4.4}Tasks and operations}{149}}
+\newlabel{sec:ch6p3tasks}{{7.4.4}{149}}
+\@writefile{toc}{\contentsline {section}{\numberline {7.5}Conclusion}{150}}
+\newlabel{ch6:conclu}{{7.5}{150}}
+\@writefile{toc}{\contentsline {section}{\numberline {7.6}Glossary}{150}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{151}}
\@setckpt{Chapters/chapter6/ch6}{
-\setcounter{page}{135}
+\setcounter{page}{153}
\setcounter{equation}{0}
\setcounter{enumi}{4}
\setcounter{enumii}{0}
\setcounter{footnote}{0}
\setcounter{mpfootnote}{0}
\setcounter{part}{3}
-\setcounter{chapter}{6}
+\setcounter{chapter}{7}
\setcounter{section}{6}
\setcounter{subsection}{0}
\setcounter{subsubsection}{0}
\setcounter{lstnumber}{17}
\setcounter{ContinuedFloat}{0}
\setcounter{AlgoLine}{6}
-\setcounter{algocfline}{4}
-\setcounter{algocfproc}{4}
-\setcounter{algocf}{4}
+\setcounter{algocfline}{5}
+\setcounter{algocfproc}{5}
+\setcounter{algocf}{5}
\setcounter{nprt@mantissa@digitsbefore}{0}
\setcounter{nprt@mantissa@digitsafter}{0}
\setcounter{nprt@exponent@digitsbefore}{0}