\@writefile{toc}{\author{Stephane Vialle}{}}
\@writefile{toc}{\author{Jens Gustedt}{}}
\@writefile{loa}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {chapter}{\numberline {5}Development methodologies for GPU and cluster of GPUs}{49}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {6}Development methodologies for GPU and cluster of GPUs}{81}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {section}{\numberline {5.1}Introduction}{50}}
-\newlabel{ch6:intro}{{5.1}{50}}
-\@writefile{toc}{\contentsline {section}{\numberline {5.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{50}}
-\newlabel{ch6:part1}{{5.2}{50}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.1}Synchronous parallel algorithms on GPU clusters}{50}}
-\@writefile{lof}{\contentsline {figure}{\numberline {5.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{52}}
-\newlabel{fig:ch6p1overlapnative}{{5.1}{52}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.2}Native overlap of CPU communications and GPU computations}{52}}
-\newlabel{algo:ch6p1overlapnative}{{5.1}{53}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{53}}
-\@writefile{lof}{\contentsline {figure}{\numberline {5.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{54}}
-\newlabel{fig:ch6p1overlapseqsequence}{{5.2}{54}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.3}Overlapping with sequences of transfers and computations}{54}}
-\newlabel{algo:ch6p1overlapseqsequence}{{5.2}{55}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{55}}
-\@writefile{lof}{\contentsline {figure}{\numberline {5.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{56}}
-\newlabel{fig:ch6p1overlapstreamsequence}{{5.3}{56}}
-\newlabel{algo:ch6p1overlapstreamsequence}{{5.3}{57}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{57}}
-\@writefile{lof}{\contentsline {figure}{\numberline {5.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{59}}
-\newlabel{fig:ch6p1overlapinterleaved}{{5.4}{59}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.4}Interleaved communications-transfers-computations overlapping}{59}}
-\newlabel{algo:ch6p1overlapinterleaved}{{5.4}{60}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{60}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.5}Experimental validation}{62}}
-\newlabel{ch6:p1expes}{{5.2.5}{62}}
-\newlabel{ch6:p1block-cyclic}{{5.2.5}{62}}
-\@writefile{lof}{\contentsline {figure}{\numberline {5.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{63}}
-\newlabel{fig:ch6p1syncexpematrixprod}{{5.5}{63}}
-\@writefile{toc}{\contentsline {section}{\numberline {5.3}General scheme of asynchronous parallel code with computation/communication overlapping}{64}}
-\newlabel{ch6:part2}{{5.3}{64}}
-\@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{64}}
-\newlabel{algo:ch6p2sync}{{3}{64}}
-\@writefile{loa}{\contentsline {algorithm}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{64}}
-\newlabel{algo:ch6p2async}{{4}{64}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.1}A basic asynchronous scheme}{66}}
-\newlabel{ch6:p2BasicAsync}{{5.3.1}{66}}
-\newlabel{algo:ch6p2BasicAsync}{{5.5}{66}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.5}Initialization of the basic asynchronous scheme}{66}}
-\newlabel{algo:ch6p2BasicAsyncComp}{{5.6}{67}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.6}Computing function in the basic asynchronous scheme}{67}}
-\newlabel{algo:ch6p2BasicAsyncSendings}{{5.7}{68}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.7}Sending function in the basic asynchronous scheme}{68}}
-\newlabel{algo:ch6p2BasicAsyncReceptions}{{5.8}{69}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.8}Reception function in the basic asynchronous scheme}{69}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.2}Synchronization of the asynchronous scheme}{70}}
-\newlabel{ch6:p2SsyncOverAsync}{{5.3.2}{70}}
-\newlabel{algo:ch6p2Sync}{{5.9}{71}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.9}Initialization of the synchronized scheme}{71}}
-\newlabel{algo:ch6p2SyncComp}{{5.10}{72}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.10}Computing function in the synchronized scheme}{72}}
-\newlabel{algo:ch6p2SyncReceptions}{{5.11}{73}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.11}Reception function in the synchronized scheme}{73}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{74}}
-\newlabel{ch6:p2GPUAsync}{{5.3.3}{74}}
-\newlabel{algo:ch6p2AsyncSyncComp}{{5.12}{76}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.12}Computing function in the final asynchronous scheme}{76}}
-\newlabel{algo:ch6p2syncGPU}{{5.13}{77}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.13}Computing function in the final asynchronous scheme}{77}}
-\newlabel{algo:ch6p2FullOverAsyncMain}{{5.14}{79}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.14}Initialization of the main process of complete overlap with asynchronism}{79}}
-\newlabel{algo:ch6p2FullOverAsyncComp1}{{5.15}{80}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{80}}
-\newlabel{algo:ch6p2FullOverAsyncComp2}{{5.16}{81}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{81}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.4}Experimental validation}{82}}
-\newlabel{sec:ch6p2expes}{{5.3.4}{82}}
-\@writefile{lof}{\contentsline {figure}{\numberline {5.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{83}}
-\newlabel{fig:ch6p2syncasync}{{5.6}{83}}
-\@writefile{lof}{\contentsline {figure}{\numberline {5.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{84}}
-\newlabel{fig:ch6p2aux}{{5.7}{84}}
-\@writefile{toc}{\contentsline {section}{\numberline {5.4}Perspective: A unifying programming model}{85}}
-\newlabel{sec:ch6p3unify}{{5.4}{85}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.1}Resources}{85}}
-\newlabel{sec:ch6p3resources}{{5.4.1}{85}}
-\newlabel{algo:ch6p3ORWLresources}{{5.17}{86}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{86}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.2}Control}{86}}
-\newlabel{sec:ch6p3ORWLcontrol}{{5.4.2}{86}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.3}Example: block-cyclic matrix multiplication (MM)}{87}}
-\newlabel{sec:ch6p3ORWLMM}{{5.4.3}{87}}
-\newlabel{algo:ch6p3ORWLBCCMM}{{5.18}{87}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.18}Block-cyclic matrix multiplication, high level per task view}{87}}
-\newlabel{algo:ch6p3ORWLlcopy}{{5.19}{88}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.19}An iterative local copy operation}{88}}
-\newlabel{algo:ch6p3ORWLrcopy}{{5.20}{88}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{88}}
-\newlabel{algo:ch6p3ORWLtrans}{{5.21}{88}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{88}}
-\newlabel{algo:ch6p3ORWLdecl}{{5.22}{89}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.22}Dynamic declaration of handles to represent the resources}{89}}
-\newlabel{algo:ch6p3ORWLinit}{{5.23}{90}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.23}Dynamic initialization of access mode and priorities}{90}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.4}Tasks and operations}{90}}
-\newlabel{sec:ch6p3tasks}{{5.4.4}{90}}
-\@writefile{toc}{\contentsline {section}{\numberline {5.5}Conclusion}{91}}
-\newlabel{ch6:conclu}{{5.5}{91}}
-\@writefile{toc}{\contentsline {section}{\numberline {5.6}Glossary}{91}}
-\@writefile{toc}{\contentsline {section}{Bibliography}{92}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.1}Introduction}{82}}
+\newlabel{ch6:intro}{{6.1}{82}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{82}}
+\newlabel{ch6:part1}{{6.2}{82}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.1}Synchronous parallel algorithms on GPU clusters}{82}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{84}}
+\newlabel{fig:ch6p1overlapnative}{{6.1}{84}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.2}Native overlap of CPU communications and GPU computations}{84}}
+\newlabel{algo:ch6p1overlapnative}{{6.1}{85}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{85}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{86}}
+\newlabel{fig:ch6p1overlapseqsequence}{{6.2}{86}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{86}}
+\newlabel{algo:ch6p1overlapseqsequence}{{6.2}{87}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{87}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{88}}
+\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{88}}
+\newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{89}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{89}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{91}}
+\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{91}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{91}}
+\newlabel{algo:ch6p1overlapinterleaved}{{6.4}{92}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{92}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{94}}
+\newlabel{ch6:p1expes}{{6.2.5}{94}}
+\newlabel{ch6:p1block-cyclic}{{6.2.5}{94}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{95}}
+\newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{95}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{96}}
+\newlabel{ch6:part2}{{6.3}{96}}
+\@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{96}}
+\newlabel{algo:ch6p2sync}{{3}{96}}
+\@writefile{loa}{\contentsline {algorithm}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{96}}
+\newlabel{algo:ch6p2async}{{4}{96}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{98}}
+\newlabel{ch6:p2BasicAsync}{{6.3.1}{98}}
+\newlabel{algo:ch6p2BasicAsync}{{6.5}{98}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{98}}
+\newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{99}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{99}}
+\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{100}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{100}}
+\newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{101}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{101}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{102}}
+\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{102}}
+\newlabel{algo:ch6p2Sync}{{6.9}{103}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{103}}
+\newlabel{algo:ch6p2SyncComp}{{6.10}{104}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{104}}
+\newlabel{algo:ch6p2SyncReceptions}{{6.11}{105}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{105}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{106}}
+\newlabel{ch6:p2GPUAsync}{{6.3.3}{106}}
+\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{108}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{108}}
+\newlabel{algo:ch6p2syncGPU}{{6.13}{109}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{109}}
+\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{111}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{111}}
+\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{112}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{112}}
+\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{113}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{113}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{114}}
+\newlabel{sec:ch6p2expes}{{6.3.4}{114}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{115}}
+\newlabel{fig:ch6p2syncasync}{{6.6}{115}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{116}}
+\newlabel{fig:ch6p2aux}{{6.7}{116}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{117}}
+\newlabel{sec:ch6p3unify}{{6.4}{117}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{117}}
+\newlabel{sec:ch6p3resources}{{6.4.1}{117}}
+\newlabel{algo:ch6p3ORWLresources}{{6.17}{118}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{118}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{118}}
+\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{118}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{119}}
+\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{119}}
+\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{119}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{119}}
+\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{120}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{120}}
+\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{120}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{120}}
+\newlabel{algo:ch6p3ORWLtrans}{{6.21}{120}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{120}}
+\newlabel{algo:ch6p3ORWLdecl}{{6.22}{121}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{121}}
+\newlabel{algo:ch6p3ORWLinit}{{6.23}{122}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{122}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{122}}
+\newlabel{sec:ch6p3tasks}{{6.4.4}{122}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{123}}
+\newlabel{ch6:conclu}{{6.5}{123}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{123}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{124}}
\@setckpt{Chapters/chapter6/ch6}{
-\setcounter{page}{94}
+\setcounter{page}{126}
\setcounter{equation}{0}
\setcounter{enumi}{4}
\setcounter{enumii}{0}
\setcounter{footnote}{0}
\setcounter{mpfootnote}{0}
\setcounter{part}{1}
-\setcounter{chapter}{5}
+\setcounter{chapter}{6}
\setcounter{section}{6}
\setcounter{subsection}{0}
\setcounter{subsubsection}{0}
\setcounter{figure}{7}
\setcounter{table}{0}
\setcounter{numauthors}{0}
-\setcounter{parentequation}{0}
+\setcounter{parentequation}{8}
\setcounter{subfigure}{0}
\setcounter{lofdepth}{1}
\setcounter{subtable}{0}
\setcounter{algocfproc}{0}
\setcounter{algocf}{0}
\setcounter{proposition}{0}
+\setcounter{theorem}{0}
+\setcounter{exercise}{0}
+\setcounter{example}{0}
+\setcounter{definition}{0}
\setcounter{proof}{0}
\setcounter{lstlisting}{23}
}