\relax \@writefile{toc}{\author{Sylvain Contassot-Vivier}{}} \@writefile{toc}{\author{Stephane Vialle}{}} \@writefile{toc}{\author{Jens Gustedt}{}} \@writefile{loa}{\addvspace {10\p@ }} \@writefile{toc}{\contentsline {chapter}{\numberline {7}Development methodologies for GPU and cluster of GPUs}{107}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} \@writefile{toc}{\contentsline {section}{\numberline {7.1}Introduction}{108}} \newlabel{ch6:intro}{{7.1}{108}} \@writefile{toc}{\contentsline {section}{\numberline {7.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{108}} \newlabel{ch6:part1}{{7.2}{108}} \@writefile{toc}{\contentsline {subsection}{\numberline {7.2.1}Synchronous parallel algorithms on GPU clusters}{108}} \@writefile{lof}{\contentsline {figure}{\numberline {7.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{110}} \newlabel{fig:ch6p1overlapnative}{{7.1}{110}} \@writefile{toc}{\contentsline {subsection}{\numberline {7.2.2}Native overlap of CPU communications and GPU computations}{110}} \newlabel{algo:ch6p1overlapnative}{{7.1}{111}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{111}} \@writefile{lof}{\contentsline {figure}{\numberline {7.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{112}} \newlabel{fig:ch6p1overlapseqsequence}{{7.2}{112}} \@writefile{toc}{\contentsline {subsection}{\numberline {7.2.3}Overlapping with sequences of transfers and computations}{112}} \newlabel{algo:ch6p1overlapseqsequence}{{7.2}{113}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{113}} \@writefile{lof}{\contentsline {figure}{\numberline {7.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{115}} \newlabel{fig:ch6p1overlapstreamsequence}{{7.3}{115}} \newlabel{algo:ch6p1overlapstreamsequence}{{7.3}{115}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{115}} \@writefile{toc}{\contentsline {subsection}{\numberline {7.2.4}Interleaved communications-transfers-computations overlapping}{117}} \@writefile{lof}{\contentsline {figure}{\numberline {7.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{118}} \newlabel{fig:ch6p1overlapinterleaved}{{7.4}{118}} \newlabel{algo:ch6p1overlapinterleaved}{{7.4}{118}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{118}} \@writefile{toc}{\contentsline {subsection}{\numberline {7.2.5}Experimental validation}{120}} \newlabel{ch6:p1expes}{{7.2.5}{120}} \newlabel{ch6:p1block-cyclic}{{7.2.5}{120}} \@writefile{lof}{\contentsline {figure}{\numberline {7.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{121}} \newlabel{fig:ch6p1syncexpematrixprod}{{7.5}{121}} \@writefile{toc}{\contentsline {section}{\numberline {7.3}General scheme of asynchronous parallel code with computation/communication overlapping}{122}} \newlabel{ch6:part2}{{7.3}{122}} \@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Synchronous iterative scheme\relax }}{122}} \newlabel{algo:ch6p2sync}{{4}{122}} \@writefile{loa}{\contentsline {algocf}{\numberline {5}{\ignorespaces Asynchronous iterative scheme\relax }}{123}} \newlabel{algo:ch6p2async}{{5}{123}} \@writefile{toc}{\contentsline {subsection}{\numberline {7.3.1}A basic asynchronous scheme}{124}} \newlabel{ch6:p2BasicAsync}{{7.3.1}{124}} \newlabel{algo:ch6p2BasicAsync}{{7.5}{124}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.5}Initialization of the basic asynchronous scheme}{124}} \newlabel{algo:ch6p2BasicAsyncComp}{{7.6}{125}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.6}Computing function in the basic asynchronous scheme}{125}} \newlabel{algo:ch6p2BasicAsyncSendings}{{7.7}{127}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.7}Sending function in the basic asynchronous scheme}{127}} \newlabel{algo:ch6p2BasicAsyncReceptions}{{7.8}{127}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.8}Reception function in the basic asynchronous scheme}{127}} \@writefile{toc}{\contentsline {subsection}{\numberline {7.3.2}Synchronization of the asynchronous scheme}{129}} \newlabel{ch6:p2SsyncOverAsync}{{7.3.2}{129}} \newlabel{algo:ch6p2Sync}{{7.9}{129}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.9}Initialization of the synchronized scheme}{129}} \newlabel{algo:ch6p2SyncComp}{{7.10}{130}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.10}Computing function in the synchronized scheme}{130}} \newlabel{algo:ch6p2SyncReceptions}{{7.11}{132}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.11}Reception function in the synchronized scheme}{132}} \@writefile{toc}{\contentsline {subsection}{\numberline {7.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{133}} \newlabel{ch6:p2GPUAsync}{{7.3.3}{133}} \newlabel{algo:ch6p2AsyncSyncComp}{{7.12}{134}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.12}Computing function in the final asynchronous scheme}{134}} \newlabel{algo:ch6p2syncGPU}{{7.13}{135}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.13}Computing function in the final asynchronous scheme}{135}} \newlabel{algo:ch6p2FullOverAsyncMain}{{7.14}{138}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.14}Initialization of the main process of complete overlap with asynchronism}{138}} \newlabel{algo:ch6p2FullOverAsyncComp1}{{7.15}{139}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{139}} \newlabel{algo:ch6p2FullOverAsyncComp2}{{7.16}{140}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{140}} \@writefile{toc}{\contentsline {subsection}{\numberline {7.3.4}Experimental validation}{141}} \newlabel{sec:ch6p2expes}{{7.3.4}{141}} \@writefile{lof}{\contentsline {figure}{\numberline {7.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{142}} \newlabel{fig:ch6p2syncasync}{{7.6}{142}} \@writefile{lof}{\contentsline {figure}{\numberline {7.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{143}} \newlabel{fig:ch6p2aux}{{7.7}{143}} \@writefile{toc}{\contentsline {section}{\numberline {7.4}Perspective: A unifying programming model}{144}} \newlabel{sec:ch6p3unify}{{7.4}{144}} \@writefile{toc}{\contentsline {subsection}{\numberline {7.4.1}Resources}{144}} \newlabel{sec:ch6p3resources}{{7.4.1}{144}} \newlabel{algo:ch6p3ORWLresources}{{7.17}{145}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{145}} \@writefile{toc}{\contentsline {subsection}{\numberline {7.4.2}Control}{145}} \newlabel{sec:ch6p3ORWLcontrol}{{7.4.2}{145}} \@writefile{toc}{\contentsline {subsection}{\numberline {7.4.3}Example: block-cyclic matrix multiplication (MM)}{146}} \newlabel{sec:ch6p3ORWLMM}{{7.4.3}{146}} \newlabel{algo:ch6p3ORWLBCCMM}{{7.18}{146}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.18}Block-cyclic matrix multiplication, high level per task view}{146}} \newlabel{algo:ch6p3ORWLlcopy}{{7.19}{147}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.19}An iterative local copy operation}{147}} \newlabel{algo:ch6p3ORWLrcopy}{{7.20}{147}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{147}} \newlabel{algo:ch6p3ORWLtrans}{{7.21}{147}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{147}} \newlabel{algo:ch6p3ORWLdecl}{{7.22}{148}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.22}Dynamic declaration of handles to represent the resources}{148}} \newlabel{algo:ch6p3ORWLinit}{{7.23}{148}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {7.23}Dynamic initialization of access mode and priorities}{148}} \@writefile{toc}{\contentsline {subsection}{\numberline {7.4.4}Tasks and operations}{149}} \newlabel{sec:ch6p3tasks}{{7.4.4}{149}} \@writefile{toc}{\contentsline {section}{\numberline {7.5}Conclusion}{150}} \newlabel{ch6:conclu}{{7.5}{150}} \@writefile{toc}{\contentsline {section}{\numberline {7.6}Glossary}{150}} \@writefile{toc}{\contentsline {section}{Bibliography}{151}} \@setckpt{Chapters/chapter6/ch6}{ \setcounter{page}{153} \setcounter{equation}{0} \setcounter{enumi}{4} \setcounter{enumii}{0} \setcounter{enumiii}{0} \setcounter{enumiv}{21} \setcounter{footnote}{0} \setcounter{mpfootnote}{0} \setcounter{part}{3} \setcounter{chapter}{7} \setcounter{section}{6} \setcounter{subsection}{0} \setcounter{subsubsection}{0} \setcounter{paragraph}{0} \setcounter{subparagraph}{0} \setcounter{figure}{7} \setcounter{table}{0} \setcounter{numauthors}{0} \setcounter{parentequation}{8} \setcounter{subfigure}{0} \setcounter{lofdepth}{1} \setcounter{subtable}{0} \setcounter{lotdepth}{1} \setcounter{lstnumber}{17} \setcounter{ContinuedFloat}{0} \setcounter{AlgoLine}{6} \setcounter{algocfline}{5} \setcounter{algocfproc}{5} \setcounter{algocf}{5} \setcounter{nprt@mantissa@digitsbefore}{0} \setcounter{nprt@mantissa@digitsafter}{0} \setcounter{nprt@exponent@digitsbefore}{0} \setcounter{nprt@exponent@digitsafter}{0} \setcounter{nprt@digitsfirstblock}{0} \setcounter{nprt@blockcnt}{0} \setcounter{nprt@cntprint}{0} \setcounter{proposition}{0} \setcounter{theorem}{0} \setcounter{exercise}{0} \setcounter{example}{0} \setcounter{definition}{0} \setcounter{proof}{0} \setcounter{lstlisting}{23} }