\relax \@writefile{toc}{\author{Sylvain Contassot-Vivier}{}} \@writefile{toc}{\author{Stephane Vialle}{}} \@writefile{toc}{\author{Jens Gustedt}{}} \@writefile{loa}{\addvspace {10\p@ }} \@writefile{toc}{\contentsline {chapter}{\numberline {6}Development methodologies for GPU and cluster of GPUs}{81}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} \@writefile{toc}{\contentsline {section}{\numberline {6.1}Introduction}{82}} \newlabel{ch6:intro}{{6.1}{82}} \@writefile{toc}{\contentsline {section}{\numberline {6.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{82}} \newlabel{ch6:part1}{{6.2}{82}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.2.1}Synchronous parallel algorithms on GPU clusters}{82}} \@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{84}} \newlabel{fig:ch6p1overlapnative}{{6.1}{84}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.2.2}Native overlap of CPU communications and GPU computations}{84}} \newlabel{algo:ch6p1overlapnative}{{6.1}{85}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{85}} \@writefile{lof}{\contentsline {figure}{\numberline {6.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{86}} \newlabel{fig:ch6p1overlapseqsequence}{{6.2}{86}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{86}} \newlabel{algo:ch6p1overlapseqsequence}{{6.2}{87}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{87}} \@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{88}} \newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{88}} \newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{89}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{89}} \@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{91}} \newlabel{fig:ch6p1overlapinterleaved}{{6.4}{91}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{91}} \newlabel{algo:ch6p1overlapinterleaved}{{6.4}{92}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{92}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{94}} \newlabel{ch6:p1expes}{{6.2.5}{94}} \newlabel{ch6:p1block-cyclic}{{6.2.5}{94}} \@writefile{lof}{\contentsline {figure}{\numberline {6.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{95}} \newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{95}} \@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{96}} \newlabel{ch6:part2}{{6.3}{96}} \@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{96}} \newlabel{algo:ch6p2sync}{{3}{96}} \@writefile{loa}{\contentsline {algorithm}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{96}} \newlabel{algo:ch6p2async}{{4}{96}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{98}} \newlabel{ch6:p2BasicAsync}{{6.3.1}{98}} \newlabel{algo:ch6p2BasicAsync}{{6.5}{98}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{98}} \newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{99}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{99}} \newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{100}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{100}} \newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{101}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{101}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{102}} \newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{102}} \newlabel{algo:ch6p2Sync}{{6.9}{103}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{103}} \newlabel{algo:ch6p2SyncComp}{{6.10}{104}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{104}} \newlabel{algo:ch6p2SyncReceptions}{{6.11}{105}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{105}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{106}} \newlabel{ch6:p2GPUAsync}{{6.3.3}{106}} \newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{108}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{108}} \newlabel{algo:ch6p2syncGPU}{{6.13}{109}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{109}} \newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{111}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{111}} \newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{112}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{112}} \newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{113}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{113}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{114}} \newlabel{sec:ch6p2expes}{{6.3.4}{114}} \@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{115}} \newlabel{fig:ch6p2syncasync}{{6.6}{115}} \@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{116}} \newlabel{fig:ch6p2aux}{{6.7}{116}} \@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{117}} \newlabel{sec:ch6p3unify}{{6.4}{117}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{117}} \newlabel{sec:ch6p3resources}{{6.4.1}{117}} \newlabel{algo:ch6p3ORWLresources}{{6.17}{118}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{118}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{118}} \newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{118}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{119}} \newlabel{sec:ch6p3ORWLMM}{{6.4.3}{119}} \newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{119}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{119}} \newlabel{algo:ch6p3ORWLlcopy}{{6.19}{120}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{120}} \newlabel{algo:ch6p3ORWLrcopy}{{6.20}{120}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{120}} \newlabel{algo:ch6p3ORWLtrans}{{6.21}{120}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{120}} \newlabel{algo:ch6p3ORWLdecl}{{6.22}{121}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{121}} \newlabel{algo:ch6p3ORWLinit}{{6.23}{122}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{122}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{122}} \newlabel{sec:ch6p3tasks}{{6.4.4}{122}} \@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{123}} \newlabel{ch6:conclu}{{6.5}{123}} \@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{123}} \@writefile{toc}{\contentsline {section}{Bibliography}{124}} \@setckpt{Chapters/chapter6/ch6}{ \setcounter{page}{126} \setcounter{equation}{0} \setcounter{enumi}{4} \setcounter{enumii}{0} \setcounter{enumiii}{0} \setcounter{enumiv}{21} \setcounter{footnote}{0} \setcounter{mpfootnote}{0} \setcounter{part}{1} \setcounter{chapter}{6} \setcounter{section}{6} \setcounter{subsection}{0} \setcounter{subsubsection}{0} \setcounter{paragraph}{0} \setcounter{subparagraph}{0} \setcounter{figure}{7} \setcounter{table}{0} \setcounter{numauthors}{0} \setcounter{parentequation}{8} \setcounter{subfigure}{0} \setcounter{lofdepth}{1} \setcounter{subtable}{0} \setcounter{lotdepth}{1} \setcounter{lstnumber}{17} \setcounter{ContinuedFloat}{0} \setcounter{float@type}{16} \setcounter{algorithm}{4} \setcounter{ALC@unique}{0} \setcounter{ALC@line}{0} \setcounter{ALC@rem}{0} \setcounter{ALC@depth}{0} \setcounter{AlgoLine}{0} \setcounter{algocfline}{0} \setcounter{algocfproc}{0} \setcounter{algocf}{0} \setcounter{proposition}{0} \setcounter{theorem}{0} \setcounter{exercise}{0} \setcounter{example}{0} \setcounter{definition}{0} \setcounter{proof}{0} \setcounter{lstlisting}{23} }