X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/blobdiff_plain/44b8f845847505b81dc0f1199c49e67a495ed7a0..b0cfcc742771497c83313352b59170ead2f99f40:/BookGPU/Chapters/chapter6/ch6.aux diff --git a/BookGPU/Chapters/chapter6/ch6.aux b/BookGPU/Chapters/chapter6/ch6.aux index 973672c..eec3fee 100644 --- a/BookGPU/Chapters/chapter6/ch6.aux +++ b/BookGPU/Chapters/chapter6/ch6.aux @@ -3,110 +3,110 @@ \@writefile{toc}{\author{Stephane Vialle}{}} \@writefile{toc}{\author{Jens Gustedt}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {3}Development methodologies for GPU and cluster of GPUs}{23}} +\@writefile{toc}{\contentsline {chapter}{\numberline {5}Development methodologies for GPU and cluster of GPUs}{49}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {section}{\numberline {3.1}Introduction}{24}} -\newlabel{ch6:intro}{{3.1}{24}} -\@writefile{toc}{\contentsline {section}{\numberline {3.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{24}} -\newlabel{ch6:part1}{{3.2}{24}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.1}Synchronous parallel algorithms on GPU clusters}{24}} -\@writefile{lof}{\contentsline {figure}{\numberline {3.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{26}} -\newlabel{fig:ch6p1overlapnative}{{3.1}{26}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.2}Native overlap of CPU communications and GPU computations}{26}} -\newlabel{algo:ch6p1overlapnative}{{3.1}{27}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{27}} -\@writefile{lof}{\contentsline {figure}{\numberline {3.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{28}} -\newlabel{fig:ch6p1overlapseqsequence}{{3.2}{28}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.3}Overlapping with sequences of transfers and computations}{28}} -\newlabel{algo:ch6p1overlapseqsequence}{{3.2}{29}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{29}} -\@writefile{lof}{\contentsline {figure}{\numberline {3.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{30}} -\newlabel{fig:ch6p1overlapstreamsequence}{{3.3}{30}} -\newlabel{algo:ch6p1overlapstreamsequence}{{3.3}{31}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{31}} -\@writefile{lof}{\contentsline {figure}{\numberline {3.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{33}} -\newlabel{fig:ch6p1overlapinterleaved}{{3.4}{33}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.4}Interleaved communications-transfers-computations overlapping}{33}} -\newlabel{algo:ch6p1overlapinterleaved}{{3.4}{34}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{34}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.2.5}Experimental validation}{36}} -\newlabel{ch6:p1expes}{{3.2.5}{36}} -\newlabel{ch6:p1block-cyclic}{{3.2.5}{36}} -\@writefile{lof}{\contentsline {figure}{\numberline {3.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{37}} -\newlabel{fig:ch6p1syncexpematrixprod}{{3.5}{37}} -\@writefile{toc}{\contentsline {section}{\numberline {3.3}General scheme of asynchronous parallel code with computation/communication overlapping}{38}} -\newlabel{ch6:part2}{{3.3}{38}} -\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces Synchronous iterative scheme\relax }}{38}} -\newlabel{algo:ch6p2sync}{{1}{38}} -\@writefile{loa}{\contentsline {algorithm}{\numberline {2}{\ignorespaces Asynchronous iterative scheme\relax }}{38}} -\newlabel{algo:ch6p2async}{{2}{38}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.1}A basic asynchronous scheme}{40}} -\newlabel{ch6:p2BasicAsync}{{3.3.1}{40}} -\newlabel{algo:ch6p2BasicAsync}{{3.5}{40}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.5}Initialization of the basic asynchronous scheme}{40}} -\newlabel{algo:ch6p2BasicAsyncComp}{{3.6}{41}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.6}Computing function in the basic asynchronous scheme}{41}} -\newlabel{algo:ch6p2BasicAsyncSendings}{{3.7}{42}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.7}Sending function in the basic asynchronous scheme}{42}} -\newlabel{algo:ch6p2BasicAsyncReceptions}{{3.8}{43}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.8}Reception function in the basic asynchronous scheme}{43}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.2}Synchronization of the asynchronous scheme}{44}} -\newlabel{ch6:p2SsyncOverAsync}{{3.3.2}{44}} -\newlabel{algo:ch6p2Sync}{{3.9}{45}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.9}Initialization of the synchronized scheme}{45}} -\newlabel{algo:ch6p2SyncComp}{{3.10}{46}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.10}Computing function in the synchronized scheme}{46}} -\newlabel{algo:ch6p2SyncReceptions}{{3.11}{47}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.11}Reception function in the synchronized scheme}{47}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{48}} -\newlabel{ch6:p2GPUAsync}{{3.3.3}{48}} -\newlabel{algo:ch6p2AsyncSyncComp}{{3.12}{50}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.12}Computing function in the final asynchronous scheme}{50}} -\newlabel{algo:ch6p2syncGPU}{{3.13}{51}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.13}Computing function in the final asynchronous scheme}{51}} -\newlabel{algo:ch6p2FullOverAsyncMain}{{3.14}{53}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.14}Initialization of the main process of complete overlap with asynchronism}{53}} -\newlabel{algo:ch6p2FullOverAsyncComp1}{{3.15}{54}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{54}} -\newlabel{algo:ch6p2FullOverAsyncComp2}{{3.16}{55}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{55}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.3.4}Experimental validation}{56}} -\newlabel{sec:ch6p2expes}{{3.3.4}{56}} -\@writefile{lof}{\contentsline {figure}{\numberline {3.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{57}} -\newlabel{fig:ch6p2syncasync}{{3.6}{57}} -\@writefile{lof}{\contentsline {figure}{\numberline {3.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{58}} -\newlabel{fig:ch6p2aux}{{3.7}{58}} -\@writefile{toc}{\contentsline {section}{\numberline {3.4}Perspective: A unifying programming model}{59}} -\newlabel{sec:ch6p3unify}{{3.4}{59}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.4.1}Resources}{59}} -\newlabel{sec:ch6p3resources}{{3.4.1}{59}} -\newlabel{algo:ch6p3ORWLresources}{{3.17}{60}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{60}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.4.2}Control}{60}} -\newlabel{sec:ch6p3ORWLcontrol}{{3.4.2}{60}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.4.3}Example: block-cyclic matrix multiplication (MM)}{61}} -\newlabel{sec:ch6p3ORWLMM}{{3.4.3}{61}} -\newlabel{algo:ch6p3ORWLBCCMM}{{3.18}{61}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.18}Block-cyclic matrix multiplication, high level per task view}{61}} -\newlabel{algo:ch6p3ORWLlcopy}{{3.19}{62}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.19}An iterative local copy operation}{62}} -\newlabel{algo:ch6p3ORWLrcopy}{{3.20}{62}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{62}} -\newlabel{algo:ch6p3ORWLtrans}{{3.21}{62}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{62}} -\newlabel{algo:ch6p3ORWLdecl}{{3.22}{63}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.22}Dynamic declaration of handles to represent the resources}{63}} -\newlabel{algo:ch6p3ORWLinit}{{3.23}{64}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.23}Dynamic initialization of access mode and priorities}{64}} -\@writefile{toc}{\contentsline {subsection}{\numberline {3.4.4}Tasks and operations}{64}} -\newlabel{sec:ch6p3tasks}{{3.4.4}{64}} -\@writefile{toc}{\contentsline {section}{\numberline {3.5}Conclusion}{65}} -\newlabel{ch6:conclu}{{3.5}{65}} -\@writefile{toc}{\contentsline {section}{\numberline {3.6}Glossary}{65}} -\@writefile{toc}{\contentsline {section}{Bibliography}{66}} +\@writefile{toc}{\contentsline {section}{\numberline {5.1}Introduction}{50}} +\newlabel{ch6:intro}{{5.1}{50}} +\@writefile{toc}{\contentsline {section}{\numberline {5.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{50}} +\newlabel{ch6:part1}{{5.2}{50}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.1}Synchronous parallel algorithms on GPU clusters}{50}} +\@writefile{lof}{\contentsline {figure}{\numberline {5.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{52}} +\newlabel{fig:ch6p1overlapnative}{{5.1}{52}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.2}Native overlap of CPU communications and GPU computations}{52}} +\newlabel{algo:ch6p1overlapnative}{{5.1}{53}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{53}} +\@writefile{lof}{\contentsline {figure}{\numberline {5.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{54}} +\newlabel{fig:ch6p1overlapseqsequence}{{5.2}{54}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.3}Overlapping with sequences of transfers and computations}{54}} +\newlabel{algo:ch6p1overlapseqsequence}{{5.2}{55}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{55}} +\@writefile{lof}{\contentsline {figure}{\numberline {5.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{56}} +\newlabel{fig:ch6p1overlapstreamsequence}{{5.3}{56}} +\newlabel{algo:ch6p1overlapstreamsequence}{{5.3}{57}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{57}} +\@writefile{lof}{\contentsline {figure}{\numberline {5.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{59}} +\newlabel{fig:ch6p1overlapinterleaved}{{5.4}{59}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.4}Interleaved communications-transfers-computations overlapping}{59}} +\newlabel{algo:ch6p1overlapinterleaved}{{5.4}{60}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{60}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.2.5}Experimental validation}{62}} +\newlabel{ch6:p1expes}{{5.2.5}{62}} +\newlabel{ch6:p1block-cyclic}{{5.2.5}{62}} +\@writefile{lof}{\contentsline {figure}{\numberline {5.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{63}} +\newlabel{fig:ch6p1syncexpematrixprod}{{5.5}{63}} +\@writefile{toc}{\contentsline {section}{\numberline {5.3}General scheme of asynchronous parallel code with computation/communication overlapping}{64}} +\newlabel{ch6:part2}{{5.3}{64}} +\@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{64}} +\newlabel{algo:ch6p2sync}{{3}{64}} +\@writefile{loa}{\contentsline {algorithm}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{64}} +\newlabel{algo:ch6p2async}{{4}{64}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.1}A basic asynchronous scheme}{66}} +\newlabel{ch6:p2BasicAsync}{{5.3.1}{66}} +\newlabel{algo:ch6p2BasicAsync}{{5.5}{66}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.5}Initialization of the basic asynchronous scheme}{66}} +\newlabel{algo:ch6p2BasicAsyncComp}{{5.6}{67}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.6}Computing function in the basic asynchronous scheme}{67}} +\newlabel{algo:ch6p2BasicAsyncSendings}{{5.7}{68}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.7}Sending function in the basic asynchronous scheme}{68}} +\newlabel{algo:ch6p2BasicAsyncReceptions}{{5.8}{69}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.8}Reception function in the basic asynchronous scheme}{69}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.2}Synchronization of the asynchronous scheme}{70}} +\newlabel{ch6:p2SsyncOverAsync}{{5.3.2}{70}} +\newlabel{algo:ch6p2Sync}{{5.9}{71}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.9}Initialization of the synchronized scheme}{71}} +\newlabel{algo:ch6p2SyncComp}{{5.10}{72}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.10}Computing function in the synchronized scheme}{72}} +\newlabel{algo:ch6p2SyncReceptions}{{5.11}{73}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.11}Reception function in the synchronized scheme}{73}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{74}} +\newlabel{ch6:p2GPUAsync}{{5.3.3}{74}} +\newlabel{algo:ch6p2AsyncSyncComp}{{5.12}{76}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.12}Computing function in the final asynchronous scheme}{76}} +\newlabel{algo:ch6p2syncGPU}{{5.13}{77}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.13}Computing function in the final asynchronous scheme}{77}} +\newlabel{algo:ch6p2FullOverAsyncMain}{{5.14}{79}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.14}Initialization of the main process of complete overlap with asynchronism}{79}} +\newlabel{algo:ch6p2FullOverAsyncComp1}{{5.15}{80}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{80}} +\newlabel{algo:ch6p2FullOverAsyncComp2}{{5.16}{81}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{81}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.3.4}Experimental validation}{82}} +\newlabel{sec:ch6p2expes}{{5.3.4}{82}} +\@writefile{lof}{\contentsline {figure}{\numberline {5.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{83}} +\newlabel{fig:ch6p2syncasync}{{5.6}{83}} +\@writefile{lof}{\contentsline {figure}{\numberline {5.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{84}} +\newlabel{fig:ch6p2aux}{{5.7}{84}} +\@writefile{toc}{\contentsline {section}{\numberline {5.4}Perspective: A unifying programming model}{85}} +\newlabel{sec:ch6p3unify}{{5.4}{85}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.1}Resources}{85}} +\newlabel{sec:ch6p3resources}{{5.4.1}{85}} +\newlabel{algo:ch6p3ORWLresources}{{5.17}{86}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{86}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.2}Control}{86}} +\newlabel{sec:ch6p3ORWLcontrol}{{5.4.2}{86}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.3}Example: block-cyclic matrix multiplication (MM)}{87}} +\newlabel{sec:ch6p3ORWLMM}{{5.4.3}{87}} +\newlabel{algo:ch6p3ORWLBCCMM}{{5.18}{87}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.18}Block-cyclic matrix multiplication, high level per task view}{87}} +\newlabel{algo:ch6p3ORWLlcopy}{{5.19}{88}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.19}An iterative local copy operation}{88}} +\newlabel{algo:ch6p3ORWLrcopy}{{5.20}{88}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{88}} +\newlabel{algo:ch6p3ORWLtrans}{{5.21}{88}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{88}} +\newlabel{algo:ch6p3ORWLdecl}{{5.22}{89}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.22}Dynamic declaration of handles to represent the resources}{89}} +\newlabel{algo:ch6p3ORWLinit}{{5.23}{90}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {5.23}Dynamic initialization of access mode and priorities}{90}} +\@writefile{toc}{\contentsline {subsection}{\numberline {5.4.4}Tasks and operations}{90}} +\newlabel{sec:ch6p3tasks}{{5.4.4}{90}} +\@writefile{toc}{\contentsline {section}{\numberline {5.5}Conclusion}{91}} +\newlabel{ch6:conclu}{{5.5}{91}} +\@writefile{toc}{\contentsline {section}{\numberline {5.6}Glossary}{91}} +\@writefile{toc}{\contentsline {section}{Bibliography}{92}} \@setckpt{Chapters/chapter6/ch6}{ -\setcounter{page}{68} +\setcounter{page}{94} \setcounter{equation}{0} \setcounter{enumi}{4} \setcounter{enumii}{0} @@ -115,7 +115,7 @@ \setcounter{footnote}{0} \setcounter{mpfootnote}{0} \setcounter{part}{1} -\setcounter{chapter}{3} +\setcounter{chapter}{5} \setcounter{section}{6} \setcounter{subsection}{0} \setcounter{subsubsection}{0} @@ -132,7 +132,7 @@ \setcounter{lstnumber}{17} \setcounter{ContinuedFloat}{0} \setcounter{float@type}{16} -\setcounter{algorithm}{2} +\setcounter{algorithm}{4} \setcounter{ALC@unique}{0} \setcounter{ALC@line}{0} \setcounter{ALC@rem}{0}