X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/blobdiff_plain/d6c0eb8af968b2b5d3562240345777b17776520b..b7e61e1f68e950462bff7221fe17c38d2ce7b3c0:/BookGPU/Chapters/chapter6/ch6.aux diff --git a/BookGPU/Chapters/chapter6/ch6.aux b/BookGPU/Chapters/chapter6/ch6.aux index bfe00ce..7d02f32 100644 --- a/BookGPU/Chapters/chapter6/ch6.aux +++ b/BookGPU/Chapters/chapter6/ch6.aux @@ -3,110 +3,110 @@ \@writefile{toc}{\author{Stephane Vialle}{}} \@writefile{toc}{\author{Jens Gustedt}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {6}Development methodologies for GPU and cluster of GPUs}{87}} +\@writefile{toc}{\contentsline {chapter}{\numberline {6}Development methodologies for GPU and cluster of GPUs}{89}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {section}{\numberline {6.1}Introduction}{88}} -\newlabel{ch6:intro}{{6.1}{88}} -\@writefile{toc}{\contentsline {section}{\numberline {6.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{88}} -\newlabel{ch6:part1}{{6.2}{88}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.1}Synchronous parallel algorithms on GPU clusters}{88}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{90}} -\newlabel{fig:ch6p1overlapnative}{{6.1}{90}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.2}Native overlap of CPU communications and GPU computations}{90}} -\newlabel{algo:ch6p1overlapnative}{{6.1}{91}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{91}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{92}} -\newlabel{fig:ch6p1overlapseqsequence}{{6.2}{92}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{92}} -\newlabel{algo:ch6p1overlapseqsequence}{{6.2}{93}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{93}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{95}} -\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{95}} -\newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{95}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{95}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{97}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{98}} -\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{98}} -\newlabel{algo:ch6p1overlapinterleaved}{{6.4}{98}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{98}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{100}} -\newlabel{ch6:p1expes}{{6.2.5}{100}} -\newlabel{ch6:p1block-cyclic}{{6.2.5}{100}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{101}} -\newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{101}} -\@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{102}} -\newlabel{ch6:part2}{{6.3}{102}} -\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{102}} -\newlabel{algo:ch6p2sync}{{3}{102}} -\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{103}} -\newlabel{algo:ch6p2async}{{4}{103}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{104}} -\newlabel{ch6:p2BasicAsync}{{6.3.1}{104}} -\newlabel{algo:ch6p2BasicAsync}{{6.5}{104}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{104}} -\newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{105}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{105}} -\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{107}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{107}} -\newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{107}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{107}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{109}} -\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{109}} -\newlabel{algo:ch6p2Sync}{{6.9}{109}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{109}} -\newlabel{algo:ch6p2SyncComp}{{6.10}{110}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{110}} -\newlabel{algo:ch6p2SyncReceptions}{{6.11}{112}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{112}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{113}} -\newlabel{ch6:p2GPUAsync}{{6.3.3}{113}} -\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{114}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{114}} -\newlabel{algo:ch6p2syncGPU}{{6.13}{115}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{115}} -\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{118}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{118}} -\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{119}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{119}} -\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{120}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{120}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{121}} -\newlabel{sec:ch6p2expes}{{6.3.4}{121}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{122}} -\newlabel{fig:ch6p2syncasync}{{6.6}{122}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{123}} -\newlabel{fig:ch6p2aux}{{6.7}{123}} -\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{124}} -\newlabel{sec:ch6p3unify}{{6.4}{124}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{124}} -\newlabel{sec:ch6p3resources}{{6.4.1}{124}} -\newlabel{algo:ch6p3ORWLresources}{{6.17}{125}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{125}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{125}} -\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{125}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{126}} -\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{126}} -\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{126}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{126}} -\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{127}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{127}} -\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{127}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{127}} -\newlabel{algo:ch6p3ORWLtrans}{{6.21}{127}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{127}} -\newlabel{algo:ch6p3ORWLdecl}{{6.22}{128}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{128}} -\newlabel{algo:ch6p3ORWLinit}{{6.23}{128}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{128}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{129}} -\newlabel{sec:ch6p3tasks}{{6.4.4}{129}} -\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{130}} -\newlabel{ch6:conclu}{{6.5}{130}} -\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{130}} -\@writefile{toc}{\contentsline {section}{Bibliography}{131}} +\@writefile{toc}{\contentsline {section}{\numberline {6.1}Introduction}{90}} +\newlabel{ch6:intro}{{6.1}{90}} +\@writefile{toc}{\contentsline {section}{\numberline {6.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{90}} +\newlabel{ch6:part1}{{6.2}{90}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.1}Synchronous parallel algorithms on GPU clusters}{90}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{92}} +\newlabel{fig:ch6p1overlapnative}{{6.1}{92}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.2}Native overlap of CPU communications and GPU computations}{92}} +\newlabel{algo:ch6p1overlapnative}{{6.1}{93}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{93}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{94}} +\newlabel{fig:ch6p1overlapseqsequence}{{6.2}{94}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{94}} +\newlabel{algo:ch6p1overlapseqsequence}{{6.2}{95}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{95}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{97}} +\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{97}} +\newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{97}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{97}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{99}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{100}} +\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{100}} +\newlabel{algo:ch6p1overlapinterleaved}{{6.4}{100}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{100}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{102}} +\newlabel{ch6:p1expes}{{6.2.5}{102}} +\newlabel{ch6:p1block-cyclic}{{6.2.5}{102}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{103}} +\newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{103}} +\@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{104}} +\newlabel{ch6:part2}{{6.3}{104}} +\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{104}} +\newlabel{algo:ch6p2sync}{{3}{104}} +\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{105}} +\newlabel{algo:ch6p2async}{{4}{105}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{106}} +\newlabel{ch6:p2BasicAsync}{{6.3.1}{106}} +\newlabel{algo:ch6p2BasicAsync}{{6.5}{106}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{106}} +\newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{107}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{107}} +\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{109}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{109}} +\newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{109}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{109}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{111}} +\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{111}} +\newlabel{algo:ch6p2Sync}{{6.9}{111}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{111}} +\newlabel{algo:ch6p2SyncComp}{{6.10}{112}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{112}} +\newlabel{algo:ch6p2SyncReceptions}{{6.11}{114}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{114}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{115}} +\newlabel{ch6:p2GPUAsync}{{6.3.3}{115}} +\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{116}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{116}} +\newlabel{algo:ch6p2syncGPU}{{6.13}{117}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{117}} +\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{120}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{120}} +\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{121}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{121}} +\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{122}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{122}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{123}} +\newlabel{sec:ch6p2expes}{{6.3.4}{123}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{124}} +\newlabel{fig:ch6p2syncasync}{{6.6}{124}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{125}} +\newlabel{fig:ch6p2aux}{{6.7}{125}} +\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{126}} +\newlabel{sec:ch6p3unify}{{6.4}{126}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{126}} +\newlabel{sec:ch6p3resources}{{6.4.1}{126}} +\newlabel{algo:ch6p3ORWLresources}{{6.17}{127}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{127}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{127}} +\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{127}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{128}} +\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{128}} +\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{128}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{128}} +\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{129}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{129}} +\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{129}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{129}} +\newlabel{algo:ch6p3ORWLtrans}{{6.21}{129}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{129}} +\newlabel{algo:ch6p3ORWLdecl}{{6.22}{130}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{130}} +\newlabel{algo:ch6p3ORWLinit}{{6.23}{130}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{130}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{131}} +\newlabel{sec:ch6p3tasks}{{6.4.4}{131}} +\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{132}} +\newlabel{ch6:conclu}{{6.5}{132}} +\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{132}} +\@writefile{toc}{\contentsline {section}{Bibliography}{133}} \@setckpt{Chapters/chapter6/ch6}{ -\setcounter{page}{133} +\setcounter{page}{135} \setcounter{equation}{0} \setcounter{enumi}{4} \setcounter{enumii}{0}