BookGPU/Chapters/chapter6/ch6.aux

   1 \relax
   2 \@writefile{toc}{\author{Sylvain Contassot-Vivier}{}}
   3 \@writefile{toc}{\author{Stephane Vialle}{}}
   4 \@writefile{toc}{\author{Jens Gustedt}{}}
   5 \@writefile{loa}{\addvspace {10\p@ }}
   6 \@writefile{toc}{\contentsline {chapter}{\numberline {3}Development methodologies for GPU and cluster of GPUs}{23}}
   7 \@writefile{lof}{\addvspace {10\p@ }}
   8 \@writefile{lot}{\addvspace {10\p@ }}
   9 \@writefile{toc}{\contentsline {section}{\numberline {3.1}Introduction}{24}}
  10 \newlabel{ch6:intro}{{3.1}{24}}
  11 \@writefile{toc}{\contentsline {section}{\numberline {3.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{24}}
  12 \newlabel{ch6:part1}{{3.2}{24}}
  13 \@writefile{toc}{\contentsline {subsection}{\numberline {3.2.1}Synchronous parallel algorithms on GPU clusters}{24}}
  14 \@writefile{lof}{\contentsline {figure}{\numberline {3.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{26}}
  15 \newlabel{fig:ch6p1overlapnative}{{3.1}{26}}
  16 \@writefile{toc}{\contentsline {subsection}{\numberline {3.2.2}Native overlap of CPU communications and GPU computations}{26}}
  17 \newlabel{algo:ch6p1overlapnative}{{3.1}{27}}
  18 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{27}}
  19 \@writefile{lof}{\contentsline {figure}{\numberline {3.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{28}}
  20 \newlabel{fig:ch6p1overlapseqsequence}{{3.2}{28}}
  21 \@writefile{toc}{\contentsline {subsection}{\numberline {3.2.3}Overlapping with sequences of transfers and computations}{28}}
  22 \newlabel{algo:ch6p1overlapseqsequence}{{3.2}{29}}
  23 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{29}}
  24 \@writefile{lof}{\contentsline {figure}{\numberline {3.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{30}}
  25 \newlabel{fig:ch6p1overlapstreamsequence}{{3.3}{30}}
  26 \newlabel{algo:ch6p1overlapstreamsequence}{{3.3}{31}}
  27 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{31}}
  28 \@writefile{lof}{\contentsline {figure}{\numberline {3.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{33}}
  29 \newlabel{fig:ch6p1overlapinterleaved}{{3.4}{33}}
  30 \@writefile{toc}{\contentsline {subsection}{\numberline {3.2.4}Interleaved communications-transfers-computations overlapping}{33}}
  31 \newlabel{algo:ch6p1overlapinterleaved}{{3.4}{34}}
  32 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{34}}
  33 \@writefile{toc}{\contentsline {subsection}{\numberline {3.2.5}Experimental validation}{36}}
  34 \newlabel{ch6:p1expes}{{3.2.5}{36}}
  35 \newlabel{ch6:p1block-cyclic}{{3.2.5}{36}}
  36 \@writefile{lof}{\contentsline {figure}{\numberline {3.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{37}}
  37 \newlabel{fig:ch6p1syncexpematrixprod}{{3.5}{37}}
  38 \@writefile{toc}{\contentsline {section}{\numberline {3.3}General scheme of asynchronous parallel code with computation/communication overlapping}{38}}
  39 \newlabel{ch6:part2}{{3.3}{38}}
  40 \@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces Synchronous iterative scheme\relax }}{38}}
  41 \newlabel{algo:ch6p2sync}{{1}{38}}
  42 \@writefile{loa}{\contentsline {algorithm}{\numberline {2}{\ignorespaces Asynchronous iterative scheme\relax }}{38}}
  43 \newlabel{algo:ch6p2async}{{2}{38}}
  44 \@writefile{toc}{\contentsline {subsection}{\numberline {3.3.1}A basic asynchronous scheme}{40}}
  45 \newlabel{ch6:p2BasicAsync}{{3.3.1}{40}}
  46 \newlabel{algo:ch6p2BasicAsync}{{3.5}{40}}
  47 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.5}Initialization of the basic asynchronous scheme}{40}}
  48 \newlabel{algo:ch6p2BasicAsyncComp}{{3.6}{41}}
  49 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.6}Computing function in the basic asynchronous scheme}{41}}
  50 \newlabel{algo:ch6p2BasicAsyncSendings}{{3.7}{42}}
  51 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.7}Sending function in the basic asynchronous scheme}{42}}
  52 \newlabel{algo:ch6p2BasicAsyncReceptions}{{3.8}{43}}
  53 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.8}Reception function in the basic asynchronous scheme}{43}}
  54 \@writefile{toc}{\contentsline {subsection}{\numberline {3.3.2}Synchronization of the asynchronous scheme}{44}}
  55 \newlabel{ch6:p2SsyncOverAsync}{{3.3.2}{44}}
  56 \newlabel{algo:ch6p2Sync}{{3.9}{45}}
  57 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.9}Initialization of the synchronized scheme}{45}}
  58 \newlabel{algo:ch6p2SyncComp}{{3.10}{46}}
  59 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.10}Computing function in the synchronized scheme}{46}}
  60 \newlabel{algo:ch6p2SyncReceptions}{{3.11}{47}}
  61 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.11}Reception function in the synchronized scheme}{47}}
  62 \@writefile{toc}{\contentsline {subsection}{\numberline {3.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{48}}
  63 \newlabel{ch6:p2GPUAsync}{{3.3.3}{48}}
  64 \newlabel{algo:ch6p2AsyncSyncComp}{{3.12}{50}}
  65 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.12}Computing function in the final asynchronous scheme}{50}}
  66 \newlabel{algo:ch6p2syncGPU}{{3.13}{51}}
  67 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.13}Computing function in the final asynchronous scheme}{51}}
  68 \newlabel{algo:ch6p2FullOverAsyncMain}{{3.14}{53}}
  69 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.14}Initialization of the main process of complete overlap with asynchronism}{53}}
  70 \newlabel{algo:ch6p2FullOverAsyncComp1}{{3.15}{54}}
  71 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{54}}
  72 \newlabel{algo:ch6p2FullOverAsyncComp2}{{3.16}{55}}
  73 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{55}}
  74 \@writefile{toc}{\contentsline {subsection}{\numberline {3.3.4}Experimental validation}{56}}
  75 \newlabel{sec:ch6p2expes}{{3.3.4}{56}}
  76 \@writefile{lof}{\contentsline {figure}{\numberline {3.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{57}}
  77 \newlabel{fig:ch6p2syncasync}{{3.6}{57}}
  78 \@writefile{lof}{\contentsline {figure}{\numberline {3.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{58}}
  79 \newlabel{fig:ch6p2aux}{{3.7}{58}}
  80 \@writefile{toc}{\contentsline {section}{\numberline {3.4}Perspective: A unifying programming model}{59}}
  81 \newlabel{sec:ch6p3unify}{{3.4}{59}}
  82 \@writefile{toc}{\contentsline {subsection}{\numberline {3.4.1}Resources}{59}}
  83 \newlabel{sec:ch6p3resources}{{3.4.1}{59}}
  84 \newlabel{algo:ch6p3ORWLresources}{{3.17}{60}}
  85 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{60}}
  86 \@writefile{toc}{\contentsline {subsection}{\numberline {3.4.2}Control}{60}}
  87 \newlabel{sec:ch6p3ORWLcontrol}{{3.4.2}{60}}
  88 \@writefile{toc}{\contentsline {subsection}{\numberline {3.4.3}Example: block-cyclic matrix multiplication (MM)}{61}}
  89 \newlabel{sec:ch6p3ORWLMM}{{3.4.3}{61}}
  90 \newlabel{algo:ch6p3ORWLBCCMM}{{3.18}{61}}
  91 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.18}Block-cyclic matrix multiplication, high level per task view}{61}}
  92 \newlabel{algo:ch6p3ORWLlcopy}{{3.19}{62}}
  93 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.19}An iterative local copy operation}{62}}
  94 \newlabel{algo:ch6p3ORWLrcopy}{{3.20}{62}}
  95 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{62}}
  96 \newlabel{algo:ch6p3ORWLtrans}{{3.21}{62}}
  97 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{62}}
  98 \newlabel{algo:ch6p3ORWLdecl}{{3.22}{63}}
  99 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.22}Dynamic declaration of handles to represent the resources}{63}}
 100 \newlabel{algo:ch6p3ORWLinit}{{3.23}{64}}
 101 \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.23}Dynamic initialization of access mode and priorities}{64}}
 102 \@writefile{toc}{\contentsline {subsection}{\numberline {3.4.4}Tasks and operations}{64}}
 103 \newlabel{sec:ch6p3tasks}{{3.4.4}{64}}
 104 \@writefile{toc}{\contentsline {section}{\numberline {3.5}Conclusion}{65}}
 105 \newlabel{ch6:conclu}{{3.5}{65}}
 106 \@writefile{toc}{\contentsline {section}{\numberline {3.6}Glossary}{65}}
 107 \@writefile{toc}{\contentsline {section}{Bibliography}{66}}
 108 \@setckpt{Chapters/chapter6/ch6}{
 109 \setcounter{page}{68}
 110 \setcounter{equation}{0}
 111 \setcounter{enumi}{4}
 112 \setcounter{enumii}{0}
 113 \setcounter{enumiii}{0}
 114 \setcounter{enumiv}{21}
 115 \setcounter{footnote}{0}
 116 \setcounter{mpfootnote}{0}
 117 \setcounter{part}{1}
 118 \setcounter{chapter}{3}
 119 \setcounter{section}{6}
 120 \setcounter{subsection}{0}
 121 \setcounter{subsubsection}{0}
 122 \setcounter{paragraph}{0}
 123 \setcounter{subparagraph}{0}
 124 \setcounter{figure}{7}
 125 \setcounter{table}{0}
 126 \setcounter{numauthors}{0}
 127 \setcounter{parentequation}{0}
 128 \setcounter{subfigure}{0}
 129 \setcounter{lofdepth}{1}
 130 \setcounter{subtable}{0}
 131 \setcounter{lotdepth}{1}
 132 \setcounter{lstnumber}{17}
 133 \setcounter{ContinuedFloat}{0}
 134 \setcounter{float@type}{16}
 135 \setcounter{algorithm}{2}
 136 \setcounter{ALC@unique}{0}
 137 \setcounter{ALC@line}{0}
 138 \setcounter{ALC@rem}{0}
 139 \setcounter{ALC@depth}{0}
 140 \setcounter{AlgoLine}{0}
 141 \setcounter{algocfline}{0}
 142 \setcounter{algocfproc}{0}
 143 \setcounter{algocf}{0}
 144 \setcounter{proposition}{0}
 145 \setcounter{proof}{0}
 146 \setcounter{lstlisting}{23}
 147 }