From: couturie Date: Wed, 20 Mar 2013 20:45:59 +0000 (+0100) Subject: new X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/390b92667ac7e43b5bbd34c7fd7b8c8314a05b3d?ds=sidebyside;hp=-c new --- 390b92667ac7e43b5bbd34c7fd7b8c8314a05b3d diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index 46546ef..de8e359 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -152,8 +152,8 @@ \maketitle \frontmatter -\include{frontmatter/Foreword} -\include{frontmatter/preface} +%\include{frontmatter/Foreword} +%\include{frontmatter/preface} \listoffigures \listoftables @@ -164,22 +164,28 @@ \include{Chapters/symbollist} \setcounter{page}{1} -\part{This is a Part} +\part{Presentation of GPUs} \include{Chapters/chapter1/ch1} \include{Chapters/chapter2/ch2} +\part{Image processing} \include{Chapters/chapter3/ch3} +\part{Software development} \include{Chapters/chapter5/ch5} \include{Chapters/chapter6/ch6} -\include{Chapters/chapter7/ch7} +\part{Optimization} \include{Chapters/chapter8/ch8} \include{Chapters/chapter9/ch9} + +\part{Numerical applications} +\include{Chapters/chapter7/ch7} \include{Chapters/chapter11/ch11} \include{Chapters/chapter12/ch12} \include{Chapters/chapter13/ch13} \include{Chapters/chapter14/ch14} \include{Chapters/chapter15/ch15} \include{Chapters/chapter16/ch16} - \include{Chapters/chapter18/ch18} +\part{Other} +\include{Chapters/chapter18/ch18} \bibliographystyle{hep} %%%\bibliography{biblio} diff --git a/BookGPU/Chapters/chapter12/ch12.aux b/BookGPU/Chapters/chapter12/ch12.aux index f25a484..82783b4 100644 --- a/BookGPU/Chapters/chapter12/ch12.aux +++ b/BookGPU/Chapters/chapter12/ch12.aux @@ -3,81 +3,81 @@ \@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} \@writefile{toc}{\author{Jacques Bahi}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {11}Solving sparse linear systems with GMRES and CG methods on GPU clusters}{251}} +\@writefile{toc}{\contentsline {chapter}{\numberline {11}Solving sparse linear systems with GMRES and CG methods on GPU clusters}{259}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\newlabel{ch12}{{11}{251}} -\@writefile{toc}{\contentsline {section}{\numberline {11.1}Introduction}{251}} -\newlabel{ch12:sec:01}{{11.1}{251}} -\@writefile{toc}{\contentsline {section}{\numberline {11.2}Krylov iterative methods}{252}} -\newlabel{ch12:sec:02}{{11.2}{252}} -\newlabel{ch12:eq:01}{{11.1}{252}} -\newlabel{ch12:eq:02}{{11.2}{252}} -\newlabel{ch12:eq:03}{{11.3}{252}} -\newlabel{ch12:eq:11}{{11.4}{253}} -\@writefile{toc}{\contentsline {subsection}{\numberline {11.2.1}CG method}{253}} -\newlabel{ch12:sec:02.01}{{11.2.1}{253}} -\newlabel{ch12:eq:04}{{11.5}{253}} -\newlabel{ch12:eq:05}{{11.6}{253}} -\newlabel{ch12:eq:06}{{11.7}{253}} -\newlabel{ch12:eq:07}{{11.8}{253}} -\newlabel{ch12:eq:08}{{11.9}{253}} -\newlabel{ch12:eq:09}{{11.10}{253}} -\@writefile{loa}{\contentsline {algocf}{\numberline {9}{\ignorespaces Left-preconditioned CG method\relax }}{254}} -\newlabel{ch12:alg:01}{{9}{254}} -\newlabel{ch12:eq:10}{{11.11}{254}} -\@writefile{toc}{\contentsline {subsection}{\numberline {11.2.2}GMRES method}{255}} -\newlabel{ch12:sec:02.02}{{11.2.2}{255}} -\newlabel{ch12:eq:12}{{11.12}{255}} -\newlabel{ch12:eq:13}{{11.13}{255}} -\newlabel{ch12:eq:14}{{11.14}{255}} -\newlabel{ch12:eq:15}{{11.15}{255}} -\newlabel{ch12:eq:16}{{11.16}{255}} -\newlabel{ch12:eq:17}{{11.17}{255}} -\newlabel{ch12:eq:18}{{11.18}{255}} -\newlabel{ch12:eq:19}{{11.19}{255}} -\@writefile{loa}{\contentsline {algocf}{\numberline {10}{\ignorespaces Left-preconditioned GMRES method with restarts\relax }}{256}} -\newlabel{ch12:alg:02}{{10}{256}} -\@writefile{toc}{\contentsline {section}{\numberline {11.3}Parallel implementation on a GPU cluster}{257}} -\newlabel{ch12:sec:03}{{11.3}{257}} -\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.1}Data partitioning}{257}} -\newlabel{ch12:sec:03.01}{{11.3.1}{257}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.1}{\ignorespaces A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.\relax }}{258}} -\newlabel{ch12:fig:01}{{11.1}{258}} -\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.2}GPU computing}{258}} -\newlabel{ch12:sec:03.02}{{11.3.2}{258}} -\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.3}Data communications}{259}} -\newlabel{ch12:sec:03.03}{{11.3.3}{259}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.2}{\ignorespaces Data exchanges between \textit {Node 1} and its neighbors \textit {Node 0}, \textit {Node 2} and \textit {Node 3}.\relax }}{260}} -\newlabel{ch12:fig:02}{{11.2}{260}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.3}{\ignorespaces Columns reordering of a sparse sub-matrix.\relax }}{261}} -\newlabel{ch12:fig:03}{{11.3}{261}} -\@writefile{toc}{\contentsline {section}{\numberline {11.4}Experimental results}{262}} -\newlabel{ch12:sec:04}{{11.4}{262}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.4}{\ignorespaces General scheme of the GPU cluster of tests composed of six machines, each with two GPUs.\relax }}{262}} -\newlabel{ch12:fig:04}{{11.4}{262}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.5}{\ignorespaces Sketches of sparse matrices chosen from the Davis collection.\relax }}{263}} -\newlabel{ch12:fig:05}{{11.5}{263}} -\@writefile{lot}{\contentsline {table}{\numberline {11.1}{\ignorespaces Main characteristics of sparse matrices chosen from the Davis collection.\relax }}{263}} -\newlabel{ch12:tab:01}{{11.1}{263}} -\@writefile{lot}{\contentsline {table}{\numberline {11.2}{\ignorespaces Performances of the parallel CG method on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{264}} -\newlabel{ch12:tab:02}{{11.2}{264}} -\@writefile{lot}{\contentsline {table}{\numberline {11.3}{\ignorespaces Performances of the parallel GMRES method on a cluster 24 CPU cores vs. on cluster of 12 GPUs.\relax }}{264}} -\newlabel{ch12:tab:03}{{11.3}{264}} -\newlabel{ch12:eq:20}{{11.20}{265}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.6}{\ignorespaces Parallel generation of a large sparse matrix by four computing nodes.\relax }}{266}} -\newlabel{ch12:fig:06}{{11.6}{266}} -\@writefile{lot}{\contentsline {table}{\numberline {11.4}{\ignorespaces Main characteristics of sparse banded matrices generated from those of the Davis collection.\relax }}{266}} -\newlabel{ch12:tab:04}{{11.4}{266}} -\@writefile{lot}{\contentsline {table}{\numberline {11.5}{\ignorespaces Performances of the parallel CG method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{267}} -\newlabel{ch12:tab:05}{{11.5}{267}} -\@writefile{lot}{\contentsline {table}{\numberline {11.6}{\ignorespaces Performances of the parallel GMRES method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{267}} -\newlabel{ch12:tab:06}{{11.6}{267}} -\@writefile{toc}{\contentsline {section}{\numberline {11.5}Conclusion}{267}} -\newlabel{ch12:sec:05}{{11.5}{267}} -\@writefile{toc}{\contentsline {section}{Bibliography}{268}} +\newlabel{ch12}{{11}{259}} +\@writefile{toc}{\contentsline {section}{\numberline {11.1}Introduction}{259}} +\newlabel{ch12:sec:01}{{11.1}{259}} +\@writefile{toc}{\contentsline {section}{\numberline {11.2}Krylov iterative methods}{260}} +\newlabel{ch12:sec:02}{{11.2}{260}} +\newlabel{ch12:eq:01}{{11.1}{260}} +\newlabel{ch12:eq:02}{{11.2}{260}} +\newlabel{ch12:eq:03}{{11.3}{260}} +\newlabel{ch12:eq:11}{{11.4}{261}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.2.1}CG method}{261}} +\newlabel{ch12:sec:02.01}{{11.2.1}{261}} +\newlabel{ch12:eq:04}{{11.5}{261}} +\newlabel{ch12:eq:05}{{11.6}{261}} +\newlabel{ch12:eq:06}{{11.7}{261}} +\newlabel{ch12:eq:07}{{11.8}{261}} +\newlabel{ch12:eq:08}{{11.9}{261}} +\newlabel{ch12:eq:09}{{11.10}{261}} +\@writefile{loa}{\contentsline {algocf}{\numberline {9}{\ignorespaces Left-preconditioned CG method\relax }}{262}} +\newlabel{ch12:alg:01}{{9}{262}} +\newlabel{ch12:eq:10}{{11.11}{262}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.2.2}GMRES method}{263}} +\newlabel{ch12:sec:02.02}{{11.2.2}{263}} +\newlabel{ch12:eq:12}{{11.12}{263}} +\newlabel{ch12:eq:13}{{11.13}{263}} +\newlabel{ch12:eq:14}{{11.14}{263}} +\newlabel{ch12:eq:15}{{11.15}{263}} +\newlabel{ch12:eq:16}{{11.16}{263}} +\newlabel{ch12:eq:17}{{11.17}{263}} +\newlabel{ch12:eq:18}{{11.18}{263}} +\newlabel{ch12:eq:19}{{11.19}{263}} +\@writefile{loa}{\contentsline {algocf}{\numberline {10}{\ignorespaces Left-preconditioned GMRES method with restarts\relax }}{264}} +\newlabel{ch12:alg:02}{{10}{264}} +\@writefile{toc}{\contentsline {section}{\numberline {11.3}Parallel implementation on a GPU cluster}{265}} +\newlabel{ch12:sec:03}{{11.3}{265}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.1}Data partitioning}{265}} +\newlabel{ch12:sec:03.01}{{11.3.1}{265}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.1}{\ignorespaces A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.\relax }}{266}} +\newlabel{ch12:fig:01}{{11.1}{266}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.2}GPU computing}{266}} +\newlabel{ch12:sec:03.02}{{11.3.2}{266}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.3}Data communications}{267}} +\newlabel{ch12:sec:03.03}{{11.3.3}{267}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.2}{\ignorespaces Data exchanges between \textit {Node 1} and its neighbors \textit {Node 0}, \textit {Node 2} and \textit {Node 3}.\relax }}{268}} +\newlabel{ch12:fig:02}{{11.2}{268}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.3}{\ignorespaces Columns reordering of a sparse sub-matrix.\relax }}{269}} +\newlabel{ch12:fig:03}{{11.3}{269}} +\@writefile{toc}{\contentsline {section}{\numberline {11.4}Experimental results}{270}} +\newlabel{ch12:sec:04}{{11.4}{270}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.4}{\ignorespaces General scheme of the GPU cluster of tests composed of six machines, each with two GPUs.\relax }}{270}} +\newlabel{ch12:fig:04}{{11.4}{270}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.5}{\ignorespaces Sketches of sparse matrices chosen from the Davis collection.\relax }}{271}} +\newlabel{ch12:fig:05}{{11.5}{271}} +\@writefile{lot}{\contentsline {table}{\numberline {11.1}{\ignorespaces Main characteristics of sparse matrices chosen from the Davis collection.\relax }}{271}} +\newlabel{ch12:tab:01}{{11.1}{271}} +\@writefile{lot}{\contentsline {table}{\numberline {11.2}{\ignorespaces Performances of the parallel CG method on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{272}} +\newlabel{ch12:tab:02}{{11.2}{272}} +\@writefile{lot}{\contentsline {table}{\numberline {11.3}{\ignorespaces Performances of the parallel GMRES method on a cluster 24 CPU cores vs. on cluster of 12 GPUs.\relax }}{272}} +\newlabel{ch12:tab:03}{{11.3}{272}} +\newlabel{ch12:eq:20}{{11.20}{273}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.6}{\ignorespaces Parallel generation of a large sparse matrix by four computing nodes.\relax }}{274}} +\newlabel{ch12:fig:06}{{11.6}{274}} +\@writefile{lot}{\contentsline {table}{\numberline {11.4}{\ignorespaces Main characteristics of sparse banded matrices generated from those of the Davis collection.\relax }}{274}} +\newlabel{ch12:tab:04}{{11.4}{274}} +\@writefile{lot}{\contentsline {table}{\numberline {11.5}{\ignorespaces Performances of the parallel CG method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{275}} +\newlabel{ch12:tab:05}{{11.5}{275}} +\@writefile{lot}{\contentsline {table}{\numberline {11.6}{\ignorespaces Performances of the parallel GMRES method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{275}} +\newlabel{ch12:tab:06}{{11.6}{275}} +\@writefile{toc}{\contentsline {section}{\numberline {11.5}Conclusion}{275}} +\newlabel{ch12:sec:05}{{11.5}{275}} +\@writefile{toc}{\contentsline {section}{Bibliography}{276}} \@setckpt{Chapters/chapter12/ch12}{ -\setcounter{page}{270} +\setcounter{page}{278} \setcounter{equation}{22} \setcounter{enumi}{4} \setcounter{enumii}{0} @@ -85,7 +85,7 @@ \setcounter{enumiv}{10} \setcounter{footnote}{0} \setcounter{mpfootnote}{0} -\setcounter{part}{1} +\setcounter{part}{5} \setcounter{chapter}{11} \setcounter{section}{5} \setcounter{subsection}{0} diff --git a/BookGPU/Chapters/chapter13/ch13.aux b/BookGPU/Chapters/chapter13/ch13.aux index 750fdc1..f830fe0 100644 --- a/BookGPU/Chapters/chapter13/ch13.aux +++ b/BookGPU/Chapters/chapter13/ch13.aux @@ -5,86 +5,86 @@ \@writefile{toc}{\author{Pierre Spit\IeC {\'e}ri}{}} \@writefile{toc}{\author{Jacques Bahi}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {12}Solving sparse nonlinear systems of obstacle problems on GPU clusters}{271}} +\@writefile{toc}{\contentsline {chapter}{\numberline {12}Solving sparse nonlinear systems of obstacle problems on GPU clusters}{279}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\newlabel{ch13}{{12}{271}} -\@writefile{toc}{\contentsline {section}{\numberline {12.1}Introduction}{271}} -\newlabel{ch13:sec:01}{{12.1}{271}} -\@writefile{toc}{\contentsline {section}{\numberline {12.2}Obstacle problems}{272}} -\newlabel{ch13:sec:02}{{12.2}{272}} -\@writefile{toc}{\contentsline {subsection}{\numberline {12.2.1}Mathematical model}{272}} -\newlabel{ch13:sec:02.01}{{12.2.1}{272}} -\newlabel{ch13:eq:01}{{12.1}{272}} -\newlabel{ch13:eq:02}{{12.2}{272}} -\@writefile{toc}{\contentsline {subsection}{\numberline {12.2.2}Discretization}{273}} -\newlabel{ch13:sec:02.02}{{12.2.2}{273}} -\newlabel{ch13:eq:03}{{12.3}{273}} -\newlabel{ch13:eq:04}{{12.4}{273}} -\newlabel{ch13:eq:05}{{12.5}{273}} -\@writefile{toc}{\contentsline {section}{\numberline {12.3}Parallel iterative method}{274}} -\newlabel{ch13:sec:03}{{12.3}{274}} -\newlabel{ch13:eq:06}{{12.6}{274}} -\newlabel{ch13:eq:07}{{12.7}{274}} -\newlabel{ch13:eq:08}{{12.8}{274}} -\newlabel{ch13:eq:09}{{12.9}{274}} -\newlabel{ch13:eq:10}{{12.10}{275}} -\newlabel{ch13:eq:11}{{12.11}{275}} -\newlabel{ch13:eq:12}{{12.12}{275}} -\newlabel{ch13:eq:13}{{12.13}{276}} -\newlabel{ch13:eq:14}{{12.14}{276}} -\newlabel{ch13:eq:15}{{12.15}{276}} -\newlabel{ch13:eq:16}{{12.16}{276}} -\@writefile{toc}{\contentsline {section}{\numberline {12.4}Parallel implementation on a GPU cluster}{277}} -\newlabel{ch13:sec:04}{{12.4}{277}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.1}{\ignorespaces Data partitioning of a problem to be solved among $S=3\times 4$ computing nodes.\relax }}{277}} -\newlabel{ch13:fig:01}{{12.1}{277}} -\@writefile{loa}{\contentsline {algocf}{\numberline {11}{\ignorespaces Parallel solving of the obstacle problem on a GPU cluster\relax }}{278}} -\newlabel{ch13:alg:01}{{11}{278}} -\newlabel{ch13:eq:18}{{12.17}{278}} -\@writefile{loa}{\contentsline {algocf}{\numberline {12}{\ignorespaces Parallel iterative solving of the nonlinear systems on a GPU cluster ($Solve()$ function)\relax }}{279}} -\newlabel{ch13:alg:02}{{12}{279}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.2}{\ignorespaces Decomposition of a sub-problem in a GPU into $nz$ slices.\relax }}{280}} -\newlabel{ch13:fig:02}{{12.2}{280}} -\newlabel{ch13:list:01}{{12.1}{280}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.1}Skeleton codes of a GPU kernel and a CPU function}{280}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.3}{\ignorespaces Matrix constant coefficients in a three-dimensional domain.\relax }}{282}} -\newlabel{ch13:fig:03}{{12.3}{282}} -\newlabel{ch13:eq:17}{{12.18}{282}} -\newlabel{ch13:list:02}{{12.2}{282}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.2}GPU kernels of the projected Richardson method}{282}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.4}{\ignorespaces Computation of a vector element with the projected Richardson method.\relax }}{284}} -\newlabel{ch13:fig:04}{{12.4}{284}} -\newlabel{ch13:list:03}{{12.3}{284}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.3}Memory access to the cache texture memory}{284}} -\@writefile{toc}{\contentsline {section}{\numberline {12.5}Experimental tests on a GPU cluster}{285}} -\newlabel{ch13:sec:05}{{12.5}{285}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.5}{\ignorespaces GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.\relax }}{287}} -\newlabel{ch13:fig:05}{{12.5}{287}} -\@writefile{lot}{\contentsline {table}{\numberline {12.1}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 24 CPU cores.\relax }}{287}} -\newlabel{ch13:tab:01}{{12.1}{287}} -\@writefile{lot}{\contentsline {table}{\numberline {12.2}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.\relax }}{288}} -\newlabel{ch13:tab:02}{{12.2}{288}} -\@writefile{toc}{\contentsline {section}{\numberline {12.6}Red-Black ordering technique}{288}} -\newlabel{ch13:sec:06}{{12.6}{288}} -\newlabel{ch13:list:04}{{12.4}{289}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.4}GPU kernels of the projected Richardson method using the red-black technique}{289}} -\newlabel{ch13:fig:06.01}{{12.6(a)}{290}} -\newlabel{sub@ch13:fig:06.01}{{(a)}{290}} -\newlabel{ch13:fig:06.02}{{12.6(b)}{290}} -\newlabel{sub@ch13:fig:06.02}{{(b)}{290}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.6}{\ignorespaces Red-Black ordering for computing the iterate vector elements in a three-dimensional space.\relax }}{290}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Red-Black ordering on x, y and z axises}}}{290}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Red-Black ordering on y axis}}}{290}} -\@writefile{lot}{\contentsline {table}{\numberline {12.3}{\ignorespaces Execution times in seconds of the parallel projected Richardson method using read-black ordering technique implemented on a cluster of 12 GPUs.\relax }}{291}} -\newlabel{ch13:tab:03}{{12.3}{291}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.7}{\ignorespaces Weak scaling of both synchronous and asynchronous algorithms of the projected Richardson method using red-black ordering technique.\relax }}{292}} -\newlabel{ch13:fig:07}{{12.7}{292}} -\@writefile{toc}{\contentsline {section}{\numberline {12.7}Conclusion}{292}} -\newlabel{ch13:sec:07}{{12.7}{292}} -\@writefile{toc}{\contentsline {section}{Bibliography}{293}} +\newlabel{ch13}{{12}{279}} +\@writefile{toc}{\contentsline {section}{\numberline {12.1}Introduction}{279}} +\newlabel{ch13:sec:01}{{12.1}{279}} +\@writefile{toc}{\contentsline {section}{\numberline {12.2}Obstacle problems}{280}} +\newlabel{ch13:sec:02}{{12.2}{280}} +\@writefile{toc}{\contentsline {subsection}{\numberline {12.2.1}Mathematical model}{280}} +\newlabel{ch13:sec:02.01}{{12.2.1}{280}} +\newlabel{ch13:eq:01}{{12.1}{280}} +\newlabel{ch13:eq:02}{{12.2}{280}} +\@writefile{toc}{\contentsline {subsection}{\numberline {12.2.2}Discretization}{281}} +\newlabel{ch13:sec:02.02}{{12.2.2}{281}} +\newlabel{ch13:eq:03}{{12.3}{281}} +\newlabel{ch13:eq:04}{{12.4}{281}} +\newlabel{ch13:eq:05}{{12.5}{281}} +\@writefile{toc}{\contentsline {section}{\numberline {12.3}Parallel iterative method}{282}} +\newlabel{ch13:sec:03}{{12.3}{282}} +\newlabel{ch13:eq:06}{{12.6}{282}} +\newlabel{ch13:eq:07}{{12.7}{282}} +\newlabel{ch13:eq:08}{{12.8}{282}} +\newlabel{ch13:eq:09}{{12.9}{282}} +\newlabel{ch13:eq:10}{{12.10}{283}} +\newlabel{ch13:eq:11}{{12.11}{283}} +\newlabel{ch13:eq:12}{{12.12}{283}} +\newlabel{ch13:eq:13}{{12.13}{284}} +\newlabel{ch13:eq:14}{{12.14}{284}} +\newlabel{ch13:eq:15}{{12.15}{284}} +\newlabel{ch13:eq:16}{{12.16}{284}} +\@writefile{toc}{\contentsline {section}{\numberline {12.4}Parallel implementation on a GPU cluster}{285}} +\newlabel{ch13:sec:04}{{12.4}{285}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.1}{\ignorespaces Data partitioning of a problem to be solved among $S=3\times 4$ computing nodes.\relax }}{285}} +\newlabel{ch13:fig:01}{{12.1}{285}} +\@writefile{loa}{\contentsline {algocf}{\numberline {11}{\ignorespaces Parallel solving of the obstacle problem on a GPU cluster\relax }}{286}} +\newlabel{ch13:alg:01}{{11}{286}} +\newlabel{ch13:eq:18}{{12.17}{286}} +\@writefile{loa}{\contentsline {algocf}{\numberline {12}{\ignorespaces Parallel iterative solving of the nonlinear systems on a GPU cluster ($Solve()$ function)\relax }}{287}} +\newlabel{ch13:alg:02}{{12}{287}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.2}{\ignorespaces Decomposition of a sub-problem in a GPU into $nz$ slices.\relax }}{288}} +\newlabel{ch13:fig:02}{{12.2}{288}} +\newlabel{ch13:list:01}{{12.1}{288}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.1}Skeleton codes of a GPU kernel and a CPU function}{288}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.3}{\ignorespaces Matrix constant coefficients in a three-dimensional domain.\relax }}{290}} +\newlabel{ch13:fig:03}{{12.3}{290}} +\newlabel{ch13:eq:17}{{12.18}{290}} +\newlabel{ch13:list:02}{{12.2}{290}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.2}GPU kernels of the projected Richardson method}{290}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.4}{\ignorespaces Computation of a vector element with the projected Richardson method.\relax }}{292}} +\newlabel{ch13:fig:04}{{12.4}{292}} +\newlabel{ch13:list:03}{{12.3}{292}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.3}Memory access to the cache texture memory}{292}} +\@writefile{toc}{\contentsline {section}{\numberline {12.5}Experimental tests on a GPU cluster}{293}} +\newlabel{ch13:sec:05}{{12.5}{293}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.5}{\ignorespaces GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.\relax }}{295}} +\newlabel{ch13:fig:05}{{12.5}{295}} +\@writefile{lot}{\contentsline {table}{\numberline {12.1}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 24 CPU cores.\relax }}{295}} +\newlabel{ch13:tab:01}{{12.1}{295}} +\@writefile{lot}{\contentsline {table}{\numberline {12.2}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.\relax }}{296}} +\newlabel{ch13:tab:02}{{12.2}{296}} +\@writefile{toc}{\contentsline {section}{\numberline {12.6}Red-Black ordering technique}{296}} +\newlabel{ch13:sec:06}{{12.6}{296}} +\newlabel{ch13:list:04}{{12.4}{297}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.4}GPU kernels of the projected Richardson method using the red-black technique}{297}} +\newlabel{ch13:fig:06.01}{{12.6(a)}{298}} +\newlabel{sub@ch13:fig:06.01}{{(a)}{298}} +\newlabel{ch13:fig:06.02}{{12.6(b)}{298}} +\newlabel{sub@ch13:fig:06.02}{{(b)}{298}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.6}{\ignorespaces Red-Black ordering for computing the iterate vector elements in a three-dimensional space.\relax }}{298}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Red-Black ordering on x, y and z axises}}}{298}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Red-Black ordering on y axis}}}{298}} +\@writefile{lot}{\contentsline {table}{\numberline {12.3}{\ignorespaces Execution times in seconds of the parallel projected Richardson method using read-black ordering technique implemented on a cluster of 12 GPUs.\relax }}{299}} +\newlabel{ch13:tab:03}{{12.3}{299}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.7}{\ignorespaces Weak scaling of both synchronous and asynchronous algorithms of the projected Richardson method using red-black ordering technique.\relax }}{300}} +\newlabel{ch13:fig:07}{{12.7}{300}} +\@writefile{toc}{\contentsline {section}{\numberline {12.7}Conclusion}{300}} +\newlabel{ch13:sec:07}{{12.7}{300}} +\@writefile{toc}{\contentsline {section}{Bibliography}{301}} \@setckpt{Chapters/chapter13/ch13}{ -\setcounter{page}{295} +\setcounter{page}{303} \setcounter{equation}{18} \setcounter{enumi}{4} \setcounter{enumii}{0} @@ -92,7 +92,7 @@ \setcounter{enumiv}{15} \setcounter{footnote}{0} \setcounter{mpfootnote}{0} -\setcounter{part}{1} +\setcounter{part}{5} \setcounter{chapter}{12} \setcounter{section}{7} \setcounter{subsection}{0} diff --git a/BookGPU/Chapters/chapter16/ch16.aux b/BookGPU/Chapters/chapter16/ch16.aux index 19fea95..75c0afb 100644 --- a/BookGPU/Chapters/chapter16/ch16.aux +++ b/BookGPU/Chapters/chapter16/ch16.aux @@ -4,72 +4,72 @@ \@writefile{toc}{\author{H. Wang}{}} \@writefile{toc}{\author{H. Yu}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {15}GPU-Accelerated Envelope-Following Method}{335}} +\@writefile{toc}{\contentsline {chapter}{\numberline {15}GPU-Accelerated Envelope-Following Method}{343}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {section}{\numberline {15.1}Introduction}{335}} -\newlabel{fig:ef1}{{15.1(a)}{337}} -\newlabel{sub@fig:ef1}{{(a)}{337}} -\newlabel{fig:ef2}{{15.1(b)}{337}} -\newlabel{sub@fig:ef2}{{(b)}{337}} -\@writefile{lof}{\contentsline {figure}{\numberline {15.1}{\ignorespaces Transient envelope-following analysis. (Both two figures reflect backward-Euler style envelope-following.)\relax }}{337}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Illustration of one envelope skip.}}}{337}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {The envelope changes in a slow time scale.}}}{337}} -\newlabel{fig:ef_intro}{{15.1}{337}} -\@writefile{toc}{\contentsline {section}{\numberline {15.2}The envelope-following method in a nutshell}{338}} -\newlabel{sec:ef}{{15.2}{338}} -\newlabel{eq:dae}{{15.1}{338}} -\newlabel{eq:Newton}{{15.2}{339}} -\newlabel{eq:A}{{15.3}{339}} -\@writefile{toc}{\contentsline {section}{\numberline {15.3}New parallel envelope-following method}{340}} -\newlabel{sec:gmres}{{15.3}{340}} -\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.1}GMRES solver for Newton update equation}{340}} -\@writefile{lof}{\contentsline {figure}{\numberline {15.2}{\ignorespaces The flow of envelope-following method.\relax }}{341}} -\newlabel{fig:ef_flow}{{15.2}{341}} -\@writefile{loa}{\contentsline {algocf}{\numberline {14}{\ignorespaces Standard GMRES algorithm.\relax }}{342}} -\newlabel{alg:GMRES}{{14}{342}} -\newlabel{line:mvp}{{5}{342}} -\newlabel{line:newnorm}{{11}{342}} -\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.2}Parallelization on GPU platforms}{342}} -\newlabel{sec:gpu}{{15.3.2}{342}} -\@writefile{lof}{\contentsline {figure}{\numberline {15.3}{\ignorespaces GPU parallel solver for envelope-following update.\relax }}{343}} -\newlabel{fig:gmres}{{15.3}{343}} -\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.3}Gear-2 based sensitivity calculation}{344}} -\newlabel{sec:gear}{{15.3.3}{344}} -\newlabel{eq:BE}{{15.4}{344}} -\newlabel{eq:sens1}{{15.5}{344}} -\newlabel{eq:Gear_t2}{{15.6}{345}} -\newlabel{eq:sens2}{{15.7}{345}} -\newlabel{eq:Gear_t3}{{15.8}{345}} -\newlabel{eq:sensM}{{15.9}{345}} -\@writefile{loa}{\contentsline {algocf}{\numberline {15}{\ignorespaces The matrix-free method for Krylov subspace construction.\relax }}{346}} -\newlabel{alg:mf_Gear}{{15}{346}} -\newlabel{line:mf_Gear_loop}{{4}{346}} -\newlabel{line:shift}{{8}{346}} -\@writefile{toc}{\contentsline {section}{\numberline {15.4}Numerical examples}{346}} -\newlabel{sec:exp}{{15.4}{346}} -\@writefile{lof}{\contentsline {figure}{\numberline {15.4}{\ignorespaces Diagram of a zero-voltage quasi-resonant flyback converter.\relax }}{347}} -\newlabel{fig:flyback}{{15.4}{347}} -\@writefile{lof}{\contentsline {figure}{\numberline {15.5}{\ignorespaces Illustration of power/ground network model.\relax }}{347}} -\newlabel{fig:pg}{{15.5}{347}} -\newlabel{fig:flybackWhole}{{15.6(a)}{348}} -\newlabel{sub@fig:flybackWhole}{{(a)}{348}} -\newlabel{fig:flybackZoom}{{15.6(b)}{348}} -\newlabel{sub@fig:flybackZoom}{{(b)}{348}} -\@writefile{lof}{\contentsline {figure}{\numberline {15.6}{\ignorespaces Flyback converter solution calculated by envelope-following. The red curve is traditional SPICE simulation result, and the back curve is the envelope-following output with simulation points marked.\relax }}{348}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {The whole plot}}}{348}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Detail of one EF simulation period}}}{348}} -\newlabel{fig:flyback_wave}{{15.6}{348}} -\@writefile{lof}{\contentsline {figure}{\numberline {15.7}{\ignorespaces Buck converter solution calculated by envelope-following.\relax }}{349}} -\newlabel{fig:buck_wave}{{15.7}{349}} -\@writefile{lot}{\contentsline {table}{\numberline {15.1}{\ignorespaces CPU and GPU time comparisons (in seconds) for solving Newton update equation with the proposed Gear-2 sensitivity. \relax }}{349}} -\newlabel{table:circuit}{{15.1}{349}} -\@writefile{toc}{\contentsline {section}{\numberline {15.5}Summary}{350}} -\newlabel{sec:summary}{{15.5}{350}} -\@writefile{toc}{\contentsline {section}{\numberline {15.6}Glossary}{350}} -\@writefile{toc}{\contentsline {section}{Bibliography}{350}} +\@writefile{toc}{\contentsline {section}{\numberline {15.1}Introduction}{343}} +\newlabel{fig:ef1}{{15.1(a)}{345}} +\newlabel{sub@fig:ef1}{{(a)}{345}} +\newlabel{fig:ef2}{{15.1(b)}{345}} +\newlabel{sub@fig:ef2}{{(b)}{345}} +\@writefile{lof}{\contentsline {figure}{\numberline {15.1}{\ignorespaces Transient envelope-following analysis. (Both two figures reflect backward-Euler style envelope-following.)\relax }}{345}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Illustration of one envelope skip.}}}{345}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {The envelope changes in a slow time scale.}}}{345}} +\newlabel{fig:ef_intro}{{15.1}{345}} +\@writefile{toc}{\contentsline {section}{\numberline {15.2}The envelope-following method in a nutshell}{346}} +\newlabel{sec:ef}{{15.2}{346}} +\newlabel{eq:dae}{{15.1}{346}} +\newlabel{eq:Newton}{{15.2}{347}} +\newlabel{eq:A}{{15.3}{347}} +\@writefile{toc}{\contentsline {section}{\numberline {15.3}New parallel envelope-following method}{348}} +\newlabel{sec:gmres}{{15.3}{348}} +\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.1}GMRES solver for Newton update equation}{348}} +\@writefile{lof}{\contentsline {figure}{\numberline {15.2}{\ignorespaces The flow of envelope-following method.\relax }}{349}} +\newlabel{fig:ef_flow}{{15.2}{349}} +\@writefile{loa}{\contentsline {algocf}{\numberline {14}{\ignorespaces Standard GMRES algorithm.\relax }}{350}} +\newlabel{alg:GMRES}{{14}{350}} +\newlabel{line:mvp}{{5}{350}} +\newlabel{line:newnorm}{{11}{350}} +\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.2}Parallelization on GPU platforms}{350}} +\newlabel{sec:gpu}{{15.3.2}{350}} +\@writefile{lof}{\contentsline {figure}{\numberline {15.3}{\ignorespaces GPU parallel solver for envelope-following update.\relax }}{351}} +\newlabel{fig:gmres}{{15.3}{351}} +\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.3}Gear-2 based sensitivity calculation}{352}} +\newlabel{sec:gear}{{15.3.3}{352}} +\newlabel{eq:BE}{{15.4}{352}} +\newlabel{eq:sens1}{{15.5}{352}} +\newlabel{eq:Gear_t2}{{15.6}{353}} +\newlabel{eq:sens2}{{15.7}{353}} +\newlabel{eq:Gear_t3}{{15.8}{353}} +\newlabel{eq:sensM}{{15.9}{353}} +\@writefile{loa}{\contentsline {algocf}{\numberline {15}{\ignorespaces The matrix-free method for Krylov subspace construction.\relax }}{354}} +\newlabel{alg:mf_Gear}{{15}{354}} +\newlabel{line:mf_Gear_loop}{{4}{354}} +\newlabel{line:shift}{{8}{354}} +\@writefile{toc}{\contentsline {section}{\numberline {15.4}Numerical examples}{354}} +\newlabel{sec:exp}{{15.4}{354}} +\@writefile{lof}{\contentsline {figure}{\numberline {15.4}{\ignorespaces Diagram of a zero-voltage quasi-resonant flyback converter.\relax }}{355}} +\newlabel{fig:flyback}{{15.4}{355}} +\@writefile{lof}{\contentsline {figure}{\numberline {15.5}{\ignorespaces Illustration of power/ground network model.\relax }}{355}} +\newlabel{fig:pg}{{15.5}{355}} +\newlabel{fig:flybackWhole}{{15.6(a)}{356}} +\newlabel{sub@fig:flybackWhole}{{(a)}{356}} +\newlabel{fig:flybackZoom}{{15.6(b)}{356}} +\newlabel{sub@fig:flybackZoom}{{(b)}{356}} +\@writefile{lof}{\contentsline {figure}{\numberline {15.6}{\ignorespaces Flyback converter solution calculated by envelope-following. The red curve is traditional SPICE simulation result, and the back curve is the envelope-following output with simulation points marked.\relax }}{356}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {The whole plot}}}{356}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Detail of one EF simulation period}}}{356}} +\newlabel{fig:flyback_wave}{{15.6}{356}} +\@writefile{lof}{\contentsline {figure}{\numberline {15.7}{\ignorespaces Buck converter solution calculated by envelope-following.\relax }}{357}} +\newlabel{fig:buck_wave}{{15.7}{357}} +\@writefile{lot}{\contentsline {table}{\numberline {15.1}{\ignorespaces CPU and GPU time comparisons (in seconds) for solving Newton update equation with the proposed Gear-2 sensitivity. \relax }}{357}} +\newlabel{table:circuit}{{15.1}{357}} +\@writefile{toc}{\contentsline {section}{\numberline {15.5}Summary}{358}} +\newlabel{sec:summary}{{15.5}{358}} +\@writefile{toc}{\contentsline {section}{\numberline {15.6}Glossary}{358}} +\@writefile{toc}{\contentsline {section}{Bibliography}{358}} \@setckpt{Chapters/chapter16/ch16}{ -\setcounter{page}{352} +\setcounter{page}{360} \setcounter{equation}{9} \setcounter{enumi}{2} \setcounter{enumii}{0} @@ -77,7 +77,7 @@ \setcounter{enumiv}{22} \setcounter{footnote}{0} \setcounter{mpfootnote}{0} -\setcounter{part}{1} +\setcounter{part}{5} \setcounter{chapter}{15} \setcounter{section}{6} \setcounter{subsection}{0} diff --git a/BookGPU/Chapters/chapter18/ch18.aux b/BookGPU/Chapters/chapter18/ch18.aux index e01c7b1..618a140 100644 --- a/BookGPU/Chapters/chapter18/ch18.aux +++ b/BookGPU/Chapters/chapter18/ch18.aux @@ -2,44 +2,45 @@ \@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} \@writefile{toc}{\author{Christophe Guyeux}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {16}Pseudorandom Number Generator on GPU}{353}} +\@writefile{toc}{\contentsline {chapter}{\numberline {16}Pseudorandom Number Generator on GPU}{363}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\newlabel{chapter18}{{16}{353}} -\@writefile{toc}{\contentsline {section}{\numberline {16.1}Introduction}{353}} -\@writefile{toc}{\contentsline {section}{\numberline {16.2}Basic Remindees}{355}} -\newlabel{section:BASIC RECALLS}{{16.2}{355}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.1}A Short Presentation of Chaos}{355}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.2}On Devaney's Definition of Chaos}{355}} -\newlabel{sec:dev}{{16.2.2}{355}} -\newlabel{Devaney}{{16.1}{355}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.3}Chaotic iterations}{356}} -\newlabel{subsection:Chaotic iterations}{{16.2.3}{356}} -\newlabel{Chaotic iterations}{{2}{356}} -\newlabel{eq:generalIC}{{16.4}{357}} -\newlabel{equation Oplus}{{16.5}{357}} -\@writefile{toc}{\contentsline {section}{\numberline {16.3}Toward Efficiency and Improvement for CI PRNG}{357}} -\newlabel{sec:efficient PRNG}{{16.3}{357}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{357}} -\newlabel{algo:seqCIPRNG}{{16.1}{357}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.1}C code of the sequential PRNG based on chaotic iterations}{357}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{358}} -\newlabel{sec:efficient PRNG gpu}{{16.3.2}{358}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.3}Naive Version for GPU}{358}} -\@writefile{loa}{\contentsline {algocf}{\numberline {16}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{359}} -\newlabel{algo:gpu_kernel}{{16}{359}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.4}Improved Version for GPU}{359}} -\newlabel{IR}{{17}{360}} -\@writefile{loa}{\contentsline {algocf}{\numberline {17}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{360}} -\newlabel{algo:gpu_kernel2}{{17}{360}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.5}Chaos Evaluation of the Improved Version}{360}} -\@writefile{toc}{\contentsline {section}{\numberline {16.4}Experiments}{361}} -\newlabel{sec:experiments}{{16.4}{361}} -\@writefile{toc}{\contentsline {section}{Bibliography}{361}} -\@writefile{lof}{\contentsline {figure}{\numberline {16.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{362}} -\newlabel{fig:time_xorlike_gpu}{{16.1}{362}} +\newlabel{chapter18}{{16}{363}} +\@writefile{toc}{\contentsline {section}{\numberline {16.1}Introduction}{363}} +\@writefile{toc}{\contentsline {section}{\numberline {16.2}Basic Remindees}{365}} +\newlabel{section:BASIC RECALLS}{{16.2}{365}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.1}A Short Presentation of Chaos}{365}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.2}On Devaney's Definition of Chaos}{365}} +\newlabel{sec:dev}{{16.2.2}{365}} +\newlabel{Devaney}{{16.1}{365}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.3}Chaotic iterations}{366}} +\newlabel{subsection:Chaotic iterations}{{16.2.3}{366}} +\newlabel{Chaotic iterations}{{2}{366}} +\newlabel{eq:generalIC}{{16.4}{367}} +\newlabel{equation Oplus}{{16.5}{367}} +\@writefile{toc}{\contentsline {section}{\numberline {16.3}Toward Efficiency and Improvement for CI PRNG}{367}} +\newlabel{sec:efficient PRNG}{{16.3}{367}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{367}} +\newlabel{algo:seqCIPRNG}{{16.1}{367}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.1}C code of the sequential PRNG based on chaotic iterations}{367}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{368}} +\newlabel{sec:efficient PRNG gpu}{{16.3.2}{368}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.3}Naive Version for GPU}{368}} +\@writefile{loa}{\contentsline {algocf}{\numberline {16}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{369}} +\newlabel{algo:gpu_kernel}{{16}{369}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.4}Improved Version for GPU}{369}} +\newlabel{IR}{{17}{370}} +\@writefile{loa}{\contentsline {algocf}{\numberline {17}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{370}} +\newlabel{algo:gpu_kernel2}{{17}{370}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.5}Chaos Evaluation of the Improved Version}{370}} +\@writefile{toc}{\contentsline {section}{\numberline {16.4}Experiments}{371}} +\newlabel{sec:experiments}{{16.4}{371}} +\@writefile{toc}{\contentsline {section}{\numberline {16.5}Summary}{371}} +\@writefile{lof}{\contentsline {figure}{\numberline {16.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{372}} +\newlabel{fig:time_xorlike_gpu}{{16.1}{372}} +\@writefile{toc}{\contentsline {section}{Bibliography}{373}} \@setckpt{Chapters/chapter18/ch18}{ -\setcounter{page}{364} +\setcounter{page}{375} \setcounter{equation}{5} \setcounter{enumi}{2} \setcounter{enumii}{0} @@ -47,9 +48,9 @@ \setcounter{enumiv}{17} \setcounter{footnote}{2} \setcounter{mpfootnote}{0} -\setcounter{part}{1} +\setcounter{part}{6} \setcounter{chapter}{16} -\setcounter{section}{4} +\setcounter{section}{5} \setcounter{subsection}{0} \setcounter{subsubsection}{0} \setcounter{paragraph}{0} diff --git a/BookGPU/Chapters/chapter3/ch3.aux b/BookGPU/Chapters/chapter3/ch3.aux index 91483ed..f86e67e 100644 --- a/BookGPU/Chapters/chapter3/ch3.aux +++ b/BookGPU/Chapters/chapter3/ch3.aux @@ -1,116 +1,116 @@ \relax \@writefile{toc}{\author{Gilles Perrot}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {3}Setting up the environnement.}{23}} +\@writefile{toc}{\contentsline {chapter}{\numberline {3}Setting up the environnement.}{25}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\newlabel{algo:memcopy:H2D}{{7}{23}} -\newlabel{algo:memcopy:kernel}{{8}{23}} -\newlabel{algo:memcopy:D2H}{{9}{23}} -\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{23}} -\newlabel{algo:memcopy}{{1}{23}} -\@writefile{toc}{\contentsline {section}{\numberline {3.1}Data transfers, memory management.}{24}} -\newlabel{lst:main1}{{3.1}{25}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}Generic main.cu file used to launch CUDA kernels}{25}} -\newlabel{lst:fkern1}{{3.2}{25}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}fast\_kernels.cu file featuring one kernel skeleton}{25}} -\newlabel{lst:mkfile}{{3.3}{26}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}Generic Makefile based on those provided by NV SDK}{26}} -\@writefile{toc}{\contentsline {section}{\numberline {3.2}Performance measurements}{26}} -\newlabel{lst:chronos}{{3.4}{26}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Time measurement technique using cutil functions}{26}} +\newlabel{algo:memcopy:H2D}{{7}{25}} +\newlabel{algo:memcopy:kernel}{{8}{25}} +\newlabel{algo:memcopy:D2H}{{9}{25}} +\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{25}} +\newlabel{algo:memcopy}{{1}{25}} +\@writefile{toc}{\contentsline {section}{\numberline {3.1}Data transfers, memory management.}{26}} +\newlabel{lst:main1}{{3.1}{27}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}Generic main.cu file used to launch CUDA kernels}{27}} +\newlabel{lst:fkern1}{{3.2}{27}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}fast\_kernels.cu file featuring one kernel skeleton}{27}} +\newlabel{lst:mkfile}{{3.3}{28}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}Generic Makefile based on those provided by NV SDK}{28}} +\@writefile{toc}{\contentsline {section}{\numberline {3.2}Performance measurements}{28}} +\newlabel{lst:chronos}{{3.4}{28}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Time measurement technique using cutil functions}{28}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {4}Implementing a fast median filter}{29}} +\@writefile{toc}{\contentsline {chapter}{\numberline {4}Implementing a fast median filter}{31}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} \@writefile{toc}{\author{Gilles Perrot}{}} -\@writefile{toc}{\contentsline {section}{\numberline {4.1}Introduction}{29}} -\@writefile{toc}{\contentsline {section}{\numberline {4.2}Median filtering}{30}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.1}Basic principles}{30}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.2}A naive implementation}{30}} -\newlabel{img:sap_example_ref}{{4.1(a)}{31}} -\newlabel{sub@img:sap_example_ref}{{(a)}{31}} -\newlabel{img:sap_example_med3}{{4.1(b)}{31}} -\newlabel{sub@img:sap_example_med3}{{(b)}{31}} -\newlabel{img:sap_example_med5}{{4.1(c)}{31}} -\newlabel{sub@img:sap_example_med5}{{(c)}{31}} -\newlabel{img:sap_example_med3_it2}{{4.1(d)}{31}} -\newlabel{sub@img:sap_example_med3_it2}{{(d)}{31}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Exemple of median filtering, applied to salt \& pepper noise reduction.\relax }}{31}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted by salt and pepper noise of density 0.25}}}{31}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ median filter}}}{31}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ median filter}}}{31}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image denoised by 2 iterations of a $3\times 3$ median filter}}}{31}} -\newlabel{fig:sap_examples}{{4.1}{31}} -\newlabel{lst:medianGeneric}{{4.1}{32}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.1}Generic CUDA kernel achieving median filtering}{32}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Exemple of 5x5 median filtering\relax }}{33}} -\newlabel{fig:median_1}{{4.2}{33}} -\newlabel{algoMedianGeneric}{{2}{33}} -\newlabel{algoMedianGeneric:memcpyH2D}{{1}{33}} -\newlabel{algoMedianGeneric:cptstart}{{3}{33}} -\newlabel{algoMedianGeneric:cptend}{{5}{33}} -\newlabel{algoMedianGeneric:memcpyD2H}{{7}{33}} -\@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{33}} -\@writefile{toc}{\contentsline {section}{\numberline {4.3}NVidia GPU tuning recipes}{33}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{34}} -\newlabel{fig:median_overlap}{{4.3}{34}} -\@writefile{lot}{\contentsline {table}{\numberline {4.1}{\ignorespaces Performance results of \texttt {kernel medianR}. \relax }}{34}} -\newlabel{tab:medianHisto1}{{4.1}{34}} -\@writefile{toc}{\contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers }{35}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.1}The simplest way}{35}} -\newlabel{lst:kernelMedian3RegTri9}{{4.2}{36}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}3$\times $3 median filter kernel using one register per neighborhood pixel and bubble sort}{36}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Further optimization}{36}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{36}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs on GPU C2070 and CPU for generic median, 3$\times $3 median register-only and \textit {libJacket}.\relax }}{37}} -\newlabel{fig:compMedians1}{{4.4}{37}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{37}} -\newlabel{fig:forgetful_selection}{{4.5}{37}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{38}} -\newlabel{fig:forgetful3}{{4.6}{38}} -\newlabel{lst:medianForget1pix3}{{4.3}{39}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{39}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{40}} -\newlabel{fig:median3_overlap}{{4.7}{40}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{40}} -\newlabel{lst:medianForget2pix3}{{4.4}{40}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{40}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{41}} -\newlabel{fig:compMedians2}{{4.8}{41}} -\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{42}} -\newlabel{sec:median5}{{4.5.1}{42}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{42}} -\newlabel{lst:medianForget2pix5}{{4.5}{42}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{42}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.\relax }}{43}} -\newlabel{fig:median5overlap}{{4.9}{43}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.\relax }}{43}} -\newlabel{fig:median5overlap}{{4.10}{43}} -\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{44}} -\newlabel{tab:median5comp}{{4.2}{44}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{45}} -\newlabel{lst:medianSeparable}{{4.6}{45}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{45}} -\newlabel{img:sap_example_ref}{{4.11(a)}{46}} -\newlabel{sub@img:sap_example_ref}{{(a)}{46}} -\newlabel{img:sap_example_sep_med3}{{4.11(b)}{46}} -\newlabel{sub@img:sap_example_sep_med3}{{(b)}{46}} -\newlabel{img:sap_example_sep_med5}{{4.11(c)}{46}} -\newlabel{sub@img:sap_example_sep_med5}{{(c)}{46}} -\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{46}} -\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{46}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{46}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{46}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{46}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{46}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{46}} -\newlabel{fig:sap_examples2}{{4.11}{46}} -\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{47}} -\newlabel{tab:medianSeparable}{{4.3}{47}} -\@writefile{toc}{\contentsline {section}{Bibliography}{48}} +\@writefile{toc}{\contentsline {section}{\numberline {4.1}Introduction}{31}} +\@writefile{toc}{\contentsline {section}{\numberline {4.2}Median filtering}{32}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.1}Basic principles}{32}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.2}A naive implementation}{32}} +\newlabel{img:sap_example_ref}{{4.1(a)}{33}} +\newlabel{sub@img:sap_example_ref}{{(a)}{33}} +\newlabel{img:sap_example_med3}{{4.1(b)}{33}} +\newlabel{sub@img:sap_example_med3}{{(b)}{33}} +\newlabel{img:sap_example_med5}{{4.1(c)}{33}} +\newlabel{sub@img:sap_example_med5}{{(c)}{33}} +\newlabel{img:sap_example_med3_it2}{{4.1(d)}{33}} +\newlabel{sub@img:sap_example_med3_it2}{{(d)}{33}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Exemple of median filtering, applied to salt \& pepper noise reduction.\relax }}{33}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted by salt and pepper noise of density 0.25}}}{33}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ median filter}}}{33}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ median filter}}}{33}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image denoised by 2 iterations of a $3\times 3$ median filter}}}{33}} +\newlabel{fig:sap_examples}{{4.1}{33}} +\newlabel{lst:medianGeneric}{{4.1}{34}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.1}Generic CUDA kernel achieving median filtering}{34}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Exemple of 5x5 median filtering\relax }}{35}} +\newlabel{fig:median_1}{{4.2}{35}} +\newlabel{algoMedianGeneric}{{2}{35}} +\newlabel{algoMedianGeneric:memcpyH2D}{{1}{35}} +\newlabel{algoMedianGeneric:cptstart}{{3}{35}} +\newlabel{algoMedianGeneric:cptend}{{5}{35}} +\newlabel{algoMedianGeneric:memcpyD2H}{{7}{35}} +\@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{35}} +\@writefile{toc}{\contentsline {section}{\numberline {4.3}NVidia GPU tuning recipes}{35}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{36}} +\newlabel{fig:median_overlap}{{4.3}{36}} +\@writefile{lot}{\contentsline {table}{\numberline {4.1}{\ignorespaces Performance results of \texttt {kernel medianR}. \relax }}{36}} +\newlabel{tab:medianHisto1}{{4.1}{36}} +\@writefile{toc}{\contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers }{37}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.1}The simplest way}{37}} +\newlabel{lst:kernelMedian3RegTri9}{{4.2}{38}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}3$\times $3 median filter kernel using one register per neighborhood pixel and bubble sort}{38}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Further optimization}{38}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{38}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs on GPU C2070 and CPU for generic median, 3$\times $3 median register-only and \textit {libJacket}.\relax }}{39}} +\newlabel{fig:compMedians1}{{4.4}{39}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{39}} +\newlabel{fig:forgetful_selection}{{4.5}{39}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{40}} +\newlabel{fig:forgetful3}{{4.6}{40}} +\newlabel{lst:medianForget1pix3}{{4.3}{41}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{41}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{42}} +\newlabel{fig:median3_overlap}{{4.7}{42}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{42}} +\newlabel{lst:medianForget2pix3}{{4.4}{42}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{42}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{43}} +\newlabel{fig:compMedians2}{{4.8}{43}} +\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{44}} +\newlabel{sec:median5}{{4.5.1}{44}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{44}} +\newlabel{lst:medianForget2pix5}{{4.5}{44}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{44}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.\relax }}{45}} +\newlabel{fig:median5overlap}{{4.9}{45}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.\relax }}{45}} +\newlabel{fig:median5overlap}{{4.10}{45}} +\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{46}} +\newlabel{tab:median5comp}{{4.2}{46}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{47}} +\newlabel{lst:medianSeparable}{{4.6}{47}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{47}} +\newlabel{img:sap_example_ref}{{4.11(a)}{48}} +\newlabel{sub@img:sap_example_ref}{{(a)}{48}} +\newlabel{img:sap_example_sep_med3}{{4.11(b)}{48}} +\newlabel{sub@img:sap_example_sep_med3}{{(b)}{48}} +\newlabel{img:sap_example_sep_med5}{{4.11(c)}{48}} +\newlabel{sub@img:sap_example_sep_med5}{{(c)}{48}} +\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{48}} +\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{48}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{48}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{48}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{48}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{48}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{48}} +\newlabel{fig:sap_examples2}{{4.11}{48}} +\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{49}} +\newlabel{tab:medianSeparable}{{4.3}{49}} +\@writefile{toc}{\contentsline {section}{Bibliography}{50}} \@setckpt{Chapters/chapter3/ch3}{ -\setcounter{page}{50} +\setcounter{page}{52} \setcounter{equation}{0} \setcounter{enumi}{3} \setcounter{enumii}{0} @@ -118,7 +118,7 @@ \setcounter{enumiv}{10} \setcounter{footnote}{0} \setcounter{mpfootnote}{0} -\setcounter{part}{1} +\setcounter{part}{2} \setcounter{chapter}{4} \setcounter{section}{5} \setcounter{subsection}{2} diff --git a/BookGPU/Chapters/chapter6/ch6.aux b/BookGPU/Chapters/chapter6/ch6.aux index af524e9..1527dfc 100644 --- a/BookGPU/Chapters/chapter6/ch6.aux +++ b/BookGPU/Chapters/chapter6/ch6.aux @@ -3,110 +3,110 @@ \@writefile{toc}{\author{Stephane Vialle}{}} \@writefile{toc}{\author{Jens Gustedt}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {6}Development methodologies for GPU and cluster of GPUs}{83}} +\@writefile{toc}{\contentsline {chapter}{\numberline {6}Development methodologies for GPU and cluster of GPUs}{87}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {section}{\numberline {6.1}Introduction}{84}} -\newlabel{ch6:intro}{{6.1}{84}} -\@writefile{toc}{\contentsline {section}{\numberline {6.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{84}} -\newlabel{ch6:part1}{{6.2}{84}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.1}Synchronous parallel algorithms on GPU clusters}{84}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{86}} -\newlabel{fig:ch6p1overlapnative}{{6.1}{86}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.2}Native overlap of CPU communications and GPU computations}{86}} -\newlabel{algo:ch6p1overlapnative}{{6.1}{87}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{87}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{88}} -\newlabel{fig:ch6p1overlapseqsequence}{{6.2}{88}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{88}} -\newlabel{algo:ch6p1overlapseqsequence}{{6.2}{89}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{89}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{91}} -\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{91}} -\newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{91}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{91}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{93}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{94}} -\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{94}} -\newlabel{algo:ch6p1overlapinterleaved}{{6.4}{94}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{94}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{96}} -\newlabel{ch6:p1expes}{{6.2.5}{96}} -\newlabel{ch6:p1block-cyclic}{{6.2.5}{96}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{97}} -\newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{97}} -\@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{98}} -\newlabel{ch6:part2}{{6.3}{98}} -\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{98}} -\newlabel{algo:ch6p2sync}{{3}{98}} -\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{99}} -\newlabel{algo:ch6p2async}{{4}{99}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{100}} -\newlabel{ch6:p2BasicAsync}{{6.3.1}{100}} -\newlabel{algo:ch6p2BasicAsync}{{6.5}{100}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{100}} -\newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{101}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{101}} -\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{103}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{103}} -\newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{103}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{103}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{105}} -\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{105}} -\newlabel{algo:ch6p2Sync}{{6.9}{105}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{105}} -\newlabel{algo:ch6p2SyncComp}{{6.10}{106}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{106}} -\newlabel{algo:ch6p2SyncReceptions}{{6.11}{108}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{108}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{109}} -\newlabel{ch6:p2GPUAsync}{{6.3.3}{109}} -\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{110}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{110}} -\newlabel{algo:ch6p2syncGPU}{{6.13}{111}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{111}} -\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{114}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{114}} -\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{115}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{115}} -\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{116}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{116}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{117}} -\newlabel{sec:ch6p2expes}{{6.3.4}{117}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{118}} -\newlabel{fig:ch6p2syncasync}{{6.6}{118}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{119}} -\newlabel{fig:ch6p2aux}{{6.7}{119}} -\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{120}} -\newlabel{sec:ch6p3unify}{{6.4}{120}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{120}} -\newlabel{sec:ch6p3resources}{{6.4.1}{120}} -\newlabel{algo:ch6p3ORWLresources}{{6.17}{121}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{121}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{121}} -\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{121}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{122}} -\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{122}} -\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{122}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{122}} -\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{123}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{123}} -\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{123}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{123}} -\newlabel{algo:ch6p3ORWLtrans}{{6.21}{123}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{123}} -\newlabel{algo:ch6p3ORWLdecl}{{6.22}{124}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{124}} -\newlabel{algo:ch6p3ORWLinit}{{6.23}{124}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{124}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{125}} -\newlabel{sec:ch6p3tasks}{{6.4.4}{125}} -\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{126}} -\newlabel{ch6:conclu}{{6.5}{126}} -\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{126}} -\@writefile{toc}{\contentsline {section}{Bibliography}{127}} +\@writefile{toc}{\contentsline {section}{\numberline {6.1}Introduction}{88}} +\newlabel{ch6:intro}{{6.1}{88}} +\@writefile{toc}{\contentsline {section}{\numberline {6.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{88}} +\newlabel{ch6:part1}{{6.2}{88}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.1}Synchronous parallel algorithms on GPU clusters}{88}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{90}} +\newlabel{fig:ch6p1overlapnative}{{6.1}{90}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.2}Native overlap of CPU communications and GPU computations}{90}} +\newlabel{algo:ch6p1overlapnative}{{6.1}{91}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{91}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{92}} +\newlabel{fig:ch6p1overlapseqsequence}{{6.2}{92}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{92}} +\newlabel{algo:ch6p1overlapseqsequence}{{6.2}{93}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{93}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{95}} +\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{95}} +\newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{95}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{95}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{97}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{98}} +\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{98}} +\newlabel{algo:ch6p1overlapinterleaved}{{6.4}{98}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{98}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{100}} +\newlabel{ch6:p1expes}{{6.2.5}{100}} +\newlabel{ch6:p1block-cyclic}{{6.2.5}{100}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{101}} +\newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{101}} +\@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{102}} +\newlabel{ch6:part2}{{6.3}{102}} +\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{102}} +\newlabel{algo:ch6p2sync}{{3}{102}} +\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{103}} +\newlabel{algo:ch6p2async}{{4}{103}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{104}} +\newlabel{ch6:p2BasicAsync}{{6.3.1}{104}} +\newlabel{algo:ch6p2BasicAsync}{{6.5}{104}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{104}} +\newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{105}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{105}} +\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{107}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{107}} +\newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{107}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{107}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{109}} +\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{109}} +\newlabel{algo:ch6p2Sync}{{6.9}{109}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{109}} +\newlabel{algo:ch6p2SyncComp}{{6.10}{110}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{110}} +\newlabel{algo:ch6p2SyncReceptions}{{6.11}{112}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{112}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{113}} +\newlabel{ch6:p2GPUAsync}{{6.3.3}{113}} +\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{114}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{114}} +\newlabel{algo:ch6p2syncGPU}{{6.13}{115}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{115}} +\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{118}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{118}} +\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{119}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{119}} +\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{120}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{120}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{121}} +\newlabel{sec:ch6p2expes}{{6.3.4}{121}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{122}} +\newlabel{fig:ch6p2syncasync}{{6.6}{122}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{123}} +\newlabel{fig:ch6p2aux}{{6.7}{123}} +\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{124}} +\newlabel{sec:ch6p3unify}{{6.4}{124}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{124}} +\newlabel{sec:ch6p3resources}{{6.4.1}{124}} +\newlabel{algo:ch6p3ORWLresources}{{6.17}{125}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{125}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{125}} +\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{125}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{126}} +\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{126}} +\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{126}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{126}} +\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{127}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{127}} +\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{127}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{127}} +\newlabel{algo:ch6p3ORWLtrans}{{6.21}{127}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{127}} +\newlabel{algo:ch6p3ORWLdecl}{{6.22}{128}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{128}} +\newlabel{algo:ch6p3ORWLinit}{{6.23}{128}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{128}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{129}} +\newlabel{sec:ch6p3tasks}{{6.4.4}{129}} +\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{130}} +\newlabel{ch6:conclu}{{6.5}{130}} +\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{130}} +\@writefile{toc}{\contentsline {section}{Bibliography}{131}} \@setckpt{Chapters/chapter6/ch6}{ -\setcounter{page}{129} +\setcounter{page}{133} \setcounter{equation}{0} \setcounter{enumi}{4} \setcounter{enumii}{0} @@ -114,7 +114,7 @@ \setcounter{enumiv}{21} \setcounter{footnote}{0} \setcounter{mpfootnote}{0} -\setcounter{part}{1} +\setcounter{part}{3} \setcounter{chapter}{6} \setcounter{section}{6} \setcounter{subsection}{0}