]> AND Private Git Repository - book_gpu.git/commitdiff
Logo AND Algorithmique Numérique Distribuée

Private GIT Repository
new
authorcouturie <couturie@carcariass.(none)>
Wed, 20 Mar 2013 20:45:59 +0000 (21:45 +0100)
committercouturie <couturie@carcariass.(none)>
Wed, 20 Mar 2013 20:45:59 +0000 (21:45 +0100)
BookGPU/BookGPU.tex
BookGPU/Chapters/chapter12/ch12.aux
BookGPU/Chapters/chapter13/ch13.aux
BookGPU/Chapters/chapter16/ch16.aux
BookGPU/Chapters/chapter18/ch18.aux
BookGPU/Chapters/chapter3/ch3.aux
BookGPU/Chapters/chapter6/ch6.aux

index 46546ef7d70eee7d14d170ca160e6fdfbde89d23..de8e3594470be8197aa8db190db0795abef80f85 100755 (executable)
 \maketitle
 
 \frontmatter
 \maketitle
 
 \frontmatter
-\include{frontmatter/Foreword}
-\include{frontmatter/preface}
+%\include{frontmatter/Foreword}
+%\include{frontmatter/preface}
 
 \listoffigures
 \listoftables
 
 \listoffigures
 \listoftables
 \include{Chapters/symbollist}
 
 \setcounter{page}{1}
 \include{Chapters/symbollist}
 
 \setcounter{page}{1}
-\part{This is a Part}
+\part{Presentation of GPUs}
 \include{Chapters/chapter1/ch1}
 \include{Chapters/chapter2/ch2}
 \include{Chapters/chapter1/ch1}
 \include{Chapters/chapter2/ch2}
+\part{Image processing}
 \include{Chapters/chapter3/ch3}
 \include{Chapters/chapter3/ch3}
+\part{Software development}
 \include{Chapters/chapter5/ch5}
 \include{Chapters/chapter6/ch6}
 \include{Chapters/chapter5/ch5}
 \include{Chapters/chapter6/ch6}
-\include{Chapters/chapter7/ch7}
+\part{Optimization}
 \include{Chapters/chapter8/ch8}
 \include{Chapters/chapter9/ch9}
 \include{Chapters/chapter8/ch8}
 \include{Chapters/chapter9/ch9}
+
+\part{Numerical applications}
+\include{Chapters/chapter7/ch7}
 \include{Chapters/chapter11/ch11}
 \include{Chapters/chapter12/ch12}
 \include{Chapters/chapter13/ch13}
 \include{Chapters/chapter14/ch14}
 \include{Chapters/chapter15/ch15}
 \include{Chapters/chapter16/ch16}
 \include{Chapters/chapter11/ch11}
 \include{Chapters/chapter12/ch12}
 \include{Chapters/chapter13/ch13}
 \include{Chapters/chapter14/ch14}
 \include{Chapters/chapter15/ch15}
 \include{Chapters/chapter16/ch16}
- \include{Chapters/chapter18/ch18}
+\part{Other}
+\include{Chapters/chapter18/ch18}
 
 \bibliographystyle{hep}
 %%%\bibliography{biblio}
 
 \bibliographystyle{hep}
 %%%\bibliography{biblio}
index f25a4846176ecd98674eebc03655f6ecfa97d3e0..82783b42e49cb403d0eafb436d9e0f793ddf250f 100644 (file)
@@ -3,81 +3,81 @@
 \@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}}
 \@writefile{toc}{\author{Jacques Bahi}{}}
 \@writefile{loa}{\addvspace {10\p@ }}
 \@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}}
 \@writefile{toc}{\author{Jacques Bahi}{}}
 \@writefile{loa}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {chapter}{\numberline {11}Solving sparse linear systems with GMRES and CG methods on GPU clusters}{251}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {11}Solving sparse linear systems with GMRES and CG methods on GPU clusters}{259}}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
-\newlabel{ch12}{{11}{251}}
-\@writefile{toc}{\contentsline {section}{\numberline {11.1}Introduction}{251}}
-\newlabel{ch12:sec:01}{{11.1}{251}}
-\@writefile{toc}{\contentsline {section}{\numberline {11.2}Krylov iterative methods}{252}}
-\newlabel{ch12:sec:02}{{11.2}{252}}
-\newlabel{ch12:eq:01}{{11.1}{252}}
-\newlabel{ch12:eq:02}{{11.2}{252}}
-\newlabel{ch12:eq:03}{{11.3}{252}}
-\newlabel{ch12:eq:11}{{11.4}{253}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {11.2.1}CG method}{253}}
-\newlabel{ch12:sec:02.01}{{11.2.1}{253}}
-\newlabel{ch12:eq:04}{{11.5}{253}}
-\newlabel{ch12:eq:05}{{11.6}{253}}
-\newlabel{ch12:eq:06}{{11.7}{253}}
-\newlabel{ch12:eq:07}{{11.8}{253}}
-\newlabel{ch12:eq:08}{{11.9}{253}}
-\newlabel{ch12:eq:09}{{11.10}{253}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {9}{\ignorespaces Left-preconditioned CG method\relax }}{254}}
-\newlabel{ch12:alg:01}{{9}{254}}
-\newlabel{ch12:eq:10}{{11.11}{254}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {11.2.2}GMRES method}{255}}
-\newlabel{ch12:sec:02.02}{{11.2.2}{255}}
-\newlabel{ch12:eq:12}{{11.12}{255}}
-\newlabel{ch12:eq:13}{{11.13}{255}}
-\newlabel{ch12:eq:14}{{11.14}{255}}
-\newlabel{ch12:eq:15}{{11.15}{255}}
-\newlabel{ch12:eq:16}{{11.16}{255}}
-\newlabel{ch12:eq:17}{{11.17}{255}}
-\newlabel{ch12:eq:18}{{11.18}{255}}
-\newlabel{ch12:eq:19}{{11.19}{255}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {10}{\ignorespaces Left-preconditioned GMRES method with restarts\relax }}{256}}
-\newlabel{ch12:alg:02}{{10}{256}}
-\@writefile{toc}{\contentsline {section}{\numberline {11.3}Parallel implementation on a GPU cluster}{257}}
-\newlabel{ch12:sec:03}{{11.3}{257}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.1}Data partitioning}{257}}
-\newlabel{ch12:sec:03.01}{{11.3.1}{257}}
-\@writefile{lof}{\contentsline {figure}{\numberline {11.1}{\ignorespaces A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.\relax }}{258}}
-\newlabel{ch12:fig:01}{{11.1}{258}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.2}GPU computing}{258}}
-\newlabel{ch12:sec:03.02}{{11.3.2}{258}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.3}Data communications}{259}}
-\newlabel{ch12:sec:03.03}{{11.3.3}{259}}
-\@writefile{lof}{\contentsline {figure}{\numberline {11.2}{\ignorespaces Data exchanges between \textit  {Node 1} and its neighbors \textit  {Node 0}, \textit  {Node 2} and \textit  {Node 3}.\relax }}{260}}
-\newlabel{ch12:fig:02}{{11.2}{260}}
-\@writefile{lof}{\contentsline {figure}{\numberline {11.3}{\ignorespaces Columns reordering of a sparse sub-matrix.\relax }}{261}}
-\newlabel{ch12:fig:03}{{11.3}{261}}
-\@writefile{toc}{\contentsline {section}{\numberline {11.4}Experimental results}{262}}
-\newlabel{ch12:sec:04}{{11.4}{262}}
-\@writefile{lof}{\contentsline {figure}{\numberline {11.4}{\ignorespaces General scheme of the GPU cluster of tests composed of six machines, each with two GPUs.\relax }}{262}}
-\newlabel{ch12:fig:04}{{11.4}{262}}
-\@writefile{lof}{\contentsline {figure}{\numberline {11.5}{\ignorespaces Sketches of sparse matrices chosen from the Davis collection.\relax }}{263}}
-\newlabel{ch12:fig:05}{{11.5}{263}}
-\@writefile{lot}{\contentsline {table}{\numberline {11.1}{\ignorespaces Main characteristics of sparse matrices chosen from the Davis collection.\relax }}{263}}
-\newlabel{ch12:tab:01}{{11.1}{263}}
-\@writefile{lot}{\contentsline {table}{\numberline {11.2}{\ignorespaces Performances of the parallel CG method on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{264}}
-\newlabel{ch12:tab:02}{{11.2}{264}}
-\@writefile{lot}{\contentsline {table}{\numberline {11.3}{\ignorespaces Performances of the parallel GMRES method on a cluster 24 CPU cores vs. on cluster of 12 GPUs.\relax }}{264}}
-\newlabel{ch12:tab:03}{{11.3}{264}}
-\newlabel{ch12:eq:20}{{11.20}{265}}
-\@writefile{lof}{\contentsline {figure}{\numberline {11.6}{\ignorespaces Parallel generation of a large sparse matrix by four computing nodes.\relax }}{266}}
-\newlabel{ch12:fig:06}{{11.6}{266}}
-\@writefile{lot}{\contentsline {table}{\numberline {11.4}{\ignorespaces Main characteristics of sparse banded matrices generated from those of the Davis collection.\relax }}{266}}
-\newlabel{ch12:tab:04}{{11.4}{266}}
-\@writefile{lot}{\contentsline {table}{\numberline {11.5}{\ignorespaces Performances of the parallel CG method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{267}}
-\newlabel{ch12:tab:05}{{11.5}{267}}
-\@writefile{lot}{\contentsline {table}{\numberline {11.6}{\ignorespaces Performances of the parallel GMRES method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{267}}
-\newlabel{ch12:tab:06}{{11.6}{267}}
-\@writefile{toc}{\contentsline {section}{\numberline {11.5}Conclusion}{267}}
-\newlabel{ch12:sec:05}{{11.5}{267}}
-\@writefile{toc}{\contentsline {section}{Bibliography}{268}}
+\newlabel{ch12}{{11}{259}}
+\@writefile{toc}{\contentsline {section}{\numberline {11.1}Introduction}{259}}
+\newlabel{ch12:sec:01}{{11.1}{259}}
+\@writefile{toc}{\contentsline {section}{\numberline {11.2}Krylov iterative methods}{260}}
+\newlabel{ch12:sec:02}{{11.2}{260}}
+\newlabel{ch12:eq:01}{{11.1}{260}}
+\newlabel{ch12:eq:02}{{11.2}{260}}
+\newlabel{ch12:eq:03}{{11.3}{260}}
+\newlabel{ch12:eq:11}{{11.4}{261}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {11.2.1}CG method}{261}}
+\newlabel{ch12:sec:02.01}{{11.2.1}{261}}
+\newlabel{ch12:eq:04}{{11.5}{261}}
+\newlabel{ch12:eq:05}{{11.6}{261}}
+\newlabel{ch12:eq:06}{{11.7}{261}}
+\newlabel{ch12:eq:07}{{11.8}{261}}
+\newlabel{ch12:eq:08}{{11.9}{261}}
+\newlabel{ch12:eq:09}{{11.10}{261}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {9}{\ignorespaces Left-preconditioned CG method\relax }}{262}}
+\newlabel{ch12:alg:01}{{9}{262}}
+\newlabel{ch12:eq:10}{{11.11}{262}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {11.2.2}GMRES method}{263}}
+\newlabel{ch12:sec:02.02}{{11.2.2}{263}}
+\newlabel{ch12:eq:12}{{11.12}{263}}
+\newlabel{ch12:eq:13}{{11.13}{263}}
+\newlabel{ch12:eq:14}{{11.14}{263}}
+\newlabel{ch12:eq:15}{{11.15}{263}}
+\newlabel{ch12:eq:16}{{11.16}{263}}
+\newlabel{ch12:eq:17}{{11.17}{263}}
+\newlabel{ch12:eq:18}{{11.18}{263}}
+\newlabel{ch12:eq:19}{{11.19}{263}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {10}{\ignorespaces Left-preconditioned GMRES method with restarts\relax }}{264}}
+\newlabel{ch12:alg:02}{{10}{264}}
+\@writefile{toc}{\contentsline {section}{\numberline {11.3}Parallel implementation on a GPU cluster}{265}}
+\newlabel{ch12:sec:03}{{11.3}{265}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.1}Data partitioning}{265}}
+\newlabel{ch12:sec:03.01}{{11.3.1}{265}}
+\@writefile{lof}{\contentsline {figure}{\numberline {11.1}{\ignorespaces A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.\relax }}{266}}
+\newlabel{ch12:fig:01}{{11.1}{266}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.2}GPU computing}{266}}
+\newlabel{ch12:sec:03.02}{{11.3.2}{266}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.3}Data communications}{267}}
+\newlabel{ch12:sec:03.03}{{11.3.3}{267}}
+\@writefile{lof}{\contentsline {figure}{\numberline {11.2}{\ignorespaces Data exchanges between \textit  {Node 1} and its neighbors \textit  {Node 0}, \textit  {Node 2} and \textit  {Node 3}.\relax }}{268}}
+\newlabel{ch12:fig:02}{{11.2}{268}}
+\@writefile{lof}{\contentsline {figure}{\numberline {11.3}{\ignorespaces Columns reordering of a sparse sub-matrix.\relax }}{269}}
+\newlabel{ch12:fig:03}{{11.3}{269}}
+\@writefile{toc}{\contentsline {section}{\numberline {11.4}Experimental results}{270}}
+\newlabel{ch12:sec:04}{{11.4}{270}}
+\@writefile{lof}{\contentsline {figure}{\numberline {11.4}{\ignorespaces General scheme of the GPU cluster of tests composed of six machines, each with two GPUs.\relax }}{270}}
+\newlabel{ch12:fig:04}{{11.4}{270}}
+\@writefile{lof}{\contentsline {figure}{\numberline {11.5}{\ignorespaces Sketches of sparse matrices chosen from the Davis collection.\relax }}{271}}
+\newlabel{ch12:fig:05}{{11.5}{271}}
+\@writefile{lot}{\contentsline {table}{\numberline {11.1}{\ignorespaces Main characteristics of sparse matrices chosen from the Davis collection.\relax }}{271}}
+\newlabel{ch12:tab:01}{{11.1}{271}}
+\@writefile{lot}{\contentsline {table}{\numberline {11.2}{\ignorespaces Performances of the parallel CG method on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{272}}
+\newlabel{ch12:tab:02}{{11.2}{272}}
+\@writefile{lot}{\contentsline {table}{\numberline {11.3}{\ignorespaces Performances of the parallel GMRES method on a cluster 24 CPU cores vs. on cluster of 12 GPUs.\relax }}{272}}
+\newlabel{ch12:tab:03}{{11.3}{272}}
+\newlabel{ch12:eq:20}{{11.20}{273}}
+\@writefile{lof}{\contentsline {figure}{\numberline {11.6}{\ignorespaces Parallel generation of a large sparse matrix by four computing nodes.\relax }}{274}}
+\newlabel{ch12:fig:06}{{11.6}{274}}
+\@writefile{lot}{\contentsline {table}{\numberline {11.4}{\ignorespaces Main characteristics of sparse banded matrices generated from those of the Davis collection.\relax }}{274}}
+\newlabel{ch12:tab:04}{{11.4}{274}}
+\@writefile{lot}{\contentsline {table}{\numberline {11.5}{\ignorespaces Performances of the parallel CG method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{275}}
+\newlabel{ch12:tab:05}{{11.5}{275}}
+\@writefile{lot}{\contentsline {table}{\numberline {11.6}{\ignorespaces Performances of the parallel GMRES method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{275}}
+\newlabel{ch12:tab:06}{{11.6}{275}}
+\@writefile{toc}{\contentsline {section}{\numberline {11.5}Conclusion}{275}}
+\newlabel{ch12:sec:05}{{11.5}{275}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{276}}
 \@setckpt{Chapters/chapter12/ch12}{
 \@setckpt{Chapters/chapter12/ch12}{
-\setcounter{page}{270}
+\setcounter{page}{278}
 \setcounter{equation}{22}
 \setcounter{enumi}{4}
 \setcounter{enumii}{0}
 \setcounter{equation}{22}
 \setcounter{enumi}{4}
 \setcounter{enumii}{0}
@@ -85,7 +85,7 @@
 \setcounter{enumiv}{10}
 \setcounter{footnote}{0}
 \setcounter{mpfootnote}{0}
 \setcounter{enumiv}{10}
 \setcounter{footnote}{0}
 \setcounter{mpfootnote}{0}
-\setcounter{part}{1}
+\setcounter{part}{5}
 \setcounter{chapter}{11}
 \setcounter{section}{5}
 \setcounter{subsection}{0}
 \setcounter{chapter}{11}
 \setcounter{section}{5}
 \setcounter{subsection}{0}
index 750fdc1c9f303e188b86c2c717848e54e4cf3e7f..f830fe0daf9d90de75314f4545079eddada211d2 100644 (file)
@@ -5,86 +5,86 @@
 \@writefile{toc}{\author{Pierre Spit\IeC {\'e}ri}{}}
 \@writefile{toc}{\author{Jacques Bahi}{}}
 \@writefile{loa}{\addvspace {10\p@ }}
 \@writefile{toc}{\author{Pierre Spit\IeC {\'e}ri}{}}
 \@writefile{toc}{\author{Jacques Bahi}{}}
 \@writefile{loa}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {chapter}{\numberline {12}Solving sparse nonlinear systems of obstacle problems on GPU clusters}{271}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {12}Solving sparse nonlinear systems of obstacle problems on GPU clusters}{279}}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
-\newlabel{ch13}{{12}{271}}
-\@writefile{toc}{\contentsline {section}{\numberline {12.1}Introduction}{271}}
-\newlabel{ch13:sec:01}{{12.1}{271}}
-\@writefile{toc}{\contentsline {section}{\numberline {12.2}Obstacle problems}{272}}
-\newlabel{ch13:sec:02}{{12.2}{272}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {12.2.1}Mathematical model}{272}}
-\newlabel{ch13:sec:02.01}{{12.2.1}{272}}
-\newlabel{ch13:eq:01}{{12.1}{272}}
-\newlabel{ch13:eq:02}{{12.2}{272}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {12.2.2}Discretization}{273}}
-\newlabel{ch13:sec:02.02}{{12.2.2}{273}}
-\newlabel{ch13:eq:03}{{12.3}{273}}
-\newlabel{ch13:eq:04}{{12.4}{273}}
-\newlabel{ch13:eq:05}{{12.5}{273}}
-\@writefile{toc}{\contentsline {section}{\numberline {12.3}Parallel iterative method}{274}}
-\newlabel{ch13:sec:03}{{12.3}{274}}
-\newlabel{ch13:eq:06}{{12.6}{274}}
-\newlabel{ch13:eq:07}{{12.7}{274}}
-\newlabel{ch13:eq:08}{{12.8}{274}}
-\newlabel{ch13:eq:09}{{12.9}{274}}
-\newlabel{ch13:eq:10}{{12.10}{275}}
-\newlabel{ch13:eq:11}{{12.11}{275}}
-\newlabel{ch13:eq:12}{{12.12}{275}}
-\newlabel{ch13:eq:13}{{12.13}{276}}
-\newlabel{ch13:eq:14}{{12.14}{276}}
-\newlabel{ch13:eq:15}{{12.15}{276}}
-\newlabel{ch13:eq:16}{{12.16}{276}}
-\@writefile{toc}{\contentsline {section}{\numberline {12.4}Parallel implementation on a GPU cluster}{277}}
-\newlabel{ch13:sec:04}{{12.4}{277}}
-\@writefile{lof}{\contentsline {figure}{\numberline {12.1}{\ignorespaces Data partitioning of a problem to be solved among $S=3\times 4$ computing nodes.\relax }}{277}}
-\newlabel{ch13:fig:01}{{12.1}{277}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {11}{\ignorespaces Parallel solving of the obstacle problem on a GPU cluster\relax }}{278}}
-\newlabel{ch13:alg:01}{{11}{278}}
-\newlabel{ch13:eq:18}{{12.17}{278}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {12}{\ignorespaces Parallel iterative solving of the nonlinear systems on a GPU cluster ($Solve()$ function)\relax }}{279}}
-\newlabel{ch13:alg:02}{{12}{279}}
-\@writefile{lof}{\contentsline {figure}{\numberline {12.2}{\ignorespaces Decomposition of a sub-problem in a GPU into $nz$ slices.\relax }}{280}}
-\newlabel{ch13:fig:02}{{12.2}{280}}
-\newlabel{ch13:list:01}{{12.1}{280}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.1}Skeleton codes of a GPU kernel and a CPU function}{280}}
-\@writefile{lof}{\contentsline {figure}{\numberline {12.3}{\ignorespaces Matrix constant coefficients in a three-dimensional domain.\relax }}{282}}
-\newlabel{ch13:fig:03}{{12.3}{282}}
-\newlabel{ch13:eq:17}{{12.18}{282}}
-\newlabel{ch13:list:02}{{12.2}{282}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.2}GPU kernels of the projected Richardson method}{282}}
-\@writefile{lof}{\contentsline {figure}{\numberline {12.4}{\ignorespaces Computation of a vector element with the projected Richardson method.\relax }}{284}}
-\newlabel{ch13:fig:04}{{12.4}{284}}
-\newlabel{ch13:list:03}{{12.3}{284}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.3}Memory access to the cache texture memory}{284}}
-\@writefile{toc}{\contentsline {section}{\numberline {12.5}Experimental tests on a GPU cluster}{285}}
-\newlabel{ch13:sec:05}{{12.5}{285}}
-\@writefile{lof}{\contentsline {figure}{\numberline {12.5}{\ignorespaces GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.\relax }}{287}}
-\newlabel{ch13:fig:05}{{12.5}{287}}
-\@writefile{lot}{\contentsline {table}{\numberline {12.1}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 24 CPU cores.\relax }}{287}}
-\newlabel{ch13:tab:01}{{12.1}{287}}
-\@writefile{lot}{\contentsline {table}{\numberline {12.2}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.\relax }}{288}}
-\newlabel{ch13:tab:02}{{12.2}{288}}
-\@writefile{toc}{\contentsline {section}{\numberline {12.6}Red-Black ordering technique}{288}}
-\newlabel{ch13:sec:06}{{12.6}{288}}
-\newlabel{ch13:list:04}{{12.4}{289}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.4}GPU kernels of the projected Richardson method using the red-black technique}{289}}
-\newlabel{ch13:fig:06.01}{{12.6(a)}{290}}
-\newlabel{sub@ch13:fig:06.01}{{(a)}{290}}
-\newlabel{ch13:fig:06.02}{{12.6(b)}{290}}
-\newlabel{sub@ch13:fig:06.02}{{(b)}{290}}
-\@writefile{lof}{\contentsline {figure}{\numberline {12.6}{\ignorespaces Red-Black ordering for computing the iterate vector elements in a three-dimensional space.\relax }}{290}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Red-Black ordering on x, y and z axises}}}{290}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Red-Black ordering on y axis}}}{290}}
-\@writefile{lot}{\contentsline {table}{\numberline {12.3}{\ignorespaces Execution times in seconds of the parallel projected Richardson method using read-black ordering technique implemented on a cluster of 12 GPUs.\relax }}{291}}
-\newlabel{ch13:tab:03}{{12.3}{291}}
-\@writefile{lof}{\contentsline {figure}{\numberline {12.7}{\ignorespaces Weak scaling of both synchronous and asynchronous algorithms of the projected Richardson method using red-black ordering technique.\relax }}{292}}
-\newlabel{ch13:fig:07}{{12.7}{292}}
-\@writefile{toc}{\contentsline {section}{\numberline {12.7}Conclusion}{292}}
-\newlabel{ch13:sec:07}{{12.7}{292}}
-\@writefile{toc}{\contentsline {section}{Bibliography}{293}}
+\newlabel{ch13}{{12}{279}}
+\@writefile{toc}{\contentsline {section}{\numberline {12.1}Introduction}{279}}
+\newlabel{ch13:sec:01}{{12.1}{279}}
+\@writefile{toc}{\contentsline {section}{\numberline {12.2}Obstacle problems}{280}}
+\newlabel{ch13:sec:02}{{12.2}{280}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {12.2.1}Mathematical model}{280}}
+\newlabel{ch13:sec:02.01}{{12.2.1}{280}}
+\newlabel{ch13:eq:01}{{12.1}{280}}
+\newlabel{ch13:eq:02}{{12.2}{280}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {12.2.2}Discretization}{281}}
+\newlabel{ch13:sec:02.02}{{12.2.2}{281}}
+\newlabel{ch13:eq:03}{{12.3}{281}}
+\newlabel{ch13:eq:04}{{12.4}{281}}
+\newlabel{ch13:eq:05}{{12.5}{281}}
+\@writefile{toc}{\contentsline {section}{\numberline {12.3}Parallel iterative method}{282}}
+\newlabel{ch13:sec:03}{{12.3}{282}}
+\newlabel{ch13:eq:06}{{12.6}{282}}
+\newlabel{ch13:eq:07}{{12.7}{282}}
+\newlabel{ch13:eq:08}{{12.8}{282}}
+\newlabel{ch13:eq:09}{{12.9}{282}}
+\newlabel{ch13:eq:10}{{12.10}{283}}
+\newlabel{ch13:eq:11}{{12.11}{283}}
+\newlabel{ch13:eq:12}{{12.12}{283}}
+\newlabel{ch13:eq:13}{{12.13}{284}}
+\newlabel{ch13:eq:14}{{12.14}{284}}
+\newlabel{ch13:eq:15}{{12.15}{284}}
+\newlabel{ch13:eq:16}{{12.16}{284}}
+\@writefile{toc}{\contentsline {section}{\numberline {12.4}Parallel implementation on a GPU cluster}{285}}
+\newlabel{ch13:sec:04}{{12.4}{285}}
+\@writefile{lof}{\contentsline {figure}{\numberline {12.1}{\ignorespaces Data partitioning of a problem to be solved among $S=3\times 4$ computing nodes.\relax }}{285}}
+\newlabel{ch13:fig:01}{{12.1}{285}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {11}{\ignorespaces Parallel solving of the obstacle problem on a GPU cluster\relax }}{286}}
+\newlabel{ch13:alg:01}{{11}{286}}
+\newlabel{ch13:eq:18}{{12.17}{286}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {12}{\ignorespaces Parallel iterative solving of the nonlinear systems on a GPU cluster ($Solve()$ function)\relax }}{287}}
+\newlabel{ch13:alg:02}{{12}{287}}
+\@writefile{lof}{\contentsline {figure}{\numberline {12.2}{\ignorespaces Decomposition of a sub-problem in a GPU into $nz$ slices.\relax }}{288}}
+\newlabel{ch13:fig:02}{{12.2}{288}}
+\newlabel{ch13:list:01}{{12.1}{288}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.1}Skeleton codes of a GPU kernel and a CPU function}{288}}
+\@writefile{lof}{\contentsline {figure}{\numberline {12.3}{\ignorespaces Matrix constant coefficients in a three-dimensional domain.\relax }}{290}}
+\newlabel{ch13:fig:03}{{12.3}{290}}
+\newlabel{ch13:eq:17}{{12.18}{290}}
+\newlabel{ch13:list:02}{{12.2}{290}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.2}GPU kernels of the projected Richardson method}{290}}
+\@writefile{lof}{\contentsline {figure}{\numberline {12.4}{\ignorespaces Computation of a vector element with the projected Richardson method.\relax }}{292}}
+\newlabel{ch13:fig:04}{{12.4}{292}}
+\newlabel{ch13:list:03}{{12.3}{292}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.3}Memory access to the cache texture memory}{292}}
+\@writefile{toc}{\contentsline {section}{\numberline {12.5}Experimental tests on a GPU cluster}{293}}
+\newlabel{ch13:sec:05}{{12.5}{293}}
+\@writefile{lof}{\contentsline {figure}{\numberline {12.5}{\ignorespaces GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.\relax }}{295}}
+\newlabel{ch13:fig:05}{{12.5}{295}}
+\@writefile{lot}{\contentsline {table}{\numberline {12.1}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 24 CPU cores.\relax }}{295}}
+\newlabel{ch13:tab:01}{{12.1}{295}}
+\@writefile{lot}{\contentsline {table}{\numberline {12.2}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.\relax }}{296}}
+\newlabel{ch13:tab:02}{{12.2}{296}}
+\@writefile{toc}{\contentsline {section}{\numberline {12.6}Red-Black ordering technique}{296}}
+\newlabel{ch13:sec:06}{{12.6}{296}}
+\newlabel{ch13:list:04}{{12.4}{297}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.4}GPU kernels of the projected Richardson method using the red-black technique}{297}}
+\newlabel{ch13:fig:06.01}{{12.6(a)}{298}}
+\newlabel{sub@ch13:fig:06.01}{{(a)}{298}}
+\newlabel{ch13:fig:06.02}{{12.6(b)}{298}}
+\newlabel{sub@ch13:fig:06.02}{{(b)}{298}}
+\@writefile{lof}{\contentsline {figure}{\numberline {12.6}{\ignorespaces Red-Black ordering for computing the iterate vector elements in a three-dimensional space.\relax }}{298}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Red-Black ordering on x, y and z axises}}}{298}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Red-Black ordering on y axis}}}{298}}
+\@writefile{lot}{\contentsline {table}{\numberline {12.3}{\ignorespaces Execution times in seconds of the parallel projected Richardson method using read-black ordering technique implemented on a cluster of 12 GPUs.\relax }}{299}}
+\newlabel{ch13:tab:03}{{12.3}{299}}
+\@writefile{lof}{\contentsline {figure}{\numberline {12.7}{\ignorespaces Weak scaling of both synchronous and asynchronous algorithms of the projected Richardson method using red-black ordering technique.\relax }}{300}}
+\newlabel{ch13:fig:07}{{12.7}{300}}
+\@writefile{toc}{\contentsline {section}{\numberline {12.7}Conclusion}{300}}
+\newlabel{ch13:sec:07}{{12.7}{300}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{301}}
 \@setckpt{Chapters/chapter13/ch13}{
 \@setckpt{Chapters/chapter13/ch13}{
-\setcounter{page}{295}
+\setcounter{page}{303}
 \setcounter{equation}{18}
 \setcounter{enumi}{4}
 \setcounter{enumii}{0}
 \setcounter{equation}{18}
 \setcounter{enumi}{4}
 \setcounter{enumii}{0}
@@ -92,7 +92,7 @@
 \setcounter{enumiv}{15}
 \setcounter{footnote}{0}
 \setcounter{mpfootnote}{0}
 \setcounter{enumiv}{15}
 \setcounter{footnote}{0}
 \setcounter{mpfootnote}{0}
-\setcounter{part}{1}
+\setcounter{part}{5}
 \setcounter{chapter}{12}
 \setcounter{section}{7}
 \setcounter{subsection}{0}
 \setcounter{chapter}{12}
 \setcounter{section}{7}
 \setcounter{subsection}{0}
index 19fea95ef2498f31557e04c6e6c92baf75b7873e..75c0afb4c5612b0bd5ea7e973039adad5b949951 100644 (file)
@@ -4,72 +4,72 @@
 \@writefile{toc}{\author{H. Wang}{}}
 \@writefile{toc}{\author{H. Yu}{}}
 \@writefile{loa}{\addvspace {10\p@ }}
 \@writefile{toc}{\author{H. Wang}{}}
 \@writefile{toc}{\author{H. Yu}{}}
 \@writefile{loa}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {chapter}{\numberline {15}GPU-Accelerated Envelope-Following Method}{335}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {15}GPU-Accelerated Envelope-Following Method}{343}}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {section}{\numberline {15.1}Introduction}{335}}
-\newlabel{fig:ef1}{{15.1(a)}{337}}
-\newlabel{sub@fig:ef1}{{(a)}{337}}
-\newlabel{fig:ef2}{{15.1(b)}{337}}
-\newlabel{sub@fig:ef2}{{(b)}{337}}
-\@writefile{lof}{\contentsline {figure}{\numberline {15.1}{\ignorespaces Transient envelope-following analysis. (Both two figures reflect backward-Euler style envelope-following.)\relax }}{337}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Illustration of one envelope skip.}}}{337}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {The envelope changes in a slow time scale.}}}{337}}
-\newlabel{fig:ef_intro}{{15.1}{337}}
-\@writefile{toc}{\contentsline {section}{\numberline {15.2}The envelope-following method in a nutshell}{338}}
-\newlabel{sec:ef}{{15.2}{338}}
-\newlabel{eq:dae}{{15.1}{338}}
-\newlabel{eq:Newton}{{15.2}{339}}
-\newlabel{eq:A}{{15.3}{339}}
-\@writefile{toc}{\contentsline {section}{\numberline {15.3}New parallel envelope-following method}{340}}
-\newlabel{sec:gmres}{{15.3}{340}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.1}GMRES solver for Newton update equation}{340}}
-\@writefile{lof}{\contentsline {figure}{\numberline {15.2}{\ignorespaces The flow of envelope-following method.\relax }}{341}}
-\newlabel{fig:ef_flow}{{15.2}{341}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {14}{\ignorespaces Standard GMRES algorithm.\relax }}{342}}
-\newlabel{alg:GMRES}{{14}{342}}
-\newlabel{line:mvp}{{5}{342}}
-\newlabel{line:newnorm}{{11}{342}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.2}Parallelization on GPU platforms}{342}}
-\newlabel{sec:gpu}{{15.3.2}{342}}
-\@writefile{lof}{\contentsline {figure}{\numberline {15.3}{\ignorespaces GPU parallel solver for envelope-following update.\relax }}{343}}
-\newlabel{fig:gmres}{{15.3}{343}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.3}Gear-2 based sensitivity calculation}{344}}
-\newlabel{sec:gear}{{15.3.3}{344}}
-\newlabel{eq:BE}{{15.4}{344}}
-\newlabel{eq:sens1}{{15.5}{344}}
-\newlabel{eq:Gear_t2}{{15.6}{345}}
-\newlabel{eq:sens2}{{15.7}{345}}
-\newlabel{eq:Gear_t3}{{15.8}{345}}
-\newlabel{eq:sensM}{{15.9}{345}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {15}{\ignorespaces The matrix-free method for Krylov subspace construction.\relax }}{346}}
-\newlabel{alg:mf_Gear}{{15}{346}}
-\newlabel{line:mf_Gear_loop}{{4}{346}}
-\newlabel{line:shift}{{8}{346}}
-\@writefile{toc}{\contentsline {section}{\numberline {15.4}Numerical examples}{346}}
-\newlabel{sec:exp}{{15.4}{346}}
-\@writefile{lof}{\contentsline {figure}{\numberline {15.4}{\ignorespaces Diagram of a zero-voltage quasi-resonant flyback converter.\relax }}{347}}
-\newlabel{fig:flyback}{{15.4}{347}}
-\@writefile{lof}{\contentsline {figure}{\numberline {15.5}{\ignorespaces Illustration of power/ground network model.\relax }}{347}}
-\newlabel{fig:pg}{{15.5}{347}}
-\newlabel{fig:flybackWhole}{{15.6(a)}{348}}
-\newlabel{sub@fig:flybackWhole}{{(a)}{348}}
-\newlabel{fig:flybackZoom}{{15.6(b)}{348}}
-\newlabel{sub@fig:flybackZoom}{{(b)}{348}}
-\@writefile{lof}{\contentsline {figure}{\numberline {15.6}{\ignorespaces Flyback converter solution calculated by envelope-following. The red curve is traditional SPICE simulation result, and the back curve is the envelope-following output with simulation points marked.\relax }}{348}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {The whole plot}}}{348}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Detail of one EF simulation period}}}{348}}
-\newlabel{fig:flyback_wave}{{15.6}{348}}
-\@writefile{lof}{\contentsline {figure}{\numberline {15.7}{\ignorespaces Buck converter solution calculated by envelope-following.\relax }}{349}}
-\newlabel{fig:buck_wave}{{15.7}{349}}
-\@writefile{lot}{\contentsline {table}{\numberline {15.1}{\ignorespaces CPU and GPU time comparisons (in seconds) for solving Newton update equation with the proposed Gear-2 sensitivity. \relax }}{349}}
-\newlabel{table:circuit}{{15.1}{349}}
-\@writefile{toc}{\contentsline {section}{\numberline {15.5}Summary}{350}}
-\newlabel{sec:summary}{{15.5}{350}}
-\@writefile{toc}{\contentsline {section}{\numberline {15.6}Glossary}{350}}
-\@writefile{toc}{\contentsline {section}{Bibliography}{350}}
+\@writefile{toc}{\contentsline {section}{\numberline {15.1}Introduction}{343}}
+\newlabel{fig:ef1}{{15.1(a)}{345}}
+\newlabel{sub@fig:ef1}{{(a)}{345}}
+\newlabel{fig:ef2}{{15.1(b)}{345}}
+\newlabel{sub@fig:ef2}{{(b)}{345}}
+\@writefile{lof}{\contentsline {figure}{\numberline {15.1}{\ignorespaces Transient envelope-following analysis. (Both two figures reflect backward-Euler style envelope-following.)\relax }}{345}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Illustration of one envelope skip.}}}{345}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {The envelope changes in a slow time scale.}}}{345}}
+\newlabel{fig:ef_intro}{{15.1}{345}}
+\@writefile{toc}{\contentsline {section}{\numberline {15.2}The envelope-following method in a nutshell}{346}}
+\newlabel{sec:ef}{{15.2}{346}}
+\newlabel{eq:dae}{{15.1}{346}}
+\newlabel{eq:Newton}{{15.2}{347}}
+\newlabel{eq:A}{{15.3}{347}}
+\@writefile{toc}{\contentsline {section}{\numberline {15.3}New parallel envelope-following method}{348}}
+\newlabel{sec:gmres}{{15.3}{348}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.1}GMRES solver for Newton update equation}{348}}
+\@writefile{lof}{\contentsline {figure}{\numberline {15.2}{\ignorespaces The flow of envelope-following method.\relax }}{349}}
+\newlabel{fig:ef_flow}{{15.2}{349}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {14}{\ignorespaces Standard GMRES algorithm.\relax }}{350}}
+\newlabel{alg:GMRES}{{14}{350}}
+\newlabel{line:mvp}{{5}{350}}
+\newlabel{line:newnorm}{{11}{350}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.2}Parallelization on GPU platforms}{350}}
+\newlabel{sec:gpu}{{15.3.2}{350}}
+\@writefile{lof}{\contentsline {figure}{\numberline {15.3}{\ignorespaces GPU parallel solver for envelope-following update.\relax }}{351}}
+\newlabel{fig:gmres}{{15.3}{351}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.3}Gear-2 based sensitivity calculation}{352}}
+\newlabel{sec:gear}{{15.3.3}{352}}
+\newlabel{eq:BE}{{15.4}{352}}
+\newlabel{eq:sens1}{{15.5}{352}}
+\newlabel{eq:Gear_t2}{{15.6}{353}}
+\newlabel{eq:sens2}{{15.7}{353}}
+\newlabel{eq:Gear_t3}{{15.8}{353}}
+\newlabel{eq:sensM}{{15.9}{353}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {15}{\ignorespaces The matrix-free method for Krylov subspace construction.\relax }}{354}}
+\newlabel{alg:mf_Gear}{{15}{354}}
+\newlabel{line:mf_Gear_loop}{{4}{354}}
+\newlabel{line:shift}{{8}{354}}
+\@writefile{toc}{\contentsline {section}{\numberline {15.4}Numerical examples}{354}}
+\newlabel{sec:exp}{{15.4}{354}}
+\@writefile{lof}{\contentsline {figure}{\numberline {15.4}{\ignorespaces Diagram of a zero-voltage quasi-resonant flyback converter.\relax }}{355}}
+\newlabel{fig:flyback}{{15.4}{355}}
+\@writefile{lof}{\contentsline {figure}{\numberline {15.5}{\ignorespaces Illustration of power/ground network model.\relax }}{355}}
+\newlabel{fig:pg}{{15.5}{355}}
+\newlabel{fig:flybackWhole}{{15.6(a)}{356}}
+\newlabel{sub@fig:flybackWhole}{{(a)}{356}}
+\newlabel{fig:flybackZoom}{{15.6(b)}{356}}
+\newlabel{sub@fig:flybackZoom}{{(b)}{356}}
+\@writefile{lof}{\contentsline {figure}{\numberline {15.6}{\ignorespaces Flyback converter solution calculated by envelope-following. The red curve is traditional SPICE simulation result, and the back curve is the envelope-following output with simulation points marked.\relax }}{356}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {The whole plot}}}{356}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Detail of one EF simulation period}}}{356}}
+\newlabel{fig:flyback_wave}{{15.6}{356}}
+\@writefile{lof}{\contentsline {figure}{\numberline {15.7}{\ignorespaces Buck converter solution calculated by envelope-following.\relax }}{357}}
+\newlabel{fig:buck_wave}{{15.7}{357}}
+\@writefile{lot}{\contentsline {table}{\numberline {15.1}{\ignorespaces CPU and GPU time comparisons (in seconds) for solving Newton update equation with the proposed Gear-2 sensitivity. \relax }}{357}}
+\newlabel{table:circuit}{{15.1}{357}}
+\@writefile{toc}{\contentsline {section}{\numberline {15.5}Summary}{358}}
+\newlabel{sec:summary}{{15.5}{358}}
+\@writefile{toc}{\contentsline {section}{\numberline {15.6}Glossary}{358}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{358}}
 \@setckpt{Chapters/chapter16/ch16}{
 \@setckpt{Chapters/chapter16/ch16}{
-\setcounter{page}{352}
+\setcounter{page}{360}
 \setcounter{equation}{9}
 \setcounter{enumi}{2}
 \setcounter{enumii}{0}
 \setcounter{equation}{9}
 \setcounter{enumi}{2}
 \setcounter{enumii}{0}
@@ -77,7 +77,7 @@
 \setcounter{enumiv}{22}
 \setcounter{footnote}{0}
 \setcounter{mpfootnote}{0}
 \setcounter{enumiv}{22}
 \setcounter{footnote}{0}
 \setcounter{mpfootnote}{0}
-\setcounter{part}{1}
+\setcounter{part}{5}
 \setcounter{chapter}{15}
 \setcounter{section}{6}
 \setcounter{subsection}{0}
 \setcounter{chapter}{15}
 \setcounter{section}{6}
 \setcounter{subsection}{0}
index e01c7b1472307619c994be0dda704936d5f304b3..618a140e26c2df0ad4ea45c05cefd783183a0249 100644 (file)
@@ -2,44 +2,45 @@
 \@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}}
 \@writefile{toc}{\author{Christophe Guyeux}{}}
 \@writefile{loa}{\addvspace {10\p@ }}
 \@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}}
 \@writefile{toc}{\author{Christophe Guyeux}{}}
 \@writefile{loa}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {chapter}{\numberline {16}Pseudorandom Number Generator on GPU}{353}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {16}Pseudorandom Number Generator on GPU}{363}}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
-\newlabel{chapter18}{{16}{353}}
-\@writefile{toc}{\contentsline {section}{\numberline {16.1}Introduction}{353}}
-\@writefile{toc}{\contentsline {section}{\numberline {16.2}Basic Remindees}{355}}
-\newlabel{section:BASIC RECALLS}{{16.2}{355}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.1}A Short Presentation of Chaos}{355}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.2}On Devaney's Definition of Chaos}{355}}
-\newlabel{sec:dev}{{16.2.2}{355}}
-\newlabel{Devaney}{{16.1}{355}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.3}Chaotic iterations}{356}}
-\newlabel{subsection:Chaotic iterations}{{16.2.3}{356}}
-\newlabel{Chaotic iterations}{{2}{356}}
-\newlabel{eq:generalIC}{{16.4}{357}}
-\newlabel{equation Oplus}{{16.5}{357}}
-\@writefile{toc}{\contentsline {section}{\numberline {16.3}Toward Efficiency and Improvement for CI PRNG}{357}}
-\newlabel{sec:efficient PRNG}{{16.3}{357}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{357}}
-\newlabel{algo:seqCIPRNG}{{16.1}{357}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.1}C code of the sequential PRNG based on chaotic iterations}{357}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{358}}
-\newlabel{sec:efficient PRNG gpu}{{16.3.2}{358}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.3}Naive Version for GPU}{358}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {16}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{359}}
-\newlabel{algo:gpu_kernel}{{16}{359}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.4}Improved Version for GPU}{359}}
-\newlabel{IR}{{17}{360}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {17}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{360}}
-\newlabel{algo:gpu_kernel2}{{17}{360}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.5}Chaos Evaluation of the Improved Version}{360}}
-\@writefile{toc}{\contentsline {section}{\numberline {16.4}Experiments}{361}}
-\newlabel{sec:experiments}{{16.4}{361}}
-\@writefile{toc}{\contentsline {section}{Bibliography}{361}}
-\@writefile{lof}{\contentsline {figure}{\numberline {16.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{362}}
-\newlabel{fig:time_xorlike_gpu}{{16.1}{362}}
+\newlabel{chapter18}{{16}{363}}
+\@writefile{toc}{\contentsline {section}{\numberline {16.1}Introduction}{363}}
+\@writefile{toc}{\contentsline {section}{\numberline {16.2}Basic Remindees}{365}}
+\newlabel{section:BASIC RECALLS}{{16.2}{365}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.1}A Short Presentation of Chaos}{365}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.2}On Devaney's Definition of Chaos}{365}}
+\newlabel{sec:dev}{{16.2.2}{365}}
+\newlabel{Devaney}{{16.1}{365}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.3}Chaotic iterations}{366}}
+\newlabel{subsection:Chaotic iterations}{{16.2.3}{366}}
+\newlabel{Chaotic iterations}{{2}{366}}
+\newlabel{eq:generalIC}{{16.4}{367}}
+\newlabel{equation Oplus}{{16.5}{367}}
+\@writefile{toc}{\contentsline {section}{\numberline {16.3}Toward Efficiency and Improvement for CI PRNG}{367}}
+\newlabel{sec:efficient PRNG}{{16.3}{367}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{367}}
+\newlabel{algo:seqCIPRNG}{{16.1}{367}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.1}C code of the sequential PRNG based on chaotic iterations}{367}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{368}}
+\newlabel{sec:efficient PRNG gpu}{{16.3.2}{368}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.3}Naive Version for GPU}{368}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {16}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{369}}
+\newlabel{algo:gpu_kernel}{{16}{369}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.4}Improved Version for GPU}{369}}
+\newlabel{IR}{{17}{370}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {17}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{370}}
+\newlabel{algo:gpu_kernel2}{{17}{370}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.5}Chaos Evaluation of the Improved Version}{370}}
+\@writefile{toc}{\contentsline {section}{\numberline {16.4}Experiments}{371}}
+\newlabel{sec:experiments}{{16.4}{371}}
+\@writefile{toc}{\contentsline {section}{\numberline {16.5}Summary}{371}}
+\@writefile{lof}{\contentsline {figure}{\numberline {16.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{372}}
+\newlabel{fig:time_xorlike_gpu}{{16.1}{372}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{373}}
 \@setckpt{Chapters/chapter18/ch18}{
 \@setckpt{Chapters/chapter18/ch18}{
-\setcounter{page}{364}
+\setcounter{page}{375}
 \setcounter{equation}{5}
 \setcounter{enumi}{2}
 \setcounter{enumii}{0}
 \setcounter{equation}{5}
 \setcounter{enumi}{2}
 \setcounter{enumii}{0}
@@ -47,9 +48,9 @@
 \setcounter{enumiv}{17}
 \setcounter{footnote}{2}
 \setcounter{mpfootnote}{0}
 \setcounter{enumiv}{17}
 \setcounter{footnote}{2}
 \setcounter{mpfootnote}{0}
-\setcounter{part}{1}
+\setcounter{part}{6}
 \setcounter{chapter}{16}
 \setcounter{chapter}{16}
-\setcounter{section}{4}
+\setcounter{section}{5}
 \setcounter{subsection}{0}
 \setcounter{subsubsection}{0}
 \setcounter{paragraph}{0}
 \setcounter{subsection}{0}
 \setcounter{subsubsection}{0}
 \setcounter{paragraph}{0}
index 91483edb7edc673fa600b4e6a310c9a4febf6a1c..f86e67e12564ec21a5c44ffeabf4ad7bcd0fd187 100644 (file)
 \relax 
 \@writefile{toc}{\author{Gilles Perrot}{}}
 \@writefile{loa}{\addvspace {10\p@ }}
 \relax 
 \@writefile{toc}{\author{Gilles Perrot}{}}
 \@writefile{loa}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {chapter}{\numberline {3}Setting up the environnement.}{23}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {3}Setting up the environnement.}{25}}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
-\newlabel{algo:memcopy:H2D}{{7}{23}}
-\newlabel{algo:memcopy:kernel}{{8}{23}}
-\newlabel{algo:memcopy:D2H}{{9}{23}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{23}}
-\newlabel{algo:memcopy}{{1}{23}}
-\@writefile{toc}{\contentsline {section}{\numberline {3.1}Data transfers, memory management.}{24}}
-\newlabel{lst:main1}{{3.1}{25}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}Generic main.cu file used to launch CUDA kernels}{25}}
-\newlabel{lst:fkern1}{{3.2}{25}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}fast\_kernels.cu file featuring one kernel skeleton}{25}}
-\newlabel{lst:mkfile}{{3.3}{26}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}Generic Makefile based on those provided by NV SDK}{26}}
-\@writefile{toc}{\contentsline {section}{\numberline {3.2}Performance measurements}{26}}
-\newlabel{lst:chronos}{{3.4}{26}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Time measurement technique using cutil functions}{26}}
+\newlabel{algo:memcopy:H2D}{{7}{25}}
+\newlabel{algo:memcopy:kernel}{{8}{25}}
+\newlabel{algo:memcopy:D2H}{{9}{25}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{25}}
+\newlabel{algo:memcopy}{{1}{25}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.1}Data transfers, memory management.}{26}}
+\newlabel{lst:main1}{{3.1}{27}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}Generic main.cu file used to launch CUDA kernels}{27}}
+\newlabel{lst:fkern1}{{3.2}{27}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}fast\_kernels.cu file featuring one kernel skeleton}{27}}
+\newlabel{lst:mkfile}{{3.3}{28}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}Generic Makefile based on those provided by NV SDK}{28}}
+\@writefile{toc}{\contentsline {section}{\numberline {3.2}Performance measurements}{28}}
+\newlabel{lst:chronos}{{3.4}{28}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Time measurement technique using cutil functions}{28}}
 \@writefile{loa}{\addvspace {10\p@ }}
 \@writefile{loa}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {chapter}{\numberline {4}Implementing a fast median filter}{29}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {4}Implementing a fast median filter}{31}}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
 \@writefile{toc}{\author{Gilles Perrot}{}}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
 \@writefile{toc}{\author{Gilles Perrot}{}}
-\@writefile{toc}{\contentsline {section}{\numberline {4.1}Introduction}{29}}
-\@writefile{toc}{\contentsline {section}{\numberline {4.2}Median filtering}{30}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.1}Basic principles}{30}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.2}A naive implementation}{30}}
-\newlabel{img:sap_example_ref}{{4.1(a)}{31}}
-\newlabel{sub@img:sap_example_ref}{{(a)}{31}}
-\newlabel{img:sap_example_med3}{{4.1(b)}{31}}
-\newlabel{sub@img:sap_example_med3}{{(b)}{31}}
-\newlabel{img:sap_example_med5}{{4.1(c)}{31}}
-\newlabel{sub@img:sap_example_med5}{{(c)}{31}}
-\newlabel{img:sap_example_med3_it2}{{4.1(d)}{31}}
-\newlabel{sub@img:sap_example_med3_it2}{{(d)}{31}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Exemple of median filtering, applied to salt \& pepper noise reduction.\relax }}{31}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted by salt and pepper noise of density 0.25}}}{31}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ median filter}}}{31}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ median filter}}}{31}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image denoised by 2 iterations of a $3\times 3$ median filter}}}{31}}
-\newlabel{fig:sap_examples}{{4.1}{31}}
-\newlabel{lst:medianGeneric}{{4.1}{32}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.1}Generic CUDA kernel achieving median filtering}{32}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Exemple of 5x5 median filtering\relax }}{33}}
-\newlabel{fig:median_1}{{4.2}{33}}
-\newlabel{algoMedianGeneric}{{2}{33}}
-\newlabel{algoMedianGeneric:memcpyH2D}{{1}{33}}
-\newlabel{algoMedianGeneric:cptstart}{{3}{33}}
-\newlabel{algoMedianGeneric:cptend}{{5}{33}}
-\newlabel{algoMedianGeneric:memcpyD2H}{{7}{33}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{33}}
-\@writefile{toc}{\contentsline {section}{\numberline {4.3}NVidia GPU tuning recipes}{33}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{34}}
-\newlabel{fig:median_overlap}{{4.3}{34}}
-\@writefile{lot}{\contentsline {table}{\numberline {4.1}{\ignorespaces Performance results of \texttt  {kernel medianR}. \relax }}{34}}
-\newlabel{tab:medianHisto1}{{4.1}{34}}
-\@writefile{toc}{\contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers }{35}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.1}The simplest way}{35}}
-\newlabel{lst:kernelMedian3RegTri9}{{4.2}{36}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}3$\times $3 median filter kernel using one register per neighborhood pixel and bubble sort}{36}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Further optimization}{36}}
-\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{36}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs on GPU C2070 and CPU for generic median, 3$\times $3 median register-only and \textit  {libJacket}.\relax }}{37}}
-\newlabel{fig:compMedians1}{{4.4}{37}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{37}}
-\newlabel{fig:forgetful_selection}{{4.5}{37}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{38}}
-\newlabel{fig:forgetful3}{{4.6}{38}}
-\newlabel{lst:medianForget1pix3}{{4.3}{39}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{39}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{40}}
-\newlabel{fig:median3_overlap}{{4.7}{40}}
-\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{40}}
-\newlabel{lst:medianForget2pix3}{{4.4}{40}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{40}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{41}}
-\newlabel{fig:compMedians2}{{4.8}{41}}
-\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{42}}
-\newlabel{sec:median5}{{4.5.1}{42}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{42}}
-\newlabel{lst:medianForget2pix5}{{4.5}{42}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{42}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.\relax }}{43}}
-\newlabel{fig:median5overlap}{{4.9}{43}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.\relax }}{43}}
-\newlabel{fig:median5overlap}{{4.10}{43}}
-\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{44}}
-\newlabel{tab:median5comp}{{4.2}{44}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{45}}
-\newlabel{lst:medianSeparable}{{4.6}{45}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{45}}
-\newlabel{img:sap_example_ref}{{4.11(a)}{46}}
-\newlabel{sub@img:sap_example_ref}{{(a)}{46}}
-\newlabel{img:sap_example_sep_med3}{{4.11(b)}{46}}
-\newlabel{sub@img:sap_example_sep_med3}{{(b)}{46}}
-\newlabel{img:sap_example_sep_med5}{{4.11(c)}{46}}
-\newlabel{sub@img:sap_example_sep_med5}{{(c)}{46}}
-\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{46}}
-\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{46}}
-\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{46}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{46}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{46}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{46}}
-\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{46}}
-\newlabel{fig:sap_examples2}{{4.11}{46}}
-\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{47}}
-\newlabel{tab:medianSeparable}{{4.3}{47}}
-\@writefile{toc}{\contentsline {section}{Bibliography}{48}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.1}Introduction}{31}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.2}Median filtering}{32}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.1}Basic principles}{32}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.2}A naive implementation}{32}}
+\newlabel{img:sap_example_ref}{{4.1(a)}{33}}
+\newlabel{sub@img:sap_example_ref}{{(a)}{33}}
+\newlabel{img:sap_example_med3}{{4.1(b)}{33}}
+\newlabel{sub@img:sap_example_med3}{{(b)}{33}}
+\newlabel{img:sap_example_med5}{{4.1(c)}{33}}
+\newlabel{sub@img:sap_example_med5}{{(c)}{33}}
+\newlabel{img:sap_example_med3_it2}{{4.1(d)}{33}}
+\newlabel{sub@img:sap_example_med3_it2}{{(d)}{33}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Exemple of median filtering, applied to salt \& pepper noise reduction.\relax }}{33}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted by salt and pepper noise of density 0.25}}}{33}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ median filter}}}{33}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ median filter}}}{33}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image denoised by 2 iterations of a $3\times 3$ median filter}}}{33}}
+\newlabel{fig:sap_examples}{{4.1}{33}}
+\newlabel{lst:medianGeneric}{{4.1}{34}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.1}Generic CUDA kernel achieving median filtering}{34}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Exemple of 5x5 median filtering\relax }}{35}}
+\newlabel{fig:median_1}{{4.2}{35}}
+\newlabel{algoMedianGeneric}{{2}{35}}
+\newlabel{algoMedianGeneric:memcpyH2D}{{1}{35}}
+\newlabel{algoMedianGeneric:cptstart}{{3}{35}}
+\newlabel{algoMedianGeneric:cptend}{{5}{35}}
+\newlabel{algoMedianGeneric:memcpyD2H}{{7}{35}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{35}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.3}NVidia GPU tuning recipes}{35}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{36}}
+\newlabel{fig:median_overlap}{{4.3}{36}}
+\@writefile{lot}{\contentsline {table}{\numberline {4.1}{\ignorespaces Performance results of \texttt  {kernel medianR}. \relax }}{36}}
+\newlabel{tab:medianHisto1}{{4.1}{36}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers }{37}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.1}The simplest way}{37}}
+\newlabel{lst:kernelMedian3RegTri9}{{4.2}{38}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}3$\times $3 median filter kernel using one register per neighborhood pixel and bubble sort}{38}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Further optimization}{38}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{38}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs on GPU C2070 and CPU for generic median, 3$\times $3 median register-only and \textit  {libJacket}.\relax }}{39}}
+\newlabel{fig:compMedians1}{{4.4}{39}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{39}}
+\newlabel{fig:forgetful_selection}{{4.5}{39}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{40}}
+\newlabel{fig:forgetful3}{{4.6}{40}}
+\newlabel{lst:medianForget1pix3}{{4.3}{41}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{41}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{42}}
+\newlabel{fig:median3_overlap}{{4.7}{42}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{42}}
+\newlabel{lst:medianForget2pix3}{{4.4}{42}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{42}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{43}}
+\newlabel{fig:compMedians2}{{4.8}{43}}
+\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{44}}
+\newlabel{sec:median5}{{4.5.1}{44}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{44}}
+\newlabel{lst:medianForget2pix5}{{4.5}{44}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{44}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.\relax }}{45}}
+\newlabel{fig:median5overlap}{{4.9}{45}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.\relax }}{45}}
+\newlabel{fig:median5overlap}{{4.10}{45}}
+\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{46}}
+\newlabel{tab:median5comp}{{4.2}{46}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{47}}
+\newlabel{lst:medianSeparable}{{4.6}{47}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{47}}
+\newlabel{img:sap_example_ref}{{4.11(a)}{48}}
+\newlabel{sub@img:sap_example_ref}{{(a)}{48}}
+\newlabel{img:sap_example_sep_med3}{{4.11(b)}{48}}
+\newlabel{sub@img:sap_example_sep_med3}{{(b)}{48}}
+\newlabel{img:sap_example_sep_med5}{{4.11(c)}{48}}
+\newlabel{sub@img:sap_example_sep_med5}{{(c)}{48}}
+\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{48}}
+\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{48}}
+\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{48}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{48}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{48}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{48}}
+\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{48}}
+\newlabel{fig:sap_examples2}{{4.11}{48}}
+\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{49}}
+\newlabel{tab:medianSeparable}{{4.3}{49}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{50}}
 \@setckpt{Chapters/chapter3/ch3}{
 \@setckpt{Chapters/chapter3/ch3}{
-\setcounter{page}{50}
+\setcounter{page}{52}
 \setcounter{equation}{0}
 \setcounter{enumi}{3}
 \setcounter{enumii}{0}
 \setcounter{equation}{0}
 \setcounter{enumi}{3}
 \setcounter{enumii}{0}
 \setcounter{enumiv}{10}
 \setcounter{footnote}{0}
 \setcounter{mpfootnote}{0}
 \setcounter{enumiv}{10}
 \setcounter{footnote}{0}
 \setcounter{mpfootnote}{0}
-\setcounter{part}{1}
+\setcounter{part}{2}
 \setcounter{chapter}{4}
 \setcounter{section}{5}
 \setcounter{subsection}{2}
 \setcounter{chapter}{4}
 \setcounter{section}{5}
 \setcounter{subsection}{2}
index af524e9c27187bce5c11e149f365d2f1b8519497..1527dfc3ae2e6e8045898f091dba911e2bdea2ab 100644 (file)
 \@writefile{toc}{\author{Stephane Vialle}{}}
 \@writefile{toc}{\author{Jens Gustedt}{}}
 \@writefile{loa}{\addvspace {10\p@ }}
 \@writefile{toc}{\author{Stephane Vialle}{}}
 \@writefile{toc}{\author{Jens Gustedt}{}}
 \@writefile{loa}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {chapter}{\numberline {6}Development methodologies for GPU and cluster of GPUs}{83}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {6}Development methodologies for GPU and cluster of GPUs}{87}}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
 \@writefile{lof}{\addvspace {10\p@ }}
 \@writefile{lot}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {section}{\numberline {6.1}Introduction}{84}}
-\newlabel{ch6:intro}{{6.1}{84}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{84}}
-\newlabel{ch6:part1}{{6.2}{84}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.1}Synchronous parallel algorithms on GPU clusters}{84}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{86}}
-\newlabel{fig:ch6p1overlapnative}{{6.1}{86}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.2}Native overlap of CPU communications and GPU computations}{86}}
-\newlabel{algo:ch6p1overlapnative}{{6.1}{87}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{87}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{88}}
-\newlabel{fig:ch6p1overlapseqsequence}{{6.2}{88}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{88}}
-\newlabel{algo:ch6p1overlapseqsequence}{{6.2}{89}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{89}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{91}}
-\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{91}}
-\newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{91}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{91}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{93}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{94}}
-\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{94}}
-\newlabel{algo:ch6p1overlapinterleaved}{{6.4}{94}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{94}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{96}}
-\newlabel{ch6:p1expes}{{6.2.5}{96}}
-\newlabel{ch6:p1block-cyclic}{{6.2.5}{96}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{97}}
-\newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{97}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{98}}
-\newlabel{ch6:part2}{{6.3}{98}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{98}}
-\newlabel{algo:ch6p2sync}{{3}{98}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{99}}
-\newlabel{algo:ch6p2async}{{4}{99}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{100}}
-\newlabel{ch6:p2BasicAsync}{{6.3.1}{100}}
-\newlabel{algo:ch6p2BasicAsync}{{6.5}{100}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{100}}
-\newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{101}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{101}}
-\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{103}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{103}}
-\newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{103}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{103}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{105}}
-\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{105}}
-\newlabel{algo:ch6p2Sync}{{6.9}{105}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{105}}
-\newlabel{algo:ch6p2SyncComp}{{6.10}{106}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{106}}
-\newlabel{algo:ch6p2SyncReceptions}{{6.11}{108}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{108}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{109}}
-\newlabel{ch6:p2GPUAsync}{{6.3.3}{109}}
-\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{110}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{110}}
-\newlabel{algo:ch6p2syncGPU}{{6.13}{111}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{111}}
-\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{114}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{114}}
-\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{115}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{115}}
-\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{116}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{116}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{117}}
-\newlabel{sec:ch6p2expes}{{6.3.4}{117}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{118}}
-\newlabel{fig:ch6p2syncasync}{{6.6}{118}}
-\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{119}}
-\newlabel{fig:ch6p2aux}{{6.7}{119}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{120}}
-\newlabel{sec:ch6p3unify}{{6.4}{120}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{120}}
-\newlabel{sec:ch6p3resources}{{6.4.1}{120}}
-\newlabel{algo:ch6p3ORWLresources}{{6.17}{121}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{121}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{121}}
-\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{121}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{122}}
-\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{122}}
-\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{122}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{122}}
-\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{123}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{123}}
-\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{123}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{123}}
-\newlabel{algo:ch6p3ORWLtrans}{{6.21}{123}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{123}}
-\newlabel{algo:ch6p3ORWLdecl}{{6.22}{124}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{124}}
-\newlabel{algo:ch6p3ORWLinit}{{6.23}{124}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{124}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{125}}
-\newlabel{sec:ch6p3tasks}{{6.4.4}{125}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{126}}
-\newlabel{ch6:conclu}{{6.5}{126}}
-\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{126}}
-\@writefile{toc}{\contentsline {section}{Bibliography}{127}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.1}Introduction}{88}}
+\newlabel{ch6:intro}{{6.1}{88}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{88}}
+\newlabel{ch6:part1}{{6.2}{88}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.1}Synchronous parallel algorithms on GPU clusters}{88}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{90}}
+\newlabel{fig:ch6p1overlapnative}{{6.1}{90}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.2}Native overlap of CPU communications and GPU computations}{90}}
+\newlabel{algo:ch6p1overlapnative}{{6.1}{91}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{91}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{92}}
+\newlabel{fig:ch6p1overlapseqsequence}{{6.2}{92}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{92}}
+\newlabel{algo:ch6p1overlapseqsequence}{{6.2}{93}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{93}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{95}}
+\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{95}}
+\newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{95}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{95}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{97}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{98}}
+\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{98}}
+\newlabel{algo:ch6p1overlapinterleaved}{{6.4}{98}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{98}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{100}}
+\newlabel{ch6:p1expes}{{6.2.5}{100}}
+\newlabel{ch6:p1block-cyclic}{{6.2.5}{100}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{101}}
+\newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{101}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{102}}
+\newlabel{ch6:part2}{{6.3}{102}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{102}}
+\newlabel{algo:ch6p2sync}{{3}{102}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{103}}
+\newlabel{algo:ch6p2async}{{4}{103}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{104}}
+\newlabel{ch6:p2BasicAsync}{{6.3.1}{104}}
+\newlabel{algo:ch6p2BasicAsync}{{6.5}{104}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{104}}
+\newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{105}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{105}}
+\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{107}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{107}}
+\newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{107}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{107}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{109}}
+\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{109}}
+\newlabel{algo:ch6p2Sync}{{6.9}{109}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{109}}
+\newlabel{algo:ch6p2SyncComp}{{6.10}{110}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{110}}
+\newlabel{algo:ch6p2SyncReceptions}{{6.11}{112}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{112}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{113}}
+\newlabel{ch6:p2GPUAsync}{{6.3.3}{113}}
+\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{114}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{114}}
+\newlabel{algo:ch6p2syncGPU}{{6.13}{115}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{115}}
+\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{118}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{118}}
+\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{119}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{119}}
+\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{120}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{120}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{121}}
+\newlabel{sec:ch6p2expes}{{6.3.4}{121}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{122}}
+\newlabel{fig:ch6p2syncasync}{{6.6}{122}}
+\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{123}}
+\newlabel{fig:ch6p2aux}{{6.7}{123}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{124}}
+\newlabel{sec:ch6p3unify}{{6.4}{124}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{124}}
+\newlabel{sec:ch6p3resources}{{6.4.1}{124}}
+\newlabel{algo:ch6p3ORWLresources}{{6.17}{125}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{125}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{125}}
+\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{125}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{126}}
+\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{126}}
+\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{126}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{126}}
+\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{127}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{127}}
+\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{127}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{127}}
+\newlabel{algo:ch6p3ORWLtrans}{{6.21}{127}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{127}}
+\newlabel{algo:ch6p3ORWLdecl}{{6.22}{128}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{128}}
+\newlabel{algo:ch6p3ORWLinit}{{6.23}{128}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{128}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{129}}
+\newlabel{sec:ch6p3tasks}{{6.4.4}{129}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{130}}
+\newlabel{ch6:conclu}{{6.5}{130}}
+\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{130}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{131}}
 \@setckpt{Chapters/chapter6/ch6}{
 \@setckpt{Chapters/chapter6/ch6}{
-\setcounter{page}{129}
+\setcounter{page}{133}
 \setcounter{equation}{0}
 \setcounter{enumi}{4}
 \setcounter{enumii}{0}
 \setcounter{equation}{0}
 \setcounter{enumi}{4}
 \setcounter{enumii}{0}
 \setcounter{enumiv}{21}
 \setcounter{footnote}{0}
 \setcounter{mpfootnote}{0}
 \setcounter{enumiv}{21}
 \setcounter{footnote}{0}
 \setcounter{mpfootnote}{0}
-\setcounter{part}{1}
+\setcounter{part}{3}
 \setcounter{chapter}{6}
 \setcounter{section}{6}
 \setcounter{subsection}{0}
 \setcounter{chapter}{6}
 \setcounter{section}{6}
 \setcounter{subsection}{0}