From: couturie Date: Sun, 3 Mar 2013 10:29:51 +0000 (+0100) Subject: update ch3 X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/beb4b32a38394d2c847ba5a733a4906781d95881?hp=bc21a4e60ae5e4ef525ce6933905824c6597401d update ch3 --- diff --git a/BookGPU/Chapters/chapter12/ch12.aux b/BookGPU/Chapters/chapter12/ch12.aux index 26d263a..d3b6ca3 100644 --- a/BookGPU/Chapters/chapter12/ch12.aux +++ b/BookGPU/Chapters/chapter12/ch12.aux @@ -3,107 +3,107 @@ \@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} \@writefile{toc}{\author{Jacques Bahi}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {11}Solving sparse linear systems with GMRES and CG methods on GPU clusters}{249}} +\@writefile{toc}{\contentsline {chapter}{\numberline {11}Solving sparse linear systems with GMRES and CG methods on GPU clusters}{251}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\newlabel{ch12}{{11}{249}} -\@writefile{toc}{\contentsline {section}{\numberline {11.1}Introduction}{249}} -\newlabel{ch12:sec:01}{{11.1}{249}} -\@writefile{toc}{\contentsline {section}{\numberline {11.2}Krylov iterative methods}{250}} -\newlabel{ch12:sec:02}{{11.2}{250}} -\newlabel{ch12:eq:01}{{11.1}{250}} -\newlabel{ch12:eq:02}{{11.2}{250}} -\newlabel{ch12:eq:03}{{11.3}{250}} -\newlabel{ch12:eq:11}{{11.4}{251}} -\@writefile{toc}{\contentsline {subsection}{\numberline {11.2.1}CG method}{251}} -\newlabel{ch12:sec:02.01}{{11.2.1}{251}} -\newlabel{ch12:eq:04}{{11.5}{251}} -\newlabel{ch12:eq:05}{{11.6}{251}} -\newlabel{ch12:eq:06}{{11.7}{251}} -\newlabel{ch12:eq:07}{{11.8}{251}} -\newlabel{ch12:eq:08}{{11.9}{251}} -\newlabel{ch12:eq:09}{{11.10}{251}} -\@writefile{loa}{\contentsline {algocf}{\numberline {9}{\ignorespaces Left-preconditioned CG method\relax }}{252}} -\newlabel{ch12:alg:01}{{9}{252}} -\newlabel{ch12:eq:10}{{11.11}{252}} -\@writefile{toc}{\contentsline {subsection}{\numberline {11.2.2}GMRES method}{253}} -\newlabel{ch12:sec:02.02}{{11.2.2}{253}} -\newlabel{ch12:eq:12}{{11.12}{253}} -\newlabel{ch12:eq:13}{{11.13}{253}} -\newlabel{ch12:eq:14}{{11.14}{253}} -\newlabel{ch12:eq:15}{{11.15}{253}} -\newlabel{ch12:eq:16}{{11.16}{253}} -\newlabel{ch12:eq:17}{{11.17}{253}} -\newlabel{ch12:eq:18}{{11.18}{253}} -\newlabel{ch12:eq:19}{{11.19}{253}} -\@writefile{loa}{\contentsline {algocf}{\numberline {10}{\ignorespaces Left-preconditioned GMRES method with restarts\relax }}{254}} -\newlabel{ch12:alg:02}{{10}{254}} -\@writefile{toc}{\contentsline {section}{\numberline {11.3}Parallel implementation on a GPU cluster}{255}} -\newlabel{ch12:sec:03}{{11.3}{255}} -\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.1}Data partitioning}{255}} -\newlabel{ch12:sec:03.01}{{11.3.1}{255}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.1}{\ignorespaces A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.\relax }}{256}} -\newlabel{ch12:fig:01}{{11.1}{256}} -\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.2}GPU computing}{256}} -\newlabel{ch12:sec:03.02}{{11.3.2}{256}} -\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.3}Data communications}{257}} -\newlabel{ch12:sec:03.03}{{11.3.3}{257}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.2}{\ignorespaces Data exchanges between \textit {Node 1} and its neighbors \textit {Node 0}, \textit {Node 2} and \textit {Node 3}.\relax }}{258}} -\newlabel{ch12:fig:02}{{11.2}{258}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.3}{\ignorespaces Columns reordering of a sparse sub-matrix.\relax }}{259}} -\newlabel{ch12:fig:03}{{11.3}{259}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.4}{\ignorespaces General scheme of the GPU cluster of tests composed of six machines, each with two GPUs.\relax }}{260}} -\newlabel{ch12:fig:04}{{11.4}{260}} -\@writefile{toc}{\contentsline {section}{\numberline {11.4}Experimental results}{260}} -\newlabel{ch12:sec:04}{{11.4}{260}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.5}{\ignorespaces Sketches of sparse matrices chosen from the Davis's collection.\relax }}{261}} -\newlabel{ch12:fig:05}{{11.5}{261}} -\@writefile{lot}{\contentsline {table}{\numberline {11.1}{\ignorespaces Main characteristics of sparse matrices chosen from the Davis's collection.\relax }}{262}} -\newlabel{ch12:tab:01}{{11.1}{262}} -\@writefile{lot}{\contentsline {table}{\numberline {11.2}{\ignorespaces Performances of the parallel CG method on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{262}} -\newlabel{ch12:tab:02}{{11.2}{262}} -\@writefile{lot}{\contentsline {table}{\numberline {11.3}{\ignorespaces Performances of the parallel GMRES method on a cluster 24 CPU cores vs. on cluster of 12 GPUs.\relax }}{263}} -\newlabel{ch12:tab:03}{{11.3}{263}} -\newlabel{ch12:eq:20}{{11.20}{263}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.6}{\ignorespaces Parallel generation of a large sparse matrix by four computing nodes.\relax }}{264}} -\newlabel{ch12:fig:06}{{11.6}{264}} -\@writefile{lot}{\contentsline {table}{\numberline {11.4}{\ignorespaces Main characteristics of sparse banded matrices generated from those of the Davis's collection.\relax }}{265}} -\newlabel{ch12:tab:04}{{11.4}{265}} -\@writefile{lot}{\contentsline {table}{\numberline {11.5}{\ignorespaces Performances of the parallel CG method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{265}} -\newlabel{ch12:tab:05}{{11.5}{265}} -\@writefile{toc}{\contentsline {section}{\numberline {11.5}Hypergraph partitioning}{265}} -\newlabel{ch12:sec:05}{{11.5}{265}} -\@writefile{lot}{\contentsline {table}{\numberline {11.6}{\ignorespaces Performances of the parallel GMRES method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{266}} -\newlabel{ch12:tab:06}{{11.6}{266}} -\@writefile{lot}{\contentsline {table}{\numberline {11.7}{\ignorespaces Main characteristics of sparse five-bands matrices generated from those of the Davis's collection.\relax }}{266}} -\newlabel{ch12:tab:07}{{11.7}{266}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.7}{\ignorespaces Parallel generation of a large sparse five-bands matrix by four computing nodes.\relax }}{267}} -\newlabel{ch12:fig:07}{{11.7}{267}} -\@writefile{lot}{\contentsline {table}{\numberline {11.8}{\ignorespaces Performances of parallel CG solver for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs\relax }}{267}} -\newlabel{ch12:tab:08}{{11.8}{267}} -\@writefile{lot}{\contentsline {table}{\numberline {11.9}{\ignorespaces Performances of parallel GMRES solver for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs\relax }}{268}} -\newlabel{ch12:tab:09}{{11.9}{268}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.8}{\ignorespaces An example of the hypergraph partitioning of a sparse matrix decomposed between three computing nodes.\relax }}{269}} -\newlabel{ch12:fig:08}{{11.8}{269}} -\@writefile{lot}{\contentsline {table}{\numberline {11.10}{\ignorespaces Performances of the parallel CG solver using hypergraph partitioning for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPU.\relax }}{270}} -\newlabel{ch12:tab:10}{{11.10}{270}} -\@writefile{lot}{\contentsline {table}{\numberline {11.11}{\ignorespaces Performances of the parallel GMRES solver using hypergraph partitioning for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPU.\relax }}{271}} -\newlabel{ch12:tab:11}{{11.11}{271}} -\@writefile{lot}{\contentsline {table}{\numberline {11.12}{\ignorespaces The total communication volume between 12 GPU computing nodes without and with the hypergraph partitioning method.\relax }}{272}} -\newlabel{ch12:tab:12}{{11.12}{272}} -\newlabel{ch12:fig:09.01}{{11.9(a)}{273}} -\newlabel{sub@ch12:fig:09.01}{{(a)}{273}} -\newlabel{ch12:fig:09.02}{{11.9(b)}{273}} -\newlabel{sub@ch12:fig:09.02}{{(b)}{273}} -\@writefile{lof}{\contentsline {figure}{\numberline {11.9}{\ignorespaces Weak-scaling of the parallel CG and GMRES solvers on a GPU cluster for solving large sparse linear systems.\relax }}{273}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Sparse band matrices}}}{273}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Sparse five-bands matrices}}}{273}} -\newlabel{ch12:fig:09}{{11.9}{273}} -\@writefile{toc}{\contentsline {section}{\numberline {11.6}Conclusion}{273}} -\newlabel{ch12:sec:06}{{11.6}{273}} -\@writefile{toc}{\contentsline {section}{Bibliography}{274}} +\newlabel{ch12}{{11}{251}} +\@writefile{toc}{\contentsline {section}{\numberline {11.1}Introduction}{251}} +\newlabel{ch12:sec:01}{{11.1}{251}} +\@writefile{toc}{\contentsline {section}{\numberline {11.2}Krylov iterative methods}{252}} +\newlabel{ch12:sec:02}{{11.2}{252}} +\newlabel{ch12:eq:01}{{11.1}{252}} +\newlabel{ch12:eq:02}{{11.2}{252}} +\newlabel{ch12:eq:03}{{11.3}{252}} +\newlabel{ch12:eq:11}{{11.4}{253}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.2.1}CG method}{253}} +\newlabel{ch12:sec:02.01}{{11.2.1}{253}} +\newlabel{ch12:eq:04}{{11.5}{253}} +\newlabel{ch12:eq:05}{{11.6}{253}} +\newlabel{ch12:eq:06}{{11.7}{253}} +\newlabel{ch12:eq:07}{{11.8}{253}} +\newlabel{ch12:eq:08}{{11.9}{253}} +\newlabel{ch12:eq:09}{{11.10}{253}} +\@writefile{loa}{\contentsline {algocf}{\numberline {9}{\ignorespaces Left-preconditioned CG method\relax }}{254}} +\newlabel{ch12:alg:01}{{9}{254}} +\newlabel{ch12:eq:10}{{11.11}{254}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.2.2}GMRES method}{255}} +\newlabel{ch12:sec:02.02}{{11.2.2}{255}} +\newlabel{ch12:eq:12}{{11.12}{255}} +\newlabel{ch12:eq:13}{{11.13}{255}} +\newlabel{ch12:eq:14}{{11.14}{255}} +\newlabel{ch12:eq:15}{{11.15}{255}} +\newlabel{ch12:eq:16}{{11.16}{255}} +\newlabel{ch12:eq:17}{{11.17}{255}} +\newlabel{ch12:eq:18}{{11.18}{255}} +\newlabel{ch12:eq:19}{{11.19}{255}} +\@writefile{loa}{\contentsline {algocf}{\numberline {10}{\ignorespaces Left-preconditioned GMRES method with restarts\relax }}{256}} +\newlabel{ch12:alg:02}{{10}{256}} +\@writefile{toc}{\contentsline {section}{\numberline {11.3}Parallel implementation on a GPU cluster}{257}} +\newlabel{ch12:sec:03}{{11.3}{257}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.1}Data partitioning}{257}} +\newlabel{ch12:sec:03.01}{{11.3.1}{257}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.1}{\ignorespaces A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.\relax }}{258}} +\newlabel{ch12:fig:01}{{11.1}{258}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.2}GPU computing}{258}} +\newlabel{ch12:sec:03.02}{{11.3.2}{258}} +\@writefile{toc}{\contentsline {subsection}{\numberline {11.3.3}Data communications}{259}} +\newlabel{ch12:sec:03.03}{{11.3.3}{259}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.2}{\ignorespaces Data exchanges between \textit {Node 1} and its neighbors \textit {Node 0}, \textit {Node 2} and \textit {Node 3}.\relax }}{260}} +\newlabel{ch12:fig:02}{{11.2}{260}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.3}{\ignorespaces Columns reordering of a sparse sub-matrix.\relax }}{261}} +\newlabel{ch12:fig:03}{{11.3}{261}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.4}{\ignorespaces General scheme of the GPU cluster of tests composed of six machines, each with two GPUs.\relax }}{262}} +\newlabel{ch12:fig:04}{{11.4}{262}} +\@writefile{toc}{\contentsline {section}{\numberline {11.4}Experimental results}{262}} +\newlabel{ch12:sec:04}{{11.4}{262}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.5}{\ignorespaces Sketches of sparse matrices chosen from the Davis's collection.\relax }}{263}} +\newlabel{ch12:fig:05}{{11.5}{263}} +\@writefile{lot}{\contentsline {table}{\numberline {11.1}{\ignorespaces Main characteristics of sparse matrices chosen from the Davis's collection.\relax }}{264}} +\newlabel{ch12:tab:01}{{11.1}{264}} +\@writefile{lot}{\contentsline {table}{\numberline {11.2}{\ignorespaces Performances of the parallel CG method on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{264}} +\newlabel{ch12:tab:02}{{11.2}{264}} +\@writefile{lot}{\contentsline {table}{\numberline {11.3}{\ignorespaces Performances of the parallel GMRES method on a cluster 24 CPU cores vs. on cluster of 12 GPUs.\relax }}{265}} +\newlabel{ch12:tab:03}{{11.3}{265}} +\newlabel{ch12:eq:20}{{11.20}{265}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.6}{\ignorespaces Parallel generation of a large sparse matrix by four computing nodes.\relax }}{266}} +\newlabel{ch12:fig:06}{{11.6}{266}} +\@writefile{lot}{\contentsline {table}{\numberline {11.4}{\ignorespaces Main characteristics of sparse banded matrices generated from those of the Davis's collection.\relax }}{267}} +\newlabel{ch12:tab:04}{{11.4}{267}} +\@writefile{lot}{\contentsline {table}{\numberline {11.5}{\ignorespaces Performances of the parallel CG method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{267}} +\newlabel{ch12:tab:05}{{11.5}{267}} +\@writefile{toc}{\contentsline {section}{\numberline {11.5}Hypergraph partitioning}{267}} +\newlabel{ch12:sec:05}{{11.5}{267}} +\@writefile{lot}{\contentsline {table}{\numberline {11.6}{\ignorespaces Performances of the parallel GMRES method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{268}} +\newlabel{ch12:tab:06}{{11.6}{268}} +\@writefile{lot}{\contentsline {table}{\numberline {11.7}{\ignorespaces Main characteristics of sparse five-bands matrices generated from those of the Davis's collection.\relax }}{268}} +\newlabel{ch12:tab:07}{{11.7}{268}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.7}{\ignorespaces Parallel generation of a large sparse five-bands matrix by four computing nodes.\relax }}{269}} +\newlabel{ch12:fig:07}{{11.7}{269}} +\@writefile{lot}{\contentsline {table}{\numberline {11.8}{\ignorespaces Performances of parallel CG solver for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs\relax }}{269}} +\newlabel{ch12:tab:08}{{11.8}{269}} +\@writefile{lot}{\contentsline {table}{\numberline {11.9}{\ignorespaces Performances of parallel GMRES solver for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs\relax }}{270}} +\newlabel{ch12:tab:09}{{11.9}{270}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.8}{\ignorespaces An example of the hypergraph partitioning of a sparse matrix decomposed between three computing nodes.\relax }}{271}} +\newlabel{ch12:fig:08}{{11.8}{271}} +\@writefile{lot}{\contentsline {table}{\numberline {11.10}{\ignorespaces Performances of the parallel CG solver using hypergraph partitioning for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPU.\relax }}{272}} +\newlabel{ch12:tab:10}{{11.10}{272}} +\@writefile{lot}{\contentsline {table}{\numberline {11.11}{\ignorespaces Performances of the parallel GMRES solver using hypergraph partitioning for solving linear systems associated to sparse five-bands matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPU.\relax }}{273}} +\newlabel{ch12:tab:11}{{11.11}{273}} +\@writefile{lot}{\contentsline {table}{\numberline {11.12}{\ignorespaces The total communication volume between 12 GPU computing nodes without and with the hypergraph partitioning method.\relax }}{274}} +\newlabel{ch12:tab:12}{{11.12}{274}} +\newlabel{ch12:fig:09.01}{{11.9(a)}{275}} +\newlabel{sub@ch12:fig:09.01}{{(a)}{275}} +\newlabel{ch12:fig:09.02}{{11.9(b)}{275}} +\newlabel{sub@ch12:fig:09.02}{{(b)}{275}} +\@writefile{lof}{\contentsline {figure}{\numberline {11.9}{\ignorespaces Weak-scaling of the parallel CG and GMRES solvers on a GPU cluster for solving large sparse linear systems.\relax }}{275}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Sparse band matrices}}}{275}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Sparse five-bands matrices}}}{275}} +\newlabel{ch12:fig:09}{{11.9}{275}} +\@writefile{toc}{\contentsline {section}{\numberline {11.6}Conclusion}{275}} +\newlabel{ch12:sec:06}{{11.6}{275}} +\@writefile{toc}{\contentsline {section}{Bibliography}{276}} \@setckpt{Chapters/chapter12/ch12}{ -\setcounter{page}{276} +\setcounter{page}{278} \setcounter{equation}{25} \setcounter{enumi}{4} \setcounter{enumii}{0} diff --git a/BookGPU/Chapters/chapter13/ch13.aux b/BookGPU/Chapters/chapter13/ch13.aux index 9460acd..bb6c472 100644 --- a/BookGPU/Chapters/chapter13/ch13.aux +++ b/BookGPU/Chapters/chapter13/ch13.aux @@ -5,86 +5,86 @@ \@writefile{toc}{\author{Pierre Spit\IeC {\'e}ri}{}} \@writefile{toc}{\author{Jacques Bahi}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {12}Solving sparse nonlinear systems of obstacle problems on GPU clusters}{277}} +\@writefile{toc}{\contentsline {chapter}{\numberline {12}Solving sparse nonlinear systems of obstacle problems on GPU clusters}{279}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\newlabel{ch13}{{12}{277}} -\@writefile{toc}{\contentsline {section}{\numberline {12.1}Introduction}{277}} -\newlabel{ch13:sec:01}{{12.1}{277}} -\@writefile{toc}{\contentsline {section}{\numberline {12.2}Obstacle problems}{278}} -\newlabel{ch13:sec:02}{{12.2}{278}} -\@writefile{toc}{\contentsline {subsection}{\numberline {12.2.1}Mathematical model}{278}} -\newlabel{ch13:sec:02.01}{{12.2.1}{278}} -\newlabel{ch13:eq:01}{{12.1}{278}} -\newlabel{ch13:eq:02}{{12.2}{278}} -\@writefile{toc}{\contentsline {subsection}{\numberline {12.2.2}Discretization}{279}} -\newlabel{ch13:sec:02.02}{{12.2.2}{279}} -\newlabel{ch13:eq:03}{{12.3}{279}} -\newlabel{ch13:eq:04}{{12.4}{279}} -\newlabel{ch13:eq:05}{{12.5}{279}} -\@writefile{toc}{\contentsline {section}{\numberline {12.3}Parallel iterative method}{280}} -\newlabel{ch13:sec:03}{{12.3}{280}} -\newlabel{ch13:eq:06}{{12.6}{280}} -\newlabel{ch13:eq:07}{{12.7}{280}} -\newlabel{ch13:eq:08}{{12.8}{280}} -\newlabel{ch13:eq:09}{{12.9}{280}} -\newlabel{ch13:eq:10}{{12.10}{281}} -\newlabel{ch13:eq:11}{{12.11}{281}} -\newlabel{ch13:eq:12}{{12.12}{281}} -\newlabel{ch13:eq:13}{{12.13}{282}} -\newlabel{ch13:eq:14}{{12.14}{282}} -\newlabel{ch13:eq:15}{{12.15}{282}} -\newlabel{ch13:eq:16}{{12.16}{282}} -\@writefile{toc}{\contentsline {section}{\numberline {12.4}Parallel implementation on a GPU cluster}{283}} -\newlabel{ch13:sec:04}{{12.4}{283}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.1}{\ignorespaces Data partitioning of a problem to be solved among $S=3\times 4$ computing nodes.\relax }}{283}} -\newlabel{ch13:fig:01}{{12.1}{283}} -\@writefile{loa}{\contentsline {algocf}{\numberline {11}{\ignorespaces Parallel solving of the obstacle problem on a GPU cluster\relax }}{284}} -\newlabel{ch13:alg:01}{{11}{284}} -\newlabel{ch13:eq:18}{{12.17}{284}} -\@writefile{loa}{\contentsline {algocf}{\numberline {12}{\ignorespaces Parallel iterative solving of the nonlinear systems on a GPU cluster ($Solve()$ function)\relax }}{285}} -\newlabel{ch13:alg:02}{{12}{285}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.2}{\ignorespaces Decomposition of a sub-problem in a GPU into $nz$ slices.\relax }}{286}} -\newlabel{ch13:fig:02}{{12.2}{286}} -\newlabel{ch13:list:01}{{12.1}{286}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.1}Skeleton codes of a GPU kernel and a CPU function}{286}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.3}{\ignorespaces Matrix constant coefficients in a three-dimensional domain.\relax }}{288}} -\newlabel{ch13:fig:03}{{12.3}{288}} -\newlabel{ch13:eq:17}{{12.18}{288}} -\newlabel{ch13:list:02}{{12.2}{288}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.2}GPU kernels of the projected Richardson method}{288}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.4}{\ignorespaces Computation of a vector element with the projected Richardson method.\relax }}{290}} -\newlabel{ch13:fig:04}{{12.4}{290}} -\newlabel{ch13:list:03}{{12.3}{290}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.3}Memory access to the cache texture memory}{290}} -\@writefile{toc}{\contentsline {section}{\numberline {12.5}Experimental tests on a GPU cluster}{291}} -\newlabel{ch13:sec:05}{{12.5}{291}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.5}{\ignorespaces GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.\relax }}{293}} -\newlabel{ch13:fig:05}{{12.5}{293}} -\@writefile{lot}{\contentsline {table}{\numberline {12.1}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 24 CPU cores.\relax }}{293}} -\newlabel{ch13:tab:01}{{12.1}{293}} -\@writefile{lot}{\contentsline {table}{\numberline {12.2}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.\relax }}{294}} -\newlabel{ch13:tab:02}{{12.2}{294}} -\@writefile{toc}{\contentsline {section}{\numberline {12.6}Red-Black ordering technique}{294}} -\newlabel{ch13:sec:06}{{12.6}{294}} -\newlabel{ch13:list:04}{{12.4}{295}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.4}GPU kernels of the projected Richardson method using the red-black technique}{295}} -\newlabel{ch13:fig:06.01}{{12.6(a)}{296}} -\newlabel{sub@ch13:fig:06.01}{{(a)}{296}} -\newlabel{ch13:fig:06.02}{{12.6(b)}{296}} -\newlabel{sub@ch13:fig:06.02}{{(b)}{296}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.6}{\ignorespaces Red-Black ordering for computing the iterate vector elements in a three-dimensional space.\relax }}{296}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Red-Black ordering on x, y and z axises}}}{296}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Red-Black ordering on y axis}}}{296}} -\@writefile{lot}{\contentsline {table}{\numberline {12.3}{\ignorespaces Execution times in seconds of the parallel projected Richardson method using read-black ordering technique implemented on a cluster of 12 GPUs.\relax }}{297}} -\newlabel{ch13:tab:03}{{12.3}{297}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.7}{\ignorespaces Weak scaling of both synchronous and asynchronous algorithms of the projected Richardson method using red-black ordering technique.\relax }}{298}} -\newlabel{ch13:fig:07}{{12.7}{298}} -\@writefile{toc}{\contentsline {section}{\numberline {12.7}Conclusion}{298}} -\newlabel{ch13:sec:07}{{12.7}{298}} -\@writefile{toc}{\contentsline {section}{Bibliography}{299}} +\newlabel{ch13}{{12}{279}} +\@writefile{toc}{\contentsline {section}{\numberline {12.1}Introduction}{279}} +\newlabel{ch13:sec:01}{{12.1}{279}} +\@writefile{toc}{\contentsline {section}{\numberline {12.2}Obstacle problems}{280}} +\newlabel{ch13:sec:02}{{12.2}{280}} +\@writefile{toc}{\contentsline {subsection}{\numberline {12.2.1}Mathematical model}{280}} +\newlabel{ch13:sec:02.01}{{12.2.1}{280}} +\newlabel{ch13:eq:01}{{12.1}{280}} +\newlabel{ch13:eq:02}{{12.2}{280}} +\@writefile{toc}{\contentsline {subsection}{\numberline {12.2.2}Discretization}{281}} +\newlabel{ch13:sec:02.02}{{12.2.2}{281}} +\newlabel{ch13:eq:03}{{12.3}{281}} +\newlabel{ch13:eq:04}{{12.4}{281}} +\newlabel{ch13:eq:05}{{12.5}{281}} +\@writefile{toc}{\contentsline {section}{\numberline {12.3}Parallel iterative method}{282}} +\newlabel{ch13:sec:03}{{12.3}{282}} +\newlabel{ch13:eq:06}{{12.6}{282}} +\newlabel{ch13:eq:07}{{12.7}{282}} +\newlabel{ch13:eq:08}{{12.8}{282}} +\newlabel{ch13:eq:09}{{12.9}{282}} +\newlabel{ch13:eq:10}{{12.10}{283}} +\newlabel{ch13:eq:11}{{12.11}{283}} +\newlabel{ch13:eq:12}{{12.12}{283}} +\newlabel{ch13:eq:13}{{12.13}{284}} +\newlabel{ch13:eq:14}{{12.14}{284}} +\newlabel{ch13:eq:15}{{12.15}{284}} +\newlabel{ch13:eq:16}{{12.16}{284}} +\@writefile{toc}{\contentsline {section}{\numberline {12.4}Parallel implementation on a GPU cluster}{285}} +\newlabel{ch13:sec:04}{{12.4}{285}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.1}{\ignorespaces Data partitioning of a problem to be solved among $S=3\times 4$ computing nodes.\relax }}{285}} +\newlabel{ch13:fig:01}{{12.1}{285}} +\@writefile{loa}{\contentsline {algocf}{\numberline {11}{\ignorespaces Parallel solving of the obstacle problem on a GPU cluster\relax }}{286}} +\newlabel{ch13:alg:01}{{11}{286}} +\newlabel{ch13:eq:18}{{12.17}{286}} +\@writefile{loa}{\contentsline {algocf}{\numberline {12}{\ignorespaces Parallel iterative solving of the nonlinear systems on a GPU cluster ($Solve()$ function)\relax }}{287}} +\newlabel{ch13:alg:02}{{12}{287}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.2}{\ignorespaces Decomposition of a sub-problem in a GPU into $nz$ slices.\relax }}{288}} +\newlabel{ch13:fig:02}{{12.2}{288}} +\newlabel{ch13:list:01}{{12.1}{288}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.1}Skeleton codes of a GPU kernel and a CPU function}{288}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.3}{\ignorespaces Matrix constant coefficients in a three-dimensional domain.\relax }}{290}} +\newlabel{ch13:fig:03}{{12.3}{290}} +\newlabel{ch13:eq:17}{{12.18}{290}} +\newlabel{ch13:list:02}{{12.2}{290}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.2}GPU kernels of the projected Richardson method}{290}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.4}{\ignorespaces Computation of a vector element with the projected Richardson method.\relax }}{292}} +\newlabel{ch13:fig:04}{{12.4}{292}} +\newlabel{ch13:list:03}{{12.3}{292}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.3}Memory access to the cache texture memory}{292}} +\@writefile{toc}{\contentsline {section}{\numberline {12.5}Experimental tests on a GPU cluster}{293}} +\newlabel{ch13:sec:05}{{12.5}{293}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.5}{\ignorespaces GPU cluster of tests composed of 12 computing nodes (six machines, each with two GPUs.\relax }}{295}} +\newlabel{ch13:fig:05}{{12.5}{295}} +\@writefile{lot}{\contentsline {table}{\numberline {12.1}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 24 CPU cores.\relax }}{295}} +\newlabel{ch13:tab:01}{{12.1}{295}} +\@writefile{lot}{\contentsline {table}{\numberline {12.2}{\ignorespaces Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.\relax }}{296}} +\newlabel{ch13:tab:02}{{12.2}{296}} +\@writefile{toc}{\contentsline {section}{\numberline {12.6}Red-Black ordering technique}{296}} +\newlabel{ch13:sec:06}{{12.6}{296}} +\newlabel{ch13:list:04}{{12.4}{297}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {12.4}GPU kernels of the projected Richardson method using the red-black technique}{297}} +\newlabel{ch13:fig:06.01}{{12.6(a)}{298}} +\newlabel{sub@ch13:fig:06.01}{{(a)}{298}} +\newlabel{ch13:fig:06.02}{{12.6(b)}{298}} +\newlabel{sub@ch13:fig:06.02}{{(b)}{298}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.6}{\ignorespaces Red-Black ordering for computing the iterate vector elements in a three-dimensional space.\relax }}{298}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Red-Black ordering on x, y and z axises}}}{298}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Red-Black ordering on y axis}}}{298}} +\@writefile{lot}{\contentsline {table}{\numberline {12.3}{\ignorespaces Execution times in seconds of the parallel projected Richardson method using read-black ordering technique implemented on a cluster of 12 GPUs.\relax }}{299}} +\newlabel{ch13:tab:03}{{12.3}{299}} +\@writefile{lof}{\contentsline {figure}{\numberline {12.7}{\ignorespaces Weak scaling of both synchronous and asynchronous algorithms of the projected Richardson method using red-black ordering technique.\relax }}{300}} +\newlabel{ch13:fig:07}{{12.7}{300}} +\@writefile{toc}{\contentsline {section}{\numberline {12.7}Conclusion}{300}} +\newlabel{ch13:sec:07}{{12.7}{300}} +\@writefile{toc}{\contentsline {section}{Bibliography}{301}} \@setckpt{Chapters/chapter13/ch13}{ -\setcounter{page}{301} +\setcounter{page}{303} \setcounter{equation}{18} \setcounter{enumi}{4} \setcounter{enumii}{0} diff --git a/BookGPU/Chapters/chapter16/ch16.aux b/BookGPU/Chapters/chapter16/ch16.aux index dfac2f4..1b44748 100644 --- a/BookGPU/Chapters/chapter16/ch16.aux +++ b/BookGPU/Chapters/chapter16/ch16.aux @@ -4,72 +4,72 @@ \@writefile{toc}{\author{H. Wang}{}} \@writefile{toc}{\author{H. Yu}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {15}GPU-Accelerated Envelope-Following Method}{341}} +\@writefile{toc}{\contentsline {chapter}{\numberline {15}GPU-Accelerated Envelope-Following Method}{343}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {section}{\numberline {15.1}Introduction}{341}} -\newlabel{fig:ef1}{{15.1(a)}{343}} -\newlabel{sub@fig:ef1}{{(a)}{343}} -\newlabel{fig:ef2}{{15.1(b)}{343}} -\newlabel{sub@fig:ef2}{{(b)}{343}} -\@writefile{lof}{\contentsline {figure}{\numberline {15.1}{\ignorespaces Transient envelope-following analysis. (Both two figures reflect backward-Euler style envelope-following.)\relax }}{343}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Illustration of one envelope skip.}}}{343}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {The envelope changes in a slow time scale.}}}{343}} -\newlabel{fig:ef_intro}{{15.1}{343}} -\@writefile{toc}{\contentsline {section}{\numberline {15.2}The envelope-following method in a nutshell}{344}} -\newlabel{sec:ef}{{15.2}{344}} -\newlabel{eq:dae}{{15.1}{344}} -\newlabel{eq:Newton}{{15.2}{345}} -\newlabel{eq:A}{{15.3}{345}} -\@writefile{toc}{\contentsline {section}{\numberline {15.3}New parallel envelope-following method}{346}} -\newlabel{sec:gmres}{{15.3}{346}} -\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.1}GMRES solver for Newton update equation}{346}} -\@writefile{lof}{\contentsline {figure}{\numberline {15.2}{\ignorespaces The flow of envelope-following method.\relax }}{347}} -\newlabel{fig:ef_flow}{{15.2}{347}} -\@writefile{loa}{\contentsline {algocf}{\numberline {14}{\ignorespaces Standard GMRES algorithm.\relax }}{348}} -\newlabel{alg:GMRES}{{14}{348}} -\newlabel{line:mvp}{{5}{348}} -\newlabel{line:newnorm}{{11}{348}} -\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.2}Parallelization on GPU platforms}{348}} -\newlabel{sec:gpu}{{15.3.2}{348}} -\@writefile{lof}{\contentsline {figure}{\numberline {15.3}{\ignorespaces GPU parallel solver for envelope-following update.\relax }}{349}} -\newlabel{fig:gmres}{{15.3}{349}} -\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.3}Gear-2 based sensitivity calculation}{350}} -\newlabel{sec:gear}{{15.3.3}{350}} -\newlabel{eq:BE}{{15.4}{350}} -\newlabel{eq:sens1}{{15.5}{350}} -\newlabel{eq:Gear_t2}{{15.6}{351}} -\newlabel{eq:sens2}{{15.7}{351}} -\newlabel{eq:Gear_t3}{{15.8}{351}} -\newlabel{eq:sensM}{{15.9}{351}} -\@writefile{loa}{\contentsline {algocf}{\numberline {15}{\ignorespaces The matrix-free method for Krylov subspace construction.\relax }}{352}} -\newlabel{alg:mf_Gear}{{15}{352}} -\newlabel{line:mf_Gear_loop}{{4}{352}} -\newlabel{line:shift}{{8}{352}} -\@writefile{toc}{\contentsline {section}{\numberline {15.4}Numerical examples}{352}} -\newlabel{sec:exp}{{15.4}{352}} -\@writefile{lof}{\contentsline {figure}{\numberline {15.4}{\ignorespaces Diagram of a zero-voltage quasi-resonant flyback converter.\relax }}{353}} -\newlabel{fig:flyback}{{15.4}{353}} -\@writefile{lof}{\contentsline {figure}{\numberline {15.5}{\ignorespaces Illustration of power/ground network model.\relax }}{353}} -\newlabel{fig:pg}{{15.5}{353}} -\newlabel{fig:flybackWhole}{{15.6(a)}{354}} -\newlabel{sub@fig:flybackWhole}{{(a)}{354}} -\newlabel{fig:flybackZoom}{{15.6(b)}{354}} -\newlabel{sub@fig:flybackZoom}{{(b)}{354}} -\@writefile{lof}{\contentsline {figure}{\numberline {15.6}{\ignorespaces Flyback converter solution calculated by envelope-following. The red curve is traditional SPICE simulation result, and the back curve is the envelope-following output with simulation points marked.\relax }}{354}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {The whole plot}}}{354}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Detail of one EF simulation period}}}{354}} -\newlabel{fig:flyback_wave}{{15.6}{354}} -\@writefile{lof}{\contentsline {figure}{\numberline {15.7}{\ignorespaces Buck converter solution calculated by envelope-following.\relax }}{355}} -\newlabel{fig:buck_wave}{{15.7}{355}} -\@writefile{lot}{\contentsline {table}{\numberline {15.1}{\ignorespaces CPU and GPU time comparisons (in seconds) for solving Newton update equation with the proposed Gear-2 sensitivity. \relax }}{355}} -\newlabel{table:circuit}{{15.1}{355}} -\@writefile{toc}{\contentsline {section}{\numberline {15.5}Summary}{356}} -\newlabel{sec:summary}{{15.5}{356}} -\@writefile{toc}{\contentsline {section}{\numberline {15.6}Glossary}{356}} -\@writefile{toc}{\contentsline {section}{Bibliography}{356}} +\@writefile{toc}{\contentsline {section}{\numberline {15.1}Introduction}{343}} +\newlabel{fig:ef1}{{15.1(a)}{345}} +\newlabel{sub@fig:ef1}{{(a)}{345}} +\newlabel{fig:ef2}{{15.1(b)}{345}} +\newlabel{sub@fig:ef2}{{(b)}{345}} +\@writefile{lof}{\contentsline {figure}{\numberline {15.1}{\ignorespaces Transient envelope-following analysis. (Both two figures reflect backward-Euler style envelope-following.)\relax }}{345}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Illustration of one envelope skip.}}}{345}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {The envelope changes in a slow time scale.}}}{345}} +\newlabel{fig:ef_intro}{{15.1}{345}} +\@writefile{toc}{\contentsline {section}{\numberline {15.2}The envelope-following method in a nutshell}{346}} +\newlabel{sec:ef}{{15.2}{346}} +\newlabel{eq:dae}{{15.1}{346}} +\newlabel{eq:Newton}{{15.2}{347}} +\newlabel{eq:A}{{15.3}{347}} +\@writefile{toc}{\contentsline {section}{\numberline {15.3}New parallel envelope-following method}{348}} +\newlabel{sec:gmres}{{15.3}{348}} +\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.1}GMRES solver for Newton update equation}{348}} +\@writefile{lof}{\contentsline {figure}{\numberline {15.2}{\ignorespaces The flow of envelope-following method.\relax }}{349}} +\newlabel{fig:ef_flow}{{15.2}{349}} +\@writefile{loa}{\contentsline {algocf}{\numberline {14}{\ignorespaces Standard GMRES algorithm.\relax }}{350}} +\newlabel{alg:GMRES}{{14}{350}} +\newlabel{line:mvp}{{5}{350}} +\newlabel{line:newnorm}{{11}{350}} +\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.2}Parallelization on GPU platforms}{350}} +\newlabel{sec:gpu}{{15.3.2}{350}} +\@writefile{lof}{\contentsline {figure}{\numberline {15.3}{\ignorespaces GPU parallel solver for envelope-following update.\relax }}{351}} +\newlabel{fig:gmres}{{15.3}{351}} +\@writefile{toc}{\contentsline {subsection}{\numberline {15.3.3}Gear-2 based sensitivity calculation}{352}} +\newlabel{sec:gear}{{15.3.3}{352}} +\newlabel{eq:BE}{{15.4}{352}} +\newlabel{eq:sens1}{{15.5}{352}} +\newlabel{eq:Gear_t2}{{15.6}{353}} +\newlabel{eq:sens2}{{15.7}{353}} +\newlabel{eq:Gear_t3}{{15.8}{353}} +\newlabel{eq:sensM}{{15.9}{353}} +\@writefile{loa}{\contentsline {algocf}{\numberline {15}{\ignorespaces The matrix-free method for Krylov subspace construction.\relax }}{354}} +\newlabel{alg:mf_Gear}{{15}{354}} +\newlabel{line:mf_Gear_loop}{{4}{354}} +\newlabel{line:shift}{{8}{354}} +\@writefile{toc}{\contentsline {section}{\numberline {15.4}Numerical examples}{354}} +\newlabel{sec:exp}{{15.4}{354}} +\@writefile{lof}{\contentsline {figure}{\numberline {15.4}{\ignorespaces Diagram of a zero-voltage quasi-resonant flyback converter.\relax }}{355}} +\newlabel{fig:flyback}{{15.4}{355}} +\@writefile{lof}{\contentsline {figure}{\numberline {15.5}{\ignorespaces Illustration of power/ground network model.\relax }}{355}} +\newlabel{fig:pg}{{15.5}{355}} +\newlabel{fig:flybackWhole}{{15.6(a)}{356}} +\newlabel{sub@fig:flybackWhole}{{(a)}{356}} +\newlabel{fig:flybackZoom}{{15.6(b)}{356}} +\newlabel{sub@fig:flybackZoom}{{(b)}{356}} +\@writefile{lof}{\contentsline {figure}{\numberline {15.6}{\ignorespaces Flyback converter solution calculated by envelope-following. The red curve is traditional SPICE simulation result, and the back curve is the envelope-following output with simulation points marked.\relax }}{356}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {The whole plot}}}{356}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Detail of one EF simulation period}}}{356}} +\newlabel{fig:flyback_wave}{{15.6}{356}} +\@writefile{lof}{\contentsline {figure}{\numberline {15.7}{\ignorespaces Buck converter solution calculated by envelope-following.\relax }}{357}} +\newlabel{fig:buck_wave}{{15.7}{357}} +\@writefile{lot}{\contentsline {table}{\numberline {15.1}{\ignorespaces CPU and GPU time comparisons (in seconds) for solving Newton update equation with the proposed Gear-2 sensitivity. \relax }}{357}} +\newlabel{table:circuit}{{15.1}{357}} +\@writefile{toc}{\contentsline {section}{\numberline {15.5}Summary}{358}} +\newlabel{sec:summary}{{15.5}{358}} +\@writefile{toc}{\contentsline {section}{\numberline {15.6}Glossary}{358}} +\@writefile{toc}{\contentsline {section}{Bibliography}{358}} \@setckpt{Chapters/chapter16/ch16}{ -\setcounter{page}{358} +\setcounter{page}{360} \setcounter{equation}{9} \setcounter{enumi}{2} \setcounter{enumii}{0} diff --git a/BookGPU/Chapters/chapter18/ch18.aux b/BookGPU/Chapters/chapter18/ch18.aux index db2a3d1..e8d12a3 100644 --- a/BookGPU/Chapters/chapter18/ch18.aux +++ b/BookGPU/Chapters/chapter18/ch18.aux @@ -2,44 +2,44 @@ \@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} \@writefile{toc}{\author{Christophe Guyeux}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {16}Pseudorandom Number Generator on GPU}{359}} +\@writefile{toc}{\contentsline {chapter}{\numberline {16}Pseudorandom Number Generator on GPU}{361}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\newlabel{chapter18}{{16}{359}} -\@writefile{toc}{\contentsline {section}{\numberline {16.1}Introduction}{359}} -\@writefile{toc}{\contentsline {section}{\numberline {16.2}Basic Recalls}{361}} -\newlabel{section:BASIC RECALLS}{{16.2}{361}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.1}A Short Presentation of Chaos}{361}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.2}On Devaney's Definition of Chaos}{361}} -\newlabel{sec:dev}{{16.2.2}{361}} -\newlabel{Devaney}{{16.1}{361}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.3}Chaotic iterations}{362}} -\newlabel{subsection:Chaotic iterations}{{16.2.3}{362}} -\newlabel{Chaotic iterations}{{2}{362}} -\newlabel{eq:generalIC}{{16.4}{363}} -\newlabel{equation Oplus}{{16.5}{363}} -\@writefile{toc}{\contentsline {section}{\numberline {16.3}Toward Efficiency and Improvement for CI PRNG}{363}} -\newlabel{sec:efficient PRNG}{{16.3}{363}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{363}} -\newlabel{algo:seqCIPRNG}{{16.1}{363}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.1}C code of the sequential PRNG based on chaotic iterations}{363}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{364}} -\newlabel{sec:efficient PRNG gpu}{{16.3.2}{364}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.3}Naive Version for GPU}{364}} -\@writefile{loa}{\contentsline {algocf}{\numberline {16}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{365}} -\newlabel{algo:gpu_kernel}{{16}{365}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.4}Improved Version for GPU}{365}} -\newlabel{IR}{{17}{366}} -\@writefile{loa}{\contentsline {algocf}{\numberline {17}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{366}} -\newlabel{algo:gpu_kernel2}{{17}{366}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.5}Chaos Evaluation of the Improved Version}{366}} -\@writefile{toc}{\contentsline {section}{\numberline {16.4}Experiments}{367}} -\newlabel{sec:experiments}{{16.4}{367}} -\@writefile{lof}{\contentsline {figure}{\numberline {16.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{368}} -\newlabel{fig:time_xorlike_gpu}{{16.1}{368}} -\@writefile{toc}{\contentsline {section}{Bibliography}{369}} +\newlabel{chapter18}{{16}{361}} +\@writefile{toc}{\contentsline {section}{\numberline {16.1}Introduction}{361}} +\@writefile{toc}{\contentsline {section}{\numberline {16.2}Basic Recalls}{363}} +\newlabel{section:BASIC RECALLS}{{16.2}{363}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.1}A Short Presentation of Chaos}{363}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.2}On Devaney's Definition of Chaos}{363}} +\newlabel{sec:dev}{{16.2.2}{363}} +\newlabel{Devaney}{{16.1}{363}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.3}Chaotic iterations}{364}} +\newlabel{subsection:Chaotic iterations}{{16.2.3}{364}} +\newlabel{Chaotic iterations}{{2}{364}} +\newlabel{eq:generalIC}{{16.4}{365}} +\newlabel{equation Oplus}{{16.5}{365}} +\@writefile{toc}{\contentsline {section}{\numberline {16.3}Toward Efficiency and Improvement for CI PRNG}{365}} +\newlabel{sec:efficient PRNG}{{16.3}{365}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{365}} +\newlabel{algo:seqCIPRNG}{{16.1}{365}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.1}C code of the sequential PRNG based on chaotic iterations}{365}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{366}} +\newlabel{sec:efficient PRNG gpu}{{16.3.2}{366}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.3}Naive Version for GPU}{366}} +\@writefile{loa}{\contentsline {algocf}{\numberline {16}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{367}} +\newlabel{algo:gpu_kernel}{{16}{367}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.4}Improved Version for GPU}{367}} +\newlabel{IR}{{17}{368}} +\@writefile{loa}{\contentsline {algocf}{\numberline {17}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{368}} +\newlabel{algo:gpu_kernel2}{{17}{368}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.5}Chaos Evaluation of the Improved Version}{368}} +\@writefile{toc}{\contentsline {section}{\numberline {16.4}Experiments}{369}} +\newlabel{sec:experiments}{{16.4}{369}} +\@writefile{lof}{\contentsline {figure}{\numberline {16.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{370}} +\newlabel{fig:time_xorlike_gpu}{{16.1}{370}} +\@writefile{toc}{\contentsline {section}{Bibliography}{371}} \@setckpt{Chapters/chapter18/ch18}{ -\setcounter{page}{371} +\setcounter{page}{373} \setcounter{equation}{5} \setcounter{enumi}{2} \setcounter{enumii}{0} diff --git a/BookGPU/Chapters/chapter3/biblio3.bib b/BookGPU/Chapters/chapter3/biblio3.bib index 927c841..56fda5a 100755 --- a/BookGPU/Chapters/chapter3/biblio3.bib +++ b/BookGPU/Chapters/chapter3/biblio3.bib @@ -471,4 +471,39 @@ volume={98}, number={3}, pages={493-504}, doi={10.1016/j.bpj.2009.10.037}, +} +@article{Sanchez-2-2012, +year={2012}, +issn={1939-8018}, +journal={Journal of Signal Processing Systems}, +doi={10.1007/s11265-012-0715-1}, +title={Highly Parallelable Bidimensional Median Filter for Modern Parallel Programming Models}, +url={http://dx.doi.org/10.1007/s11265-012-0715-1}, +publisher={Springer US}, +keywords={Nonlinear filters; Parallel algorithms; Image processing}, +author={Sánchez, RicardoM. and Rodríguez, PaulA.}, +pages={1-15}, +language={English} +} + +@inproceedings{Batcher:1968:SNA:1468075.1468121, + author = {Batcher, K. E.}, + title = {Sorting networks and their applications}, + booktitle = {Proceedings of the April 30--May 2, 1968, spring joint computer conference}, + series = {AFIPS '68 (Spring)}, + year = {1968}, + location = {Atlantic City, New Jersey}, + pages = {307--314}, + numpages = {8}, + url = {http://doi.acm.org/10.1145/1468075.1468121}, + doi = {10.1145/1468075.1468121}, + acmid = {1468121}, + publisher = {ACM}, + address = {New York, NY, USA}, +} +@book{cormen2001introduction, + title={Introduction to algorithms}, + author={Cormen, Thomas H and Leiserson, Charles E and Rivest, Ronald L and Stein, Clifford}, + year={2001}, + publisher={MIT press} } \ No newline at end of file diff --git a/BookGPU/Chapters/chapter3/ch3.aux b/BookGPU/Chapters/chapter3/ch3.aux index 0d1505e..cb0cf96 100644 --- a/BookGPU/Chapters/chapter3/ch3.aux +++ b/BookGPU/Chapters/chapter3/ch3.aux @@ -23,6 +23,7 @@ \@writefile{toc}{\contentsline {chapter}{\numberline {4}Implementing a fast median filter}{29}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} +\@writefile{toc}{\author{Gilles Perrot}{}} \@writefile{toc}{\contentsline {section}{\numberline {4.1}Introduction}{29}} \@writefile{toc}{\contentsline {section}{\numberline {4.2}Median filtering}{30}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.2.1}Basic principles}{30}} @@ -61,56 +62,60 @@ \newlabel{lst:kernelMedian3RegTri9}{{4.2}{36}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}3$\times $3 median filter kernel using one register per neighborhood pixel and bubble sort}{36}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Further optimization}{36}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs on GPU C2070 and CPU for generic median, in 3$\times $3 median register-only and \textit {libJacket}.\relax }}{37}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs on GPU C2070 and CPU for generic median, 3$\times $3 median register-only and \textit {libJacket}.\relax }}{37}} \newlabel{fig:compMedians1}{{4.4}{37}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{37}} -\newlabel{fig:forgetful_selection}{{4.5}{37}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count}{37}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{37}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{38}} +\newlabel{fig:forgetful_selection}{{4.5}{38}} \newlabel{lst:medianForget1pix3}{{4.3}{38}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method}{38}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{39}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{40}} -\newlabel{fig:median3_overlap}{{4.6}{40}} -\newlabel{lst:medianForget2pix3}{{4.4}{40}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{40}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{41}} -\newlabel{fig:compMedians2}{{4.7}{41}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{38}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{39}} +\newlabel{fig:forgetful3}{{4.6}{39}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{40}} +\newlabel{fig:median3_overlap}{{4.7}{40}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{40}} +\newlabel{lst:medianForget2pix3}{{4.4}{41}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{41}} \@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{41}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.\relax }}{42}} -\newlabel{fig:median5overlap}{{4.8}{42}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{42}} +\newlabel{fig:compMedians2}{{4.8}{42}} \newlabel{sec:median5}{{4.5.1}{42}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{42}} \newlabel{lst:medianForget2pix5}{{4.5}{42}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{42}} -\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{44}} -\newlabel{tab:median5comp}{{4.2}{44}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{44}} -\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{45}} -\newlabel{tab:medianSeparable}{{4.3}{45}} -\newlabel{lst:medianSeparable}{{4.6}{45}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{45}} -\newlabel{img:sap_example_ref}{{4.9(a)}{46}} -\newlabel{sub@img:sap_example_ref}{{(a)}{46}} -\newlabel{img:sap_example_sep_med3}{{4.9(b)}{46}} -\newlabel{sub@img:sap_example_sep_med3}{{(b)}{46}} -\newlabel{img:sap_example_sep_med5}{{4.9(c)}{46}} -\newlabel{sub@img:sap_example_sep_med5}{{(c)}{46}} -\newlabel{img:sap_example_sep_med3_it2}{{4.9(d)}{46}} -\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{46}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{46}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{46}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{46}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{46}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{46}} -\newlabel{fig:sap_examples2}{{4.9}{46}} -\@writefile{toc}{\contentsline {section}{Bibliography}{47}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.\relax }}{43}} +\newlabel{fig:median5overlap}{{4.9}{43}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.\relax }}{43}} +\newlabel{fig:median5overlap}{{4.10}{43}} +\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{45}} +\newlabel{tab:median5comp}{{4.2}{45}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{45}} +\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{46}} +\newlabel{tab:medianSeparable}{{4.3}{46}} +\newlabel{lst:medianSeparable}{{4.6}{46}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{46}} +\newlabel{img:sap_example_ref}{{4.11(a)}{47}} +\newlabel{sub@img:sap_example_ref}{{(a)}{47}} +\newlabel{img:sap_example_sep_med3}{{4.11(b)}{47}} +\newlabel{sub@img:sap_example_sep_med3}{{(b)}{47}} +\newlabel{img:sap_example_sep_med5}{{4.11(c)}{47}} +\newlabel{sub@img:sap_example_sep_med5}{{(c)}{47}} +\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{47}} +\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{47}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{47}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{47}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{47}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{47}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{47}} +\newlabel{fig:sap_examples2}{{4.11}{47}} +\@writefile{toc}{\contentsline {section}{Bibliography}{49}} \@setckpt{Chapters/chapter3/ch3}{ -\setcounter{page}{49} +\setcounter{page}{50} \setcounter{equation}{0} \setcounter{enumi}{3} \setcounter{enumii}{0} \setcounter{enumiii}{0} -\setcounter{enumiv}{9} +\setcounter{enumiv}{10} \setcounter{footnote}{0} \setcounter{mpfootnote}{0} \setcounter{part}{1} @@ -120,9 +125,9 @@ \setcounter{subsubsection}{0} \setcounter{paragraph}{0} \setcounter{subparagraph}{0} -\setcounter{figure}{9} +\setcounter{figure}{11} \setcounter{table}{3} -\setcounter{numauthors}{0} +\setcounter{numauthors}{1} \setcounter{parentequation}{0} \setcounter{subfigure}{0} \setcounter{lofdepth}{1} diff --git a/BookGPU/Chapters/chapter3/ch3.tex b/BookGPU/Chapters/chapter3/ch3.tex index e04a36c..94c3e97 100755 --- a/BookGPU/Chapters/chapter3/ch3.tex +++ b/BookGPU/Chapters/chapter3/ch3.tex @@ -109,9 +109,9 @@ It remains that global runtime can still be faster than similar processes run on Therefore, to fully optimize global runtimes, it is important to pay attention to how memory transfers are done. This leads us to propose, in the following section, an overall code structure to be used with all our kernel examples. -Obviously, our code originally accepts various image dimensions and can process color images. +Obviously, our code originally accepts various image dimensions and can process color images when an extrapolated definition of the median filter is choosen. However, so as to propose concise and more readable code, we will assume the following limitations: -8 or 16~bit-coded gray-level input images whose dimensions $H\times W$ are multiples of 512 pixels. +16~bit-coded gray-level input images whose dimensions $H\times W$ are multiples of 512 pixels. \begin{algorithm} \SetNlSty{}{}{:} @@ -131,18 +131,18 @@ However, so as to propose concise and more readable code, we will assume the fol \section{Data transfers, memory management.} This section deals with the following issues: \begin{enumerate} -\item Data transfer from CPU memory to GPU global memory: several GPU memory areas are available as destination memory but the 2-D caching mechanism of texture memory, specifically designed for fetching neighboring pixels, is currently the fastest way to fetch gray-level pixel values inside a kernel computation. This has lead us to choose \textbf{texture memory} as primary GPU memory area for images. -\item Data fetching from GPU global memory to kernel local memory: as said above, we use texture memory. Depending on which process is run, texture data is used either by direct fetching in kernel local memory or through a prefetching in shared memory. +\item Data transfer from CPU memory to GPU global memory: several GPU memory areas are available as destination memory but the 2-D caching mechanism of texture memory \index{memory~hierarchy!texture~memory}, specifically designed for fetching neighboring pixels, is currently the fastest way to fetch gray-level pixel values inside a kernel computation. This has led us to choose \textbf{texture memory} as primary GPU memory area for images. +\item Data fetching from GPU global memory to kernel local memory: as said above, we use texture memory \index{memory~hierarchy!texture~memory}. Depending on which process is run, texture data is used either by direct fetching in kernel local memory or through a prefetching \index{prefetching} in shared memory \index{memory~hierarchy!shared~memory}. \item Data outputting from kernels to GPU memory: there is actually no alternative to global memory, as kernels can not directly write into texture memory and as copying from texture to CPU memory would not be faster than from simple global memory. -\item Data transfer from GPU global memory to CPU memory: it can be drastically accelerated by use of \textbf{pinned memory}, keeping in mind it has to be used sparingly. +\item Data transfer from GPU global memory to CPU memory: it can be drastically accelerated by use of \textbf{pinned memory} \index{memory~hierarchy!pinned~memory}, keeping in mind it has to be used sparingly. \end{enumerate} -Algorithm \ref{algo:memcopy} summarizes all the above considerations and describe how data are handled in our examples. For more information on how to handle the different types of GPU memory, we suggest to refer to CUDA programmer's guide. +Algorithm \ref{algo:memcopy} summarizes all the above considerations and describes how data are handled in our examples. For more information on how to handle the different types of GPU memory, we suggest to refer to the CUDA programmer's guide. -At debug stage, for simplicity's sake, we use the \textbf{cutil} library supplied by the NVidia developpement kit (SDK). Thus, in order to easily implement our examples, we suggest readers download and install the latest NVidia-SDK (ours is SDK4.0), create a new directory \textit{SDK-root-dir/C/src/fast\_kernels} and adapt the generic \textit{Makefile} that can be found in each sub-directory of \textit{SDK-root-dir/C/src/}. Then, only two more files will be enough to have a fully operational environnement: \textit{main.cu} and \textit{fast\_kernels.cu}. +At debug stage, for simplicity's sake, we use the \textbf{cutil} \index{Cutil library} library supplied by the NVidia development kit (SDK). Thus, in order to easily implement our examples, we suggest readers to download and to install the latest NVidia-SDK (ours is SDK4.0), create a new directory \textit{SDK-root-dir/C/src/fast\_kernels} and adapt the generic \textit{Makefile} that can be found in each sub-directory of \textit{SDK-root-dir/C/src/}. Then, only two more files will be enough to have a fully operational environnement: \textit{main.cu} and \textit{fast\_kernels.cu}. Listings \ref{lst:main1}, \ref{lst:fkern1} and \ref{lst:mkfile} implement all the above considerations minimally, while remaining functional. The main file of Listing \ref{lst:main1} is a simplified version of our actual main file. -It has to be noticed that cutil functions \texttt{cutLoadPGMi} and \texttt{cutSavePGMi} only operate on unsigned integer data. As data is coded in short integer format for performance reasons, the use of these functions involves casting data after loading and before saving. This may be overcome by use of a different library. Actually, our choice was to modify the above mentioned cutil functions. +It has to be noticed that cutil functions \texttt{cutLoadPGMi} \index{Cutil library!cutLoadPGMi} and \texttt{cutSavePGMi} \index{Cutil library!cutSavePGMi} only operate on unsigned integer data. As data is coded in short integer format for performance reasons, the use of these functions involves casting data after loading and before saving. This may be overcome by use of a different library. Actually, our choice was to modify the above mentioned cutil functions. Listing \ref{lst:fkern1} gives a minimal kernel skeleton that will serve as the basis for all other kernels. Lines 5 and 6 determine the coordinates $(i, j)$ of the pixel to be processed, each pixel being associated to one thread. The instruction in line 8 combines writing the output gray-level value into global memory and fetching the input gray-level value from 2-D texture memory. @@ -156,9 +156,9 @@ The Makefile given in Listing \ref{lst:mkfile} shows how to adapt examples given \section{Performance measurements} -As our goal is to design very fast implementations of basic image processing algorithms, we need to make quite accurate time-measurements, within the order of magnitude of $0.01~ms$. Again, the easiest way of doing so is to use the helper functions of the cutil library. As usual, as the durations we are measuring are short and possibly suject to non neglectable variations, a good practice is to measure multiple executions and issue the mean runtime. All time results given in this chapter have been obtained through 1000 calls to each kernel. +As our goal is to design very fast implementations of basic image processing algorithms, we need to make quite accurate time-measurements, within the order of magnitude of $0.01~ms$. Again, the easiest way of doing so is to use the helper functions of the cutil library. As usual, as the durations we are measuring are short and possibly subject to non neglectable variations, a good practice is to measure multiple executions and issue the mean runtime. All time results given in this chapter have been obtained through 1000 calls to each kernel. -Listing \ref{lst:chronos} shows how to use the dedicated cutil functions. Timer declaration and creation only need to be performed once while reset, start and stop can be used as often as necessary. Synchronization is mandatory before stopping the timer (Line 7), to avoid runtime measurement being biased. +Listing \ref{lst:chronos} shows how to use the dedicated cutil functions \index{Cutil library!Timer usage}. Timer declaration and creation only need to be performed once while reset, start and stop can be used as often as necessary. Synchronization is mandatory before stopping the timer (Line 7), to avoid runtime measurement being biased. \lstinputlisting[label={lst:chronos},caption=Time measurement technique using cutil functions]{Chapters/chapter3/code/exChronos.cu} In an attempt to provide relevant speedup values, we either implemented CPU versions of the algorithms studied, or used the values found in existing literature. Still, the large number and diversity of hardware platforms and GPU cards makes it impossible to benchmark every possible combination and significant differences may occur between the speedups we announce and those obtained with different devices. As a reference, our developing platform details as follows: @@ -181,15 +181,15 @@ Last, like many authors, we chose to use the pixel throughput value of each proc In order to estimate the potential for improvement of each kernel, a reference throughput measurement, involving identity kernel of Listing \ref{lst:fkern1}, was performed. As this kernel only fetches input values from texture memory and outputs them to global memory without doing any computation, it represents the smallest, thus fastest, possible process and is taken as the reference throughput value (100\%). The same measurement was performed on CPU, with a maximum effective pixel throughput of 130~Mpixel per second. On GPU, depending on grid parameters it amounts to 800~MPixels/s on GTX280 and 1300~Mpixels/s on C2070. - \chapter{Implementing a fast median filter} +\chapterauthor{Gilles Perrot}{FEMTO-ST Institute} \section{Introduction} Median filtering is a well-known method used in a wide range of application frameworks as well as a standalone filter especially for \textit{salt and pepper} denoising. It is able to highly reduce power of noise without blurring edges too much. First introduced by Tukey in \cite{tukey77}, it has been widely studied since then, and many researchers have proposed efficient implementations of it, adapted to various hypothesis, architectures and processors. -Originally, its main drawbacks were its compute complexity, its non linearity and its data-dependent runtime. Several researchers have adress these issues and designed, for example, efficient histogram-based median filter with predictible runtime \cite{Huang:1981:TDS:539567, Weiss:2006:FMB:1179352.1141918}. +Originally, its main drawbacks were its compute complexity, its non linearity and its data-dependent runtime. Several researchers have addressed these issues and designed, for example, efficient histogram-based median filter with predictible runtimes \cite{Huang:1981:TDS:539567, Weiss:2006:FMB:1179352.1141918}. -More recently, the advent of GPUs opened new perspectives in terms of image processing performance, and some researchers managed to take advantage of the new graphic capabilities: in that respect, we can cite the Branchless Vectorized Median filter (BVM) \cite{5402362, chen09} which allows very interesting runtimes on CUDA-enabled devices but, as far as we know, the fastest implementation to date is the histogram-based CCDS median filter \cite{6288187}. +More recently, the advent of GPUs opened new perspectives in terms of image processing performance, and some researchers managed to take advantage of the new graphic capabilities: in that respect, we can cite the Branchless Vectorized Median filter (BVM) \cite{5402362, chen09} which allows very interesting runtimes on CUDA-enabled devices but, as far as we know, the fastest implementation to date is the histogram-based PCMF median filter \cite{Sanchez-2-2012}. Some of the following implementations feature very fast runtimes; They are targeted on Nvidia Tesla GPU (Fermi architecture, compute capability 2.x) but may easily be adapted to other models e.g. those of compute capability 1.3. @@ -213,9 +213,9 @@ Figure \ref{fig:sap_examples} shows an example of a $512\times 512$ pixel image, As mentioned earlier, the selection of the median value can be performed by more than one technique, using either histogram-based or sorting methods, each of them having its own benefits and drawbacks as will be discussed further down. \subsection{A naive implementation} -As a reference, Listing \ref{lst:medianGeneric} gives a simple, not to say simplistic implementation of a CUDA kernel (\texttt{kernel\_medianR}) achieving generic $n\times n$ histogram-based median filtering. Its runtime has a very low data dependency, but this implementation does not suit very well GPU architecture. Each pixel loads the whole of its $n\times n$ neighborhood meaning that one pixel is loaded multiple times inside one single thread block, and above all, the use of a local vector (histogram[]) considerably downgrades performance, as the compiler automatically stores such vectors in local memory (slow). +As a reference, Listing \ref{lst:medianGeneric} gives a simple, not to say simplistic implementation of a CUDA kernel (\texttt{kernel\_medianR}) achieving generic $n\times n$ histogram-based median filtering. Its runtime has a very low data dependency, but this implementation does not suit very well GPU architecture. Each pixel loads the whole of its $n\times n$ neighborhood meaning that one pixel is loaded multiple times inside one single thread block, and above all, the use of a local vector (histogram[]) considerably downgrades performance, as the compiler automatically stores such vectors in local memory (slow) \index{memory~hierarchy!local~memory}. -Table \ref{tab:medianHisto1} displays measured runtimes of \texttt{kernel\_medianR} and pixel throughputs for each GPU version and for both CPU and GPU implementations. Usual window sizes of $3\times 3$, $5\times 5$ and $7\times 7$ are shown. Though some specific applications require larger window sizes and dedicated algorithms , such small square window sizes are most widely used in general purpose image processing. GPU runtimes have been obtained with a grid of 64-thread blocks. This block size, is a good compromise in this case. +Table \ref{tab:medianHisto1} displays measured runtimes of \texttt{kernel\_medianR} and pixel throughputs for each GPU version and for both CPU and GPU implementations. Usual window sizes of $3\times 3$, $5\times 5$ and $7\times 7$ are shown. Though some specific applications require larger window sizes and dedicated algorithms, such small square window sizes are most widely used in general purpose image processing. GPU runtimes have been obtained with a grid of 64-thread blocks. This block size, is a good compromise in this case. The first observation to make when analysing results of Table \ref{tab:medianHisto1} is that, on CPU, window size has almost no influence on the effective pixel throughput. Since inner loops that fill the histogram vector contain very few fetching instructions (from 9 to 49, depending on the window size), it is not surprising to note their neglectable impact compared to outer loops that fetch image pixels (from 256k to 16M instructions). @@ -292,35 +292,35 @@ copy data from GPU global memory to CPU memory\label{algoMedianGeneric:memcpyD2H \section{NVidia GPU tuning recipes} When designing GPU code, besides thinking of the actual data computing process, one must choose the memory type into which to store temporary data. Three types of GPU memory are available: \begin{enumerate} -\item \textbf{Global memory, the most versatile:}\\Offers the largest storing space and global scope but is slowest (400 cycles latency). \textbf{Texture memory} is physically included into it, but allows access through an efficient 2-D caching mechanism. -\item \textbf{Registers, the fastest:}\\Allow access wihtout latency, but only 63 registers are available per thread (thread scope), with a maximum of 32K per Symetric Multiprocessor (SM). -\item \textbf{Shared memory, a complex compromise:}\\All threads in one block can access 48~KBytes of shared memory, which is faster than global memory (20 cycles latency) but slower than registers. +\item \textbf{Global memory, the most versatile:} \index{memory~hierarchy!global~memory}\\Offers the largest storing space and global scope but is slowest (400 cycles latency). \textbf{Texture memory} is physically included into it, but allows access through an efficient 2-D caching mechanism. +\item \textbf{Registers, the fastest:} \index{memory~hierarchy!registers}\\Allow access wihtout latency, but only 63 registers are available per thread (thread scope), with a maximum of 32K per Symetric Multiprocessor (SM) \index{register count}w. +\item \textbf{Shared memory, a complex compromise:} \index{memory~hierarchy!shared~memory}\\All threads in one block can access 48~KBytes of shared memory, which is faster than global memory (20 cycles latency) but slower than registers. However, bank conflicts can occur if two threads of a warp try to access data stored in one single memory bank. In such cases, the parallel process is re-serialized which may cause significant performance decrease. One easy way to avoid it is to ensure that two consecutive threads in one block always access 32-bit data at two consecutive adresses. \end{enumerate} \noindent As observed earlier, designing a median filter GPU implementation using only global memory is fairly straightforward, but its performance remains quite low even if it is faster than CPU. -To overcome this, the most frequent choice made in efficient implementations found in literature is to use shared memory. Such option implies prefetching data prior to doing the actual computations, a relevant choice, as each pixel of an image belongs to n$\times$n different neighborhoods. Thus, it can be expected that fetching each gray-level value from global memory only once should be more efficient than doing it each time it is required. One of the most efficient implementations using shared memory is presented in \cite{5402362}. In the case of the generic kernel of Listing \ref{lst:medianGeneric}, using shared memory without further optimization would not bring valuable speedup because that would just move redundancy from texture to shared memory fetching and would generate bank conflicts. For information, we wrote such a version of the generic median kernel and our measurements showed a speedup of around 3\% (as an example: 32ms for 5$\times$5 median on a 1024$^2$ pixel image, i.e. 33~Mpixel/s ). +To overcome this, the most frequent choice made in efficient implementations found in literature is to use shared memory. Such option implies prefetching \index{prefetching}data prior to doing the actual computations, a relevant choice, as each pixel of an image belongs to n$\times$n different neighborhoods. Thus, it can be expected that fetching each gray-level value from global memory only once should be more efficient than doing it each time it is required. One of the most efficient implementations using shared memory is presented in \cite{5402362}. In the case of the generic kernel of Listing \ref{lst:medianGeneric}, using shared memory without further optimization would not bring valuable speedup because that would just move redundancy from texture to shared memory fetching and would generate bank conflicts. For information, we wrote such a version of the generic median kernel and our measurements showed a speedup of around 3\% (as an example: 32ms for 5$\times$5 median on a 1024$^2$ pixel image, i.e. 33~Mpixel/s ). -As for registers, designing a generic median filter that would only use that type of memory seems difficult, due to the above mentioned 63 register-per-thread limitation. +As for registers, designing a generic median filter that would only use that type of memory seems difficult, due to the above mentioned 63 register-per-thread limitation \index{register count}. Yet, nothing forbids us to design fixed-size filters, each of them specific to one of the most popular window sizes. It might be worth the effort as dramatic increase in performance could be expected. -Another track to follow in order to improve performance of GPU implementations consists in hiding latencies generated by arithmetic instruction calls and memory accesses. Both can be partially hidden by introducing Instruction-Level Parallelism (ILP) and by increasing the data count output by each thread. Though such techniques may seem to break the NVidia occupancy paradigm, they can lead to dramatically higher data throughput values. +Another track to follow in order to improve performance of GPU implementations consists in hiding latencies generated by arithmetic instruction calls and memory accesses. Both can be partially hidden by introducing Instruction-Level Parallelism \index{Instruction-Level Parallelism}(ILP) and by increasing the data count output by each thread. Though such techniques may seem to break the NVidia occupancy paradigm, they can lead to dramatically higher data throughput values. The following sections illustrate these ideas and detail the design of the fastest CUDA median filter known to date. \section{A 3$\times$3 median filter: using registers } Designing a median filter dedicated to the smallest possible square window size is a good challenge to start using registers. -One first issue is that the exclusive use of registers forbids us to implement a naive histogram-based method. In a \textit{8-bit gray level pixel per thread} rule, each histogram requires one 256-element vector to store its values, i.e. four times the maximum register count allowed per thread (63). Considering that a 3$\times$3 median filter involves only 9 pixel values per thread, it seem obvious they can be sorted within the 63-register limit. +One first issue is that the exclusive use of registers forbids us to implement a naive histogram-based method. In a \textit{8-bit gray level pixel per thread} rule, each histogram requires one 256-element vector to store its values, i.e. four times the maximum register count allowed per thread (63)\index{register count}. Considering that a 3$\times$3 median filter involves only 9 pixel values per thread, it seem obvious they can be sorted within the 63-register limit. \subsection{The simplest way} -In the case of a 3$\times$3 median filter, the simplest solution consists in associating one register to each gray-level value, then sorting those 9 values and selecting the fifth one, i.e. the median value. For such a small amount of data to sort, a simple selection method is well indicated. As shown in Listing \ref{lst:kernelMedian3RegTri9} (\texttt{kernelMedian3RegTri9()}), the constraint of only using registers leads to adopt an unusual manner of coding. However, results are persuasive: runtimes are divided by around 120 on GTX280 and 80 on C2070, while only reduced by a 3.5 factor on CPU. -The diagram of Figure \ref{fig:compMedians1} summarizes these first results. Only C2070 throughputs are shown and compared to CPU results. We included the maximum effective pixel throughput in order to see the improvement potential of the different implementations. We also introduced throughputd achieved by \textit{libJacket}, a commercial implementation, as it was the fastest known implementation of a 3$\times$3 median filter to date, as illustrated in \cite{chen09}. One of the authors of libJacket kindly posted the CUDA code of its 3$\times$3 median filter, that we inserted into our own coding structure. The algorithm itself is quite similar to ours, but running it in our own environement produced higher throughput values than those published in \cite{chen09}, not due to different hardware capabilities between our GTX280 and the GTX260 used in the paper, but to the way we perform memory transfers and to our register-only method of storing temporary data. +In the case of a 3$\times$3 median filter, the simplest solution consists in associating one register to each gray-level value, then sorting those 9 values and selecting the fifth one, i.e. the median value. For such a small amount of data to sort, a simple selection method is well indicated. As shown in Listing \ref{lst:kernelMedian3RegTri9} (\texttt{kernelMedian3RegSort9()}), the constraint of only using registers leads to adopt an unusual manner of coding. However, results are persuasive: runtimes are divided by around 120 on GTX280 and 80 on C2070, while only reduced by a 3.5 factor on CPU. +The diagram of Figure \ref{fig:compMedians1} summarizes these first results, obtained with a block size of 128 threads on GTX280 and 256 on C2070. Only C2070 throughputs are shown and compared to CPU results. We included the maximum effective pixel throughput in order to see the improvement potential of the different implementations. We also introduced throughputd achieved by \textit{libJacket}, a commercial implementation, as it was the fastest known implementation of a 3$\times$3 median filter to date, as illustrated in \cite{chen09}. One of the authors of libJacket kindly posted the CUDA code of its 3$\times$3 median filter, that we inserted into our own coding structure. The algorithm itself is quite similar to ours, but running it in our own environement produced higher throughput values than those published in \cite{chen09}, not due to different hardware capabilities between our GTX280 and the GTX260 used in the paper, but to the way we perform memory transfers and to our register-only method of storing temporary data. \lstinputlisting[label={lst:kernelMedian3RegTri9},caption= 3$\times$3 median filter kernel using one register per neighborhood pixel and bubble sort]{Chapters/chapter3/code/kernMedianRegTri9.cu} \begin{figure} \centering - \includegraphics[width=11cm]{Chapters/chapter3/img/debitPlot1.pdf} - \caption{Comparison of pixel throughputs on GPU C2070 and CPU for generic median, in 3$\times$3 median register-only and \textit{libJacket}.} + \includegraphics[width=15cm]{Chapters/chapter3/img/debitPlot1.pdf} + \caption{Comparison of pixel throughputs on GPU C2070 and CPU for generic median, 3$\times$3 median register-only and \textit{libJacket}.} \label{fig:compMedians1} \end{figure} @@ -332,8 +332,8 @@ Running the above register-only 3$\times$3 median filter through the NVidia CUDA \end{itemize} -\subsubsection{Reducing register count} -Our current kernel (\texttt{kernelMedian3RegTri9}) uses one register per gray-level value, which amounts to 9 registers for the entire 3$\times$3 window. +\subsubsection{Reducing register count \index{register count}} +Our current kernel (\texttt{kernelMedian3RegSort9}) uses one register per gray-level value, which amounts to 9 registers for the entire 3$\times$3 window. This count can be reduced by use of an iterative sorting process called \textit{forgetful selection}, where both \textit{extrema} are eliminated at each sorting stage, until only 3 elements remain. The question is to find out the minimal register count $k_{n^2}$ that allows the selection of the median amoung $n^2$ values. The answer can be evaluated considering that, when eliminating the maximum and the minimum values, one has to make sure not to eliminate the global median value. Such a situation is illustrated in Figure \ref{fig:forgetful_selection} for a 3$\times$3 median filter. For better comprehension, the 9 elements of the 3$\times$3 pixel window have been represented in a row. \begin{figure} \centering @@ -353,11 +353,19 @@ which leads to: $$k_{n^2}=\lceil \frac{n^2}{2}\rceil+1 $$ This rule can be applied to the first eliminating stage and remains true with the next ones as each stage suppresses exactly two values, one above and one below the median value. In our 3$\times$3 pixel window example, the minimum register count becomes $k_9=\lceil 9/2\rceil+1 = 6$. +This iterative process is illustrated in Figure \ref{fig:forgetful3}, where it achieves one entire $3\times 3$ median selection, beginning with $k_9=6$ elements. +\begin{figure} + \centering + \includegraphics[width=5cm]{Chapters/chapter3/img/forgetful_selectionb.png} + \caption{Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.} + \label{fig:forgetful3} +\end{figure} + The \textit{forgetful selection} method, used in \cite{mcguire2008median} does not imply full sorting of values, but only selecting minimum and maximum values, which, at the price of a few iteration steps ($n^2-k$), reduces arithmetic complexity. -Listing \ref{lst:medianForget1pix3} details this process where forgetful selection is achieved by use of simple 2-value sorting function ($s()$, lines 1 to 5) that swaps input values if necessary. Moreover, whenever possible, in order to increase the Instruction-Level Parallelism, successive calls to $s()$ are done with independant elements as arguments. This is illustrated by the macro definitions of lines 7 to 14. +Listing \ref{lst:medianForget1pix3} details this process where forgetful selection is achieved by use of simple 2-value swapping function ($s()$, lines 1 to 5) that swaps input values if necessary, so as to achieve the first steps of a \textit{bitonic}-like sorting network (\cite{Batcher:1968:SNA:1468075.1468121}). Moreover, whenever possible, in order to increase the Instruction-Level Parallelism \index{Instruction-Level Parallelism}, successive calls to $s()$ are done with independant elements as arguments. This is illustrated by the macro definitions of lines 7 to 14 and by Figure \ref{fig:bitonic} which details the first iteration of the $5\times 5$ selection, starting with $k_{25}=14$ elements. -\lstinputlisting[label={lst:medianForget1pix3},caption= 3$\times$3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method]{Chapters/chapter3/code/kernMedianForget1pix3.cu} +\lstinputlisting[label={lst:medianForget1pix3},caption= 3$\times$3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070. ]{Chapters/chapter3/code/kernMedianForget1pix3.cu} Our such modified kernel provides significantly improved runtimes: a speedup of around 16\% is obtained, and pixel throughput reaches around 1000~MPixel/s on C2070. @@ -379,7 +387,7 @@ Running this $3\times 3$ kernel saves another 10\% runtime, as shown in Figure \ \begin{figure} \centering - \includegraphics[width=11cm]{Chapters/chapter3/img/debitPlot2.pdf} + \includegraphics[width=15cm]{Chapters/chapter3/img/debitPlot2.pdf} \caption{Comparison of pixel throughput on GPU C2070 for the different 3$\times$3 median kernels.} \label{fig:compMedians2} \end{figure} @@ -392,11 +400,18 @@ The next two sections will first detail the particular case of the 5$\times$5 me The minimum register count required to apply the forgetful selection method to a 5$\times$5 median filter is $k_{25}=\lceil 25/2\rceil+1 = 14$. Moreover, two adjacent overlapping windows share 20 pixels ($n^2-one\_column$) so that, when processing 2 pixels simultaneously, a count of 7 common selection stages can be carried out from the first selection stage with 14 common values to the processing of the last common value. That allows to limit register count to 22 per thread. Figure \ref{fig:median5overlap} describes the distribution of overlapping pixels, implemented in Listing \ref{lst:medianForget2pix5}: common selection stages take place from line 25 to line 37, while the remaining separate selection stages occur between lines 45 and 62 after the separation of line 40. \begin{figure} \centering - \includegraphics[width=6cm]{Chapters/chapter3/img/median5_overlap.png} + \includegraphics[width=6cm]{Chapters/chapter3/img/median5_overlap4.png} \caption{Reducing register count in a 5$\times$5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.} \label{fig:median5overlap} \end{figure} +\begin{figure} + \centering + \includegraphics[width=6cm]{Chapters/chapter3/img/forgetful_selection4.png} + \caption{First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.} + \label{fig:median5overlap} +\end{figure} + \lstinputlisting[label={lst:medianForget2pix5},caption=kernel 5$\times$5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.]{Chapters/chapter3/code/kernMedian2pix5.cu} Timing results follow the same variations with image size as in previously presented kernels. That is why Table \ref{tab:median5comp} shows only throughput values obtained for C2070 card and 4096$\times$4096 pixel image. diff --git a/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu b/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu index 363b181..29922a3 100755 --- a/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu +++ b/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu @@ -1,4 +1,4 @@ -__global__ void kernel_Median3RegTri9( short *output, +__global__ void kernel_Median3RegSort9( short *output, int i_dim, int j_dim) { int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; diff --git a/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu~ b/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu~ index cd173eb..363b181 100755 --- a/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu~ +++ b/BookGPU/Chapters/chapter3/code/kernMedianRegTri9.cu~ @@ -1,19 +1,11 @@ -__global__ void kernel_Median3RegTri9( short *output, int i_dim, int j_dim) +__global__ void kernel_Median3RegTri9( short *output, + int i_dim, int j_dim) { - - // coordonnees absolues du point int j = __mul24(blockIdx.x,blockDim.x) + threadIdx.x ; int i = __mul24(blockIdx.y,blockDim.y) + threadIdx.y ; - - /************************************************************************** - * tri(s) - **************************************************************************/ - int a0, a1, a2, a3, a4, a5, a6, a7, a8 ; + int a0, a1, a2, a3, a4, a5, a6, a7, a8 ; // 1 register per pixel - /******************************************************************************** - * affectation des valeurs - ********************************************************************************/ - a0 = tex2D(tex_img_ins, j-1, i-1) ; + a0 = tex2D(tex_img_ins, j-1, i-1) ; // fetching values a1 = tex2D(tex_img_ins, j , i-1) ; a2 = tex2D(tex_img_ins, j+1, i-1) ; a3 = tex2D(tex_img_ins, j-1, i) ; @@ -23,10 +15,7 @@ __global__ void kernel_Median3RegTri9( short *output, int i_dim, int j_dim) a7 = tex2D(tex_img_ins, j , i+1) ; a8 = tex2D(tex_img_ins, j+1, i+1) ; - //tri selection - bubTriReg9(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8); + bubReg9(&a0,&a1,&a2,&a3,&a4,&a5,&a6,&a7,&a8); // bubble sort - //median au milieu ! - output[ __mul24(i, j_dim) +j ] = a4 ; - -} + output[ __mul24(i, j_dim) +j ] = a4 ; // median at the middle + } diff --git a/BookGPU/Chapters/chapter3/code/kernMedianSeparable.cu b/BookGPU/Chapters/chapter3/code/kernMedianSeparable.cu index f4ad2c6..bdf7023 100755 --- a/BookGPU/Chapters/chapter3/code/kernMedianSeparable.cu +++ b/BookGPU/Chapters/chapter3/code/kernMedianSeparable.cu @@ -1,7 +1,7 @@ __global__ void kernel_medianV_sh( short *output, int i_dim, int j_dim, int r) { - int idc, val, min, max, inf, egal, sup, mxinf, minsup, estim ; + int idc, val, min, max, inf, equal, sup, mxinf, minsup, estim ; //coordinates in the block int ib = threadIdx.y ; @@ -40,7 +40,7 @@ __global__ void kernel_medianV_sh( short *output, int i_dim, int j_dim, int r) while (1) { estim = (min+max)/2 ; - inf = sup = egal = 0 ; + inf = sup = equal = 0 ; mxinf = min ; minsup= max ; for (idc =0; idc< 2*r+1 ; idc++) @@ -54,7 +54,7 @@ __global__ void kernel_medianV_sh( short *output, int i_dim, int j_dim, int r) { sup++; if( val < minsup) minsup = val ; - } else egal++ ; + } else equal++ ; } if ( (inf <= (r+1))&&(sup <=(r+1)) ) break ; else if (inf>sup) max = mxinf ; @@ -62,7 +62,7 @@ __global__ void kernel_medianV_sh( short *output, int i_dim, int j_dim, int r) } if ( inf >= r+1 ) val = mxinf ; - else if (inf+egal >= r+1) val = estim ; + else if (inf+equal >= r+1) val = estim ; else val = minsup ; output[ __mul24(j, i_dim) +i ] = val ; diff --git a/BookGPU/Chapters/chapter3/code/kernMedianSeparable.cu~ b/BookGPU/Chapters/chapter3/code/kernMedianSeparable.cu~ index 5c79c82..f4ad2c6 100755 --- a/BookGPU/Chapters/chapter3/code/kernMedianSeparable.cu~ +++ b/BookGPU/Chapters/chapter3/code/kernMedianSeparable.cu~ @@ -6,17 +6,14 @@ __global__ void kernel_medianV_sh( short *output, int i_dim, int j_dim, int r) //coordinates in the block int ib = threadIdx.y ; int jb = threadIdx.x ; - int idx_h = __mul24(ib+r,blockDim.x) + jb ; // index pixel deans shmem (bloc+halo) + int idx_h = __mul24(ib+r,blockDim.x) +jb; // base pixel index int offset = __mul24(blockDim.x,r) ; - // coordonnees absolues du point int j = __mul24(blockIdx.x,blockDim.x) + jb ; int i = __mul24(blockIdx.y,blockDim.y) + ib ; - extern __shared__ int buff[] ; - /*********************************************************************************** - * CHARGEMENT DATA EN SHARED MEM - ***********************************************************************************/ + // DATA PREFETCHING INTO SHARED MEM + extern __shared__ int buff[] ; buff[ idx_h ] = tex2D(tex_img_ins, j, i) ; if (ib < r) @@ -29,10 +26,8 @@ __global__ void kernel_medianV_sh( short *output, int i_dim, int j_dim, int r) } __syncthreads() ; - /********************************************************************************************** - * TRI VERTICAL par algo TORBEN MOGENSEN - * (a little bit slow but saves memory => faster !) - **********************************************************************************************/ + + // TORBEN MOGENSEN SORTING min = max = buff[ ib*blockDim.x +jb] ; for (idc= 0 ; idc< 2*r+1 ; idc++ ) diff --git a/BookGPU/Chapters/chapter3/img/debitPlot1.pdf b/BookGPU/Chapters/chapter3/img/debitPlot1.pdf index 204c562..9cd73d0 100644 Binary files a/BookGPU/Chapters/chapter3/img/debitPlot1.pdf and b/BookGPU/Chapters/chapter3/img/debitPlot1.pdf differ diff --git a/BookGPU/Chapters/chapter3/img/debitPlot2.pdf b/BookGPU/Chapters/chapter3/img/debitPlot2.pdf index 255472b..e0b3d0d 100644 Binary files a/BookGPU/Chapters/chapter3/img/debitPlot2.pdf and b/BookGPU/Chapters/chapter3/img/debitPlot2.pdf differ diff --git a/BookGPU/Chapters/chapter6/ch6.aux b/BookGPU/Chapters/chapter6/ch6.aux index ef830ac..2be0894 100644 --- a/BookGPU/Chapters/chapter6/ch6.aux +++ b/BookGPU/Chapters/chapter6/ch6.aux @@ -3,110 +3,110 @@ \@writefile{toc}{\author{Stephane Vialle}{}} \@writefile{toc}{\author{Jens Gustedt}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {6}Development methodologies for GPU and cluster of GPUs}{81}} +\@writefile{toc}{\contentsline {chapter}{\numberline {6}Development methodologies for GPU and cluster of GPUs}{83}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {section}{\numberline {6.1}Introduction}{82}} -\newlabel{ch6:intro}{{6.1}{82}} -\@writefile{toc}{\contentsline {section}{\numberline {6.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{82}} -\newlabel{ch6:part1}{{6.2}{82}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.1}Synchronous parallel algorithms on GPU clusters}{82}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{84}} -\newlabel{fig:ch6p1overlapnative}{{6.1}{84}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.2}Native overlap of CPU communications and GPU computations}{84}} -\newlabel{algo:ch6p1overlapnative}{{6.1}{85}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{85}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{86}} -\newlabel{fig:ch6p1overlapseqsequence}{{6.2}{86}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{86}} -\newlabel{algo:ch6p1overlapseqsequence}{{6.2}{87}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{87}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{88}} -\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{88}} -\newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{89}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{89}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{91}} -\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{91}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{91}} -\newlabel{algo:ch6p1overlapinterleaved}{{6.4}{92}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{92}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{94}} -\newlabel{ch6:p1expes}{{6.2.5}{94}} -\newlabel{ch6:p1block-cyclic}{{6.2.5}{94}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{95}} -\newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{95}} -\@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{96}} -\newlabel{ch6:part2}{{6.3}{96}} -\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{96}} -\newlabel{algo:ch6p2sync}{{3}{96}} -\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{96}} -\newlabel{algo:ch6p2async}{{4}{96}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{97}} -\newlabel{ch6:p2BasicAsync}{{6.3.1}{97}} -\newlabel{algo:ch6p2BasicAsync}{{6.5}{98}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{98}} -\newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{99}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{99}} -\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{100}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{100}} -\newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{101}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{101}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{102}} -\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{102}} -\newlabel{algo:ch6p2Sync}{{6.9}{103}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{103}} -\newlabel{algo:ch6p2SyncComp}{{6.10}{104}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{104}} -\newlabel{algo:ch6p2SyncReceptions}{{6.11}{105}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{105}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{106}} -\newlabel{ch6:p2GPUAsync}{{6.3.3}{106}} -\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{107}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{107}} -\newlabel{algo:ch6p2syncGPU}{{6.13}{109}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{109}} -\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{111}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{111}} -\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{112}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{112}} -\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{113}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{113}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{114}} -\newlabel{sec:ch6p2expes}{{6.3.4}{114}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{115}} -\newlabel{fig:ch6p2syncasync}{{6.6}{115}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{116}} -\newlabel{fig:ch6p2aux}{{6.7}{116}} -\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{117}} -\newlabel{sec:ch6p3unify}{{6.4}{117}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{117}} -\newlabel{sec:ch6p3resources}{{6.4.1}{117}} -\newlabel{algo:ch6p3ORWLresources}{{6.17}{118}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{118}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{118}} -\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{118}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{119}} -\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{119}} -\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{119}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{119}} -\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{120}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{120}} -\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{120}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{120}} -\newlabel{algo:ch6p3ORWLtrans}{{6.21}{120}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{120}} -\newlabel{algo:ch6p3ORWLdecl}{{6.22}{121}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{121}} -\newlabel{algo:ch6p3ORWLinit}{{6.23}{122}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{122}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{122}} -\newlabel{sec:ch6p3tasks}{{6.4.4}{122}} -\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{123}} -\newlabel{ch6:conclu}{{6.5}{123}} -\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{123}} -\@writefile{toc}{\contentsline {section}{Bibliography}{124}} +\@writefile{toc}{\contentsline {section}{\numberline {6.1}Introduction}{84}} +\newlabel{ch6:intro}{{6.1}{84}} +\@writefile{toc}{\contentsline {section}{\numberline {6.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{84}} +\newlabel{ch6:part1}{{6.2}{84}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.1}Synchronous parallel algorithms on GPU clusters}{84}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{86}} +\newlabel{fig:ch6p1overlapnative}{{6.1}{86}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.2}Native overlap of CPU communications and GPU computations}{86}} +\newlabel{algo:ch6p1overlapnative}{{6.1}{87}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{87}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{88}} +\newlabel{fig:ch6p1overlapseqsequence}{{6.2}{88}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{88}} +\newlabel{algo:ch6p1overlapseqsequence}{{6.2}{89}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{89}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{90}} +\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{90}} +\newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{91}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{91}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{93}} +\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{93}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{93}} +\newlabel{algo:ch6p1overlapinterleaved}{{6.4}{94}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{94}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{96}} +\newlabel{ch6:p1expes}{{6.2.5}{96}} +\newlabel{ch6:p1block-cyclic}{{6.2.5}{96}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{97}} +\newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{97}} +\@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{98}} +\newlabel{ch6:part2}{{6.3}{98}} +\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{98}} +\newlabel{algo:ch6p2sync}{{3}{98}} +\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{98}} +\newlabel{algo:ch6p2async}{{4}{98}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{99}} +\newlabel{ch6:p2BasicAsync}{{6.3.1}{99}} +\newlabel{algo:ch6p2BasicAsync}{{6.5}{100}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{100}} +\newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{101}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{101}} +\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{102}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{102}} +\newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{103}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{103}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{104}} +\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{104}} +\newlabel{algo:ch6p2Sync}{{6.9}{105}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{105}} +\newlabel{algo:ch6p2SyncComp}{{6.10}{106}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{106}} +\newlabel{algo:ch6p2SyncReceptions}{{6.11}{107}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{107}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{108}} +\newlabel{ch6:p2GPUAsync}{{6.3.3}{108}} +\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{109}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{109}} +\newlabel{algo:ch6p2syncGPU}{{6.13}{111}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{111}} +\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{113}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{113}} +\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{114}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{114}} +\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{115}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{115}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{116}} +\newlabel{sec:ch6p2expes}{{6.3.4}{116}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{117}} +\newlabel{fig:ch6p2syncasync}{{6.6}{117}} +\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{118}} +\newlabel{fig:ch6p2aux}{{6.7}{118}} +\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{119}} +\newlabel{sec:ch6p3unify}{{6.4}{119}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{119}} +\newlabel{sec:ch6p3resources}{{6.4.1}{119}} +\newlabel{algo:ch6p3ORWLresources}{{6.17}{120}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{120}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{120}} +\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{120}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{121}} +\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{121}} +\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{121}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{121}} +\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{122}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{122}} +\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{122}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{122}} +\newlabel{algo:ch6p3ORWLtrans}{{6.21}{122}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{122}} +\newlabel{algo:ch6p3ORWLdecl}{{6.22}{123}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{123}} +\newlabel{algo:ch6p3ORWLinit}{{6.23}{124}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{124}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{124}} +\newlabel{sec:ch6p3tasks}{{6.4.4}{124}} +\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{125}} +\newlabel{ch6:conclu}{{6.5}{125}} +\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{125}} +\@writefile{toc}{\contentsline {section}{Bibliography}{126}} \@setckpt{Chapters/chapter6/ch6}{ -\setcounter{page}{126} +\setcounter{page}{128} \setcounter{equation}{0} \setcounter{enumi}{4} \setcounter{enumii}{0}