From: Raphael Couturier Date: Wed, 24 Apr 2013 21:02:03 +0000 (+0200) Subject: new X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/078df8cd2b7dba79784d8e374afb7c5ee3dc4d58?hp=-c new --- 078df8cd2b7dba79784d8e374afb7c5ee3dc4d58 diff --git a/BookGPU/Chapters/chapter12/ch12.aux b/BookGPU/Chapters/chapter12/ch12.aux deleted file mode 100644 index 65b67a8..0000000 --- a/BookGPU/Chapters/chapter12/ch12.aux +++ /dev/null @@ -1,123 +0,0 @@ -\relax -\@writefile{toc}{\author{Lilia Ziane Khodja}{}} -\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} -\@writefile{toc}{\author{Jacques Bahi}{}} -\@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {12}Solving sparse linear systems with GMRES and CG methods on GPU clusters}{291}} -\@writefile{lof}{\addvspace {10\p@ }} -\@writefile{lot}{\addvspace {10\p@ }} -\newlabel{ch12}{{12}{291}} -\@writefile{toc}{\contentsline {section}{\numberline {12.1}Introduction}{291}} -\newlabel{ch12:sec:01}{{12.1}{291}} -\@writefile{toc}{\contentsline {section}{\numberline {12.2}Krylov iterative methods}{292}} -\newlabel{ch12:sec:02}{{12.2}{292}} -\newlabel{ch12:eq:01}{{12.1}{292}} -\newlabel{ch12:eq:02}{{12.2}{292}} -\newlabel{ch12:eq:03}{{12.3}{292}} -\newlabel{ch12:eq:11}{{12.4}{293}} -\@writefile{toc}{\contentsline {subsection}{\numberline {12.2.1}CG method}{293}} -\newlabel{ch12:sec:02.01}{{12.2.1}{293}} -\newlabel{ch12:eq:04}{{12.5}{293}} -\newlabel{ch12:eq:05}{{12.6}{293}} -\newlabel{ch12:eq:06}{{12.7}{293}} -\newlabel{ch12:eq:07}{{12.8}{293}} -\newlabel{ch12:eq:08}{{12.9}{293}} -\newlabel{ch12:eq:09}{{12.10}{293}} -\@writefile{loa}{\contentsline {algocf}{\numberline {12}{\ignorespaces Left-preconditioned CG method\relax }}{294}} -\newlabel{ch12:alg:01}{{12}{294}} -\newlabel{ch12:eq:10}{{12.11}{294}} -\@writefile{toc}{\contentsline {subsection}{\numberline {12.2.2}GMRES method}{295}} -\newlabel{ch12:sec:02.02}{{12.2.2}{295}} -\newlabel{ch12:eq:12}{{12.12}{295}} -\newlabel{ch12:eq:13}{{12.13}{295}} -\newlabel{ch12:eq:14}{{12.14}{295}} -\newlabel{ch12:eq:15}{{12.15}{295}} -\newlabel{ch12:eq:16}{{12.16}{295}} -\newlabel{ch12:eq:17}{{12.17}{295}} -\newlabel{ch12:eq:18}{{12.18}{295}} -\newlabel{ch12:eq:19}{{12.19}{295}} -\@writefile{loa}{\contentsline {algocf}{\numberline {13}{\ignorespaces Left-preconditioned GMRES method with restarts\relax }}{296}} -\newlabel{ch12:alg:02}{{13}{296}} -\@writefile{toc}{\contentsline {section}{\numberline {12.3}Parallel implementation on a GPU cluster}{297}} -\newlabel{ch12:sec:03}{{12.3}{297}} -\@writefile{toc}{\contentsline {subsection}{\numberline {12.3.1}Data partitioning}{297}} -\newlabel{ch12:sec:03.01}{{12.3.1}{297}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.1}{\ignorespaces A data partitioning of the sparse matrix $A$, the solution vector $x$ and the right-hand side $b$ into four portions.\relax }}{298}} -\newlabel{ch12:fig:01}{{12.1}{298}} -\@writefile{toc}{\contentsline {subsection}{\numberline {12.3.2}GPU computing}{298}} -\newlabel{ch12:sec:03.02}{{12.3.2}{298}} -\@writefile{toc}{\contentsline {subsection}{\numberline {12.3.3}Data communications}{299}} -\newlabel{ch12:sec:03.03}{{12.3.3}{299}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.2}{\ignorespaces Data exchanges between \textit {Node 1} and its neighbors \textit {Node 0}, \textit {Node 2} and \textit {Node 3}.\relax }}{300}} -\newlabel{ch12:fig:02}{{12.2}{300}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.3}{\ignorespaces Columns reordering of a sparse sub-matrix.\relax }}{301}} -\newlabel{ch12:fig:03}{{12.3}{301}} -\@writefile{toc}{\contentsline {section}{\numberline {12.4}Experimental results}{302}} -\newlabel{ch12:sec:04}{{12.4}{302}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.4}{\ignorespaces General scheme of the GPU cluster of tests composed of six machines, each with two GPUs.\relax }}{302}} -\newlabel{ch12:fig:04}{{12.4}{302}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.5}{\ignorespaces Sketches of sparse matrices chosen from the Davis collection.\relax }}{303}} -\newlabel{ch12:fig:05}{{12.5}{303}} -\@writefile{lot}{\contentsline {table}{\numberline {12.1}{\ignorespaces Main characteristics of sparse matrices chosen from the Davis collection.\relax }}{303}} -\newlabel{ch12:tab:01}{{12.1}{303}} -\@writefile{lot}{\contentsline {table}{\numberline {12.2}{\ignorespaces Performances of the parallel CG method on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{304}} -\newlabel{ch12:tab:02}{{12.2}{304}} -\@writefile{lot}{\contentsline {table}{\numberline {12.3}{\ignorespaces Performances of the parallel GMRES method on a cluster 24 CPU cores vs. on cluster of 12 GPUs.\relax }}{304}} -\newlabel{ch12:tab:03}{{12.3}{304}} -\newlabel{ch12:eq:20}{{12.20}{305}} -\@writefile{lof}{\contentsline {figure}{\numberline {12.6}{\ignorespaces Parallel generation of a large sparse matrix by four computing nodes.\relax }}{306}} -\newlabel{ch12:fig:06}{{12.6}{306}} -\@writefile{lot}{\contentsline {table}{\numberline {12.4}{\ignorespaces Main characteristics of sparse banded matrices generated from those of the Davis collection.\relax }}{306}} -\newlabel{ch12:tab:04}{{12.4}{306}} -\@writefile{lot}{\contentsline {table}{\numberline {12.5}{\ignorespaces Performances of the parallel CG method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{307}} -\newlabel{ch12:tab:05}{{12.5}{307}} -\@writefile{toc}{\contentsline {section}{\numberline {12.5}Conclusion}{307}} -\newlabel{ch12:sec:05}{{12.5}{307}} -\@writefile{lot}{\contentsline {table}{\numberline {12.6}{\ignorespaces Performances of the parallel GMRES method for solving linear systems associated to sparse banded matrices on a cluster of 24 CPU cores vs. on a cluster of 12 GPUs.\relax }}{308}} -\newlabel{ch12:tab:06}{{12.6}{308}} -\@writefile{toc}{\contentsline {section}{Bibliography}{308}} -\@setckpt{Chapters/chapter12/ch12}{ -\setcounter{page}{310} -\setcounter{equation}{22} -\setcounter{enumi}{2} -\setcounter{enumii}{0} -\setcounter{enumiii}{0} -\setcounter{enumiv}{10} -\setcounter{footnote}{0} -\setcounter{mpfootnote}{0} -\setcounter{part}{5} -\setcounter{chapter}{12} -\setcounter{section}{5} -\setcounter{subsection}{0} -\setcounter{subsubsection}{0} -\setcounter{paragraph}{0} -\setcounter{subparagraph}{0} -\setcounter{figure}{6} -\setcounter{table}{6} -\setcounter{numauthors}{0} -\setcounter{parentequation}{46} -\setcounter{subfigure}{0} -\setcounter{lofdepth}{1} -\setcounter{subtable}{0} -\setcounter{lotdepth}{1} -\setcounter{lstnumber}{50} -\setcounter{ContinuedFloat}{0} -\setcounter{AlgoLine}{29} -\setcounter{algocfline}{13} -\setcounter{algocfproc}{13} -\setcounter{algocf}{13} -\setcounter{nprt@mantissa@digitsbefore}{0} -\setcounter{nprt@mantissa@digitsafter}{0} -\setcounter{nprt@exponent@digitsbefore}{0} -\setcounter{nprt@exponent@digitsafter}{0} -\setcounter{nprt@digitsfirstblock}{0} -\setcounter{nprt@blockcnt}{0} -\setcounter{nprt@cntprint}{0} -\setcounter{proposition}{1} -\setcounter{theorem}{0} -\setcounter{exercise}{0} -\setcounter{example}{0} -\setcounter{definition}{0} -\setcounter{proof}{1} -\setcounter{lstlisting}{0} -} diff --git a/BookGPU/Chapters/chapter16/ch16.aux b/BookGPU/Chapters/chapter16/ch16.aux deleted file mode 100644 index 75d0256..0000000 --- a/BookGPU/Chapters/chapter16/ch16.aux +++ /dev/null @@ -1,115 +0,0 @@ -\relax -\@writefile{toc}{\author{X.-X. Liu}{}} -\@writefile{toc}{\author{S. X.-D. Tan}{}} -\@writefile{toc}{\author{H. Wang}{}} -\@writefile{toc}{\author{H. Yu}{}} -\@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {16}GPU-Accelerated Envelope-Following Method}{375}} -\@writefile{lof}{\addvspace {10\p@ }} -\@writefile{lot}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {section}{\numberline {16.1}Introduction}{375}} -\newlabel{fig:ef1}{{16.1(a)}{377}} -\newlabel{sub@fig:ef1}{{(a)}{377}} -\newlabel{fig:ef2}{{16.1(b)}{377}} -\newlabel{sub@fig:ef2}{{(b)}{377}} -\@writefile{lof}{\contentsline {figure}{\numberline {16.1}{\ignorespaces Transient envelope-following analysis. (Both two figures reflect backward-Euler style envelope-following.)\relax }}{377}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Illustration of one envelope skip.}}}{377}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {The envelope changes in a slow time scale.}}}{377}} -\newlabel{fig:ef_intro}{{16.1}{377}} -\@writefile{toc}{\contentsline {section}{\numberline {16.2}The envelope-following method in a nutshell}{378}} -\newlabel{sec:ef}{{16.2}{378}} -\newlabel{eq:dae}{{16.1}{378}} -\newlabel{eq:Newton}{{16.2}{379}} -\newlabel{eq:A}{{16.3}{379}} -\@writefile{toc}{\contentsline {section}{\numberline {16.3}New parallel envelope-following method}{380}} -\newlabel{sec:gmres}{{16.3}{380}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.1}GMRES solver for Newton update equation}{380}} -\@writefile{lof}{\contentsline {figure}{\numberline {16.2}{\ignorespaces The flow of envelope-following method.\relax }}{381}} -\newlabel{fig:ef_flow}{{16.2}{381}} -\@writefile{loa}{\contentsline {algocf}{\numberline {17}{\ignorespaces Standard GMRES algorithm.\relax }}{382}} -\newlabel{alg:GMRES}{{17}{382}} -\newlabel{line:mvp}{{5}{382}} -\newlabel{line:newnorm}{{11}{382}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.2}Parallelization on GPU platforms}{382}} -\newlabel{sec:gpu}{{16.3.2}{382}} -\@writefile{lof}{\contentsline {figure}{\numberline {16.3}{\ignorespaces GPU parallel solver for envelope-following update.\relax }}{383}} -\newlabel{fig:gmres}{{16.3}{383}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.3}Gear-2 based sensitivity calculation}{384}} -\newlabel{sec:gear}{{16.3.3}{384}} -\newlabel{eq:BE}{{16.4}{384}} -\newlabel{eq:sens1}{{16.5}{384}} -\newlabel{eq:Gear_t2}{{16.6}{385}} -\newlabel{eq:sens2}{{16.7}{385}} -\newlabel{eq:Gear_t3}{{16.8}{385}} -\newlabel{eq:sensM}{{16.9}{385}} -\@writefile{loa}{\contentsline {algocf}{\numberline {18}{\ignorespaces The matrix-free method for Krylov subspace construction.\relax }}{386}} -\newlabel{alg:mf_Gear}{{18}{386}} -\newlabel{line:mf_Gear_loop}{{4}{386}} -\newlabel{line:shift}{{8}{386}} -\@writefile{toc}{\contentsline {section}{\numberline {16.4}Numerical examples}{386}} -\newlabel{sec:exp}{{16.4}{386}} -\@writefile{lof}{\contentsline {figure}{\numberline {16.4}{\ignorespaces Diagram of a zero-voltage quasi-resonant flyback converter.\relax }}{387}} -\newlabel{fig:flyback}{{16.4}{387}} -\@writefile{lof}{\contentsline {figure}{\numberline {16.5}{\ignorespaces Illustration of power/ground network model.\relax }}{387}} -\newlabel{fig:pg}{{16.5}{387}} -\newlabel{fig:flybackWhole}{{16.6(a)}{388}} -\newlabel{sub@fig:flybackWhole}{{(a)}{388}} -\newlabel{fig:flybackZoom}{{16.6(b)}{388}} -\newlabel{sub@fig:flybackZoom}{{(b)}{388}} -\@writefile{lof}{\contentsline {figure}{\numberline {16.6}{\ignorespaces Flyback converter solution calculated by envelope-following. The red curve is traditional SPICE simulation result, and the back curve is the envelope-following output with simulation points marked.\relax }}{388}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {The whole plot}}}{388}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Detail of one EF simulation period}}}{388}} -\newlabel{fig:flyback_wave}{{16.6}{388}} -\@writefile{lof}{\contentsline {figure}{\numberline {16.7}{\ignorespaces Buck converter solution calculated by envelope-following.\relax }}{389}} -\newlabel{fig:buck_wave}{{16.7}{389}} -\@writefile{lot}{\contentsline {table}{\numberline {16.1}{\ignorespaces CPU and GPU time comparisons (in seconds) for solving Newton update equation with the proposed Gear-2 sensitivity. \relax }}{389}} -\newlabel{table:circuit}{{16.1}{389}} -\@writefile{toc}{\contentsline {section}{\numberline {16.5}Summary}{390}} -\newlabel{sec:summary}{{16.5}{390}} -\@writefile{toc}{\contentsline {section}{\numberline {16.6}Glossary}{390}} -\@writefile{toc}{\contentsline {section}{Bibliography}{390}} -\@setckpt{Chapters/chapter16/ch16}{ -\setcounter{page}{392} -\setcounter{equation}{9} -\setcounter{enumi}{2} -\setcounter{enumii}{0} -\setcounter{enumiii}{0} -\setcounter{enumiv}{22} -\setcounter{footnote}{0} -\setcounter{mpfootnote}{0} -\setcounter{part}{5} -\setcounter{chapter}{16} -\setcounter{section}{6} -\setcounter{subsection}{0} -\setcounter{subsubsection}{0} -\setcounter{paragraph}{0} -\setcounter{subparagraph}{0} -\setcounter{figure}{7} -\setcounter{table}{1} -\setcounter{numauthors}{0} -\setcounter{parentequation}{4} -\setcounter{subfigure}{0} -\setcounter{lofdepth}{1} -\setcounter{subtable}{0} -\setcounter{lotdepth}{1} -\setcounter{lstnumber}{9} -\setcounter{ContinuedFloat}{0} -\setcounter{AlgoLine}{8} -\setcounter{algocfline}{18} -\setcounter{algocfproc}{18} -\setcounter{algocf}{18} -\setcounter{nprt@mantissa@digitsbefore}{0} -\setcounter{nprt@mantissa@digitsafter}{0} -\setcounter{nprt@exponent@digitsbefore}{0} -\setcounter{nprt@exponent@digitsafter}{0} -\setcounter{nprt@digitsfirstblock}{0} -\setcounter{nprt@blockcnt}{0} -\setcounter{nprt@cntprint}{0} -\setcounter{proposition}{1} -\setcounter{theorem}{0} -\setcounter{exercise}{0} -\setcounter{example}{0} -\setcounter{definition}{0} -\setcounter{proof}{1} -\setcounter{lstlisting}{0} -} diff --git a/BookGPU/Chapters/chapter17/ch17.aux b/BookGPU/Chapters/chapter17/ch17.aux deleted file mode 100644 index 005df6d..0000000 --- a/BookGPU/Chapters/chapter17/ch17.aux +++ /dev/null @@ -1,121 +0,0 @@ -\relax -\@writefile{toc}{\author{Guillaume Laville}{}} -\@writefile{toc}{\author{Christophe Lang}{}} -\@writefile{toc}{\author{Kamel Mazouzi}{}} -\@writefile{toc}{\author{Nicolas Marilleau}{}} -\@writefile{toc}{\author{B\IeC {\'e}n\IeC {\'e}dicte Herrmann}{}} -\@writefile{toc}{\author{Laurent Philippe}{}} -\@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {17}Implementing Multi-Agent Systems on GPU}{395}} -\@writefile{lof}{\addvspace {10\p@ }} -\@writefile{lot}{\addvspace {10\p@ }} -\newlabel{chapter17}{{17}{396}} -\@writefile{toc}{\contentsline {section}{\numberline {17.1}Introduction}{396}} -\newlabel{ch17:intro}{{17.1}{396}} -\@writefile{toc}{\contentsline {section}{\numberline {17.2}Running Agent-Based Simulations}{397}} -\newlabel{ch17:ABM}{{17.2}{397}} -\@writefile{toc}{\contentsline {subsection}{\numberline {17.2.1}Multi-agent systems and parallelism}{397}} -\@writefile{toc}{\contentsline {subsection}{\numberline {17.2.2}MAS Implementation on GPU}{399}} -\newlabel{ch17:subsec:gpu}{{17.2.2}{399}} -\@writefile{toc}{\contentsline {section}{\numberline {17.3}A first practical example}{400}} -\newlabel{ch17:sec:1stmodel}{{17.3}{400}} -\@writefile{toc}{\contentsline {subsection}{\numberline {17.3.1}The Collembola model}{400}} -\newlabel{ch17:subsec:collembolamodel}{{17.3.1}{400}} -\@writefile{lof}{\contentsline {figure}{\numberline {17.1}{\ignorespaces Evolution algorithm of Collembola model\relax }}{401}} -\newlabel{ch17:fig:collem_algorithm}{{17.1}{401}} -\@writefile{toc}{\contentsline {subsection}{\numberline {17.3.2}Collembola Implementation}{401}} -\newlabel{ch17:listing:collembola-diffuse}{{17.1}{402}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {17.1}Collembola OpenCL Diffusion kernel}{402}} -\newlabel{ch17:listing:collembola-reduc}{{17.2}{402}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {17.2}Collembola OpenCL reduction kernel}{402}} -\@writefile{toc}{\contentsline {subsection}{\numberline {17.3.3}Collembola performance}{403}} -\@writefile{lof}{\contentsline {figure}{\numberline {17.2}{\ignorespaces Performance of the Collembola model on CPU and GPU\relax }}{404}} -\newlabel{ch17:fig:mior_perfs_collem}{{17.2}{404}} -\@writefile{toc}{\contentsline {section}{\numberline {17.4}Second example}{404}} -\newlabel{ch17:sec:2ndmodel}{{17.4}{404}} -\@writefile{toc}{\contentsline {subsection}{\numberline {17.4.1}The MIOR model}{404}} -\newlabel{ch17:subsec:miormodel}{{17.4.1}{404}} -\@writefile{loa}{\contentsline {algocf}{\numberline {19}{\ignorespaces Evolution step of each Meta-Mior (microbial colony) agent\relax }}{405}} -\newlabel{ch17:seqalgo}{{19}{405}} -\@writefile{toc}{\contentsline {subsection}{\numberline {17.4.2}MIOR Implementation}{405}} -\@writefile{lof}{\contentsline {figure}{\numberline {17.3}{\ignorespaces Execution distribution retained on GPU\relax }}{406}} -\newlabel{ch17:fig:gpu_distribution}{{17.3}{406}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {17.4.2.1}Execution mapping on GPU}{406}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {17.4.2.2}Data structures translation}{407}} -\newlabel{ch17:subsec:datastructures}{{17.4.2.2}{407}} -\newlabel{ch17:listing:mior_data_structures}{{17.3}{407}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {17.3}Main data structures used in a MIOR simulation}{407}} -\@writefile{lof}{\contentsline {figure}{\numberline {17.4}{\ignorespaces Compact representation of the topology of a MIOR simulation\relax }}{408}} -\newlabel{ch17:fig:csr_representation}{{17.4}{408}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {17.4.2.3}Critical resources access management}{408}} -\newlabel{ch17:subsec:concurrency}{{17.4.2.3}{408}} -\newlabel{ch17:listing:mior_kernels}{{17.4}{409}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {17.4}Main MIOR kernel}{409}} -\newlabel{ch17:fig:mior_launcher}{{17.5}{410}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {17.5}MIOR simulation launcher}{410}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {17.4.2.4}Termination detection}{410}} -\@writefile{toc}{\contentsline {subsection}{\numberline {17.4.3}Performance of MIOR implementations}{411}} -\newlabel{ch17:subsec:miorexperiments}{{17.4.3}{411}} -\@writefile{lof}{\contentsline {figure}{\numberline {17.5}{\ignorespaces CPU and GPU performance on a Tesla C1060 node\relax }}{412}} -\newlabel{ch17:fig:mior_perfs_tesla}{{17.5}{412}} -\@writefile{lof}{\contentsline {figure}{\numberline {17.6}{\ignorespaces CPU and GPU performance on a personal computer with a Geforce 8800GT\relax }}{413}} -\newlabel{ch17:fig:mior_perfs_8800gt}{{17.6}{413}} -\@writefile{toc}{\contentsline {section}{\numberline {17.5}Analysis and recommendations}{413}} -\newlabel{ch17:analysis}{{17.5}{413}} -\@writefile{lof}{\contentsline {figure}{\numberline {17.7}{\ignorespaces Execution time of one multi-simulation kernel on the Tesla platform\relax }}{414}} -\newlabel{ch17:fig:monokernel_graph}{{17.7}{414}} -\@writefile{lof}{\contentsline {figure}{\numberline {17.8}{\ignorespaces Total execution time for 1000 simulations on the Tesla platform, while varying the number of simulations for each kernel\relax }}{414}} -\newlabel{ch17:fig:multikernel_graph}{{17.8}{414}} -\@writefile{toc}{\contentsline {subsection}{\numberline {17.5.1}Analysis}{414}} -\@writefile{toc}{\contentsline {subsection}{\numberline {17.5.2}MAS execution workflow}{415}} -\@writefile{toc}{\contentsline {subsection}{\numberline {17.5.3}Implementation challenges}{416}} -\@writefile{toc}{\contentsline {subsection}{\numberline {17.5.4}MCSMA}{416}} -\newlabel{ch17:Mcsma}{{17.5.4}{416}} -\@writefile{toc}{\contentsline {section}{\numberline {17.6}Conclusion}{417}} -\newlabel{ch17:conclusion}{{17.6}{417}} -\@writefile{toc}{\contentsline {section}{Bibliography}{418}} -\@setckpt{Chapters/chapter17/ch17}{ -\setcounter{page}{422} -\setcounter{equation}{0} -\setcounter{enumi}{3} -\setcounter{enumii}{0} -\setcounter{enumiii}{0} -\setcounter{enumiv}{25} -\setcounter{footnote}{1} -\setcounter{mpfootnote}{0} -\setcounter{part}{6} -\setcounter{chapter}{17} -\setcounter{section}{6} -\setcounter{subsection}{0} -\setcounter{subsubsection}{0} -\setcounter{paragraph}{0} -\setcounter{subparagraph}{0} -\setcounter{figure}{8} -\setcounter{table}{0} -\setcounter{numauthors}{0} -\setcounter{parentequation}{4} -\setcounter{subfigure}{0} -\setcounter{lofdepth}{1} -\setcounter{subtable}{0} -\setcounter{lotdepth}{1} -\setcounter{lstnumber}{21} -\setcounter{ContinuedFloat}{0} -\setcounter{AlgoLine}{17} -\setcounter{algocfline}{19} -\setcounter{algocfproc}{19} -\setcounter{algocf}{19} -\setcounter{nprt@mantissa@digitsbefore}{0} -\setcounter{nprt@mantissa@digitsafter}{0} -\setcounter{nprt@exponent@digitsbefore}{0} -\setcounter{nprt@exponent@digitsafter}{0} -\setcounter{nprt@digitsfirstblock}{0} -\setcounter{nprt@blockcnt}{0} -\setcounter{nprt@cntprint}{0} -\setcounter{proposition}{1} -\setcounter{theorem}{0} -\setcounter{exercise}{0} -\setcounter{example}{0} -\setcounter{definition}{0} -\setcounter{proof}{1} -\setcounter{lstlisting}{5} -} diff --git a/BookGPU/Chapters/chapter18/ch18.aux b/BookGPU/Chapters/chapter18/ch18.aux deleted file mode 100644 index 884902a..0000000 --- a/BookGPU/Chapters/chapter18/ch18.aux +++ /dev/null @@ -1,86 +0,0 @@ -\relax -\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} -\@writefile{toc}{\author{Christophe Guyeux}{}} -\@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {18}Pseudorandom Number Generator on GPU}{423}} -\@writefile{lof}{\addvspace {10\p@ }} -\@writefile{lot}{\addvspace {10\p@ }} -\newlabel{chapter18}{{18}{423}} -\@writefile{toc}{\contentsline {section}{\numberline {18.1}Introduction}{423}} -\@writefile{toc}{\contentsline {section}{\numberline {18.2}Basic Remindees}{425}} -\newlabel{section:BASIC RECALLS}{{18.2}{425}} -\@writefile{toc}{\contentsline {subsection}{\numberline {18.2.1}A Short Presentation of Chaos}{425}} -\@writefile{toc}{\contentsline {subsection}{\numberline {18.2.2}On Devaney's Definition of Chaos}{425}} -\newlabel{sec:dev}{{18.2.2}{425}} -\newlabel{Devaney}{{18.1}{425}} -\@writefile{toc}{\contentsline {subsection}{\numberline {18.2.3}Chaotic iterations}{426}} -\newlabel{subsection:Chaotic iterations}{{18.2.3}{426}} -\newlabel{Chaotic iterations}{{2}{426}} -\newlabel{eq:generalIC}{{18.4}{427}} -\newlabel{equation Oplus}{{18.5}{427}} -\@writefile{toc}{\contentsline {section}{\numberline {18.3}Toward Efficiency and Improvement for CI PRNG}{427}} -\newlabel{sec:efficient PRNG}{{18.3}{427}} -\@writefile{toc}{\contentsline {subsection}{\numberline {18.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{427}} -\newlabel{algo:seqCIPRNG}{{18.1}{427}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {18.1}C code of the sequential PRNG based on chaotic iterations}{427}} -\@writefile{toc}{\contentsline {subsection}{\numberline {18.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{428}} -\newlabel{sec:efficient PRNG gpu}{{18.3.2}{428}} -\@writefile{toc}{\contentsline {subsection}{\numberline {18.3.3}Naive Version for GPU}{428}} -\@writefile{loa}{\contentsline {algocf}{\numberline {20}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{429}} -\newlabel{algo:gpu_kernel}{{20}{429}} -\@writefile{toc}{\contentsline {subsection}{\numberline {18.3.4}Improved Version for GPU}{429}} -\newlabel{IR}{{21}{430}} -\@writefile{loa}{\contentsline {algocf}{\numberline {21}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{430}} -\newlabel{algo:gpu_kernel2}{{21}{430}} -\@writefile{toc}{\contentsline {subsection}{\numberline {18.3.5}Chaos Evaluation of the Improved Version}{430}} -\@writefile{toc}{\contentsline {section}{\numberline {18.4}Experiments}{431}} -\newlabel{sec:experiments}{{18.4}{431}} -\@writefile{toc}{\contentsline {section}{\numberline {18.5}Summary}{431}} -\@writefile{lof}{\contentsline {figure}{\numberline {18.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{432}} -\newlabel{fig:time_xorlike_gpu}{{18.1}{432}} -\@writefile{toc}{\contentsline {section}{Bibliography}{433}} -\@setckpt{Chapters/chapter18/ch18}{ -\setcounter{page}{435} -\setcounter{equation}{5} -\setcounter{enumi}{3} -\setcounter{enumii}{0} -\setcounter{enumiii}{0} -\setcounter{enumiv}{17} -\setcounter{footnote}{2} -\setcounter{mpfootnote}{0} -\setcounter{part}{6} -\setcounter{chapter}{18} -\setcounter{section}{5} -\setcounter{subsection}{0} -\setcounter{subsubsection}{0} -\setcounter{paragraph}{0} -\setcounter{subparagraph}{0} -\setcounter{figure}{1} -\setcounter{table}{0} -\setcounter{numauthors}{0} -\setcounter{parentequation}{4} -\setcounter{subfigure}{0} -\setcounter{lofdepth}{1} -\setcounter{subtable}{0} -\setcounter{lotdepth}{1} -\setcounter{lstnumber}{15} -\setcounter{ContinuedFloat}{0} -\setcounter{AlgoLine}{14} -\setcounter{algocfline}{21} -\setcounter{algocfproc}{21} -\setcounter{algocf}{21} -\setcounter{nprt@mantissa@digitsbefore}{0} -\setcounter{nprt@mantissa@digitsafter}{0} -\setcounter{nprt@exponent@digitsbefore}{0} -\setcounter{nprt@exponent@digitsafter}{0} -\setcounter{nprt@digitsfirstblock}{0} -\setcounter{nprt@blockcnt}{0} -\setcounter{nprt@cntprint}{0} -\setcounter{proposition}{1} -\setcounter{theorem}{0} -\setcounter{exercise}{0} -\setcounter{example}{0} -\setcounter{definition}{2} -\setcounter{proof}{1} -\setcounter{lstlisting}{1} -} diff --git a/BookGPU/Chapters/chapter3/ch3.aux b/BookGPU/Chapters/chapter3/ch3.aux index 298a450..b6d8bfd 100644 --- a/BookGPU/Chapters/chapter3/ch3.aux +++ b/BookGPU/Chapters/chapter3/ch3.aux @@ -1,121 +1,120 @@ \relax \@writefile{toc}{\author{Gilles Perrot}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {3}Setting up the environnement.}{25}} +\@writefile{toc}{\contentsline {chapter}{\numberline {3}Setting up the environnement.}{23}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\newlabel{algo:memcopy:H2D}{{7}{25}} -\newlabel{algo:memcopy:kernel}{{8}{25}} -\newlabel{algo:memcopy:D2H}{{9}{25}} -\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{25}} -\newlabel{algo:memcopy}{{1}{25}} -\@writefile{toc}{\contentsline {section}{\numberline {3.1}Data transfers, memory management.}{26}} -\newlabel{lst:main1}{{3.1}{27}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}Generic main.cu file used to launch CUDA kernels}{27}} -\newlabel{lst:fkern1}{{3.2}{27}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}fast\_kernels.cu file featuring one kernel skeleton}{27}} -\newlabel{lst:mkfile}{{3.3}{28}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}Generic Makefile based on those provided by NV SDK}{28}} -\@writefile{toc}{\contentsline {section}{\numberline {3.2}Performance measurements}{28}} -\newlabel{lst:chronos}{{3.4}{28}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Time measurement technique using cutil functions}{28}} +\newlabel{algo:memcopy:H2D}{{7}{23}} +\newlabel{algo:memcopy:kernel}{{8}{23}} +\newlabel{algo:memcopy:D2H}{{9}{23}} +\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{23}} +\newlabel{algo:memcopy}{{1}{23}} +\@writefile{toc}{\contentsline {section}{\numberline {3.1}Data transfers, memory management.}{24}} +\newlabel{lst:main1}{{3.1}{25}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.1}Generic main.cu file used to launch CUDA kernels}{25}} +\newlabel{lst:fkern1}{{3.2}{25}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.2}fast\_kernels.cu file featuring one kernel skeleton}{25}} +\newlabel{lst:mkfile}{{3.3}{26}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.3}Generic Makefile based on those provided by NV SDK}{26}} +\@writefile{toc}{\contentsline {section}{\numberline {3.2}Performance measurements}{26}} +\newlabel{lst:chronos}{{3.4}{26}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Time measurement technique using cutil functions}{26}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {4}Implementing a fast median filter}{31}} +\@writefile{toc}{\contentsline {chapter}{\numberline {4}Implementing a fast median filter}{29}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} \@writefile{toc}{\author{Gilles Perrot}{}} -\@writefile{toc}{\contentsline {section}{\numberline {4.1}Introduction}{31}} -\@writefile{toc}{\contentsline {section}{\numberline {4.2}Median filtering}{32}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.1}Basic principles}{32}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.2}A naive implementation}{32}} -\newlabel{img:sap_example_ref}{{4.1(a)}{33}} -\newlabel{sub@img:sap_example_ref}{{(a)}{33}} -\newlabel{img:sap_example_med3}{{4.1(b)}{33}} -\newlabel{sub@img:sap_example_med3}{{(b)}{33}} -\newlabel{img:sap_example_med5}{{4.1(c)}{33}} -\newlabel{sub@img:sap_example_med5}{{(c)}{33}} -\newlabel{img:sap_example_med3_it2}{{4.1(d)}{33}} -\newlabel{sub@img:sap_example_med3_it2}{{(d)}{33}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Exemple of median filtering, applied to salt \& pepper noise reduction.\relax }}{33}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted by salt and pepper noise of density 0.25}}}{33}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ median filter}}}{33}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ median filter}}}{33}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image denoised by 2 iterations of a $3\times 3$ median filter}}}{33}} -\newlabel{fig:sap_examples}{{4.1}{33}} -\newlabel{lst:medianGeneric}{{4.1}{34}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.1}Generic CUDA kernel achieving median filtering}{34}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Exemple of 5x5 median filtering\relax }}{35}} -\newlabel{fig:median_1}{{4.2}{35}} -\newlabel{algoMedianGeneric}{{2}{35}} -\newlabel{algoMedianGeneric:memcpyH2D}{{1}{35}} -\newlabel{algoMedianGeneric:cptstart}{{3}{35}} -\newlabel{algoMedianGeneric:cptend}{{5}{35}} -\newlabel{algoMedianGeneric:memcpyD2H}{{7}{35}} -\@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{35}} -\@writefile{toc}{\contentsline {section}{\numberline {4.3}NVidia GPU tuning recipes}{35}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{36}} -\newlabel{fig:median_overlap}{{4.3}{36}} -\@writefile{lot}{\contentsline {table}{\numberline {4.1}{\ignorespaces Performance results of \texttt {kernel medianR}. \relax }}{36}} -\newlabel{tab:medianHisto1}{{4.1}{36}} -\@writefile{toc}{\contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers }{37}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.1}The simplest way}{37}} -\newlabel{lst:kernelMedian3RegTri9}{{4.2}{38}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}3$\times $3 median filter kernel using one register per neighborhood pixel and bubble sort}{38}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Further optimization}{38}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs on GPU C2070 and CPU for generic median, 3$\times $3 median register-only and \textit {libJacket}.\relax }}{39}} -\newlabel{fig:compMedians1}{{4.4}{39}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{39}} -\newlabel{fig:forgetful_selection}{{4.5}{39}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{39}} -\newlabel{lst:medianForget1pix3}{{4.3}{40}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{40}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{41}} -\newlabel{fig:forgetful3}{{4.6}{41}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{42}} -\newlabel{fig:median3_overlap}{{4.7}{42}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{42}} -\newlabel{lst:medianForget2pix3}{{4.4}{42}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{42}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{43}} -\newlabel{fig:compMedians2}{{4.8}{43}} -\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{44}} -\newlabel{sec:median5}{{4.5.1}{44}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{44}} -\newlabel{lst:medianForget2pix5}{{4.5}{44}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{44}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.\relax }}{45}} -\newlabel{fig:median5overlap}{{4.9}{45}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.\relax }}{45}} -\newlabel{fig:median5overlap}{{4.10}{45}} -\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{46}} -\newlabel{tab:median5comp}{{4.2}{46}} -\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{47}} -\newlabel{lst:medianSeparable}{{4.6}{47}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{47}} -\newlabel{img:sap_example_ref}{{4.11(a)}{48}} -\newlabel{sub@img:sap_example_ref}{{(a)}{48}} -\newlabel{img:sap_example_sep_med3}{{4.11(b)}{48}} -\newlabel{sub@img:sap_example_sep_med3}{{(b)}{48}} -\newlabel{img:sap_example_sep_med5}{{4.11(c)}{48}} -\newlabel{sub@img:sap_example_sep_med5}{{(c)}{48}} -\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{48}} -\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{48}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{48}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{48}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{48}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{48}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{48}} -\newlabel{fig:sap_examples2}{{4.11}{48}} -\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{49}} -\newlabel{tab:medianSeparable}{{4.3}{49}} -\@writefile{toc}{\contentsline {section}{Bibliography}{50}} +\@writefile{toc}{\contentsline {section}{\numberline {4.1}Introduction}{29}} +\@writefile{toc}{\contentsline {section}{\numberline {4.2}Median filtering}{30}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.1}Basic principles}{30}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.2.2}A naive implementation}{30}} +\newlabel{img:sap_example_ref}{{4.1(a)}{31}} +\newlabel{sub@img:sap_example_ref}{{(a)}{31}} +\newlabel{img:sap_example_med3}{{4.1(b)}{31}} +\newlabel{sub@img:sap_example_med3}{{(b)}{31}} +\newlabel{img:sap_example_med5}{{4.1(c)}{31}} +\newlabel{sub@img:sap_example_med5}{{(c)}{31}} +\newlabel{img:sap_example_med3_it2}{{4.1(d)}{31}} +\newlabel{sub@img:sap_example_med3_it2}{{(d)}{31}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Exemple of median filtering, applied to salt \& pepper noise reduction.\relax }}{31}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted by salt and pepper noise of density 0.25}}}{31}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ median filter}}}{31}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ median filter}}}{31}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image denoised by 2 iterations of a $3\times 3$ median filter}}}{31}} +\newlabel{fig:sap_examples}{{4.1}{31}} +\newlabel{lst:medianGeneric}{{4.1}{32}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.1}Generic CUDA kernel achieving median filtering}{32}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Exemple of 5x5 median filtering\relax }}{33}} +\newlabel{fig:median_1}{{4.2}{33}} +\newlabel{algoMedianGeneric}{{2}{33}} +\newlabel{algoMedianGeneric:memcpyH2D}{{1}{33}} +\newlabel{algoMedianGeneric:cptstart}{{3}{33}} +\newlabel{algoMedianGeneric:cptend}{{5}{33}} +\newlabel{algoMedianGeneric:memcpyD2H}{{7}{33}} +\@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{33}} +\@writefile{toc}{\contentsline {section}{\numberline {4.3}NVidia GPU tuning recipes}{33}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{34}} +\newlabel{fig:median_overlap}{{4.3}{34}} +\@writefile{lot}{\contentsline {table}{\numberline {4.1}{\ignorespaces Performance results of \texttt {kernel medianR}. \relax }}{34}} +\newlabel{tab:medianHisto1}{{4.1}{34}} +\@writefile{toc}{\contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers }{35}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.1}The simplest way}{35}} +\newlabel{lst:kernelMedian3RegTri9}{{4.2}{36}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.2}3$\times $3 median filter kernel using one register per neighborhood pixel and bubble sort}{36}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.4.2}Further optimization}{36}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{36}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.4}{\ignorespaces Comparison of pixel throughputs on GPU C2070 and CPU for generic median, 3$\times $3 median register-only and \textit {libJacket}.\relax }}{37}} +\newlabel{fig:compMedians1}{{4.4}{37}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{37}} +\newlabel{fig:forgetful_selection}{{4.5}{37}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{38}} +\newlabel{fig:forgetful3}{{4.6}{38}} +\newlabel{lst:medianForget1pix3}{{4.3}{39}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{39}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{40}} +\newlabel{fig:median3_overlap}{{4.7}{40}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{40}} +\newlabel{lst:medianForget2pix3}{{4.4}{40}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{40}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{41}} +\newlabel{fig:compMedians2}{{4.8}{41}} +\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{42}} +\newlabel{sec:median5}{{4.5.1}{42}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{42}} +\newlabel{lst:medianForget2pix5}{{4.5}{42}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{42}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.\relax }}{43}} +\newlabel{fig:median5overlap}{{4.9}{43}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.\relax }}{43}} +\newlabel{fig:median5overlap}{{4.10}{43}} +\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{44}} +\newlabel{tab:median5comp}{{4.2}{44}} +\@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{45}} +\newlabel{lst:medianSeparable}{{4.6}{45}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{45}} +\newlabel{img:sap_example_ref}{{4.11(a)}{46}} +\newlabel{sub@img:sap_example_ref}{{(a)}{46}} +\newlabel{img:sap_example_sep_med3}{{4.11(b)}{46}} +\newlabel{sub@img:sap_example_sep_med3}{{(b)}{46}} +\newlabel{img:sap_example_sep_med5}{{4.11(c)}{46}} +\newlabel{sub@img:sap_example_sep_med5}{{(c)}{46}} +\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{46}} +\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{46}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{46}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{46}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{46}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{46}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{46}} +\newlabel{fig:sap_examples2}{{4.11}{46}} +\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{47}} +\newlabel{tab:medianSeparable}{{4.3}{47}} \@setckpt{Chapters/chapter3/ch3}{ -\setcounter{page}{52} +\setcounter{page}{49} \setcounter{equation}{0} \setcounter{enumi}{3} \setcounter{enumii}{0} \setcounter{enumiii}{0} -\setcounter{enumiv}{10} +\setcounter{enumiv}{0} \setcounter{footnote}{0} \setcounter{mpfootnote}{0} \setcounter{part}{2} diff --git a/BookGPU/Chapters/chapter6/ch6.aux b/BookGPU/Chapters/chapter6/ch6.aux deleted file mode 100644 index bfe00ce..0000000 --- a/BookGPU/Chapters/chapter6/ch6.aux +++ /dev/null @@ -1,152 +0,0 @@ -\relax -\@writefile{toc}{\author{Sylvain Contassot-Vivier}{}} -\@writefile{toc}{\author{Stephane Vialle}{}} -\@writefile{toc}{\author{Jens Gustedt}{}} -\@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {6}Development methodologies for GPU and cluster of GPUs}{87}} -\@writefile{lof}{\addvspace {10\p@ }} -\@writefile{lot}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {section}{\numberline {6.1}Introduction}{88}} -\newlabel{ch6:intro}{{6.1}{88}} -\@writefile{toc}{\contentsline {section}{\numberline {6.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{88}} -\newlabel{ch6:part1}{{6.2}{88}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.1}Synchronous parallel algorithms on GPU clusters}{88}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.1}{\ignorespaces Native overlap of internode CPU communications with GPU computations.\relax }}{90}} -\newlabel{fig:ch6p1overlapnative}{{6.1}{90}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.2}Native overlap of CPU communications and GPU computations}{90}} -\newlabel{algo:ch6p1overlapnative}{{6.1}{91}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.1}Generic scheme implicitly overlapping MPI communications with CUDA GPU computations}{91}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.2}{\ignorespaces Overlap of internode CPU communications with a sequence of CPU/GPU data transfers and GPU computations.\relax }}{92}} -\newlabel{fig:ch6p1overlapseqsequence}{{6.2}{92}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.3}Overlapping with sequences of transfers and computations}{92}} -\newlabel{algo:ch6p1overlapseqsequence}{{6.2}{93}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.2}Generic scheme explicitly overlapping MPI communications with sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{93}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.3}{\ignorespaces Overlap of internode CPU communications with a streamed sequence of CPU/GPU data transfers and GPU computations.\relax }}{95}} -\newlabel{fig:ch6p1overlapstreamsequence}{{6.3}{95}} -\newlabel{algo:ch6p1overlapstreamsequence}{{6.3}{95}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.3}Generic scheme explicitly overlapping MPI communications with streamed sequences of CUDA CPU/GPU transfers and CUDA GPU computations}{95}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.4}Interleaved communications-transfers-computations overlapping}{97}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.4}{\ignorespaces Complete overlap of internode CPU communications, CPU/GPU data transfers and GPU computations, interleaving computation-communication iterations\relax }}{98}} -\newlabel{fig:ch6p1overlapinterleaved}{{6.4}{98}} -\newlabel{algo:ch6p1overlapinterleaved}{{6.4}{98}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.4}Generic scheme explicitly overlapping MPI communications, CUDA CPU/GPU transfers and CUDA GPU computations, interleaving computation-communication iterations}{98}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.2.5}Experimental validation}{100}} -\newlabel{ch6:p1expes}{{6.2.5}{100}} -\newlabel{ch6:p1block-cyclic}{{6.2.5}{100}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.5}{\ignorespaces Experimental performances of different synchronous algorithms computing a dense matrix product\relax }}{101}} -\newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{101}} -\@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{102}} -\newlabel{ch6:part2}{{6.3}{102}} -\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{102}} -\newlabel{algo:ch6p2sync}{{3}{102}} -\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{103}} -\newlabel{algo:ch6p2async}{{4}{103}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{104}} -\newlabel{ch6:p2BasicAsync}{{6.3.1}{104}} -\newlabel{algo:ch6p2BasicAsync}{{6.5}{104}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{104}} -\newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{105}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.6}Computing function in the basic asynchronous scheme}{105}} -\newlabel{algo:ch6p2BasicAsyncSendings}{{6.7}{107}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.7}Sending function in the basic asynchronous scheme}{107}} -\newlabel{algo:ch6p2BasicAsyncReceptions}{{6.8}{107}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.8}Reception function in the basic asynchronous scheme}{107}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.2}Synchronization of the asynchronous scheme}{109}} -\newlabel{ch6:p2SsyncOverAsync}{{6.3.2}{109}} -\newlabel{algo:ch6p2Sync}{{6.9}{109}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.9}Initialization of the synchronized scheme}{109}} -\newlabel{algo:ch6p2SyncComp}{{6.10}{110}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.10}Computing function in the synchronized scheme}{110}} -\newlabel{algo:ch6p2SyncReceptions}{{6.11}{112}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{112}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{113}} -\newlabel{ch6:p2GPUAsync}{{6.3.3}{113}} -\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{114}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{114}} -\newlabel{algo:ch6p2syncGPU}{{6.13}{115}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{115}} -\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{118}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.14}Initialization of the main process of complete overlap with asynchronism}{118}} -\newlabel{algo:ch6p2FullOverAsyncComp1}{{6.15}{119}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.15}Computing function in the final asynchronous scheme with CPU/GPU overlap}{119}} -\newlabel{algo:ch6p2FullOverAsyncComp2}{{6.16}{120}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.16}Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap}{120}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.4}Experimental validation}{121}} -\newlabel{sec:ch6p2expes}{{6.3.4}{121}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.6}{\ignorespaces Computation times of the test application in synchronous and asynchronous modes.\relax }}{122}} -\newlabel{fig:ch6p2syncasync}{{6.6}{122}} -\@writefile{lof}{\contentsline {figure}{\numberline {6.7}{\ignorespaces Computation times with or without overlap of Jacobian updatings in asynchronous mode.\relax }}{123}} -\newlabel{fig:ch6p2aux}{{6.7}{123}} -\@writefile{toc}{\contentsline {section}{\numberline {6.4}Perspective: A unifying programming model}{124}} -\newlabel{sec:ch6p3unify}{{6.4}{124}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.1}Resources}{124}} -\newlabel{sec:ch6p3resources}{{6.4.1}{124}} -\newlabel{algo:ch6p3ORWLresources}{{6.17}{125}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.17}Declaration of ORWL resources for a block-cyclic matrix multiplication}{125}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.2}Control}{125}} -\newlabel{sec:ch6p3ORWLcontrol}{{6.4.2}{125}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.3}Example: block-cyclic matrix multiplication (MM)}{126}} -\newlabel{sec:ch6p3ORWLMM}{{6.4.3}{126}} -\newlabel{algo:ch6p3ORWLBCCMM}{{6.18}{126}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.18}Block-cyclic matrix multiplication, high level per task view}{126}} -\newlabel{algo:ch6p3ORWLlcopy}{{6.19}{127}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.19}An iterative local copy operation}{127}} -\newlabel{algo:ch6p3ORWLrcopy}{{6.20}{127}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.20}An iterative remote copy operation as part of a block cyclic matrix multiplication task}{127}} -\newlabel{algo:ch6p3ORWLtrans}{{6.21}{127}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.21}An iterative GPU transfer and compute operation as part of a block cyclic matrix multiplication task}{127}} -\newlabel{algo:ch6p3ORWLdecl}{{6.22}{128}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.22}Dynamic declaration of handles to represent the resources}{128}} -\newlabel{algo:ch6p3ORWLinit}{{6.23}{128}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.23}Dynamic initialization of access mode and priorities}{128}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.4.4}Tasks and operations}{129}} -\newlabel{sec:ch6p3tasks}{{6.4.4}{129}} -\@writefile{toc}{\contentsline {section}{\numberline {6.5}Conclusion}{130}} -\newlabel{ch6:conclu}{{6.5}{130}} -\@writefile{toc}{\contentsline {section}{\numberline {6.6}Glossary}{130}} -\@writefile{toc}{\contentsline {section}{Bibliography}{131}} -\@setckpt{Chapters/chapter6/ch6}{ -\setcounter{page}{133} -\setcounter{equation}{0} -\setcounter{enumi}{4} -\setcounter{enumii}{0} -\setcounter{enumiii}{0} -\setcounter{enumiv}{21} -\setcounter{footnote}{0} -\setcounter{mpfootnote}{0} -\setcounter{part}{3} -\setcounter{chapter}{6} -\setcounter{section}{6} -\setcounter{subsection}{0} -\setcounter{subsubsection}{0} -\setcounter{paragraph}{0} -\setcounter{subparagraph}{0} -\setcounter{figure}{7} -\setcounter{table}{0} -\setcounter{numauthors}{0} -\setcounter{parentequation}{8} -\setcounter{subfigure}{0} -\setcounter{lofdepth}{1} -\setcounter{subtable}{0} -\setcounter{lotdepth}{1} -\setcounter{lstnumber}{17} -\setcounter{ContinuedFloat}{0} -\setcounter{AlgoLine}{6} -\setcounter{algocfline}{4} -\setcounter{algocfproc}{4} -\setcounter{algocf}{4} -\setcounter{nprt@mantissa@digitsbefore}{0} -\setcounter{nprt@mantissa@digitsafter}{0} -\setcounter{nprt@exponent@digitsbefore}{0} -\setcounter{nprt@exponent@digitsafter}{0} -\setcounter{nprt@digitsfirstblock}{0} -\setcounter{nprt@blockcnt}{0} -\setcounter{nprt@cntprint}{0} -\setcounter{proposition}{0} -\setcounter{theorem}{0} -\setcounter{exercise}{0} -\setcounter{example}{0} -\setcounter{definition}{0} -\setcounter{proof}{0} -\setcounter{lstlisting}{23} -}