From 013c063fb5cb99cc33b413d937f330acc42ca73e Mon Sep 17 00:00:00 2001 From: couturie Date: Wed, 6 Feb 2013 14:19:18 +0100 Subject: [PATCH] ch18 --- BookGPU/Chapters/chapter18/biblio.bib | 35 ++ BookGPU/Chapters/chapter18/ch1.aux | 64 +++ BookGPU/Chapters/chapter18/ch18.aux | 71 ++++ BookGPU/Chapters/chapter18/ch18.tex | 392 ++++++++++++++++++ BookGPU/Chapters/chapter18/ch18.tex~ | 296 +++++++++++++ .../chapter18/figures/curve_time_bbs_gpu.pdf | Bin 0 -> 7587 bytes .../figures/curve_time_xorlike_gpu.pdf | Bin 0 -> 8005 bytes BookGPU/Chapters/chapter18/preamble.aux | 32 ++ 8 files changed, 890 insertions(+) create mode 100644 BookGPU/Chapters/chapter18/biblio.bib create mode 100644 BookGPU/Chapters/chapter18/ch1.aux create mode 100644 BookGPU/Chapters/chapter18/ch18.aux create mode 100755 BookGPU/Chapters/chapter18/ch18.tex create mode 100755 BookGPU/Chapters/chapter18/ch18.tex~ create mode 100644 BookGPU/Chapters/chapter18/figures/curve_time_bbs_gpu.pdf create mode 100644 BookGPU/Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf create mode 100644 BookGPU/Chapters/chapter18/preamble.aux diff --git a/BookGPU/Chapters/chapter18/biblio.bib b/BookGPU/Chapters/chapter18/biblio.bib new file mode 100644 index 0000000..f3b0ccc --- /dev/null +++ b/BookGPU/Chapters/chapter18/biblio.bib @@ -0,0 +1,35 @@ +@misc{ch1:cuda, + author = {{NVIDIA Corporation}}, + keywords = {CUDA}, + note = {Version 4.0}, + title = {{NVIDIA CUDA C} Programming Guide}, + year = 2011 +} + +@Article{ch1:Buck:2004:BGS, + author = "I. Buck and T. Foley and D. Horn and J. + Sugerman and K. Fatahalian and M. Houston and P. + Hanrahan", + title = "{Brook} for {GPUs}: stream computing on graphics + hardware", + journal = "ACM Transactions on Graphics", + volume = "23", + number = "3", + pages = "777--786", + month = aug, + year = "2004", +} + + +@article{ch1:CMR:12, + author = "B. Cloutier and B. K. Muite and P. Rigge", + title = "A comparison of CPU and GPU performance for Fourier pseudospectral simulations of the Navier-Stokes, Cubic Nonlinear Schrödinger and Sine Gordon Equations", + journal = "Computational Physics (physics.comp-ph)", + year = "2012", + archivePrefix = "arXiv", + eprint = "1206.3215", + primaryClass = "physics.comp-ph", +} + + + diff --git a/BookGPU/Chapters/chapter18/ch1.aux b/BookGPU/Chapters/chapter18/ch1.aux new file mode 100644 index 0000000..2c84cf4 --- /dev/null +++ b/BookGPU/Chapters/chapter18/ch1.aux @@ -0,0 +1,64 @@ +\relax +\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} +\@writefile{loa}{\addvspace {10\p@ }} +\@writefile{toc}{\contentsline {chapter}{\numberline {1}Presentation of the GPU architecture and of the Cuda environment}{3}} +\@writefile{lof}{\addvspace {10\p@ }} +\@writefile{lot}{\addvspace {10\p@ }} +\newlabel{chapter1}{{1}{3}} +\@writefile{toc}{\contentsline {section}{\numberline {1.1}Introduction}{3}} +\newlabel{ch1:intro}{{1.1}{3}} +\@writefile{toc}{\contentsline {section}{\numberline {1.2}Brief history of Video Card}{3}} +\@writefile{toc}{\contentsline {section}{\numberline {1.3}GPGPU}{4}} +\@writefile{toc}{\contentsline {section}{\numberline {1.4}Architecture of current GPUs}{5}} +\@writefile{lof}{\contentsline {figure}{\numberline {1.1}{\ignorespaces Comparison of number of cores in a CPU and in a GPU.\relax }}{5}} +\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} +\newlabel{ch1:fig:comparison_cpu_gpu}{{1.1}{5}} +\@writefile{lof}{\contentsline {figure}{\numberline {1.2}{\ignorespaces Comparison of low latency of CPU and high throughput of GPU.\relax }}{6}} +\newlabel{ch1:fig:latency_throughput}{{1.2}{6}} +\@writefile{toc}{\contentsline {section}{\numberline {1.5}Kinds of parallelism}{7}} +\@writefile{toc}{\contentsline {section}{\numberline {1.6}Cuda Multithreading}{7}} +\@writefile{lof}{\contentsline {figure}{\numberline {1.3}{\ignorespaces Scalability of GPU.\relax }}{8}} +\newlabel{ch1:fig:scalability}{{1.3}{8}} +\@writefile{toc}{\contentsline {section}{\numberline {1.7}Memory hierarchy}{9}} +\@writefile{lof}{\contentsline {figure}{\numberline {1.4}{\ignorespaces Memory hierarchy of a GPU.\relax }}{10}} +\newlabel{ch1:fig:memory_hierarchy}{{1.4}{10}} +\@writefile{toc}{\contentsline {section}{\numberline {1.8}Conclusion}{10}} +\@writefile{toc}{\contentsline {section}{Bibliography}{11}} +\@setckpt{Chapters/chapter1/ch1}{ +\setcounter{page}{12} +\setcounter{equation}{0} +\setcounter{enumi}{0} +\setcounter{enumii}{0} +\setcounter{enumiii}{0} +\setcounter{enumiv}{3} +\setcounter{footnote}{0} +\setcounter{mpfootnote}{0} +\setcounter{part}{1} +\setcounter{chapter}{1} +\setcounter{section}{8} +\setcounter{subsection}{0} +\setcounter{subsubsection}{0} +\setcounter{paragraph}{0} +\setcounter{subparagraph}{0} +\setcounter{figure}{4} +\setcounter{table}{0} +\setcounter{numauthors}{0} +\setcounter{parentequation}{0} +\setcounter{subfigure}{0} +\setcounter{lofdepth}{1} +\setcounter{subtable}{0} +\setcounter{lotdepth}{1} +\setcounter{lstnumber}{1} +\setcounter{ContinuedFloat}{0} +\setcounter{AlgoLine}{0} +\setcounter{algocfline}{0} +\setcounter{algocfproc}{0} +\setcounter{algocf}{0} +\setcounter{proposition}{0} +\setcounter{theorem}{0} +\setcounter{exercise}{0} +\setcounter{example}{0} +\setcounter{definition}{0} +\setcounter{proof}{0} +\setcounter{lstlisting}{0} +} diff --git a/BookGPU/Chapters/chapter18/ch18.aux b/BookGPU/Chapters/chapter18/ch18.aux new file mode 100644 index 0000000..f9f97ae --- /dev/null +++ b/BookGPU/Chapters/chapter18/ch18.aux @@ -0,0 +1,71 @@ +\relax +\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} +\@writefile{toc}{\author{Christophe Guyeux}{}} +\@writefile{loa}{\addvspace {10\p@ }} +\@writefile{toc}{\contentsline {chapter}{\numberline {14}Pseudo Random Number Generator on GPU}{305}} +\@writefile{lof}{\addvspace {10\p@ }} +\@writefile{lot}{\addvspace {10\p@ }} +\newlabel{chapter18}{{14}{305}} +\@writefile{toc}{\contentsline {section}{\numberline {14.1}Introduction}{305}} +\@writefile{toc}{\contentsline {section}{\numberline {14.2}Basic Recalls}{306}} +\newlabel{section:BASIC RECALLS}{{14.2}{306}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.2.1}Devaney's Chaotic Dynamical Systems}{306}} +\@writefile{toc}{\contentsline {section}{\numberline {14.3}Toward Efficiency and Improvement for CI PRNG}{306}} +\newlabel{sec:efficient PRNG}{{14.3}{306}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{306}} +\newlabel{algo:seqCIPRNG}{{14.1}{306}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {14.1}C code of the sequential PRNG based on chaotic iterations}{306}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{307}} +\newlabel{sec:efficient PRNG gpu}{{14.3.2}{307}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.3}Naive Version for GPU}{307}} +\@writefile{loa}{\contentsline {algocf}{\numberline {12}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{308}} +\newlabel{algo:gpu_kernel}{{12}{308}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.4}Improved Version for GPU}{308}} +\newlabel{IR}{{13}{309}} +\@writefile{loa}{\contentsline {algocf}{\numberline {13}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{309}} +\newlabel{algo:gpu_kernel2}{{13}{309}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.5}Chaos Evaluation of the Improved Version}{309}} +\@writefile{toc}{\contentsline {section}{\numberline {14.4}Experiments}{310}} +\newlabel{sec:experiments}{{14.4}{310}} +\@writefile{lof}{\contentsline {figure}{\numberline {14.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{311}} +\newlabel{fig:time_xorlike_gpu}{{14.1}{311}} +\@writefile{lof}{\contentsline {figure}{\numberline {14.2}{\ignorespaces Quantity of pseudorandom numbers generated per second using the BBS-based PRNG\relax }}{312}} +\newlabel{fig:time_bbs_gpu}{{14.2}{312}} +\@setckpt{Chapters/chapter18/ch18}{ +\setcounter{page}{313} +\setcounter{equation}{0} +\setcounter{enumi}{2} +\setcounter{enumii}{0} +\setcounter{enumiii}{0} +\setcounter{enumiv}{0} +\setcounter{footnote}{2} +\setcounter{mpfootnote}{0} +\setcounter{part}{1} +\setcounter{chapter}{14} +\setcounter{section}{4} +\setcounter{subsection}{0} +\setcounter{subsubsection}{0} +\setcounter{paragraph}{0} +\setcounter{subparagraph}{0} +\setcounter{figure}{2} +\setcounter{table}{0} +\setcounter{numauthors}{0} +\setcounter{parentequation}{4} +\setcounter{subfigure}{0} +\setcounter{lofdepth}{1} +\setcounter{subtable}{0} +\setcounter{lotdepth}{1} +\setcounter{lstnumber}{15} +\setcounter{ContinuedFloat}{0} +\setcounter{AlgoLine}{14} +\setcounter{algocfline}{13} +\setcounter{algocfproc}{13} +\setcounter{algocf}{13} +\setcounter{proposition}{1} +\setcounter{theorem}{0} +\setcounter{exercise}{0} +\setcounter{example}{0} +\setcounter{definition}{0} +\setcounter{proof}{1} +\setcounter{lstlisting}{1} +} diff --git a/BookGPU/Chapters/chapter18/ch18.tex b/BookGPU/Chapters/chapter18/ch18.tex new file mode 100755 index 0000000..3808995 --- /dev/null +++ b/BookGPU/Chapters/chapter18/ch18.tex @@ -0,0 +1,392 @@ +\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte} +\chapterauthor{Christophe Guyeux}{Femto-ST Institute, University of Franche-Comte} + + +\chapter{Pseudo Random Number Generator on GPU} +\label{chapter18} + +\section{Introduction} + +Randomness is of importance in many fields such as scientific +simulations or cryptography. ``Random numbers'' can mainly be +generated either by a deterministic and reproducible algorithm called +a pseudorandom number generator (PRNG). In this paper, we focus on +reproducible generators, useful for instance in Monte-Carlo based +simulators. These domains need PRNGs that are statistically +irreproachable. In some fields such as in numerical simulations, +speed is a strong requirement that is usually attained by using +parallel architectures. In that case, a recurrent problem is that a +deflation of the statistical qualities is often reported, when the +parallelization of a good PRNG is realized. In some fields such as in +numerical simulations, speed is a strong requirement that is usually +attained by using parallel architectures. In that case, a recurrent +problem is that a deflation of the statistical qualities is often +reported, when the parallelization of a good PRNG is realized. This +is why ad-hoc PRNGs for each possible architecture must be found to +achieve both speed and randomness. On the other side, speed is not +the main requirement in cryptography: the great need is to +define \emph{secure} generators able to withstand malicious +attacks. Roughly speaking, an attacker should not be able in practice +to make the distinction between numbers obtained with the secure +generator and a true random sequence. Or, in an equivalent +formulation, he or she should not be able (in practice) to predict the +next bit of the generator, having the knowledge of all the binary +digits that have been already released. ``Being able in practice'' +refers here to the possibility to achieve this attack in polynomial +time, and to the exponential growth of the difficulty of this +challenge when the size of the parameters of the PRNG increases. + + +Finally, a small part of the community working in this domain focuses on a +third requirement, that is to define chaotic generators. +The main idea is to take benefits from a chaotic dynamical system to obtain a +generator that is unpredictable, disordered, sensible to its seed, or in other word chaotic. +Their desire is to map a given chaotic dynamics into a sequence that seems random +and unassailable due to chaos. +However, the chaotic maps used as a pattern are defined in the real line +whereas computers deal with finite precision numbers. +This distortion leads to a deflation of both chaotic properties and speed. +Furthermore, authors of such chaotic generators often claim their PRNG +as secure due to their chaos properties, but there is no obvious relation +between chaos and security as it is understood in cryptography. +This is why the use of chaos for PRNG still remains marginal and disputable. + + +The remainder of this paper is organized as follows. +A COMPLETER + + +\section{Basic Recalls} +\label{section:BASIC RECALLS} + +This section is devoted to basic definitions and terminologies in the fields of +topological chaos and chaotic iterations. We assume the reader is familiar +with basic notions on topology (see for instance~\cite{Devaney}). + + +\subsection{Devaney's Chaotic Dynamical Systems} + + +A COMPLETER + + + +\section{Toward Efficiency and Improvement for CI PRNG} +\label{sec:efficient PRNG} + +\subsection{First Efficient Implementation of a PRNG based on Chaotic Iterations} +% +%Based on the proof presented in the previous section, it is now possible to +%improve the speed of the generator formerly presented in~\cite{bgw09:ip,guyeux10}. +%The first idea is to consider +%that the provided strategy is a pseudorandom Boolean vector obtained by a +%given PRNG. +%An iteration of the system is simply the bitwise exclusive or between +%the last computed state and the current strategy. +%Topological properties of disorder exhibited by chaotic +%iterations can be inherited by the inputted generator, we hope by doing so to +%obtain some statistical improvements while preserving speed. +% +%%RAPH : j'ai viré tout ca +%% Let us give an example using 16-bits numbers, to clearly understand how the bitwise xor operations +%% are +%% done. +%% Suppose that $x$ and the strategy $S^i$ are given as +%% binary vectors. +%% Table~\ref{TableExemple} shows the result of $x \oplus S^i$. + +%% \begin{table} +%% \begin{scriptsize} +%% $$ +%% \begin{array}{|cc|cccccccccccccccc|} +%% \hline +%% x &=&1&0&1&1&1&0&1&0&1&0&0&1&0&0&1&0\\ +%% \hline +%% S^i &=&0&1&1&0&0&1&1&0&1&1&1&0&0&1&1&1\\ +%% \hline +%% x \oplus S^i&=&1&1&0&1&1&1&0&0&0&1&1&1&0&1&0&1\\ +%% \hline + +%% \hline +%% \end{array} +%% $$ +%% \end{scriptsize} +%% \caption{Example of an arbitrary round of the proposed generator} +%% \label{TableExemple} +%% \end{table} + + + + +\lstset{language=C,caption={C code of the sequential PRNG based on chaotic iterations},label={algo:seqCIPRNG}} +\begin{small} +\begin{lstlisting} + +unsigned int CIPRNG() { + static unsigned int x = 123123123; + unsigned long t1 = xorshift(); + unsigned long t2 = xor128(); + unsigned long t3 = xorwow(); + x = x^(unsigned int)t1; + x = x^(unsigned int)(t2>>32); + x = x^(unsigned int)(t3>>32); + x = x^(unsigned int)t2; + x = x^(unsigned int)(t1>>32); + x = x^(unsigned int)t3; + return x; +} +\end{lstlisting} +\end{small} + + + +In Listing~\ref{algo:seqCIPRNG} a sequential version of the proposed PRNG based +on chaotic iterations is presented. The xor operator is represented by +\textasciicircum. This function uses three classical 64-bits PRNGs, namely the +\texttt{xorshift}, the \texttt{xor128}, and the +\texttt{xorwow}~\cite{Marsaglia2003}. In the following, we call them ``xor-like +PRNGs''. As each xor-like PRNG uses 64-bits whereas our proposed generator +works with 32-bits, we use the command \texttt{(unsigned int)}, that selects the +32 least significant bits of a given integer, and the code \texttt{(unsigned + int)(t$>>$32)} in order to obtain the 32 most significant bits of \texttt{t}. + +Thus producing a pseudorandom number needs 6 xor operations with 6 32-bits numbers +that are provided by 3 64-bits PRNGs. This version successfully passes the +stringent BigCrush battery of tests~\cite{LEcuyerS07}. +At this point, we thus +have defined an efficient and statistically unbiased generator. Its speed is +directly related to the use of linear operations, but for the same reason, +this fast generator cannot be proven as secure. + + + +\subsection{Efficient PRNGs based on Chaotic Iterations on GPU} +\label{sec:efficient PRNG gpu} + +In order to take benefits from the computing power of GPU, a program +needs to have independent blocks of threads that can be computed +simultaneously. In general, the larger the number of threads is, the +more local memory is used, and the less branching instructions are +used (if, while, ...), the better the performances on GPU is. +Obviously, having these requirements in mind, it is possible to build +a program similar to the one presented in Listing +\ref{algo:seqCIPRNG}, which computes pseudorandom numbers on GPU. To +do so, we must firstly recall that in the CUDA~\cite{Nvid10} +environment, threads have a local identifier called +\texttt{ThreadIdx}, which is relative to the block containing +them. Furthermore, in CUDA, parts of the code that are executed by the GPU, are +called {\it kernels}. + + +\subsection{Naive Version for GPU} + + +It is possible to deduce from the CPU version a quite similar version adapted to GPU. +The simple principle consists in making each thread of the GPU computing the CPU version of our PRNG. +Of course, the three xor-like +PRNGs used in these computations must have different parameters. +In a given thread, these parameters are +randomly picked from another PRNGs. +The initialization stage is performed by the CPU. +To do it, the ISAAC PRNG~\cite{Jenkins96} is used to set all the +parameters embedded into each thread. + +The implementation of the three +xor-like PRNGs is straightforward when their parameters have been +allocated in the GPU memory. Each xor-like works with an internal +number $x$ that saves the last generated pseudorandom number. Additionally, the +implementation of the xor128, the xorshift, and the xorwow respectively require +4, 5, and 6 unsigned long as internal variables. + + +\begin{algorithm} +\begin{small} +\KwIn{InternalVarXorLikeArray: array with internal variables of the 3 xor-like +PRNGs in global memory\; +NumThreads: number of threads\;} +\KwOut{NewNb: array containing random numbers in global memory} +\If{threadIdx is concerned by the computation} { + retrieve data from InternalVarXorLikeArray[threadIdx] in local variables\; + \For{i=1 to n} { + compute a new PRNG as in Listing\ref{algo:seqCIPRNG}\; + store the new PRNG in NewNb[NumThreads*threadIdx+i]\; + } + store internal variables in InternalVarXorLikeArray[threadIdx]\; +} +\end{small} +\caption{Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations} +\label{algo:gpu_kernel} +\end{algorithm} + + + +Algorithm~\ref{algo:gpu_kernel} presents a naive implementation of the proposed PRNG on +GPU. Due to the available memory in the GPU and the number of threads +used simultaneously, the number of random numbers that a thread can generate +inside a kernel is limited (\emph{i.e.}, the variable \texttt{n} in +algorithm~\ref{algo:gpu_kernel}). For instance, if $100,000$ threads are used and +if $n=100$\footnote{in fact, we need to add the initial seed (a 32-bits number)}, +then the memory required to store all of the internals variables of both the xor-like +PRNGs\footnote{we multiply this number by $2$ in order to count 32-bits numbers} +and the pseudorandom numbers generated by our PRNG, is equal to $100,000\times ((4+5+6)\times +2+(1+100))=1,310,000$ 32-bits numbers, that is, approximately $52$Mb. + +This generator is able to pass the whole BigCrush battery of tests, for all +the versions that have been tested depending on their number of threads +(called \texttt{NumThreads} in our algorithm, tested up to $5$ million). + + +\subsection{Improved Version for GPU} + +As GPU cards using CUDA have shared memory between threads of the same block, it +is possible to use this feature in order to simplify the previous algorithm, +i.e., to use less than 3 xor-like PRNGs. The solution consists in computing only +one xor-like PRNG by thread, saving it into the shared memory, and then to use the results +of some other threads in the same block of threads. In order to define which +thread uses the result of which other one, we can use a combination array that +contains the indexes of all threads and for which a combination has been +performed. + +In Algorithm~\ref{algo:gpu_kernel2}, two combination arrays are used. The +variable \texttt{offset} is computed using the value of +\texttt{combination\_size}. Then we can compute \texttt{o1} and \texttt{o2} +representing the indexes of the other threads whose results are used by the +current one. In this algorithm, we consider that a 32-bits xor-like PRNG has +been chosen. In practice, we use the xor128 proposed in~\cite{Marsaglia2003} in +which unsigned longs (64 bits) have been replaced by unsigned integers (32 +bits). + +This version can also pass the whole {\it BigCrush} battery of tests. + +\begin{algorithm} +\begin{small} +\KwIn{InternalVarXorLikeArray: array with internal variables of 1 xor-like PRNGs +in global memory\; +NumThreads: Number of threads\; +array\_comb1, array\_comb2: Arrays containing combinations of size combination\_size\;} + +\KwOut{NewNb: array containing random numbers in global memory} +\If{threadId is concerned} { + retrieve data from InternalVarXorLikeArray[threadId] in local variables including shared memory and x\; + offset = threadIdx\%combination\_size\; + o1 = threadIdx-offset+array\_comb1[offset]\; + o2 = threadIdx-offset+array\_comb2[offset]\; + \For{i=1 to n} { + t=xor-like()\; + t=t\textasciicircum shmem[o1]\textasciicircum shmem[o2]\; + shared\_mem[threadId]=t\; + x = x\textasciicircum t\; + + store the new PRNG in NewNb[NumThreads*threadId+i]\; + } + store internal variables in InternalVarXorLikeArray[threadId]\; +} +\end{small} +\caption{Main kernel for the chaotic iterations based PRNG GPU efficient +version\label{IR}} +\label{algo:gpu_kernel2} +\end{algorithm} + +\subsection{Chaos Evaluation of the Improved Version} + +A run of Algorithm~\ref{algo:gpu_kernel2} consists in an operation ($x=x\oplus t$) having +the form of Equation~\ref{equation Oplus}, which is equivalent to the iterative +system of Eq.~\ref{eq:generalIC}. That is, an iteration of the general chaotic +iterations is realized between the last stored value $x$ of the thread and a strategy $t$ +(obtained by a bitwise exclusive or between a value provided by a xor-like() call +and two values previously obtained by two other threads). +To be certain that we are in the framework of Theorem~\ref{t:chaos des general}, +we must guarantee that this dynamical system iterates on the space +$\mathcal{X} = \mathcal{P}\left(\llbracket 1, \mathsf{N} \rrbracket\right)^\mathds{N}\times\mathds{B}^\mathsf{N}$. +The left term $x$ obviously belongs to $\mathds{B}^ \mathsf{N}$. +To prevent from any flaws of chaotic properties, we must check that the right +term (the last $t$), corresponding to the strategies, can possibly be equal to any +integer of $\llbracket 1, \mathsf{N} \rrbracket$. + +Such a result is obvious, as for the xor-like(), all the +integers belonging into its interval of definition can occur at each iteration, and thus the +last $t$ respects the requirement. Furthermore, it is possible to +prove by an immediate mathematical induction that, as the initial $x$ +is uniformly distributed (it is provided by a cryptographically secure PRNG), +the two other stored values shmem[o1] and shmem[o2] are uniformly distributed too, +(this is the induction hypothesis), and thus the next $x$ is finally uniformly distributed. + +Thus Algorithm~\ref{algo:gpu_kernel2} is a concrete realization of the general +chaotic iterations presented previously, and for this reason, it satisfies the +Devaney's formulation of a chaotic behavior. + +\section{Experiments} +\label{sec:experiments} + +Different experiments have been performed in order to measure the generation +speed. We have used a first computer equipped with a Tesla C1060 NVidia GPU card +and an +Intel Xeon E5530 cadenced at 2.40 GHz, and +a second computer equipped with a smaller CPU and a GeForce GTX 280. +All the +cards have 240 cores. + +In Figure~\ref{fig:time_xorlike_gpu} we compare the quantity of pseudorandom numbers +generated per second with various xor-like based PRNGs. In this figure, the optimized +versions use the {\it xor64} described in~\cite{Marsaglia2003}, whereas the naive versions +embed the three xor-like PRNGs described in Listing~\ref{algo:seqCIPRNG}. In +order to obtain the optimal performances, the storage of pseudorandom numbers +into the GPU memory has been removed. This step is time consuming and slows down the numbers +generation. Moreover this storage is completely +useless, in case of applications that consume the pseudorandom +numbers directly after generation. We can see that when the number of threads is greater +than approximately 30,000 and lower than 5 million, the number of pseudorandom numbers generated +per second is almost constant. With the naive version, this value ranges from 2.5 to +3GSamples/s. With the optimized version, it is approximately equal to +20GSamples/s. Finally we can remark that both GPU cards are quite similar, but in +practice, the Tesla C1060 has more memory than the GTX 280, and this memory +should be of better quality. +As a comparison, Listing~\ref{algo:seqCIPRNG} leads to the generation of about +138MSample/s when using one core of the Xeon E5530. + +\begin{figure}[htbp] +\begin{center} + \includegraphics[scale=0.65]{Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf} +\end{center} +\caption{Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG} +\label{fig:time_xorlike_gpu} +\end{figure} + + + + + +In Figure~\ref{fig:time_bbs_gpu} we highlight the performances of the optimized +BBS-based PRNG on GPU. On the Tesla C1060 we obtain approximately 700MSample/s +and on the GTX 280 about 670MSample/s, which is obviously slower than the +xorlike-based PRNG on GPU. However, we will show in the next sections that this +new PRNG has a strong level of security, which is necessarily paid by a speed +reduction. + +\begin{figure}[htbp] +\begin{center} + \includegraphics[scale=0.65]{Chapters/chapter18/figures/curve_time_bbs_gpu.pdf} +\end{center} +\caption{Quantity of pseudorandom numbers generated per second using the BBS-based PRNG} +\label{fig:time_bbs_gpu} +\end{figure} + +All these experiments allow us to conclude that it is possible to +generate a very large quantity of pseudorandom numbers statistically perfect with the xor-like version. +To a certain extend, it is also the case with the secure BBS-based version, the speed deflation being +explained by the fact that the former version has ``only'' +chaotic properties and statistical perfection, whereas the latter is also cryptographically secure, +as it is shown in the next sections. + + + + + + + + + + + +\putbib[Chapters/chapter18/biblio] + diff --git a/BookGPU/Chapters/chapter18/ch18.tex~ b/BookGPU/Chapters/chapter18/ch18.tex~ new file mode 100755 index 0000000..6c3f1f1 --- /dev/null +++ b/BookGPU/Chapters/chapter18/ch18.tex~ @@ -0,0 +1,296 @@ +\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte} + + +\chapter{Presentation of the GPU architecture and of the Cuda environment} +\label{chapter1} + +\section{Introduction}\label{ch1:intro} + +This chapter introduces the Graphics Processing Unit (GPU) architecture and all +the concepts needed to understand how GPUs work and can be used to speed up the +execution of some algorithms. First of all this chapter gives a brief history of +the development of Graphics card until they have been used in order to make +general purpose computation. Then the architecture of a GPU is +illustrated. There are many fundamental differences between a GPU and a +tradition processor. In order to benefit from the power of a GPU, a Cuda +programmer needs to use threads. They have some particularities which enable the +Cuda model to be efficient and scalable when some constraints are addressed. + + + +\section{Brief history of Video Card} + +Video cards or Graphics cards have been introduced in personal computers to +produce high quality graphics faster than classical Central Processing Units +(CPU) and to alleviate CPU from this task. In general, display tasks are very +repetitive and very specific. Hence, some manufacturers have produced more and +more sophisticated video cards, providing 2D accelerations then 3D accelerations, +then some light transforms. Video cards own their own memory to perform their +computation. For at least two decades, every personal computer has had a video +card which is simple for desktop computers or which provides many accelerations +for game and/or graphic oriented computers. In the latter case, graphic cards +may be more expensive than a CPU. + +Since 2000, video cards have allowed users to apply arithmetic operations +simultaneously on a sequence of pixels, also later called stream processing. In +this case, the information of the pixels (color, location and other information) are +combined in order to produce a pixel color that can be displayed on a screen. +Simultaneous computations are provided by shaders which calculate rendering +effects on graphics hardware with a high degree of flexibility. These shaders +handles the stream data with pipelines. + + +Some researchers tried to apply those operations on other data, representing +something different from pixels, and consequently this resulted in the first +uses of video cards for performing general purpose computation. The programming +model was not easy to use at all and was very dependent of the hardware +constraints. More precisely it consisted in using either DirectX of OpenGL +functions providing an interface to some classical operations for videos +operations (memory transfers, texture manipulation, ...). Floating point +operations were most of the time unimaginable. Obviously when something went +wrong, programmers had no way (and neither the tools) to detect it. + +\section{GPGPU} + +In order to benefit from the computing power of more recent video cards, Cuda +was first proposed in 2007 by NVidia. It unifies the programming model for some +of their most performant video cards. Cuda~\cite{ch1:cuda} has quickly been +considered by the scientific community as a great advance for general purpose +graphics processing unit (GPGPU) computing. Of course other programming models +have been proposed. The other well-known alternative is OpenCL which aims at +proposing an alternative to Cuda and which is multi-platform and portable. This +is a great advantage since it is even possible to execute OpenCL programs on +traditional CPUs. The main drawback is that it is less tight with the hardware +and consequently sometimes provides less efficient programs. Moreover, Cuda +benefits from more mature compilation and optimization procedures. Other less +known environments have been proposed, but most of them have been stopped, for +example we can cite: FireStream by ATI which is not maintained anymore and +replaced by OpenCL, BrookGPU by Standford University~\cite{ch1:Buck:2004:BGS}. +Another environment based on pragma (insertion of pragma directives inside the +code to help the compiler to generate efficient code) is call OpenACC. For a +comparison with OpenCL, interested readers may refer to~\cite{ch1:CMR:12}. + + + +\section{Architecture of current GPUs} + +The architecture \index{architecture of a GPU} of current GPUs is constantly +evolving. Nevertheless some trends remain constant throughout this evolution. +Processing units composing a GPU are far more simple than a traditional CPU but +it is much easier to integrate many computing units inside a GPU card than to do +so with many cores inside a CPU. This is due to the fact that the cores of a GPU are +simpler than the cores of a CPU. In 2012, the most powerful GPUs own more than 500 +cores and the most powerful CPUs have 8 +cores. Figure~\ref{ch1:fig:comparison_cpu_gpu} shows the number of cores inside +a CPU and inside a GPU. In fact, in a current NVidia GPU, there are +multiprocessors which have 32 cores (for example on Fermi cards). The core clock +of CPU is generally around 3GHz and the one of GPU is about 1.5GHz. Although +the core clock of GPU cores is slower, the amount of cores inside a GPU provides +more computational power. This measure is commonly represented by the number of +floating point operation per seconds. Nowadays the most powerful GPUs provide more +than 1TFlops, i.e. $10^{12}$ floating point operations per second. +Nevertheless GPUs are very efficient to perform some operations but not all +kinds of operations. They are very efficient to execute repetitive work in which +only the data change. It is important to keep in mind that multiprocessors +inside a GPU have 32 cores. Later we will see that these 32 cores need to do the +same work to get maximum performance. + +\begin{figure}[b!] +\centerline{\includegraphics[]{Chapters/chapter1/figures/nb_cores_CPU_GPU.pdf}} +\caption{Comparison of number of cores in a CPU and in a GPU.} +%[Comparison of number of cores in a CPU and in a GPU] +\label{ch1:fig:comparison_cpu_gpu} +\end{figure} + +On the most powerful GPU cards, called Fermi, multiprocessors are called streaming +multiprocessors (SM). Each SM contains 32 cores and is able to perform 32 +floating points or integer operations on 32 bits numbers per clock or 16 floating +points on 64 bits number per clock. SMs have their own registers, execution +pipelines and caches. On Fermi architecture, there are 64Kb shared memory + L1 +cache and 32,536 32bits registers per SM. More precisely the programmer can +decide what amount of shared memory and L1 cache SM can use. The constraint is +that the sum of both amounts should be less or equal to 64Kb. + +Threads are used to benefit from the important number of cores of a GPU. Those +threads are different from traditional threads for CPU. In +chapter~\ref{chapter2}, some examples of GPU programming will explicit the +details of the GPU threads. However, threads are gathered into blocks of 32 +threads, called ``warps''. Those warps are important when designing an algorithm +for GPU. + + +Another big difference between CPU and GPU is the latency of memory. In CPU, +everything is optimized to obtain a low latency architecture. This is possible +through the use of cache memories. Moreover, nowadays CPUs perform many +performance optimizations such as speculative execution which roughly speaking +consists in executing a small part of code in advance even if later this work +reveals itself to be useless. On the contrary, GPUs do not have low latency +memory. In comparison GPUs have small cache memories. Nevertheless the +architecture of GPUs is optimized for throughput computation and it takes into +account the memory latency. + + + +\begin{figure}[b!] +\centerline{\includegraphics[scale=0.7]{Chapters/chapter1/figures/low_latency_vs_high_throughput.pdf}} +\caption{Comparison of low latency of CPU and high throughput of GPU.} +\label{ch1:fig:latency_throughput} +\end{figure} + +Figure~\ref{ch1:fig:latency_throughput} illustrates the main difference of +memory latency between a CPU and a GPU. In a CPU, tasks ``ti'' are executed one +by one with a short memory latency to get the data to process. After some tasks, +there is a context switch that allows the CPU to run concurrent applications +and/or multi-threaded applications. Memory latencies are longer in a GPU, the +the principle to obtain a high throughput is to have many tasks to +compute. Later we will see that those tasks are called threads with Cuda. With +this principle, as soon as a task is finished the next one is ready to be +executed while the wait for data for the previous task is overlapped by +computation of other tasks. + + + +\section{Kinds of parallelism} + +Many kinds of parallelism are amiable according to the type of hardware. +Roughly speaking, there are three classes of parallelism: instruction-level +parallelism, data parallelism and task parallelism. + +Instruction-level parallelism consists in re-ordering some instructions in order +to execute some of them in parallel without changing the result of the code. +In modern CPUs, instruction pipelines allow processor to execute instructions +faster. With a pipeline a processor can execute multiple instructions +simultaneously due to the fact that the output of a task is the input of the +next one. + +Data parallelism consists in executing the same program with different data on +different computing units. Of course, no dependency should exist between the +data. For example, it is easy to parallelize loops without dependency using the +data parallelism paradigm. This paradigm is linked with the Single Instructions +Multiple Data (SIMD) architecture. This is the kind of parallelism provided by +GPUs. + +Task parallelism is the common parallelism achieved out on clusters and grids and +high performance architectures where different tasks are executed by different +computing units. + +\section{Cuda Multithreading} + +The data parallelism of Cuda is more precisely based on the Single Instruction +Multiple Thread (SIMT) model. This is due to the fact that a programmer accesses +to the cores by the intermediate of threads. In the Cuda model, all cores +execute the same set of instructions but with different data. This model has +similarities with the vector programming model proposed for vector machines through +the 1970s into the 90s, notably the various Cray platforms. On the Cuda +architecture, the performance is led by the use of a huge number of threads +(from thousands up to to millions). The particularity of the model is that there +is no context switching as in CPUs and each thread has its own registers. In +practice, threads are executed by SM and are gathered into groups of 32 +threads. Those groups are called ``warps''. Each SM alternatively executes +``active warps'' and warps becoming temporarily inactive due to waiting of data +(as shown in Figure~\ref{ch1:fig:latency_throughput}). + +The key to scalability in the Cuda model is the use of a huge number of threads. +In practice, threads are not only gathered in warps but also in thread blocks. A +thread block is executed by only one SM and it cannot migrate. The typical size of +a thread block is a number power of two (for example: 64, 128, 256 or 512). + + + +In this case, without changing anything inside a Cuda code, it is possible to +run your code with a small Cuda device or the most performing Tesla Cuda cards. +Blocks are executed in any order depending on the number of SMs available. So +the programmer must conceive its code having this issue in mind. This +independence between thread blocks provides the scalability of Cuda codes. + +\begin{figure}[b!] +\centerline{\includegraphics[scale=0.65]{Chapters/chapter1/figures/scalability.pdf}} +\caption{Scalability of GPU.} +\label{ch1:fig:scalability} +\end{figure} + + +A kernel is a function which contains a block of instructions that are executed +by the threads of a GPU. When the problem considered is a two dimensional or three +dimensional problem, it is possible to group thread blocks into a grid. In +practice, the number of thread blocks and the size of thread blocks is given as +parameters to each kernel. Figure~\ref{ch1:fig:scalability} illustrates an +example of a kernel composed of 8 thread blocks. Then this kernel is executed on +a small device containing only 2 SMs. So in this case, blocks are executed 2 +by 2 in any order. If the kernel is executed on a larger Cuda device containing +4 SMs, blocks are executed 4 by 4 simultaneously. The execution times should be +approximately twice faster in the latter case. Of course, that depends on other +parameters that will be described later. + +Thread blocks provide a way to cooperation in the sense that threads of the same +block cooperatively load and store blocks of memory they all +use. Synchronizations of threads in the same block are possible (but not between +threads of different blocks). Threads of the same block can also share results +in order to compute a single result. In chapter~\ref{chapter2}, some examples +will explicit that. + + +\section{Memory hierarchy} + +The memory hierarchy of GPUs\index{memory~hierarchy} is different from the CPUs +one. In practice, there are registers\index{memory~hierarchy!registers}, local +memory\index{memory~hierarchy!local~memory}, shared +memory\index{memory~hierarchy!shared~memory}, cache +memory\index{memory~hierarchy!cache~memory} and global +memory\index{memory~hierarchy!global~memory}. + + +As previously mentioned each thread can access its own registers. It is +important to keep in mind that the number of registers per block is limited. On +recent cards, this number is limited to 64Kb per SM. Access to registers is +very fast, so it is a good idea to use them whenever possible. + +Likewise each thread can access local memory which, in practice, is much slower +than registers. Local memory is automatically used by the compiler when all the +registers are occupied. So the best idea is to optimize the use of registers +even if this implies to reduce the number of threads per block. + +\begin{figure}[hbtp!] +\centerline{\includegraphics[scale=0.60]{Chapters/chapter1/figures/memory_hierarchy.pdf}} +\caption{Memory hierarchy of a GPU.} +\label{ch1:fig:memory_hierarchy} +\end{figure} + + + +Shared memory allows cooperation between threads of the same block. This kind +of memory is fast because it requires to be manipulated manually and its size is +limited. It is accessible during the execution of a kernel. So the principle is +to fill the shared memory at the start of the kernel with global data that are +used very frequently, then threads can access it for their computation. They +can obviously change the content of this shared memory either with computation +or load of other data and they can store its content in the global memory. So +shared memory can be seen as a cache memory manageable manually. This requires +obviously an effort from the programmer. + +On recent cards, the programmer may decide what amount of cache memory and +shared memory is attributed to a kernel. The cache memory is a L1 cache which is +directly managed by the GPU. Sometimes, this cache provides very efficient +result and sometimes the use of shared memory is a better solution. + + + + +Figure~\ref{ch1:fig:memory_hierarchy} illustrates the memory hierarchy of a +GPU. Threads are represented on the top of the figure. They can access to their +own registers and their local memory. Threads of the same block can access to +the shared memory of this block. The cache memory is not represented here but it +is local to a thread. Then each block can access to the global memory of the +GPU. + + \section{Conclusion} + +In this chapter, a brief presentation of the video card, which has later been +used to perform computation, has been given. The architecture of a GPU has been +illustrated focusing on the particularity of GPUs in term of parallelism, memory +latency and threads. In order to design an efficient algorithm for GPU, it is +essential to have all these parameters in mind. + + +\putbib[Chapters/chapter1/biblio] + diff --git a/BookGPU/Chapters/chapter18/figures/curve_time_bbs_gpu.pdf b/BookGPU/Chapters/chapter18/figures/curve_time_bbs_gpu.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a6a1080ad960e9f60d4952c52c0eaa979f7db36f GIT binary patch literal 7587 zcmb_hc|26__y2sfG4hc%HAxpSD%{z}F8eN7%aX=y#*CRUGqS5tl&vI1vLq>qvQ#3H zP>GTy6)DP;P=r*{?~bMQ`}_UAe|%e**7~YC6%_8@ajZ7Bb4=pqOtwwK$^^&H5Op>cI}n2YBERStSmk_e@TTaf-uRYefOsMLx_r$v?)AX)1Z1b4$sjZf$J@0{t z2**~A;{yj;)x-m|`oV_F_oLrTirz(!th`~otvUT`?HHG~0b6ch7-6Hi>*1vq?|~9{ zc}E95(V%z}7W>+pk!ucXEO_w1;1T+@l9n|qcROa#XGbJe46f$Yoaw&!q|%J^IO(X$ zVaip5;!6Def@FGuU>19oW!B>r@YFXu>pN8RYZj=&OT;T% z*6*V_dwD(3eQ9%Kd8&I>mj0q{%lMoJ1272*%_r%a$wD_u9z2$~cE{$(Q+j)*w7a7D zdU(Nu{6cHKmydiY(fI%?$_yUJ>W#bV=zpc4$EmC9wOu!uocZGJ_C*q!v6r^P*7BLn znc3qei3as;8Ze0<^RGFi*)>}hs!yB1zLx2H{Vsri&VBSPURrkrJKDvL$$)R+A*AI1R{*v`I$!E~<8FqrsAfKw6lef2|^kxT64VIKY zO>Y$Nf=8zc$G4NR8DgDLZgkD8Wh#|lp4ABK zwe!|J(f&eFjJ$bl)%`^`40;83K}D>zl?v8o^Wj)A-+%mDdcAje$Gtg+qiU#}%y%I* zdDl886>Pq6X-R2aP%FFvo?Kva&d3^-e*sk6dq}Wr>D9>v3pmA}YL8JIZUpx}E%$0H z^vS%?8sYCEc{)dyyD?i9=Ka^2p(E|{Jjy6BgL$%PZ9(t1zpXaCu!z2_V*4|?&ec<6 z&z;6LFS~X&YDE4MDd^bPPUpwXcXqT#xKdnl!?sJlJ=MC;tv$lOu)q2OweWoT*+o^C zpIN3(y*eAL^>todXx&>1i#*!_ppmd^2z!O_7llOOVY4*skB;IH$f1C?L>dLOrjY4G z19kx5iiD?wR#5b9R|=s*eJ zL+r)@d{cWq)R6#`P!A+x!iL0AsObx+L4k(QHKdUT2pgLn3?j|CB{VV9=pb#7U%<3u zJOH$$lX-wEOz5^9%pHI<_z6}1Br4QC+aqTr4T*drlTDjbG3&iGg-;|yGXQFa@WVFY zR|P+o0Ry+#S_3Fu-B~B6S2b(OjsO$N+5J#6z0A2;b(s3Ev%<^SNdqE}A`A>@VB=!G z-Exx|g}IBur;~`&;WDD|NL)IH&*lOeLKV~Dn4W>cN$2wThF(N2fI)*+#MvA6^14V4gLAWsS6`0{6ZWs91ai6{RwGv z4G^#>cj0}R#)Fas1NC_%;TD3ZQ1ufzW)wQjYxYu@Y2lnu6b70@MN22?vuI2TfP{he ze2R|~v~{pB(3;NULCcv=@pSsYZL?}{7!61(R1eTrR0a(}Q_u#q1)-7w9l>oN5%dH} zAQ_~9(?&rmNTu(hfHcqxq=P;n3uMt*6p#(Fq5m9^!}Q~UT#yIyD7z>ukVg*yc}ybD z3*>Q#Bnrp}`TlIs4-5o@z#s~jy{tV;jB(}}od1V;t<#mL7a)s{J=Jlm3A14{x9u_GAYvOn6FIO1oo z%M~HdpU`V@Sqxst+_f^P!dP@ccw(Ny-x_5(f!kN7EI^k>UM(GukGJ{e?mIR*n&h+b z>qot!&BkF)`z@#qyO(cB(kOcjbKHAOWl#472Wrk8bW%=P$|rPS&^7rkzRK1D2j<}g zv<<~v9`7J|*Wcr4g0%D_#mVPVM~_}VhxAwpI=^a=C;J|ZHOSF^_id$E_(jLiL(2}G zIJVBtR=L|D)hSsvr$gq*;nl}VmE@k*wl$K)WODBQa7|kxL4Ul%-5m6mE)1dTK}+Z)K?!(U3W6oTfDho1~*w8)$Z#SLD1;L_oZZ%}E@ZyLz*Dl=2Nrl0uo-EAogi^1&5 zoS}y+`AVVtWK)Lpz8){jDLY!05%_5FvDFN!a+X-icJ9ei;0OdXx) zrrNN0HN!MB?z8Fr8^yssbg8A2_lY(p^qOF%V(hauij=~SxqeAG)vMd!B8>TI60{}* z#XX|p<}wkk9BfJS#puYcRqv%ucvA1XH#`u3%PAC9PQ2oZyrm%_*5sXKzbVqbbNfng zmuvJq-%OTVM@`T0ql@sHo!|Da%?S?e$OsB~&(z-3{xp!@xdaIeIlj$F-ru+Y&N5KA znlBBPQNNXUqicIy*I}I?kCY{AR~{NxyV)n|iUs>62M&ZD`XZu_{QOF=?9gVd#1-aR zMVIr6vK5Leq~f}H3*k5ATox^_;ANL%1+gW%@1jRy=0ACvm&&t^DYshU-sXZabl+AR ze}i76lci84*}N08P~?61X369GbL>pkMfY4jZTc`FR9QJ@=eH2vipw`Kr<}^nQnDzQ zw(MMa@7k0RqBN4GJRhcSf_s{%QaeCiL7dh#s?L|kc_Pf<%KJ8TLPT;&}T1H(d%IQyh`^wMQ z3`NlRWcW?uU=8Bn`lsYt{ZZ z^65+tVi~w##IvLG7UXDw%p#NG|a453@GsM`kE7$ z==aFM1#4q?+oM`0IsCnOXCnNL_=v*A8ftcjcv;q-@T*BH^xs|-B{JaBH$(d99kEPMh`Q`yHZp0^sr_N z&kknzoS$6PyjtrYyyf3|*;Zm}UOLqFl-FF`9WbVl-s0BG>|5bnzSJX&Jgh%N9BJ2D z8t+0c$?kkt_w8uz=#u`k?_TGAMMz4{m*A=y_TW`z`r(4?flS0Oit z6Z0_2tva7=L<~B!UeDijJiT;OH1PQsUYxg)o_B+J;z~Q^VlT$F1^8~4&u@}_wg%o^ zzT5z#X!G&Zx3I$Eg17r4%MML^dNZkZBKtMR@$294rv{=i+>LifU=u!f^R3q*oSlsfC9v3IQ+WOm$YTqnPhA{aLS3T=v1pbqN3%mb1Qs8SN zPgoir>*wmkdg^AEexPqk_NepGpMW0l~ zf_C?Q!Mwo}lai@!1`%nmGOr%^fO^8iw_E8+H-4d(3`dEt#~ggOTGi;hMuvX=nd4y- z=_fXP-W_Sfc=6<7w;|(9?D^+%o|;ZQN61uKh~biVcS8s&H+HC-#VmLqpR^%(PeH54 zRm(+sr#Y(;0z$&apkZWV?3hcR%-C|3>XG&q!I@-C+i+N`@6wYGkO@`(DOCW;#stgH=Muj@{g^EGgcjQGea_YP z_8t`FxMj1AA7{d5UK_lcrxR-}Dks7!wzU{PC2s0k>BH6u6YI()R_Do8n1z==Bfm9o zs#}~VcNqTGP4MdI)uy;D4hScSJx4bTr{H=_R6__oTI5FUKDPkVkPXzH%aajxwrjK> zt!U|}&sKXKl31cxD#&K>CEb}<`3X%qw$Gd5jPnd$)jgCvBtluMqVB{!-P6!zJ<2V5 z!ctyHmBU@z-G2g~8rT{1>9)>sT!Od&tW!KZ5ReOAu5%KrY;bG0@-|Eey7zLKq}9N@ z_ZOrVFaB^>N#=Nd;)`yF3zcDlcKDq;MG=8s7@Zu}wdkRbllbM6YTHF#$QOiNZnz$ypu+hzQd_$q!&EA%PD!0a4xD?5PmlPo_5*W(ewir7nL=>OcAgj zTv|Top0D|^c`{5d%sISX>F|-1>M-Tg1zDICezmy=|JfaFM^9+wYaLL0IqYKVx_Do0 zc(8&2=lNb&dj47^iO1AUE92jjMbiCGoBZX`zwfcolp#Oz^N@DE(B5JEOug&t4AM>L>?fpED&n?`tzs zZ#N-GdZ&1z@D?^w{>jsUT18Wts7Hk!o>kaait7z}a->0}Vusn$8*(R?{NwAGf60Po zb}%Ptn{CC4`(U<$74>j3W6e@ddanU1BN=CdIA6HCEC2du%SUNdeD}3RE_{V5*D#wf z*5D=Emj?~5oHi9~xSTsAmg)DhvLS;Yn|EGo_b?xEDx)*@L)~0e$1uX*hcYXe5&WxGT!0? z*M||+9cog?TJ#R9+G;OkRIe>M(<)jSCi=IJs95+aAZ zRqC7?DlQ=h$aN7COZy%S6izh*7>K#{`Qx?(-b5zhipVIPLjgju7Y2n7cv5I|mJZ@w zc@+YnlXVczIBTRe$B^PhHw)rY?1O9^NI?ve1{tBJ3)9vkLeL3CHGs%brA~vaJODbT z>Bt+}S((TS83Hr|d^kQ3m;eO$Fj+j!03F1%h$b`^l0gJ8&BABsAhucC0(>@`$)NKA zw3>z*N)?4s0jON)H0jUgG60ks9s#N3lBt?@5RNjV5W3Pqc=7ohO%U|=_gC}BsIj>; z5T&7^0V2^L8m$U(sPY0?d}4qqi>CnDn|6f0?YkI@r#9`A8j0-#(zk&qH6%D^XbxYX zvsr-9Fwv9k$Jariuw)zxhr_Aj@Hh=s6pBJpMM57{G)5gw#c7Zz>UheWCC*PC{$dK6 zfNWu)5I`eM2qPlx1gi6=mj|He2QB+BQ1Ax^8keYZ+98Oac189T4aY8AFL=7Qm9fI2b3(=oA ze-dSL9iT$crLp`tOg10Ds-Xc6kx%ksLZ@vegzC?jotB+t)dq#>`6)X8V!VhPYwQ0z zVm>~?LKgCA8nH=!J`hl0ZuDQ4$-kIcGU>aU96v5|dMA)cAcaX0dc=d0i-P(EOw0Dv&i>1~e00B9r%1@S>0VVcEAQ%FFe zkys=iheKgDAyEh<5;3QV%~hUD#~(Wv;&BkxmohYfkS%WtiT|Bhm^+B9@5lFoRQyyO z8@_|e0bB|d0$3rSPuOo~J^@Eadi1A=*fqiG-^*!eda7V{?`RtORQ%7;c{|L_IEHGlGj z&m}@IBNuuJ3V`IdMB%Byu}3eKQE%FhVdn{+qd+srr?TXChN zWs>L9ks5vW?DbD^lU*;zUzMDmeqCoj*5=;OV(o=ZH{_=5QN1{G+RfCo-p*m|l^%;+umiHh4yJb#;<+hb3b7~DHX*iFiF=Tix z((+X&CtM7#V7r0{`TGNs^>|#F7~d?z2QmUm)i)HwZJX{I#M899`a(y`-^;EopGtYC z0gsQ1ej@4{;LvpLi=>ayW<_kVZaz(2?^>AQT(czl`UA;Qm1@Us#jmlx~lC z>UL=-%ry5=cGSnUJ*T(UVM|Spt}VG|7HVB`PbZ$%IZWvjsBYKHd@&` z*wHO(7I7&#v9jkMRAWR}O|#6=L1~MIk&6cz%KbCjQqi$~uljFYs%)6)2nq?04f!M; zLOPw58d5D@VdkakKO{_g*f8|5IjbfX6t9Wczv$~HUI`%vH_|7a|X_eyJhA#$`#d2MiK%!PX|t<$Aj)ii@~ zM$$_}g*uyKOpH*ii1V&jMz1?;Of{b>Q+QKDE^&N<&vto{7nt#My`Xqjn^v|<$gOIn z@RoI$?D9Q_{-q4>?S~B?fnv0yy82zQzjvy~R<#VTJDu!!c?TSl%8(gf_VApEon!j){bp11>P#O?Im=C@5ltjgjFpQ zK6tKki?G^L%`X9;?uL)bM1&%Oq?dOY5}D231yAyOv&ECdgtR^JyT{LoMZhKMmVL6% zUtHeIBzDEoD(!o0M%2}lQ)o+gjmt#O?CQ9k)Y_Ex7!DEQh&K%;EXg_3a0*t&Dl}N_2ncKH~CkyRZ-O$D$jxHngOA$(ARaTk$w< z#guDQ!KU%2OL7fmgpG6HRn_=aafZ4?1J2%HX<=ddmMvJ*Z7=t~dp@zFE_bZ=RI)C} zj2F*lC&v~P-7>n_6?8E_DEkoZR9=ITona>Ec~;!1Oi};mXhHU%f~Hh=I$4+P4>%&BVH^?* zVDLz17&OSCvUmW7pZ8&)C6&we;ZUetXh*Q%uqjqlo+AkD831Ta_2)t82L3!FD;~5f z04P2mh{TW$k)u$v9cYsP^`Jh)kqhuQ7fTofnpcZ2Vs5j6xIuaWvx;#6a0lI;3pm2~ zYFomb0f>VSpXE=aeD3r6)EuQAnMY=_Jr-EZYi~;Bk=>yY05u2rA)D|kgCEm?f#w#b z0E$GKS8{e%^P(&XFg~5-4>q&YTo@(7h`)~VyYqv(WG`X3a4>0-2M};qvr6$s7QK22IHGZ4?>}?xMT%Jh_fAG=VSZ z&mWG+w;=xqVZd-WJh1TcVGHe_|Nq1THrFPg(epeAze$-Lf#LCdi_Br+A!{t)&%uA? z_gxb5hjhN~3vl#zSwswG0XnNU62R~w5Z!ltsLlWJVbIGDL>6=^p|d<7v#@knT>AIE zm5&>bANBk|f&974Z zkegtj4wu4TL=Y5;f3o*BD&50#zRUMDe~eEG1C61urBifR9!x5Lgn?E(s+TRac(5?g zl+NWs)0uVhtOG&2c{VtV8pIWf2xu`XfO?=2Xa-tB9v%AQ$9PeW@&vOZNx4OfuIK|e`rj6IA@y)fmV$;B4zY{1abs~$^}~lV&os@w#t7gHP zhW$mMZ&tPcNJk^y(>yqleOTXt4ZuwD7^MOxdty?t2b)WqlZJ4cYrF?KN_P^G{P9B__<<}ibug6=c5@Z50~7dF~?~}QX*wbu0Ec&h2=@$qD@Vrs2g1cJIxZD$c$TaY`A$gib-|-tiM#Y5Gu=}}a7%fW zA8%9p?YL1w-m*hZGloO%(^~RiiDn%Wjk#M%w(kSHfnnRX?rW!Viq~tVuXJh>o+y+w zwtoI1ZL4bSvf#_n%SApI2g|O+=+{fHo0#%7(5w%Px-DBa)xSnR-tfqqLDyD(Y8G8j?C>q0sjq8x?P+g3hLrE$_wl{H3Sm`D zA+ziGopN`Tk#$y!Tw3q4a#jdYM2tJwiwU)&B*~-V5``>8-H2JZiL?KOi|Vo&^)W}M z+xJ=BZ?gLQcw-Goqwv-VXEWI?m_WJRi)C-8^zOUXBD}6g8^P6<8i{$cW>3n#eqFuB zQ0`TqeoK+BbxvYI=?5KRifn{z9=*%F-X}U`r*h&ri%_eYR>v&Oa3Aix-T5FnLpnFB zz%%9Wfr>*?Doc9fYcmoaCyv8+o0=F!YH$om#V1CUzv5S{lslzTcUZ7$rQ5d;MWdtU zkM}q>nRW`S=S*Msiv7fB*VFVjD-YMx#OLODO4#>FkHqVk)ucQq+GTG)Q=?9o%^y3u zk(w%p_i5`jlsfJajUs=wvjNHpONAm@MMqMqKj#;>744BIzF&Lci_@W1jn5L?XbE?c z_7<<3*mdr4=;fTkoWu$Iiz8RPbutzu(JBp{Gz z@hDD8`2oekVdi6+rCVXyz?heYZhpe3!hwF+yY~@azlQAdsWDabL%n){QQK%+)712# zQ%}WSbn&W__6{j$AIMr|vOi-VDiJf%%krQp}8VdH3s)K|oe=s7{j!ax(8UT9kRU9LqT@Pxl;_wJ;87((x7J22GAO znx{KhyY79HjDg3Voyrv4KE9SabFX{Zxx%jJH~YpI8sDrfQadv*PrfRiKDPuV-%~|# zS}sCveYx0wGLI?me1pS154vq^6}zpmyCXfMr>*4PbY7GElUT&mJ+b!Y-Jv-u#V#AF zr)hZ~!jEDJy2H}-28=up?VT7W@JpmeBkwt|z7sJgMu~)}l zI-hxOvb5^#CwkPPp7_1@#G6_=k51ZsGZIy-j{4F&%JP%Rlv+9bV54s8Awj$Dkg4wW zgv_KDjTh|Hm7X-de5Xnn@T^qHUX2JzMjbJtYgQta0x>OuoAQL5mfd^3H(cj)eO%=y zw=!yzUcbMy3Mu<$amkUIEp6H)+=_c?rnoaXf*zA8 zh|-6rCbum&?fyq2+OFa0qPAd9*A7y719l{O`M~0=kWu0CG|hm!Zl8`A)fwpRcMUi{ z_-L`UuVv?hH8S1B6_VG_Nw03LP5CM?B|54{bkS((>lG5S_FUTNgT!xID%gGE@0Cwu zRAtyRyxltR0ol|+|rw1&b_D?g{h6eeu zAMWZ`B&-V#v$UA%?OuJjW_v|X*X`Uia_WLYFESsw*EW=jV_x^}?m;Sw zo`}A+Z{-fOawntsWr^|U0n1pa;1>s@{Qq)EI8?sL!gm7+Wmy1%e;3~z06)^EFc@eKhEGTjD*(#D= zQ_sRKNO5(nO)s?!spYfJi`I)Y27lW4psXq7Tl3BI0AMCB@gGp05?@KZ-Khvy%ZItb@feE zp+T?Avnktzrw_5h525VDRgaT0EJxmLuHAW5x+b(HJzCaZeYCNb9yimP#H|jIj5$)TXrJUp6)R(VPBpj51KjkrK!XmE5%W}7T61pd! z?JV6km3Ap-;zYC)u0KS&OycGqs@=(WhTY_&M>(n&;-uckT34Rc(P=JPdEVDequWN^8wcwY$sO9j_PwI>@qt*Hxu;PdABGdz?>rw@Pj^w>t4osd(oeb+GP!ZKp!% zJ%y!SEY(4Buk)Ad`ds+!JM!2nAA|aa#Fw{*wfe8^gIy06Aa=v6#4Hpy86WR6irW?^ zJEA8$vZGV(*7{vGy4y0&C)p;)nC$x2)E#v3ZqPD=ui8f)&bja}ZNJtzW^eRA-`^JQ z`aiB2cBUo^cMKF3ulD_pp{eZ;U2Ygu9^gjxptH2#?@KG-0Nq^+z6)oHH1*b_deXNA za;R2;X4aHI21U&su1$hzYLlS?3l-r{_D0ZV%i&xAy2@$E=~%JC8W)&0G^y`Yi< z;P1s`an=2`;Iky^P@4}1;lL~kkD&#(GqnJCY&MfY=K*L{HB}S>g;4=$9O%aB$L25q zlqwz$vE;bZ)GeWE%N#?frv>-q@x0YR(9h3L)eob}=6HZ8H8nL5i3ZVV1cZU$2C#T! ze*}xW8Im`v2;J^G8H=krtCK2)?FG{9K$I#HToANSg`l%p0ADcKjqSsO=0H(XBco_| z1PqV)2qX`dk2jOe1F))C zz?;mYcru|IITJw6Nu4E~$J7M*-ucNm|Dw6b-lnGix52!;_;Jj~Q`cuxe7vAqhOz#C z872RsW8Ep=)%5n^FlQHnI|ZaNseFyNkZ)0tp6(QN8k^%q=7+JjH&kgM^PuPkp%{dd zG@(HroyVk-W|x@`lQ-93()r6qok?bSXugntf$KU`o5`}{BplmRUqOZ;epipQcQVoO0VU>|6I1&k8;Kb%AEV$$M zRSTh5^J7Z^YCvVLJyZ(sJ1*aK5Lm~D=LvCGhz!N=C1-yQl?Ihxp&}qZ&(1vn9*2YU zqXF|U6afVlF!>L_T7VHSc&N<%D-4ClLuJz6U^wI-v`8chD$oChhb8`jha&Kc%D>?e ziGR?d(Q1%cf5$`rfrldE{(zzJ7^tZHD=m(Q`U%G4kf8!32l`A6fL8QC=-UGp04>>2 z_MH9f0D#6U8XK6cPC_MCeZWx@O(fzmSOT7;0S1#yu5+AA?$wu_;$Hx literal 0 HcmV?d00001 diff --git a/BookGPU/Chapters/chapter18/preamble.aux b/BookGPU/Chapters/chapter18/preamble.aux new file mode 100644 index 0000000..65abe7e --- /dev/null +++ b/BookGPU/Chapters/chapter18/preamble.aux @@ -0,0 +1,32 @@ +\relax +\@setckpt{Chapters/chapter1/preamble}{ +\setcounter{page}{1} +\setcounter{equation}{0} +\setcounter{enumi}{0} +\setcounter{enumii}{0} +\setcounter{enumiii}{0} +\setcounter{enumiv}{0} +\setcounter{footnote}{0} +\setcounter{mpfootnote}{0} +\setcounter{part}{0} +\setcounter{chapter}{0} +\setcounter{section}{0} +\setcounter{subsection}{0} +\setcounter{subsubsection}{0} +\setcounter{paragraph}{0} +\setcounter{subparagraph}{0} +\setcounter{figure}{0} +\setcounter{table}{0} +\setcounter{numauthors}{0} +\setcounter{parentequation}{0} +\setcounter{subfigure}{0} +\setcounter{lofdepth}{1} +\setcounter{subtable}{0} +\setcounter{lotdepth}{1} +\setcounter{lstnumber}{1} +\setcounter{ContinuedFloat}{0} +\setcounter{AlgoLine}{0} +\setcounter{algocfline}{0} +\setcounter{algocfproc}{0} +\setcounter{algocf}{0} +} -- 2.39.5