From 013c063fb5cb99cc33b413d937f330acc42ca73e Mon Sep 17 00:00:00 2001 From: couturie <couturie@extinction> Date: Wed, 6 Feb 2013 14:19:18 +0100 Subject: [PATCH] ch18 --- BookGPU/Chapters/chapter18/biblio.bib | 35 ++ BookGPU/Chapters/chapter18/ch1.aux | 64 +++ BookGPU/Chapters/chapter18/ch18.aux | 71 ++++ BookGPU/Chapters/chapter18/ch18.tex | 392 ++++++++++++++++++ BookGPU/Chapters/chapter18/ch18.tex~ | 296 +++++++++++++ .../chapter18/figures/curve_time_bbs_gpu.pdf | Bin 0 -> 7587 bytes .../figures/curve_time_xorlike_gpu.pdf | Bin 0 -> 8005 bytes BookGPU/Chapters/chapter18/preamble.aux | 32 ++ 8 files changed, 890 insertions(+) create mode 100644 BookGPU/Chapters/chapter18/biblio.bib create mode 100644 BookGPU/Chapters/chapter18/ch1.aux create mode 100644 BookGPU/Chapters/chapter18/ch18.aux create mode 100755 BookGPU/Chapters/chapter18/ch18.tex create mode 100755 BookGPU/Chapters/chapter18/ch18.tex~ create mode 100644 BookGPU/Chapters/chapter18/figures/curve_time_bbs_gpu.pdf create mode 100644 BookGPU/Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf create mode 100644 BookGPU/Chapters/chapter18/preamble.aux diff --git a/BookGPU/Chapters/chapter18/biblio.bib b/BookGPU/Chapters/chapter18/biblio.bib new file mode 100644 index 0000000..f3b0ccc --- /dev/null +++ b/BookGPU/Chapters/chapter18/biblio.bib @@ -0,0 +1,35 @@ +@misc{ch1:cuda, + author = {{NVIDIA Corporation}}, + keywords = {CUDA}, + note = {Version 4.0}, + title = {{NVIDIA CUDA C} Programming Guide}, + year = 2011 +} + +@Article{ch1:Buck:2004:BGS, + author = "I. Buck and T. Foley and D. Horn and J. + Sugerman and K. Fatahalian and M. Houston and P. + Hanrahan", + title = "{Brook} for {GPUs}: stream computing on graphics + hardware", + journal = "ACM Transactions on Graphics", + volume = "23", + number = "3", + pages = "777--786", + month = aug, + year = "2004", +} + + +@article{ch1:CMR:12, + author = "B. Cloutier and B. K. Muite and P. Rigge", + title = "A comparison of CPU and GPU performance for Fourier pseudospectral simulations of the Navier-Stokes, Cubic Nonlinear Schrödinger and Sine Gordon Equations", + journal = "Computational Physics (physics.comp-ph)", + year = "2012", + archivePrefix = "arXiv", + eprint = "1206.3215", + primaryClass = "physics.comp-ph", +} + + + diff --git a/BookGPU/Chapters/chapter18/ch1.aux b/BookGPU/Chapters/chapter18/ch1.aux new file mode 100644 index 0000000..2c84cf4 --- /dev/null +++ b/BookGPU/Chapters/chapter18/ch1.aux @@ -0,0 +1,64 @@ +\relax +\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} +\@writefile{loa}{\addvspace {10\p@ }} +\@writefile{toc}{\contentsline {chapter}{\numberline {1}Presentation of the GPU architecture and of the Cuda environment}{3}} +\@writefile{lof}{\addvspace {10\p@ }} +\@writefile{lot}{\addvspace {10\p@ }} +\newlabel{chapter1}{{1}{3}} +\@writefile{toc}{\contentsline {section}{\numberline {1.1}Introduction}{3}} +\newlabel{ch1:intro}{{1.1}{3}} +\@writefile{toc}{\contentsline {section}{\numberline {1.2}Brief history of Video Card}{3}} +\@writefile{toc}{\contentsline {section}{\numberline {1.3}GPGPU}{4}} +\@writefile{toc}{\contentsline {section}{\numberline {1.4}Architecture of current GPUs}{5}} +\@writefile{lof}{\contentsline {figure}{\numberline {1.1}{\ignorespaces Comparison of number of cores in a CPU and in a GPU.\relax }}{5}} +\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} +\newlabel{ch1:fig:comparison_cpu_gpu}{{1.1}{5}} +\@writefile{lof}{\contentsline {figure}{\numberline {1.2}{\ignorespaces Comparison of low latency of CPU and high throughput of GPU.\relax }}{6}} +\newlabel{ch1:fig:latency_throughput}{{1.2}{6}} +\@writefile{toc}{\contentsline {section}{\numberline {1.5}Kinds of parallelism}{7}} +\@writefile{toc}{\contentsline {section}{\numberline {1.6}Cuda Multithreading}{7}} +\@writefile{lof}{\contentsline {figure}{\numberline {1.3}{\ignorespaces Scalability of GPU.\relax }}{8}} +\newlabel{ch1:fig:scalability}{{1.3}{8}} +\@writefile{toc}{\contentsline {section}{\numberline {1.7}Memory hierarchy}{9}} +\@writefile{lof}{\contentsline {figure}{\numberline {1.4}{\ignorespaces Memory hierarchy of a GPU.\relax }}{10}} +\newlabel{ch1:fig:memory_hierarchy}{{1.4}{10}} +\@writefile{toc}{\contentsline {section}{\numberline {1.8}Conclusion}{10}} +\@writefile{toc}{\contentsline {section}{Bibliography}{11}} +\@setckpt{Chapters/chapter1/ch1}{ +\setcounter{page}{12} +\setcounter{equation}{0} +\setcounter{enumi}{0} +\setcounter{enumii}{0} +\setcounter{enumiii}{0} +\setcounter{enumiv}{3} +\setcounter{footnote}{0} +\setcounter{mpfootnote}{0} +\setcounter{part}{1} +\setcounter{chapter}{1} +\setcounter{section}{8} +\setcounter{subsection}{0} +\setcounter{subsubsection}{0} +\setcounter{paragraph}{0} +\setcounter{subparagraph}{0} +\setcounter{figure}{4} +\setcounter{table}{0} +\setcounter{numauthors}{0} +\setcounter{parentequation}{0} +\setcounter{subfigure}{0} +\setcounter{lofdepth}{1} +\setcounter{subtable}{0} +\setcounter{lotdepth}{1} +\setcounter{lstnumber}{1} +\setcounter{ContinuedFloat}{0} +\setcounter{AlgoLine}{0} +\setcounter{algocfline}{0} +\setcounter{algocfproc}{0} +\setcounter{algocf}{0} +\setcounter{proposition}{0} +\setcounter{theorem}{0} +\setcounter{exercise}{0} +\setcounter{example}{0} +\setcounter{definition}{0} +\setcounter{proof}{0} +\setcounter{lstlisting}{0} +} diff --git a/BookGPU/Chapters/chapter18/ch18.aux b/BookGPU/Chapters/chapter18/ch18.aux new file mode 100644 index 0000000..f9f97ae --- /dev/null +++ b/BookGPU/Chapters/chapter18/ch18.aux @@ -0,0 +1,71 @@ +\relax +\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} +\@writefile{toc}{\author{Christophe Guyeux}{}} +\@writefile{loa}{\addvspace {10\p@ }} +\@writefile{toc}{\contentsline {chapter}{\numberline {14}Pseudo Random Number Generator on GPU}{305}} +\@writefile{lof}{\addvspace {10\p@ }} +\@writefile{lot}{\addvspace {10\p@ }} +\newlabel{chapter18}{{14}{305}} +\@writefile{toc}{\contentsline {section}{\numberline {14.1}Introduction}{305}} +\@writefile{toc}{\contentsline {section}{\numberline {14.2}Basic Recalls}{306}} +\newlabel{section:BASIC RECALLS}{{14.2}{306}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.2.1}Devaney's Chaotic Dynamical Systems}{306}} +\@writefile{toc}{\contentsline {section}{\numberline {14.3}Toward Efficiency and Improvement for CI PRNG}{306}} +\newlabel{sec:efficient PRNG}{{14.3}{306}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{306}} +\newlabel{algo:seqCIPRNG}{{14.1}{306}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {14.1}C code of the sequential PRNG based on chaotic iterations}{306}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{307}} +\newlabel{sec:efficient PRNG gpu}{{14.3.2}{307}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.3}Naive Version for GPU}{307}} +\@writefile{loa}{\contentsline {algocf}{\numberline {12}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{308}} +\newlabel{algo:gpu_kernel}{{12}{308}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.4}Improved Version for GPU}{308}} +\newlabel{IR}{{13}{309}} +\@writefile{loa}{\contentsline {algocf}{\numberline {13}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{309}} +\newlabel{algo:gpu_kernel2}{{13}{309}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.5}Chaos Evaluation of the Improved Version}{309}} +\@writefile{toc}{\contentsline {section}{\numberline {14.4}Experiments}{310}} +\newlabel{sec:experiments}{{14.4}{310}} +\@writefile{lof}{\contentsline {figure}{\numberline {14.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{311}} +\newlabel{fig:time_xorlike_gpu}{{14.1}{311}} +\@writefile{lof}{\contentsline {figure}{\numberline {14.2}{\ignorespaces Quantity of pseudorandom numbers generated per second using the BBS-based PRNG\relax }}{312}} +\newlabel{fig:time_bbs_gpu}{{14.2}{312}} +\@setckpt{Chapters/chapter18/ch18}{ +\setcounter{page}{313} +\setcounter{equation}{0} +\setcounter{enumi}{2} +\setcounter{enumii}{0} +\setcounter{enumiii}{0} +\setcounter{enumiv}{0} +\setcounter{footnote}{2} +\setcounter{mpfootnote}{0} +\setcounter{part}{1} +\setcounter{chapter}{14} +\setcounter{section}{4} +\setcounter{subsection}{0} +\setcounter{subsubsection}{0} +\setcounter{paragraph}{0} +\setcounter{subparagraph}{0} +\setcounter{figure}{2} +\setcounter{table}{0} +\setcounter{numauthors}{0} +\setcounter{parentequation}{4} +\setcounter{subfigure}{0} +\setcounter{lofdepth}{1} +\setcounter{subtable}{0} +\setcounter{lotdepth}{1} +\setcounter{lstnumber}{15} +\setcounter{ContinuedFloat}{0} +\setcounter{AlgoLine}{14} +\setcounter{algocfline}{13} +\setcounter{algocfproc}{13} +\setcounter{algocf}{13} +\setcounter{proposition}{1} +\setcounter{theorem}{0} +\setcounter{exercise}{0} +\setcounter{example}{0} +\setcounter{definition}{0} +\setcounter{proof}{1} +\setcounter{lstlisting}{1} +} diff --git a/BookGPU/Chapters/chapter18/ch18.tex b/BookGPU/Chapters/chapter18/ch18.tex new file mode 100755 index 0000000..3808995 --- /dev/null +++ b/BookGPU/Chapters/chapter18/ch18.tex @@ -0,0 +1,392 @@ +\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte} +\chapterauthor{Christophe Guyeux}{Femto-ST Institute, University of Franche-Comte} + + +\chapter{Pseudo Random Number Generator on GPU} +\label{chapter18} + +\section{Introduction} + +Randomness is of importance in many fields such as scientific +simulations or cryptography. ``Random numbers'' can mainly be +generated either by a deterministic and reproducible algorithm called +a pseudorandom number generator (PRNG). In this paper, we focus on +reproducible generators, useful for instance in Monte-Carlo based +simulators. These domains need PRNGs that are statistically +irreproachable. In some fields such as in numerical simulations, +speed is a strong requirement that is usually attained by using +parallel architectures. In that case, a recurrent problem is that a +deflation of the statistical qualities is often reported, when the +parallelization of a good PRNG is realized. In some fields such as in +numerical simulations, speed is a strong requirement that is usually +attained by using parallel architectures. In that case, a recurrent +problem is that a deflation of the statistical qualities is often +reported, when the parallelization of a good PRNG is realized. This +is why ad-hoc PRNGs for each possible architecture must be found to +achieve both speed and randomness. On the other side, speed is not +the main requirement in cryptography: the great need is to +define \emph{secure} generators able to withstand malicious +attacks. Roughly speaking, an attacker should not be able in practice +to make the distinction between numbers obtained with the secure +generator and a true random sequence. Or, in an equivalent +formulation, he or she should not be able (in practice) to predict the +next bit of the generator, having the knowledge of all the binary +digits that have been already released. ``Being able in practice'' +refers here to the possibility to achieve this attack in polynomial +time, and to the exponential growth of the difficulty of this +challenge when the size of the parameters of the PRNG increases. + + +Finally, a small part of the community working in this domain focuses on a +third requirement, that is to define chaotic generators. +The main idea is to take benefits from a chaotic dynamical system to obtain a +generator that is unpredictable, disordered, sensible to its seed, or in other word chaotic. +Their desire is to map a given chaotic dynamics into a sequence that seems random +and unassailable due to chaos. +However, the chaotic maps used as a pattern are defined in the real line +whereas computers deal with finite precision numbers. +This distortion leads to a deflation of both chaotic properties and speed. +Furthermore, authors of such chaotic generators often claim their PRNG +as secure due to their chaos properties, but there is no obvious relation +between chaos and security as it is understood in cryptography. +This is why the use of chaos for PRNG still remains marginal and disputable. + + +The remainder of this paper is organized as follows. +A COMPLETER + + +\section{Basic Recalls} +\label{section:BASIC RECALLS} + +This section is devoted to basic definitions and terminologies in the fields of +topological chaos and chaotic iterations. We assume the reader is familiar +with basic notions on topology (see for instance~\cite{Devaney}). + + +\subsection{Devaney's Chaotic Dynamical Systems} + + +A COMPLETER + + + +\section{Toward Efficiency and Improvement for CI PRNG} +\label{sec:efficient PRNG} + +\subsection{First Efficient Implementation of a PRNG based on Chaotic Iterations} +% +%Based on the proof presented in the previous section, it is now possible to +%improve the speed of the generator formerly presented in~\cite{bgw09:ip,guyeux10}. +%The first idea is to consider +%that the provided strategy is a pseudorandom Boolean vector obtained by a +%given PRNG. +%An iteration of the system is simply the bitwise exclusive or between +%the last computed state and the current strategy. +%Topological properties of disorder exhibited by chaotic +%iterations can be inherited by the inputted generator, we hope by doing so to +%obtain some statistical improvements while preserving speed. +% +%%RAPH : j'ai viré tout ca +%% Let us give an example using 16-bits numbers, to clearly understand how the bitwise xor operations +%% are +%% done. +%% Suppose that $x$ and the strategy $S^i$ are given as +%% binary vectors. +%% Table~\ref{TableExemple} shows the result of $x \oplus S^i$. + +%% \begin{table} +%% \begin{scriptsize} +%% $$ +%% \begin{array}{|cc|cccccccccccccccc|} +%% \hline +%% x &=&1&0&1&1&1&0&1&0&1&0&0&1&0&0&1&0\\ +%% \hline +%% S^i &=&0&1&1&0&0&1&1&0&1&1&1&0&0&1&1&1\\ +%% \hline +%% x \oplus S^i&=&1&1&0&1&1&1&0&0&0&1&1&1&0&1&0&1\\ +%% \hline + +%% \hline +%% \end{array} +%% $$ +%% \end{scriptsize} +%% \caption{Example of an arbitrary round of the proposed generator} +%% \label{TableExemple} +%% \end{table} + + + + +\lstset{language=C,caption={C code of the sequential PRNG based on chaotic iterations},label={algo:seqCIPRNG}} +\begin{small} +\begin{lstlisting} + +unsigned int CIPRNG() { + static unsigned int x = 123123123; + unsigned long t1 = xorshift(); + unsigned long t2 = xor128(); + unsigned long t3 = xorwow(); + x = x^(unsigned int)t1; + x = x^(unsigned int)(t2>>32); + x = x^(unsigned int)(t3>>32); + x = x^(unsigned int)t2; + x = x^(unsigned int)(t1>>32); + x = x^(unsigned int)t3; + return x; +} +\end{lstlisting} +\end{small} + + + +In Listing~\ref{algo:seqCIPRNG} a sequential version of the proposed PRNG based +on chaotic iterations is presented. The xor operator is represented by +\textasciicircum. This function uses three classical 64-bits PRNGs, namely the +\texttt{xorshift}, the \texttt{xor128}, and the +\texttt{xorwow}~\cite{Marsaglia2003}. In the following, we call them ``xor-like +PRNGs''. As each xor-like PRNG uses 64-bits whereas our proposed generator +works with 32-bits, we use the command \texttt{(unsigned int)}, that selects the +32 least significant bits of a given integer, and the code \texttt{(unsigned + int)(t$>>$32)} in order to obtain the 32 most significant bits of \texttt{t}. + +Thus producing a pseudorandom number needs 6 xor operations with 6 32-bits numbers +that are provided by 3 64-bits PRNGs. This version successfully passes the +stringent BigCrush battery of tests~\cite{LEcuyerS07}. +At this point, we thus +have defined an efficient and statistically unbiased generator. Its speed is +directly related to the use of linear operations, but for the same reason, +this fast generator cannot be proven as secure. + + + +\subsection{Efficient PRNGs based on Chaotic Iterations on GPU} +\label{sec:efficient PRNG gpu} + +In order to take benefits from the computing power of GPU, a program +needs to have independent blocks of threads that can be computed +simultaneously. In general, the larger the number of threads is, the +more local memory is used, and the less branching instructions are +used (if, while, ...), the better the performances on GPU is. +Obviously, having these requirements in mind, it is possible to build +a program similar to the one presented in Listing +\ref{algo:seqCIPRNG}, which computes pseudorandom numbers on GPU. To +do so, we must firstly recall that in the CUDA~\cite{Nvid10} +environment, threads have a local identifier called +\texttt{ThreadIdx}, which is relative to the block containing +them. Furthermore, in CUDA, parts of the code that are executed by the GPU, are +called {\it kernels}. + + +\subsection{Naive Version for GPU} + + +It is possible to deduce from the CPU version a quite similar version adapted to GPU. +The simple principle consists in making each thread of the GPU computing the CPU version of our PRNG. +Of course, the three xor-like +PRNGs used in these computations must have different parameters. +In a given thread, these parameters are +randomly picked from another PRNGs. +The initialization stage is performed by the CPU. +To do it, the ISAAC PRNG~\cite{Jenkins96} is used to set all the +parameters embedded into each thread. + +The implementation of the three +xor-like PRNGs is straightforward when their parameters have been +allocated in the GPU memory. Each xor-like works with an internal +number $x$ that saves the last generated pseudorandom number. Additionally, the +implementation of the xor128, the xorshift, and the xorwow respectively require +4, 5, and 6 unsigned long as internal variables. + + +\begin{algorithm} +\begin{small} +\KwIn{InternalVarXorLikeArray: array with internal variables of the 3 xor-like +PRNGs in global memory\; +NumThreads: number of threads\;} +\KwOut{NewNb: array containing random numbers in global memory} +\If{threadIdx is concerned by the computation} { + retrieve data from InternalVarXorLikeArray[threadIdx] in local variables\; + \For{i=1 to n} { + compute a new PRNG as in Listing\ref{algo:seqCIPRNG}\; + store the new PRNG in NewNb[NumThreads*threadIdx+i]\; + } + store internal variables in InternalVarXorLikeArray[threadIdx]\; +} +\end{small} +\caption{Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations} +\label{algo:gpu_kernel} +\end{algorithm} + + + +Algorithm~\ref{algo:gpu_kernel} presents a naive implementation of the proposed PRNG on +GPU. Due to the available memory in the GPU and the number of threads +used simultaneously, the number of random numbers that a thread can generate +inside a kernel is limited (\emph{i.e.}, the variable \texttt{n} in +algorithm~\ref{algo:gpu_kernel}). For instance, if $100,000$ threads are used and +if $n=100$\footnote{in fact, we need to add the initial seed (a 32-bits number)}, +then the memory required to store all of the internals variables of both the xor-like +PRNGs\footnote{we multiply this number by $2$ in order to count 32-bits numbers} +and the pseudorandom numbers generated by our PRNG, is equal to $100,000\times ((4+5+6)\times +2+(1+100))=1,310,000$ 32-bits numbers, that is, approximately $52$Mb. + +This generator is able to pass the whole BigCrush battery of tests, for all +the versions that have been tested depending on their number of threads +(called \texttt{NumThreads} in our algorithm, tested up to $5$ million). + + +\subsection{Improved Version for GPU} + +As GPU cards using CUDA have shared memory between threads of the same block, it +is possible to use this feature in order to simplify the previous algorithm, +i.e., to use less than 3 xor-like PRNGs. The solution consists in computing only +one xor-like PRNG by thread, saving it into the shared memory, and then to use the results +of some other threads in the same block of threads. In order to define which +thread uses the result of which other one, we can use a combination array that +contains the indexes of all threads and for which a combination has been +performed. + +In Algorithm~\ref{algo:gpu_kernel2}, two combination arrays are used. The +variable \texttt{offset} is computed using the value of +\texttt{combination\_size}. Then we can compute \texttt{o1} and \texttt{o2} +representing the indexes of the other threads whose results are used by the +current one. In this algorithm, we consider that a 32-bits xor-like PRNG has +been chosen. In practice, we use the xor128 proposed in~\cite{Marsaglia2003} in +which unsigned longs (64 bits) have been replaced by unsigned integers (32 +bits). + +This version can also pass the whole {\it BigCrush} battery of tests. + +\begin{algorithm} +\begin{small} +\KwIn{InternalVarXorLikeArray: array with internal variables of 1 xor-like PRNGs +in global memory\; +NumThreads: Number of threads\; +array\_comb1, array\_comb2: Arrays containing combinations of size combination\_size\;} + +\KwOut{NewNb: array containing random numbers in global memory} +\If{threadId is concerned} { + retrieve data from InternalVarXorLikeArray[threadId] in local variables including shared memory and x\; + offset = threadIdx\%combination\_size\; + o1 = threadIdx-offset+array\_comb1[offset]\; + o2 = threadIdx-offset+array\_comb2[offset]\; + \For{i=1 to n} { + t=xor-like()\; + t=t\textasciicircum shmem[o1]\textasciicircum shmem[o2]\; + shared\_mem[threadId]=t\; + x = x\textasciicircum t\; + + store the new PRNG in NewNb[NumThreads*threadId+i]\; + } + store internal variables in InternalVarXorLikeArray[threadId]\; +} +\end{small} +\caption{Main kernel for the chaotic iterations based PRNG GPU efficient +version\label{IR}} +\label{algo:gpu_kernel2} +\end{algorithm} + +\subsection{Chaos Evaluation of the Improved Version} + +A run of Algorithm~\ref{algo:gpu_kernel2} consists in an operation ($x=x\oplus t$) having +the form of Equation~\ref{equation Oplus}, which is equivalent to the iterative +system of Eq.~\ref{eq:generalIC}. That is, an iteration of the general chaotic +iterations is realized between the last stored value $x$ of the thread and a strategy $t$ +(obtained by a bitwise exclusive or between a value provided by a xor-like() call +and two values previously obtained by two other threads). +To be certain that we are in the framework of Theorem~\ref{t:chaos des general}, +we must guarantee that this dynamical system iterates on the space +$\mathcal{X} = \mathcal{P}\left(\llbracket 1, \mathsf{N} \rrbracket\right)^\mathds{N}\times\mathds{B}^\mathsf{N}$. +The left term $x$ obviously belongs to $\mathds{B}^ \mathsf{N}$. +To prevent from any flaws of chaotic properties, we must check that the right +term (the last $t$), corresponding to the strategies, can possibly be equal to any +integer of $\llbracket 1, \mathsf{N} \rrbracket$. + +Such a result is obvious, as for the xor-like(), all the +integers belonging into its interval of definition can occur at each iteration, and thus the +last $t$ respects the requirement. Furthermore, it is possible to +prove by an immediate mathematical induction that, as the initial $x$ +is uniformly distributed (it is provided by a cryptographically secure PRNG), +the two other stored values shmem[o1] and shmem[o2] are uniformly distributed too, +(this is the induction hypothesis), and thus the next $x$ is finally uniformly distributed. + +Thus Algorithm~\ref{algo:gpu_kernel2} is a concrete realization of the general +chaotic iterations presented previously, and for this reason, it satisfies the +Devaney's formulation of a chaotic behavior. + +\section{Experiments} +\label{sec:experiments} + +Different experiments have been performed in order to measure the generation +speed. We have used a first computer equipped with a Tesla C1060 NVidia GPU card +and an +Intel Xeon E5530 cadenced at 2.40 GHz, and +a second computer equipped with a smaller CPU and a GeForce GTX 280. +All the +cards have 240 cores. + +In Figure~\ref{fig:time_xorlike_gpu} we compare the quantity of pseudorandom numbers +generated per second with various xor-like based PRNGs. In this figure, the optimized +versions use the {\it xor64} described in~\cite{Marsaglia2003}, whereas the naive versions +embed the three xor-like PRNGs described in Listing~\ref{algo:seqCIPRNG}. In +order to obtain the optimal performances, the storage of pseudorandom numbers +into the GPU memory has been removed. This step is time consuming and slows down the numbers +generation. Moreover this storage is completely +useless, in case of applications that consume the pseudorandom +numbers directly after generation. We can see that when the number of threads is greater +than approximately 30,000 and lower than 5 million, the number of pseudorandom numbers generated +per second is almost constant. With the naive version, this value ranges from 2.5 to +3GSamples/s. With the optimized version, it is approximately equal to +20GSamples/s. Finally we can remark that both GPU cards are quite similar, but in +practice, the Tesla C1060 has more memory than the GTX 280, and this memory +should be of better quality. +As a comparison, Listing~\ref{algo:seqCIPRNG} leads to the generation of about +138MSample/s when using one core of the Xeon E5530. + +\begin{figure}[htbp] +\begin{center} + \includegraphics[scale=0.65]{Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf} +\end{center} +\caption{Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG} +\label{fig:time_xorlike_gpu} +\end{figure} + + + + + +In Figure~\ref{fig:time_bbs_gpu} we highlight the performances of the optimized +BBS-based PRNG on GPU. On the Tesla C1060 we obtain approximately 700MSample/s +and on the GTX 280 about 670MSample/s, which is obviously slower than the +xorlike-based PRNG on GPU. However, we will show in the next sections that this +new PRNG has a strong level of security, which is necessarily paid by a speed +reduction. + +\begin{figure}[htbp] +\begin{center} + \includegraphics[scale=0.65]{Chapters/chapter18/figures/curve_time_bbs_gpu.pdf} +\end{center} +\caption{Quantity of pseudorandom numbers generated per second using the BBS-based PRNG} +\label{fig:time_bbs_gpu} +\end{figure} + +All these experiments allow us to conclude that it is possible to +generate a very large quantity of pseudorandom numbers statistically perfect with the xor-like version. +To a certain extend, it is also the case with the secure BBS-based version, the speed deflation being +explained by the fact that the former version has ``only'' +chaotic properties and statistical perfection, whereas the latter is also cryptographically secure, +as it is shown in the next sections. + + + + + + + + + + + +\putbib[Chapters/chapter18/biblio] + diff --git a/BookGPU/Chapters/chapter18/ch18.tex~ b/BookGPU/Chapters/chapter18/ch18.tex~ new file mode 100755 index 0000000..6c3f1f1 --- /dev/null +++ b/BookGPU/Chapters/chapter18/ch18.tex~ @@ -0,0 +1,296 @@ +\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte} + + +\chapter{Presentation of the GPU architecture and of the Cuda environment} +\label{chapter1} + +\section{Introduction}\label{ch1:intro} + +This chapter introduces the Graphics Processing Unit (GPU) architecture and all +the concepts needed to understand how GPUs work and can be used to speed up the +execution of some algorithms. First of all this chapter gives a brief history of +the development of Graphics card until they have been used in order to make +general purpose computation. Then the architecture of a GPU is +illustrated. There are many fundamental differences between a GPU and a +tradition processor. In order to benefit from the power of a GPU, a Cuda +programmer needs to use threads. They have some particularities which enable the +Cuda model to be efficient and scalable when some constraints are addressed. + + + +\section{Brief history of Video Card} + +Video cards or Graphics cards have been introduced in personal computers to +produce high quality graphics faster than classical Central Processing Units +(CPU) and to alleviate CPU from this task. In general, display tasks are very +repetitive and very specific. Hence, some manufacturers have produced more and +more sophisticated video cards, providing 2D accelerations then 3D accelerations, +then some light transforms. Video cards own their own memory to perform their +computation. For at least two decades, every personal computer has had a video +card which is simple for desktop computers or which provides many accelerations +for game and/or graphic oriented computers. In the latter case, graphic cards +may be more expensive than a CPU. + +Since 2000, video cards have allowed users to apply arithmetic operations +simultaneously on a sequence of pixels, also later called stream processing. In +this case, the information of the pixels (color, location and other information) are +combined in order to produce a pixel color that can be displayed on a screen. +Simultaneous computations are provided by shaders which calculate rendering +effects on graphics hardware with a high degree of flexibility. These shaders +handles the stream data with pipelines. + + +Some researchers tried to apply those operations on other data, representing +something different from pixels, and consequently this resulted in the first +uses of video cards for performing general purpose computation. The programming +model was not easy to use at all and was very dependent of the hardware +constraints. More precisely it consisted in using either DirectX of OpenGL +functions providing an interface to some classical operations for videos +operations (memory transfers, texture manipulation, ...). Floating point +operations were most of the time unimaginable. Obviously when something went +wrong, programmers had no way (and neither the tools) to detect it. + +\section{GPGPU} + +In order to benefit from the computing power of more recent video cards, Cuda +was first proposed in 2007 by NVidia. It unifies the programming model for some +of their most performant video cards. Cuda~\cite{ch1:cuda} has quickly been +considered by the scientific community as a great advance for general purpose +graphics processing unit (GPGPU) computing. Of course other programming models +have been proposed. The other well-known alternative is OpenCL which aims at +proposing an alternative to Cuda and which is multi-platform and portable. This +is a great advantage since it is even possible to execute OpenCL programs on +traditional CPUs. The main drawback is that it is less tight with the hardware +and consequently sometimes provides less efficient programs. Moreover, Cuda +benefits from more mature compilation and optimization procedures. Other less +known environments have been proposed, but most of them have been stopped, for +example we can cite: FireStream by ATI which is not maintained anymore and +replaced by OpenCL, BrookGPU by Standford University~\cite{ch1:Buck:2004:BGS}. +Another environment based on pragma (insertion of pragma directives inside the +code to help the compiler to generate efficient code) is call OpenACC. For a +comparison with OpenCL, interested readers may refer to~\cite{ch1:CMR:12}. + + + +\section{Architecture of current GPUs} + +The architecture \index{architecture of a GPU} of current GPUs is constantly +evolving. Nevertheless some trends remain constant throughout this evolution. +Processing units composing a GPU are far more simple than a traditional CPU but +it is much easier to integrate many computing units inside a GPU card than to do +so with many cores inside a CPU. This is due to the fact that the cores of a GPU are +simpler than the cores of a CPU. In 2012, the most powerful GPUs own more than 500 +cores and the most powerful CPUs have 8 +cores. Figure~\ref{ch1:fig:comparison_cpu_gpu} shows the number of cores inside +a CPU and inside a GPU. In fact, in a current NVidia GPU, there are +multiprocessors which have 32 cores (for example on Fermi cards). The core clock +of CPU is generally around 3GHz and the one of GPU is about 1.5GHz. Although +the core clock of GPU cores is slower, the amount of cores inside a GPU provides +more computational power. This measure is commonly represented by the number of +floating point operation per seconds. Nowadays the most powerful GPUs provide more +than 1TFlops, i.e. $10^{12}$ floating point operations per second. +Nevertheless GPUs are very efficient to perform some operations but not all +kinds of operations. They are very efficient to execute repetitive work in which +only the data change. It is important to keep in mind that multiprocessors +inside a GPU have 32 cores. Later we will see that these 32 cores need to do the +same work to get maximum performance. + +\begin{figure}[b!] +\centerline{\includegraphics[]{Chapters/chapter1/figures/nb_cores_CPU_GPU.pdf}} +\caption{Comparison of number of cores in a CPU and in a GPU.} +%[Comparison of number of cores in a CPU and in a GPU] +\label{ch1:fig:comparison_cpu_gpu} +\end{figure} + +On the most powerful GPU cards, called Fermi, multiprocessors are called streaming +multiprocessors (SM). Each SM contains 32 cores and is able to perform 32 +floating points or integer operations on 32 bits numbers per clock or 16 floating +points on 64 bits number per clock. SMs have their own registers, execution +pipelines and caches. On Fermi architecture, there are 64Kb shared memory + L1 +cache and 32,536 32bits registers per SM. More precisely the programmer can +decide what amount of shared memory and L1 cache SM can use. The constraint is +that the sum of both amounts should be less or equal to 64Kb. + +Threads are used to benefit from the important number of cores of a GPU. Those +threads are different from traditional threads for CPU. In +chapter~\ref{chapter2}, some examples of GPU programming will explicit the +details of the GPU threads. However, threads are gathered into blocks of 32 +threads, called ``warps''. Those warps are important when designing an algorithm +for GPU. + + +Another big difference between CPU and GPU is the latency of memory. In CPU, +everything is optimized to obtain a low latency architecture. This is possible +through the use of cache memories. Moreover, nowadays CPUs perform many +performance optimizations such as speculative execution which roughly speaking +consists in executing a small part of code in advance even if later this work +reveals itself to be useless. On the contrary, GPUs do not have low latency +memory. In comparison GPUs have small cache memories. Nevertheless the +architecture of GPUs is optimized for throughput computation and it takes into +account the memory latency. + + + +\begin{figure}[b!] +\centerline{\includegraphics[scale=0.7]{Chapters/chapter1/figures/low_latency_vs_high_throughput.pdf}} +\caption{Comparison of low latency of CPU and high throughput of GPU.} +\label{ch1:fig:latency_throughput} +\end{figure} + +Figure~\ref{ch1:fig:latency_throughput} illustrates the main difference of +memory latency between a CPU and a GPU. In a CPU, tasks ``ti'' are executed one +by one with a short memory latency to get the data to process. After some tasks, +there is a context switch that allows the CPU to run concurrent applications +and/or multi-threaded applications. Memory latencies are longer in a GPU, the +the principle to obtain a high throughput is to have many tasks to +compute. Later we will see that those tasks are called threads with Cuda. With +this principle, as soon as a task is finished the next one is ready to be +executed while the wait for data for the previous task is overlapped by +computation of other tasks. + + + +\section{Kinds of parallelism} + +Many kinds of parallelism are amiable according to the type of hardware. +Roughly speaking, there are three classes of parallelism: instruction-level +parallelism, data parallelism and task parallelism. + +Instruction-level parallelism consists in re-ordering some instructions in order +to execute some of them in parallel without changing the result of the code. +In modern CPUs, instruction pipelines allow processor to execute instructions +faster. With a pipeline a processor can execute multiple instructions +simultaneously due to the fact that the output of a task is the input of the +next one. + +Data parallelism consists in executing the same program with different data on +different computing units. Of course, no dependency should exist between the +data. For example, it is easy to parallelize loops without dependency using the +data parallelism paradigm. This paradigm is linked with the Single Instructions +Multiple Data (SIMD) architecture. This is the kind of parallelism provided by +GPUs. + +Task parallelism is the common parallelism achieved out on clusters and grids and +high performance architectures where different tasks are executed by different +computing units. + +\section{Cuda Multithreading} + +The data parallelism of Cuda is more precisely based on the Single Instruction +Multiple Thread (SIMT) model. This is due to the fact that a programmer accesses +to the cores by the intermediate of threads. In the Cuda model, all cores +execute the same set of instructions but with different data. This model has +similarities with the vector programming model proposed for vector machines through +the 1970s into the 90s, notably the various Cray platforms. On the Cuda +architecture, the performance is led by the use of a huge number of threads +(from thousands up to to millions). The particularity of the model is that there +is no context switching as in CPUs and each thread has its own registers. In +practice, threads are executed by SM and are gathered into groups of 32 +threads. Those groups are called ``warps''. Each SM alternatively executes +``active warps'' and warps becoming temporarily inactive due to waiting of data +(as shown in Figure~\ref{ch1:fig:latency_throughput}). + +The key to scalability in the Cuda model is the use of a huge number of threads. +In practice, threads are not only gathered in warps but also in thread blocks. A +thread block is executed by only one SM and it cannot migrate. The typical size of +a thread block is a number power of two (for example: 64, 128, 256 or 512). + + + +In this case, without changing anything inside a Cuda code, it is possible to +run your code with a small Cuda device or the most performing Tesla Cuda cards. +Blocks are executed in any order depending on the number of SMs available. So +the programmer must conceive its code having this issue in mind. This +independence between thread blocks provides the scalability of Cuda codes. + +\begin{figure}[b!] +\centerline{\includegraphics[scale=0.65]{Chapters/chapter1/figures/scalability.pdf}} +\caption{Scalability of GPU.} +\label{ch1:fig:scalability} +\end{figure} + + +A kernel is a function which contains a block of instructions that are executed +by the threads of a GPU. When the problem considered is a two dimensional or three +dimensional problem, it is possible to group thread blocks into a grid. In +practice, the number of thread blocks and the size of thread blocks is given as +parameters to each kernel. Figure~\ref{ch1:fig:scalability} illustrates an +example of a kernel composed of 8 thread blocks. Then this kernel is executed on +a small device containing only 2 SMs. So in this case, blocks are executed 2 +by 2 in any order. If the kernel is executed on a larger Cuda device containing +4 SMs, blocks are executed 4 by 4 simultaneously. The execution times should be +approximately twice faster in the latter case. Of course, that depends on other +parameters that will be described later. + +Thread blocks provide a way to cooperation in the sense that threads of the same +block cooperatively load and store blocks of memory they all +use. Synchronizations of threads in the same block are possible (but not between +threads of different blocks). Threads of the same block can also share results +in order to compute a single result. In chapter~\ref{chapter2}, some examples +will explicit that. + + +\section{Memory hierarchy} + +The memory hierarchy of GPUs\index{memory~hierarchy} is different from the CPUs +one. In practice, there are registers\index{memory~hierarchy!registers}, local +memory\index{memory~hierarchy!local~memory}, shared +memory\index{memory~hierarchy!shared~memory}, cache +memory\index{memory~hierarchy!cache~memory} and global +memory\index{memory~hierarchy!global~memory}. + + +As previously mentioned each thread can access its own registers. It is +important to keep in mind that the number of registers per block is limited. On +recent cards, this number is limited to 64Kb per SM. Access to registers is +very fast, so it is a good idea to use them whenever possible. + +Likewise each thread can access local memory which, in practice, is much slower +than registers. Local memory is automatically used by the compiler when all the +registers are occupied. So the best idea is to optimize the use of registers +even if this implies to reduce the number of threads per block. + +\begin{figure}[hbtp!] +\centerline{\includegraphics[scale=0.60]{Chapters/chapter1/figures/memory_hierarchy.pdf}} +\caption{Memory hierarchy of a GPU.} +\label{ch1:fig:memory_hierarchy} +\end{figure} + + + +Shared memory allows cooperation between threads of the same block. This kind +of memory is fast because it requires to be manipulated manually and its size is +limited. It is accessible during the execution of a kernel. So the principle is +to fill the shared memory at the start of the kernel with global data that are +used very frequently, then threads can access it for their computation. They +can obviously change the content of this shared memory either with computation +or load of other data and they can store its content in the global memory. So +shared memory can be seen as a cache memory manageable manually. This requires +obviously an effort from the programmer. + +On recent cards, the programmer may decide what amount of cache memory and +shared memory is attributed to a kernel. The cache memory is a L1 cache which is +directly managed by the GPU. Sometimes, this cache provides very efficient +result and sometimes the use of shared memory is a better solution. + + + + +Figure~\ref{ch1:fig:memory_hierarchy} illustrates the memory hierarchy of a +GPU. Threads are represented on the top of the figure. They can access to their +own registers and their local memory. Threads of the same block can access to +the shared memory of this block. The cache memory is not represented here but it +is local to a thread. Then each block can access to the global memory of the +GPU. + + \section{Conclusion} + +In this chapter, a brief presentation of the video card, which has later been +used to perform computation, has been given. The architecture of a GPU has been +illustrated focusing on the particularity of GPUs in term of parallelism, memory +latency and threads. In order to design an efficient algorithm for GPU, it is +essential to have all these parameters in mind. + + +\putbib[Chapters/chapter1/biblio] + diff --git a/BookGPU/Chapters/chapter18/figures/curve_time_bbs_gpu.pdf b/BookGPU/Chapters/chapter18/figures/curve_time_bbs_gpu.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a6a1080ad960e9f60d4952c52c0eaa979f7db36f GIT binary patch literal 7587 zcmb_hc|26__y2sfG4hc%HAxpSD%{z}F8eN7%aX=y#*CRUGqS5tl&vI1vLq>qvQ#3H zP>GTy6)DP;P=r*{?~bMQ`}_UAe|%<M%yZ9kw&$GloO_@5RkSrSQAMd?VT$zw38!E< z012=?y<ytgpcREh<9h*kh++qt(3yM+7XVF|L_WocLSmCCy1FnPpGzV7zyi`vCcCpq zxF!4F-7kEvDC*npQNQjA(5iK7S43n<&ncgWMbYca&7DQVbkYOs(_0*(B@VP!gpSE1 z45suBBS<uwO6aSPM7`@}jeT#lH5gNA`DG~<HcCle<Mt9q+t%5u+&Q?K*Vt3Zm~0t6 zAe&P-sXfr>e**7~YC6%_8@ajZ7Bb4=pqOtwwK$^^&H5Op>cI}n2<xz7YZQoIw%d6^ zPO+%t(!`U{GYy_v)<rEB^=lAMzAV{C-B8kD6?1=dsavl3E;!z6!b+fBr}UWKk@*7S zZka)7u&G%nk->YBERStSmk_e@TTaf-uRYefOsMLx_r$v?)AX)1Z1b4$sjZf$J@0{t z2**~A;{yj;)x-m|`oV_F_oLrTirz(!th`~otvUT`?HHG~0b6ch7-6Hi>*1vq?|~9{ zc}E95(V%z}7W>+pk!ucXEO_w1;1T+@l9n|qcROa#XGbJe46f$Yoaw&!q|%J^IO(X$ zVaip5;!6Def@F<yehSUHWD8t9ZgAhGy+>GuU>19oW!B>r@YFXu>pN8RYZj=&OT;T% z*6*V_dwD(3eQ9%Kd8&I>mj0q{%lMoJ1272*%_r%a$wD_u9z2$~cE{$(Q+j)*w7a7D zdU(Nu{6cHKmydiY(fI%?$_yUJ>W#bV=zpc4$EmC9wOu!uocZGJ_C*q!v6r^P*7BLn znc3qei3as;8Ze0<^RGFi*)>}hs!yB1zLx2H{V<F64@<8(!v31m;2XaOr5oMrn~N&{ zK4g^c!51G*+4MoiKE3z6w!pbj-r+N;n2@{HL;H1butLvh_oF4SaaH57S_i_HM3MUP z>sri&VBSPURrkrJKDvL$$)R+A*AI1R{*v`I$!E~<8FqrsAfKw6lef2|^kxT64VIKY zO>Y$Nf=8zc$G<goPL6)qwx}z=piH&wc;C_|PeM<kntSU7l3yfux4i#I<Lx%xtDFAM zmc9UFx7LZ##fwp;nDe2pRh0`2t|+<E$7HQ9K2TI34>4NR8DgDLZgkD8Wh#|lp4ABK zwe!|J(f&eFjJ$bl)%`^`40;83K}D>zl?v8o^Wj)A-+%mDdcAje$Gtg+qiU#}%y%I* zdDl886>Pq6X-R2aP%FFvo?Kva&d3^-e*sk6dq}Wr>D9>v3pmA}YL8JIZUpx}E%$0H z^vS%?8sYCEc{)dyyD?i9=Ka^2p(E|{Jjy6BgL$%PZ9(t1zpXaCu!z2_V*4|?&ec<6 z&z;6LFS~X&YDE4MDd^bPPUpwXcXqT#xKdnl!?sJlJ=MC;tv$lOu)q2OweWoT*+o^C zpIN3(y*eAL^>todXx&>1i#*!_ppmd^2z!O_7llOOVY4*skB;IH$f1C?L>dLOrjY4G z19kx5iiD<dNGyQCBi&)pB$vYC0~le?fq`}u9@~#gqVS+@WXolf>?wR#5b9R|=s*eJ zL+r)@d{cWq)R6#`P!A+x!iL0AsObx+L4k(QHKdUT2pgLn3?j|CB{VV9=pb#7U%<3u zJOH$$lX-wEOz5^9%pHI<_z6}1Br4QC+aqTr4T*drlTDjbG3&iGg-;|yGXQFa@WVFY zR|P+o0Ry+#S_3Fu-B~B6S2b(OjsO$N+5J#6z0A2;b(s3Ev%<^SNdqE}A`A>@VB=!G z-Exx|g}IBur;~`&;WDD|NL)IH&*lOeLKV~Dn4W>cN$2wThF(N2fI)*+#Mv<ljRu|R zWWE>A6^14V4gLAWsS6`0{6ZWs91ai6{RwGv<DY+j(gB+p6VT{c9faSkOwYjZcwt0l zxbRRk=J;pmzv}yL3Higi(Dyky`nxT43}%ix?Kcv@2q}=<9RG}FAqDyqmLUteAJSPg zC@3s_7LWe@+TPETFRXcCnLt^ct36?ue=4BgvM_yVL-e8i=X4oB+s0K@9Z*H1)Byq> z4G^#>cj0}R#)Fas1NC_%;TD3ZQ1ufzW)wQjYxYu@Y2lnu6b70@MN22?vuI2TfP{he ze2R|~v~{pB(3;NULCcv=@pSsYZL?}{7!61(R1eTrR0a(}Q_u#q1)-7w9l>oN5%dH} zAQ_~9(?&rmNTu(hfHcqxq=P;n3uMt*6p#(Fq5m9^!}Q~UT#yIyD7z>ukVg*yc}ybD z3*>Q#Bnrp}`TlIs4-5o@z#s~jy<s{o&~pd{1E7R;@$*g)Ue88!=1Jw}2Nh^-Kj*=4 z?#TvwB>{tV;jB(}}od1V;t<#mL7a)s{J=Jlm3A14{x9u_GAYvOn6FIO<i+`sv8 zR?pdOu4Sj}7o9KN7!5PaU6B<nzVQUZG4qt7_g~96b}{z3z4rn?u_mcIGK)h!D!*J0 z=DpJqObYY_RU$W;aJeO-_Y7Cul{eORO6|n*dYJP&C+VFc#05^P$5&ezi$_Oi->1oo z%M~HdpU`V@Sqxst+_f^P!dP@ccw(Ny-x_5(f!kN7EI^k>UM(GukGJ{e?mIR*n&h+b z>qot!&BkF)`z@#qyO(cB(kOcjbKHAOWl#472Wrk8bW%=P$|rPS&^7rkzRK1D2j<}g zv<<~v9`7J|*Wcr4g0%D_#mVPVM~_}VhxAwpI=^a=C;J|ZHOSF^_id$E_(jLiL(2}G zIJVBtR=L|D)hSsvr$gq*;nl}VmE@k*wl$K<n6(yLOljfKO4<ijc}huH-de^G)so+9 zwtn)!U-BuU>)WODBQa7|kxL4Ul%-5m6mE)1dTK}+Z<F#2m!mFgkKA+I@Sb9y)XH0~ zw>)K?!(U3W6oTfDho1~*w8)$Z#SLD1;L_oZZ%}E@ZyLz*Dl=2Nrl0uo-EAogi^1&5 zoS}y+`AVVtWK)Lpz8){jDLY!05%_5FvDFN!a+X-icJ9ei<IX<);TwnUiw>;0OdXx) zrrNN0HN!MB?z8Fr8^yssbg8A2_lY(p^qOF%V(hauij=~SxqeAG)vMd!B8>TI60{}* z#XX|p<}wkk9BfJS#puYcRqv%ucvA1XH#`u3%PAC9PQ2oZyrm%_*5sXKzbVqbbNfng zmuvJq-%OTVM@`T0ql@sHo!|Da%?S?e$OsB~&(z-3{xp!@xdaIeIlj$F-ru+Y&N5KA znlBBPQNNXUqicIy*I}I?kCY{AR~{NxyV)n|iUs>62M&ZD`XZu_{QOF=?9gVd#1-aR zMVIr6vK5Leq~f}H3*k5ATox^_;ANL%1+gW%@1jRy=0ACvm&&t^DYshU-sXZabl+AR ze}i76lci84*}N08P~?61X369GbL>pkMfY4jZTc`FR9QJ@=eH2vipw`Kr<}^nQnDzQ zw(MMa@7k0RqBN4GJ<jm!37hg{P)oY!hg6A8*e?&_S7etCj}F^scywDP*u8QrA7`0j zF{yj-KHCM>RhcSf_s{%QaeCiL7dh#s?L|kc_Pf<%KJ8TLPT;&}T1H(d%IQyh`^wMQ z3`NlRWcW?uU=8Bn`ls<BT(UprdTeI`ZM^oTi0&Q@1%Hoo_V=rTE&BbmUv6p>Yt{ZZ z^65<F7n&m@Z+-Iq-UaXpt9fH^-*Q>+tVi~w##IvLG7UXDw%p#NG|a453@GsM`kE7$ z==aFM1#4q?+oM`0IsCnOXCnNL_=v*A8ftcjcv;q-@T*BH^xs|-B{JaBH$(<R%y$QN zPEIT`KRNHdho#rXRWD^e-}1PrHqgi$St0ZM((oSRO~a>d99kEPMh`Q`yHZp0^sr_N z&kknzoS$6PyjtrYyyf3|*;Zm}UOLqFl-FF`9WbVl-s0BG>|5bnzSJX&Jgh%N9BJ2D z8t+0c$?kkt_w8uz=#u`k?_TGAMMz4{m*A=y_TW`z`r<sER}Ss;ANsU>(4?flS0Oit z6Z0_2tva7=L<~B!UeDijJiT;OH1PQsUYxg)o_B+J;z~Q^VlT$F1^8~4&u@}_wg%o^ zzT5z#X!G&Zx3I$Eg17r4%MML^dNZkZBKtMR@$294rv{=i+>LifU=u!f^R3q*o<?p; z)hs`;rtTk0zo_^@wz_uhEnKNvBs|3XmV1JeWDPDo-q?LVbKDwVo1}AX3tEuvqU9KY z2UahVnutiKXQ+rx00bp>SlsfC9v3IQ+WOm$YTqnPhA{aLS3T=v1pbqN3%mb1Qs8SN zPgoir>*wmkdg^AEexPqk_NepGp<HKsQ-9g7qx17j<Ju6FnxT`o4-`Jh?A)m#&?{l~ zXp}A3aVcX**Hms#U2J@`$?o*y@6<_{frRVzl#5I9W%a_ZNLn}_dbJ6Ecb?Q>MW0l~ zf_C?Q!Mwo}lai@!1`%nmGOr%^fO^8iw_E8+H-4d(3`dEt#~ggOTGi;hMuvX=nd4y- z=_fXP-W_Sfc=6<7w;|(9?D^+%o|;ZQN61uKh~biVcS8s&H+HC-#VmLqpR^%(PeH54 zRm(+sr#Y(;0z$&apkZWV?3hcR%-C|3>XG&q!I@-C+i+N`@6wYGkO@`(DO<O|FQ{L? zJ#KwDzU6SIRkqD&P+qOR*Sf2c2VBdT*&`{q8sA04H8JTS+(3q9JEo8^UZHT=GQi5I zx9xuS-rAk0&P8O^1ap0kg^99-(Hfh+66uV`erHDD?rrt5$FKU*<~OHm$;k#kNWZHS zB1tzWbu(MnvG`!Za9dB!vg8ObAgL4g@(QlLX>CW;#stgH=Muj@{g^EGgcjQGea_YP z_8t`FxMj1AA7{d5UK_lcrxR-}Dks7!wzU{PC2s0k>BH6u6YI()R_Do8n1z==Bfm9o zs#}~VcNqTGP4MdI)uy;D4hScSJx4bTr{H=_R6__oTI5FUKDPkVkPXzH%aajxwrjK> zt!U|}&sKXKl31cxD#&K>CEb}<`3X%qw$Gd5jPnd$)jgCvBtluMqVB{!-P6!zJ<2V5 z!ctyHmBU@z-G2g~8rT{1>9)>sT!Od&tW!KZ5ReOAu5%KrY;bG0@-|Eey7zLKq}9N@ z_ZOrVFaB^>N#=Nd;)`yF3zcDlcKDq;MG=8s7@Zu}wdkRbllbM6YTHF<XYUx*7)uO1 ziXIC>#$QOiNZnz$ypu+hzQd_$q!&EA%PD!0a4xD?5PmlPo_5*W(ewir7nL=>OcAgj zTv|Top0D|^c`{5d%sISX>F|-1>M-Tg1zDICezmy=|JfaFM^9+wYaLL0IqYKVx_Do0 zc(8&2=lNb&dj47^iO1AUE92jjMbiCGoBZX`zwf<dgoOACIH?LT5JbvGAoe`~&Pm1g zYw)fgdi&fE=VU*=w>colp#Oz^N@DE(B5JEOug&t4AM>L>?fpED<KxRZEUtR%c0_&q z0K1L*nxAsuPSg$iBeL|f>&n?<T~kg{8>`t<I@kU1I`^=zR@&+*x)n<%O4qKpOwt)S zB3EH+JwIz~olkV;x~cZKHt*_BzFxA@1`%=xBn+GL8t=SS5KmKDk=Y|gy&)!p4H>zs zZ#N-GdZ&1z@D?^w{>jsUT18Wts7Hk!o>kaait7z}a->0}Vusn$8*(R?{NwAGf60Po zb}%Ptn{CC4`(U<$74>j3W6e@ddanU1BN=CdIA6HCEC2du%SUNdeD}3RE_{V5*D#wf z*5D=Emj?~5oHi9~xSTsAmg)DhvLS;Yn|EGo_b?xEDx)*@L)<rxyV3sEK-J5xmrx}l z<@a7C9a&JY@B%!oRHP~9N@7D4HO)i`PK!xT?Dh-cm8@w7S^L2dQ0ZYaBIlL)VgBks z<?)?1gQge6B$KuTbcKFAR<(ED?c}Lg0V2nz;n{khJ!C1jslk5N{ufVZ-uhowr3u*G zjk(#2y}HA!0QU0JhfGf|lrC4eC$HUmob7t6?CMjC(l;pdu(%m9{=6(kpTT{!@pN!? zp+MmL8n2l#MG<ZMh?U$`f~mWssu^;WM(*ZyHM+XkuS>~0e$1uX*hcYXe5&WxGT!0? z*M||+9cog?TJ#R9+G;OkRIe>M(<)jSCi=<S_d~#hy2Hr4Eo-J!)laTv>IJs95+aAZ zRqC7?DlQ=h$aN7COZy%<zM-R@eRw}`tN@tzxWRX);=TDnX}fBQ_tjxTs&%iW!DW)u z`US^2!>S6izh*7>K#{`Qx?(-b<TanQDO~dpn4l(@U$PqSZRG@;aaf)6w<1(%37((p zcLEcv;hM5{LvF7No8@;(-BbR$H4Jf`P;_Usz1VkKY)(Magk8Xwu=+bgF%K;Uq~CeP z$T!y|kLbdm_34svR}9m-vgLJ>5zhipVIPL<pLN=s!I#{q{#)+Re4L6+L<x|#za@On zx`lf(Qf1ZlpN9XPG%^<DU-4M3Ux^q+jJjQv8Gru5-rK3snsz?AnucE&iN`d1$E3_3 z1_#QcyMza_UlEMyv-;0xv$<3Jk3+;ZYpb~v1PV*Qd_O^G>jgju7Y2n7cv5I|mJZ@w zc@+YnlXVczIBTRe$B^PhHw)rY?1O9^NI?ve1{tBJ3)9vkLeL3CHGs%brA~vaJODbT z>Bt+}S((TS83Hr|d^kQ3m;eO$Fj+j!03F1%h$b`^l0gJ8&BABsAhucC0(>@`$)NKA zw3>z*N)?4s0jON)H0jUgG60ks9s#N3lBt?@5RNjV5W3Pqc=7ohO%U|=_gC}BsIj>; z5T&7^0V2^L8m$U(sPY0?d}4qqi>CnDn|6f0?YkI@r#9`A8j0-#(zk&qH6%D^XbxYX zvsr-9Fwv9k$Jariuw)zxhr_Aj@Hh=s6pBJpMM57{G)5gw#c7Zz>UheWCC*PC{$dK6 zfNWu)5I`eM2qPl<ktkdpgsH6+VCu!@@n@h7KwXW1LV&X?{}&vBGV-6-4yAE+@!AlN zrZt2MPy>x1gi6=mj|He2QB+BQ1Ax^8keYZ+98Oac189T4aY8AFL=7Qm9fI2b3(=oA ze-dSL9iT$crLp`tOg10Ds-Xc6kx%ksLZ@vegzC?jotB+t)dq#>`6)X8V!VhPYwQ0z zVm>~?LKgCA8nH=!J`hl0ZuDQ4$-kIcGU>aU96v5|dMA)cAcaX0dc=d0i-P<llQgMp zt`8BaV<LwG!6-yNRNf#|ga}=2Xp&FoGby^$`%It7pBXUe!hNI3B(i8ahyYbGg-Y~e z@)5eq8)gE}r0dQMrX|21TA&e0SPX3Lj3RrA@0><m61126xO9q&ipp<Vrn!V#zRN;m z@+{T|Is2!sxl$3<%m1eA7X|;JUZzP<Yk^7!{NA{9VZgui`S%q4=+@9ZKnLM3?8SfF z4Ss@^=6Z|re`+loAW%%0At<cE4XZ5N>(EOw0Dv&i>1~e00B9r%1@S>0VVcEAQ%FFe zkys=iheKgDAyEh<5;3QV%~hUD#~(Wv;&BkxmohYfkS%WtiT|Bhm^+B9@5lFoRQyyO z8@_|e0bB|d0$3rSPuOo~J^<tc1_w}qSsDt1flzMY2Vl+7@EX`ZXecxRiqWrp(4zj} zLn2WS0RKB5@((@~Mgw|>@Eadi1A=*fqiG-^*!eda7V{?`RtORQ%7;c{|L_IEHGlGj z&m}@IBNuuJ3V`<XAn5G^769$oP#1-m06PFQXHnU}GzJO5s78RRHUVv*PQVhdSUetW wU`S9mA{b(DC_}uy36h|0Y;3Ii|8uwt-!Sm_L@s|iK?D>IdMB%Byu}3eKQE%<DF6Tf literal 0 HcmV?d00001 diff --git a/BookGPU/Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf b/BookGPU/Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2437d2e19e4bb599ec97bf0c340742815ccff97f GIT binary patch literal 8005 zcmb_Bc|4Te_od`DNSh+XlhWA6Gy9Bfy!MnWTXu~x!!%<?Gh+=cWGPZ4g+fJ(B}KBl zsK`<z$u6ZRN+P5v{hqP3et*B;_mB6P&%?dvp6#A<?z#7#N8Uo;5P?#~!sKg)V$Z{H z01{xk?SW}(f+kd!2hS70Ll8^Qkj~^$IRI$LB=e~HR0`XjN+Q9yJPwuY1@lkIIlY6u z!C=kd$*;<Lq{1FwnWTv5_H23~y2`TZ?r3z(N{ks(Xlcc{>FhVdn{+qd+srr?TXChN zWs>L9ks5vW?DbD^lU*;zUzMDmeqCoj*5=;OV(o=ZH{_=5QN1{G+RfCo-p*m|<i?T7 zX80F{^1gQ;J<Gf9Ty7z<ef?IO_nFk@Ke(B+;<cx5&0xab2)5TWZE||Nk`=gPto8KX zp5xo<{$W<X$tv2W{`yGTmctUsyGNsz7b?bB(hJYfQU^|7eXNi`OQwH}GRufv06LW| z09EgO$4tsz0J3_v#~Uj(;BVT0@mn|9e&<$Udxy=@;DL1qE}AIgB@{_bYJwR%A%y0k z)y@>l^%;+um<Cc)j=?;z8PS_^%+!(%D~wv3>iHh4yJb#;<+hb3b7~DHX*iFiF=Tix z((+X&CtM7#V7r0{`TGNs^>|#F7~d?z2QmUm)i)HwZJX{I#M899`a(y`-^;EopGtYC z0gsQ1ej@4{;LvpLi=>ayW<_kVZaz(2?^>AQT(czl`UA;Qm1@<a*BSS>Us#jmlx~lC z>UL=-%ry5=cGSnUJ*T(UVM|Spt}VG|7HVB`PbZ$%IZ<NFN$ikF8qWy!MbtO+d!-s( z2)?faH&mp?DX-3dh0bcRJ3i0qM&YrGkNRK@Ndt=#giYh<zGv1p6x=ePpVW<4#xl|! z-|@1}vV0!p7^#MxZvf`?GmH^5POp<l!Zp*M#KcKA<UYCxdhz}W@kI`uZ}J9e%No<w zVm2FJiE}!Wh_ex{m<ZI@fwa8QcAd7~*EM5K`7_5TL&WN1N3Xwh8^SK%p>WvjsB<E& zS&bgPZFX{JRLehxo^G?w%hD*h7RsV-=-PhCQHu9A%2DQn1t)cvxk$)Er}o#UA8dFy z<@8WyZPV7;6hx2a#sOy3wYSPE&A;t`Tvg}tGP~zt3zjeqn_l$LKbYI8Ez>YKHd@&` z*wHO(7I7&#v9jkMRAWR}O|#6=L1~MIk&6cz%KbCjQqi$~uljFYs%)6)2nq?04f!M; zLOPw58d5D@VdkakKO{_g*f8|5IjbfX6t9Wczv$~HUI`%<K30|=Fhuch|0wl-`=`5D zM_0Sb%2nsK$`q~-+!LUwByZjbw>vH_|7a|X_eyJhA#$`#d2MiK%!PX|t<$Aj)ii@~ zM$$_}g*uyKOpH*ii1V&jMz1?;Of{b>Q+QKDE^&N<&vto{7nt#My`Xqjn^v|<$gOIn z@RoI$?D9<VM1~I|+q>Q_{-q4>?S~B?fnv0yy82zQzjvy~R<#VTJDu<GLd$*K@ZvFL z(LC2JRi!6ZJV`oUvTa-R=1ot6i-*S(k2sB38|t<v?qC|}awLVXcx?%@Oiyfz4SRI@ zLD-YAu)n>!!c?TSl%8(gf_VApEon!j){bp11>P#O<m1#0)sG8>?Im=C@5ltjgjFpQ zK6tKki?G^L%`X9;?uL)bM1&%Oq?dOY5}D231yAyOv&ECdgtR^JyT{LoMZhKMmVL6% zUtHeIBzDEoD(!o0M%2}lQ)o+gjmt#O?CQ9k)Y_Ex7<yrCO=-H&8wkBOp!W^*8f0J4 zVb~jmo+>!DEQh&K%;EXg_3a0*t&Dl}N_2ncKH~CkyRZ-O$D$jxHngOA$(ARaTk$w< z#guDQ!KU%2OL7fmgpG6HRn_=aafZ4?1J2%HX<=ddmMvJ*Z7=t~dp@zFE_bZ=RI)C} zj2F*lC&v~P-7>n_6?8E_DEkoZR9=ITona>Ec~;!1O<ktwA0PFBH-2x+ux~`*X}6jy zbl78!RF?aE7KU2<%+1f&{QoE{5(}G$VSi+G>i};mXhHU%f~Hh=I$4+P4>%&BVH^?* zVDLz17&OSCvUmW7pZ8&)C6&we;ZUetXh*Q%uqjqlo+AkD831Ta_2)t82L3!FD;~5f z04P2mh{TW$k)u$v9cYsP^`Jh)kqhuQ7fTofnpcZ2Vs5j6xIuaWvx;#6a0lI;3pm2~ zYFomb0f>VSpXE=aeD3r6)EuQAnMY=_Jr-EZYi~;Bk=>yY05u2rA)D|kgCEm?f#w#b z0E$GKS8{e%^P(&XFg~5-4>q&YTo@(7h`)~VyYqv(WG<C&7*OBNSkG{W@-`~dm&&74 z$g}3sr*bJAx;Kx_0o3>`X3a4>0-2M};qvr6$s7QK22IHGZ4?>}?xMT%Jh_fAG=VSZ z&mWG+w;=xqVZd-WJh1TcVGHe_|Nq1THrFPg(epeAze$-Lf#LCdi_Br+A!{t)&%uA? z_gxb5hjhN~3vl#zSwswG0XnNU62R~w5Z!ltsLlWJVbIGDL>6=^p|d<7v#@knT>AIE zm5&>bANBk|f&97<eSAaz6hgoGVYX{V_M-l0hv`BK#}PpU5NH$;AmGse0gH0xpN>4Z zkegtj4wu4TL=Y5;f3o*BD&50#zRUMDe~eEG1C61urBifR9!x5Lgn?E(s+TRac(5?g zl+NWs)0uVhtOG&2c{VtV8pIWf2xu`XfO?=2Xa-t<P|$!jpdCmC-9QTH4pPBcp&$*U z(S50)2j~gXK`)R6vgj--$OhTaUvJQx>B9v%AQ$9PeW@&vOZNx4OfuIK<a(1SRFDVq z{Meum=nDpbfnXq&!&aQN40Il(VgM9BHhx|W{QkU6=T3}29~hz8{p<<7g%c(QsfHtw zzMmy+Y(=cCYFbH~`cgK_wWb(aV~ulcT<+^xoDjG#Ju|XaB~ng60JrqMw33SiLwR%H z+t?NdiKT)UZ%O1uid(yovIHa)cP^2<EsZD(+*jsj-dZ*B4gP4_L7CB7@BZ*yS;g0P zUt<oSap_A14%+WNg4UKNXel-o-Ken@(0sNOTybhu!pwp4Z3PD`H|!@ZIU{nSm_G4E z%EirJOT?@p?5>|e`rj6IA@y)fmV$;B4zY{1abs~$^}~lV&os<fEY=5v!M86)!^v^+ z=dQ+qui&2*@7HHXud&H3?T*mSvCf3oStS_Q)73sy_4S=}f9ZjWx_{D(^NmLhw6&#` zcX!qp4i60s6r7ZG(rd;P_Vw3ZSY@NeIjo76$C;lw{na&jjR9h3@X5jWgtM!UCnu&B zWhh)Bs$r=N|2+=wQzZkULPl_pHi-`jS=~blO3|DPcdr;&FO@LJxsZLuQz_iRJiv-% zy`g`HSK1c^q5G+@w3N$#D@8h(30mbofhnB~j7TzwQaqMqYHum9XE`o0$>@w#t7gHP zhW$mMZ&tPc<tvh#+@2If`C#X%=!DpS^uZ(0W#l6U*UxL6UsoWO<7Ohs&WswQxLjYZ z;>NJk^y(>yqleOTXt4ZuwD7^MOxdty?t2b)WqlZJ4cYrF?KN_P<c&UFaMAX=V(YJl zryu@owCh>^G{P9B__<<}ibug6=c5@Z50~7dF~?~}QX*wbu0Ec&h2=@$<NuaA5cwB< zm06OpB=%@zWO&)q+Fi>qD@Vrs2g1cJIxZD$c$TaY`A$gib-|-tiM#Y5Gu=}}a7%fW zA8%9p?YL1w-m*hZGloO%(^~RiiDn%Wjk#M%w(kSHfnnRX?rW!Viq~tVuXJh>o+y+w zwtoI1ZL4bSvf#_n%SApI2g|O+=+{fHo0#%7(5w%Px-DBa)xSnR-<L-p${l}qH9GG& zPrNEle8YjmyZ@3FT-H<CVSh5f);2lV<g*Qy!Ly0+YI(7IvwmB_sqWoEJB`D8MW3#J zda60pE^qyk#YLglluxfX6W#pze63gArXh;?vLh04+Rq$JX(tBLvo=z@pMQviBUjUU z{LxDJ2Rt-CzBRg6_78an>tfqqLDyD(Y8G8j?C>q0sjq8x?P+g3hLrE$_wl{H3Sm`D zA+ziGopN`Tk#$y!Tw3q4a#jdYM2tJwiwU)&B*~-V5``>8-H2JZiL?KOi|Vo&^)W}M z+xJ=BZ?gLQcw-Goqwv-VXEWI?m_WJRi)C-8^zOUXBD}6g8^P6<8i{$cW>3n#eqFuB zQ0`TqeoK+BbxvYI=?5KRifn{z9=*%F-X}U`r*h&ri%_eYR>v&Oa3Aix-T5FnLpnFB zz%%9Wfr>*?Doc9fYcmoaCyv8+o0=F!YH$om#V1CUzv5S{lslzTcUZ7$rQ5d;MWdtU zkM}q>nRW`S=S*Msiv7fB*VFVjD-YMx#OLODO4#>FkHqVk)ucQq+GTG)Q=?9o%^y3u zk(w%p_i5`jlsfJajUs=wvjNHpONAm@MMqMqKj#;>744BIzF&Lci_@W1jn5L?XbE?c z_7<<3*mdr4=;fTkoWu$Iiz8RPbutzu(JBp{<Q=xJW`qizpooXSV0sC)o)wMPWHpu_ zaV+pYIr(00rqJ}pzGtsJK{uOJ8I8@MUD2w~l}`98eu!(nsAgY^{+QztF{r1FZ^>Gz z@hDD8`2oekVdi6+rCVXyz?heYZhpe3!hwF+yY~@azlQAdsWDabL%n){QQK%+)712# zQ%}WSbn&W__6{j$AIMr|vOi-VDiJf%%kr<YhAuP5YokJaj4sud<ieDnJhxXImbAkx zZ40P=<u`!Y!p2DE`afUX;QW`?v)nhg1O|zbx1}|mpRKskXk_18hraBw8+9daAX7p5 zM(3#Z=OYz>Qp}8VdH3s)K|oe=s7{j!ax(8UT9kRU9LqT@Pxl;_wJ;87((x7J22GAO znx{KhyY79HjDg3Voyrv4KE9SabFX{Zxx%jJH~YpI8sDrfQadv*PrfRiKDPuV-%~|# zS}sCveYx0wGLI?me1pS154vq^6}zpmyCXfMr>*4PbY7GElUT&mJ+b!Y-Jv-u#V#AF zr)hZ~!jEDJy2H}-28=up?VT7W@Jpme<cxGW#-WH*@bQHFuIL>Bkwt|z7sJgMu~)}l zI-hxOvb5^#CwkPPp7_1@#G6_=k51ZsGZIy-j{4F&%JP%Rlv+9bV54s8Awj$Dkg4wW zgv_KDjTh|Hm7X-de5Xnn@T^qHUX2JzMjbJtYgQta0x>OuoAQL5mfd^3H(cj)eO%=y z<k0anc&c+pPs&XF*AlV!wJ$O>w=!yzUcbMy3Mu<$amkUIEp6H)+=_c?rnoaXf*zA8 zh|-6rCbum&?fyq2+OFa0qPAd9*A7y719l{O`M~0=kWu0CG|hm!Zl8`A)fwpRcMUi{ z_-L`UuVv?hH8S1B6_VG_Nw03LP5CM?B|54{bkS((>lG5S_FUTNgT!xID%gGE@0Cwu zRAtyR<r81(9t<oN5wR7IGq8L-iq|uuC|g}VAW6#(4lUhRzs!TQ^PPT+P=0ijdDfaX zMsEc9n$C`4lyBe0QMaNfjVRd`A%_#iW?}N@a$7>yxltR0ol|+|rw1&b_D?g{h6eeu zAMWZ`B&-V#v$UA%?OuJjW_v|X*X`Ui<!qr!@>a_WLYFES<l>sw*EW=jV_x^}?m;Sw zo`}A+Z{-fOawntsWr^|U0n1pa;1>s@{Qq)EI8?sL<x5y_ybXH1bfl$qxW`2?N0F2g zQ+UHlfOvJ_%kFoX-QS|(@7mMQP2R>!gm7+Wmy1%e;3~z06)^EFc@eKhEGTjD*(#D= zQ_sRKNO5(nO)s?!spYfJi`I)Y27lW4psXq7Tl3BI0AMCB@gGp05?@<q+6(ld+L}5Z zzd6+W*+|<^IGdq+<K`2A8v(_al@9zPaOW#|Q~QV8IcRIKrYX_p1d)gU^x{Shhs_7B z`RIO%=-Doiv0m?*^iTjJ;{yDqC781&dLZwNS!CBI3o(i47`Eh@wP(G@c}dutxkW2| zZWOsmRhVag8(ze?BVyum+f6a|d1&ystoUJ-myL1Br-PI=aobOYe=Z*LZ9MzlN+6@} zop$1_MtLErME}KOa#JNSu5KazDb^hccB^kQI>KZ-Kh<p&oQ~|3x*FTB!CLu#b;pEn z;^p{o=PBp&R}6D)MO35pUKXu*Gk!e|Sd-bg{uXLWTZpCR_S6;2hwj)`!f!6U7_k4? zx1A*ud;FaNH!Txam%_~_++S8_;2*r$e^@fI_{)cW_tuH)0z|Y@*ybW9HwD?28bjoE zIfZt!SL^a&7eW98OKQ#TTYEQ=(h?*yn<NP3UQb;1wV+;u8{4K%tbJNPXu{mibl5Mq zLM{EaR~@-SQDgPT&v_Td4=TI1%3jGW_Z~FSa7*nzGJa&rKUB`<Ro*$fYuy{{wr_at z*6-e%rIWQxC}&aW*e&<bbSaYo$x3FE)l@{``(u7`ANzBjAC?V^Xb-3+XxAkoF2(9^ zc$hua7c{QdzuM)-v;7l<Ji*U5Tx-9yU)(6==<c%uen`0@xjaYaYGCHtf@60Cd!}4S zruS|KPOO3j02L?o4G!%;C?dB?KQnb?&rYmC&(5nc?@veV0z%cy)bD=IdEk&L*Coy< z4GD-DP7|qf70L^`#!ZMWi0IOY5WMCqG8jM#d|TMs?wYAIxGU%RDqG>vy%ZItb@feE zp+T?Avnktzrw_5h525VDRgaT0EJxmLuHAW5x+b(HJzCaZeYCNb9yimP#H|jI<Lz9H zI&-r-^smr8ma?okPDPHH8Eo>j5$)TXrJUp6)R(VPBpj51KjkrK!XmE5%W}7T61pd! z?JV6km3Ap-;zYC)u0KS&OycGqs@=(WhTY_&M>(n&;-uckT34Rc(P=JPdEVD<lbTt3 z>equWN^8wcwY$sO9j_PwI>@qt*Hxu;PdABGdz?>rw@Pj^w>t4osd(oeb+GP!ZKp!% zJ%y!SEY(4Buk)Ad`ds+!JM!2nAA|aa#Fw{*wfe8^gIy06Aa=v6#4Hpy86WR6irW?^ zJEA8$vZGV(*7{vGy4y0&C)p;)nC$x2)E#v3ZqPD=ui8f)&bja}ZNJtzW^eRA-`^JQ z`aiB2cBUo^cMKF3ulD_pp{eZ;U2Ygu9^gjxptH2#?@KG-0Nq^+z6)oHH1*b_deXNA za;R2;X4aHI21U&su1$hzYLlS?3l-r{_D0ZV%i&xAy2@$E=~<c>%JC8W)&0G^y`Yi< z;P1s`an=2`;Iky^P@4}1;lL~kkD&#(GqnJCY&MfY=K*L{HB}S>g;4=$9O%aB$L25q zlqwz$vE;bZ)GeWE%N#?frv>-q@x0YR(9h3L)eob}=6HZ8H8nL5i3ZVV1cZU$2C#T! ze*}xW8Im`v2;J^G8H=krtCK2)?FG{9K$I#HToANSg`l%p0ADcKjqSsO=0H(XBco_| z1P<j!M4(Vq3IYi|2m+Qy#<@|5Sg6jhAj$hD4Sx{@4M4ImkO`ocI#fw=_n}ZZT5ux^ z6Trxm&E?G%KmejD0R;!=XZ|lJ1bO5?&mD5({Nyzu9CcHuc0dczhw4})Q)v1KGd98( zutpQr0i?Q`ItH(<jsrBo-!LJUL!^4pM-b?v$bUil6Xs8(Y>qV)2qX`dk2jOe1F))C zz?;mYcru|IITJw6Nu4E~$J7M*-ucNm|Dw6b-lnGix52!;_;Jj~Q`cuxe7vAqhOz#C z872RsW8Ep=)%5n^FlQHnI|ZaNseFyNkZ)0tp6(QN8k^%q=7+JjH&kgM^PuPkp%{dd zG@(HroyVk-W|x@`lQ-93()r6qok?bSXu<sv?o=AthslGJ6cpzS&!m&)8nYDO4=zxP z!jA<uXO58-b?*X45(QdHJ{&srHx{!fd=}pcp|<;cH4xJDPeu!2!jF~zjoB{@{zI_L zf}pel#SQp9X^~*Szx4cjjDBQlDF180{rGwKkDcJBqSHcNQTR`3MG-2R@!bQNmA_aO z`0Lz^?F*ox>gntf$KU`o5`}{BplmRUqOZ;epipQcQVoO0VU>|6I1&k8;Kb%AEV$$M zRSTh5^J7Z^YCvVLJyZ(sJ1*aK5Lm~D=LvCGhz!N=C1-yQl?Ihxp&}qZ&(1vn9*2YU zqXF|U6afVlF!>L_T7VHSc&N<%D-4ClLuJz6U^wI-v`8chD$oChhb8`jha&Kc%D>?e ziGR?d(Q1%cf5$`rfrldE{(zzJ7^tZHD=m(Q`U%G4kf8!32l`A6fL8QC=-UGp04>>2 z_MH9f0D#6U8XK6cPC_MCeZWx@O(fzmSOT<FVTgu?YC1YZH3Jl0A4Aa5L7}lahNS-= b!<qm6fy*Ovc(V>7;0S1#yu5+AA?$wu_;$Hx literal 0 HcmV?d00001 diff --git a/BookGPU/Chapters/chapter18/preamble.aux b/BookGPU/Chapters/chapter18/preamble.aux new file mode 100644 index 0000000..65abe7e --- /dev/null +++ b/BookGPU/Chapters/chapter18/preamble.aux @@ -0,0 +1,32 @@ +\relax +\@setckpt{Chapters/chapter1/preamble}{ +\setcounter{page}{1} +\setcounter{equation}{0} +\setcounter{enumi}{0} +\setcounter{enumii}{0} +\setcounter{enumiii}{0} +\setcounter{enumiv}{0} +\setcounter{footnote}{0} +\setcounter{mpfootnote}{0} +\setcounter{part}{0} +\setcounter{chapter}{0} +\setcounter{section}{0} +\setcounter{subsection}{0} +\setcounter{subsubsection}{0} +\setcounter{paragraph}{0} +\setcounter{subparagraph}{0} +\setcounter{figure}{0} +\setcounter{table}{0} +\setcounter{numauthors}{0} +\setcounter{parentequation}{0} +\setcounter{subfigure}{0} +\setcounter{lofdepth}{1} +\setcounter{subtable}{0} +\setcounter{lotdepth}{1} +\setcounter{lstnumber}{1} +\setcounter{ContinuedFloat}{0} +\setcounter{AlgoLine}{0} +\setcounter{algocfline}{0} +\setcounter{algocfproc}{0} +\setcounter{algocf}{0} +} -- 2.39.5