From 013c063fb5cb99cc33b413d937f330acc42ca73e Mon Sep 17 00:00:00 2001
From: couturie <couturie@extinction>
Date: Wed, 6 Feb 2013 14:19:18 +0100
Subject: [PATCH] ch18

---
 BookGPU/Chapters/chapter18/biblio.bib         |  35 ++
 BookGPU/Chapters/chapter18/ch1.aux            |  64 +++
 BookGPU/Chapters/chapter18/ch18.aux           |  71 ++++
 BookGPU/Chapters/chapter18/ch18.tex           | 392 ++++++++++++++++++
 BookGPU/Chapters/chapter18/ch18.tex~          | 296 +++++++++++++
 .../chapter18/figures/curve_time_bbs_gpu.pdf  | Bin 0 -> 7587 bytes
 .../figures/curve_time_xorlike_gpu.pdf        | Bin 0 -> 8005 bytes
 BookGPU/Chapters/chapter18/preamble.aux       |  32 ++
 8 files changed, 890 insertions(+)
 create mode 100644 BookGPU/Chapters/chapter18/biblio.bib
 create mode 100644 BookGPU/Chapters/chapter18/ch1.aux
 create mode 100644 BookGPU/Chapters/chapter18/ch18.aux
 create mode 100755 BookGPU/Chapters/chapter18/ch18.tex
 create mode 100755 BookGPU/Chapters/chapter18/ch18.tex~
 create mode 100644 BookGPU/Chapters/chapter18/figures/curve_time_bbs_gpu.pdf
 create mode 100644 BookGPU/Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf
 create mode 100644 BookGPU/Chapters/chapter18/preamble.aux

diff --git a/BookGPU/Chapters/chapter18/biblio.bib b/BookGPU/Chapters/chapter18/biblio.bib
new file mode 100644
index 0000000..f3b0ccc
--- /dev/null
+++ b/BookGPU/Chapters/chapter18/biblio.bib
@@ -0,0 +1,35 @@
+@misc{ch1:cuda,
+  author = {{NVIDIA Corporation}},
+  keywords = {CUDA},
+  note = {Version 4.0},
+  title = {{NVIDIA CUDA C} Programming Guide},
+  year = 2011
+}
+
+@Article{ch1:Buck:2004:BGS,
+  author =	"I. Buck and T. Foley and D. Horn and J.
+		 Sugerman and K. Fatahalian and M. Houston and P.
+		 Hanrahan",
+  title =	"{Brook} for {GPUs}: stream computing on graphics
+		 hardware",
+  journal =	"ACM Transactions on Graphics",
+  volume =	"23",
+  number =	"3",
+  pages =	"777--786",
+  month =	aug,
+  year = 	"2004",
+}
+
+
+@article{ch1:CMR:12,
+     author    = "B. Cloutier and B. K. Muite and P. Rigge",
+     title     = "A comparison of CPU and GPU performance for Fourier pseudospectral simulations of the Navier-Stokes, Cubic Nonlinear Schrödinger and Sine Gordon Equations",
+     journal   = "Computational Physics (physics.comp-ph)",
+     year      = "2012",
+     archivePrefix = "arXiv",
+     eprint        = "1206.3215",
+     primaryClass  = "physics.comp-ph",
+}
+
+
+
diff --git a/BookGPU/Chapters/chapter18/ch1.aux b/BookGPU/Chapters/chapter18/ch1.aux
new file mode 100644
index 0000000..2c84cf4
--- /dev/null
+++ b/BookGPU/Chapters/chapter18/ch1.aux
@@ -0,0 +1,64 @@
+\relax 
+\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}}
+\@writefile{loa}{\addvspace {10\p@ }}
+\@writefile{toc}{\contentsline {chapter}{\numberline {1}Presentation of the GPU architecture and of the Cuda environment}{3}}
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\newlabel{chapter1}{{1}{3}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.1}Introduction}{3}}
+\newlabel{ch1:intro}{{1.1}{3}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.2}Brief history of Video Card}{3}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.3}GPGPU}{4}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.4}Architecture of current GPUs}{5}}
+\@writefile{lof}{\contentsline {figure}{\numberline {1.1}{\ignorespaces Comparison of number of cores in a CPU and in a GPU.\relax }}{5}}
+\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
+\newlabel{ch1:fig:comparison_cpu_gpu}{{1.1}{5}}
+\@writefile{lof}{\contentsline {figure}{\numberline {1.2}{\ignorespaces Comparison of low latency of CPU and high throughput of GPU.\relax }}{6}}
+\newlabel{ch1:fig:latency_throughput}{{1.2}{6}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.5}Kinds of parallelism}{7}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.6}Cuda Multithreading}{7}}
+\@writefile{lof}{\contentsline {figure}{\numberline {1.3}{\ignorespaces Scalability of GPU.\relax }}{8}}
+\newlabel{ch1:fig:scalability}{{1.3}{8}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.7}Memory hierarchy}{9}}
+\@writefile{lof}{\contentsline {figure}{\numberline {1.4}{\ignorespaces Memory hierarchy of a GPU.\relax }}{10}}
+\newlabel{ch1:fig:memory_hierarchy}{{1.4}{10}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.8}Conclusion}{10}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{11}}
+\@setckpt{Chapters/chapter1/ch1}{
+\setcounter{page}{12}
+\setcounter{equation}{0}
+\setcounter{enumi}{0}
+\setcounter{enumii}{0}
+\setcounter{enumiii}{0}
+\setcounter{enumiv}{3}
+\setcounter{footnote}{0}
+\setcounter{mpfootnote}{0}
+\setcounter{part}{1}
+\setcounter{chapter}{1}
+\setcounter{section}{8}
+\setcounter{subsection}{0}
+\setcounter{subsubsection}{0}
+\setcounter{paragraph}{0}
+\setcounter{subparagraph}{0}
+\setcounter{figure}{4}
+\setcounter{table}{0}
+\setcounter{numauthors}{0}
+\setcounter{parentequation}{0}
+\setcounter{subfigure}{0}
+\setcounter{lofdepth}{1}
+\setcounter{subtable}{0}
+\setcounter{lotdepth}{1}
+\setcounter{lstnumber}{1}
+\setcounter{ContinuedFloat}{0}
+\setcounter{AlgoLine}{0}
+\setcounter{algocfline}{0}
+\setcounter{algocfproc}{0}
+\setcounter{algocf}{0}
+\setcounter{proposition}{0}
+\setcounter{theorem}{0}
+\setcounter{exercise}{0}
+\setcounter{example}{0}
+\setcounter{definition}{0}
+\setcounter{proof}{0}
+\setcounter{lstlisting}{0}
+}
diff --git a/BookGPU/Chapters/chapter18/ch18.aux b/BookGPU/Chapters/chapter18/ch18.aux
new file mode 100644
index 0000000..f9f97ae
--- /dev/null
+++ b/BookGPU/Chapters/chapter18/ch18.aux
@@ -0,0 +1,71 @@
+\relax 
+\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}}
+\@writefile{toc}{\author{Christophe Guyeux}{}}
+\@writefile{loa}{\addvspace {10\p@ }}
+\@writefile{toc}{\contentsline {chapter}{\numberline {14}Pseudo Random Number Generator on GPU}{305}}
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\newlabel{chapter18}{{14}{305}}
+\@writefile{toc}{\contentsline {section}{\numberline {14.1}Introduction}{305}}
+\@writefile{toc}{\contentsline {section}{\numberline {14.2}Basic Recalls}{306}}
+\newlabel{section:BASIC RECALLS}{{14.2}{306}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {14.2.1}Devaney's Chaotic Dynamical Systems}{306}}
+\@writefile{toc}{\contentsline {section}{\numberline {14.3}Toward Efficiency and Improvement for CI PRNG}{306}}
+\newlabel{sec:efficient PRNG}{{14.3}{306}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{306}}
+\newlabel{algo:seqCIPRNG}{{14.1}{306}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {14.1}C code of the sequential PRNG based on chaotic iterations}{306}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{307}}
+\newlabel{sec:efficient PRNG gpu}{{14.3.2}{307}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.3}Naive Version for GPU}{307}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {12}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{308}}
+\newlabel{algo:gpu_kernel}{{12}{308}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.4}Improved Version for GPU}{308}}
+\newlabel{IR}{{13}{309}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {13}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{309}}
+\newlabel{algo:gpu_kernel2}{{13}{309}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.5}Chaos Evaluation of the Improved Version}{309}}
+\@writefile{toc}{\contentsline {section}{\numberline {14.4}Experiments}{310}}
+\newlabel{sec:experiments}{{14.4}{310}}
+\@writefile{lof}{\contentsline {figure}{\numberline {14.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{311}}
+\newlabel{fig:time_xorlike_gpu}{{14.1}{311}}
+\@writefile{lof}{\contentsline {figure}{\numberline {14.2}{\ignorespaces Quantity of pseudorandom numbers generated per second using the BBS-based PRNG\relax }}{312}}
+\newlabel{fig:time_bbs_gpu}{{14.2}{312}}
+\@setckpt{Chapters/chapter18/ch18}{
+\setcounter{page}{313}
+\setcounter{equation}{0}
+\setcounter{enumi}{2}
+\setcounter{enumii}{0}
+\setcounter{enumiii}{0}
+\setcounter{enumiv}{0}
+\setcounter{footnote}{2}
+\setcounter{mpfootnote}{0}
+\setcounter{part}{1}
+\setcounter{chapter}{14}
+\setcounter{section}{4}
+\setcounter{subsection}{0}
+\setcounter{subsubsection}{0}
+\setcounter{paragraph}{0}
+\setcounter{subparagraph}{0}
+\setcounter{figure}{2}
+\setcounter{table}{0}
+\setcounter{numauthors}{0}
+\setcounter{parentequation}{4}
+\setcounter{subfigure}{0}
+\setcounter{lofdepth}{1}
+\setcounter{subtable}{0}
+\setcounter{lotdepth}{1}
+\setcounter{lstnumber}{15}
+\setcounter{ContinuedFloat}{0}
+\setcounter{AlgoLine}{14}
+\setcounter{algocfline}{13}
+\setcounter{algocfproc}{13}
+\setcounter{algocf}{13}
+\setcounter{proposition}{1}
+\setcounter{theorem}{0}
+\setcounter{exercise}{0}
+\setcounter{example}{0}
+\setcounter{definition}{0}
+\setcounter{proof}{1}
+\setcounter{lstlisting}{1}
+}
diff --git a/BookGPU/Chapters/chapter18/ch18.tex b/BookGPU/Chapters/chapter18/ch18.tex
new file mode 100755
index 0000000..3808995
--- /dev/null
+++ b/BookGPU/Chapters/chapter18/ch18.tex
@@ -0,0 +1,392 @@
+\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte}
+\chapterauthor{Christophe Guyeux}{Femto-ST Institute, University of Franche-Comte}
+
+
+\chapter{Pseudo Random Number Generator on GPU}
+\label{chapter18}
+
+\section{Introduction}
+
+Randomness is of importance in many fields such as scientific
+simulations or cryptography.  ``Random numbers'' can mainly be
+generated either by a deterministic and reproducible algorithm called
+a pseudorandom number generator (PRNG). In this paper, we focus on
+reproducible generators, useful for instance in Monte-Carlo based
+simulators.  These domains need PRNGs that are statistically
+irreproachable.  In some fields such as in numerical simulations,
+speed is a strong requirement that is usually attained by using
+parallel architectures. In that case, a recurrent problem is that a
+deflation of the statistical qualities is often reported, when the
+parallelization of a good PRNG is realized. In some fields such as in
+numerical simulations, speed is a strong requirement that is usually
+attained by using parallel architectures. In that case, a recurrent
+problem is that a deflation of the statistical qualities is often
+reported, when the parallelization of a good PRNG is realized.  This
+is why ad-hoc PRNGs for each possible architecture must be found to
+achieve both speed and randomness.  On the other side, speed is not
+the main requirement in cryptography: the great need is to
+define \emph{secure} generators able to withstand malicious
+attacks. Roughly speaking, an attacker should not be able in practice
+to make the distinction between numbers obtained with the secure
+generator and a true random sequence.  Or, in an equivalent
+formulation, he or she should not be able (in practice) to predict the
+next bit of the generator, having the knowledge of all the binary
+digits that have been already released. ``Being able in practice''
+refers here to the possibility to achieve this attack in polynomial
+time, and to the exponential growth of the difficulty of this
+challenge when the size of the parameters of the PRNG increases.
+
+
+Finally, a small part of the community working in this domain focuses on a
+third requirement, that is to define chaotic generators.
+The main idea is to take benefits from a chaotic dynamical system to obtain a
+generator that is unpredictable, disordered, sensible to its seed, or in other word chaotic.
+Their desire is to map a given chaotic dynamics into a sequence that seems random 
+and unassailable due to chaos.
+However, the chaotic maps used as a pattern are defined in the real line 
+whereas computers deal with finite precision numbers.
+This distortion leads to a deflation of both chaotic properties and speed.
+Furthermore, authors of such chaotic generators often claim their PRNG
+as secure due to their chaos properties, but there is no obvious relation
+between chaos and security as it is understood in cryptography.
+This is why the use of chaos for PRNG still remains marginal and disputable.
+
+
+The remainder of this paper is organized as follows. 
+A COMPLETER
+
+
+\section{Basic Recalls}
+\label{section:BASIC RECALLS}
+
+This section is devoted to basic definitions and terminologies in the fields of
+topological chaos and chaotic iterations. We assume the reader is familiar
+with basic notions on topology (see for instance~\cite{Devaney}).
+
+
+\subsection{Devaney's Chaotic Dynamical Systems}
+
+
+A COMPLETER
+
+
+
+\section{Toward Efficiency and Improvement for CI PRNG}
+\label{sec:efficient PRNG}
+
+\subsection{First Efficient Implementation of a PRNG based on Chaotic Iterations}
+%
+%Based on the proof presented in the previous section, it is now possible to 
+%improve the speed of the generator formerly presented in~\cite{bgw09:ip,guyeux10}. 
+%The first idea is to consider
+%that the provided strategy is a pseudorandom Boolean vector obtained by a
+%given PRNG.
+%An iteration of the system is simply the bitwise exclusive or between
+%the last computed state and the current strategy.
+%Topological properties of disorder exhibited by chaotic 
+%iterations can be inherited by the inputted generator, we hope by doing so to 
+%obtain some statistical improvements while preserving speed.
+%
+%%RAPH : j'ai viré tout ca
+%% Let us give an example using 16-bits numbers, to clearly understand how the bitwise xor operations
+%% are
+%% done.  
+%% Suppose  that $x$ and the  strategy $S^i$ are given as
+%% binary vectors.
+%% Table~\ref{TableExemple} shows the result of $x \oplus S^i$.
+
+%% \begin{table}
+%% \begin{scriptsize}
+%% $$
+%% \begin{array}{|cc|cccccccccccccccc|}
+%% \hline
+%% x      &=&1&0&1&1&1&0&1&0&1&0&0&1&0&0&1&0\\
+%% \hline
+%% S^i      &=&0&1&1&0&0&1&1&0&1&1&1&0&0&1&1&1\\
+%% \hline
+%% x \oplus S^i&=&1&1&0&1&1&1&0&0&0&1&1&1&0&1&0&1\\
+%% \hline
+
+%% \hline
+%%  \end{array}
+%% $$
+%% \end{scriptsize}
+%% \caption{Example of an arbitrary round of the proposed generator}
+%% \label{TableExemple}
+%% \end{table}
+
+
+
+
+\lstset{language=C,caption={C code of the sequential PRNG based on chaotic iterations},label={algo:seqCIPRNG}}
+\begin{small}
+\begin{lstlisting}
+
+unsigned int CIPRNG() {
+  static unsigned int x = 123123123;
+  unsigned long t1 = xorshift();
+  unsigned long t2 = xor128();
+  unsigned long t3 = xorwow();
+  x = x^(unsigned int)t1;
+  x = x^(unsigned int)(t2>>32);
+  x = x^(unsigned int)(t3>>32);
+  x = x^(unsigned int)t2;
+  x = x^(unsigned int)(t1>>32);
+  x = x^(unsigned int)t3;
+  return x;
+}
+\end{lstlisting}
+\end{small}
+
+
+
+In Listing~\ref{algo:seqCIPRNG} a sequential  version of the proposed PRNG based
+on  chaotic  iterations  is  presented.   The xor  operator  is  represented  by
+\textasciicircum.  This function uses  three classical 64-bits PRNGs, namely the
+\texttt{xorshift},         the          \texttt{xor128},         and         the
+\texttt{xorwow}~\cite{Marsaglia2003}.  In the following, we call them ``xor-like
+PRNGs''.   As each  xor-like PRNG  uses 64-bits  whereas our  proposed generator
+works with 32-bits, we use the command \texttt{(unsigned int)}, that selects the
+32 least  significant bits  of a given  integer, and the  code \texttt{(unsigned
+  int)(t$>>$32)} in order to obtain the 32 most significant bits of \texttt{t}.
+
+Thus producing a pseudorandom number needs 6 xor operations with 6 32-bits numbers
+that  are provided by  3 64-bits  PRNGs.  This  version successfully  passes the
+stringent BigCrush battery of tests~\cite{LEcuyerS07}. 
+At this point, we thus
+have defined an efficient and statistically unbiased generator. Its speed is
+directly related to the use of linear operations, but for the same reason,
+this fast generator cannot be proven as secure.
+
+
+
+\subsection{Efficient PRNGs based on Chaotic Iterations on GPU}
+\label{sec:efficient PRNG gpu}
+
+In order to  take benefits from the computing power  of GPU, a program
+needs  to have  independent blocks  of  threads that  can be  computed
+simultaneously. In general,  the larger the number of  threads is, the
+more local  memory is  used, and the  less branching  instructions are
+used  (if,  while,  ...),  the  better the  performances  on  GPU  is.
+Obviously, having these requirements in  mind, it is possible to build
+a   program    similar   to    the   one   presented    in  Listing 
+\ref{algo:seqCIPRNG}, which computes  pseudorandom numbers on GPU.  To
+do  so,  we  must   firstly  recall  that  in  the  CUDA~\cite{Nvid10}
+environment,    threads    have     a    local    identifier    called
+\texttt{ThreadIdx},  which   is  relative  to   the  block  containing
+them. Furthermore, in  CUDA, parts of  the code that are executed by the  GPU, are
+called {\it kernels}.
+
+
+\subsection{Naive Version for GPU}
+
+ 
+It is possible to deduce from the CPU version a quite similar version adapted to GPU.
+The simple principle consists in making each thread of the GPU computing the CPU version of our PRNG.  
+Of course,  the  three xor-like
+PRNGs  used in these computations must have different  parameters. 
+In a given thread, these parameters are
+randomly picked from another PRNGs. 
+The  initialization stage is performed by  the CPU.
+To do it, the  ISAAC  PRNG~\cite{Jenkins96} is used to  set  all  the
+parameters embedded into each thread.   
+
+The implementation of  the three
+xor-like  PRNGs  is  straightforward  when  their  parameters  have  been
+allocated in  the GPU memory.  Each xor-like  works with  an internal
+number  $x$  that saves  the  last  generated  pseudorandom number. Additionally,  the
+implementation of the  xor128, the xorshift, and the  xorwow respectively require
+4, 5, and 6 unsigned long as internal variables.
+
+
+\begin{algorithm}
+\begin{small}
+\KwIn{InternalVarXorLikeArray: array with internal variables of the 3 xor-like
+PRNGs in global memory\;
+NumThreads: number of threads\;}
+\KwOut{NewNb: array containing random numbers in global memory}
+\If{threadIdx is concerned by the computation} {
+  retrieve data from InternalVarXorLikeArray[threadIdx] in local variables\;
+  \For{i=1 to n} {
+    compute a new PRNG as in Listing\ref{algo:seqCIPRNG}\;
+    store the new PRNG in NewNb[NumThreads*threadIdx+i]\;
+  }
+  store internal variables in InternalVarXorLikeArray[threadIdx]\;
+}
+\end{small}
+\caption{Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations}
+\label{algo:gpu_kernel}
+\end{algorithm}
+
+
+
+Algorithm~\ref{algo:gpu_kernel}  presents a naive  implementation of the proposed  PRNG on
+GPU.  Due to the available  memory in the  GPU and the number  of threads
+used simultaneously,  the number  of random numbers  that a thread  can generate
+inside   a    kernel   is   limited  (\emph{i.e.},    the    variable   \texttt{n}   in
+algorithm~\ref{algo:gpu_kernel}). For instance, if  $100,000$ threads are used and
+if $n=100$\footnote{in fact, we need to add the initial seed (a 32-bits number)},
+then   the  memory   required   to  store all of the  internals   variables  of both the  xor-like
+PRNGs\footnote{we multiply this number by $2$ in order to count 32-bits numbers}
+and  the pseudorandom  numbers generated by  our  PRNG,  is  equal to  $100,000\times  ((4+5+6)\times
+2+(1+100))=1,310,000$ 32-bits numbers, that is, approximately $52$Mb.
+
+This generator is able to pass the whole BigCrush battery of tests, for all
+the versions that have been tested depending on their number of threads 
+(called \texttt{NumThreads} in our algorithm, tested up to $5$ million).
+
+
+\subsection{Improved Version for GPU}
+
+As GPU cards using CUDA have shared memory between threads of the same block, it
+is possible  to use this  feature in order  to simplify the  previous algorithm,
+i.e., to use less  than 3 xor-like PRNGs. The solution  consists in computing only
+one xor-like PRNG by thread, saving  it into the shared memory, and then to use the results
+of some  other threads in the  same block of  threads. In order to  define which
+thread uses the result of which other  one, we can use a combination array that
+contains  the indexes  of  all threads  and  for which  a combination has  been
+performed. 
+
+In  Algorithm~\ref{algo:gpu_kernel2},  two  combination  arrays are  used.   The
+variable     \texttt{offset}    is     computed    using     the     value    of
+\texttt{combination\_size}.   Then we  can compute  \texttt{o1}  and \texttt{o2}
+representing the  indexes of  the other  threads whose results  are used  by the
+current one.   In this algorithm, we  consider that a 32-bits  xor-like PRNG has
+been chosen. In practice, we  use the xor128 proposed in~\cite{Marsaglia2003} in
+which  unsigned longs  (64 bits)  have been  replaced by  unsigned  integers (32
+bits).
+
+This version  can also pass the whole {\it BigCrush} battery of tests.
+
+\begin{algorithm}
+\begin{small}
+\KwIn{InternalVarXorLikeArray: array with internal variables of 1 xor-like PRNGs
+in global memory\;
+NumThreads: Number of threads\;
+array\_comb1, array\_comb2: Arrays containing combinations of size combination\_size\;}
+
+\KwOut{NewNb: array containing random numbers in global memory}
+\If{threadId is concerned} {
+  retrieve data from InternalVarXorLikeArray[threadId] in local variables including shared memory and x\;
+  offset = threadIdx\%combination\_size\;
+  o1 = threadIdx-offset+array\_comb1[offset]\;
+  o2 = threadIdx-offset+array\_comb2[offset]\;
+  \For{i=1 to n} {
+    t=xor-like()\;
+    t=t\textasciicircum shmem[o1]\textasciicircum shmem[o2]\;
+    shared\_mem[threadId]=t\;
+    x = x\textasciicircum t\;
+
+    store the new PRNG in NewNb[NumThreads*threadId+i]\;
+  }
+  store internal variables in InternalVarXorLikeArray[threadId]\;
+}
+\end{small}
+\caption{Main kernel for the chaotic iterations based PRNG GPU efficient
+version\label{IR}}
+\label{algo:gpu_kernel2} 
+\end{algorithm}
+
+\subsection{Chaos Evaluation of the Improved Version}
+
+A run of Algorithm~\ref{algo:gpu_kernel2} consists in an operation ($x=x\oplus t$) having 
+the form of Equation~\ref{equation Oplus}, which is equivalent to the iterative
+system of Eq.~\ref{eq:generalIC}. That is, an iteration of the general chaotic
+iterations is realized between the last stored value $x$ of the thread and a strategy $t$
+(obtained by a bitwise exclusive or between a value provided by a xor-like() call
+and two values previously obtained by two other threads).
+To be certain that we are in the framework of Theorem~\ref{t:chaos des general},
+we must guarantee that this dynamical system iterates on the space 
+$\mathcal{X} = \mathcal{P}\left(\llbracket 1, \mathsf{N} \rrbracket\right)^\mathds{N}\times\mathds{B}^\mathsf{N}$.
+The left term $x$ obviously belongs to $\mathds{B}^ \mathsf{N}$.
+To prevent from any flaws of chaotic properties, we must check that the right 
+term (the last $t$), corresponding to the strategies,  can possibly be equal to any
+integer of $\llbracket 1, \mathsf{N} \rrbracket$. 
+
+Such a result is obvious, as for the xor-like(), all the
+integers belonging into its interval of definition can occur at each iteration, and thus the 
+last $t$ respects the requirement. Furthermore, it is possible to
+prove by an immediate mathematical induction that, as the initial $x$
+is uniformly distributed (it is provided by a cryptographically secure PRNG),
+the two other stored values shmem[o1] and shmem[o2] are uniformly distributed too,
+(this is the induction hypothesis), and thus the next $x$ is finally uniformly distributed.
+
+Thus Algorithm~\ref{algo:gpu_kernel2} is a concrete realization of the general
+chaotic iterations presented previously, and for this reason, it satisfies the 
+Devaney's formulation of a chaotic behavior.
+
+\section{Experiments}
+\label{sec:experiments}
+
+Different experiments  have been  performed in order  to measure  the generation
+speed. We have used a first computer equipped with a Tesla C1060 NVidia  GPU card
+and an
+Intel  Xeon E5530 cadenced  at 2.40  GHz,  and 
+a second computer  equipped with a smaller  CPU and  a GeForce GTX  280. 
+All the
+cards have 240 cores.
+
+In  Figure~\ref{fig:time_xorlike_gpu} we  compare the  quantity of  pseudorandom numbers
+generated per second with various xor-like based PRNGs. In this figure, the optimized
+versions use the {\it xor64} described in~\cite{Marsaglia2003}, whereas the naive versions
+embed  the three  xor-like  PRNGs described  in Listing~\ref{algo:seqCIPRNG}.   In
+order to obtain the optimal performances, the storage of pseudorandom numbers
+into the GPU memory has been removed. This step is time consuming and slows down the numbers
+generation.  Moreover this   storage  is  completely
+useless, in case of applications that consume the pseudorandom
+numbers  directly   after generation. We can see  that when the number of  threads is greater
+than approximately 30,000 and lower than 5 million, the number of pseudorandom numbers generated
+per second  is almost constant.  With the  naive version, this value ranges from 2.5 to
+3GSamples/s.   With  the  optimized   version,  it  is  approximately  equal to
+20GSamples/s. Finally  we can remark  that both GPU  cards are quite  similar, but in
+practice,  the Tesla C1060  has more  memory than  the GTX  280, and  this memory
+should be of better quality.
+As a  comparison,   Listing~\ref{algo:seqCIPRNG}  leads   to the  generation of  about
+138MSample/s when using one core of the Xeon E5530.
+
+\begin{figure}[htbp]
+\begin{center}
+  \includegraphics[scale=0.65]{Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf}
+\end{center}
+\caption{Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG}
+\label{fig:time_xorlike_gpu}
+\end{figure}
+
+
+
+
+
+In Figure~\ref{fig:time_bbs_gpu} we highlight  the performances of the optimized
+BBS-based PRNG on GPU.  On  the Tesla C1060 we obtain approximately 700MSample/s
+and  on the  GTX 280  about  670MSample/s, which  is obviously  slower than  the
+xorlike-based PRNG on GPU. However, we  will show in the next sections that this
+new PRNG  has a strong  level of  security, which is  necessarily paid by  a speed
+reduction.
+
+\begin{figure}[htbp]
+\begin{center}
+  \includegraphics[scale=0.65]{Chapters/chapter18/figures/curve_time_bbs_gpu.pdf}
+\end{center}
+\caption{Quantity of pseudorandom numbers generated per second using the BBS-based PRNG}
+\label{fig:time_bbs_gpu}
+\end{figure}
+
+All  these  experiments allow  us  to conclude  that  it  is possible  to
+generate a very large quantity of pseudorandom  numbers statistically perfect with the  xor-like version.
+To a certain extend, it is also the case with the secure BBS-based version, the speed deflation being
+explained by the fact that the former  version has ``only''
+chaotic properties and statistical perfection, whereas the latter is also cryptographically secure,
+as it is shown in the next sections.
+
+
+
+
+
+
+
+
+
+
+
+\putbib[Chapters/chapter18/biblio]
+
diff --git a/BookGPU/Chapters/chapter18/ch18.tex~ b/BookGPU/Chapters/chapter18/ch18.tex~
new file mode 100755
index 0000000..6c3f1f1
--- /dev/null
+++ b/BookGPU/Chapters/chapter18/ch18.tex~
@@ -0,0 +1,296 @@
+\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte}
+
+
+\chapter{Presentation of the GPU architecture and of the Cuda environment}
+\label{chapter1}
+
+\section{Introduction}\label{ch1:intro}
+
+This chapter introduces the Graphics  Processing Unit (GPU) architecture and all
+the concepts needed to understand how GPUs  work and can be used to speed up the
+execution of some algorithms. First of all this chapter gives a brief history of
+the development  of Graphics  card until they  have been  used in order  to make
+general   purpose   computation.    Then   the   architecture  of   a   GPU   is
+illustrated.  There  are  many  fundamental  differences between  a  GPU  and  a
+tradition  processor. In  order  to benefit  from the  power  of a  GPU, a  Cuda
+programmer needs to use threads. They have some particularities which enable the
+Cuda model to be efficient and scalable when some constraints are addressed.
+
+
+
+\section{Brief history of Video Card}
+
+Video  cards or Graphics  cards have  been introduced  in personal  computers to
+produce  high quality graphics  faster than  classical Central  Processing Units
+(CPU) and  to alleviate CPU from this  task. In general, display  tasks are very
+repetitive and very specific.  Hence,  some manufacturers have produced more and
+more sophisticated video cards, providing 2D accelerations then 3D accelerations,
+then some  light transforms. Video cards  own their own memory  to perform their
+computation.  For at least two decades, every personal computer has had a video
+card which is simple for  desktop computers or which provides many accelerations
+for game and/or  graphic oriented computers.  In the  latter case, graphic cards
+may be more expensive than a CPU.
+
+Since  2000, video  cards have  allowed  users to  apply arithmetic  operations
+simultaneously on a sequence of  pixels, also later called stream processing. In
+this case, the information of the pixels (color, location and other information) are
+combined in order  to produce a pixel  color that can be displayed  on a screen.
+Simultaneous  computations are  provided  by shaders  which calculate  rendering
+effects on  graphics hardware with a  high degree of  flexibility. These shaders
+handles the stream data with pipelines.
+
+
+Some researchers  tried to  apply those operations  on other  data, representing
+something different  from pixels,  and consequently this  resulted in  the first
+uses of video cards for  performing general purpose computation. The programming
+model  was not  easy  to use  at  all and  was very  dependent  of the  hardware
+constraints.   More precisely  it consisted  in using  either DirectX  of OpenGL
+functions  providing  an  interface  to  some classical  operations  for  videos
+operations  (memory  transfers,  texture  manipulation,  ...).   Floating  point
+operations were  most of the  time unimaginable.  Obviously when  something went
+wrong, programmers had no way (and neither the tools) to detect it.
+
+\section{GPGPU}
+
+In order  to benefit from the computing  power of more recent  video cards, Cuda
+was first proposed in 2007 by  NVidia. It unifies the programming model for some
+of  their most performant  video cards.   Cuda~\cite{ch1:cuda} has  quickly been
+considered by  the scientific community as  a great advance  for general purpose
+graphics processing unit (GPGPU)  computing.  Of course other programming models
+have been  proposed. The  other well-known alternative  is OpenCL which  aims at
+proposing an alternative to Cuda  and which is multi-platform and portable. This
+is a  great advantage since  it is even  possible to execute OpenCL  programs on
+traditional CPUs.  The main drawback is that it is less tight with the hardware
+and  consequently sometimes  provides  less efficient  programs. Moreover,  Cuda
+benefits from  more mature compilation and optimization  procedures.  Other less
+known environments  have been proposed, but  most of them have  been stopped, for
+example  we can  cite: FireStream  by ATI  which is  not maintained  anymore and
+replaced by  OpenCL, BrookGPU by  Standford University~\cite{ch1:Buck:2004:BGS}.
+Another environment based  on pragma (insertion of pragma  directives inside the
+code to  help the compiler to generate  efficient code) is call  OpenACC.  For a
+comparison with OpenCL, interested readers may refer to~\cite{ch1:CMR:12}.
+
+
+
+\section{Architecture of current GPUs}
+
+The architecture  \index{architecture of  a GPU} of  current GPUs  is constantly
+evolving.  Nevertheless  some trends remain constant  throughout this evolution.
+Processing units composing a GPU are  far more simple than a traditional CPU but
+it is much easier to integrate many computing units inside a GPU card than to do
+so with many cores inside a CPU. This is due to the fact that the cores of a GPU are
+simpler than the cores of a CPU.  In  2012, the most powerful GPUs own more than 500
+cores       and       the       most       powerful      CPUs       have       8
+cores. Figure~\ref{ch1:fig:comparison_cpu_gpu} shows  the number of cores inside
+a  CPU  and  inside a  GPU.   In  fact,  in  a  current NVidia  GPU,  there  are
+multiprocessors which have 32 cores (for example on Fermi cards). The core clock
+of CPU is  generally around 3GHz and  the one of GPU is  about 1.5GHz.  Although
+the core clock of GPU cores is slower, the amount of cores inside a GPU provides
+more computational power.  This measure is commonly represented by the number of
+floating point operation  per seconds. Nowadays the most powerful  GPUs provide more
+than   1TFlops,  i.e.    $10^{12}$   floating  point   operations  per   second.
+Nevertheless  GPUs are very  efficient to  perform some  operations but  not all
+kinds of operations. They are very efficient to execute repetitive work in which
+only  the data  change. It  is important  to keep  in mind  that multiprocessors
+inside a GPU have 32 cores. Later we will see that these 32 cores need to do the
+same work to get maximum performance.
+
+\begin{figure}[b!]
+\centerline{\includegraphics[]{Chapters/chapter1/figures/nb_cores_CPU_GPU.pdf}}
+\caption{Comparison of number of cores in a CPU and in a GPU.}
+%[Comparison of number of cores in a CPU and in a GPU]
+\label{ch1:fig:comparison_cpu_gpu}
+\end{figure}
+
+On the most powerful  GPU cards, called Fermi, multiprocessors  are called streaming
+multiprocessors  (SM). Each  SM contains  32  cores and  is able  to perform  32
+floating points or integer operations on  32 bits numbers per clock or 16 floating
+points  on  64 bits number  per  clock. SMs  have  their  own registers,  execution
+pipelines and caches.  On Fermi architecture,  there are 64Kb shared memory + L1
+cache  and 32,536 32bits  registers per  SM. More  precisely the  programmer can
+decide what amount  of shared memory and  L1 cache SM can use.  The constraint is
+that the sum of both amounts should be less or equal to 64Kb.
+
+Threads are used to  benefit from the important number of cores  of a GPU. Those
+threads    are   different    from    traditional   threads    for   CPU.     In
+chapter~\ref{chapter2},  some  examples of  GPU  programming  will explicit  the
+details of  the GPU  threads. However,  threads are gathered  into blocks  of 32
+threads, called ``warps''. Those warps  are important when designing an algorithm
+for GPU.
+
+
+Another big  difference between CPU and GPU  is the latency of  memory.  In CPU,
+everything is optimized  to obtain a low latency  architecture. This is possible
+through  the  use  of  cache  memories. Moreover,  nowadays  CPUs  perform  many
+performance optimizations  such as speculative execution  which roughly speaking
+consists in executing  a small part of  code in advance even if  later this work
+reveals itself  to be  useless. On the  contrary, GPUs  do not have  low latency
+memory.   In comparison GPUs  have small  cache memories.  Nevertheless the
+architecture of GPUs is optimized  for throughput computation and it takes into
+account the memory latency.
+
+
+
+\begin{figure}[b!]
+\centerline{\includegraphics[scale=0.7]{Chapters/chapter1/figures/low_latency_vs_high_throughput.pdf}}
+\caption{Comparison of low latency of CPU and high throughput of GPU.}
+\label{ch1:fig:latency_throughput}
+\end{figure}
+
+Figure~\ref{ch1:fig:latency_throughput}  illustrates   the  main  difference  of
+memory latency between a CPU and a  GPU. In a CPU, tasks ``ti'' are executed one
+by one with a short memory latency to get the data to process. After some tasks,
+there is  a context switch  that allows the  CPU to run  concurrent applications
+and/or multi-threaded  applications. Memory latencies  are longer in a  GPU, the
+the  principle  to   obtain  a  high  throughput  is  to   have  many  tasks  to
+compute. Later we  will see that those tasks are called  threads with Cuda. With
+this  principle, as soon  as a  task is  finished the  next one  is ready  to be
+executed  while the  wait for  data for  the previous  task is  overlapped by
+computation of other tasks.
+
+
+
+\section{Kinds of parallelism}
+
+Many  kinds  of parallelism  are  amiable according  to  the  type of  hardware.
+Roughly  speaking, there  are three  classes of  parallelism: instruction-level
+parallelism, data parallelism and task parallelism.
+
+Instruction-level parallelism consists in re-ordering some instructions in order
+to execute  some of them in parallel  without changing the result  of the code.
+In  modern CPUs, instruction  pipelines allow  processor to  execute instructions
+faster.   With   a  pipeline  a  processor  can   execute  multiple  instructions
+simultaneously due  to the fact that  the output of a  task is the  input of the
+next one.
+
+Data parallelism consists  in executing the same program  with different data on
+different computing  units.  Of course,  no dependency should exist  between the
+data. For example, it is easy  to parallelize loops without dependency using the
+data parallelism paradigm. This paradigm  is linked with the Single Instructions
+Multiple Data (SIMD)  architecture. This is the kind  of parallelism provided by
+GPUs.
+
+Task parallelism is the common parallelism  achieved out on clusters and grids and
+high performance  architectures where different tasks are  executed by different
+computing units.
+
+\section{Cuda Multithreading}
+
+The data parallelism  of Cuda is more precisely based  on the Single Instruction
+Multiple Thread (SIMT) model. This is due to the fact that a programmer accesses
+to  the cores  by the  intermediate of  threads. In  the Cuda  model,  all cores
+execute the  same set of  instructions but with  different data. This  model has
+similarities with the vector programming  model proposed for vector machines through
+the  1970s into  the  90s, notably  the  various Cray  platforms.   On the  Cuda
+architecture, the  performance is  led by the  use of  a huge number  of threads
+(from thousands up to  to millions). The particularity of the  model is that there
+is no  context switching as in  CPUs and each  thread has its own  registers. In
+practice,  threads  are executed  by  SM  and are  gathered  into  groups of  32
+threads.  Those  groups  are  called  ``warps''. Each  SM  alternatively  executes
+``active warps''  and warps becoming temporarily  inactive due to  waiting of data
+(as shown in Figure~\ref{ch1:fig:latency_throughput}).
+
+The key to scalability in the Cuda model is the use of a huge number of threads.
+In practice, threads are not only gathered  in warps but also in thread blocks. A
+thread block is executed  by only one SM and it cannot  migrate. The typical size of
+a thread block is a number power of two (for example: 64, 128, 256 or 512).
+
+
+
+In this  case, without changing anything inside  a Cuda code, it  is possible to
+run your  code with  a small Cuda  device or  the most performing Tesla  Cuda cards.
+Blocks are  executed in any order depending  on the number of  SMs available. So
+the  programmer  must  conceive  its  code  having this  issue  in  mind.   This
+independence between thread blocks provides the scalability of Cuda codes.
+
+\begin{figure}[b!]
+\centerline{\includegraphics[scale=0.65]{Chapters/chapter1/figures/scalability.pdf}}
+\caption{Scalability of GPU.}
+\label{ch1:fig:scalability}
+\end{figure}
+
+
+A kernel is a function which  contains a block of instructions that are executed
+by the  threads of a GPU.   When the problem considered  is a two  dimensional or three
+dimensional  problem,  it is  possible  to group  thread  blocks  into a grid.   In
+practice, the number of  thread blocks and the size of thread  blocks is given as
+parameters  to  each  kernel.   Figure~\ref{ch1:fig:scalability}  illustrates  an
+example of a kernel composed of 8 thread blocks. Then this kernel is executed on
+a small device containing only 2 SMs.  So in  this case, blocks are executed 2
+by 2 in any order.  If the kernel is executed on a larger Cuda device containing
+4 SMs, blocks are executed 4 by 4 simultaneously.  The execution times should be
+approximately twice faster in the latter  case. Of course, that depends on other
+parameters that will be described later.
+
+Thread blocks provide a way to cooperation  in the sense that threads of the same
+block   cooperatively    load   and   store   blocks   of    memory   they   all
+use. Synchronizations of threads in the same block are possible (but not between
+threads of different  blocks). Threads of the same block  can also share results
+in order  to compute a  single result. In chapter~\ref{chapter2},  some examples
+will explicit that.
+
+
+\section{Memory hierarchy}
+
+The memory hierarchy of  GPUs\index{memory~hierarchy} is different from the CPUs
+one.  In practice,  there are registers\index{memory~hierarchy!registers}, local
+memory\index{memory~hierarchy!local~memory},                               shared
+memory\index{memory~hierarchy!shared~memory},                               cache
+memory\index{memory~hierarchy!cache~memory}              and              global
+memory\index{memory~hierarchy!global~memory}.
+
+
+As  previously  mentioned each  thread  can access  its  own  registers.  It  is
+important to keep in mind that the  number of registers per block is limited. On
+recent cards,  this number is  limited to 64Kb  per SM.  Access to  registers is
+very fast, so it is a good idea to use them whenever possible.
+
+Likewise each thread can access local  memory which, in practice, is much slower
+than registers.  Local memory is automatically used by the compiler when all the
+registers are  occupied. So the  best idea is  to optimize the use  of registers
+even if this implies to reduce the number of threads per block.
+
+\begin{figure}[hbtp!]
+\centerline{\includegraphics[scale=0.60]{Chapters/chapter1/figures/memory_hierarchy.pdf}}
+\caption{Memory hierarchy of a GPU.}
+\label{ch1:fig:memory_hierarchy}
+\end{figure}
+
+
+
+Shared memory allows  cooperation between threads of the  same block.  This kind
+of memory is fast because it requires to be manipulated manually and its size is
+limited.  It is accessible during the execution of a kernel. So the principle is
+to fill the shared  memory at the start of the kernel  with global data that are
+used very  frequently, then threads can  access it for  their computation.  They
+can obviously change  the content of this shared  memory either with computation
+or load of  other data and they can  store its content in the  global memory. So
+shared memory can  be seen as a cache memory  manageable manually. This requires
+obviously an effort from the programmer.
+
+On  recent cards,  the programmer  may decide  what amount  of cache  memory and
+shared memory is attributed to a kernel. The cache memory is a L1 cache which is
+directly  managed by  the GPU.  Sometimes,  this cache  provides very  efficient
+result and sometimes the use of shared memory is a better solution.
+
+
+
+
+Figure~\ref{ch1:fig:memory_hierarchy}  illustrates  the  memory hierarchy  of  a
+GPU. Threads are represented on the top  of the figure. They can access to their
+own registers  and their local memory. Threads  of the same block  can access to
+the shared memory of this block. The cache memory is not represented here but it
+is local  to a thread. Then  each block can access  to the global  memory of the
+GPU.
+
+ \section{Conclusion}
+
+In this chapter,  a brief presentation of the video card,  which has later been
+used to perform computation, has been  given. The architecture of a GPU has been
+illustrated focusing on the particularity of GPUs in term of parallelism, memory
+latency and  threads. In order to design  an efficient algorithm for  GPU, it is
+essential to have all these parameters in mind.
+
+
+\putbib[Chapters/chapter1/biblio]
+
diff --git a/BookGPU/Chapters/chapter18/figures/curve_time_bbs_gpu.pdf b/BookGPU/Chapters/chapter18/figures/curve_time_bbs_gpu.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..a6a1080ad960e9f60d4952c52c0eaa979f7db36f
GIT binary patch
literal 7587
zcmb_hc|26__y2sfG4hc%HAxpSD%{z}F8eN7%aX=y#*CRUGqS5tl&vI1vLq>qvQ#3H
zP>GTy6)DP;P=r*{?~bMQ`}_UAe|%<M%yZ9kw&$GloO_@5RkSrSQAMd?VT$zw38!E<
z012=?y<ytgpcREh<9h*kh++qt(3yM+7XVF|L_WocLSmCCy1FnPpGzV7zyi`vCcCpq
zxF!4F-7kEvDC*npQNQjA(5iK7S43n<&ncgWMbYca&7DQVbkYOs(_0*(B@VP!gpSE1
z45suBBS<uwO6aSPM7`@}jeT#lH5gNA`DG~<HcCle<Mt9q+t%5u+&Q?K*Vt3Zm~0t6
zAe&P-sXfr>e**7~YC6%_8@ajZ7Bb4=pqOtwwK$^^&H5Op>cI}n2<xz7YZQoIw%d6^
zPO+%t(!`U{GYy_v)<rEB^=lAMzAV{C-B8kD6?1=dsavl3E;!z6!b+fBr}UWKk@*7S
zZka)7u&G%nk->YBERStSmk_e@TTaf-uRYefOsMLx_r$v?)AX)1Z1b4$sjZf$J@0{t
z2**~A;{yj;)x-m|`oV_F_oLrTirz(!th`~otvUT`?HHG~0b6ch7-6Hi>*1vq?|~9{
zc}E95(V%z}7W>+pk!ucXEO_w1;1T+@l9n|qcROa#XGbJe46f$Yoaw&!q|%J^IO(X$
zVaip5;!6Def@F<yehSUHWD8t9ZgAhGy+>GuU>19oW!B>r@YFXu>pN8RYZj=&OT;T%
z*6*V_dwD(3eQ9%Kd8&I>mj0q{%lMoJ1272*%_r%a$wD_u9z2$~cE{$(Q+j)*w7a7D
zdU(Nu{6cHKmydiY(fI%?$_yUJ>W#bV=zpc4$EmC9wOu!uocZGJ_C*q!v6r^P*7BLn
znc3qei3as;8Ze0<^RGFi*)>}hs!yB1zLx2H{V<F64@<8(!v31m;2XaOr5oMrn~N&{
zK4g^c!51G*+4MoiKE3z6w!pbj-r+N;n2@{HL;H1butLvh_oF4SaaH57S_i_HM3MUP
z>sri&VBSPURrkrJKDvL$$)R+A*AI1R{*v`I$!E~<8FqrsAfKw6lef2|^kxT64VIKY
zO>Y$Nf=8zc$G<goPL6)qwx}z=piH&wc;C_|PeM<kntSU7l3yfux4i#I<Lx%xtDFAM
zmc9UFx7LZ##fwp;nDe2pRh0`2t|+<E$7HQ9K2TI34>4NR8DgDLZgkD8Wh#|lp4ABK
zwe!|J(f&eFjJ$bl)%`^`40;83K}D>zl?v8o^Wj)A-+%mDdcAje$Gtg+qiU#}%y%I*
zdDl886>Pq6X-R2aP%FFvo?Kva&d3^-e*sk6dq}Wr>D9>v3pmA}YL8JIZUpx}E%$0H
z^vS%?8sYCEc{)dyyD?i9=Ka^2p(E|{Jjy6BgL$%PZ9(t1zpXaCu!z2_V*4|?&ec<6
z&z;6LFS~X&YDE4MDd^bPPUpwXcXqT#xKdnl!?sJlJ=MC;tv$lOu)q2OweWoT*+o^C
zpIN3(y*eAL^>todXx&>1i#*!_ppmd^2z!O_7llOOVY4*skB;IH$f1C?L>dLOrjY4G
z19kx5iiD<dNGyQCBi&)pB$vYC0~le?fq`}u9@~#gqVS+@WXolf>?wR#5b9R|=s*eJ
zL+r)@d{cWq)R6#`P!A+x!iL0AsObx+L4k(QHKdUT2pgLn3?j|CB{VV9=pb#7U%<3u
zJOH$$lX-wEOz5^9%pHI<_z6}1Br4QC+aqTr4T*drlTDjbG3&iGg-;|yGXQFa@WVFY
zR|P+o0Ry+#S_3Fu-B~B6S2b(OjsO$N+5J#6z0A2;b(s3Ev%<^SNdqE}A`A>@VB=!G
z-Exx|g}IBur;~`&;WDD|NL)IH&*lOeLKV~Dn4W>cN$2wThF(N2fI)*+#Mv<ljRu|R
zWWE>A6^14V4gLAWsS6`0{6ZWs91ai6{RwGv<DY+j(gB+p6VT{c9faSkOwYjZcwt0l
zxbRRk=J;pmzv}yL3Higi(Dyky`nxT43}%ix?Kcv@2q}=<9RG}FAqDyqmLUteAJSPg
zC@3s_7LWe@+TPETFRXcCnLt^ct36?ue=4BgvM_yVL-e8i=X4oB+s0K@9Z*H1)Byq>
z4G^#>cj0}R#)Fas1NC_%;TD3ZQ1ufzW)wQjYxYu@Y2lnu6b70@MN22?vuI2TfP{he
ze2R|~v~{pB(3;NULCcv=@pSsYZL?}{7!61(R1eTrR0a(}Q_u#q1)-7w9l>oN5%dH}
zAQ_~9(?&rmNTu(hfHcqxq=P;n3uMt*6p#(Fq5m9^!}Q~UT#yIyD7z>ukVg*yc}ybD
z3*>Q#Bnrp}`TlIs4-5o@z#s~jy<s{o&~pd{1E7R;@$*g)Ue88!=1Jw}2Nh^-Kj*=4
z?#Tv&#6wB>{tV;jB(}}od1V;t<#mL7a)s{J=Jlm3A14{x9u_GAYvOn6FIO<i+`sv8
zR?pdOu4Sj}7o9KN7!5PaU6B<nzVQUZG4qt7_g~96b}{z3z4rn?u_mcIGK)h!D!*J0
z=DpJqObYY_RU$W;aJeO-_Y7Cul{eORO6|n*dYJP&C+VFc#05^P$5&ezi$_Oi->1oo
z%M~HdpU`V@Sqxst+_f^P!dP@ccw(Ny-x_5(f!kN7EI^k>UM(GukGJ{e?mIR*n&h+b
z>qot!&BkF)`z@#qyO(cB(kOcjbKHAOWl#472Wrk8bW%=P$|rPS&^7rkzRK1D2j<}g
zv<<~v9`7J|*Wcr4g0%D_#mVPVM~_}VhxAwpI=^a=C;J|ZHOSF^_id$E_(jLiL(2}G
zIJVBtR=L|D)hSsvr$gq*;nl}VmE@k*wl$K<n6(yLOljfKO4<ijc}huH-de^G)so+9
zwtn)!U-BuU>)WODBQa7|kxL4Ul%-5m6mE)1dTK}+Z<F#2m!mFgkKA+I@Sb9y)XH0~
zw>)K?!(U3W6oTfDho1~*w8)$Z#SLD1;L_oZZ%}E@ZyLz*Dl=2Nrl0uo-EAogi^1&5
zoS}y+`AVVtWK)Lpz8){jDLY!05%_5FvDFN!a+X-icJ9ei<IX<);TwnUiw>;0OdXx)
zrrNN0HN!MB?z8Fr8^yssbg8A2_lY(p^qOF%V(hauij=~SxqeAG)vMd!B8>TI60{}*
z#XX|p<}wkk9BfJS#puYcRqv%ucvA1XH#`u3%PAC9PQ2oZyrm%_*5sXKzbVqbbNfng
zmuvJq-%OTVM@`T0ql@sHo!|Da%?S?e$OsB~&(z-3{xp!@xdaIeIlj$F-ru+Y&N5KA
znlBBPQNNXUqicIy*I}I?kCY{AR~{NxyV)n|iUs>62M&ZD`XZu_{QOF=?9gVd#1-aR
zMVIr6vK5Leq~f}H3*k5ATox^_;ANL%1+gW%@1jRy=0ACvm&&t^DYshU-sXZabl+AR
ze}i76lci84*}N08P~?61X369GbL>pkMfY4jZTc`FR9QJ@=eH2vipw`Kr<}^nQnDzQ
zw(MMa@7k0RqBN4GJ<jm!37hg{P)oY!hg6A8*e?&_S7etCj}F^scywDP*u8QrA7`0j
zF{yj-KHCM>RhcSf_s{%QaeCiL7dh#s?L|kc_Pf<%KJ8TLPT;&}T1H(d%IQyh`^wMQ
z3`NlRWcW?uU=8Bn`ls<BT(UprdTeI`ZM^oTi0&Q@1%Hoo_V=rTE&BbmUv6p>Yt{ZZ
z^65<F7n&m@Z+-Iq-UaXpt9fH^-*Q>+tVi~w##IvLG7UXDw%p#NG|a453@GsM`kE7$
z==aFM1#4q?+oM`0IsCnOXCnNL_=v*A8ftcjcv;q-@T*BH^xs|-B{JaBH$(<R%y$QN
zPEIT`KRNHdho#rXRWD^e-}1PrHqgi$St0ZM((oSRO~a>d99kEPMh`Q`yHZp0^sr_N
z&kknzoS$6PyjtrYyyf3|*;Zm}UOLqFl-FF`9WbVl-s0BG>|5bnzSJX&Jgh%N9BJ2D
z8t+0c$?kkt_w8uz=#u`k?_TGAMMz4{m*A=y_TW`z`r<sER}Ss;ANsU>(4?flS0Oit
z6Z0_2tva7=L<~B!UeDijJiT;OH1PQsUYxg)o_B+J;z~Q^VlT$F1^8~4&u@}_wg%o^
zzT5z#X!G&Zx3I$Eg17r4%MML^dNZkZBKtMR@$294rv{=i+>LifU=u!f^R3q*o<?p;
z)hs`;rtTk0zo_^@wz_uhEnKNvBs|3XmV1JeWDPDo-q?LVbKDwVo1}AX3tEuvqU9KY
z2UahVnutiKXQ+rx00bp>SlsfC9v3IQ+WOm$YTqnPhA{aLS3T=v1pbqN3%mb1Qs8SN
zPgoir>*wmkdg^AEexPqk_NepGp<HKsQ-9g7qx17j<Ju6FnxT`o4-`Jh?A)m#&?{l~
zXp}A3aVcX**Hms#U2J@`$?o*y@6<_{frRVzl#5I9W%a_ZNLn}_dbJ6Ecb?Q>MW0l~
zf_C?Q!Mwo}lai@!1`%nmGOr%^fO^8iw_E8+H-4d(3`dEt#~ggOTGi;hMuvX=nd4y-
z=_fXP-W_Sfc=6<7w;|(9?D^+%o|;ZQN61uKh~biVcS8s&H+HC-#VmLqpR^%(PeH54
zRm(+sr#Y(;0z$&apkZWV?3hcR%-C|3>XG&q!I@-C+i+N`@6wYGkO@`(DO<O|FQ{L?
zJ#KwDzU6SIRkqD&P+qOR*Sf2c2VBdT*&`{q8sA04H8JTS+(3q9JEo8^UZHT=GQi5I
zx9xuS-rAk0&P8O^1ap0kg^99-(Hfh+66uV`erHDD?rrt5$FKU*<~OHm$;k#kNWZHS
zB1tzWbu(MnvG`!Za9dB!vg8ObAgL4g@(QlLX>CW;#stgH=Muj@{g^EGgcjQGea_YP
z_8t`FxMj1AA7{d5UK_lcrxR-}Dks7!wzU{PC2s0k>BH6u6YI()R_Do8n1z==Bfm9o
zs#}~VcNqTGP4MdI)uy;D4hScSJx4bTr{H=_R6__oTI5FUKDPkVkPXzH%aajxwrjK>
zt!U|}&sKXKl31cxD#&K>CEb}<`3X%qw$Gd5jPnd$)jgCvBtluMqVB{!-P6!zJ<2V5
z!ctyHmBU@z-G2g~8rT{1>9)>sT!Od&tW!KZ5ReOAu5%KrY;bG0@-|Eey7zLKq}9N@
z_ZOrVFaB^>N#=Nd;)`yF3zcDlcKDq;MG=8s7@Zu}wdkRbllbM6YTHF<XYUx*7)uO1
ziXIC>#$QOiNZnz$ypu+hzQd_$q!&EA%PD!0a4xD?5PmlPo_5*W(ewir7nL=>OcAgj
zTv|Top0D|^c`{5d%sISX>F|-1>M-Tg1zDICezmy=|JfaFM^9+wYaLL0IqYKVx_Do0
zc(8&2=lNb&dj47^iO1AUE92jjMbiCGoBZX`zwf<dgoOACIH?LT5JbvGAoe`~&Pm1g
zYw)fgdi&fE=VU*=w>colp#Oz^N@DE(B5JEOug&t4AM>L>?fpED<KxRZEUtR%c0_&q
z0K1L*nxAsuPSg$iBeL|f>&n?<T~kg{8>`t<I@kU1I`^=zR@&+*x)n<%O4qKpOwt)S
zB3EH+JwIz~olkV;x~cZKHt*_BzFxA@1`%=xBn+GL8t=SS5KmKDk=Y|gy&)!p4H>zs
zZ#N-GdZ&1z@D?^w{>jsUT18Wts7Hk!o>kaait7z}a->0}Vusn$8*(R?{NwAGf60Po
zb}%Ptn{CC4`(U<$74>j3W6e@ddanU1BN=CdIA6HCEC2du%SUNdeD}3RE_{V5*D#wf
z*5D=Emj?~5oHi9~xSTsAmg)DhvLS;Yn|EGo_b?xEDx)*@L)<rxyV3sEK-J5xmrx}l
z<@a7C9a&JY@B%!oRHP~9N@7D4HO)i`PK!xT?Dh-cm8@w7S^L2dQ0ZYaBIlL)VgBks
z<?)?1gQge6B$KuTbcKFAR<(ED?c}Lg0V2nz;n{khJ!C1jslk5N{ufVZ-uhowr3u*G
zjk(#2y}HA!0QU0JhfGf|lrC4eC$HUmob7t6?CMjC(l;pdu(%m9{=6(kpTT{!@pN!?
zp+MmL8n2l#MG<ZMh?U$`f~mWssu^;WM(*ZyHM+XkuS>~0e$1uX*hcYXe5&WxGT!0?
z*M||+9cog?TJ#R9+G;OkRIe>M(<)jSCi=<S_d~#hy2Hr4Eo-J!)laTv>IJs95+aAZ
zRqC7?DlQ=h$aN7COZy%<zM-R@eRw}`tN@tzxWRX);=TDnX}fBQ_tjxTs&%iW!DW)u
z`US^2!>S6izh*7>K#{`Qx?(-b<TanQDO~dpn4l(@U$PqSZRG@;aaf)6w<1(%37((p
zcLEcv;hM5{LvF7No8@;(-BbR$H4Jf`P;_Usz1VkKY)(Magk8Xwu=+bgF%K;Uq~CeP
z$T!y|kLbdm_34svR}9m-vgLJ>5zhipVIPL<pLN=s!I#{q{#)+Re4L6+L<x|#za@On
zx`lf(Qf1ZlpN9XPG%^<DU-4M3Ux^q+jJjQv8Gru5-rK3snsz?AnucE&iN`d1$E3_3
z1_#QcyMza_UlEMyv-;0xv$<3Jk3+;ZYpb~v1PV*Qd_O^G>jgju7Y2n7cv5I|mJZ@w
zc@+YnlXVczIBTRe$B^PhHw)rY?1O9^NI?ve1{tBJ3)9vkLeL3CHGs%brA~vaJODbT
z>Bt+}S((TS83Hr|d^kQ3m;eO$Fj+j!03F1%h$b`^l0gJ8&BABsAhucC0(>@`$)NKA
zw3>z*N)?4s0jON)H0jUgG60ks9s#N3lBt?@5RNjV5W3Pqc=7ohO%U|=_gC}BsIj>;
z5T&7^0V2^L8m$U(sPY0?d}4qqi>CnDn|6f0?YkI@r#9`A8j0-#(zk&qH6%D^XbxYX
zvsr-9Fwv9k$Jariuw)zxhr_Aj@Hh=s6pBJpMM57{G)5gw#c7Zz>UheWCC*PC{$dK6
zfNWu)5I`eM2qPl<ktkdpgsH6+VCu!@@n@h7KwXW1LV&X?{}&vBGV-6-4yAE+@!AlN
zrZt2MPy>x1gi6=mj|He2QB+BQ1Ax^8keYZ+98Oac189T4aY8AFL=7Qm9fI2b3(=oA
ze-dSL9iT$crLp`tOg10Ds-Xc6kx%ksLZ@vegzC?jotB+t)dq#>`6)X8V!VhPYwQ0z
zVm>~?LKgCA8nH=!J`hl0ZuDQ4$-kIcGU>aU96v5|dMA)cAcaX0dc=d0i-P<llQgMp
zt`8BaV<LwG!6-yNRNf#|ga}=2Xp&FoGby^$`%It7pBXUe!hNI3B(i8ahyYbGg-Y~e
z@)5eq8)gE}r0dQMrX|21TA&e0SPX3Lj3RrA@0><m61126xO9q&ipp<Vrn!V#zRN;m
z@+{T|Is2!sxl$3<%m1eA7X|;JUZzP<Yk^7!{NA{9VZgui`S%q4=+@9ZKnLM3?8SfF
z4Ss@^=6Z|re`+loAW%%0At<cE4XZ5N>(EOw0Dv&i>1~e00B9r%1@S>0VVcEAQ%FFe
zkys=iheKgDAyEh<5;3QV%~hUD#~(Wv;&BkxmohYfkS%WtiT|Bhm^+B9@5lFoRQyyO
z8@_|e0bB|d0$3rSPuOo~J^<tc1_w}qSsDt1flzMY2Vl+7@EX`ZXecxRiqWrp(4zj}
zLn2WS0RKB5@((@~Mgw|>@Eadi1A=*fqiG-^*!eda7V{?`RtORQ%7;c{|L_IEHGlGj
z&m}@IBNuuJ3V`<XAn5G^769$oP#1-m06PFQXHnU}GzJO5s78RRHUVv*PQVhdSUetW
wU`S9mA{b(DC_}uy36h|0Y;3Ii|8uwt-!Sm_L@s|iK?D>IdMB%Byu}3eKQE%<DF6Tf

literal 0
HcmV?d00001

diff --git a/BookGPU/Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf b/BookGPU/Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..2437d2e19e4bb599ec97bf0c340742815ccff97f
GIT binary patch
literal 8005
zcmb_Bc|4Te_od`DNSh+XlhWA6Gy9Bfy!MnWTXu~x!!%<?Gh+=cWGPZ4g+fJ(B}KBl
zsK`<z$u6ZRN+P5v{hqP3et*B;_mB6P&%?dvp6#A<?z#7#N8Uo;5P?#~!sKg)V$Z{H
z01{xk?SW}(f+kd!2hS70Ll8^Qkj~^$IRI$LB=e~HR0`XjN+Q9yJPwuY1@lkIIlY6u
z!C=kd$*;<Lq{1FwnWTv5_H23~y2`TZ?r3z(N{ks(Xlcc{>FhVdn{+qd+srr?TXChN
zWs>L9ks5vW?DbD^lU*;zUzMDmeqCoj*5=;OV(o=ZH{_=5QN1{G+RfCo-p*m|<i?T7
zX80F{^1gQ;J<Gf9Ty7z<ef?IO_nFk@Ke(B+;<cx5&0xab2)5TWZE||Nk`=gPto8KX
zp5xo<{$W<X$tv2W{`yGTmctUsyGNsz7b?bB(hJYfQU^|7eXNi`OQwH}GRufv06LW|
z09EgO$4tsz0J3_v#~Uj(;BVT0@mn|9e&<$Udxy=@;DL1qE}AIgB@{_bYJwR%A%y0k
z)y@>l^%;+um<Cc)j=?;z8PS_^%+!(%D~wv3>iHh4yJb#;<+hb3b7~DHX*iFiF=Tix
z((+X&CtM7#V7r0{`TGNs^>|#F7~d?z2QmUm)i)HwZJX{I#M899`a(y`-^;EopGtYC
z0gsQ1ej@4{;LvpLi=>ayW<_kVZaz(2?^>AQT(czl`UA;Qm1@<a*BSS>Us#jmlx~lC
z>UL=-%ry5=cGSnUJ*T(UVM|Spt}VG|7HVB`PbZ$%IZ<NFN$ikF8qWy!MbtO+d!-s(
z2)?faH&mp?DX-3dh0bcRJ3i0qM&YrGkNRK@Ndt=#giYh<zGv1p6x=ePpVW<4#xl|!
z-|@1}vV0!p7^#MxZvf`?GmH^5POp<l!Zp*M#KcKA<UYCxdhz}W@kI`uZ}J9e%No<w
zVm2FJiE}!Wh_ex{m<ZI@fwa8QcAd7~*EM5K`7_5TL&WN1N3Xwh8^SK%p>WvjsB<E&
zS&bgPZFX{JRLehxo^G?w%hD*h7RsV-=-PhCQHu9A%2DQn1t)cvxk$)Er}o#UA8dFy
z<@8WyZPV7;6hx2a#sOy3wYSPE&A;t`Tvg}tGP~zt3zjeqn_l$LKbYI8Ez>YKHd@&`
z*wHO(7I7&#v9jkMRAWR}O|#6=L1~MIk&6cz%KbCjQqi$~uljFYs%)6)2nq?04f!M;
zLOPw58d5D@VdkakKO{_g*f8|5IjbfX6t9Wczv$~HUI`%<K30|=Fhuch|0wl-`=`5D
zM_0Sb%2nsK$`q~-+!LUwByZjbw>vH_|7a|X_eyJhA#$`#d2MiK%!PX|t<$Aj)ii@~
zM$$_}g*uyKOpH*ii1V&jMz1?;Of{b>Q+QKDE^&N<&vto{7nt#My`Xqjn^v|<$gOIn
z@RoI$?D9<VM1~I|+q>Q_{-q4>?S~B?fnv0yy82zQzjvy~R<#VTJDu<GLd$*K@ZvFL
z(LC2JRi!6ZJV`oUvTa-R=1ot6i-*S(k2sB38|t<v?qC|}awLVXcx?%@Oiyfz4SRI@
zLD-YAu)n>!!c?TSl%8(gf_VApEon!j){bp11>P#O<m1#0)sG8>?Im=C@5ltjgjFpQ
zK6tKki?G^L%`X9;?uL)bM1&%Oq?dOY5}D231yAyOv&ECdgtR^JyT{LoMZhKMmVL6%
zUtHeIBzDEoD(!o0M%2}lQ)o+gjmt#O?CQ9k)Y_Ex7<yrCO=-H&8wkBOp!W^*8f0J4
zVb~jmo+>!DEQh&K%;EXg_3a0*t&Dl}N_2ncKH~CkyRZ-O$D$jxHngOA$(ARaTk$w<
z#guDQ!KU%2OL7fmgpG6HRn_=aafZ4?1J2%HX<=ddmMvJ*Z7=t~dp@zFE_bZ=RI)C}
zj2F*lC&v~P-7>n_6?8E_DEkoZR9=ITona>Ec~;!1O<ktwA0PFBH-2x+ux~`*X}6jy
zbl78!RF?aE7KU2<%+1f&{QoE{5(}G$VSi+G>i};mXhHU%f~Hh=I$4+P4>%&BVH^?*
zVDLz17&OSCvUmW7pZ8&)C6&we;ZUetXh*Q%uqjqlo+AkD831Ta_2)t82L3!FD;~5f
z04P2mh{TW$k)u$v9cYsP^`Jh)kqhuQ7fTofnpcZ2Vs5j6xIuaWvx;#6a0lI;3pm2~
zYFomb0f>VSpXE=aeD3r6)EuQAnMY=_Jr-EZYi~;Bk=>yY05u2rA)D|kgCEm?f#w#b
z0E$GKS8{e%^P(&XFg~5-4>q&YTo@(7h`)~VyYqv(WG<C&7*OBNSkG{W@-`~dm&&74
z$g}3sr*bJAx;Kx_0o3>`X3a4>0-2M};qvr6$s7QK22IHGZ4?>}?xMT%Jh_fAG=VSZ
z&mWG+w;=xqVZd-WJh1TcVGHe_|Nq1THrFPg(epeAze$-Lf#LCdi_Br+A!{t)&%uA?
z_gxb5hjhN~3vl#zSwswG0XnNU62R~w5Z!ltsLlWJVbIGDL>6=^p|d<7v#@knT>AIE
zm5&>bANBk|f&97<eSAaz6hgoGVYX{V_M-l0hv`BK#}PpU5NH$;AmGse0gH0xpN>4Z
zkegtj4wu4TL=Y5;f3o*BD&50#zRUMDe~eEG1C61urBifR9!x5Lgn?E(s+TRac(5?g
zl+NWs)0uVhtOG&2c{VtV8pIWf2xu`XfO?=2Xa-t<P|$!jpdCmC-9QTH4pPBcp&$*U
z(S50)2j~gXK`)R6vgj--$OhTaUvJQx>B9v%AQ$9PeW@&vOZNx4OfuIK<a(1SRFDVq
z{Meum=nDpbfnXq&!&aQN40Il(VgM9BHhx|W{QkU6=T3}29~hz8{p<<7g%c(QsfHtw
zzMmy+Y(=cCYFbH~`cgK_wWb(aV~ulcT<+^xoDjG#Ju|XaB~ng60JrqMw33SiLwR%H
z+t?NdiKT)UZ%O1uid(yovIHa)cP^2<EsZD(+*jsj-dZ*B4gP4_L7CB7@BZ*yS;g0P
zUt<oSap_A14%+WNg4UKNXel-o-Ken@(0sNOTybhu!pwp4Z3PD`H|!@ZIU{nSm_G4E
z%EirJOT?@p?5>|e`rj6IA@y)fmV$;B4zY{1abs~$^}~lV&os<fEY=5v!M86)!^v^+
z=dQ+qui&2*@7HHXud&H3?T*mSvCf3oStS_Q)73sy_4S=}f9ZjWx_{D(^NmLhw6&#`
zcX!qp4i60s6r7ZG(rd;P_Vw3ZSY@NeIjo76$C;lw{na&jjR9h3@X5jWgtM!UCnu&B
zWhh)Bs$r=N|2+=wQzZkULPl_pHi-`jS=~blO3|DPcdr;&FO@LJxsZLuQz_iRJiv-%
zy`g`HSK1c^q5G+@w3N$#D@8h(30mbofhnB~j7TzwQaqMqYHum9XE`o0$>@w#t7gHP
zhW$mMZ&tPc<tvh#+@2If`C#X%=!DpS^uZ(0W#l6U*UxL6UsoWO<7Ohs&WswQxLjYZ
z;>NJk^y(>yqleOTXt4ZuwD7^MOxdty?t2b)WqlZJ4cYrF?KN_P<c&UFaMAX=V(YJl
zryu@owCh>^G{P9B__<<}ibug6=c5@Z50~7dF~?~}QX*wbu0Ec&h2=@$<NuaA5cwB<
zm06OpB=%@zWO&)q+Fi>qD@Vrs2g1cJIxZD$c$TaY`A$gib-|-tiM#Y5Gu=}}a7%fW
zA8%9p?YL1w-m*hZGloO%(^~RiiDn%Wjk#M%w(kSHfnnRX?rW!Viq~tVuXJh>o+y+w
zwtoI1ZL4bSvf#_n%SApI2g|O+=+{fHo0#%7(5w%Px-DBa)xSnR-<L-p${l}qH9GG&
zPrNEle8YjmyZ@3FT-H<CVSh5f);2lV<g*Qy!Ly0+YI(7IvwmB_sqWoEJB`D8MW3#J
zda60pE^qyk#YLglluxfX6W#pze63gArXh;?vLh04+Rq$JX(tBLvo=z@pMQviBUjUU
z{LxDJ2Rt-CzBRg6_78an>tfqqLDyD(Y8G8j?C>q0sjq8x?P+g3hLrE$_wl{H3Sm`D
zA+ziGopN`Tk#$y!Tw3q4a#jdYM2tJwiwU)&B*~-V5``>8-H2JZiL?KOi|Vo&^)W}M
z+xJ=BZ?gLQcw-Goqwv-VXEWI?m_WJRi)C-8^zOUXBD}6g8^P6<8i{$cW>3n#eqFuB
zQ0`TqeoK+BbxvYI=?5KRifn{z9=*%F-X}U`r*h&ri%_eYR>v&Oa3Aix-T5FnLpnFB
zz%%9Wfr>*?Doc9fYcmoaCyv8+o0=F!YH$om#V1CUzv5S{lslzTcUZ7$rQ5d;MWdtU
zkM}q>nRW`S=S*Msiv7fB*VFVjD-YMx#OLODO4#>FkHqVk)ucQq+GTG)Q=?9o%^y3u
zk(w%p_i5`jlsfJajUs=wvjNHpONAm@MMqMqKj#;>744BIzF&Lci_@W1jn5L?XbE?c
z_7<<3*mdr4=;fTkoWu$Iiz8RPbutzu(JBp{<Q=xJW`qizpooXSV0sC)o)wMPWHpu_
zaV+pYIr(00rqJ}pzGtsJK{uOJ8I8@MUD2w~l}`98eu!(nsAgY^{+QztF{r1FZ^>Gz
z@hDD8`2oekVdi6+rCVXyz?heYZhpe3!hwF+yY~@azlQAdsWDabL%n){QQK%+)712#
zQ%}WSbn&W__6{j$AIMr|vOi-VDiJf%%kr<YhAuP5YokJaj4sud<ieDnJhxXImbAkx
zZ40P=<u`!Y!p2DE`afUX;QW`?v)nhg1O|zbx1}|mpRKskXk_18hraBw8+9daAX7p5
zM(3#Z=OYz>Qp}8VdH3s)K|oe=s7{j!ax(8UT9kRU9LqT@Pxl;_wJ;87((x7J22GAO
znx{KhyY79HjDg3Voyrv4KE9SabFX{Zxx%jJH~YpI8sDrfQadv*PrfRiKDPuV-%~|#
zS}sCveYx0wGLI?me1pS154vq^6}zpmyCXfMr>*4PbY7GElUT&mJ+b!Y-Jv-u#V#AF
zr)hZ~!jEDJy2H}-28=up?VT7W@Jpme<cxGW#-WH*@bQHFuIL>Bkwt|z7sJgMu~)}l
zI-hxOvb5^#CwkPPp7_1@#G6_=k51ZsGZIy-j{4F&%JP%Rlv+9bV54s8Awj$Dkg4wW
zgv_KDjTh|Hm7X-de5Xnn@T^qHUX2JzMjbJtYgQta0x>OuoAQL5mfd^3H(cj)eO%=y
z<k0anc&c+pPs&XF*AlV!wJ$O>w=!yzUcbMy3Mu<$amkUIEp6H)+=_c?rnoaXf*zA8
zh|-6rCbum&?fyq2+OFa0qPAd9*A7y719l{O`M~0=kWu0CG|hm!Zl8`A)fwpRcMUi{
z_-L`UuVv?hH8S1B6_VG_Nw03LP5CM?B|54{bkS((>lG5S_FUTNgT!xID%gGE@0Cwu
zRAtyR<r81(9t<oN5wR7IGq8L-iq|uuC|g}VAW6#(4lUhRzs!TQ^PPT+P=0ijdDfaX
zMsEc9n$C`4lyBe0QMaNfjVRd`A%_#iW?}N@a$7>yxltR0ol|+|rw1&b_D?g{h6eeu
zAMWZ`B&-V#v$UA%?OuJjW_v|X*X`Ui<!qr!@>a_WLYFES<l>sw*EW=jV_x^}?m;Sw
zo`}A+Z{-fOawntsWr^|U0n1pa;1>s@{Qq)EI8?sL<x5y_ybXH1bfl$qxW`2?N0F2g
zQ+UHlfOvJ_%kFoX-QS|(@7mMQP2R>!gm7+Wmy1%e;3~z06)^EFc@eKhEGTjD*(#D=
zQ_sRKNO5(nO)s?!spYfJi`I)Y27lW4psXq7Tl3BI0AMCB@gGp05?@<q+6(ld+L}5Z
zzd6+W*+|<^IGdq+<K`2A8v(_al@9zPaOW#|Q~QV8IcRIKrYX_p1d)gU^x{Shhs_7B
z`RIO%=-Doiv0m?*^iTjJ;{yDqC781&dLZwNS!CBI3o(i47`Eh@wP(G@c}dutxkW2|
zZWOsmRhVag8(ze?BVyum+f6a|d1&ystoUJ-myL1Br-PI=aobOYe=Z*LZ9MzlN+6@}
zop$1_MtLErME}KOa#JNSu5KazDb^hccB^kQI>KZ-Kh<p&oQ~|3x*FTB!CLu#b;pEn
z;^p{o=PBp&R}6D)MO35pUKXu*Gk!e|Sd-bg{uXLWTZpCR_S6;2hwj)`!f!6U7_k4?
zx1A*ud;FaNH!Txam%_~_++S8_;2*r$e^@fI_{)cW_tuH)0z|Y@*ybW9HwD?28bjoE
zIfZt!SL^a&7eW98OKQ#TTYEQ=(h?*yn<NP3UQb;1wV+;u8{4K%tbJNPXu{mibl5Mq
zLM{EaR~@-SQDgPT&v_Td4=TI1%3jGW_Z~FSa7*nzGJa&rKUB`<Ro*$fYuy{{wr_at
z*6-e%rIWQxC}&aW*e&<bbSaYo$x3FE)l@{``(u7`ANzBjAC?V^Xb-3+XxAkoF2(9^
zc$hua7c{QdzuM)-v;7l<Ji*U5Tx-9yU)(6==<c%uen`0@xjaYaYGCHtf@60Cd!}4S
zruS|KPOO3j02L?o4G!%;C?dB?KQnb?&rYmC&(5nc?@veV0z%cy)bD=IdEk&L*Coy<
z4GD-DP7|qf70L^`#!ZMWi0IOY5WMCqG8jM#d|TMs?wYAIxGU%RDqG>vy%ZItb@feE
zp+T?Avnktzrw_5h525VDRgaT0EJxmLuHAW5x+b(HJzCaZeYCNb9yimP#H|jI<Lz9H
zI&-r-^smr8ma?okPDPHH8Eo>j5$)TXrJUp6)R(VPBpj51KjkrK!XmE5%W}7T61pd!
z?JV6km3Ap-;zYC)u0KS&OycGqs@=(WhTY_&M>(n&;-uckT34Rc(P=JPdEVD<lbTt3
z>equWN^8wcwY$sO9j_PwI>@qt*Hxu;PdABGdz?>rw@Pj^w>t4osd(oeb+GP!ZKp!%
zJ%y!SEY(4Buk)Ad`ds+!JM!2nAA|aa#Fw{*wfe8^gIy06Aa=v6#4Hpy86WR6irW?^
zJEA8$vZGV(*7{vGy4y0&C)p;)nC$x2)E#v3ZqPD=ui8f)&bja}ZNJtzW^eRA-`^JQ
z`aiB2cBUo^cMKF3ulD_pp{eZ;U2Ygu9^gjxptH2#?@KG-0Nq^+z6)oHH1*b_deXNA
za;R2;X4aHI21U&su1$hzYLlS?3l-r{_D0ZV%i&xAy2@$E=~<c>%JC8W)&0G^y`Yi<
z;P1s`an=2`;Iky^P@4}1;lL~kkD&#(GqnJCY&MfY=K*L{HB}S>g;4=$9O%aB$L25q
zlqwz$vE;bZ)GeWE%N#?frv>-q@x0YR(9h3L)eob}=6HZ8H8nL5i3ZVV1cZU$2C#T!
ze*}xW8Im`v2;J^G8H=krtCK2)?FG{9K$I#HToANSg`l%p0ADcKjqSsO=0H(XBco_|
z1P<j!M4(Vq3IYi|2m+Qy#<@|5Sg6jhAj$hD4Sx{@4M4ImkO`ocI#fw=_n}ZZT5ux^
z6Trxm&E?G%KmejD0R;!=XZ|lJ1bO5?&mD5({Nyzu9CcHuc0dczhw4})Q)v1KGd98(
zutpQr0i?Q`ItH(<jsrBo-!LJUL!^4pM-b?v$bUil6Xs8(Y>qV)2qX`dk2jOe1F))C
zz?;mYcru|IITJw6Nu4E~$J7M*-ucNm|Dw6b-lnGix52!;_;Jj~Q`cuxe7vAqhOz#C
z872RsW8Ep=)%5n^FlQHnI|ZaNseFyNkZ)0tp6(QN8k^%q=7+JjH&kgM^PuPkp%{dd
zG@(HroyVk-W|x@`lQ-93()r6qok?bSXu<sv?o=AthslGJ6cpzS&!m&)8nYDO4=zxP
z!jA<uXO58-b?*X45(QdHJ{&srHx{!fd=}pcp|<;cH4xJDPeu!2!jF~zjoB{@{zI_L
zf}pel#SQp9X^~*Szx4cjjDBQlDF180{rGwKkDcJBqSHcNQTR`3MG-2R@!bQNmA_aO
z`0Lz^?F*ox>gntf$KU`o5`}{BplmRUqOZ;epipQcQVoO0VU>|6I1&k8;Kb%AEV$$M
zRSTh5^J7Z^YCvVLJyZ(sJ1*aK5Lm~D=LvCGhz!N=C1-yQl?Ihxp&}qZ&(1vn9*2YU
zqXF|U6afVlF!>L_T7VHSc&N<%D-4ClLuJz6U^wI-v`8chD$oChhb8`jha&Kc%D>?e
ziGR?d(Q1%cf5$`rfrldE{(zzJ7^tZHD=m(Q`U%G4kf8!32l`A6fL8QC=-UGp04>>2
z_MH9f0D#6U8XK6cPC_MCeZWx@O(fzmSOT<FVTgu?YC1YZH3Jl0A4Aa5L7}lahNS-=
b!<qm6fy*Ovc(V>7;0S1#yu5+AA?$wu_;$Hx

literal 0
HcmV?d00001

diff --git a/BookGPU/Chapters/chapter18/preamble.aux b/BookGPU/Chapters/chapter18/preamble.aux
new file mode 100644
index 0000000..65abe7e
--- /dev/null
+++ b/BookGPU/Chapters/chapter18/preamble.aux
@@ -0,0 +1,32 @@
+\relax 
+\@setckpt{Chapters/chapter1/preamble}{
+\setcounter{page}{1}
+\setcounter{equation}{0}
+\setcounter{enumi}{0}
+\setcounter{enumii}{0}
+\setcounter{enumiii}{0}
+\setcounter{enumiv}{0}
+\setcounter{footnote}{0}
+\setcounter{mpfootnote}{0}
+\setcounter{part}{0}
+\setcounter{chapter}{0}
+\setcounter{section}{0}
+\setcounter{subsection}{0}
+\setcounter{subsubsection}{0}
+\setcounter{paragraph}{0}
+\setcounter{subparagraph}{0}
+\setcounter{figure}{0}
+\setcounter{table}{0}
+\setcounter{numauthors}{0}
+\setcounter{parentequation}{0}
+\setcounter{subfigure}{0}
+\setcounter{lofdepth}{1}
+\setcounter{subtable}{0}
+\setcounter{lotdepth}{1}
+\setcounter{lstnumber}{1}
+\setcounter{ContinuedFloat}{0}
+\setcounter{AlgoLine}{0}
+\setcounter{algocfline}{0}
+\setcounter{algocfproc}{0}
+\setcounter{algocf}{0}
+}
-- 
2.39.5