--- /dev/null
+@misc{ch1:cuda,
+ author = {{NVIDIA Corporation}},
+ keywords = {CUDA},
+ note = {Version 4.0},
+ title = {{NVIDIA CUDA C} Programming Guide},
+ year = 2011
+}
+
+@Article{ch1:Buck:2004:BGS,
+ author = "I. Buck and T. Foley and D. Horn and J.
+ Sugerman and K. Fatahalian and M. Houston and P.
+ Hanrahan",
+ title = "{Brook} for {GPUs}: stream computing on graphics
+ hardware",
+ journal = "ACM Transactions on Graphics",
+ volume = "23",
+ number = "3",
+ pages = "777--786",
+ month = aug,
+ year = "2004",
+}
+
+
+@article{ch1:CMR:12,
+ author = "B. Cloutier and B. K. Muite and P. Rigge",
+ title = "A comparison of CPU and GPU performance for Fourier pseudospectral simulations of the Navier-Stokes, Cubic Nonlinear Schrödinger and Sine Gordon Equations",
+ journal = "Computational Physics (physics.comp-ph)",
+ year = "2012",
+ archivePrefix = "arXiv",
+ eprint = "1206.3215",
+ primaryClass = "physics.comp-ph",
+}
+
+
+
--- /dev/null
+\relax
+\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}}
+\@writefile{loa}{\addvspace {10\p@ }}
+\@writefile{toc}{\contentsline {chapter}{\numberline {1}Presentation of the GPU architecture and of the Cuda environment}{3}}
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\newlabel{chapter1}{{1}{3}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.1}Introduction}{3}}
+\newlabel{ch1:intro}{{1.1}{3}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.2}Brief history of Video Card}{3}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.3}GPGPU}{4}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.4}Architecture of current GPUs}{5}}
+\@writefile{lof}{\contentsline {figure}{\numberline {1.1}{\ignorespaces Comparison of number of cores in a CPU and in a GPU.\relax }}{5}}
+\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
+\newlabel{ch1:fig:comparison_cpu_gpu}{{1.1}{5}}
+\@writefile{lof}{\contentsline {figure}{\numberline {1.2}{\ignorespaces Comparison of low latency of CPU and high throughput of GPU.\relax }}{6}}
+\newlabel{ch1:fig:latency_throughput}{{1.2}{6}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.5}Kinds of parallelism}{7}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.6}Cuda Multithreading}{7}}
+\@writefile{lof}{\contentsline {figure}{\numberline {1.3}{\ignorespaces Scalability of GPU.\relax }}{8}}
+\newlabel{ch1:fig:scalability}{{1.3}{8}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.7}Memory hierarchy}{9}}
+\@writefile{lof}{\contentsline {figure}{\numberline {1.4}{\ignorespaces Memory hierarchy of a GPU.\relax }}{10}}
+\newlabel{ch1:fig:memory_hierarchy}{{1.4}{10}}
+\@writefile{toc}{\contentsline {section}{\numberline {1.8}Conclusion}{10}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{11}}
+\@setckpt{Chapters/chapter1/ch1}{
+\setcounter{page}{12}
+\setcounter{equation}{0}
+\setcounter{enumi}{0}
+\setcounter{enumii}{0}
+\setcounter{enumiii}{0}
+\setcounter{enumiv}{3}
+\setcounter{footnote}{0}
+\setcounter{mpfootnote}{0}
+\setcounter{part}{1}
+\setcounter{chapter}{1}
+\setcounter{section}{8}
+\setcounter{subsection}{0}
+\setcounter{subsubsection}{0}
+\setcounter{paragraph}{0}
+\setcounter{subparagraph}{0}
+\setcounter{figure}{4}
+\setcounter{table}{0}
+\setcounter{numauthors}{0}
+\setcounter{parentequation}{0}
+\setcounter{subfigure}{0}
+\setcounter{lofdepth}{1}
+\setcounter{subtable}{0}
+\setcounter{lotdepth}{1}
+\setcounter{lstnumber}{1}
+\setcounter{ContinuedFloat}{0}
+\setcounter{AlgoLine}{0}
+\setcounter{algocfline}{0}
+\setcounter{algocfproc}{0}
+\setcounter{algocf}{0}
+\setcounter{proposition}{0}
+\setcounter{theorem}{0}
+\setcounter{exercise}{0}
+\setcounter{example}{0}
+\setcounter{definition}{0}
+\setcounter{proof}{0}
+\setcounter{lstlisting}{0}
+}
--- /dev/null
+\relax
+\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}}
+\@writefile{toc}{\author{Christophe Guyeux}{}}
+\@writefile{loa}{\addvspace {10\p@ }}
+\@writefile{toc}{\contentsline {chapter}{\numberline {14}Pseudo Random Number Generator on GPU}{305}}
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\newlabel{chapter18}{{14}{305}}
+\@writefile{toc}{\contentsline {section}{\numberline {14.1}Introduction}{305}}
+\@writefile{toc}{\contentsline {section}{\numberline {14.2}Basic Recalls}{306}}
+\newlabel{section:BASIC RECALLS}{{14.2}{306}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {14.2.1}Devaney's Chaotic Dynamical Systems}{306}}
+\@writefile{toc}{\contentsline {section}{\numberline {14.3}Toward Efficiency and Improvement for CI PRNG}{306}}
+\newlabel{sec:efficient PRNG}{{14.3}{306}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{306}}
+\newlabel{algo:seqCIPRNG}{{14.1}{306}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {14.1}C code of the sequential PRNG based on chaotic iterations}{306}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{307}}
+\newlabel{sec:efficient PRNG gpu}{{14.3.2}{307}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.3}Naive Version for GPU}{307}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {12}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{308}}
+\newlabel{algo:gpu_kernel}{{12}{308}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.4}Improved Version for GPU}{308}}
+\newlabel{IR}{{13}{309}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {13}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{309}}
+\newlabel{algo:gpu_kernel2}{{13}{309}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.5}Chaos Evaluation of the Improved Version}{309}}
+\@writefile{toc}{\contentsline {section}{\numberline {14.4}Experiments}{310}}
+\newlabel{sec:experiments}{{14.4}{310}}
+\@writefile{lof}{\contentsline {figure}{\numberline {14.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{311}}
+\newlabel{fig:time_xorlike_gpu}{{14.1}{311}}
+\@writefile{lof}{\contentsline {figure}{\numberline {14.2}{\ignorespaces Quantity of pseudorandom numbers generated per second using the BBS-based PRNG\relax }}{312}}
+\newlabel{fig:time_bbs_gpu}{{14.2}{312}}
+\@setckpt{Chapters/chapter18/ch18}{
+\setcounter{page}{313}
+\setcounter{equation}{0}
+\setcounter{enumi}{2}
+\setcounter{enumii}{0}
+\setcounter{enumiii}{0}
+\setcounter{enumiv}{0}
+\setcounter{footnote}{2}
+\setcounter{mpfootnote}{0}
+\setcounter{part}{1}
+\setcounter{chapter}{14}
+\setcounter{section}{4}
+\setcounter{subsection}{0}
+\setcounter{subsubsection}{0}
+\setcounter{paragraph}{0}
+\setcounter{subparagraph}{0}
+\setcounter{figure}{2}
+\setcounter{table}{0}
+\setcounter{numauthors}{0}
+\setcounter{parentequation}{4}
+\setcounter{subfigure}{0}
+\setcounter{lofdepth}{1}
+\setcounter{subtable}{0}
+\setcounter{lotdepth}{1}
+\setcounter{lstnumber}{15}
+\setcounter{ContinuedFloat}{0}
+\setcounter{AlgoLine}{14}
+\setcounter{algocfline}{13}
+\setcounter{algocfproc}{13}
+\setcounter{algocf}{13}
+\setcounter{proposition}{1}
+\setcounter{theorem}{0}
+\setcounter{exercise}{0}
+\setcounter{example}{0}
+\setcounter{definition}{0}
+\setcounter{proof}{1}
+\setcounter{lstlisting}{1}
+}
--- /dev/null
+\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte}
+\chapterauthor{Christophe Guyeux}{Femto-ST Institute, University of Franche-Comte}
+
+
+\chapter{Pseudo Random Number Generator on GPU}
+\label{chapter18}
+
+\section{Introduction}
+
+Randomness is of importance in many fields such as scientific
+simulations or cryptography. ``Random numbers'' can mainly be
+generated either by a deterministic and reproducible algorithm called
+a pseudorandom number generator (PRNG). In this paper, we focus on
+reproducible generators, useful for instance in Monte-Carlo based
+simulators. These domains need PRNGs that are statistically
+irreproachable. In some fields such as in numerical simulations,
+speed is a strong requirement that is usually attained by using
+parallel architectures. In that case, a recurrent problem is that a
+deflation of the statistical qualities is often reported, when the
+parallelization of a good PRNG is realized. In some fields such as in
+numerical simulations, speed is a strong requirement that is usually
+attained by using parallel architectures. In that case, a recurrent
+problem is that a deflation of the statistical qualities is often
+reported, when the parallelization of a good PRNG is realized. This
+is why ad-hoc PRNGs for each possible architecture must be found to
+achieve both speed and randomness. On the other side, speed is not
+the main requirement in cryptography: the great need is to
+define \emph{secure} generators able to withstand malicious
+attacks. Roughly speaking, an attacker should not be able in practice
+to make the distinction between numbers obtained with the secure
+generator and a true random sequence. Or, in an equivalent
+formulation, he or she should not be able (in practice) to predict the
+next bit of the generator, having the knowledge of all the binary
+digits that have been already released. ``Being able in practice''
+refers here to the possibility to achieve this attack in polynomial
+time, and to the exponential growth of the difficulty of this
+challenge when the size of the parameters of the PRNG increases.
+
+
+Finally, a small part of the community working in this domain focuses on a
+third requirement, that is to define chaotic generators.
+The main idea is to take benefits from a chaotic dynamical system to obtain a
+generator that is unpredictable, disordered, sensible to its seed, or in other word chaotic.
+Their desire is to map a given chaotic dynamics into a sequence that seems random
+and unassailable due to chaos.
+However, the chaotic maps used as a pattern are defined in the real line
+whereas computers deal with finite precision numbers.
+This distortion leads to a deflation of both chaotic properties and speed.
+Furthermore, authors of such chaotic generators often claim their PRNG
+as secure due to their chaos properties, but there is no obvious relation
+between chaos and security as it is understood in cryptography.
+This is why the use of chaos for PRNG still remains marginal and disputable.
+
+
+The remainder of this paper is organized as follows.
+A COMPLETER
+
+
+\section{Basic Recalls}
+\label{section:BASIC RECALLS}
+
+This section is devoted to basic definitions and terminologies in the fields of
+topological chaos and chaotic iterations. We assume the reader is familiar
+with basic notions on topology (see for instance~\cite{Devaney}).
+
+
+\subsection{Devaney's Chaotic Dynamical Systems}
+
+
+A COMPLETER
+
+
+
+\section{Toward Efficiency and Improvement for CI PRNG}
+\label{sec:efficient PRNG}
+
+\subsection{First Efficient Implementation of a PRNG based on Chaotic Iterations}
+%
+%Based on the proof presented in the previous section, it is now possible to
+%improve the speed of the generator formerly presented in~\cite{bgw09:ip,guyeux10}.
+%The first idea is to consider
+%that the provided strategy is a pseudorandom Boolean vector obtained by a
+%given PRNG.
+%An iteration of the system is simply the bitwise exclusive or between
+%the last computed state and the current strategy.
+%Topological properties of disorder exhibited by chaotic
+%iterations can be inherited by the inputted generator, we hope by doing so to
+%obtain some statistical improvements while preserving speed.
+%
+%%RAPH : j'ai viré tout ca
+%% Let us give an example using 16-bits numbers, to clearly understand how the bitwise xor operations
+%% are
+%% done.
+%% Suppose that $x$ and the strategy $S^i$ are given as
+%% binary vectors.
+%% Table~\ref{TableExemple} shows the result of $x \oplus S^i$.
+
+%% \begin{table}
+%% \begin{scriptsize}
+%% $$
+%% \begin{array}{|cc|cccccccccccccccc|}
+%% \hline
+%% x &=&1&0&1&1&1&0&1&0&1&0&0&1&0&0&1&0\\
+%% \hline
+%% S^i &=&0&1&1&0&0&1&1&0&1&1&1&0&0&1&1&1\\
+%% \hline
+%% x \oplus S^i&=&1&1&0&1&1&1&0&0&0&1&1&1&0&1&0&1\\
+%% \hline
+
+%% \hline
+%% \end{array}
+%% $$
+%% \end{scriptsize}
+%% \caption{Example of an arbitrary round of the proposed generator}
+%% \label{TableExemple}
+%% \end{table}
+
+
+
+
+\lstset{language=C,caption={C code of the sequential PRNG based on chaotic iterations},label={algo:seqCIPRNG}}
+\begin{small}
+\begin{lstlisting}
+
+unsigned int CIPRNG() {
+ static unsigned int x = 123123123;
+ unsigned long t1 = xorshift();
+ unsigned long t2 = xor128();
+ unsigned long t3 = xorwow();
+ x = x^(unsigned int)t1;
+ x = x^(unsigned int)(t2>>32);
+ x = x^(unsigned int)(t3>>32);
+ x = x^(unsigned int)t2;
+ x = x^(unsigned int)(t1>>32);
+ x = x^(unsigned int)t3;
+ return x;
+}
+\end{lstlisting}
+\end{small}
+
+
+
+In Listing~\ref{algo:seqCIPRNG} a sequential version of the proposed PRNG based
+on chaotic iterations is presented. The xor operator is represented by
+\textasciicircum. This function uses three classical 64-bits PRNGs, namely the
+\texttt{xorshift}, the \texttt{xor128}, and the
+\texttt{xorwow}~\cite{Marsaglia2003}. In the following, we call them ``xor-like
+PRNGs''. As each xor-like PRNG uses 64-bits whereas our proposed generator
+works with 32-bits, we use the command \texttt{(unsigned int)}, that selects the
+32 least significant bits of a given integer, and the code \texttt{(unsigned
+ int)(t$>>$32)} in order to obtain the 32 most significant bits of \texttt{t}.
+
+Thus producing a pseudorandom number needs 6 xor operations with 6 32-bits numbers
+that are provided by 3 64-bits PRNGs. This version successfully passes the
+stringent BigCrush battery of tests~\cite{LEcuyerS07}.
+At this point, we thus
+have defined an efficient and statistically unbiased generator. Its speed is
+directly related to the use of linear operations, but for the same reason,
+this fast generator cannot be proven as secure.
+
+
+
+\subsection{Efficient PRNGs based on Chaotic Iterations on GPU}
+\label{sec:efficient PRNG gpu}
+
+In order to take benefits from the computing power of GPU, a program
+needs to have independent blocks of threads that can be computed
+simultaneously. In general, the larger the number of threads is, the
+more local memory is used, and the less branching instructions are
+used (if, while, ...), the better the performances on GPU is.
+Obviously, having these requirements in mind, it is possible to build
+a program similar to the one presented in Listing
+\ref{algo:seqCIPRNG}, which computes pseudorandom numbers on GPU. To
+do so, we must firstly recall that in the CUDA~\cite{Nvid10}
+environment, threads have a local identifier called
+\texttt{ThreadIdx}, which is relative to the block containing
+them. Furthermore, in CUDA, parts of the code that are executed by the GPU, are
+called {\it kernels}.
+
+
+\subsection{Naive Version for GPU}
+
+
+It is possible to deduce from the CPU version a quite similar version adapted to GPU.
+The simple principle consists in making each thread of the GPU computing the CPU version of our PRNG.
+Of course, the three xor-like
+PRNGs used in these computations must have different parameters.
+In a given thread, these parameters are
+randomly picked from another PRNGs.
+The initialization stage is performed by the CPU.
+To do it, the ISAAC PRNG~\cite{Jenkins96} is used to set all the
+parameters embedded into each thread.
+
+The implementation of the three
+xor-like PRNGs is straightforward when their parameters have been
+allocated in the GPU memory. Each xor-like works with an internal
+number $x$ that saves the last generated pseudorandom number. Additionally, the
+implementation of the xor128, the xorshift, and the xorwow respectively require
+4, 5, and 6 unsigned long as internal variables.
+
+
+\begin{algorithm}
+\begin{small}
+\KwIn{InternalVarXorLikeArray: array with internal variables of the 3 xor-like
+PRNGs in global memory\;
+NumThreads: number of threads\;}
+\KwOut{NewNb: array containing random numbers in global memory}
+\If{threadIdx is concerned by the computation} {
+ retrieve data from InternalVarXorLikeArray[threadIdx] in local variables\;
+ \For{i=1 to n} {
+ compute a new PRNG as in Listing\ref{algo:seqCIPRNG}\;
+ store the new PRNG in NewNb[NumThreads*threadIdx+i]\;
+ }
+ store internal variables in InternalVarXorLikeArray[threadIdx]\;
+}
+\end{small}
+\caption{Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations}
+\label{algo:gpu_kernel}
+\end{algorithm}
+
+
+
+Algorithm~\ref{algo:gpu_kernel} presents a naive implementation of the proposed PRNG on
+GPU. Due to the available memory in the GPU and the number of threads
+used simultaneously, the number of random numbers that a thread can generate
+inside a kernel is limited (\emph{i.e.}, the variable \texttt{n} in
+algorithm~\ref{algo:gpu_kernel}). For instance, if $100,000$ threads are used and
+if $n=100$\footnote{in fact, we need to add the initial seed (a 32-bits number)},
+then the memory required to store all of the internals variables of both the xor-like
+PRNGs\footnote{we multiply this number by $2$ in order to count 32-bits numbers}
+and the pseudorandom numbers generated by our PRNG, is equal to $100,000\times ((4+5+6)\times
+2+(1+100))=1,310,000$ 32-bits numbers, that is, approximately $52$Mb.
+
+This generator is able to pass the whole BigCrush battery of tests, for all
+the versions that have been tested depending on their number of threads
+(called \texttt{NumThreads} in our algorithm, tested up to $5$ million).
+
+
+\subsection{Improved Version for GPU}
+
+As GPU cards using CUDA have shared memory between threads of the same block, it
+is possible to use this feature in order to simplify the previous algorithm,
+i.e., to use less than 3 xor-like PRNGs. The solution consists in computing only
+one xor-like PRNG by thread, saving it into the shared memory, and then to use the results
+of some other threads in the same block of threads. In order to define which
+thread uses the result of which other one, we can use a combination array that
+contains the indexes of all threads and for which a combination has been
+performed.
+
+In Algorithm~\ref{algo:gpu_kernel2}, two combination arrays are used. The
+variable \texttt{offset} is computed using the value of
+\texttt{combination\_size}. Then we can compute \texttt{o1} and \texttt{o2}
+representing the indexes of the other threads whose results are used by the
+current one. In this algorithm, we consider that a 32-bits xor-like PRNG has
+been chosen. In practice, we use the xor128 proposed in~\cite{Marsaglia2003} in
+which unsigned longs (64 bits) have been replaced by unsigned integers (32
+bits).
+
+This version can also pass the whole {\it BigCrush} battery of tests.
+
+\begin{algorithm}
+\begin{small}
+\KwIn{InternalVarXorLikeArray: array with internal variables of 1 xor-like PRNGs
+in global memory\;
+NumThreads: Number of threads\;
+array\_comb1, array\_comb2: Arrays containing combinations of size combination\_size\;}
+
+\KwOut{NewNb: array containing random numbers in global memory}
+\If{threadId is concerned} {
+ retrieve data from InternalVarXorLikeArray[threadId] in local variables including shared memory and x\;
+ offset = threadIdx\%combination\_size\;
+ o1 = threadIdx-offset+array\_comb1[offset]\;
+ o2 = threadIdx-offset+array\_comb2[offset]\;
+ \For{i=1 to n} {
+ t=xor-like()\;
+ t=t\textasciicircum shmem[o1]\textasciicircum shmem[o2]\;
+ shared\_mem[threadId]=t\;
+ x = x\textasciicircum t\;
+
+ store the new PRNG in NewNb[NumThreads*threadId+i]\;
+ }
+ store internal variables in InternalVarXorLikeArray[threadId]\;
+}
+\end{small}
+\caption{Main kernel for the chaotic iterations based PRNG GPU efficient
+version\label{IR}}
+\label{algo:gpu_kernel2}
+\end{algorithm}
+
+\subsection{Chaos Evaluation of the Improved Version}
+
+A run of Algorithm~\ref{algo:gpu_kernel2} consists in an operation ($x=x\oplus t$) having
+the form of Equation~\ref{equation Oplus}, which is equivalent to the iterative
+system of Eq.~\ref{eq:generalIC}. That is, an iteration of the general chaotic
+iterations is realized between the last stored value $x$ of the thread and a strategy $t$
+(obtained by a bitwise exclusive or between a value provided by a xor-like() call
+and two values previously obtained by two other threads).
+To be certain that we are in the framework of Theorem~\ref{t:chaos des general},
+we must guarantee that this dynamical system iterates on the space
+$\mathcal{X} = \mathcal{P}\left(\llbracket 1, \mathsf{N} \rrbracket\right)^\mathds{N}\times\mathds{B}^\mathsf{N}$.
+The left term $x$ obviously belongs to $\mathds{B}^ \mathsf{N}$.
+To prevent from any flaws of chaotic properties, we must check that the right
+term (the last $t$), corresponding to the strategies, can possibly be equal to any
+integer of $\llbracket 1, \mathsf{N} \rrbracket$.
+
+Such a result is obvious, as for the xor-like(), all the
+integers belonging into its interval of definition can occur at each iteration, and thus the
+last $t$ respects the requirement. Furthermore, it is possible to
+prove by an immediate mathematical induction that, as the initial $x$
+is uniformly distributed (it is provided by a cryptographically secure PRNG),
+the two other stored values shmem[o1] and shmem[o2] are uniformly distributed too,
+(this is the induction hypothesis), and thus the next $x$ is finally uniformly distributed.
+
+Thus Algorithm~\ref{algo:gpu_kernel2} is a concrete realization of the general
+chaotic iterations presented previously, and for this reason, it satisfies the
+Devaney's formulation of a chaotic behavior.
+
+\section{Experiments}
+\label{sec:experiments}
+
+Different experiments have been performed in order to measure the generation
+speed. We have used a first computer equipped with a Tesla C1060 NVidia GPU card
+and an
+Intel Xeon E5530 cadenced at 2.40 GHz, and
+a second computer equipped with a smaller CPU and a GeForce GTX 280.
+All the
+cards have 240 cores.
+
+In Figure~\ref{fig:time_xorlike_gpu} we compare the quantity of pseudorandom numbers
+generated per second with various xor-like based PRNGs. In this figure, the optimized
+versions use the {\it xor64} described in~\cite{Marsaglia2003}, whereas the naive versions
+embed the three xor-like PRNGs described in Listing~\ref{algo:seqCIPRNG}. In
+order to obtain the optimal performances, the storage of pseudorandom numbers
+into the GPU memory has been removed. This step is time consuming and slows down the numbers
+generation. Moreover this storage is completely
+useless, in case of applications that consume the pseudorandom
+numbers directly after generation. We can see that when the number of threads is greater
+than approximately 30,000 and lower than 5 million, the number of pseudorandom numbers generated
+per second is almost constant. With the naive version, this value ranges from 2.5 to
+3GSamples/s. With the optimized version, it is approximately equal to
+20GSamples/s. Finally we can remark that both GPU cards are quite similar, but in
+practice, the Tesla C1060 has more memory than the GTX 280, and this memory
+should be of better quality.
+As a comparison, Listing~\ref{algo:seqCIPRNG} leads to the generation of about
+138MSample/s when using one core of the Xeon E5530.
+
+\begin{figure}[htbp]
+\begin{center}
+ \includegraphics[scale=0.65]{Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf}
+\end{center}
+\caption{Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG}
+\label{fig:time_xorlike_gpu}
+\end{figure}
+
+
+
+
+
+In Figure~\ref{fig:time_bbs_gpu} we highlight the performances of the optimized
+BBS-based PRNG on GPU. On the Tesla C1060 we obtain approximately 700MSample/s
+and on the GTX 280 about 670MSample/s, which is obviously slower than the
+xorlike-based PRNG on GPU. However, we will show in the next sections that this
+new PRNG has a strong level of security, which is necessarily paid by a speed
+reduction.
+
+\begin{figure}[htbp]
+\begin{center}
+ \includegraphics[scale=0.65]{Chapters/chapter18/figures/curve_time_bbs_gpu.pdf}
+\end{center}
+\caption{Quantity of pseudorandom numbers generated per second using the BBS-based PRNG}
+\label{fig:time_bbs_gpu}
+\end{figure}
+
+All these experiments allow us to conclude that it is possible to
+generate a very large quantity of pseudorandom numbers statistically perfect with the xor-like version.
+To a certain extend, it is also the case with the secure BBS-based version, the speed deflation being
+explained by the fact that the former version has ``only''
+chaotic properties and statistical perfection, whereas the latter is also cryptographically secure,
+as it is shown in the next sections.
+
+
+
+
+
+
+
+
+
+
+
+\putbib[Chapters/chapter18/biblio]
+
--- /dev/null
+\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte}
+
+
+\chapter{Presentation of the GPU architecture and of the Cuda environment}
+\label{chapter1}
+
+\section{Introduction}\label{ch1:intro}
+
+This chapter introduces the Graphics Processing Unit (GPU) architecture and all
+the concepts needed to understand how GPUs work and can be used to speed up the
+execution of some algorithms. First of all this chapter gives a brief history of
+the development of Graphics card until they have been used in order to make
+general purpose computation. Then the architecture of a GPU is
+illustrated. There are many fundamental differences between a GPU and a
+tradition processor. In order to benefit from the power of a GPU, a Cuda
+programmer needs to use threads. They have some particularities which enable the
+Cuda model to be efficient and scalable when some constraints are addressed.
+
+
+
+\section{Brief history of Video Card}
+
+Video cards or Graphics cards have been introduced in personal computers to
+produce high quality graphics faster than classical Central Processing Units
+(CPU) and to alleviate CPU from this task. In general, display tasks are very
+repetitive and very specific. Hence, some manufacturers have produced more and
+more sophisticated video cards, providing 2D accelerations then 3D accelerations,
+then some light transforms. Video cards own their own memory to perform their
+computation. For at least two decades, every personal computer has had a video
+card which is simple for desktop computers or which provides many accelerations
+for game and/or graphic oriented computers. In the latter case, graphic cards
+may be more expensive than a CPU.
+
+Since 2000, video cards have allowed users to apply arithmetic operations
+simultaneously on a sequence of pixels, also later called stream processing. In
+this case, the information of the pixels (color, location and other information) are
+combined in order to produce a pixel color that can be displayed on a screen.
+Simultaneous computations are provided by shaders which calculate rendering
+effects on graphics hardware with a high degree of flexibility. These shaders
+handles the stream data with pipelines.
+
+
+Some researchers tried to apply those operations on other data, representing
+something different from pixels, and consequently this resulted in the first
+uses of video cards for performing general purpose computation. The programming
+model was not easy to use at all and was very dependent of the hardware
+constraints. More precisely it consisted in using either DirectX of OpenGL
+functions providing an interface to some classical operations for videos
+operations (memory transfers, texture manipulation, ...). Floating point
+operations were most of the time unimaginable. Obviously when something went
+wrong, programmers had no way (and neither the tools) to detect it.
+
+\section{GPGPU}
+
+In order to benefit from the computing power of more recent video cards, Cuda
+was first proposed in 2007 by NVidia. It unifies the programming model for some
+of their most performant video cards. Cuda~\cite{ch1:cuda} has quickly been
+considered by the scientific community as a great advance for general purpose
+graphics processing unit (GPGPU) computing. Of course other programming models
+have been proposed. The other well-known alternative is OpenCL which aims at
+proposing an alternative to Cuda and which is multi-platform and portable. This
+is a great advantage since it is even possible to execute OpenCL programs on
+traditional CPUs. The main drawback is that it is less tight with the hardware
+and consequently sometimes provides less efficient programs. Moreover, Cuda
+benefits from more mature compilation and optimization procedures. Other less
+known environments have been proposed, but most of them have been stopped, for
+example we can cite: FireStream by ATI which is not maintained anymore and
+replaced by OpenCL, BrookGPU by Standford University~\cite{ch1:Buck:2004:BGS}.
+Another environment based on pragma (insertion of pragma directives inside the
+code to help the compiler to generate efficient code) is call OpenACC. For a
+comparison with OpenCL, interested readers may refer to~\cite{ch1:CMR:12}.
+
+
+
+\section{Architecture of current GPUs}
+
+The architecture \index{architecture of a GPU} of current GPUs is constantly
+evolving. Nevertheless some trends remain constant throughout this evolution.
+Processing units composing a GPU are far more simple than a traditional CPU but
+it is much easier to integrate many computing units inside a GPU card than to do
+so with many cores inside a CPU. This is due to the fact that the cores of a GPU are
+simpler than the cores of a CPU. In 2012, the most powerful GPUs own more than 500
+cores and the most powerful CPUs have 8
+cores. Figure~\ref{ch1:fig:comparison_cpu_gpu} shows the number of cores inside
+a CPU and inside a GPU. In fact, in a current NVidia GPU, there are
+multiprocessors which have 32 cores (for example on Fermi cards). The core clock
+of CPU is generally around 3GHz and the one of GPU is about 1.5GHz. Although
+the core clock of GPU cores is slower, the amount of cores inside a GPU provides
+more computational power. This measure is commonly represented by the number of
+floating point operation per seconds. Nowadays the most powerful GPUs provide more
+than 1TFlops, i.e. $10^{12}$ floating point operations per second.
+Nevertheless GPUs are very efficient to perform some operations but not all
+kinds of operations. They are very efficient to execute repetitive work in which
+only the data change. It is important to keep in mind that multiprocessors
+inside a GPU have 32 cores. Later we will see that these 32 cores need to do the
+same work to get maximum performance.
+
+\begin{figure}[b!]
+\centerline{\includegraphics[]{Chapters/chapter1/figures/nb_cores_CPU_GPU.pdf}}
+\caption{Comparison of number of cores in a CPU and in a GPU.}
+%[Comparison of number of cores in a CPU and in a GPU]
+\label{ch1:fig:comparison_cpu_gpu}
+\end{figure}
+
+On the most powerful GPU cards, called Fermi, multiprocessors are called streaming
+multiprocessors (SM). Each SM contains 32 cores and is able to perform 32
+floating points or integer operations on 32 bits numbers per clock or 16 floating
+points on 64 bits number per clock. SMs have their own registers, execution
+pipelines and caches. On Fermi architecture, there are 64Kb shared memory + L1
+cache and 32,536 32bits registers per SM. More precisely the programmer can
+decide what amount of shared memory and L1 cache SM can use. The constraint is
+that the sum of both amounts should be less or equal to 64Kb.
+
+Threads are used to benefit from the important number of cores of a GPU. Those
+threads are different from traditional threads for CPU. In
+chapter~\ref{chapter2}, some examples of GPU programming will explicit the
+details of the GPU threads. However, threads are gathered into blocks of 32
+threads, called ``warps''. Those warps are important when designing an algorithm
+for GPU.
+
+
+Another big difference between CPU and GPU is the latency of memory. In CPU,
+everything is optimized to obtain a low latency architecture. This is possible
+through the use of cache memories. Moreover, nowadays CPUs perform many
+performance optimizations such as speculative execution which roughly speaking
+consists in executing a small part of code in advance even if later this work
+reveals itself to be useless. On the contrary, GPUs do not have low latency
+memory. In comparison GPUs have small cache memories. Nevertheless the
+architecture of GPUs is optimized for throughput computation and it takes into
+account the memory latency.
+
+
+
+\begin{figure}[b!]
+\centerline{\includegraphics[scale=0.7]{Chapters/chapter1/figures/low_latency_vs_high_throughput.pdf}}
+\caption{Comparison of low latency of CPU and high throughput of GPU.}
+\label{ch1:fig:latency_throughput}
+\end{figure}
+
+Figure~\ref{ch1:fig:latency_throughput} illustrates the main difference of
+memory latency between a CPU and a GPU. In a CPU, tasks ``ti'' are executed one
+by one with a short memory latency to get the data to process. After some tasks,
+there is a context switch that allows the CPU to run concurrent applications
+and/or multi-threaded applications. Memory latencies are longer in a GPU, the
+the principle to obtain a high throughput is to have many tasks to
+compute. Later we will see that those tasks are called threads with Cuda. With
+this principle, as soon as a task is finished the next one is ready to be
+executed while the wait for data for the previous task is overlapped by
+computation of other tasks.
+
+
+
+\section{Kinds of parallelism}
+
+Many kinds of parallelism are amiable according to the type of hardware.
+Roughly speaking, there are three classes of parallelism: instruction-level
+parallelism, data parallelism and task parallelism.
+
+Instruction-level parallelism consists in re-ordering some instructions in order
+to execute some of them in parallel without changing the result of the code.
+In modern CPUs, instruction pipelines allow processor to execute instructions
+faster. With a pipeline a processor can execute multiple instructions
+simultaneously due to the fact that the output of a task is the input of the
+next one.
+
+Data parallelism consists in executing the same program with different data on
+different computing units. Of course, no dependency should exist between the
+data. For example, it is easy to parallelize loops without dependency using the
+data parallelism paradigm. This paradigm is linked with the Single Instructions
+Multiple Data (SIMD) architecture. This is the kind of parallelism provided by
+GPUs.
+
+Task parallelism is the common parallelism achieved out on clusters and grids and
+high performance architectures where different tasks are executed by different
+computing units.
+
+\section{Cuda Multithreading}
+
+The data parallelism of Cuda is more precisely based on the Single Instruction
+Multiple Thread (SIMT) model. This is due to the fact that a programmer accesses
+to the cores by the intermediate of threads. In the Cuda model, all cores
+execute the same set of instructions but with different data. This model has
+similarities with the vector programming model proposed for vector machines through
+the 1970s into the 90s, notably the various Cray platforms. On the Cuda
+architecture, the performance is led by the use of a huge number of threads
+(from thousands up to to millions). The particularity of the model is that there
+is no context switching as in CPUs and each thread has its own registers. In
+practice, threads are executed by SM and are gathered into groups of 32
+threads. Those groups are called ``warps''. Each SM alternatively executes
+``active warps'' and warps becoming temporarily inactive due to waiting of data
+(as shown in Figure~\ref{ch1:fig:latency_throughput}).
+
+The key to scalability in the Cuda model is the use of a huge number of threads.
+In practice, threads are not only gathered in warps but also in thread blocks. A
+thread block is executed by only one SM and it cannot migrate. The typical size of
+a thread block is a number power of two (for example: 64, 128, 256 or 512).
+
+
+
+In this case, without changing anything inside a Cuda code, it is possible to
+run your code with a small Cuda device or the most performing Tesla Cuda cards.
+Blocks are executed in any order depending on the number of SMs available. So
+the programmer must conceive its code having this issue in mind. This
+independence between thread blocks provides the scalability of Cuda codes.
+
+\begin{figure}[b!]
+\centerline{\includegraphics[scale=0.65]{Chapters/chapter1/figures/scalability.pdf}}
+\caption{Scalability of GPU.}
+\label{ch1:fig:scalability}
+\end{figure}
+
+
+A kernel is a function which contains a block of instructions that are executed
+by the threads of a GPU. When the problem considered is a two dimensional or three
+dimensional problem, it is possible to group thread blocks into a grid. In
+practice, the number of thread blocks and the size of thread blocks is given as
+parameters to each kernel. Figure~\ref{ch1:fig:scalability} illustrates an
+example of a kernel composed of 8 thread blocks. Then this kernel is executed on
+a small device containing only 2 SMs. So in this case, blocks are executed 2
+by 2 in any order. If the kernel is executed on a larger Cuda device containing
+4 SMs, blocks are executed 4 by 4 simultaneously. The execution times should be
+approximately twice faster in the latter case. Of course, that depends on other
+parameters that will be described later.
+
+Thread blocks provide a way to cooperation in the sense that threads of the same
+block cooperatively load and store blocks of memory they all
+use. Synchronizations of threads in the same block are possible (but not between
+threads of different blocks). Threads of the same block can also share results
+in order to compute a single result. In chapter~\ref{chapter2}, some examples
+will explicit that.
+
+
+\section{Memory hierarchy}
+
+The memory hierarchy of GPUs\index{memory~hierarchy} is different from the CPUs
+one. In practice, there are registers\index{memory~hierarchy!registers}, local
+memory\index{memory~hierarchy!local~memory}, shared
+memory\index{memory~hierarchy!shared~memory}, cache
+memory\index{memory~hierarchy!cache~memory} and global
+memory\index{memory~hierarchy!global~memory}.
+
+
+As previously mentioned each thread can access its own registers. It is
+important to keep in mind that the number of registers per block is limited. On
+recent cards, this number is limited to 64Kb per SM. Access to registers is
+very fast, so it is a good idea to use them whenever possible.
+
+Likewise each thread can access local memory which, in practice, is much slower
+than registers. Local memory is automatically used by the compiler when all the
+registers are occupied. So the best idea is to optimize the use of registers
+even if this implies to reduce the number of threads per block.
+
+\begin{figure}[hbtp!]
+\centerline{\includegraphics[scale=0.60]{Chapters/chapter1/figures/memory_hierarchy.pdf}}
+\caption{Memory hierarchy of a GPU.}
+\label{ch1:fig:memory_hierarchy}
+\end{figure}
+
+
+
+Shared memory allows cooperation between threads of the same block. This kind
+of memory is fast because it requires to be manipulated manually and its size is
+limited. It is accessible during the execution of a kernel. So the principle is
+to fill the shared memory at the start of the kernel with global data that are
+used very frequently, then threads can access it for their computation. They
+can obviously change the content of this shared memory either with computation
+or load of other data and they can store its content in the global memory. So
+shared memory can be seen as a cache memory manageable manually. This requires
+obviously an effort from the programmer.
+
+On recent cards, the programmer may decide what amount of cache memory and
+shared memory is attributed to a kernel. The cache memory is a L1 cache which is
+directly managed by the GPU. Sometimes, this cache provides very efficient
+result and sometimes the use of shared memory is a better solution.
+
+
+
+
+Figure~\ref{ch1:fig:memory_hierarchy} illustrates the memory hierarchy of a
+GPU. Threads are represented on the top of the figure. They can access to their
+own registers and their local memory. Threads of the same block can access to
+the shared memory of this block. The cache memory is not represented here but it
+is local to a thread. Then each block can access to the global memory of the
+GPU.
+
+ \section{Conclusion}
+
+In this chapter, a brief presentation of the video card, which has later been
+used to perform computation, has been given. The architecture of a GPU has been
+illustrated focusing on the particularity of GPUs in term of parallelism, memory
+latency and threads. In order to design an efficient algorithm for GPU, it is
+essential to have all these parameters in mind.
+
+
+\putbib[Chapters/chapter1/biblio]
+
--- /dev/null
+\relax
+\@setckpt{Chapters/chapter1/preamble}{
+\setcounter{page}{1}
+\setcounter{equation}{0}
+\setcounter{enumi}{0}
+\setcounter{enumii}{0}
+\setcounter{enumiii}{0}
+\setcounter{enumiv}{0}
+\setcounter{footnote}{0}
+\setcounter{mpfootnote}{0}
+\setcounter{part}{0}
+\setcounter{chapter}{0}
+\setcounter{section}{0}
+\setcounter{subsection}{0}
+\setcounter{subsubsection}{0}
+\setcounter{paragraph}{0}
+\setcounter{subparagraph}{0}
+\setcounter{figure}{0}
+\setcounter{table}{0}
+\setcounter{numauthors}{0}
+\setcounter{parentequation}{0}
+\setcounter{subfigure}{0}
+\setcounter{lofdepth}{1}
+\setcounter{subtable}{0}
+\setcounter{lotdepth}{1}
+\setcounter{lstnumber}{1}
+\setcounter{ContinuedFloat}{0}
+\setcounter{AlgoLine}{0}
+\setcounter{algocfline}{0}
+\setcounter{algocfproc}{0}
+\setcounter{algocf}{0}
+}