From: couturie Date: Wed, 6 Feb 2013 13:19:18 +0000 (+0100) Subject: ch18 X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/013c063fb5cb99cc33b413d937f330acc42ca73e?ds=sidebyside;hp=-c ch18 --- 013c063fb5cb99cc33b413d937f330acc42ca73e diff --git a/BookGPU/Chapters/chapter18/biblio.bib b/BookGPU/Chapters/chapter18/biblio.bib new file mode 100644 index 0000000..f3b0ccc --- /dev/null +++ b/BookGPU/Chapters/chapter18/biblio.bib @@ -0,0 +1,35 @@ +@misc{ch1:cuda, + author = {{NVIDIA Corporation}}, + keywords = {CUDA}, + note = {Version 4.0}, + title = {{NVIDIA CUDA C} Programming Guide}, + year = 2011 +} + +@Article{ch1:Buck:2004:BGS, + author = "I. Buck and T. Foley and D. Horn and J. + Sugerman and K. Fatahalian and M. Houston and P. + Hanrahan", + title = "{Brook} for {GPUs}: stream computing on graphics + hardware", + journal = "ACM Transactions on Graphics", + volume = "23", + number = "3", + pages = "777--786", + month = aug, + year = "2004", +} + + +@article{ch1:CMR:12, + author = "B. Cloutier and B. K. Muite and P. Rigge", + title = "A comparison of CPU and GPU performance for Fourier pseudospectral simulations of the Navier-Stokes, Cubic Nonlinear Schrödinger and Sine Gordon Equations", + journal = "Computational Physics (physics.comp-ph)", + year = "2012", + archivePrefix = "arXiv", + eprint = "1206.3215", + primaryClass = "physics.comp-ph", +} + + + diff --git a/BookGPU/Chapters/chapter18/ch1.aux b/BookGPU/Chapters/chapter18/ch1.aux new file mode 100644 index 0000000..2c84cf4 --- /dev/null +++ b/BookGPU/Chapters/chapter18/ch1.aux @@ -0,0 +1,64 @@ +\relax +\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} +\@writefile{loa}{\addvspace {10\p@ }} +\@writefile{toc}{\contentsline {chapter}{\numberline {1}Presentation of the GPU architecture and of the Cuda environment}{3}} +\@writefile{lof}{\addvspace {10\p@ }} +\@writefile{lot}{\addvspace {10\p@ }} +\newlabel{chapter1}{{1}{3}} +\@writefile{toc}{\contentsline {section}{\numberline {1.1}Introduction}{3}} +\newlabel{ch1:intro}{{1.1}{3}} +\@writefile{toc}{\contentsline {section}{\numberline {1.2}Brief history of Video Card}{3}} +\@writefile{toc}{\contentsline {section}{\numberline {1.3}GPGPU}{4}} +\@writefile{toc}{\contentsline {section}{\numberline {1.4}Architecture of current GPUs}{5}} +\@writefile{lof}{\contentsline {figure}{\numberline {1.1}{\ignorespaces Comparison of number of cores in a CPU and in a GPU.\relax }}{5}} +\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} +\newlabel{ch1:fig:comparison_cpu_gpu}{{1.1}{5}} +\@writefile{lof}{\contentsline {figure}{\numberline {1.2}{\ignorespaces Comparison of low latency of CPU and high throughput of GPU.\relax }}{6}} +\newlabel{ch1:fig:latency_throughput}{{1.2}{6}} +\@writefile{toc}{\contentsline {section}{\numberline {1.5}Kinds of parallelism}{7}} +\@writefile{toc}{\contentsline {section}{\numberline {1.6}Cuda Multithreading}{7}} +\@writefile{lof}{\contentsline {figure}{\numberline {1.3}{\ignorespaces Scalability of GPU.\relax }}{8}} +\newlabel{ch1:fig:scalability}{{1.3}{8}} +\@writefile{toc}{\contentsline {section}{\numberline {1.7}Memory hierarchy}{9}} +\@writefile{lof}{\contentsline {figure}{\numberline {1.4}{\ignorespaces Memory hierarchy of a GPU.\relax }}{10}} +\newlabel{ch1:fig:memory_hierarchy}{{1.4}{10}} +\@writefile{toc}{\contentsline {section}{\numberline {1.8}Conclusion}{10}} +\@writefile{toc}{\contentsline {section}{Bibliography}{11}} +\@setckpt{Chapters/chapter1/ch1}{ +\setcounter{page}{12} +\setcounter{equation}{0} +\setcounter{enumi}{0} +\setcounter{enumii}{0} +\setcounter{enumiii}{0} +\setcounter{enumiv}{3} +\setcounter{footnote}{0} +\setcounter{mpfootnote}{0} +\setcounter{part}{1} +\setcounter{chapter}{1} +\setcounter{section}{8} +\setcounter{subsection}{0} +\setcounter{subsubsection}{0} +\setcounter{paragraph}{0} +\setcounter{subparagraph}{0} +\setcounter{figure}{4} +\setcounter{table}{0} +\setcounter{numauthors}{0} +\setcounter{parentequation}{0} +\setcounter{subfigure}{0} +\setcounter{lofdepth}{1} +\setcounter{subtable}{0} +\setcounter{lotdepth}{1} +\setcounter{lstnumber}{1} +\setcounter{ContinuedFloat}{0} +\setcounter{AlgoLine}{0} +\setcounter{algocfline}{0} +\setcounter{algocfproc}{0} +\setcounter{algocf}{0} +\setcounter{proposition}{0} +\setcounter{theorem}{0} +\setcounter{exercise}{0} +\setcounter{example}{0} +\setcounter{definition}{0} +\setcounter{proof}{0} +\setcounter{lstlisting}{0} +} diff --git a/BookGPU/Chapters/chapter18/ch18.aux b/BookGPU/Chapters/chapter18/ch18.aux new file mode 100644 index 0000000..f9f97ae --- /dev/null +++ b/BookGPU/Chapters/chapter18/ch18.aux @@ -0,0 +1,71 @@ +\relax +\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} +\@writefile{toc}{\author{Christophe Guyeux}{}} +\@writefile{loa}{\addvspace {10\p@ }} +\@writefile{toc}{\contentsline {chapter}{\numberline {14}Pseudo Random Number Generator on GPU}{305}} +\@writefile{lof}{\addvspace {10\p@ }} +\@writefile{lot}{\addvspace {10\p@ }} +\newlabel{chapter18}{{14}{305}} +\@writefile{toc}{\contentsline {section}{\numberline {14.1}Introduction}{305}} +\@writefile{toc}{\contentsline {section}{\numberline {14.2}Basic Recalls}{306}} +\newlabel{section:BASIC RECALLS}{{14.2}{306}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.2.1}Devaney's Chaotic Dynamical Systems}{306}} +\@writefile{toc}{\contentsline {section}{\numberline {14.3}Toward Efficiency and Improvement for CI PRNG}{306}} +\newlabel{sec:efficient PRNG}{{14.3}{306}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{306}} +\newlabel{algo:seqCIPRNG}{{14.1}{306}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {14.1}C code of the sequential PRNG based on chaotic iterations}{306}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{307}} +\newlabel{sec:efficient PRNG gpu}{{14.3.2}{307}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.3}Naive Version for GPU}{307}} +\@writefile{loa}{\contentsline {algocf}{\numberline {12}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{308}} +\newlabel{algo:gpu_kernel}{{12}{308}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.4}Improved Version for GPU}{308}} +\newlabel{IR}{{13}{309}} +\@writefile{loa}{\contentsline {algocf}{\numberline {13}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{309}} +\newlabel{algo:gpu_kernel2}{{13}{309}} +\@writefile{toc}{\contentsline {subsection}{\numberline {14.3.5}Chaos Evaluation of the Improved Version}{309}} +\@writefile{toc}{\contentsline {section}{\numberline {14.4}Experiments}{310}} +\newlabel{sec:experiments}{{14.4}{310}} +\@writefile{lof}{\contentsline {figure}{\numberline {14.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{311}} +\newlabel{fig:time_xorlike_gpu}{{14.1}{311}} +\@writefile{lof}{\contentsline {figure}{\numberline {14.2}{\ignorespaces Quantity of pseudorandom numbers generated per second using the BBS-based PRNG\relax }}{312}} +\newlabel{fig:time_bbs_gpu}{{14.2}{312}} +\@setckpt{Chapters/chapter18/ch18}{ +\setcounter{page}{313} +\setcounter{equation}{0} +\setcounter{enumi}{2} +\setcounter{enumii}{0} +\setcounter{enumiii}{0} +\setcounter{enumiv}{0} +\setcounter{footnote}{2} +\setcounter{mpfootnote}{0} +\setcounter{part}{1} +\setcounter{chapter}{14} +\setcounter{section}{4} +\setcounter{subsection}{0} +\setcounter{subsubsection}{0} +\setcounter{paragraph}{0} +\setcounter{subparagraph}{0} +\setcounter{figure}{2} +\setcounter{table}{0} +\setcounter{numauthors}{0} +\setcounter{parentequation}{4} +\setcounter{subfigure}{0} +\setcounter{lofdepth}{1} +\setcounter{subtable}{0} +\setcounter{lotdepth}{1} +\setcounter{lstnumber}{15} +\setcounter{ContinuedFloat}{0} +\setcounter{AlgoLine}{14} +\setcounter{algocfline}{13} +\setcounter{algocfproc}{13} +\setcounter{algocf}{13} +\setcounter{proposition}{1} +\setcounter{theorem}{0} +\setcounter{exercise}{0} +\setcounter{example}{0} +\setcounter{definition}{0} +\setcounter{proof}{1} +\setcounter{lstlisting}{1} +} diff --git a/BookGPU/Chapters/chapter18/ch18.tex b/BookGPU/Chapters/chapter18/ch18.tex new file mode 100755 index 0000000..3808995 --- /dev/null +++ b/BookGPU/Chapters/chapter18/ch18.tex @@ -0,0 +1,392 @@ +\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte} +\chapterauthor{Christophe Guyeux}{Femto-ST Institute, University of Franche-Comte} + + +\chapter{Pseudo Random Number Generator on GPU} +\label{chapter18} + +\section{Introduction} + +Randomness is of importance in many fields such as scientific +simulations or cryptography. ``Random numbers'' can mainly be +generated either by a deterministic and reproducible algorithm called +a pseudorandom number generator (PRNG). In this paper, we focus on +reproducible generators, useful for instance in Monte-Carlo based +simulators. These domains need PRNGs that are statistically +irreproachable. In some fields such as in numerical simulations, +speed is a strong requirement that is usually attained by using +parallel architectures. In that case, a recurrent problem is that a +deflation of the statistical qualities is often reported, when the +parallelization of a good PRNG is realized. In some fields such as in +numerical simulations, speed is a strong requirement that is usually +attained by using parallel architectures. In that case, a recurrent +problem is that a deflation of the statistical qualities is often +reported, when the parallelization of a good PRNG is realized. This +is why ad-hoc PRNGs for each possible architecture must be found to +achieve both speed and randomness. On the other side, speed is not +the main requirement in cryptography: the great need is to +define \emph{secure} generators able to withstand malicious +attacks. Roughly speaking, an attacker should not be able in practice +to make the distinction between numbers obtained with the secure +generator and a true random sequence. Or, in an equivalent +formulation, he or she should not be able (in practice) to predict the +next bit of the generator, having the knowledge of all the binary +digits that have been already released. ``Being able in practice'' +refers here to the possibility to achieve this attack in polynomial +time, and to the exponential growth of the difficulty of this +challenge when the size of the parameters of the PRNG increases. + + +Finally, a small part of the community working in this domain focuses on a +third requirement, that is to define chaotic generators. +The main idea is to take benefits from a chaotic dynamical system to obtain a +generator that is unpredictable, disordered, sensible to its seed, or in other word chaotic. +Their desire is to map a given chaotic dynamics into a sequence that seems random +and unassailable due to chaos. +However, the chaotic maps used as a pattern are defined in the real line +whereas computers deal with finite precision numbers. +This distortion leads to a deflation of both chaotic properties and speed. +Furthermore, authors of such chaotic generators often claim their PRNG +as secure due to their chaos properties, but there is no obvious relation +between chaos and security as it is understood in cryptography. +This is why the use of chaos for PRNG still remains marginal and disputable. + + +The remainder of this paper is organized as follows. +A COMPLETER + + +\section{Basic Recalls} +\label{section:BASIC RECALLS} + +This section is devoted to basic definitions and terminologies in the fields of +topological chaos and chaotic iterations. We assume the reader is familiar +with basic notions on topology (see for instance~\cite{Devaney}). + + +\subsection{Devaney's Chaotic Dynamical Systems} + + +A COMPLETER + + + +\section{Toward Efficiency and Improvement for CI PRNG} +\label{sec:efficient PRNG} + +\subsection{First Efficient Implementation of a PRNG based on Chaotic Iterations} +% +%Based on the proof presented in the previous section, it is now possible to +%improve the speed of the generator formerly presented in~\cite{bgw09:ip,guyeux10}. +%The first idea is to consider +%that the provided strategy is a pseudorandom Boolean vector obtained by a +%given PRNG. +%An iteration of the system is simply the bitwise exclusive or between +%the last computed state and the current strategy. +%Topological properties of disorder exhibited by chaotic +%iterations can be inherited by the inputted generator, we hope by doing so to +%obtain some statistical improvements while preserving speed. +% +%%RAPH : j'ai viré tout ca +%% Let us give an example using 16-bits numbers, to clearly understand how the bitwise xor operations +%% are +%% done. +%% Suppose that $x$ and the strategy $S^i$ are given as +%% binary vectors. +%% Table~\ref{TableExemple} shows the result of $x \oplus S^i$. + +%% \begin{table} +%% \begin{scriptsize} +%% $$ +%% \begin{array}{|cc|cccccccccccccccc|} +%% \hline +%% x &=&1&0&1&1&1&0&1&0&1&0&0&1&0&0&1&0\\ +%% \hline +%% S^i &=&0&1&1&0&0&1&1&0&1&1&1&0&0&1&1&1\\ +%% \hline +%% x \oplus S^i&=&1&1&0&1&1&1&0&0&0&1&1&1&0&1&0&1\\ +%% \hline + +%% \hline +%% \end{array} +%% $$ +%% \end{scriptsize} +%% \caption{Example of an arbitrary round of the proposed generator} +%% \label{TableExemple} +%% \end{table} + + + + +\lstset{language=C,caption={C code of the sequential PRNG based on chaotic iterations},label={algo:seqCIPRNG}} +\begin{small} +\begin{lstlisting} + +unsigned int CIPRNG() { + static unsigned int x = 123123123; + unsigned long t1 = xorshift(); + unsigned long t2 = xor128(); + unsigned long t3 = xorwow(); + x = x^(unsigned int)t1; + x = x^(unsigned int)(t2>>32); + x = x^(unsigned int)(t3>>32); + x = x^(unsigned int)t2; + x = x^(unsigned int)(t1>>32); + x = x^(unsigned int)t3; + return x; +} +\end{lstlisting} +\end{small} + + + +In Listing~\ref{algo:seqCIPRNG} a sequential version of the proposed PRNG based +on chaotic iterations is presented. The xor operator is represented by +\textasciicircum. This function uses three classical 64-bits PRNGs, namely the +\texttt{xorshift}, the \texttt{xor128}, and the +\texttt{xorwow}~\cite{Marsaglia2003}. In the following, we call them ``xor-like +PRNGs''. As each xor-like PRNG uses 64-bits whereas our proposed generator +works with 32-bits, we use the command \texttt{(unsigned int)}, that selects the +32 least significant bits of a given integer, and the code \texttt{(unsigned + int)(t$>>$32)} in order to obtain the 32 most significant bits of \texttt{t}. + +Thus producing a pseudorandom number needs 6 xor operations with 6 32-bits numbers +that are provided by 3 64-bits PRNGs. This version successfully passes the +stringent BigCrush battery of tests~\cite{LEcuyerS07}. +At this point, we thus +have defined an efficient and statistically unbiased generator. Its speed is +directly related to the use of linear operations, but for the same reason, +this fast generator cannot be proven as secure. + + + +\subsection{Efficient PRNGs based on Chaotic Iterations on GPU} +\label{sec:efficient PRNG gpu} + +In order to take benefits from the computing power of GPU, a program +needs to have independent blocks of threads that can be computed +simultaneously. In general, the larger the number of threads is, the +more local memory is used, and the less branching instructions are +used (if, while, ...), the better the performances on GPU is. +Obviously, having these requirements in mind, it is possible to build +a program similar to the one presented in Listing +\ref{algo:seqCIPRNG}, which computes pseudorandom numbers on GPU. To +do so, we must firstly recall that in the CUDA~\cite{Nvid10} +environment, threads have a local identifier called +\texttt{ThreadIdx}, which is relative to the block containing +them. Furthermore, in CUDA, parts of the code that are executed by the GPU, are +called {\it kernels}. + + +\subsection{Naive Version for GPU} + + +It is possible to deduce from the CPU version a quite similar version adapted to GPU. +The simple principle consists in making each thread of the GPU computing the CPU version of our PRNG. +Of course, the three xor-like +PRNGs used in these computations must have different parameters. +In a given thread, these parameters are +randomly picked from another PRNGs. +The initialization stage is performed by the CPU. +To do it, the ISAAC PRNG~\cite{Jenkins96} is used to set all the +parameters embedded into each thread. + +The implementation of the three +xor-like PRNGs is straightforward when their parameters have been +allocated in the GPU memory. Each xor-like works with an internal +number $x$ that saves the last generated pseudorandom number. Additionally, the +implementation of the xor128, the xorshift, and the xorwow respectively require +4, 5, and 6 unsigned long as internal variables. + + +\begin{algorithm} +\begin{small} +\KwIn{InternalVarXorLikeArray: array with internal variables of the 3 xor-like +PRNGs in global memory\; +NumThreads: number of threads\;} +\KwOut{NewNb: array containing random numbers in global memory} +\If{threadIdx is concerned by the computation} { + retrieve data from InternalVarXorLikeArray[threadIdx] in local variables\; + \For{i=1 to n} { + compute a new PRNG as in Listing\ref{algo:seqCIPRNG}\; + store the new PRNG in NewNb[NumThreads*threadIdx+i]\; + } + store internal variables in InternalVarXorLikeArray[threadIdx]\; +} +\end{small} +\caption{Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations} +\label{algo:gpu_kernel} +\end{algorithm} + + + +Algorithm~\ref{algo:gpu_kernel} presents a naive implementation of the proposed PRNG on +GPU. Due to the available memory in the GPU and the number of threads +used simultaneously, the number of random numbers that a thread can generate +inside a kernel is limited (\emph{i.e.}, the variable \texttt{n} in +algorithm~\ref{algo:gpu_kernel}). For instance, if $100,000$ threads are used and +if $n=100$\footnote{in fact, we need to add the initial seed (a 32-bits number)}, +then the memory required to store all of the internals variables of both the xor-like +PRNGs\footnote{we multiply this number by $2$ in order to count 32-bits numbers} +and the pseudorandom numbers generated by our PRNG, is equal to $100,000\times ((4+5+6)\times +2+(1+100))=1,310,000$ 32-bits numbers, that is, approximately $52$Mb. + +This generator is able to pass the whole BigCrush battery of tests, for all +the versions that have been tested depending on their number of threads +(called \texttt{NumThreads} in our algorithm, tested up to $5$ million). + + +\subsection{Improved Version for GPU} + +As GPU cards using CUDA have shared memory between threads of the same block, it +is possible to use this feature in order to simplify the previous algorithm, +i.e., to use less than 3 xor-like PRNGs. The solution consists in computing only +one xor-like PRNG by thread, saving it into the shared memory, and then to use the results +of some other threads in the same block of threads. In order to define which +thread uses the result of which other one, we can use a combination array that +contains the indexes of all threads and for which a combination has been +performed. + +In Algorithm~\ref{algo:gpu_kernel2}, two combination arrays are used. The +variable \texttt{offset} is computed using the value of +\texttt{combination\_size}. Then we can compute \texttt{o1} and \texttt{o2} +representing the indexes of the other threads whose results are used by the +current one. In this algorithm, we consider that a 32-bits xor-like PRNG has +been chosen. In practice, we use the xor128 proposed in~\cite{Marsaglia2003} in +which unsigned longs (64 bits) have been replaced by unsigned integers (32 +bits). + +This version can also pass the whole {\it BigCrush} battery of tests. + +\begin{algorithm} +\begin{small} +\KwIn{InternalVarXorLikeArray: array with internal variables of 1 xor-like PRNGs +in global memory\; +NumThreads: Number of threads\; +array\_comb1, array\_comb2: Arrays containing combinations of size combination\_size\;} + +\KwOut{NewNb: array containing random numbers in global memory} +\If{threadId is concerned} { + retrieve data from InternalVarXorLikeArray[threadId] in local variables including shared memory and x\; + offset = threadIdx\%combination\_size\; + o1 = threadIdx-offset+array\_comb1[offset]\; + o2 = threadIdx-offset+array\_comb2[offset]\; + \For{i=1 to n} { + t=xor-like()\; + t=t\textasciicircum shmem[o1]\textasciicircum shmem[o2]\; + shared\_mem[threadId]=t\; + x = x\textasciicircum t\; + + store the new PRNG in NewNb[NumThreads*threadId+i]\; + } + store internal variables in InternalVarXorLikeArray[threadId]\; +} +\end{small} +\caption{Main kernel for the chaotic iterations based PRNG GPU efficient +version\label{IR}} +\label{algo:gpu_kernel2} +\end{algorithm} + +\subsection{Chaos Evaluation of the Improved Version} + +A run of Algorithm~\ref{algo:gpu_kernel2} consists in an operation ($x=x\oplus t$) having +the form of Equation~\ref{equation Oplus}, which is equivalent to the iterative +system of Eq.~\ref{eq:generalIC}. That is, an iteration of the general chaotic +iterations is realized between the last stored value $x$ of the thread and a strategy $t$ +(obtained by a bitwise exclusive or between a value provided by a xor-like() call +and two values previously obtained by two other threads). +To be certain that we are in the framework of Theorem~\ref{t:chaos des general}, +we must guarantee that this dynamical system iterates on the space +$\mathcal{X} = \mathcal{P}\left(\llbracket 1, \mathsf{N} \rrbracket\right)^\mathds{N}\times\mathds{B}^\mathsf{N}$. +The left term $x$ obviously belongs to $\mathds{B}^ \mathsf{N}$. +To prevent from any flaws of chaotic properties, we must check that the right +term (the last $t$), corresponding to the strategies, can possibly be equal to any +integer of $\llbracket 1, \mathsf{N} \rrbracket$. + +Such a result is obvious, as for the xor-like(), all the +integers belonging into its interval of definition can occur at each iteration, and thus the +last $t$ respects the requirement. Furthermore, it is possible to +prove by an immediate mathematical induction that, as the initial $x$ +is uniformly distributed (it is provided by a cryptographically secure PRNG), +the two other stored values shmem[o1] and shmem[o2] are uniformly distributed too, +(this is the induction hypothesis), and thus the next $x$ is finally uniformly distributed. + +Thus Algorithm~\ref{algo:gpu_kernel2} is a concrete realization of the general +chaotic iterations presented previously, and for this reason, it satisfies the +Devaney's formulation of a chaotic behavior. + +\section{Experiments} +\label{sec:experiments} + +Different experiments have been performed in order to measure the generation +speed. We have used a first computer equipped with a Tesla C1060 NVidia GPU card +and an +Intel Xeon E5530 cadenced at 2.40 GHz, and +a second computer equipped with a smaller CPU and a GeForce GTX 280. +All the +cards have 240 cores. + +In Figure~\ref{fig:time_xorlike_gpu} we compare the quantity of pseudorandom numbers +generated per second with various xor-like based PRNGs. In this figure, the optimized +versions use the {\it xor64} described in~\cite{Marsaglia2003}, whereas the naive versions +embed the three xor-like PRNGs described in Listing~\ref{algo:seqCIPRNG}. In +order to obtain the optimal performances, the storage of pseudorandom numbers +into the GPU memory has been removed. This step is time consuming and slows down the numbers +generation. Moreover this storage is completely +useless, in case of applications that consume the pseudorandom +numbers directly after generation. We can see that when the number of threads is greater +than approximately 30,000 and lower than 5 million, the number of pseudorandom numbers generated +per second is almost constant. With the naive version, this value ranges from 2.5 to +3GSamples/s. With the optimized version, it is approximately equal to +20GSamples/s. Finally we can remark that both GPU cards are quite similar, but in +practice, the Tesla C1060 has more memory than the GTX 280, and this memory +should be of better quality. +As a comparison, Listing~\ref{algo:seqCIPRNG} leads to the generation of about +138MSample/s when using one core of the Xeon E5530. + +\begin{figure}[htbp] +\begin{center} + \includegraphics[scale=0.65]{Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf} +\end{center} +\caption{Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG} +\label{fig:time_xorlike_gpu} +\end{figure} + + + + + +In Figure~\ref{fig:time_bbs_gpu} we highlight the performances of the optimized +BBS-based PRNG on GPU. On the Tesla C1060 we obtain approximately 700MSample/s +and on the GTX 280 about 670MSample/s, which is obviously slower than the +xorlike-based PRNG on GPU. However, we will show in the next sections that this +new PRNG has a strong level of security, which is necessarily paid by a speed +reduction. + +\begin{figure}[htbp] +\begin{center} + \includegraphics[scale=0.65]{Chapters/chapter18/figures/curve_time_bbs_gpu.pdf} +\end{center} +\caption{Quantity of pseudorandom numbers generated per second using the BBS-based PRNG} +\label{fig:time_bbs_gpu} +\end{figure} + +All these experiments allow us to conclude that it is possible to +generate a very large quantity of pseudorandom numbers statistically perfect with the xor-like version. +To a certain extend, it is also the case with the secure BBS-based version, the speed deflation being +explained by the fact that the former version has ``only'' +chaotic properties and statistical perfection, whereas the latter is also cryptographically secure, +as it is shown in the next sections. + + + + + + + + + + + +\putbib[Chapters/chapter18/biblio] + diff --git a/BookGPU/Chapters/chapter18/ch18.tex~ b/BookGPU/Chapters/chapter18/ch18.tex~ new file mode 100755 index 0000000..6c3f1f1 --- /dev/null +++ b/BookGPU/Chapters/chapter18/ch18.tex~ @@ -0,0 +1,296 @@ +\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte} + + +\chapter{Presentation of the GPU architecture and of the Cuda environment} +\label{chapter1} + +\section{Introduction}\label{ch1:intro} + +This chapter introduces the Graphics Processing Unit (GPU) architecture and all +the concepts needed to understand how GPUs work and can be used to speed up the +execution of some algorithms. First of all this chapter gives a brief history of +the development of Graphics card until they have been used in order to make +general purpose computation. Then the architecture of a GPU is +illustrated. There are many fundamental differences between a GPU and a +tradition processor. In order to benefit from the power of a GPU, a Cuda +programmer needs to use threads. They have some particularities which enable the +Cuda model to be efficient and scalable when some constraints are addressed. + + + +\section{Brief history of Video Card} + +Video cards or Graphics cards have been introduced in personal computers to +produce high quality graphics faster than classical Central Processing Units +(CPU) and to alleviate CPU from this task. In general, display tasks are very +repetitive and very specific. Hence, some manufacturers have produced more and +more sophisticated video cards, providing 2D accelerations then 3D accelerations, +then some light transforms. Video cards own their own memory to perform their +computation. For at least two decades, every personal computer has had a video +card which is simple for desktop computers or which provides many accelerations +for game and/or graphic oriented computers. In the latter case, graphic cards +may be more expensive than a CPU. + +Since 2000, video cards have allowed users to apply arithmetic operations +simultaneously on a sequence of pixels, also later called stream processing. In +this case, the information of the pixels (color, location and other information) are +combined in order to produce a pixel color that can be displayed on a screen. +Simultaneous computations are provided by shaders which calculate rendering +effects on graphics hardware with a high degree of flexibility. These shaders +handles the stream data with pipelines. + + +Some researchers tried to apply those operations on other data, representing +something different from pixels, and consequently this resulted in the first +uses of video cards for performing general purpose computation. The programming +model was not easy to use at all and was very dependent of the hardware +constraints. More precisely it consisted in using either DirectX of OpenGL +functions providing an interface to some classical operations for videos +operations (memory transfers, texture manipulation, ...). Floating point +operations were most of the time unimaginable. Obviously when something went +wrong, programmers had no way (and neither the tools) to detect it. + +\section{GPGPU} + +In order to benefit from the computing power of more recent video cards, Cuda +was first proposed in 2007 by NVidia. It unifies the programming model for some +of their most performant video cards. Cuda~\cite{ch1:cuda} has quickly been +considered by the scientific community as a great advance for general purpose +graphics processing unit (GPGPU) computing. Of course other programming models +have been proposed. The other well-known alternative is OpenCL which aims at +proposing an alternative to Cuda and which is multi-platform and portable. This +is a great advantage since it is even possible to execute OpenCL programs on +traditional CPUs. The main drawback is that it is less tight with the hardware +and consequently sometimes provides less efficient programs. Moreover, Cuda +benefits from more mature compilation and optimization procedures. Other less +known environments have been proposed, but most of them have been stopped, for +example we can cite: FireStream by ATI which is not maintained anymore and +replaced by OpenCL, BrookGPU by Standford University~\cite{ch1:Buck:2004:BGS}. +Another environment based on pragma (insertion of pragma directives inside the +code to help the compiler to generate efficient code) is call OpenACC. For a +comparison with OpenCL, interested readers may refer to~\cite{ch1:CMR:12}. + + + +\section{Architecture of current GPUs} + +The architecture \index{architecture of a GPU} of current GPUs is constantly +evolving. Nevertheless some trends remain constant throughout this evolution. +Processing units composing a GPU are far more simple than a traditional CPU but +it is much easier to integrate many computing units inside a GPU card than to do +so with many cores inside a CPU. This is due to the fact that the cores of a GPU are +simpler than the cores of a CPU. In 2012, the most powerful GPUs own more than 500 +cores and the most powerful CPUs have 8 +cores. Figure~\ref{ch1:fig:comparison_cpu_gpu} shows the number of cores inside +a CPU and inside a GPU. In fact, in a current NVidia GPU, there are +multiprocessors which have 32 cores (for example on Fermi cards). The core clock +of CPU is generally around 3GHz and the one of GPU is about 1.5GHz. Although +the core clock of GPU cores is slower, the amount of cores inside a GPU provides +more computational power. This measure is commonly represented by the number of +floating point operation per seconds. Nowadays the most powerful GPUs provide more +than 1TFlops, i.e. $10^{12}$ floating point operations per second. +Nevertheless GPUs are very efficient to perform some operations but not all +kinds of operations. They are very efficient to execute repetitive work in which +only the data change. It is important to keep in mind that multiprocessors +inside a GPU have 32 cores. Later we will see that these 32 cores need to do the +same work to get maximum performance. + +\begin{figure}[b!] +\centerline{\includegraphics[]{Chapters/chapter1/figures/nb_cores_CPU_GPU.pdf}} +\caption{Comparison of number of cores in a CPU and in a GPU.} +%[Comparison of number of cores in a CPU and in a GPU] +\label{ch1:fig:comparison_cpu_gpu} +\end{figure} + +On the most powerful GPU cards, called Fermi, multiprocessors are called streaming +multiprocessors (SM). Each SM contains 32 cores and is able to perform 32 +floating points or integer operations on 32 bits numbers per clock or 16 floating +points on 64 bits number per clock. SMs have their own registers, execution +pipelines and caches. On Fermi architecture, there are 64Kb shared memory + L1 +cache and 32,536 32bits registers per SM. More precisely the programmer can +decide what amount of shared memory and L1 cache SM can use. The constraint is +that the sum of both amounts should be less or equal to 64Kb. + +Threads are used to benefit from the important number of cores of a GPU. Those +threads are different from traditional threads for CPU. In +chapter~\ref{chapter2}, some examples of GPU programming will explicit the +details of the GPU threads. However, threads are gathered into blocks of 32 +threads, called ``warps''. Those warps are important when designing an algorithm +for GPU. + + +Another big difference between CPU and GPU is the latency of memory. In CPU, +everything is optimized to obtain a low latency architecture. This is possible +through the use of cache memories. Moreover, nowadays CPUs perform many +performance optimizations such as speculative execution which roughly speaking +consists in executing a small part of code in advance even if later this work +reveals itself to be useless. On the contrary, GPUs do not have low latency +memory. In comparison GPUs have small cache memories. Nevertheless the +architecture of GPUs is optimized for throughput computation and it takes into +account the memory latency. + + + +\begin{figure}[b!] +\centerline{\includegraphics[scale=0.7]{Chapters/chapter1/figures/low_latency_vs_high_throughput.pdf}} +\caption{Comparison of low latency of CPU and high throughput of GPU.} +\label{ch1:fig:latency_throughput} +\end{figure} + +Figure~\ref{ch1:fig:latency_throughput} illustrates the main difference of +memory latency between a CPU and a GPU. In a CPU, tasks ``ti'' are executed one +by one with a short memory latency to get the data to process. After some tasks, +there is a context switch that allows the CPU to run concurrent applications +and/or multi-threaded applications. Memory latencies are longer in a GPU, the +the principle to obtain a high throughput is to have many tasks to +compute. Later we will see that those tasks are called threads with Cuda. With +this principle, as soon as a task is finished the next one is ready to be +executed while the wait for data for the previous task is overlapped by +computation of other tasks. + + + +\section{Kinds of parallelism} + +Many kinds of parallelism are amiable according to the type of hardware. +Roughly speaking, there are three classes of parallelism: instruction-level +parallelism, data parallelism and task parallelism. + +Instruction-level parallelism consists in re-ordering some instructions in order +to execute some of them in parallel without changing the result of the code. +In modern CPUs, instruction pipelines allow processor to execute instructions +faster. With a pipeline a processor can execute multiple instructions +simultaneously due to the fact that the output of a task is the input of the +next one. + +Data parallelism consists in executing the same program with different data on +different computing units. Of course, no dependency should exist between the +data. For example, it is easy to parallelize loops without dependency using the +data parallelism paradigm. This paradigm is linked with the Single Instructions +Multiple Data (SIMD) architecture. This is the kind of parallelism provided by +GPUs. + +Task parallelism is the common parallelism achieved out on clusters and grids and +high performance architectures where different tasks are executed by different +computing units. + +\section{Cuda Multithreading} + +The data parallelism of Cuda is more precisely based on the Single Instruction +Multiple Thread (SIMT) model. This is due to the fact that a programmer accesses +to the cores by the intermediate of threads. In the Cuda model, all cores +execute the same set of instructions but with different data. This model has +similarities with the vector programming model proposed for vector machines through +the 1970s into the 90s, notably the various Cray platforms. On the Cuda +architecture, the performance is led by the use of a huge number of threads +(from thousands up to to millions). The particularity of the model is that there +is no context switching as in CPUs and each thread has its own registers. In +practice, threads are executed by SM and are gathered into groups of 32 +threads. Those groups are called ``warps''. Each SM alternatively executes +``active warps'' and warps becoming temporarily inactive due to waiting of data +(as shown in Figure~\ref{ch1:fig:latency_throughput}). + +The key to scalability in the Cuda model is the use of a huge number of threads. +In practice, threads are not only gathered in warps but also in thread blocks. A +thread block is executed by only one SM and it cannot migrate. The typical size of +a thread block is a number power of two (for example: 64, 128, 256 or 512). + + + +In this case, without changing anything inside a Cuda code, it is possible to +run your code with a small Cuda device or the most performing Tesla Cuda cards. +Blocks are executed in any order depending on the number of SMs available. So +the programmer must conceive its code having this issue in mind. This +independence between thread blocks provides the scalability of Cuda codes. + +\begin{figure}[b!] +\centerline{\includegraphics[scale=0.65]{Chapters/chapter1/figures/scalability.pdf}} +\caption{Scalability of GPU.} +\label{ch1:fig:scalability} +\end{figure} + + +A kernel is a function which contains a block of instructions that are executed +by the threads of a GPU. When the problem considered is a two dimensional or three +dimensional problem, it is possible to group thread blocks into a grid. In +practice, the number of thread blocks and the size of thread blocks is given as +parameters to each kernel. Figure~\ref{ch1:fig:scalability} illustrates an +example of a kernel composed of 8 thread blocks. Then this kernel is executed on +a small device containing only 2 SMs. So in this case, blocks are executed 2 +by 2 in any order. If the kernel is executed on a larger Cuda device containing +4 SMs, blocks are executed 4 by 4 simultaneously. The execution times should be +approximately twice faster in the latter case. Of course, that depends on other +parameters that will be described later. + +Thread blocks provide a way to cooperation in the sense that threads of the same +block cooperatively load and store blocks of memory they all +use. Synchronizations of threads in the same block are possible (but not between +threads of different blocks). Threads of the same block can also share results +in order to compute a single result. In chapter~\ref{chapter2}, some examples +will explicit that. + + +\section{Memory hierarchy} + +The memory hierarchy of GPUs\index{memory~hierarchy} is different from the CPUs +one. In practice, there are registers\index{memory~hierarchy!registers}, local +memory\index{memory~hierarchy!local~memory}, shared +memory\index{memory~hierarchy!shared~memory}, cache +memory\index{memory~hierarchy!cache~memory} and global +memory\index{memory~hierarchy!global~memory}. + + +As previously mentioned each thread can access its own registers. It is +important to keep in mind that the number of registers per block is limited. On +recent cards, this number is limited to 64Kb per SM. Access to registers is +very fast, so it is a good idea to use them whenever possible. + +Likewise each thread can access local memory which, in practice, is much slower +than registers. Local memory is automatically used by the compiler when all the +registers are occupied. So the best idea is to optimize the use of registers +even if this implies to reduce the number of threads per block. + +\begin{figure}[hbtp!] +\centerline{\includegraphics[scale=0.60]{Chapters/chapter1/figures/memory_hierarchy.pdf}} +\caption{Memory hierarchy of a GPU.} +\label{ch1:fig:memory_hierarchy} +\end{figure} + + + +Shared memory allows cooperation between threads of the same block. This kind +of memory is fast because it requires to be manipulated manually and its size is +limited. It is accessible during the execution of a kernel. So the principle is +to fill the shared memory at the start of the kernel with global data that are +used very frequently, then threads can access it for their computation. They +can obviously change the content of this shared memory either with computation +or load of other data and they can store its content in the global memory. So +shared memory can be seen as a cache memory manageable manually. This requires +obviously an effort from the programmer. + +On recent cards, the programmer may decide what amount of cache memory and +shared memory is attributed to a kernel. The cache memory is a L1 cache which is +directly managed by the GPU. Sometimes, this cache provides very efficient +result and sometimes the use of shared memory is a better solution. + + + + +Figure~\ref{ch1:fig:memory_hierarchy} illustrates the memory hierarchy of a +GPU. Threads are represented on the top of the figure. They can access to their +own registers and their local memory. Threads of the same block can access to +the shared memory of this block. The cache memory is not represented here but it +is local to a thread. Then each block can access to the global memory of the +GPU. + + \section{Conclusion} + +In this chapter, a brief presentation of the video card, which has later been +used to perform computation, has been given. The architecture of a GPU has been +illustrated focusing on the particularity of GPUs in term of parallelism, memory +latency and threads. In order to design an efficient algorithm for GPU, it is +essential to have all these parameters in mind. + + +\putbib[Chapters/chapter1/biblio] + diff --git a/BookGPU/Chapters/chapter18/figures/curve_time_bbs_gpu.pdf b/BookGPU/Chapters/chapter18/figures/curve_time_bbs_gpu.pdf new file mode 100644 index 0000000..a6a1080 Binary files /dev/null and b/BookGPU/Chapters/chapter18/figures/curve_time_bbs_gpu.pdf differ diff --git a/BookGPU/Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf b/BookGPU/Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf new file mode 100644 index 0000000..2437d2e Binary files /dev/null and b/BookGPU/Chapters/chapter18/figures/curve_time_xorlike_gpu.pdf differ diff --git a/BookGPU/Chapters/chapter18/preamble.aux b/BookGPU/Chapters/chapter18/preamble.aux new file mode 100644 index 0000000..65abe7e --- /dev/null +++ b/BookGPU/Chapters/chapter18/preamble.aux @@ -0,0 +1,32 @@ +\relax +\@setckpt{Chapters/chapter1/preamble}{ +\setcounter{page}{1} +\setcounter{equation}{0} +\setcounter{enumi}{0} +\setcounter{enumii}{0} +\setcounter{enumiii}{0} +\setcounter{enumiv}{0} +\setcounter{footnote}{0} +\setcounter{mpfootnote}{0} +\setcounter{part}{0} +\setcounter{chapter}{0} +\setcounter{section}{0} +\setcounter{subsection}{0} +\setcounter{subsubsection}{0} +\setcounter{paragraph}{0} +\setcounter{subparagraph}{0} +\setcounter{figure}{0} +\setcounter{table}{0} +\setcounter{numauthors}{0} +\setcounter{parentequation}{0} +\setcounter{subfigure}{0} +\setcounter{lofdepth}{1} +\setcounter{subtable}{0} +\setcounter{lotdepth}{1} +\setcounter{lstnumber}{1} +\setcounter{ContinuedFloat}{0} +\setcounter{AlgoLine}{0} +\setcounter{algocfline}{0} +\setcounter{algocfproc}{0} +\setcounter{algocf}{0} +}