un peu plus

[prng_gpu.git] / prng_gpu.tex
diff --git a/prng_gpu.tex b/prng_gpu.tex

index 57526f26331a2a8d9bed7cee992d881f7425977d..dc91965a2009f5b3aa21cbdadfad7c89dee1b629 100644 (file)
--- a/prng_gpu.tex
+++ b/prng_gpu.tex
@@ -8,6 +8,7 @@
  \usepackage{moreverb}
  \usepackage{commath}
  \usepackage{algorithm2e}
+\usepackage{listings}
  \usepackage[standard]{ntheorem}
  
  % Pour mathds : les ensembles IR, IN, etc.
@@ -246,7 +247,7 @@ return $x$\;
  \end{algorithm}
  
  \begin{algorithm}[h!]
-\SetAlgoLined
+%\SetAlgoLined                        %%RAPH: cette ligne provoque une erreur chez moi
  \KwIn{the internal configuration $z$ (a 32-bit word)}
  \KwOut{$y$ (a 32-bit word)}
  $z\leftarrow{z\oplus{(z\ll13)}}$\;
@@ -666,60 +667,197 @@ Indeed this result is weaker than the theorem establishing the chaos for the fin
  
  \section{Efficient prng based on chaotic iterations}
  
-On parle du séquentiel avec des nombres 64 bits\\
-
-Faire le lien avec le paragraphe précédent (je considère que la stratégie s'appelle $S^i$\\
-
  In  order to  implement efficiently  a PRNG  based on  chaotic iterations  it is
  possible to improve  previous works [ref]. One solution  consists in considering
-that the  strategy used $S^i$  contains all the  bits for which the  negation is
-achieved out. Then instead of applying  the negation on these bits we can simply
-apply the  xor operator between  the current number  and the strategy  $S^i$. In
+that the  strategy used contains all the  bits for which the  negation is
+achieved out. Then in order to apply  the negation on these bits we can simply
+apply the  xor operator between  the current number  and the strategy. In
  order to obtain the strategy we also use a classical PRNG.
  
-\begin{figure}[htbp]
-\begin{center}
-\fbox{
-\begin{minipage}{14cm}
-unsigned int CIprng() \{\\
-  static unsigned int x = 123123123;\\
-  unsigned long t1 = xorshift();\\
-  unsigned long t2 = xor128();\\
-  unsigned long t3 = xorwow();\\
-  x = x\textasciicircum (unsigned int)t1;\\
-  x = x\textasciicircum (unsigned int)(t2$>>$32);\\
-  x = x\textasciicircum (unsigned int)(t3$>>$32);\\
-  x = x\textasciicircum (unsigned int)t2;\\
-  x = x\textasciicircum (unsigned int)(t1$>>$32);\\
-  x = x\textasciicircum (unsigned int)t3;\\
-  return x;\\
-\}
-\end{minipage}
+Here  is an  example with  16-bits numbers  showing how  the bit  operations are
+applied.  Suppose  that $x$ and the  strategy $S^i$ are defined  in binary mode.
+Then the following table shows the result of $x$ xor $S^i$.
+$$
+\begin{array}{|cc|cccccccccccccccc|}
+\hline
+x      &=&1&0&1&1&1&0&1&0&1&0&0&1&0&0&1&0\\
+\hline
+S^i      &=&0&1&1&0&0&1&1&0&1&1&1&0&0&1&1&1\\
+\hline
+x \oplus S^i&=&1&1&0&1&1&1&0&0&0&1&1&1&0&1&0&1\\
+\hline
+
+\hline
+ \end{array}
+$$
+
+%% \begin{figure}[htbp]
+%% \begin{center}
+%% \fbox{
+%% \begin{minipage}{14cm}
+%% unsigned int CIprng() \{\\
+%%   static unsigned int x = 123123123;\\
+%%   unsigned long t1 = xorshift();\\
+%%   unsigned long t2 = xor128();\\
+%%   unsigned long t3 = xorwow();\\
+%%   x = x\textasciicircum (unsigned int)t1;\\
+%%   x = x\textasciicircum (unsigned int)(t2$>>$32);\\
+%%   x = x\textasciicircum (unsigned int)(t3$>>$32);\\
+%%   x = x\textasciicircum (unsigned int)t2;\\
+%%   x = x\textasciicircum (unsigned int)(t1$>>$32);\\
+%%   x = x\textasciicircum (unsigned int)t3;\\
+%%   return x;\\
+%% \}
+%% \end{minipage}
+%% }
+%% \end{center}
+%% \caption{sequential Chaotic Iteration PRNG}
+%% \label{algo:seqCIprng}
+%% \end{figure}
+
+
+
+\lstset{language=C,caption={C code of the sequential chaotic iterations based PRNG},label=algo:seqCIprng}
+\begin{lstlisting}
+unsigned int CIprng() {
+  static unsigned int x = 123123123;
+  unsigned long t1 = xorshift();
+  unsigned long t2 = xor128();
+  unsigned long t3 = xorwow();
+  x = x^(unsigned int)t1;
+  x = x^(unsigned int)(t2>>32);
+  x = x^(unsigned int)(t3>>32);
+  x = x^(unsigned int)t2;
+  x = x^(unsigned int)(t1>>32);
+  x = x^(unsigned int)t3;
+  return x;
  }
-\end{center}
-\caption{sequential Chaotic Iteration PRNG}
-\label{algo:seqCIprng}
-\end{figure}
+\end{lstlisting}
+
+
  
-In Figure~\ref{algo:seqCIprng}  a sequential  version of our  chaotic iterations
-based PRNG  is presented.  This version  uses three classical 64  bits PRNG: the
-\texttt{xorshift},  the \texttt{xor128}  and the  \texttt{xorwow}.   These three
-PRNGs  are presented  in~\cite{Marsaglia2003}.   As each  PRNG  used works  with
-64-bits and as  our PRNG works with 32 bits, the  use of \texttt{(unsigned int)}
-selects the 32 least  significant bits whereas \texttt{(unsigned int)(t3$>>$32)}
-selects the 32  most significants bits of the  variable \texttt{t}. This version
-sucesses   the   BigCrush   of    the   TestU01   battery   [P.   L’ecuyer   and
+
+
+In listing~\ref{algo:seqCIprng}  a sequential  version of our  chaotic iterations
+based PRNG  is presented. The xor operator is represented by \textasciicircum. This function  uses three classical  64-bits PRNG: the
+\texttt{xorshift},   the  \texttt{xor128}  and   the  \texttt{xorwow}.   In  the
+following,  we  call them  xor-like  PRNGSs.   These  three PRNGs  are  presented
+in~\cite{Marsaglia2003}.  As each xor-like  PRNG used works with 64-bits  and as our PRNG
+works  with 32-bits, the  use of  \texttt{(unsigned int)}  selects the  32 least
+significant bits  whereas \texttt{(unsigned int)(t3$>>$32)} selects  the 32 most
+significants  bits of the  variable \texttt{t}.  So to  produce a  random number
+realizes 6  xor operations with  6 32-bits numbers  produced by 3  64-bits PRNG.
+This version  successes the  BigCrush of the  TestU01 battery [P.   L’ecuyer and
    R. Simard. Testu01].
  
  \section{Efficient prng based on chaotic iterations on GPU}
  
-On parle du passage du sequentiel au GPU
+In  order to benefit  from computing  power of  GPU, a  program needs  to define
+independent blocks of threads which  can be computed simultaneously. In general,
+the larger the number of threads is,  the more local memory is used and the less
+branching  instructions are  used (if,  while, ...),  the better  performance is
+obtained  on  GPU.  So  with  algorithm  \ref{algo:seqCIprng}  presented in  the
+previous section, it is possible to  build a similar program which computes PRNG
+on  GPU. 
+
+
+\subsection{Naive version for GPU}
+
+From the CPU version, it is possible  to obtain a quite similar version for GPU.
+The principe consists in assigning the computation of a PRNG as in sequential to
+each thread  of the  GPU.  Of course,  it is  essential that the  three xor-like
+PRNGs  used for  our computation  have different  parameters. So  we  chose them
+randomly with  another PRNG. As the  initialisation is performed by  the CPU, we
+have chosen to use the ISAAC PRNG  [ref] to initalize all the parameters for the
+GPU version  of our  PRNG.  The  implementation of the  three xor-like  PRNGs is
+straightforward  as soon  as their  parameters have  been allocated  in  the GPU
+memory. Each xor-like  PRNGs used works with an internal  number $x$ which keeps
+the last generated random numbers. Other internal variables are also used by the
+xor-like PRNGs. More  precisely, the implementation of the  xor128, the xorshift
+and  the xorwow  respectively  require 4,  5  and 6  unsigned  long as  internal
+variables.
+
+\begin{algorithm}
+
+\KwIn{InternalVarXorLikeArray: array with internal variables of the 3 xor-like PRNGs in global memory\;
+NumThreads: Number of threads\;}
+\KwOut{NewNb: array containing random numbers in global memory}
+\If{threadId is concerned} {
+  retrieve data from InternalVarXorLikeArray[threadId] in local variables\;
+  \For{i=1 to n} {
+    compute a new PRNG as in Listing\ref{algo:seqCIprng}\;
+    store the new PRNG in NewNb[NumThreads*threadId+i]\;
+  }
+  store internal variables in InternalVarXorLikeArray[threadId]\;
+}
+
+\caption{main kernel for the chaotic iterations based PRNG GPU naive version}
+\label{algo:gpu_kernel}
+\end{algorithm}
+
+According to  the available  memory in the  GPU and  the number of  threads used
+simultenaously, the number of random numbers that a thread can generate inside a
+kernel     is     limited,      i.e.      the     variable     \texttt{n}     in
+algorithm~\ref{algo:gpu_kernel}. For example, if  $100,000$ threads are used and
+if $n=100$\footnote{in fact, we need to add the initial seed (a 32-bits number)}
+then   the  memory   required   to  store   internals   variables  of   xor-like
+PRNGs\footnote{we multiply this number by $2$ in order to count 32-bits numbers}
+and  random  number of  our  PRNG  is  equals to  $100,000\times  ((4+5+6)\times
+2+(1+100))=1,310,000$ 32-bits numbers, i.e. about $52$Mb.
+
+All the  tests performed  to pass the  BigCrush of TestU01  succeeded. Different
+number of threads have been tested upto $10$ millions.
+
+\begin{remark}
+Algorithm~\ref{algo:gpu_kernel}  has  the  advantage to  manipulate  independent
+PRNGs, so this version is easily usable on a cluster of computer. The only thing
+to ensure is to use a single ISAAC PRNG. For this, a simple solution consists in
+using a master node for the initialization which computes the initial parameters
+for all the differents nodes involves in the computation.
+\end{remark}
  
+\subsection{Improved version for GPU}
+
+As GPU cards using CUDA have shared memory between threads of the same block, it
+is possible  to use this  feature in order  to simplify the  previous algorithm,
+i.e. using  less than 3 xor-like  PRNGs. The solution consists  in comuting only
+one xor-like PRNG  by thread, saving in into shared  memory and accessing result
+of some other threads in the same block of threads.
+
+\begin{algorithm}
+
+\KwIn{InternalVarXorLikeArray: array with internal variables of 1 xor-like PRNGs in global memory\;
+NumThreads: Number of threads\;
+tab1, tab2: Arrays containing permutations\;}
+
+\KwOut{NewNb: array containing random numbers in global memory}
+\If{threadId is concerned} {
+  retrieve data from InternalVarXorLikeArray[threadId] in local variables\;
+  offset = threadId\%32;
+  \For{i=1 to n} {
+    t=xor-like()\;
+    shared\_mem[threadId]=(unsigned int)t\;
+    x = x$\oplus$ (unsigned int) t\;
+    x = x$\oplus$ (unsigned int) (t>>32)\;
+    x = x$\oplus$ shared[tab1[offset]]\;
+    x = x$\oplus$ shared[tab2[offset]]\;
+
+    store the new PRNG in NewNb[NumThreads*threadId+i]\;
+  }
+  store internal variables in InternalVarXorLikeArray[threadId]\;
+}
+
+\caption{main kernel for the chaotic iterations based PRNG GPU efficient version}
+\label{algo:gpu_kernel2}
+\end{algorithm}
  \section{Experiments}
  
-On passe le BigCrush\\
-On donne des temps de générations sur GPU/CPU\\
-On donne des temps de générations de nombre sur GPU puis on rappatrie sur CPU / CPU ? bof bof, on verra
+Differents experiments have been performed in order to measure the generation speed.
+
+
+First of all we have compared the time to generate X random numbers with both the CPU version and the GPU version. 
+
+Faire une courbe du nombre de random en fonction du nombre de threads, éventuellement en fonction du nombres de threads par bloc.
  
  
  \section{Conclusion}