ajout courbe

[prng_gpu.git] / prng_gpu.tex
diff --git a/prng_gpu.tex b/prng_gpu.tex

index 57526f26331a2a8d9bed7cee992d881f7425977d..d1fb7a67a7f66791e989f8b6111d6aa4b3c52ec0 100644 (file)
--- a/prng_gpu.tex
+++ b/prng_gpu.tex
@@ -8,6 +8,7 @@
  \usepackage{moreverb}
  \usepackage{commath}
  \usepackage{algorithm2e}
+\usepackage{listings}
  \usepackage[standard]{ntheorem}
  
  % Pour mathds : les ensembles IR, IN, etc.
@@ -246,7 +247,7 @@ return $x$\;
  \end{algorithm}
  
  \begin{algorithm}[h!]
-\SetAlgoLined
+%\SetAlgoLined                        %%RAPH: cette ligne provoque une erreur chez moi
  \KwIn{the internal configuration $z$ (a 32-bit word)}
  \KwOut{$y$ (a 32-bit word)}
  $z\leftarrow{z\oplus{(z\ll13)}}$\;
@@ -666,60 +667,223 @@ Indeed this result is weaker than the theorem establishing the chaos for the fin
  
  \section{Efficient prng based on chaotic iterations}
  
-On parle du séquentiel avec des nombres 64 bits\\
-
-Faire le lien avec le paragraphe précédent (je considère que la stratégie s'appelle $S^i$\\
-
  In  order to  implement efficiently  a PRNG  based on  chaotic iterations  it is
  possible to improve  previous works [ref]. One solution  consists in considering
-that the  strategy used $S^i$  contains all the  bits for which the  negation is
-achieved out. Then instead of applying  the negation on these bits we can simply
-apply the  xor operator between  the current number  and the strategy  $S^i$. In
+that the  strategy used contains all the  bits for which the  negation is
+achieved out. Then in order to apply  the negation on these bits we can simply
+apply the  xor operator between  the current number  and the strategy. In
  order to obtain the strategy we also use a classical PRNG.
  
-\begin{figure}[htbp]
-\begin{center}
-\fbox{
-\begin{minipage}{14cm}
-unsigned int CIprng() \{\\
-  static unsigned int x = 123123123;\\
-  unsigned long t1 = xorshift();\\
-  unsigned long t2 = xor128();\\
-  unsigned long t3 = xorwow();\\
-  x = x\textasciicircum (unsigned int)t1;\\
-  x = x\textasciicircum (unsigned int)(t2$>>$32);\\
-  x = x\textasciicircum (unsigned int)(t3$>>$32);\\
-  x = x\textasciicircum (unsigned int)t2;\\
-  x = x\textasciicircum (unsigned int)(t1$>>$32);\\
-  x = x\textasciicircum (unsigned int)t3;\\
-  return x;\\
-\}
-\end{minipage}
+Here  is an  example with  16-bits numbers  showing how  the bitwise  operations are
+applied.  Suppose  that $x$ and the  strategy $S^i$ are defined  in binary mode.
+Then the following table shows the result of $x$ xor $S^i$.
+$$
+\begin{array}{|cc|cccccccccccccccc|}
+\hline
+x      &=&1&0&1&1&1&0&1&0&1&0&0&1&0&0&1&0\\
+\hline
+S^i      &=&0&1&1&0&0&1&1&0&1&1&1&0&0&1&1&1\\
+\hline
+x \oplus S^i&=&1&1&0&1&1&1&0&0&0&1&1&1&0&1&0&1\\
+\hline
+
+\hline
+ \end{array}
+$$
+
+%% \begin{figure}[htbp]
+%% \begin{center}
+%% \fbox{
+%% \begin{minipage}{14cm}
+%% unsigned int CIprng() \{\\
+%%   static unsigned int x = 123123123;\\
+%%   unsigned long t1 = xorshift();\\
+%%   unsigned long t2 = xor128();\\
+%%   unsigned long t3 = xorwow();\\
+%%   x = x\textasciicircum (unsigned int)t1;\\
+%%   x = x\textasciicircum (unsigned int)(t2$>>$32);\\
+%%   x = x\textasciicircum (unsigned int)(t3$>>$32);\\
+%%   x = x\textasciicircum (unsigned int)t2;\\
+%%   x = x\textasciicircum (unsigned int)(t1$>>$32);\\
+%%   x = x\textasciicircum (unsigned int)t3;\\
+%%   return x;\\
+%% \}
+%% \end{minipage}
+%% }
+%% \end{center}
+%% \caption{sequential Chaotic Iteration PRNG}
+%% \label{algo:seqCIprng}
+%% \end{figure}
+
+
+
+\lstset{language=C,caption={C code of the sequential chaotic iterations based PRNG},label=algo:seqCIprng}
+\begin{lstlisting}
+unsigned int CIprng() {
+  static unsigned int x = 123123123;
+  unsigned long t1 = xorshift();
+  unsigned long t2 = xor128();
+  unsigned long t3 = xorwow();
+  x = x^(unsigned int)t1;
+  x = x^(unsigned int)(t2>>32);
+  x = x^(unsigned int)(t3>>32);
+  x = x^(unsigned int)t2;
+  x = x^(unsigned int)(t1>>32);
+  x = x^(unsigned int)t3;
+  return x;
  }
-\end{center}
-\caption{sequential Chaotic Iteration PRNG}
-\label{algo:seqCIprng}
-\end{figure}
+\end{lstlisting}
  
-In Figure~\ref{algo:seqCIprng}  a sequential  version of our  chaotic iterations
-based PRNG  is presented.  This version  uses three classical 64  bits PRNG: the
-\texttt{xorshift},  the \texttt{xor128}  and the  \texttt{xorwow}.   These three
-PRNGs  are presented  in~\cite{Marsaglia2003}.   As each  PRNG  used works  with
-64-bits and as  our PRNG works with 32 bits, the  use of \texttt{(unsigned int)}
-selects the 32 least  significant bits whereas \texttt{(unsigned int)(t3$>>$32)}
-selects the 32  most significants bits of the  variable \texttt{t}. This version
-sucesses   the   BigCrush   of    the   TestU01   battery   [P.   L’ecuyer   and
-  R. Simard. Testu01].
+
+
+
+
+In listing~\ref{algo:seqCIprng}  a sequential version of  our chaotic iterations
+based   PRNG    is   presented.   The    xor   operator   is    represented   by
+\textasciicircum.  This   function  uses  three  classical   64-bits  PRNG:  the
+\texttt{xorshift},  the   \texttt{xor128}  and  the   \texttt{xorwow}.   In  the
+following,  we call  them  xor-like  PRNGSs.  These  three  PRNGs are  presented
+in~\cite{Marsaglia2003}.  As each  xor-like PRNG used works with  64-bits and as
+our PRNG works  with 32-bits, the use of \texttt{(unsigned  int)} selects the 32
+least significant bits whereas  \texttt{(unsigned int)(t3$>>$32)} selects the 32
+most  significants bits  of the  variable \texttt{t}.   So to  produce  a random
+number realizes  6 xor operations with  6 32-bits numbers produced  by 3 64-bits
+PRNG.  This version successes the  BigCrush of the TestU01 battery [P.  L’ecuyer
+  and R. Simard. Testu01].
  
  \section{Efficient prng based on chaotic iterations on GPU}
  
-On parle du passage du sequentiel au GPU
+In  order to benefit  from computing  power of  GPU, a  program needs  to define
+independent blocks of threads which  can be computed simultaneously. In general,
+the larger the number of threads is,  the more local memory is used and the less
+branching  instructions are  used (if,  while, ...),  the better  performance is
+obtained  on  GPU.  So  with  algorithm  \ref{algo:seqCIprng}  presented in  the
+previous section, it is possible to  build a similar program which computes PRNG
+on  GPU. In  the CUDA  [ref] environment,  threads have  a  local identificator,
+called \texttt{ThreadIdx} relative to the block containing them.
+
+
+\subsection{Naive version for GPU}
+
+From the CPU version, it is possible  to obtain a quite similar version for GPU.
+The principe consists in assigning the computation of a PRNG as in sequential to
+each thread  of the  GPU.  Of course,  it is  essential that the  three xor-like
+PRNGs  used for  our computation  have different  parameters. So  we  chose them
+randomly with  another PRNG. As the  initialisation is performed by  the CPU, we
+have chosen to use the ISAAC PRNG  [ref] to initalize all the parameters for the
+GPU version  of our  PRNG.  The  implementation of the  three xor-like  PRNGs is
+straightforward  as soon  as their  parameters have  been allocated  in  the GPU
+memory. Each xor-like  PRNGs used works with an internal  number $x$ which keeps
+the last generated random numbers. Other internal variables are also used by the
+xor-like PRNGs. More  precisely, the implementation of the  xor128, the xorshift
+and  the xorwow  respectively  require 4,  5  and 6  unsigned  long as  internal
+variables.
+
+\begin{algorithm}
+
+\KwIn{InternalVarXorLikeArray: array with internal variables of the 3 xor-like PRNGs in global memory\;
+NumThreads: Number of threads\;}
+\KwOut{NewNb: array containing random numbers in global memory}
+\If{threadIdx is concerned by the computation} {
+  retrieve data from InternalVarXorLikeArray[threadIdx] in local variables\;
+  \For{i=1 to n} {
+    compute a new PRNG as in Listing\ref{algo:seqCIprng}\;
+    store the new PRNG in NewNb[NumThreads*threadIdx+i]\;
+  }
+  store internal variables in InternalVarXorLikeArray[threadIdx]\;
+}
+
+\caption{main kernel for the chaotic iterations based PRNG GPU naive version}
+\label{algo:gpu_kernel}
+\end{algorithm}
+
+Algorithm~\ref{algo:gpu_kernel}  presents a naive  implementation of  PRNG using
+GPU.  According  to the available  memory in the  GPU and the number  of threads
+used simultenaously,  the number  of random numbers  that a thread  can generate
+inside   a    kernel   is   limited,   i.e.    the    variable   \texttt{n}   in
+algorithm~\ref{algo:gpu_kernel}. For example, if  $100,000$ threads are used and
+if $n=100$\footnote{in fact, we need to add the initial seed (a 32-bits number)}
+then   the  memory   required   to  store   internals   variables  of   xor-like
+PRNGs\footnote{we multiply this number by $2$ in order to count 32-bits numbers}
+and  random  number of  our  PRNG  is  equals to  $100,000\times  ((4+5+6)\times
+2+(1+100))=1,310,000$ 32-bits numbers, i.e. about $52$Mb.
+
+All the  tests performed  to pass the  BigCrush of TestU01  succeeded. Different
+number of threads, called \texttt{NumThreads} in our algorithm, have been tested
+upto $10$ millions.
+
+\begin{remark}
+Algorithm~\ref{algo:gpu_kernel}  has  the  advantage to  manipulate  independent
+PRNGs, so this version is easily usable on a cluster of computer. The only thing
+to ensure is to use a single ISAAC PRNG. For this, a simple solution consists in
+using a master node for the initialization which computes the initial parameters
+for all the differents nodes involves in the computation.
+\end{remark}
+
+\subsection{Improved version for GPU}
+
+As GPU cards using CUDA have shared memory between threads of the same block, it
+is possible  to use this  feature in order  to simplify the  previous algorithm,
+i.e. using less  than 3 xor-like PRNGs. The solution  consists in computing only
+one xor-like PRNG by thread, saving  it into shared memory and using the results
+of some  other threads in the  same block of  threads. In order to  define which
+thread uses the result of which other  one, we can use a permutation array which
+contains  the indexes  of  all threads  and  for which  a  permutation has  been
+performed.  In Algorithm~\ref{algo:gpu_kernel2}, 2 permutations arrays are used.
+The    variable   \texttt{offset}    is    computed   using    the   value    of
+\texttt{permutation\_size}.   Then we  can compute  \texttt{o1}  and \texttt{o2}
+which represent the indexes of the  other threads for which the results are used
+by the  current thread. In  the algorithm, we  consider that a  64-bits xor-like
+PRNG is used, that is why both 32-bits parts are used.
+
+This version also succeed to the BigCrush batteries of tests.
+
+\begin{algorithm}
+
+\KwIn{InternalVarXorLikeArray: array with internal variables of 1 xor-like PRNGs in global memory\;
+NumThreads: Number of threads\;
+tab1, tab2: Arrays containing permutations of size permutation\_size\;}
+
+\KwOut{NewNb: array containing random numbers in global memory}
+\If{threadId is concerned} {
+  retrieve data from InternalVarXorLikeArray[threadId] in local variables\;
+  offset = threadIdx\%permutation\_size\;
+  o1 = threadIdx-offset+tab1[offset]\;
+  o2 = threadIdx-offset+tab2[offset]\;
+  \For{i=1 to n} {
+    t=xor-like()\;
+    shared\_mem[threadId]=(unsigned int)t\;
+    x = x $\oplus$ (unsigned int) t\;
+    x = x $\oplus$ (unsigned int) (t>>32)\;
+    x = x $\oplus$ shared[o1]\;
+    x = x $\oplus$ shared[o2]\;
+
+    store the new PRNG in NewNb[NumThreads*threadId+i]\;
+  }
+  store internal variables in InternalVarXorLikeArray[threadId]\;
+}
+
+\caption{main kernel for the chaotic iterations based PRNG GPU efficient version}
+\label{algo:gpu_kernel2}
+\end{algorithm}
+
+
  
  \section{Experiments}
  
-On passe le BigCrush\\
-On donne des temps de générations sur GPU/CPU\\
-On donne des temps de générations de nombre sur GPU puis on rappatrie sur CPU / CPU ? bof bof, on verra
+Differents experiments have been performed in order to measure the generation speed.
+\begin{figure}[t]
+\begin{center}
+  \includegraphics[scale=.5]{curve_time_gpu.pdf}
+
+\end{center}
+\caption{Number of random numbers generated per second}
+\label{fig:time_naive_gpu}
+\end{figure}
+
+First of all we have compared the time to generate X random numbers with both the CPU version and the GPU version. 
+
+Faire une courbe du nombre de random en fonction du nombre de threads, éventuellement en fonction du nombres de threads par bloc.
  
  
  \section{Conclusion}