pch

[prng_gpu.git] / prng_gpu.tex
diff --git a/prng_gpu.tex b/prng_gpu.tex

index f21f19e817e2590d6e96c48bf88b3257b4c37b8c..0a88df58f8b1204d8d1f118090fd03f9bcdb229c 100644 (file)
--- a/prng_gpu.tex
+++ b/prng_gpu.tex
@@ -7,7 +7,7 @@
  \usepackage{amscd}
  \usepackage{moreverb}
  \usepackage{commath}
-\usepackage{algorithm2e}
+\usepackage[ruled,vlined]{algorithm2e}
  \usepackage{listings}
  \usepackage[standard]{ntheorem}
  
@@ -216,7 +216,10 @@ We can finally remark that, to the best of our knowledge, no GPU implementation
  \label{section:BASIC RECALLS}
  
  This section is devoted to basic definitions and terminologies in the fields of
-topological chaos and chaotic iterations.
+topological chaos and chaotic iterations. We assume the reader is familiar
+with basic notions on topology (see for instance~\cite{Devaney}).
+
+
  \subsection{Devaney's Chaotic Dynamical Systems}
  
  In the sequel $S^{n}$ denotes the $n^{th}$ term of a sequence $S$ and $V_{i}$
@@ -229,7 +232,7 @@ Consider a topological space $(\mathcal{X},\tau)$ and a continuous function $f :
  \mathcal{X} \rightarrow \mathcal{X}$.
  
  \begin{definition}
-$f$ is said to be \emph{topologically transitive} if, for any pair of open sets
+The function $f$ is said to be \emph{topologically transitive} if, for any pair of open sets
  $U,V \subset \mathcal{X}$, there exists $k>0$ such that $f^k(U) \cap V \neq
  \varnothing$.
  \end{definition}
@@ -248,7 +251,7 @@ necessarily the same period).
  
  
  \begin{definition}[Devaney's formulation of chaos~\cite{Devaney}]
-$f$ is said to be \emph{chaotic} on $(\mathcal{X},\tau)$ if $f$ is regular and
+The function $f$ is said to be \emph{chaotic} on $(\mathcal{X},\tau)$ if $f$ is regular and
  topologically transitive.
  \end{definition}
  
@@ -256,12 +259,12 @@ The chaos property is strongly linked to the notion of ``sensitivity'', defined
  on a metric space $(\mathcal{X},d)$ by:
  
  \begin{definition}
-\label{sensitivity} $f$ has \emph{sensitive dependence on initial conditions}
+\label{sensitivity} The function $f$ has \emph{sensitive dependence on initial conditions}
  if there exists $\delta >0$ such that, for any $x\in \mathcal{X}$ and any
  neighborhood $V$ of $x$, there exist $y\in V$ and $n > 0$ such that
  $d\left(f^{n}(x), f^{n}(y)\right) >\delta $.
  
-$\delta$ is called the \emph{constant of sensitivity} of $f$.
+The constant $\delta$ is called the \emph{constant of sensitivity} of $f$.
  \end{definition}
  
  Indeed, Banks \emph{et al.} have proven in~\cite{Banks92} that when $f$ is
@@ -786,7 +789,7 @@ where $(s^0,s^1, \hdots)$ is the strategy of $Y$, satisfies the properties
  claimed in the lemma.
  \end{proof}
  
-We can now prove the Theorem~\ref{t:chaos des general}...
+We can now prove the Theorem~\ref{t:chaos des general}.
  
  \begin{proof}[Theorem~\ref{t:chaos des general}]
  Firstly, strong transitivity implies transitivity.
@@ -856,7 +859,9 @@ $$
  
  
  
-\lstset{language=C,caption={C code of the sequential PRNG based on chaotic iterations},label=algo:seqCIPRNG}
+
+\lstset{language=C,caption={C code of the sequential PRNG based on chaotic iteration\
+s},label=algo:seqCIPRNG}
  \begin{lstlisting}
  unsigned int CIPRNG() {
    static unsigned int x = 123123123;
@@ -876,19 +881,18 @@ unsigned int CIPRNG() {
  
  
  
+In Listing~\ref{algo:seqCIPRNG} a sequential  version of the proposed PRNG based
+on  chaotic  iterations  is  presented.   The xor  operator  is  represented  by
+\textasciicircum.  This function uses  three classical 64-bits PRNGs, namely the
+\texttt{xorshift},         the          \texttt{xor128},         and         the
+\texttt{xorwow}~\cite{Marsaglia2003}.  In the following, we call them ``xor-like
+PRNGs''.   As each  xor-like PRNG  uses 64-bits  whereas our  proposed generator
+works with 32-bits, we use the command \texttt{(unsigned int)}, that selects the
+32 least  significant bits  of a given  integer, and the  code \texttt{(unsigned
+  int)(t$>>$32)} in order to obtain the 32 most significant bits of \texttt{t}.
  
-In Listing~\ref{algo:seqCIPRNG}  a sequential version of  the proposed PRNG based on chaotic iterations
- is  presented.  The xor operator is  represented by \textasciicircum.
-This  function uses  three classical  64-bits PRNGs, namely the  \texttt{xorshift}, the
-\texttt{xor128},  and  the  \texttt{xorwow}~\cite{Marsaglia2003}.   In  the following,  we  call  them
-``xor-like PRNGs''. 
-As
-each xor-like PRNG  uses 64-bits whereas our proposed generator works with 32-bits,
-we use the command \texttt{(unsigned int)}, that selects the 32 least significant bits of a given integer, and the code
-\texttt{(unsigned int)(t3$>>$32)}  in order to obtain the 32 most significant  bits of \texttt{t}.   
-
-So producing a  pseudorandom number needs  6 xor operations
-with 6 32-bits  numbers that are provided by 3 64-bits PRNGs.   This version successfully passes the
+So producing a pseudorandom number needs 6 xor operations with 6 32-bits numbers
+that  are provided by  3 64-bits  PRNGs.  This  version successfully  passes the
  stringent BigCrush battery of tests~\cite{LEcuyerS07}.
  
  \section{Efficient PRNGs based on Chaotic Iterations on GPU}
@@ -982,12 +986,14 @@ thread uses the result of which other  one, we can use a combination array that
  contains  the indexes  of  all threads  and  for which  a combination has  been
  performed. 
  
-In Algorithm~\ref{algo:gpu_kernel2}, two combination arrays are used.
-The    variable   \texttt{offset}    is    computed   using    the   value    of
+In  Algorithm~\ref{algo:gpu_kernel2},  two  combination  arrays are  used.   The
+variable     \texttt{offset}    is     computed    using     the     value    of
  \texttt{combination\_size}.   Then we  can compute  \texttt{o1}  and \texttt{o2}
-representing the indexes of the  other threads whose results are used
-by the  current one. In  this algorithm, we  consider that a  64-bits xor-like
-PRNG has been chosen, and so its two 32-bits parts are used.
+representing the  indexes of  the other  threads whose results  are used  by the
+current one.   In this algorithm, we  consider that a 32-bits  xor-like PRNG has
+been chosen. In practice, we  use the xor128 proposed in~\cite{Marsaglia2003} in
+which  unsigned longs  (64 bits)  have been  replaced by  unsigned  integers (32
+bits).
  
  This version also can pass the whole {\it BigCrush} battery of tests.
  
@@ -996,28 +1002,28 @@ This version also can pass the whole {\it BigCrush} battery of tests.
  \KwIn{InternalVarXorLikeArray: array with internal variables of 1 xor-like PRNGs
  in global memory\;
  NumThreads: Number of threads\;
-tab1, tab2: Arrays containing combinations of size combination\_size\;}
+array\_comb1, array\_comb2: Arrays containing combinations of size combination\_size\;}
  
  \KwOut{NewNb: array containing random numbers in global memory}
  \If{threadId is concerned} {
    retrieve data from InternalVarXorLikeArray[threadId] in local variables including shared memory and x\;
    offset = threadIdx\%combination\_size\;
-  o1 = threadIdx-offset+tab1[offset]\;
-  o2 = threadIdx-offset+tab2[offset]\;
+  o1 = threadIdx-offset+array\_comb1[offset]\;
+  o2 = threadIdx-offset+array\_comb2[offset]\;
    \For{i=1 to n} {
      t=xor-like()\;
-    t=t $\wedge$ shmem[o1] $\wedge$ shmem[o2]\;
+    t=t\textasciicircum shmem[o1]\textasciicircum shmem[o2]\;
      shared\_mem[threadId]=t\;
-    x = x $\wedge$ t\;
+    x = x\textasciicircum t\;
  
      store the new PRNG in NewNb[NumThreads*threadId+i]\;
    }
    store internal variables in InternalVarXorLikeArray[threadId]\;
  }
  
-\caption{main kernel for the chaotic iterations based PRNG GPU efficient
-version}
-\label{algo:gpu_kernel2}
+\caption{Main kernel for the chaotic iterations based PRNG GPU efficient
+version\label{IR}}
+\label{algo:gpu_kernel2} 
  \end{algorithm}
  
  \subsection{Theoretical Evaluation of the Improved Version}
@@ -1089,13 +1095,12 @@ As a  comparison,   Listing~\ref{algo:seqCIPRNG}  leads   to the  generation of
  
  
  
-In Figure~\ref{fig:time_bbs_gpu}  we highlight the performances  of the optimized
-BBS-based  PRNG on GPU. On the  Tesla C1060 we
-obtain approximately 700MSample/s and on the GTX 280 about 670MSample/s, which is
-obviously slower than the xorlike-based PRNG on GPU. However, we will show in the 
-next sections that 
-this new PRNG has a strong level of security, which is necessary paid by a speed
-reduction. 
+In Figure~\ref{fig:time_bbs_gpu} we highlight  the performances of the optimized
+BBS-based PRNG on GPU.  On  the Tesla C1060 we obtain approximately 700MSample/s
+and  on the  GTX 280  about  670MSample/s, which  is obviously  slower than  the
+xorlike-based PRNG on GPU. However, we  will show in the next sections that this
+new PRNG  has a strong  level of  security, which is  necessary paid by  a speed
+reduction.
  
  \begin{figure}[htbp]
  \begin{center}
@@ -1127,17 +1132,17 @@ In this section the concatenation of two strings $u$ and $v$ is classically
  denoted by $uv$.
  In a cryptographic context, a pseudorandom generator is a deterministic
  algorithm $G$ transforming strings  into strings and such that, for any
-seed $k$ of length $k$, $G(k)$ (the output of $G$ on the input $k$) has size
-$\ell_G(k)$ with $\ell_G(k)>k$.
+seed $m$ of length $m$, $G(m)$ (the output of $G$ on the input $m$) has size
+$\ell_G(m)$ with $\ell_G(m)>m$.
  The notion of {\it secure} PRNGs can now be defined as follows. 
  
  \begin{definition}
  A cryptographic PRNG $G$ is secure if for any probabilistic polynomial time
  algorithm $D$, for any positive polynomial $p$, and for all sufficiently
-large $k$'s,
-$$| \mathrm{Pr}[D(G(U_k))=1]-Pr[D(U_{\ell_G(k)})=1]|< \frac{1}{p(k)},$$
+large $m$'s,
+$$| \mathrm{Pr}[D(G(U_m))=1]-Pr[D(U_{\ell_G(m)})=1]|< \frac{1}{p(m)},$$
  where $U_r$ is the uniform distribution over $\{0,1\}^r$ and the
-probabilities are taken over $U_N$, $U_{\ell_G(N)}$ as well as over the
+probabilities are taken over $U_m$, $U_{\ell_G(m)}$ as well as over the
  internal coin tosses of $D$. 
  \end{definition}
  
@@ -1146,7 +1151,7 @@ distinguish a perfect uniform random generator from $G$ with a non
  negligible probability. The interested reader is referred
  to~\cite[chapter~3]{Goldreich} for more information. Note that it is
  quite easily possible to change the function $\ell$ into any polynomial
-function $\ell^\prime$ satisfying $\ell^\prime(N)>N)$~\cite[Chapter 3.3]{Goldreich}.
+function $\ell^\prime$ satisfying $\ell^\prime(m)>m)$~\cite[Chapter 3.3]{Goldreich}.
  
  The generation schema developed in (\ref{equation Oplus}) is based on a
  pseudorandom generator. Let $H$ be a cryptographic PRNG. We may assume,
@@ -1246,13 +1251,13 @@ very slow and only usable for cryptographic applications.
    
  The modulus operation is the most time consuming operation for current
  GPU cards.  So in order to obtain quite reasonable performances, it is
-required to use only modulus  on 32 bits integer numbers. Consequently
+required to use only modulus  on 32-bits integer numbers. Consequently
  $x_n^2$ need  to be lesser than $2^{32}$,  and thus the number $M$ must be
  lesser than $2^{16}$.  So in practice we can choose prime numbers around
-256 that are congruent to 3 modulus 4.  With 32 bits numbers, only the
+256 that are congruent to 3 modulus 4.  With 32-bits numbers, only the
  4 least significant bits of $x_n$ can be chosen (the maximum number of
  indistinguishable    bits    is    lesser    than   or    equals    to
-$log_2(log_2(M))$). In other words, to generate a  32 bits number, we need to use
+$log_2(log_2(M))$). In other words, to generate a  32-bits number, we need to use
  8 times  the BBS  algorithm with possibly different  combinations of  $M$. This
  approach is  not sufficient to be able to pass  all the TestU01,
  as small values of  $M$ for the BBS  lead to
@@ -1271,9 +1276,9 @@ character  \& is for the  bitwise AND. Thus using  \&7 with  a number
  gives the last 3 bits, providing so a number between 0 and 7.
  \item
  Secondly, after the  generation of the 8 BBS numbers  for each thread, we
-have a 32 bits number whose period is possibly quite small. So
+have a 32-bits number whose period is possibly quite small. So
  to add randomness,  we generate 4 more BBS numbers   to
-shift  the 32 bits  numbers, and  add up to  6 new  bits.  This  improvement is
+shift  the 32-bits  numbers, and  add up to  6 new  bits.  This  improvement is
  described  in Algorithm~\ref{algo:bbs_gpu}.  In  practice, the last 2 bits
  of the first new BBS number are  used to make a left shift of at most
  3 bits. The  last 3 bits of the  second new BBS number are  add to the
@@ -1293,29 +1298,33 @@ variable for BBS number 8 is stored in place 1.
  \KwIn{InternalVarBBSArray: array with internal variables of the 8 BBS
  in global memory\;
  NumThreads: Number of threads\;
-tab: 2D Arrays containing 16 combinations (in first dimension)  of size combination\_size (in second dimension)\;}
+array\_comb: 2D Arrays containing 16 combinations (in first dimension)  of size combination\_size (in second dimension)\;
+array\_shift[4]=\{0,1,3,7\}\;
+}
  
  \KwOut{NewNb: array containing random numbers in global memory}
  \If{threadId is concerned} {
    retrieve data from InternalVarBBSArray[threadId] in local variables including shared memory and x\;
    we consider that bbs1 ... bbs8 represent the internal states of the 8 BBS numbers\;
    offset = threadIdx\%combination\_size\;
-  o1 = threadIdx-offset+tab[bbs1\&7][offset]\;
-  o2 = threadIdx-offset+tab[8+bbs2\&7][offset]\;
+  o1 = threadIdx-offset+array\_comb[bbs1\&7][offset]\;
+  o2 = threadIdx-offset+array\_comb[8+bbs2\&7][offset]\;
    \For{i=1 to n} {
-    t<<=4\;
+    t$<<$=4\;
      t|=BBS1(bbs1)\&15\;
      ...\;
-    t<<=4\;
+    t$<<$=4\;
      t|=BBS8(bbs8)\&15\;
-    //two new shifts\;
-    t<<=BBS3(bbs3)\&3\;
-    t|=BBS1(bbs1)\&7\;
-     t<<=BBS7(bbs7)\&3\;
-    t|=BBS2(bbs2)\&7\;
-    t=t \^{ } shmem[o1] \^{ }    shmem[o2]\;
+    \tcp{two new shifts}
+    shift=BBS3(bbs3)\&3\;
+    t$<<$=shift\;
+    t|=BBS1(bbs1)\&array\_shift[shift]\;
+    shift=BBS7(bbs7)\&3\;
+    t$<<$=shift\;
+    t|=BBS2(bbs2)\&array\_shift[shift]\;
+    t=t\textasciicircum  shmem[o1]\textasciicircum     shmem[o2]\;
      shared\_mem[threadId]=t\;
-    x = x \^{ }    t\;
+    x = x\textasciicircum   t\;
  
      store the new PRNG in NewNb[NumThreads*threadId+i]\;
    }
@@ -1326,31 +1335,30 @@ tab: 2D Arrays containing 16 combinations (in first dimension)  of size combinat
  \label{algo:bbs_gpu}
  \end{algorithm}
  
-In Algorithm~\ref{algo:bbs_gpu}, $n$ is for the quantity
-of random numbers that a thread has to generate.
-The operation t<<=4 performs a left shift of 4 bits
-on the variable  $t$ and stores the result  in $t$, and 
-$BBS1(bbs1)\&15$ selects
-the last  four bits of the result  of $BBS1$. 
-Thus an operation of the form $t<<=4; t|=BBS1(bbs1)\&15\;$
-realizes in $t$ a left shift of 4 bits, and then puts
-the 4 last bits of $BBS1(bbs1)$ in the four last
-positions of $t$.
-Let us remark that to initialize $t$ is not a necessity as we
-fill it 4 bits by 4 bits, until having obtained 32 bits.
-The two last new shifts are realized in order to enlarge
-the small periods of the BBS used here, to introduce a kind of variability.
-In these operations, we make twice a left shift of $t$ of \emph{at most}
-3 bits and we put \emph{exactly} the 3 last bits from a BBS into 
-the 3 last bits of $t$, leading possibly to a loss of a few 
-bits of $t$. 
-
-It should  be noticed that this generator has another time the form $x^{n+1} = x^n \oplus S^n$,
+In Algorithm~\ref{algo:bbs_gpu}, $n$ is for  the quantity of random numbers that
+a thread has to  generate.  The operation t<<=4 performs a left  shift of 4 bits
+on the variable  $t$ and stores the result in  $t$, and $BBS1(bbs1)\&15$ selects
+the last  four bits  of the  result of $BBS1$.   Thus an  operation of  the form
+$t<<=4; t|=BBS1(bbs1)\&15\;$  realizes in $t$ a  left shift of 4  bits, and then
+puts the 4 last bits of $BBS1(bbs1)$  in the four last positions of $t$.  Let us
+remark that the initialization $t$ is not a  necessity as we fill it 4 bits by 4
+bits, until  having obtained 32-bits.  The  two last new shifts  are realized in
+order to enlarge the small periods of  the BBS used here, to introduce a kind of
+variability.  In these operations, we make twice a left shift of $t$ of \emph{at
+  most}  3 bits,  represented by  \texttt{shift} in  the algorithm,  and  we put
+\emph{exactly} the \texttt{shift}  last bits from a BBS  into the \texttt{shift}
+last bits of $t$. For this, an array named \texttt{array\_shift}, containing the
+correspondance between the  shift and the number obtained  with \texttt{shift} 1
+to make the \texttt{and} operation is used. For example, with a left shift of 0,
+we  make an  and operation  with 0,  with  a left  shift of  3, we  make an  and
+operation with 7 (represented by 111 in binary mode).
+
+It should  be noticed that this generator has once more the form $x^{n+1} = x^n \oplus S^n$,
  where $S^n$ is referred in this algorithm as $t$: each iteration of this
  PRNG ends with $x = x \wedge t$. This $S^n$ is only constituted
  by secure bits produced by the BBS generator, and thus, due to
  Proposition~\ref{cryptopreuve}, the resulted PRNG is cryptographically
-secure
+secure.