pch

[prng_gpu.git] / prng_gpu.tex
diff --git a/prng_gpu.tex b/prng_gpu.tex

index c646316ef6549811294655cd3dbb38366f54663f..0a88df58f8b1204d8d1f118090fd03f9bcdb229c 100644 (file)
--- a/prng_gpu.tex
+++ b/prng_gpu.tex
@@ -7,7 +7,7 @@
  \usepackage{amscd}
  \usepackage{moreverb}
  \usepackage{commath}
  \usepackage{amscd}
  \usepackage{moreverb}
  \usepackage{commath}
-\usepackage{algorithm2e}
+\usepackage[ruled,vlined]{algorithm2e}
  \usepackage{listings}
  \usepackage[standard]{ntheorem}
  
  \usepackage{listings}
  \usepackage[standard]{ntheorem}
  
@@ -216,7 +216,10 @@ We can finally remark that, to the best of our knowledge, no GPU implementation
  \label{section:BASIC RECALLS}
  
  This section is devoted to basic definitions and terminologies in the fields of
  \label{section:BASIC RECALLS}
  
  This section is devoted to basic definitions and terminologies in the fields of
-topological chaos and chaotic iterations.
+topological chaos and chaotic iterations. We assume the reader is familiar
+with basic notions on topology (see for instance~\cite{Devaney}).
+
+
  \subsection{Devaney's Chaotic Dynamical Systems}
  
  In the sequel $S^{n}$ denotes the $n^{th}$ term of a sequence $S$ and $V_{i}$
  \subsection{Devaney's Chaotic Dynamical Systems}
  
  In the sequel $S^{n}$ denotes the $n^{th}$ term of a sequence $S$ and $V_{i}$
@@ -229,7 +232,7 @@ Consider a topological space $(\mathcal{X},\tau)$ and a continuous function $f :
  \mathcal{X} \rightarrow \mathcal{X}$.
  
  \begin{definition}
  \mathcal{X} \rightarrow \mathcal{X}$.
  
  \begin{definition}
-$f$ is said to be \emph{topologically transitive} if, for any pair of open sets
+The function $f$ is said to be \emph{topologically transitive} if, for any pair of open sets
  $U,V \subset \mathcal{X}$, there exists $k>0$ such that $f^k(U) \cap V \neq
  \varnothing$.
  \end{definition}
  $U,V \subset \mathcal{X}$, there exists $k>0$ such that $f^k(U) \cap V \neq
  \varnothing$.
  \end{definition}
@@ -248,7 +251,7 @@ necessarily the same period).
  
  
  \begin{definition}[Devaney's formulation of chaos~\cite{Devaney}]
  
  
  \begin{definition}[Devaney's formulation of chaos~\cite{Devaney}]
-$f$ is said to be \emph{chaotic} on $(\mathcal{X},\tau)$ if $f$ is regular and
+The function $f$ is said to be \emph{chaotic} on $(\mathcal{X},\tau)$ if $f$ is regular and
  topologically transitive.
  \end{definition}
  
  topologically transitive.
  \end{definition}
  
@@ -256,12 +259,12 @@ The chaos property is strongly linked to the notion of ``sensitivity'', defined
  on a metric space $(\mathcal{X},d)$ by:
  
  \begin{definition}
  on a metric space $(\mathcal{X},d)$ by:
  
  \begin{definition}
-\label{sensitivity} $f$ has \emph{sensitive dependence on initial conditions}
+\label{sensitivity} The function $f$ has \emph{sensitive dependence on initial conditions}
  if there exists $\delta >0$ such that, for any $x\in \mathcal{X}$ and any
  neighborhood $V$ of $x$, there exist $y\in V$ and $n > 0$ such that
  $d\left(f^{n}(x), f^{n}(y)\right) >\delta $.
  
  if there exists $\delta >0$ such that, for any $x\in \mathcal{X}$ and any
  neighborhood $V$ of $x$, there exist $y\in V$ and $n > 0$ such that
  $d\left(f^{n}(x), f^{n}(y)\right) >\delta $.
  
-$\delta$ is called the \emph{constant of sensitivity} of $f$.
+The constant $\delta$ is called the \emph{constant of sensitivity} of $f$.
  \end{definition}
  
  Indeed, Banks \emph{et al.} have proven in~\cite{Banks92} that when $f$ is
  \end{definition}
  
  Indeed, Banks \emph{et al.} have proven in~\cite{Banks92} that when $f$ is
@@ -786,7 +789,7 @@ where $(s^0,s^1, \hdots)$ is the strategy of $Y$, satisfies the properties
  claimed in the lemma.
  \end{proof}
  
  claimed in the lemma.
  \end{proof}
  
-We can now prove the Theorem~\ref{t:chaos des general}...
+We can now prove the Theorem~\ref{t:chaos des general}.
  
  \begin{proof}[Theorem~\ref{t:chaos des general}]
  Firstly, strong transitivity implies transitivity.
  
  \begin{proof}[Theorem~\ref{t:chaos des general}]
  Firstly, strong transitivity implies transitivity.
@@ -856,7 +859,9 @@ $$
  
  
  
  
  
  
-\lstset{language=C,caption={C code of the sequential PRNG based on chaotic iterations},label=algo:seqCIPRNG}
+
+\lstset{language=C,caption={C code of the sequential PRNG based on chaotic iteration\
+s},label=algo:seqCIPRNG}
  \begin{lstlisting}
  unsigned int CIPRNG() {
    static unsigned int x = 123123123;
  \begin{lstlisting}
  unsigned int CIPRNG() {
    static unsigned int x = 123123123;
@@ -876,7 +881,6 @@ unsigned int CIPRNG() {
  
  
  
  
  
  
-
  In Listing~\ref{algo:seqCIPRNG} a sequential  version of the proposed PRNG based
  on  chaotic  iterations  is  presented.   The xor  operator  is  represented  by
  \textasciicircum.  This function uses  three classical 64-bits PRNGs, namely the
  In Listing~\ref{algo:seqCIPRNG} a sequential  version of the proposed PRNG based
  on  chaotic  iterations  is  presented.   The xor  operator  is  represented  by
  \textasciicircum.  This function uses  three classical 64-bits PRNGs, namely the
@@ -1008,18 +1012,18 @@ array\_comb1, array\_comb2: Arrays containing combinations of size combination\_
    o2 = threadIdx-offset+array\_comb2[offset]\;
    \For{i=1 to n} {
      t=xor-like()\;
    o2 = threadIdx-offset+array\_comb2[offset]\;
    \For{i=1 to n} {
      t=xor-like()\;
-    t=t $\wedge$ shmem[o1] $\wedge$ shmem[o2]\;
+    t=t\textasciicircum shmem[o1]\textasciicircum shmem[o2]\;
      shared\_mem[threadId]=t\;
      shared\_mem[threadId]=t\;
-    x = x $\wedge$ t\;
+    x = x\textasciicircum t\;
  
      store the new PRNG in NewNb[NumThreads*threadId+i]\;
    }
    store internal variables in InternalVarXorLikeArray[threadId]\;
  }
  
  
      store the new PRNG in NewNb[NumThreads*threadId+i]\;
    }
    store internal variables in InternalVarXorLikeArray[threadId]\;
  }
  
-\caption{main kernel for the chaotic iterations based PRNG GPU efficient
-version}
-\label{algo:gpu_kernel2}
+\caption{Main kernel for the chaotic iterations based PRNG GPU efficient
+version\label{IR}}
+\label{algo:gpu_kernel2} 
  \end{algorithm}
  
  \subsection{Theoretical Evaluation of the Improved Version}
  \end{algorithm}
  
  \subsection{Theoretical Evaluation of the Improved Version}
@@ -1091,13 +1095,12 @@ As a  comparison,   Listing~\ref{algo:seqCIPRNG}  leads   to the  generation of
  
  
  
  
  
  
-In Figure~\ref{fig:time_bbs_gpu}  we highlight the performances  of the optimized
-BBS-based  PRNG on GPU. On the  Tesla C1060 we
-obtain approximately 700MSample/s and on the GTX 280 about 670MSample/s, which is
-obviously slower than the xorlike-based PRNG on GPU. However, we will show in the 
-next sections that 
-this new PRNG has a strong level of security, which is necessary paid by a speed
-reduction. 
+In Figure~\ref{fig:time_bbs_gpu} we highlight  the performances of the optimized
+BBS-based PRNG on GPU.  On  the Tesla C1060 we obtain approximately 700MSample/s
+and  on the  GTX 280  about  670MSample/s, which  is obviously  slower than  the
+xorlike-based PRNG on GPU. However, we  will show in the next sections that this
+new PRNG  has a strong  level of  security, which is  necessary paid by  a speed
+reduction.
  
  \begin{figure}[htbp]
  \begin{center}
  
  \begin{figure}[htbp]
  \begin{center}
@@ -1129,17 +1132,17 @@ In this section the concatenation of two strings $u$ and $v$ is classically
  denoted by $uv$.
  In a cryptographic context, a pseudorandom generator is a deterministic
  algorithm $G$ transforming strings  into strings and such that, for any
  denoted by $uv$.
  In a cryptographic context, a pseudorandom generator is a deterministic
  algorithm $G$ transforming strings  into strings and such that, for any
-seed $k$ of length $k$, $G(k)$ (the output of $G$ on the input $k$) has size
-$\ell_G(k)$ with $\ell_G(k)>k$.
+seed $m$ of length $m$, $G(m)$ (the output of $G$ on the input $m$) has size
+$\ell_G(m)$ with $\ell_G(m)>m$.
  The notion of {\it secure} PRNGs can now be defined as follows. 
  
  \begin{definition}
  A cryptographic PRNG $G$ is secure if for any probabilistic polynomial time
  algorithm $D$, for any positive polynomial $p$, and for all sufficiently
  The notion of {\it secure} PRNGs can now be defined as follows. 
  
  \begin{definition}
  A cryptographic PRNG $G$ is secure if for any probabilistic polynomial time
  algorithm $D$, for any positive polynomial $p$, and for all sufficiently
-large $k$'s,
-$$| \mathrm{Pr}[D(G(U_k))=1]-Pr[D(U_{\ell_G(k)})=1]|< \frac{1}{p(k)},$$
+large $m$'s,
+$$| \mathrm{Pr}[D(G(U_m))=1]-Pr[D(U_{\ell_G(m)})=1]|< \frac{1}{p(m)},$$
  where $U_r$ is the uniform distribution over $\{0,1\}^r$ and the
  where $U_r$ is the uniform distribution over $\{0,1\}^r$ and the
-probabilities are taken over $U_N$, $U_{\ell_G(N)}$ as well as over the
+probabilities are taken over $U_m$, $U_{\ell_G(m)}$ as well as over the
  internal coin tosses of $D$. 
  \end{definition}
  
  internal coin tosses of $D$. 
  \end{definition}
  
@@ -1148,7 +1151,7 @@ distinguish a perfect uniform random generator from $G$ with a non
  negligible probability. The interested reader is referred
  to~\cite[chapter~3]{Goldreich} for more information. Note that it is
  quite easily possible to change the function $\ell$ into any polynomial
  negligible probability. The interested reader is referred
  to~\cite[chapter~3]{Goldreich} for more information. Note that it is
  quite easily possible to change the function $\ell$ into any polynomial
-function $\ell^\prime$ satisfying $\ell^\prime(N)>N)$~\cite[Chapter 3.3]{Goldreich}.
+function $\ell^\prime$ satisfying $\ell^\prime(m)>m)$~\cite[Chapter 3.3]{Goldreich}.
  
  The generation schema developed in (\ref{equation Oplus}) is based on a
  pseudorandom generator. Let $H$ be a cryptographic PRNG. We may assume,
  
  The generation schema developed in (\ref{equation Oplus}) is based on a
  pseudorandom generator. Let $H$ be a cryptographic PRNG. We may assume,
@@ -1248,13 +1251,13 @@ very slow and only usable for cryptographic applications.
    
  The modulus operation is the most time consuming operation for current
  GPU cards.  So in order to obtain quite reasonable performances, it is
    
  The modulus operation is the most time consuming operation for current
  GPU cards.  So in order to obtain quite reasonable performances, it is
-required to use only modulus  on 32 bits integer numbers. Consequently
+required to use only modulus  on 32-bits integer numbers. Consequently
  $x_n^2$ need  to be lesser than $2^{32}$,  and thus the number $M$ must be
  lesser than $2^{16}$.  So in practice we can choose prime numbers around
  $x_n^2$ need  to be lesser than $2^{32}$,  and thus the number $M$ must be
  lesser than $2^{16}$.  So in practice we can choose prime numbers around
-256 that are congruent to 3 modulus 4.  With 32 bits numbers, only the
+256 that are congruent to 3 modulus 4.  With 32-bits numbers, only the
  4 least significant bits of $x_n$ can be chosen (the maximum number of
  indistinguishable    bits    is    lesser    than   or    equals    to
  4 least significant bits of $x_n$ can be chosen (the maximum number of
  indistinguishable    bits    is    lesser    than   or    equals    to
-$log_2(log_2(M))$). In other words, to generate a  32 bits number, we need to use
+$log_2(log_2(M))$). In other words, to generate a  32-bits number, we need to use
  8 times  the BBS  algorithm with possibly different  combinations of  $M$. This
  approach is  not sufficient to be able to pass  all the TestU01,
  as small values of  $M$ for the BBS  lead to
  8 times  the BBS  algorithm with possibly different  combinations of  $M$. This
  approach is  not sufficient to be able to pass  all the TestU01,
  as small values of  $M$ for the BBS  lead to
@@ -1273,9 +1276,9 @@ character  \& is for the  bitwise AND. Thus using  \&7 with  a number
  gives the last 3 bits, providing so a number between 0 and 7.
  \item
  Secondly, after the  generation of the 8 BBS numbers  for each thread, we
  gives the last 3 bits, providing so a number between 0 and 7.
  \item
  Secondly, after the  generation of the 8 BBS numbers  for each thread, we
-have a 32 bits number whose period is possibly quite small. So
+have a 32-bits number whose period is possibly quite small. So
  to add randomness,  we generate 4 more BBS numbers   to
  to add randomness,  we generate 4 more BBS numbers   to
-shift  the 32 bits  numbers, and  add up to  6 new  bits.  This  improvement is
+shift  the 32-bits  numbers, and  add up to  6 new  bits.  This  improvement is
  described  in Algorithm~\ref{algo:bbs_gpu}.  In  practice, the last 2 bits
  of the first new BBS number are  used to make a left shift of at most
  3 bits. The  last 3 bits of the  second new BBS number are  add to the
  described  in Algorithm~\ref{algo:bbs_gpu}.  In  practice, the last 2 bits
  of the first new BBS number are  used to make a left shift of at most
  3 bits. The  last 3 bits of the  second new BBS number are  add to the
@@ -1296,7 +1299,7 @@ variable for BBS number 8 is stored in place 1.
  in global memory\;
  NumThreads: Number of threads\;
  array\_comb: 2D Arrays containing 16 combinations (in first dimension)  of size combination\_size (in second dimension)\;
  in global memory\;
  NumThreads: Number of threads\;
  array\_comb: 2D Arrays containing 16 combinations (in first dimension)  of size combination\_size (in second dimension)\;
-
+array\_shift[4]=\{0,1,3,7\}\;
  }
  
  \KwOut{NewNb: array containing random numbers in global memory}
  }
  
  \KwOut{NewNb: array containing random numbers in global memory}
@@ -1307,19 +1310,21 @@ array\_comb: 2D Arrays containing 16 combinations (in first dimension)  of size
    o1 = threadIdx-offset+array\_comb[bbs1\&7][offset]\;
    o2 = threadIdx-offset+array\_comb[8+bbs2\&7][offset]\;
    \For{i=1 to n} {
    o1 = threadIdx-offset+array\_comb[bbs1\&7][offset]\;
    o2 = threadIdx-offset+array\_comb[8+bbs2\&7][offset]\;
    \For{i=1 to n} {
-    t<<=4\;
+    t$<<$=4\;
      t|=BBS1(bbs1)\&15\;
      ...\;
      t|=BBS1(bbs1)\&15\;
      ...\;
-    t<<=4\;
+    t$<<$=4\;
      t|=BBS8(bbs8)\&15\;
      t|=BBS8(bbs8)\&15\;
-    //two new shifts\;
-    t<<=BBS3(bbs3)\&3\;
-    t|=BBS1(bbs1)\&7\;
-     t<<=BBS7(bbs7)\&3\;
-    t|=BBS2(bbs2)\&7\;
-    t=t \^{ } shmem[o1] \^{ }    shmem[o2]\;
+    \tcp{two new shifts}
+    shift=BBS3(bbs3)\&3\;
+    t$<<$=shift\;
+    t|=BBS1(bbs1)\&array\_shift[shift]\;
+    shift=BBS7(bbs7)\&3\;
+    t$<<$=shift\;
+    t|=BBS2(bbs2)\&array\_shift[shift]\;
+    t=t\textasciicircum  shmem[o1]\textasciicircum     shmem[o2]\;
      shared\_mem[threadId]=t\;
      shared\_mem[threadId]=t\;
-    x = x \^{ }    t\;
+    x = x\textasciicircum   t\;
  
      store the new PRNG in NewNb[NumThreads*threadId+i]\;
    }
  
      store the new PRNG in NewNb[NumThreads*threadId+i]\;
    }
@@ -1330,31 +1335,30 @@ array\_comb: 2D Arrays containing 16 combinations (in first dimension)  of size
  \label{algo:bbs_gpu}
  \end{algorithm}
  
  \label{algo:bbs_gpu}
  \end{algorithm}
  
-In Algorithm~\ref{algo:bbs_gpu}, $n$ is for the quantity
-of random numbers that a thread has to generate.
-The operation t<<=4 performs a left shift of 4 bits
-on the variable  $t$ and stores the result  in $t$, and 
-$BBS1(bbs1)\&15$ selects
-the last  four bits of the result  of $BBS1$. 
-Thus an operation of the form $t<<=4; t|=BBS1(bbs1)\&15\;$
-realizes in $t$ a left shift of 4 bits, and then puts
-the 4 last bits of $BBS1(bbs1)$ in the four last
-positions of $t$.
-Let us remark that to initialize $t$ is not a necessity as we
-fill it 4 bits by 4 bits, until having obtained 32 bits.
-The two last new shifts are realized in order to enlarge
-the small periods of the BBS used here, to introduce a kind of variability.
-In these operations, we make twice a left shift of $t$ of \emph{at most}
-3 bits and we put \emph{exactly} the 3 last bits from a BBS into 
-the 3 last bits of $t$, leading possibly to a loss of a few 
-bits of $t$. 
-
-It should  be noticed that this generator has another time the form $x^{n+1} = x^n \oplus S^n$,
+In Algorithm~\ref{algo:bbs_gpu}, $n$ is for  the quantity of random numbers that
+a thread has to  generate.  The operation t<<=4 performs a left  shift of 4 bits
+on the variable  $t$ and stores the result in  $t$, and $BBS1(bbs1)\&15$ selects
+the last  four bits  of the  result of $BBS1$.   Thus an  operation of  the form
+$t<<=4; t|=BBS1(bbs1)\&15\;$  realizes in $t$ a  left shift of 4  bits, and then
+puts the 4 last bits of $BBS1(bbs1)$  in the four last positions of $t$.  Let us
+remark that the initialization $t$ is not a  necessity as we fill it 4 bits by 4
+bits, until  having obtained 32-bits.  The  two last new shifts  are realized in
+order to enlarge the small periods of  the BBS used here, to introduce a kind of
+variability.  In these operations, we make twice a left shift of $t$ of \emph{at
+  most}  3 bits,  represented by  \texttt{shift} in  the algorithm,  and  we put
+\emph{exactly} the \texttt{shift}  last bits from a BBS  into the \texttt{shift}
+last bits of $t$. For this, an array named \texttt{array\_shift}, containing the
+correspondance between the  shift and the number obtained  with \texttt{shift} 1
+to make the \texttt{and} operation is used. For example, with a left shift of 0,
+we  make an  and operation  with 0,  with  a left  shift of  3, we  make an  and
+operation with 7 (represented by 111 in binary mode).
+
+It should  be noticed that this generator has once more the form $x^{n+1} = x^n \oplus S^n$,
  where $S^n$ is referred in this algorithm as $t$: each iteration of this
  PRNG ends with $x = x \wedge t$. This $S^n$ is only constituted
  by secure bits produced by the BBS generator, and thus, due to
  Proposition~\ref{cryptopreuve}, the resulted PRNG is cryptographically
  where $S^n$ is referred in this algorithm as $t$: each iteration of this
  PRNG ends with $x = x \wedge t$. This $S^n$ is only constituted
  by secure bits produced by the BBS generator, and thus, due to
  Proposition~\ref{cryptopreuve}, the resulted PRNG is cryptographically
-secure
+secure.