Merge branch 'master' of ssh://bilbo.iut-bm.univ-fcomte.fr/prng_gpu

[prng_gpu.git] / prng_gpu.tex
diff --git a/prng_gpu.tex b/prng_gpu.tex

index d2e5e0547d736054ce547ef49dd39e7b5f0c1a3c..e5b80cc94aa90c5eaa6dd82c2962c3350f16e94c 100644 (file)
--- a/prng_gpu.tex
+++ b/prng_gpu.tex
@@ -900,7 +900,7 @@ tab1, tab2: Arrays containing permutations of size permutation\_size\;}
  
  \KwOut{NewNb: array containing random numbers in global memory}
  \If{threadId is concerned} {
  
  \KwOut{NewNb: array containing random numbers in global memory}
  \If{threadId is concerned} {
-  retrieve data from InternalVarXorLikeArray[threadId] in local variables including shared memory\;
+  retrieve data from InternalVarXorLikeArray[threadId] in local variables including shared memory and x\;
    offset = threadIdx\%permutation\_size\;
    o1 = threadIdx-offset+tab1[offset]\;
    o2 = threadIdx-offset+tab2[offset]\;
    offset = threadIdx\%permutation\_size\;
    o1 = threadIdx-offset+tab1[offset]\;
    o2 = threadIdx-offset+tab2[offset]\;
@@ -943,6 +943,25 @@ Thus Algorithm~\ref{algo:gpu_kernel2} is a concrete realization of the general
  chaotic iterations presented previously, and for this reason, it satisfies the 
  Devaney's formulation of a chaotic behavior.
  
  chaotic iterations presented previously, and for this reason, it satisfies the 
  Devaney's formulation of a chaotic behavior.
  
+\section{A cryptographically secure prng for GPU}
+
+It is  possible to build a  cryptographically secure prng based  on the previous
+algorithm (algorithm~\ref{algo:gpu_kernel2}).   It simply consists  in replacing
+the  {\it  xor-like} algorithm  by  another  cryptographically  secure prng.  In
+practice, we suggest  to use the BBS algorithm~\cite{BBS}  which takes the form:
+$$x_{n+1}=x_n^2~ mod~ M$$  where $M$ is the product of  two prime numbers. Those
+prime numbers  need to be congruent  to 3 modulus  4. In practice, this  PRNG is
+known to  be slow and  not efficient for  the generation of random  numbers. For
+current  GPU   cards,  the  modulus   operation  is  the  most   time  consuming
+operation. So in  order to obtain quite reasonable  performances, it is required
+to use only modulus on 32  bits integer numbers. Consequently $x_n^2$ need to be
+less than  $2^{32}$ and the  number $M$  need to be  less than $2^{16}$.   So in
+pratice we can  choose prime numbers around 256 that are  congruent to 3 modulus
+4.  With  32 bits numbers,  only the  4 least significant  bits of $x_n$  can be
+chosen  (the   maximum  number  of   undistinguishing  is  less  or   equals  to
+$log_2(log_2(x_n))$). So  to generate a 32 bits  number, we need to  use 8 times
+the BBS algorithm, with different combinations of $M$ is required.
+
  \section{Experiments}
  \label{sec:experiments}
  
  \section{Experiments}
  \label{sec:experiments}
  
@@ -955,8 +974,8 @@ cards have 240 cores.
  In Figure~\ref{fig:time_gpu}  we compare the number of  random numbers generated
  per second. The xor-like prng  is a xor64 described in~\cite{Marsaglia2003}.  In
  order to obtain the optimal performance  we remove the storage of random numbers
  In Figure~\ref{fig:time_gpu}  we compare the number of  random numbers generated
  per second. The xor-like prng  is a xor64 described in~\cite{Marsaglia2003}.  In
  order to obtain the optimal performance  we remove the storage of random numbers
-in the GPU memory. This step is time consumming and slows down the random number
-generation.  Moreover, if you are interested by applications that consome random
+in the GPU memory. This step is time consuming and slows down the random number
+generation.  Moreover, if you are interested by applications that consume random
  numbers  directly   when  they  are  generated,  their   storage  is  completely
  useless. In this  figure we can see  that when the number of  threads is greater
  than approximately 30,000 upto 5 millions the number of random numbers generated
  numbers  directly   when  they  are  generated,  their   storage  is  completely
  useless. In this  figure we can see  that when the number of  threads is greater
  than approximately 30,000 upto 5 millions the number of random numbers generated
@@ -968,10 +987,10 @@ should be of better quality.
  
  \begin{figure}[htbp]
  \begin{center}
  
  \begin{figure}[htbp]
  \begin{center}
-  \includegraphics[scale=.7]{curve_time_gpu.pdf}
+  \includegraphics[scale=.7]{curve_time_xorlike_gpu.pdf}
  \end{center}
  \end{center}
-\caption{Number of random numbers generated per second}
-\label{fig:time_gpu}
+\caption{Number of random numbers generated per second with the xorlike based prng}
+\label{fig:time_xorlike_gpu}
  \end{figure}
  
  
  \end{figure}
  
  
@@ -981,6 +1000,15 @@ In  comparison,   Listing~\ref{algo:seqCIprng}  allows  us   to  generate  about
  
  
  
  
  
  
+\begin{figure}[htbp]
+\begin{center}
+  \includegraphics[scale=.7]{curve_time_bbs_gpu.pdf}
+\end{center}
+\caption{Number of random numbers generated per second with the bbs based prng}
+\label{fig:time_bbs_gpu}
+\end{figure}
+
+
  
  %% \section{Cryptanalysis of the Proposed PRNG}
  
  
  %% \section{Cryptanalysis of the Proposed PRNG}