un peu plus

[prng_gpu.git] / prng_gpu.tex
diff --git a/prng_gpu.tex b/prng_gpu.tex

index 00991e9cb0d3fe30956f7ea5d4f311d801d8b6ca..dc91965a2009f5b3aa21cbdadfad7c89dee1b629 100644 (file)
--- a/prng_gpu.tex
+++ b/prng_gpu.tex
@@ -758,12 +758,18 @@ the larger the number of threads is,  the more local memory is used and the less
  branching  instructions are  used (if,  while, ...),  the better  performance is
  obtained  on  GPU.  So  with  algorithm  \ref{algo:seqCIprng}  presented in  the
  previous section, it is possible to  build a similar program which computes PRNG
  branching  instructions are  used (if,  while, ...),  the better  performance is
  obtained  on  GPU.  So  with  algorithm  \ref{algo:seqCIprng}  presented in  the
  previous section, it is possible to  build a similar program which computes PRNG
-on  GPU. The principe  consists in  assigning the  computation of  a PRNG  as in
-sequential to each thread of the GPU.  Of course, it is essential that the three
-xor-like PRNGs used  for our computation have different  parameters. So we chose
-them randomly with another PRNG. As  the initialisation is performed by the CPU,
-we have chosen to  use the ISAAC PRNG [ref] to initalize  all the parameters for
-the GPU version of our PRNG.   The implementation of the three xor-like PRNGs is
+on  GPU. 
+
+
+\subsection{Naive version for GPU}
+
+From the CPU version, it is possible  to obtain a quite similar version for GPU.
+The principe consists in assigning the computation of a PRNG as in sequential to
+each thread  of the  GPU.  Of course,  it is  essential that the  three xor-like
+PRNGs  used for  our computation  have different  parameters. So  we  chose them
+randomly with  another PRNG. As the  initialisation is performed by  the CPU, we
+have chosen to use the ISAAC PRNG  [ref] to initalize all the parameters for the
+GPU version  of our  PRNG.  The  implementation of the  three xor-like  PRNGs is
  straightforward  as soon  as their  parameters have  been allocated  in  the GPU
  memory. Each xor-like  PRNGs used works with an internal  number $x$ which keeps
  the last generated random numbers. Other internal variables are also used by the
  straightforward  as soon  as their  parameters have  been allocated  in  the GPU
  memory. Each xor-like  PRNGs used works with an internal  number $x$ which keeps
  the last generated random numbers. Other internal variables are also used by the
@@ -785,7 +791,7 @@ NumThreads: Number of threads\;}
    store internal variables in InternalVarXorLikeArray[threadId]\;
  }
  
    store internal variables in InternalVarXorLikeArray[threadId]\;
  }
  
-\caption{main kernel for the chaotic iterations based PRNG GPU version}
+\caption{main kernel for the chaotic iterations based PRNG GPU naive version}
  \label{algo:gpu_kernel}
  \end{algorithm}
  
  \label{algo:gpu_kernel}
  \end{algorithm}
  
@@ -802,6 +808,48 @@ and  random  number of  our  PRNG  is  equals to  $100,000\times  ((4+5+6)\times
  All the  tests performed  to pass the  BigCrush of TestU01  succeeded. Different
  number of threads have been tested upto $10$ millions.
  
  All the  tests performed  to pass the  BigCrush of TestU01  succeeded. Different
  number of threads have been tested upto $10$ millions.
  
+\begin{remark}
+Algorithm~\ref{algo:gpu_kernel}  has  the  advantage to  manipulate  independent
+PRNGs, so this version is easily usable on a cluster of computer. The only thing
+to ensure is to use a single ISAAC PRNG. For this, a simple solution consists in
+using a master node for the initialization which computes the initial parameters
+for all the differents nodes involves in the computation.
+\end{remark}
+
+\subsection{Improved version for GPU}
+
+As GPU cards using CUDA have shared memory between threads of the same block, it
+is possible  to use this  feature in order  to simplify the  previous algorithm,
+i.e. using  less than 3 xor-like  PRNGs. The solution consists  in comuting only
+one xor-like PRNG  by thread, saving in into shared  memory and accessing result
+of some other threads in the same block of threads.
+
+\begin{algorithm}
+
+\KwIn{InternalVarXorLikeArray: array with internal variables of 1 xor-like PRNGs in global memory\;
+NumThreads: Number of threads\;
+tab1, tab2: Arrays containing permutations\;}
+
+\KwOut{NewNb: array containing random numbers in global memory}
+\If{threadId is concerned} {
+  retrieve data from InternalVarXorLikeArray[threadId] in local variables\;
+  offset = threadId\%32;
+  \For{i=1 to n} {
+    t=xor-like()\;
+    shared\_mem[threadId]=(unsigned int)t\;
+    x = x$\oplus$ (unsigned int) t\;
+    x = x$\oplus$ (unsigned int) (t>>32)\;
+    x = x$\oplus$ shared[tab1[offset]]\;
+    x = x$\oplus$ shared[tab2[offset]]\;
+
+    store the new PRNG in NewNb[NumThreads*threadId+i]\;
+  }
+  store internal variables in InternalVarXorLikeArray[threadId]\;
+}
+
+\caption{main kernel for the chaotic iterations based PRNG GPU efficient version}
+\label{algo:gpu_kernel2}
+\end{algorithm}
  \section{Experiments}
  
  Differents experiments have been performed in order to measure the generation speed.
  \section{Experiments}
  
  Differents experiments have been performed in order to measure the generation speed.