From 50092af11c4cbab1c3048d46875f66476118537c Mon Sep 17 00:00:00 2001
From: couturie <couturie@carcariass.(none)>
Date: Wed, 14 Sep 2011 22:03:48 +0200
Subject: [PATCH] un peu plus

---
 prng_gpu.tex | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/prng_gpu.tex b/prng_gpu.tex
index 8d76043..dc91965 100644
--- a/prng_gpu.tex
+++ b/prng_gpu.tex
@@ -761,7 +761,7 @@ previous section, it is possible to  build a similar program which computes PRNG
 on  GPU. 
 
 
-\subsection{Naive version}
+\subsection{Naive version for GPU}
 
 From the CPU version, it is possible  to obtain a quite similar version for GPU.
 The principe consists in assigning the computation of a PRNG as in sequential to
@@ -791,7 +791,7 @@ NumThreads: Number of threads\;}
   store internal variables in InternalVarXorLikeArray[threadId]\;
 }
 
-\caption{main kernel for the chaotic iterations based PRNG GPU version}
+\caption{main kernel for the chaotic iterations based PRNG GPU naive version}
 \label{algo:gpu_kernel}
 \end{algorithm}
 
@@ -816,15 +816,40 @@ using a master node for the initialization which computes the initial parameters
 for all the differents nodes involves in the computation.
 \end{remark}
 
-\subsection{Version more suited to GPU}
+\subsection{Improved version for GPU}
 
-As GPU offers  shared memory mechanism between threads of the  same block, it is
-possible to  use this in  order to simplify  the previous algorithm,  i.e. using
-less than 3 xor-like PRNGs. The solution consists in 
+As GPU cards using CUDA have shared memory between threads of the same block, it
+is possible  to use this  feature in order  to simplify the  previous algorithm,
+i.e. using  less than 3 xor-like  PRNGs. The solution consists  in comuting only
+one xor-like PRNG  by thread, saving in into shared  memory and accessing result
+of some other threads in the same block of threads.
 
-  threads of the same block  compute a random
-number and uses other random numbers of
+\begin{algorithm}
+
+\KwIn{InternalVarXorLikeArray: array with internal variables of 1 xor-like PRNGs in global memory\;
+NumThreads: Number of threads\;
+tab1, tab2: Arrays containing permutations\;}
+
+\KwOut{NewNb: array containing random numbers in global memory}
+\If{threadId is concerned} {
+  retrieve data from InternalVarXorLikeArray[threadId] in local variables\;
+  offset = threadId\%32;
+  \For{i=1 to n} {
+    t=xor-like()\;
+    shared\_mem[threadId]=(unsigned int)t\;
+    x = x$\oplus$ (unsigned int) t\;
+    x = x$\oplus$ (unsigned int) (t>>32)\;
+    x = x$\oplus$ shared[tab1[offset]]\;
+    x = x$\oplus$ shared[tab2[offset]]\;
 
+    store the new PRNG in NewNb[NumThreads*threadId+i]\;
+  }
+  store internal variables in InternalVarXorLikeArray[threadId]\;
+}
+
+\caption{main kernel for the chaotic iterations based PRNG GPU efficient version}
+\label{algo:gpu_kernel2}
+\end{algorithm}
 \section{Experiments}
 
 Differents experiments have been performed in order to measure the generation speed.
-- 
2.39.5