X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/prng_gpu.git/blobdiff_plain/2460d1d2aaa1c52db31709934ee63e66cbcb8116..a97a587bc408ff330d3c6292bcf7d0c488ecae16:/prng_gpu.tex

diff --git a/prng_gpu.tex b/prng_gpu.tex
index c48aeda..d95e87f 100644
--- a/prng_gpu.tex
+++ b/prng_gpu.tex
@@ -906,17 +906,15 @@ tab1, tab2: Arrays containing permutations of size permutation\_size\;}
 
 \KwOut{NewNb: array containing random numbers in global memory}
 \If{threadId is concerned} {
-  retrieve data from InternalVarXorLikeArray[threadId] in local variables\;
+  retrieve data from InternalVarXorLikeArray[threadId] in local variables including shared memory\;
   offset = threadIdx\%permutation\_size\;
   o1 = threadIdx-offset+tab1[offset]\;
   o2 = threadIdx-offset+tab2[offset]\;
   \For{i=1 to n} {
     t=xor-like()\;
-    shared\_mem[threadId]=(unsigned int)t\;
-    x = x $\oplus$ (unsigned int) t\;
-    x = x $\oplus$ (unsigned int) (t>>32)\;
-    x = x $\oplus$ shared[o1]\;
-    x = x $\oplus$ shared[o2]\;
+    t=t$\oplus$shmem[o1]$\oplus$shmem[o2]\;
+    shared\_mem[threadId]=t\;
+    x = x $\oplus$ t\;
 
     store the new PRNG in NewNb[NumThreads*threadId+i]\;
   }
@@ -930,9 +928,9 @@ version}
 
 \subsection{Theoretical Evaluation of the Improved Version}
 
-A run of Algorithm~\ref{algo:gpu_kernel2} consists in four operations having 
+A run of Algorithm~\ref{algo:gpu_kernel2} consists in three operations having 
 the form of Equation~\ref{equation Oplus}, which is equivalent to the iterative
-system of Eq.~\ref{eq:generalIC}. That is, four iterations of the general chaotic
+system of Eq.~\ref{eq:generalIC}. That is, three iterations of the general chaotic
 iterations are realized between two stored values of the PRNG.
 To be certain that we are in the framework of Theorem~\ref{t:chaos des general},
 we must guarantee that this dynamical system iterates on the space 
@@ -956,18 +954,23 @@ Devaney's formulation of a chaotic behavior.
 
 Different experiments  have been  performed in order  to measure  the generation
 speed. We have used  a computer equiped with Tesla C1060 NVidia  GPU card and an
-Intel Xeon E5530 cadenced at 2.40 GHz for our experiments.
+Intel  Xeon E5530 cadenced  at 2.40  GHz for  our experiments  and we  have used
+another one  equipped with  a less performant  CPU and  a GeForce GTX  280. Both
+cards have 240 cores.
 
 In Figure~\ref{fig:time_gpu}  we compare the number of  random numbers generated
-per second.   In order  to obtain the  optimal number  we remove the  storage of
-random numbers  in the GPU memory. This  step is time consumming  and slows down
-the random number  generation.  Moreover, if you are  interested by applications
-that consome  random number directly when  they are generated,  their storage is
-completely useless. In this figure we can see that when the number of threads is
-greater than approximately  30,000 upto 5 millions the  number of random numbers
-generated per second is almost constant.   With the naive version, it is between
-2.5  and  3GSample/s.   With the  optimized  version,  it  is almost  equals  to
-20GSample/s.
+per second. The xor-like prng  is a xor64 described in~\cite{Marsaglia2003}.  In
+order to obtain the optimal performance  we remove the storage of random numbers
+in the GPU memory. This step is time consumming and slows down the random number
+generation.  Moreover, if you are interested by applications that consome random
+numbers  directly   when  they  are  generated,  their   storage  is  completely
+useless. In this  figure we can see  that when the number of  threads is greater
+than approximately 30,000 upto 5 millions the number of random numbers generated
+per second  is almost constant.  With the  naive version, it is  between 2.5 and
+3GSample/s.   With  the  optimized   version,  it  is  approximately  equals  to
+20GSample/s. Finally  we can remark  that both GPU  cards are quite  similar. In
+practice,  the Tesla C1060  has more  memory than  the GTX  280 and  this memory
+should be of better quality.
 
 \begin{figure}[htbp]
 \begin{center}