petites modifs

[prng_gpu.git] / prng_gpu.tex
diff --git a/prng_gpu.tex b/prng_gpu.tex

index c48aeda1cc9aa4636e434c7410060fbc3c39f26b..d95e87f0e69d9d47dab61d8e1e8cb708628cf177 100644 (file)
--- a/prng_gpu.tex
+++ b/prng_gpu.tex
@@ -906,17 +906,15 @@ tab1, tab2: Arrays containing permutations of size permutation\_size\;}
  
  \KwOut{NewNb: array containing random numbers in global memory}
  \If{threadId is concerned} {
-  retrieve data from InternalVarXorLikeArray[threadId] in local variables\;
+  retrieve data from InternalVarXorLikeArray[threadId] in local variables including shared memory\;
    offset = threadIdx\%permutation\_size\;
    o1 = threadIdx-offset+tab1[offset]\;
    o2 = threadIdx-offset+tab2[offset]\;
    \For{i=1 to n} {
      t=xor-like()\;
-    shared\_mem[threadId]=(unsigned int)t\;
-    x = x $\oplus$ (unsigned int) t\;
-    x = x $\oplus$ (unsigned int) (t>>32)\;
-    x = x $\oplus$ shared[o1]\;
-    x = x $\oplus$ shared[o2]\;
+    t=t$\oplus$shmem[o1]$\oplus$shmem[o2]\;
+    shared\_mem[threadId]=t\;
+    x = x $\oplus$ t\;
  
      store the new PRNG in NewNb[NumThreads*threadId+i]\;
    }
@@ -930,9 +928,9 @@ version}
  
  \subsection{Theoretical Evaluation of the Improved Version}
  
-A run of Algorithm~\ref{algo:gpu_kernel2} consists in four operations having 
+A run of Algorithm~\ref{algo:gpu_kernel2} consists in three operations having 
  the form of Equation~\ref{equation Oplus}, which is equivalent to the iterative
-system of Eq.~\ref{eq:generalIC}. That is, four iterations of the general chaotic
+system of Eq.~\ref{eq:generalIC}. That is, three iterations of the general chaotic
  iterations are realized between two stored values of the PRNG.
  To be certain that we are in the framework of Theorem~\ref{t:chaos des general},
  we must guarantee that this dynamical system iterates on the space 
@@ -956,18 +954,23 @@ Devaney's formulation of a chaotic behavior.
  
  Different experiments  have been  performed in order  to measure  the generation
  speed. We have used  a computer equiped with Tesla C1060 NVidia  GPU card and an
-Intel Xeon E5530 cadenced at 2.40 GHz for our experiments.
+Intel  Xeon E5530 cadenced  at 2.40  GHz for  our experiments  and we  have used
+another one  equipped with  a less performant  CPU and  a GeForce GTX  280. Both
+cards have 240 cores.
  
  In Figure~\ref{fig:time_gpu}  we compare the number of  random numbers generated
-per second.   In order  to obtain the  optimal number  we remove the  storage of
-random numbers  in the GPU memory. This  step is time consumming  and slows down
-the random number  generation.  Moreover, if you are  interested by applications
-that consome  random number directly when  they are generated,  their storage is
-completely useless. In this figure we can see that when the number of threads is
-greater than approximately  30,000 upto 5 millions the  number of random numbers
-generated per second is almost constant.   With the naive version, it is between
-2.5  and  3GSample/s.   With the  optimized  version,  it  is almost  equals  to
-20GSample/s.
+per second. The xor-like prng  is a xor64 described in~\cite{Marsaglia2003}.  In
+order to obtain the optimal performance  we remove the storage of random numbers
+in the GPU memory. This step is time consumming and slows down the random number
+generation.  Moreover, if you are interested by applications that consome random
+numbers  directly   when  they  are  generated,  their   storage  is  completely
+useless. In this  figure we can see  that when the number of  threads is greater
+than approximately 30,000 upto 5 millions the number of random numbers generated
+per second  is almost constant.  With the  naive version, it is  between 2.5 and
+3GSample/s.   With  the  optimized   version,  it  is  approximately  equals  to
+20GSample/s. Finally  we can remark  that both GPU  cards are quite  similar. In
+practice,  the Tesla C1060  has more  memory than  the GTX  280 and  this memory
+should be of better quality.
  
  \begin{figure}[htbp]
  \begin{center}