petites modifs

[prng_gpu.git] / prng_gpu.tex
diff --git a/prng_gpu.tex b/prng_gpu.tex

index c48aeda1cc9aa4636e434c7410060fbc3c39f26b..d95e87f0e69d9d47dab61d8e1e8cb708628cf177 100644 (file)
--- a/prng_gpu.tex
+++ b/prng_gpu.tex
@@ -906,17 +906,15 @@ tab1, tab2: Arrays containing permutations of size permutation\_size\;}
  
  \KwOut{NewNb: array containing random numbers in global memory}
  \If{threadId is concerned} {
  
  \KwOut{NewNb: array containing random numbers in global memory}
  \If{threadId is concerned} {
-  retrieve data from InternalVarXorLikeArray[threadId] in local variables\;
+  retrieve data from InternalVarXorLikeArray[threadId] in local variables including shared memory\;
    offset = threadIdx\%permutation\_size\;
    o1 = threadIdx-offset+tab1[offset]\;
    o2 = threadIdx-offset+tab2[offset]\;
    \For{i=1 to n} {
      t=xor-like()\;
    offset = threadIdx\%permutation\_size\;
    o1 = threadIdx-offset+tab1[offset]\;
    o2 = threadIdx-offset+tab2[offset]\;
    \For{i=1 to n} {
      t=xor-like()\;
-    shared\_mem[threadId]=(unsigned int)t\;
-    x = x $\oplus$ (unsigned int) t\;
-    x = x $\oplus$ (unsigned int) (t>>32)\;
-    x = x $\oplus$ shared[o1]\;
-    x = x $\oplus$ shared[o2]\;
+    t=t$\oplus$shmem[o1]$\oplus$shmem[o2]\;
+    shared\_mem[threadId]=t\;
+    x = x $\oplus$ t\;
  
      store the new PRNG in NewNb[NumThreads*threadId+i]\;
    }
  
      store the new PRNG in NewNb[NumThreads*threadId+i]\;
    }
@@ -930,9 +928,9 @@ version}
  
  \subsection{Theoretical Evaluation of the Improved Version}
  
  
  \subsection{Theoretical Evaluation of the Improved Version}
  
-A run of Algorithm~\ref{algo:gpu_kernel2} consists in four operations having 
+A run of Algorithm~\ref{algo:gpu_kernel2} consists in three operations having 
  the form of Equation~\ref{equation Oplus}, which is equivalent to the iterative
  the form of Equation~\ref{equation Oplus}, which is equivalent to the iterative
-system of Eq.~\ref{eq:generalIC}. That is, four iterations of the general chaotic
+system of Eq.~\ref{eq:generalIC}. That is, three iterations of the general chaotic
  iterations are realized between two stored values of the PRNG.
  To be certain that we are in the framework of Theorem~\ref{t:chaos des general},
  we must guarantee that this dynamical system iterates on the space 
  iterations are realized between two stored values of the PRNG.
  To be certain that we are in the framework of Theorem~\ref{t:chaos des general},
  we must guarantee that this dynamical system iterates on the space 
@@ -956,18 +954,23 @@ Devaney's formulation of a chaotic behavior.
  
  Different experiments  have been  performed in order  to measure  the generation
  speed. We have used  a computer equiped with Tesla C1060 NVidia  GPU card and an
  
  Different experiments  have been  performed in order  to measure  the generation
  speed. We have used  a computer equiped with Tesla C1060 NVidia  GPU card and an
-Intel Xeon E5530 cadenced at 2.40 GHz for our experiments.
+Intel  Xeon E5530 cadenced  at 2.40  GHz for  our experiments  and we  have used
+another one  equipped with  a less performant  CPU and  a GeForce GTX  280. Both
+cards have 240 cores.
  
  In Figure~\ref{fig:time_gpu}  we compare the number of  random numbers generated
  
  In Figure~\ref{fig:time_gpu}  we compare the number of  random numbers generated
-per second.   In order  to obtain the  optimal number  we remove the  storage of
-random numbers  in the GPU memory. This  step is time consumming  and slows down
-the random number  generation.  Moreover, if you are  interested by applications
-that consome  random number directly when  they are generated,  their storage is
-completely useless. In this figure we can see that when the number of threads is
-greater than approximately  30,000 upto 5 millions the  number of random numbers
-generated per second is almost constant.   With the naive version, it is between
-2.5  and  3GSample/s.   With the  optimized  version,  it  is almost  equals  to
-20GSample/s.
+per second. The xor-like prng  is a xor64 described in~\cite{Marsaglia2003}.  In
+order to obtain the optimal performance  we remove the storage of random numbers
+in the GPU memory. This step is time consumming and slows down the random number
+generation.  Moreover, if you are interested by applications that consome random
+numbers  directly   when  they  are  generated,  their   storage  is  completely
+useless. In this  figure we can see  that when the number of  threads is greater
+than approximately 30,000 upto 5 millions the number of random numbers generated
+per second  is almost constant.  With the  naive version, it is  between 2.5 and
+3GSample/s.   With  the  optimized   version,  it  is  approximately  equals  to
+20GSample/s. Finally  we can remark  that both GPU  cards are quite  similar. In
+practice,  the Tesla C1060  has more  memory than  the GTX  280 and  this memory
+should be of better quality.
  
  \begin{figure}[htbp]
  \begin{center}
  
  \begin{figure}[htbp]
  \begin{center}