From 50092af11c4cbab1c3048d46875f66476118537c Mon Sep 17 00:00:00 2001 From: couturie Date: Wed, 14 Sep 2011 22:03:48 +0200 Subject: [PATCH] un peu plus --- prng_gpu.tex | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/prng_gpu.tex b/prng_gpu.tex index 8d76043..dc91965 100644 --- a/prng_gpu.tex +++ b/prng_gpu.tex @@ -761,7 +761,7 @@ previous section, it is possible to build a similar program which computes PRNG on GPU. -\subsection{Naive version} +\subsection{Naive version for GPU} From the CPU version, it is possible to obtain a quite similar version for GPU. The principe consists in assigning the computation of a PRNG as in sequential to @@ -791,7 +791,7 @@ NumThreads: Number of threads\;} store internal variables in InternalVarXorLikeArray[threadId]\; } -\caption{main kernel for the chaotic iterations based PRNG GPU version} +\caption{main kernel for the chaotic iterations based PRNG GPU naive version} \label{algo:gpu_kernel} \end{algorithm} @@ -816,15 +816,40 @@ using a master node for the initialization which computes the initial parameters for all the differents nodes involves in the computation. \end{remark} -\subsection{Version more suited to GPU} +\subsection{Improved version for GPU} -As GPU offers shared memory mechanism between threads of the same block, it is -possible to use this in order to simplify the previous algorithm, i.e. using -less than 3 xor-like PRNGs. The solution consists in +As GPU cards using CUDA have shared memory between threads of the same block, it +is possible to use this feature in order to simplify the previous algorithm, +i.e. using less than 3 xor-like PRNGs. The solution consists in comuting only +one xor-like PRNG by thread, saving in into shared memory and accessing result +of some other threads in the same block of threads. - threads of the same block compute a random -number and uses other random numbers of +\begin{algorithm} + +\KwIn{InternalVarXorLikeArray: array with internal variables of 1 xor-like PRNGs in global memory\; +NumThreads: Number of threads\; +tab1, tab2: Arrays containing permutations\;} + +\KwOut{NewNb: array containing random numbers in global memory} +\If{threadId is concerned} { + retrieve data from InternalVarXorLikeArray[threadId] in local variables\; + offset = threadId\%32; + \For{i=1 to n} { + t=xor-like()\; + shared\_mem[threadId]=(unsigned int)t\; + x = x$\oplus$ (unsigned int) t\; + x = x$\oplus$ (unsigned int) (t>>32)\; + x = x$\oplus$ shared[tab1[offset]]\; + x = x$\oplus$ shared[tab2[offset]]\; + store the new PRNG in NewNb[NumThreads*threadId+i]\; + } + store internal variables in InternalVarXorLikeArray[threadId]\; +} + +\caption{main kernel for the chaotic iterations based PRNG GPU efficient version} +\label{algo:gpu_kernel2} +\end{algorithm} \section{Experiments} Differents experiments have been performed in order to measure the generation speed. -- 2.39.5