X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/kahina_paper2.git/blobdiff_plain/3cf07b0e550de12f9012804f80af6c5417d16544..ebc3cd7d745664101ef0a6988b3c5164c260b917:/paper.tex diff --git a/paper.tex b/paper.tex index 7237cf6..bae5b94 100644 --- a/paper.tex +++ b/paper.tex @@ -779,26 +779,19 @@ implementation of Ehrlich-Aberth method. \KwOut{$Z$ (solution vector of roots)} Initialize the polynomial $P$ and its derivative $P'$\; Set the initial values of vector $Z$\; -Copy $P$, $P'$ and $Z$ from the CPU memory to the GPU memory\; +Copy $P$, $P'$ and $Z$ from CPU to GPU\; \While{\emph{not convergence}}{ - $Z_{prev}$ = Kernel\_Save($Z$)\; - $Z$ = Kernel\_Update($P,P',Z$)\; - Kernel\_Test\_Convergence($Z,Z_{prev},\epsilon$)\; + $Z^{prev}$ = KernelSave($Z,n$)\; + $Z$ = KernelUpdate($P,P',Z^{prev},n$)\; + $\Delta Z$ = KernelComputeError($Z,Z^{prev},n$)\; + $\Delta Z_{max}$ = CudaMaxFunction($\Delta Z,n$)\; + TestConvergence($\Delta Z_{max},\epsilon$)\; } -Copy $Z$ from the GPU memory to the CPU memory\; +Copy $Z$ from GPU to CPU\; \label{alg1-cuda} +\RC{Si l'algo vous convient, il faudrait le détailler précisément\LZK{J'ai modifié l'algo. Sinon, est ce qu'on doit mettre en paramètre $Z^{prev}$ ou $Z$ tout court (dans le cas où on exploite l'asynchronisme des threads cuda!) pour le Kernel\_Update? }} \end{algorithm} - - - - - - - - - -\RC{Si l'algo vous convient, il faudrait le détailler précisément} \section{The EA algorithm on Multiple GPUs} \label{sec4} @@ -857,38 +850,67 @@ arrays containing all the roots. %% roots sufficiently converge. -\begin{algorithm}[h] -\label{alg2-cuda-openmp} +%% \begin{algorithm}[h] +%% \label{alg2-cuda-openmp} +%% \LinesNumbered +%% \SetAlgoNoLine +%% \caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method} + +%% \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance +%% threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)} + +%% \KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)} + +%% \BlankLine + +%% Initialization of P\; +%% Initialization of Pu\; +%% Initialization of the solution vector $Z^{0}$\; +%% Start of a parallel part with OpenMP (Z, $\Delta z$, P are shared variables)\; +%% gpu\_id=cudaGetDevice()\; +%% Allocate memory on GPU\; +%% Compute local size and offet according to gpu\_id\; +%% \While {$error > \epsilon$}{ +%% copy Z from CPU to GPU\; +%% $ ZPrec_{loc}=kernel\_save(Z_{loc})$\; +%% $ Z_{loc}=kernel\_update(Z,P,Pu)$\; +%% $\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\; +%% $ error= Max(\Delta z)$\; +%% copy $Z_{loc}$ from GPU to Z in CPU +%% } +%%\end{algorithm} + +\begin{algorithm}[htpb] \LinesNumbered \SetAlgoNoLine -\caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method} - -\KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance - threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)} - -\KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)} - -\BlankLine - -Initialization of P\; -Initialization of Pu\; -Initialization of the solution vector $Z^{0}$\; -Start of a parallel part with OpenMP (Z, $\Delta z$, P are shared variables)\; -gpu\_id=cudaGetDevice()\; -Allocate memory on GPU\; -Compute local size and offet according to gpu\_id\; -\While {$error > \epsilon$}{ - copy Z from CPU to GPU\; -$ ZPrec_{loc}=kernel\_save(Z_{loc})$\; -$ Z_{loc}=kernel\_update(Z,P,Pu)$\; -$\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\; -$ error= Max(\Delta z)$\; - copy $Z_{loc}$ from GPU to Z in CPU +\caption{Finding roots of polynomials with the Ehrlich-Aberth method on multiple GPUs using OpenMP} +\KwIn{$n$ (polynomial's degree), $\epsilon$ (tolerance threshold), $ngpu$ (number of GPUs)} +\KwOut{$Z$ (solution vector of roots)} +Initialize the polynomial $P$ and its derivative $P'$\; +Set the initial values of vector $Z$\; +Start of a parallel part with OpenMP ($Z$, $\Delta Z$, $\Delta Z_{max}$, $P$ are shared variables)\; +$id_{gpu}$ = cudaGetDevice()\; +$n_{loc}$ = $n/ngpu$ (local size)\; +%$idx$ = $id_{gpu}\times n_{loc}$ (local offset)\; +Copy $P$, $P'$ from CPU to GPU\; +\While{\emph{not convergence}}{ + Copy $Z$ from CPU to GPU\; + $Z^{prev}$ = KernelSave($Z,n$)\; + $Z_{loc}$ = KernelUpdate($P,P',Z^{prev},n_{loc}$)\; + $\Delta Z_{loc}$ = KernelComputeError($Z_{loc},Z^{prev}_{loc},n_{loc}$)\; + $\Delta Z_{max}[id_{gpu}]$ = CudaMaxFunction($\Delta Z_{loc},n_{loc}$)\; + Copy $Z_{loc}$ from GPU to $Z$ in CPU\; + $max$ = MaxFunction($\Delta Z_{max},ngpu$)\; + TestConvergence($max,\epsilon$)\; } +\label{alg2-cuda-openmp} +\LZK{J'ai modifié l'algo. Le $P$ est mis shared. Qu'en est-il pour $P'$?} \end{algorithm} + + \subsection{an MPI-CUDA approach} %\begin{figure}[htbp] %\centering