X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/kahina_paper2.git/blobdiff_plain/3cf07b0e550de12f9012804f80af6c5417d16544..ebc3cd7d745664101ef0a6988b3c5164c260b917:/paper.tex

diff --git a/paper.tex b/paper.tex
index 7237cf6..bae5b94 100644
--- a/paper.tex
+++ b/paper.tex
@@ -779,26 +779,19 @@ implementation of Ehrlich-Aberth method.
 \KwOut{$Z$ (solution vector of roots)}
 Initialize the polynomial $P$ and its derivative $P'$\;
 Set the initial values of vector $Z$\;
-Copy $P$, $P'$ and $Z$ from the CPU memory to the GPU memory\;
+Copy $P$, $P'$ and $Z$ from CPU to GPU\;
 \While{\emph{not convergence}}{
-  $Z_{prev}$ = Kernel\_Save($Z$)\;
-  $Z$ = Kernel\_Update($P,P',Z$)\;
-  Kernel\_Test\_Convergence($Z,Z_{prev},\epsilon$)\;
+  $Z^{prev}$ = KernelSave($Z,n$)\;
+  $Z$ = KernelUpdate($P,P',Z^{prev},n$)\;
+  $\Delta Z$ = KernelComputeError($Z,Z^{prev},n$)\;
+  $\Delta Z_{max}$ = CudaMaxFunction($\Delta Z,n$)\;
+  TestConvergence($\Delta Z_{max},\epsilon$)\;
 }
-Copy $Z$ from the GPU memory to the CPU memory\;
+Copy $Z$ from GPU to CPU\;
 \label{alg1-cuda}
+\RC{Si l'algo vous convient, il faudrait le dÃ©tailler prÃ©cisÃ©ment\LZK{J'ai modifiÃ© l'algo. Sinon, est ce qu'on doit mettre en paramÃ¨tre $Z^{prev}$ ou $Z$ tout court (dans le cas oÃ¹ on exploite l'asynchronisme des threads cuda!) pour le Kernel\_Update? }}
 \end{algorithm}
 
-
-
-
-
-
-
-
-
-
-\RC{Si l'algo vous convient, il faudrait le dÃ©tailler prÃ©cisÃ©ment}
  
 \section{The EA algorithm on Multiple GPUs}
 \label{sec4}
@@ -857,38 +850,67 @@ arrays containing all the roots.
 %% roots sufficiently converge.
 
 
-\begin{algorithm}[h]
-\label{alg2-cuda-openmp}
+%% \begin{algorithm}[h]
+%%   \label{alg2-cuda-openmp}
+%%   \LinesNumbered
+%%   \SetAlgoNoLine
+%%   \caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
+
+%%   \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
+%%     threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
+
+%%   \KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
+
+%%   \BlankLine
+
+%%   Initialization of P\;
+%%   Initialization of Pu\;
+%%   Initialization of the solution vector $Z^{0}$\;
+%%   Start of a parallel part with OpenMP (Z, $\Delta z$, P are shared variables)\;
+%%   gpu\_id=cudaGetDevice()\; 
+%%   Allocate memory on GPU\;
+%%   Compute local size and offet according to gpu\_id\;
+%%   \While {$error > \epsilon$}{
+%%     copy Z from CPU to GPU\;
+%%     $ ZPrec_{loc}=kernel\_save(Z_{loc})$\;
+%%     $ Z_{loc}=kernel\_update(Z,P,Pu)$\;
+%%     $\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\;
+%%     $  error= Max(\Delta z)$\;
+%%     copy $Z_{loc}$ from GPU to Z in CPU
+%%   }
+%%\end{algorithm}
+
+\begin{algorithm}[htpb]
 \LinesNumbered
 \SetAlgoNoLine
-\caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
-
-\KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
-  threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
-
-\KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
-
-\BlankLine
-
-Initialization of P\;
-Initialization of Pu\;
-Initialization of the solution vector $Z^{0}$\;
-Start of a parallel part with OpenMP (Z, $\Delta z$, P are shared variables)\;
-gpu\_id=cudaGetDevice()\; 
-Allocate memory on GPU\;
-Compute local size and offet according to gpu\_id\;
-\While {$error > \epsilon$}{
-  copy Z from CPU to GPU\;
-$ ZPrec_{loc}=kernel\_save(Z_{loc})$\;
-$ Z_{loc}=kernel\_update(Z,P,Pu)$\;
-$\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\;
-$  error= Max(\Delta z)$\;
-  copy $Z_{loc}$ from GPU to Z in CPU
+\caption{Finding roots of polynomials with the Ehrlich-Aberth method on multiple GPUs using OpenMP}
+\KwIn{$n$ (polynomial's degree), $\epsilon$ (tolerance threshold), $ngpu$ (number of GPUs)}
+\KwOut{$Z$ (solution vector of roots)}
+Initialize the polynomial $P$ and its derivative $P'$\;
+Set the initial values of vector $Z$\;
+Start of a parallel part with OpenMP ($Z$, $\Delta Z$, $\Delta Z_{max}$, $P$ are shared variables)\;
+$id_{gpu}$ = cudaGetDevice()\;
+$n_{loc}$ = $n/ngpu$ (local size)\;
+%$idx$ = $id_{gpu}\times n_{loc}$ (local offset)\;
+Copy $P$, $P'$ from CPU to GPU\;
+\While{\emph{not convergence}}{
+  Copy $Z$ from CPU to GPU\;
+  $Z^{prev}$ = KernelSave($Z,n$)\;
+  $Z_{loc}$ = KernelUpdate($P,P',Z^{prev},n_{loc}$)\;
+  $\Delta Z_{loc}$ = KernelComputeError($Z_{loc},Z^{prev}_{loc},n_{loc}$)\;
+  $\Delta Z_{max}[id_{gpu}]$ = CudaMaxFunction($\Delta Z_{loc},n_{loc}$)\;
+  Copy $Z_{loc}$ from GPU to $Z$ in CPU\;
+  $max$ = MaxFunction($\Delta Z_{max},ngpu$)\;
+  TestConvergence($max,\epsilon$)\;
 }
+\label{alg2-cuda-openmp}
+\LZK{J'ai modifiÃ© l'algo. Le $P$ est mis shared. Qu'en est-il pour $P'$?}
 \end{algorithm}
 
 
 
+
+
 \subsection{an MPI-CUDA approach}
 %\begin{figure}[htbp]
 %\centering