From ebc3cd7d745664101ef0a6988b3c5164c260b917 Mon Sep 17 00:00:00 2001
From: zianekhodja <zianekhodja.lilia@gmail.com>
Date: Sun, 17 Jan 2016 01:44:16 +0100
Subject: [PATCH] modification algo 1 et 2

---
 paper.tex | 102 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 62 insertions(+), 40 deletions(-)

diff --git a/paper.tex b/paper.tex
index 477f14f..bae5b94 100644
--- a/paper.tex
+++ b/paper.tex
@@ -779,26 +779,19 @@ implementation of Ehrlich-Aberth method.
 \KwOut{$Z$ (solution vector of roots)}
 Initialize the polynomial $P$ and its derivative $P'$\;
 Set the initial values of vector $Z$\;
-Copy $P$, $P'$ and $Z$ from the CPU memory to the GPU memory\;
+Copy $P$, $P'$ and $Z$ from CPU to GPU\;
 \While{\emph{not convergence}}{
-  $Z_{prev}$ = Kernel\_Save($Z,n$)\;
-  $Z$ = Kernel\_Update($P,P',Z,n$)\;
-  Kernel\_Test\_Convergence($Z,Z_{prev},n,\epsilon$)\;
+  $Z^{prev}$ = KernelSave($Z,n$)\;
+  $Z$ = KernelUpdate($P,P',Z^{prev},n$)\;
+  $\Delta Z$ = KernelComputeError($Z,Z^{prev},n$)\;
+  $\Delta Z_{max}$ = CudaMaxFunction($\Delta Z,n$)\;
+  TestConvergence($\Delta Z_{max},\epsilon$)\;
 }
-Copy $Z$ from the GPU memory to the CPU memory\;
+Copy $Z$ from GPU to CPU\;
 \label{alg1-cuda}
+\RC{Si l'algo vous convient, il faudrait le dÃ©tailler prÃ©cisÃ©ment\LZK{J'ai modifiÃ© l'algo. Sinon, est ce qu'on doit mettre en paramÃ¨tre $Z^{prev}$ ou $Z$ tout court (dans le cas oÃ¹ on exploite l'asynchronisme des threads cuda!) pour le Kernel\_Update? }}
 \end{algorithm}
 
-
-
-
-
-
-
-
-
-
-\RC{Si l'algo vous convient, il faudrait le dÃ©tailler prÃ©cisÃ©ment}
  
 \section{The EA algorithm on Multiple GPUs}
 \label{sec4}
@@ -857,38 +850,67 @@ arrays containing all the roots.
 %% roots sufficiently converge.
 
 
-\begin{algorithm}[h]
-\label{alg2-cuda-openmp}
+%% \begin{algorithm}[h]
+%%   \label{alg2-cuda-openmp}
+%%   \LinesNumbered
+%%   \SetAlgoNoLine
+%%   \caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
+
+%%   \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
+%%     threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
+
+%%   \KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
+
+%%   \BlankLine
+
+%%   Initialization of P\;
+%%   Initialization of Pu\;
+%%   Initialization of the solution vector $Z^{0}$\;
+%%   Start of a parallel part with OpenMP (Z, $\Delta z$, P are shared variables)\;
+%%   gpu\_id=cudaGetDevice()\; 
+%%   Allocate memory on GPU\;
+%%   Compute local size and offet according to gpu\_id\;
+%%   \While {$error > \epsilon$}{
+%%     copy Z from CPU to GPU\;
+%%     $ ZPrec_{loc}=kernel\_save(Z_{loc})$\;
+%%     $ Z_{loc}=kernel\_update(Z,P,Pu)$\;
+%%     $\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\;
+%%     $  error= Max(\Delta z)$\;
+%%     copy $Z_{loc}$ from GPU to Z in CPU
+%%   }
+%%\end{algorithm}
+
+\begin{algorithm}[htpb]
 \LinesNumbered
 \SetAlgoNoLine
-\caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
-
-\KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
-  threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
-
-\KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
-
-\BlankLine
-
-Initialization of P\;
-Initialization of Pu\;
-Initialization of the solution vector $Z^{0}$\;
-Start of a parallel part with OpenMP (Z, $\Delta z$, P are shared variables)\;
-gpu\_id=cudaGetDevice()\; 
-Allocate memory on GPU\;
-Compute local size and offet according to gpu\_id\;
-\While {$error > \epsilon$}{
-  copy Z from CPU to GPU\;
-$ ZPrec_{loc}=kernel\_save(Z_{loc})$\;
-$ Z_{loc}=kernel\_update(Z,P,Pu)$\;
-$\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\;
-$  error= Max(\Delta z)$\;
-  copy $Z_{loc}$ from GPU to Z in CPU
+\caption{Finding roots of polynomials with the Ehrlich-Aberth method on multiple GPUs using OpenMP}
+\KwIn{$n$ (polynomial's degree), $\epsilon$ (tolerance threshold), $ngpu$ (number of GPUs)}
+\KwOut{$Z$ (solution vector of roots)}
+Initialize the polynomial $P$ and its derivative $P'$\;
+Set the initial values of vector $Z$\;
+Start of a parallel part with OpenMP ($Z$, $\Delta Z$, $\Delta Z_{max}$, $P$ are shared variables)\;
+$id_{gpu}$ = cudaGetDevice()\;
+$n_{loc}$ = $n/ngpu$ (local size)\;
+%$idx$ = $id_{gpu}\times n_{loc}$ (local offset)\;
+Copy $P$, $P'$ from CPU to GPU\;
+\While{\emph{not convergence}}{
+  Copy $Z$ from CPU to GPU\;
+  $Z^{prev}$ = KernelSave($Z,n$)\;
+  $Z_{loc}$ = KernelUpdate($P,P',Z^{prev},n_{loc}$)\;
+  $\Delta Z_{loc}$ = KernelComputeError($Z_{loc},Z^{prev}_{loc},n_{loc}$)\;
+  $\Delta Z_{max}[id_{gpu}]$ = CudaMaxFunction($\Delta Z_{loc},n_{loc}$)\;
+  Copy $Z_{loc}$ from GPU to $Z$ in CPU\;
+  $max$ = MaxFunction($\Delta Z_{max},ngpu$)\;
+  TestConvergence($max,\epsilon$)\;
 }
+\label{alg2-cuda-openmp}
+\LZK{J'ai modifiÃ© l'algo. Le $P$ est mis shared. Qu'en est-il pour $P'$?}
 \end{algorithm}
 
 
 
+
+
 \subsection{an MPI-CUDA approach}
 %\begin{figure}[htbp]
 %\centering
-- 
2.39.5