modification algo 1 et 2

[kahina_paper2.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index 7237cf69f26c8713fac8be40a0d3d10ab18f7421..bae5b946a9464c92cc243cb25f207bccdb6c1b07 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -779,26 +779,19 @@ implementation of Ehrlich-Aberth method.
  \KwOut{$Z$ (solution vector of roots)}
  Initialize the polynomial $P$ and its derivative $P'$\;
  Set the initial values of vector $Z$\;
-Copy $P$, $P'$ and $Z$ from the CPU memory to the GPU memory\;
+Copy $P$, $P'$ and $Z$ from CPU to GPU\;
  \While{\emph{not convergence}}{
-  $Z_{prev}$ = Kernel\_Save($Z$)\;
-  $Z$ = Kernel\_Update($P,P',Z$)\;
-  Kernel\_Test\_Convergence($Z,Z_{prev},\epsilon$)\;
+  $Z^{prev}$ = KernelSave($Z,n$)\;
+  $Z$ = KernelUpdate($P,P',Z^{prev},n$)\;
+  $\Delta Z$ = KernelComputeError($Z,Z^{prev},n$)\;
+  $\Delta Z_{max}$ = CudaMaxFunction($\Delta Z,n$)\;
+  TestConvergence($\Delta Z_{max},\epsilon$)\;
  }
-Copy $Z$ from the GPU memory to the CPU memory\;
+Copy $Z$ from GPU to CPU\;
  \label{alg1-cuda}
+\RC{Si l'algo vous convient, il faudrait le détailler précisément\LZK{J'ai modifié l'algo. Sinon, est ce qu'on doit mettre en paramètre $Z^{prev}$ ou $Z$ tout court (dans le cas où on exploite l'asynchronisme des threads cuda!) pour le Kernel\_Update? }}
  \end{algorithm}
  
-
-
-
-
-
-
-
-
-
-\RC{Si l'algo vous convient, il faudrait le détailler précisément}
   
  \section{The EA algorithm on Multiple GPUs}
  \label{sec4}
@@ -857,38 +850,67 @@ arrays containing all the roots.
  %% roots sufficiently converge.
  
  
-\begin{algorithm}[h]
-\label{alg2-cuda-openmp}
+%% \begin{algorithm}[h]
+%%   \label{alg2-cuda-openmp}
+%%   \LinesNumbered
+%%   \SetAlgoNoLine
+%%   \caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
+
+%%   \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
+%%     threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
+
+%%   \KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
+
+%%   \BlankLine
+
+%%   Initialization of P\;
+%%   Initialization of Pu\;
+%%   Initialization of the solution vector $Z^{0}$\;
+%%   Start of a parallel part with OpenMP (Z, $\Delta z$, P are shared variables)\;
+%%   gpu\_id=cudaGetDevice()\; 
+%%   Allocate memory on GPU\;
+%%   Compute local size and offet according to gpu\_id\;
+%%   \While {$error > \epsilon$}{
+%%     copy Z from CPU to GPU\;
+%%     $ ZPrec_{loc}=kernel\_save(Z_{loc})$\;
+%%     $ Z_{loc}=kernel\_update(Z,P,Pu)$\;
+%%     $\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\;
+%%     $  error= Max(\Delta z)$\;
+%%     copy $Z_{loc}$ from GPU to Z in CPU
+%%   }
+%%\end{algorithm}
+
+\begin{algorithm}[htpb]
  \LinesNumbered
  \SetAlgoNoLine
-\caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
-
-\KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
-  threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
-
-\KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
-
-\BlankLine
-
-Initialization of P\;
-Initialization of Pu\;
-Initialization of the solution vector $Z^{0}$\;
-Start of a parallel part with OpenMP (Z, $\Delta z$, P are shared variables)\;
-gpu\_id=cudaGetDevice()\; 
-Allocate memory on GPU\;
-Compute local size and offet according to gpu\_id\;
-\While {$error > \epsilon$}{
-  copy Z from CPU to GPU\;
-$ ZPrec_{loc}=kernel\_save(Z_{loc})$\;
-$ Z_{loc}=kernel\_update(Z,P,Pu)$\;
-$\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\;
-$  error= Max(\Delta z)$\;
-  copy $Z_{loc}$ from GPU to Z in CPU
+\caption{Finding roots of polynomials with the Ehrlich-Aberth method on multiple GPUs using OpenMP}
+\KwIn{$n$ (polynomial's degree), $\epsilon$ (tolerance threshold), $ngpu$ (number of GPUs)}
+\KwOut{$Z$ (solution vector of roots)}
+Initialize the polynomial $P$ and its derivative $P'$\;
+Set the initial values of vector $Z$\;
+Start of a parallel part with OpenMP ($Z$, $\Delta Z$, $\Delta Z_{max}$, $P$ are shared variables)\;
+$id_{gpu}$ = cudaGetDevice()\;
+$n_{loc}$ = $n/ngpu$ (local size)\;
+%$idx$ = $id_{gpu}\times n_{loc}$ (local offset)\;
+Copy $P$, $P'$ from CPU to GPU\;
+\While{\emph{not convergence}}{
+  Copy $Z$ from CPU to GPU\;
+  $Z^{prev}$ = KernelSave($Z,n$)\;
+  $Z_{loc}$ = KernelUpdate($P,P',Z^{prev},n_{loc}$)\;
+  $\Delta Z_{loc}$ = KernelComputeError($Z_{loc},Z^{prev}_{loc},n_{loc}$)\;
+  $\Delta Z_{max}[id_{gpu}]$ = CudaMaxFunction($\Delta Z_{loc},n_{loc}$)\;
+  Copy $Z_{loc}$ from GPU to $Z$ in CPU\;
+  $max$ = MaxFunction($\Delta Z_{max},ngpu$)\;
+  TestConvergence($max,\epsilon$)\;
  }
+\label{alg2-cuda-openmp}
+\LZK{J'ai modifié l'algo. Le $P$ est mis shared. Qu'en est-il pour $P'$?}
  \end{algorithm}
  
  
  
+
+
  \subsection{an MPI-CUDA approach}
  %\begin{figure}[htbp]
  %\centering