-%% \begin{enumerate}
-%% \begin{algorithm}[htpb]
-%% \label{alg2-cuda-openmp}
-%% %\LinesNumbered
-%% \caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
-
-%% \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
-%% threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
-
-%% \KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
-
-%% \BlankLine
-
-%% \item Initialization of P\;
-%% \item Initialization of Pu\;
-%% \item Initialization of the solution vector $Z^{0}$\;
-%% \verb=omp_set_num_threads(num_gpus);=
-%% \verb=#pragma omp parallel shared(Z,$\Delta$ z,P);=
-%% \verb=cudaGetDevice(gpu_id);=
-%% \item Allocate and copy initial data from CPU memory to the GPU global memories\;
-%% \item index= $Size/num\_gpus$\;
-%% \item k=0\;
-%% \While {$error > \epsilon$}{
-%% \item Let $\Delta z=0$\;
-%% \item $ kernel\_save(ZPrec,Z)$\;
-%% \item k=k+1\;
-%% \item $ kernel\_update(Z,P,Pu,index)$\;
-%% \item $kernel\_testConverge(\Delta z[gpu\_id],Z,ZPrec)$\;
-%% %\verb=#pragma omp barrier;=
-%% \item error= Max($\Delta z$)\;
-%% }
-
-%% \item Copy results from GPU memories to CPU memory\;
-%% \end{algorithm}
-%% \end{enumerate}
-%% ~\\
-%% \RC{C'est encore pire ici, on ne voit pas les comm CPU <-> GPU }
-
-
-\subsection{Multi-GPU : an MPI-CUDA approach}
+%% \begin{algorithm}[h]
+%% \label{alg2-cuda-openmp}
+%% \LinesNumbered
+%% \SetAlgoNoLine
+%% \caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
+
+%% \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
+%% threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
+
+%% \KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
+
+%% \BlankLine
+
+%% Initialization of P\;
+%% Initialization of Pu\;
+%% Initialization of the solution vector $Z^{0}$\;
+%% Start of a parallel part with OpenMP (Z, $\Delta z$, P are shared variables)\;
+%% gpu\_id=cudaGetDevice()\;
+%% Allocate memory on GPU\;
+%% Compute local size and offet according to gpu\_id\;
+%% \While {$error > \epsilon$}{
+%% copy Z from CPU to GPU\;
+%% $ ZPrec_{loc}=kernel\_save(Z_{loc})$\;
+%% $ Z_{loc}=kernel\_update(Z,P,Pu)$\;
+%% $\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\;
+%% $ error= Max(\Delta z)$\;
+%% copy $Z_{loc}$ from GPU to Z in CPU
+%% }
+%%\end{algorithm}
+
+\begin{algorithm}[htpb]
+\LinesNumbered
+\SetAlgoNoLine
+\caption{Finding roots of polynomials with the Ehrlich-Aberth method on multiple GPUs using OpenMP}
+\KwIn{$n$ (polynomial's degree), $\epsilon$ (tolerance threshold), $ngpu$ (number of GPUs)}
+\KwOut{$Z$ (solution vector of roots)}
+Initialize the polynomial $P$ and its derivative $P'$\;
+Set the initial values of vector $Z$\;
+Start of a parallel part with OpenMP ($Z$, $\Delta Z$, $\Delta Z_{max}$, $P$ are shared variables)\;
+$id_{gpu}$ = cudaGetDevice()\;
+$n_{loc}$ = $n/ngpu$ (local size)\;
+%$idx$ = $id_{gpu}\times n_{loc}$ (local offset)\;
+Copy $P$, $P'$ from CPU to GPU\;
+\While{\emph{not convergence}}{
+ Copy $Z$ from CPU to GPU\;
+ $Z^{prev}$ = KernelSave($Z,n$)\;
+ $Z_{loc}$ = KernelUpdate($P,P',Z^{prev},n_{loc}$)\;
+ $\Delta Z_{loc}$ = KernelComputeError($Z_{loc},Z^{prev}_{loc},n_{loc}$)\;
+ $\Delta Z_{max}[id_{gpu}]$ = CudaMaxFunction($\Delta Z_{loc},n_{loc}$)\;
+ Copy $Z_{loc}$ from GPU to $Z$ in CPU\;
+ $max$ = MaxFunction($\Delta Z_{max},ngpu$)\;
+ TestConvergence($max,\epsilon$)\;
+}
+\label{alg2-cuda-openmp}
+\LZK{J'ai modifié l'algo. Le $P$ est mis shared. Qu'en est-il pour $P'$?}
+\end{algorithm}
+
+
+
+
+
+\subsection{an MPI-CUDA approach}