une contrib à signaler

[kahina_paper2.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index 2ee181ae60f1fe2012fbe30549aa396a4c097521..2d47f2266da4954109cc2b793fe6b804b1b02bb0 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -1,17 +1,8 @@
-
-
-
  \documentclass[conference]{IEEEtran}
  \documentclass[conference]{IEEEtran}
-
  \usepackage[ruled,vlined]{algorithm2e}
  \usepackage[ruled,vlined]{algorithm2e}
-
-
  \hyphenation{op-tical net-works semi-conduc-tor}
  \hyphenation{op-tical net-works semi-conduc-tor}
-
  \bibliographystyle{IEEEtran}
  
  \bibliographystyle{IEEEtran}
  
-
-
  \usepackage{amsfonts}
  \usepackage[utf8]{inputenc}
  \usepackage[T1]{fontenc}
  \usepackage{amsfonts}
  \usepackage[utf8]{inputenc}
  \usepackage[T1]{fontenc}
@@ -26,8 +17,6 @@
    \todo[color=orange!10,#1]{\sffamily\textbf{AS:} #2}\xspace}
  
  
    \todo[color=orange!10,#1]{\sffamily\textbf{AS:} #2}\xspace}
  
  
-
-
  \begin{document}
  
  \title{Two parallel implementations of Ehrlich-Aberth algorithm for root-finding of polynomials on multiple GPUs with OpenMP and MPI}
  \begin{document}
  
  \title{Two parallel implementations of Ehrlich-Aberth algorithm for root-finding of polynomials on multiple GPUs with OpenMP and MPI}
@@ -120,10 +109,11 @@ In this paper we propose the parallelization of Ehrlich-Aberth method which has
  \item The parallel implementation of Ehrlich-Aberth algorithm on a multi-GPU platform with a distributed memory using MPI API, such that each GPU is attached and managed by a MPI process. The GPUs exchange their data by message-passing communications. 
   \end{itemize}
  This latter approach is more used on clusters to solve very complex problems that are too large for traditional supercomputers, which are very expensive to build and run.
  \item The parallel implementation of Ehrlich-Aberth algorithm on a multi-GPU platform with a distributed memory using MPI API, such that each GPU is attached and managed by a MPI process. The GPUs exchange their data by message-passing communications. 
   \end{itemize}
  This latter approach is more used on clusters to solve very complex problems that are too large for traditional supercomputers, which are very expensive to build and run.
-\LZK{Pas d'autres contributions possibles? J'ai supprimé les deux premiers points proposés précédemment.}
+\LZK{Pas d'autres contributions possibles? J'ai supprimé les deux premiers points proposés précédemment.
+\AS{La résolution du problème pour des polynomes pleins de degré 6M est une contribution aussi à mon avis}}
  
  The paper is organized as follows. In Section~\ref{sec2} we present three different parallel programming models OpenMP, MPI and CUDA. In Section~\ref{sec3} we present the implementation of the Ehrlich-Aberth algorithm on a single GPU. In Section~\ref{sec4} we present the parallel implementations of the Ehrlich-Aberth algorithm on multiple GPUs using the OpenMP and MPI approaches. In section~\ref{sec5} we present our experiments and discuss them. Finally, Section~\ref{sec6} concludes this paper and gives some hints for future research directions in this topic. 
  
  The paper is organized as follows. In Section~\ref{sec2} we present three different parallel programming models OpenMP, MPI and CUDA. In Section~\ref{sec3} we present the implementation of the Ehrlich-Aberth algorithm on a single GPU. In Section~\ref{sec4} we present the parallel implementations of the Ehrlich-Aberth algorithm on multiple GPUs using the OpenMP and MPI approaches. In section~\ref{sec5} we present our experiments and discuss them. Finally, Section~\ref{sec6} concludes this paper and gives some hints for future research directions in this topic. 
-%\LZK{A revoir toute cette organization: je viens de la revoir}
+
  
  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  
  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -247,20 +237,14 @@ implementation of Ehrlich-Aberth method.
  Initialize the polynomial $P$ and its derivative $P'$\;
  Set the initial values of vector $Z$\;
  Copy $P$, $P'$ and $Z$ from CPU to GPU\;
  Initialize the polynomial $P$ and its derivative $P'$\;
  Set the initial values of vector $Z$\;
  Copy $P$, $P'$ and $Z$ from CPU to GPU\;
-\While{\emph{not convergence}}{
+\While{$\Delta Z_{max} > \epsilon$}{
    $Z^{prev}$ = KernelSave($Z,n$)\;
    $Z$ = KernelUpdate($P,P',Z,n$)\;
    $Z^{prev}$ = KernelSave($Z,n$)\;
    $Z$ = KernelUpdate($P,P',Z,n$)\;
-  $\Delta Z$ = KernelComputeError($Z,Z^{prev},n$)\;
-  $\Delta Z_{max}$ = CudaMaxFunction($\Delta Z,n$)\;
-  TestConvergence($\Delta Z_{max},\epsilon$)\;
+  $\Delta Z_{max}$ = KernelComputeError($Z,Z^{prev},n$)\;
  }
  Copy $Z$ from GPU to CPU\;
  \label{alg1-cuda}
  }
  Copy $Z$ from GPU to CPU\;
  \label{alg1-cuda}
-\LZK{J'ai modifié l'algo. Sinon, est ce qu'on doit mettre en paramètre
-  $Z^{prev}$ ou $Z$ tout court (dans le cas où on exploite
-  l'asynchronisme des threads cuda!) pour le Kernel\_Update? }
-\RC{Le $Z_{prev}$ sert à calculer l'erreur donc j'ai remis Z. La ligne
-avec TestConvergence ca fait une ligne de plus.}
+\RC{La ligne avec TestConvergence ca fait une ligne de plus.\LZK{Oui j'ai hésité à l'ajouter. On peut faire le test dans la condition de while mais quelle est la valeur initiale de $\Delta Z_{max}$?! Ou bien on s'en fiche?}}
  \end{algorithm}
  
   
  \end{algorithm}
  
   
@@ -301,28 +285,24 @@ arrays containing all the roots.
  \KwOut{$Z$ (solution vector of roots)}
  Initialize the polynomial $P$ and its derivative $P'$\;
  Set the initial values of vector $Z$\;
  \KwOut{$Z$ (solution vector of roots)}
  Initialize the polynomial $P$ and its derivative $P'$\;
  Set the initial values of vector $Z$\;
-Start of a parallel part with OpenMP ($Z$, $\Delta Z$, $\Delta
-Z_{max}$, $P$, $P'$ are shared variables)\;
+Start of a parallel part with OpenMP ($Z$, $\Delta Z_{max}$, $P$, $P'$ are shared variables)\;
  $id_{gpu}$ = cudaGetDevice()\;
  $n_{loc}$ = $n/ngpu$ (local size)\;
  $id_{gpu}$ = cudaGetDevice()\;
  $n_{loc}$ = $n/ngpu$ (local size)\;
-%$idx$ = $id_{gpu}\times n_{loc}$ (local offset)\;
+$offset$ = $id_{gpu}\times n_{loc}$ (local offset)\;
  Copy $P$, $P'$ from CPU to GPU\;
  Copy $P$, $P'$ from CPU to GPU\;
-\While{\emph{not convergence}}{
+\While{$max > \epsilon$}{
    Copy $Z$ from CPU to GPU\;
    $Z^{prev}$ = KernelSave($Z,n$)\;
    Copy $Z$ from CPU to GPU\;
    $Z^{prev}$ = KernelSave($Z,n$)\;
-  $Z_{loc}$ = KernelUpdate($P,P',Z^{prev},n_{loc}$)\;
-  $\Delta Z_{loc}$ = KernelComputeError($Z_{loc},Z^{prev}_{loc},n_{loc}$)\;
-  $\Delta Z_{max}[id_{gpu}]$ = CudaMaxFunction($\Delta Z_{loc},n_{loc}$)\;
-  Copy $Z_{loc}$ from GPU to $Z$ in CPU\;
-  $max$ = MaxFunction($\Delta Z_{max},ngpu$)\;
-  TestConvergence($max,\epsilon$)\;
+  $Z[offset]$ = KernelUpdate($P,P',Z,n_{loc}$)\;
+  $\Delta Z_{max}[id_{gpu}]$ = KernelComputeError($Z[offset],Z^{prev}[offset],n_{loc}$)\;
+  Copy $Z[offset]$ from GPU to $Z$ in CPU\;
+  $max$ = MaxFunction($\Delta Z_{max},ngpu$)\; 
  }
  \label{alg2-cuda-openmp}
  }
  \label{alg2-cuda-openmp}
-\LZK{J'ai modifié l'algo. Le $P$ est mis shared. Qu'en est-il pour
-  $P'$?}\RC{Je l'ai rajouté. Bon sinon le n\_loc ne remplace pas
+\RC{Je l'ai rajouté. Bon sinon le n\_loc ne remplace pas
    vraiment un offset et une taille mais bon... et là il y a 4 lignes
    pour la convergence, c'est bcp ... Zloc, Zmax, max et
    vraiment un offset et une taille mais bon... et là il y a 4 lignes
    pour la convergence, c'est bcp ... Zloc, Zmax, max et
-  testconvergence. On pourrait faire mieux}
+  testconvergence. On pourrait faire mieux\LZK{Modifié, c'est bon!}}
  \end{algorithm}
  
  
  \end{algorithm}