une contrib à signaler

[kahina_paper2.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index b579c69cf1e7bc134798b74fb1a2cbe0e38cb963..2d47f2266da4954109cc2b793fe6b804b1b02bb0 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -17,8 +17,6 @@
    \todo[color=orange!10,#1]{\sffamily\textbf{AS:} #2}\xspace}
  
  
    \todo[color=orange!10,#1]{\sffamily\textbf{AS:} #2}\xspace}
  
  
-
-
  \begin{document}
  
  \title{Two parallel implementations of Ehrlich-Aberth algorithm for root-finding of polynomials on multiple GPUs with OpenMP and MPI}
  \begin{document}
  
  \title{Two parallel implementations of Ehrlich-Aberth algorithm for root-finding of polynomials on multiple GPUs with OpenMP and MPI}
@@ -111,7 +109,8 @@ In this paper we propose the parallelization of Ehrlich-Aberth method which has
  \item The parallel implementation of Ehrlich-Aberth algorithm on a multi-GPU platform with a distributed memory using MPI API, such that each GPU is attached and managed by a MPI process. The GPUs exchange their data by message-passing communications. 
   \end{itemize}
  This latter approach is more used on clusters to solve very complex problems that are too large for traditional supercomputers, which are very expensive to build and run.
  \item The parallel implementation of Ehrlich-Aberth algorithm on a multi-GPU platform with a distributed memory using MPI API, such that each GPU is attached and managed by a MPI process. The GPUs exchange their data by message-passing communications. 
   \end{itemize}
  This latter approach is more used on clusters to solve very complex problems that are too large for traditional supercomputers, which are very expensive to build and run.
-\LZK{Pas d'autres contributions possibles? J'ai supprimé les deux premiers points proposés précédemment.}
+\LZK{Pas d'autres contributions possibles? J'ai supprimé les deux premiers points proposés précédemment.
+\AS{La résolution du problème pour des polynomes pleins de degré 6M est une contribution aussi à mon avis}}
  
  The paper is organized as follows. In Section~\ref{sec2} we present three different parallel programming models OpenMP, MPI and CUDA. In Section~\ref{sec3} we present the implementation of the Ehrlich-Aberth algorithm on a single GPU. In Section~\ref{sec4} we present the parallel implementations of the Ehrlich-Aberth algorithm on multiple GPUs using the OpenMP and MPI approaches. In section~\ref{sec5} we present our experiments and discuss them. Finally, Section~\ref{sec6} concludes this paper and gives some hints for future research directions in this topic. 
  
  
  The paper is organized as follows. In Section~\ref{sec2} we present three different parallel programming models OpenMP, MPI and CUDA. In Section~\ref{sec3} we present the implementation of the Ehrlich-Aberth algorithm on a single GPU. In Section~\ref{sec4} we present the parallel implementations of the Ehrlich-Aberth algorithm on multiple GPUs using the OpenMP and MPI approaches. In section~\ref{sec5} we present our experiments and discuss them. Finally, Section~\ref{sec6} concludes this paper and gives some hints for future research directions in this topic. 
  
@@ -241,8 +240,7 @@ Copy $P$, $P'$ and $Z$ from CPU to GPU\;
  \While{$\Delta Z_{max} > \epsilon$}{
    $Z^{prev}$ = KernelSave($Z,n$)\;
    $Z$ = KernelUpdate($P,P',Z,n$)\;
  \While{$\Delta Z_{max} > \epsilon$}{
    $Z^{prev}$ = KernelSave($Z,n$)\;
    $Z$ = KernelUpdate($P,P',Z,n$)\;
-  $\Delta Z$ = KernelComputeError($Z,Z^{prev},n$)\;
-  $\Delta Z_{max}$ = CudaMaxFunction($\Delta Z,n$)\;
+  $\Delta Z_{max}$ = KernelComputeError($Z,Z^{prev},n$)\;
  }
  Copy $Z$ from GPU to CPU\;
  \label{alg1-cuda}
  }
  Copy $Z$ from GPU to CPU\;
  \label{alg1-cuda}
@@ -287,28 +285,24 @@ arrays containing all the roots.
  \KwOut{$Z$ (solution vector of roots)}
  Initialize the polynomial $P$ and its derivative $P'$\;
  Set the initial values of vector $Z$\;
  \KwOut{$Z$ (solution vector of roots)}
  Initialize the polynomial $P$ and its derivative $P'$\;
  Set the initial values of vector $Z$\;
-Start of a parallel part with OpenMP ($Z$, $\Delta Z$, $\Delta
-Z_{max}$, $P$, $P'$ are shared variables)\;
+Start of a parallel part with OpenMP ($Z$, $\Delta Z_{max}$, $P$, $P'$ are shared variables)\;
  $id_{gpu}$ = cudaGetDevice()\;
  $n_{loc}$ = $n/ngpu$ (local size)\;
  $id_{gpu}$ = cudaGetDevice()\;
  $n_{loc}$ = $n/ngpu$ (local size)\;
-%$idx$ = $id_{gpu}\times n_{loc}$ (local offset)\;
+$offset$ = $id_{gpu}\times n_{loc}$ (local offset)\;
  Copy $P$, $P'$ from CPU to GPU\;
  Copy $P$, $P'$ from CPU to GPU\;
-\While{\emph{not convergence}}{
+\While{$max > \epsilon$}{
    Copy $Z$ from CPU to GPU\;
    $Z^{prev}$ = KernelSave($Z,n$)\;
    Copy $Z$ from CPU to GPU\;
    $Z^{prev}$ = KernelSave($Z,n$)\;
-  $Z_{loc}$ = KernelUpdate($P,P',Z^{prev},n_{loc}$)\;
-  $\Delta Z_{loc}$ = KernelComputeError($Z_{loc},Z^{prev}_{loc},n_{loc}$)\;
-  $\Delta Z_{max}[id_{gpu}]$ = CudaMaxFunction($\Delta Z_{loc},n_{loc}$)\;
-  Copy $Z_{loc}$ from GPU to $Z$ in CPU\;
-  $max$ = MaxFunction($\Delta Z_{max},ngpu$)\;
-  TestConvergence($max,\epsilon$)\;
+  $Z[offset]$ = KernelUpdate($P,P',Z,n_{loc}$)\;
+  $\Delta Z_{max}[id_{gpu}]$ = KernelComputeError($Z[offset],Z^{prev}[offset],n_{loc}$)\;
+  Copy $Z[offset]$ from GPU to $Z$ in CPU\;
+  $max$ = MaxFunction($\Delta Z_{max},ngpu$)\; 
  }
  \label{alg2-cuda-openmp}
  }
  \label{alg2-cuda-openmp}
-\LZK{J'ai modifié l'algo. Le $P$ est mis shared. Qu'en est-il pour
-  $P'$?}\RC{Je l'ai rajouté. Bon sinon le n\_loc ne remplace pas
+\RC{Je l'ai rajouté. Bon sinon le n\_loc ne remplace pas
    vraiment un offset et une taille mais bon... et là il y a 4 lignes
    pour la convergence, c'est bcp ... Zloc, Zmax, max et
    vraiment un offset et une taille mais bon... et là il y a 4 lignes
    pour la convergence, c'est bcp ... Zloc, Zmax, max et
-  testconvergence. On pourrait faire mieux}
+  testconvergence. On pourrait faire mieux\LZK{Modifié, c'est bon!}}
  \end{algorithm}
  
  
  \end{algorithm}