une contrib à signaler

[kahina_paper2.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index b579c69cf1e7bc134798b74fb1a2cbe0e38cb963..2d47f2266da4954109cc2b793fe6b804b1b02bb0 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -17,8 +17,6 @@
    \todo[color=orange!10,#1]{\sffamily\textbf{AS:} #2}\xspace}
  
  
-
-
  \begin{document}
  
  \title{Two parallel implementations of Ehrlich-Aberth algorithm for root-finding of polynomials on multiple GPUs with OpenMP and MPI}
@@ -111,7 +109,8 @@ In this paper we propose the parallelization of Ehrlich-Aberth method which has
  \item The parallel implementation of Ehrlich-Aberth algorithm on a multi-GPU platform with a distributed memory using MPI API, such that each GPU is attached and managed by a MPI process. The GPUs exchange their data by message-passing communications. 
   \end{itemize}
  This latter approach is more used on clusters to solve very complex problems that are too large for traditional supercomputers, which are very expensive to build and run.
-\LZK{Pas d'autres contributions possibles? J'ai supprimé les deux premiers points proposés précédemment.}
+\LZK{Pas d'autres contributions possibles? J'ai supprimé les deux premiers points proposés précédemment.
+\AS{La résolution du problème pour des polynomes pleins de degré 6M est une contribution aussi à mon avis}}
  
  The paper is organized as follows. In Section~\ref{sec2} we present three different parallel programming models OpenMP, MPI and CUDA. In Section~\ref{sec3} we present the implementation of the Ehrlich-Aberth algorithm on a single GPU. In Section~\ref{sec4} we present the parallel implementations of the Ehrlich-Aberth algorithm on multiple GPUs using the OpenMP and MPI approaches. In section~\ref{sec5} we present our experiments and discuss them. Finally, Section~\ref{sec6} concludes this paper and gives some hints for future research directions in this topic. 
  
@@ -241,8 +240,7 @@ Copy $P$, $P'$ and $Z$ from CPU to GPU\;
  \While{$\Delta Z_{max} > \epsilon$}{
    $Z^{prev}$ = KernelSave($Z,n$)\;
    $Z$ = KernelUpdate($P,P',Z,n$)\;
-  $\Delta Z$ = KernelComputeError($Z,Z^{prev},n$)\;
-  $\Delta Z_{max}$ = CudaMaxFunction($\Delta Z,n$)\;
+  $\Delta Z_{max}$ = KernelComputeError($Z,Z^{prev},n$)\;
  }
  Copy $Z$ from GPU to CPU\;
  \label{alg1-cuda}
@@ -287,28 +285,24 @@ arrays containing all the roots.
  \KwOut{$Z$ (solution vector of roots)}
  Initialize the polynomial $P$ and its derivative $P'$\;
  Set the initial values of vector $Z$\;
-Start of a parallel part with OpenMP ($Z$, $\Delta Z$, $\Delta
-Z_{max}$, $P$, $P'$ are shared variables)\;
+Start of a parallel part with OpenMP ($Z$, $\Delta Z_{max}$, $P$, $P'$ are shared variables)\;
  $id_{gpu}$ = cudaGetDevice()\;
  $n_{loc}$ = $n/ngpu$ (local size)\;
-%$idx$ = $id_{gpu}\times n_{loc}$ (local offset)\;
+$offset$ = $id_{gpu}\times n_{loc}$ (local offset)\;
  Copy $P$, $P'$ from CPU to GPU\;
-\While{\emph{not convergence}}{
+\While{$max > \epsilon$}{
    Copy $Z$ from CPU to GPU\;
    $Z^{prev}$ = KernelSave($Z,n$)\;
-  $Z_{loc}$ = KernelUpdate($P,P',Z^{prev},n_{loc}$)\;
-  $\Delta Z_{loc}$ = KernelComputeError($Z_{loc},Z^{prev}_{loc},n_{loc}$)\;
-  $\Delta Z_{max}[id_{gpu}]$ = CudaMaxFunction($\Delta Z_{loc},n_{loc}$)\;
-  Copy $Z_{loc}$ from GPU to $Z$ in CPU\;
-  $max$ = MaxFunction($\Delta Z_{max},ngpu$)\;
-  TestConvergence($max,\epsilon$)\;
+  $Z[offset]$ = KernelUpdate($P,P',Z,n_{loc}$)\;
+  $\Delta Z_{max}[id_{gpu}]$ = KernelComputeError($Z[offset],Z^{prev}[offset],n_{loc}$)\;
+  Copy $Z[offset]$ from GPU to $Z$ in CPU\;
+  $max$ = MaxFunction($\Delta Z_{max},ngpu$)\; 
  }
  \label{alg2-cuda-openmp}
-\LZK{J'ai modifié l'algo. Le $P$ est mis shared. Qu'en est-il pour
-  $P'$?}\RC{Je l'ai rajouté. Bon sinon le n\_loc ne remplace pas
+\RC{Je l'ai rajouté. Bon sinon le n\_loc ne remplace pas
    vraiment un offset et une taille mais bon... et là il y a 4 lignes
    pour la convergence, c'est bcp ... Zloc, Zmax, max et
-  testconvergence. On pourrait faire mieux}
+  testconvergence. On pourrait faire mieux\LZK{Modifié, c'est bon!}}
  \end{algorithm}