modif commentaire fig 9

[kahina_paper2.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index d6dd2ffd305a3930af2cfacaf70c02ecf01a3079..25a169dda86bcd4c918377d559c199142723f0d9 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -178,7 +178,7 @@
  
  % *** SPECIALIZED LIST PACKAGES ***
  %
-\usepackage{algorithmic}
+
  % algorithmic.sty was written by Peter Williams and Rogerio Brito.
  % This package provides an algorithmic environment fo describing algorithms.
  % You can use the algorithmic environment in-text or within a figure
@@ -772,10 +772,10 @@ CUDA running threads like threads on a CPU host. In the following
  paragraph Algorithm~\ref{alg1-cuda} shows the GPU parallel
  implementation of Ehrlich-Aberth method.
  
-\begin{enumerate}
  \begin{algorithm}[htpb]
  \label{alg1-cuda}
-%\LinesNumbered
+\LinesNumbered
+\SetAlgoNoLine
  \caption{CUDA Algorithm to find roots with the Ehrlich-Aberth method}
  
  \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
@@ -785,24 +785,19 @@ implementation of Ehrlich-Aberth method.
  
  %\BlankLine
  
-\item Initialization of P\;
-\item Initialization of Pu\;
-\item Initialization of the solution vector $Z^{0}$\;
-\item Allocate and copy initial data to the GPU global memory\;
-\item \While {$\Delta z_{max} > \epsilon$}{
-\item   $ kernel\_save(ZPrec,Z)$\;
-\item   $ kernel\_update(Z,P,Pu)$\;
-\item $\Delta z_{max}=kernel\_testConverge(Z,ZPrec)$\;
+Initialization of P\;
+Initialization of Pu\;
+Initialization of the solution vector $Z^{0}$\;
+Allocate and copy initial data to the GPU global memory\;
+\While {$\Delta z_{max} > \epsilon$}{
+   $ kernel\_save(ZPrec,Z)$\;
+   $ kernel\_update(Z,P,Pu)$\;
+ $\Delta z_{max}=kernel\_testConverge(Z,ZPrec)$\;
  
  }
-\item Copy results from GPU memory to CPU memory\;
+Copy results from GPU memory to CPU memory\;
  \end{algorithm}
-\end{enumerate}
-~\\ 
  
-\RC{Au final, on laisse ce code, on l'explique, si c'est kahina qui
-  rajoute l'explication, il faut absolument ajouter \KG{dfsdfsd}, car
-  l'anglais sera à relire et je ne veux pas tout relire... }
   
  \section{The EA algorithm on Multiple GPUs}
  \label{sec4}
@@ -859,43 +854,40 @@ shared memory arrays containing all the roots.
  %% roots sufficiently converge.
  
  
-%% \begin{enumerate}
-%% \begin{algorithm}[htpb]
-%% \label{alg2-cuda-openmp}
-%% %\LinesNumbered
-%% \caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
-
-%% \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
-%%   threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
-
-%% \KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
-
-%% \BlankLine
+\begin{algorithm}[htpb]
+\label{alg2-cuda-openmp}
+\LinesNumbered
+\SetAlgoNoLine
+\caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
  
-%% \item Initialization of P\;
-%% \item Initialization of Pu\;
-%% \item Initialization of the solution vector $Z^{0}$\;
-%% \verb=omp_set_num_threads(num_gpus);=
-%% \verb=#pragma omp parallel shared(Z,$\Delta$ z,P);=
-%% \verb=cudaGetDevice(gpu_id);=
-%% \item Allocate and copy initial data from CPU memory to the GPU global memories\;
-%% \item index= $Size/num\_gpus$\;
-%% \item k=0\;
-%% \While {$error > \epsilon$}{
-%% \item Let $\Delta z=0$\;
-%% \item $ kernel\_save(ZPrec,Z)$\;
-%% \item  k=k+1\;
-%% \item $ kernel\_update(Z,P,Pu,index)$\;
-%% \item $kernel\_testConverge(\Delta z[gpu\_id],Z,ZPrec)$\;
-%% %\verb=#pragma omp barrier;=
-%% \item error= Max($\Delta z$)\;
-%% }
+\KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
+  threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
+
+\KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
+
+\BlankLine
+
+Initialization of P\;
+Initialization of Pu\;
+Initialization of the solution vector $Z^{0}$\;
+omp\_set\_num\_threads(num\_gpus)\;
+\#pragma omp parallel shared(Z,$\Delta$ z,P)\;
+\Indp
+{
+gpu\_id=cudaGetDevice()\;
+Allocate memory on GPU\;
+Compute local size and offet according to gpu\_id\;
+\While {$error > \epsilon$}{
+  copy Z from CPU to GPU\;
+$ ZPrec_{loc}=kernel\_save(Z_{loc})$\;
+$ Z_{loc}=kernel\_update(Z,P,Pu)$\;
+$\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\;
+$  error= Max(\Delta z)$\;
+  copy $Z_{loc}$ from GPU to Z in CPU
+}
+\Indm}
+\end{algorithm}
  
-%% \item Copy results from GPU memories to CPU memory\;
-%% \end{algorithm}
-%% \end{enumerate}
-%% ~\\ 
-%% \RC{C'est encore pire ici, on ne voit pas les comm CPU <-> GPU }
  
  
  \subsection{Multi-GPU : an MPI-CUDA approach}
@@ -1107,11 +1099,12 @@ sparse and full polynomials ranging from 1,000,000 to 5,000,000.
  \label{fig:09}
  \end{figure}
  In Figure~\ref{fig:09} we can see that both approaches are scalable
-and can solve very high degree polynomials. With full polynomial both
-approaches give very similar results. However, for sparse polynomials
-there are a noticeable difference in favour of MPI when the degree is
-above 4 millions. Between 1  and 3 millions, OpenMP is more effecient.
-Under 1 million, OpenMPI and MPI are almost equivalent.
+and can solve very high degree polynomials. In addition, with full polynomial as well as sparse ones, both
+approaches give very similar results.
+
+%SIDER JE viens de virer \c ca For sparse polynomials here are a noticeable difference in favour of MPI when the degree is
+%above 4 millions. Between 1  and 3 millions, OpenMP is more effecient.
+%Under 1 million, OpenMPI and MPI are almost equivalent.
  
  %SIDER : il faut une explication sur les différences ici aussi.