modif commentaire fig 9

[kahina_paper2.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index a236a5a8804aabe89b76018001f618f52c3a017a..25a169dda86bcd4c918377d559c199142723f0d9 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -775,6 +775,7 @@ implementation of Ehrlich-Aberth method.
  \begin{algorithm}[htpb]
  \label{alg1-cuda}
  \LinesNumbered
+\SetAlgoNoLine
  \caption{CUDA Algorithm to find roots with the Ehrlich-Aberth method}
  
  \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
@@ -797,11 +798,6 @@ Allocate and copy initial data to the GPU global memory\;
  Copy results from GPU memory to CPU memory\;
  \end{algorithm}
  
-~\\ 
-
-\RC{Au final, on laisse ce code, on l'explique, si c'est kahina qui
-  rajoute l'explication, il faut absolument ajouter \KG{dfsdfsd}, car
-  l'anglais sera à relire et je ne veux pas tout relire... }
   
  \section{The EA algorithm on Multiple GPUs}
  \label{sec4}
@@ -858,43 +854,40 @@ shared memory arrays containing all the roots.
  %% roots sufficiently converge.
  
  
-%% \begin{enumerate}
-%% \begin{algorithm}[htpb]
-%% \label{alg2-cuda-openmp}
-%% %\LinesNumbered
-%% \caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
+\begin{algorithm}[htpb]
+\label{alg2-cuda-openmp}
+\LinesNumbered
+\SetAlgoNoLine
+\caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
  
-%% \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
-%%   threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
+\KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
+  threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
  
-%% \KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
+\KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
  
-%% \BlankLine
+\BlankLine
  
-%% \item Initialization of P\;
-%% \item Initialization of Pu\;
-%% \item Initialization of the solution vector $Z^{0}$\;
-%% \verb=omp_set_num_threads(num_gpus);=
-%% \verb=#pragma omp parallel shared(Z,$\Delta$ z,P);=
-%% \verb=cudaGetDevice(gpu_id);=
-%% \item Allocate and copy initial data from CPU memory to the GPU global memories\;
-%% \item index= $Size/num\_gpus$\;
-%% \item k=0\;
-%% \While {$error > \epsilon$}{
-%% \item Let $\Delta z=0$\;
-%% \item $ kernel\_save(ZPrec,Z)$\;
-%% \item  k=k+1\;
-%% \item $ kernel\_update(Z,P,Pu,index)$\;
-%% \item $kernel\_testConverge(\Delta z[gpu\_id],Z,ZPrec)$\;
-%% %\verb=#pragma omp barrier;=
-%% \item error= Max($\Delta z$)\;
-%% }
+Initialization of P\;
+Initialization of Pu\;
+Initialization of the solution vector $Z^{0}$\;
+omp\_set\_num\_threads(num\_gpus)\;
+\#pragma omp parallel shared(Z,$\Delta$ z,P)\;
+\Indp
+{
+gpu\_id=cudaGetDevice()\;
+Allocate memory on GPU\;
+Compute local size and offet according to gpu\_id\;
+\While {$error > \epsilon$}{
+  copy Z from CPU to GPU\;
+$ ZPrec_{loc}=kernel\_save(Z_{loc})$\;
+$ Z_{loc}=kernel\_update(Z,P,Pu)$\;
+$\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\;
+$  error= Max(\Delta z)$\;
+  copy $Z_{loc}$ from GPU to Z in CPU
+}
+\Indm}
+\end{algorithm}
  
-%% \item Copy results from GPU memories to CPU memory\;
-%% \end{algorithm}
-%% \end{enumerate}
-%% ~\\ 
-%% \RC{C'est encore pire ici, on ne voit pas les comm CPU <-> GPU }
  
  
  \subsection{Multi-GPU : an MPI-CUDA approach}
@@ -1106,11 +1099,12 @@ sparse and full polynomials ranging from 1,000,000 to 5,000,000.
  \label{fig:09}
  \end{figure}
  In Figure~\ref{fig:09} we can see that both approaches are scalable
-and can solve very high degree polynomials. With full polynomial both
-approaches give very similar results. However, for sparse polynomials
-there are a noticeable difference in favour of MPI when the degree is
-above 4 millions. Between 1  and 3 millions, OpenMP is more effecient.
-Under 1 million, OpenMPI and MPI are almost equivalent.
+and can solve very high degree polynomials. In addition, with full polynomial as well as sparse ones, both
+approaches give very similar results.
+
+%SIDER JE viens de virer \c ca For sparse polynomials here are a noticeable difference in favour of MPI when the degree is
+%above 4 millions. Between 1  and 3 millions, OpenMP is more effecient.
+%Under 1 million, OpenMPI and MPI are almost equivalent.
  
  %SIDER : il faut une explication sur les différences ici aussi.