X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/kahina_paper2.git/blobdiff_plain/1e4a27cd6dd739a4ee1c228d704a036ac34609c8..4db665ef149942bc02e41b0556b37ccb0a072955:/paper.tex?ds=sidebyside

diff --git a/paper.tex b/paper.tex
index a236a5a..25a169d 100644
--- a/paper.tex
+++ b/paper.tex
@@ -775,6 +775,7 @@ implementation of Ehrlich-Aberth method.
 \begin{algorithm}[htpb]
 \label{alg1-cuda}
 \LinesNumbered
+\SetAlgoNoLine
 \caption{CUDA Algorithm to find roots with the Ehrlich-Aberth method}
 
 \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
@@ -797,11 +798,6 @@ Allocate and copy initial data to the GPU global memory\;
 Copy results from GPU memory to CPU memory\;
 \end{algorithm}
 
-~\\ 
-
-\RC{Au final, on laisse ce code, on l'explique, si c'est kahina qui
-  rajoute l'explication, il faut absolument ajouter \KG{dfsdfsd}, car
-  l'anglais sera Ã  relire et je ne veux pas tout relire... }
  
 \section{The EA algorithm on Multiple GPUs}
 \label{sec4}
@@ -858,43 +854,40 @@ shared memory arrays containing all the roots.
 %% roots sufficiently converge.
 
 
-%% \begin{enumerate}
-%% \begin{algorithm}[htpb]
-%% \label{alg2-cuda-openmp}
-%% %\LinesNumbered
-%% \caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
+\begin{algorithm}[htpb]
+\label{alg2-cuda-openmp}
+\LinesNumbered
+\SetAlgoNoLine
+\caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
 
-%% \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
-%%   threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
+\KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
+  threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
 
-%% \KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
+\KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
 
-%% \BlankLine
+\BlankLine
 
-%% \item Initialization of P\;
-%% \item Initialization of Pu\;
-%% \item Initialization of the solution vector $Z^{0}$\;
-%% \verb=omp_set_num_threads(num_gpus);=
-%% \verb=#pragma omp parallel shared(Z,$\Delta$ z,P);=
-%% \verb=cudaGetDevice(gpu_id);=
-%% \item Allocate and copy initial data from CPU memory to the GPU global memories\;
-%% \item index= $Size/num\_gpus$\;
-%% \item k=0\;
-%% \While {$error > \epsilon$}{
-%% \item Let $\Delta z=0$\;
-%% \item $ kernel\_save(ZPrec,Z)$\;
-%% \item  k=k+1\;
-%% \item $ kernel\_update(Z,P,Pu,index)$\;
-%% \item $kernel\_testConverge(\Delta z[gpu\_id],Z,ZPrec)$\;
-%% %\verb=#pragma omp barrier;=
-%% \item error= Max($\Delta z$)\;
-%% }
+Initialization of P\;
+Initialization of Pu\;
+Initialization of the solution vector $Z^{0}$\;
+omp\_set\_num\_threads(num\_gpus)\;
+\#pragma omp parallel shared(Z,$\Delta$ z,P)\;
+\Indp
+{
+gpu\_id=cudaGetDevice()\;
+Allocate memory on GPU\;
+Compute local size and offet according to gpu\_id\;
+\While {$error > \epsilon$}{
+  copy Z from CPU to GPU\;
+$ ZPrec_{loc}=kernel\_save(Z_{loc})$\;
+$ Z_{loc}=kernel\_update(Z,P,Pu)$\;
+$\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\;
+$  error= Max(\Delta z)$\;
+  copy $Z_{loc}$ from GPU to Z in CPU
+}
+\Indm}
+\end{algorithm}
 
-%% \item Copy results from GPU memories to CPU memory\;
-%% \end{algorithm}
-%% \end{enumerate}
-%% ~\\ 
-%% \RC{C'est encore pire ici, on ne voit pas les comm CPU <-> GPU }
 
 
 \subsection{Multi-GPU : an MPI-CUDA approach}
@@ -1106,11 +1099,12 @@ sparse and full polynomials ranging from 1,000,000 to 5,000,000.
 \label{fig:09}
 \end{figure}
 In Figure~\ref{fig:09} we can see that both approaches are scalable
-and can solve very high degree polynomials. With full polynomial both
-approaches give very similar results. However, for sparse polynomials
-there are a noticeable difference in favour of MPI when the degree is
-above 4 millions. Between 1  and 3 millions, OpenMP is more effecient.
-Under 1 million, OpenMPI and MPI are almost equivalent.
+and can solve very high degree polynomials. In addition, with full polynomial as well as sparse ones, both
+approaches give very similar results.
+
+%SIDER JE viens de virer \c ca For sparse polynomials here are a noticeable difference in favour of MPI when the degree is
+%above 4 millions. Between 1  and 3 millions, OpenMP is more effecient.
+%Under 1 million, OpenMPI and MPI are almost equivalent.
 
 %SIDER : il faut une explication sur les diffÃ©rences ici aussi.