X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/kahina_paper2.git/blobdiff_plain/53e4e1ca95c2a0805ab15a71eb47157aeb187aed..4db665ef149942bc02e41b0556b37ccb0a072955:/paper.tex diff --git a/paper.tex b/paper.tex index d1ead14..25a169d 100644 --- a/paper.tex +++ b/paper.tex @@ -178,7 +178,7 @@ % *** SPECIALIZED LIST PACKAGES *** % -\usepackage{algorithmic} + % algorithmic.sty was written by Peter Williams and Rogerio Brito. % This package provides an algorithmic environment fo describing algorithms. % You can use the algorithmic environment in-text or within a figure @@ -772,10 +772,10 @@ CUDA running threads like threads on a CPU host. In the following paragraph Algorithm~\ref{alg1-cuda} shows the GPU parallel implementation of Ehrlich-Aberth method. -\begin{enumerate} \begin{algorithm}[htpb] \label{alg1-cuda} -%\LinesNumbered +\LinesNumbered +\SetAlgoNoLine \caption{CUDA Algorithm to find roots with the Ehrlich-Aberth method} \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance @@ -785,27 +785,19 @@ implementation of Ehrlich-Aberth method. %\BlankLine -\item Initialization of P\; -\item Initialization of Pu\; -\item Initialization of the solution vector $Z^{0}$\; -\item Allocate and copy initial data to the GPU global memory\; -\item k=0\; -\item \While {$\Delta z_{max} > \epsilon$}{ -\item Let $\Delta z_{max}=0$\; -\item $ kernel\_save(ZPrec,Z)$\; -\item k=k+1\; -\item $ kernel\_update(Z,P,Pu)$\; -\item $kernel\_testConverge(\Delta z_{max},Z,ZPrec)$\; +Initialization of P\; +Initialization of Pu\; +Initialization of the solution vector $Z^{0}$\; +Allocate and copy initial data to the GPU global memory\; +\While {$\Delta z_{max} > \epsilon$}{ + $ kernel\_save(ZPrec,Z)$\; + $ kernel\_update(Z,P,Pu)$\; + $\Delta z_{max}=kernel\_testConverge(Z,ZPrec)$\; } -\item Copy results from GPU memory to CPU memory\; +Copy results from GPU memory to CPU memory\; \end{algorithm} -\end{enumerate} -~\\ -\RC{Au final, on laisse ce code, on l'explique, si c'est kahina qui - rajoute l'explication, il faut absolument ajouter \KG{dfsdfsd}, car - l'anglais sera à relire et je ne veux pas tout relire... } \section{The EA algorithm on Multiple GPUs} \label{sec4} @@ -862,43 +854,40 @@ shared memory arrays containing all the roots. %% roots sufficiently converge. -%% \begin{enumerate} -%% \begin{algorithm}[htpb] -%% \label{alg2-cuda-openmp} -%% %\LinesNumbered -%% \caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method} - -%% \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance -%% threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)} - -%% \KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)} - -%% \BlankLine +\begin{algorithm}[htpb] +\label{alg2-cuda-openmp} +\LinesNumbered +\SetAlgoNoLine +\caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method} -%% \item Initialization of P\; -%% \item Initialization of Pu\; -%% \item Initialization of the solution vector $Z^{0}$\; -%% \verb=omp_set_num_threads(num_gpus);= -%% \verb=#pragma omp parallel shared(Z,$\Delta$ z,P);= -%% \verb=cudaGetDevice(gpu_id);= -%% \item Allocate and copy initial data from CPU memory to the GPU global memories\; -%% \item index= $Size/num\_gpus$\; -%% \item k=0\; -%% \While {$error > \epsilon$}{ -%% \item Let $\Delta z=0$\; -%% \item $ kernel\_save(ZPrec,Z)$\; -%% \item k=k+1\; -%% \item $ kernel\_update(Z,P,Pu,index)$\; -%% \item $kernel\_testConverge(\Delta z[gpu\_id],Z,ZPrec)$\; -%% %\verb=#pragma omp barrier;= -%% \item error= Max($\Delta z$)\; -%% } +\KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance + threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)} + +\KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)} + +\BlankLine + +Initialization of P\; +Initialization of Pu\; +Initialization of the solution vector $Z^{0}$\; +omp\_set\_num\_threads(num\_gpus)\; +\#pragma omp parallel shared(Z,$\Delta$ z,P)\; +\Indp +{ +gpu\_id=cudaGetDevice()\; +Allocate memory on GPU\; +Compute local size and offet according to gpu\_id\; +\While {$error > \epsilon$}{ + copy Z from CPU to GPU\; +$ ZPrec_{loc}=kernel\_save(Z_{loc})$\; +$ Z_{loc}=kernel\_update(Z,P,Pu)$\; +$\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\; +$ error= Max(\Delta z)$\; + copy $Z_{loc}$ from GPU to Z in CPU +} +\Indm} +\end{algorithm} -%% \item Copy results from GPU memories to CPU memory\; -%% \end{algorithm} -%% \end{enumerate} -%% ~\\ -%% \RC{C'est encore pire ici, on ne voit pas les comm CPU <-> GPU } \subsection{Multi-GPU : an MPI-CUDA approach} @@ -1110,11 +1099,12 @@ sparse and full polynomials ranging from 1,000,000 to 5,000,000. \label{fig:09} \end{figure} In Figure~\ref{fig:09} we can see that both approaches are scalable -and can solve very high degree polynomials. With full polynomial both -approaches give very similar results. However, for sparse polynomials -there are a noticeable difference in favour of MPI when the degree is -above 4 millions. Between 1 and 3 millions, OpenMP is more effecient. -Under 1 million, OpenMPI and MPI are almost equivalent. +and can solve very high degree polynomials. In addition, with full polynomial as well as sparse ones, both +approaches give very similar results. + +%SIDER JE viens de virer \c ca For sparse polynomials here are a noticeable difference in favour of MPI when the degree is +%above 4 millions. Between 1 and 3 millions, OpenMP is more effecient. +%Under 1 million, OpenMPI and MPI are almost equivalent. %SIDER : il faut une explication sur les différences ici aussi.