X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/kahina_paper2.git/blobdiff_plain/1e4a27cd6dd739a4ee1c228d704a036ac34609c8..4db665ef149942bc02e41b0556b37ccb0a072955:/paper.tex?ds=sidebyside diff --git a/paper.tex b/paper.tex index a236a5a..25a169d 100644 --- a/paper.tex +++ b/paper.tex @@ -775,6 +775,7 @@ implementation of Ehrlich-Aberth method. \begin{algorithm}[htpb] \label{alg1-cuda} \LinesNumbered +\SetAlgoNoLine \caption{CUDA Algorithm to find roots with the Ehrlich-Aberth method} \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance @@ -797,11 +798,6 @@ Allocate and copy initial data to the GPU global memory\; Copy results from GPU memory to CPU memory\; \end{algorithm} -~\\ - -\RC{Au final, on laisse ce code, on l'explique, si c'est kahina qui - rajoute l'explication, il faut absolument ajouter \KG{dfsdfsd}, car - l'anglais sera à relire et je ne veux pas tout relire... } \section{The EA algorithm on Multiple GPUs} \label{sec4} @@ -858,43 +854,40 @@ shared memory arrays containing all the roots. %% roots sufficiently converge. -%% \begin{enumerate} -%% \begin{algorithm}[htpb] -%% \label{alg2-cuda-openmp} -%% %\LinesNumbered -%% \caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method} +\begin{algorithm}[htpb] +\label{alg2-cuda-openmp} +\LinesNumbered +\SetAlgoNoLine +\caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method} -%% \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance -%% threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)} +\KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance + threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)} -%% \KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)} +\KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)} -%% \BlankLine +\BlankLine -%% \item Initialization of P\; -%% \item Initialization of Pu\; -%% \item Initialization of the solution vector $Z^{0}$\; -%% \verb=omp_set_num_threads(num_gpus);= -%% \verb=#pragma omp parallel shared(Z,$\Delta$ z,P);= -%% \verb=cudaGetDevice(gpu_id);= -%% \item Allocate and copy initial data from CPU memory to the GPU global memories\; -%% \item index= $Size/num\_gpus$\; -%% \item k=0\; -%% \While {$error > \epsilon$}{ -%% \item Let $\Delta z=0$\; -%% \item $ kernel\_save(ZPrec,Z)$\; -%% \item k=k+1\; -%% \item $ kernel\_update(Z,P,Pu,index)$\; -%% \item $kernel\_testConverge(\Delta z[gpu\_id],Z,ZPrec)$\; -%% %\verb=#pragma omp barrier;= -%% \item error= Max($\Delta z$)\; -%% } +Initialization of P\; +Initialization of Pu\; +Initialization of the solution vector $Z^{0}$\; +omp\_set\_num\_threads(num\_gpus)\; +\#pragma omp parallel shared(Z,$\Delta$ z,P)\; +\Indp +{ +gpu\_id=cudaGetDevice()\; +Allocate memory on GPU\; +Compute local size and offet according to gpu\_id\; +\While {$error > \epsilon$}{ + copy Z from CPU to GPU\; +$ ZPrec_{loc}=kernel\_save(Z_{loc})$\; +$ Z_{loc}=kernel\_update(Z,P,Pu)$\; +$\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\; +$ error= Max(\Delta z)$\; + copy $Z_{loc}$ from GPU to Z in CPU +} +\Indm} +\end{algorithm} -%% \item Copy results from GPU memories to CPU memory\; -%% \end{algorithm} -%% \end{enumerate} -%% ~\\ -%% \RC{C'est encore pire ici, on ne voit pas les comm CPU <-> GPU } \subsection{Multi-GPU : an MPI-CUDA approach} @@ -1106,11 +1099,12 @@ sparse and full polynomials ranging from 1,000,000 to 5,000,000. \label{fig:09} \end{figure} In Figure~\ref{fig:09} we can see that both approaches are scalable -and can solve very high degree polynomials. With full polynomial both -approaches give very similar results. However, for sparse polynomials -there are a noticeable difference in favour of MPI when the degree is -above 4 millions. Between 1 and 3 millions, OpenMP is more effecient. -Under 1 million, OpenMPI and MPI are almost equivalent. +and can solve very high degree polynomials. In addition, with full polynomial as well as sparse ones, both +approaches give very similar results. + +%SIDER JE viens de virer \c ca For sparse polynomials here are a noticeable difference in favour of MPI when the degree is +%above 4 millions. Between 1 and 3 millions, OpenMP is more effecient. +%Under 1 million, OpenMPI and MPI are almost equivalent. %SIDER : il faut une explication sur les différences ici aussi.