X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/kahina_paper2.git/blobdiff_plain/72b8c7d942e80fd4d5d76f2ed0ad0f899a085d85..f9bf26cc810e585f03a82f4a0a5e8c64b96842ca:/paper.tex?ds=sidebyside

diff --git a/paper.tex b/paper.tex
index 32c2753..962c7f9 100644
--- a/paper.tex
+++ b/paper.tex
@@ -530,7 +530,7 @@ Algorithm 1 shows the GPU parallel implementation of Ehrlich-Aberth method.
 Algorithm~\ref{alg2-cuda} shows a sketch of the Ehrlich-Aberth method using CUDA.
 
 \begin{enumerate}
-\begin{algorithm}[H]
+\begin{algorithm}[htpb]
 \label{alg2-cuda}
 %\LinesNumbered
 \caption{CUDA Algorithm to find roots with the Ehrlich-Aberth method}
@@ -565,24 +565,113 @@ Algorithm~\ref{alg2-cuda} shows a sketch of the Ehrlich-Aberth method using CUDA
 \section{The EA algorithm on Multi-GPU}
 
 \subsection{MGPU (OpenMP-CUDA)approach}
+Before beginning the calculation, our implementation parallel with OpenMP and CUDA shares the input data between threads OpenMP, these input data sotn Z: the vector solution, P: the polynomial to solve,
+
+Before starting computations, our parallel implementation shared input data of the root finding polynomial between OpenMP threads. From Algorithm 1, the input data are the solution vector $Z$, the polynomial to solve $P$. Let number of OpenMP threads is equal to the number of GPUs, each threads OpenMP ( T-omp) checks one GPU,  and control a part of the shared memory, that is a part of the vector Z  like: $(n/Nbr_gpu)$ roots, n: the polynomial's degrees, $Nbr_gpu$ the number of GPUs. Then every GPU will have a grid of computation organized with its performances and the size of data of which it checks. In principle a grid is set by two parameter DimGrid, the number of block per grid, DimBloc: the number of threads per block. The following schema  shows the architecture of (CUDA,OpenMP).
+
+
+
+Each thread OpenMP compute the kernels on GPUs,than after each iteration they copy out the data from GPU memory to CPU shared memory. The kernels are re-runs is up to the roots converge sufficiently. Here are below the corresponding algorithm:
+\begin{enumerate}
+\begin{algorithm}[htpb]
+\label{alg2-cuda}
+%\LinesNumbered
+\caption{CUDA OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
+
+\KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
+  threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degrees), $\Delta z_{max}$ (Maximum value of stop condition)}
+
+\KwOut {$Z$ (Solution root's vector), $ZPrec$ (Previous solution root's vector)}
+
+\BlankLine
+// selection du GPU\;
+\item cudaSetDevice(i)\;
+// allocations memoire\;
+\verb= #pragma omp single=
+\item hostAlloc(P,Pu,Z)\;
+\verb= #pragma omp parallel shared(Z,âzmax,P)=
+\item deviceAlloc(dP,dPu,dZ)\;
+\verb= #pragma omp barrier=
+// transfers CPU-GPU and compute GPU\;
+\item copyH2D(P,dP)\;
+\item copyH2D(Pu,dPu)\;
+\item copyH2D(Zi,dZi)\;
+\While {$\Delta z_{max} > \epsilon$}{
+\item Let $\Delta z_{max}=0$\;
+\item $ kernel\_save(ZPrec,Z)$\;
+\item  k=k+1\;
+//each GPU i  compute the new root for his part dZi
+\item $ kernel\_update(dZi,P,Pu)$\;
+\item $kernel\_testConverge(\Delta z_{max},dZi,ZPrec)$\;
+}
+\item copyD2H(dZ,Zi)\;
+ // fin omp parallel\;
+
+\end{algorithm}
+\end{enumerate}
+~\\ 
+
+
 \subsection{MGPU (MPI-CUDA)approach}
 
 \section{experiments}
 
 \begin{figure}[htbp]
 \centering
-  \includegraphics[angle=-90,width=0.8\textwidth]{GPU_openmp}
-\caption{Execution times in seconds of the Ehrlich-Aberth method on GPUs using shared memory paradigm with OpenMP}
+  \includegraphics[angle=-90,width=0.5\textwidth]{Sparse_openmp}
+\caption{Execution times in seconds of the Ehrlich-Aberth method for solving sparse polynomials on GPUs using shared memory paradigm with OpenMP}
 \label{fig:01}
 \end{figure}
 
 \begin{figure}[htbp]
 \centering
-  \includegraphics[angle=-90,width=0.8\textwidth]{GPU_mpi}
-\caption{Execution times in seconds of the Ehrlich-Aberth method on GPUs using distributed memory paradigm with MPI}
+  \includegraphics[angle=-90,width=0.5\textwidth]{Sparse_mpi}
+\caption{Execution times in seconds of the Ehrlich-Aberth method for solving sparse polynomials on GPUs using distributed memory paradigm with MPI}
 \label{fig:02}
 \end{figure}
 
+\begin{figure}[htbp]
+\centering
+  \includegraphics[angle=-90,width=0.5\textwidth]{Full_openmp}
+\caption{Execution times in seconds of the Ehrlich-Aberth method for solving full polynomials on GPUs using shared memory paradigm with OpenMP}
+\label{fig:03}
+\end{figure}
+
+\begin{figure}[htbp]
+\centering
+  \includegraphics[angle=-90,width=0.5\textwidth]{Full_mpi}
+\caption{Execution times in seconds of the Ehrlich-Aberth method for full polynomials on GPUs using distributed memory paradigm with MPI}
+\label{fig:04}
+\end{figure}
+
+\begin{figure}[htbp]
+\centering
+  \includegraphics[angle=-90,width=0.5\textwidth]{Sparse_mpivsomp}
+\caption{Comparaison between MPI and OpenMP versions of the Ehrlich-Aberth method for solving sparse plynomials on GPUs}
+\label{fig:05}
+\end{figure}
+
+\begin{figure}[htbp]
+\centering
+  \includegraphics[angle=-90,width=0.5\textwidth]{Full_mpivsomp}
+\caption{Comparaison between MPI and OpenMP versions of the Ehrlich-Aberth method for solving full polynomials on GPUs}
+\label{fig:06}
+\end{figure}
+
+\begin{figure}[htbp]
+\centering
+  \includegraphics[angle=-90,width=0.5\textwidth]{MPI_mpivsomp}
+\caption{Comparaison of execution times of the Ehrlich-Aberth method for solving sparse and full polynomials on GPUs with distributed memory paradigm using MPI}
+\label{fig:07}
+\end{figure}
+
+\begin{figure}[htbp]
+\centering
+  \includegraphics[angle=-90,width=0.5\textwidth]{OMP_mpivsomp}
+\caption{Comparaison of execution times of the Ehrlich-Aberth method for solving sparse and full polynomials on GPUs with shared memory paradigm using OpenMP}
+\label{fig:08}
+\end{figure}
+
 % An example of a floating figure using the graphicx package.
 % Note that \label must occur AFTER (or within) \caption.
 % For figures, \caption should occur after the \includegraphics.