the EA implementation on (CUDA, OpenMP)

[kahina_paper2.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index 9d151a778728c1579d564d66c6d38d2a0a096f5c..88d882317d55082178510383bde12d407168e403 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -530,7 +530,7 @@ Algorithm 1 shows the GPU parallel implementation of Ehrlich-Aberth method.
  Algorithm~\ref{alg2-cuda} shows a sketch of the Ehrlich-Aberth method using CUDA.
  
  \begin{enumerate}
  Algorithm~\ref{alg2-cuda} shows a sketch of the Ehrlich-Aberth method using CUDA.
  
  \begin{enumerate}
-\begin{algorithm}[H]
+\begin{algorithm}[htpb]
  \label{alg2-cuda}
  %\LinesNumbered
  \caption{CUDA Algorithm to find roots with the Ehrlich-Aberth method}
  \label{alg2-cuda}
  %\LinesNumbered
  \caption{CUDA Algorithm to find roots with the Ehrlich-Aberth method}
@@ -565,25 +565,111 @@ Algorithm~\ref{alg2-cuda} shows a sketch of the Ehrlich-Aberth method using CUDA
  \section{The EA algorithm on Multi-GPU}
  
  \subsection{MGPU (OpenMP-CUDA)approach}
  \section{The EA algorithm on Multi-GPU}
  
  \subsection{MGPU (OpenMP-CUDA)approach}
-\subsection{MGPU (OpenMP-CUDA)approach}
+Before starting computations, our parallel implementation shared input data of the root finding polynomial between OpenMP threads. From Algorithm 1, the input data are the solution vector $Z$, the polynomial to solve $P$. Let number of OpenMP threads is equal to the number of GPUs, each threads OpenMP ( T-omp) checks one GPU,  and control a part of the shared memory, that is a part of the vector Z  like: $(n/Nbr_gpu)$ roots, n: the polynomial's degrees, $Nbr_gpu$ the number of GPUs. Then every GPU will have a grid of computation organized with its performances and the size of data of which it checks. In principle a grid is set by two parameter DimGrid, the number of block per grid, DimBloc: the number of threads per block. The following schema  shows the architecture of (CUDA,OpenMP).
+
+
+
+Each thread OpenMP compute the kernels on GPUs,than after each iteration they copy out the data from GPU memory to CPU shared memory. The kernels are re-runs is up to the roots converge sufficiently. Here are below the corresponding algorithm:
+\begin{enumerate}
+\begin{algorithm}[htpb]
+\label{alg2-cuda}
+%\LinesNumbered
+\caption{CUDA OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
+
+\KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
+  threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degrees), $\Delta z_{max}$ (Maximum value of stop condition)}
+
+\KwOut {$Z$ (Solution root's vector), $ZPrec$ (Previous solution root's vector)}
+
+\BlankLine
+// selection du GPU\;
+\item cudaSetDevice(i)\;
+// allocations memoire\;
+\verb= #pragma omp single=
+\item hostAlloc(P,Pu,Z)\;
+\verb= #pragma omp parallel shared(Z,∆zmax,P)=
+\item deviceAlloc(dP,dPu,dZ)\;
+\verb= #pragma omp barrier=
+// transfers CPU-GPU and compute GPU\;
+\item copyH2D(P,dP)\;
+\item copyH2D(Pu,dPu)\;
+\item copyH2D(Zi,dZi)\;
+\While {$\Delta z_{max} > \epsilon$}{
+\item Let $\Delta z_{max}=0$\;
+\item $ kernel\_save(ZPrec,Z)$\;
+\item  k=k+1\;
+//each GPU i  compute the new root for his part dZi
+\item $ kernel\_update(dZi,P,Pu)$\;
+\item $kernel\_testConverge(\Delta z_{max},dZi,ZPrec)$\;
+}
+\item copyD2H(dZ,Zi)\;
+ // fin omp parallel\;
+
+\end{algorithm}
+\end{enumerate}
+~\\ 
+
+
  \subsection{MGPU (MPI-CUDA)approach}
  \subsection{MGPU (MPI-CUDA)approach}
+
  \section{experiments}
  \section{experiments}
+
  \begin{figure}[htbp]
  \centering
  \begin{figure}[htbp]
  \centering
-  \includegraphics[angle=-90,width=0.8\textwidth]{GPU_openmp}
-\caption{Execution times in seconds of the Ehrlich-Aberth method on GPUs using shared memory paradigm with OpenMP}
+  \includegraphics[angle=-90,width=0.5\textwidth]{Sparse_openmp}
+\caption{Execution times in seconds of the Ehrlich-Aberth method for solving sparse polynomials on GPUs using shared memory paradigm with OpenMP}
  \label{fig:01}
  \end{figure}
  
  \begin{figure}[htbp]
  \centering
  \label{fig:01}
  \end{figure}
  
  \begin{figure}[htbp]
  \centering
-  \includegraphics[angle=-90,width=0.8\textwidth]{GPU_mpi}
-\caption{Execution times in seconds of the Ehrlich-Aberth method on GPUs using distributed memory paradigm with MPI}
+  \includegraphics[angle=-90,width=0.5\textwidth]{Sparse_mpi}
+\caption{Execution times in seconds of the Ehrlich-Aberth method for solving sparse polynomials on GPUs using distributed memory paradigm with MPI}
  \label{fig:02}
  \end{figure}
  \label{fig:02}
  \end{figure}
-\subsection{MGPU (MPI-CUDA)approach}
  
  
-\section{experiments}
+\begin{figure}[htbp]
+\centering
+  \includegraphics[angle=-90,width=0.5\textwidth]{Full_openmp}
+\caption{Execution times in seconds of the Ehrlich-Aberth method for solving full polynomials on GPUs using shared memory paradigm with OpenMP}
+\label{fig:03}
+\end{figure}
+
+\begin{figure}[htbp]
+\centering
+  \includegraphics[angle=-90,width=0.5\textwidth]{Full_mpi}
+\caption{Execution times in seconds of the Ehrlich-Aberth method for full polynomials on GPUs using distributed memory paradigm with MPI}
+\label{fig:04}
+\end{figure}
+
+\begin{figure}[htbp]
+\centering
+  \includegraphics[angle=-90,width=0.5\textwidth]{Sparse_mpivsomp}
+\caption{Comparaison between MPI and OpenMP versions of the Ehrlich-Aberth method for solving sparse plynomials on GPUs}
+\label{fig:05}
+\end{figure}
+
+\begin{figure}[htbp]
+\centering
+  \includegraphics[angle=-90,width=0.5\textwidth]{Full_mpivsomp}
+\caption{Comparaison between MPI and OpenMP versions of the Ehrlich-Aberth method for solving full polynomials on GPUs}
+\label{fig:06}
+\end{figure}
+
+\begin{figure}[htbp]
+\centering
+  \includegraphics[angle=-90,width=0.5\textwidth]{MPI_mpivsomp}
+\caption{Comparaison of execution times of the Ehrlich-Aberth method for solving sparse and full polynomials on GPUs with distributed memory paradigm using MPI}
+\label{fig:07}
+\end{figure}
+
+\begin{figure}[htbp]
+\centering
+  \includegraphics[angle=-90,width=0.5\textwidth]{OMP_mpivsomp}
+\caption{Comparaison of execution times of the Ehrlich-Aberth method for solving sparse and full polynomials on GPUs with shared memory paradigm using OpenMP}
+\label{fig:08}
+\end{figure}
+
  % An example of a floating figure using the graphicx package.
  % Note that \label must occur AFTER (or within) \caption.
  % For figures, \caption should occur after the \includegraphics.
  % An example of a floating figure using the graphicx package.
  % Note that \label must occur AFTER (or within) \caption.
  % For figures, \caption should occur after the \includegraphics.