the EA implementation on (CUDA, OpenMP)

[kahina_paper2.git] / paper.tex
diff --git a/paper.tex b/paper.tex

index 7bba549b888ad58b61958c8307169a1cec326f38..88d882317d55082178510383bde12d407168e403 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -565,6 +565,51 @@ Algorithm~\ref{alg2-cuda} shows a sketch of the Ehrlich-Aberth method using CUDA
  \section{The EA algorithm on Multi-GPU}
  
  \subsection{MGPU (OpenMP-CUDA)approach}
  \section{The EA algorithm on Multi-GPU}
  
  \subsection{MGPU (OpenMP-CUDA)approach}
+Before starting computations, our parallel implementation shared input data of the root finding polynomial between OpenMP threads. From Algorithm 1, the input data are the solution vector $Z$, the polynomial to solve $P$. Let number of OpenMP threads is equal to the number of GPUs, each threads OpenMP ( T-omp) checks one GPU,  and control a part of the shared memory, that is a part of the vector Z  like: $(n/Nbr_gpu)$ roots, n: the polynomial's degrees, $Nbr_gpu$ the number of GPUs. Then every GPU will have a grid of computation organized with its performances and the size of data of which it checks. In principle a grid is set by two parameter DimGrid, the number of block per grid, DimBloc: the number of threads per block. The following schema  shows the architecture of (CUDA,OpenMP).
+
+
+
+Each thread OpenMP compute the kernels on GPUs,than after each iteration they copy out the data from GPU memory to CPU shared memory. The kernels are re-runs is up to the roots converge sufficiently. Here are below the corresponding algorithm:
+\begin{enumerate}
+\begin{algorithm}[htpb]
+\label{alg2-cuda}
+%\LinesNumbered
+\caption{CUDA OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
+
+\KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
+  threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degrees), $\Delta z_{max}$ (Maximum value of stop condition)}
+
+\KwOut {$Z$ (Solution root's vector), $ZPrec$ (Previous solution root's vector)}
+
+\BlankLine
+// selection du GPU\;
+\item cudaSetDevice(i)\;
+// allocations memoire\;
+\verb= #pragma omp single=
+\item hostAlloc(P,Pu,Z)\;
+\verb= #pragma omp parallel shared(Z,∆zmax,P)=
+\item deviceAlloc(dP,dPu,dZ)\;
+\verb= #pragma omp barrier=
+// transfers CPU-GPU and compute GPU\;
+\item copyH2D(P,dP)\;
+\item copyH2D(Pu,dPu)\;
+\item copyH2D(Zi,dZi)\;
+\While {$\Delta z_{max} > \epsilon$}{
+\item Let $\Delta z_{max}=0$\;
+\item $ kernel\_save(ZPrec,Z)$\;
+\item  k=k+1\;
+//each GPU i  compute the new root for his part dZi
+\item $ kernel\_update(dZi,P,Pu)$\;
+\item $kernel\_testConverge(\Delta z_{max},dZi,ZPrec)$\;
+}
+\item copyD2H(dZ,Zi)\;
+ // fin omp parallel\;
+
+\end{algorithm}
+\end{enumerate}
+~\\ 
+
+
  \subsection{MGPU (MPI-CUDA)approach}
  
  \section{experiments}
  \subsection{MGPU (MPI-CUDA)approach}
  
  \section{experiments}
@@ -611,6 +656,20 @@ Algorithm~\ref{alg2-cuda} shows a sketch of the Ehrlich-Aberth method using CUDA
  \label{fig:06}
  \end{figure}
  
  \label{fig:06}
  \end{figure}
  
+\begin{figure}[htbp]
+\centering
+  \includegraphics[angle=-90,width=0.5\textwidth]{MPI_mpivsomp}
+\caption{Comparaison of execution times of the Ehrlich-Aberth method for solving sparse and full polynomials on GPUs with distributed memory paradigm using MPI}
+\label{fig:07}
+\end{figure}
+
+\begin{figure}[htbp]
+\centering
+  \includegraphics[angle=-90,width=0.5\textwidth]{OMP_mpivsomp}
+\caption{Comparaison of execution times of the Ehrlich-Aberth method for solving sparse and full polynomials on GPUs with shared memory paradigm using OpenMP}
+\label{fig:08}
+\end{figure}
+
  % An example of a floating figure using the graphicx package.
  % Note that \label must occur AFTER (or within) \caption.
  % For figures, \caption should occur after the \includegraphics.
  % An example of a floating figure using the graphicx package.
  % Note that \label must occur AFTER (or within) \caption.
  % For figures, \caption should occur after the \includegraphics.