MAJ

author Kahina <kahina@kahina-VPCEH3K1E.(none)>

Mon, 21 Dec 2015 06:21:14 +0000 (07:21 +0100)

committer Kahina <kahina@kahina-VPCEH3K1E.(none)>

Mon, 21 Dec 2015 06:21:14 +0000 (07:21 +0100)
author Kahina <kahina@kahina-VPCEH3K1E.(none)>
Mon, 21 Dec 2015 06:21:14 +0000 (07:21 +0100)
committer Kahina <kahina@kahina-VPCEH3K1E.(none)>
Mon, 21 Dec 2015 06:21:14 +0000 (07:21 +0100)
diff --git a/paper.tex b/paper.tex

index 88d882317d55082178510383bde12d407168e403..8d0dc139fe614160c642fd590bdf67254aee59ca 100644 (file)
--- a/paper.tex
+++ b/paper.tex
@@ -492,7 +492,14 @@ We introduced three paradigms of parallel programming. Our objective consist to
  
  \section{The EA algorithm on single GPU}
  \subsection{the EA method}
  
  \section{The EA algorithm on single GPU}
  \subsection{the EA method}
-the Ehrlich-Aberth method is an iterative  method , contain 4 steps, start from the initial approximations of all the
+%\begin{figure}[htbp]
+%\centering
+ % \includegraphics[angle=-90,width=0.5\textwidth]{EA-Algorithm}
+%\caption{The Ehrlich-Aberth algorithm on single GPU}
+%\label{fig:03}
+%\end{figure}
+
+the Ehrlich-Aberth method is an iterative  method, contain 4 steps, start from the initial approximations of all the
  roots of the polynomial,the second step initialize the solution vector $Z$ using the Guggenheimer method to assure the distinction of the initial vector roots, than in step 3 we apply the the iterative function based on the Newton's method and Weiestrass operator[...,...], wich will make it possible to converge to the roots solution, provided that all the root are different. At the end of each application of the iterative function, a stop condition is verified consists in stopping the iterative process when the whole of the modules of the roots
  are lower than a fixed value $ε$ 
  \subsection{EA parallel implementation on CUDA}
  roots of the polynomial,the second step initialize the solution vector $Z$ using the Guggenheimer method to assure the distinction of the initial vector roots, than in step 3 we apply the the iterative function based on the Newton's method and Weiestrass operator[...,...], wich will make it possible to converge to the roots solution, provided that all the root are different. At the end of each application of the iterative function, a stop condition is verified consists in stopping the iterative process when the whole of the modules of the roots
  are lower than a fixed value $ε$ 
  \subsection{EA parallel implementation on CUDA}
@@ -567,6 +574,12 @@ Algorithm~\ref{alg2-cuda} shows a sketch of the Ehrlich-Aberth method using CUDA
  \subsection{MGPU (OpenMP-CUDA)approach}
  Before starting computations, our parallel implementation shared input data of the root finding polynomial between OpenMP threads. From Algorithm 1, the input data are the solution vector $Z$, the polynomial to solve $P$. Let number of OpenMP threads is equal to the number of GPUs, each threads OpenMP ( T-omp) checks one GPU,  and control a part of the shared memory, that is a part of the vector Z  like: $(n/Nbr_gpu)$ roots, n: the polynomial's degrees, $Nbr_gpu$ the number of GPUs. Then every GPU will have a grid of computation organized with its performances and the size of data of which it checks. In principle a grid is set by two parameter DimGrid, the number of block per grid, DimBloc: the number of threads per block. The following schema  shows the architecture of (CUDA,OpenMP).
  
  \subsection{MGPU (OpenMP-CUDA)approach}
  Before starting computations, our parallel implementation shared input data of the root finding polynomial between OpenMP threads. From Algorithm 1, the input data are the solution vector $Z$, the polynomial to solve $P$. Let number of OpenMP threads is equal to the number of GPUs, each threads OpenMP ( T-omp) checks one GPU,  and control a part of the shared memory, that is a part of the vector Z  like: $(n/Nbr_gpu)$ roots, n: the polynomial's degrees, $Nbr_gpu$ the number of GPUs. Then every GPU will have a grid of computation organized with its performances and the size of data of which it checks. In principle a grid is set by two parameter DimGrid, the number of block per grid, DimBloc: the number of threads per block. The following schema  shows the architecture of (CUDA,OpenMP).
  
+%\begin{figure}[htbp]
+%\centering
+ % \includegraphics[angle=-90,width=0.5\textwidth]{OpenMP-CUDA}
+%\caption{The OpenMP-CUDA architecture}
+%\label{fig:03}
+%\end{figure}
  
  
  Each thread OpenMP compute the kernels on GPUs,than after each iteration they copy out the data from GPU memory to CPU shared memory. The kernels are re-runs is up to the roots converge sufficiently. Here are below the corresponding algorithm:
  
  
  Each thread OpenMP compute the kernels on GPUs,than after each iteration they copy out the data from GPU memory to CPU shared memory. The kernels are re-runs is up to the roots converge sufficiently. Here are below the corresponding algorithm:
@@ -574,43 +587,83 @@ Each thread OpenMP compute the kernels on GPUs,than after each iteration they co
  \begin{algorithm}[htpb]
  \label{alg2-cuda}
  %\LinesNumbered
  \begin{algorithm}[htpb]
  \label{alg2-cuda}
  %\LinesNumbered
-\caption{CUDA OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
+\caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
  
  \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
  
  \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
-  threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degrees), $\Delta z_{max}$ (Maximum value of stop condition)}
+  threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degrees), $\Delta z$ ( Vector of errors of stop condition), $num_gpus$ (number of OpenMP threads/ number of GPUs), Size (number of roots)}
  
  \KwOut {$Z$ (Solution root's vector), $ZPrec$ (Previous solution root's vector)}
  
  \BlankLine
  
  \KwOut {$Z$ (Solution root's vector), $ZPrec$ (Previous solution root's vector)}
  
  \BlankLine
-// selection du GPU\;
-\item cudaSetDevice(i)\;
-// allocations memoire\;
-\verb= #pragma omp single=
-\item hostAlloc(P,Pu,Z)\;
-\verb= #pragma omp parallel shared(Z,∆zmax,P)=
-\item deviceAlloc(dP,dPu,dZ)\;
-\verb= #pragma omp barrier=
-// transfers CPU-GPU and compute GPU\;
-\item copyH2D(P,dP)\;
-\item copyH2D(Pu,dPu)\;
-\item copyH2D(Zi,dZi)\;
-\While {$\Delta z_{max} > \epsilon$}{
-\item Let $\Delta z_{max}=0$\;
+
+\item Initialization of the of P\;
+\item Initialization of the of Pu\;
+\item Initialization of the solution vector $Z^{0}$\;
+\verb=omp_set_num_threads(num_gpus);=
+\verb=cudaGetDevice(gpu_id);=
+\verb=#pragma omp parallel shared(Z,$\Delta$ z,P);=
+\item Allocate and copy initial data from CPU memory to the GPU global memories\;
+\item index= $Size/num\_gpus$\;
+\item k=0\;
+\While {$error > \epsilon$}{
+\item Let $\Delta z=0$\;
  \item $ kernel\_save(ZPrec,Z)$\;
  \item  k=k+1\;
  \item $ kernel\_save(ZPrec,Z)$\;
  \item  k=k+1\;
-//each GPU i  compute the new root for his part dZi
-\item $ kernel\_update(dZi,P,Pu)$\;
-\item $kernel\_testConverge(\Delta z_{max},dZi,ZPrec)$\;
+\item $ kernel\_update(Z,P,Pu,index)$\;
+\item $kernel\_testConverge(\Delta z[gpu\_id],Z,ZPrec)$\;
+%\verb=#pragma omp barrier;=
+\item error= Max($\Delta z$)\;
  }
  }
-\item copyD2H(dZ,Zi)\;
- // fin omp parallel\;
  
  
+\item Copy results from GPU memories to CPU memory\;
  \end{algorithm}
  \end{enumerate}
  ~\\ 
  
  
  \end{algorithm}
  \end{enumerate}
  ~\\ 
  
  
-\subsection{MGPU (MPI-CUDA)approach}
+
+\subsection{Multi-GPU (MPI-CUDA)approach}
+%\begin{figure}[htbp]
+%\centering
+ % \includegraphics[angle=-90,width=0.2\textwidth]{MPI-CUDA}
+%\caption{The MPI-CUDA architecture }
+%\label{fig:03}
+%\end{figure}
+
+
+\begin{enumerate}
+\begin{algorithm}[htpb]
+\label{alg2-cuda}
+%\LinesNumbered
+\caption{CUDA-MPI Algorithm to find roots with the Ehrlich-Aberth method}
+
+\KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
+  threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degrees), $\Delta z$ ( error of stop condition), $num_gpus$ (number of MPI processes/ number of GPUs), Size (number of roots)}
+
+\KwOut {$Z$ (Solution root's vector), $ZPrec$ (Previous solution root's vector)}
+
+\BlankLine
+\item Initialization of the P\;
+\item Initialization of the Pu\;
+\item Initialization of the solution vector $Z^{0}$\;
+\item Allocate and copy initial data from CPU memories to the GPU global memories\;
+\item $index= Size/num_gpus$\;
+\item k=0\;
+\While {$error > \epsilon$}{
+\item Let $\Delta z=0$\;
+\item $ kernel\_save(ZPrec,Z)$\;
+\item  k=k+1\;
+\item $ kernel\_update(Z,P,Pu,index)$\;
+\item $kernel\_testConverge(\Delta z,Z,ZPrec)$\;
+\item Copy results from GPU memories to CPU memories\;
+\item Send $Z[id]$ to all neighboring processes\;
+\item Receive $Z[j]$ from neighboring process j\;
+\item ComputeMaxError($\Delta z$,error)\;
+
+}
+\end{algorithm}
+\end{enumerate}
+~\\ 
  
  \section{experiments}
  
  
  \section{experiments}
author	Kahina <kahina@kahina-VPCEH3K1E.(none)>
	Mon, 21 Dec 2015 06:21:14 +0000 (07:21 +0100)
committer	Kahina <kahina@kahina-VPCEH3K1E.(none)>
	Mon, 21 Dec 2015 06:21:14 +0000 (07:21 +0100)