X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/kahina_paper2.git/blobdiff_plain/1e4a27cd6dd739a4ee1c228d704a036ac34609c8..329c650efca3cd8d8d10932efa575b528cf2d8bf:/paper.tex

diff --git a/paper.tex b/paper.tex
index a236a5a..e826d2d 100644
--- a/paper.tex
+++ b/paper.tex
@@ -775,6 +775,7 @@ implementation of Ehrlich-Aberth method.
 \begin{algorithm}[htpb]
 \label{alg1-cuda}
 \LinesNumbered
+\SetAlgoNoLine
 \caption{CUDA Algorithm to find roots with the Ehrlich-Aberth method}
 
 \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
@@ -797,11 +798,6 @@ Allocate and copy initial data to the GPU global memory\;
 Copy results from GPU memory to CPU memory\;
 \end{algorithm}
 
-~\\ 
-
-\RC{Au final, on laisse ce code, on l'explique, si c'est kahina qui
-  rajoute l'explication, il faut absolument ajouter \KG{dfsdfsd}, car
-  l'anglais sera Ã  relire et je ne veux pas tout relire... }
  
 \section{The EA algorithm on Multiple GPUs}
 \label{sec4}
@@ -858,43 +854,40 @@ shared memory arrays containing all the roots.
 %% roots sufficiently converge.
 
 
-%% \begin{enumerate}
-%% \begin{algorithm}[htpb]
-%% \label{alg2-cuda-openmp}
-%% %\LinesNumbered
-%% \caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
+\begin{algorithm}[htpb]
+\label{alg2-cuda-openmp}
+\LinesNumbered
+\SetAlgoNoLine
+\caption{CUDA-OpenMP Algorithm to find roots with the Ehrlich-Aberth method}
 
-%% \KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
-%%   threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
+\KwIn{$Z^{0}$ (Initial root's vector), $\varepsilon$ (Error tolerance
+  threshold), P (Polynomial to solve), Pu (Derivative of P), $n$ (Polynomial degree), $\Delta z$ ( Vector of errors for stop condition), $num\_gpus$ (number of OpenMP threads/ Number of GPUs), $Size$ (number of roots)}
 
-%% \KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
+\KwOut {$Z$ ( Root's vector), $ZPrec$ (Previous root's vector)}
 
-%% \BlankLine
+\BlankLine
 
-%% \item Initialization of P\;
-%% \item Initialization of Pu\;
-%% \item Initialization of the solution vector $Z^{0}$\;
-%% \verb=omp_set_num_threads(num_gpus);=
-%% \verb=#pragma omp parallel shared(Z,$\Delta$ z,P);=
-%% \verb=cudaGetDevice(gpu_id);=
-%% \item Allocate and copy initial data from CPU memory to the GPU global memories\;
-%% \item index= $Size/num\_gpus$\;
-%% \item k=0\;
-%% \While {$error > \epsilon$}{
-%% \item Let $\Delta z=0$\;
-%% \item $ kernel\_save(ZPrec,Z)$\;
-%% \item  k=k+1\;
-%% \item $ kernel\_update(Z,P,Pu,index)$\;
-%% \item $kernel\_testConverge(\Delta z[gpu\_id],Z,ZPrec)$\;
-%% %\verb=#pragma omp barrier;=
-%% \item error= Max($\Delta z$)\;
-%% }
+Initialization of P\;
+Initialization of Pu\;
+Initialization of the solution vector $Z^{0}$\;
+omp\_set\_num\_threads(num\_gpus)\;
+\#pragma omp parallel shared(Z,$\Delta$ z,P)\;
+\Indp
+{
+gpu\_id=cudaGetDevice()\;
+Allocate memory on GPU\;
+Compute local size and offet according to gpu\_id\;
+\While {$error > \epsilon$}{
+  copy Z from CPU to GPU\;
+$ ZPrec_{loc}=kernel\_save(Z_{loc})$\;
+$ Z_{loc}=kernel\_update(Z,P,Pu)$\;
+$\Delta z[gpu\_id] = kernel\_testConv(Z_{loc},ZPrec_{loc})$\;
+$  error= Max(\Delta z)$\;
+  copy $Z_{loc}$ from GPU to Z in CPU
+}
+\Indm}
+\end{algorithm}
 
-%% \item Copy results from GPU memories to CPU memory\;
-%% \end{algorithm}
-%% \end{enumerate}
-%% ~\\ 
-%% \RC{C'est encore pire ici, on ne voit pas les comm CPU <-> GPU }
 
 
 \subsection{Multi-GPU : an MPI-CUDA approach}