From: couturie Date: Tue, 4 Dec 2012 19:49:46 +0000 (+0100) Subject: ch15 X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/3c87d3518228d3b7bcfbd0a08cb79682f05132ee ch15 --- diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index e2a1c03..7936982 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -17,7 +17,10 @@ \usepackage{algorithmicx} \usepackage[lined,boxed,commentsnumbered]{algorithm2e} \usepackage{epstopdf} - +\usepackage{url} +\usepackage{multirow} +\usepackage{layouts} +\usepackage{comment} \frenchspacing \tolerance=5000 @@ -114,6 +117,7 @@ \include{Chapters/chapter8/ch8} \include{Chapters/chapter11/ch11} \include{Chapters/chapter14/ch14} +\include{Chapters/chapter15/ch15} \bibliographystyle{hep} %%%\bibliography{biblio} diff --git a/BookGPU/Chapters/chapter15/biblio.bib b/BookGPU/Chapters/chapter15/biblio.bib new file mode 100644 index 0000000..d0bb53f --- /dev/null +++ b/BookGPU/Chapters/chapter15/biblio.bib @@ -0,0 +1,167 @@ +@Inproceedings{PF_PDSEC2011, + author ={P. Fortin and R. Habel and F.~J\'ez\'equel and J.-L. Lamotte and N.S. Scott}, + title = {Deployment on GPUs of an application in computational atomic physics}, + booktitle = {{12th IEEE International Workshop on Parallel and Distributed Scientific +and Engineering Computing (PDSEC) in conjunction with the 25th International Parallel and Distributed Processing Symposium (IPDPS)}}, +year = 2011} +%note={8 pages}} +@Comment address= {{Anchorage, Alaska, USA}}, +@Comment month = may, + + +@article{Burke_1987, +author={P.G. Burke and C.J. Noble and M.P. Scott}, +title={{R-matrix theory of electron scattering at intermediate energies}}, +journal={Proceedings of the Royal Society of London A}, +volume=410, +year=1987, +pages={287--310} + } +% %Proc. Roy. Soc. A } + +@Article{2DRMP, +author ={N.S. Scott and M.P. Scott and P.G. Burke and T. Stitt and V. Faro-Maza and C. Denis and A. Maniopoulou}, +title ={{2DRMP: A~suite of two-dimensional R-matrix propagation codes}}, +journal ={Computer Physics Communications}, +volume={180}, +year =2009, +pages={2424--2449} +} +%note={ISSN: {0010-4655}, doi:10.1016/j.cpc.2009.07.017} + +@Article{FARM_2DRMP, +author ={ V.M. Burke and C.J. Noble and V. Faro-Maza and A. Maniopoulou and N.S. Scott}, +title ={ {FARM\_2DRMP: a version of FARM for use with 2DRMP}}, +journal ={Computer Physics Communications}, +volume={180}, +year =2009, +pages={2450--2451} +} +% note={ISSN: {0010-4655}, doi:10.1016/j.cpc.2009.07.017} + +@INPROCEEDINGS{VECPAR, + author = {T. Stitt and + N.S. Scott and + M.P. Scott and + P.G. Burke}, + title = {{2-D R-Matrix Propagation: A Large Scale Electron Scattering + Simulation Dominated by the Multiplication of Dynamically + Changing Matrices}}, + booktitle = {Proc. of VECPAR'02}, + year = {2002}, + pages = {354-367} +} +% month={June}, +%, +% ee = {http://link.springer.de/link/service/series/0558/bibs/2565/25650354.htm} +%} +%, +% crossref = {DBLP:conf/vecpar/2002}, +% bibsource = {DBLP, http://dblp.uni-trier.de} +% } + +@Misc{CPC, + title= {{CPC}: {C}omputer {P}hysics {C}ommunications}, + howpublished = {Queen's University, Belfast}, + note = {\url{http://cpc.cs.qub.ac.uk/}}, + key={CPC} +} + + + +@Misc{CUBLAS, + title={{NVIDIA} {CUDA} {T}oolkit 4.1, {CUBLAS} {L}ibrary}, + month = {January}, + year = {2012}, + key={NVIDIA CUBLAS} +} +@Comment howpublished = {available at: \url{http://docs.nvidia.com/cuda/cublas}}, + + + + +@Misc{CUDA_ProgGuide, + title={{NVIDIA} {CUDA} {C} {P}rogramming {G}uide, version 4.1}, + month = {November}, + year = {2011}, + key={NVIDIA CUDA} +} +@Comment howpublished = {available at: \url{http://docs.nvidia.com/cuda/cuda-c-programming-guide}}, + + + +@Misc{CUDA_ProgGuide_3.2, + title={{NVIDIA} {CUDA} {C} {P}rogramming {G}uide}, + year={2010}, + howpublished = {version 3.2, available at: \url{www.nvidia.com/object/cuda_get.html}} +} + + +@Misc{CUDA_stream, + title={{CUDA C/C++ Streams and Concurrency, NVIDIA}}, + author={S. Rennich}, + month={January}, + year={2012}, +} + + + + +@Misc{GOTO, + title= { Goto\uppercase{BLAS}}, + howpublished = { \url{http://www.tacc.utexas.edu/tacc-projects/gotoblas2}} +} + +@Misc{MAGMA, + title={{MAGMA (Matrix Algebra on GPU and Multicore Architectures)}}, + howpublished={Available at: \url{http://icl.cs.utk.edu/magma}}, +key={MAGMA} +} + +@Misc{MAGMA_0.2_UserGuide, + title={{MAGMA (Matrix Algebra on GPU and Multicore Architectures) version 0.2 Users' Guide}}, + author={S. Tomov and R. Nath and P. Du and J. Dongarra}, + howpublished={available at: \url{http://icl.cs.utk.edu/magma}} +} + + +@INPROCEEDINGS{NTD10a, + author = {R. Nath and S. Tomov and J. Dongarra}, + title = {{Accelerating GPU Kernels for Dense Linear Algebra}}, + booktitle={Proc. of VECPAR'10}, + year={2010} +} +% month={June}, + + +@article{NTD10b, + author = {Nath, R. and Tomov, S. and Dongarra, J.}, + title = {{An Improved Magma Gemm For Fermi Graphics Processing Units}}, + journal = {International Journal of High Performance Computing Applications}, + volume = {24}, + number = {4}, + year = {2010}, + pages = {511--515}, +} + @Comment issue_date = {November 2010}, + @Comment publisher = {Sage Publications, Inc.}, + @Comment address = {Thousand Oaks, CA, USA}, + @Comment keywords = {CUDA matrix mutiply, Fermi, GPU BLAS, dense linear algebra, hybrid computing}, + @Comment numpages = {5}, + @Comment url = {http://dx.doi.org/10.1177/1094342010385729}, + @Comment doi = {10.1177/1094342010385729}, + @Comment acmid = {1889710}, + @Comment issn = {1094-3420}, + @Comment month = nov, + + +@Misc{CUBLAS3.1, +title= {{CUDA CUBLAS Library 3.1}}, + howpublished = { \url{http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/CUBLAS_Library_3.1.pdf} } +} + +@Misc{CUBLAS3.2, +title= {{CUDA CUBLAS Library 3.2}}, + howpublished = { \url{http://developer.download.nvidia.com/compute/cuda/3_2_prod/toolkit/docs/CUBLAS_Library.pdf} } +} + diff --git a/BookGPU/Chapters/chapter15/ch15.tex b/BookGPU/Chapters/chapter15/ch15.tex new file mode 100644 index 0000000..1eca6ec --- /dev/null +++ b/BookGPU/Chapters/chapter15/ch15.tex @@ -0,0 +1,1168 @@ +\chapterauthor{Pierre Fortin}{Laboratoire d'Informatique de Paris 6, University Paris 6} +\chapterauthor{Rachid Habel}{T\'el\'ecom SudParis} +\chapterauthor{Fabienne J\'ez\'equel}{Laboratoire d'Informatique de Paris 6, University Paris 6} +\chapterauthor{Jean-Luc Lamotte}{Laboratoire d'Informatique de Paris 6, University Paris 6} +\chapterauthor{Stan Scott}{School of Electronics, Electrical Engineering \& Computer Science, +The Queen's University of Belfast} + +\newcommand{\fixme}[1]{{\bf #1}} + +\chapter{Numerical validation and performance optimization on GPUs of +an application in atomic physics} +\label{chapter15} + +\section{Introduction}\label{ch15:intro} +As described in Chapter~\ref{chapter1}, GPUs are characterized by hundreds +of cores and theoretically perform one order of magnitude better than CPUs. +An important factor to consider when programming on GPUs + is the cost of +data transfers between CPU memory and GPU memory. Thus, to have good +performance on +GPUs, applications should be coarse-grained and have a high arithmetic +intensity +($i.e.$ the ratio of arithmetic operations to memory operations). +Another important aspect of GPU programming is that floating-point +operations are preferably performed in single precision, if the + validity of results is not impacted by that format. +The GPU compute power for floating-point operations is indeed greater in +single precision than in double precision. +The peak performance ratio between single precision and double +precision varies for example for NVIDIA GPUs from $12$ for the first Tesla +GPUs (C1060), +to $2$ for the Fermi GPUs (C2050 and C2070) +and to $3$ for the latest Kepler architecture (K20/K20X). +As far as AMD GPUs are concerned, the latest AMD GPU (Tahiti HD 7970) +presents a ratio of $4$. +Moreover, GPU internal memory accesses and CPU-GPU data transfers are +faster in single precision than in double precision, +because of the different format lengths. + +This chapter describes the deployment on GPUs of PROP, a program of the +2DRMP~\cite{FARM_2DRMP,2DRMP} suite which models electron collisions +with H-like atoms and ions at intermediate energies. 2DRMP operates successfully on serial +computers, high performance clusters and supercomputers. The primary +purpose of the PROP program is to propagate a global +R-matrix~\cite{Burke_1987}, $\Re$, in the two-electron configuration +space. +The propagation needs to be performed for all collision energies, +for instance hundreds of energies, +which are independent. +Propagation equations are dominated by matrix multiplications involving sub-matrices of $\Re$. +However, the matrix multiplications are not +straightforward in the sense that $\Re$ dynamically changes the designation of its rows and +columns and increases in size as the propagation proceeds \cite{VECPAR}. + +In a preliminary investigation PROP was selected by GENCI\footnote{GENCI: Grand Equipement National + de Calcul Intensif, \url{www.genci.fr}} and +CAPS\footnote{CAPS is a software company providing products and solutions + for manycore application programming and deployment, + \url{www.caps-entreprise.com}}, +following their first call for projects in 2009-2010 +aimed at +deploying applications on hybrid systems based on GPUs. +First CAPS +recast the propagation equations with larger matrices. +For matrix products the GPU performance gain over CPU increases indeed +with the matrix size, since the +CPU-GPU transfer overhead becomes less significant and since CPUs are +still more efficient for fine computation grains. +Then, using HMPP\footnote{ +HMPP or {\em CAPS compiler}, see: \url{www.caps-entreprise.com/hmpp.html}}, +a commercial +hybrid and parallel compiler, CAPS +developed a version of PROP, in +which matrix multiplications are performed on +the GPU or the CPU, depending on the matrix size. +Unfortunately this partial GPU implementation of PROP does not offer +significant acceleration. + +The work described in this chapter, which is based on a study presented in \cite{PF_PDSEC2011}, aims at + improving PROP performance on +GPUs by exploring two directions. First, because the original version of PROP is written +in double precision, +we study the numerical stability of PROP in single precision. +Second, we deploy the whole +computation code of PROP on +GPUs to avoid the overhead generated by +data transfers +and we propose successive improvements +(including a specific one to the Fermi architecture) +in order to optimize the GPU code. + + + + +\section{2DRMP and the PROP program} +\label{s:2DRMP_PROP} +\subsection{Principles of R-matrix propagation} +2DRMP~\cite{FARM_2DRMP,2DRMP} is part of the CPC library\footnote{CPC: +Computer Physics Communications, +\url{http://cpc.cs.qub.ac.uk/}}. +It is a suite of seven +programs aimed at creating virtual experiments on high performance and grid +architectures to enable the study of electron scattering from H-like +atoms and ions at intermediate energies. The 2DRMP suite uses the +two-dimensional $R$-matrix propagation approach~\cite{Burke_1987}. +In 2DRMP the two-electron configuration space ($r_1$,$r_2$) is +divided into sectors. +Figure~\ref{prop} shows the division of the two-electron configuration +space ($r_1$,$r_2$) into 4 vertical $strips$ representing 10 $sectors$. +The key computation in 2DRMP, performed by the PROP program, is the +propagation of a global +$R$-matrix, $\Re$, from sector to sector across the internal region, as shown in Fig.~\ref{prop}. + +\begin{figure}[h] +\begin{center} +\includegraphics*[width=0.65\linewidth]{Chapters/chapter15/figures/prop.pdf} +\caption{\label{prop} Subdivision of the configuration space +($r_1$,$r_2$) into a set of connected sectors.} +\end{center} +\end{figure} + +\begin{figure}[h] +\begin{center} +\includegraphics*[width=0.8\linewidth]{Chapters/chapter15/figures/Domain.pdf} +\caption{\label{domain} Propagation of the R-matrix from domain D to domain D'.} +\end{center} +\end{figure} + +We consider the general situation in +Fig.~\ref{domain} where we assume that we already know +the global $R$-matrix, $\Re^{I}$, associated with the boundary defined +by edges 5, 2, 1 and 6 +in domain $D$ and we wish to +evaluate the new global $R$-matrix, $\Re^{O}$, associated with edges 5, 3, 4 and 6 +in domain $D'$ following propagation across subregion $d$. +Input edges are denoted by I (edges 1 and~2), output edges by O (edges 3 and 4) and +common edges by X (edges 5 and~6). +Because of symmetry, only the lower half of domains $D$ and $D'$ has to be considered. +The global $R$-matrices, $\Re^{I}$ in domain $D$ and $\Re^{O}$ in +domain $D'$, can be written as: +\begin{equation} +\Re^{I} = \left(\begin{array}{cc} + \Re_{II}^{I} & \Re_{IX}^{I}\\ + \Re_{XI}^{I} & \Re_{XX}^{I} + \end{array}\right) +\ \ +\Re^{O} = \left(\begin{array}{cc} + \Re_{OO}^{O} & \Re_{OX}^{O}\\ + \Re_{XO}^{O} & \Re_{XX}^{O} + \end{array}\right). +\label{eq:RI_RO} +\end{equation} + + + +From the set of local $R$-matrices, $\mathbf{R}_{ij}$ ($i,j\in \{1,2,3,4\}$) +associated +with subregion $d$, we can define +\begin{subequations} +\begin{eqnarray} + \mathbf{r}_{II} = \left(\begin{array}{cc} + \mathbf{R}_{11} & \mathbf{R}_{12}\\ + \mathbf{R}_{21} & \mathbf{R}_{22} + \end{array}\right), \label{eqaa} & + \mathbf{r}_{IO} = \left(\begin{array}{cc} + \mathbf{R}_{13} & \mathbf{R}_{14}\\ + \mathbf{R}_{23} & \mathbf{R}_{24} + \end{array}\right), \label{eqbb}\\ + \mathbf{r}_{OI} = \left(\begin{array}{cc} + \mathbf{R}_{31} & \mathbf{R}_{32}\\ + \mathbf{R}_{41} & \mathbf{R}_{42} + \end{array}\right), \label{eqcc} & + \mathbf{r}_{OO} = \left(\begin{array}{cc} + \mathbf{R}_{33} & \mathbf{R}_{34}\\ + \mathbf{R}_{43} & \mathbf{R}_{44} + \end{array}\right),\label{eqdd} +\end{eqnarray} +\end{subequations} +where $I$ represents the input edges 1 and 2, and $O$ represents +the output edges 3 and 4 (see Fig.~\ref{domain}). +The propagation across each sector is characterized by equations~(\ref{eq1}) to (\ref{eq4}). +\begin{subequations} +\begin{eqnarray} +\Re^{O}_{OO} &=& \mathbf{r}_{OO} - \mathbf{r}_{IO}^T (r_{II} + \Re^{I}_{II})^{-1}\mathbf{r}_{IO}, \label{eq1} \\ +\Re^{O}_{OX} &=& \mathbf{r}_{IO}^T (\mathbf{r}_{II} + \Re^{I}_{II})^{-1}\Re^{I}_{IX}, \label{eq2} \\ + \Re^{O}_{XO} &=& \Re^{I}_{XI}(\mathbf{r}_{II} + \Re^{I}_{II})^{-1}\mathbf{r}_{IO}, \label{eq3} \\ + \Re^{O}_{XX} &=& \Re^{I}_{XX} - \Re^{I}_{XI}(\mathbf{r}_{II} +\Re^{I}_{II})^{-1}\Re^{I}_{IX}. \label{eq4} +\end{eqnarray} +\end{subequations} + +The matrix inversions are not explicitly performed. To compute +$(r_{II} + \Re^{I}_{II})^{-1}\mathbf{r}_{IO}$ and $(\mathbf{r}_{II} + \Re^{I}_{II})^{-1}\Re^{I}_{IX}$, +two linear systems are solved. + + +\medskip + +While equations (\ref{eq1})-(\ref{eq4}) can be applied to the +propagation across a general subregion two special situations should be +noted: propagation across a diagonal subregion and propagation across +a subregion bounded by the $r_{1}$-axis at the beginning of a new +strip. + +In the case of a diagonal subregion, from symmetry considerations, +edge 2 is identical to edge 1 and edge 3 is identical to edge~4. +Accordingly, with only one input edge and one output edge equations +(\ref{eqaa})-(\ref{eqdd}) become: +\begin{subequations} +\begin{eqnarray} +\mathbf{r}_{II} = 2\mathbf{R}_{11}, \ +\mathbf{r}_{IO} = 2\mathbf{R}_{14}, \label{eq4b}\\ +\mathbf{r}_{OI} = 2\mathbf{R}_{41}, \ +\mathbf{r}_{OO} = 2\mathbf{R}_{44}. \label{eq4d} +\end{eqnarray} +\end{subequations} +In the case of a subregion bounded by the $r_1$-axis at the beginning +of a new strip, we note that the input boundary $I$ consists of only +one edge. When propagating across the first subregion in the second +strip there is no common boundary $X$: in this case only equation +(\ref{eq1}) needs to be solved. + +\medskip + +Having obtained the global $R$-matrix $\Re$ on the boundary of the +innermost subregion (labeled $0$ in Fig.~\ref{prop}), $\Re$ is propagated across +each subregion in the order indicated in Fig.~\ref{prop}, +working systematically from the +$r_1$-axis at the bottom of each strip across all subregions to the +diagonal. + + + +\subsection{Description of the PROP program} +\label{sec:PROP} + +\begin{table}[t] +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|} + \hline + \multirow{2}{0.09\linewidth}{\centering Data set} & + \multirow{2}{0.15\linewidth}{\centering + Local $R$-\\matrix size} & + \multirow{2}{0.07\linewidth}{\centering Strips} & + \multirow{2}{0.09\linewidth}{\centering Sectors} & + \multirow{2}{0.19\linewidth}{\centering Final global \\$R$-matrix size} & + \multirow{2}{0.15\linewidth}{\centering Scattering\\energies} \\ + & & & & & \\ + \hline + Small & 90x90 & 4 & 10 & 360x360 & 6\\ + \hline + Medium & 90x90 & 4 & 10 & 360x360 & 64\\ + \hline + Large & 383x383 & 20 & 210 & 7660x7660 & 6\\ + \hline + Huge & 383x383 & 20 & 210 & 7660x7660 & 64\\ \hline +\end{tabular} +\caption{\label{data-sets}Characteristics of four data sets} +\end{center} +\end{table} + +The PROP program computes the propagation of the $R$-matrix across the sectors of the internal region. +Table~\ref{data-sets} shows four different +data sets used in this study and highlights the principal parameters of the PROP program. + PROP execution can be described by Algorithm~\ref{prop-algo}. +First, amplitude arrays and +correction data are read from data files generated by the preceding +program of the 2DRMP suite. +Then, local $R$-matrices are constructed from amplitude arrays. +Correction data is used to compute correction vectors added to the diagonal of the local +$R$-matrices. The local $R$-matrices, together with the input $R$-matrix, + $\Re^{I}$, +computed on the previous sector, are used to compute the current output +$R$-matrix, +$\Re^{O}$. + At the end of a sector evaluation, +the output $R$-matrix becomes the input $R$-matrix +for the next evaluation. + +%% \begin{algorithm} +%% \caption{\label{prop-algo}PROP algorithm} +%% \begin{algorithmic} +%% \FOR{all scattering energies} +%% \FOR{all sectors} +%% \STATE Read amplitude arrays +%% \STATE Read correction data +%% \STATE Construct local $R$-matrices +%% \STATE From $\Re^{I}$ and local $R$-matrices, compute $\Re^{O}$ +%% \STATE $\Re^{O}$ becomes $\Re^{I}$ for the next sector +%% \ENDFOR +%% \STATE Compute physical $R$-Matrix +%% \ENDFOR +%% \end{algorithmic} +%% \end{algorithm} + + +On the first sector, there is no input $R$-matrix yet. To bootstrap +the propagation, the first output $R$-matrix is constructed using only +one local $R$-matrix. On the last sector, that is, on the boundary of +the inner region, a physical $R$-matrix corresponding to the output +$R$-matrix is computed and stored into an output file. + +In the PROP program, sectors are characterized into four types, +depending on the computation performed: +\begin{itemize} +\item the starting sector (labeled 0 in Fig.~\ref{prop}) +\item the axis sectors (labeled 1, 3 and 6 in Fig.~\ref{prop}) +\item the diagonal sectors (labeled 2, 5 and 9 in Fig.~\ref{prop}) +\item the off-diagonal sectors (labeled 4, 7 and 8 in Fig.~\ref{prop}). +\end{itemize} + + +The serial version of PROP is implemented in Fortran~90 and uses +for linear algebra operations BLAS and LAPACK routines +which are fully optimized for x86 architecture. +This +program +serially propagates the $R$-matrix for +all scattering energies. +Since the propagations for these different +energies are independent, there also +exists an embarrassingly parallel version of +PROP +that spreads the computations of +several energies +among multiple CPU nodes via +MPI. + + + +\subsection{CAPS implementation} +\label{caps} + +In order to handle larger matrices, and thus obtain better GPU speedup, CAPS +recast equations (\ref{eq1}) to (\ref{eq4}) into one equation. +The output $R$-matrix $\Re^{O}$ defined by equation~(\ref{eq:RI_RO}) is now computed as follows. +\begin{equation}\label{eq_CAPS_1} +\Re^{O} = \Re^{O^{\ \prime}} + U A^{-1} V, +\end{equation} +\begin{equation}\label{eq_CAPS_2} +{\rm with} \ +\Re^{O^{\ \prime}}= \left(\begin{array}{cc} + \mathbf{r}_{OO} & 0\\ + 0 & \Re^I_{XX} \end{array}\right), \ +U= \left(\begin{array}{c} + \mathbf-{r}_{IO}^{T}\\ + \Re^I_{XI} \end{array}\right), +\end{equation} +\begin{equation}\label{eq_CAPS_3} +A= \mathbf{r}_{II} + \Re^I_{II} \ {\rm and} \ +V= (\mathbf{r}_{IO}\ \ \ -\Re^I_{IX}). +\end{equation} + +To compute $W=A^{-1}V$, no matrix inversion is performed. The matrix +system $AW=V$ is solved. +This reimplementation of PROP reduces the number of equations to be +solved and the number of matrix copies for evaluating each sector. +For instance, for an off-diagonal sector, +copies fall from 22 to 5, matrix multiplications from 4 to~1 and calls +to a linear equation solver from 2 to 1. + +To implement this version, CAPS +used HMPP, a +commercial hybrid and parallel compiler, +based on compiler directives like the new OpenACC standard\footnote{See: \url{www.openacc-standard.org}}. +If the matrices are large enough (the limit sizes are experimental parameters), +they are multiplied on the GPU, otherwise on the CPU. +CAPS + used the MKL BLAS implementation on an Intel Xeon +x5560 quad core CPU (2.8 GHz) +and the CUBLAS library (CUDA 2.2) on one Tesla C1060 GPU. +On the large data set (see Table~\ref{data-sets}), CAPS + obtained a speedup of 1.15 for the GPU +version over the CPU one (with multi-threaded MKL calls on the four +CPU cores). This limited gain in performance is mainly +due to the use of double precision computation +and to the small or medium sizes of most matrices. +For these matrices, the computation gain on +the GPU is indeed +strongly affected by the overhead +generated by transferring these matrices from +the CPU memory to the GPU memory to perform each matrix multiplication and then +transferring the result back to the CPU memory. + +Our goal is to speedup PROP more significantly by porting the whole +code to the GPU and therefore avoiding +the +intermediate data transfers between +the host (CPU) and the GPU. We will also study the +stability of PROP in single precision because +single precision computation is faster on the GPU +and CPU-GPU data transfers are twice as fast as those performed in +double precision. + + + +\section{Numerical validation of PROP in single precision} +\label{single-precision} + +\begin{comment} +\begin{table}[h] +\begin{center} +\begin{tabular}{|c|c|} + \hline + relative error interval & \# occurrences \\ + \hline + [0, 1.E-8) & 18 \\ + \hline + [1.E-8, 1.E-6) & 1241 \\ + \hline + [1.E-6, 1.E-4) & 48728 \\ + \hline + [1.E-4, 1.E-2) & 184065 \\ + \hline + [1.E-2, 1) & 27723 \\ + \hline + [1, 100) & 304 \\ + \hline + [100, $+\infty$) & 1 \\ + \hline +\end{tabular} +\end{center} +\caption{\label{sp-distrib}Error distribution for medium case in single precision} +\end{table} +\end{comment} + + +Floating-point input data, computation and output data of PROP are +originally in double precision format. +PROP produces a standard $R$-matrix H-file \cite{FARM_2DRMP} + and a collection of Rmat00X files (where X +ranges from 0 to the number of scattering energies - 1) +holding the physical R-matrix for each +energy. +The H-file and the Rmat00X files are binary input files of the FARM program \cite{FARM_2DRMP} +(last program of the 2DRMP suite). +Their text equivalent are the prop.out +and the prop00X.out files. +To study the validity of PROP results in single precision, +first, +reference results are + generated by running the serial version of PROP in double precision. +Data used in the most costly computation parts are read from input files in +double precision format and then +cast to single precision format. +PROP results (input of FARM) are computed in single precision and written +into files in double precision. + +\subsection{Medium case study} +\begin{figure}[h] +\begin{center} +\includegraphics*[width=0.9\linewidth]{Chapters/chapter15/figures/error.pdf} +\caption{\label{fig:sp-distrib} Error distribution for medium case in single precision} +\end{center} +\end{figure} + +The physical $R$-matrices, in +the prop00X.out files, are compared to the +reference ones for the medium case (see Table~\ref{data-sets}). + The relative +error distribution is +given in Fig.~\ref{fig:sp-distrib}. +We focus on the largest errors. +\begin{itemize} +\item Errors greater than 100: the only impacted value is of order 1.E-6 +and is negligible compared to the other ones +in the same prop00X.out file. + +\item Errors between 1 and 100: the values corresponding to the + largest errors are of order 1.E-3 and are negligible compared to + the majority of the other values which range between 1.E-2 and + 1.E-1. + +\item Errors between 1.E-2 and 1: the largest errors ($\ge$ 6\%) + impact values the order of magnitude of which is at most 1.E-1. + These values are negligible. + Relative errors of approximately 5\% impact values the order of + magnitude of which is at most 1.E2. + For instance, the value 164 produced by the reference version of + PROP becomes 172 in the single precision version. + +\end{itemize} + +To study the impact of the single precision version of PROP on the +FARM program, the cross-section +results files corresponding to +transitions +{1s1s}, +{1s2s}, {1s2p}, {1s3s}, {1s3p}, {1s3d}, +{1s4s}, {2p4d} are compared to the reference ones. +Table~\ref{sp-farm} shows that all cross-section files are impacted by +errors. Indeed in the {2p4d} file, four relative errors are +greater than one and the maximum relative error is 1.60. +However the largest errors impact negligible values. For example, the maximum +error (1.60) impacts a reference value which is 4.5E-4. The largest +values are impacted by low errors. For instance, the maximum value +(1.16) is impacted by a relative error of the order 1.E-3. + +\begin{table}[t] +\begin{center} +\begin{tabular}{|c|c||c|c|} \hline + file & largest relative error & file & largest relative error\\ \hline + {1s1s} & 0.02& {1s3p} & 0.11 \\ \hline + {1s2s} & 0.06 & {1s3d} & 0.22 \\ \hline + {1s2p} & 0.08 & {1s4s} & 0.20 \\ \hline + {1s3s} & 0.17 &2p4d & 1.60 \\ \hline +\end{tabular} +\caption{\label{sp-farm}Impact on FARM of the single precision version of PROP} +\end{center} +\end{table} + +To examine in more detail the impact of PROP on FARM, +cross sections above the ionization threshold (1 Ryd) +are compared in single and +double precision for +transitions amongst the 1s, \dots 4s, 2p, \dots 4p, 3d, 4d target states. +This comparison is carried out by generating 45 plots. In all the + plots, results in single and double precision match except for few + scattering energies which are very close to pseudo-state thresholds. +For example Fig.~\ref{1s2p} and \ref{1s4d} present the scattering energies corresponding to the +{1s2p} and {1s4d} cross-sections computed in single and double precision. For some cross-sections, +increasing a threshold parameter from 1.E-4 to 1.E-3 in the FARM +program +results in energies close to threshold being avoided + and therefore +the cross-sections in double and single precision match more +accurately. +This is the case for instance for cross-section 1s2p (see Fig.~\ref{1s2p3}). +However for other cross-sections (such as 1s4d) some problematic energies remain even if the +threshold parameter in the FARM +program is increased to 1.E-3 (see Fig.~\ref{1s4d3}). A higher +threshold parameter would be required for such cross-sections. + +\begin{figure}[t] +\centering +\subfigure[threshold = 1.E-4]{ +\includegraphics*[width=.76\linewidth]{Chapters/chapter15/figures/1s2p.pdf} + \label{1s2p} + } +\subfigure[threshold = 1.E-3]{ +\includegraphics*[width=.76\linewidth]{Chapters/chapter15/figures/1s2p3.pdf} + \label{1s2p3} + } +\label{fig:1s2p_10sectors} +\caption{1s2p cross-section, 10 sectors} +\end{figure} + +\begin{figure}[t] +\centering +\subfigure[threshold = 1.E-4]{ +\includegraphics*[width=.76\linewidth]{Chapters/chapter15/figures/1s4d.pdf} + \label{1s4d} + } +\subfigure[threshold = 1.E-3]{ +\includegraphics*[width=.76\linewidth]{Chapters/chapter15/figures/1s4d3.pdf} + \label{1s4d3} + } +\label{fig:1s4d_10sectors} +\caption{1s4d cross-section, 10 sectors} +\end{figure} + +As a conclusion, the medium case study shows that the execution of +PROP in single precision leads to a few inexact scattering energies to +be computed by the FARM program for some cross-sections. +Thanks to a suitable threshold parameter in the FARM program these problematic energies may possibly +be skipped. +Instead of investigating deeper the choice of such a parameter for the medium case, we analyze the +single precision computation in a more +realistic case in Sect.~\ref{huge}. +\begin{comment} +The conclusion of the medium case study is that running PROP in single +precision gives relatively stable results provided that suitable +parameter values are used in the FARM program in order to skip the +problematic energies that are too close to the pseudo-state +thresholds. To verify if this conclusion is still valid with a larger +data set, the single precision computation is analyzed in a more +realistic case in Sect.~\ref{huge}. +\end{comment} + +\subsection{Huge case study}\label{huge} + + +\begin{figure}[t] + \centering +\includegraphics*[width=.76\linewidth]{Chapters/chapter15/figures/1s2pHT.pdf} +\caption{\label{1s2pHT}1s2p cross-section, threshold = 1.E-4, 210 sectors} +\end{figure} + +\begin{figure}[t] + \centering +\includegraphics*[width=.76\linewidth]{Chapters/chapter15/figures/1s2pHT.pdf} +\caption{\label{1s4dHT}1s4d cross-section, threshold = 1.E-4, 210 sectors} +\end{figure} + +We study here the impact on FARM of the PROP program run in +single precision for the huge case (see Table~\ref{data-sets}). +The cross-sections +corresponding to all +atomic target states 1s \dots 7i are explored, which +leads to +406 comparison plots. +It should be noted that in this case, over the same energy range above the ionization threshold, the density of pseudo-state thresholds is significantly increased compared to the medium case. +As expected, all the plots exhibit large differences between single and double +precision cross-sections. +For example Fig.~\ref{1s2pHT} and \ref{1s4dHT} present the 1s2p and 1s4d cross-sections computed in +single and double precision for the huge case. +We can conclude that PROP in single precision gives invalid results +for realistic simulation cases above the ionization threshold. +Therefore the deployment of PROP on GPU, described in Sect.~\ref{gpu-implem}, +has been carried out in double precision. + +\section{Towards a complete deployment of PROP on GPUs} +\label{gpu-implem} + +We now detail how PROP has been progressively deployed on +GPUs in double precision in order to avoid the +expensive memory transfers between the host and the GPU. +Different versions with successive improvements and optimizations are presented. +We use CUDA~\cite{CUDA_ProgGuide} for GPU programming, as well as the +CUBLAS~\cite{CUBLAS} +and MAGMA \cite{MAGMA} libraries for linear algebra operations. +Since PROP is written in Fortran 90, {\em wrappers} in C are used to +enable calls to CUDA kernels from PROP routines. + + +\subsection{Computing the output $R$-matrix on GPU} +\label{gpu-RO} + +\begin{figure}[h] + \centering + \includegraphics[width=0.7\linewidth]{Chapters/chapter15/figures/offdiagonal_nb.pdf} + \caption{\label{offdiagonal} The six steps of an off-diagonal sector + evaluation.} +\end{figure} + +As mentioned in Algorithm~\ref{prop-algo}, evaluating a sector +mainly consists in constructing local $R$-matrices and in computing +one output $R$-matrix, $\Re^{O}$. In this first step of the porting +process, referred to as GPU V1\label{gpuv1}, +we only consider the computation of $\Re^{O}$ on the GPU. +We distinguish the following six steps, related to equations +(\ref{eq_CAPS_1}), (\ref{eq_CAPS_2}) and (\ref{eq_CAPS_3}), and illustrated in +Fig.~\ref{offdiagonal} for an off-diagonal sector. + +\begin{description} +\item[Step 1] (``Input copies''):~data are copied from $\Re^{I}$ + to temporary arrays ($A$, $U$, $V$) and to $\Re^{O}$. + These copies, along with possible scalings or transpositions, are + implemented as CUDA kernels which can be applied to two + matrices of any size and starting at any offset. + Memory accesses are coalesced \cite{CUDA_ProgGuide} in order to + provide the best performance for such memory-bound kernels. +\item[Step 2] (``Local copies''):~data are copied from + local $R$-matrices to temporary arrays ($U$, $V$) and to $\Re^{O}$. + Moreover data from local $R$-matrix + $\mathbf{r}_{II}$ + is added to matrix $A$ (via a CUDA kernel) and zeroes are written in + $\Re^{O}$ where required. +\item[Step 3] (``Linear system solving''):~matrix $A$ is factorized + using the MAGMA DGETRF + routine and the result is stored in-place. +\item[Step 4] (``Linear system solving'' cont.):~the matrix system + of linear equations $AW$ = $V$ is solved using the MAGMA DGETRS +routine. The solution is stored in matrix $V$. +\item[Step 5] (``Output matrix product''):~matrix $U$ + is multiplied by matrix $V$ using the CUBLAS DGEMM + routine. The result is stored in a temporary matrix~$t$. +\item[Step 6] (``Output add''):~$t$ is added to $\Re^{O}$ (CUDA + kernel). +\end{description} + +All the involved matrices are stored in the GPU memory. Only the +local $R$-matrices are first constructed on the host and then sent +to the GPU memory, since these matrices vary from sector to sector. +The evaluation of the axis and diagonal sectors is similar. +However, fewer operations and copies are required because of +symmetry considerations \cite{2DRMP}. + +\subsection{Constructing the local $R$-matrices on GPU} + +\begin{figure}[t] + \centering + \includegraphics[width=0.7\linewidth]{Chapters/chapter15/figures/amplitudes_nb.pdf} + \caption{\label{amplitudes} Constructing the local $R$-matrix R34 + from the $j$ amplitude array associated with edge 4 and the $i$ + amplitude array associated with edge~3.} +\end{figure} + +Local $R$-matrices are constructed using two three dimensional arrays, +$i$ and $j$. Each three dimensional array contains four +matrices corresponding to the surface amplitudes associated with the +four edges of a sector. Those matrices are named {\em amplitude arrays}. + $j$ amplitude arrays are read from data files and $i$ amplitude arrays +are obtained by scaling each row of the $j$ amplitude arrays. +The main part of the construction of a local $R$-matrix, +presented in Fig.~\ref{amplitudes}, +is a matrix product between +one $i$ amplitude array and one transposed $j$ amplitude array +which is performed by a single DGEMM +BLAS call. +In this version, hereafter referred to as GPU +V2\label{gpuv2}, $i$ and $j$ amplitude arrays are transferred to the +GPU memory and the required matrix multiplications are performed on +the GPU (via CUBLAS routines). + + +The involved matrices having medium sizes (either $3066 \times 383$ or +$5997 \times 383$), +performing these matrix multiplications +on the GPU is expected to be faster than on the CPU. +However, this implies a greater communication volume +between the CPU and the GPU +since the $i$ and $j$ amplitude arrays are larger than the local +$R$-matrices. +It can be noticed that correction data are also used in the +construction of a local $R$-matrix, +but this is a minor part in the +computation. However, these correction data also have to be +transferred from the CPU to the GPU for each sector. + +\subsection{Scaling amplitude arrays on GPU} +It +should be worthwhile to try to reduce the CPU-GPU data +transfers of the GPU V2, where the $i$ and $j$ amplitude arrays are +constructed on the host and then sent to the GPU memory for each sector. +In this new version, hereafter referred to as GPU V3\label{gpuv3}, we +transfer only the $j$ amplitude arrays and the +required scaling factors (stored in one 1D array) to the GPU memory, +so that the $i$ amplitude arrays are +then directly computed on the GPU by multiplying the $j$ amplitude +arrays by these scaling factors (via a CUDA kernel). +Therefore, we save the transfer of four $i$ amplitude arrays on +each sector by transferring only this 1D array of scaling factors. +Moreover, scaling $j$ amplitude arrays is expected to be faster on the +GPU than on the CPU, thanks to the massively parallel architecture of +the GPU and thanks to its higher internal memory bandwidth. + +\subsection{Using double-buffering to overlap I/O and computation} + +\begin{figure}[t] + \centering + \includegraphics[width=0.8\linewidth]{Chapters/chapter15/figures/C1060_V3_IO_COMP.pdf} + \caption{\label{overlapping} Compute and I/O times for the GPU V3 on + one C1060.} +\end{figure} + +As described in Algorithm~\ref{prop-algo}, there are two main steps in +the propagation across a sector: reading amplitude arrays +and correction data from I/O files and +evaluating the current sector. +Fig.~\ref{overlapping} shows the I/O times and the evaluation times +of each sector for the huge case execution (210 sectors, 20 strips) of the GPU V3 +on one C1060. +Whereas the times required by the off-diagonal sectors are similar +within each of the 20 strips, +the times for diagonal sectors of each strip +are the shortest ones, the times for the axis sectors being +intermediate. +The I/O times are roughly constant among all strips. +The evaluation time is equivalent to the I/O +time for the first sectors. But this evaluation time grows +linearly with the strip number, and rapidly exceeds the I/O +time. + +It is thus interesting to use a double-buffering technique to overlap the +I/O time with the evaluation time: +for each sector, the evaluation of sector $n$ is performed +(on GPU) simultaneously with the reading of data for sector +$n+1$ (on CPU). This requires the duplication in the CPU memory of all the +data structures +used for storing data read from I/O files for each sector. +This version, hereafter referred to as GPU +V4\label{gpuv4}, uses POSIX threads. Two threads are +executed concurrently: an I/O thread that reads data from I/O files +for each sector, and a computation thread, dedicated to the propagation +of the global $R$-matrix, that performs successively for each sector +all necessary computations on GPU, +as well as all required CPU-GPU data transfers. +The evaluation of a sector uses the data read for this sector as well +as the global $R$-matrix computed on the previous sector. +This dependency requires synchronizations between the I/O thread and +the computation thread which are implemented through standard POSIX +thread mechanisms. + + +\subsection{Matrix padding} +The CUBLAS DGEMM +performance and the MAGMA DGETRF/DGETRS +performance is reduced when the sizes (or +the leading dimensions) of the matrix are not multiples of the inner blocking size \cite{NTD10a}. +This inner blocking size can be 32 or 64, depending on the computation +and on the underlying +GPU architecture \cite{MAGMA,NTD10b}. +In this version (GPU V5\label{gpuv5}), +the matrices are therefore padded with $0.0$ (and $1.0$ on the diagonal for the linear systems) +so that their sizes are +multiples of 64. +This corresponds indeed to the optimal size for the matrix product on the +Fermi architecture \cite{NTD10b}. And as far as linear system solving is +concerned, all the matrices have sizes which are multiples of 383: we +therefore use padding to obtain multiples of 384 (which are +again multiples of 64). +It can be noticed that this padding has to be performed dynamically +as the matrices increase in size during the propagation +(when possible the + maximum required storage space is however allocated only once in the + GPU memory). + + + +\section{Performance results} +\subsection{PROP deployment on GPU} + +\begin{table*}[ht] +\begin{center} +\begin{tabular}{|c||c|c||} + \hline + PROP version & \multicolumn{2}{c|}{Execution time} \\ + \hline + \hline + CPU version: 1 core & \multicolumn{2}{c|}{201m32s} \\ + \hline + CPU version: 4 cores & \multicolumn{2}{c|}{113m28s} \\ + \hline \hline +GPU version & C1060 & C2050 \\ + \hline\hline + GPU V1 (\S~\ref{gpuv1}) & 79m25s & 66m22s \\ + \hline + GPU V2 (\S~\ref{gpuv2}) & 47m58s & 29m52s \\ + \hline + GPU V3 (\S~\ref{gpuv3}) & 41m28s & 23m46s \\ + \hline + GPU V4 (\S~\ref{gpuv4}) & 27m21s & 13m55s\\ + \hline + GPU V5 (\S~\ref{gpuv5}) & 24m27s & 12m39s \\ + \hline +\end{tabular} +\caption{\label{table:time} +Execution time of PROP on CPU and GPU} +\end{center} +\end{table*} + +\begin{comment} +\begin{table*}[ht] +\begin{center} +\begin{tabular}{|c||c|c||} + \hline + PROP version & \multicolumn{2}{c|}{Execution time} \\ + \hline \hline +CPU version & 1 core & 4 cores \\\hline +& {201m32s} & {113m28s} \\ \hline \hline +GPU version & C1060 & C2050 \\ + \hline\hline + GPU V1 (\ref{gpuv1}) & 79m25s & 66m22s \\ + \hline + GPU V2 (\ref{gpuv2}) & 47m58s & 29m52s \\ + \hline + GPU V3 (\ref{gpuv3}) & 41m28s & 23m46s \\ + \hline + GPU V4 (\ref{gpuv4}) & 27m21s & 13m55s\\ + \hline + GPU V5 (\ref{gpuv5}) & 24m27s & 12m39s \\ + \hline +\end{tabular} +\caption{\label{table:time} +Execution time of the successive GPU versions} +\end{center} +\end{table*} +\end{comment} + +\begin{figure}[h] +\centering +\subfigure[Speedup over 1 CPU core]{ +\includegraphics*[width=0.76 + \linewidth]{Chapters/chapter15/figures/histo_speedup_1core.pdf} + \label{fig:speedup_1core} + } + +\subfigure[Speedup over 4 CPU cores]{ +\includegraphics*[width=0.76 + \linewidth]{Chapters/chapter15/figures/histo_speedup_4cores.pdf} + \label{fig:speedup_4cores} + } +\label{fig:speedup} +\caption{Speedup of the successive GPU versions.} +\end{figure} + +Table~\ref{table:time} presents +the execution times +of PROP on CPUs and GPUs, +each version solves the propagation equations in the +form~(\ref{eq_CAPS_1}-\ref{eq_CAPS_3}) as proposed by CAPS. +Fig.~\ref{fig:speedup_1core} (respectively \ref{fig:speedup_4cores}) +shows the speedup of the successive GPU versions +over one CPU core (respectively four CPU cores). +We use here Intel Q8200 quad-core CPUs (2.33 GHz), one C1060 GPU and +one C2050 (Fermi) GPU, located at + UPMC (Universit\'e Pierre et Marie Curie, Paris, France). +As a remark, the execution times measured on the C2050 would be the same +on the C2070 and on the C2075, the only difference between these GPUs +being their memory size and their TDP (Thermal Design Power). +We emphasize that the execution times correspond to the +complete propagation for all six energies of the large case (see +Table~\ref{data-sets}), that is to say to the complete execution of +the PROP program. +Since energies are independent, execution times for more energies +(e.g. the huge case) should be proportional +to those reported in Table~\ref{table:time}. + +These tests, which have been performed with CUDA 3.2, CUBLAS 3.2 and +MAGMA 0.2, +show that the successive GPU versions of PROP offer +increasing, and at the end interesting, speedups. +More precisely, +V2 shows that it is worth increasing slightly the +CPU-GPU communication volume in order to perform +large enough matrix products on the GPU. + This communication volume can fortunately be +reduced thanks to~V3, +which also accelerates the computation of +amplitude arrays thanks to the GPU. +The +double-buffering technique implemented in V4 + effectively enables the overlapping of +I/O operations with computation, while the +padding implemented in V5 also improves the computation times. +It +is noticed that the padding +does offer much more performance gain with, +for example, CUDA 3.1 and the MAGMA DGEMM~\cite{NTD10b}: the +speedup with respect to one +CPU core was increased from 6.3 to 8.1 on C1060, and from 9.5 to 14.3 +on C2050. +Indeed since CUBLAS 3.2 performance has been improved for non block multiple +matrix sizes through MAGMA code~\cite{NTD10a}. +Although for all versions the C2050 (with its improved +double precision performance) offers up to almost +double speedup compared to +the C1060, the performance obtained with both architectures justifies the use of +the GPU for such an application. + +\subsection{PROP execution profile} + +\begin{figure}[t] + \centering + \includegraphics*[width=0.64\linewidth]{Chapters/chapter15/figures/CPU_1_CORE_TIMES.pdf} +\caption{CPU (1 core) execution times for the off-diagonal sectors of the large case.} +\label{fig:CPU-timing} +\end{figure} + +\begin{figure}[t] + \centering + \subfigure[GPU V5 on one C1060]{ +\includegraphics*[width=0.64\linewidth]{Chapters/chapter15/figures/GPU_V5_C1060_TIMES.pdf} +\label{GPU-timing}} + + \subfigure[GPU V5 on one C2050]{ +\includegraphics*[width=0.64\linewidth]{Chapters/chapter15/figures/GPU_V5_C2050_TIMES.pdf} +\label{fermi-timing}} + +\caption{GPU execution times for the off-diagonal sectors of + the large case.} +\label{fig:profileGPU} +\end{figure} + +We detail here the execution profile on +the CPU and the GPU for the evaluation of all off-diagonal sectors +(the most representative ones) for a complete energy propagation. + Fig.~\ref{fig:CPU-timing} and \ref{fig:profileGPU} show CPU and GPU execution times for the +171 off-diagonal sectors of the large case (see Table \ref{data-sets}). +``Copying, adding, scaling'' corresponds to the amplitude + array construction (scaling) as well as to steps 1, 2 and 6 in + Sect.~\ref{gpu-RO}, all implemented via CUDA kernels. +``CPU-GPU transfers'' +aggregate transfer times for the $j$ amplitude +arrays and the scaling factors, as well as for the correction data. + ``Amplitude matrix product'' corresponds to the DGEMM call to + construct the local R-matrices from the $i$ and $j$ amplitude + arrays. +``Linear system solving'' and ``Output matrix product'' correspond +respectively to steps 3-4 and to step 5 in Sect.~\ref{gpu-RO}. + +On one CPU core (see Fig.~\ref{fig:CPU-timing}), + matrix products for the construction of the local +$R$-matrices require the same +computation time during the whole propagation. Likewise the CPU time required by + matrix products for the output $R$-matrix is constant within each + strip. But as the global $R$-matrix is propagated from strip to + strip, the sizes of +the matrices $U$ and $V$ increase, so does their multiplication time. +The time required to solve the linear system increases +slightly during the propagation. +These three operations (``Amplitude matrix product'', ``Output matrix +product'' and ``Linear system solving'') are clearly dominant in terms +of computation +time compared to the other remaining operations, which justify our +primary focus on such three linear algebra operations. + + +On the C1060 (see Fig.~\ref{GPU-timing}), we have +generally managed to obtain a similar speedup for all operations +(around 8, which corresponds to Fig.~\ref{fig:speedup_1core}). Only the linear system solving +presents a lower speedup (around~4). +The CUDA kernels and the remaining CPU-GPU transfers make a minor contribution +to the overall computation time and do not require +additional improvements. + +On the C2050 (see Fig.~\ref{fermi-timing}), additional speedup is +obtained for all operations, except for the +CPU-GPU transfers and the linear system solving. +The CPU-GPU transfers are mainly due to the $j$ amplitude arrays, and +currently still correspond to minor times. When required, the +double-buffering technique may also be used to overlap such transfers +with computation on the GPU. + + + +\section{Propagation of multiple concurrent energies on GPU} + +Finally, we present here an improvement that can +benefit from the Fermi architecture, as well as from the newest Kepler +architecture, +both of which enable the concurrent execution of multiple +CUDA kernels, thus offering additional speedup on +GPUs for small or medium computation grain kernels. +In our case, the performance gain on the GPU is indeed limited +since most matrices have small or medium sizes. +By using multiple streams within one CUDA context~\cite{CUDA_ProgGuide}, +we can propagate multiple energies +concurrently on the Fermi GPU. +It can be noticed that all GPU computations for all streams are +launched by the same host thread. We therefore rely here on the {\em legacy +API} of CUBLAS~\cite{CUBLAS} (like MAGMA) +without thread safety problems. +A {\em breadth first} issue order is used for kernel +launchs \cite{CUDA_stream}: for a given GPU kernel, all kernel launchs +are indeed issued together in the host thread, using one stream for each +concurrent energy, in order to maximize concurrent kernel +execution. +Of course, the memory available on the GPU must be large enough to +store all data structures required by each energy. +Moreover, multiple streams are also used within the +propagation of each single energy +in order to enable concurrent executions among the required kernels. + + +\begin{table}[t] +\begin{center} +\begin{tabular}{|c|c||c|c|c|c|c|} + \hline + \multirow{4}{0.18\linewidth}{Medium case} & Number of & + \multirow{2}{0.07\linewidth}{\centering 1} & + \multirow{2}{0.07\linewidth}{\centering 2} & + \multirow{2}{0.07\linewidth}{\centering 4} & + \multirow{2}{0.07\linewidth}{\centering 8} & + \multirow{2}{0.07\linewidth}{\centering 16} \\ + & energies & & & & & \\ + \cline{2-7} + & Time (s) & 11.18 & 6.87 & 5.32 & 4.96 & 4.76 \\ + \cline{2-7} + & Speedup & - & 1.63 & 2.10 & 2.26 & 2.35 \\ + \hline + \hline + \multirow{4}{0.18\linewidth}{Large case} & Number of & + \multirow{2}{0.07\linewidth}{\centering 1} & + \multicolumn{2}{c|}{\multirow{2}{0.07\linewidth}{\centering 2}} & + \multicolumn{2}{c|}{\multirow{2}{0.07\linewidth}{\centering 3}} \\ + & energies & & \multicolumn{2}{c|}{~} & \multicolumn{2}{c|}{~} \\ + \cline{2-7} + & Time (s) & 509.51 & \multicolumn{2}{c|}{451.49} & \multicolumn{2}{c|}{436.72} \\ + \cline{2-7} + & Speedup & - & \multicolumn{2}{c|}{1.13} & \multicolumn{2}{c|}{1.17} \\ + \hline +\end{tabular} +\caption{\label{t:perfs_V6} Performance results with multiple + concurrent energies + on one C2070 GPU. GPU initialization times are not considered here. } +\end{center} +\end{table} + +In order to have enough GPU memory to run two or three concurrent +energies for the large case, we use one C2070 GPU +(featuring 6GB of memory) +with one Intel X5650 hex-core CPU, CUDA 4.1 and CUBLAS 4.1, as +well as the latest MAGMA release (version 1.3.0). +Substantial changes have been required +in the MAGMA calls with respect to the previous versions of PROP that were using MAGMA 0.2. + Table~\ref{t:perfs_V6} presents the speedups +obtained on the Fermi GPU for multiple concurrent +energies (up to sixteen since this is the maximum number of concurrent +kernel launches currently supported \cite{CUDA_ProgGuide}). +With the medium case, speedups greater than 2 are obtained with four +concurrent energies or more. +With the more realistic large case, the performance gain is lower mainly because of +the increase in matrix sizes, which implies a better GPU usage +with only one energy on the GPU. The concurrent execution of multiple +kernels is also limited by other operations on the +GPU \cite{CUDA_ProgGuide,CUDA_stream} and by the current MAGMA code which +prevents concurrent MAGMA calls in different streams. +Better speedups can be here expected on the latest Kepler GPUs which +offer additional compute power, and whose {\em Hyper-Q} feature may help +improve further the GPU utilization with concurrent energies. +On the contrary, the same code on the C1060 shows no speedup + since the concurrent kernel launches are +serialized on this previous GPU architecture. + + + + + + + + +\section{Conclusion and future work} +\label{conclusion} + + +In this chapter, we have presented our methodology and our results in +the deployment on +a GPU of an application (the PROP program) in atomic physics. + +We have started by studying the numerical stability of PROP using +single precision arithmetic. This has shown that PROP +using single precision, while relatively stable for some small cases, +gives unsatisfactory results for realistic simulation cases above the +ionization threshold where there is a +significant density of pseudo-states. It is +expected that this may not be the case below the ionization threshold +where the actual target states are less dense. This requires further +investigation. + +We have +therefore deployed the PROP code in double precision on +a GPU, with successive improvements. The different GPU versions +each offer increasing speedups over the CPU version. +Compared to the single (respectively four) core(s) CPU version, the +optimal GPU implementation +gives a speedup of 8.2 (resp. 4.6) on one C1060 GPU, +and a speedup of 15.9 (resp. 9.0) on one +C2050 GPU with improved double precision performance. +An additional gain of around 15\% +can also be obtained on one Fermi GPU +with large memory (C2070) thanks to concurrent kernel execution. + +Such speedups +cannot be directly compared with the 1.15 speedup +obtained with the HMPP version, since in our tests the CPUs are +different and the CUBLAS versions are more recent. +However, the programming effort required +progressively to deploy PROP on GPUs clearly offers improved and interesting speedups for this +real-life application in double precision with varying-size matrices. + + +We are currently working on a hybrid CPU-GPU version that spreads the +computations of the independent energies on both the CPU +and the GPU. This will enable +multiple energy execution on the CPU, with +one or several core(s) dedicated to each energy (via multi-threaded +BLAS libraries). Multiple +concurrent energies may also be propagated on each Fermi GPU. +By merging this work with the current MPI PROP program, we will +obtain a scalable hybrid CPU-GPU version. +This final version will offer an additional level of parallelism +thanks to the MPI +standard in order to exploit multiple +nodes with multiple CPU cores and possibly multiple GPU cards. + +\putbib[Chapters/chapter15/biblio] + diff --git a/BookGPU/Chapters/chapter15/figures/1s2p.pdf b/BookGPU/Chapters/chapter15/figures/1s2p.pdf new file mode 100644 index 0000000..2cd2c18 Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/1s2p.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/1s2p3.pdf b/BookGPU/Chapters/chapter15/figures/1s2p3.pdf new file mode 100644 index 0000000..d87e962 Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/1s2p3.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/1s2pHT.pdf b/BookGPU/Chapters/chapter15/figures/1s2pHT.pdf new file mode 100644 index 0000000..9ae1822 Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/1s2pHT.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/1s4d.pdf b/BookGPU/Chapters/chapter15/figures/1s4d.pdf new file mode 100644 index 0000000..4e889d7 Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/1s4d.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/1s4d3.pdf b/BookGPU/Chapters/chapter15/figures/1s4d3.pdf new file mode 100644 index 0000000..56ff4f4 Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/1s4d3.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/1s4dHT.pdf b/BookGPU/Chapters/chapter15/figures/1s4dHT.pdf new file mode 100644 index 0000000..9857001 Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/1s4dHT.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/C1060_V3_IO_COMP.pdf b/BookGPU/Chapters/chapter15/figures/C1060_V3_IO_COMP.pdf new file mode 100644 index 0000000..03acc05 Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/C1060_V3_IO_COMP.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/CPU_1_CORE_TIMES.pdf b/BookGPU/Chapters/chapter15/figures/CPU_1_CORE_TIMES.pdf new file mode 100644 index 0000000..ec8f980 Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/CPU_1_CORE_TIMES.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/Domain.pdf b/BookGPU/Chapters/chapter15/figures/Domain.pdf new file mode 100644 index 0000000..2027845 Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/Domain.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/GPU_V5_C1060_TIMES.pdf b/BookGPU/Chapters/chapter15/figures/GPU_V5_C1060_TIMES.pdf new file mode 100644 index 0000000..dc316ca Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/GPU_V5_C1060_TIMES.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/GPU_V5_C2050_TIMES.pdf b/BookGPU/Chapters/chapter15/figures/GPU_V5_C2050_TIMES.pdf new file mode 100644 index 0000000..0522c60 Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/GPU_V5_C2050_TIMES.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/amplitudes_nb.pdf b/BookGPU/Chapters/chapter15/figures/amplitudes_nb.pdf new file mode 100644 index 0000000..c1f43a3 Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/amplitudes_nb.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/error.pdf b/BookGPU/Chapters/chapter15/figures/error.pdf new file mode 100644 index 0000000..203190a Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/error.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/histo_speedup_1core.pdf b/BookGPU/Chapters/chapter15/figures/histo_speedup_1core.pdf new file mode 100644 index 0000000..f859253 Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/histo_speedup_1core.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/histo_speedup_4cores.pdf b/BookGPU/Chapters/chapter15/figures/histo_speedup_4cores.pdf new file mode 100644 index 0000000..cc1d80b Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/histo_speedup_4cores.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/offdiagonal_nb.pdf b/BookGPU/Chapters/chapter15/figures/offdiagonal_nb.pdf new file mode 100644 index 0000000..46b67be Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/offdiagonal_nb.pdf differ diff --git a/BookGPU/Chapters/chapter15/figures/prop.pdf b/BookGPU/Chapters/chapter15/figures/prop.pdf new file mode 100644 index 0000000..481fe64 Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/prop.pdf differ diff --git a/BookGPU/Makefile b/BookGPU/Makefile index 011ff95..43eb90f 100644 --- a/BookGPU/Makefile +++ b/BookGPU/Makefile @@ -7,7 +7,8 @@ all: bibtex bu1 bibtex bu2 bibtex bu3 -# bibtex bu4 + bibtex bu4 + bibtex bu6 makeindex ${BOOK}.idx pdflatex ${BOOK} pdflatex ${BOOK}