From: couturie <couturie@carcariass.(none)>
Date: Tue, 4 Dec 2012 19:49:46 +0000 (+0100)
Subject: ch15
X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/3c87d3518228d3b7bcfbd0a08cb79682f05132ee?ds=inline

ch15
---

diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex
index e2a1c03..7936982 100755
--- a/BookGPU/BookGPU.tex
+++ b/BookGPU/BookGPU.tex
@@ -17,7 +17,10 @@
 \usepackage{algorithmicx}
 \usepackage[lined,boxed,commentsnumbered]{algorithm2e}
 \usepackage{epstopdf}
-
+\usepackage{url}
+\usepackage{multirow}
+\usepackage{layouts}
+\usepackage{comment}
 
 \frenchspacing
 \tolerance=5000
@@ -114,6 +117,7 @@
 \include{Chapters/chapter8/ch8}
 \include{Chapters/chapter11/ch11}
 \include{Chapters/chapter14/ch14}
+\include{Chapters/chapter15/ch15}
 
 \bibliographystyle{hep}
 %%%\bibliography{biblio}
diff --git a/BookGPU/Chapters/chapter15/biblio.bib b/BookGPU/Chapters/chapter15/biblio.bib
new file mode 100644
index 0000000..d0bb53f
--- /dev/null
+++ b/BookGPU/Chapters/chapter15/biblio.bib
@@ -0,0 +1,167 @@
+@Inproceedings{PF_PDSEC2011,
+  author ={P. Fortin and  R. Habel and  F.~J\'ez\'equel and J.-L. Lamotte and N.S. Scott},
+  title = {Deployment on GPUs of an application in computational atomic physics},
+  booktitle = {{12th IEEE International Workshop on Parallel and Distributed Scientific 
+and Engineering Computing (PDSEC) in conjunction with the 25th International Parallel and Distributed Processing Symposium (IPDPS)}},
+year = 2011}
+%note={8 pages}}
+@Comment address=  {{Anchorage, Alaska, USA}},
+@Comment month = may, 
+
+
+@article{Burke_1987,
+author={P.G. Burke and C.J. Noble and M.P. Scott},
+title={{R-matrix theory of electron scattering at intermediate energies}},
+journal={Proceedings of the Royal Society of London A},
+volume=410,
+year=1987,
+pages={287--310}
+ }
+% %Proc. Roy. Soc. A }
+
+@Article{2DRMP,
+author ={N.S. Scott and M.P. Scott and P.G. Burke and T. Stitt and V. Faro-Maza and C. Denis and A. Maniopoulou},
+title ={{2DRMP: A~suite of two-dimensional R-matrix propagation codes}},
+journal ={Computer Physics Communications},
+volume={180},
+year =2009,
+pages={2424--2449}
+}
+%note={ISSN: {0010-4655}, doi:10.1016/j.cpc.2009.07.017}
+
+@Article{FARM_2DRMP,
+author ={  V.M. Burke and C.J. Noble and V. Faro-Maza and A. Maniopoulou and N.S. Scott},
+title ={  {FARM\_2DRMP: a version of FARM for use with 2DRMP}},
+journal ={Computer Physics Communications},
+volume={180},
+year =2009,
+pages={2450--2451}
+}
+% note={ISSN: {0010-4655}, doi:10.1016/j.cpc.2009.07.017}
+
+@INPROCEEDINGS{VECPAR,
+   author    = {T. Stitt and
+                N.S. Scott and
+                M.P. Scott and
+                P.G. Burke},
+   title     = {{2-D R-Matrix Propagation: A Large Scale Electron Scattering
+                Simulation Dominated by the Multiplication of Dynamically
+                Changing Matrices}},
+   booktitle = {Proc. of VECPAR'02},  
+   year      = {2002},
+   pages     = {354-367}
+}
+% month={June},
+%,
+%   ee        = {http://link.springer.de/link/service/series/0558/bibs/2565/25650354.htm}
+%}
+%,
+%   crossref  = {DBLP:conf/vecpar/2002},
+%   bibsource = {DBLP, http://dblp.uni-trier.de}
+% }
+
+@Misc{CPC,
+  title= {{CPC}: {C}omputer {P}hysics {C}ommunications},
+  howpublished = {Queen's University, Belfast},
+  note = 	 {\url{http://cpc.cs.qub.ac.uk/}},
+  key={CPC}
+}
+
+
+
+@Misc{CUBLAS,
+  title={{NVIDIA} {CUDA} {T}oolkit 4.1, {CUBLAS} {L}ibrary},
+  month = {January}, 
+  year = {2012}, 
+  key={NVIDIA CUBLAS}
+}
+@Comment howpublished = {available at: \url{http://docs.nvidia.com/cuda/cublas}},
+
+
+
+
+@Misc{CUDA_ProgGuide,
+  title={{NVIDIA} {CUDA} {C} {P}rogramming {G}uide, version 4.1},
+  month = {November}, 
+  year = {2011}, 
+  key={NVIDIA CUDA}
+}
+@Comment  howpublished = {available at: \url{http://docs.nvidia.com/cuda/cuda-c-programming-guide}},
+
+
+
+@Misc{CUDA_ProgGuide_3.2,
+  title={{NVIDIA} {CUDA} {C} {P}rogramming {G}uide},
+  year={2010},
+  howpublished = {version 3.2, available at: \url{www.nvidia.com/object/cuda_get.html}}
+}
+
+
+@Misc{CUDA_stream,
+  title={{CUDA C/C++ Streams and Concurrency, NVIDIA}},
+  author={S. Rennich}, 
+  month={January},
+  year={2012}, 
+}
+
+
+
+
+@Misc{GOTO,
+  title= {  Goto\uppercase{BLAS}},
+  howpublished = {  \url{http://www.tacc.utexas.edu/tacc-projects/gotoblas2}}
+}
+
+@Misc{MAGMA,
+ title={{MAGMA (Matrix Algebra on GPU and Multicore Architectures)}},
+ howpublished={Available at: \url{http://icl.cs.utk.edu/magma}}, 
+key={MAGMA}
+}
+
+@Misc{MAGMA_0.2_UserGuide,
+ title={{MAGMA (Matrix Algebra on GPU and Multicore Architectures) version 0.2 Users' Guide}},
+ author={S. Tomov and R. Nath and P. Du and J. Dongarra},
+ howpublished={available at: \url{http://icl.cs.utk.edu/magma}}
+}
+
+
+@INPROCEEDINGS{NTD10a,
+  author = {R. Nath and S. Tomov and J. Dongarra},
+  title = {{Accelerating GPU Kernels for Dense Linear Algebra}}, 
+  booktitle={Proc. of VECPAR'10},
+  year={2010}
+}
+%   month={June},
+
+
+@article{NTD10b,
+ author = {Nath, R. and Tomov, S. and Dongarra, J.},
+ title = {{An Improved Magma Gemm For Fermi Graphics Processing Units}},
+ journal = {International Journal of High Performance Computing Applications},
+ volume = {24},
+ number = {4},
+ year = {2010},
+ pages = {511--515},
+} 
+ @Comment issue_date = {November  2010},
+ @Comment publisher = {Sage Publications, Inc.},
+ @Comment address = {Thousand Oaks, CA, USA},
+ @Comment keywords = {CUDA matrix mutiply, Fermi, GPU BLAS, dense linear algebra, hybrid computing},
+ @Comment numpages = {5},
+ @Comment url = {http://dx.doi.org/10.1177/1094342010385729},
+ @Comment doi = {10.1177/1094342010385729},
+ @Comment acmid = {1889710},
+ @Comment issn = {1094-3420},
+ @Comment month = nov,
+
+
+@Misc{CUBLAS3.1,
+title= {{CUDA CUBLAS Library 3.1}},
+  howpublished = {  \url{http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/CUBLAS_Library_3.1.pdf} }
+}
+
+@Misc{CUBLAS3.2,
+title= {{CUDA CUBLAS Library 3.2}},
+  howpublished = {  \url{http://developer.download.nvidia.com/compute/cuda/3_2_prod/toolkit/docs/CUBLAS_Library.pdf} }
+}
+
diff --git a/BookGPU/Chapters/chapter15/ch15.tex b/BookGPU/Chapters/chapter15/ch15.tex
new file mode 100644
index 0000000..1eca6ec
--- /dev/null
+++ b/BookGPU/Chapters/chapter15/ch15.tex
@@ -0,0 +1,1168 @@
+\chapterauthor{Pierre Fortin}{Laboratoire d'Informatique de Paris 6, University Paris 6}
+\chapterauthor{Rachid Habel}{T\'el\'ecom SudParis}
+\chapterauthor{Fabienne J\'ez\'equel}{Laboratoire d'Informatique de Paris 6, University Paris 6}
+\chapterauthor{Jean-Luc Lamotte}{Laboratoire d'Informatique de Paris 6, University Paris 6}
+\chapterauthor{Stan Scott}{School of Electronics, Electrical Engineering \& Computer Science,
+The Queen's University of Belfast}
+
+\newcommand{\fixme}[1]{{\bf #1}}
+
+\chapter{Numerical validation and performance optimization on GPUs of
+an application in atomic physics} 
+\label{chapter15}
+
+\section{Introduction}\label{ch15:intro}
+As described in Chapter~\ref{chapter1}, GPUs are characterized by hundreds 
+of cores and theoretically perform one order of magnitude better than CPUs.  
+An important factor to consider when programming on GPUs
+ is the cost of
+data transfers between CPU memory and GPU memory. Thus, to have good
+performance on
+GPUs, applications should be coarse-grained and have a high arithmetic
+intensity 
+($i.e.$ the ratio of arithmetic operations to memory operations). 
+Another important aspect of GPU programming is that floating-point
+operations are preferably performed in single precision, if the
+ validity of results is not impacted by that format.   
+The GPU compute power for floating-point operations is indeed greater in
+single precision than in double precision.  
+The peak performance ratio between single precision and double
+precision varies for example for NVIDIA GPUs from $12$ for the first Tesla
+GPUs (C1060), 
+to $2$ for the Fermi GPUs (C2050 and C2070)  
+and to $3$ for the latest Kepler architecture (K20/K20X).  
+As far as AMD GPUs are concerned, the latest AMD GPU (Tahiti HD 7970)
+presents a ratio of $4$.  
+Moreover, GPU internal memory accesses and CPU-GPU data transfers are
+faster in single precision than in double precision,  
+because of the different format lengths. 
+
+This chapter describes the deployment on GPUs of PROP, a program of the
+2DRMP~\cite{FARM_2DRMP,2DRMP} suite which models electron collisions
+with H-like atoms and ions at intermediate energies. 2DRMP operates successfully on serial
+computers, high performance clusters and supercomputers.  The primary
+purpose of the PROP program is to propagate a global
+R-matrix~\cite{Burke_1987}, $\Re$, in the two-electron configuration
+space.
+The propagation needs to be performed for all collision energies, 
+for instance hundreds of energies,
+which are independent.
+Propagation equations are dominated by matrix multiplications involving sub-matrices of $\Re$.
+However, the matrix multiplications are not
+straightforward in the sense that $\Re$ dynamically changes the designation of its rows and
+columns and increases in size as the propagation proceeds \cite{VECPAR}.
+
+In a preliminary investigation PROP was selected by GENCI\footnote{GENCI: Grand Equipement National
+  de Calcul Intensif, \url{www.genci.fr}} and
+CAPS\footnote{CAPS is a software company providing products and solutions
+  for manycore application programming and deployment,
+  \url{www.caps-entreprise.com}},
+following their first call for projects in 2009-2010 
+aimed at
+deploying applications on hybrid systems based on GPUs.
+First CAPS  
+recast the propagation equations with larger matrices.  
+For matrix products the GPU performance gain over CPU increases indeed 
+with the matrix size, since the 
+CPU-GPU transfer overhead becomes less significant and since CPUs are
+still more efficient for fine computation grains. 
+Then, using HMPP\footnote{
+HMPP or {\em CAPS compiler}, see: \url{www.caps-entreprise.com/hmpp.html}},
+a commercial 
+hybrid and parallel compiler, CAPS 
+developed a version of PROP, in
+which matrix multiplications are performed on
+the GPU or the CPU, depending on the matrix size.
+Unfortunately this partial GPU implementation of PROP does not offer
+significant acceleration. 
+
+The work described in this chapter, which is based on a study presented in \cite{PF_PDSEC2011}, aims at 
+ improving PROP performance on 
+GPUs by exploring two directions. First, because the original version of PROP is written
+in double precision, 
+we study the numerical stability of PROP in single precision. 
+Second, we deploy  the whole
+computation code of PROP on
+GPUs to avoid the overhead generated by
+data transfers 
+and we propose successive improvements
+(including a specific one to the Fermi architecture) 
+in order to optimize the GPU code.
+
+
+
+
+\section{2DRMP and the PROP program}
+\label{s:2DRMP_PROP}
+\subsection{Principles of R-matrix propagation}
+2DRMP~\cite{FARM_2DRMP,2DRMP} is part of the CPC library\footnote{CPC:
+Computer Physics Communications, 
+\url{http://cpc.cs.qub.ac.uk/}}.  
+It is a suite of seven 
+programs aimed at creating virtual experiments on high performance and grid
+architectures to enable the study of electron scattering from H-like
+atoms and ions at intermediate energies.  The 2DRMP suite  uses the
+two-dimensional $R$-matrix propagation approach~\cite{Burke_1987}. 
+In 2DRMP the two-electron configuration  space  ($r_1$,$r_2$) is
+divided into sectors. 
+Figure~\ref{prop} shows the division of the two-electron configuration
+space ($r_1$,$r_2$) into 4 vertical $strips$ representing 10 $sectors$. 
+The key computation in 2DRMP, performed by the PROP program, is the
+propagation of a global 
+$R$-matrix, $\Re$, from sector to sector across the internal region, as shown in Fig.~\ref{prop}. 
+
+\begin{figure}[h]
+\begin{center}
+\includegraphics*[width=0.65\linewidth]{Chapters/chapter15/figures/prop.pdf} 
+\caption{\label{prop} Subdivision of the configuration space 
+($r_1$,$r_2$) into a set of connected sectors.}
+\end{center}
+\end{figure}
+
+\begin{figure}[h]
+\begin{center}
+\includegraphics*[width=0.8\linewidth]{Chapters/chapter15/figures/Domain.pdf} 
+\caption{\label{domain} Propagation of the R-matrix from domain D to domain D'.}
+\end{center}
+\end{figure}
+
+We consider the general situation in
+Fig.~\ref{domain} where we assume that we already know
+the global $R$-matrix, $\Re^{I}$, associated with the boundary defined
+by edges 5, 2, 1 and 6 
+in domain $D$ and we wish to
+evaluate the new global $R$-matrix, $\Re^{O}$, associated with edges 5, 3, 4 and 6 
+in domain $D'$ following propagation across subregion $d$.
+Input edges are denoted by I (edges 1 and~2), output edges by O (edges 3 and 4) and
+common edges by X (edges 5 and~6).
+Because of symmetry, only the lower half of domains $D$ and $D'$ has to be considered. 
+The global $R$-matrices, $\Re^{I}$ in domain $D$ and $\Re^{O}$ in
+domain $D'$, can be written as:
+\begin{equation}
+\Re^{I} = \left(\begin{array}{cc}
+      \Re_{II}^{I} & \Re_{IX}^{I}\\
+      \Re_{XI}^{I} & \Re_{XX}^{I}
+    \end{array}\right) 
+\ \
+\Re^{O} = \left(\begin{array}{cc}
+      \Re_{OO}^{O} & \Re_{OX}^{O}\\
+      \Re_{XO}^{O} & \Re_{XX}^{O}
+    \end{array}\right).
+\label{eq:RI_RO}
+\end{equation}
+
+
+
+From the set of local $R$-matrices, $\mathbf{R}_{ij}$ ($i,j\in \{1,2,3,4\}$)
+associated 
+with subregion $d$, we can define
+\begin{subequations}
+\begin{eqnarray}
+    \mathbf{r}_{II} = \left(\begin{array}{cc}
+      \mathbf{R}_{11} & \mathbf{R}_{12}\\
+      \mathbf{R}_{21} & \mathbf{R}_{22}
+    \end{array}\right), \label{eqaa} & 
+    \mathbf{r}_{IO} = \left(\begin{array}{cc}
+      \mathbf{R}_{13} & \mathbf{R}_{14}\\
+      \mathbf{R}_{23} & \mathbf{R}_{24}
+    \end{array}\right), \label{eqbb}\\
+    \mathbf{r}_{OI} = \left(\begin{array}{cc}
+      \mathbf{R}_{31} & \mathbf{R}_{32}\\
+      \mathbf{R}_{41} & \mathbf{R}_{42}
+    \end{array}\right), \label{eqcc} &
+   \mathbf{r}_{OO} = \left(\begin{array}{cc}
+      \mathbf{R}_{33} & \mathbf{R}_{34}\\
+      \mathbf{R}_{43} & \mathbf{R}_{44}
+    \end{array}\right),\label{eqdd}
+\end{eqnarray}
+\end{subequations}
+where $I$ represents the input edges 1 and 2, and $O$ represents
+the output edges 3 and 4 (see Fig.~\ref{domain}). 
+The propagation across each sector is characterized by equations~(\ref{eq1}) to (\ref{eq4}).
+\begin{subequations}
+\begin{eqnarray}
+\Re^{O}_{OO} &=& \mathbf{r}_{OO} - \mathbf{r}_{IO}^T (r_{II} + \Re^{I}_{II})^{-1}\mathbf{r}_{IO}, \label{eq1} \\
+\Re^{O}_{OX} &=& \mathbf{r}_{IO}^T (\mathbf{r}_{II} + \Re^{I}_{II})^{-1}\Re^{I}_{IX},  \label{eq2} \\
+   \Re^{O}_{XO} &=& \Re^{I}_{XI}(\mathbf{r}_{II} + \Re^{I}_{II})^{-1}\mathbf{r}_{IO},  \label{eq3} \\
+   \Re^{O}_{XX} &=& \Re^{I}_{XX} - \Re^{I}_{XI}(\mathbf{r}_{II} +\Re^{I}_{II})^{-1}\Re^{I}_{IX}. \label{eq4}
+\end{eqnarray}
+\end{subequations}
+
+The matrix inversions are not explicitly performed. To compute
+$(r_{II} + \Re^{I}_{II})^{-1}\mathbf{r}_{IO}$ and $(\mathbf{r}_{II} + \Re^{I}_{II})^{-1}\Re^{I}_{IX}$,
+two linear systems are solved.
+
+
+\medskip
+
+While equations (\ref{eq1})-(\ref{eq4}) can be applied to the
+propagation across a general subregion two special situations should be
+noted: propagation across a diagonal subregion and propagation across
+a subregion bounded by the $r_{1}$-axis at the beginning of a new
+strip.
+
+In the case of a diagonal subregion, from symmetry considerations,
+edge 2 is identical to edge 1 and edge 3 is identical to edge~4. 
+Accordingly, with only one input edge and one output edge equations
+(\ref{eqaa})-(\ref{eqdd}) become: 
+\begin{subequations}
+\begin{eqnarray}
+\mathbf{r}_{II} = 2\mathbf{R}_{11}, \ 
+\mathbf{r}_{IO} = 2\mathbf{R}_{14}, \label{eq4b}\\
+\mathbf{r}_{OI} = 2\mathbf{R}_{41}, \ 
+\mathbf{r}_{OO} = 2\mathbf{R}_{44}. \label{eq4d}
+\end{eqnarray}
+\end{subequations}
+In the case of a subregion bounded by the $r_1$-axis at the beginning
+of a new strip, we note that the input boundary $I$ consists of only
+one edge. When propagating across the first subregion in the second
+strip there is no common boundary $X$: in this case only equation
+(\ref{eq1}) needs to be solved.
+
+\medskip
+
+Having obtained the global $R$-matrix $\Re$ on the boundary of the
+innermost subregion (labeled $0$ in Fig.~\ref{prop}), $\Re$ is propagated across
+each subregion in the order indicated in Fig.~\ref{prop},
+working systematically from the
+$r_1$-axis at the bottom of each strip across all subregions to the
+diagonal. 
+
+
+
+\subsection{Description of the PROP program}
+\label{sec:PROP}
+
+\begin{table}[t]
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|}
+ \hline
+ \multirow{2}{0.09\linewidth}{\centering Data set} &
+ \multirow{2}{0.15\linewidth}{\centering
+  Local $R$-\\matrix size} &
+ \multirow{2}{0.07\linewidth}{\centering Strips} &
+ \multirow{2}{0.09\linewidth}{\centering Sectors} &
+ \multirow{2}{0.19\linewidth}{\centering Final global \\$R$-matrix size} &
+ \multirow{2}{0.15\linewidth}{\centering Scattering\\energies} \\
+  & & & & & \\
+  \hline
+  Small & 90x90  & 4 & 10 & 360x360 & 6\\
+ \hline
+  Medium  & 90x90  & 4 & 10 & 360x360 & 64\\
+ \hline
+  Large  & 383x383  & 20 & 210 &  7660x7660 & 6\\
+ \hline
+  Huge  & 383x383  & 20 & 210 &  7660x7660 & 64\\ \hline
+\end{tabular}
+\caption{\label{data-sets}Characteristics of four data sets}
+\end{center}
+\end{table}
+
+The PROP program computes the propagation of the $R$-matrix across the sectors of the internal region.  
+Table~\ref{data-sets} shows four different 
+data sets used in this study and highlights the principal parameters of the PROP program. 
+ PROP execution can be described by Algorithm~\ref{prop-algo}. 
+First, amplitude arrays and
+correction data are read from data files generated by the preceding
+program of the 2DRMP suite.  
+Then, local $R$-matrices are constructed from amplitude arrays. 
+Correction data is used to compute correction vectors added to the diagonal of the local 
+$R$-matrices. The local $R$-matrices, together with the input $R$-matrix, 
+ $\Re^{I}$,
+computed on the previous sector, are used to compute the current output
+$R$-matrix, 
+$\Re^{O}$. 
+ At the end of a sector evaluation,
+the output $R$-matrix becomes  the input $R$-matrix 
+for the next evaluation.  
+
+%% \begin{algorithm}
+%% \caption{\label{prop-algo}PROP algorithm}
+%% \begin{algorithmic}
+%% \FOR{all scattering energies}
+%%  \FOR{all sectors}
+%%  \STATE Read amplitude arrays
+%%  \STATE Read correction data
+%% \STATE Construct local $R$-matrices
+%% \STATE From $\Re^{I}$ and local $R$-matrices, compute $\Re^{O}$
+%% \STATE $\Re^{O}$ becomes $\Re^{I}$ for the next sector 
+%%  \ENDFOR
+%%  \STATE Compute physical $R$-Matrix 
+%% \ENDFOR
+%% \end{algorithmic}
+%% \end{algorithm}
+
+
+On the first sector, there is no input $R$-matrix yet. To bootstrap
+the propagation, the first output $R$-matrix is constructed using only
+one local $R$-matrix.  On the last sector, that is, on the boundary of
+the inner region, a physical $R$-matrix corresponding to the output
+$R$-matrix is computed and stored into an output file.
+
+In the PROP program, sectors are characterized into four types,
+depending on the computation performed: 
+\begin{itemize}
+\item the starting sector (labeled 0 in Fig.~\ref{prop})
+\item the axis sectors (labeled 1, 3 and 6 in Fig.~\ref{prop})
+\item the diagonal sectors (labeled 2, 5 and 9 in Fig.~\ref{prop})
+\item the off-diagonal sectors (labeled 4, 7 and 8 in Fig.~\ref{prop}).
+\end{itemize}
+
+
+The serial version of PROP is implemented in Fortran~90 and uses
+for linear algebra operations BLAS and LAPACK routines
+which are fully optimized for x86 architecture.
+This 
+program 
+serially propagates the $R$-matrix for
+all scattering energies. 
+Since the propagations for these different 
+energies are independent, there also 
+exists an embarrassingly parallel version of 
+PROP 
+that spreads the computations of 
+several energies
+among multiple CPU nodes via  
+MPI. 
+
+
+
+\subsection{CAPS implementation}
+\label{caps}
+
+In order to handle larger matrices, and thus obtain better GPU  speedup, CAPS  
+recast equations (\ref{eq1}) to (\ref{eq4}) into one equation.
+The output $R$-matrix $\Re^{O}$ defined by equation~(\ref{eq:RI_RO}) is now computed as follows. 
+\begin{equation}\label{eq_CAPS_1}
+\Re^{O} = \Re^{O^{\ \prime}} + U A^{-1} V, 
+\end{equation}
+\begin{equation}\label{eq_CAPS_2}
+{\rm with} \
+\Re^{O^{\ \prime}}= \left(\begin{array}{cc}
+      \mathbf{r}_{OO} & 0\\
+      0 & \Re^I_{XX}    \end{array}\right), \
+U= \left(\begin{array}{c}
+      \mathbf-{r}_{IO}^{T}\\
+      \Re^I_{XI}    \end{array}\right), 
+\end{equation}
+\begin{equation}\label{eq_CAPS_3}
+A= \mathbf{r}_{II} + \Re^I_{II} \ {\rm and}  \
+V= (\mathbf{r}_{IO}\ \ \ -\Re^I_{IX}).
+\end{equation}
+
+To compute $W=A^{-1}V$, no matrix inversion is performed. The matrix
+system $AW=V$ is solved. 
+This reimplementation of PROP reduces the number of equations to be
+solved and the number of matrix copies for evaluating each sector.
+For instance, for an off-diagonal sector, 
+copies fall from 22 to 5, matrix multiplications from 4 to~1 and calls
+to a linear equation solver from 2 to 1. 
+
+To implement this version, CAPS 
+used HMPP, a 
+commercial hybrid and parallel compiler, 
+based on compiler directives like the new OpenACC standard\footnote{See: \url{www.openacc-standard.org}}.  
+If the matrices are large enough (the limit sizes are experimental parameters), 
+they are multiplied on the GPU, otherwise on the CPU. 
+CAPS 
+ used the MKL BLAS implementation on an Intel Xeon
+x5560 quad core CPU (2.8 GHz) 
+and the CUBLAS library (CUDA 2.2) on one Tesla C1060 GPU. 
+On the large data set (see Table~\ref{data-sets}), CAPS 
+ obtained a speedup of 1.15 for the GPU 
+version over the CPU one (with multi-threaded MKL calls on the four
+CPU cores). This limited gain in performance is mainly
+due to the use of double precision computation 
+and to the small or medium sizes of most matrices.
+For these matrices, the computation gain on  
+the GPU is indeed
+strongly affected by the overhead 
+generated by transferring these matrices from  
+the CPU memory to the GPU memory to perform each matrix multiplication and then
+transferring the result back to the CPU memory. 
+
+Our goal is to speedup PROP more significantly by porting the whole
+code to the GPU and therefore avoiding 
+the 
+intermediate data transfers between
+the host (CPU) and the GPU. We will also study the
+stability of PROP in single precision because 
+single precision computation is faster on the GPU  
+and CPU-GPU data transfers are twice as fast as those performed in
+double precision. 
+
+
+
+\section{Numerical validation of PROP in single precision}
+\label{single-precision}
+
+\begin{comment}
+\begin{table}[h]
+\begin{center}
+\begin{tabular}{|c|c|}
+  \hline
+   relative error interval & \# occurrences \\
+  \hline
+   [0, 1.E-8) & 18 \\
+  \hline
+   [1.E-8, 1.E-6) & 1241 \\
+  \hline
+   [1.E-6, 1.E-4) & 48728 \\
+  \hline
+   [1.E-4, 1.E-2) & 184065 \\
+  \hline
+   [1.E-2, 1) & 27723 \\
+  \hline
+   [1, 100) & 304 \\
+  \hline
+   [100, $+\infty$) & 1 \\
+  \hline
+\end{tabular}
+\end{center}
+\caption{\label{sp-distrib}Error distribution for medium case in single precision}
+\end{table}
+\end{comment}
+
+
+Floating-point input data, computation and output data of PROP are 
+originally in double precision format.
+PROP produces a standard $R$-matrix H-file \cite{FARM_2DRMP}
+ and a collection of Rmat00X files (where X
+ranges from 0 to the number of scattering energies - 1)
+holding the physical R-matrix for each 
+energy.
+The  H-file  and the Rmat00X files are binary input files of the FARM program \cite{FARM_2DRMP}
+(last program of the 2DRMP suite).
+Their text equivalent are the  prop.out 
+and the prop00X.out files. 
+To study the validity of PROP results in  single precision,
+first,
+reference results are 
+ generated by running the serial version of PROP in double precision.
+Data used in the most costly computation parts are read from input files in
+double precision format and then 
+cast to single precision format.
+PROP results  (input of FARM) are computed in single precision and  written
+into files in double precision. 
+
+\subsection{Medium case study}
+\begin{figure}[h]
+\begin{center}
+\includegraphics*[width=0.9\linewidth]{Chapters/chapter15/figures/error.pdf} 
+\caption{\label{fig:sp-distrib} Error distribution for medium case in single precision}
+\end{center}
+\end{figure}
+
+The physical $R$-matrices, in
+the prop00X.out files, are compared to the
+reference ones for the medium case (see Table~\ref{data-sets}). 
+ The relative
+error distribution is
+given in Fig.~\ref{fig:sp-distrib}. 
+We focus on the largest errors. 
+\begin{itemize}
+\item Errors greater than 100: the only impacted value is of order 1.E-6
+and is negligible compared to the other ones 
+in the same prop00X.out file.
+
+\item Errors between 1 and 100: the values corresponding to the
+  largest errors are of order 1.E-3 and are negligible compared to
+  the majority of the other values which range between 1.E-2 and
+  1.E-1.
+
+\item Errors between 1.E-2 and 1: the largest errors ($\ge$ 6\%)
+  impact values the order of magnitude of which is at most 1.E-1.
+  These values are negligible. 
+  Relative errors of approximately 5\% impact values the order of
+  magnitude of which is at most 1.E2. 
+  For instance, the value 164 produced by the reference version of
+  PROP becomes 172 in the single precision version.
+
+\end{itemize}
+
+To study the impact of the single precision version of PROP on the
+FARM program, the cross-section
+results files corresponding to 
+transitions 
+{1s1s}, 
+{1s2s}, {1s2p}, {1s3s}, {1s3p}, {1s3d},
+{1s4s}, {2p4d} are compared to the reference ones.  
+Table~\ref{sp-farm} shows that all cross-section files are impacted by
+errors.  Indeed in the  {2p4d} file,  four relative errors are 
+greater than one and the maximum relative error is 1.60. 
+However the largest errors impact negligible values. For example, the maximum
+error (1.60) impacts a reference value which is 4.5E-4.  The largest 
+values are impacted by low errors. For instance, the maximum value
+(1.16) is impacted by a relative error of the order 1.E-3. 
+
+\begin{table}[t] 
+\begin{center}
+\begin{tabular}{|c|c||c|c|} \hline
+  file & largest relative error & file & largest relative error\\ \hline
+ {1s1s} & 0.02& {1s3p} & 0.11  \\ \hline
+ {1s2s} & 0.06 &  {1s3d} &  0.22 \\ \hline
+ {1s2p} &  0.08 & {1s4s} &  0.20  \\ \hline
+ {1s3s} &  0.17 &2p4d & 1.60  \\ \hline
+\end{tabular}
+\caption{\label{sp-farm}Impact  on FARM  of the single precision version of PROP}
+\end{center}
+\end{table}
+
+To examine in more detail the impact of PROP on FARM, 
+cross sections above the ionization threshold (1 Ryd)
+are compared in single and
+double precision  for 
+transitions amongst the 1s, \dots 4s, 2p, \dots 4p, 3d, 4d target states.  
+This comparison is carried out by generating 45 plots.  In all the
+ plots, results in single and double precision match except for few
+ scattering energies which are very close to pseudo-state thresholds. 
+For example Fig.~\ref{1s2p} and \ref{1s4d} present the scattering energies corresponding to the
+{1s2p} and {1s4d} cross-sections computed in single and double precision.   For some cross-sections, 
+increasing a threshold parameter from 1.E-4 to 1.E-3 in the FARM
+program
+results in energies close to threshold being avoided
+ and therefore
+the cross-sections in double and single precision match more
+accurately. 
+This is the case for instance for cross-section 1s2p (see Fig.~\ref{1s2p3}). 
+However for other cross-sections (such as 1s4d) some problematic energies remain even if the 
+threshold parameter in  the FARM
+program is increased to 1.E-3 (see Fig.~\ref{1s4d3}).  A higher 
+threshold parameter would be required for such cross-sections. 
+
+\begin{figure}[t]
+\centering
+\subfigure[threshold = 1.E-4]{ 
+\includegraphics*[width=.76\linewidth]{Chapters/chapter15/figures/1s2p.pdf}
+   \label{1s2p}
+ }
+\subfigure[threshold = 1.E-3]{
+\includegraphics*[width=.76\linewidth]{Chapters/chapter15/figures/1s2p3.pdf}
+ \label{1s2p3}
+ }
+\label{fig:1s2p_10sectors}
+\caption{1s2p cross-section, 10 sectors}
+\end{figure}
+
+\begin{figure}[t]
+\centering
+\subfigure[threshold = 1.E-4]{
+\includegraphics*[width=.76\linewidth]{Chapters/chapter15/figures/1s4d.pdf}
+   \label{1s4d}
+ }
+\subfigure[threshold = 1.E-3]{
+\includegraphics*[width=.76\linewidth]{Chapters/chapter15/figures/1s4d3.pdf}
+ \label{1s4d3}
+ }
+\label{fig:1s4d_10sectors}
+\caption{1s4d cross-section, 10 sectors}
+\end{figure}
+
+As a conclusion, the medium case study shows that the execution of
+PROP in single precision leads to a few inexact scattering energies to
+be computed by the FARM program for some cross-sections.
+Thanks to a suitable threshold parameter in the FARM program these problematic energies may possibly 
+be skipped. 
+Instead of investigating deeper the choice of such a parameter for the medium case, we analyze the 
+single precision computation  in a more
+realistic case in Sect.~\ref{huge}. 
+\begin{comment}
+The conclusion of the medium case study is that running PROP in single
+precision gives relatively stable results provided that suitable
+parameter values are used in the FARM program in order to skip the
+problematic energies that are too close to the pseudo-state
+thresholds.  To verify if this conclusion is still valid with a larger
+data set, the single precision computation is analyzed in a more
+realistic case in Sect.~\ref{huge}.
+\end{comment}
+
+\subsection{Huge case study}\label{huge}
+
+
+\begin{figure}[t] 
+  \centering
+\includegraphics*[width=.76\linewidth]{Chapters/chapter15/figures/1s2pHT.pdf}
+\caption{\label{1s2pHT}1s2p cross-section, threshold = 1.E-4, 210 sectors}
+\end{figure}
+
+\begin{figure}[t] 
+  \centering
+\includegraphics*[width=.76\linewidth]{Chapters/chapter15/figures/1s2pHT.pdf}
+\caption{\label{1s4dHT}1s4d cross-section, threshold = 1.E-4, 210 sectors}
+\end{figure}
+
+We study here the impact on FARM of the PROP program run in
+single precision for the huge case (see Table~\ref{data-sets}).
+The cross-sections
+corresponding to all
+atomic target states 1s \dots 7i are explored, which
+leads to 
+406 comparison plots. 
+It should be noted that in this case, over the same energy range above the ionization threshold, the density of pseudo-state thresholds is significantly increased compared to the medium case.
+As expected, all the plots exhibit large differences between single and double
+precision cross-sections. 
+For example Fig.~\ref{1s2pHT} and  \ref{1s4dHT} present the 1s2p and 1s4d cross-sections computed in
+single and double precision for the huge case.  
+We can conclude that PROP in single precision gives invalid results 
+for realistic simulation cases above the ionization threshold.
+Therefore the  deployment of PROP on GPU, described in Sect.~\ref{gpu-implem},
+has been carried out in double precision. 
+
+\section{Towards a complete deployment of PROP on GPUs} 
+\label{gpu-implem}
+
+We now detail how PROP has been progressively deployed on
+GPUs in double precision in order to avoid the
+expensive memory transfers between the host and the GPU.
+Different versions with successive improvements and optimizations are presented.
+We use CUDA~\cite{CUDA_ProgGuide} for GPU programming, as well as the
+CUBLAS~\cite{CUBLAS} 
+and MAGMA \cite{MAGMA} libraries for linear algebra operations.
+Since PROP is written in Fortran 90, {\em wrappers} in C are used to
+enable calls to CUDA kernels from PROP routines.
+
+
+\subsection{Computing the output $R$-matrix on GPU}
+\label{gpu-RO}
+
+\begin{figure}[h]
+  \centering
+  \includegraphics[width=0.7\linewidth]{Chapters/chapter15/figures/offdiagonal_nb.pdf}
+  \caption{\label{offdiagonal} The six steps of an off-diagonal sector
+    evaluation.}
+\end{figure}
+
+As mentioned in Algorithm~\ref{prop-algo}, evaluating a sector
+mainly consists in constructing local $R$-matrices and in computing
+one output $R$-matrix, $\Re^{O}$. In this first step of the porting
+process, referred to as GPU V1\label{gpuv1},
+we only consider the computation of $\Re^{O}$ on the GPU.
+We distinguish the following six steps, related to equations
+(\ref{eq_CAPS_1}), (\ref{eq_CAPS_2}) and (\ref{eq_CAPS_3}), and illustrated in
+Fig.~\ref{offdiagonal} for an off-diagonal sector.
+
+\begin{description}
+\item[Step 1] (``Input copies''):~data are copied from $\Re^{I}$
+  to temporary arrays ($A$, $U$, $V$) and to $\Re^{O}$.
+  These copies, along with possible scalings or transpositions, are
+  implemented as CUDA kernels which can be applied to two
+  matrices of any size and starting at any offset. 
+  Memory accesses are coalesced \cite{CUDA_ProgGuide} in order to
+  provide the best performance for such memory-bound kernels.
+\item[Step 2] (``Local copies''):~data are copied from
+  local $R$-matrices to temporary arrays ($U$, $V$) and to $\Re^{O}$.
+  Moreover data from local $R$-matrix
+  $\mathbf{r}_{II}$ 
+  is added to matrix $A$ (via a CUDA kernel) and zeroes are written in
+   $\Re^{O}$  where required.
+\item[Step 3] (``Linear system solving''):~matrix $A$ is factorized
+  using the MAGMA DGETRF 
+   routine and the result is stored in-place.
+\item[Step 4] (``Linear system solving'' cont.):~the matrix system
+ of linear equations  $AW$ = $V$ is solved using the MAGMA DGETRS 
+routine. The  solution is stored in matrix $V$.
+\item[Step 5] (``Output matrix product''):~matrix $U$
+  is multiplied by matrix $V$ using the CUBLAS DGEMM 
+  routine. The result is stored in a temporary matrix~$t$.
+\item[Step 6] (``Output add''):~$t$ is added to $\Re^{O}$ (CUDA
+  kernel).
+\end{description}
+
+All the involved matrices are stored in the GPU memory. Only the
+local $R$-matrices are first constructed on the host and then sent
+to the GPU memory, since these matrices vary from sector to sector.
+The evaluation of the axis and diagonal sectors is similar.
+However, fewer operations and copies are required because of
+symmetry considerations \cite{2DRMP}.
+
+\subsection{Constructing the local $R$-matrices on GPU}
+
+\begin{figure}[t]
+ \centering
+  \includegraphics[width=0.7\linewidth]{Chapters/chapter15/figures/amplitudes_nb.pdf} 
+ \caption{\label{amplitudes} Constructing the local $R$-matrix R34
+ from the $j$ amplitude array associated with edge 4 and the $i$
+ amplitude array associated with edge~3.}
+\end{figure}
+
+Local $R$-matrices are constructed using two three dimensional arrays,
+$i$ and $j$. Each three dimensional array contains four
+matrices corresponding to the surface amplitudes associated with the
+four edges of a sector. Those matrices are named {\em amplitude arrays}.
+ $j$ amplitude arrays are read from data files and $i$ amplitude arrays
+are obtained by scaling each row of the $j$ amplitude arrays. 
+The main part of the construction of a local $R$-matrix,
+presented in Fig.~\ref{amplitudes},
+is a matrix product between
+one $i$ amplitude array and one transposed $j$ amplitude array 
+which is performed by a single DGEMM 
+BLAS call. 
+In this version, hereafter referred to as GPU
+V2\label{gpuv2}, $i$ and $j$ amplitude arrays are transferred to the
+GPU memory and the required matrix multiplications are performed on
+the GPU (via CUBLAS routines).
+
+
+The involved matrices having medium sizes (either $3066 \times 383$ or
+$5997 \times 383$),
+performing these matrix multiplications
+on the GPU is expected to be faster than on the CPU.
+However, this implies a greater communication volume
+between the CPU and the GPU
+since the $i$ and $j$ amplitude arrays are larger than the local
+$R$-matrices.
+It can be noticed that correction data are also used in the
+construction of a local $R$-matrix,
+but this is a minor part in the
+computation. However, these correction data also have to be
+transferred from the CPU to the GPU for each sector.
+
+\subsection{Scaling amplitude arrays on GPU}
+It  
+should be worthwhile to try to reduce the CPU-GPU data
+transfers of the GPU V2, where the $i$ and $j$ amplitude arrays are
+constructed on the host and then sent to the GPU memory for each sector. 
+In this new version, hereafter referred to as GPU V3\label{gpuv3}, we
+transfer only the $j$ amplitude arrays and the
+required scaling factors (stored in one 1D array) to the GPU memory,
+so that the $i$ amplitude arrays are
+then directly computed on the GPU by multiplying the $j$ amplitude
+arrays by these scaling factors (via a CUDA kernel).
+Therefore, we save the transfer of four $i$ amplitude arrays on
+each sector by transferring  only this 1D array of scaling factors. 
+Moreover, scaling $j$ amplitude arrays is expected to be faster on the
+GPU than on the CPU, thanks to the massively parallel architecture of
+the GPU and thanks to its higher internal memory bandwidth.
+
+\subsection{Using double-buffering to overlap I/O and computation}
+
+\begin{figure}[t]
+  \centering
+  \includegraphics[width=0.8\linewidth]{Chapters/chapter15/figures/C1060_V3_IO_COMP.pdf} 
+  \caption{\label{overlapping} Compute and I/O times for the GPU V3 on
+    one C1060.}  
+\end{figure} 
+
+As described in Algorithm~\ref{prop-algo}, there are two main steps in
+the propagation across a sector: reading  amplitude arrays
+and correction data from I/O files and
+evaluating the current sector.
+Fig.~\ref{overlapping} shows the I/O times and the evaluation times
+of each sector for the huge case execution (210 sectors, 20 strips) of the GPU V3 
+on one C1060. 
+Whereas the times required by the off-diagonal sectors are similar
+within each of the 20 strips, 
+the times for diagonal sectors of each strip
+are the shortest ones, the times for the axis sectors being
+intermediate.
+The I/O times are roughly constant among all strips.
+The evaluation time is equivalent to the I/O
+time for the first sectors. But this evaluation time grows 
+linearly with the strip number, and rapidly exceeds the I/O 
+time.
+
+It is thus interesting to use a double-buffering technique to overlap the
+I/O time with the evaluation time:
+for each sector, the evaluation of sector $n$ is performed
+(on GPU) simultaneously with the reading of data for sector
+$n+1$ (on CPU). This requires the duplication in the CPU memory of all the
+data structures
+used for storing data read from I/O files for each sector.
+This version, hereafter referred to as GPU
+V4\label{gpuv4}, uses POSIX threads. Two threads are
+executed concurrently: an I/O thread that reads data from I/O files
+for each sector, and a computation thread, dedicated to the propagation
+of the global $R$-matrix, that performs successively for each sector
+all necessary computations on GPU, 
+as well as all required CPU-GPU data transfers.
+The evaluation of a sector uses the data read for this sector as well
+as the global $R$-matrix computed on the previous sector.
+This dependency requires synchronizations between the I/O thread and
+the computation thread which are implemented through standard POSIX
+thread mechanisms.
+
+
+\subsection{Matrix padding}
+The CUBLAS DGEMM 
+performance and the MAGMA DGETRF/DGETRS 
+performance is reduced when the sizes (or
+the leading dimensions) of the matrix are not multiples of the inner blocking size \cite{NTD10a}.
+This inner blocking size can be 32 or 64, depending on the computation
+and on the underlying  
+GPU architecture \cite{MAGMA,NTD10b}. 
+In this version (GPU V5\label{gpuv5}), 
+the matrices are therefore padded with $0.0$ (and $1.0$ on the diagonal for the linear systems)
+so that their sizes are 
+multiples of 64.
+This corresponds indeed to the optimal size for the matrix product on the
+Fermi architecture \cite{NTD10b}. And as far as linear system solving is
+concerned, all the matrices have sizes which are multiples of 383: we
+therefore use padding to obtain multiples of 384 (which are 
+again  multiples of 64). 
+It can be noticed that this padding has to be performed dynamically
+as the matrices increase in size during the propagation 
+(when possible the
+ maximum required storage space is however allocated only once in the
+ GPU memory). 
+
+
+
+\section{Performance results}
+\subsection{PROP deployment on GPU}
+
+\begin{table*}[ht]
+\begin{center}
+\begin{tabular}{|c||c|c||}
+ \hline
+  PROP version & \multicolumn{2}{c|}{Execution time} \\
+  \hline
+  \hline
+  CPU version: 1  core & \multicolumn{2}{c|}{201m32s} \\
+  \hline
+  CPU version: 4  cores &  \multicolumn{2}{c|}{113m28s} \\
+  \hline \hline
+GPU version  & C1060 & C2050 \\
+  \hline\hline
+  GPU V1 (\S~\ref{gpuv1}) & 79m25s & 66m22s  \\
+  \hline
+  GPU V2 (\S~\ref{gpuv2}) & 47m58s & 29m52s \\
+  \hline
+  GPU V3 (\S~\ref{gpuv3}) & 41m28s & 23m46s \\
+  \hline
+  GPU V4 (\S~\ref{gpuv4}) & 27m21s & 13m55s\\
+  \hline
+  GPU V5 (\S~\ref{gpuv5}) & 24m27s & 12m39s  \\
+  \hline
+\end{tabular}
+\caption{\label{table:time} 
+Execution time of PROP on CPU and GPU}
+\end{center}
+\end{table*}
+
+\begin{comment}
+\begin{table*}[ht]
+\begin{center}
+\begin{tabular}{|c||c|c||}
+ \hline
+  PROP version & \multicolumn{2}{c|}{Execution time} \\
+  \hline  \hline
+CPU version & 1 core & 4 cores  \\\hline
+& {201m32s} & {113m28s} \\ \hline  \hline
+GPU version  & C1060 & C2050 \\
+  \hline\hline
+  GPU V1 (\ref{gpuv1}) & 79m25s & 66m22s  \\
+  \hline
+  GPU V2 (\ref{gpuv2}) & 47m58s & 29m52s \\
+  \hline
+  GPU V3 (\ref{gpuv3}) & 41m28s & 23m46s \\
+  \hline
+  GPU V4 (\ref{gpuv4}) & 27m21s & 13m55s\\
+  \hline
+  GPU V5 (\ref{gpuv5}) & 24m27s & 12m39s  \\
+  \hline
+\end{tabular}
+\caption{\label{table:time} 
+Execution time of the successive GPU versions}
+\end{center}
+\end{table*}
+\end{comment}
+
+\begin{figure}[h]
+\centering
+\subfigure[Speedup over 1 CPU core]{
+\includegraphics*[width=0.76
+        \linewidth]{Chapters/chapter15/figures/histo_speedup_1core.pdf}
+   \label{fig:speedup_1core}
+ }
+
+\subfigure[Speedup over 4 CPU cores]{
+\includegraphics*[width=0.76
+        \linewidth]{Chapters/chapter15/figures/histo_speedup_4cores.pdf}
+ \label{fig:speedup_4cores}
+ }
+\label{fig:speedup}
+\caption{Speedup of the successive GPU versions.}
+\end{figure}
+
+Table~\ref{table:time} presents 
+the execution times 
+of PROP on CPUs and GPUs, 
+each version solves the propagation equations in the 
+form~(\ref{eq_CAPS_1}-\ref{eq_CAPS_3}) as proposed by CAPS. 
+Fig.~\ref{fig:speedup_1core} (respectively \ref{fig:speedup_4cores})
+shows the speedup of the successive GPU versions
+over one CPU core (respectively four CPU cores). 
+We use here Intel Q8200 quad-core CPUs (2.33 GHz), one C1060 GPU and
+one C2050 (Fermi) GPU, located at 
+ UPMC (Universit\'e Pierre et Marie Curie, Paris, France). 
+As a remark, the execution times measured on the C2050 would be the same 
+on the C2070 and on  the C2075, the only difference between these GPUs 
+being their memory size and their TDP (Thermal Design Power). 
+We emphasize that the execution times correspond to the
+complete propagation for all six energies of the large case (see
+Table~\ref{data-sets}), that is to say to the complete execution of
+the PROP program.  
+Since energies are independent, execution times for more energies
+(e.g. the huge case) should be proportional
+to those reported in Table~\ref{table:time}.  
+
+These tests, which have been performed with CUDA 3.2, CUBLAS 3.2 and 
+MAGMA 0.2, 
+show that the successive GPU versions of PROP offer 
+increasing, and at the end interesting, speedups.
+More precisely, 
+V2 shows that it is worth increasing slightly the
+CPU-GPU communication volume in order to perform
+large enough matrix products on the GPU. 
+ This communication volume can fortunately be
+reduced thanks to~V3,  
+which also accelerates the computation of
+amplitude arrays thanks to the GPU. 
+The 
+double-buffering technique implemented in V4
+ effectively enables the overlapping of 
+I/O operations with computation, while the 
+padding implemented in V5 also improves the computation times.
+It 
+is noticed that the padding 
+does offer much more performance gain with,  
+for example,  CUDA 3.1 and the MAGMA DGEMM~\cite{NTD10b}:  the
+speedup with respect to one 
+CPU core was increased from 6.3 to 8.1 on C1060, and from 9.5 to 14.3
+on C2050. 
+Indeed since CUBLAS 3.2 performance has been improved for non block multiple
+matrix sizes through MAGMA code~\cite{NTD10a}. 
+Although for all versions the C2050 (with its improved
+double precision performance) offers up to almost 
+double speedup compared to 
+the C1060, the performance obtained with both architectures justifies the use of 
+the GPU for such an application.
+
+\subsection{PROP execution profile}
+
+\begin{figure}[t]
+  \centering
+ \includegraphics*[width=0.64\linewidth]{Chapters/chapter15/figures/CPU_1_CORE_TIMES.pdf}
+\caption{CPU (1 core)  execution times for the off-diagonal sectors of the large case.}
+\label{fig:CPU-timing}
+\end{figure}
+
+\begin{figure}[t]
+  \centering
+  \subfigure[GPU V5 on one C1060]{ 
+\includegraphics*[width=0.64\linewidth]{Chapters/chapter15/figures/GPU_V5_C1060_TIMES.pdf}
+\label{GPU-timing}} 
+
+  \subfigure[GPU V5 on one C2050]{
+\includegraphics*[width=0.64\linewidth]{Chapters/chapter15/figures/GPU_V5_C2050_TIMES.pdf}
+\label{fermi-timing}} 
+ 
+\caption{GPU execution times for the off-diagonal sectors of
+  the large case.}
+\label{fig:profileGPU}
+\end{figure}
+
+We detail here the execution profile on 
+the CPU and the GPU for the evaluation of all off-diagonal sectors 
+(the most representative ones) for a complete energy propagation. 
+ Fig.~\ref{fig:CPU-timing} and \ref{fig:profileGPU} show CPU and GPU execution times for the 
+171 off-diagonal sectors of  the large case (see Table \ref{data-sets}). 
+``Copying, adding, scaling'' corresponds to the amplitude
+  array construction (scaling) as well as to steps 1, 2 and 6 in
+  Sect.~\ref{gpu-RO}, all implemented via CUDA kernels.
+``CPU-GPU transfers'' 
+aggregate transfer times for the $j$ amplitude
+arrays and the scaling factors, as well as for the correction data.
+ ``Amplitude matrix product'' corresponds to the DGEMM call to
+ construct the local R-matrices from the $i$ and $j$ amplitude
+ arrays. 
+``Linear system solving'' and ``Output matrix product'' correspond
+respectively to steps 3-4 and to step 5 in Sect.~\ref{gpu-RO}.
+
+On one CPU core (see  Fig.~\ref{fig:CPU-timing}), 
+ matrix products for the construction of the local
+$R$-matrices require the same 
+computation time during the whole propagation. Likewise the CPU time required by
+ matrix products for the output $R$-matrix is constant within each
+ strip. But as the global $R$-matrix is propagated from strip to
+ strip, the sizes of
+the matrices $U$ and $V$ increase, so does their multiplication time.
+The time required to solve the linear system increases
+slightly during the propagation.
+These three operations (``Amplitude matrix product'', ``Output matrix
+product'' and ``Linear system solving'') are clearly dominant in terms
+of computation
+time compared to the other remaining operations, which justify our
+primary focus on such three linear algebra operations.
+
+
+On the C1060 (see Fig.~\ref{GPU-timing}), we have
+generally managed to obtain a similar speedup for all operations
+(around 8, which corresponds to Fig.~\ref{fig:speedup_1core}). Only the linear system solving
+presents a lower speedup (around~4). 
+The CUDA kernels and the remaining CPU-GPU transfers make a minor contribution
+to the overall computation time and do not require
+additional improvements.
+
+On the C2050 (see Fig.~\ref{fermi-timing}), additional speedup is
+obtained for all operations, except for the 
+CPU-GPU transfers and the linear system solving.
+The CPU-GPU transfers are mainly due to the $j$ amplitude arrays, and
+currently still correspond to minor times. When required, the
+double-buffering technique may also be used to overlap such transfers
+with computation on the GPU.
+
+
+
+\section{Propagation of multiple concurrent energies on GPU}
+
+Finally, we present here an improvement that can  
+benefit from the Fermi architecture, as well as from the newest Kepler
+architecture, 
+both of which enable the concurrent execution of multiple 
+CUDA kernels, thus offering additional speedup on  
+GPUs for small or medium computation grain kernels.
+In our case, the performance gain on the GPU is indeed limited
+since most matrices have small or medium sizes. 
+By using multiple streams within one CUDA context~\cite{CUDA_ProgGuide},
+we can propagate multiple energies
+concurrently on the Fermi GPU. 
+It can be noticed that all GPU computations for all streams are
+launched by the same host thread. We therefore rely here on the {\em legacy
+API} of CUBLAS~\cite{CUBLAS} (like MAGMA)
+without thread safety problems. 
+A {\em breadth first} issue order is used for kernel
+launchs \cite{CUDA_stream}: for a given GPU kernel, all kernel launchs
+are indeed issued together in the host thread, using one stream for each
+concurrent energy, in order to maximize concurrent kernel
+execution.  
+Of course, the memory available on the GPU must be large enough to
+store all data structures required by each energy. 
+Moreover, multiple streams are also used within the
+propagation of each single energy 
+in order to enable concurrent executions among the required kernels.
+
+
+\begin{table}[t]
+\begin{center}
+\begin{tabular}{|c|c||c|c|c|c|c|}
+  \hline
+  \multirow{4}{0.18\linewidth}{Medium case} & Number of & 
+  \multirow{2}{0.07\linewidth}{\centering 1} & 
+  \multirow{2}{0.07\linewidth}{\centering 2} & 
+  \multirow{2}{0.07\linewidth}{\centering 4} & 
+  \multirow{2}{0.07\linewidth}{\centering 8} & 
+  \multirow{2}{0.07\linewidth}{\centering 16} \\  
+  & energies & & & & & \\  
+  \cline{2-7}
+  & Time (s) &  11.18 & 6.87 & 5.32 & 4.96 & 4.76 \\
+  \cline{2-7}
+  & Speedup & - & 1.63 & 2.10 & 2.26 & 2.35 \\
+  \hline
+  \hline
+  \multirow{4}{0.18\linewidth}{Large case} & Number of & 
+  \multirow{2}{0.07\linewidth}{\centering 1} & 
+  \multicolumn{2}{c|}{\multirow{2}{0.07\linewidth}{\centering 2}} & 
+  \multicolumn{2}{c|}{\multirow{2}{0.07\linewidth}{\centering 3}} \\
+  & energies & & \multicolumn{2}{c|}{~} & \multicolumn{2}{c|}{~}  \\  
+  \cline{2-7}
+  & Time (s) & 509.51 & \multicolumn{2}{c|}{451.49} & \multicolumn{2}{c|}{436.72}  \\  
+  \cline{2-7}
+  & Speedup & - & \multicolumn{2}{c|}{1.13} & \multicolumn{2}{c|}{1.17}  \\  
+  \hline
+\end{tabular}
+\caption{\label{t:perfs_V6} Performance results with multiple
+  concurrent energies 
+  on one C2070 GPU. GPU initialization times are not considered here. }
+\end{center}
+\end{table}
+
+In order to have enough GPU memory to run two or three concurrent
+energies for the large case, we use one C2070 GPU 
+(featuring 6GB of memory) 
+with one Intel X5650 hex-core CPU, CUDA 4.1 and CUBLAS 4.1, as
+well as the latest MAGMA release (version 1.3.0). 
+Substantial changes have been required 
+in the MAGMA calls with respect to the previous versions of PROP that were using MAGMA 0.2. 
+ Table~\ref{t:perfs_V6} presents the speedups 
+obtained on the Fermi GPU for multiple concurrent
+energies (up to sixteen since this is the maximum number of concurrent
+kernel launches currently supported \cite{CUDA_ProgGuide}). 
+With the medium case, speedups greater than 2 are obtained with four
+concurrent energies or more. 
+With the more realistic large case, the performance gain is lower mainly because of
+the increase in matrix sizes, which implies a better GPU usage
+with only one energy on the GPU. The concurrent execution of multiple
+kernels is also limited by other operations on the
+GPU \cite{CUDA_ProgGuide,CUDA_stream} and by the current MAGMA code which
+prevents concurrent MAGMA calls in different streams. 
+Better speedups can be here expected on the latest Kepler GPUs which
+offer additional compute power, and whose {\em Hyper-Q} feature may help
+improve further the GPU utilization with concurrent energies. 
+On the contrary, the same code on the C1060 shows no speedup
+ since the concurrent kernel launches are
+serialized on this previous GPU architecture. 
+
+
+
+
+
+
+
+
+\section{Conclusion and future work}
+\label{conclusion}
+
+
+In this chapter, we have presented our methodology and our results in
+the deployment on 
+a GPU of an application (the PROP program) in atomic physics. 
+
+We have started by studying the numerical stability of PROP using
+single precision arithmetic. This has shown that PROP
+using single precision, while relatively stable for some small cases,
+gives unsatisfactory results for realistic simulation cases above the
+ionization threshold where there is a 
+significant density of pseudo-states. It is
+expected that this may not be the case below the ionization threshold
+where the actual target states are less dense. This requires further
+investigation. 
+
+We have 
+therefore deployed the PROP code in double precision on 
+a GPU, with successive improvements. The different GPU versions 
+each offer increasing speedups over the CPU version.
+Compared to the single (respectively four) core(s) CPU version, the
+optimal GPU implementation 
+gives a speedup of 8.2 (resp. 4.6) on one C1060 GPU,
+and a speedup of 15.9 (resp. 9.0) on one 
+C2050 GPU with improved double precision performance.
+An additional gain of around 15\% 
+can also be obtained on one Fermi GPU
+with large memory (C2070) thanks to concurrent kernel execution. 
+
+Such speedups 
+cannot be directly compared with the 1.15 speedup 
+obtained with the HMPP version, since in our tests the CPUs are
+different and the CUBLAS versions are more recent.  
+However, the programming effort required 
+progressively to deploy PROP on GPUs clearly offers improved and interesting speedups for this 
+real-life   application in double precision with varying-size matrices. 
+
+
+We are currently working on a hybrid CPU-GPU version that spreads the
+computations of the independent energies on both the CPU
+and the GPU. This will enable 
+multiple energy execution on the CPU, with 
+one or several core(s) dedicated to each energy (via multi-threaded
+BLAS libraries). Multiple 
+concurrent energies may also be propagated on each Fermi GPU. 
+By merging this work with the current MPI PROP program, we will
+obtain a scalable hybrid CPU-GPU version.
+This final version will offer an additional level of parallelism
+thanks to the MPI
+standard in order to exploit multiple
+nodes with multiple CPU cores and possibly multiple GPU cards. 
+
+\putbib[Chapters/chapter15/biblio]
+
diff --git a/BookGPU/Chapters/chapter15/figures/1s2p.pdf b/BookGPU/Chapters/chapter15/figures/1s2p.pdf
new file mode 100644
index 0000000..2cd2c18
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/1s2p.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/1s2p3.pdf b/BookGPU/Chapters/chapter15/figures/1s2p3.pdf
new file mode 100644
index 0000000..d87e962
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/1s2p3.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/1s2pHT.pdf b/BookGPU/Chapters/chapter15/figures/1s2pHT.pdf
new file mode 100644
index 0000000..9ae1822
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/1s2pHT.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/1s4d.pdf b/BookGPU/Chapters/chapter15/figures/1s4d.pdf
new file mode 100644
index 0000000..4e889d7
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/1s4d.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/1s4d3.pdf b/BookGPU/Chapters/chapter15/figures/1s4d3.pdf
new file mode 100644
index 0000000..56ff4f4
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/1s4d3.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/1s4dHT.pdf b/BookGPU/Chapters/chapter15/figures/1s4dHT.pdf
new file mode 100644
index 0000000..9857001
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/1s4dHT.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/C1060_V3_IO_COMP.pdf b/BookGPU/Chapters/chapter15/figures/C1060_V3_IO_COMP.pdf
new file mode 100644
index 0000000..03acc05
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/C1060_V3_IO_COMP.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/CPU_1_CORE_TIMES.pdf b/BookGPU/Chapters/chapter15/figures/CPU_1_CORE_TIMES.pdf
new file mode 100644
index 0000000..ec8f980
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/CPU_1_CORE_TIMES.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/Domain.pdf b/BookGPU/Chapters/chapter15/figures/Domain.pdf
new file mode 100644
index 0000000..2027845
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/Domain.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/GPU_V5_C1060_TIMES.pdf b/BookGPU/Chapters/chapter15/figures/GPU_V5_C1060_TIMES.pdf
new file mode 100644
index 0000000..dc316ca
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/GPU_V5_C1060_TIMES.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/GPU_V5_C2050_TIMES.pdf b/BookGPU/Chapters/chapter15/figures/GPU_V5_C2050_TIMES.pdf
new file mode 100644
index 0000000..0522c60
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/GPU_V5_C2050_TIMES.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/amplitudes_nb.pdf b/BookGPU/Chapters/chapter15/figures/amplitudes_nb.pdf
new file mode 100644
index 0000000..c1f43a3
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/amplitudes_nb.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/error.pdf b/BookGPU/Chapters/chapter15/figures/error.pdf
new file mode 100644
index 0000000..203190a
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/error.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/histo_speedup_1core.pdf b/BookGPU/Chapters/chapter15/figures/histo_speedup_1core.pdf
new file mode 100644
index 0000000..f859253
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/histo_speedup_1core.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/histo_speedup_4cores.pdf b/BookGPU/Chapters/chapter15/figures/histo_speedup_4cores.pdf
new file mode 100644
index 0000000..cc1d80b
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/histo_speedup_4cores.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/offdiagonal_nb.pdf b/BookGPU/Chapters/chapter15/figures/offdiagonal_nb.pdf
new file mode 100644
index 0000000..46b67be
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/offdiagonal_nb.pdf differ
diff --git a/BookGPU/Chapters/chapter15/figures/prop.pdf b/BookGPU/Chapters/chapter15/figures/prop.pdf
new file mode 100644
index 0000000..481fe64
Binary files /dev/null and b/BookGPU/Chapters/chapter15/figures/prop.pdf differ
diff --git a/BookGPU/Makefile b/BookGPU/Makefile
index 011ff95..43eb90f 100644
--- a/BookGPU/Makefile
+++ b/BookGPU/Makefile
@@ -7,7 +7,8 @@ all:
 	bibtex bu1
 	bibtex bu2
 	bibtex bu3
-#	bibtex bu4
+	bibtex bu4
+	bibtex bu6	
 	makeindex  ${BOOK}.idx
 	pdflatex ${BOOK}
 	pdflatex ${BOOK}