\usepackage{cite}
%\usepackage{algorithm}
%\usepackage{algorithmic}
-\usepackage[lined,boxed,commentsnumbered]{algorithm2e}
+\usepackage[ruled,lined,linesnumbered]{algorithm2e}
\usepackage{epstopdf}
\usepackage{url}
\usepackage{multirow}
\setcounter{page}{1}
\part{This is a Part}
-%\include{Chapters/chapter1/ch1}
-%\include{Chapters/chapter2/ch2}
-%\include{Chapters/chapter3/ch3}
-%\include{Chapters/chapter5/ch5}
-%\include{Chapters/chapter6/ch6}
-%\include{Chapters/chapter7/ch7}
-%\include{Chapters/chapter8/ch8}
+\include{Chapters/chapter1/ch1}
+\include{Chapters/chapter2/ch2}
+\include{Chapters/chapter3/ch3}
+\include{Chapters/chapter5/ch5}
+\include{Chapters/chapter6/ch6}
+\include{Chapters/chapter7/ch7}
+\include{Chapters/chapter8/ch8}
\include{Chapters/chapter9/ch9}
-%\include{Chapters/chapter11/ch11}
-%\include{Chapters/chapter14/ch14}
-%\include{Chapters/chapter15/ch15}
+\include{Chapters/chapter11/ch11}
+\include{Chapters/chapter14/ch14}
+\include{Chapters/chapter15/ch15}
\bibliographystyle{hep}
%%%\bibliography{biblio}
\chapterauthor{Stan Scott}{School of Electronics, Electrical Engineering \& Computer Science,
The Queen's University of Belfast}
-\newcommand{\fixme}[1]{{\bf #1}}
+%\newcommand{\fixme}[1]{{\bf #1}}
\chapter[Numerical validation and performance optimization on GPUs in atomic physics]{Numerical validation and performance optimization on GPUs of an application in atomic physics}
\label{chapter15}
the output $R$-matrix becomes the input $R$-matrix
for the next evaluation.
+%% \begin{algorithm}
+%% \caption{\label{prop-algo}PROP algorithm}
+%% \begin{algorithmic}
+%% \FOR{all scattering energies}
+%% \FOR{all sectors}
+%% \STATE Read amplitude arrays
+%% \STATE Read correction data
+%% \STATE Construct local $R$-matrices
+%% \STATE From $\Re^{I}$ and local $R$-matrices, compute $\Re^{O}$
+%% \STATE $\Re^{O}$ becomes $\Re^{I}$ for the next sector
+%% \ENDFOR
+%% \STATE Compute physical $R$-Matrix
+%% \ENDFOR
+%% \end{algorithmic}
+%% \end{algorithm}
+
\begin{algorithm}
\caption{\label{prop-algo}PROP algorithm}
-\begin{algorithmic}
-\FOR{all scattering energies}
- \FOR{all sectors}
- \STATE Read amplitude arrays
- \STATE Read correction data
-\STATE Construct local $R$-matrices
-\STATE From $\Re^{I}$ and local $R$-matrices, compute $\Re^{O}$
-\STATE $\Re^{O}$ becomes $\Re^{I}$ for the next sector
- \ENDFOR
- \STATE Compute physical $R$-Matrix
-\ENDFOR
-\end{algorithmic}
+%\begin{algorithmic}
+\For{all scattering energies} {
+ \For{all sectors}{
+ Read amplitude arrays\;
+ Read correction data\;
+ Construct local $R$-matrices\;
+ From $\Re^{I}$ and local $R$-matrices, compute $\Re^{O}$\;
+ $\Re^{O}$ becomes $\Re^{I}$ for the next sector\;
+ }
+ Compute physical $R$-Matrix \;
+}
+%\end{algorithmic}
\end{algorithm}
\@writefile{toc}{\contentsline {chapter}{\numberline {3}Setting up the environnement.}{23}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
-\newlabel{algo:memcopy:H2D}{{\caption@xref {algo:memcopy:H2D}{ on input line 124}}{23}}
-\newlabel{algo:memcopy:kernel}{{\caption@xref {algo:memcopy:kernel}{ on input line 125}}{23}}
-\newlabel{algo:memcopy:D2H}{{\caption@xref {algo:memcopy:D2H}{ on input line 126}}{23}}
-\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{23}}
+\newlabel{algo:memcopy:H2D}{{7}{23}}
+\newlabel{algo:memcopy:kernel}{{8}{23}}
+\newlabel{algo:memcopy:D2H}{{9}{23}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{23}}
\newlabel{algo:memcopy}{{1}{23}}
\@writefile{toc}{\contentsline {section}{\numberline {3.1}Data transfers, memory management.}{24}}
\newlabel{lst:main1}{{3.1}{25}}
\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Exemple of 5x5 median filtering\relax }}{33}}
\newlabel{fig:median_1}{{4.2}{33}}
\newlabel{algoMedianGeneric}{{2}{33}}
-\newlabel{algoMedianGeneric:memcpyH2D}{{\caption@xref {algoMedianGeneric:memcpyH2D}{ on input line 241}}{33}}
-\newlabel{algoMedianGeneric:cptstart}{{\caption@xref {algoMedianGeneric:cptstart}{ on input line 246}}{33}}
-\newlabel{algoMedianGeneric:cptend}{{\caption@xref {algoMedianGeneric:cptend}{ on input line 246}}{33}}
-\newlabel{algoMedianGeneric:memcpyD2H}{{\caption@xref {algoMedianGeneric:memcpyD2H}{ on input line 247}}{33}}
-\@writefile{loa}{\contentsline {algorithm}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{33}}
+\newlabel{algoMedianGeneric:memcpyH2D}{{1}{33}}
+\newlabel{algoMedianGeneric:cptstart}{{3}{33}}
+\newlabel{algoMedianGeneric:cptend}{{5}{33}}
+\newlabel{algoMedianGeneric:memcpyD2H}{{7}{33}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{33}}
\@writefile{toc}{\contentsline {section}{\numberline {4.3}NVidia GPU tuning recipes}{33}}
\@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{34}}
\newlabel{fig:median_overlap}{{4.3}{34}}
\setcounter{lotdepth}{1}
\setcounter{lstnumber}{70}
\setcounter{ContinuedFloat}{0}
-\setcounter{float@type}{16}
-\setcounter{algorithm}{2}
-\setcounter{ALC@unique}{0}
-\setcounter{ALC@line}{0}
-\setcounter{ALC@rem}{0}
-\setcounter{ALC@depth}{0}
-\setcounter{AlgoLine}{0}
-\setcounter{algocfline}{0}
-\setcounter{algocfproc}{0}
-\setcounter{algocf}{0}
+\setcounter{AlgoLine}{7}
+\setcounter{algocfline}{2}
+\setcounter{algocfproc}{2}
+\setcounter{algocf}{2}
\setcounter{proposition}{0}
\setcounter{theorem}{0}
\setcounter{exercise}{0}
\begin{algorithm}
\SetNlSty{}{}{:}
- allocate and populate CPU memory \textbf{h\_in}\;\\
- allocate CPU pinned-memory \textbf{h\_out}\;\\
- allocate GPU global memory \textbf{d\_out}\;\\
- declare GPU texture reference \textbf{tex\_img\_in}\;\\
- allocate GPU array in global memory \textbf{array\_img\_in}\;\\
- bind GPU array \textbf{array\_img\_in} to texture \textbf{tex\_img\_in}\;\\
- copy data from \textbf{h\_in} to \textbf{array\_img\_in}\label{algo:memcopy:H2D}\;\\
- kernel\kl gridDim,blockDim\kr()\tcc*[f]{outputs to d\_out}\label{algo:memcopy:kernel}\;\\
- copy data from \textbf{d\_out} to \textbf{h\_out} \label{algo:memcopy:D2H}\;\\
+ allocate and populate CPU memory \textbf{h\_in}\;
+ allocate CPU pinned-memory \textbf{h\_out}\;
+ allocate GPU global memory \textbf{d\_out}\;
+ declare GPU texture reference \textbf{tex\_img\_in}\;
+ allocate GPU array in global memory \textbf{array\_img\_in}\;
+ bind GPU array \textbf{array\_img\_in} to texture \textbf{tex\_img\_in}\;
+ copy data from \textbf{h\_in} to \textbf{array\_img\_in}\label{algo:memcopy:H2D}\;
+ kernel\kl gridDim,blockDim\kr()\tcc*[f]{outputs to d\_out}\label{algo:memcopy:kernel}\;
+ copy data from \textbf{d\_out} to \textbf{h\_out} \label{algo:memcopy:D2H}\;
\caption{Global memory management on CPU and GPU sides.}
\label{algo:memcopy}
\end{algorithm}
%\SetNlSty{}{}{:}
% \SetLine
%\linesnumbered
- copy data from CPU to GPU texture memory\label{algoMedianGeneric:memcpyH2D}\;\\
+ copy data from CPU to GPU texture memory\label{algoMedianGeneric:memcpyH2D}\;
\ForEach(\tcc*[f]{in parallel}){pixel at position $(x, y)$}{
- Read gray-level values of the n$\times$n neighborhood\label{algoMedianGeneric:cptstart}\;\\
- Selects the median value among those n$\times$n values\;\\
- Outputs the new gray-level value \label{algoMedianGeneric:cptend}\;\\
+ Read gray-level values of the n$\times$n neighborhood\label{algoMedianGeneric:cptstart}\;
+ Selects the median value among those n$\times$n values\;
+ Outputs the new gray-level value \label{algoMedianGeneric:cptend}\;
}
-copy data from GPU global memory to CPU memory\label{algoMedianGeneric:memcpyD2H}\;\\
+copy data from GPU global memory to CPU memory\label{algoMedianGeneric:memcpyD2H}\;
\caption{\label{algoMedianGeneric}generic n$\times$n median filter}
\end{algorithm}
\newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{95}}
\@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{96}}
\newlabel{ch6:part2}{{6.3}{96}}
-\@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{96}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{96}}
\newlabel{algo:ch6p2sync}{{3}{96}}
-\@writefile{loa}{\contentsline {algorithm}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{96}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{96}}
\newlabel{algo:ch6p2async}{{4}{96}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{98}}
-\newlabel{ch6:p2BasicAsync}{{6.3.1}{98}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{97}}
+\newlabel{ch6:p2BasicAsync}{{6.3.1}{97}}
\newlabel{algo:ch6p2BasicAsync}{{6.5}{98}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{98}}
\newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{99}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{105}}
\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{106}}
\newlabel{ch6:p2GPUAsync}{{6.3.3}{106}}
-\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{108}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{108}}
+\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{107}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{107}}
\newlabel{algo:ch6p2syncGPU}{{6.13}{109}}
\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{109}}
\newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{111}}
\setcounter{lotdepth}{1}
\setcounter{lstnumber}{17}
\setcounter{ContinuedFloat}{0}
-\setcounter{float@type}{16}
-\setcounter{algorithm}{4}
-\setcounter{ALC@unique}{0}
-\setcounter{ALC@line}{0}
-\setcounter{ALC@rem}{0}
-\setcounter{ALC@depth}{0}
\setcounter{AlgoLine}{0}
-\setcounter{algocfline}{0}
-\setcounter{algocfproc}{0}
-\setcounter{algocf}{0}
+\setcounter{algocfline}{4}
+\setcounter{algocfproc}{4}
+\setcounter{algocf}{4}
\setcounter{proposition}{0}
\setcounter{theorem}{0}
\setcounter{exercise}{0}
bibtex bu6
bibtex bu7
bibtex bu8
- bibtex bu9
+ bibtex bu9
+#don't put chapter 14, refs are included
+ bibtex bu11
makeindex ${BOOK}.idx
pdflatex ${BOOK}
pdflatex ${BOOK}