From: Raphael Couturier Date: Tue, 22 Jan 2013 16:00:42 +0000 (+0100) Subject: new X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/8ad1643f80bdf5681bdb9cca04ff30378cb44cb8?ds=inline;hp=-c new --- 8ad1643f80bdf5681bdb9cca04ff30378cb44cb8 diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index 8bb03d4..edd313e 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -15,7 +15,7 @@ \usepackage{cite} %\usepackage{algorithm} %\usepackage{algorithmic} -\usepackage[lined,boxed,commentsnumbered]{algorithm2e} +\usepackage[ruled,lined,linesnumbered]{algorithm2e} \usepackage{epstopdf} \usepackage{url} \usepackage{multirow} @@ -130,17 +130,17 @@ \setcounter{page}{1} \part{This is a Part} -%\include{Chapters/chapter1/ch1} -%\include{Chapters/chapter2/ch2} -%\include{Chapters/chapter3/ch3} -%\include{Chapters/chapter5/ch5} -%\include{Chapters/chapter6/ch6} -%\include{Chapters/chapter7/ch7} -%\include{Chapters/chapter8/ch8} +\include{Chapters/chapter1/ch1} +\include{Chapters/chapter2/ch2} +\include{Chapters/chapter3/ch3} +\include{Chapters/chapter5/ch5} +\include{Chapters/chapter6/ch6} +\include{Chapters/chapter7/ch7} +\include{Chapters/chapter8/ch8} \include{Chapters/chapter9/ch9} -%\include{Chapters/chapter11/ch11} -%\include{Chapters/chapter14/ch14} -%\include{Chapters/chapter15/ch15} +\include{Chapters/chapter11/ch11} +\include{Chapters/chapter14/ch14} +\include{Chapters/chapter15/ch15} \bibliographystyle{hep} %%%\bibliography{biblio} diff --git a/BookGPU/Chapters/chapter15/ch15.tex b/BookGPU/Chapters/chapter15/ch15.tex index 5dae9b2..9b0bf27 100644 --- a/BookGPU/Chapters/chapter15/ch15.tex +++ b/BookGPU/Chapters/chapter15/ch15.tex @@ -5,7 +5,7 @@ \chapterauthor{Stan Scott}{School of Electronics, Electrical Engineering \& Computer Science, The Queen's University of Belfast} -\newcommand{\fixme}[1]{{\bf #1}} +%\newcommand{\fixme}[1]{{\bf #1}} \chapter[Numerical validation and performance optimization on GPUs in atomic physics]{Numerical validation and performance optimization on GPUs of an application in atomic physics} \label{chapter15} @@ -275,20 +275,36 @@ $\Re^{O}$. the output $R$-matrix becomes the input $R$-matrix for the next evaluation. +%% \begin{algorithm} +%% \caption{\label{prop-algo}PROP algorithm} +%% \begin{algorithmic} +%% \FOR{all scattering energies} +%% \FOR{all sectors} +%% \STATE Read amplitude arrays +%% \STATE Read correction data +%% \STATE Construct local $R$-matrices +%% \STATE From $\Re^{I}$ and local $R$-matrices, compute $\Re^{O}$ +%% \STATE $\Re^{O}$ becomes $\Re^{I}$ for the next sector +%% \ENDFOR +%% \STATE Compute physical $R$-Matrix +%% \ENDFOR +%% \end{algorithmic} +%% \end{algorithm} + \begin{algorithm} \caption{\label{prop-algo}PROP algorithm} -\begin{algorithmic} -\FOR{all scattering energies} - \FOR{all sectors} - \STATE Read amplitude arrays - \STATE Read correction data -\STATE Construct local $R$-matrices -\STATE From $\Re^{I}$ and local $R$-matrices, compute $\Re^{O}$ -\STATE $\Re^{O}$ becomes $\Re^{I}$ for the next sector - \ENDFOR - \STATE Compute physical $R$-Matrix -\ENDFOR -\end{algorithmic} +%\begin{algorithmic} +\For{all scattering energies} { + \For{all sectors}{ + Read amplitude arrays\; + Read correction data\; + Construct local $R$-matrices\; + From $\Re^{I}$ and local $R$-matrices, compute $\Re^{O}$\; + $\Re^{O}$ becomes $\Re^{I}$ for the next sector\; + } + Compute physical $R$-Matrix \; +} +%\end{algorithmic} \end{algorithm} diff --git a/BookGPU/Chapters/chapter3/ch3.aux b/BookGPU/Chapters/chapter3/ch3.aux index 6cc3707..0d1505e 100644 --- a/BookGPU/Chapters/chapter3/ch3.aux +++ b/BookGPU/Chapters/chapter3/ch3.aux @@ -4,10 +4,10 @@ \@writefile{toc}{\contentsline {chapter}{\numberline {3}Setting up the environnement.}{23}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\newlabel{algo:memcopy:H2D}{{\caption@xref {algo:memcopy:H2D}{ on input line 124}}{23}} -\newlabel{algo:memcopy:kernel}{{\caption@xref {algo:memcopy:kernel}{ on input line 125}}{23}} -\newlabel{algo:memcopy:D2H}{{\caption@xref {algo:memcopy:D2H}{ on input line 126}}{23}} -\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{23}} +\newlabel{algo:memcopy:H2D}{{7}{23}} +\newlabel{algo:memcopy:kernel}{{8}{23}} +\newlabel{algo:memcopy:D2H}{{9}{23}} +\@writefile{loa}{\contentsline {algocf}{\numberline {1}{\ignorespaces Global memory management on CPU and GPU sides.\relax }}{23}} \newlabel{algo:memcopy}{{1}{23}} \@writefile{toc}{\contentsline {section}{\numberline {3.1}Data transfers, memory management.}{24}} \newlabel{lst:main1}{{3.1}{25}} @@ -46,11 +46,11 @@ \@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Exemple of 5x5 median filtering\relax }}{33}} \newlabel{fig:median_1}{{4.2}{33}} \newlabel{algoMedianGeneric}{{2}{33}} -\newlabel{algoMedianGeneric:memcpyH2D}{{\caption@xref {algoMedianGeneric:memcpyH2D}{ on input line 241}}{33}} -\newlabel{algoMedianGeneric:cptstart}{{\caption@xref {algoMedianGeneric:cptstart}{ on input line 246}}{33}} -\newlabel{algoMedianGeneric:cptend}{{\caption@xref {algoMedianGeneric:cptend}{ on input line 246}}{33}} -\newlabel{algoMedianGeneric:memcpyD2H}{{\caption@xref {algoMedianGeneric:memcpyD2H}{ on input line 247}}{33}} -\@writefile{loa}{\contentsline {algorithm}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{33}} +\newlabel{algoMedianGeneric:memcpyH2D}{{1}{33}} +\newlabel{algoMedianGeneric:cptstart}{{3}{33}} +\newlabel{algoMedianGeneric:cptend}{{5}{33}} +\newlabel{algoMedianGeneric:memcpyD2H}{{7}{33}} +\@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{33}} \@writefile{toc}{\contentsline {section}{\numberline {4.3}NVidia GPU tuning recipes}{33}} \@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{34}} \newlabel{fig:median_overlap}{{4.3}{34}} @@ -130,16 +130,10 @@ \setcounter{lotdepth}{1} \setcounter{lstnumber}{70} \setcounter{ContinuedFloat}{0} -\setcounter{float@type}{16} -\setcounter{algorithm}{2} -\setcounter{ALC@unique}{0} -\setcounter{ALC@line}{0} -\setcounter{ALC@rem}{0} -\setcounter{ALC@depth}{0} -\setcounter{AlgoLine}{0} -\setcounter{algocfline}{0} -\setcounter{algocfproc}{0} -\setcounter{algocf}{0} +\setcounter{AlgoLine}{7} +\setcounter{algocfline}{2} +\setcounter{algocfproc}{2} +\setcounter{algocf}{2} \setcounter{proposition}{0} \setcounter{theorem}{0} \setcounter{exercise}{0} diff --git a/BookGPU/Chapters/chapter3/ch3.tex b/BookGPU/Chapters/chapter3/ch3.tex index 83bb339..e04a36c 100755 --- a/BookGPU/Chapters/chapter3/ch3.tex +++ b/BookGPU/Chapters/chapter3/ch3.tex @@ -115,15 +115,15 @@ However, so as to propose concise and more readable code, we will assume the fol \begin{algorithm} \SetNlSty{}{}{:} - allocate and populate CPU memory \textbf{h\_in}\;\\ - allocate CPU pinned-memory \textbf{h\_out}\;\\ - allocate GPU global memory \textbf{d\_out}\;\\ - declare GPU texture reference \textbf{tex\_img\_in}\;\\ - allocate GPU array in global memory \textbf{array\_img\_in}\;\\ - bind GPU array \textbf{array\_img\_in} to texture \textbf{tex\_img\_in}\;\\ - copy data from \textbf{h\_in} to \textbf{array\_img\_in}\label{algo:memcopy:H2D}\;\\ - kernel\kl gridDim,blockDim\kr()\tcc*[f]{outputs to d\_out}\label{algo:memcopy:kernel}\;\\ - copy data from \textbf{d\_out} to \textbf{h\_out} \label{algo:memcopy:D2H}\;\\ + allocate and populate CPU memory \textbf{h\_in}\; + allocate CPU pinned-memory \textbf{h\_out}\; + allocate GPU global memory \textbf{d\_out}\; + declare GPU texture reference \textbf{tex\_img\_in}\; + allocate GPU array in global memory \textbf{array\_img\_in}\; + bind GPU array \textbf{array\_img\_in} to texture \textbf{tex\_img\_in}\; + copy data from \textbf{h\_in} to \textbf{array\_img\_in}\label{algo:memcopy:H2D}\; + kernel\kl gridDim,blockDim\kr()\tcc*[f]{outputs to d\_out}\label{algo:memcopy:kernel}\; + copy data from \textbf{d\_out} to \textbf{h\_out} \label{algo:memcopy:D2H}\; \caption{Global memory management on CPU and GPU sides.} \label{algo:memcopy} \end{algorithm} @@ -238,13 +238,13 @@ On the GPU's side, we note high dependence on window size due to the redundancy %\SetNlSty{}{}{:} % \SetLine %\linesnumbered - copy data from CPU to GPU texture memory\label{algoMedianGeneric:memcpyH2D}\;\\ + copy data from CPU to GPU texture memory\label{algoMedianGeneric:memcpyH2D}\; \ForEach(\tcc*[f]{in parallel}){pixel at position $(x, y)$}{ - Read gray-level values of the n$\times$n neighborhood\label{algoMedianGeneric:cptstart}\;\\ - Selects the median value among those n$\times$n values\;\\ - Outputs the new gray-level value \label{algoMedianGeneric:cptend}\;\\ + Read gray-level values of the n$\times$n neighborhood\label{algoMedianGeneric:cptstart}\; + Selects the median value among those n$\times$n values\; + Outputs the new gray-level value \label{algoMedianGeneric:cptend}\; } -copy data from GPU global memory to CPU memory\label{algoMedianGeneric:memcpyD2H}\;\\ +copy data from GPU global memory to CPU memory\label{algoMedianGeneric:memcpyD2H}\; \caption{\label{algoMedianGeneric}generic n$\times$n median filter} \end{algorithm} diff --git a/BookGPU/Chapters/chapter6/ch6.aux b/BookGPU/Chapters/chapter6/ch6.aux index e79cde5..ef830ac 100644 --- a/BookGPU/Chapters/chapter6/ch6.aux +++ b/BookGPU/Chapters/chapter6/ch6.aux @@ -37,12 +37,12 @@ \newlabel{fig:ch6p1syncexpematrixprod}{{6.5}{95}} \@writefile{toc}{\contentsline {section}{\numberline {6.3}General scheme of asynchronous parallel code with computation/communication overlapping}{96}} \newlabel{ch6:part2}{{6.3}{96}} -\@writefile{loa}{\contentsline {algorithm}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{96}} +\@writefile{loa}{\contentsline {algocf}{\numberline {3}{\ignorespaces Synchronous iterative scheme\relax }}{96}} \newlabel{algo:ch6p2sync}{{3}{96}} -\@writefile{loa}{\contentsline {algorithm}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{96}} +\@writefile{loa}{\contentsline {algocf}{\numberline {4}{\ignorespaces Asynchronous iterative scheme\relax }}{96}} \newlabel{algo:ch6p2async}{{4}{96}} -\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{98}} -\newlabel{ch6:p2BasicAsync}{{6.3.1}{98}} +\@writefile{toc}{\contentsline {subsection}{\numberline {6.3.1}A basic asynchronous scheme}{97}} +\newlabel{ch6:p2BasicAsync}{{6.3.1}{97}} \newlabel{algo:ch6p2BasicAsync}{{6.5}{98}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.5}Initialization of the basic asynchronous scheme}{98}} \newlabel{algo:ch6p2BasicAsyncComp}{{6.6}{99}} @@ -61,8 +61,8 @@ \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.11}Reception function in the synchronized scheme}{105}} \@writefile{toc}{\contentsline {subsection}{\numberline {6.3.3}Asynchronous scheme using MPI, OpenMP and CUDA}{106}} \newlabel{ch6:p2GPUAsync}{{6.3.3}{106}} -\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{108}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{108}} +\newlabel{algo:ch6p2AsyncSyncComp}{{6.12}{107}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {6.12}Computing function in the final asynchronous scheme}{107}} \newlabel{algo:ch6p2syncGPU}{{6.13}{109}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {6.13}Computing function in the final asynchronous scheme}{109}} \newlabel{algo:ch6p2FullOverAsyncMain}{{6.14}{111}} @@ -131,16 +131,10 @@ \setcounter{lotdepth}{1} \setcounter{lstnumber}{17} \setcounter{ContinuedFloat}{0} -\setcounter{float@type}{16} -\setcounter{algorithm}{4} -\setcounter{ALC@unique}{0} -\setcounter{ALC@line}{0} -\setcounter{ALC@rem}{0} -\setcounter{ALC@depth}{0} \setcounter{AlgoLine}{0} -\setcounter{algocfline}{0} -\setcounter{algocfproc}{0} -\setcounter{algocf}{0} +\setcounter{algocfline}{4} +\setcounter{algocfproc}{4} +\setcounter{algocf}{4} \setcounter{proposition}{0} \setcounter{theorem}{0} \setcounter{exercise}{0} diff --git a/BookGPU/Makefile b/BookGPU/Makefile index 1371704..9280d0b 100644 --- a/BookGPU/Makefile +++ b/BookGPU/Makefile @@ -12,7 +12,9 @@ all: bibtex bu6 bibtex bu7 bibtex bu8 - bibtex bu9 + bibtex bu9 +#don't put chapter 14, refs are included + bibtex bu11 makeindex ${BOOK}.idx pdflatex ${BOOK} pdflatex ${BOOK}