From: couturie Date: Mon, 5 Aug 2013 14:57:47 +0000 (+0200) Subject: new X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/9204f1de91750cacde93b719fa722d0320040454?ds=sidebyside new --- diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index e57e595..a078efb 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -154,7 +154,7 @@ \makeindex -%\includeonly{Chapters/chapter19/ch19} +%\includeonly{Chapters/chapter5/ch5} \DeclareCaptionLabelSeparator{colon}{. } \begin{document} @@ -173,7 +173,7 @@ \author{Raphaël Couturier} \maketitle -\cleardoublepage +%\cleardoublepage \frontmatter %\include{frontmatter/Foreword} diff --git a/BookGPU/Chapters/chapter18/ch1.aux b/BookGPU/Chapters/chapter18/ch1.aux deleted file mode 100644 index 2c84cf4..0000000 --- a/BookGPU/Chapters/chapter18/ch1.aux +++ /dev/null @@ -1,64 +0,0 @@ -\relax -\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} -\@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {1}Presentation of the GPU architecture and of the Cuda environment}{3}} -\@writefile{lof}{\addvspace {10\p@ }} -\@writefile{lot}{\addvspace {10\p@ }} -\newlabel{chapter1}{{1}{3}} -\@writefile{toc}{\contentsline {section}{\numberline {1.1}Introduction}{3}} -\newlabel{ch1:intro}{{1.1}{3}} -\@writefile{toc}{\contentsline {section}{\numberline {1.2}Brief history of Video Card}{3}} -\@writefile{toc}{\contentsline {section}{\numberline {1.3}GPGPU}{4}} -\@writefile{toc}{\contentsline {section}{\numberline {1.4}Architecture of current GPUs}{5}} -\@writefile{lof}{\contentsline {figure}{\numberline {1.1}{\ignorespaces Comparison of number of cores in a CPU and in a GPU.\relax }}{5}} -\providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} -\newlabel{ch1:fig:comparison_cpu_gpu}{{1.1}{5}} -\@writefile{lof}{\contentsline {figure}{\numberline {1.2}{\ignorespaces Comparison of low latency of CPU and high throughput of GPU.\relax }}{6}} -\newlabel{ch1:fig:latency_throughput}{{1.2}{6}} -\@writefile{toc}{\contentsline {section}{\numberline {1.5}Kinds of parallelism}{7}} -\@writefile{toc}{\contentsline {section}{\numberline {1.6}Cuda Multithreading}{7}} -\@writefile{lof}{\contentsline {figure}{\numberline {1.3}{\ignorespaces Scalability of GPU.\relax }}{8}} -\newlabel{ch1:fig:scalability}{{1.3}{8}} -\@writefile{toc}{\contentsline {section}{\numberline {1.7}Memory hierarchy}{9}} -\@writefile{lof}{\contentsline {figure}{\numberline {1.4}{\ignorespaces Memory hierarchy of a GPU.\relax }}{10}} -\newlabel{ch1:fig:memory_hierarchy}{{1.4}{10}} -\@writefile{toc}{\contentsline {section}{\numberline {1.8}Conclusion}{10}} -\@writefile{toc}{\contentsline {section}{Bibliography}{11}} -\@setckpt{Chapters/chapter1/ch1}{ -\setcounter{page}{12} -\setcounter{equation}{0} -\setcounter{enumi}{0} -\setcounter{enumii}{0} -\setcounter{enumiii}{0} -\setcounter{enumiv}{3} -\setcounter{footnote}{0} -\setcounter{mpfootnote}{0} -\setcounter{part}{1} -\setcounter{chapter}{1} -\setcounter{section}{8} -\setcounter{subsection}{0} -\setcounter{subsubsection}{0} -\setcounter{paragraph}{0} -\setcounter{subparagraph}{0} -\setcounter{figure}{4} -\setcounter{table}{0} -\setcounter{numauthors}{0} -\setcounter{parentequation}{0} -\setcounter{subfigure}{0} -\setcounter{lofdepth}{1} -\setcounter{subtable}{0} -\setcounter{lotdepth}{1} -\setcounter{lstnumber}{1} -\setcounter{ContinuedFloat}{0} -\setcounter{AlgoLine}{0} -\setcounter{algocfline}{0} -\setcounter{algocfproc}{0} -\setcounter{algocf}{0} -\setcounter{proposition}{0} -\setcounter{theorem}{0} -\setcounter{exercise}{0} -\setcounter{example}{0} -\setcounter{definition}{0} -\setcounter{proof}{0} -\setcounter{lstlisting}{0} -} diff --git a/BookGPU/Chapters/chapter18/preamble.aux b/BookGPU/Chapters/chapter18/preamble.aux deleted file mode 100644 index 65abe7e..0000000 --- a/BookGPU/Chapters/chapter18/preamble.aux +++ /dev/null @@ -1,32 +0,0 @@ -\relax -\@setckpt{Chapters/chapter1/preamble}{ -\setcounter{page}{1} -\setcounter{equation}{0} -\setcounter{enumi}{0} -\setcounter{enumii}{0} -\setcounter{enumiii}{0} -\setcounter{enumiv}{0} -\setcounter{footnote}{0} -\setcounter{mpfootnote}{0} -\setcounter{part}{0} -\setcounter{chapter}{0} -\setcounter{section}{0} -\setcounter{subsection}{0} -\setcounter{subsubsection}{0} -\setcounter{paragraph}{0} -\setcounter{subparagraph}{0} -\setcounter{figure}{0} -\setcounter{table}{0} -\setcounter{numauthors}{0} -\setcounter{parentequation}{0} -\setcounter{subfigure}{0} -\setcounter{lofdepth}{1} -\setcounter{subtable}{0} -\setcounter{lotdepth}{1} -\setcounter{lstnumber}{1} -\setcounter{ContinuedFloat}{0} -\setcounter{AlgoLine}{0} -\setcounter{algocfline}{0} -\setcounter{algocfproc}{0} -\setcounter{algocf}{0} -} diff --git a/BookGPU/Chapters/chapter3/code/medianGeneric.cu.aux b/BookGPU/Chapters/chapter3/code/medianGeneric.cu.aux deleted file mode 100755 index 03ee019..0000000 --- a/BookGPU/Chapters/chapter3/code/medianGeneric.cu.aux +++ /dev/null @@ -1,32 +0,0 @@ -\relax -\@setckpt{code/medianGeneric.cu}{ -\setcounter{page}{4} -\setcounter{equation}{0} -\setcounter{enumi}{0} -\setcounter{enumii}{0} -\setcounter{enumiii}{0} -\setcounter{enumiv}{0} -\setcounter{footnote}{0} -\setcounter{mpfootnote}{0} -\setcounter{part}{0} -\setcounter{chapter}{1} -\setcounter{section}{2} -\setcounter{subsection}{1} -\setcounter{subsubsection}{0} -\setcounter{paragraph}{0} -\setcounter{subparagraph}{0} -\setcounter{figure}{2} -\setcounter{table}{0} -\setcounter{parentequation}{0} -\setcounter{subfigure}{0} -\setcounter{lofdepth}{1} -\setcounter{subtable}{0} -\setcounter{lotdepth}{1} -\setcounter{AlgoLine}{7} -\setcounter{algocfline}{1} -\setcounter{algocfproc}{1} -\setcounter{algocf}{1} -\setcounter{lstnumber}{1} -\setcounter{ContinuedFloat}{0} -\setcounter{lstlisting}{0} -} diff --git a/BookGPU/Chapters/chapter7/ch7.tex b/BookGPU/Chapters/chapter7/ch7.tex index e74681b..f9f7fd4 100644 --- a/BookGPU/Chapters/chapter7/ch7.tex +++ b/BookGPU/Chapters/chapter7/ch7.tex @@ -807,7 +807,7 @@ A harmonic analysis of the wave spectrum at the shoal center line is computed an {\scriptsize\input{Chapters/chapter7/figures/WhalinWaveHarmonics_T3_single.tikz}} } % \end{center} - \caption[Harmonic analysis for the experiment of Whalin for $T=1,2,3\,$s.]{Harmonic analysis for the experiment of Whalin for $T=1,2,3\,$s respectively. Measured experimental and computed results (single-precision) are in good agreement. Test environment 1.}\label{ch7:whalinresults} + \caption[Harmonic analysis for the experiment of Whalin for $T=1,2,3\,$s.]{Harmonic analysis for the experiment of Whalin for $T=1,2,3\,$s. Measured experimental and computed results (single-precision) are in good agreement. Test environment 1.}\label{ch7:whalinresults} \end{figure} \subsection{Acceleration via parallelism in time using parareal}\label{ch7:parareal}\index{parareal} @@ -822,7 +822,7 @@ The parareal algorithm has been introduced as a component in our in-house GPU li In Section \ref{ch5:parareal} it is assumed that communication costs can be neglected and a simple model for the algorithmic work complexity is derived. It is found that there are four key discretization parameters for parareal that need to be balanced appropriately in order to achieve high parallel efficiency: the number of coarse-grained time intervals $N$, the number of iterations $K$, the ratio between the computational cost of the coarse to the fine propagator $\mathcal{C}_\mathcal{G}/\mathcal{C}_\mathcal{F}$, and the ratio between fine and coarse time step sizes $\delta t/\delta T$. % How to obtain speed-up -Ideally, the ratio $\mathcal{C}_\mathcal{G}/\mathcal{C}_\mathcal{F}$ is small and convergence happens in $k=1$ iteration. This is seldom the case though, as it requires the coarse propagator to achieve accuracy close to that of the fine propagator while at the same time being substantially cheaper computationally, these two objectives obviously being conflicting. Obtaining the highest possible speed-up is a matter of trade-off, typically, the more GPUs used, the faster the coarse propagator should be. The performance of parareal is problem- and discretization-dependent and as such one would suspect that different wave parameters influence the suitability of the method. This was investigated in \cite{ch7:ASNP12} and indeed the performance does change with wave parameters. Typically the method works better for deep water waves with low to medium wave amplitudes. +Ideally, the ratio $\mathcal{C}_\mathcal{G}/\mathcal{C}_\mathcal{F}$ is small and convergence happens in $k=1$ iteration. This is seldom the case though, as it requires the coarse propagator to achieve accuracy close to that of the fine propagator while at the same time being substantially cheaper computationally, these two objectives obviously being conflicting. Obtaining the highest possible speed-up is a matter of trade-off, typically, the more GPUs used, the faster the coarse propagator should be. The performance of parareal is problem- and discretization-dependent and as such one would suspect that different wave parameters influence the suitability of the method. This was investigated in \cite{ch7:ASNP12} and indeed the performance does change with wave parameters. Typically the method works better for deep water waves with low- to medium-wave amplitudes. % \begin{figure}[!htb] \begin{center} @@ -842,7 +842,7 @@ Ideally, the ratio $\mathcal{C}_\mathcal{G}/\mathcal{C}_\mathcal{F}$ is small an % % What did we do and what are the results -We have performed a scalability study for parareal using 2D nonlinear stream function waves based on a discretization with $(N_x,N_z)=(33,9)$ collocation points, cf. Figure \ref{ch7:fig:DDPA_SPEEDUP}. The study shows that moderate speedup is possible for this hyperbolic system. Using four GPU nodes, a speedup of slightly more than two was achieved while using sixteen GPU nodes resulted in a speedup of slightly less than five. As noticed in Figure \ref{ch7:fig:DDPA_SPEEDUP}, parallel efficiency decreases quite fast when using more GPUs. This limitation is due to the usage of a fairly slow and accurate coarse propagator and is linked to a known difficulty with parareal applied to hyperbolic systems. For hyperbolic systems, instabilities tend to arise when using a very inaccurate coarse propagator. This prevents using a large number of time subdomains, as this by Amdahl's law also requires a very fast coarse propagator. The numbers are still impressive though, considering that the speedup due to parareal comes as additional speedup to an already efficient and fast code. +We have performed a scalability study for parareal using 2D nonlinear stream function waves based on a discretization with $(N_x,N_z)=(33,9)$ collocation points, cf. Figure \ref{ch7:fig:DDPA_SPEEDUP}. The study shows that moderate speedup is possible for this hyperbolic system. Using four GPU nodes, a speedup of slightly more than two was achieved while using sixteen GPU nodes resulted in a speedup of slightly less than five. As noticed in Figure \ref{ch7:fig:DDPA_SPEEDUP}, parallel efficiency decreases quite fast when using more GPUs. This limitation is due to the usages of a fairly slow and accurate coarse propagator and is linked to a known difficulty with parareal applied to hyperbolic systems. For hyperbolic systems, instabilities tend to arise when using a very inaccurate coarse propagator. This prevents using a large number of time subdomains, as this by Amdahl's law also requires a very fast coarse propagator. The numbers are still impressive though, considering that the speedup due to parareal comes as additional speedup to an already efficient and fast code. Performance results for the Whalin test case are also shown in Figure \ref{ch7:fig:whalinparareal}. There is a natural limitation to how much we can increase $R$ (the ratio between the complexity of the fine and coarse propagators), because of stability issues with the coarse propagator. In this test case we simulate from $t=[0,1]$s, using up to $32$ GPUs. For low $R$ and only two GPUs, there is no speedup gain, but for the configuration with eight or more GPUs and $R\geq6$, we are able to get more than $2$ times speedup. Though these hyperbolic systems are not optimal for performance tuning using the parareal method, results still confirm that reasonable speedups are in fact possible on heterogenous systems. \begin{figure}[!htb] @@ -868,7 +868,7 @@ Finally, we remark that the parareal algorithm is also a fault-tolerant algorith %\newpage \subsection{Towards real-time interactive ship simulation}\index{real-time simulation} -A fast GPU-accelerated ship hydrodynamic model is developed for real-time interactive ship simulation by modification of the unified potential flow model presented in Section \ref{ch7:goveq}. The target scientific application is an interactive full mission marine simulator, where multiple ships controlled by naval officers can navigate in a near-realistic virtual marine environment. Full mission simulators are used for education and training of naval officers in critical manoeuvring operations and for evaluation of ship and marine infrastructure designs. To predict the motion of ships, a hydrodynamics model is required for prediction of forces by \eqref{ch7:forcecalc} which is affected by the kinematic properties of the model, cf. Section \ref{ch7:dispkin}. The state-of-the-art for such a hydrodynamic model in today's real-time ship simulators is based on fast interpolation and proper scaling of experimental model data. The amount of experimental model data is limited with respect to hull forms and configurations, requiring the need for extrapolation that compromises the accuracy. +A fast GPU-accelerated ship hydrodynamic model is developed for real-time interactive ship simulation by modification of the unified potential flow model presented in Section \ref{ch7:goveq}. The target scientific application is an interactive full mission marine simulator, where multiple ships controlled by naval officers can navigate in a near-realistic virtual marine environment. Full mission simulators are used for education and training of naval officers in critical maneuvering operations and for evaluation of ship and marine infrastructure designs. To predict the motion of ships, a hydrodynamics model is required for prediction of forces by \eqref{ch7:forcecalc} which is affected by the kinematic properties of the model, cf. Section \ref{ch7:dispkin}. The state-of-the-art for such a hydrodynamic model in today's realtime ship simulators is based on fast interpolation and proper scaling of experimental model data. The amount of experimental model data is limited with respect to hull forms and configurations, requiring the need for extrapolation that compromises the accuracy. The objective of current and ongoing work is aimed at removing these limitations by replacing the existing hydrodynamic model and instead calculating at full-scale the flow field, wave field, ship-structure, and ship-ship interaction forces in real-time using massive parallel computation technology. The potential flow model (OceanWave3D) presented in Section \ref{ch7:goveq} is suitable as the modeling basis for this purpose since it is robust, accurate, efficient, and scalable to arbitrarily large domains. Furthermore, it can accurately account for dispersive waves in the range from shallow to deep waters in marine settings where the sea bed may be uneven. @@ -901,7 +901,7 @@ where $U$ is the velocity of the ship. The unsteady linear water problem is used \end{subequations} where the pressure on the ship hull $p_{ship}$ is calculated explicitly based on a quasi-static approximation which is determined by assuming $\partial_t\phi_1\approx0$ and rewriting \eqref{ch7:quasistatic}. In general, a ship hull is a complex surface in three-dimensional space, but its draft can be approximated by a single-valued function of the horizontal coordinates $\eta_0 = \eta_0(x,y)$, and the no-flux condition on the ship hull is approximated by a flat-ship approximation. Radiation boundary conditions are approximated by a Sommerfelt absorbing boundary condition \cite{ch7:DgayguiJolySJAM1994} on the vertical sides of the physical domain to let waves escape the domain. -The modified numerical model can still be based on flexible-order finite difference method as discussed in Section \ref{ch7:sec:nummodel}. The computational bottleneck problem is the efficient solution of the Laplace problem twice which can be done efficiently by the GPU-accelerated iterative PDC method as explained in section \ref{ch7:PDCmethod}. A snapshot of the steady state wave field is provided in the introduction to this chapter. Computed resistance curves for a Series 60 hull moving at forward speed corresponding to Froude number $F_n=0.316$ knots in calm water are compared to experimental data \cite{ch7:TodaEtAl1992} in Figure \ref{ch7:fig:shiphydro} (a). The computed Kelvin wave system is shown in Figure \ref{ch7:fig:shiphydro} (b). The computed results compare well with experiments at moderate ship Froudes numbers $F_n=U/\sqrt{gh}$ in the range 0.1-0.25 as expected for a linear model. The real-time constraint required to fulfill the interactive and visualization requirements can currently be met with the GPU-accelerated hydrodynamics model for problem sizes of approximately $10^6$ for ship Froude numbers in the range 0.1-0.3. The modeling and real-time aspects will be addressed in more detail in ongoing work. +The modified numerical model can still be based on flexible-order finite difference method as discussed in Section \ref{ch7:sec:nummodel}. The computational bottleneck problem is the efficient solution of the Laplace problem twice which can be done efficiently by the GPU-accelerated iterative PDC method as explained in section \ref{ch7:PDCmethod}. A snapshot of the steady state wave field is provided in the introduction to this chapter. Computed resistance curves for a Series 60 hull moving at forward speed corresponding to Froude number $F_n=0.316$ knots in calm water are compared to experimental data \cite{ch7:TodaEtAl1992} in Figure \ref{ch7:fig:shiphydro} (a). The computed Kelvin wave system is shown in Figure \ref{ch7:fig:shiphydro} (b). The computed results compare well with experiments at moderate ship Froude numbers $F_n=U/\sqrt{gh}$ in the range 0.1--0.25 as expected for a linear model. The real-time constraint required to fulfill the interactive and visualization requirements can currently be met with the GPU-accelerated hydrodynamics model for problem sizes of approximately $10^6$ for ship Froude numbers in the range 0.1--0.3. The modeling and real-time aspects will be addressed in more detail in ongoing work. % \begin{figure}[!htb] @@ -921,13 +921,13 @@ The modified numerical model can still be based on flexible-order finite differe %\newpage \section{Conclusion and future work} -We have presented implementation details together with several novel results on development of a new massively parallel and scalable tool for simulation of nonlinear free surface water waves on heterogenous hardware. The tool is based on the unified potential flow model referred to as OceanWave3D \cite{ch7:EBL08} which provides the basis for efficient and scalable simulation of water waves over uneven bottoms on arbitrary domain sizes. We have demonstrated in a few examples how we can accelerate performance by using single-precision math without comprimising accuracy. We have shown that performance can be accelerated by introducing concurrency in the time integration using the parareal algorithm and for the first time in a heterogenous setup based on the use of multiple GPUs. Interestingly, we find that parallel computations using parareal may be more efficient than using conventional data-parallel distributed computations in a multi-GPU setup for moderate problem sizes. We have measured absolute performance and scalability using several of the most recent generations of NVIDIA GPUs to detail the efficiency of the current code. This is useful to predict time to results as explained in \cite{ch7:EngsigKarupEtAl2011} and may be compared against other wave models in fair comparisons. +We have presented implementation details together with several novel results on development of a new massively parallel and scalable tool for simulation of nonlinear free surface water waves on heterogenous hardware. The tool is based on the unified potential flow model referred to as OceanWave3D \cite{ch7:EBL08} which provides the basis for efficient and scalable simulation of water waves over uneven bottoms on arbitrary domain sizes. We have demonstrated in a few examples how we can accelerate performance by using single-precision math without compromising accuracy. We have shown that performance can be accelerated by introducing concurrency in the time integration using the parareal algorithm and for the first time in a heterogenous setup based on the use of multiple GPUs. Interestingly, we find that parallel computations using parareal may be more efficient than using conventional data-parallel distributed computations in a multi-GPU setup for moderate problem sizes. We have measured absolute performance and scalability using several of the most recent generations of NVIDIA GPUs to detail the efficiency of the current code. This is useful to predict time to results as explained in \cite{ch7:EngsigKarupEtAl2011} and may be compared against other wave models in fair comparisons. Work in progress focuses on extending the governing equations to account for lack of physics such as wave runup and wave breaking. Also, we plan to extend the domain decomposition method to unstructured grids of blocks that can be boundary-fitted to more general bottom-mounted structures to be able to address wave-structure problems, cf. \cite{ch7:EHBM06,ch7:EHBW08}. For example, this will provide the basis for simulations of wave transformations in large harbor areas or predict wave climates in near-coastal areas. We anticipate that a tool based on the proposed parallel solution strategies will be useful for further advancement in fast and robust analysis techniques and large-scale simulation of free surface wave simulation (e.g., for use as an efficient far-field solver at large scales) and be a basis for next-generation wave models. We also expect that the tool can be useful for hybrid-solution strategies with local flow features possibly resolved by other models and for advancing state-of-the-art in fast physics-based wave-body simulations, e.g., ship-wave interactions in ship simulation where real-time constraints are imposed due to visualization. These subjects will be part of ongoing work addressing application aspects. -\section{Acknowledgment} +\section{Acknowledgments} This work was supported by grant no. 09-070032 from the Danish Research Council for Technology and Production Sciences. A special thank goes to Professor Jan S. Hesthaven for supporting parts of this work. Scalability and performance tests was done in the GPUlab at DTU Informatics, Technical University of Denmark and using the GPU-cluster at Center for Computing and Visualization, Brown University, USA. NVIDIA Corporation is acknowledged for generous hardware donations to facilities of the GPUlab. diff --git a/BookGPU/Makefile b/BookGPU/Makefile index 1ab7327..18c5a86 100644 --- a/BookGPU/Makefile +++ b/BookGPU/Makefile @@ -35,4 +35,5 @@ all: clean: rm -rf *~ *.aux *.bbl *.blg *.dvi *.lon *.not *.lof *.log *.lot *.toc *.ind *.idx *.ilg *.aux + find . -name "*aux" -print |xargs rm