last version

[book_gpu.git] / BookGPU / Chapters / chapter16 / gpu.tex
diff --git a/BookGPU/Chapters/chapter16/gpu.tex b/BookGPU/Chapters/chapter16/gpu.tex

index f4f62f1611a2ddd1632077ad0daa8118d44d4b4d..4d4d6ef365353105617ca6b9bde55f864738f997 100644 (file)
--- a/BookGPU/Chapters/chapter16/gpu.tex
+++ b/BookGPU/Chapters/chapter16/gpu.tex
@@ -5,7 +5,7 @@ In this section, we explain how to efficiently
  use matrix-free GMRES to solve
  the Newton update problems with implicit sensitivity calculation,
  i.e., the steps enclosed by the double dashed block
  use matrix-free GMRES to solve
  the Newton update problems with implicit sensitivity calculation,
  i.e., the steps enclosed by the double dashed block
-in Fig.~\ref{fig:ef_flow}.
+in Figure~\ref{fig:ef_flow}.
  Then implementation issues of GPU acceleration
  will be discussed in detail. 
  Finally,  the Gear-2 integration is briefly introduced.
  Then implementation issues of GPU acceleration
  will be discussed in detail. 
  Finally,  the Gear-2 integration is briefly introduced.
@@ -15,13 +15,13 @@ Finally,  the Gear-2 integration is briefly introduced.
  \underline{G}eneralized \underline{M}inimum \underline{Res}idual,
  or GMRES method is an iterative method for solving
  systems of linear equations ($A x=b$) with dense matrix $A$.
  \underline{G}eneralized \underline{M}inimum \underline{Res}idual,
  or GMRES method is an iterative method for solving
  systems of linear equations ($A x=b$) with dense matrix $A$.
-The standard GMRES\index{GMRES} is given in Algorithm~\ref{alg:GMRES}.
-It constructs a Krylov subspace\index{Krylov subspace} with order $m$,
+The standard GMRES\index{iterative method!GMRES} is given in Algorithm~\ref{alg:GMRES}.
+It constructs a Krylov subspace\index{iterative method!Krylov subspace} with order $m$,
  \[ \mathcal{K}_m = \mathrm{span}( b, A^{} b, A^2 b,\ldots, A^{m-1} b ),\]
  where the approximate solution $x_m$ resides.
  In practice, an orthonormal basis $V_m$ that spans the
  subspace $\mathcal{K}_{m}$ can be generated by
  \[ \mathcal{K}_m = \mathrm{span}( b, A^{} b, A^2 b,\ldots, A^{m-1} b ),\]
  where the approximate solution $x_m$ resides.
  In practice, an orthonormal basis $V_m$ that spans the
  subspace $\mathcal{K}_{m}$ can be generated by
-the Arnoldi iteration\index{Arnoldi iterations}.
+the Arnoldi iterations\index{iterative method!Arnoldi iterations}.
  The goal of GMRES is to search for an optimal coefficient $y$
  such that the linear combination $x_m = V_m y$ will minimize
  its residual $\| b-Ax_m \|_2$.
  The goal of GMRES is to search for an optimal coefficient $y$
  such that the linear combination $x_m = V_m y$ will minimize
  its residual $\| b-Ax_m \|_2$.
@@ -78,7 +78,7 @@ a preset tolerance~\cite{Golub:Book'96}.
  %% \end{algorithm}
  
  \begin{algorithm}
  %% \end{algorithm}
  
  \begin{algorithm}
-\caption{Standard GMRES\index{GMRES} algorithm.} \label{alg:GMRES}
+\caption{standard GMRES\index{iterative method!GMRES} algorithm} \label{alg:GMRES}
    \KwIn{ $ A \in \mathbb{R}^{N \times N}$, $b \in \mathbb{R}^N$,
        and initial guess $x_0 \in \mathbb{R}^N$}
    \KwOut{ $x \in \mathbb{R}^N$: $\| b - A x\|_2 < tol$}
    \KwIn{ $ A \in \mathbb{R}^{N \times N}$, $b \in \mathbb{R}^N$,
        and initial guess $x_0 \in \mathbb{R}^N$}
    \KwOut{ $x \in \mathbb{R}^N$: $\| b - A x\|_2 < tol$}
@@ -160,7 +160,7 @@ period in order to solve a Newton update.
  At each time step, SPICE\index{SPICE} has
  to linearize device models, stamp matrix elements
  into MNA (short for modified nodal analysis\index{modified nodal analysis, or MNA}) matrices,
  At each time step, SPICE\index{SPICE} has
  to linearize device models, stamp matrix elements
  into MNA (short for modified nodal analysis\index{modified nodal analysis, or MNA}) matrices,
-and solve circuit equations in its inner Newton iteration\index{Newton iteration}.
+and solve circuit equations in its inner Newton iteration\index{iterative method!Newton iteration}.
  When convergence is attained,
  circuit states are saved and then next time step begins.
  This is also the time when we store the needed matrices
  When convergence is attained,
  circuit states are saved and then next time step begins.
  This is also the time when we store the needed matrices
@@ -225,7 +225,7 @@ Hence, in consideration of the serial nature of the trianularization,
  the small size of Hessenberg matrix,
  and the frequent inspection of values by host, it is
  preferable to allocate $\tilde{H}$ in CPU (host) memory.
  the small size of Hessenberg matrix,
  and the frequent inspection of values by host, it is
  preferable to allocate $\tilde{H}$ in CPU (host) memory.
-As shown in Fig.~\ref{fig:gmres}, the memory copy from device to host
+As shown in Figure~\ref{fig:gmres}, the memory copy from device to host
  is called each time when Arnoldi iteration generates a new vector
  and the orthogonalization produces the vector $h$.
  
  is called each time when Arnoldi iteration generates a new vector
  and the orthogonalization produces the vector $h$.