From fd7d6f1c6c149f79839200277703c9ef950150f8 Mon Sep 17 00:00:00 2001
From: couturie <couturie@extinction>
Date: Mon, 23 Sep 2013 21:20:38 +0200
Subject: [PATCH] new

---
 BookGPU/Chapters/chapter1/ch1.tex    | 22 ++++++++++++----------
 BookGPU/Chapters/chapter15/ch15.tex  |  4 +++-
 BookGPU/Chapters/chapter16/exp.tex   |  2 +-
 BookGPU/Chapters/chapter16/intro.tex |  2 +-
 BookGPU/Chapters/chapter2/ch2.tex    |  6 +++---
 BookGPU/Chapters/chapter3/ch3.tex    |  2 +-
 BookGPU/Chapters/chapter4/ch4.tex    |  4 ++--
 BookGPU/Chapters/chapter5/ch5.tex    |  2 ++
 BookGPU/Chapters/chapter7/ch7.tex    |  2 +-
 9 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/BookGPU/Chapters/chapter1/ch1.tex b/BookGPU/Chapters/chapter1/ch1.tex
index 68605f6..2fef3a4 100755
--- a/BookGPU/Chapters/chapter1/ch1.tex
+++ b/BookGPU/Chapters/chapter1/ch1.tex
@@ -58,8 +58,8 @@ graphics processing unit (GPGPU)  computing.  Of course other programming models
 have been  proposed. The  other well-known alternative  is OpenCL which  aims at
 proposing an alternative  to CUDA and which is  multiplatform and portable. This
 is a  great advantage since  it is even  possible to execute OpenCL  programs on
-traditional CPUs.  The main drawback is  that it is less tight with the hardware
-and  consequently sometimes  provides  less efficient  programs. Moreover,  CUDA
+traditional CPUs.  The main drawback is  that it is less close to the hardware
+and  consequently it sometimes  provides  less efficient  programs. Moreover,  CUDA
 benefits from  more mature compilation and optimization  procedures.  Other less
 known environments have been proposed,  but most of them have been discontinued,
 such FireStream by ATI which is  not maintained anymore and has been replaced by
@@ -127,11 +127,7 @@ account the memory latency.
 
 
 
-\begin{figure}[t!]
-\centerline{\includegraphics[scale=0.7]{Chapters/chapter1/figures/low_latency_vs_high_throughput.pdf}}
-\caption{Comparison of low latency of a CPU and high throughput of a GPU.}
-\label{ch1:fig:latency_throughput}
-\end{figure}
+
 
 Figure~\ref{ch1:fig:latency_throughput}  illustrates   the  main  difference  of
 memory latency between a CPU and a  GPU. In a CPU, tasks ``ti'' are executed one
@@ -144,7 +140,13 @@ this  principle, as soon  as a  task is  finished the  next one  is ready  to be
 executed  while the  wait for  data for  the previous  task is  overlapped by the
 computation of other tasks. 
 
+\clearpage
 
+\begin{figure}[t!]
+\centerline{\includegraphics[scale=0.7]{Chapters/chapter1/figures/low_latency_vs_high_throughput.pdf}}
+\caption{Comparison of low latency of a CPU and high throughput of a GPU.}
+\label{ch1:fig:latency_throughput}
+\end{figure}
 
 \section{Kinds of parallelism}
 
@@ -169,7 +171,7 @@ GPUs.
 Task parallelism is the common parallelism  achieved  on clusters and grids and
 high performance  architectures where different tasks are  executed by different
 computing units.
-
+\clearpage
 \section{CUDA multithreading}
 
 The data parallelism  of CUDA is more precisely based  on the Single Instruction
@@ -265,7 +267,7 @@ to fill the shared  memory at the start of the kernel  with global data that are
 used very  frequently, then threads can  access it for  their computation.  Threads
 can obviously change  the content of this shared  memory either with computation
 or by loading  other data and they can  store its content in the  global memory. So
-shared memory can  be seen as a cache memory  manageable manually. This
+shared memory can  be seen as a cache memory which is manageable manually. This
 obviously  requires an effort from the programmer.
 
 On  recent cards,  the programmer  may decide  what amount  of cache  memory and
@@ -282,7 +284,7 @@ own registers  and their local memory. Threads  of the same block  can access
 the shared memory of that block. The cache memory is not represented here but it
 is local  to a thread. Then  each block can access  the global  memory of the
 GPU.
-
+\clearpage
  \section{Conclusion}
 
 In this chapter,  a brief presentation of the video card,  which has later been
diff --git a/BookGPU/Chapters/chapter15/ch15.tex b/BookGPU/Chapters/chapter15/ch15.tex
index 2d507e4..7860441 100644
--- a/BookGPU/Chapters/chapter15/ch15.tex
+++ b/BookGPU/Chapters/chapter15/ch15.tex
@@ -1093,7 +1093,9 @@ in order to enable concurrent executions among the required kernels.
   & Speedup & - & \multicolumn{2}{c|}{1.13} & \multicolumn{2}{c|}{1.17}  \\  
   \hline
 \end{tabular}
-\caption{\label{t:perfs_V6} Performance results with multiple
+\caption[Performance results with multiple
+  concurrent energies 
+  on one C2070 GPU.]{\label{t:perfs_V6} Performance results with multiple
   concurrent energies 
   on one C2070 GPU. GPU initialization times are not considered here. }
 \end{center}
diff --git a/BookGPU/Chapters/chapter16/exp.tex b/BookGPU/Chapters/chapter16/exp.tex
index 8e5894d..7fda02b 100644
--- a/BookGPU/Chapters/chapter16/exp.tex
+++ b/BookGPU/Chapters/chapter16/exp.tex
@@ -47,7 +47,7 @@ Section~\ref{sec:gpu}.
   \includegraphics[width=.6\textwidth]{./Chapters/chapter16/figures/flyback_zoomin_emb.eps}
   \label{fig:flybackZoom}
 }
-\caption{Flyback converter solution calculated by envelope-following.
+\caption[Flyback converter solution calculated by envelope-following.]{Flyback converter solution calculated by envelope-following.
 The red curve is traditional SPICE simulation result, and
 the back curve is the envelope-following output with simulation points
 marked.}
diff --git a/BookGPU/Chapters/chapter16/intro.tex b/BookGPU/Chapters/chapter16/intro.tex
index 4cbc91f..875093f 100644
--- a/BookGPU/Chapters/chapter16/intro.tex
+++ b/BookGPU/Chapters/chapter16/intro.tex
@@ -75,7 +75,7 @@ next envelope step.
   \subfigure[The envelope changes in a slow time scale.]
        {\resizebox{.9\textwidth}{!}{\input{./Chapters/chapter16/figures/envelope.pdf_t}}
             \label{fig:ef2} }
-  \caption{Transient envelope-following\index{envelope-following} analysis.
+  \caption[Transient envelope-following\index{envelope-following} analysis.]{Transient envelope-following\index{envelope-following} analysis.
     (Both two figures reflect backward Euler\index{Euler!backward Euler} style envelope-following.)}
   \label{fig:ef_intro}
 \end{figure}
diff --git a/BookGPU/Chapters/chapter2/ch2.tex b/BookGPU/Chapters/chapter2/ch2.tex
index 7fc8471..490d753 100755
--- a/BookGPU/Chapters/chapter2/ch2.tex
+++ b/BookGPU/Chapters/chapter2/ch2.tex
@@ -24,8 +24,8 @@ are executed on a GPU. This code is in Listing~\ref{ch2:lst:ex1}.
 
 As GPUs have  their own memory, the first step consists  of allocating memory on
 the  GPU.    A  call  to   \texttt{cudaMalloc}\index{CUDA  functions!cudaMalloc}
-allocates memory on  the GPU. {\bf REREAD The first parameter of this  function is a pointer
-on a  memory on the  device, i.e. the  GPU.} The second parameter  represents the
+allocates memory on  the GPU.  The first parameter of this  function is a pointer
+on a  memory on the  device, i.e. the  GPU. The second parameter  represents the
 size of the allocated variables, this size is expressed in bits.
 \pagebreak
 \lstinputlisting[label=ch2:lst:ex1,caption=simple example]{Chapters/chapter2/ex1.cu}
@@ -71,7 +71,7 @@ gives the size of each block.
 
 
 
-
+\pagebreak
 \section{Second example: using CUBLAS \index{CUBLAS}}
 \label{ch2:2ex}
 
diff --git a/BookGPU/Chapters/chapter3/ch3.tex b/BookGPU/Chapters/chapter3/ch3.tex
index 1c6453d..ac546dc 100755
--- a/BookGPU/Chapters/chapter3/ch3.tex
+++ b/BookGPU/Chapters/chapter3/ch3.tex
@@ -256,7 +256,7 @@ Listing \ref{lst:medianForget1pix3} details this process where forgetful selecti
 \begin{figure}[b]
    \centering
    \includegraphics[width=6cm]{Chapters/chapter3/img/forgetful_selection.png}
-   \caption{Forgetful selection with the minimal element register count. Illustration for $3\times 3$ pixel window represented in a row and supposed sorted.}
+   \caption{Forgetful selection with the minimal element register count. Illustration for $3\times 3$ pixel window represented in a row and supposedly sorted.}
    \label{fig:forgetful_selection}
 \end{figure}
 \begin{figure}
diff --git a/BookGPU/Chapters/chapter4/ch4.tex b/BookGPU/Chapters/chapter4/ch4.tex
index 90612c9..ed3f531 100644
--- a/BookGPU/Chapters/chapter4/ch4.tex
+++ b/BookGPU/Chapters/chapter4/ch4.tex
@@ -113,7 +113,7 @@ $\mathbf{2048\times 2048}$&1.178&1549 &\bf 3.265&\bf 875 &6.398&529 \\\hline
 $\mathbf{4096\times 4096}$&4.700&1585 &13.05&533     &25.56&533 \\\hline
 \end{tabular}
 }  
-\caption[Timings (time) and throughput values (TP in MP/s) of one register-only non-separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a C2070 card.]{Timings (time) and throughput values (TP in MPx/s) of one register-only non-separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}. The bold value points out the result obtained in the reference situation.}
+\caption[Timings (time) and throughput values (TP in MP/s) of one register-only nonseparable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a C2070 card.]{Timings (time) and throughput values (TP in MPx/s) of one register-only nonseparable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}. The bold value points out the result obtained in the reference situation.}
 \label{tab:convoNonSepReg1}
 \end{table} 
 
@@ -142,7 +142,7 @@ $\mathbf{2048\times 2048}$&0.801&1092 &\bf 2.189&\bf 802 &4.278&573 \\\hline
 $\mathbf{4096\times 4096}$&3.171&1075 &8.720&793 &17.076&569 \\\hline
 \end{tabular}
 }  
-\caption[Timings (time) and throughput values (TP in MP/s) of one register-only non-separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a GTX280.]{Timings (time) and throughput values (TP in MP/s) of one register-only non-separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}. The bold value points out the result obtained in the reference situation.}
+\caption[Timings (time) and throughput values (TP in MP/s) of one register-only nonseparable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a GTX280.]{Timings (time) and throughput values (TP in MP/s) of one register-only nonseparable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$, and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}. The bold value points out the result obtained in the reference situation.}
 \label{tab:convoNonSepReg3}
 \end{table}
 
diff --git a/BookGPU/Chapters/chapter5/ch5.tex b/BookGPU/Chapters/chapter5/ch5.tex
index 0a04352..26752ea 100644
--- a/BookGPU/Chapters/chapter5/ch5.tex
+++ b/BookGPU/Chapters/chapter5/ch5.tex
@@ -450,6 +450,8 @@ If grid ghost layers are updated whenever information from adjacent subdomains i
 
 Distributed performance for the finite difference stencil operation is illustrated in Figure \ref{ch5:fig:multigpu}. The timings include the compute time for the finite difference approximation and the time for updating ghost layers via message passing. It is obvious from Figure \ref{ch5:fig:multigpu:a} that communication overhead dominates for the smallest problem sizes, where the non distributed grid (1 GPU) is fastest. However, communication overhead does not grow as rapidly as computation times, due to the surface-to-volume ratio. Therefore message passing becomes less influential for large problems, where reasonable performance speedups are obtained. Figure \ref{ch5:fig:multigpu:b} demonstrates how the computational performance on multi-GPU systems can be significantly improved for various stencil sizes. With this simple domain decomposition technique, developers are able to implement applications based on heterogeneous distributed computing, without explicitly dealing with message passing and it is still possible to provide user specific implementations of the topology class for customized grid updates.
 
+\clearpage
+
 % TODO: Should we put in the DD algebra?
 
 \begin{figure}[!htb]
diff --git a/BookGPU/Chapters/chapter7/ch7.tex b/BookGPU/Chapters/chapter7/ch7.tex
index e084f01..2b8b30e 100644
--- a/BookGPU/Chapters/chapter7/ch7.tex
+++ b/BookGPU/Chapters/chapter7/ch7.tex
@@ -696,7 +696,7 @@ where $m$ is one of the scalar functions $\phi,u,w$ describing kinematics; $c$ i
 \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/lineardispersion_Nx30-HL90-p6_Linear-eps-converted-to.pdf}
 }
 \end{center}
-\caption{The accuracy in phase celerity $c$ determined by \eqref{ch7:errdisp} for small-amplitude (linear) wave.
+\caption[The accuracy in phase celerity $c$ determined by \eqref{ch7:errdisp} for small-amplitude (linear) wave.]{The accuracy in phase celerity $c$ determined by \eqref{ch7:errdisp} for small-amplitude (linear) wave.
 $N_z\in[6,12]$. Sixth order scheme.}
 \label{ch7:figlinear}
 \end{figure}
-- 
2.39.5