new

[book_gpu.git] / BookGPU / Chapters / chapter6 / PartieSync.tex
diff --git a/BookGPU/Chapters/chapter6/PartieSync.tex b/BookGPU/Chapters/chapter6/PartieSync.tex

index d8d281c62d8a33318c07d763447a694e4663a9cb..7fdaace7f3468648572dbfd896e3413a6ec3bac6 100755 (executable)
--- a/BookGPU/Chapters/chapter6/PartieSync.tex
+++ b/BookGPU/Chapters/chapter6/PartieSync.tex
@@ -97,7 +97,7 @@ parallel programming schemes on a GPU cluster:
  Using CUDA\index{CUDA}, GPU kernel executions are nonblocking, and GPU/CPU data
  transfers\index{CUDA!data transfer}
  are blocking or nonblocking operations. All GPU kernel executions and CPU/GPU
  Using CUDA\index{CUDA}, GPU kernel executions are nonblocking, and GPU/CPU data
  transfers\index{CUDA!data transfer}
  are blocking or nonblocking operations. All GPU kernel executions and CPU/GPU
-data transfers are associated to "streams,"\index{CUDA!stream} and all operations on a same stream
+data transfers are associated to ``streams'',\index{CUDA!stream} and all operations on a same stream
  are serialized. When transferring data from the CPU to the GPU, then running GPU
  computations, and finally transferring results from the GPU to the CPU, there is
  a natural synchronization and serialization if these operations are achieved on
  are serialized. When transferring data from the CPU to the GPU, then running GPU
  computations, and finally transferring results from the GPU to the CPU, there is
  a natural synchronization and serialization if these operations are achieved on
@@ -489,7 +489,7 @@ working on  independent subsets of  data.  \Lst{algo:ch6p1overlapstreamsequence}
  is not so generic as \Lst{algo:ch6p1overlapseqsequence}.
  
  
  is not so generic as \Lst{algo:ch6p1overlapseqsequence}.
  
  
-\subsection{Interleaved communications-transfers-computations\\overlapping}
+\subsection{Interleaved communications-transfers-computations overlapping}
  
  Many algorithms do not support splitting data transfers and kernel calls, and
  cannot exploit CUDA streams, for example, when each GPU thread requires access to
  
  Many algorithms do not support splitting data transfers and kernel calls, and
  cannot exploit CUDA streams, for example, when each GPU thread requires access to
@@ -506,7 +506,8 @@ and twice as many GPU buffers.
  \begin{figure}[t]
    \centering
    \includegraphics{Chapters/chapter6/figures/Sync-CompleteInterleaveOverlap.pdf}
  \begin{figure}[t]
    \centering
    \includegraphics{Chapters/chapter6/figures/Sync-CompleteInterleaveOverlap.pdf}
-  \caption{Complete overlap of internode CPU communications, CPU/GPU data transfers, and GPU
+  \caption[Complete overlap of internode CPU communications,\break\hfill CPU/GPU data transfers, and GPU
+  computations, interleaving computation-communication iterations.]{Complete overlap of internode CPU communications, CPU/GPU data transfers, and GPU
    computations, interleaving computation-communication iterations.}
    \label{fig:ch6p1overlapinterleaved}
  \end{figure}
    computations, interleaving computation-communication iterations.}
    \label{fig:ch6p1overlapinterleaved}
  \end{figure}