From ecd01808b5702d940bd77107a2bf829d3832179b Mon Sep 17 00:00:00 2001
From: couturie <couturie@extinction>
Date: Wed, 7 Aug 2013 20:53:22 +0200
Subject: [PATCH] new

---
 BookGPU/Chapters/chapter1/ch1.tex     | 12 ++++++------
 BookGPU/Chapters/chapter2/ch2.tex     |  4 ++--
 BookGPU/Chapters/chapter8/biblio8.bib | 13 ++++++++++++-
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/BookGPU/Chapters/chapter1/ch1.tex b/BookGPU/Chapters/chapter1/ch1.tex
index 9c3d8af..e3cbd81 100755
--- a/BookGPU/Chapters/chapter1/ch1.tex
+++ b/BookGPU/Chapters/chapter1/ch1.tex
@@ -139,12 +139,12 @@ Figure~\ref{ch1:fig:latency_throughput}  illustrates   the  main  difference  of
 memory latency between a CPU and a  GPU. In a CPU, tasks ``ti'' are executed one
 by one with a short memory latency to get the data to process. After some tasks,
 there is  a context switch  that allows the  CPU to run  concurrent applications
-and/or multi-threaded  applications. {\bf REPHRASE} Memory latencies  are longer in a  GPU, the
+and/or multi-threaded  applications.  Memory latencies  are longer in a  GPU. Thhe
  principle  to   obtain  a  high  throughput  is  to   have  many  tasks  to
 compute. Later we  will see that these tasks are called  threads with CUDA. With
 this  principle, as soon  as a  task is  finished the  next one  is ready  to be
-executed  while the  wait for  data for  the previous  task is  overlapped by
-computation of other tasks. {\bf HERE}
+executed  while the  wait for  data for  the previous  task is  overlapped by the
+computation of other tasks. 
 
 
 
@@ -215,14 +215,14 @@ by the  threads of a GPU.   When the problem considered  is a two-dimensional or
 practice, the number of  thread blocks and the size of thread  blocks are given as
 parameters  to  each  kernel.   Figure~\ref{ch1:fig:scalability}  illustrates  an
 example of a kernel composed of 8 thread blocks. Then this kernel is executed on
-a small device containing only 2 SMs. {\bf RELIRE} So in  this case, blocks are executed 2
+a small device containing only 2 SMs.  So in  this case, blocks are executed 2
 by 2 in any order.  If the kernel is executed on a larger CUDA device containing
 4 SMs, blocks are executed 4 by 4 simultaneously.  The execution times should be
 approximately twice faster in the latter  case. Of course, that depends on other
 parameters that will be described later (in this chapter and other chapters).
 
-{\bf RELIRE}
-Thread blocks provide a way to cooperation  in the sense that threads of the same
+
+Thread blocks provide a way to cooperate  in the sense that threads of the same
 block   cooperatively    load   and   store   blocks   of    memory   they   all
 use. Synchronizations of threads in the same block are possible (but not between
 threads of different  blocks). Threads of the same block  can also share results
diff --git a/BookGPU/Chapters/chapter2/ch2.tex b/BookGPU/Chapters/chapter2/ch2.tex
index bd48e2a..68c309a 100755
--- a/BookGPU/Chapters/chapter2/ch2.tex
+++ b/BookGPU/Chapters/chapter2/ch2.tex
@@ -60,8 +60,8 @@ the           values            of           the           block           index
 (called  \texttt{blockIdx} \index{CUDA keywords!blockIdx}  in CUDA)  and  of the
 thread   index   (called   \texttt{threadIdx}\index{CUDA keywords!threadIdx}   in
 CUDA). Blocks of threads and thread  indexes can be decomposed into 1 dimension,
-2 dimensions, or  3 dimensions. {\bf A REGARDER} According to the  dimension of manipulated data,
-the appropriate dimension  can be useful. In our example,  only one dimension is
+2 dimensions, or  3 dimensions.  According to the  dimension of manipulated data,
+the dimension of blocks of threads  must be chosen carefully. In our example,  only one dimension is
 used.   Then using the notation  \texttt{.x}, we  can access  the  first dimension
 (\texttt{.y}  and \texttt{.z},  respectively allow access  to the  second and
 third dimension).   The variable \texttt{blockDim}\index{CUDA keywords!blockDim}
diff --git a/BookGPU/Chapters/chapter8/biblio8.bib b/BookGPU/Chapters/chapter8/biblio8.bib
index 7aadc57..a915ef3 100644
--- a/BookGPU/Chapters/chapter8/biblio8.bib
+++ b/BookGPU/Chapters/chapter8/biblio8.bib
@@ -3,6 +3,8 @@
    title = 	"A parallel algorithm for graph matching and its MasPar implementation",
    journal = 	"IEEE Transactions on Parallel and Distributed Systems",
    volume = 	"8",
+number = "5",
+pages="490-501",
    year = 	"1997"
    }
 
@@ -10,6 +12,8 @@
  author = 	 {T. Carneiro and A. E. Muritibab and M. Negreirosc and G. A. Lima de Campos},
  title = 	 {A New Parallel Schema for Branch-and-Bound Algorithms Using {GPGPU}},
  booktitle = 	 {23rd International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD)},
+pages="41-47",
+  address="New York, USA",
  year = 	 {2011}
 }
 
@@ -56,6 +60,7 @@
  author = 	 {T. Han and T. S. Abdelrahman},
  title = 	 {Reducing branch divergence in {GPU} programs},
  booktitle = 	 {{Proceedings of the Fourth Workshop on General Purpose Processing on Graphics Processing Units (GPGPU-4), ACM}},
+ pages="1-8",
  year = 	 {2011},
  publisher = 	 {New York, USA}
 }
@@ -113,13 +118,18 @@ NOTE =		"Habilitation to Direct Research"
 	TITLE ="An Extension of {J}ohnson's results on Job-Lot Scheduling",
 	JOURNAL ="Naval Research Logistis Quarterly",
 	YEAR ="1956",
-	NOTE ="3:3"
+  pages="61-68",
+	volume ="3",
+number="3",
 }
 
 @ARTICLE{ch8:LGMitten_1959,
 	AUTHOR ="L. G. Mitten",
 	TITLE ="Sequencing $n$ jobs on two machines with arbitrary time lags",
 	JOURNAL ="Management Science",
+  volume="5",
+number="3",
+pages="293-298",
 	YEAR ="1959"
 }
 
@@ -128,6 +138,7 @@ NOTE =		"Habilitation to Direct Research"
  title = 	 {A grid-enabled branch and bound algorithm for solving challenging combinatorial optimization problems},
  booktitle = {{Proceedings of 21th IEEE International Parallel and Distributed Processing Symposium (IPDPS)}},
  year = 	 {2007},
+pages = "1-9",
  month = 	 {March},
  publisher = {Long Beach, California}
 }
-- 
2.39.5