From ecd01808b5702d940bd77107a2bf829d3832179b Mon Sep 17 00:00:00 2001 From: couturie Date: Wed, 7 Aug 2013 20:53:22 +0200 Subject: [PATCH] new --- BookGPU/Chapters/chapter1/ch1.tex | 12 ++++++------ BookGPU/Chapters/chapter2/ch2.tex | 4 ++-- BookGPU/Chapters/chapter8/biblio8.bib | 13 ++++++++++++- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/BookGPU/Chapters/chapter1/ch1.tex b/BookGPU/Chapters/chapter1/ch1.tex index 9c3d8af..e3cbd81 100755 --- a/BookGPU/Chapters/chapter1/ch1.tex +++ b/BookGPU/Chapters/chapter1/ch1.tex @@ -139,12 +139,12 @@ Figure~\ref{ch1:fig:latency_throughput} illustrates the main difference of memory latency between a CPU and a GPU. In a CPU, tasks ``ti'' are executed one by one with a short memory latency to get the data to process. After some tasks, there is a context switch that allows the CPU to run concurrent applications -and/or multi-threaded applications. {\bf REPHRASE} Memory latencies are longer in a GPU, the +and/or multi-threaded applications. Memory latencies are longer in a GPU. Thhe principle to obtain a high throughput is to have many tasks to compute. Later we will see that these tasks are called threads with CUDA. With this principle, as soon as a task is finished the next one is ready to be -executed while the wait for data for the previous task is overlapped by -computation of other tasks. {\bf HERE} +executed while the wait for data for the previous task is overlapped by the +computation of other tasks. @@ -215,14 +215,14 @@ by the threads of a GPU. When the problem considered is a two-dimensional or practice, the number of thread blocks and the size of thread blocks are given as parameters to each kernel. Figure~\ref{ch1:fig:scalability} illustrates an example of a kernel composed of 8 thread blocks. Then this kernel is executed on -a small device containing only 2 SMs. {\bf RELIRE} So in this case, blocks are executed 2 +a small device containing only 2 SMs. So in this case, blocks are executed 2 by 2 in any order. If the kernel is executed on a larger CUDA device containing 4 SMs, blocks are executed 4 by 4 simultaneously. The execution times should be approximately twice faster in the latter case. Of course, that depends on other parameters that will be described later (in this chapter and other chapters). -{\bf RELIRE} -Thread blocks provide a way to cooperation in the sense that threads of the same + +Thread blocks provide a way to cooperate in the sense that threads of the same block cooperatively load and store blocks of memory they all use. Synchronizations of threads in the same block are possible (but not between threads of different blocks). Threads of the same block can also share results diff --git a/BookGPU/Chapters/chapter2/ch2.tex b/BookGPU/Chapters/chapter2/ch2.tex index bd48e2a..68c309a 100755 --- a/BookGPU/Chapters/chapter2/ch2.tex +++ b/BookGPU/Chapters/chapter2/ch2.tex @@ -60,8 +60,8 @@ the values of the block index (called \texttt{blockIdx} \index{CUDA keywords!blockIdx} in CUDA) and of the thread index (called \texttt{threadIdx}\index{CUDA keywords!threadIdx} in CUDA). Blocks of threads and thread indexes can be decomposed into 1 dimension, -2 dimensions, or 3 dimensions. {\bf A REGARDER} According to the dimension of manipulated data, -the appropriate dimension can be useful. In our example, only one dimension is +2 dimensions, or 3 dimensions. According to the dimension of manipulated data, +the dimension of blocks of threads must be chosen carefully. In our example, only one dimension is used. Then using the notation \texttt{.x}, we can access the first dimension (\texttt{.y} and \texttt{.z}, respectively allow access to the second and third dimension). The variable \texttt{blockDim}\index{CUDA keywords!blockDim} diff --git a/BookGPU/Chapters/chapter8/biblio8.bib b/BookGPU/Chapters/chapter8/biblio8.bib index 7aadc57..a915ef3 100644 --- a/BookGPU/Chapters/chapter8/biblio8.bib +++ b/BookGPU/Chapters/chapter8/biblio8.bib @@ -3,6 +3,8 @@ title = "A parallel algorithm for graph matching and its MasPar implementation", journal = "IEEE Transactions on Parallel and Distributed Systems", volume = "8", +number = "5", +pages="490-501", year = "1997" } @@ -10,6 +12,8 @@ author = {T. Carneiro and A. E. Muritibab and M. Negreirosc and G. A. Lima de Campos}, title = {A New Parallel Schema for Branch-and-Bound Algorithms Using {GPGPU}}, booktitle = {23rd International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD)}, +pages="41-47", + address="New York, USA", year = {2011} } @@ -56,6 +60,7 @@ author = {T. Han and T. S. Abdelrahman}, title = {Reducing branch divergence in {GPU} programs}, booktitle = {{Proceedings of the Fourth Workshop on General Purpose Processing on Graphics Processing Units (GPGPU-4), ACM}}, + pages="1-8", year = {2011}, publisher = {New York, USA} } @@ -113,13 +118,18 @@ NOTE = "Habilitation to Direct Research" TITLE ="An Extension of {J}ohnson's results on Job-Lot Scheduling", JOURNAL ="Naval Research Logistis Quarterly", YEAR ="1956", - NOTE ="3:3" + pages="61-68", + volume ="3", +number="3", } @ARTICLE{ch8:LGMitten_1959, AUTHOR ="L. G. Mitten", TITLE ="Sequencing $n$ jobs on two machines with arbitrary time lags", JOURNAL ="Management Science", + volume="5", +number="3", +pages="293-298", YEAR ="1959" } @@ -128,6 +138,7 @@ NOTE = "Habilitation to Direct Research" title = {A grid-enabled branch and bound algorithm for solving challenging combinatorial optimization problems}, booktitle = {{Proceedings of 21th IEEE International Parallel and Distributed Processing Symposium (IPDPS)}}, year = {2007}, +pages = "1-9", month = {March}, publisher = {Long Beach, California} } -- 2.39.5