From 41b7ae7deef002c6c884c2d7cf428d3f9303adf3 Mon Sep 17 00:00:00 2001 From: couturie Date: Wed, 1 May 2013 17:51:46 +0200 Subject: [PATCH 01/16] correct --- BookGPU/BookGPU.tex | 4 ++-- BookGPU/Chapters/chapter14/ch14.tex | 4 ++-- BookGPU/Chapters/chapter17/biblio.bib | 20 ++++++++++---------- BookGPU/Chapters/chapter17/ch17.tex | 4 ++-- BookGPU/Chapters/chapter18/biblio18.bib | 4 ++-- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index bc6236c..0c854a5 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -196,11 +196,11 @@ \include{Chapters/chapter11/ch11} \include{Chapters/chapter12/ch12} \include{Chapters/chapter13/ch13} -\include{Chapters/chapter14/ch14} %index +\include{Chapters/chapter14/ch14} \include{Chapters/chapter15/ch15} \include{Chapters/chapter16/ch16} \part{Other} -\include{Chapters/chapter17/ch17} %index +\include{Chapters/chapter17/ch17} \include{Chapters/chapter18/ch18} \include{Chapters/chapter19/ch19} diff --git a/BookGPU/Chapters/chapter14/ch14.tex b/BookGPU/Chapters/chapter14/ch14.tex index 9759ba7..d50db31 100755 --- a/BookGPU/Chapters/chapter14/ch14.tex +++ b/BookGPU/Chapters/chapter14/ch14.tex @@ -7,7 +7,7 @@ application} \section{Introduction} -The lattice Boltzmann (LB) method (for an overview see, e.g., +The lattice Boltzmann (LB) method \index{Lattice Boltzmann method} (for an overview see, e.g., \cite{succi-book}) has become a popular approach to a variety of fluid dynamics problems. It provides a way to solve the incompressible, isothermal Navier-Stokes equations and has the attractive features of @@ -55,7 +55,7 @@ and particle suspensions, and typically require additional physics beyond the bare Navier-Stokes equations to provide a full description~\cite{aidun2010}. The representation of this extra physics raises additional design questions for the application -programmer. Here, we consider the \textit{Ludwig} code \cite{desplat}, +programmer. Here, we consider the \textit{Ludwig} code \cite{desplat}\index{Ludwig code}, an LB application developed specifically for complex fluids (\textit{Ludwig} was named for Boltzmann, 1844--1906). We will present the steps diff --git a/BookGPU/Chapters/chapter17/biblio.bib b/BookGPU/Chapters/chapter17/biblio.bib index 575f9cf..f049a24 100644 --- a/BookGPU/Chapters/chapter17/biblio.bib +++ b/BookGPU/Chapters/chapter17/biblio.bib @@ -2,7 +2,7 @@ % Encoding: ISO8859_1 @incollection{Odell:2003:RRD:1807559.1807562, - author = {Odell, James J. and Van Dyke Parunak, H. and Fleischer, Mitchell}, + author = {Odell, J. J. and Van Dyke Parunak, H. and Fleischer, M.}, chapter = {The role of roles in designing effective agent organizations}, title = {Software engineering for large-scale multi-agent systems}, editor = {Garcia, Alessandro and Lucena, Carlos and Zambonelli, Franco and Omicini, Andrea and Castro, Jaelson}, @@ -119,7 +119,7 @@ } @ARTICLE{Bleiweiss_2008, - author = {Bleiweiss, Avi}, + author = {Bleiweiss, A.}, title = {Multi Agent Navigation on the GPU}, journal = {GDC09 Game Developers Conference 2009}, year = {2008} @@ -348,8 +348,8 @@ } @INPROCEEDINGS{Silveira:2010:PRG:1948395.1948446, - author = {Silveira, Renato and Fischer, Leonardo and Ferreira, Jos\'{e} Ant\^{o}nio - Salini and Prestes, Edson and Nedel, Luciana}, + author = {Silveira, R. and Fischer, L. and Ferreira, J. A. + S. and Prestes, E. and Nedel, L.}, title = {Path-planning for RTS games based on potential fields}, booktitle = {Proceedings of the Third international conference on Motion in games}, year = {2010}, @@ -569,7 +569,7 @@ OPTaddress = {London} @InProceedings{Cosenza2011, -author = {Cosenza,B and Cordasco, G and De Chiara, R. and Scarano, V.}, +author = {Cosenza, B. and Cordasco, G. and De Chiara, R. and Scarano, V.}, title = {Distributed Load Balancing for Parallel Agent-based Simulations}, booktitle = {19th Euromicro International Conference on Parallel, Distributed and Network-Based Computing}, year = {2011}, @@ -577,10 +577,10 @@ address = {Ayia Napa, Cyprus} } @article{Chuffart2010, - author = {Chuffart, F and - Dumoulin, N and - Faure, T and - Deffuant, G}, + author = {Chuffart, F. and + Dumoulin, N. and + Faure, T. and + Deffuant, G.}, title = {SimExplorer: Programming Experimental Designs on Models and Managing Quality of Modelling Process}, journal = {IJAEIS}, @@ -648,7 +648,7 @@ year={2011} } @inproceedings{Aaby10, - author = {Aaby, Brandon G. and Perumalla, Kalyan S. and Seal, Sudip K.}, + author = {Aaby, B. G. and Perumalla, K. S. and Seal, S. K.}, title = {Efficient simulation of agent-based models on multi-GPU and multi-core clusters}, booktitle = {Proceedings of the 3rd International ICST Conference on Simulation Tools and Techniques}, series = {SIMUTools '10}, diff --git a/BookGPU/Chapters/chapter17/ch17.tex b/BookGPU/Chapters/chapter17/ch17.tex index 80da091..28e8251 100755 --- a/BookGPU/Chapters/chapter17/ch17.tex +++ b/BookGPU/Chapters/chapter17/ch17.tex @@ -47,7 +47,7 @@ solution to increase simulation performance but Graphical Processing Units (GPU) are also a promising technology with an attractive performance/cost ratio. -Conceptually a MAS is a distributed system as it favors the definition +Conceptually a MAS\index{Multi-Agent System} is a distributed system as it favors the definition and description of large sets of individuals, the agents, that can be run in parallel. As a large set of agents could have the same behavior a SIMD model should fit the simulation execution. Most of the @@ -299,7 +299,7 @@ collembolas in fields and forests. It is based on a diffusion algorithm which illustrates the case of agents with a simple behavior and few synchronization problems. -\subsection{The Collembola model} +\subsection{The Collembola model\index{Collembola model}} \label{ch17:subsec:collembolamodel} The Collembola model is an example of multi-agent system using GIS (Geographical Information System) diff --git a/BookGPU/Chapters/chapter18/biblio18.bib b/BookGPU/Chapters/chapter18/biblio18.bib index 2d0aed1..a2d65e4 100644 --- a/BookGPU/Chapters/chapter18/biblio18.bib +++ b/BookGPU/Chapters/chapter18/biblio18.bib @@ -48,7 +48,7 @@ @Book{Goldreich, - author = {Oded Goldreich}, + author = {O. Goldreich}, ALTeditor = {}, title = {Foundations of Cryptography: Basic Tools}, publisher = {Cambridge University Press}, @@ -58,7 +58,7 @@ @book{kellert1994wake, title={In the wake of chaos: unpredictable order in dynamical systems}, - author={Kellert, S.H.}, + author={Kellert, S. H.}, isbn={9780226429762}, lccn={lc92030355}, series={Science and its conceptual foundations}, -- 2.39.5 From c8aecc94debd6aebfcbdcb9813ebd4a0f7de093e Mon Sep 17 00:00:00 2001 From: couturie Date: Wed, 1 May 2013 18:14:17 +0200 Subject: [PATCH 02/16] small correc --- BookGPU/Chapters/chapter13/biblio13.bib | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/BookGPU/Chapters/chapter13/biblio13.bib b/BookGPU/Chapters/chapter13/biblio13.bib index 1ccc901..0903bfe 100644 --- a/BookGPU/Chapters/chapter13/biblio13.bib +++ b/BookGPU/Chapters/chapter13/biblio13.bib @@ -2,10 +2,10 @@ title = {Asynchronous grid computation for {A}merican options derivatives}, author = {Chau, M. and Couturier, R. and Bahi, J. M. and Spiteri, P.}, journal = {Advances in Engineering Software}, -volume = {}, -number = {0}, -pages = {}, -note = {}, +volume = {***-***}, +number = {***-***}, +pages = {***-***}, +note = {Online version first}, year = {2012}, } @@ -40,13 +40,10 @@ year = {}, year = {1989}, } -@article{ch13:ref5, +@book{ch13:ref5, author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.}, title = {Parallel iterative algorithms: from sequential to grid computing}, journal = {Chapman \& Hall/CRC, Numerical Analysis \& Scientific Computating 1 (2007)}, - volume = {}, - number = {}, - pages = {}, year = {2007}, } -- 2.39.5 From 5d35abe2b7e45bef96c118ab97504d14cf0620f3 Mon Sep 17 00:00:00 2001 From: couturie Date: Wed, 1 May 2013 19:05:53 +0200 Subject: [PATCH 03/16] new --- BookGPU/Chapters/chapter12/ch12.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BookGPU/Chapters/chapter12/ch12.tex b/BookGPU/Chapters/chapter12/ch12.tex index 254c0cb..63eae8c 100755 --- a/BookGPU/Chapters/chapter12/ch12.tex +++ b/BookGPU/Chapters/chapter12/ch12.tex @@ -1,4 +1,4 @@ -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +1%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% %% %% CHAPTER 12 %% %% %% -- 2.39.5 From 6f31b862f879417ba764d85798f4ce9197c6cfe6 Mon Sep 17 00:00:00 2001 From: Raphael Couturier Date: Fri, 3 May 2013 09:58:39 +0200 Subject: [PATCH 04/16] new --- BookGPU/BookGPU.tex | 2 +- BookGPU/Chapters/chapter12/ch12.tex | 28 +++++++++++++++------------- BookGPU/Chapters/chapter19/ch19.tex | 15 ++++++++------- BookGPU/Makefile | 3 +++ 4 files changed, 27 insertions(+), 21 deletions(-) diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index 0c854a5..77937b8 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -147,7 +147,7 @@ \makeindex -%\includeonly{Chapters/chapter10/ch10} +%\includeonly{Chapters/chapter19/ch19} \begin{document} diff --git a/BookGPU/Chapters/chapter12/ch12.tex b/BookGPU/Chapters/chapter12/ch12.tex index 63eae8c..8b869b3 100755 --- a/BookGPU/Chapters/chapter12/ch12.tex +++ b/BookGPU/Chapters/chapter12/ch12.tex @@ -729,19 +729,6 @@ initial matrix. \label{ch12:tab:04} \end{table} -We have used the parallel CG and GMRES algorithms for solving sparse linear systems of $25$ -million unknown values. The sparse matrices associated to these linear systems are generated -from those presented in Table~\ref{ch12:tab:01}. Their main characteristics are given in Table~\ref{ch12:tab:04}. -Tables~\ref{ch12:tab:05} and~\ref{ch12:tab:06} shows the performances of the parallel CG and -GMRES solvers, respectively, obtained on a cluster of $24$ CPU cores and on a cluster of $12$ -GPUs. Obviously, we can notice from these tables that solving large sparse linear systems on -a GPU cluster is more efficient than on a CPU cluster (see relative gains $\tau$). We can also -notice that the execution times of the CG method, whether in a CPU cluster or in a GPU cluster, -are better than those of the GMRES method for solving large symmetric linear systems. In fact, the -CG method is characterized by a better convergence\index{Convergence} rate and a shorter execution -time of an iteration than those of the GMRES method. Moreover, an iteration of the parallel GMRES -method requires more data exchanges between computing nodes compared to the parallel CG method. - \begin{table} \begin{center} \begin{tabular}{|c|c|c|c|c|c|c|} @@ -802,6 +789,21 @@ on a cluster of 12 GPUs.} \end{center} \end{table} + +We have used the parallel CG and GMRES algorithms for solving sparse linear systems of $25$ +million unknown values. The sparse matrices associated to these linear systems are generated +from those presented in Table~\ref{ch12:tab:01}. Their main characteristics are given in Table~\ref{ch12:tab:04}. +Tables~\ref{ch12:tab:05} and~\ref{ch12:tab:06} shows the performances of the parallel CG and +GMRES solvers, respectively, obtained on a cluster of $24$ CPU cores and on a cluster of $12$ +GPUs. Obviously, we can notice from these tables that solving large sparse linear systems on +a GPU cluster is more efficient than on a CPU cluster (see relative gains $\tau$). We can also +notice that the execution times of the CG method, whether in a CPU cluster or in a GPU cluster, +are better than those of the GMRES method for solving large symmetric linear systems. In fact, the +CG method is characterized by a better convergence\index{Convergence} rate and a shorter execution +time of an iteration than those of the GMRES method. Moreover, an iteration of the parallel GMRES +method requires more data exchanges between computing nodes compared to the parallel CG method. + + %%--------------------------%% %% SECTION 5 %% %%--------------------------%% diff --git a/BookGPU/Chapters/chapter19/ch19.tex b/BookGPU/Chapters/chapter19/ch19.tex index ea65a11..e25a23a 100755 --- a/BookGPU/Chapters/chapter19/ch19.tex +++ b/BookGPU/Chapters/chapter19/ch19.tex @@ -1,4 +1,5 @@ -\chapterauthor{Bertil Schmidt, Hoang-Vu Dang}{Johannes Gutenberg University of Mainz} +\chapterauthor{Bertil Schmidt}{Johannes Gutenberg University of Mainz} +\chapterauthor{Hoang-Vu Dang}{Johannes Gutenberg University of Mainz} \chapter{Solving large sparse linear systems for integer factorization on GPUs} \label{chapter19} @@ -101,7 +102,7 @@ We can treat $x$, $y$ and $S_i$ as vectors of block width $m$ or $n$. Assuming $ \subsubsection*{Coordinate list (COO)\index{Compressed storage format!COO}} For each non-zero, both its column and row indices are explicitly stored. The Cusp implementation \cite{ch19:cusp} stores elements in sorted order of row indices ensuring that entries with the same row index are stored contiguously. - \begin{lstlisting} + \begin{lstlisting}[caption={}] coo.row_index = {0, 1, 1, 2, 2, 3, 3, 3, 4, 5, 5} coo.col_index = {2, 1, 4, 1, 3, 0, 2, 4, 2, 0, 5} coo.value = {3, 1, 5, 2, 4, 6, 8, 10, 9, 7, 11} @@ -109,7 +110,7 @@ For each non-zero, both its column and row indices are explicitly stored. The Cu \subsubsection*{Compressed Sparse Row (CSR)\index{Compressed storage format!CSR}} Non-zeros are sorted by the row index, and only their column indices are explicitly stored in a column array. Additionally, the vector $row\_start$ stores indices of the first non-zero element of each row in the column array. - \begin{lstlisting} + \begin{lstlisting}[caption={}] csr.row_start = {0, 1, 3, 5, 8, 9, 12} csr.col_index = {2, 1, 4, 1, 3, 0, 2, 4, 2, 0, 5} csr.value = {3, 1, 5, 2, 4, 6, 8, 10, 9, 7, 11} @@ -117,7 +118,7 @@ For each non-zero, both its column and row indices are explicitly stored. The Cu \subsubsection*{Ellpack (ELL)\index{Compressed storage format!ELL}} Let $K$ be the maximum number of non-zero elements in any row of the matrix. Then, for each row, ELL stores exactly $K$ elements (extra padding is required for rows that contain less than $K$ non-zero elements). Only column indices are required to store in an array, the row index can be implied since exactly $K$ elements are stored per row. The Cusp implementation store the column indices in a transposed manner so that consecutive threads can access consecutive memory addresses. - \begin{lstlisting} + \begin{lstlisting}[caption={}] ell.col_index = { 2, 1, 1, 0, 2, 0, *, 4, 3, 2, *, 5, @@ -129,7 +130,7 @@ For each non-zero, both its column and row indices are explicitly stored. The Cu \end{lstlisting} \subsubsection*{Hybrid (HYB)\index{Compressed storage format!HYB}} The HYB format heuristically computes an value $K$ and store $K$ non-zeros per rows in the ELL format. When a row has more than $K$ non-zeros, the trailing non-zeros are stored in COO. This design decreases the storage overhead due to ELL padding elements and thus improve the overall performance. - \begin{lstlisting} + \begin{lstlisting}[caption={}] hyb.nnz_per_row = 2 hyb.ell.col_index = {2, 1, 1, 0, 2, 0, *, 4, 3, 2, *, 5} hyb.ell.value = {3, 1, 2, 6, 9, 7, *, 5, 4, 8, *, 11} @@ -139,7 +140,7 @@ For each non-zero, both its column and row indices are explicitly stored. The Cu \end{lstlisting} \subsubsection*{Sliced Ellpack (SLE)\index{Compressed storage format!SLE}} This format partitions the matrix into horizontal slices of $S$ adjacent rows \cite{ch19:sle}. Each slice is stored in ELLPACK format. The maximum number of non-zeros may be different for each slice. An additional array $slice\_start$ is used to index the first element in each slice. The matrix rows are usually sorted by the number of non-zeros per row in order to move rows with similar number of non-zeros together. - \begin{lstlisting} + \begin{lstlisting}[caption={}] sle.slice_size = 2 sle.col_index = { 2, 1, *, 4, @@ -453,7 +454,7 @@ We use the Intel MKL library 10.3 in order to compare SCOO performance to an opt \label{ch19:conclusion} In this chapter, we have presented our implementation of iterative SpMV for NFS matrices on GPUs with the CUDA programming language. Our GPU implementation takes advantage of the variety of sparseness properties in NFS matrices to produce suitable formats for different parts. The GPU implementation shows promising improvement over an optimized CPU implementation. As the size of integers in factorization projects is expected to increase further, the linear algebrea step of NFS will become an even bigger bottleneck. The size and sparseness of matrices generated by the NFS sieving step are growing significantly with the size of the integer to be factored. Thus, a big GPU cluster is required to accelerate the linear algebra step. However, in order to achieve scalability for bigger problem sizes, the amount of GPU RAM and data transfer bandwidth need to be increased in addition to the number of GPUs. -We further adapted the proposed Sliced COO format to single-precision floating-point numbers and evaluated it with large and sparse matrices derived from other computational science applications. We have published our code at [-to be updated-]. +We further adapted the proposed Sliced COO format to single-precision floating-point numbers and evaluated it with large and sparse matrices derived from other computational science applications. We have published our code at https://github.com/danghvu/cudaSpmv. \putbib[Chapters/chapter19/biblio] diff --git a/BookGPU/Makefile b/BookGPU/Makefile index e6e4e03..22e9fd7 100644 --- a/BookGPU/Makefile +++ b/BookGPU/Makefile @@ -1,6 +1,9 @@ BOOK=BookGPU + +#page 124 pdf, figures pas belles + all: pdflatex ${BOOK} pdflatex ${BOOK} -- 2.39.5 From 2a52431cae6d35c53f8db5003df57d95bd2f635f Mon Sep 17 00:00:00 2001 From: couturie Date: Sun, 5 May 2013 14:33:50 +0200 Subject: [PATCH 05/16] correct ch17 --- BookGPU/Chapters/chapter17/ch17.tex | 4 ++-- BookGPU/Chapters/chapter17/code/collem_kernel_diffuse.cl | 3 ++- BookGPU/Chapters/chapter17/code/mior_launcher.java | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/BookGPU/Chapters/chapter17/ch17.tex b/BookGPU/Chapters/chapter17/ch17.tex index 28e8251..4aa06b8 100755 --- a/BookGPU/Chapters/chapter17/ch17.tex +++ b/BookGPU/Chapters/chapter17/ch17.tex @@ -1,7 +1,7 @@ \chapterauthor{Guillaume Laville}{Femto-ST Institute, University of Franche-Comte, France} \chapterauthor{Christophe Lang}{Femto-ST Institute, University of Franche-Comte, France} -\chapterauthor{Kamel Mazouzi}{Femto-ST Institute, University of Franche-Comte, France} -\chapterauthor{Nicolas Marilleau}{Femto-ST Institute, University of Franche-Comte, France} +\chapterauthor{Kamel Mazouzi}{Franche-Comte Computing Center, University of Franche-Comte, France} +\chapterauthor{Nicolas Marilleau}{UMMISCO, Institut de Recherche pour le Développement (IRD), France} \chapterauthor{Bénédicte Herrmann}{Femto-ST Institute, University of Franche-Comte, France} \chapterauthor{Laurent Philippe}{Femto-ST Institute, University of Franche-Comte, France} diff --git a/BookGPU/Chapters/chapter17/code/collem_kernel_diffuse.cl b/BookGPU/Chapters/chapter17/code/collem_kernel_diffuse.cl index 93de319..2bfa979 100644 --- a/BookGPU/Chapters/chapter17/code/collem_kernel_diffuse.cl +++ b/BookGPU/Chapters/chapter17/code/collem_kernel_diffuse.cl @@ -15,7 +15,8 @@ kernel void diffuse( // _syncthreads() in CUDA barrier(CLK_GLOBAL_MEM_FENCE); - // Retrieve neighbors surplus and add them to the current cell population + // Retrieve neighbors surplus and add them to the current cell + // population int surplus = OVERFLOW(world, overflows, i + 1, j) + OVERFLOW(world, overflows, i - 1, j) + OVERFLOW(world, overflows, i, j - 1) + OVERFLOW(world, overflows, i, j + 1); CELL(world, newPopulations, i, j) = CELL(world, populations, i, j) + surplus / 8.0 - OVERFLOW(world, overflows, i, j); diff --git a/BookGPU/Chapters/chapter17/code/mior_launcher.java b/BookGPU/Chapters/chapter17/code/mior_launcher.java index 3b648eb..159b88a 100644 --- a/BookGPU/Chapters/chapter17/code/mior_launcher.java +++ b/BookGPU/Chapters/chapter17/code/mior_launcher.java @@ -3,7 +3,8 @@ protected void doLiveImpl() { simulateKernel.setArguments(mmMem, omMem, worldsMem, mmOffsetsMem, omOffsetsMem, mmCSRMem, omCSRMem, partsMem); if (blockSize < Math.max(nbOM, nbMM)) { - throw new RuntimeException("blockSize (" + blockSize + ") too small to execute the simulation"); + throw new RuntimeException("blockSize (" + blockSize + + ") too small to execute the simulation"); } OCLEvent event = queue.enqueue1DKernel(simulateKernel, nbSim * blockSize, blockSize); @@ -15,6 +16,5 @@ protected void doLiveImpl() { queue.blockingReadBuffer(mmMem, mmList, 0, mmMem.getSize()); queue.blockingReadBuffer(omMem, omList, 0, omMem.getSize()); queue.blockingReadBuffer(worldsMem, worlds, 0, worldsMem.getSize()); - System.out.println("copy"); } } -- 2.39.5 From 8db670a863cf32ab3e5fbd99ecc161d6317c763f Mon Sep 17 00:00:00 2001 From: couturie Date: Sun, 5 May 2013 17:54:18 +0200 Subject: [PATCH 06/16] correct --- BookGPU/BookGPU.tex | 4 +- BookGPU/Chapters/chapter11/biblio11.bib | 42 +++++++++---------- BookGPU/Chapters/chapter11/code3.cu | 5 ++- BookGPU/Chapters/chapter14/biblio14.bib | 20 ++++----- BookGPU/Chapters/chapter15/biblio.bib | 14 +++---- BookGPU/Chapters/chapter15/ch15.tex | 19 +++++---- BookGPU/Chapters/chapter19/code.cu | 7 +++- BookGPU/Chapters/chapter7/biblio7.bib | 54 ++++++++++++------------- BookGPU/Chapters/chapter7/ch7.tex | 3 +- BookGPU/Chapters/chapter8/biblio8.bib | 20 ++++----- BookGPU/Chapters/chapter9/biblio9.bib | 4 +- 11 files changed, 98 insertions(+), 94 deletions(-) diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index 77937b8..1ca9c1a 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -147,7 +147,7 @@ \makeindex -%\includeonly{Chapters/chapter19/ch19} +%\includeonly{Chapters/chapter15/ch15} \begin{document} @@ -189,7 +189,7 @@ \part{Optimization} \include{Chapters/chapter8/ch8} \include{Chapters/chapter9/ch9} -\include{Chapters/chapter10/ch10} +\include{Chapters/chapter10/ch10} %revoir ce chapitre \part{Numerical applications} \include{Chapters/chapter7/ch7} diff --git a/BookGPU/Chapters/chapter11/biblio11.bib b/BookGPU/Chapters/chapter11/biblio11.bib index a39c5ce..c6d75ac 100644 --- a/BookGPU/Chapters/chapter11/biblio11.bib +++ b/BookGPU/Chapters/chapter11/biblio11.bib @@ -17,7 +17,7 @@ } @article{tridiag_GPU, - author = {G\"ddeke , D. and Strzodka, R.}, + author = {G\"oddeke , D. and Strzodka, R.}, title = {Cyclic Reduction Tridiagonal Solvers on {GPU}s Applied to Mixed Precision Multigrid}, journal = { IEEE Transactions on Parallel and Distributed Systems}, volume = {22}, @@ -56,7 +56,7 @@ } @inproceedings{Abbas2011, - author = {Abbas, M. and Majid, A.A. and Awang, M.N.H. and Ali, J.M.}, + author = {Abbas, M. and Majid, A. A. and Awang, M. N. H. and Ali, J. M.}, title = {Monotonicity Preserving Interpolation using Rational Spline}, booktitle = {International MultiConference of Engineers and Computer Scientists (IMECS '11)}, address = {Hong Kong}, @@ -66,7 +66,7 @@ } @article{abel1826, - author = {Abel, N.H.}, + author = {Abel, N. H.}, title = {Untersuchungen der Funktionen zweier unabh\"angigen ver\"anderlichen Gr\"ossen $x$ und $y$ wie $f(x, y)$, welche die Eigenschaft haben, dass $f(z, f(x, y))$ eine symmetrische Funktion von $x, y$ und $z$ ist.}, journal = {J. Reine Angew. Math.}, volume = {1}, @@ -143,7 +143,7 @@ year = {1984} } - author = {Adams, M.J.}, + author = {Adams, M. J.}, title = {Chemometrics in Analytical Spectroscopy}, publisher = {Royal Society of Chemistry}, address = {London}, @@ -2557,7 +2557,7 @@ Contents(R)/Clinical Medicine. } @article{Bellman1973, - author = {Bellman, R.E. and Giertz, M.}, + author = {Bellman, R. E. and Giertz, M.}, title = {On the analytic formalism of the theory of fuzzy sets}, journal = {Inform. Sci.}, volume = {5}, @@ -2566,7 +2566,7 @@ Contents(R)/Clinical Medicine. } @article{Bellman1970_MS, - author = {Bellman, R.E. and Zadeh, L.}, + author = {Bellman, R. E. and Zadeh, L.}, title = {Decisionmaking in a fuzzy environment}, journal = {Management Science}, volume = {17}, @@ -2894,7 +2894,7 @@ constraint}, } @article{Blum1973, - author = {Blum, M. and Floyd, R.W. and Watt, V. and Rive, R.L. and Tarjan, R.E.}, + author = {Blum, M. and Floyd, R. W. and Watt, V. and Rive, R. L. and Tarjan, R. E.}, title = {Time Bounds for Selection}, journal = {Journal of Computer and System Sciences}, volume = {7}, @@ -3524,7 +3524,7 @@ Royal Australian Chemical Institute; RACI}, } @article{Carlson1985_SIAM, - author = {Carlson, R.E. and Fritsch, F.N.}, + author = {Carlson, R. E. and Fritsch, F. N.}, title = {Monotone piecewise bicubic interpolation}, journal = {SIAM J. Numer. Anal.}, volume = {22}, @@ -6672,7 +6672,7 @@ Schrödinger equation}, } @article{Fritsch1980, - author = {Fritsch, F.N. and Carlson, R.E.}, + author = {Fritsch, F. N. and Carlson, R. E.}, title = {Monotone pieceuiuse cubic interpolation}, journal = {SIAM J. Numer. Anal.}, volume = {17 }, @@ -7443,7 +7443,7 @@ Genetic algorithms.}, } @article{Gregory1982, - author = {Gregory, J.A. and Delbourgo, R.}, + author = {Gregory, J. A. and Delbourgo, R.}, title = {Piecewise rational quadratic interpolation to monotonic data}, journal = {IMA Journal of Numerical Analysis}, volume = {2}, @@ -9120,7 +9120,7 @@ review}, } @article{Jupp_1978, - author = {Jupp, D.L.B.}, + author = {Jupp, D. L. B.}, title = {Approximation to data by splines with free knots}, journal = {SIAM J. Numer. Anal.}, volume = {15 }, @@ -9302,7 +9302,7 @@ ELECTR STRUCT and LASER, POB 1527, GR-71110 IRAKLION.}, } @article{Kearsley_2006, - author = {Kearsley, A.J.}, + author = {Kearsley, A. J.}, title = {Projections Onto Order Simplexes and Isotonic Regression}, journal = {J. Res. Natl. Inst. Stand. Technol.}, volume = {111}, @@ -9311,7 +9311,7 @@ ELECTR STRUCT and LASER, POB 1527, GR-71110 IRAKLION.}, } @article{Kelley1960_siam, - author = {Kelley, J.E.}, + author = {Kelley, J. E.}, title = {The cutting-plane method for solving convex programs}, journal = {J. of SIAM}, volume = {8}, @@ -10825,7 +10825,7 @@ PRINCETON, NJ 08544, USA.}, } @article{Lyche1973, - author = {Lyche, T. and Schumaker, L.L.}, + author = {Lyche, T. and Schumaker, L. L.}, title = {Computation of smoothing and interpolating naturel splines via local bases}, journal = {SIAM J. Numer. Anal.}, volume = {10}, @@ -11342,7 +11342,7 @@ undergraduate text}, } @article{McAllister1981_ACM, - author = {McAllister, D.F. and Roulier, J.A.}, + author = {McAllister, D. F. and Roulier, J. A.}, title = {An algorithm for computing a shape-preserving oscillatory quadratic spline}, journal = {ACM Trans. Math. Software }, volume = {7}, @@ -13486,7 +13486,7 @@ IRAKLION, GREECE.}, } @book{Robertson_book, - author = {Robertson, T. and Wright, F.T. and Dykstra, R.L.}, + author = {Robertson, T. and Wright, F. T. and Dykstra, R. L.}, title = {Order Restricted Statistical Inference}, publisher = {Wiley}, address = {Chichester ; New York}, @@ -14261,7 +14261,7 @@ Reprint available from: Schoen F UNIV FLORENCE FLORENCE ITALY}, } @book{Schumaker1981_book, - author = {Schumaker, L.L.}, + author = {Schumaker, L. L.}, title = {Spline functions: Basic theory}, publisher = {Wiley}, address = {New York}, @@ -14269,7 +14269,7 @@ Reprint available from: Schoen F UNIV FLORENCE FLORENCE ITALY}, } @article{Schumaker1983, - author = {Schumaker, L.L.}, + author = {Schumaker, L. L.}, title = {On Shape Preserving Quadratic Spline Interpolation}, journal = {SIAM Journal on Numerical Analysis}, volume = {20}, @@ -14298,7 +14298,7 @@ Princeton, NJ 08544, USA, .}, } @article{Sch, - author = {Schweikert, D.G.}, + author = {Schweikert, D. G.}, title = {An interpolation curve using a spline in tension}, journal = {J. Math. Phys. }, volume = {45}, @@ -14307,7 +14307,7 @@ Princeton, NJ 08544, USA, .}, } @article{Schweikert1966, - author = {Schweikert, D.G.}, + author = {Schweikert, D. G.}, title = {An interpolation curve using a spline in tension}, journal = {J. Math. Phys.}, volume = {45}, @@ -15640,7 +15640,7 @@ United States}, year = {2000} } - author = {Warren, J. and Noone, J. and Smith, B. and Ruffin, R. and Frith, P. and van der Zwaag, B. and Beliakov, G. and Frankel, H.}, + author = {Warren, J. and Noone, J. and Smith, B. and Ruffin, R. and Frith, P. and Van der Zwaag, B. and Beliakov, G. and Frankel, H.}, title = {Automated Attention Flags in Chronic Disease Care Planning}, journal = {Ausralian Medical Journal}, volume = {175}, diff --git a/BookGPU/Chapters/chapter11/code3.cu b/BookGPU/Chapters/chapter11/code3.cu index 37d8409..5da77e6 100644 --- a/BookGPU/Chapters/chapter11/code3.cu +++ b/BookGPU/Chapters/chapter11/code3.cu @@ -11,8 +11,9 @@ __device__ void Bisection_device(T z, T* t, int mi,int ma,int* l) *l = mi-1; } -/* Kernel to evaluates monotone spline for a sequence of query points residing in the array z of size m -*/ +// Kernel to evaluate monotone spline for a sequence of query points +// residing in the array z of size m + template __global__ void d_MonSplineValue(Tx* z, int K, double* t, double * alpha, double * beta, double * gamma, int T, Ty *value) { diff --git a/BookGPU/Chapters/chapter14/biblio14.bib b/BookGPU/Chapters/chapter14/biblio14.bib index b5e4141..ec060bc 100644 --- a/BookGPU/Chapters/chapter14/biblio14.bib +++ b/BookGPU/Chapters/chapter14/biblio14.bib @@ -16,7 +16,7 @@ } @Article{aidun2010, - author = { Aidun, C.K. and Clausen, J.R.}, + author = { Aidun, C. K. and Clausen, J. R.}, title = {Lattice Boltzmann method for complex flows}, journal = {Ann. Rev. Fluid Mech.}, year = {2010}, @@ -35,7 +35,7 @@ } @Article{wei2004, - author = {Wei, X. and Li, W. and M\"uller, K. and Kaufman, A.E.}, + author = {Wei, X. and Li, W. and M\"uller, K. and Kaufman, A. E.}, title = {The lattice Boltzmann method for simulating gaseous phenomena}, journal = {IEEE Transactions on Visualization and Computer Graphics}, year = {2004}, @@ -83,7 +83,7 @@ @Article{myre2011, - author = { Myre, J. and Walsh, S.D.C. and Lilja, D. and Saar, M.O.}, + author = { Myre, J. and Walsh, S. D. C. and Lilja, D. and Saar, M. O.}, title = {Performance analysis of single-phase, multiphase, and multicomponent lattice Boltzmann fluid flow simulations on GPU clusters}, journal = {Concurrency Computat.: Pract. Exper.}, year = {2011}, @@ -102,7 +102,7 @@ } @Article{bernaschi2010, - author = {Bernaschi, M. and Fatica, M. and Melchionna, S. and Succi, S; and Kaxiras, E.}, + author = {Bernaschi, M. and Fatica, M. and Melchionna, S. and Succi, S. and Kaxiras, E.}, title = {A flexible high-performance lattice Boltzmann GPU code for the simulations of fluid flow in complex geometries}, journal = {Concurrency Computat.: Pract. Exper.}, @@ -170,7 +170,7 @@ lattice Boltzmann method on GPU cluster}, } @Article{walshsaar2012, - author = {Walsh, S.D.C. and Saar, M.O.}, + author = {Walsh, S. D. C. and Saar, M. O.}, title = {Developing extensible lattice Boltzmann simulators for general-purpose graphics-processing units}, journal = {Comm. Comput. Phys.}, year = {2013}, @@ -203,7 +203,7 @@ hierarchical and distributed auto-tuning}, @Article{ladd1994, - author = {Ladd, A.J.C.}, + author = {Ladd, A. J. C.}, title = {Numerical simulations of particle suspensions via a discretized Boltzmann equation. Part 1. Theoretical foundation and Part II. Numerical results}, journal = {J. Fluid Mech.}, year = {1994}, @@ -214,7 +214,7 @@ hierarchical and distributed auto-tuning}, @Article{nguyen2002, - author = {Nguyen, N.-Q. and Ladd, A.J.C.}, + author = {Nguyen, N.-Q. and Ladd, A. J. C.}, title = {Lubrication corrections for lattice Boltzmann simulations of particle suspensions}, journal = {Phys. Rev. E}, year = {2002}, @@ -223,7 +223,7 @@ hierarchical and distributed auto-tuning}, } @Article{ch14:immersed1, - author = {Peskin, C.S.}, + author = {Peskin, C. S.}, title = {Flow patterns around heart valves; a numerical method}, journal = {J. Comp. Phys.}, year = {1972}, @@ -233,7 +233,7 @@ hierarchical and distributed auto-tuning}, } @Article{ch14:immersed2, - author = {Peskin, C.S.}, + author = {Peskin, C. S.}, title = {The immersed boundary method}, journal = {Acta Nummerica}, year = {2002}, @@ -243,7 +243,7 @@ hierarchical and distributed auto-tuning}, } @Article{ch14:immersed-lb, - author = {Feng, Z.-G. and Michaelides, E.E}, + author = {Feng, Z.-G. and Michaelides, E. E}, title = {The immersed boundary-lattice Boltzmann method for solving fluid-particles interaction problem}, journal = {J. Comp. Phys.}, diff --git a/BookGPU/Chapters/chapter15/biblio.bib b/BookGPU/Chapters/chapter15/biblio.bib index d0bb53f..8cb390d 100644 --- a/BookGPU/Chapters/chapter15/biblio.bib +++ b/BookGPU/Chapters/chapter15/biblio.bib @@ -1,5 +1,5 @@ @Inproceedings{PF_PDSEC2011, - author ={P. Fortin and R. Habel and F.~J\'ez\'equel and J.-L. Lamotte and N.S. Scott}, + author ={P. Fortin and R. Habel and F.~J\'ez\'equel and J.-L. Lamotte and N. S. Scott}, title = {Deployment on GPUs of an application in computational atomic physics}, booktitle = {{12th IEEE International Workshop on Parallel and Distributed Scientific and Engineering Computing (PDSEC) in conjunction with the 25th International Parallel and Distributed Processing Symposium (IPDPS)}}, @@ -10,7 +10,7 @@ year = 2011} @article{Burke_1987, -author={P.G. Burke and C.J. Noble and M.P. Scott}, +author={P. G. Burke and C. J. Noble and M. P. Scott}, title={{R-matrix theory of electron scattering at intermediate energies}}, journal={Proceedings of the Royal Society of London A}, volume=410, @@ -20,7 +20,7 @@ pages={287--310} % %Proc. Roy. Soc. A } @Article{2DRMP, -author ={N.S. Scott and M.P. Scott and P.G. Burke and T. Stitt and V. Faro-Maza and C. Denis and A. Maniopoulou}, +author ={N. S. Scott and M. P. Scott and P. G. Burke and T. Stitt and V. Faro-Maza and C. Denis and A. Maniopoulou}, title ={{2DRMP: A~suite of two-dimensional R-matrix propagation codes}}, journal ={Computer Physics Communications}, volume={180}, @@ -30,7 +30,7 @@ pages={2424--2449} %note={ISSN: {0010-4655}, doi:10.1016/j.cpc.2009.07.017} @Article{FARM_2DRMP, -author ={ V.M. Burke and C.J. Noble and V. Faro-Maza and A. Maniopoulou and N.S. Scott}, +author ={V. M. Burke and C. J. Noble and V. Faro-Maza and A. Maniopoulou and N. S. Scott}, title ={ {FARM\_2DRMP: a version of FARM for use with 2DRMP}}, journal ={Computer Physics Communications}, volume={180}, @@ -41,9 +41,9 @@ pages={2450--2451} @INPROCEEDINGS{VECPAR, author = {T. Stitt and - N.S. Scott and - M.P. Scott and - P.G. Burke}, + N. S. Scott and + M. P. Scott and + P. G. Burke}, title = {{2-D R-Matrix Propagation: A Large Scale Electron Scattering Simulation Dominated by the Multiplication of Dynamically Changing Matrices}}, diff --git a/BookGPU/Chapters/chapter15/ch15.tex b/BookGPU/Chapters/chapter15/ch15.tex index 9b0bf27..cf464c7 100644 --- a/BookGPU/Chapters/chapter15/ch15.tex +++ b/BookGPU/Chapters/chapter15/ch15.tex @@ -825,7 +825,7 @@ as the matrices increase in size during the propagation \section{Performance results} \subsection{PROP deployment on GPU} -\begin{table*}[ht] +\begin{table}[ht] \begin{center} \begin{tabular}{|c||c|c||} \hline @@ -849,13 +849,13 @@ GPU version & C1060 & C2050 \\ GPU V5 (\S~\ref{gpuv5}) & 24m27s & 12m39s \\ \hline \end{tabular} -\caption{\label{table:time} -Execution time of PROP on CPU and GPU} \end{center} -\end{table*} +\caption{Execution time of PROP on CPU and GPU} +\label{table:time} +\end{table} -\begin{comment} -\begin{table*}[ht] + +\begin{table}[ht] \begin{center} \begin{tabular}{|c||c|c||} \hline @@ -876,11 +876,10 @@ GPU version & C1060 & C2050 \\ GPU V5 (\ref{gpuv5}) & 24m27s & 12m39s \\ \hline \end{tabular} -\caption{\label{table:time} -Execution time of the successive GPU versions} \end{center} -\end{table*} -\end{comment} +\caption{Execution time of the successive GPU versions} +\label{table:time} +\end{table} \begin{figure}[h] \centering diff --git a/BookGPU/Chapters/chapter19/code.cu b/BookGPU/Chapters/chapter19/code.cu index 3a95e55..c7bad40 100644 --- a/BookGPU/Chapters/chapter19/code.cu +++ b/BookGPU/Chapters/chapter19/code.cu @@ -1,4 +1,5 @@ -// compute y = B*x (B is stored in SCOO formats [ cols, rows, values, offsets, numPacks, numRows ]) +// compute y = B*x (B is stored in SCOO formats [ cols, rows, values, +//offsets, numPacks, numRows ]) // LANE_SIZE = 2^k // NUM_ROWS_PER_SLICE is computed based on sparsity template @@ -13,7 +14,9 @@ sliced_coo_kernel( const float * x, float * y) { - const int thread_lane = threadIdx.x & (LANE_SIZE-1); // ~ threadIdx.x % LANE_SIZE + // ~ threadIdx.x % LANE_SIZE + const int thread_lane = threadIdx.x & (LANE_SIZE-1); + const int row_lane = threadIdx.x/(LANE_SIZE); __shared__ float sdata[NUM_ROWS_PER_SLICE][LANE_SIZE]; diff --git a/BookGPU/Chapters/chapter7/biblio7.bib b/BookGPU/Chapters/chapter7/biblio7.bib index 6d7e3f9..e48f2ce 100644 --- a/BookGPU/Chapters/chapter7/biblio7.bib +++ b/BookGPU/Chapters/chapter7/biblio7.bib @@ -37,7 +37,7 @@ DOI = "10.1002/fld.2675" } @ARTICLE{ch7:EHBM06, -AUTHOR = "Engsig-Karup, A.P. and Hesthaven, J.S. and Bingham, H.B. and Madsen, P.", +AUTHOR = "Engsig-Karup, A. P. and Hesthaven, J. S. and Bingham, H. B. and Madsen, P.", TITLE = "Nodal {DG-FEM} solutions of high-order {B}oussinesq-type equations", JOURNAL = JEM, YEAR = "2006", @@ -46,7 +46,7 @@ PAGES = "351--370" } @ARTICLE{ch7:EHBW08, -AUTHOR = "Engsig-Karup, A.P. and Hesthaven, J.S. and Bingham, H.B. and Warburton, T.", +AUTHOR = "Engsig-Karup, A. P. and Hesthaven, J. S. and Bingham, H. B. and Warburton, T.", TITLE = "{DG-FEM} solution for nonlinear wave-structure interaction using Boussinesq-type equations", JOURNAL = CE, YEAR = "2008", @@ -55,7 +55,7 @@ PAGES = "197--208" } @INPROCEEDINGS{ch7:Glimberg2011, - AUTHOR = {Stefan L. Glimberg and Allan P. Engsig-Karup and Morten G. Madsen}, + AUTHOR = {S. L. Glimberg and A. P. Engsig-Karup and M. G. Madsen}, TITLE = {A Fast GPU-accelerated Mixed-precision Strategy for Fully Nonlinear Water Wave Computations}, BOOKTITLE = {Numerical Mathematics and Advanced Applications 2011, Proceedings of ENUMATH 2011, the 9th European Conference on Numerical Mathematics and Advanced Applications, Leicester, September 2011}, YEAR = {2011}, @@ -64,7 +64,7 @@ PAGES = "197--208" } @ARTICLE{ch7:EBL08, -AUTHOR = "Engsig-Karup, A.P. and Bingham, H.B. and Lindberg, O.", +AUTHOR = "Engsig-Karup, A. P. and Bingham, H. B. and Lindberg, O.", TITLE = "An efficient flexible-order model for {3D} nonlinear water waves", YEAR = "2009", JOURNAL = JCP, @@ -141,7 +141,7 @@ PAGES = "211--228" } @ARTICLE{ch7:MBS03, -AUTHOR = "Madsen, P.A. and Bingham, H.B. and Sch{\"a}ffer, H. A.", +AUTHOR = "Madsen, P. A. and Bingham, H. B. and Sch{\"a}ffer, H. A.", TITLE = "Boussinesq-type formulations for fully nonlinear and extremely dispersive water waves: derivation and analysis", JOURNAL = RSL, YEAR = "2003", @@ -150,7 +150,7 @@ PAGES = "1075--1104" } @ARTICLE{ch7:MBL02, -AUTHOR = "Madsen, P.A. and Bingham, H.B. and Liu, H.", +AUTHOR = "Madsen, P. A. and Bingham, H. B. and Liu, H.", TITLE = "A new Boussinesq method for fully nonlinear waves from shallow to deep water", JOURNAL = JFM, YEAR = "2002", @@ -168,7 +168,7 @@ PAGES = "{319--333}" } @article {ch7:LynettEtAl2004a, - AUTHOR = {Lynett, P. and Liu, P.L.-F.}, + AUTHOR = {Lynett, P. and Liu, P. L.-F.}, TITLE = {A two-layer approach to wave modelling}, JOURNAL = {Proc. Roy. Soc. London Ser. A}, FJOURNAL = {Proceedings of the Royal Society. London. Series A. @@ -179,7 +179,7 @@ PAGES = "{319--333}" } @incollection {ch7:TsaiYue1996, - AUTHOR = {Tsai, Wu-ting and Yue, Dick K. P.}, + AUTHOR = {Tsai, W. and Yue, D. K. P.}, TITLE = {Computation of nonlinear free-surface flows}, BOOKTITLE = {Annual review of fluid mechanics, Vol.\ 28}, PAGES = {249--278}, @@ -191,7 +191,7 @@ PAGES = "{319--333}" } @article {ch7:LynettEtAl2004b, - AUTHOR = {Lynett, P. and Liu, P.L.-F.}, + AUTHOR = {Lynett, P. and Liu, P. L.-F.}, TITLE = {Linear analysis of the multi-layer model}, JOURNAL = CE, VOLUME = {51}, @@ -200,7 +200,7 @@ PAGES = "{319--333}" } @article {ch7:DiasBridges2006, - AUTHOR = {Dias, Fr{\'e}d{\'e}ric and Bridges, Thomas J.}, + AUTHOR = {Dias, F. and Bridges, T. J.}, TITLE = {The numerical computation of freely propagating time-dependent irrotational water waves}, JOURNAL = {Fluid Dynam. Res.}, @@ -240,7 +240,7 @@ PAGES = "285--297" @incollection {ch7:Yeung1982, - AUTHOR = {Yeung, Ronald W.}, + AUTHOR = {Yeung, R. W.}, TITLE = {Numerical methods in free-surface flows}, BOOKTITLE = {Annual review of fluid mechanics, Vol. 14}, PAGES = {395--442}, @@ -263,13 +263,13 @@ note = "", issn = "0079-6611", doi = "10.1016/j.pocean.2007.05.005", url = "http://www.sciencedirect.com/science/article/pii/S0079661107001206", -author = "L. Cavaleri and J.-H.G.M. Alves and F. Ardhuin and A. Babanin and M. Banner and K. Belibassakis and M. Benoit and M. Donelan and J. Groeneweg and T.H.C. Herbers and P. Hwang and P.A.E.M. Janssen and T. Janssen and I.V. Lavrenov and R. Magne and J. Monbaliu and M. Onorato and V. Polnikov and D. Resio and W.E. Rogers and A. Sheremet and J. McKee Smith and H.L. Tolman and G. van Vledder and J. Wolf and I. Young", +author = "L. Cavaleri and J.-H. G. M. Alves and F. Ardhuin and A. Babanin and M. Banner and K. Belibassakis and M. Benoit and M. Donelan and J. Groeneweg and T.H.C. Herbers and P. Hwang and P. A. E. M. Janssen and T. Janssen and I. V. Lavrenov and R. Magne and J. Monbaliu and M. Onorato and V. Polnikov and D. Resio and W. E. Rogers and A. Sheremet and J. McKee Smith and H. L. Tolman and G. van Vledder and J. Wolf and I. Young", keywords = "Wind waves", -keywords = "Wind‰ÛÒwave generation", -keywords = "Wave‰ÛÒwave interaction", +keywords = "WindロÒwave generation", +keywords = "WaveロÒwave interaction", keywords = "Wave propagation", keywords = "Wave dissipation", -keywords = "Wave‰ÛÒcurrent interaction", +keywords = "WaveロÒcurrent interaction", keywords = "Numerics" } @@ -303,7 +303,7 @@ YEAR = "1982" } @BOOK{ch7:Hackbusch1982, -AUTHOR = "Hackbusch W.", +AUTHOR = "Hackbusch, W.", TITLE = "On multigrid iterations with defect correction. In: Hackbusch, W.; Trottenberg, U. (eds): Lecture Notes in Math.", BOOKTITLE = "Multigrid Methods", VOLUME = "960", @@ -313,7 +313,7 @@ YEAR = "1982" } @article {ch7:MR744926, - AUTHOR = {Schaffer, Steve}, + AUTHOR = {Schaffer, S.}, TITLE = {Higher order multigrid methods}, JOURNAL = {Math. Comp.}, FJOURNAL = {Mathematics of Computation}, @@ -352,7 +352,7 @@ year = "2011", issn = "1631-0721", doi = "10.1016/j.crme.2010.11.002", url = "http://www.sciencedirect.com/science/article/pii/S1631072110002032", -author = "David E. Keyes", +author = "D. E. Keyes", keywords = "Computer science", keywords = "Exaflop", keywords = "Informatiquealgorithmique", @@ -375,7 +375,7 @@ YEAR = "2006" @book{ch7:Whalin1971, title={The Limit of Applicability of Linear Wave Refraction Theory in a Convergence Zone}, - author={Whalin, R.W. and United States. Army. Corps of Engineers and Waterways Experiment Station (U.S.)}, + author={Whalin, R. W. and United States. Army. Corps of Engineers and Waterways Experiment Station (U.S.)}, series={Research report}, url={http://books.google.dk/books?id=wwvWSgAACAAJ}, year={1971}, @@ -383,7 +383,7 @@ YEAR = "2006" } @article{ch7:AbottEtAl1984, -author = "Abott, M.B. and McCowan, A.D. and Warren, I.R.", +author = "Abott, M. B. and McCowan, A. D. and Warren, I. R.", title = "Accuracy of short-wave numerical models", journal = "ASCE Journal of Hydraulic Engineering", volume = "110", @@ -393,7 +393,7 @@ year = "1984" } @article{ch7:AbottEtAl1978, -author = "Abott, M.B. Petersens, H.M. and Skovgaard, O.", +author = "Abott, M. B. Petersens, H. M. and Skovgaard, O.", title = "On the numerical modelling of short waves in shallow water", journal = "Journal of Hydraulic Research", volume = "16", @@ -403,7 +403,7 @@ year = "1978" } @ARTICLE{ch7:MS98, -AUTHOR = "Madsen, P.A. and Sch{\"{a}}ffer, H. A.", +AUTHOR = "Madsen, P. A. and Sch{\"{a}}ffer, H. A.", TITLE = "Higher order Boussinesq-type equations for surface gravity waves - derivation and analysis.", JOURNAL = "In Advances in Coastal and Ocean Engineering", VOLUME = "356", @@ -412,7 +412,7 @@ PAGES = "3123--3181" } @article{ch7:Peregrine1967, -author = "Peregrine, D.H.", +author = "Peregrine, D. H.", title = "Long waves on a beach", journal = "Journal of Fluid Mechanics", volume = "27", @@ -430,7 +430,7 @@ published = {SIAM} @ARTICLE{ch7:GlimbergEtAl2012, - AUTHOR = {Stefan L. Glimberg and Allan P. Engsig-Karup}, + AUTHOR = {S. L. Glimberg and A. P. Engsig-Karup}, TITLE = {On a Multi-GPU Implementation of a Free Surface Water Wave Model for Large-scale Simulations}, JOURNAL = {Submitted to: Special Issue of the Journal Parallel Computing}, YEAR = {2012}, @@ -479,7 +479,7 @@ published = {SIAM} @mastersthesis{ch7:ASNP12, - author = {Allan S. Nielsen}, + author = {A. S. Nielsen}, title = {Feasibility study of the Parareal algorithm}, school = {Technical University of Denmark, Department of Informatics and Mathematical Modeling}, year = {2012}, @@ -487,7 +487,7 @@ published = {SIAM} } @Book{ch7:Higham:2002:ASN, - author = "Nicholas J. Higham", + author = "N. J. Higham", title = "Accuracy and Stability of Numerical Algorithms", publisher = "Society for Industrial and Applied Mathematics", address = "Philadelphia, PA, USA", @@ -526,7 +526,7 @@ note = "", issn = "0378-3839", doi = "10.1016/j.coastaleng.2008.10.012", url = "http://www.sciencedirect.com/science/article/pii/S0378383908001865", -author = "Harry B. Bingham and Per A. Madsen and David R. Fuhrman", +author = "H. B. Bingham and P. A. Madsen and D. R. Fuhrman", keywords = "Boussinesq-type equations", keywords = "Nonlinear waves", keywords = "Potential flow", diff --git a/BookGPU/Chapters/chapter7/ch7.tex b/BookGPU/Chapters/chapter7/ch7.tex index c18d417..45fc453 100644 --- a/BookGPU/Chapters/chapter7/ch7.tex +++ b/BookGPU/Chapters/chapter7/ch7.tex @@ -535,7 +535,8 @@ __global__ void laplace_sigma_transformed( { size_type offset_i = i < alpha ? 2*alpha-i : i >= Ns-alpha ? Ns-1-i : alpha; size_type row_i = offset_i*rank; - size_type offset_j = alpha; // Always centered stencils in x-dir + // Always centered stencils in x-dir + size_type offset_j = alpha; size_type row_j = alpha*rank; value_type dhdx = hx[j]; diff --git a/BookGPU/Chapters/chapter8/biblio8.bib b/BookGPU/Chapters/chapter8/biblio8.bib index 78424f2..3e10526 100644 --- a/BookGPU/Chapters/chapter8/biblio8.bib +++ b/BookGPU/Chapters/chapter8/biblio8.bib @@ -7,14 +7,14 @@ } @InProceedings{ch8:Carneiro_2011, - author = {T. Carneiro and A. Einstein Muritibab and M. Negreirosc and G. Augusto Lima de Campos}, + author = {T. Carneiro and A. E. Muritibab and M. Negreirosc and G. A. Lima de Campos}, title = {A New Parallel Schema for Branch-and-Bound Algorithms Using GPGPU}, booktitle = {23rd International Symposium on Computer Architecture and High Performance Computing (SBAC-PAD)}, year = {2011} } @ARTICLE{ch8:Casadoa_2008, - author = "L.G. Casadoa and J.A. Martíneza and I. Garcíaa and E.M.T. Hendrixb.", + author = "L. G. Casadoa and J. A. Martíneza and I. Garcíaa and E. M. T. Hendrixb.", title = "Branch-and-Bound interval global optimization on shared memory multiprocessors", journal = "Optimization Methods and Software", volume = "23, No.5", @@ -32,7 +32,7 @@ } @Article{ch8:Garey_1976, - author = {M.R. Garey and D.S. Johnson and R. Sethi}, + author = {M. R. Garey and D. S. Johnson and R. Sethi}, title = {{The complexity of flow-shop and job-shop scheduling}}, journal = {Mathematics of Operations Research}, year = {1976}, @@ -42,7 +42,7 @@ } @Article{ch8:Gendron_1994, - author = {B. Gendron and T.G. Crainic}, + author = {B. Gendron and T. G. Crainic}, title = {Parallel {B}ranch and {B}ound {A}lgorithms: {S}urvey and {S}ynthesis}, journal = {Operations Research}, year = {1994}, @@ -52,7 +52,7 @@ } @InProceedings{ch8:Han, - author = {T. Han and T.S. Abdelrahman}, + author = {T. Han and T. S. Abdelrahman}, title = {Reducing branch divergence in GPU programs}, booktitle = {{In Proceedings of the Fourth Workshop on General Purpose Processing on Graphics Processing Units (GPGPU-4), ACM}}, year = {2011}, @@ -60,7 +60,7 @@ } @Article{ch8:Johnson_1954, - author = {S.M. Johnson}, + author = {S. M. Johnson}, title = {{Optimal two and three-stage production schedules with setup times included}}, journal = {Naval Research Logistis Quarterly}, year = {1954}, @@ -77,7 +77,7 @@ } @Article{ch8:Lenstra_1978, - author = {J.K. Lenstra and B.J. Lageweg and A.H.G. Rinnooy Kan}, + author = {J. K. Lenstra and B. J. Lageweg and A. H. G. Rinnooy Kan}, title = {{A General bounding scheme for the permutation flow-shop problem}}, journal = {Operations Research}, year = {1978}, @@ -108,7 +108,7 @@ NOTE = "Th\`ese HDR" } @ARTICLE{ch8:JRJackson_1956, - AUTHOR ="J.R.Jackson", + AUTHOR ="J. R. Jackson", TITLE ="An Extension of Johnson's results on Job-Lot Scheduling", JOURNAL ="Naval Research Logistis Quarterly", YEAR ="1956", @@ -116,7 +116,7 @@ NOTE = "Th\`ese HDR" } @ARTICLE{ch8:LGMitten_1959, - AUTHOR ="L.G.Mitten", + AUTHOR ="L. G. Mitten", TITLE ="Sequencing n jobs on two machines with arbitrary time lags", JOURNAL ="Management Science", YEAR ="1959" @@ -141,7 +141,7 @@ NOTE = "Th\`ese HDR" } @InProceedings{ch8:Zhang, - author = {E.Z. Zhang and Y. Jiang and Z. Guo and X. Shen}, + author = {E. Z. Zhang and Y. Jiang and Z. Guo and X. Shen}, title = {Streamlining GPU applications on the fly: thread divergence elimination through runtime thread-data remapping}, booktitle = {{In Proceedings of the 24th ACM International Conference on Supercomputing (ICS'10), ACM.}}, year = {2010}, diff --git a/BookGPU/Chapters/chapter9/biblio9.bib b/BookGPU/Chapters/chapter9/biblio9.bib index 5c32a72..041c461 100644 --- a/BookGPU/Chapters/chapter9/biblio9.bib +++ b/BookGPU/Chapters/chapter9/biblio9.bib @@ -228,7 +228,7 @@ year = {2013} } @incollection{paradiseoGPU, -author={Melab, N. and Luong, T.V. and Boufaras, K. and Talbi, E.G.}, +author={Melab, N. and Luong, T. V. and Boufaras, K. and Talbi, E.-G.}, title={{Towards ParadisEO-MO-GPU: A Framework for GPU-Based Local Search Metaheuristics}}, booktitle={Advances in Computational Intelligence}, series={Lecture Notes in Computer Science}, @@ -238,7 +238,7 @@ year={2011} } @incollection{luongMultiStart, -author={T. V. Luong and N. Melab and E-G. Talbi}, +author={T. V. Luong and N. Melab and E.-G. Talbi}, title={{GPU-Based Multi-start Local Search Algorithms}}, booktitle={Learning and Intelligent Optimization}, series={Lecture Notes in Computer Science}, -- 2.39.5 From 6e645292c9053655e4e44bbebf1c1eb751a554a6 Mon Sep 17 00:00:00 2001 From: couturie Date: Sun, 5 May 2013 19:00:57 +0200 Subject: [PATCH 07/16] new --- BookGPU/BookGPU.tex | 4 +- BookGPU/Chapters/chapter5/biblio5.bib | 48 ++++++------ BookGPU/Chapters/chapter5/ch5.tex | 10 +-- BookGPU/Chapters/chapter6/PartieAsync.tex | 92 ++++++++++++++--------- BookGPU/Chapters/chapter6/PartieSync.tex | 58 +++++++------- BookGPU/Chapters/chapter6/biblio6.bib | 36 ++++----- BookGPU/Chapters/chapter6/ch6.tex | 1 - BookGPU/Chapters/chapter8/ch8.tex | 44 +++++------ 8 files changed, 153 insertions(+), 140 deletions(-) diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index 1ca9c1a..e011a89 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -147,7 +147,7 @@ \makeindex -%\includeonly{Chapters/chapter15/ch15} +%\includeonly{Chapters/chapter5/ch5} \begin{document} @@ -184,7 +184,7 @@ \include{Chapters/chapter3/ch3} \include{Chapters/chapter4/ch4} \part{Software development} -\include{Chapters/chapter5/ch5} +\include{Chapters/chapter5/ch5} \include{Chapters/chapter6/ch6} \part{Optimization} \include{Chapters/chapter8/ch8} diff --git a/BookGPU/Chapters/chapter5/biblio5.bib b/BookGPU/Chapters/chapter5/biblio5.bib index 8cff215..04267a1 100644 --- a/BookGPU/Chapters/chapter5/biblio5.bib +++ b/BookGPU/Chapters/chapter5/biblio5.bib @@ -2,13 +2,13 @@ % Encoding: Cp1252 @INCOLLECTION{ch5:Bajrovic2011, - author = {Bajrovic, Enes and Traff, Jesper}, + author = {Bajrovic, E. and Traff, J.}, title = {Using MPI Derived Datatypes in Numerical Libraries}, booktitle = {Recent Advances in the Message Passing Interface}, publisher = {Springer Berlin / Heidelberg}, year = {2011}, - editor = {Cotronis, Yiannis and Danalis, Anthony and Nikolopoulos, Dimitrios - and Dongarra, Jack}, + editor = {Cotronis, Y. and Danalis, A. and Nikolopoulos, D. + and Dongarra, J.}, volume = {6960}, series = {Lecture Notes in Computer Science}, pages = {29-38}, @@ -23,7 +23,7 @@ } @ARTICLE{ch5:Bell2011, - author = {Nathan Bell and Jared Hoberock}, + author = {N. Bell and J. Hoberock}, title = {Thrust: A Productivity-Oriented Library for CUDA}, journal = {in GPU Computing Gems, Jade Edition, Edited by Wen-mei W. Hwu}, year = {2011}, @@ -48,7 +48,7 @@ title = {Design Patterns - Elements of Reusable Object-Oriented Software}, publisher = {Addison-Wesley Professional Computing Series}, year = {1995}, - author = {Erich Gamma and Richard Helm and Ralph Johnson and John Vlissides}, + author = {E. Gamma and R. Helm and R. Johnson and J. Vlissides}, owner = {slgl}, timestamp = {2012.09.10} } @@ -58,7 +58,7 @@ Interface, {\rm 2nd edition}}, publisher = {MIT Press}, year = {1999}, - author = {William Gropp and Ewing Lusk and Anthony Skjellum}, + author = {W. Gropp and E. Lusk and A. Skjellum}, address = {Cambridge, MA}, area = {M}, areaseq = {0} @@ -75,14 +75,14 @@ } @INCOLLECTION{ch5:Hoefler2011, - author = {Hoefler, Torsten and Snir, Marc}, + author = {Hoefler, T. and Snir, M.}, title = {Writing Parallel Libraries with MPI - Common Practice, Issues, and Extensions}, booktitle = {Recent Advances in the Message Passing Interface}, publisher = {Springer Berlin / Heidelberg}, year = {2011}, - editor = {Cotronis, Yiannis and Danalis, Anthony and Nikolopoulos, Dimitrios - and Dongarra, Jack}, + editor = {Cotronis, Y. and Danalis, A. and Nikolopoulos, D. + and Dongarra, J.}, volume = {6960}, series = {Lecture Notes in Computer Science}, pages = {345-355}, @@ -100,7 +100,7 @@ - steady-state and time-dependent problems}, publisher = {SIAM}, year = {2007}, - author = {Randall J. LeVeque}, + author = {R. J. LeVeque}, pages = {I-XV, 1-341}, bibsource = {DBLP, http://dblp.uni-trier.de}, ee = {http://www.ec-securehost.com/SIAM/OT98.html}, @@ -108,7 +108,7 @@ } @TECHREPORT{ch5:Skjellum1994, - author = {Anthony Skjellum and Nathan E. Doss and Purushotham V. Bangaloret}, + author = {A. Skjellum and N. E. Doss and P. V. Bangaloret}, title = {Writing Libraries in MPI}, institution = {Department of Computer Science and NSF Engineering Research Center for Computational Fiels Simulation. Mississippi State University}, @@ -133,7 +133,7 @@ title = {C++ Templates: The Complete Guide}, publisher = {Addison-Wesley Professional}, year = {2002}, - author = {David Vandevoorde and Nicolai M. Josuttis}, + author = {D. Vandevoorde and N. M. Josuttis}, pages = {552}, edition = {1st}, month = {November}, @@ -143,7 +143,7 @@ } @article{ch5:Korson1992, - author = {Korson, Tim and McGregor, John D.}, + author = {Korson, T. and McGregor, J. D.}, title = {Technical criteria for the specification and evaluation of object-oriented libraries}, journal = {Softw. Eng. J.}, issue_date = {March 1992}, @@ -168,7 +168,7 @@ } @INPROCEEDINGS{ch5:Glimberg2011, - AUTHOR = {Stefan L. Glimberg and Allan P. Engsig-Karup and Morten G. Madsen}, + AUTHOR = {S. L. Glimberg and A. P. Engsig-Karup and M. G. Madsen}, TITLE = {A Fast GPU-accelerated Mixed-precision Strategy for Fully Nonlinear Water Wave Computations}, BOOKTITLE = {Numerical Mathematics and Advanced Applications 2011, Proceedings of ENUMATH 2011, the 9th European Conference on Numerical Mathematics and Advanced Applications, Leicester, September 2011}, YEAR = {2011}, @@ -234,7 +234,7 @@ Since real world applications are naturally parallel and hardware is naturally p @book{ch5:chorin1993, title={A Mathematical Introduction to Fluid Mechanics}, - author={Chorin, A.J. and Marsden, J.E.}, + author={Chorin, A. J. and Marsden, J. E.}, isbn={9780387979182}, lccn={98115991}, series={Texts in Applied Mathematics}, @@ -244,7 +244,7 @@ Since real world applications are naturally parallel and hardware is naturally p } @book{ch5:Saad2003, - author = {Saad, Yousef}, + author = {Saad, Y.}, title = {Iterative Methods for Sparse Linear Systems}, year = {2003}, isbn = {0898715342}, @@ -255,7 +255,7 @@ Since real world applications are naturally parallel and hardware is naturally p @book{ch5:Kelley1995, title={Iterative Methods for Linear and Nonlinear Equations}, - author={Kelley, C.T.}, + author={Kelley, C. T.}, isbn={9780898713527}, lccn={lc95032249}, series={Frontiers in Applied Mathematics Series}, @@ -265,7 +265,7 @@ Since real world applications are naturally parallel and hardware is naturally p } @techreport{ch5:YMTR08, - author = {Yvon Maday}, + author = {Y. Maday}, title = {The parareal in time algorithm}, institution = {Universite Pierr\'{e} et Marie Curie}, year = {2008}, @@ -302,7 +302,7 @@ Since real world applications are naturally parallel and hardware is naturally p } @mastersthesis{ch5:ASNP12, - author = {Allan S. Nielsen}, + author = {A. S. Nielsen}, title = {Feasibility study of the Parareal algorithm}, school = {Technical University of Denmark, Department of Informatics and Mathematical Modeling}, year = {2012}, @@ -327,7 +327,7 @@ Since real world applications are naturally parallel and hardware is naturally p } @TECHREPORT{ch5:ScientificGrandChallenges2010, - author = {David L. Brown and Paul Messina et. al}, + author = {D. L. Brown and P. Messina et. al}, title = {Scientific Grand Challenges, Crosscutting technologies for computing at the exascale}, institution = {U.S. Department of Energy}, year = {2010}, @@ -336,7 +336,7 @@ Since real world applications are naturally parallel and hardware is naturally p } @article{ch5:Keyes2011, -author = {David E. Keyes}, +author = {D. E. Keyes}, title = {{Exaflop/s: The why and the how}}, journal = {Comptes Rendus Mecanique}, volume = {339}, @@ -381,7 +381,7 @@ masid = {49649121} } @inproceedings{ch5:Bell2009, - author = {Bell, Nathan and Garland, Michael}, + author = {Bell, N. and Garland, M.}, title = {Implementing sparse matrix-vector multiplication on throughput-oriented processors}, booktitle = {SC '09: Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis}, year = {2009}, @@ -394,7 +394,7 @@ masid = {49649121} } @book{ch5:Kirk2010, - author = {Kirk, David B. and Hwu, Wen-mei W.}, + author = {Kirk, D. B. and Hwu, W.-M. W.}, title = {Programming Massively Parallel Processors: A Hands-on Approach}, year = {2010}, isbn = {0123814723, 9780123814722}, @@ -405,7 +405,7 @@ masid = {49649121} @book{ch5:Trottenberg2001, title={Multigrid}, - author={Trottenberg, U. and Oosterlee, C.W. and Sch{\"u}ller, A.}, + author={Trottenberg, U. and Oosterlee, C. W. and Sch{\"u}ller, A.}, isbn={9780127010700}, lccn={00103940}, url={http://books.google.dk/books?id=9ysyNPZoR24C}, diff --git a/BookGPU/Chapters/chapter5/ch5.tex b/BookGPU/Chapters/chapter5/ch5.tex index 1c5da65..1d3d957 100644 --- a/BookGPU/Chapters/chapter5/ch5.tex +++ b/BookGPU/Chapters/chapter5/ch5.tex @@ -174,16 +174,16 @@ where $u(x,y,t)$ is the unknown heat distribution, $\kappa$ is a heat conductivi u(x,y,t_0) = \sin(\pi x)\,\sin(\pi y), & \qquad (x,y) \in \Omega. \end{align} An illustrative example of the numerical solution to the heat problem, using \eqref{ch5:eq:heatinit} as the initial condition is given in Figure \ref{ch5:fig:heatsolution}. -\begin{figure}[!htb] +\begin{figure}[!htbp] \begin{center} \setlength\figurewidth{0.3\textwidth} % - \setlength\figureheight{0.32\textwidth} % + \setlength\figureheight{0.3\textwidth} % \subfigure[$t=0.00s$]%{\input{Chapters/chapter5/figures/HeatSolution0.tikz}} -{\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/HeatSolution0_conv.pdf}} +{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_conv.pdf}} \subfigure[$t=0.05s$]%{\input{Chapters/chapter5/figures/HeatSolution0.049307.tikz}} -{\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/HeatSolution0_049307_conv.pdf}} +{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_049307_conv.pdf}} %\subfigure[$t=0.10s$]{\input{Chapters/chapter5/figures/HeatSolution0.099723.tikz}} -{\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/HeatSolution0_099723_conv.pdf}} +{\includegraphics[width=0.48\textwidth]{Chapters/chapter5/figures/HeatSolution0_099723_conv.pdf}} \end{center} \caption{Discrete solution at times $t=0s$ and $t=0.05s$, using \eqref{ch5:eq:heatinit} as initial condition and a small $20\times20$ numerical grid.}\label{ch5:fig:heatsolution} \end{figure} diff --git a/BookGPU/Chapters/chapter6/PartieAsync.tex b/BookGPU/Chapters/chapter6/PartieAsync.tex index 4a2b6d2..9c8de06 100644 --- a/BookGPU/Chapters/chapter6/PartieAsync.tex +++ b/BookGPU/Chapters/chapter6/PartieAsync.tex @@ -157,11 +157,15 @@ So, the global organization of this scheme is set up in \Lst{algo:ch6p2BasicAsyn % \label{algo:ch6p2BasicAsync} \begin{Listing}{algo:ch6p2BasicAsync}{Initialization of the basic asynchronous scheme} // Variables declaration and initialization -omp_lock_t lockSend; // Controls the sendings from the computing thread -omp_lock_t lockRec; // Ensures the initial reception of external data -char Finished = 0; // Boolean indicating the end of the process -char SendsInProgress = 0; // Boolean indicating if previous data sendings are still in progress -double Threshold; // Threshold of the residual for convergence detection +// Controls the sendings from the computing thread +omp_lock_t lockSend; +// Ensures the initial reception of external data +omp_lock_t lockRec; +char Finished = 0; // Boolean indicating the end of the process +// Boolean indicating if previous data sendings are still in progress +char SendsInProgress = 0; +// Threshold of the residual for convergence detection +double Threshold; // Parameters reading ... @@ -177,9 +181,10 @@ MPI_Comm_rank(MPI_COMM_WORLD, &numP); // OpenMP initialization (mainly declarations and setting up of locks) omp_set_num_threads(3); omp_init_lock(&lockSend); -omp_set_lock(&lockSend); // Initially locked, unlocked to start sendings +omp_set_lock(&lockSend);//Initially locked, unlocked to start sendings omp_init_lock(&lockRec); -omp_set_lock(&lockRec); // Initially locked, unlocked when initial data are received +//Initially locked, unlocked when initial data are received +omp_set_lock(&lockRec); #pragma omp parallel { @@ -236,7 +241,8 @@ double residual; // Residual of the current iteration // Computation loop while(!Finished){ - // Sendings of data dependencies if there is no previous sending in progress + // Sendings of data dependencies if there is no previous sending + // in progress if(!SendsInProgress){ // Potential copy of data to be sent in additional buffers ... @@ -345,7 +351,8 @@ The last function, detailed in \Lst{algo:ch6p2BasicAsyncReceptions}, does all th % \label{algo:ch6p2BasicAsyncReceptions} \begin{Listing}{algo:ch6p2BasicAsyncReceptions}{Reception function in the basic asynchronous scheme} // Variables declaration and initialization -char countReceipts = 0; // Boolean indicating whether receptions are counted or not +char countReceipts = 0; // Boolean indicating whether receptions are +// counted or not int nbEndMsg = 0; // Number of end messages received int arrived = 0; // Boolean indicating if a message is arrived int srcNd; // Source node of the message @@ -359,10 +366,12 @@ while(!Finished){ // Management of data messages switch(status.MPI_TAG){ case tagCom: // Management of data messages - srcNd = status.MPI_SOURCE; // Get the source node of the message + // Get the source node of the message + srcNd = status.MPI_SOURCE; // Actual data reception in the corresponding buffer MPI_Recv(dataBufferOf(srcNd), nbDataOf(srcNd), dataTypeOf(srcNd), srcNd, tagCom, MPI_COMM_WORLD, &status); - // Unlocking of the computing thread when data are received from all dependencies + // Unlocking of the computing thread when data are received + // from all dependencies if(countReceipts == 1 && ... @\emph{receptions from ALL dependencies}@ ...){ omp_unset_lock(&lockRec); countReceipts = 0; // No more counting after first iteration @@ -454,10 +463,14 @@ required to change the operating mode. \begin{Listing}{algo:ch6p2Sync}{Initialization of the synchronized scheme} // Variables declarations and initialization ... -omp_lock_t lockStates; // Controls the synchronous exchange of local states -omp_lock_t lockIter; // Controls the synchronization at the end of each iteration -char localCV = 0; // Boolean indicating whether the local stabilization is reached or not -int nbOtherCVs = 0; // Number of other nodes being in local stabilization +// Controls the synchronous exchange of local states +omp_lock_t lockStates; +// Controls the synchronization at the end of each iteration +omp_lock_t lockIter; +//Boolean indicating whether the local stabilization is reached or not +char localCV = 0; +// Number of other nodes being in local stabilization +int nbOtherCVs = 0; // Parameters reading ... @@ -468,9 +481,12 @@ int nbOtherCVs = 0; // Number of other nodes being in local stabilization // OpenMP initialization (mainly declarations and setting up of locks) ... omp_init_lock(&lockStates); -omp_set_lock(&lockStates); // Initially locked, unlocked when all state messages are received +// Initially locked, unlocked when all state messages are received +omp_set_lock(&lockStates); omp_init_lock(&lockIter); -omp_set_lock(&lockIter); // Initially locked, unlocked when all "end of iteration" messages are received +// Initially locked, unlocked when all "end of iteration" messages are +// received +omp_set_lock(&lockIter); // Threads launching #pragma omp parallel @@ -548,7 +564,7 @@ while(!Finished){ // Waiting for the state messages receptions from the other nodes omp_set_lock(&lockStates); - // Determination of global convergence (if all nodes are in local CV) + //Determination of global convergence (if all nodes are in local CV) if(localCV + nbOtherCVs == nbP){ // Entering global CV state Finished = 1; @@ -632,10 +648,11 @@ while(!Finished){ case tagState: // Management of local state messages // Actual reception of the message MPI_Recv(&recvdState, 1, MPI_CHAR, status.MPI_SOURCE, tagState, MPI_COMM_WORLD, &status); - // Updates of numbers of stabilized nodes and received state msgs + // Updates of numbers of stabilized nodes and received state msgs nbOtherCVs += recvdState; nbStateMsg++; - // Unlocking of the computing thread when states of all other nodes are received + // Unlocking of the computing thread when states of all other + // nodes are received if(nbStateMsg == nbP-1){ nbStateMsg = 0; omp_unset_lock(&lockStates); @@ -645,7 +662,8 @@ while(!Finished){ // Actual reception of the message in dummy buffer MPI_Recv(dummyBuffer, 1, MPI_CHAR, status.MPI_SOURCE, tagIter, MPI_COMM_WORLD, &status); nbIterMsg++; // Update of the nb of iteration messages - // Unlocking of the computing thread when iteration messages are received from all other nodes + // Unlocking of the computing thread when iteration messages + // are received from all other nodes if(nbIterMsg == nbP - 1){ nbIterMsg = 0; omp_unset_lock(&lockIter); @@ -740,9 +758,9 @@ iterations done (\texttt{nbSyncIter}). \begin{Listing}{algo:ch6p2AsyncSyncComp}{Computing function in the final asynchronous scheme}% without GPU computing.} // Variables declarations and initialization ... -OpMode curMode = SYNC; // Current operating mode (always begin in sync) -double asyncStart; // Starting time of the current async section -int nbSyncIter = 0; // Number of sync iterations done in async mode +OpMode curMode = SYNC;// Current operating mode (always begin in sync) +double asyncStart; // Starting time of the current async section +int nbSyncIter = 0; // Number of sync iterations done in async mode // Computation loop while(!Finished){ @@ -751,14 +769,15 @@ while(!Finished){ // Entering synchronous mode when asyncDuration is reached @% // (additional conditions can be specified if needed) @ if(MPI_Wtime() - asyncStart >= asyncDuration){ - // Waiting for the end of previous sends before starting sync mode + // Waiting for the end of previous sends before starting sync mode omp_set_lock(&lockSendsDone); curMode = SYNC; // Entering synchronous mode stampData(dataToSend, SYNC); // Mark data to send with sync flag nbSyncIter = 0; } }else{ - // In main async mode, going back to async mode when the max number of sync iterations are done + // In main async mode, going back to async mode when the max number + // of sync iterations are done if(mainMode == ASYNC){ nbSyncIter++; // Update of the number of sync iterations done if(nbSyncIter == 2){ @@ -827,12 +846,14 @@ dim3 Dg, Db; // CUDA kernel grids // Computation loop while(!Finished){ - // Determination of the dynamic operating mode, sendings of data dependencies and blocking data receptions in sync mode + // Determination of the dynamic operating mode, sendings of data + // dependencies and blocking data receptions in sync mode ... // Local GPU computation // Data transfers from node RAM to GPU CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer"); - ... // There may be several data transfers: typically A and b in linear problems + ... // There may be several data transfers: typically A and b in + // linear problems // GPU grid definition Db.x = BLOCK_SIZE_X; // BLOCK_SIZE_# are kernel design dependent Db.y = BLOCK_SIZE_Y; @@ -903,12 +924,13 @@ omp_lock_t lockRes; // Informs aux thread about new results omp_lock_t lockWrite; // Controls exclusion of results access ... auxRes ... ; // Results of auxiliary computations -// Parameters reading, MPI initialization, data initialization and distribution +// Parameters reading, MPI initialization, data initialization and +// distribution ... // OpenMP initialization ... omp_init_lock(&lockAux); -omp_set_lock(&lockAux); // Unlocked when new aux results are available +omp_set_lock(&lockAux);//Unlocked when new aux results are available omp_init_lock(&lockRes); omp_set_lock(&lockRes); // Unlocked when new results are available omp_init_lock(&lockWrite); @@ -963,10 +985,12 @@ dim3 Dg, Db; // CUDA kernel grids // Computation loop while(!Finished){ - // Determination of the dynamic operating mode, sendings of data dependencies and blocking data receptions in sync mode + // Determination of the dynamic operating mode, sendings of data + // dependencies and blocking data receptions in sync mode ... // Local GPU computation - // Data transfers from node RAM to GPU, GPU grid definition and init of shared mem + // Data transfers from node RAM to GPU, GPU grid definition and init + // of shared mem CHECK_CUDA_SUCCESS(cudaMemcpyToSymbol(dataOnGPU, dataInRAM, inputsSize, 0, cudaMemcpyHostToDevice), "Data transfer"); ... // Kernel call @@ -1027,13 +1051,13 @@ while(!Finished){ for(ind=0; ind GPU (sync. op) + cudaMemcpy(gpuInputTabAdr, cpuInputTabAdr, // Data transfer: + sizeof(float)*N, // CPU --> GPU (sync. op) cudaMemcpyHostToDevice); - gpuKernel_k1<<>>(); // GPU comp. (async. op) - MPI_Sendrecv_replace(cpuInputTabAdr, // MPI comms. (sync. op) + gpuKernel_k1<<>>(); // GPU comp. (async. op) + MPI_Sendrecv_replace(cpuInputTabAdr, // MPI comms. (sync. op) N,MPI_FLOAT, dest, 0, src, 0, ...); // IF there is (now) a result to transfer from the GPU to the CPU: - cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer: - sizeof(float), // GPU --> CPU (sync. op) + cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer: + sizeof(float), // GPU --> CPU (sync. op) cudaMemcpyDeviceToHost); } ... @@ -258,7 +258,7 @@ omp_set_num_threads(2); // - Thread 0: achieves MPI communications if (omp_get_thread_num() == 0) { - MPI_Sendrecv(current, // MPI comms. (sync. op) + MPI_Sendrecv(current, // MPI comms. (sync. op) N, MPI_FLOAT, dest, 0, future, N, MPI_FLOAT, dest, 0, ...); @@ -266,13 +266,13 @@ omp_set_num_threads(2); // - Thread 1: achieves the GPU sequence (GPU computations and // CPU/GPU data transfers) } else if (omp_get_thread_num() == 1) { - cudaMemcpy(gpuInputTabAdr, current, // Data transfer: - sizeof(float)*N, // CPU --> GPU (sync. op) + cudaMemcpy(gpuInputTabAdr, current, // Data transfer: + sizeof(float)*N, // CPU --> GPU (sync. op) cudaMemcpyHostToDevice); - gpuKernel_k1<<>>(); // GPU comp. (async. op) - // IF there is (now) a result to transfer from the GPU to the CPU: - cudaMemcpy(cpuResTabAdr + i, gpuResAdr, // Data transfer: - sizeof(float), // GPU --> CPU (sync. op) + gpuKernel_k1<<>>(); // GPU comp. (async. op) + // IF there is (now) a result to transfer from the GPU to the CPU: + cudaMemcpy(cpuResTabAdr + i, gpuResAdr,// Data transfer: + sizeof(float), // GPU --> CPU (sync. op) cudaMemcpyDeviceToHost); } @@ -393,27 +393,27 @@ omp_set_num_threads(2); for (int i = 0; i < NbIter; i++) { // - Thread 0: achieves MPI communications if (omp_get_thread_num() == 0) { - MPI_Sendrecv(current, // MPI comms. (sync. op) + MPI_Sendrecv(current, // MPI comms. (sync. op) N, MPI_FLOAT, dest, 0, future, N, MPI_FLOAT, dest, 0, ...); - // - Thread 1: achieves the streamed GPU sequence (GPU computations - // and CPU/GPU data transfers) + // - Thread 1: achieves the streamed GPU sequence (GPU + // computations and CPU/GPU data transfers) } else if (omp_get_thread_num() == 1) { - for (int s = 0; s < NbS; s++) { // Start all data transfers: + for (int s = 0; s < NbS; s++) { // Start all data transfers: cudaMemcpyAsync(gpuInputTabAdr + s*stride, // CPU --> GPU current + s*stride, // (async. ops) sizeof(float)*stride, cudaMemcpyHostToDevice, TabS[s]); } - for (int s = 0; s < NbS; s++) { // Start all GPU comps. (async.) + for (int s = 0; s < NbS; s++) { // Start all GPU comps. (async.) gpuKernel_k1<<>>(gpuInputTabAdr + s*stride); } - cudaThreadSynchronize(); // Wait all threads are ended - // IF there is (now) a result to transfer from the GPU to the CPU: - cudaMemcpy(cpuResTabAdr, // Data transfers: - gpuResAdr, // GPU --> CPU (sync. op) + cudaThreadSynchronize(); // Wait all threads are ended + // IF there is (now) a result to transfer from the GPU to the CPU: + cudaMemcpy(cpuResTabAdr, // Data transfers: + gpuResAdr, // GPU --> CPU (sync. op) sizeof(float), cudaMemcpyDeviceToHost); } @@ -539,7 +539,7 @@ omp_set_num_threads(3); // - Thread 0: achieves MPI communications if (omp_get_thread_num() == 0) { if (i < NbIter) { - MPI_Sendrecv(cpuCurrent, // MPI comms. (sync. op) + MPI_Sendrecv(cpuCurrent, // MPI comms. (sync. op) N, MPI_FLOAT, dest, 0, cpuFuture, N, MPI_FLOAT, dest, 0, ...); @@ -547,17 +547,17 @@ omp_set_num_threads(3); // - Thread 1: achieves the CPU/GPU data transfers } else if (omp_get_thread_num() == 1) { if (i < NbIter) { - cudaMemcpy(gpuFuture, cpuCurrent, // Data transfer: - sizeof(float)*N, // CPU --> GPU (sync. op) + cudaMemcpy(gpuFuture, cpuCurrent, // Data transfer: + sizeof(float)*N, // CPU --> GPU (sync. op) cudaMemcpyHostToDevice); } - // - Thread 2: achieves the GPU computations and the result transfer + // - Thread 2: achieves the GPU computations and the result transfer } else if (omp_get_thread_num() == 2) { if (i > 0) { - gpuKernel_k1<<>>(gpuCurrent); // GPU comp. (async. op) + gpuKernel_k1<<>>(gpuCurrent);// GPU comp. (async. op) // IF there is (now) a result to transfer from GPU to CPU: - cudaMemcpy(cpuResTabAdr + (i-1), // Data transfer: - gpuResAdr, sizeof(float), // GPU --> CPU (sync. op) + cudaMemcpy(cpuResTabAdr + (i-1), // Data transfer: + gpuResAdr, sizeof(float),// GPU --> CPU (sync. op) cudaMemcpyDeviceToHost); } } diff --git a/BookGPU/Chapters/chapter6/biblio6.bib b/BookGPU/Chapters/chapter6/biblio6.bib index 65dfda7..641c3cf 100644 --- a/BookGPU/Chapters/chapter6/biblio6.bib +++ b/BookGPU/Chapters/chapter6/biblio6.bib @@ -17,7 +17,7 @@ @Book{BT89, - author = {D.P.~Bertsekas and J.N.~Tsitsiklis}, + author = {D. P. Bertsekas and J. N. Tsitsiklis}, ALTeditor = {}, title = {Parallel and Distributed Computation}, publisher = {Prentice Hall}, @@ -47,7 +47,7 @@ } @InBook{ChVCV13, - author = {St\'{e}phane Vialle and Sylvain Contassot-Vivier}, + author = {S. Vialle and S. Contassot-Vivier}, editor = {Magoul\'{e}s, Fr\'{e}d\'{e}ric}, title = {Patterns for parallel programming on {G}{P}{U}s}, chapter = {Optimization methodology for Parallel Programming of Homogeneous or Hybrid Clusters}, @@ -67,7 +67,7 @@ } @Book{BCC07, - author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l}, + author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.}, title = {Parallel Iterative Algorithms: from sequential to grid computing}, publisher = {Chapman \& Hall/CRC}, year = {2007}, @@ -76,7 +76,7 @@ } @InProceedings{HPCS2002, - author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l}, + author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.}, title = {Asynchronism for Iterative Algorithms in a Global Computing Environment}, booktitle = {The 16th Annual International Symposium on High Performance Computing Systems and Applications (HPCS'2002)}, @@ -87,7 +87,7 @@ Computing Systems and Applications (HPCS'2002)}, } @InProceedings{Vecpar08a, - author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l}, + author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.}, title = {An efficient and robust decentralized algorithm for detecting the global convergence in asynchronous iterative algorithms}, booktitle = {8th International Meeting on High Performance Computing for Computational Science, VECPAR'08}, @@ -98,7 +98,7 @@ convergence in asynchronous iterative algorithms}, } @article{ParCo05, - author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l}, + author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.}, title = {Evaluation of the Asynchronous Iterative Algorithms in the Context of Distant Heterogeneous Clusters}, journal = {Parallel Computing}, volume = {31}, @@ -108,7 +108,7 @@ convergence in asynchronous iterative algorithms}, } @InProceedings{ECost10, - author = {Contassot-Vivier, Sylvain and Vialle, St\'{e}phane and Jost, Thomas}, + author = {Contassot-Vivier, S. and Vialle, S. and Jost, T.}, title = {Optimizing computing and energy performances on {GPU} clusters: experimentation on a {PDE} solver}, booktitle = {COST Action IC0804 on Large Scale Distributed Systems,1st Year}, OPTpages = {}, @@ -119,7 +119,7 @@ convergence in asynchronous iterative algorithms}, } @Article{SuperCo05, - author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Couturier, Rapha\"{e}l}, + author = {Bahi, J. M. and Contassot-Vivier, S. and Couturier, R.}, title = {Performance comparison of parallel programming environments for implementing {AIAC} algorithms}, journal = {Journal of Supercomputing. Special Issue on Performance Modelling and Evaluation of Parallel and Distributed Systems}, year = {2006}, @@ -129,7 +129,7 @@ convergence in asynchronous iterative algorithms}, } @InProceedings{Para10, - author = {Contassot-Vivier, Sylvain and Jost, Thomas and Vialle, St\'{e}phane}, + author = {Contassot-Vivier, S. and Jost, T. and Vialle, S.}, title = {Impact of asynchronism on {GPU} accelerated parallel iterative computations}, booktitle = {PARA 2010 conference: State of the Art in Scientific and Parallel Computing}, OPTpages = {--}, @@ -139,7 +139,7 @@ convergence in asynchronous iterative algorithms}, } @InProceedings{ECost10, - author = {Contassot-Vivier, Sylvain and Vialle, St\'{e}phane and Jost, Thomas}, + author = {Contassot-Vivier, S. and Vialle, S. and Jost, T.}, title = {Optimizing computing and energy performances on {GPU} clusters: experimentation on a {PDE} solver}, booktitle = {COST Action IC0804 on Large Scale Distributed Systems,1st Year}, OPTpages = {}, @@ -150,7 +150,7 @@ convergence in asynchronous iterative algorithms}, } @InCollection{JCVV10, - author = {Thomas Jost and Sylvain Contassot-Vivier and St\'{e}phane Vialle}, + author = {T. Jost and S. Contassot-Vivier and S. Vialle}, title = {An efficient multi-algorithm sparse linear solver for {GPU}s}, booktitle = {Parallel Computing : From Multicores and GPU's to Petascale}, pages = {546--553}, @@ -170,7 +170,7 @@ convergence in asynchronous iterative algorithms}, } @InProceedings{ParCo09, - author = {Thomas Jost and Sylvain Contassot-Vivier and St\'{e}phane Vialle}, + author = {T. Jost and S. Contassot-Vivier and S. Vialle}, title = {An efficient multi-algorithms sparse linear solver for {GPU}s}, booktitle = {EuroGPU mini-symposium of the International Conference on Parallel Computing, ParCo'2009}, pages = {546--553}, @@ -180,7 +180,7 @@ convergence in asynchronous iterative algorithms}, } @InProceedings{BCVG11, - author = {Bahi, Jacques M. and Contassot-Vivier, Sylvain and Giersch, Arnaud}, + author = {Bahi, J. M. and Contassot-Vivier, S. and Giersch, A.}, title = {Load Balancing in Dynamic Networks by Bounded Delays Asynchronous Diffusion}, booktitle = {VECPAR 2010}, pages = {352--365}, @@ -217,7 +217,7 @@ convergence in asynchronous iterative algorithms}, } @article{Hoefler08a, -author = {Torsten Hoefler and Andrew Lumsdaine}, +author = {T. Hoefler and A. Lumsdaine}, title = {Overlapping Communication and Computation with High Level Communication Routines}, journal ={Cluster Computing and the Grid, IEEE International Symposium on}, OPTvolume = {0}, @@ -230,7 +230,7 @@ address = {Los Alamitos, CA, USA}, } @Article{Valiant:BSP, - author = {Valiant, Leslie G.}, + author = {Valiant, L. G.}, title = {A bridging model for parallel computation}, journal = {Communications of the ACM}, year = 1990, @@ -240,7 +240,7 @@ address = {Los Alamitos, CA, USA}, } @inproceedings{gustedt:hal-00639289, - AUTHOR = {Gustedt, Jens and Jeanvoine, Emmanuel}, + AUTHOR = {Gustedt, J. and Jeanvoine, E.}, TITLE = {{Relaxed Synchronization with Ordered Read-Write Locks}}, BOOKTITLE = {{Euro-Par 2011: Parallel Processing Workshops}}, YEAR = {2012}, @@ -258,7 +258,7 @@ address = {Los Alamitos, CA, USA}, } @article{clauss:2010:inria-00330024:1, - AUTHOR = {Clauss, Pierre-Nicolas and Gustedt, Jens}, + AUTHOR = {Clauss, P.-N. and Gustedt, J.}, TITLE = {{Iterative Computations with Ordered Read-Write Locks}}, JOURNAL = {{Journal of Parallel and Distributed Computing}}, PUBLISHER = {Elsevier}, @@ -277,7 +277,7 @@ address = {Los Alamitos, CA, USA}, TITLE = {The par{X}{X}{L} Environment: Scalable Fine Grained Development for Large Coarse Grained Platforms}, X-PAYS = {IT}, X-INTERNATIONAL-AUDIENCE = {yes}, - AUTHOR = {Gustedt, Jens AND Vialle, Stéphane AND De Vivo, Amelia}, + AUTHOR = {Gustedt, J. AND Vialle, S. AND De Vivo, A.}, BOOKTITLE = {PARA 06}, LONG-BOOKTITLE = {PARA 06: Worshop on state-of-the-art in scientific and parallel computing }, EDITOR = {Bo K{\aa}gstr{\"o}m and others}, diff --git a/BookGPU/Chapters/chapter6/ch6.tex b/BookGPU/Chapters/chapter6/ch6.tex index 0cf6f84..676da7b 100755 --- a/BookGPU/Chapters/chapter6/ch6.tex +++ b/BookGPU/Chapters/chapter6/ch6.tex @@ -9,7 +9,6 @@ \newcommand{\Equ}[1]{(\ref{#1})} \def\Reals{\mathbb{R}} -\definecolor{shadecolor}{rgb}{0.95,0.95,0.95} %\newenvironment{Algo}{\vspace{-1em}\begin{center}\begin{minipage}[h]{0.95\columnwidth}\begin{shaded}\begin{tabbing}% % \hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\=\hspace{3mm}\= \kill} % % { \end{tabbing}\vspace{-1em}\end{shaded}\end{minipage}\end{center}\vspace{-1em}} diff --git a/BookGPU/Chapters/chapter8/ch8.tex b/BookGPU/Chapters/chapter8/ch8.tex index 101a552..a2529b1 100644 --- a/BookGPU/Chapters/chapter8/ch8.tex +++ b/BookGPU/Chapters/chapter8/ch8.tex @@ -499,7 +499,6 @@ To reduce the computation time cost of the term $\min\limits_{(i,j)\in \jmath^2, MM & $m \times (m-1)$ & $m \times (m-1)$ \\ \hline \end{tabular} -\vspace{0.5cm} \caption{The different data structures of the $LB$ algorithm and their associated complexities in memory size and numbers of accesses. The parameters $n$, $m$ and $n'$ designate respectively the total number of jobs, the total number of machines and the number of remaining jobs to be scheduled for the sub-problems the lower bound is being computed.} \label{ch8:tabMemComplex} \end{table} @@ -518,7 +517,7 @@ The focus is put on the shared memory which is a key enabler for many high-perfo In order to achieve further performances, we also take care of adequately use the global memory by judiciously configuring the L1 cache which greatly enables improving performance over direct access to global memory. Indeed, the GPU device we are using in our experiments is based on the NVIDIA Fermi architecture which introduced two new hierarchies of memories (L1 $/$ L2 cache) compared to older architectures. -\begin{table*} +\begin{table} \centering \footnotesize \begin{tabular}{|r|r|r|r|r|r|} @@ -535,10 +534,9 @@ $50 \times 20$ & 9.500 (9.5KB) & 9.500 (19KB) & 1.000 (1KB) & 20 (0.04KB) & 380 $20 \times 20$ & 3.800 (3.8KB) & 3.800 (7.6KB) & 400 (0.4KB) & 20 (0.04KB) & 380 (0.76KB) \\ \hline \end{tabular} -\vspace{0.5cm} \caption{The sizes of each data structure for the different experimented problem instances. The sizes are given in number of elements and in bytes (between brackets).} \label{ch8:tabMemSizes} -\end{table*} +\end{table} \vspace{0.2cm} @@ -610,7 +608,7 @@ Using the approach defined in \cite{ch8:Mezmaz_2007}, it is possible to obtain a Table \ref{ch8:instance_time} gives, for each instance according to its number of jobs and its number of machines, the used resolution time with a sequential B\&B. For example, the sequential resolution time of each instance defined with $20$ jobs and $20$ machines is approximately 10 minutes. Of course, the computation time of the lower bound of a sub-problem defined with $20$ jobs and $20$ machines is on average greater than the computation time of the lower bound of a sub-problem defined with $50$ jobs and $20$ machines. Therefore, as shown in this table, the sequential resolution time increases with the size of the instance in order to be sure that the number of sub-problems explored is significant for all instances. -\begin{table*} +\begin{table} \setlength{\tabcolsep}{0.2cm} \renewcommand{\arraystretch}{1.2} \centering @@ -622,10 +620,9 @@ Instance (No. of jobs x No. of machines) & 20$\times$20 & 50$\times$20 & 100$\ti Sequential resolution time (minutes) & 10 & 50 & 150 & 300 \\ \hline \end{tabular} -\vspace{0.3cm} \caption{The sequential resolution time of each instance according to its number of jobs and machines} \label{ch8:instance_time} -\end{table*} +\end{table} \subsection{Performance impact of GPU-based parallelism} @@ -637,7 +634,7 @@ The results obtained with the GPU-PTE-BB approach (see Table \ref{ch8:ParaGPU1}) The results show also that the parallel efficiency decreases with the size of the problem instance. For a fixed number of machines (here 20 machines) and a fixed pool size, the obtained speedup decline accordingly with the number of jobs. For instance for a pool size of 262144, the acceleration factor obtained with 200 jobs (13.4) while it is (40.50) for the instances with 20 jobs. This behavior is mainly due to the overhead induced by the transfer of the pool of resulting sub-problems between the CPU and the GPU. For example, for the instances with 200 jobs the size of the pool to exchange between the CPU and the GPU is ten times bigger than the size of the pool for the instances with 20 jobs. -\begin{table*} +\begin{table} \setlength{\tabcolsep}{0.2cm} \renewcommand{\arraystretch}{1.2} \centering @@ -662,16 +659,15 @@ $20 \times $20 & 6.43 & 11.43 & 20.14 & 27.78 & 30.12 & 35.74 & 40.50\\ % \hline % \hline \end{tabular} -\vspace{0.3cm} \caption{Speedups for different problem instances and pool sizes with the GPU-PTE-BB approach.} \label{ch8:ParaGPU1} -\end{table*} +\end{table} The results obtained with the GPU-PEB-BB approach (see Table \ref{ch8:ParaGPU2}) show that evaluating in parallel the bounds of a selected pool, allow to significantly speedup the execution of the B\&B. Indeed, an acceleration factor up to 71.69 is obtained for the 200 $\times$ 20 problem instances using a pool of 262144 sub-problems. The results show also that the parallel efficiency grows with the size of the problem instance. For a fixed number of machines (here 20 machines) and a fixed pool size, the obtained speedup grows accordingly with the number of jobs. For instance for a pool size of 262144, the acceleration factor obtained with 200 jobs (71.69) is almost the double of the one obtained with 20 jobs (38.40). As far the pool size tuning is considered, we could notice that this parameter depends strongly on the problem instance being solved. Indeed, while the best acceleration is obtained with a pool size of 8192 sub-problems for the instances 50 $\times$ 20 and 20 $\times$ 20, the best speedups are obtained with a pool size of 262144 sub-problems with the instances 200 $\times$ 20 and 100 $\times$ 20.\\ -\begin{table*} +\begin{table} \setlength{\tabcolsep}{0.2cm} \renewcommand{\arraystretch}{1.2} \centering @@ -696,10 +692,9 @@ $20 \times $20 & 38.74 & \textbf{46.47} & 45.37 & 41.92 & 39.55 & 38.90 & 38.40\ % \hline % \hline \end{tabular} -\vspace{0.3cm} \caption{Speedups for different problem instances and pool sizes with the GPU-PEB-BB approach.} \label{ch8:ParaGPU2} -\end{table*} +\end{table} Compared to the parallel tree exploration-based GPU-accelerated B\&B approach, the parallel evaluation of bounds approach is by far much more efficient wherever the instance is. For example, while the GPU-PEB-BB approach reaches speedup of $\times$71.69 for the instance with 200 jobs on 20 machines, a speedup of a $\times$13.4 is measured with the parallel tree exploration-based approach which corresponds to an acceleration of $\times$5.56 . Moreover, on the contrary to the GPU-PEB-BB approach, in the GPU-PTE-BB the speedups decrease when the problem instance becomes higher. Remember here that while in the GPU-PEB-BB approach all threads evaluate only one node each whatever the permutation size is. In the GPU-PTE-BB, each thread branches all the children of its assigned parent node. Therefore, the bigger the size of the permutation is, the bigger the amount of work performed by each thread is and the bigger the difference between the workload is. Indeed, let us suppose that for the instance with $200$ jobs, the thread $0$ handles a node from the level $2$ of the tree and the thread $100$ handles a node from the level $170$ of the tree. In this case, the thread $0$ generates and evaluates $198$ nodes while the thread $100$ decomposes and bounds only $30$ nodes. The problem in this example is that the kernel execution would last until the thread $0$ finishes its work while the other threads might have ended their works and stayed idle. @@ -708,7 +703,7 @@ Compared to the parallel tree exploration-based GPU-accelerated B\&B approach, t The objective of this section is to demonstrate that the thread divergence reduction mechanisms we propose has an impact on the performance of the GPU accelerated B\&B and to evaluate how this impact is significant. In the following, the reported results are obtained with the GPU-accelerated B\&B based on the parallel evaluation of bounds. -\begin{table*}[!h] +\begin{table}[!h] \setlength{\tabcolsep}{0.2cm} \renewcommand{\arraystretch}{1.2} \centering @@ -734,10 +729,9 @@ $20 \times $20 & 41.71 & \textbf{50.28} & 49.19 & 45.90 & 42.03 & 41.80 & 41.65\ % \hline % \hline \end{tabular} -\vspace{0.3cm} \caption{Speedups for different instances and pool sizes using thread divergence management.} \label{ch8:ParaDivergence} -\end{table*} +\end{table} Table~\ref{ch8:ParaDivergence} shows the experimental results obtained using the sorting process and the refactoring approach presented in Section \ref{ch8:ThreadDivergence}. Results show that the proposed optimizations emphasize the GPU acceleration reported in Table~\ref{ch8:ParaGPU2} and obtained without thread divergence reduction. For example, for the instances of 200 jobs over 20 machines and a pool size of 262144, the average reported speedup is 77.46 while the average acceleration factor obtained without thread divergence management for the same instances and the same pool size is 71.69 which corresponds to an improvement of 7.68\%. Such considerable but not outstanding improvement is predictable, as claimed in \cite{ch8:Han}, since the factorized part of the branches in the FSP lower bound is very small. @@ -747,7 +741,7 @@ The objective of the experimental study presented in this section is to find the Table~\ref{ch8:PTM-on-SM} reports the speedups obtained for the first experimented scenario where only the matrix $PTM$ is put on the shared memory. Results show that the speedup grows on average with the growing of the pool size in the same way as in Table~\ref{ch8:ParaDivergence}. For the largest problem instance and pool size, putting the PTM matrix on the shared memory improves the speedups up to ($14\%$) compared to those obtained when $PTM$ is on global memory reaching an acceleration of $\times 90.51$ for the problem instances $200 \times 20$ and a pool size of $262144$ sub-problems . -\begin{table*} +\begin{table} \centering \footnotesize \begin{tabular}{|r|r|r|r|r|r|r|r|} @@ -771,14 +765,13 @@ $20 \times $20 & 41.94 & \textbf{60.10} & 48.28 & 39.86 & 39.61 & 38.93 & 37.79 % \hline % \hline \end{tabular} -\vspace{0.3cm} \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. $PTM$ is placed in shared memory and all others are placed in global memory.} \label{ch8:PTM-on-SM} -\end{table*} +\end{table} Table~\ref{ch8:JM-on-SM} reports the behavior of the speedup averaged on the different problem instances (sizes) as a function of the pool size for the scenario where the Johnson's matrix is put on the shared memory. Results show that putting the $JM$ matrix on the shared matrix improves more the performances comparing to the first scenario where $PTM$ is put on the shared memory. Indeed, according to Table~\ref{ch8:tabMemComplex}, matrix $JM$ is accessed more frequently than matrix $PTM$. Putting $JM$ matrix on the shared memory allows accelerations up to $\times 97.83$ for the problem instances $200 \times 20$. -\begin{table*} +\begin{table} \centering \footnotesize \begin{tabular}{|r|r|r|r|r|r|r|r|} @@ -802,15 +795,14 @@ $20 \times $20 & 49.00 & \textbf{60.25} & 55.50 & 45.88 & 44.47 & 43.11 & 42.82 % \hline % \hline \end{tabular} -\vspace{0.3cm} \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. $JM$ is placed in shared memory and all others are placed in global memory.} \label{ch8:JM-on-SM} -\end{table*} +\end{table} Table~\ref{ch8:JM-PTM-on-SM} reports the behavior of the average speedup for the different problem instances (sizes) with $20$ machines for the data placement scenario where both $PTM$ and $JM$ are put on shared memory. According to the underlying Table, the scenarios~(3) ($JM$ together or without $PTM$ in shared memory) is clearly better than the scenarii~(1)and~(2) (respectively $PTM$ in shared memory and $JM$ in shared memory) whatever is the problem instance (size). -\begin{table*} +\begin{table} \centering \footnotesize \begin{tabular}{|r|r|r|r|r|r|r|r|} @@ -834,11 +826,9 @@ $20 \times $20 & 53.64 & \textbf{61.47} & 59.55 & 51.39 & 47.40 & 46.53 & 46.37\ % \hline % \hline \end{tabular} -\vspace{0.3cm} - \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. -$PTM$ and $JM$ are placed together in shared memory and all others are placed in global memory.} + \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. $PTM$ and $JM$ are placed together in shared memory and all others are placed in global memory.} \label{ch8:JM-PTM-on-SM} -\end{table*} +\end{table} By carefully analyzing each of the scenarii of data placement on the memory hierarchies of the GPU, the recommendation is to put in the shared memory the Johnson's and the processing time matrices ($JM$ and $PTM$) if they fit in together. Otherwise, the whole or a part of the Johnson's matrix has to be put in priority in the shared memory. The other data structures are mapped to the global memory. -- 2.39.5 From be4f0c2285aefdeaab115857dd8d77d7aabe6c25 Mon Sep 17 00:00:00 2001 From: couturie Date: Sun, 5 May 2013 19:44:31 +0200 Subject: [PATCH 08/16] new --- BookGPU/Chapters/chapter2/biblio.bib | 2 +- BookGPU/Chapters/chapter3/biblio3.bib | 18 +++++++++--------- BookGPU/Chapters/chapter4/biblio4.bib | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/BookGPU/Chapters/chapter2/biblio.bib b/BookGPU/Chapters/chapter2/biblio.bib index d577ae8..8b3a959 100644 --- a/BookGPU/Chapters/chapter2/biblio.bib +++ b/BookGPU/Chapters/chapter2/biblio.bib @@ -13,7 +13,7 @@ @Article{ch2:journals/ijhpca/Dongarra02, title = "Basic Linear Algebra Subprograms Technical (Blast) Forum Standard (1)", - author = "Jack Dongarra", + author = "J. Dongarra", journal = "IJHPCA", year = "2002", number = "1", diff --git a/BookGPU/Chapters/chapter3/biblio3.bib b/BookGPU/Chapters/chapter3/biblio3.bib index 56fda5a..219118b 100755 --- a/BookGPU/Chapters/chapter3/biblio3.bib +++ b/BookGPU/Chapters/chapter3/biblio3.bib @@ -258,7 +258,7 @@ } @inproceedings{mcguire2008median, - author = {Morgan Mc{G}uire}, + author = {M. Mc{G}uire}, booktitle = {ShaderX6}, month = {February}, title = {A fast, small-radius GPU median filter}, @@ -327,7 +327,7 @@ } @inproceedings{chen09, - author = {Wei Chen and M. Beister and Y. Kyriakou and M. Kachelries}, + author = {W. Chen and M. Beister and Y. Kyriakou and M. Kachelries}, booktitle = {Nuclear Science Symposium Conference Record (NSS/MIC), 2009 IEEE}, doi = {10.1109/NSSMIC.2009.5402323}, issn = {1095-7863}, @@ -339,7 +339,7 @@ } @inproceedings{sanchezICASSP12, - author = {Ricardo M. Sanchez and Paul A. Rodriguez}, + author = {R. M. Sanchez and P. A. Rodriguez}, booktitle = {Acoustics, Speech and Signal Processing (ICASSP), 2012 IEEE International Conference on}, doi = {10.1109/ICASSP.2012.6288187}, issn = {1520-6149}, @@ -362,7 +362,7 @@ doi={10.1109/CIT.2011.60}, ISSN={},} @book{tukey77, -author = {Tukey, John Wilder}, +author = {Tukey, J. W.}, isbn = {0-201-07616-0}, publisher = {Addison-Wesley}, title = {Exploratory Data Analysis}, @@ -383,7 +383,7 @@ ISSN={1095-7863},} @article{Weiss:2006:FMB:1141911.1141918, - author = {Weiss, Ben}, + author = {Weiss, B}, title = {Fast median and bilateral filtering}, journal = {ACM Trans. Graph.}, issue_date = {July 2006}, @@ -403,7 +403,7 @@ ISSN={1095-7863},} } @inproceedings{Weiss:2006:FMB:1179352.1141918, - author = {Weiss, Ben}, + author = {Weiss, B}, title = {Fast median and bilateral filtering}, booktitle = {ACM SIGGRAPH 2006 Papers}, series = {SIGGRAPH '06}, @@ -421,7 +421,7 @@ ISSN={1095-7863},} } @book{Huang:1981:TDS:539567, - author = {Huang, Thomas S.}, + author = {Huang, T. S.}, title = {Two-Dimensional Digital Signal Processing II: Transforms and Median Filters}, year = {1981}, isbn = {0387103597}, @@ -462,7 +462,7 @@ doi={10.1109/TIP.2007.902329}, ISSN={1057-7149},} @ARTICLE{Wu2010, -author={Yong Wu and Mansoureh Eghbali and Jimmy Ou and Rong Lu and Ligia Toro and Enrico Stefani}, +author={Y. Wu and M. Eghbali and J. Ou and R. Lu and L. Toro and E. Stefani}, journal={Biophysical Journal}, title={Quantitative determination of spatial protein-protein correlations in fluorescence confocal microscopy.}, year={2010}, @@ -481,7 +481,7 @@ title={Highly Parallelable Bidimensional Median Filter for Modern Parallel Progr url={http://dx.doi.org/10.1007/s11265-012-0715-1}, publisher={Springer US}, keywords={Nonlinear filters; Parallel algorithms; Image processing}, -author={Sánchez, RicardoM. and Rodríguez, PaulA.}, +author={Sánchez, R. M. and Rodríguez, P. A.}, pages={1-15}, language={English} } diff --git a/BookGPU/Chapters/chapter4/biblio4.bib b/BookGPU/Chapters/chapter4/biblio4.bib index 14e5ef7..463e4e4 100644 --- a/BookGPU/Chapters/chapter4/biblio4.bib +++ b/BookGPU/Chapters/chapter4/biblio4.bib @@ -1,6 +1,6 @@ @unpublished{convolutionsoup, title = {Convolution Soup}, - author = {Stam, Joe}, + author = {Stam, J.}, abstract = {Graphics processors can be easily programmed to provide significant acceleration in many common parallel tasks. However, with additional architecture knowledge and understanding of optimization strategies, a savvy programmer can unleash the full potential of the GPU's massive memory bandwidth and ensure the processing resources are utilized to their fullest extent. In this talk, we'll explore several different approaches to a very simple but ubiquitous image processing algorithm, the convolution. A naive approach shows the detrimental impact of poorly written code, a simple approach achieves decent results with little effort or code complexity, and a few highly optimized techniques realize the GPUs full power for the most demanding tasks. The techniques explored in this simple but illustrative example will serve as a base for understanding the optimization strategies to apply towards more complex algorithms.}, year = {2010}, month ={8}, -- 2.39.5 From 177d75ae3d1a1061fb9caa43de9afca760ca0d1a Mon Sep 17 00:00:00 2001 From: couturie Date: Mon, 6 May 2013 22:14:50 +0200 Subject: [PATCH 09/16] correct ch 10 --- BookGPU/BookGPU.tex | 19 ++++-- BookGPU/Chapters/chapter10/biblio.bib | 86 ++++++++++++++++++++++----- BookGPU/Chapters/chapter10/ch10.tex | 17 ++++-- BookGPU/Chapters/chapter3/ch3.aux | 66 ++++++++++---------- 4 files changed, 130 insertions(+), 58 deletions(-) diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index e011a89..fae7a1b 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -1,6 +1,6 @@ \documentclass[sunil1,ChapterTOCs]{sunil} \usepackage[utf8]{inputenc} -\usepackage{amssymb} +%\usepackage{amssymb} %usepackage{amsbsy} \usepackage{amsfonts,amssymb} \usepackage{amsmath} @@ -21,7 +21,7 @@ \usepackage{epstopdf} \usepackage{url} \usepackage{multirow} -\usepackage{layouts} +%\usepackage{layouts} \usepackage{comment} \usepackage{xcolor} \usepackage{upquote} @@ -36,6 +36,7 @@ \usepackage{moreverb} \usepackage{commath} \usepackage{numprint} +\usepackage{placeins} %\usepackage{lmodern} %% \usepackage{listings} %% \usepackage{subfigure} @@ -68,9 +69,14 @@ \newcommand{\mbf}{\mathbf} \newcommand{\mc}{\mathcal} -%newcommand{\bs}{\boldsymbol} \newcommand{\N}{\mathcal{N}} \newcommand{\B}{\mathcal{B}} +\newcommand{\bs}{\boldsymbol} + + + +\let\OLDlstinputlisting\lstinputlisting +\renewcommand{\lstinputlisting}[2][]{\FloatBarrier\OLDlstinputlisting[#1]{#2}\FloatBarrier} \DeclareMathOperator*{\argmax}{arg\,max} @@ -147,11 +153,14 @@ \makeindex -%\includeonly{Chapters/chapter5/ch5} +%\includeonly{Chapters/chapter10/ch10} \begin{document} + + + %%bibliography style to adopt \bibliographyunit[\chapter] \defaultbibliographystyle{plain} @@ -189,7 +198,7 @@ \part{Optimization} \include{Chapters/chapter8/ch8} \include{Chapters/chapter9/ch9} -\include{Chapters/chapter10/ch10} %revoir ce chapitre +\include{Chapters/chapter10/ch10} \part{Numerical applications} \include{Chapters/chapter7/ch7} diff --git a/BookGPU/Chapters/chapter10/biblio.bib b/BookGPU/Chapters/chapter10/biblio.bib index f5bc1eb..942cdfe 100644 --- a/BookGPU/Chapters/chapter10/biblio.bib +++ b/BookGPU/Chapters/chapter10/biblio.bib @@ -35,7 +35,7 @@ pages = {271-281} } @Article{GILL1989, -author = {P.E. Gill et al.}, +author = {P. E. Gill et al.}, title = {A pratical anti-cycling procedure for linearly constrained optimization}, journal = {Mathematical Programming}, year = {1989}, @@ -46,8 +46,8 @@ pages = {437-474} @Misc{VOLKOV2010, author = {V. Volkov}, title = {Better Performance at Lower Occupancy}, -howpublished = {http://www.cs.berkeley.edu/~volkov/}, -year = {2010} +note = {Presentation at the {GPU} {T}echnology {C}onference 2010 (GTC 2010)}, +howpublished = {http://www.eecs.berkeley.edu/$\sim$volkov}, } @TechReport{MODEL, @@ -59,9 +59,9 @@ year = {2009} @Misc{NETBLIB, author = {J. Dongarra and E. Grosse}, -title = {The {NETLIB} Repository}, +title = {The {NETLIB} {R}epository at {UTK} and {ORNL}}, +institution = {The University of Tennessee and Oak Ridge National Lab}, howpublished = {http://www.netlib.org}, -year = {2010} } @Misc{APMTECH, @@ -108,7 +108,7 @@ year = {2007} } @Book{HILLIER, -author = {F.S Hillier and G.J. Lierberman}, +author = {F. S Hillier and G. J. Lierberman}, title = {Introduction to Operations Research}, year = {2010}, edition = {9} @@ -144,12 +144,19 @@ year = {1980} author = {X. Meyer}, title = {Etude et implmentation de l'algorithme standard du simplexe sur {GPU}s}, school = {University of {Geneva}}, -year = {2011}, +year = {2011} +} + +@InProceedings{MEYER2011, +author = {X. Meyer{,} P. Albuquerque and B. Chopard}, +title = {A multi-{GPU} implementation and performance model for the standard simplex method}, +booktitle = {Proc. of the 1st Int'l Symp. and 10th Balkan Conf. on Operational Research, BalcOR}, +year = {2011} } @Article{LALAMI11, author = {M. E. Lalami{,} D. El-Baz and V. Boyer}, -title = {Multi GPU Implementation of the Simplex Algorithm}, +title = {Multi-{GPU} Implementation of the Simplex Algorithm}, journal = {IEEE International Conference on High Performance Computing and Communications}, year = {2011}, pages = {179-186} @@ -157,7 +164,7 @@ pages = {179-186} @Article{JUNG08, author = {J. H. Jung and D. P. O'Leary}, -title = {Implementing an interior point method for linear programs on a CPU-GPU system}, +title = {Implementing an interior point method for linear programs on a {CPU-GPU} system}, journal = {Electronic Transactions on Numerical Analysis}, year = {2008}, pages = {174-189}, @@ -166,7 +173,7 @@ volume = {28} @Article{ELSTER09, author = {D. G. Spampinato{,} A. C. Elster and T. Natvig}, -title = {Modeling Multi-GPU Systems in Parallel Computing: From Multicores and GPU's to Petascale}, +title = {Modeling Multi-{GPU} Systems in Parallel Computing: From Multicores and {GPU}'s to Petascale}, journal = {Advances in Parallel Computing}, year = {2010}, pages = {562-569}, @@ -175,7 +182,9 @@ volume = {19} @book{dantzig1953product, title={The Product Form for the Inverse in the Simplex Method}, - author={Dantzig, G. B. and Orchard-Hays, W. and RAND Corp. Santa Monica Calif.}, + author={G. B. Dantzig and W. Orchard-Hays}, + institution={RAND Corp.}, + address={Santa Monica, California}, url={http://books.google.ch/books?id=XLuttgAACAAJ}, year={1953}, publisher={Defense Technical Information Center} @@ -184,7 +193,7 @@ volume = {19} @article{Marchand:2002:CPI:772382.772395, - author = {Marchand, Hugues and Martin, Alexander and Weismantel, Robert and Wolsey, Laurence}, + author = {Marchand, H. and Martin, A. and Weismantel, R. and Wolsey, L.}, title = {Cutting planes in integer and mixed integer programming}, journal = {Discrete Appl. Math.}, issue_date = {15 November 2002}, @@ -224,10 +233,10 @@ journal={Annals of Operations Research}, volume={43}, issue={1}, doi={10.1007/BF02025534}, -title={A fast LU update for linear programming}, +title={A fast {LU} update for linear programming}, url={http://dx.doi.org/10.1007/BF02025534}, publisher={Baltzer Science Publishers, Baarn/Kluwer Academic Publishers}, -author={Suhl, L. M. and Suhl, U. H.}, +author={L. M. Suhl and U. H. Suhl}, pages={33-47}, language={English} } @@ -254,13 +263,13 @@ doi={10.1007/BF01584074}, title={Experiments in mixed-integer linear programming}, url={http://dx.doi.org/10.1007/BF01584074}, publisher={Springer-Verlag}, -author={Benichou, M. and Gauthier, J. M. and Girodet, P. and Hentges, G. and Ribiere, G. and Vincent, O.}, +author={M. Benichou et al.}, pages={76-94}, language={English} } @article{Achterberg05, - author = {Achterberg, T. and Koch, T. and Martin, A.}, + author = {T. Achterberg{,} T. Koch and A. Martin}, title = {Branching rules revisited}, journal = {Oper. Res. Lett.}, issue_date = {January, 2005}, @@ -297,3 +306,48 @@ volume = {20}, number = {5}, pages = {736-773} } + +@article {LEMKE54, +author = {Lemke, C. E.}, +title = {The dual method of solving the linear programming problem}, +journal = {Naval Research Logistics Quarterly}, +volume = {1}, +number = {1}, +publisher = {Wiley Subscription Services, Inc., A Wiley Company}, +issn = {1931-9193}, +url = {http://dx.doi.org/10.1002/nav.3800010107}, +doi = {10.1002/nav.3800010107}, +pages = {36--47}, +year = {1954} +} + + +@article{BARTELS69, + author = {R. H. Bartels and G. H. Golub}, + title = {The simplex method of linear programming using {LU} decomposition}, + journal = {Commun. ACM}, + issue_date = {May 1969}, + volume = {12}, + number = {5}, + month = may, + year = {1969}, + issn = {0001-0782}, + pages = {266--268}, + numpages = {3}, + url = {http://doi.acm.org/10.1145/362946.362974}, + doi = {10.1145/362946.362974}, + acmid = {362974}, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {LU decomposition, computational stability, linear programming, round-off errors, simplex method} +} + +@article{BRAYTON70, + title={Some results on sparse matrices}, + author={R. K. Brayton{,} F. G. Gustavson and R. A. Willoughby}, + journal={Mathematics of Computation}, + volume={24}, + number={112}, + pages={937--954}, + year={1970} +} diff --git a/BookGPU/Chapters/chapter10/ch10.tex b/BookGPU/Chapters/chapter10/ch10.tex index 68ecaee..c47af05 100644 --- a/BookGPU/Chapters/chapter10/ch10.tex +++ b/BookGPU/Chapters/chapter10/ch10.tex @@ -5,7 +5,7 @@ \chapterauthor{Bastien Chopard}{Department of Computer Science, University of Geneva} %\chapter{Linear programming on a GPU: a study case based on the simplex method and the branch-cut-and bound algorithm} -\chapter{Linear programming on a GPU: a~study~case} +\chapter{Linear programming on a GPU: a case study} \section{Introduction} \label{chXXX:sec:intro} The simplex method is a well-known optimization algorithm for solving linear programming (LP) models in the field of operations research. It is part of software often employed by businesses for finding solutions to problems such as airline scheduling problems. The original standard simplex method was proposed by Dantzig in 1947~\cite{VCLP}. A more efficient method named the revised simplex, was later developed. Nowadays its sequential implementation can be found in almost all commercial LP solvers. But the always increasing complexity and size of LP problems from the industry, drives the demand for more computational power. Indeed, current implementations of the revised simplex strive to produce the expected results, if any. In this context, parallelization is the natural idea to investigate~\cite{HALL}. Already in 1996, Thomadakis et al.~\cite{THOMAD} implemented the standard method on a massively parallel computer and obtained an increase in performances when solving dense or large problems. @@ -117,19 +117,22 @@ Correspondingly, the columns with index $\ell$ and $e$ are exchanged between $\m We then update the constraints matrix $\mbf{A}$, the bound $\mbf{b}$ and the cost $\mbf{c}$ using Gaussian elimination. More precisely, denoting $\mbf{\tilde{I}_m}$, $\mbf{\tilde{A}_\N}$, $\mbf{\tilde{c}_\B}$ and $\mbf{\tilde{c}_\N}$ the resulting elements after the exchange, Gaussian elimination then transforms the tableau +\begin{center} \begin{tabular}{|c|c||c|} $\mbf{\tilde{A}_\N}$ & $\mbf{\tilde{I}_m}$ & $\mbf{b}$ \\ \hline $\mbf{\tilde{c}^T_\N}$ & $\mbf{\tilde{c}^T_\B}$ & $z$ \end{tabular} +\end{center} +\noindent into a tableau with updated values for $\mbf{A_\N}$, $\mbf{c_\N}$ and $\mbf{b}$ - +\begin{center} \begin{tabular}{|c|c||c|} $\mbf{A_\N}$ & $\mbf{I_m}$ & $\mbf{b}$ \\ \hline $\mbf{c^T_\N}$ & $\mbf{c^T_\B}$ & $z-tc_e$ \end{tabular} - +\end{center} The latter is obtained by first dividing the $\ell$-th row by $a_{\ell e}$; the resulting row multiplied by $a_{ie}$ ($i\not=\ell$), resp. by $c_e$, is then added to the $i$-th row, resp. the last row. Hence, $\mbf{c_\B=0}$. @@ -590,7 +593,7 @@ The main steps found in the previous implementation are again present in the rev \label{chXXX:algo:simplex-rev} \textit{// Find entering variable $x_e,\, e\in \B$}\; $\bs\tau \leftarrow \mbf{(B^{-1})^T}\mbf{c_\B} $ \hspace*{5mm} \textit{// cublasDgemv}\; - $\bs\upsilon \leftarrow \mbf{c_\N} - \mbf{N^T} \bs\tau $ \hspace*{4mm} \textit{// cublasDgemv} + $\bs\upsilon \leftarrow \mbf{c_\N} - \mbf{N^T} \bs\tau $ \hspace*{4mm} \textit{// cublasDgemv}\; $e \leftarrow argmax \left( \bs\upsilon / \sqrt{\bs\gamma} \right) $\; \If { $e < 0$ } {\Return \texttt{optima\_found} } \textit{// Find leaving variable $x_\ell,\, \ell\in \N$} \; @@ -887,6 +890,7 @@ We used problems from the \textit{NETLIB} repository to illustrate the improveme The test environnement is composed of a CPU server (2 Intel Xeon X5650, 2.67GHz, with 96GB DDR3 RAM) and a GPU computing system (NVIDIA tesla M2090) with the 4.2 CUDA Toolkit. This system connects 4 GPUs to the server. Each GPU has 6GB GDDR5 graphics memory and 512 cores (1.3GHz). \begin{table}[!h] +\begin{center} \begin{tabular}{|l|r|r|r|r|} \hline \multicolumn{1}{|l|}{Problem name} & \multicolumn{1}{l|}{Rows} & \multicolumn{1}{l|}{Cols} & \multicolumn{1}{l|}{NNZ} & \multicolumn{1}{l|}{C-to-R} \\ \hline @@ -900,6 +904,7 @@ scagr25.mps & 472 & 500 & 2029 & 1.1\\ \hline \end{tabular} \caption{NETLIB problems solved in less than 1 second} \label{chXXX:tab:small} +\end{center} \end{table} Let us begin by discussing the performances on problems solved in less than one second. The name, size, number of non-zero elements (NNZ) and columns to rows ratio (C-to-R) of each problems are reported in table~\ref{chXXX:tab:small}. The performances shown in figure~\ref{chXXX:fig:FastSolve} corroborate our previous observations. On these problems the \texttt{Revised-sparse} method doesn't outperform the \texttt{Standard} one. This can be explained by two factors: the added communications (kernel calls) for the revised method, and the small size and density of the problems. It is likely that sparse operations on a GPU require larger amounts of data to become more efficient than their dense counterpart. @@ -913,6 +918,7 @@ Let us begin by discussing the performances on problems solved in less than one The problems shown in table~\ref{chXXX:tab:medium} are solved in less than 4 seconds. As we can see in figure~\ref{chXXX:fig:NormalSolve}, the expected trend for the \texttt{Revised-base} and the \texttt{Revised-opti} methods is now confirmed. Let us presently focus on the \texttt{Standard} and \texttt{Revised-sparse} methods. Some of the problems, in particular \textit{czprob.mps} and \textit{nesm.mps}, are solved in a comparable amount of time. The performance gain of the \texttt{Revised-sparse} is related to the C-to-R (columns to rows) ratio of these problems, displaying respectively a 3.8 and a 4.4 ratio. \begin{table}[!h] +\begin{center} \begin{tabular}{|l|r|r|r|r|} \hline \multicolumn{1}{|l|}{Problem name} & \multicolumn{1}{l|}{Rows} & \multicolumn{1}{l|}{Cols} & \multicolumn{1}{l|}{NNZ} & \multicolumn{1}{l|}{C-to-R} \\ \hline @@ -927,6 +933,7 @@ perold.mps & 626 & 1376 & 6026 & 1.0\\ \hline \end{tabular} \caption{NETLIB problems solved in the range of 1 to 4 seconds} \label{chXXX:tab:medium} +\end{center} \end{table} \begin{figure}[!h] @@ -939,6 +946,7 @@ perold.mps & 626 & 1376 & 6026 & 1.0\\ \hline Finally, the biggest problems, and slowest to solve, are given in table~\ref{chXXX:tab:big}. A new tendency can be observed in figure~\ref{chXXX:fig:SlowSolve}. The \texttt{Revised-sparse} method is the fastest on most of the problems. The performances are still close between the best two methods on problems having a C-to-R ratio close to 2 like \textit{bnl2.mps}, \textit{pilot.mps} or \textit{greenbeb.mps}. However, when this ratio is greater, the \texttt{Revised-sparse} can be nearly twice as fast as the standard method. This is noticeable on \textit{80bau3b.mps} (4.5) and \textit{fit2p.mps} (4.3). Although the C-to-R ratio of \textit{d6cube.mps} (14.9) exceeds the ones previously mentioned, the \texttt{Revised-sparse} method doesn't show an impressive performance, probably due to the small amount of rows and the density of this problem, which does thus not fully benefit from the lower complexity of sparse operations. \begin{table}[!h] +\begin{center} \begin{tabular}{|l|r|r|r|r|} \hline \multicolumn{1}{|l|}{Problem name} & \multicolumn{1}{l|}{Rows} & \multicolumn{1}{l|}{Cols} & \multicolumn{1}{l|}{NNZ} & \multicolumn{1}{l|}{C-to-R} \\ \hline @@ -954,6 +962,7 @@ pilot87.mps & 2031 & 4883 & 73804 & 2.4\\ \hline \end{tabular} \caption{NETLIB problems solved in more than 5 seconds} \label{chXXX:tab:big} +\end{center} \end{table} \begin{figure}[!h] diff --git a/BookGPU/Chapters/chapter3/ch3.aux b/BookGPU/Chapters/chapter3/ch3.aux index 298a450..a5fefc8 100644 --- a/BookGPU/Chapters/chapter3/ch3.aux +++ b/BookGPU/Chapters/chapter3/ch3.aux @@ -44,8 +44,6 @@ \newlabel{fig:sap_examples}{{4.1}{33}} \newlabel{lst:medianGeneric}{{4.1}{34}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {4.1}Generic CUDA kernel achieving median filtering}{34}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Exemple of 5x5 median filtering\relax }}{35}} -\newlabel{fig:median_1}{{4.2}{35}} \newlabel{algoMedianGeneric}{{2}{35}} \newlabel{algoMedianGeneric:memcpyH2D}{{1}{35}} \newlabel{algoMedianGeneric:cptstart}{{3}{35}} @@ -53,6 +51,8 @@ \newlabel{algoMedianGeneric:memcpyD2H}{{7}{35}} \@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{35}} \@writefile{toc}{\contentsline {section}{\numberline {4.3}NVidia GPU tuning recipes}{35}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Exemple of 5x5 median filtering\relax }}{35}} +\newlabel{fig:median_1}{{4.2}{35}} \@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{36}} \newlabel{fig:median_overlap}{{4.3}{36}} \@writefile{lot}{\contentsline {table}{\numberline {4.1}{\ignorespaces Performance results of \texttt {kernel medianR}. \relax }}{36}} @@ -67,48 +67,48 @@ \@writefile{lof}{\contentsline {figure}{\numberline {4.5}{\ignorespaces Forgetful selection with the minimal element register count. Illustration for 3$\times $3 pixel window represented in a row and supposed sorted.\relax }}{39}} \newlabel{fig:forgetful_selection}{{4.5}{39}} \@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{39}} -\newlabel{lst:medianForget1pix3}{{4.3}{40}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{40}} \@writefile{lof}{\contentsline {figure}{\numberline {4.6}{\ignorespaces Determination of the Median value by the forgetful selection process, applied to a $3\times 3$ neighborhood window.\relax }}{41}} \newlabel{fig:forgetful3}{{4.6}{41}} +\newlabel{lst:medianForget1pix3}{{4.3}{41}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.3}3$\times $3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method. The optimal thread block size is 128 on GTX280 and 256 on C2070.}{41}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{42}} \@writefile{lof}{\contentsline {figure}{\numberline {4.7}{\ignorespaces Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times $3 median kernel.\relax }}{42}} \newlabel{fig:median3_overlap}{{4.7}{42}} -\@writefile{toc}{\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{42}} -\newlabel{lst:medianForget2pix3}{{4.4}{42}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{42}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{43}} -\newlabel{fig:compMedians2}{{4.8}{43}} -\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{44}} +\newlabel{lst:medianForget2pix3}{{4.4}{43}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.4}3$\times $3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.}{43}} +\@writefile{toc}{\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{43}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.8}{\ignorespaces Comparison of pixel throughput on GPU C2070 for the different 3$\times $3 median kernels.\relax }}{44}} +\newlabel{fig:compMedians2}{{4.8}{44}} \newlabel{sec:median5}{{4.5.1}{44}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{44}} -\newlabel{lst:medianForget2pix5}{{4.5}{44}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{44}} \@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.\relax }}{45}} \newlabel{fig:median5overlap}{{4.9}{45}} \@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.\relax }}{45}} \newlabel{fig:median5overlap}{{4.10}{45}} -\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{46}} -\newlabel{tab:median5comp}{{4.2}{46}} +\newlabel{lst:medianForget2pix5}{{4.5}{46}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{46}} +\@writefile{lot}{\contentsline {table}{\numberline {4.2}{\ignorespaces Performance of various 5$\times $5 median kernel implementations, applied on 4096$\times $4096 pixel image with C2070 GPU card.\relax }}{47}} +\newlabel{tab:median5comp}{{4.2}{47}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.5.2}Fast approximated n$\times $n median filter }{47}} -\newlabel{lst:medianSeparable}{{4.6}{47}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{47}} -\newlabel{img:sap_example_ref}{{4.11(a)}{48}} -\newlabel{sub@img:sap_example_ref}{{(a)}{48}} -\newlabel{img:sap_example_sep_med3}{{4.11(b)}{48}} -\newlabel{sub@img:sap_example_sep_med3}{{(b)}{48}} -\newlabel{img:sap_example_sep_med5}{{4.11(c)}{48}} -\newlabel{sub@img:sap_example_sep_med5}{{(c)}{48}} -\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{48}} -\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{48}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{48}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{48}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{48}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{48}} -\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{48}} -\newlabel{fig:sap_examples2}{{4.11}{48}} -\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{49}} -\newlabel{tab:medianSeparable}{{4.3}{49}} -\@writefile{toc}{\contentsline {section}{Bibliography}{50}} +\@writefile{lot}{\contentsline {table}{\numberline {4.3}{\ignorespaces Measured performance of one generic pseudo-separable median kernel applied to 4096$\times $4096 pixel image with various window sizes.\relax }}{48}} +\newlabel{tab:medianSeparable}{{4.3}{48}} +\newlabel{img:sap_example_ref}{{4.11(a)}{49}} +\newlabel{sub@img:sap_example_ref}{{(a)}{49}} +\newlabel{img:sap_example_sep_med3}{{4.11(b)}{49}} +\newlabel{sub@img:sap_example_sep_med3}{{(b)}{49}} +\newlabel{img:sap_example_sep_med5}{{4.11(c)}{49}} +\newlabel{sub@img:sap_example_sep_med5}{{(c)}{49}} +\newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{49}} +\newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{49}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{49}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{49}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{49}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{49}} +\@writefile{lof}{\contentsline {subfigure}{\numberline{(d)}{\ignorespaces {Image background estimation by a $55\times 55$ separable smoother}}}{49}} +\newlabel{fig:sap_examples2}{{4.11}{49}} +\newlabel{lst:medianSeparable}{{4.6}{50}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {4.6}generic pseudo median kernel.}{50}} +\@writefile{toc}{\contentsline {section}{Bibliography}{51}} \@setckpt{Chapters/chapter3/ch3}{ \setcounter{page}{52} \setcounter{equation}{0} -- 2.39.5 From 1352d9478d5f22bc50c797e0d28e560f1026e228 Mon Sep 17 00:00:00 2001 From: couturie Date: Thu, 9 May 2013 11:34:26 +0200 Subject: [PATCH 10/16] remove ch3median --- BookGPU/Chapters/chapter3/chmedian.tex | 461 ------------------------- 1 file changed, 461 deletions(-) delete mode 100755 BookGPU/Chapters/chapter3/chmedian.tex diff --git a/BookGPU/Chapters/chapter3/chmedian.tex b/BookGPU/Chapters/chapter3/chmedian.tex deleted file mode 100755 index d9cce0c..0000000 --- a/BookGPU/Chapters/chapter3/chmedian.tex +++ /dev/null @@ -1,461 +0,0 @@ -\documentclass[a4paper, 11pt]{book} - -\usepackage[T1]{fontenc} -\usepackage[latin1]{inputenc} -\usepackage[cyr]{aeguill} -\usepackage[frenchb]{babel} - -\usepackage{amssymb} -\usepackage{amsmath} -\usepackage{graphicx} -\usepackage{color, xcolor} -%\graphicspath{{img/}} -\usepackage{subfigure} -\usepackage[caption=false,font=footnotesize]{subfig} -%\usepackage{epsfig} -\usepackage{makeidx} -%\usepackage[sectionbib]{bibunits} -\usepackage{multicol} -\usepackage{multirow} -\usepackage{tabularx} -\usepackage[ruled,lined,linesnumbered]{algorithm2e} -\usepackage{array} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -% Listings -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\usepackage{listings} -\usepackage{courier} -\lstset{ - language=C, - columns=fixed, - basicstyle=\footnotesize\ttfamily, - numbers=left, - firstnumber=1, - numberstyle=\tiny, - stepnumber=5, - numbersep=5pt, - tabsize=3, - extendedchars=true, - breaklines=true, - keywordstyle=\textbf, - frame=single, - % keywordstyle=[1]\textbf, - %identifierstyle=\textbf, - commentstyle=\color{white}\textbf, - stringstyle=\color{white}\ttfamily, - % xleftmargin=17pt, - % framexleftmargin=17pt, - % framexrightmargin=5pt, - % framexbottommargin=4pt, - backgroundcolor=\color{lightgray}, - } - -\usepackage{caption} -%\DeclareCaptionFont{blue}{\color{blue}} -%\captionsetup[lstlisting]{singlelinecheck=false, labelfont={blue}, textfont={blue}} - -%\DeclareCaptionFont{white}{\color{white}} -%\DeclareCaptionFormat{listing}{\colorbox{gray}{\parbox{\textwidth}{\hspace{15pt}#1#2#3}}} -%\captionsetup[lstlisting]{format=listing,labelfont=white,textfont=white, singleline} -%%%%%%%%%%%%%%%%%%%%%%%% Fin Listings %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -\newcommand{\kl}{\includegraphics[scale=0.6]{kernLeft.png}~} -\newcommand{\kr}{\includegraphics[scale=0.6]{kernRight.png}} - -\begin{document} - -\chapter{Setting up the environnement.} -Image processing using a GPU often means using it as a general purpose computing processor, which soon brings up the issue of data transfers, especially when kernel runtime is fast and/or when large data sets are processed. -The truth is that, in certain cases, data transfers between GPU and CPU are slower than the actual computation on GPU. -It remains that global runtime can still be faster than similar processes run on CPU. -Therefore, to fully optimize global runtimes, it is important to pay attention to how memory transfers are done. -This leads us to propose, in the following section, an overall code structure to be used with all our kernel examples. - -Obviously, our code originally accepts various image dimensions and can process color images. -However, so as to propose concise and more readable code, we will assume the following limitations: -8 or 16~bit-coded gray-level input images whose dimensions $H\times W$ are multiples of 512 pixels. - -\begin{algorithm} - \SetNlSty{textbf}{}{:} - allocate and populate CPU memory \textbf{h\_in}\;\\ - allocate CPU pinned-memory \textbf{h\_out}\;\\ - allocate GPU global memory \textbf{d\_out}\;\\ - declare GPU texture reference \textbf{tex\_img\_in}\;\\ - allocate GPU array in global memory \textbf{array\_img\_in}\;\\ - bind GPU array \textbf{array\_img\_in} to texture \textbf{tex\_img\_in}\;\\ - copy data from \textbf{h\_in} to \textbf{array\_img\_in}\label{algo:memcopy:H2D}\; \\ - kernel\kl gridDim,blockDim\kr()\tcc*[f]{outputs to d\_out}\label{algo:memcopy:kernel}\;\\ - copy data from \textbf{d\_out} to \textbf{h\_out} \label{algo:memcopy:D2H}\;\\ -\caption{Global memory management on CPU and GPU sides.} -\label{algo:memcopy} - \end{algorithm} - -\section{Data transfers, memory management.} -This section deals with the following issues: -\begin{enumerate} -\item data transfer from CPU memory to GPU global memory: several GPU memory areas are available as destination memory but the 2-D caching mechanism of texture memory, specifically designed for fetching neighboring pixels, is currently the fastest way to fetch gray-level pixel values inside a kernel computation. This has lead us to choose \textbf{texture memory} as primary GPU memory area for images. -\item data fetching from GPU global memory to kernel local memory: as said above, we use texture memory. Depending on which process is run, texture data is used either by direct fetching in kernel local memory or through a prefetching in thread block shared memory. -\item data outputting from kernels to GPU memory: there is actually no alternative to global memory, as kernels can not directly write into texture memory and as copying from texture to CPU memory would not be faster than from simple global memory. -\item data transfer from GPU global memory to CPU memory: it can be drastically accelerated by use of \textbf{pinned memory}, keeping in mind it has to be used sparingly. -\end{enumerate} -Algorithm \ref{algo:memcopy} summarizes all the above considerations and describe how data are handled in our examples. For more information on how to handle the different types of GPU memory, we suggest to refer to CUDA programmer's guide. - -At debug stage, for simplicity's sake, we use the \textbf{cutil} library supplied by the NVidia developpement kit (SDK). Thus, in order to easily implement our examples, we suggest readers download download and install the latest NVidia-SDK (ours is SDK4.0), create a new directory \textit{SDK-root-dir/C/src/fast\_kernels} and adapt the generic \textit{Makefile} present in each sub-directory of \textit{SDK-root-dir/C/src/}. Then, only two more files will be enough to have a fully operational environnement: \textit{main.cu} and \textit{fast\_kernels.cu}. -Listings \ref{lst:main1}, \ref{lst:fkern1} and \ref{lst:mkfile} implement all the above considerations minimally, while remaining functional. - -The main file of Listing \ref{lst:main1} is a simplified version of our actual main file. -It has to be noticed that cutil functions \texttt{cutLoadPGMi} and \texttt{cutSavePGMi} only operate on unsigned integer data. As data is coded in short integer format for performance reasons, the use of these functions involves casting data after loading and before saving. This may be overcome by use of a different library. Actually, our choice was to modify the above mentioned cutil functions. - -Listing \ref{lst:fkern1} gives a minimal kernel skeleton that will serve as the basis for all other kernels. Lines 5 and 6 determine the coordinates $(i, j)$ of the pixel to be processed. Each pixel is associated with one thread. -The instruction in line 8 combines writing the output gray-level value into global memory and fetching the input gray-level value from 2-D texture memory. -The Makefile given in Listing \ref{lst:mkfile} shows how to adapt examples given in SDK. - -\lstinputlisting[label={lst:main1},caption=Generic main.cu file used to launch CUDA kernels]{code/mainSkel.cu} - -\lstinputlisting[label={lst:fkern1},caption=fast\_kernels.cu file featuring one kernel skeleton]{code/kernSkel.cu} - -\lstinputlisting[label={lst:mkfile},caption=Generic Makefile based on those provided by NV SDK]{code/Makefile} - - -\section{Performance measurements} -As our goal is to design very fast implementations of basic image processing algorithms, we need to make quite accurate time-measurements, within the order of magnitude of $0.01~ms$. Again, the easiest way of doing so is to use the helper functions of the cutil library. As usual, as the durations we are measuring are short and possibly suject to non neglectable variations, a good practice is to measure multiple executions and issue the mean runtime. All time results given in this chapter have been obtained through 1000 calls to each kernel. - -Listing \ref{lst:chronos} shows how to use the dedicated cutil functions. Timer declaration and creation only need to be performed once while reset, start and stop can be used as often as necessary. Synchronization is mandatory before stopping the timer (Line 7), to avoid runtime measure being biased. -\lstinputlisting[label={lst:chronos},caption=Time measurement technique using cutil functions]{code/exChronos.cu} - -In an attempt to provide relevant speedup values, we either implemented CPU versions of the algorithms studied, or used the values found in existing literature. Still, the large number and diversity of hardware platforms and GPU cards make it impossible to benchmark every possible combination and significant differences may occur between the speedups we announce and those obtained with different devices. As a reference, our developing platform details as follows: - -\begin{itemize} -\item CPU codes run on: - \begin{itemize} - \item Quad Core Xeon E31245 at 3.3GHz-8GByte RAM running Linux kernel 3.2 - \item Quad Core Xeon E5620 at 2.40GHz-12GByte RAM running Linux kernel 2.6.18 - \end{itemize} -\item GPU codes run on: -\begin{itemize} - \item Nvidia Tesla C2070 hosted by a PC QuadCore Xeon E5620 at 2.4GHz-12GByte RAM, running Linux kernel 2.6.18 - \item NVidia GeForce GTX 280 hosted by a PC QuadCore Xeon X5482 at 3.20GHz-4GByte RAM, running Linux kernel 2.6.32 - \end{itemize} -\end{itemize} - -All kernels have also been tested with various image sizes from 512$\times$512 to 4096$\times$4096 pixels. This allows to guess runtime dependancy over image size. - -Last, like many authors, we chose to use the pixel throughput value of each process in Mega Pixels per second (MP/s) as a performance indicator, including data transfers and kernel runtimes. -In order to estimate the potential for improvement of each kernel, a reference throughput measurement, involving identity kernel of Listing \ref{lst:fkern1}, was performed. As this kernel only fetches input values from texture memory and outputs them to global memory without doing any computation, it represents the smallest, thus fastest, possible process and is taken as the reference throughput value (100\%). The same measurement was performed on CPU, with a maximum effective pixel throughput of 130~Mpixel per second. On GPU, depending on grid parameters it amounts to 800~MPixels/s on GTX280 and 1300~Mpixels/s on C2070. - - - -\chapter{Implementing a fast median filter} -\section{Introduction} -Median filtering is a well-known method used in a wide range of application frameworks as well as a standalone filter especially for \textit{salt and pepper} denoising. It is able to highly reduce power of noise without blurring edges too much. - -First introduced by Tukey in \cite{tukey77}, it has been widely studied since then, and many researchers have proposed efficient implementations of it, adapted to various hypothesis, architectures and processors. -Originally, its main drawbacks were its compute complexity, its non linearity and its data-dependent runtime. Several researchers have adress these issues and designed, for example, efficient histogram-based median filter with predictible runtime \cite{Huang:1981:TDS:539567, Weiss:2006:FMB:1179352.1141918}. - -More recently, the advent of GPUs opened new perspectives in terms of image processing performance, and some researchers managed to take advantage of the new graphic capabilities: in that respect, we can cite the Branchless Vectorized Median filter (BVM) \cite{5402362, chen09} which allows very interesting runtimes on CUDA-enabled devices but, as far as we know, the fastest implementation to date is the histogram-based CCDS median filter \cite{6288187}. - -Some of the following implementations, feature very fast runtimes. They are targeted on Nvidia Tesla GPU (Fermi architecture, compute capability 2.x) but may easily be adapted to other models e.g. those of compute capability 1.3. - -The fastest ones are based on one efficient parallel implementation of the BVM algorithm described in \cite{mcguire2008median}, improving its performance through fine tuning of its implementation. - -\section{Median filtering} -\subsection{Basic principles} -DEsigning a 2-D median filter basically consists in defining a square window $H(i,j)$ for each pixel $I(i,j)$ of the input image, containing $n\times n$ pixels and centered on $I(i,j)$. The output value $I'(i,j)$ is the median value of the gray level values of the $n\times n$ pixels of $H(i,j)$. Figure \ref{fig:median_1} illustrates this principle with an example of a 5x5 median filter applied on pixel $I(5,6)$. The output value is the median value of the 25 values of the dark gray window centered on pixel $I(5,6)$. -Figure \ref{fig:sap_examples} shows an example of a $512\times 512$ pixel image, corrupted by a \textit{salt and pepper} noise and the denoised versions, output respectively by a $3\times 3$, a $5\items 5$ and a 2 iterations $3\items3 $ median filter. -\begin{figure} -\centering - \subfloat[Airplane image, corrupted with by salt \& pepper noise of density 0.25]{\label{img:sap_example_ref} \includegraphics[width=5cm]{img/airplane_sap25.png}}\qquad - \subfloat[Image denoised by a $3\times 3$ median filter]{\label{img:sap_example_med3} \includegraphics[width=5cm]{img/airplane_sap25_med3.png}}\\ - \subfloat[Image denoised by a $5\times 5$ median filter]{\label{img:sap_example_med5} \includegraphics[width=5cm]{img/airplane_sap25_med5.png}}\qquad - \subfloat[Image denoised by 2 iterations of a $3\times 3$ median filter]{\label{img:sap_example_med3_it2} \includegraphics[width=5cm]{img/airplane_sap25_med3_it2.png}}\\ - \caption{Exemple of median filtering, applied to salt \& pepper noise reduction.} - \label{fig:sap_examples} -\end{figure} - - The generic filtering method is given by Algorithm \ref{algo_median_generic}. After the data transfer stage of line \ref{algo_median_generic:memcpyH2D} which copies data from CPU memory to GPU texture memory, the actual median computing occurs between lines \ref{algo_median_generic:cptstart} and lines \ref{algo_median_generic:cptend}, before the final transfer which copies data back to CPU memory at line \ref{algo_median_generic:memcpyD2H}. Obviously, on key issue is the selection method that identifies the median value. But, as shown in figure \ref{fig:median_overlap}, since two neighboring pixels share part of the values to be sorted, a second key issue is how to rule redundancy between consecutive positions of the running window $H(i,j)$. -As mentioned earlier, the selection of the median value can be performed by more than one technique, using either histogram-based or sorting methods, each of them having its own benefits and drawbacks as will be discussed further down. - -\subsection{A naive implementation} -As a reference, Listing \ref{lst:medianGeneric} gives a simple, not to say simplistic implementation of a CUDA kernel (\texttt{kernel\_medianR}) achieving generic $n\times n$ histogram-based median filtering. Its runtime has a very low data dependency, but this implementation does not suit very well GPU architecture. Each pixel loads the whole of its $n\times n$ neighborhood meaning that one pixel is loaded multiple times inside one single thread block, and above all, the use of a local vector (histogram[]) considerably downgrades performance, as the compiler automatically stores such vectors in local memory (slow). - -Table \ref{tab:medianHisto1} displays measured runtimes of \texttt{kernel\_medianR} and pixel throughputs for each GPU version and for both CPU and GPU implementations. Usual window sizes of $3\times 3$, $5\times 5$ and $7\times 7$ are shown. Though some specific applications require larger window sizes and dedicated algorithms , such small square window sizes are most widely used in general purpose image processing. GPU runtimes have been obtained with a grid of 64-thread blocks. This block size, is a good compromise in this case. - -The first observation to make when analysing results of Table \ref{tab:medianHisto1} is that, on CPU, window size has almost no influence on the effective pixel throughput. -Since inner loops that fill the histogram vector contain very few fetching instructions (from 9 to 49, depending on the window size), it is not surprising to note neglectable runtime compared to the runtime of outer loops that fetch image pixels (from 256k to 16M instructions). -One could be tempted to claim that CPU has no chance to win, which is not so obvious as it highly depends on what kind of algorithm is run and above all, how it is implemented. Despite a maximum effective throughput potential that is almost five times higher, measured GTX280 throughput values sometimes prove slower than CPU values, as shown in Table \ref{tab:medianHisto1}. - -On the GPU's side, we note high dependence on window size due to the redundancy induced by the multiple fetches of each pixel inside each block, becoming higher with the window size as illustrated by Figure \ref{fig:median_overlap}. On C2070 card, thanks to a more efficient caching mechanism, this effect is lesser. On GPUs, dependency over image size is low, due to slightly more efficient data transfers when copying larger data amounts. Thus transferring a 4096$\times$4096 pixel image (32~MBytes) is a bit faster than transferring 64 times a 512$\times$512 pixel image (0.5~MBytes). - -%% mettre l'eau à la bouche - -\lstinputlisting[label={lst:medianGeneric},caption=Generic CUDA kernel achieving median filtering]{code/medianGeneric.cu} - -\begin{figure} - \centering - \includegraphics[width=8cm]{img/median_1.png} - \caption{Exemple of 5x5 median filtering} - \label{fig:median_1} -\end{figure} - -\begin{algorithm} - \SetNlSty{textbf}{}{:} - copy data from CPU to GPU texture memory\label{algo_median_generic:memcpyH2D}\; - \ForEach(\tcc*[f]{in parallel}){pixel at position $(x, y)$}{ - Read gray-level values of the n$\times$n neighborhood\label{algo_median_generic:cptstart}\; - Selects the median ($(n^2/2)^{th}$) value among those n$\times$n values\; - Outputs the new gray-level value \label{algo_median_generic:cptend}\; - } -copy data from GPU global memory to CPU memory\label{algo_median_generic:memcpyD2H}\; -\caption{generic n$\times$n median filter} -\label{algo_median_generic} -\end{algorithm} - -\begin{figure} - \centering - \includegraphics[width=5cm]{img/median_overlap.png} - \caption{Illustration of window overlapping in 5x5 median filtering} - \label{fig:median_overlap} -\end{figure} - - -\begin{table}[h] -%\newcolumntype{I}{!{\vrule width 1.5pt}} -\newlength\savedwidth -\newcommand\whline{\noalign{\global\savedwidth - \arrayrulewidth\global\arrayrulewidth 1.5pt} - \hline \noalign{\global\arrayrulewidth - \savedwidth} -} -\renewcommand{\arraystretch}{1.5} -\centering -{\tiny -\begin{tabular}{|c|l||c|c|c|c|c|c|c|c|c|} -\hline -\multicolumn{2}{|l||}{Processor} & \multicolumn{3}{c|}{\textbf{GTX280}} & \multicolumn{3}{c|}{\textbf{C2070}} & \multicolumn{3}{c|}{\textbf{Xeon}} \\ \hline -\multicolumn{2}{|l||}{\shortstack{Performances$\rightarrow$\\sizes (pixels)$\downarrow$}} & \shortstack{t\\(ms)}& \shortstack{output\\(MP/s)}& \shortstack{rate\\\% }&\shortstack{t\\(ms)}& \shortstack{output\\(MP/s)}& \shortstack{rate\\\% }&\shortstack{t\\(ms)}& \shortstack{output\\(MP/s)}& \shortstack{rate\\\% } \\ \whline -\multirow{3}{*}{\rotatebox{90}{512$^2$}} &3$\times$3&11.50 &22 &2.2 &7.58 &33 &3.4 & 19.25& 14&-\\ - &5$\times$5&19.10 &14 &1.3 &8.60 &30 &3.0 &18.49 &14 &-\\ - &7$\times$7&31.30 &8 &0.8 &10.60 &24 &2.5 &20.27 &13 &-\\\whline -\multirow{3}{*}{\rotatebox{90}{1024$^2$}}&3$\times$3&44.50 &23 &2.3 &29.60 &34 &3.5 &75.49 &14 &-\\ - &5$\times$5&71.10 &14 &1.4 &33.00 &31 &3.2 &73.88 &14 &-\\ - &7$\times$7&114.50 &9 &0.9 &39.10 &26 &2.7 &77.40 &13 &-\\\whline -\multirow{3}{*}{\rotatebox{90}{2048$^2$}}&3$\times$3&166.00 &24 &2.4 &115.20 &36 &3.6 &296.18&14 &-\\ - &5$\times$5&261.00&16 &1.5 &128.20&32 &3.3 &294.55&14 &-\\ - &7$\times$7&411.90 &10&1.0 &143.30&28 &2.8 &303.48&14 &-\\\whline -\multirow{3}{*}{\rotatebox{90}{4096$^2$}}&3$\times$3&523.80 &31 &3.0 &435.00 &38 &3.9 &1184.16&14 &-\\ - &5$\times$5&654.10&25 &2.4 &460.20&36 &3.7 &1158.26&14 &-\\ - &7$\times$7&951.30 &17&1.7 &509.60&32 &3.3 &1213.55&14 &-\\\whline - -\end{tabular}} -\caption{Performance results of \texttt{kernel medianR}. } -\label{tab:medianHisto1} -\end{table} - -\section{NVidia GPU tuning recipes} -When designing GPU code, besides thinking of the actual data computing process, one must choose the memory type into which to store temporary data. Three type of GPU memory are available: -\begin{enumerate} -\item \textbf{Global memory, the most versatile:}\\Offers the largest storing space and global scope but is slowest (400 cycles latency). \textbf{Texture memory} is physically included into it, but allows access through an efficient 2-D caching mechanism. -\item \textbf{Registers, the fastest:}\\Allows access wihtout latency, but only 63 registers are available per thread (thread scope), with a maximum of 32K per Symetric Multiprocessor (SM). -\item \textbf{Shared memory, a complex compromise:}\\All threads in one block can access 48~KBytes of shared memory, which is faster than global memory (20 cycles latency) but slower than registers. -However, bank conflicts can occur if two threads of a warp try to access data stored in one single memory bank. In such cases, the parallel process is re-serialized which may cause significant performance decrease. One easy way to avoid it is to ensure that two consecutive threads in one block always access 32 bit data at two consecutive adresses. -\end{enumerate} - -\noindent As observed earlier, designing a median filter GPU implementation using only global memory is fairly straightforward, but its performance remains quite low even if it is faster than CPU. -To overcome this, the most frequent choice made in efficient implementations found in literature is to use shared memory. Such option implies prefetching data prior to doing the actual computations, a relevant choice, as each pixel of an image belongs to n$\times$n different neighborhoods. Thus, it can be expected that fetching each gray-level value from global memory only once should be more efficient than do it each time it is required. One of the most efficient implementations using shared memory is presented in \cite{5402362}. In the case of the generic kernel of Listing \ref{lst:medianGeneric}, using shared memory without further optimization would not bring valuable speedup because that would just move redundancy from texture to shared memory fetching and would generate bank conflicts. For information, we wrote such a version of the generic median kernel and our measurements showed a speedup of around 3\% (for example: 32ms for 5$\times$5 median on a 1024$^2$ pixel image). - -As for registers, designing a generic median filter that would only use that type of memory seems difficult, due to the above mentioned 63 register-per-thread limitation. -Yet, nothing forbids us to design fixed-size filters, each of them specific to one of the most popular window sizes. It might be worth the effort as dramatic increase in performance could be expected. - -Another track to follow in order to improve performance of GPU implementations consists in hiding latencies generated by arithmetic instruction calls and memory accesses. Both can be partially hidden by introducing Instruction-Level Parallelism (ILP) and by increasing the data count output by each thread. Though such techniques may seem to break the NVidia occupancy paradigm, they can lead to dramatically higher data throughput values. -The following sections illustrate these ideas and detail the design of the fastest CUDA median filter known to date. - -\section{A 3$\times$3 median filter: using registers } -Designing a median filter dedicated to the smallest possible square window size is a good challenge to start using registers. -One first issue is that the exclusive use of registers forbids us to implement a naive histogram-based method. In a \textit{8-bit gray level pixel per thread} rule, each histogram requires one 256-element vector to store its values, e.g. four times the maximum register count allowed per thread (63). Considering a 3$\times$3 median filter involves only 9 pixel values per thread, it seem obvious they can be sorted within the 63-register limit. - -\subsection{The simplest way} -In the case of a 3$\times$3 median filter, the simplest solution consists in associating one register to each gray-level value, then sorting those 9 values and selecting the fifth one, e.g. the median value. For such a small amount of data to sort, a simple selection method is well indicated. As shown in Listing \ref{lst:kernelMedian3RegTri9} (\texttt{kernelMedian3RegTri9()}), the constraint of only using registers leads to adopt an unusual manner of coding. However, results are persuasive: runtimes are divided by around 120 on GTX280 and 80 on C2070, while only reduced by a 3.5 factor on CPU. -The diagram of Figure \ref{fig:compMedians1} summarizes these first results. Only C2070 throughputs are shown and compared to CPU results. We included the maximum effective pixel throughput in order to see the improvement potential of the different implementations. We also introduced throughputd achieved by \textit{libJacket}, a commercial implementation, as it was the fastest known implementation of 3$\times$3 median filter to date, as illustrated in \cite{chen09}. One of the authors of libJacket kindly posted the CUDA code of its 3$\times$3 median filter, that we inserted into our own coding structure. The algorithm itself is quite similar to ours, but running it in our own environement produced higher throughput values than those published in \cite{chen09}, not due to different hardware capabilities between our GTX280 and the GTX260 used in the paper, but to the way we perform memory transfers and to our register-only method of storing temporary data. - -\lstinputlisting[label={lst:kernelMedian3RegTri9},caption= 3$\times$3 median filter kernel using one register per neighborhood pixel and bubble sort]{code/kernMedianRegTri9.cu} - -\begin{figure} - \centering - \includegraphics[width=11cm]{img/debitPlot1.png} - \caption{Comparison of pixel throughputs on GPU C2070 and CPU for generic median, in 3$\times$3 median register-only and \textit{libJacket}.} - \label{fig:compMedians1} -\end{figure} - -\subsection{Further optimization} -Running the above register-only 3$\times$3 median filter through the NVidia CUDA profiler teaches us that the memory throughput achieved by the kernel remains quite low. To improve this, two methods can be used: -\begin{itemize} -\item increasing the number of concurrent threads, which can be achieved by reducing the number of registers used by each thread. -\item having each thread process more data which can be achieved at thread level by processing and outputting the gray-level value of two pixels or more. -\end{itemize} - - -\subsubsection{Reducing register count} -Our current kernel (\texttt{kernelMedian3RegTri9}) uses one register per gray-level value, which amounts to 9 registers for the entire 3$\times$3 window. -This count can be reduced by use of an iterative sorting process called \textit{forgetful selection}, where both \textit{extrema} are eliminated at each sorting stage, until only 3 elements remain. The question is to find out the minimal register count $k_n$ that allows the selection of the median amoung $n^2$ values. The answer can be evaluated considering that, when eliminating the maximum and the minimum values, one has to make sure not to eliminate the global median value. Such a situation is illustrated in Figure \ref{fig:forgetful_selection} for a 3$\times$3 median filter. For better comprehension, the 9 elements of the 3$\times$3 pixel window have been represented in a row. -\begin{figure} - \centering - \includegraphics[width=6cm]{img/forgetful_selection.png} - \caption{Forgetful selection with the minimal element register count. Illustration for 3$\times$3 pixel window represented in a row and supposed sorted.} - \label{fig:forgetful_selection} -\end{figure} -We must remember that, in the fully sorted vector, the median value will have the middle index e.g. $\lfloor n^2/2\rfloor$. -Moreover, assuming that both \textit{extrema} are eliminated from the first $k$ elements and that the global median is one of them would mean that: -\begin{itemize} -\item if the global median was the minimum among the $k$ elements, then at least $k-1$ elements would have a higher index. Considering the above median definition, at least $k-1$ elements should also have a lower index in the entire vector. -\item if the global median was the maximum among the $k$ elements, then at least $k-1$ elements would have a lower index. Considering the above median definition, at least $k-1$ elements should also have a higher index in the entire vector. -\end{itemize} -Therefore, the number $k$ of elements that are part of the first selection stage can be defined by the condition -$$n^2-k \leq \lfloor \frac{n^2}{2} \rfloor -1$$ -which leads to: -$$k_n=\lceil \frac{n^2}{2}\rceil+1 $$ -This rule can be applied to the first eliminating stage and remains true with the next ones as each stage suppresses exactly two values. -In our 3$\times$3 pixel window example, the minimum register count becomes $k_9=\lceil 9/2\rceil+1 = 6$. - -The \textit{forgetful selection} method, used in \cite{mcguire2008median} does not imply full sorting of values, but only selecting minimum and maximum values, which, at the price of a few iteration steps ($n^2-k$), reduces arithmetic complexity. -Listing \ref{lst:medianForget1pix3} details this process where forgetful selection is achieved by use of simple 2-value sorting function ($s()$, lines 1 to 5) that swaps input values if necessary. Moreover, whenever possible, in order to increase the Instruction-Level Parallelism, successive calls to $s()$ are done with independant elements as arguments. This is illustrated by the macro definitions of lines 7 to 14. - -\lstinputlisting[label={lst:medianForget1pix3},caption= 3$\times$3 median filter kernel using the minimum register count of 6 to find the median value by forgetful selection method]{code/kernMedianForget1pix3.cu} - -Our such modified kernel provides significantly improved runtimes: a speedup of around 16\% is obtained, and pixel throughput reaches around 1000~MPixel/s on C2070. - -\subsubsection{More data output per thread} -In the case of a kernel achieving an effective memory throughput value far from the GPU peak value, and if enough threads are run, another technique may help hiding memory latency and thus leverage performance: make sure that each thread generates multiple pixel outputs.\\ -Attentive readers could remark that it would increase the register count per thread, which can be compensated by dividing thread block size accordingly, thus allowing to keep the same register count per block. -Moreover, it is now possible to take advantage of window overlapping, first illustrated In Figure \ref{fig:median_overlap}, and further detailed in Figure \ref{fig:median3_overlap}. As the selection is first processed on the first 6 gray-level values, e.g. exactly the number of pixels that overlap between the neighborhoods of two adjacent center pixels, 6 texture fetches and one \texttt{minmax6} selection per thread can be saved. There again, some speedup can be expected through our modified kernel source code presented in Listing \ref{lst:medianForget2pix3}. One important difference lies in the way pixel coordinates are computed from thread indexes. As each thread has to process two pixels, the number of threads in each block is divided by 2, while the grid size remains unchanged. Consequently, in our kernel code, each thread whose block-related coordinates are $(tx, ty)$ will be in charge of processing pixels of block-related coordinates $(2tx, ty)$ and $(2tx+1, ty)$; lines 5 and 6 implement this. - -\begin{figure} - \centering - \includegraphics[width=4cm]{img/median3_overlap.png} - \caption{Illustration of how window overlapping is used to combine 2 pixel selections in a 3$\times$3 median kernel.} - \label{fig:median3_overlap} -\end{figure} - -\lstinputlisting[label={lst:medianForget2pix3},caption=3$\times$3 median filter kernel processing 2 output pixel values per thread using combined forgetful selection.]{code/kernMedian2pix3.cu} - -Running this $3\times 3$ kernel saves another 10\% runtime, as shown in Figure \ref{fig:compMedians2} and provides the best peak pixel throughput value known so far on C2070: 1155~Mpixel/s which is 86\% the maximum effective throughput. - -\begin{figure} - \centering - \includegraphics[width=11cm]{img/debitPlot2.png} - \caption{Comparison of pixel throughput on GPU C2070 for the different 3$\times$3 median kernels.} - \label{fig:compMedians2} -\end{figure} - -\section{A 5$\times$5 and more median filter } -Considering the maximum register count allowed per thread (63) and trying to push this technique to its limit potentially allows designing up to 9$\times$9 median filters. Such maximum would actually use $k_{81}=\lceil 81/2\rceil+1 = 42$ registers per thread plus 9, used by the compiler to complete arithmetic operations. This leads to a total register count of 51, which would forbid to compute more than one pixel per thread, but also would limit the number of concurrent threads per block. Our measurements show that this technique is still worth using for the 5$\times$5 median. As for larger window sizes, one option could be using shared memory. -The next two sections will first detail the particular case of the 5$\times$5 median through register-only method and eventually a generic kernel for larger window sizes. - -\subsection{A register-only 5$\times$5 median filter \label{sec:median5}} -The minimum register count required to apply the forgetful selection method to a 5$\times$5 median filter is $k_{25}=\lceil 25/2\rceil+1 = 14$. Moreover, two adjacent overlapping windows share 20 pixels ($n^2-one\_column$) so that, when processing 2 pixels simultaneously, a count of 6 common selection stages can be carried out from the first selection stage with 14 common values to the processing of the last common value. That allows to limit register count to 14+8=22 per thread. Figure \ref{fig:median5overlap} describes the distribution of overlapping pixels, implemented in Listing \ref{lst:medianForget2pix5}: common selection stages take place from line 25 to line 37, while the remaining separate selection stages occur between lines 45 and 62 after the separation of line 40. -\begin{figure} - \centering - \includegraphics[width=6cm]{img/median5_overlap.png} - \caption{Reducing register count in a 5$\times$5 register-only median kernel outputting 2 pixels simultaneously. The first 6 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.} - \label{fig:median5overlap} -\end{figure} - -\lstinputlisting[label={lst:medianForget2pix5},caption=kernel 5$\times$5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.]{code/kernMedian2pix5.cu} - -Timing results follow the same variations with image size as in previously presented kernels. That is why Table \ref{tab:median5comp} shows only throughput values obtained for C2070 card and 4096$\times$4096 pixel image. - -\begin{table}[h] -%\newlength\savedwidth -\newcommand\whline{\noalign{\global\savedwidth - \arrayrulewidth\global\arrayrulewidth 1.5pt} - \hline \noalign{\global\arrayrulewidth - \savedwidth} -} -\centering -{\scriptsize -\begin{tabular}{|l||c|c|c|c|} -\hline -\textbf{Implementation}&\shortstack{\textbf{registers only}\\\textbf{1 pix/thread}}&\shortstack{\textbf{registers only}\\\textbf{2 pix/thread}}&\shortstack{\textbf{libJacket}\\(interpolated)}&\shortstack{\textbf{shared mem}}\\\whline - \shortstack{\textbf{Throughput}\\\textbf{(MP/s)}}&551&738&152&540\\\hline -\end{tabular} -} -\caption{Performance of various 5$\times$5 median kernel implementations, applied on 4096$\times$4096 pixel image with C2070 GPU card.} -\label{tab:median5comp} -\end{table} - -% \subsection{True n$\times$n median filter} -% Shared memory can represent an efficient way to reduce global or texture loads, but it is also a limiting factor for performance. -% On Fermi GPUs (as C2070), a maximum of 48~kB of per block shared memory is avalaible. With 16-bit coded gray levels, that allows to store up to 24576 values, which can be organised as a square of 156$\times$156 pixels maximum. -% A point is that it is not efficient to use the shared memory at its maximum, as it would reduce the number of blocks beeing run in parallel on each SM. -% Another point is that it is not possible to avoid bank conflicts when designing a generic median kernel. -% Thus, the most efficient way to code a generic, large window, median filter, is to do without shared memory but use texture direct fetching. -% Listing \ref{lst:medianForgetGeneric} reproduce such a code, where the most interesting part is between lines XX and YY, where the forgetfull selection has been generalized to an arbitrary window size. -% Performance results summarized in table \ref{tab:medianForgetGeneric} demonstrate that such a method is far from beeing as efficient as small fixed-size implementations. - -% \begin{table}[h] -% %\newlength\savedwidth -% \newcommand\whline{\noalign{\global\savedwidth -% \arrayrulewidth\global\arrayrulewidth 1.5pt} -% \hline \noalign{\global\arrayrulewidth -% \savedwidth} -% } -% \centering -% {\scriptsize -% \begin{tabular}{|l||c|c|c|c|} -% \hline -% \shortstack{\textbf{Window size}\\(in pixels)}&\textbf{121}&\textbf{169}&\textbf{225}&\textbf{441}\\\whline -% \shortstack{\textbf{Throughput}\\\textbf{(MP/s)}}& & & & \\\hline -% \end{tabular} -% } -% \caption{Performance of generic median kernel applied to various window sizes on 4096$\times$4096 pixel image.} -% \label{tab:medianForgetGeneric} -% \end{table} - -% \lstinputlisting[label={lst:medianForgetGeneric},caption= generic median kernel by forgetfull selection.]{code/kernMedianForgetGeneric.cu} - -\subsection{Fast approximated n$\times$n median filter } -Large window median filters are less widespread and used in more specific fields, such as digital microscopy: for example in \cite{paper_bio_background}, an estimation of the background gray-level intensity is achieved through a $111\times 111$ median filtering (processed in around 2s for a 4MPixel image). In such cases, a possible technique is to split median selection into two separate 1-D stages: one in the vertical direction and the other in the horizontal direction. Image processing specialists may object that this method does not select the actual median value. This is true but, in the case of large window sizes and \textit{real-life} images, the so selected value is statistically near the actual median value and often represents an acceptable approximation. Such a filter is sometimes called \textit{smoother}. - -As explained earlier in this section, the use of large window median filters rules out register-only implementation, -which suggests to privilege the use of shared memory. The 1-D operation almost completely avoids bank conflicts in shared memory accesses. -Furthermore, the above-described forgetful selection method cannot be used anymore, as too many registers would be required.\\Instead, the Torben Morgensen sorting algorithm is used, as its required register count is both low and constant, and avoids the use of a local vector, unlike histogram-based methods. - -Listing \ref{lst:medianSeparable} presents a kernel code that implements the above considerations and achieves a 1-D vertical $n \times 1$ median filter. The shared memory vector is declared as \texttt{extern} (Line 16) as its size is determined at runtime and passed to the kernel call as an argument. Lines 20 to 29 perform data prefetching, including the $2n$-row halo ($n$ at the bottom and $n$ at the top of each block). Then one synchronization barrier is mandatory (line 31) to ensure that all needed data is ready prior to its use by the different threads. -Torben Morgensen sorting takes place between lines 37 and 71 and eventually, the transposed output value is stored in global memory at line 73. Outputting the transposed image in global memory saves time and allows to re-use the same kernel to achieve the second step, e.g 1-D horizontal $n \times 1$ median filtering. The final transpose is done at transfer time, when copying data from GPU to CPU memory, which once more saves time while actually generating the expected image. -It has to be noticed that this smoother, unlike the technique we proposed for fixed-size median filters, can not be considered as a state-of-the-art technique, as for example the one presented in \cite{4287006}. However, it may be considered as a good, easy to use and efficient alternative as confirmed by the results presented in Table \ref{tab:medianSeparable}. Pixel throughput values achieved by our kernel, though not constant with window size, remain very competitive if window size is kept under $120\times 120$ pixels, especially when outputting 2 pixels per thread (in \cite{4287006}, pixel throughput is around 7MP/s). -Figure \ref{fig:sap_examples2} shows an example of a $512\times 512$ pixel image, corrupted by a \textit{salt and pepper} noise and the denoised versions, output respectively by a $3\times 3$, a $5\times 5$ and a 2 iteration $3\items3 $ separable smoother. -\begin{figure} -\centering - \subfloat[Airplane image, corrupted with by salt \& pepper noise of density 0.25]{\label{img:sap_example_ref} \includegraphics[width=5cm]{img/airplane_sap25.png}}\qquad - \subfloat[Image denoised by a $3\times 3$ separable smoother]{\label{img:sap_example_sep_med3} \includegraphics[width=5cm]{img/airplane_sap25_sep_med3.png}}\\ - \subfloat[Image denoised by a $5\times 5$ separable smoother]{\label{img:sap_example_sep_med5} \includegraphics[width=5cm]{img/airplane_sap25_sep_med5.png}}\qquad - \subfloat[Image background estimation by a $55\times 55$ separable smoother]{\label{img:sap_example_sep_med3_it2} \includegraphics[width=5cm]{img/airplane_sap25_sep_med111.png}}\\ - \caption{Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.} - \label{fig:sap_examples2} -\end{figure} - - -\begin{table}[h] -%\newlength\savedwidth -\newcommand\whline{\noalign{\global\savedwidth - \arrayrulewidth\global\arrayrulewidth 1.5pt} - \hline \noalign{\global\arrayrulewidth - \savedwidth} -} -\centering -{\scriptsize -\begin{tabular}{|l||c|c|c|c|} -\hline -\shortstack{\textbf{Window edge size}\\(in pixels)}&\textbf{41}&\textbf{81}&\textbf{111}&\textbf{121}\\\whline - \shortstack{\textbf{Throughput}\\\textbf{(MP/s)}}&60 &33 & 22& 20\\\hline -\end{tabular} -} -\caption{Measured performance of one generic pseudo-separable median kernel applied to 4096$\times$4096 pixel image with various window sizes.} -\label{tab:medianSeparable} -\end{table} - -\lstinputlisting[label={lst:medianSeparable},caption= generic pseudo median kernel.]{code/kernMedianSeparable.cu} - -\bibliographystyle{plain} -\bibliography{/home/perrot/Documents/biblio} -\end{document} -- 2.39.5 From 32eca9a71cb97b720b022d9fa6f8e753368a2243 Mon Sep 17 00:00:00 2001 From: couturie Date: Thu, 9 May 2013 14:41:13 +0200 Subject: [PATCH 11/16] update --- BookGPU/Chapters/chapter3/ch3.aux | 12 +++++----- BookGPU/Chapters/chapter3/ch3.tex | 16 +++++++------ BookGPU/Chapters/chapter4/ch4.tex | 16 ++++++------- BookGPU/Chapters/chapter5/ch5.tex | 18 +++++++-------- BookGPU/Chapters/chapter6/PartieAsync.tex | 8 +++---- BookGPU/Chapters/chapter6/PartieORWL.tex | 1 + BookGPU/Chapters/chapter7/ch7.tex | 28 ++++++++++++----------- BookGPU/Chapters/chapter8/ch8.tex | 10 ++++---- 8 files changed, 57 insertions(+), 52 deletions(-) diff --git a/BookGPU/Chapters/chapter3/ch3.aux b/BookGPU/Chapters/chapter3/ch3.aux index a5fefc8..58d4a58 100644 --- a/BookGPU/Chapters/chapter3/ch3.aux +++ b/BookGPU/Chapters/chapter3/ch3.aux @@ -36,7 +36,7 @@ \newlabel{sub@img:sap_example_med5}{{(c)}{33}} \newlabel{img:sap_example_med3_it2}{{4.1(d)}{33}} \newlabel{sub@img:sap_example_med3_it2}{{(d)}{33}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Exemple of median filtering, applied to salt \& pepper noise reduction.\relax }}{33}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.1}{\ignorespaces Example of median filtering, applied to salt \& pepper noise reduction.\relax }}{33}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted by salt and pepper noise of density 0.25}}}{33}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ median filter}}}{33}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ median filter}}}{33}} @@ -44,6 +44,8 @@ \newlabel{fig:sap_examples}{{4.1}{33}} \newlabel{lst:medianGeneric}{{4.1}{34}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {4.1}Generic CUDA kernel achieving median filtering}{34}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Example of 5x5 median filtering\relax }}{35}} +\newlabel{fig:median_1}{{4.2}{35}} \newlabel{algoMedianGeneric}{{2}{35}} \newlabel{algoMedianGeneric:memcpyH2D}{{1}{35}} \newlabel{algoMedianGeneric:cptstart}{{3}{35}} @@ -51,8 +53,6 @@ \newlabel{algoMedianGeneric:memcpyD2H}{{7}{35}} \@writefile{loa}{\contentsline {algocf}{\numberline {2}{\ignorespaces generic n$\times $n median filter\relax }}{35}} \@writefile{toc}{\contentsline {section}{\numberline {4.3}NVidia GPU tuning recipes}{35}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.2}{\ignorespaces Exemple of 5x5 median filtering\relax }}{35}} -\newlabel{fig:median_1}{{4.2}{35}} \@writefile{lof}{\contentsline {figure}{\numberline {4.3}{\ignorespaces Illustration of window overlapping in 5x5 median filtering\relax }}{36}} \newlabel{fig:median_overlap}{{4.3}{36}} \@writefile{lot}{\contentsline {table}{\numberline {4.1}{\ignorespaces Performance results of \texttt {kernel medianR}. \relax }}{36}} @@ -81,9 +81,9 @@ \newlabel{fig:compMedians2}{{4.8}{44}} \newlabel{sec:median5}{{4.5.1}{44}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{44}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.\relax }}{45}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.9}{\ignorespaces Reducing register count in a 5$\times $5 register-only median kernel outputting 2 pixels simultaneously.}}{45}} \newlabel{fig:median5overlap}{{4.9}{45}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.\relax }}{45}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.10}{\ignorespaces First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network.}}{45}} \newlabel{fig:median5overlap}{{4.10}{45}} \newlabel{lst:medianForget2pix5}{{4.5}{46}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {4.5}kernel 5$\times $5 median filter processing 2 output pixel values per thread by a combined forgetfull selection.}{46}} @@ -100,7 +100,7 @@ \newlabel{sub@img:sap_example_sep_med5}{{(c)}{49}} \newlabel{img:sap_example_sep_med3_it2}{{4.11(d)}{49}} \newlabel{sub@img:sap_example_sep_med3_it2}{{(d)}{49}} -\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{49}} +\@writefile{lof}{\contentsline {figure}{\numberline {4.11}{\ignorespaces Example of separable median filtering (smoother), applied to salt \& pepper noise reduction.\relax }}{49}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(a)}{\ignorespaces {Airplane image, corrupted with by salt and pepper noise of density 0.25}}}{49}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(b)}{\ignorespaces {Image denoised by a $3\times 3$ separable smoother}}}{49}} \@writefile{lof}{\contentsline {subfigure}{\numberline{(c)}{\ignorespaces {Image denoised by a $5\times 5$ separable smoother}}}{49}} diff --git a/BookGPU/Chapters/chapter3/ch3.tex b/BookGPU/Chapters/chapter3/ch3.tex index 78396a9..1a991e2 100755 --- a/BookGPU/Chapters/chapter3/ch3.tex +++ b/BookGPU/Chapters/chapter3/ch3.tex @@ -205,7 +205,7 @@ Figure \ref{fig:sap_examples} shows an example of a $512\times 512$ pixel image, \subfigure[Image denoised by a $3\times 3$ median filter]{\label{img:sap_example_med3} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_med3.png}}\\ \subfigure[Image denoised by a $5\times 5$ median filter]{\label{img:sap_example_med5} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_med5.png}}\qquad \subfigure[Image denoised by 2 iterations of a $3\times 3$ median filter]{\label{img:sap_example_med3_it2} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_med3_it2.png}}\\ - \caption{Exemple of median filtering, applied to salt \& pepper noise reduction.} + \caption{Example of median filtering, applied to salt \& pepper noise reduction.} \label{fig:sap_examples} \end{figure} @@ -221,16 +221,18 @@ The first observation to make when analysing results of Table \ref{tab:medianHis Since inner loops that fill the histogram vector contain very few fetching instructions (from 9 to 49, depending on the window size), it is not surprising to note their neglectable impact compared to outer loops that fetch image pixels (from 256k to 16M instructions). One could be tempted to claim that CPU has no chance to win, which is not so obvious as it highly depends on what kind of algorithm is run and above all, how it is implemented. To illustrate this, we can notice that, despite a maximum effective throughput potential that is almost five times higher, measured GTX280 throughput values sometimes prove slower than CPU values, as shown in Table \ref{tab:medianHisto1}. + +\lstinputlisting[label={lst:medianGeneric},caption=Generic CUDA kernel achieving median filtering]{Chapters/chapter3/code/medianGeneric.cu} + + On the GPU's side, we note high dependence on window size due to the redundancy induced by the multiple fetches of each pixel inside each block, becoming higher with the window size as illustrated by Figure \ref{fig:median_overlap}. On C2070 card, thanks to a more efficient caching mechanism, this effect is lesser. On GPUs, dependency over image size is low, and due to slightly more efficient data transfers when copying larger data amounts, pixel throughputs increases with image size. As an example, transferring a 4096$\times$4096 pixel image (32~MBytes) is a bit faster than transferring 64 times a 512$\times$512 pixel image (0.5~MBytes). %% mettre l'eau à la bouche -\lstinputlisting[label={lst:medianGeneric},caption=Generic CUDA kernel achieving median filtering]{Chapters/chapter3/code/medianGeneric.cu} - \begin{figure} \centering \includegraphics[width=8cm]{Chapters/chapter3/img/median_1.png} - \caption{Exemple of 5x5 median filtering} + \caption{Example of 5x5 median filtering} \label{fig:median_1} \end{figure} @@ -401,14 +403,14 @@ The minimum register count required to apply the forgetful selection method to a \begin{figure} \centering \includegraphics[width=6cm]{Chapters/chapter3/img/median5_overlap4.png} - \caption{Reducing register count in a 5$\times$5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.} + \caption[Reducing register count in a 5$\times$5 register-only median kernel outputting 2 pixels simultaneously.]{Reducing register count in a 5$\times$5 register-only median kernel outputting 2 pixels simultaneously. The first 7 forgetful selection stages are common to both processed center pixels. Only the last 5 selections have to be done separately.} \label{fig:median5overlap} \end{figure} \begin{figure} \centering \includegraphics[width=6cm]{Chapters/chapter3/img/forgetful_selection4.png} - \caption{First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.} + \caption[First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network.]{First iteration of the $5\times 5$ selection process, with $k_{25}=14$, which shows how Instruction Level Parallelism is maximized by the use of an incomplete sorting network. Arrows represent the result of the swapping function, with the lowest value at the starting point and the highest value at the end point.} \label{fig:median5overlap} \end{figure} @@ -451,7 +453,7 @@ Figure \ref{fig:sap_examples2} shows an example of a $512\times 512$ pixel image \subfigure[Image denoised by a $3\times 3$ separable smoother]{\label{img:sap_example_sep_med3} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_sep_med3.png}}\\ \subfigure[Image denoised by a $5\times 5$ separable smoother]{\label{img:sap_example_sep_med5} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_sep_med5.png}}\qquad \subfigure[Image background estimation by a $55\times 55$ separable smoother]{\label{img:sap_example_sep_med3_it2} \includegraphics[width=5cm]{Chapters/chapter3/img/airplane_sap25_sep_med111.png}}\\ - \caption{Exemple of separable median filtering (smoother), applied to salt \& pepper noise reduction.} + \caption{Example of separable median filtering (smoother), applied to salt \& pepper noise reduction.} \label{fig:sap_examples2} \end{figure} diff --git a/BookGPU/Chapters/chapter4/ch4.tex b/BookGPU/Chapters/chapter4/ch4.tex index ea473a1..d618753 100644 --- a/BookGPU/Chapters/chapter4/ch4.tex +++ b/BookGPU/Chapters/chapter4/ch4.tex @@ -70,7 +70,7 @@ For more readability, only part of the connecting lines are shown. \begin{figure} \centering \includegraphics[width=11cm]{Chapters/chapter4/img/convo1.png} - \caption{Principle of a generic convolution implementation. The center pixel is represented with a black background and the pixels of its neighborhood are denoted $I_{p,q}$ where $(p,q)$ is the relative position of the neighbor pixel. Elements $h_{t,u}$ are the values of the convolution mask.} + \caption[Principle of a generic convolution implementation.]{Principle of a generic convolution implementation. The center pixel is represented with a black background and the pixels of its neighborhood are denoted $I_{p,q}$ where $(p,q)$ is the relative position of the neighbor pixel. Elements $h_{t,u}$ are the values of the convolution mask.} \label{fig:convoPrinciple} \end{figure} \begin{algorithm} @@ -138,7 +138,7 @@ $\mathbf{2048\times 2048}$&1.178&1549 &\bf 3.265&\bf 875 &6.398&529 \\\hline $\mathbf{4096\times 4096}$&4.700&1585 &13.05&533 &25.56&533 \\\hline \end{tabular} } -\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.} +\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a C2070 card (fermi architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.} \label{tab:convoNonSepReg1} \end{table} @@ -155,7 +155,7 @@ $\mathbf{2048\times 2048}$&0.801&1092 &\bf 2.189&\bf 802 &4.278&573 \\\hline $\mathbf{4096\times 4096}$&3.171&1075 &8.720&793 &17.076&569 \\\hline \end{tabular} } -\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.} +\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of one register-only non separable convolution kernel, for small mask sizes of $3\times 3$, $5\times 5$ and $7\times 7$ pixels, on a GTX280 (GT200 architecture). Data transfer duration are those of Table \ref{tab:memcpy1}.} \label{tab:convoNonSepReg3} \end{table} @@ -250,7 +250,7 @@ $\mathbf{2048\times 2048}$&0.495&2071 &\bf 0.987&1666 &1.615&1334 \\\hline $\mathbf{4096\times 4096}$&1.964&2138 &3.926&1711 &6.416&1364 \\\hline \end{tabular} } -\caption{Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.} +\caption[Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card.]{Timings ($time$) and throughput values ($TP$ in Mpix/s) of our generic fixed mask size convolution kernel run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.} \label{tab:convoGene8x8p} \end{table} @@ -269,7 +269,7 @@ This proves to be quite efficient and more versatile, but it obviously generates \begin{figure} \centering \includegraphics[width=12cm]{Chapters/chapter4/img/convoShMem.png} - \caption{Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$. Threads in both top corners of the top figure are identified either by a circle or by a star symbol. The image tile, loaded into shared memory includes the pixels to be updated by the threads of the block, as well as its 2-pixel wide halo. Here, circle and star symbols in the image tile show which pixels are actually loaded into one shared memory vector by its corresponding thread. } + \caption[Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$.]{Organization of the prefetching stage of data, for a $5\times 5$ mask and a thread block size of $8\times 4$. Threads in both top corners of the top figure are identified either by a circle or by a star symbol. The image tile, loaded into shared memory includes the pixels to be updated by the threads of the block, as well as its 2-pixel wide halo. Here, circle and star symbols in the image tile show which pixels are actually loaded into one shared memory vector by its corresponding thread. } \label{fig:ShMem1} \end{figure} Still, we also implemented this method, in a similar manner as Nvidia did in its SDK sample code. @@ -304,7 +304,7 @@ $\mathbf{2048\times 2048}$&2023 &\bf 1586 &1172 &818&676&554\\\hline $\mathbf{4096\times 4096}$&2090 &1637 &1195 &830&684&561\\\hline \end{tabular} } -\caption{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.} +\caption[Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.} \label{tab:convoGeneSh2} \end{table} \lstinputlisting[label={lst:convoGeneSh1},caption=CUDA kernel achieving a generic convolution operation after a preloading of data in shared memory.]{Chapters/chapter4/code/convoGeneSh1.cu} @@ -354,7 +354,7 @@ $\mathbf{1024\times 1024}$&0.306 &0.333 &\bf 0.333 &\bf 0.378&\bf 0.404&\bf 0.46 $\mathbf{2048\times 2048}$&1.094 &1.191 &\bf 1.260 &\bf 1.444&\bf 1.545&\bf 1.722\\\hline $\mathbf{4096\times 4096}$&4.262 &4.631 &\bf 5.000 &\bf 5.676&\bf 6.105&\bf 6.736\\\hline \end{tabular}} -\caption{Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run on a C2070 card. Timings include data copy. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones.} +\caption[Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run on a C2070 card.]{Performances, in milliseconds, of our generic 8 pixels per thread 1-D convolution kernels using shared memory, run on a C2070 card. Timings include data copy. Bold values correspond to situations where separable-convolution kernels run faster than non separable ones.} \label{tab:convoSepSh1} \end{table} \begin{table}[h] @@ -369,7 +369,7 @@ $\mathbf{2048\times 2048}$&1598 &1541 &\bf 1503 &\bf 1410&\bf 1364&\bf 1290\\\hl $\mathbf{4096\times 4096}$&1654 &1596 &\bf 1542 &\bf 1452&\bf 1400&\bf 1330\\\hline \end{tabular} } -\caption{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.} +\caption[Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card.]{Throughput values, in MegaPixel per second, of our generic 8 pixels per thread 1-D convolution kernel using shared memory, run on a C2070 card. Data transfer durations are those of Table \ref{tab:memcpy1}.} \label{tab:convoSepSh2} \end{table} diff --git a/BookGPU/Chapters/chapter5/ch5.tex b/BookGPU/Chapters/chapter5/ch5.tex index 1d3d957..ff24f8c 100644 --- a/BookGPU/Chapters/chapter5/ch5.tex +++ b/BookGPU/Chapters/chapter5/ch5.tex @@ -83,7 +83,7 @@ The library is grouped into component classes. Each component should fulfill a s \begin{center} \input{Chapters/chapter5/figures/component_design.tikz} \end{center} -\caption{Schematic representation of the five main components, their type definitions, and member functions. Because components are template based, the argument types cannot be known in beforehand. This concept rule set ensures compliance between components.}\label{ch5:fig:componentdesign} +\caption[Schematic representation of the five main components, their type definitions, and member functions.]{Schematic representation of the five main components, their type definitions, and member functions. Because components are template based, the argument types cannot be known in beforehand. This concept rule set ensures compliance between components.}\label{ch5:fig:componentdesign} \end{figure} A component is implemented as a generic C++ class, that normally takes as template arguments the same types that it offers through type definitions: a matrix takes a vector class as template argument, and a vector class takes a working precision value type. The matrix can then access the working precision via the vector class. Components that rely on multiple template arguments can combine these arguments via type binders to maintain code simplicity. We will demonstrate use of such type binders in the model problem examples. A thorough introduction to template-based programming in C++ can be found in \cite{ch5:Vandevoorde2002}. @@ -270,7 +270,7 @@ Solution times for the heat conduction problem is in itself not very interesting {\includegraphics[width=0.5\textwidth]{Chapters/chapter5/figures/AlphaPerformanceGTX590_N16777216_conv.pdf}} } \end{center} -\caption{Single and double precision floating point operations per second for a two dimensional stencil operator on a numerical grid of size $4096^2$. Various stencil sizes are used $\alpha=1,2,3,4$, equivalent to $5$pt, $9$pt, $13$pt, and $17$pt stencils. Test environment 1.}\label{ch5:fig:stencilperformance} +\caption[Single and double precision floating point operations per second for a two dimensional stencil operator on a numerical grid of size $4096^2$.]{Single and double precision floating point operations per second for a two dimensional stencil operator on a numerical grid of size $4096^2$. Various stencil sizes are used $\alpha=1,2,3,4$, equivalent to $5$pt, $9$pt, $13$pt, and $17$pt stencils. Test environment 1.}\label{ch5:fig:stencilperformance} \end{figure} @@ -416,7 +416,7 @@ CUDA enabled GPUs are optimized for high memory bandwidth and fast on-chip perfo %\includegraphics[width=0.6\textwidth]{Chapters/chapter5/figures/GPU2GPU.png} \input{Chapters/chapter5/figures/GPU2GPU.tikz} \end{center} -\caption{Message passing between two GPUs involves several memory transfers across lower bandwidth connections. A kernel that aligns the data to be transferred is required, if the data is not already sequentially stored in device memory.}\label{ch5:fig:gpu2gputransfer} +\caption[Message passing between two GPUs involves several memory transfers across lower bandwidth connections.]{Message passing between two GPUs involves several memory transfers across lower bandwidth connections. A kernel that aligns the data to be transferred is required, if the data is not already sequentially stored in device memory.}\label{ch5:fig:gpu2gputransfer} \end{figure} @@ -441,7 +441,7 @@ An alternative to the preconditioning strategy is to have each subdomain query i \begin{center} \input{Chapters/chapter5/figures/dd2d.tikz} \end{center} -\caption{Domain distribution along the horizontal direction of a two dimensional grid into three subdomains. {\large$\bullet$} and {\scriptsize$\textcolor[rgb]{0.5,0.5,0.5}{\blacksquare}$} represent internal grid points and ghost points respectively.}\label{ch5:fig:dd2d} +\caption[Domain distribution along the horizontal direction of a two dimensional grid into three subdomains.]{Domain distribution along the horizontal direction of a two dimensional grid into three subdomains. {\large$\bullet$} and {\scriptsize$\textcolor[rgb]{0.5,0.5,0.5}{\blacksquare}$} represent internal grid points and ghost points respectively.}\label{ch5:fig:dd2d} \end{figure} Topologies are introduced via an extra template argument to the grid component. A grid is by default not distributed, because the default template argument is a non-distribution topology implementation. The grid class is extended with a new member function \texttt{update()}, that makes sure that the topology instance updates all ghost points. The library contains two topology strategies, based on one dimensional and two dimensional distributions of the grid. The number of grid subdomains will be equal to the number of MPI processes executing the program. @@ -481,7 +481,7 @@ One method that introduces concurrency to the solution of evolution problems is \begin{center} \input{Chapters/chapter5/figures/ParallelInTime.tikz} \end{center} -\caption{Time domain decomposition. A compute node is assigned to each individual time sub-main to compute the initial value problem. Consistency at the time sub-domain boundaries is obtained with the application of a computationally cheap integrator in conjunction with the parareal iterative predictor-corrector algorithm, eq. \eqref{ch5:eq:PARAREAL}. }\label{ch5:fig:ParallelInTime} +\caption[Time domain decomposition.]{Time domain decomposition. A compute node is assigned to each individual time sub-main to compute the initial value problem. Consistency at the time sub-domain boundaries is obtained with the application of a computationally cheap integrator in conjunction with the parareal iterative predictor-corrector algorithm, eq. \eqref{ch5:eq:PARAREAL}. }\label{ch5:fig:ParallelInTime} \end{figure} \label{ch5:parareal} @@ -523,7 +523,7 @@ The number of GPUs used for parallelization depends on the number of MPI process \begin{center} \input{Chapters/chapter5/figures/FullyDistributed.tikz} \end{center} -\caption{\label{ch5:fig:FullyDistributedCores} Schematic visualization of a fully distributed work scheduling model for the parareal algorithm as proposed by Aubanel. Each GPU is responsible for computing the solution on a single time sub-domain. The computation is initiated at rank $0$ and cascades trough to rank $N$ where the final solution can be fetched. } +\caption[Schematic visualization of a fully distributed work scheduling model for the parareal algorithm as proposed by Aubanel.]{\label{ch5:fig:FullyDistributedCores} Schematic visualization of a fully distributed work scheduling model for the parareal algorithm as proposed by Aubanel. Each GPU is responsible for computing the solution on a single time sub-domain. The computation is initiated at rank $0$ and cascades trough to rank $N$ where the final solution can be fetched. } \end{figure} \subsubsection{Computational complexity} @@ -555,7 +555,7 @@ If we in addition assume that the time spend on coarse propagation is negligible \label{ch5:fig:pararealRvsGPUs:b} } \end{center} - \caption{Parareal convergence properties as a function of $R$ and number GPUs used. The error is measured as the relative difference between the purely sequential solution and the parareal solution.}\label{ch5:fig:pararealRvsGPUs} + \caption[Parareal convergence properties as a function of $R$ and number GPUs used.]{Parareal convergence properties as a function of $R$ and number GPUs used. The error is measured as the relative difference between the purely sequential solution and the parareal solution.}\label{ch5:fig:pararealRvsGPUs} \end{figure} \begin{figure}[!htb] @@ -571,7 +571,7 @@ If we in addition assume that the time spend on coarse propagation is negligible \label{ch5:fig:pararealGPUs:b} } \end{center} - \caption{Parareal performance properties as a function of $R$ and number GPUs used. Notice how the obtained performance depends greatly on the choice of $R$ as a function of the number of GPUs. Test environment 2. }\label{ch5:fig:pararealGPUs} + \caption[Parareal performance properties as a function of $R$ and number GPUs used.]{Parareal performance properties as a function of $R$ and number GPUs used. Notice how the obtained performance depends greatly on the choice of $R$ as a function of the number of GPUs. Test environment 2. }\label{ch5:fig:pararealGPUs} \end{figure} %TODO: Do we make this into a subsubsection: @@ -633,4 +633,4 @@ from the Danish Research Council for Technology and Production Sciences. A speci \putbib[Chapters/chapter5/biblio5] % Reset lst label and caption -\lstset{label=,caption=} +%\lstset{label=,caption=} diff --git a/BookGPU/Chapters/chapter6/PartieAsync.tex b/BookGPU/Chapters/chapter6/PartieAsync.tex index 9c8de06..0b20926 100644 --- a/BookGPU/Chapters/chapter6/PartieAsync.tex +++ b/BookGPU/Chapters/chapter6/PartieAsync.tex @@ -754,7 +754,7 @@ iterations done (\texttt{nbSyncIter}). %\begin{algorithm}[H] % \caption{Computing function in the final asynchronous scheme.}% without GPU computing.} % \label{algo:ch6p2AsyncSyncComp} -%\pagebreak +\pagebreak \begin{Listing}{algo:ch6p2AsyncSyncComp}{Computing function in the final asynchronous scheme}% without GPU computing.} // Variables declarations and initialization ... @@ -915,7 +915,7 @@ different according to the application. %\begin{algorithm}[H] % \caption{Initialization of the main process of complete overlap with asynchronism.} % \label{algo:ch6p2FullOverAsyncMain} -\pagebreak +%\pagebreak \begin{Listing}{algo:ch6p2FullOverAsyncMain}{Initialization of the main process of complete overlap with asynchronism} // Variables declarations and initialization ... @@ -977,7 +977,7 @@ MPI_Finalize(); %\begin{algorithm}[H] % \caption{Computing function in the final asynchronous scheme with CPU/GPU overlap.} % \label{algo:ch6p2FullOverAsyncComp1} -\pagebreak +%\pagebreak \begin{Listing}{algo:ch6p2FullOverAsyncComp1}{Computing function in the final asynchronous scheme with CPU/GPU overlap} // Variables declarations and initialization ... @@ -1037,7 +1037,7 @@ while(!Finished){ %\begin{algorithm}[H] % \caption{Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap.} % \label{algo:ch6p2FullOverAsyncComp2} -\pagebreak +%\pagebreak \begin{Listing}{algo:ch6p2FullOverAsyncComp2}{Auxiliary computing function in the final asynchronous scheme with CPU/GPU overlap} // Variables declarations and initialization ... auxInput ... // Local array for input data diff --git a/BookGPU/Chapters/chapter6/PartieORWL.tex b/BookGPU/Chapters/chapter6/PartieORWL.tex index 2efb115..ace4698 100644 --- a/BookGPU/Chapters/chapter6/PartieORWL.tex +++ b/BookGPU/Chapters/chapter6/PartieORWL.tex @@ -299,6 +299,7 @@ the resources. %\begin{algorithm}[H] % \caption{Dynamic initialization of access mode and priorities.} % \label{algo:ch6p3ORWLinit} +\pagebreak \begin{Listing}{algo:ch6p3ORWLinit}{Dynamic initialization of access mode and priorities} /* One operation with priority 0 (highest) consists */ /* in copying from current to the GPU and run MM, there. */ diff --git a/BookGPU/Chapters/chapter7/ch7.tex b/BookGPU/Chapters/chapter7/ch7.tex index 45fc453..6ca2080 100644 --- a/BookGPU/Chapters/chapter7/ch7.tex +++ b/BookGPU/Chapters/chapter7/ch7.tex @@ -327,7 +327,7 @@ Similar results were reported for the first time in the context of high-order Bo % MainLaplace2D_ex035_nonlinearLaplace.m \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/SFwaves_snapshots_clustered-eps-converted-to.pdf} } -\caption{Numerical experiments to assess stability properties of numerical wave model. In three cases, computed snapshots are taken of the wave elevation over one wave period of time. In a) the grid distribution of nodes in a one-parameter mapping for the grid is illustrated. Results from changes in wave elevation are illustrated for b) a mildly nonlinear standing wave on a highly clustered grid, c) regular stream function wave of medium steepness in shallow water $(kh,H/L)=(0.5,0.0292)$ on a uniform grid ($N_x=80$) and d) for a nonuniform grid with a minimal grid spacing 20 times smaller(!). In every case the step size remains fixed at $\Delta t = T/160$ s corresponding to a Courant number $C_r=c\tfrac{\Delta t}{\Delta x}=0.5$ for the uniform grid. A 6'$th$ order scheme and explicit EKR4 time-stepping is used in each test case.} +\caption[Numerical experiments to assess stability properties of numerical wave model.]{Numerical experiments to assess stability properties of numerical wave model. In three cases, computed snapshots are taken of the wave elevation over one wave period of time. In a) the grid distribution of nodes in a one-parameter mapping for the grid is illustrated. Results from changes in wave elevation are illustrated for b) a mildly nonlinear standing wave on a highly clustered grid, c) regular stream function wave of medium steepness in shallow water $(kh,H/L)=(0.5,0.0292)$ on a uniform grid ($N_x=80$) and d) for a nonuniform grid with a minimal grid spacing 20 times smaller(!). In every case the step size remains fixed at $\Delta t = T/160$ s corresponding to a Courant number $C_r=c\tfrac{\Delta t}{\Delta x}=0.5$ for the uniform grid. A 6'$th$ order scheme and explicit EKR4 time-stepping is used in each test case.} \label{ch7:numexp} \end{figure} %\newpage @@ -384,7 +384,7 @@ The profiles can be reversed by a change of coordinate, i.e. $\Gamma(1-x)$, and \includegraphics[width=0.98\textwidth]{Chapters/chapter7/figures/nonlinearwavespenalty-eps-converted-to.pdf} % Nx = 540, 6th order, vertical clustering, Nz=6; } -\caption{Snapshots at intervals $T/8$ over one wave period in time of computed a) small-amplitude $(kh,kH)=(0.63,0.005)$ and b) finite-amplitude $(kh,kH)=(1,0.41)$ stream function waves elevations having reached a steady state after transient startup. Combined wave generation and absorption zones in the western relaxation zone of both a) and b). In b) an absorption zone is positioned next to the eastern boundary and causes minor visible reflections. } +\caption[Snapshots at intervals $T/8$ over one wave period in time.]{Snapshots at intervals $T/8$ over one wave period in time of computed a) small-amplitude $(kh,kH)=(0.63,0.005)$ and b) finite-amplitude $(kh,kH)=(1,0.41)$ stream function waves elevations having reached a steady state after transient startup. Combined wave generation and absorption zones in the western relaxation zone of both a) and b). In b) an absorption zone is positioned next to the eastern boundary and causes minor visible reflections. } \label{ch7:figstandwave} \end{figure} @@ -424,9 +424,6 @@ Numerical modelling of large ocean areas to account for nonlinear wave-wave inte The ratio between necessary data transfers and computational work for the proposed numerical model for free surface water waves is high enough to expect reasonable latency hiding. The data domain decomposition method consists of a logically structured division of the computational domain into multiple subdomains. Each of these subdomains are connected via fictitious ghost layers at the artificial boundaries of width corresponding to the half-width of the finite difference stencils employed. This results in a favourable volume-to-boundary ratio as the problem size increases, diminishing communication overhead for message passing. Information between subdomains are exchanged through ghost layers at every step of the iterative PDC method, in connection with the matrix-vector evaluation for the $\sigma$-transformed Laplace problem, and before relaxation steps in the multigrid method. A single global synchronization point occur at most once each iteration, if convergence is monitored, where a global reduction step (inner product) between all processor nodes takes place. The main advantage of this decomposition strategy is, that the decomposition into multiple subdomains is straightforward. However, it comes with the cost of extra data transfers to update the set of fictitious ghost layers. -The parallel domain decomposition solver has been validated against the sequential solvers with respect to algorithmic efficiency to establish that the code produce correct results. An analysis of the numerical efficiency have also been carried out on different GPU systems to identify comparative behaviors as both the problems sizes and number of compute nodes vary. For example, performance scalings on Test environment 1 and Test environment 2 are presented in figure \ref{ch7:fig:multigpuperformance}. The figure confirms that there is only a limited benefit from using multiple GPUs for small problem sizes, since the computational intensity is simply too low to efficiently hide the latency of message passing. A substantial speedup is achieved compared to the single GPU version, while being able to solve even larger systems. -With the linear scaling of memory requirements and improved computational speed, the methodology based on multiple GPUs makes it possible to simulate water waves in very large numerical wave tanks with improved performance. - \begin{figure}[!htb] \setlength\figureheight{0.30\textwidth} \setlength\figurewidth{0.33\textwidth} @@ -438,9 +435,14 @@ With the linear scaling of memory requirements and improved computational speed, {\scriptsize\input{Chapters/chapter7/figures/TeslaK20SpeedupGPUvsCPU3D.tikz}} } \end{center} - \caption{Performance timings per PDC iteration as a function of increasing problem size $N$, for single, mixed and double precision arithmetics. Three dimensional nonlinear waves, using $6^{th}$ order finite difference approximations, preconditioned with one multigrid V-cycle and one pre- and post- Red-black Gauss-Seidel smoothing. Speedup compared to fastest known serial implementation. Using Test environment 3. CPU timings represent starting point for our investigations and has been obtained using Fortran 90 code and is based on a single-core run on a Intel Core i7, 2.80GHz processor.}\label{ch7:fig:perftimings} + \caption[Performance timings per PDC iteration as a function of increasing problem size $N$, for single, mixed and double precision arithmetics.]{Performance timings per PDC iteration as a function of increasing problem size $N$, for single, mixed and double precision arithmetics. Three dimensional nonlinear waves, using $6^{th}$ order finite difference approximations, preconditioned with one multigrid V-cycle and one pre- and post- Red-black Gauss-Seidel smoothing. Speedup compared to fastest known serial implementation. Using Test environment 3. CPU timings represent starting point for our investigations and has been obtained using Fortran 90 code and is based on a single-core run on a Intel Core i7, 2.80GHz processor.}\label{ch7:fig:perftimings} \end{figure} +The parallel domain decomposition solver has been validated against the sequential solvers with respect to algorithmic efficiency to establish that the code produce correct results. An analysis of the numerical efficiency have also been carried out on different GPU systems to identify comparative behaviors as both the problems sizes and number of compute nodes vary. For example, performance scalings on Test environment 1 and Test environment 2 are presented in figure \ref{ch7:fig:multigpuperformance}. The figure confirms that there is only a limited benefit from using multiple GPUs for small problem sizes, since the computational intensity is simply too low to efficiently hide the latency of message passing. A substantial speedup is achieved compared to the single GPU version, while being able to solve even larger systems. +With the linear scaling of memory requirements and improved computational speed, the methodology based on multiple GPUs makes it possible to simulate water waves in very large numerical wave tanks with improved performance. + + + \begin{figure}[!htb] @@ -454,7 +456,7 @@ With the linear scaling of memory requirements and improved computational speed, {\scriptsize\input{Chapters/chapter7/figures/TeslaM2050MultiGPUScaling3D.tikz}} } \end{center} - \caption{Domain decomposition performance on multi-GPU systems. Performance timings per PDC iteration as a function of increasing problem sizes using single precision. Same setup as in figure \ref{ch7:fig:perftimings}.} + \caption[Domain decomposition performance on multi-GPU systems.]{Domain decomposition performance on multi-GPU systems. Performance timings per PDC iteration as a function of increasing problem sizes using single precision. Same setup as in figure \ref{ch7:fig:perftimings}.} \label{ch7:fig:multigpuperformance} \end{figure} @@ -710,7 +712,7 @@ $N_z\in[6,12]$. Sixth order scheme.} \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/kinematicsW_Nx30-HL90-p6_Nonlinear-eps-converted-to.pdf} } \end{center} -\caption{Assessment of kinematic error is presented in terms of the depth-averaged error determined by \eqref{ch7:errkin} for a) scalar velocity potential and b) vertical velocity for a small-amplitude (linear) wave, and c) scalar velocity potential and d) vertical velocity for a finite-amplitude (nonlinear) wave with wave height $H/L=90\%(H/L)_\textrm{max}$. +\caption[Assessment of kinematic error is presented in terms of the depth-averaged error.]{Assessment of kinematic error is presented in terms of the depth-averaged error determined by \eqref{ch7:errkin} for a) scalar velocity potential and b) vertical velocity for a small-amplitude (linear) wave, and c) scalar velocity potential and d) vertical velocity for a finite-amplitude (nonlinear) wave with wave height $H/L=90\%(H/L)_\textrm{max}$. $N_z\in[6,12]$. Sixth order scheme. Clustered vertical grid. } \label{ch7:figlinear2} \end{figure} @@ -738,7 +740,7 @@ Previously reported performance results for the wave model can be taken a step f \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/PrecisionDOUBLE-eps-converted-to.pdf} } \end{center} -\caption{Comparison between convergence histories for single and double precision computations using a PDC method for the solution of the transformed Laplace problem. Very steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0903)$. Discretizaiton based on $(N_x,N_z)=(15,9)$ with 6'$th$ order stencils.} +\caption[Comparison between convergence histories for single and double precision computations using a PDC method for the solution of the transformed Laplace problem.]{Comparison between convergence histories for single and double precision computations using a PDC method for the solution of the transformed Laplace problem. Very steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0903)$. Discretizaiton based on $(N_x,N_z)=(15,9)$ with 6'$th$ order stencils.} \label{ch7:convhist} \end{figure} @@ -772,7 +774,7 @@ Results from numerical experiments are presented in figure \ref{ch7:filtering} a \includegraphics[width=0.45\textwidth]{Chapters/chapter7/figures/ComparisonDCWithFiltering-eps-converted-to.pdf} } \end{center} -\caption{Comparison between accuracy as a function of time for double precision calculations vs. single precision with and without filtering. The double precision result are unfiltered in each comparison and shows to be less sensitive to roundoff-errors. Medium steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0502)$. Discretization is based on $(N_x,N_z)=(30,6)$, A courant number of $C_r=0.5$ and 6'$th$ order stencils.} +\caption[Comparison between accuracy as a function of time for double precision calculations vs. single precision with and without filtering.]{Comparison between accuracy as a function of time for double precision calculations vs. single precision with and without filtering. The double precision result are unfiltered in each comparison and shows to be less sensitive to roundoff-errors. Medium steep nonlinear stream function wave in intermediate water $(kh,H/L)=(1,0.0502)$. Discretization is based on $(N_x,N_z)=(30,6)$, A courant number of $C_r=0.5$ and 6'$th$ order stencils.} \label{ch7:filtering} \end{figure} @@ -798,7 +800,7 @@ A harmonic analysis of the wave spectrum at the shoal center line is computed an {\scriptsize\input{Chapters/chapter7/figures/WhalinWaveHarmonics_T3_single.tikz}} } % \end{center} - \caption{Harmonic analysis for the experiment of Whalin for $T=1,2,3\,s$ respectively. Measured experimental and computed results (single precision) are in good agreement. Test environment 1.}\label{ch7:whalinresults} + \caption[Harmonic analysis for the experiment of Whalin for $T=1,2,3\,s$ respectively.]{Harmonic analysis for the experiment of Whalin for $T=1,2,3\,s$ respectively. Measured experimental and computed results (single precision) are in good agreement. Test environment 1.}\label{ch7:whalinresults} \end{figure} \subsection{Acceleration via parallelism in time using 'Parareal'}\label{ch7:parareal}\index{parareal} @@ -828,7 +830,7 @@ Ideally, the ratio $\mathcal{C}_\mathcal{G}/\mathcal{C}_\mathcal{F}$ is small an \includegraphics[width=0.5\textwidth]{Chapters/chapter7/figures/PararealSpeedupGTX590_conv.pdf} } \end{center} - \caption{(a) Parareal absolute timings for an increasingly number of water waves traveling one wave length, each wave resolution is ($33\times 9$). (b) Parareal speedup for two to sixteen compute nodes compared to the purely sequential single GPU solver. Notice how insensitive the parareal scheme is to the size of the problem solved. Test environment 2.}\label{ch7:fig:DDPA_SPEEDUP} + \caption[Parareal absolute timings and parareal speedup.]{(a) Parareal absolute timings for an increasingly number of water waves traveling one wave length, each wave resolution is ($33\times 9$). (b) Parareal speedup for two to sixteen compute nodes compared to the purely sequential single GPU solver. Notice how insensitive the parareal scheme is to the size of the problem solved. Test environment 2.}\label{ch7:fig:DDPA_SPEEDUP} \end{figure} % @@ -847,7 +849,7 @@ Performance results for the Whalin test case have also been reported in figure \ {\small\input{Chapters/chapter7/figures/WhalinPararealEfficiency.tikz}} } % \end{center} - \caption{Parallel time integration using the parareal method. $R$ is the ratio between the complexity of the fine and coarse propagators. Test environment 2.}\label{ch7:fig:whalinparareal} + \caption[Parallel time integration using the parareal method.]{Parallel time integration using the parareal method. $R$ is the ratio between the complexity of the fine and coarse propagators. Test environment 2.}\label{ch7:fig:whalinparareal} \end{figure} % Comparison with DD diff --git a/BookGPU/Chapters/chapter8/ch8.tex b/BookGPU/Chapters/chapter8/ch8.tex index a2529b1..24a71bc 100644 --- a/BookGPU/Chapters/chapter8/ch8.tex +++ b/BookGPU/Chapters/chapter8/ch8.tex @@ -499,7 +499,7 @@ To reduce the computation time cost of the term $\min\limits_{(i,j)\in \jmath^2, MM & $m \times (m-1)$ & $m \times (m-1)$ \\ \hline \end{tabular} - \caption{The different data structures of the $LB$ algorithm and their associated complexities in memory size and numbers of accesses. The parameters $n$, $m$ and $n'$ designate respectively the total number of jobs, the total number of machines and the number of remaining jobs to be scheduled for the sub-problems the lower bound is being computed.} + \caption[The different data structures of the $LB$ algorithm and their associated complexities in memory size and numbers of accesses.]{The different data structures of the $LB$ algorithm and their associated complexities in memory size and numbers of accesses. The parameters $n$, $m$ and $n'$ designate respectively the total number of jobs, the total number of machines and the number of remaining jobs to be scheduled for the sub-problems the lower bound is being computed.} \label{ch8:tabMemComplex} \end{table} @@ -534,7 +534,7 @@ $50 \times 20$ & 9.500 (9.5KB) & 9.500 (19KB) & 1.000 (1KB) & 20 (0.04KB) & 380 $20 \times 20$ & 3.800 (3.8KB) & 3.800 (7.6KB) & 400 (0.4KB) & 20 (0.04KB) & 380 (0.76KB) \\ \hline \end{tabular} -\caption{The sizes of each data structure for the different experimented problem instances. The sizes are given in number of elements and in bytes (between brackets).} +\caption[The sizes of each data structure for the different experimented problem instances.]{The sizes of each data structure for the different experimented problem instances. The sizes are given in number of elements and in bytes (between brackets).} \label{ch8:tabMemSizes} \end{table} @@ -765,7 +765,7 @@ $20 \times $20 & 41.94 & \textbf{60.10} & 48.28 & 39.86 & 39.61 & 38.93 & 37.79 % \hline % \hline \end{tabular} - \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. $PTM$ is placed in shared memory and all others are placed in global memory.} + \caption[Speedup for different FSP instances and pool sizes obtained with data access optimization.]{Speedup for different FSP instances and pool sizes obtained with data access optimization. $PTM$ is placed in shared memory and all others are placed in global memory.} \label{ch8:PTM-on-SM} \end{table} @@ -795,7 +795,7 @@ $20 \times $20 & 49.00 & \textbf{60.25} & 55.50 & 45.88 & 44.47 & 43.11 & 42.82 % \hline % \hline \end{tabular} - \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. + \caption[Speedup for different FSP instances and pool sizes obtained with data access optimization.]{Speedup for different FSP instances and pool sizes obtained with data access optimization. $JM$ is placed in shared memory and all others are placed in global memory.} \label{ch8:JM-on-SM} \end{table} @@ -826,7 +826,7 @@ $20 \times $20 & 53.64 & \textbf{61.47} & 59.55 & 51.39 & 47.40 & 46.53 & 46.37\ % \hline % \hline \end{tabular} - \caption{Speedup for different FSP instances and pool sizes obtained with data access optimization. $PTM$ and $JM$ are placed together in shared memory and all others are placed in global memory.} + \caption[Speedup for different FSP instances and pool sizes obtained with data access optimization.]{Speedup for different FSP instances and pool sizes obtained with data access optimization. $PTM$ and $JM$ are placed together in shared memory and all others are placed in global memory.} \label{ch8:JM-PTM-on-SM} \end{table} -- 2.39.5 From ac699bb490942cea02be7b088a0cfa4e0dcdd105 Mon Sep 17 00:00:00 2001 From: couturie Date: Thu, 9 May 2013 20:29:51 +0200 Subject: [PATCH 12/16] new --- BookGPU/BookGPU.tex | 2 +- BookGPU/Chapters/chapter11/ch11.tex | 12 ++++++++---- BookGPU/Chapters/chapter12/ch12.tex | 8 ++++---- BookGPU/Chapters/chapter14/ch14.tex | 3 ++- BookGPU/Chapters/chapter15/ch15.tex | 2 +- BookGPU/Chapters/chapter17/ch17.tex | 2 +- BookGPU/Chapters/chapter7/ch7.tex | 6 +++--- 7 files changed, 20 insertions(+), 15 deletions(-) diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index fae7a1b..7212cf1 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -177,9 +177,9 @@ %\include{frontmatter/Foreword} \include{frontmatter/preface} +\tableofcontents \listoffigures \listoftables -\tableofcontents \mainmatter diff --git a/BookGPU/Chapters/chapter11/ch11.tex b/BookGPU/Chapters/chapter11/ch11.tex index 5f81d1f..a9667e7 100644 --- a/BookGPU/Chapters/chapter11/ch11.tex +++ b/BookGPU/Chapters/chapter11/ch11.tex @@ -104,9 +104,13 @@ $$ It is almost straightforward to parallelise this scheme for GPUs, by processing each subinterval $[x_i,x_{i+1}]$ independently in a separate thread. However, it is not known in advance whether an extra knot $t_i$ needs to be inserted or not, and therefore calculation of the position of the knot in the output sequence of knots ${t_i}$ is problematic for parallel implementation (for a sequential algorithm no such issue arises). To avoid serialisation, we decided to insert an additional knot in every interval $[x_i,x_{i+1}]$, but set $t_i=x_i$ when the extra knot is not actually needed. This way we know in advance the position of the output knots and the length of this sequence is $2(n-1)$, and therefore all calculations can now be performed independently. The price we pay is that some of the spline knots can coincide. However, this does not affect spline evaluation, as one of the coinciding knots is simply disregarded, and the spline coefficients are replicated (so for a double knot $t_i=t_{i+1}$, we have $\alpha_i=\alpha_{i+1}$, $\beta_i=\beta_{i+1}$, $\gamma_i=\gamma_{i+1}$). Our implementation is presented in Figures \ref{ch11:algcoef}-\ref{ch11:algcoef1}. +\lstinputlisting[label=ch11:algcoef1,caption=Calculation of monotone spline knots and coefficients.]{Chapters/chapter11/code2.cu} + + At the spline evaluation stage we need to compute $s(z_k)$ for a sequence of query values ${z_k}, k=1,\ldots,K$. For each $z_k$ we locate the interval $[t_i,t_{i+1}]$ containing $z_k$, using bisection algorithm presented in Figure \ref{ch11:algeval}, and then apply the appropriate coefficients of the quadratic function. This is also done in parallel. The bisection algorithm could be implemented using texture memory (to cache the array \texttt{z}), but this is not shown in Figure \ref{ch11:algeval}. +\pagebreak \lstinputlisting[label=ch11:algcoef,caption=Implementation of the kernel for calculating spline knots and coefficients. Function fmax is used to avoid division by zero for data with coinciding abscissae.]{Chapters/chapter11/code1.cu} @@ -167,7 +171,6 @@ The bisection algorithm could be implemented using texture memory (to cache the %% \end{figure} -\lstinputlisting[label=ch11:algcoef1,caption=Calculation of monotone spline knots and coefficients.]{Chapters/chapter11/code2.cu} %% \begin{figure}[!hp] %% \renewcommand{\baselinestretch}{1} @@ -409,9 +412,8 @@ with $\hat y(k,l)$ being the unrestricted maximum likelihood estimator of $y_k\l %% %\renewcommand{\baselinestretch}{1} \begin{table}[!h] \begin{center} -\caption{The average CPU time (sec) of the serial PAVA, MLS and parallel MLS algorithms. } \label{ch11:table1} \begin{tabular}{|r|r|r|r|} - +\hline Data & PAVA & MLS & GPU MLS \\ \hline monotone increasing $f$ & & & \\ @@ -430,9 +432,11 @@ $n=10^6$ &0.2&0.1& 38\\ $n=10 \times 10^6$ &1.9& 1.9& 3500 \\ $n=20 \times 10^6$ &3.5& 4.0&-- \\ $n=50 \times 10^6$ &11& 11& -- \\ - +\hline \end{tabular} \end{center} +\caption{The average CPU time (sec) of the serial PAVA, MLS and parallel MLS algorithms. } +\label{ch11:table1} \end{table} %% %\renewcommand{\baselinestretch}{2} diff --git a/BookGPU/Chapters/chapter12/ch12.tex b/BookGPU/Chapters/chapter12/ch12.tex index 8b869b3..7f97864 100755 --- a/BookGPU/Chapters/chapter12/ch12.tex +++ b/BookGPU/Chapters/chapter12/ch12.tex @@ -9,7 +9,7 @@ \chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte, France} \chapterauthor{Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France} -\chapter{Solving sparse linear systems with GMRES and CG methods on GPU clusters} +\chapter[Solving linear systems with GMRES and CG methods on GPU clusters]{Solving sparse linear systems with GMRES and CG methods on GPU clusters} \label{ch12} %%--------------------------%% @@ -688,13 +688,13 @@ from the Davis collection. Then, it puts all these copies on the main diagonal o diagonal are filled with sub-copies (left-copy and right-copy in Figure~\ref{ch12:fig:06}) of the same initial matrix. -\begin{figure} +\begin{figure}[htbp] \centerline{\includegraphics[scale=0.30]{Chapters/chapter12/figures/generation}} \caption{Parallel generation of a large sparse matrix by four computing nodes.} \label{ch12:fig:06} \end{figure} -\begin{table}[!h] +\begin{table}[htbp] \centering \begin{tabular}{|c|c|c|c|} \hline @@ -729,7 +729,7 @@ initial matrix. \label{ch12:tab:04} \end{table} -\begin{table} +\begin{table}[htbp] \begin{center} \begin{tabular}{|c|c|c|c|c|c|c|} \hline diff --git a/BookGPU/Chapters/chapter14/ch14.tex b/BookGPU/Chapters/chapter14/ch14.tex index d50db31..bf3cd66 100755 --- a/BookGPU/Chapters/chapter14/ch14.tex +++ b/BookGPU/Chapters/chapter14/ch14.tex @@ -1,6 +1,7 @@ \chapterauthor{Alan Gray and Kevin Stratford}{EPCC, The University of Edinburgh} -\chapter{Ludwig: multiple GPUs for a complex fluid lattice Boltzmann +\chapter[Ludwig: multiple GPUs for a fluid lattice Boltzmann +application]{Ludwig: multiple GPUs for a complex fluid lattice Boltzmann application} %\putbib[biblio] diff --git a/BookGPU/Chapters/chapter15/ch15.tex b/BookGPU/Chapters/chapter15/ch15.tex index cf464c7..c0de11e 100644 --- a/BookGPU/Chapters/chapter15/ch15.tex +++ b/BookGPU/Chapters/chapter15/ch15.tex @@ -7,7 +7,7 @@ The Queen's University of Belfast} %\newcommand{\fixme}[1]{{\bf #1}} -\chapter[Numerical validation and performance optimization on GPUs in atomic physics]{Numerical validation and performance optimization on GPUs of an application in atomic physics} +\chapter[Numerical validation on GPUs in atomic physics]{Numerical validation and performance optimization on GPUs of an application in atomic physics} \label{chapter15} \section{Introduction}\label{ch15:intro} diff --git a/BookGPU/Chapters/chapter17/ch17.tex b/BookGPU/Chapters/chapter17/ch17.tex index 4aa06b8..77cdd11 100755 --- a/BookGPU/Chapters/chapter17/ch17.tex +++ b/BookGPU/Chapters/chapter17/ch17.tex @@ -1,7 +1,7 @@ \chapterauthor{Guillaume Laville}{Femto-ST Institute, University of Franche-Comte, France} \chapterauthor{Christophe Lang}{Femto-ST Institute, University of Franche-Comte, France} \chapterauthor{Kamel Mazouzi}{Franche-Comte Computing Center, University of Franche-Comte, France} -\chapterauthor{Nicolas Marilleau}{UMMISCO, Institut de Recherche pour le Développement (IRD), France} +\chapterauthor{Nicolas Marilleau}{UMMISCO, Institut de Recherche pour le Developpement (IRD), France} \chapterauthor{Bénédicte Herrmann}{Femto-ST Institute, University of Franche-Comte, France} \chapterauthor{Laurent Philippe}{Femto-ST Institute, University of Franche-Comte, France} diff --git a/BookGPU/Chapters/chapter7/ch7.tex b/BookGPU/Chapters/chapter7/ch7.tex index 6ca2080..ab70af2 100644 --- a/BookGPU/Chapters/chapter7/ch7.tex +++ b/BookGPU/Chapters/chapter7/ch7.tex @@ -820,14 +820,14 @@ Ideally, the ratio $\mathcal{C}_\mathcal{G}/\mathcal{C}_\mathcal{F}$ is small an \begin{figure}[!htb] \begin{center} \setlength\figureheight{0.35\textwidth} - \setlength\figurewidth{0.37\textwidth} + \setlength\figurewidth{0.35\textwidth} \subfigure[Performance scaling]{ % {\small\input{Chapters/chapter7/figures/PararealScaletestGTX590.tikz}} - \includegraphics[width=0.5\textwidth]{Chapters/chapter7/figures/PararealScaletestGTX590_conv.pdf} + \includegraphics[width=0.47\textwidth]{Chapters/chapter7/figures/PararealScaletestGTX590_conv.pdf} } \subfigure[Speedup]{ % {\small\input{Chapters/chapter7/figures/PararealSpeedupGTX590.tikz}} - \includegraphics[width=0.5\textwidth]{Chapters/chapter7/figures/PararealSpeedupGTX590_conv.pdf} + \includegraphics[width=0.47\textwidth]{Chapters/chapter7/figures/PararealSpeedupGTX590_conv.pdf} } \end{center} \caption[Parareal absolute timings and parareal speedup.]{(a) Parareal absolute timings for an increasingly number of water waves traveling one wave length, each wave resolution is ($33\times 9$). (b) Parareal speedup for two to sixteen compute nodes compared to the purely sequential single GPU solver. Notice how insensitive the parareal scheme is to the size of the problem solved. Test environment 2.}\label{ch7:fig:DDPA_SPEEDUP} -- 2.39.5 From 8fd941eeeeccad914f5e2b833bc00c9d2401efd6 Mon Sep 17 00:00:00 2001 From: couturie Date: Fri, 10 May 2013 13:39:05 +0200 Subject: [PATCH 13/16] new --- BookGPU/Chapters/chapter1/ch1.tex | 2 +- BookGPU/Chapters/chapter12/ch12.tex | 6 +++--- BookGPU/Chapters/chapter2/ch2.tex | 2 +- BookGPU/Chapters/chapter3/ch3.aux | 4 ++-- BookGPU/Chapters/chapter3/ch3.tex | 5 +++-- BookGPU/Chapters/chapter4/ch4.tex | 9 ++++++--- BookGPU/sunil.cls | 2 +- 7 files changed, 17 insertions(+), 13 deletions(-) diff --git a/BookGPU/Chapters/chapter1/ch1.tex b/BookGPU/Chapters/chapter1/ch1.tex index 6c3f1f1..17a7e47 100755 --- a/BookGPU/Chapters/chapter1/ch1.tex +++ b/BookGPU/Chapters/chapter1/ch1.tex @@ -1,4 +1,4 @@ -\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte} +\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte, France} \chapter{Presentation of the GPU architecture and of the Cuda environment} diff --git a/BookGPU/Chapters/chapter12/ch12.tex b/BookGPU/Chapters/chapter12/ch12.tex index 7f97864..576a0cd 100755 --- a/BookGPU/Chapters/chapter12/ch12.tex +++ b/BookGPU/Chapters/chapter12/ch12.tex @@ -5,9 +5,9 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %\chapterauthor{}{} -\chapterauthor{Lilia Ziane Khodja}{Femto-ST Institute, University of Franche-Comte, France} -\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte, France} -\chapterauthor{Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France} +\chapterauthor{Lilia Ziane Khodja, Raphaël Couturier and Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France} +%\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte, France} +%\chapterauthor{Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France} \chapter[Solving linear systems with GMRES and CG methods on GPU clusters]{Solving sparse linear systems with GMRES and CG methods on GPU clusters} \label{ch12} diff --git a/BookGPU/Chapters/chapter2/ch2.tex b/BookGPU/Chapters/chapter2/ch2.tex index 2eba230..2f4f4ea 100755 --- a/BookGPU/Chapters/chapter2/ch2.tex +++ b/BookGPU/Chapters/chapter2/ch2.tex @@ -1,4 +1,4 @@ -\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte} +\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte, France} \chapter{Introduction to Cuda} \label{chapter2} diff --git a/BookGPU/Chapters/chapter3/ch3.aux b/BookGPU/Chapters/chapter3/ch3.aux index 58d4a58..0f932fd 100644 --- a/BookGPU/Chapters/chapter3/ch3.aux +++ b/BookGPU/Chapters/chapter3/ch3.aux @@ -19,11 +19,11 @@ \@writefile{toc}{\contentsline {section}{\numberline {3.2}Performance measurements}{28}} \newlabel{lst:chronos}{{3.4}{28}} \@writefile{lol}{\contentsline {lstlisting}{\numberline {3.4}Time measurement technique using cutil functions}{28}} +\@writefile{toc}{\author{Gilles Perrot}{}} \@writefile{loa}{\addvspace {10\p@ }} \@writefile{toc}{\contentsline {chapter}{\numberline {4}Implementing a fast median filter}{31}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\@writefile{toc}{\author{Gilles Perrot}{}} \@writefile{toc}{\contentsline {section}{\numberline {4.1}Introduction}{31}} \@writefile{toc}{\contentsline {section}{\numberline {4.2}Median filtering}{32}} \@writefile{toc}{\contentsline {subsection}{\numberline {4.2.1}Basic principles}{32}} @@ -127,7 +127,7 @@ \setcounter{subparagraph}{0} \setcounter{figure}{11} \setcounter{table}{3} -\setcounter{numauthors}{1} +\setcounter{numauthors}{0} \setcounter{parentequation}{0} \setcounter{subfigure}{0} \setcounter{lofdepth}{1} diff --git a/BookGPU/Chapters/chapter3/ch3.tex b/BookGPU/Chapters/chapter3/ch3.tex index 1a991e2..8cd1767 100755 --- a/BookGPU/Chapters/chapter3/ch3.tex +++ b/BookGPU/Chapters/chapter3/ch3.tex @@ -1,4 +1,4 @@ -\chapterauthor{Gilles Perrot}{FEMTO-ST Institute} +\chapterauthor{Gilles Perrot}{Femto-ST Institute, University of Franche-Comte, France} %\graphicspath{{img/}} @@ -181,8 +181,9 @@ Last, like many authors, we chose to use the pixel throughput value of each proc In order to estimate the potential for improvement of each kernel, a reference throughput measurement, involving identity kernel of Listing \ref{lst:fkern1}, was performed. As this kernel only fetches input values from texture memory and outputs them to global memory without doing any computation, it represents the smallest, thus fastest, possible process and is taken as the reference throughput value (100\%). The same measurement was performed on CPU, with a maximum effective pixel throughput of 130~Mpixel per second. On GPU, depending on grid parameters it amounts to 800~MPixels/s on GTX280 and 1300~Mpixels/s on C2070. +\chapterauthor{Gilles Perrot}{Femto-ST Institute, University of Franche-Comte, France} + \chapter{Implementing a fast median filter} -\chapterauthor{Gilles Perrot}{FEMTO-ST Institute} \section{Introduction} Median filtering is a well-known method used in a wide range of application frameworks as well as a standalone filter especially for \textit{salt and pepper} denoising. It is able to highly reduce power of noise without blurring edges too much. diff --git a/BookGPU/Chapters/chapter4/ch4.tex b/BookGPU/Chapters/chapter4/ch4.tex index d618753..e3dbfd5 100644 --- a/BookGPU/Chapters/chapter4/ch4.tex +++ b/BookGPU/Chapters/chapter4/ch4.tex @@ -1,4 +1,7 @@ -\chapterauthor{Gilles Perrot}{FEMTO-ST Institute} +\chapterauthor{Gilles Perrot}{Femto-ST Institute, University of Franche-Comte, France} + +\chapter{Implementing an efficient convolution operation on GPU} + %\newcommand{\kl}{\includegraphics[scale=0.6]{Chapters/chapter4/img/kernLeft.png}~} %\newcommand{\kr}{\includegraphics[scale=0.6]{Chapters/chapter4/img/kernRight.png}} @@ -29,10 +32,10 @@ %% } -\chapter{Implementing an efficient convolution \index{Convolution} operation on GPU} + \section{Overview} In this chapter, after dealing with GPU median filter implementations, -we propose to explore how convolutions can be implemented on modern +we propose to explore how convolutions\index{Convolution} can be implemented on modern GPUs. Widely used in digital image processing filters, the \emph{convolution operation} basically consists in taking the sum of products of elements from two 2-D functions, letting one of the two functions move over diff --git a/BookGPU/sunil.cls b/BookGPU/sunil.cls index bc2fad6..754c025 100755 --- a/BookGPU/sunil.cls +++ b/BookGPU/sunil.cls @@ -1407,7 +1407,7 @@ \@starttoc{lof}% \if@restonecol\twocolumn\fi } -\newcommand*\l@figure{\@dottedtocline{1}{1.5em}{2.3em}} +\newcommand*\l@figure{\@dottedtocline{1}{1.5em}{2.8em}} \newcommand\listoftables{% \if@twocolumn \@restonecoltrue\onecolumn -- 2.39.5 From 715679c7fdeac58424c5052fffe7677f15337801 Mon Sep 17 00:00:00 2001 From: couturie Date: Fri, 10 May 2013 21:20:47 +0200 Subject: [PATCH 14/16] new --- BookGPU/Chapters/chapter10/ch10.tex | 6 +++--- BookGPU/Chapters/chapter11/ch11.tex | 4 ++-- BookGPU/Chapters/chapter13/ch13.tex | 6 +++--- BookGPU/Chapters/chapter14/ch14.tex | 2 +- BookGPU/Chapters/chapter15/ch15.tex | 10 +++++----- BookGPU/Chapters/chapter17/ch17.tex | 8 ++++---- BookGPU/Chapters/chapter18/ch18.tex | 4 ++-- BookGPU/Chapters/chapter19/ch19.tex | 4 ++-- BookGPU/Chapters/chapter5/ch5.tex | 8 ++++---- BookGPU/Chapters/chapter7/ch7.tex | 8 ++++---- BookGPU/Chapters/chapter8/ch8.tex | 9 +++++---- BookGPU/Chapters/chapter9/ch9.tex | 4 ++-- BookGPU/sunil.cls | 10 +++++----- 13 files changed, 42 insertions(+), 41 deletions(-) diff --git a/BookGPU/Chapters/chapter10/ch10.tex b/BookGPU/Chapters/chapter10/ch10.tex index c47af05..a91e946 100644 --- a/BookGPU/Chapters/chapter10/ch10.tex +++ b/BookGPU/Chapters/chapter10/ch10.tex @@ -1,8 +1,8 @@ -\chapterauthor{Xavier Meyer}{Department of Computer Science, University of Geneva} -\chapterauthor{Paul Albuquerque}{Institute for Informatics and Telecommunications, hepia, \\ University of Applied Sciences of Western Switzerland - Geneva} -\chapterauthor{Bastien Chopard}{Department of Computer Science, University of Geneva} +\chapterauthor{Xavier Meyer and Bastien Chopard}{Department of Computer Science, University of Geneva, Switzerland} +\chapterauthor{Paul Albuquerque}{Institute for Informatics and Telecommunications, Hepia, \\ University of Applied Sciences of Western Switzerland - Geneva, Switzerland} +%\chapterauthor{Bastien Chopard}{Department of Computer Science, University of Geneva} %\chapter{Linear programming on a GPU: a study case based on the simplex method and the branch-cut-and bound algorithm} \chapter{Linear programming on a GPU: a case study} diff --git a/BookGPU/Chapters/chapter11/ch11.tex b/BookGPU/Chapters/chapter11/ch11.tex index a9667e7..85d5304 100644 --- a/BookGPU/Chapters/chapter11/ch11.tex +++ b/BookGPU/Chapters/chapter11/ch11.tex @@ -1,6 +1,6 @@ -\chapterauthor{Gleb Beliakov}{School of Information Technology, Deakin University, Burwood 3125, Australia} -\chapterauthor{Shaowu Liu}{School of Information Technology, Deakin University, Burwood 3125, Australia} +\chapterauthor{Gleb Beliakov and Shaowu Liu}{School of Information Technology, Deakin University, Burwood 3125, Australia} +%\chapterauthor{Shaowu Liu}{School of Information Technology, Deakin University, Burwood 3125, Australia} \chapter{Parallel Monotone Spline Interpolation and Approximation on GPUs} diff --git a/BookGPU/Chapters/chapter13/ch13.tex b/BookGPU/Chapters/chapter13/ch13.tex index 7a1aa42..941f085 100755 --- a/BookGPU/Chapters/chapter13/ch13.tex +++ b/BookGPU/Chapters/chapter13/ch13.tex @@ -5,11 +5,11 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %\chapterauthor{}{} -\chapterauthor{Lilia Ziane Khodja}{Femto-ST Institute, University of Franche-Comte, France} +\chapterauthor{Lilia Ziane Khodja, Raphaël Couturier and Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France} \chapterauthor{Ming Chau}{Advanced Solutions Accelerator, Castelnau Le Lez, France} -\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte, France} +%\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comte, France} \chapterauthor{Pierre Spitéri}{ENSEEIHT-IRIT, Toulouse, France} -\chapterauthor{Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France} +%\chapterauthor{Jacques Bahi}{Femto-ST Institute, University of Franche-Comte, France} \chapter{Solving sparse nonlinear systems of obstacle problems on GPU clusters} diff --git a/BookGPU/Chapters/chapter14/ch14.tex b/BookGPU/Chapters/chapter14/ch14.tex index bf3cd66..318015f 100755 --- a/BookGPU/Chapters/chapter14/ch14.tex +++ b/BookGPU/Chapters/chapter14/ch14.tex @@ -1,4 +1,4 @@ -\chapterauthor{Alan Gray and Kevin Stratford}{EPCC, The University of Edinburgh} +\chapterauthor{Alan Gray and Kevin Stratford}{EPCC, The University of Edinburgh, United Kingdom} \chapter[Ludwig: multiple GPUs for a fluid lattice Boltzmann application]{Ludwig: multiple GPUs for a complex fluid lattice Boltzmann diff --git a/BookGPU/Chapters/chapter15/ch15.tex b/BookGPU/Chapters/chapter15/ch15.tex index c0de11e..11a9e34 100644 --- a/BookGPU/Chapters/chapter15/ch15.tex +++ b/BookGPU/Chapters/chapter15/ch15.tex @@ -1,9 +1,9 @@ -\chapterauthor{Pierre Fortin}{Laboratoire d'Informatique de Paris 6, University Paris 6} -\chapterauthor{Rachid Habel}{T\'el\'ecom SudParis} -\chapterauthor{Fabienne J\'ez\'equel}{Laboratoire d'Informatique de Paris 6, University Paris 6} -\chapterauthor{Jean-Luc Lamotte}{Laboratoire d'Informatique de Paris 6, University Paris 6} +\chapterauthor{Pierre Fortin, Fabienne J\'ez\'equel and Jean-Luc Lamotte}{Laboratoire d'Informatique de Paris 6, University Paris 6, France} +\chapterauthor{Rachid Habel}{T\'el\'ecom SudParis, France} +%\chapterauthor{Fabienne J\'ez\'equel}{Laboratoire d'Informatique de Paris 6, University Paris 6} +%\chapterauthor{Jean-Luc Lamotte}{Laboratoire d'Informatique de Paris 6, University Paris 6} \chapterauthor{Stan Scott}{School of Electronics, Electrical Engineering \& Computer Science, -The Queen's University of Belfast} +The Queen's University of Belfast, Ireland} %\newcommand{\fixme}[1]{{\bf #1}} diff --git a/BookGPU/Chapters/chapter17/ch17.tex b/BookGPU/Chapters/chapter17/ch17.tex index 77cdd11..7576284 100755 --- a/BookGPU/Chapters/chapter17/ch17.tex +++ b/BookGPU/Chapters/chapter17/ch17.tex @@ -1,9 +1,9 @@ -\chapterauthor{Guillaume Laville}{Femto-ST Institute, University of Franche-Comte, France} -\chapterauthor{Christophe Lang}{Femto-ST Institute, University of Franche-Comte, France} +\chapterauthor{Guillaume Laville, Christophe Lang, Bénédicte Herrmann and Laurent Philippe}{Femto-ST Institute, University of Franche-Comte, France} +%\chapterauthor{Christophe Lang}{Femto-ST Institute, University of Franche-Comte, France} \chapterauthor{Kamel Mazouzi}{Franche-Comte Computing Center, University of Franche-Comte, France} \chapterauthor{Nicolas Marilleau}{UMMISCO, Institut de Recherche pour le Developpement (IRD), France} -\chapterauthor{Bénédicte Herrmann}{Femto-ST Institute, University of Franche-Comte, France} -\chapterauthor{Laurent Philippe}{Femto-ST Institute, University of Franche-Comte, France} +%\chapterauthor{Bénédicte Herrmann}{Femto-ST Institute, University of Franche-Comte, France} +%\chapterauthor{Laurent Philippe}{Femto-ST Institute, University of Franche-Comte, France} \newlength\mylen \newcommand\myinput[1]{% diff --git a/BookGPU/Chapters/chapter18/ch18.tex b/BookGPU/Chapters/chapter18/ch18.tex index 7f77d59..d72e9a3 100755 --- a/BookGPU/Chapters/chapter18/ch18.tex +++ b/BookGPU/Chapters/chapter18/ch18.tex @@ -1,5 +1,5 @@ -\chapterauthor{Raphaël Couturier}{Femto-ST Institute, University of Franche-Comt\'{e}} -\chapterauthor{Christophe Guyeux}{Femto-ST Institute, University of Franche-Comt\'{e}} +\chapterauthor{Raphaël Couturier and Christophe Guyeux}{Femto-ST Institute, University of Franche-Comte, France} +%\chapterauthor{Christophe Guyeux}{Femto-ST Institute, University of Franche-Comt\'{e}} \chapter{Pseudorandom Number Generator on GPU} diff --git a/BookGPU/Chapters/chapter19/ch19.tex b/BookGPU/Chapters/chapter19/ch19.tex index e25a23a..fdf533d 100755 --- a/BookGPU/Chapters/chapter19/ch19.tex +++ b/BookGPU/Chapters/chapter19/ch19.tex @@ -1,5 +1,5 @@ -\chapterauthor{Bertil Schmidt}{Johannes Gutenberg University of Mainz} -\chapterauthor{Hoang-Vu Dang}{Johannes Gutenberg University of Mainz} +\chapterauthor{Bertil Schmidt and Hoang-Vu Dang}{Johannes Gutenberg University of Mainz, Germany} +%\chapterauthor{Hoang-Vu Dang}{Johannes Gutenberg University of Mainz} \chapter{Solving large sparse linear systems for integer factorization on GPUs} \label{chapter19} diff --git a/BookGPU/Chapters/chapter5/ch5.tex b/BookGPU/Chapters/chapter5/ch5.tex index ff24f8c..e37a709 100644 --- a/BookGPU/Chapters/chapter5/ch5.tex +++ b/BookGPU/Chapters/chapter5/ch5.tex @@ -1,7 +1,7 @@ -\chapterauthor{Stefan L. Glimberg}{Technical University of Denmark} -\chapterauthor{Allan P. Engsig-Karup}{Technical University of Denmark} -\chapterauthor{Allan S. Nielsen}{Technical University of Denmark} -\chapterauthor{Bernd Dammann}{Technical University of Denmark} +\chapterauthor{Stefan L. Glimberg, Allan P. Engsig-Karup, Allan S. Nielsen and Bernd Dammann}{Technical University of Denmark} +%\chapterauthor{Allan P. Engsig-Karup}{Technical University of Denmark} +%\chapterauthor{Allan S. Nielsen}{Technical University of Denmark} +%\chapterauthor{Bernd Dammann}{Technical University of Denmark} diff --git a/BookGPU/Chapters/chapter7/ch7.tex b/BookGPU/Chapters/chapter7/ch7.tex index ab70af2..1e0dd3d 100644 --- a/BookGPU/Chapters/chapter7/ch7.tex +++ b/BookGPU/Chapters/chapter7/ch7.tex @@ -1,8 +1,8 @@ -\chapterauthor{Allan P. Engsig-Karup}{Technical University of Denmark} -\chapterauthor{Stefan L. Glimberg}{Technical University of Denmark} -\chapterauthor{Allan S. Nielsen}{Technical University of Denmark} -\chapterauthor{Ole Lindberg}{Technical University of Denmark} +\chapterauthor{Allan P. Engsig-Karup, Stefan L. Glimberg, Allan S. Nielsen and Ole Lindberg}{Technical University of Denmark} +%\chapterauthor{Stefan L. Glimberg}{Technical University of Denmark} +%\chapterauthor{Allan S. Nielsen}{Technical University of Denmark} +%\chapterauthor{Ole Lindberg}{Technical University of Denmark} \chapter{Fast hydrodynamics on heterogenous many-core hardware} \label{ch7} diff --git a/BookGPU/Chapters/chapter8/ch8.tex b/BookGPU/Chapters/chapter8/ch8.tex index 24a71bc..5d012eb 100644 --- a/BookGPU/Chapters/chapter8/ch8.tex +++ b/BookGPU/Chapters/chapter8/ch8.tex @@ -1,6 +1,6 @@ -\chapterauthor{Imen Chakroun}{Universit\'e Lille 1 CNRS/LIFL, INRIA Lille Nord Europe, Cit\'e scientifique - 59655, Villeneuve d'Ascq cedex, France\\} -\chapterauthor{Nouredine Melab}{Universit\'e Lille 1 CNRS/LIFL, INRIA Lille Nord Europe, Cit\'e scientifique - 59655, Villeneuve d'Ascq cedex, France\\} +\chapterauthor{Imen Chakroun and Nouredine Melab}{University of Lille 1 CNRS/LIFL, INRIA Lille Nord Europe, Cit\'e scientifique, 59655 Villeneuve d'Ascq cedex, France\\} +%\chapterauthor{Nouredine Melab}{Universit\'e Lille 1 CNRS/LIFL, INRIA Lille Nord Europe, Cit\'e scientifique - 59655, Villeneuve d'Ascq cedex, France\\} \chapter{GPU-accelerated Tree-based Exact Optimization Methods} \label{ch8:GPU-accelerated-tree-based-exact-optimization-methods} @@ -254,11 +254,12 @@ In the following, we present how we dealt with the thread/branch divergence issu \vspace{-0.4cm} -\section{Thread divergence \index{Thread divergence}} +\section{Thread divergence} +\label{ch8:ThreadDivergence} \subsection{The thread divergence issue} -During the execution of an application on GPU, to each GPU multiprocessor is assigned one or more thread block(s) to execute. Those threads are partitioned into warps that get scheduled for execution. For each instruction of the flow, the multiprocessor selects a warp that is ready to be run. A warp executes one common instruction at a time, so full efficiency is realized when all threads of a warp agree on their execution path. In this chapter, the G80 model, in which a warp is a pool of 32 threads, is used. If threads of a warp diverge via a data-dependent conditional branch, the warp serially executes each branch path taken. Threads that are not on the taken path are disabled, and when all paths complete, the threads converge back to the same execution path. This phenomenon is called thread/branch divergence and often causes serious performance degradations. Branch divergence occurs only within a warp; different warps execute independently regardless of whether they are executing common or disjointed code paths. +During the execution of an application on GPU, to each GPU multiprocessor is assigned one or more thread block(s) to execute. Those threads are partitioned into warps that get scheduled for execution. For each instruction of the flow, the multiprocessor selects a warp that is ready to be run. A warp executes one common instruction at a time, so full efficiency is realized when all threads of a warp agree on their execution path. In this chapter, the G80 model, in which a warp is a pool of 32 threads, is used. If threads of a warp diverge via a data-dependent conditional branch, the warp serially executes each branch path taken. Threads that are not on the taken path are disabled, and when all paths complete, the threads converge back to the same execution path. This phenomenon is called thread/branch divergence\index{Thread divergence} and often causes serious performance degradations. Branch divergence occurs only within a warp; different warps execute independently regardless of whether they are executing common or disjointed code paths. \vspace{0.2cm} diff --git a/BookGPU/Chapters/chapter9/ch9.tex b/BookGPU/Chapters/chapter9/ch9.tex index dce1889..efaa83e 100644 --- a/BookGPU/Chapters/chapter9/ch9.tex +++ b/BookGPU/Chapters/chapter9/ch9.tex @@ -1,8 +1,8 @@ -\chapterauthor{Malika Mehdi}{CERIST Research Center, DTISI, 3 rue des frères Aissou, 16030 Ben-Aknoun, Algiers, Algeria} +\chapterauthor{Malika Mehdi and Ahc\`{e}ne Bendjoudi}{CERIST Research Center, DTISI, 3 rue des frères Aissou, 16030 Ben-Aknoun, Algiers, Algeria} \chapterauthor{Lakhdar Loukil}{University of Oran, Algeria} -\chapterauthor{Ahc\`{e}ne Bendjoudi}{CERIST Research Center, DTISI, 3 rue des frères Aissou, 16030 Ben-Aknoun, Algiers, Algeria} +%\chapterauthor{Ahc\`{e}ne Bendjoudi}{CERIST Research Center, DTISI, 3 rue des frères Aissou, 16030 Ben-Aknoun, Algiers, Algeria} \chapterauthor{Nouredine Melab}{Université Lille 1, LIFL/UMR CNRS 8022, 59655-Villeneuve d’Ascq cedex, France} diff --git a/BookGPU/sunil.cls b/BookGPU/sunil.cls index 754c025..c54ae3e 100755 --- a/BookGPU/sunil.cls +++ b/BookGPU/sunil.cls @@ -1300,31 +1300,31 @@ \stepcounter{numauthors} %%\the\c@numauthors \ifnum\c@numauthors=1 % - \sbox\@AUonebox{\CAPlusOneFont#1} + \sbox\@AUonebox{\vbox{\hsize\textwidth\CAPlusOneFont\raggedright\noindent \CAPlusOneFont#1}} \sbox\@AUaffonebox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromone{\copy\@AUonebox} \def\chapter@authorone{\copy\@finalAUboxfromone} \def\chapter@affiliationone{\copy\@AUaffonebox} \fi \ifnum\c@numauthors=2 - \sbox\@AUtwobox{\CAPlusOneFont#1} + \sbox\@AUtwobox{\vbox{\hsize\textwidth\CAPlusOneFont\raggedright\noindent \CAPlusOneFont#1}} \sbox\@AUafftwobox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromtwo{\copy\@AUtwobox} \def\chapter@authortwo{\copy\@finalAUboxfromtwo} \def\chapter@affiliationtwo{\copy\@AUafftwobox} \fi \ifnum\c@numauthors=3 - \sbox\@AUthreebox{\CAPlusOneFont#1} + \sbox\@AUthreebox{\vbox{\hsize\textwidth\CAPlusOneFont\raggedright\noindent \CAPlusOneFont#1}} \sbox\@AUaffthreebox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromthree{\copy\@AUthreebox} \def\chapter@authorthree{\copy\@finalAUboxfromthree} \def\chapter@affiliationthree{\copy\@AUaffthreebox} \fi \ifnum\c@numauthors=4 - \sbox\@AUfourbox{\CAPlusOneFont#1} + \sbox\@AUfourbox{\vbox{\hsize\textwidth\CAPlusOneFont\raggedright\noindent \CAPlusOneFont#1}} \sbox\@AUafffourbox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromfour{\copy\@AUfourbox} \def\chapter@authorfour{\copy\@finalAUboxfromfour} \def\chapter@affiliationfour{\copy\@AUafffourbox} \fi \ifnum\c@numauthors=5 - \sbox\@AUfivebox{\CAPlusOneFont#1} + \sbox\@AUfivebox{\vbox{\hsize\textwidth\CAPlusOneFont\raggedright\noindent \CAPlusOneFont#1}} \sbox\@AUafffivebox{\vbox{\hsize\textwidth\CAAPlusOneFont\noindent #2\par}} \sbox\@finalAUboxfromfive{\copy\@AUfivebox} \def\chapter@authorfive{\copy\@finalAUboxfromfive} -- 2.39.5 From a60840091cd318f2c8e530740dd7940fc393e898 Mon Sep 17 00:00:00 2001 From: couturie Date: Sat, 11 May 2013 10:36:43 +0200 Subject: [PATCH 15/16] new --- BookGPU/Chapters/chapter16/ch16.tex | 12 +++++------- BookGPU/Chapters/chapter17/ch17.tex | 2 ++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/BookGPU/Chapters/chapter16/ch16.tex b/BookGPU/Chapters/chapter16/ch16.tex index bfa5d6c..e7ba3ba 100644 --- a/BookGPU/Chapters/chapter16/ch16.tex +++ b/BookGPU/Chapters/chapter16/ch16.tex @@ -1,11 +1,9 @@ -\chapterauthor{X.-X. Liu}{Dept. Electrical Engineering, - University of California, Riverside, CA 92521} -\chapterauthor{S. X.-D. Tan}{Dept. Electrical Engineering, - University of California, Riverside, CA 92521} -\chapterauthor{H. Wang}{Univ. of Electronics Science and Technology of China, +\chapterauthor{Xuexin Liu, Sheldon Xiang-Dong Tan}{Dept. Electrical Engineering, + University of California, Riverside, CA 92521, USA} +%\chapterauthor{Sheldon Xiang-Dong Tan}{Dept. Electrical Engineering, University of California, Riverside, CA 92521} +\chapterauthor{Hai Wang}{Univ. of Electronics Science and Technology of China, Chengdu, Sichuan, China} -\chapterauthor{H. Yu}{School of Electrical \& Electronic Engineering, - Nanyang Technological University, Singapore} +\chapterauthor{Hao Yu}{School of Electrical \& Electronic Engineering, Nanyang Technological University, Singapore} % \thanks{ % This research was supported in part by NSF grants under diff --git a/BookGPU/Chapters/chapter17/ch17.tex b/BookGPU/Chapters/chapter17/ch17.tex index 7576284..d6e3a31 100755 --- a/BookGPU/Chapters/chapter17/ch17.tex +++ b/BookGPU/Chapters/chapter17/ch17.tex @@ -374,6 +374,7 @@ the total numbers of access needed to updated those populations. %\lstinputlisting[language=C,caption=Collembola OpenCL %kernels,label=fig:collem_kernels]{Chapters/chapter17/code/collem_kernels.cl} +\pagebreak \lstinputlisting[caption=Collembola OpenCL Diffusion kernel,label=ch17:listing:collembola-diffuse]{Chapters/chapter17/code/collem_kernel_diffuse.cl} The reproduction, diffusion and culling steps are implemented on GPU @@ -646,6 +647,7 @@ implementation by enforcing a quasi-sequential execution. It is necessary, in the case of MIOR as well as for other ABM, to ensure that each work-item is not too constrained in its execution. +\pagebreak \lstinputlisting[caption=Main MIOR kernel,label=ch17:listing:mior_kernels]{Chapters/chapter17/code/mior_kernels.cl} From the sequential algorithm 1 where all the agents share the same -- 2.39.5 From afa299e693f71360f1ce504754768b26916e5475 Mon Sep 17 00:00:00 2001 From: couturie Date: Mon, 13 May 2013 15:38:29 +0200 Subject: [PATCH 16/16] correct --- BookGPU/Chapters/chapter15/ch15.tex | 5 +++-- BookGPU/Chapters/chapter17/code/mior_launcher.java | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/BookGPU/Chapters/chapter15/ch15.tex b/BookGPU/Chapters/chapter15/ch15.tex index 11a9e34..0225579 100644 --- a/BookGPU/Chapters/chapter15/ch15.tex +++ b/BookGPU/Chapters/chapter15/ch15.tex @@ -1,9 +1,10 @@ -\chapterauthor{Pierre Fortin, Fabienne J\'ez\'equel and Jean-Luc Lamotte}{Laboratoire d'Informatique de Paris 6, University Paris 6, France} \chapterauthor{Rachid Habel}{T\'el\'ecom SudParis, France} +\chapterauthor{Pierre Fortin, Fabienne J\'ez\'equel and Jean-Luc Lamotte}{Laboratoire d'Informatique de Paris 6, Université Pierre et Marie Curie, France} + %\chapterauthor{Fabienne J\'ez\'equel}{Laboratoire d'Informatique de Paris 6, University Paris 6} %\chapterauthor{Jean-Luc Lamotte}{Laboratoire d'Informatique de Paris 6, University Paris 6} \chapterauthor{Stan Scott}{School of Electronics, Electrical Engineering \& Computer Science, -The Queen's University of Belfast, Ireland} +The Queen's University of Belfast, United Kingdom} %\newcommand{\fixme}[1]{{\bf #1}} diff --git a/BookGPU/Chapters/chapter17/code/mior_launcher.java b/BookGPU/Chapters/chapter17/code/mior_launcher.java index 159b88a..5203858 100644 --- a/BookGPU/Chapters/chapter17/code/mior_launcher.java +++ b/BookGPU/Chapters/chapter17/code/mior_launcher.java @@ -10,7 +10,6 @@ protected void doLiveImpl() { OCLEvent event = queue.enqueue1DKernel(simulateKernel, nbSim * blockSize, blockSize); OCLEvent.waitFor(event); - OCLUtils.printEventStats("simulate", event); if (! isBatchModeEnabled()) { queue.blockingReadBuffer(mmMem, mmList, 0, mmMem.getSize()); -- 2.39.5