From: couturie Date: Tue, 8 Oct 2013 18:41:54 +0000 (+0200) Subject: new X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/HEAD?ds=sidebyside new --- diff --git a/BookGPU/BookGPU.toc_new2 b/BookGPU/BookGPU.toc_new2 new file mode 100644 index 0000000..a61d04b --- /dev/null +++ b/BookGPU/BookGPU.toc_new2 @@ -0,0 +1,345 @@ +\contentsline {fm}{List of Figures}{xi} +\contentsline {fm}{List of Tables}{xvii} +\contentsline {fm}{Preface}{xxi} +\contentsline {part}{I\hspace {1em}Presentation of GPUs}{1} +\author {Rapha\IeC {\"e}l Couturier}{} +\contentsline {chapter}{\numberline {1}Presentation of the GPU architecture and of the CUDA\hfill \penalty -\@M environment}{3} +\contentsline {section}{\numberline {1.1}Introduction}{3} +\contentsline {section}{\numberline {1.2}Brief history of the video card}{4} +\contentsline {section}{\numberline {1.3}GPGPU}{4} +\contentsline {section}{\numberline {1.4}Architecture of current GPUs}{5} +\contentsline {section}{\numberline {1.5}Kinds of parallelism}{7} +\contentsline {section}{\numberline {1.6}CUDA multithreading}{8} +\contentsline {section}{\numberline {1.7}Memory hierarchy}{9} +\contentsline {section}{\numberline {1.8}Conclusion}{11} +\author {Rapha\IeC {\"e}l Couturier}{} +\contentsline {chapter}{\numberline {2}Introduction to CUDA}{13} +\contentsline {section}{\numberline {2.1}Introduction}{13} +\contentsline {section}{\numberline {2.2}First example}{13} +\contentsline {section}{\numberline {2.3}Second example: using CUBLAS }{16} +\contentsline {section}{\numberline {2.4}Third example: matrix-matrix multiplication}{18} +\contentsline {section}{\numberline {2.5}Conclusion}{21} +\contentsline {part}{II\hspace {1em}Image processing}{23} +\author {Gilles Perrot}{} +\contentsline {chapter}{\numberline {3}Setting up the environment}{25} +\contentsline {section}{\numberline {3.1}Data transfers, memory management}{25} +\contentsline {section}{\numberline {3.2}Performance measurements}{28} +\author {Gilles Perrot}{} +\contentsline {chapter}{\numberline {4}Implementing a fast median filter}{31} +\contentsline {section}{\numberline {4.1}Introduction}{31} +\contentsline {section}{\numberline {4.2}Median filtering}{32} +\contentsline {subsection}{\numberline {4.2.1}Basic principles}{32} +\contentsline {subsection}{\numberline {4.2.2}A naive implementation}{33} +\contentsline {section}{\numberline {4.3}NVIDIA GPU tuning recipes}{35} +\contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers}{37} +\contentsline {subsection}{\numberline {4.4.1}The simplest way}{37} +\contentsline {subsection}{\numberline {4.4.2}Further optimization}{38} +\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{39} +\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{42} +\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{44} +\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{44} +\contentsline {subsection}{\numberline {4.5.2}Fast approximated $n\times n$ median filter }{47} +\author {Gilles Perrot}{} +\contentsline {chapter}{\numberline {5}Implementing an efficient convolution operation on GPU}{53} +\contentsline {section}{\numberline {5.1}Overview}{53} +\contentsline {section}{\numberline {5.2}Definition}{54} +\contentsline {section}{\numberline {5.3}Implementation}{54} +\contentsline {subsection}{\numberline {5.3.1}First test implementation}{55} +\contentsline {subsection}{\numberline {5.3.2}Using parameterizable masks}{58} +\contentsline {subsection}{\numberline {5.3.3}Increasing the number of pixels processed by each thread}{59} +\contentsline {subsection}{\numberline {5.3.4}Using shared memory to store prefetched data}{62} +\contentsline {section}{\numberline {5.4}Separable convolution}{65} +\contentsline {section}{\numberline {5.5}Conclusion}{69} +\contentsline {part}{III\hspace {1em}Software development}{71} +\author {Stefan L. Glimberg, Allan P. Engsig-Karup, Allan S. Nielsen, and Bernd Dammann}{} +\contentsline {chapter}{\numberline {6}Development of software components for heterogeneous many-core architectures}{73} +\contentsline {section}{\numberline {6.1}Software development for heterogeneous\hfill \penalty -\@M architectures}{74} +\contentsline {subsection}{\numberline {6.1.1}Test environments}{76} +\contentsline {section}{\numberline {6.2}Heterogeneous library design for PDE solvers}{76} +\contentsline {subsection}{\numberline {6.2.1}Component and concept design}{77} +\contentsline {subsection}{\numberline {6.2.2}A matrix-free finite difference component}{77} +\contentsline {section}{\numberline {6.3}Model problems}{81} +\contentsline {subsection}{\numberline {6.3.1}Heat conduction equation}{82} +\contentsline {subsubsection}{\numberline {6.3.1.1}Assembling the heat conduction solver}{84} +\contentsline {subsubsection}{\numberline {6.3.1.2}Numerical solutions to the heat conduction problem}{86} +\contentsline {subsection}{\numberline {6.3.2}Poisson equation}{87} +\contentsline {subsubsection}{\numberline {6.3.2.1}Assembling the Poisson solver}{89} +\contentsline {subsubsection}{\numberline {6.3.2.2}Numerical solutions to the Poisson problem}{91} +\contentsline {section}{\numberline {6.4}Optimization strategies for multi-GPU systems}{91} +\contentsline {subsection}{\numberline {6.4.1}Spatial domain decomposition}{93} +\contentsline {subsection}{\numberline {6.4.2}Parareal--parallel time integration}{95} +\contentsline {subsubsection}{\numberline {6.4.2.1}The parareal algorithm}{96} +\contentsline {subsubsection}{\numberline {6.4.2.2}Computational complexity}{98} +\contentsline {section}{\numberline {6.5}Conclusion and outlook}{100} +\author {Sylvain Contassot-Vivier}{} +\author {Stephane Vialle}{} +\author {Jens Gustedt}{} +\contentsline {chapter}{\numberline {7}Development methodologies for GPU and cluster of GPUs}{105} +\contentsline {section}{\numberline {7.1}Introduction}{106} +\contentsline {section}{\numberline {7.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{106} +\contentsline {subsection}{\numberline {7.2.1}Synchronous parallel algorithms on GPU clusters}{106} +\contentsline {subsection}{\numberline {7.2.2}Native overlap of CPU communications and GPU computations}{108} +\contentsline {subsection}{\numberline {7.2.3}Overlapping with sequences of transfers and computations}{110} +\contentsline {subsection}{\numberline {7.2.4}Interleaved communications-transfers-computations overlapping}{115} +\contentsline {subsection}{\numberline {7.2.5}Experimental validation}{118} +\contentsline {section}{\numberline {7.3}General scheme of asynchronous parallel code with computation/communication overlapping}{120} +\contentsline {subsection}{\numberline {7.3.1}A basic asynchronous scheme}{122} +\contentsline {subsection}{\numberline {7.3.2}Synchronization of the asynchronous scheme}{126} +\contentsline {subsection}{\numberline {7.3.3}Asynchronous scheme using MPI, OpenMP, and CUDA}{130} +\contentsline {subsection}{\numberline {7.3.4}Experimental validation}{137} +\contentsline {section}{\numberline {7.4}Perspective: a unifying programming model}{140} +\contentsline {subsection}{\numberline {7.4.1}Resources}{141} +\contentsline {subsection}{\numberline {7.4.2}Control}{142} +\contentsline {subsection}{\numberline {7.4.3}Example: block-cyclic matrix multiplication (MM)}{142} +\contentsline {subsection}{\numberline {7.4.4}Tasks and operations}{145} +\contentsline {section}{\numberline {7.5}Conclusion}{146} +\contentsline {section}{\numberline {7.6}Glossary}{146} +\contentsline {part}{IV\hspace {1em}Optimization}{151} +\author {Imen Chakroun and Nouredine Melab}{} +\contentsline {chapter}{\numberline {8}GPU-accelerated tree-based exact optimization methods}{153} +\contentsline {section}{\numberline {8.1}Introduction}{154} +\contentsline {section}{\numberline {8.2}Branch-and-bound algorithm}{156} +\contentsline {section}{\numberline {8.3}Parallel branch-and-bound algorithms}{157} +\contentsline {subsection}{\numberline {8.3.1}The parallel tree exploration model}{158} +\contentsline {subsection}{\numberline {8.3.2}The parallel evaluation of bounds model}{158} +\contentsline {section}{\numberline {8.4}The flowshop scheduling problem}{159} +\contentsline {subsection}{\numberline {8.4.1}Definition of the flowshop scheduling problem}{159} +\contentsline {subsection}{\numberline {8.4.2}Lower bound for the flowshop scheduling problem}{160} +\contentsline {section}{\numberline {8.5}GPU-accelerated B\&B based on the parallel tree exploration (GPU-PTE-BB)}{161} +\contentsline {section}{\numberline {8.6}GPU-accelerated B\&B based on the parallel evaluation of bounds (GPU-PEB-BB) }{162} +\contentsline {section}{\numberline {8.7}Thread divergence}{164} +\contentsline {subsection}{\numberline {8.7.1}The thread divergence issue}{164} +\contentsline {subsection}{\numberline {8.7.2}Mechanisms for reducing branch divergence}{165} +\contentsline {section}{\numberline {8.8}Memory access optimization}{168} +\contentsline {subsection}{\numberline {8.8.1}Complexity analysis of the memory usage of the lower bound }{169} +\contentsline {subsection}{\numberline {8.8.2}Data placement pattern of the lower bound on GPU}{170} +\contentsline {section}{\numberline {8.9}Experiments}{172} +\contentsline {subsection}{\numberline {8.9.1}Parameters settings}{172} +\contentsline {subsection}{\numberline {8.9.2}Experimental protocol: computing the speedup}{172} +\contentsline {subsection}{\numberline {8.9.3}Performance impact of GPU-based parallelism}{174} +\contentsline {subsection}{\numberline {8.9.4}Thread divergence reduction}{176} +\contentsline {subsection}{\numberline {8.9.5}Data access optimization}{176} +\contentsline {section}{\numberline {8.10}Conclusion and future work}{178} +\author {Malika Mehdi}{} +\author {Ahc\`{e}ne Bendjoudi}{} +\author {Lakhdar Loukil}{} +\author {Nouredine Melab}{} +\contentsline {chapter}{\numberline {9}Parallel GPU-accelerated metaheuristics}{183} +\contentsline {section}{\numberline {9.1}Introduction}{184} +\contentsline {section}{\numberline {9.2}Combinatorial optimization}{184} +\contentsline {section}{\numberline {9.3}Parallel models for metaheuristics}{185} +\contentsline {section}{\numberline {9.4}Challenges for the design of GPU-based metaheuristics}{187} +\contentsline {subsection}{\numberline {9.4.1}Data placement on a hierarchical memory}{188} +\contentsline {subsection}{\numberline {9.4.2}Threads synchronization}{189} +\contentsline {subsection}{\numberline {9.4.3}Thread divergence}{189} +\contentsline {subsection}{\numberline {9.4.4}Task distribution and CPU/GPU communication}{190} +\contentsline {section}{\numberline {9.5}State-of-the-art parallel metaheuristics on GPUs}{190} +\contentsline {subsection}{\numberline {9.5.1}Implementing solution-based metaheuristics on GPUs}{191} +\contentsline {subsection}{\numberline {9.5.2}Implementing population-based metaheuristics on GPUs}{193} +\contentsline {subsection}{\numberline {9.5.3}Synthesis of the implementation strategies}{198} +\contentsline {section}{\numberline {9.6}Frameworks for metaheuristics on GPUs}{201} +\contentsline {subsection}{\numberline {9.6.1}PUGACE: framework for implementing evolutionary computation on GPUs}{201} +\contentsline {subsection}{\numberline {9.6.2}ParadisEO-MO-GPU}{202} +\contentsline {subsection}{\numberline {9.6.3}libCUDAOptimize: an open source library of GPU-based metaheuristics}{202} +\contentsline {section}{\numberline {9.7}Case study: accelerating large neighborhood LS method on GPUs for solving the Q3AP}{203} +\contentsline {subsection}{\numberline {9.7.1}The quadratic 3-dimensional assignment problem}{204} +\contentsline {subsection}{\numberline {9.7.2}Iterated tabu search algorithm for the Q3AP}{205} +\contentsline {subsection}{\numberline {9.7.3}Neighborhood functions for the Q3AP}{206} +\contentsline {subsection}{\numberline {9.7.4}Design and implementation of a GPU-based iterated tabu search algorithm for the Q3AP}{207} +\contentsline {subsection}{\numberline {9.7.5}Experimental results}{208} +\contentsline {section}{\numberline {9.8}Conclusion}{210} +\author {Xavier Meyer}{} +\author {Bastien Chopard}{} +\author {Paul Albuquerque}{} +\contentsline {chapter}{\numberline {10}Linear programming on a GPU: a\nobreakspace {}case\nobreakspace {}study}{215} +\contentsline {section}{\numberline {10.1}Introduction}{216} +\contentsline {section}{\numberline {10.2}Simplex algorithm}{217} +\contentsline {subsection}{\numberline {10.2.1}Linear programming model}{217} +\contentsline {subsection}{\numberline {10.2.2}Standard simplex algorithm}{217} +\contentsline {subsection}{\numberline {10.2.3}Revised simplex method}{219} +\contentsline {subsection}{\numberline {10.2.4}Heuristics and improvements}{221} +\contentsline {section}{\numberline {10.3}Branch-and-bound algorithm}{224} +\contentsline {subsection}{\numberline {10.3.1}Integer linear programming}{224} +\contentsline {subsection}{\numberline {10.3.2}Branch-and-bound tree}{224} +\contentsline {subsection}{\numberline {10.3.3}Branching strategy}{225} +\contentsline {subsection}{\numberline {10.3.4}Node selection strategy}{226} +\contentsline {subsection}{\numberline {10.3.5}Cutting-plane methods}{227} +\contentsline {section}{\numberline {10.4}CUDA considerations}{228} +\contentsline {subsection}{\numberline {10.4.1}Parallel reduction}{228} +\contentsline {subsection}{\numberline {10.4.2}Kernel optimization}{228} +\contentsline {section}{\numberline {10.5}Implementations}{229} +\contentsline {subsection}{\numberline {10.5.1}Standard simplex}{229} +\contentsline {subsection}{\numberline {10.5.2}Revised simplex}{232} +\contentsline {subsection}{\numberline {10.5.3}Branch-and-bound}{236} +\contentsline {section}{\numberline {10.6}Performance model}{237} +\contentsline {subsection}{\numberline {10.6.1}Levels of parallelism}{238} +\contentsline {subsection}{\numberline {10.6.2}Amount of work done by a thread}{238} +\contentsline {subsection}{\numberline {10.6.3}Global performance model}{239} +\contentsline {subsection}{\numberline {10.6.4}A kernel example: \textit {steepest-edge}}{239} +\contentsline {subsection}{\numberline {10.6.5}Standard simplex GPU implementation model}{240} +\contentsline {section}{\numberline {10.7}Measurements and analysis}{241} +\contentsline {subsection}{\numberline {10.7.1}Performance model validation}{241} +\contentsline {subsection}{\numberline {10.7.2}Performance comparison of implementations}{241} +\contentsline {section}{\numberline {10.8}Conclusion and perspectives}{245} +\contentsline {part}{V\hspace {1em}Numerical applications}{249} +\author {Allan P. Engsig-Karup, Stefan L. Glimberg, Allan S. Nielsen, and Ole Lindberg}{} +\contentsline {chapter}{\numberline {11}Fast hydrodynamics on heterogeneous many-core hardware}{251} +\contentsline {section}{\numberline {11.1}On hardware trends and challenges in scientific applications}{253} +\contentsline {section}{\numberline {11.2}On modeling paradigms for highly nonlinear and dispersive water waves}{255} +\contentsline {section}{\numberline {11.3}Governing equations}{256} +\contentsline {subsection}{\numberline {11.3.1}Boundary conditions}{260} +\contentsline {section}{\numberline {11.4}The numerical model}{260} +\contentsline {subsection}{\numberline {11.4.1}Strategies for efficient solution of the Laplace problem}{261} +\contentsline {subsection}{\numberline {11.4.2}Finite difference approximations}{262} +\contentsline {subsection}{\numberline {11.4.3}Time integration}{263} +\contentsline {subsection}{\numberline {11.4.4}Wave generation and absorption}{266} +\contentsline {subsection}{\numberline {11.4.5}Preconditioned Defect Correction (PDC) method}{267} +\contentsline {subsection}{\numberline {11.4.6}Distributed computations via domain decomposition}{269} +\contentsline {subsection}{\numberline {11.4.7}Assembling the wave model from library components}{271} +\contentsline {section}{\numberline {11.5}Properties of the numerical model}{276} +\contentsline {subsection}{\numberline {11.5.1}Dispersion and kinematic properties}{276} +\contentsline {section}{\numberline {11.6}Numerical experiments}{277} +\contentsline {subsection}{\numberline {11.6.1}On precision requirements in engineering applications}{277} +\contentsline {subsection}{\numberline {11.6.2}Acceleration via parallelism in time using parareal}{282} +\contentsline {subsection}{\numberline {11.6.3}Towards real-time interactive ship simulation}{286} +\contentsline {section}{\numberline {11.7}Conclusion and future work}{288} +\contentsline {section}{\numberline {11.8}Acknowledgments}{289} +\author {Gleb Beliakov and Shaowu Liu}{} +\contentsline {chapter}{\numberline {12}Parallel monotone spline interpolation and approximation on GPUs}{295} +\contentsline {section}{\numberline {12.1}Introduction}{295} +\contentsline {section}{\numberline {12.2}Monotone splines}{298} +\contentsline {subsection}{\numberline {12.2.1}Monotone quadratic splines}{298} +\contentsline {subsection}{\numberline {12.2.2}Monotone Hermite splines}{302} +\contentsline {section}{\numberline {12.3}Smoothing noisy data via parallel isotone regression}{303} +\contentsline {section}{\numberline {12.4}Conclusion}{308} +\author {Lilia Ziane Khodja, Rapha\IeC {\"e}l Couturier, and Jacques Bahi}{} +\contentsline {chapter}{\numberline {13}Solving sparse linear systems with GMRES and CG methods on GPU clusters}{311} +\contentsline {section}{\numberline {13.1}Introduction}{311} +\contentsline {section}{\numberline {13.2}Krylov iterative methods}{312} +\contentsline {subsection}{\numberline {13.2.1}CG method}{313} +\contentsline {subsection}{\numberline {13.2.2}GMRES method}{314} +\contentsline {section}{\numberline {13.3}Parallel implementation on a GPU cluster}{317} +\contentsline {subsection}{\numberline {13.3.1}Data partitioning}{317} +\contentsline {subsection}{\numberline {13.3.2}GPU computing}{317} +\contentsline {subsection}{\numberline {13.3.3}Data communications}{319} +\contentsline {section}{\numberline {13.4}Experimental results}{321} +\contentsline {section}{\numberline {13.5}Conclusion}{327} +\author {Lilia Ziane Khodja, Rapha\IeC {\"e}l Couturier, Jacques Bahi}{} +\author {Ming Chau}{} +\author {Pierre Spit\IeC {\'e}ri}{} +\contentsline {chapter}{\numberline {14}Solving sparse nonlinear systems of obstacle problems on GPU clusters}{331} +\contentsline {section}{\numberline {14.1}Introduction}{331} +\contentsline {section}{\numberline {14.2}Obstacle problems}{332} +\contentsline {subsection}{\numberline {14.2.1}Mathematical model}{332} +\contentsline {subsection}{\numberline {14.2.2}Discretization}{333} +\contentsline {section}{\numberline {14.3}Parallel iterative method}{334} +\contentsline {section}{\numberline {14.4}Parallel implementation on a GPU cluster}{337} +\contentsline {section}{\numberline {14.5}Experimental tests on a GPU cluster}{345} +\contentsline {section}{\numberline {14.6}Red-black ordering technique}{348} +\contentsline {section}{\numberline {14.7}Conclusion}{352} +\author {Alan Gray and Kevin Stratford}{} +\contentsline {chapter}{\numberline {15}Ludwig: multiple GPUs for a complex fluid lattice Boltzmann +application}{355} +\contentsline {section}{\numberline {15.1}Introduction}{355} +\contentsline {section}{\numberline {15.2}Background}{357} +\contentsline {section}{\numberline {15.3}Single GPU implementation}{360} +\contentsline {section}{\numberline {15.4}Multiple GPU implementation}{361} +\contentsline {section}{\numberline {15.5}Moving solid particles}{364} +\contentsline {section}{\numberline {15.6}Summary}{367} +\contentsline {section}{\numberline {15.7}Acknowledgments}{367} +\author {Rachid Habel}{} +\author {Pierre Fortin, Fabienne J\'ez\'equel, Jean-Luc Lamotte}{} +\author {Stan Scott}{} +\contentsline {chapter}{\numberline {16}Numerical validation and performance optimization on GPUs of an application in atomic physics}{371} +\contentsline {section}{\numberline {16.1}Introduction}{372} +\contentsline {section}{\numberline {16.2}2DRMP and the PROP program}{373} +\contentsline {subsection}{\numberline {16.2.1}Principles of $R$-matrix propagation}{373} +\contentsline {subsection}{\numberline {16.2.2}Description of the PROP program}{375} +\contentsline {subsection}{\numberline {16.2.3}CAPS implementation}{376} +\contentsline {section}{\numberline {16.3}Numerical validation of PROP in single precision}{377} +\contentsline {subsection}{\numberline {16.3.1}Medium case study}{378} +\contentsline {subsection}{\numberline {16.3.2}Huge case study}{380} +\contentsline {section}{\numberline {16.4}Towards a complete deployment of PROP on GPUs}{381} +\contentsline {subsection}{\numberline {16.4.1}Computing the output $R$-matrix on GPU}{382} +\contentsline {subsection}{\numberline {16.4.2}Constructing the local $R$-matrices on GPU}{384} +\contentsline {subsection}{\numberline {16.4.3}Scaling amplitude arrays on GPU}{385} +\contentsline {subsection}{\numberline {16.4.4}Using double-buffering to overlap I/O and computation}{385} +\contentsline {subsection}{\numberline {16.4.5}Matrix padding}{386} +\contentsline {section}{\numberline {16.5}Performance results}{387} +\contentsline {subsection}{\numberline {16.5.1}PROP deployment on GPU}{387} +\contentsline {subsection}{\numberline {16.5.2}PROP execution profile}{389} +\contentsline {section}{\numberline {16.6}Propagation of multiple concurrent energies on GPU}{391} +\contentsline {section}{\numberline {16.7}Conclusion and future work}{392} +\author {Xuexin Liu, Sheldon Xiang-Dong Tan}{} +\author {Hai Wang}{} +\author {Hao Yu}{} +\contentsline {chapter}{\numberline {17}A GPU-accelerated envelope-following method for switching power converter simulation}{395} +\contentsline {section}{\numberline {17.1}Introduction}{395} +\contentsline {section}{\numberline {17.2}The envelope-following method in a nutshell}{398} +\contentsline {section}{\numberline {17.3}New parallel envelope-following method}{400} +\contentsline {subsection}{\numberline {17.3.1}GMRES solver for Newton update equation}{400} +\contentsline {subsection}{\numberline {17.3.2}Parallelization on GPU platforms}{402} +\contentsline {subsection}{\numberline {17.3.3}Gear-2 based sensitivity calculation}{404} +\contentsline {section}{\numberline {17.4}Numerical examples}{406} +\contentsline {section}{\numberline {17.5}Summary}{410} +\contentsline {section}{\numberline {17.6}Glossary}{410} +\contentsline {part}{VI\hspace {1em}Other}{413} +\author {Guillaume Laville, Christophe Lang, B\IeC {\'e}n\IeC {\'e}dicte Herrmann, Laurent Philippe}{} +\author {Kamel Mazouzi}{} +\author {Nicolas Marilleau}{} +\contentsline {chapter}{\numberline {18}Implementing multi-agent systems on GPU}{415} +\contentsline {section}{\numberline {18.1}Introduction}{416} +\contentsline {section}{\numberline {18.2}Running agent-based simulations}{417} +\contentsline {subsection}{\numberline {18.2.1}Multi-agent systems and parallelism}{417} +\contentsline {subsection}{\numberline {18.2.2}MAS implementation on GPU}{418} +\contentsline {section}{\numberline {18.3}A first practical example}{420} +\contentsline {subsection}{\numberline {18.3.1}The Collembola model}{420} +\contentsline {subsection}{\numberline {18.3.2}Collembola implementation}{421} +\contentsline {subsection}{\numberline {18.3.3}Collembola performance}{423} +\contentsline {section}{\numberline {18.4}Second example}{424} +\contentsline {subsection}{\numberline {18.4.1}The MIOR model}{424} +\contentsline {subsection}{\numberline {18.4.2}MIOR implementation}{425} +\contentsline {subsubsection}{\numberline {18.4.2.1}Execution mapping on GPU}{426} +\contentsline {subsubsection}{\numberline {18.4.2.2}Data structures translation}{427} +\contentsline {subsubsection}{\numberline {18.4.2.3}Critical resources access management}{428} +\contentsline {subsubsection}{\numberline {18.4.2.4}Termination detection}{430} +\contentsline {subsection}{\numberline {18.4.3}Performance of MIOR implementations}{430} +\contentsline {section}{\numberline {18.5}Analysis and recommendations}{434} +\contentsline {subsection}{\numberline {18.5.1}Analysis}{434} +\contentsline {subsection}{\numberline {18.5.2}MAS execution workflow}{435} +\contentsline {subsection}{\numberline {18.5.3}Implementation challenges}{435} +\contentsline {subsection}{\numberline {18.5.4}MCSMA}{436} +\contentsline {section}{\numberline {18.6}Conclusion}{437} +\author {Rapha\IeC {\"e}l Couturier and Christophe Guyeux}{} +\contentsline {chapter}{\numberline {19}Pseudorandom number generator on GPU}{441} +\contentsline {section}{\numberline {19.1}Introduction}{441} +\contentsline {section}{\numberline {19.2}Basic reminders}{443} +\contentsline {subsection}{\numberline {19.2.1}A short presentation of chaos}{443} +\contentsline {subsection}{\numberline {19.2.2}On Devaney's definition of chaos}{443} +\contentsline {subsection}{\numberline {19.2.3}Chaotic iterations}{444} +\contentsline {section}{\numberline {19.3}Toward efficiency and improvement for CI PRNG}{445} +\contentsline {subsection}{\numberline {19.3.1}First efficient implementation of a PRNG based on chaotic iterations}{445} +\contentsline {subsection}{\numberline {19.3.2}Efficient PRNGs based on chaotic iterations on GPU}{446} +\contentsline {subsection}{\numberline {19.3.3}Naive version for GPU}{446} +\contentsline {subsection}{\numberline {19.3.4}Improved version for GPU}{447} +\contentsline {subsection}{\numberline {19.3.5}Chaos evaluation of the improved version}{448} +\contentsline {section}{\numberline {19.4}Experiments}{449} +\contentsline {section}{\numberline {19.5}Summary}{450} +\author {Bertil Schmidt and Hoang-Vu Dang}{} +\contentsline {chapter}{\numberline {20}Solving large sparse linear systems for integer factorization on GPUs}{453} +\contentsline {section}{\numberline {20.1}Introduction}{453} +\contentsline {section}{\numberline {20.2}Block Wiedemann algorithm}{454} +\contentsline {section}{\numberline {20.3}SpMV over GF(2) for NFS matrices using existing formats on GPUs}{455} +\contentsline {section}{\numberline {20.4}A hybrid format for SpMV on GPUs}{459} +\contentsline {subsection}{\numberline {20.4.1}Dense format}{459} +\contentsline {subsection}{\numberline {20.4.2}Sliced COO}{460} +\contentsline {subsection}{\numberline {20.4.3}Determining the cut-off point of each format}{464} +\contentsline {section}{\numberline {20.5}SCOO for single-precision floating-point matrices}{465} +\contentsline {section}{\numberline {20.6}Performance evaluation}{466} +\contentsline {subsection}{\numberline {20.6.1}Experimental setup}{466} +\contentsline {subsection}{\numberline {20.6.2}Experimental results of SpMV on NFS matrix}{466} +\contentsline {subsection}{\numberline {20.6.3}Experimental results of SpMV on floating-point matrices}{467} +\contentsline {subsubsection}{\numberline {20.6.3.1}Performance comparison to existing GPU formats}{467} +\contentsline {subsubsection}{\numberline {20.6.3.2}Performance comparison to a CPU implementation}{469} +\contentsline {section}{\numberline {20.7}Conclusion}{470} +\contentsline {chapter}{Index}{473} diff --git a/BookGPU/bu16.bbl_new b/BookGPU/bu16.bbl_new new file mode 100644 index 0000000..7fe7b12 --- /dev/null +++ b/BookGPU/bu16.bbl_new @@ -0,0 +1,117 @@ +\begin{thebibliography}{10} + +\bibitem{CUDA_Zone} +{CUDA} community showcase. +\newblock \url{http://www.nvidia.com/}. + +\bibitem{ngspice} +{NGSPICE}. +\newblock \url{http://ngspice.sourceforge.net/}. + +\bibitem{AMDMC'06} +{AMD Inc.} +\newblock Multi-core processors---the next evolution in computing ({W}hite + {P}aper), 2006. +\newblock \url{http://multicore.amd.com}. + +\bibitem{streamSDK} +{AMD Inc.} +\newblock {AMD} {S}team {SDK}. +\newblock \url{http://developer.amd.com/gpu/ATIStreamSDK}, 2011. + +\bibitem{Feldmann:ICCAD'96} +P.~Feldmann and J.~Roychowdhury. +\newblock Computation of circuit waveform envelopes using an efficient, + matrix-decomposed harmonic balance algorithm. +\newblock In {\em Proc. ICCAD}, pages 295--300, Nov. 1996. + +\bibitem{Gear:book'71} +C.~W. Gear. +\newblock {\em Numerical Initial Value Problems in Ordinary Differential + Equations}. +\newblock Prentice-Hall, Englewood Cliffs, NJ, 1971. + +\bibitem{gpgpu} +D.~G\"{o}ddeke. +\newblock General-purpose computation using graphics hardware. +\newblock \url{http://www.gpgpu.org/}, 2011. + +\bibitem{Golub:Book'96} +G.~H. Golub and C.~Van Loan. +\newblock {\em Matrix Computations}. +\newblock The Johns Hopkins University Press, 3rd edition, 1996. + +\bibitem{IntelMC'06} +{Intel Corporation}. +\newblock Intel multi-core processors, making the move to quad-core and beyond + ({W}hite {P}aper), 2006. +\newblock \url{http://www.intel.com/multi-core}. + +\bibitem{Kato:COMPEL'06} +T.~Kato et~al. +\newblock Envelope following analysis of an autonomous power electronic system. +\newblock In {\em IEEE COMPEL'06}, pages 29--33, July 2006. + +\bibitem{openCL} +{Khronos Group}. +\newblock Open {C}omputing {L}anguage ({O}pen{CL}). +\newblock \url{http://www.khronos.org/opencl}, 2011. + +\bibitem{Kirk:Book'10} +D.~B. Kirk and W.-M. Hwu. +\newblock {\em Programming Massively Parallel Processors: A Hands-on Approach}. +\newblock Morgan Kaufmann Publishers Inc., San Francisco, CA, 2010. + +\bibitem{Krein:book'97} +P.~Krein. +\newblock {\em Elements of Power Electronics}. +\newblock Oxford University Press, 1997. + +\bibitem{Kundert:ICCAD'88} +K.~Kundert, J.~White, and A.~Sangiovanni-Vincentelli. +\newblock An envelope following method for the efficient transient simulation + of switching power and filter circuits. +\newblock In {\em Proc. ICCAD}, pages 446--449, Oct. 1988. + +\bibitem{LiuTan1:DATE'12} +X.-X. Liu, S.~X.-D. Tan, H.~Wang, and H.~Yu. +\newblock A {GPU}-accelerated envelope-following method for switching power + converter simulation. +\newblock pages 1349--1354, March 2012. + +\bibitem{CUDASDK} +{NVIDIA}, 2011. +\newblock \url{http://developer.nvidia.com/cuda-toolkit-40}. + +\bibitem{nvidia} +{NVIDIA Corporation}, 2011. +\newblock {h}ttp://www.nvidia.com. + +\bibitem{cuda} +{{NVIDIA} Corporation}. +\newblock {CUDA} ({C}ompute {U}nified {D}evice {A}rchitecture), 2011. +\newblock \url{http://www.nvidia.com/object/cuda_home.html}. + +\bibitem{Telichevesky:DAC'95} +R.~Telichevesky, K.~Kundert, and J.~White. +\newblock Efficient steady-state analysis based on matrix-free + {K}rylov-subspace methods. +\newblock pages 480--484, 1995. + +\bibitem{Trzynadlowski:Book'10} +A.~M. Trzynadlowski. +\newblock {\em Introduction to Modern Power Electronics}, +\newblock second edition, Wiley, 2010. + +\bibitem{Vlach:Book'94} +J.~Vlach and K.~Singhal. +\newblock {\em Computer Methods for Circuit Analysis and Design}. +\newblock Van Nostrand Reinhold, New York, NY, 1995. + +\bibitem{White:TPE'91} +J.~White and S.~Leeb. +\newblock An envelope-following approach to switching power converter + simulation. +\newblock {\em IEEE Trans. Power Electron.}, 6(2):303--307, Apr. 1991. + +\end{thebibliography}