+\contentsline {fm}{List of Figures}{xi}
+\contentsline {fm}{List of Tables}{xvii}
+\contentsline {fm}{Preface}{xxi}
+\contentsline {part}{I\hspace {1em}Presentation of GPUs}{1}
+\author {Rapha\IeC {\"e}l Couturier}{}
+\contentsline {chapter}{\numberline {1}Presentation of the GPU architecture and of the CUDA\hfill \penalty -\@M environment}{3}
+\contentsline {section}{\numberline {1.1}Introduction}{3}
+\contentsline {section}{\numberline {1.2}Brief history of the video card}{4}
+\contentsline {section}{\numberline {1.3}GPGPU}{4}
+\contentsline {section}{\numberline {1.4}Architecture of current GPUs}{5}
+\contentsline {section}{\numberline {1.5}Kinds of parallelism}{7}
+\contentsline {section}{\numberline {1.6}CUDA multithreading}{8}
+\contentsline {section}{\numberline {1.7}Memory hierarchy}{9}
+\contentsline {section}{\numberline {1.8}Conclusion}{11}
+\author {Rapha\IeC {\"e}l Couturier}{}
+\contentsline {chapter}{\numberline {2}Introduction to CUDA}{13}
+\contentsline {section}{\numberline {2.1}Introduction}{13}
+\contentsline {section}{\numberline {2.2}First example}{13}
+\contentsline {section}{\numberline {2.3}Second example: using CUBLAS }{16}
+\contentsline {section}{\numberline {2.4}Third example: matrix-matrix multiplication}{18}
+\contentsline {section}{\numberline {2.5}Conclusion}{21}
+\contentsline {part}{II\hspace {1em}Image processing}{23}
+\author {Gilles Perrot}{}
+\contentsline {chapter}{\numberline {3}Setting up the environment}{25}
+\contentsline {section}{\numberline {3.1}Data transfers, memory management}{25}
+\contentsline {section}{\numberline {3.2}Performance measurements}{28}
+\author {Gilles Perrot}{}
+\contentsline {chapter}{\numberline {4}Implementing a fast median filter}{31}
+\contentsline {section}{\numberline {4.1}Introduction}{31}
+\contentsline {section}{\numberline {4.2}Median filtering}{32}
+\contentsline {subsection}{\numberline {4.2.1}Basic principles}{32}
+\contentsline {subsection}{\numberline {4.2.2}A naive implementation}{33}
+\contentsline {section}{\numberline {4.3}NVIDIA GPU tuning recipes}{35}
+\contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers}{37}
+\contentsline {subsection}{\numberline {4.4.1}The simplest way}{37}
+\contentsline {subsection}{\numberline {4.4.2}Further optimization}{38}
+\contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{39}
+\contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{42}
+\contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{44}
+\contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{44}
+\contentsline {subsection}{\numberline {4.5.2}Fast approximated $n\times n$ median filter }{47}
+\author {Gilles Perrot}{}
+\contentsline {chapter}{\numberline {5}Implementing an efficient convolution operation on GPU}{53}
+\contentsline {section}{\numberline {5.1}Overview}{53}
+\contentsline {section}{\numberline {5.2}Definition}{54}
+\contentsline {section}{\numberline {5.3}Implementation}{54}
+\contentsline {subsection}{\numberline {5.3.1}First test implementation}{55}
+\contentsline {subsection}{\numberline {5.3.2}Using parameterizable masks}{58}
+\contentsline {subsection}{\numberline {5.3.3}Increasing the number of pixels processed by each thread}{59}
+\contentsline {subsection}{\numberline {5.3.4}Using shared memory to store prefetched data}{62}
+\contentsline {section}{\numberline {5.4}Separable convolution}{65}
+\contentsline {section}{\numberline {5.5}Conclusion}{69}
+\contentsline {part}{III\hspace {1em}Software development}{71}
+\author {Stefan L. Glimberg, Allan P. Engsig-Karup, Allan S. Nielsen, and Bernd Dammann}{}
+\contentsline {chapter}{\numberline {6}Development of software components for heterogeneous many-core architectures}{73}
+\contentsline {section}{\numberline {6.1}Software development for heterogeneous\hfill \penalty -\@M architectures}{74}
+\contentsline {subsection}{\numberline {6.1.1}Test environments}{76}
+\contentsline {section}{\numberline {6.2}Heterogeneous library design for PDE solvers}{76}
+\contentsline {subsection}{\numberline {6.2.1}Component and concept design}{77}
+\contentsline {subsection}{\numberline {6.2.2}A matrix-free finite difference component}{77}
+\contentsline {section}{\numberline {6.3}Model problems}{81}
+\contentsline {subsection}{\numberline {6.3.1}Heat conduction equation}{82}
+\contentsline {subsubsection}{\numberline {6.3.1.1}Assembling the heat conduction solver}{84}
+\contentsline {subsubsection}{\numberline {6.3.1.2}Numerical solutions to the heat conduction problem}{86}
+\contentsline {subsection}{\numberline {6.3.2}Poisson equation}{87}
+\contentsline {subsubsection}{\numberline {6.3.2.1}Assembling the Poisson solver}{89}
+\contentsline {subsubsection}{\numberline {6.3.2.2}Numerical solutions to the Poisson problem}{91}
+\contentsline {section}{\numberline {6.4}Optimization strategies for multi-GPU systems}{91}
+\contentsline {subsection}{\numberline {6.4.1}Spatial domain decomposition}{93}
+\contentsline {subsection}{\numberline {6.4.2}Parareal--parallel time integration}{95}
+\contentsline {subsubsection}{\numberline {6.4.2.1}The parareal algorithm}{96}
+\contentsline {subsubsection}{\numberline {6.4.2.2}Computational complexity}{98}
+\contentsline {section}{\numberline {6.5}Conclusion and outlook}{100}
+\author {Sylvain Contassot-Vivier}{}
+\author {Stephane Vialle}{}
+\author {Jens Gustedt}{}
+\contentsline {chapter}{\numberline {7}Development methodologies for GPU and cluster of GPUs}{105}
+\contentsline {section}{\numberline {7.1}Introduction}{106}
+\contentsline {section}{\numberline {7.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{106}
+\contentsline {subsection}{\numberline {7.2.1}Synchronous parallel algorithms on GPU clusters}{106}
+\contentsline {subsection}{\numberline {7.2.2}Native overlap of CPU communications and GPU computations}{108}
+\contentsline {subsection}{\numberline {7.2.3}Overlapping with sequences of transfers and computations}{110}
+\contentsline {subsection}{\numberline {7.2.4}Interleaved communications-transfers-computations overlapping}{115}
+\contentsline {subsection}{\numberline {7.2.5}Experimental validation}{118}
+\contentsline {section}{\numberline {7.3}General scheme of asynchronous parallel code with computation/communication overlapping}{120}
+\contentsline {subsection}{\numberline {7.3.1}A basic asynchronous scheme}{122}
+\contentsline {subsection}{\numberline {7.3.2}Synchronization of the asynchronous scheme}{126}
+\contentsline {subsection}{\numberline {7.3.3}Asynchronous scheme using MPI, OpenMP, and CUDA}{130}
+\contentsline {subsection}{\numberline {7.3.4}Experimental validation}{137}
+\contentsline {section}{\numberline {7.4}Perspective: a unifying programming model}{140}
+\contentsline {subsection}{\numberline {7.4.1}Resources}{141}
+\contentsline {subsection}{\numberline {7.4.2}Control}{142}
+\contentsline {subsection}{\numberline {7.4.3}Example: block-cyclic matrix multiplication (MM)}{142}
+\contentsline {subsection}{\numberline {7.4.4}Tasks and operations}{145}
+\contentsline {section}{\numberline {7.5}Conclusion}{146}
+\contentsline {section}{\numberline {7.6}Glossary}{146}
+\contentsline {part}{IV\hspace {1em}Optimization}{151}
+\author {Imen Chakroun and Nouredine Melab}{}
+\contentsline {chapter}{\numberline {8}GPU-accelerated tree-based exact optimization methods}{153}
+\contentsline {section}{\numberline {8.1}Introduction}{154}
+\contentsline {section}{\numberline {8.2}Branch-and-bound algorithm}{156}
+\contentsline {section}{\numberline {8.3}Parallel branch-and-bound algorithms}{157}
+\contentsline {subsection}{\numberline {8.3.1}The parallel tree exploration model}{158}
+\contentsline {subsection}{\numberline {8.3.2}The parallel evaluation of bounds model}{158}
+\contentsline {section}{\numberline {8.4}The flowshop scheduling problem}{159}
+\contentsline {subsection}{\numberline {8.4.1}Definition of the flowshop scheduling problem}{159}
+\contentsline {subsection}{\numberline {8.4.2}Lower bound for the flowshop scheduling problem}{160}
+\contentsline {section}{\numberline {8.5}GPU-accelerated B\&B based on the parallel tree exploration (GPU-PTE-BB)}{161}
+\contentsline {section}{\numberline {8.6}GPU-accelerated B\&B based on the parallel evaluation of bounds (GPU-PEB-BB) }{162}
+\contentsline {section}{\numberline {8.7}Thread divergence}{164}
+\contentsline {subsection}{\numberline {8.7.1}The thread divergence issue}{164}
+\contentsline {subsection}{\numberline {8.7.2}Mechanisms for reducing branch divergence}{165}
+\contentsline {section}{\numberline {8.8}Memory access optimization}{168}
+\contentsline {subsection}{\numberline {8.8.1}Complexity analysis of the memory usage of the lower bound }{169}
+\contentsline {subsection}{\numberline {8.8.2}Data placement pattern of the lower bound on GPU}{170}
+\contentsline {section}{\numberline {8.9}Experiments}{172}
+\contentsline {subsection}{\numberline {8.9.1}Parameters settings}{172}
+\contentsline {subsection}{\numberline {8.9.2}Experimental protocol: computing the speedup}{172}
+\contentsline {subsection}{\numberline {8.9.3}Performance impact of GPU-based parallelism}{174}
+\contentsline {subsection}{\numberline {8.9.4}Thread divergence reduction}{176}
+\contentsline {subsection}{\numberline {8.9.5}Data access optimization}{176}
+\contentsline {section}{\numberline {8.10}Conclusion and future work}{178}
+\author {Malika Mehdi}{}
+\author {Ahc\`{e}ne Bendjoudi}{}
+\author {Lakhdar Loukil}{}
+\author {Nouredine Melab}{}
+\contentsline {chapter}{\numberline {9}Parallel GPU-accelerated metaheuristics}{183}
+\contentsline {section}{\numberline {9.1}Introduction}{184}
+\contentsline {section}{\numberline {9.2}Combinatorial optimization}{184}
+\contentsline {section}{\numberline {9.3}Parallel models for metaheuristics}{185}
+\contentsline {section}{\numberline {9.4}Challenges for the design of GPU-based metaheuristics}{187}
+\contentsline {subsection}{\numberline {9.4.1}Data placement on a hierarchical memory}{188}
+\contentsline {subsection}{\numberline {9.4.2}Threads synchronization}{189}
+\contentsline {subsection}{\numberline {9.4.3}Thread divergence}{189}
+\contentsline {subsection}{\numberline {9.4.4}Task distribution and CPU/GPU communication}{190}
+\contentsline {section}{\numberline {9.5}State-of-the-art parallel metaheuristics on GPUs}{190}
+\contentsline {subsection}{\numberline {9.5.1}Implementing solution-based metaheuristics on GPUs}{191}
+\contentsline {subsection}{\numberline {9.5.2}Implementing population-based metaheuristics on GPUs}{193}
+\contentsline {subsection}{\numberline {9.5.3}Synthesis of the implementation strategies}{198}
+\contentsline {section}{\numberline {9.6}Frameworks for metaheuristics on GPUs}{201}
+\contentsline {subsection}{\numberline {9.6.1}PUGACE: framework for implementing evolutionary computation on GPUs}{201}
+\contentsline {subsection}{\numberline {9.6.2}ParadisEO-MO-GPU}{202}
+\contentsline {subsection}{\numberline {9.6.3}libCUDAOptimize: an open source library of GPU-based metaheuristics}{202}
+\contentsline {section}{\numberline {9.7}Case study: accelerating large neighborhood LS method on GPUs for solving the Q3AP}{203}
+\contentsline {subsection}{\numberline {9.7.1}The quadratic 3-dimensional assignment problem}{204}
+\contentsline {subsection}{\numberline {9.7.2}Iterated tabu search algorithm for the Q3AP}{205}
+\contentsline {subsection}{\numberline {9.7.3}Neighborhood functions for the Q3AP}{206}
+\contentsline {subsection}{\numberline {9.7.4}Design and implementation of a GPU-based iterated tabu search algorithm for the Q3AP}{207}
+\contentsline {subsection}{\numberline {9.7.5}Experimental results}{208}
+\contentsline {section}{\numberline {9.8}Conclusion}{210}
+\author {Xavier Meyer}{}
+\author {Bastien Chopard}{}
+\author {Paul Albuquerque}{}
+\contentsline {chapter}{\numberline {10}Linear programming on a GPU: a\nobreakspace {}case\nobreakspace {}study}{215}
+\contentsline {section}{\numberline {10.1}Introduction}{216}
+\contentsline {section}{\numberline {10.2}Simplex algorithm}{217}
+\contentsline {subsection}{\numberline {10.2.1}Linear programming model}{217}
+\contentsline {subsection}{\numberline {10.2.2}Standard simplex algorithm}{217}
+\contentsline {subsection}{\numberline {10.2.3}Revised simplex method}{219}
+\contentsline {subsection}{\numberline {10.2.4}Heuristics and improvements}{221}
+\contentsline {section}{\numberline {10.3}Branch-and-bound algorithm}{224}
+\contentsline {subsection}{\numberline {10.3.1}Integer linear programming}{224}
+\contentsline {subsection}{\numberline {10.3.2}Branch-and-bound tree}{224}
+\contentsline {subsection}{\numberline {10.3.3}Branching strategy}{225}
+\contentsline {subsection}{\numberline {10.3.4}Node selection strategy}{226}
+\contentsline {subsection}{\numberline {10.3.5}Cutting-plane methods}{227}
+\contentsline {section}{\numberline {10.4}CUDA considerations}{228}
+\contentsline {subsection}{\numberline {10.4.1}Parallel reduction}{228}
+\contentsline {subsection}{\numberline {10.4.2}Kernel optimization}{228}
+\contentsline {section}{\numberline {10.5}Implementations}{229}
+\contentsline {subsection}{\numberline {10.5.1}Standard simplex}{229}
+\contentsline {subsection}{\numberline {10.5.2}Revised simplex}{232}
+\contentsline {subsection}{\numberline {10.5.3}Branch-and-bound}{236}
+\contentsline {section}{\numberline {10.6}Performance model}{237}
+\contentsline {subsection}{\numberline {10.6.1}Levels of parallelism}{238}
+\contentsline {subsection}{\numberline {10.6.2}Amount of work done by a thread}{238}
+\contentsline {subsection}{\numberline {10.6.3}Global performance model}{239}
+\contentsline {subsection}{\numberline {10.6.4}A kernel example: \textit {steepest-edge}}{239}
+\contentsline {subsection}{\numberline {10.6.5}Standard simplex GPU implementation model}{240}
+\contentsline {section}{\numberline {10.7}Measurements and analysis}{241}
+\contentsline {subsection}{\numberline {10.7.1}Performance model validation}{241}
+\contentsline {subsection}{\numberline {10.7.2}Performance comparison of implementations}{241}
+\contentsline {section}{\numberline {10.8}Conclusion and perspectives}{245}
+\contentsline {part}{V\hspace {1em}Numerical applications}{249}
+\author {Allan P. Engsig-Karup, Stefan L. Glimberg, Allan S. Nielsen, and Ole Lindberg}{}
+\contentsline {chapter}{\numberline {11}Fast hydrodynamics on heterogeneous many-core hardware}{251}
+\contentsline {section}{\numberline {11.1}On hardware trends and challenges in scientific applications}{253}
+\contentsline {section}{\numberline {11.2}On modeling paradigms for highly nonlinear and dispersive water waves}{255}
+\contentsline {section}{\numberline {11.3}Governing equations}{256}
+\contentsline {subsection}{\numberline {11.3.1}Boundary conditions}{260}
+\contentsline {section}{\numberline {11.4}The numerical model}{260}
+\contentsline {subsection}{\numberline {11.4.1}Strategies for efficient solution of the Laplace problem}{261}
+\contentsline {subsection}{\numberline {11.4.2}Finite difference approximations}{262}
+\contentsline {subsection}{\numberline {11.4.3}Time integration}{263}
+\contentsline {subsection}{\numberline {11.4.4}Wave generation and absorption}{266}
+\contentsline {subsection}{\numberline {11.4.5}Preconditioned Defect Correction (PDC) method}{267}
+\contentsline {subsection}{\numberline {11.4.6}Distributed computations via domain decomposition}{269}
+\contentsline {subsection}{\numberline {11.4.7}Assembling the wave model from library components}{271}
+\contentsline {section}{\numberline {11.5}Properties of the numerical model}{276}
+\contentsline {subsection}{\numberline {11.5.1}Dispersion and kinematic properties}{276}
+\contentsline {section}{\numberline {11.6}Numerical experiments}{277}
+\contentsline {subsection}{\numberline {11.6.1}On precision requirements in engineering applications}{277}
+\contentsline {subsection}{\numberline {11.6.2}Acceleration via parallelism in time using parareal}{282}
+\contentsline {subsection}{\numberline {11.6.3}Towards real-time interactive ship simulation}{286}
+\contentsline {section}{\numberline {11.7}Conclusion and future work}{288}
+\contentsline {section}{\numberline {11.8}Acknowledgments}{289}
+\author {Gleb Beliakov and Shaowu Liu}{}
+\contentsline {chapter}{\numberline {12}Parallel monotone spline interpolation and approximation on GPUs}{295}
+\contentsline {section}{\numberline {12.1}Introduction}{295}
+\contentsline {section}{\numberline {12.2}Monotone splines}{298}
+\contentsline {subsection}{\numberline {12.2.1}Monotone quadratic splines}{298}
+\contentsline {subsection}{\numberline {12.2.2}Monotone Hermite splines}{302}
+\contentsline {section}{\numberline {12.3}Smoothing noisy data via parallel isotone regression}{303}
+\contentsline {section}{\numberline {12.4}Conclusion}{308}
+\author {Lilia Ziane Khodja, Rapha\IeC {\"e}l Couturier, and Jacques Bahi}{}
+\contentsline {chapter}{\numberline {13}Solving sparse linear systems with GMRES and CG methods on GPU clusters}{311}
+\contentsline {section}{\numberline {13.1}Introduction}{311}
+\contentsline {section}{\numberline {13.2}Krylov iterative methods}{312}
+\contentsline {subsection}{\numberline {13.2.1}CG method}{313}
+\contentsline {subsection}{\numberline {13.2.2}GMRES method}{314}
+\contentsline {section}{\numberline {13.3}Parallel implementation on a GPU cluster}{317}
+\contentsline {subsection}{\numberline {13.3.1}Data partitioning}{317}
+\contentsline {subsection}{\numberline {13.3.2}GPU computing}{317}
+\contentsline {subsection}{\numberline {13.3.3}Data communications}{319}
+\contentsline {section}{\numberline {13.4}Experimental results}{321}
+\contentsline {section}{\numberline {13.5}Conclusion}{327}
+\author {Lilia Ziane Khodja, Rapha\IeC {\"e}l Couturier, Jacques Bahi}{}
+\author {Ming Chau}{}
+\author {Pierre Spit\IeC {\'e}ri}{}
+\contentsline {chapter}{\numberline {14}Solving sparse nonlinear systems of obstacle problems on GPU clusters}{331}
+\contentsline {section}{\numberline {14.1}Introduction}{331}
+\contentsline {section}{\numberline {14.2}Obstacle problems}{332}
+\contentsline {subsection}{\numberline {14.2.1}Mathematical model}{332}
+\contentsline {subsection}{\numberline {14.2.2}Discretization}{333}
+\contentsline {section}{\numberline {14.3}Parallel iterative method}{334}
+\contentsline {section}{\numberline {14.4}Parallel implementation on a GPU cluster}{337}
+\contentsline {section}{\numberline {14.5}Experimental tests on a GPU cluster}{345}
+\contentsline {section}{\numberline {14.6}Red-black ordering technique}{348}
+\contentsline {section}{\numberline {14.7}Conclusion}{352}
+\author {Alan Gray and Kevin Stratford}{}
+\contentsline {chapter}{\numberline {15}Ludwig: multiple GPUs for a complex fluid lattice Boltzmann
+application}{355}
+\contentsline {section}{\numberline {15.1}Introduction}{355}
+\contentsline {section}{\numberline {15.2}Background}{357}
+\contentsline {section}{\numberline {15.3}Single GPU implementation}{360}
+\contentsline {section}{\numberline {15.4}Multiple GPU implementation}{361}
+\contentsline {section}{\numberline {15.5}Moving solid particles}{364}
+\contentsline {section}{\numberline {15.6}Summary}{367}
+\contentsline {section}{\numberline {15.7}Acknowledgments}{367}
+\author {Rachid Habel}{}
+\author {Pierre Fortin, Fabienne J\'ez\'equel, Jean-Luc Lamotte}{}
+\author {Stan Scott}{}
+\contentsline {chapter}{\numberline {16}Numerical validation and performance optimization on GPUs of an application in atomic physics}{371}
+\contentsline {section}{\numberline {16.1}Introduction}{372}
+\contentsline {section}{\numberline {16.2}2DRMP and the PROP program}{373}
+\contentsline {subsection}{\numberline {16.2.1}Principles of $R$-matrix propagation}{373}
+\contentsline {subsection}{\numberline {16.2.2}Description of the PROP program}{375}
+\contentsline {subsection}{\numberline {16.2.3}CAPS implementation}{376}
+\contentsline {section}{\numberline {16.3}Numerical validation of PROP in single precision}{377}
+\contentsline {subsection}{\numberline {16.3.1}Medium case study}{378}
+\contentsline {subsection}{\numberline {16.3.2}Huge case study}{380}
+\contentsline {section}{\numberline {16.4}Towards a complete deployment of PROP on GPUs}{381}
+\contentsline {subsection}{\numberline {16.4.1}Computing the output $R$-matrix on GPU}{382}
+\contentsline {subsection}{\numberline {16.4.2}Constructing the local $R$-matrices on GPU}{384}
+\contentsline {subsection}{\numberline {16.4.3}Scaling amplitude arrays on GPU}{385}
+\contentsline {subsection}{\numberline {16.4.4}Using double-buffering to overlap I/O and computation}{385}
+\contentsline {subsection}{\numberline {16.4.5}Matrix padding}{386}
+\contentsline {section}{\numberline {16.5}Performance results}{387}
+\contentsline {subsection}{\numberline {16.5.1}PROP deployment on GPU}{387}
+\contentsline {subsection}{\numberline {16.5.2}PROP execution profile}{389}
+\contentsline {section}{\numberline {16.6}Propagation of multiple concurrent energies on GPU}{391}
+\contentsline {section}{\numberline {16.7}Conclusion and future work}{392}
+\author {Xuexin Liu, Sheldon Xiang-Dong Tan}{}
+\author {Hai Wang}{}
+\author {Hao Yu}{}
+\contentsline {chapter}{\numberline {17}A GPU-accelerated envelope-following method for switching power converter simulation}{395}
+\contentsline {section}{\numberline {17.1}Introduction}{395}
+\contentsline {section}{\numberline {17.2}The envelope-following method in a nutshell}{398}
+\contentsline {section}{\numberline {17.3}New parallel envelope-following method}{400}
+\contentsline {subsection}{\numberline {17.3.1}GMRES solver for Newton update equation}{400}
+\contentsline {subsection}{\numberline {17.3.2}Parallelization on GPU platforms}{402}
+\contentsline {subsection}{\numberline {17.3.3}Gear-2 based sensitivity calculation}{404}
+\contentsline {section}{\numberline {17.4}Numerical examples}{406}
+\contentsline {section}{\numberline {17.5}Summary}{410}
+\contentsline {section}{\numberline {17.6}Glossary}{410}
+\contentsline {part}{VI\hspace {1em}Other}{413}
+\author {Guillaume Laville, Christophe Lang, B\IeC {\'e}n\IeC {\'e}dicte Herrmann, Laurent Philippe}{}
+\author {Kamel Mazouzi}{}
+\author {Nicolas Marilleau}{}
+\contentsline {chapter}{\numberline {18}Implementing multi-agent systems on GPU}{415}
+\contentsline {section}{\numberline {18.1}Introduction}{416}
+\contentsline {section}{\numberline {18.2}Running agent-based simulations}{417}
+\contentsline {subsection}{\numberline {18.2.1}Multi-agent systems and parallelism}{417}
+\contentsline {subsection}{\numberline {18.2.2}MAS implementation on GPU}{418}
+\contentsline {section}{\numberline {18.3}A first practical example}{420}
+\contentsline {subsection}{\numberline {18.3.1}The Collembola model}{420}
+\contentsline {subsection}{\numberline {18.3.2}Collembola implementation}{421}
+\contentsline {subsection}{\numberline {18.3.3}Collembola performance}{423}
+\contentsline {section}{\numberline {18.4}Second example}{424}
+\contentsline {subsection}{\numberline {18.4.1}The MIOR model}{424}
+\contentsline {subsection}{\numberline {18.4.2}MIOR implementation}{425}
+\contentsline {subsubsection}{\numberline {18.4.2.1}Execution mapping on GPU}{426}
+\contentsline {subsubsection}{\numberline {18.4.2.2}Data structures translation}{427}
+\contentsline {subsubsection}{\numberline {18.4.2.3}Critical resources access management}{428}
+\contentsline {subsubsection}{\numberline {18.4.2.4}Termination detection}{430}
+\contentsline {subsection}{\numberline {18.4.3}Performance of MIOR implementations}{430}
+\contentsline {section}{\numberline {18.5}Analysis and recommendations}{434}
+\contentsline {subsection}{\numberline {18.5.1}Analysis}{434}
+\contentsline {subsection}{\numberline {18.5.2}MAS execution workflow}{435}
+\contentsline {subsection}{\numberline {18.5.3}Implementation challenges}{435}
+\contentsline {subsection}{\numberline {18.5.4}MCSMA}{436}
+\contentsline {section}{\numberline {18.6}Conclusion}{437}
+\author {Rapha\IeC {\"e}l Couturier and Christophe Guyeux}{}
+\contentsline {chapter}{\numberline {19}Pseudorandom number generator on GPU}{441}
+\contentsline {section}{\numberline {19.1}Introduction}{441}
+\contentsline {section}{\numberline {19.2}Basic reminders}{443}
+\contentsline {subsection}{\numberline {19.2.1}A short presentation of chaos}{443}
+\contentsline {subsection}{\numberline {19.2.2}On Devaney's definition of chaos}{443}
+\contentsline {subsection}{\numberline {19.2.3}Chaotic iterations}{444}
+\contentsline {section}{\numberline {19.3}Toward efficiency and improvement for CI PRNG}{445}
+\contentsline {subsection}{\numberline {19.3.1}First efficient implementation of a PRNG based on chaotic iterations}{445}
+\contentsline {subsection}{\numberline {19.3.2}Efficient PRNGs based on chaotic iterations on GPU}{446}
+\contentsline {subsection}{\numberline {19.3.3}Naive version for GPU}{446}
+\contentsline {subsection}{\numberline {19.3.4}Improved version for GPU}{447}
+\contentsline {subsection}{\numberline {19.3.5}Chaos evaluation of the improved version}{448}
+\contentsline {section}{\numberline {19.4}Experiments}{449}
+\contentsline {section}{\numberline {19.5}Summary}{450}
+\author {Bertil Schmidt and Hoang-Vu Dang}{}
+\contentsline {chapter}{\numberline {20}Solving large sparse linear systems for integer factorization on GPUs}{453}
+\contentsline {section}{\numberline {20.1}Introduction}{453}
+\contentsline {section}{\numberline {20.2}Block Wiedemann algorithm}{454}
+\contentsline {section}{\numberline {20.3}SpMV over GF(2) for NFS matrices using existing formats on GPUs}{455}
+\contentsline {section}{\numberline {20.4}A hybrid format for SpMV on GPUs}{459}
+\contentsline {subsection}{\numberline {20.4.1}Dense format}{459}
+\contentsline {subsection}{\numberline {20.4.2}Sliced COO}{460}
+\contentsline {subsection}{\numberline {20.4.3}Determining the cut-off point of each format}{464}
+\contentsline {section}{\numberline {20.5}SCOO for single-precision floating-point matrices}{465}
+\contentsline {section}{\numberline {20.6}Performance evaluation}{466}
+\contentsline {subsection}{\numberline {20.6.1}Experimental setup}{466}
+\contentsline {subsection}{\numberline {20.6.2}Experimental results of SpMV on NFS matrix}{466}
+\contentsline {subsection}{\numberline {20.6.3}Experimental results of SpMV on floating-point matrices}{467}
+\contentsline {subsubsection}{\numberline {20.6.3.1}Performance comparison to existing GPU formats}{467}
+\contentsline {subsubsection}{\numberline {20.6.3.2}Performance comparison to a CPU implementation}{469}
+\contentsline {section}{\numberline {20.7}Conclusion}{470}
+\contentsline {chapter}{Index}{473}