\contentsline {fm}{List of Figures}{xi} \contentsline {fm}{List of Tables}{xvii} \contentsline {fm}{Preface}{xxi} \contentsline {part}{I\hspace {1em}Presentation of GPUs}{1} \author {Rapha\IeC {\"e}l Couturier}{} \contentsline {chapter}{\numberline {1}Presentation of the GPU architecture and of the CUDA\hfill \penalty -\@M environment}{3} \contentsline {section}{\numberline {1.1}Introduction}{3} \contentsline {section}{\numberline {1.2}Brief history of the video card}{4} \contentsline {section}{\numberline {1.3}GPGPU}{4} \contentsline {section}{\numberline {1.4}Architecture of current GPUs}{5} \contentsline {section}{\numberline {1.5}Kinds of parallelism}{7} \contentsline {section}{\numberline {1.6}CUDA multithreading}{8} \contentsline {section}{\numberline {1.7}Memory hierarchy}{9} \contentsline {section}{\numberline {1.8}Conclusion}{11} \author {Rapha\IeC {\"e}l Couturier}{} \contentsline {chapter}{\numberline {2}Introduction to CUDA}{13} \contentsline {section}{\numberline {2.1}Introduction}{13} \contentsline {section}{\numberline {2.2}First example}{13} \contentsline {section}{\numberline {2.3}Second example: using CUBLAS }{16} \contentsline {section}{\numberline {2.4}Third example: matrix-matrix multiplication}{18} \contentsline {section}{\numberline {2.5}Conclusion}{21} \contentsline {part}{II\hspace {1em}Image processing}{23} \author {Gilles Perrot}{} \contentsline {chapter}{\numberline {3}Setting up the environment}{25} \contentsline {section}{\numberline {3.1}Data transfers, memory management}{25} \contentsline {section}{\numberline {3.2}Performance measurements}{28} \author {Gilles Perrot}{} \contentsline {chapter}{\numberline {4}Implementing a fast median filter}{31} \contentsline {section}{\numberline {4.1}Introduction}{31} \contentsline {section}{\numberline {4.2}Median filtering}{32} \contentsline {subsection}{\numberline {4.2.1}Basic principles}{32} \contentsline {subsection}{\numberline {4.2.2}A naive implementation}{33} \contentsline {section}{\numberline {4.3}NVIDIA GPU tuning recipes}{35} \contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers}{37} \contentsline {subsection}{\numberline {4.4.1}The simplest way}{37} \contentsline {subsection}{\numberline {4.4.2}Further optimization}{38} \contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{39} \contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{42} \contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{44} \contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{44} \contentsline {subsection}{\numberline {4.5.2}Fast approximated $n\times n$ median filter }{47} \author {Gilles Perrot}{} \contentsline {chapter}{\numberline {5}Implementing an efficient convolution operation on GPU}{53} \contentsline {section}{\numberline {5.1}Overview}{53} \contentsline {section}{\numberline {5.2}Definition}{54} \contentsline {section}{\numberline {5.3}Implementation}{54} \contentsline {subsection}{\numberline {5.3.1}First test implementation}{55} \contentsline {subsection}{\numberline {5.3.2}Using parameterizable masks}{58} \contentsline {subsection}{\numberline {5.3.3}Increasing the number of pixels processed by each thread}{59} \contentsline {subsection}{\numberline {5.3.4}Using shared memory to store prefetched data}{62} \contentsline {section}{\numberline {5.4}Separable convolution}{65} \contentsline {section}{\numberline {5.5}Conclusion}{69} \contentsline {part}{III\hspace {1em}Software development}{71} \author {Stefan L. Glimberg, Allan P. Engsig-Karup, Allan S. Nielsen, and Bernd Dammann}{} \contentsline {chapter}{\numberline {6}Development of software components for heterogeneous many-core architectures}{73} \contentsline {section}{\numberline {6.1}Software development for heterogeneous\hfill \penalty -\@M architectures}{74} \contentsline {subsection}{\numberline {6.1.1}Test environments}{76} \contentsline {section}{\numberline {6.2}Heterogeneous library design for PDE solvers}{76} \contentsline {subsection}{\numberline {6.2.1}Component and concept design}{77} \contentsline {subsection}{\numberline {6.2.2}A matrix-free finite difference component}{77} \contentsline {section}{\numberline {6.3}Model problems}{81} \contentsline {subsection}{\numberline {6.3.1}Heat conduction equation}{82} \contentsline {subsubsection}{\numberline {6.3.1.1}Assembling the heat conduction solver}{84} \contentsline {subsubsection}{\numberline {6.3.1.2}Numerical solutions to the heat conduction problem}{86} \contentsline {subsection}{\numberline {6.3.2}Poisson equation}{87} \contentsline {subsubsection}{\numberline {6.3.2.1}Assembling the Poisson solver}{89} \contentsline {subsubsection}{\numberline {6.3.2.2}Numerical solutions to the Poisson problem}{91} \contentsline {section}{\numberline {6.4}Optimization strategies for multi-GPU systems}{91} \contentsline {subsection}{\numberline {6.4.1}Spatial domain decomposition}{93} \contentsline {subsection}{\numberline {6.4.2}Parareal--parallel time integration}{95} \contentsline {subsubsection}{\numberline {6.4.2.1}The parareal algorithm}{96} \contentsline {subsubsection}{\numberline {6.4.2.2}Computational complexity}{98} \contentsline {section}{\numberline {6.5}Conclusion and outlook}{100} \author {Sylvain Contassot-Vivier}{} \author {Stephane Vialle}{} \author {Jens Gustedt}{} \contentsline {chapter}{\numberline {7}Development methodologies for GPU and cluster of GPUs}{105} \contentsline {section}{\numberline {7.1}Introduction}{106} \contentsline {section}{\numberline {7.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{106} \contentsline {subsection}{\numberline {7.2.1}Synchronous parallel algorithms on GPU clusters}{106} \contentsline {subsection}{\numberline {7.2.2}Native overlap of CPU communications and GPU computations}{108} \contentsline {subsection}{\numberline {7.2.3}Overlapping with sequences of transfers and computations}{110} \contentsline {subsection}{\numberline {7.2.4}Interleaved communications-transfers-computations overlapping}{115} \contentsline {subsection}{\numberline {7.2.5}Experimental validation}{118} \contentsline {section}{\numberline {7.3}General scheme of asynchronous parallel code with computation/communication overlapping}{120} \contentsline {subsection}{\numberline {7.3.1}A basic asynchronous scheme}{122} \contentsline {subsection}{\numberline {7.3.2}Synchronization of the asynchronous scheme}{126} \contentsline {subsection}{\numberline {7.3.3}Asynchronous scheme using MPI, OpenMP, and CUDA}{130} \contentsline {subsection}{\numberline {7.3.4}Experimental validation}{137} \contentsline {section}{\numberline {7.4}Perspective: a unifying programming model}{140} \contentsline {subsection}{\numberline {7.4.1}Resources}{141} \contentsline {subsection}{\numberline {7.4.2}Control}{142} \contentsline {subsection}{\numberline {7.4.3}Example: block-cyclic matrix multiplication (MM)}{142} \contentsline {subsection}{\numberline {7.4.4}Tasks and operations}{145} \contentsline {section}{\numberline {7.5}Conclusion}{146} \contentsline {section}{\numberline {7.6}Glossary}{146} \contentsline {part}{IV\hspace {1em}Optimization}{151} \author {Imen Chakroun and Nouredine Melab}{} \contentsline {chapter}{\numberline {8}GPU-accelerated tree-based exact optimization methods}{153} \contentsline {section}{\numberline {8.1}Introduction}{154} \contentsline {section}{\numberline {8.2}Branch-and-bound algorithm}{156} \contentsline {section}{\numberline {8.3}Parallel branch-and-bound algorithms}{157} \contentsline {subsection}{\numberline {8.3.1}The parallel tree exploration model}{158} \contentsline {subsection}{\numberline {8.3.2}The parallel evaluation of bounds model}{158} \contentsline {section}{\numberline {8.4}The flowshop scheduling problem}{159} \contentsline {subsection}{\numberline {8.4.1}Definition of the flowshop scheduling problem}{159} \contentsline {subsection}{\numberline {8.4.2}Lower bound for the flowshop scheduling problem}{160} \contentsline {section}{\numberline {8.5}GPU-accelerated B\&B based on the parallel tree exploration (GPU-PTE-BB)}{161} \contentsline {section}{\numberline {8.6}GPU-accelerated B\&B based on the parallel evaluation of bounds (GPU-PEB-BB) }{162} \contentsline {section}{\numberline {8.7}Thread divergence}{164} \contentsline {subsection}{\numberline {8.7.1}The thread divergence issue}{164} \contentsline {subsection}{\numberline {8.7.2}Mechanisms for reducing branch divergence}{165} \contentsline {section}{\numberline {8.8}Memory access optimization}{168} \contentsline {subsection}{\numberline {8.8.1}Complexity analysis of the memory usage of the lower bound }{169} \contentsline {subsection}{\numberline {8.8.2}Data placement pattern of the lower bound on GPU}{170} \contentsline {section}{\numberline {8.9}Experiments}{172} \contentsline {subsection}{\numberline {8.9.1}Parameters settings}{172} \contentsline {subsection}{\numberline {8.9.2}Experimental protocol: computing the speedup}{172} \contentsline {subsection}{\numberline {8.9.3}Performance impact of GPU-based parallelism}{174} \contentsline {subsection}{\numberline {8.9.4}Thread divergence reduction}{176} \contentsline {subsection}{\numberline {8.9.5}Data access optimization}{176} \contentsline {section}{\numberline {8.10}Conclusion and future work}{178} \author {Malika Mehdi}{} \author {Ahc\`{e}ne Bendjoudi}{} \author {Lakhdar Loukil}{} \author {Nouredine Melab}{} \contentsline {chapter}{\numberline {9}Parallel GPU-accelerated metaheuristics}{183} \contentsline {section}{\numberline {9.1}Introduction}{184} \contentsline {section}{\numberline {9.2}Combinatorial optimization}{184} \contentsline {section}{\numberline {9.3}Parallel models for metaheuristics}{185} \contentsline {section}{\numberline {9.4}Challenges for the design of GPU-based metaheuristics}{187} \contentsline {subsection}{\numberline {9.4.1}Data placement on a hierarchical memory}{188} \contentsline {subsection}{\numberline {9.4.2}Threads synchronization}{189} \contentsline {subsection}{\numberline {9.4.3}Thread divergence}{189} \contentsline {subsection}{\numberline {9.4.4}Task distribution and CPU/GPU communication}{190} \contentsline {section}{\numberline {9.5}State-of-the-art parallel metaheuristics on GPUs}{190} \contentsline {subsection}{\numberline {9.5.1}Implementing solution-based metaheuristics on GPUs}{191} \contentsline {subsection}{\numberline {9.5.2}Implementing population-based metaheuristics on GPUs}{193} \contentsline {subsection}{\numberline {9.5.3}Synthesis of the implementation strategies}{198} \contentsline {section}{\numberline {9.6}Frameworks for metaheuristics on GPUs}{201} \contentsline {subsection}{\numberline {9.6.1}PUGACE: framework for implementing evolutionary computation on GPUs}{201} \contentsline {subsection}{\numberline {9.6.2}ParadisEO-MO-GPU}{202} \contentsline {subsection}{\numberline {9.6.3}libCUDAOptimize: an open source library of GPU-based metaheuristics}{202} \contentsline {section}{\numberline {9.7}Case study: accelerating large neighborhood LS method on GPUs for solving the Q3AP}{203} \contentsline {subsection}{\numberline {9.7.1}The quadratic 3-dimensional assignment problem}{204} \contentsline {subsection}{\numberline {9.7.2}Iterated tabu search algorithm for the Q3AP}{205} \contentsline {subsection}{\numberline {9.7.3}Neighborhood functions for the Q3AP}{206} \contentsline {subsection}{\numberline {9.7.4}Design and implementation of a GPU-based iterated tabu search algorithm for the Q3AP}{207} \contentsline {subsection}{\numberline {9.7.5}Experimental results}{208} \contentsline {section}{\numberline {9.8}Conclusion}{210} \author {Xavier Meyer}{} \author {Bastien Chopard}{} \author {Paul Albuquerque}{} \contentsline {chapter}{\numberline {10}Linear programming on a GPU: a\nobreakspace {}case\nobreakspace {}study}{215} \contentsline {section}{\numberline {10.1}Introduction}{216} \contentsline {section}{\numberline {10.2}Simplex algorithm}{217} \contentsline {subsection}{\numberline {10.2.1}Linear programming model}{217} \contentsline {subsection}{\numberline {10.2.2}Standard simplex algorithm}{217} \contentsline {subsection}{\numberline {10.2.3}Revised simplex method}{219} \contentsline {subsection}{\numberline {10.2.4}Heuristics and improvements}{221} \contentsline {section}{\numberline {10.3}Branch-and-bound algorithm}{224} \contentsline {subsection}{\numberline {10.3.1}Integer linear programming}{224} \contentsline {subsection}{\numberline {10.3.2}Branch-and-bound tree}{224} \contentsline {subsection}{\numberline {10.3.3}Branching strategy}{225} \contentsline {subsection}{\numberline {10.3.4}Node selection strategy}{226} \contentsline {subsection}{\numberline {10.3.5}Cutting-plane methods}{227} \contentsline {section}{\numberline {10.4}CUDA considerations}{228} \contentsline {subsection}{\numberline {10.4.1}Parallel reduction}{228} \contentsline {subsection}{\numberline {10.4.2}Kernel optimization}{228} \contentsline {section}{\numberline {10.5}Implementations}{229} \contentsline {subsection}{\numberline {10.5.1}Standard simplex}{229} \contentsline {subsection}{\numberline {10.5.2}Revised simplex}{232} \contentsline {subsection}{\numberline {10.5.3}Branch-and-bound}{236} \contentsline {section}{\numberline {10.6}Performance model}{237} \contentsline {subsection}{\numberline {10.6.1}Levels of parallelism}{238} \contentsline {subsection}{\numberline {10.6.2}Amount of work done by a thread}{238} \contentsline {subsection}{\numberline {10.6.3}Global performance model}{239} \contentsline {subsection}{\numberline {10.6.4}A kernel example: \textit {steepest-edge}}{239} \contentsline {subsection}{\numberline {10.6.5}Standard simplex GPU implementation model}{240} \contentsline {section}{\numberline {10.7}Measurements and analysis}{241} \contentsline {subsection}{\numberline {10.7.1}Performance model validation}{241} \contentsline {subsection}{\numberline {10.7.2}Performance comparison of implementations}{241} \contentsline {section}{\numberline {10.8}Conclusion and perspectives}{245} \contentsline {part}{V\hspace {1em}Numerical applications}{249} \author {Allan P. Engsig-Karup, Stefan L. Glimberg, Allan S. Nielsen, and Ole Lindberg}{} \contentsline {chapter}{\numberline {11}Fast hydrodynamics on heterogeneous many-core hardware}{251} \contentsline {section}{\numberline {11.1}On hardware trends and challenges in scientific applications}{253} \contentsline {section}{\numberline {11.2}On modeling paradigms for highly nonlinear and dispersive water waves}{255} \contentsline {section}{\numberline {11.3}Governing equations}{256} \contentsline {subsection}{\numberline {11.3.1}Boundary conditions}{260} \contentsline {section}{\numberline {11.4}The numerical model}{260} \contentsline {subsection}{\numberline {11.4.1}Strategies for efficient solution of the Laplace problem}{261} \contentsline {subsection}{\numberline {11.4.2}Finite difference approximations}{262} \contentsline {subsection}{\numberline {11.4.3}Time integration}{263} \contentsline {subsection}{\numberline {11.4.4}Wave generation and absorption}{266} \contentsline {subsection}{\numberline {11.4.5}Preconditioned Defect Correction (PDC) method}{267} \contentsline {subsection}{\numberline {11.4.6}Distributed computations via domain decomposition}{269} \contentsline {subsection}{\numberline {11.4.7}Assembling the wave model from library components}{271} \contentsline {section}{\numberline {11.5}Properties of the numerical model}{276} \contentsline {subsection}{\numberline {11.5.1}Dispersion and kinematic properties}{276} \contentsline {section}{\numberline {11.6}Numerical experiments}{277} \contentsline {subsection}{\numberline {11.6.1}On precision requirements in engineering applications}{277} \contentsline {subsection}{\numberline {11.6.2}Acceleration via parallelism in time using parareal}{282} \contentsline {subsection}{\numberline {11.6.3}Towards real-time interactive ship simulation}{286} \contentsline {section}{\numberline {11.7}Conclusion and future work}{288} \contentsline {section}{\numberline {11.8}Acknowledgments}{289} \author {Gleb Beliakov and Shaowu Liu}{} \contentsline {chapter}{\numberline {12}Parallel monotone spline interpolation and approximation on GPUs}{295} \contentsline {section}{\numberline {12.1}Introduction}{295} \contentsline {section}{\numberline {12.2}Monotone splines}{298} \contentsline {subsection}{\numberline {12.2.1}Monotone quadratic splines}{298} \contentsline {subsection}{\numberline {12.2.2}Monotone Hermite splines}{302} \contentsline {section}{\numberline {12.3}Smoothing noisy data via parallel isotone regression}{303} \contentsline {section}{\numberline {12.4}Conclusion}{308} \author {Lilia Ziane Khodja, Rapha\IeC {\"e}l Couturier, and Jacques Bahi}{} \contentsline {chapter}{\numberline {13}Solving sparse linear systems with GMRES and CG methods on GPU clusters}{311} \contentsline {section}{\numberline {13.1}Introduction}{311} \contentsline {section}{\numberline {13.2}Krylov iterative methods}{312} \contentsline {subsection}{\numberline {13.2.1}CG method}{313} \contentsline {subsection}{\numberline {13.2.2}GMRES method}{314} \contentsline {section}{\numberline {13.3}Parallel implementation on a GPU cluster}{317} \contentsline {subsection}{\numberline {13.3.1}Data partitioning}{317} \contentsline {subsection}{\numberline {13.3.2}GPU computing}{317} \contentsline {subsection}{\numberline {13.3.3}Data communications}{319} \contentsline {section}{\numberline {13.4}Experimental results}{321} \contentsline {section}{\numberline {13.5}Conclusion}{327} \author {Lilia Ziane Khodja, Rapha\IeC {\"e}l Couturier, Jacques Bahi}{} \author {Ming Chau}{} \author {Pierre Spit\IeC {\'e}ri}{} \contentsline {chapter}{\numberline {14}Solving sparse nonlinear systems of obstacle problems on GPU clusters}{331} \contentsline {section}{\numberline {14.1}Introduction}{331} \contentsline {section}{\numberline {14.2}Obstacle problems}{332} \contentsline {subsection}{\numberline {14.2.1}Mathematical model}{332} \contentsline {subsection}{\numberline {14.2.2}Discretization}{333} \contentsline {section}{\numberline {14.3}Parallel iterative method}{334} \contentsline {section}{\numberline {14.4}Parallel implementation on a GPU cluster}{337} \contentsline {section}{\numberline {14.5}Experimental tests on a GPU cluster}{345} \contentsline {section}{\numberline {14.6}Red-black ordering technique}{348} \contentsline {section}{\numberline {14.7}Conclusion}{352} \author {Alan Gray and Kevin Stratford}{} \contentsline {chapter}{\numberline {15}Ludwig: multiple GPUs for a complex fluid lattice Boltzmann application}{355} \contentsline {section}{\numberline {15.1}Introduction}{355} \contentsline {section}{\numberline {15.2}Background}{357} \contentsline {section}{\numberline {15.3}Single GPU implementation}{360} \contentsline {section}{\numberline {15.4}Multiple GPU implementation}{361} \contentsline {section}{\numberline {15.5}Moving solid particles}{364} \contentsline {section}{\numberline {15.6}Summary}{367} \contentsline {section}{\numberline {15.7}Acknowledgments}{367} \author {Rachid Habel}{} \author {Pierre Fortin, Fabienne J\'ez\'equel, Jean-Luc Lamotte}{} \author {Stan Scott}{} \contentsline {chapter}{\numberline {16}Numerical validation and performance optimization on GPUs of an application in atomic physics}{371} \contentsline {section}{\numberline {16.1}Introduction}{372} \contentsline {section}{\numberline {16.2}2DRMP and the PROP program}{373} \contentsline {subsection}{\numberline {16.2.1}Principles of $R$-matrix propagation}{373} \contentsline {subsection}{\numberline {16.2.2}Description of the PROP program}{375} \contentsline {subsection}{\numberline {16.2.3}CAPS implementation}{376} \contentsline {section}{\numberline {16.3}Numerical validation of PROP in single precision}{377} \contentsline {subsection}{\numberline {16.3.1}Medium case study}{378} \contentsline {subsection}{\numberline {16.3.2}Huge case study}{380} \contentsline {section}{\numberline {16.4}Towards a complete deployment of PROP on GPUs}{381} \contentsline {subsection}{\numberline {16.4.1}Computing the output $R$-matrix on GPU}{382} \contentsline {subsection}{\numberline {16.4.2}Constructing the local $R$-matrices on GPU}{384} \contentsline {subsection}{\numberline {16.4.3}Scaling amplitude arrays on GPU}{385} \contentsline {subsection}{\numberline {16.4.4}Using double-buffering to overlap I/O and computation}{385} \contentsline {subsection}{\numberline {16.4.5}Matrix padding}{386} \contentsline {section}{\numberline {16.5}Performance results}{387} \contentsline {subsection}{\numberline {16.5.1}PROP deployment on GPU}{387} \contentsline {subsection}{\numberline {16.5.2}PROP execution profile}{389} \contentsline {section}{\numberline {16.6}Propagation of multiple concurrent energies on GPU}{391} \contentsline {section}{\numberline {16.7}Conclusion and future work}{392} \author {Xuexin Liu, Sheldon Xiang-Dong Tan}{} \author {Hai Wang}{} \author {Hao Yu}{} \contentsline {chapter}{\numberline {17}A GPU-accelerated envelope-following method for switching power converter simulation}{395} \contentsline {section}{\numberline {17.1}Introduction}{395} \contentsline {section}{\numberline {17.2}The envelope-following method in a nutshell}{398} \contentsline {section}{\numberline {17.3}New parallel envelope-following method}{400} \contentsline {subsection}{\numberline {17.3.1}GMRES solver for Newton update equation}{400} \contentsline {subsection}{\numberline {17.3.2}Parallelization on GPU platforms}{402} \contentsline {subsection}{\numberline {17.3.3}Gear-2 based sensitivity calculation}{404} \contentsline {section}{\numberline {17.4}Numerical examples}{406} \contentsline {section}{\numberline {17.5}Summary}{410} \contentsline {section}{\numberline {17.6}Glossary}{410} \contentsline {part}{VI\hspace {1em}Other}{413} \author {Guillaume Laville, Christophe Lang, B\IeC {\'e}n\IeC {\'e}dicte Herrmann, Laurent Philippe}{} \author {Kamel Mazouzi}{} \author {Nicolas Marilleau}{} \contentsline {chapter}{\numberline {18}Implementing multi-agent systems on GPU}{415} \contentsline {section}{\numberline {18.1}Introduction}{416} \contentsline {section}{\numberline {18.2}Running agent-based simulations}{417} \contentsline {subsection}{\numberline {18.2.1}Multi-agent systems and parallelism}{417} \contentsline {subsection}{\numberline {18.2.2}MAS implementation on GPU}{418} \contentsline {section}{\numberline {18.3}A first practical example}{420} \contentsline {subsection}{\numberline {18.3.1}The Collembola model}{420} \contentsline {subsection}{\numberline {18.3.2}Collembola implementation}{421} \contentsline {subsection}{\numberline {18.3.3}Collembola performance}{423} \contentsline {section}{\numberline {18.4}Second example}{424} \contentsline {subsection}{\numberline {18.4.1}The MIOR model}{424} \contentsline {subsection}{\numberline {18.4.2}MIOR implementation}{425} \contentsline {subsubsection}{\numberline {18.4.2.1}Execution mapping on GPU}{426} \contentsline {subsubsection}{\numberline {18.4.2.2}Data structures translation}{427} \contentsline {subsubsection}{\numberline {18.4.2.3}Critical resources access management}{428} \contentsline {subsubsection}{\numberline {18.4.2.4}Termination detection}{430} \contentsline {subsection}{\numberline {18.4.3}Performance of MIOR implementations}{430} \contentsline {section}{\numberline {18.5}Analysis and recommendations}{434} \contentsline {subsection}{\numberline {18.5.1}Analysis}{434} \contentsline {subsection}{\numberline {18.5.2}MAS execution workflow}{435} \contentsline {subsection}{\numberline {18.5.3}Implementation challenges}{435} \contentsline {subsection}{\numberline {18.5.4}MCSMA}{436} \contentsline {section}{\numberline {18.6}Conclusion}{437} \author {Rapha\IeC {\"e}l Couturier and Christophe Guyeux}{} \contentsline {chapter}{\numberline {19}Pseudorandom number generator on GPU}{441} \contentsline {section}{\numberline {19.1}Introduction}{441} \contentsline {section}{\numberline {19.2}Basic reminders}{443} \contentsline {subsection}{\numberline {19.2.1}A short presentation of chaos}{443} \contentsline {subsection}{\numberline {19.2.2}On Devaney's definition of chaos}{443} \contentsline {subsection}{\numberline {19.2.3}Chaotic iterations}{444} \contentsline {section}{\numberline {19.3}Toward efficiency and improvement for CI PRNG}{445} \contentsline {subsection}{\numberline {19.3.1}First efficient implementation of a PRNG based on chaotic iterations}{445} \contentsline {subsection}{\numberline {19.3.2}Efficient PRNGs based on chaotic iterations on GPU}{446} \contentsline {subsection}{\numberline {19.3.3}Naive version for GPU}{446} \contentsline {subsection}{\numberline {19.3.4}Improved version for GPU}{447} \contentsline {subsection}{\numberline {19.3.5}Chaos evaluation of the improved version}{448} \contentsline {section}{\numberline {19.4}Experiments}{449} \contentsline {section}{\numberline {19.5}Summary}{450} \author {Bertil Schmidt and Hoang-Vu Dang}{} \contentsline {chapter}{\numberline {20}Solving large sparse linear systems for integer factorization on GPUs}{453} \contentsline {section}{\numberline {20.1}Introduction}{453} \contentsline {section}{\numberline {20.2}Block Wiedemann algorithm}{454} \contentsline {section}{\numberline {20.3}SpMV over GF(2) for NFS matrices using existing formats on GPUs}{455} \contentsline {section}{\numberline {20.4}A hybrid format for SpMV on GPUs}{459} \contentsline {subsection}{\numberline {20.4.1}Dense format}{459} \contentsline {subsection}{\numberline {20.4.2}Sliced COO}{460} \contentsline {subsection}{\numberline {20.4.3}Determining the cut-off point of each format}{464} \contentsline {section}{\numberline {20.5}SCOO for single-precision floating-point matrices}{465} \contentsline {section}{\numberline {20.6}Performance evaluation}{466} \contentsline {subsection}{\numberline {20.6.1}Experimental setup}{466} \contentsline {subsection}{\numberline {20.6.2}Experimental results of SpMV on NFS matrix}{466} \contentsline {subsection}{\numberline {20.6.3}Experimental results of SpMV on floating-point matrices}{467} \contentsline {subsubsection}{\numberline {20.6.3.1}Performance comparison to existing GPU formats}{467} \contentsline {subsubsection}{\numberline {20.6.3.2}Performance comparison to a CPU implementation}{469} \contentsline {section}{\numberline {20.7}Conclusion}{470} \contentsline {chapter}{Index}{473}