1 \contentsline {fm}{List of Figures}{xi}
2 \contentsline {fm}{List of Tables}{xvii}
3 \contentsline {fm}{Preface}{xxi}
4 \contentsline {part}{I\hspace {1em}Presentation of GPUs}{1}
5 \author {Rapha\IeC {\"e}l Couturier}{}
6 \contentsline {chapter}{\numberline {1}Presentation of the GPU architecture and of the CUDA\hfill \penalty -\@M environment}{3}
7 \contentsline {section}{\numberline {1.1}Introduction}{3}
8 \contentsline {section}{\numberline {1.2}Brief history of the video card}{4}
9 \contentsline {section}{\numberline {1.3}GPGPU}{4}
10 \contentsline {section}{\numberline {1.4}Architecture of current GPUs}{5}
11 \contentsline {section}{\numberline {1.5}Kinds of parallelism}{7}
12 \contentsline {section}{\numberline {1.6}CUDA multithreading}{8}
13 \contentsline {section}{\numberline {1.7}Memory hierarchy}{9}
14 \contentsline {section}{\numberline {1.8}Conclusion}{11}
15 \author {Rapha\IeC {\"e}l Couturier}{}
16 \contentsline {chapter}{\numberline {2}Introduction to CUDA}{13}
17 \contentsline {section}{\numberline {2.1}Introduction}{13}
18 \contentsline {section}{\numberline {2.2}First example}{13}
19 \contentsline {section}{\numberline {2.3}Second example: using CUBLAS }{16}
20 \contentsline {section}{\numberline {2.4}Third example: matrix-matrix multiplication}{18}
21 \contentsline {section}{\numberline {2.5}Conclusion}{21}
22 \contentsline {part}{II\hspace {1em}Image processing}{23}
23 \author {Gilles Perrot}{}
24 \contentsline {chapter}{\numberline {3}Setting up the environment}{25}
25 \contentsline {section}{\numberline {3.1}Data transfers, memory management}{25}
26 \contentsline {section}{\numberline {3.2}Performance measurements}{28}
27 \author {Gilles Perrot}{}
28 \contentsline {chapter}{\numberline {4}Implementing a fast median filter}{31}
29 \contentsline {section}{\numberline {4.1}Introduction}{31}
30 \contentsline {section}{\numberline {4.2}Median filtering}{32}
31 \contentsline {subsection}{\numberline {4.2.1}Basic principles}{32}
32 \contentsline {subsection}{\numberline {4.2.2}A naive implementation}{33}
33 \contentsline {section}{\numberline {4.3}NVIDIA GPU tuning recipes}{35}
34 \contentsline {section}{\numberline {4.4}A 3$\times $3 median filter: using registers}{37}
35 \contentsline {subsection}{\numberline {4.4.1}The simplest way}{37}
36 \contentsline {subsection}{\numberline {4.4.2}Further optimization}{38}
37 \contentsline {subsubsection}{\numberline {4.4.2.1}Reducing register count }{39}
38 \contentsline {subsubsection}{\numberline {4.4.2.2}More data output per thread}{42}
39 \contentsline {section}{\numberline {4.5}A 5$\times $5 and more median filter }{44}
40 \contentsline {subsection}{\numberline {4.5.1}A register-only 5$\times $5 median filter }{44}
41 \contentsline {subsection}{\numberline {4.5.2}Fast approximated $n\times n$ median filter }{47}
42 \author {Gilles Perrot}{}
43 \contentsline {chapter}{\numberline {5}Implementing an efficient convolution operation on GPU}{53}
44 \contentsline {section}{\numberline {5.1}Overview}{53}
45 \contentsline {section}{\numberline {5.2}Definition}{54}
46 \contentsline {section}{\numberline {5.3}Implementation}{54}
47 \contentsline {subsection}{\numberline {5.3.1}First test implementation}{55}
48 \contentsline {subsection}{\numberline {5.3.2}Using parameterizable masks}{58}
49 \contentsline {subsection}{\numberline {5.3.3}Increasing the number of pixels processed by each thread}{59}
50 \contentsline {subsection}{\numberline {5.3.4}Using shared memory to store prefetched data}{62}
51 \contentsline {section}{\numberline {5.4}Separable convolution}{65}
52 \contentsline {section}{\numberline {5.5}Conclusion}{69}
53 \contentsline {part}{III\hspace {1em}Software development}{71}
54 \author {Stefan L. Glimberg, Allan P. Engsig-Karup, Allan S. Nielsen, and Bernd Dammann}{}
55 \contentsline {chapter}{\numberline {6}Development of software components for heterogeneous many-core architectures}{73}
56 \contentsline {section}{\numberline {6.1}Software development for heterogeneous\hfill \penalty -\@M architectures}{74}
57 \contentsline {subsection}{\numberline {6.1.1}Test environments}{76}
58 \contentsline {section}{\numberline {6.2}Heterogeneous library design for PDE solvers}{76}
59 \contentsline {subsection}{\numberline {6.2.1}Component and concept design}{77}
60 \contentsline {subsection}{\numberline {6.2.2}A matrix-free finite difference component}{77}
61 \contentsline {section}{\numberline {6.3}Model problems}{81}
62 \contentsline {subsection}{\numberline {6.3.1}Heat conduction equation}{82}
63 \contentsline {subsubsection}{\numberline {6.3.1.1}Assembling the heat conduction solver}{84}
64 \contentsline {subsubsection}{\numberline {6.3.1.2}Numerical solutions to the heat conduction problem}{86}
65 \contentsline {subsection}{\numberline {6.3.2}Poisson equation}{87}
66 \contentsline {subsubsection}{\numberline {6.3.2.1}Assembling the Poisson solver}{89}
67 \contentsline {subsubsection}{\numberline {6.3.2.2}Numerical solutions to the Poisson problem}{91}
68 \contentsline {section}{\numberline {6.4}Optimization strategies for multi-GPU systems}{91}
69 \contentsline {subsection}{\numberline {6.4.1}Spatial domain decomposition}{93}
70 \contentsline {subsection}{\numberline {6.4.2}Parareal--parallel time integration}{95}
71 \contentsline {subsubsection}{\numberline {6.4.2.1}The parareal algorithm}{96}
72 \contentsline {subsubsection}{\numberline {6.4.2.2}Computational complexity}{98}
73 \contentsline {section}{\numberline {6.5}Conclusion and outlook}{100}
74 \author {Sylvain Contassot-Vivier}{}
75 \author {Stephane Vialle}{}
76 \author {Jens Gustedt}{}
77 \contentsline {chapter}{\numberline {7}Development methodologies for GPU and cluster of GPUs}{105}
78 \contentsline {section}{\numberline {7.1}Introduction}{106}
79 \contentsline {section}{\numberline {7.2}General scheme of synchronous code with computation/communication overlapping in GPU clusters}{106}
80 \contentsline {subsection}{\numberline {7.2.1}Synchronous parallel algorithms on GPU clusters}{106}
81 \contentsline {subsection}{\numberline {7.2.2}Native overlap of CPU communications and GPU computations}{108}
82 \contentsline {subsection}{\numberline {7.2.3}Overlapping with sequences of transfers and computations}{110}
83 \contentsline {subsection}{\numberline {7.2.4}Interleaved communications-transfers-computations overlapping}{115}
84 \contentsline {subsection}{\numberline {7.2.5}Experimental validation}{118}
85 \contentsline {section}{\numberline {7.3}General scheme of asynchronous parallel code with computation/communication overlapping}{120}
86 \contentsline {subsection}{\numberline {7.3.1}A basic asynchronous scheme}{122}
87 \contentsline {subsection}{\numberline {7.3.2}Synchronization of the asynchronous scheme}{126}
88 \contentsline {subsection}{\numberline {7.3.3}Asynchronous scheme using MPI, OpenMP, and CUDA}{130}
89 \contentsline {subsection}{\numberline {7.3.4}Experimental validation}{137}
90 \contentsline {section}{\numberline {7.4}Perspective: a unifying programming model}{140}
91 \contentsline {subsection}{\numberline {7.4.1}Resources}{141}
92 \contentsline {subsection}{\numberline {7.4.2}Control}{142}
93 \contentsline {subsection}{\numberline {7.4.3}Example: block-cyclic matrix multiplication (MM)}{142}
94 \contentsline {subsection}{\numberline {7.4.4}Tasks and operations}{145}
95 \contentsline {section}{\numberline {7.5}Conclusion}{146}
96 \contentsline {section}{\numberline {7.6}Glossary}{146}
97 \contentsline {part}{IV\hspace {1em}Optimization}{151}
98 \author {Imen Chakroun and Nouredine Melab}{}
99 \contentsline {chapter}{\numberline {8}GPU-accelerated tree-based exact optimization methods}{153}
100 \contentsline {section}{\numberline {8.1}Introduction}{154}
101 \contentsline {section}{\numberline {8.2}Branch-and-bound algorithm}{156}
102 \contentsline {section}{\numberline {8.3}Parallel branch-and-bound algorithms}{157}
103 \contentsline {subsection}{\numberline {8.3.1}The parallel tree exploration model}{158}
104 \contentsline {subsection}{\numberline {8.3.2}The parallel evaluation of bounds model}{158}
105 \contentsline {section}{\numberline {8.4}The flowshop scheduling problem}{159}
106 \contentsline {subsection}{\numberline {8.4.1}Definition of the flowshop scheduling problem}{159}
107 \contentsline {subsection}{\numberline {8.4.2}Lower bound for the flowshop scheduling problem}{160}
108 \contentsline {section}{\numberline {8.5}GPU-accelerated B\&B based on the parallel tree exploration (GPU-PTE-BB)}{161}
109 \contentsline {section}{\numberline {8.6}GPU-accelerated B\&B based on the parallel evaluation of bounds (GPU-PEB-BB) }{162}
110 \contentsline {section}{\numberline {8.7}Thread divergence}{164}
111 \contentsline {subsection}{\numberline {8.7.1}The thread divergence issue}{164}
112 \contentsline {subsection}{\numberline {8.7.2}Mechanisms for reducing branch divergence}{165}
113 \contentsline {section}{\numberline {8.8}Memory access optimization}{168}
114 \contentsline {subsection}{\numberline {8.8.1}Complexity analysis of the memory usage of the lower bound }{169}
115 \contentsline {subsection}{\numberline {8.8.2}Data placement pattern of the lower bound on GPU}{170}
116 \contentsline {section}{\numberline {8.9}Experiments}{172}
117 \contentsline {subsection}{\numberline {8.9.1}Parameters settings}{172}
118 \contentsline {subsection}{\numberline {8.9.2}Experimental protocol: computing the speedup}{172}
119 \contentsline {subsection}{\numberline {8.9.3}Performance impact of GPU-based parallelism}{174}
120 \contentsline {subsection}{\numberline {8.9.4}Thread divergence reduction}{176}
121 \contentsline {subsection}{\numberline {8.9.5}Data access optimization}{176}
122 \contentsline {section}{\numberline {8.10}Conclusion and future work}{178}
123 \author {Malika Mehdi}{}
124 \author {Ahc\`{e}ne Bendjoudi}{}
125 \author {Lakhdar Loukil}{}
126 \author {Nouredine Melab}{}
127 \contentsline {chapter}{\numberline {9}Parallel GPU-accelerated metaheuristics}{183}
128 \contentsline {section}{\numberline {9.1}Introduction}{184}
129 \contentsline {section}{\numberline {9.2}Combinatorial optimization}{184}
130 \contentsline {section}{\numberline {9.3}Parallel models for metaheuristics}{185}
131 \contentsline {section}{\numberline {9.4}Challenges for the design of GPU-based metaheuristics}{187}
132 \contentsline {subsection}{\numberline {9.4.1}Data placement on a hierarchical memory}{188}
133 \contentsline {subsection}{\numberline {9.4.2}Threads synchronization}{189}
134 \contentsline {subsection}{\numberline {9.4.3}Thread divergence}{189}
135 \contentsline {subsection}{\numberline {9.4.4}Task distribution and CPU/GPU communication}{190}
136 \contentsline {section}{\numberline {9.5}State-of-the-art parallel metaheuristics on GPUs}{190}
137 \contentsline {subsection}{\numberline {9.5.1}Implementing solution-based metaheuristics on GPUs}{191}
138 \contentsline {subsection}{\numberline {9.5.2}Implementing population-based metaheuristics on GPUs}{193}
139 \contentsline {subsection}{\numberline {9.5.3}Synthesis of the implementation strategies}{198}
140 \contentsline {section}{\numberline {9.6}Frameworks for metaheuristics on GPUs}{201}
141 \contentsline {subsection}{\numberline {9.6.1}PUGACE: framework for implementing evolutionary computation on GPUs}{201}
142 \contentsline {subsection}{\numberline {9.6.2}ParadisEO-MO-GPU}{202}
143 \contentsline {subsection}{\numberline {9.6.3}libCUDAOptimize: an open source library of GPU-based metaheuristics}{202}
144 \contentsline {section}{\numberline {9.7}Case study: accelerating large neighborhood LS method on GPUs for solving the Q3AP}{203}
145 \contentsline {subsection}{\numberline {9.7.1}The quadratic 3-dimensional assignment problem}{204}
146 \contentsline {subsection}{\numberline {9.7.2}Iterated tabu search algorithm for the Q3AP}{205}
147 \contentsline {subsection}{\numberline {9.7.3}Neighborhood functions for the Q3AP}{206}
148 \contentsline {subsection}{\numberline {9.7.4}Design and implementation of a GPU-based iterated tabu search algorithm for the Q3AP}{207}
149 \contentsline {subsection}{\numberline {9.7.5}Experimental results}{208}
150 \contentsline {section}{\numberline {9.8}Conclusion}{210}
151 \author {Xavier Meyer}{}
152 \author {Bastien Chopard}{}
153 \author {Paul Albuquerque}{}
154 \contentsline {chapter}{\numberline {10}Linear programming on a GPU: a\nobreakspace {}case\nobreakspace {}study}{215}
155 \contentsline {section}{\numberline {10.1}Introduction}{216}
156 \contentsline {section}{\numberline {10.2}Simplex algorithm}{217}
157 \contentsline {subsection}{\numberline {10.2.1}Linear programming model}{217}
158 \contentsline {subsection}{\numberline {10.2.2}Standard simplex algorithm}{217}
159 \contentsline {subsection}{\numberline {10.2.3}Revised simplex method}{219}
160 \contentsline {subsection}{\numberline {10.2.4}Heuristics and improvements}{221}
161 \contentsline {section}{\numberline {10.3}Branch-and-bound algorithm}{224}
162 \contentsline {subsection}{\numberline {10.3.1}Integer linear programming}{224}
163 \contentsline {subsection}{\numberline {10.3.2}Branch-and-bound tree}{224}
164 \contentsline {subsection}{\numberline {10.3.3}Branching strategy}{225}
165 \contentsline {subsection}{\numberline {10.3.4}Node selection strategy}{226}
166 \contentsline {subsection}{\numberline {10.3.5}Cutting-plane methods}{227}
167 \contentsline {section}{\numberline {10.4}CUDA considerations}{228}
168 \contentsline {subsection}{\numberline {10.4.1}Parallel reduction}{228}
169 \contentsline {subsection}{\numberline {10.4.2}Kernel optimization}{228}
170 \contentsline {section}{\numberline {10.5}Implementations}{229}
171 \contentsline {subsection}{\numberline {10.5.1}Standard simplex}{229}
172 \contentsline {subsection}{\numberline {10.5.2}Revised simplex}{232}
173 \contentsline {subsection}{\numberline {10.5.3}Branch-and-bound}{236}
174 \contentsline {section}{\numberline {10.6}Performance model}{237}
175 \contentsline {subsection}{\numberline {10.6.1}Levels of parallelism}{238}
176 \contentsline {subsection}{\numberline {10.6.2}Amount of work done by a thread}{238}
177 \contentsline {subsection}{\numberline {10.6.3}Global performance model}{239}
178 \contentsline {subsection}{\numberline {10.6.4}A kernel example: \textit {steepest-edge}}{239}
179 \contentsline {subsection}{\numberline {10.6.5}Standard simplex GPU implementation model}{240}
180 \contentsline {section}{\numberline {10.7}Measurements and analysis}{241}
181 \contentsline {subsection}{\numberline {10.7.1}Performance model validation}{241}
182 \contentsline {subsection}{\numberline {10.7.2}Performance comparison of implementations}{241}
183 \contentsline {section}{\numberline {10.8}Conclusion and perspectives}{245}
184 \contentsline {part}{V\hspace {1em}Numerical applications}{249}
185 \author {Allan P. Engsig-Karup, Stefan L. Glimberg, Allan S. Nielsen, and Ole Lindberg}{}
186 \contentsline {chapter}{\numberline {11}Fast hydrodynamics on heterogeneous many-core hardware}{251}
187 \contentsline {section}{\numberline {11.1}On hardware trends and challenges in scientific applications}{253}
188 \contentsline {section}{\numberline {11.2}On modeling paradigms for highly nonlinear and dispersive water waves}{255}
189 \contentsline {section}{\numberline {11.3}Governing equations}{256}
190 \contentsline {subsection}{\numberline {11.3.1}Boundary conditions}{260}
191 \contentsline {section}{\numberline {11.4}The numerical model}{260}
192 \contentsline {subsection}{\numberline {11.4.1}Strategies for efficient solution of the Laplace problem}{261}
193 \contentsline {subsection}{\numberline {11.4.2}Finite difference approximations}{262}
194 \contentsline {subsection}{\numberline {11.4.3}Time integration}{263}
195 \contentsline {subsection}{\numberline {11.4.4}Wave generation and absorption}{266}
196 \contentsline {subsection}{\numberline {11.4.5}Preconditioned Defect Correction (PDC) method}{267}
197 \contentsline {subsection}{\numberline {11.4.6}Distributed computations via domain decomposition}{269}
198 \contentsline {subsection}{\numberline {11.4.7}Assembling the wave model from library components}{271}
199 \contentsline {section}{\numberline {11.5}Properties of the numerical model}{276}
200 \contentsline {subsection}{\numberline {11.5.1}Dispersion and kinematic properties}{276}
201 \contentsline {section}{\numberline {11.6}Numerical experiments}{277}
202 \contentsline {subsection}{\numberline {11.6.1}On precision requirements in engineering applications}{277}
203 \contentsline {subsection}{\numberline {11.6.2}Acceleration via parallelism in time using parareal}{282}
204 \contentsline {subsection}{\numberline {11.6.3}Towards real-time interactive ship simulation}{286}
205 \contentsline {section}{\numberline {11.7}Conclusion and future work}{288}
206 \contentsline {section}{\numberline {11.8}Acknowledgments}{289}
207 \author {Gleb Beliakov and Shaowu Liu}{}
208 \contentsline {chapter}{\numberline {12}Parallel monotone spline interpolation and approximation on GPUs}{295}
209 \contentsline {section}{\numberline {12.1}Introduction}{295}
210 \contentsline {section}{\numberline {12.2}Monotone splines}{298}
211 \contentsline {subsection}{\numberline {12.2.1}Monotone quadratic splines}{298}
212 \contentsline {subsection}{\numberline {12.2.2}Monotone Hermite splines}{302}
213 \contentsline {section}{\numberline {12.3}Smoothing noisy data via parallel isotone regression}{303}
214 \contentsline {section}{\numberline {12.4}Conclusion}{308}
215 \author {Lilia Ziane Khodja, Rapha\IeC {\"e}l Couturier, and Jacques Bahi}{}
216 \contentsline {chapter}{\numberline {13}Solving sparse linear systems with GMRES and CG methods on GPU clusters}{311}
217 \contentsline {section}{\numberline {13.1}Introduction}{311}
218 \contentsline {section}{\numberline {13.2}Krylov iterative methods}{312}
219 \contentsline {subsection}{\numberline {13.2.1}CG method}{313}
220 \contentsline {subsection}{\numberline {13.2.2}GMRES method}{314}
221 \contentsline {section}{\numberline {13.3}Parallel implementation on a GPU cluster}{317}
222 \contentsline {subsection}{\numberline {13.3.1}Data partitioning}{317}
223 \contentsline {subsection}{\numberline {13.3.2}GPU computing}{317}
224 \contentsline {subsection}{\numberline {13.3.3}Data communications}{319}
225 \contentsline {section}{\numberline {13.4}Experimental results}{321}
226 \contentsline {section}{\numberline {13.5}Conclusion}{327}
227 \author {Lilia Ziane Khodja, Rapha\IeC {\"e}l Couturier, Jacques Bahi}{}
228 \author {Ming Chau}{}
229 \author {Pierre Spit\IeC {\'e}ri}{}
230 \contentsline {chapter}{\numberline {14}Solving sparse nonlinear systems of obstacle problems on GPU clusters}{331}
231 \contentsline {section}{\numberline {14.1}Introduction}{331}
232 \contentsline {section}{\numberline {14.2}Obstacle problems}{332}
233 \contentsline {subsection}{\numberline {14.2.1}Mathematical model}{332}
234 \contentsline {subsection}{\numberline {14.2.2}Discretization}{333}
235 \contentsline {section}{\numberline {14.3}Parallel iterative method}{334}
236 \contentsline {section}{\numberline {14.4}Parallel implementation on a GPU cluster}{337}
237 \contentsline {section}{\numberline {14.5}Experimental tests on a GPU cluster}{345}
238 \contentsline {section}{\numberline {14.6}Red-black ordering technique}{348}
239 \contentsline {section}{\numberline {14.7}Conclusion}{352}
240 \author {Alan Gray and Kevin Stratford}{}
241 \contentsline {chapter}{\numberline {15}Ludwig: multiple GPUs for a complex fluid lattice Boltzmann
243 \contentsline {section}{\numberline {15.1}Introduction}{355}
244 \contentsline {section}{\numberline {15.2}Background}{357}
245 \contentsline {section}{\numberline {15.3}Single GPU implementation}{360}
246 \contentsline {section}{\numberline {15.4}Multiple GPU implementation}{361}
247 \contentsline {section}{\numberline {15.5}Moving solid particles}{364}
248 \contentsline {section}{\numberline {15.6}Summary}{367}
249 \contentsline {section}{\numberline {15.7}Acknowledgments}{367}
250 \author {Rachid Habel}{}
251 \author {Pierre Fortin, Fabienne J\'ez\'equel, Jean-Luc Lamotte}{}
252 \author {Stan Scott}{}
253 \contentsline {chapter}{\numberline {16}Numerical validation and performance optimization on GPUs of an application in atomic physics}{371}
254 \contentsline {section}{\numberline {16.1}Introduction}{372}
255 \contentsline {section}{\numberline {16.2}2DRMP and the PROP program}{373}
256 \contentsline {subsection}{\numberline {16.2.1}Principles of $R$-matrix propagation}{373}
257 \contentsline {subsection}{\numberline {16.2.2}Description of the PROP program}{375}
258 \contentsline {subsection}{\numberline {16.2.3}CAPS implementation}{376}
259 \contentsline {section}{\numberline {16.3}Numerical validation of PROP in single precision}{377}
260 \contentsline {subsection}{\numberline {16.3.1}Medium case study}{378}
261 \contentsline {subsection}{\numberline {16.3.2}Huge case study}{380}
262 \contentsline {section}{\numberline {16.4}Towards a complete deployment of PROP on GPUs}{381}
263 \contentsline {subsection}{\numberline {16.4.1}Computing the output $R$-matrix on GPU}{382}
264 \contentsline {subsection}{\numberline {16.4.2}Constructing the local $R$-matrices on GPU}{384}
265 \contentsline {subsection}{\numberline {16.4.3}Scaling amplitude arrays on GPU}{385}
266 \contentsline {subsection}{\numberline {16.4.4}Using double-buffering to overlap I/O and computation}{385}
267 \contentsline {subsection}{\numberline {16.4.5}Matrix padding}{386}
268 \contentsline {section}{\numberline {16.5}Performance results}{387}
269 \contentsline {subsection}{\numberline {16.5.1}PROP deployment on GPU}{387}
270 \contentsline {subsection}{\numberline {16.5.2}PROP execution profile}{389}
271 \contentsline {section}{\numberline {16.6}Propagation of multiple concurrent energies on GPU}{391}
272 \contentsline {section}{\numberline {16.7}Conclusion and future work}{392}
273 \author {Xuexin Liu, Sheldon Xiang-Dong Tan}{}
276 \contentsline {chapter}{\numberline {17}A GPU-accelerated envelope-following method for switching power converter simulation}{395}
277 \contentsline {section}{\numberline {17.1}Introduction}{395}
278 \contentsline {section}{\numberline {17.2}The envelope-following method in a nutshell}{398}
279 \contentsline {section}{\numberline {17.3}New parallel envelope-following method}{400}
280 \contentsline {subsection}{\numberline {17.3.1}GMRES solver for Newton update equation}{400}
281 \contentsline {subsection}{\numberline {17.3.2}Parallelization on GPU platforms}{402}
282 \contentsline {subsection}{\numberline {17.3.3}Gear-2 based sensitivity calculation}{404}
283 \contentsline {section}{\numberline {17.4}Numerical examples}{406}
284 \contentsline {section}{\numberline {17.5}Summary}{410}
285 \contentsline {section}{\numberline {17.6}Glossary}{410}
286 \contentsline {part}{VI\hspace {1em}Other}{413}
287 \author {Guillaume Laville, Christophe Lang, B\IeC {\'e}n\IeC {\'e}dicte Herrmann, Laurent Philippe}{}
288 \author {Kamel Mazouzi}{}
289 \author {Nicolas Marilleau}{}
290 \contentsline {chapter}{\numberline {18}Implementing multi-agent systems on GPU}{415}
291 \contentsline {section}{\numberline {18.1}Introduction}{416}
292 \contentsline {section}{\numberline {18.2}Running agent-based simulations}{417}
293 \contentsline {subsection}{\numberline {18.2.1}Multi-agent systems and parallelism}{417}
294 \contentsline {subsection}{\numberline {18.2.2}MAS implementation on GPU}{418}
295 \contentsline {section}{\numberline {18.3}A first practical example}{420}
296 \contentsline {subsection}{\numberline {18.3.1}The Collembola model}{420}
297 \contentsline {subsection}{\numberline {18.3.2}Collembola implementation}{421}
298 \contentsline {subsection}{\numberline {18.3.3}Collembola performance}{423}
299 \contentsline {section}{\numberline {18.4}Second example}{424}
300 \contentsline {subsection}{\numberline {18.4.1}The MIOR model}{424}
301 \contentsline {subsection}{\numberline {18.4.2}MIOR implementation}{425}
302 \contentsline {subsubsection}{\numberline {18.4.2.1}Execution mapping on GPU}{426}
303 \contentsline {subsubsection}{\numberline {18.4.2.2}Data structures translation}{427}
304 \contentsline {subsubsection}{\numberline {18.4.2.3}Critical resources access management}{428}
305 \contentsline {subsubsection}{\numberline {18.4.2.4}Termination detection}{430}
306 \contentsline {subsection}{\numberline {18.4.3}Performance of MIOR implementations}{430}
307 \contentsline {section}{\numberline {18.5}Analysis and recommendations}{434}
308 \contentsline {subsection}{\numberline {18.5.1}Analysis}{434}
309 \contentsline {subsection}{\numberline {18.5.2}MAS execution workflow}{435}
310 \contentsline {subsection}{\numberline {18.5.3}Implementation challenges}{435}
311 \contentsline {subsection}{\numberline {18.5.4}MCSMA}{436}
312 \contentsline {section}{\numberline {18.6}Conclusion}{437}
313 \author {Rapha\IeC {\"e}l Couturier and Christophe Guyeux}{}
314 \contentsline {chapter}{\numberline {19}Pseudorandom number generator on GPU}{441}
315 \contentsline {section}{\numberline {19.1}Introduction}{441}
316 \contentsline {section}{\numberline {19.2}Basic reminders}{443}
317 \contentsline {subsection}{\numberline {19.2.1}A short presentation of chaos}{443}
318 \contentsline {subsection}{\numberline {19.2.2}On Devaney's definition of chaos}{443}
319 \contentsline {subsection}{\numberline {19.2.3}Chaotic iterations}{444}
320 \contentsline {section}{\numberline {19.3}Toward efficiency and improvement for CI PRNG}{445}
321 \contentsline {subsection}{\numberline {19.3.1}First efficient implementation of a PRNG based on chaotic iterations}{445}
322 \contentsline {subsection}{\numberline {19.3.2}Efficient PRNGs based on chaotic iterations on GPU}{446}
323 \contentsline {subsection}{\numberline {19.3.3}Naive version for GPU}{446}
324 \contentsline {subsection}{\numberline {19.3.4}Improved version for GPU}{447}
325 \contentsline {subsection}{\numberline {19.3.5}Chaos evaluation of the improved version}{448}
326 \contentsline {section}{\numberline {19.4}Experiments}{449}
327 \contentsline {section}{\numberline {19.5}Summary}{450}
328 \author {Bertil Schmidt and Hoang-Vu Dang}{}
329 \contentsline {chapter}{\numberline {20}Solving large sparse linear systems for integer factorization on GPUs}{453}
330 \contentsline {section}{\numberline {20.1}Introduction}{453}
331 \contentsline {section}{\numberline {20.2}Block Wiedemann algorithm}{454}
332 \contentsline {section}{\numberline {20.3}SpMV over GF(2) for NFS matrices using existing formats on GPUs}{455}
333 \contentsline {section}{\numberline {20.4}A hybrid format for SpMV on GPUs}{459}
334 \contentsline {subsection}{\numberline {20.4.1}Dense format}{459}
335 \contentsline {subsection}{\numberline {20.4.2}Sliced COO}{460}
336 \contentsline {subsection}{\numberline {20.4.3}Determining the cut-off point of each format}{464}
337 \contentsline {section}{\numberline {20.5}SCOO for single-precision floating-point matrices}{465}
338 \contentsline {section}{\numberline {20.6}Performance evaluation}{466}
339 \contentsline {subsection}{\numberline {20.6.1}Experimental setup}{466}
340 \contentsline {subsection}{\numberline {20.6.2}Experimental results of SpMV on NFS matrix}{466}
341 \contentsline {subsection}{\numberline {20.6.3}Experimental results of SpMV on floating-point matrices}{467}
342 \contentsline {subsubsection}{\numberline {20.6.3.1}Performance comparison to existing GPU formats}{467}
343 \contentsline {subsubsection}{\numberline {20.6.3.2}Performance comparison to a CPU implementation}{469}
344 \contentsline {section}{\numberline {20.7}Conclusion}{470}
345 \contentsline {chapter}{Index}{473}