\include{Chapters/chapter15/ch15}
\include{Chapters/chapter16/ch16}
\part{Other}
+\include{Chapters/chapter17/ch17}
\include{Chapters/chapter18/ch18}
\include{Chapters/chapter19/ch19}
--- /dev/null
+% This file was created with JabRef 2.6.
+% Encoding: ISO8859_1
+
+@incollection{Odell:2003:RRD:1807559.1807562,
+ author = {Odell, James J. and Van Dyke Parunak, H. and Fleischer, Mitchell},
+ chapter = {The role of roles in designing effective agent organizations},
+ title = {Software engineering for large-scale multi-agent systems},
+ editor = {Garcia, Alessandro and Lucena, Carlos and Zambonelli, Franco and Omicini, Andrea and Castro, Jaelson},
+ year = {2003},
+ isbn = {3-540-08772-9},
+ pages = {27--38},
+ numpages = {12},
+ url = {http://dl.acm.org/citation.cfm?id=1807559.1807562},
+ acmid = {1807562},
+ publisher = {Springer-Verlag},
+ address = {Berlin, Heidelberg},
+}
+
+@INPROCEEDINGS{Aaby:2010:ESA:1808143.1808181,
+ author = {Aaby, Brandon G. and Perumalla, Kalyan S. and Seal, Sudip K.},
+ title = {Efficient simulation of agent-based models on multi-GPU and multi-core
+ clusters},
+ booktitle = {Proceedings of the 3rd International ICST Conference on Simulation
+ Tools and Techniques},
+ year = {2010},
+ series = {SIMUTools '10},
+ pages = {29:1--29:10},
+ address = {ICST, Brussels, Belgium, Belgium},
+ publisher = {ICST (Institute for Computer Sciences, Social-Informatics and Telecommunications
+ Engineering)},
+ acmid = {1808181},
+ articleno = {29},
+ doi = {http://dx.doi.org/10.4108/ICST.SIMUTOOLS2010.8822},
+ isbn = {978-963-9799-87-5},
+ keywords = {CUDA, GPU, MPI, agent-based simulation, cluster, computational hierarchy,
+ latency hiding, multi-core, threads},
+ location = {Torremolinos, Malaga, Spain},
+ numpages = {10},
+ url = {http://dx.doi.org/10.4108/ICST.SIMUTOOLS2010.8822}
+}
+
+@MISC{mason_home,
+ title = {{MASON Multiagent Simulation Toolkit}},
+ howpublished = "\url{http://cs.gmu.edu/~eclab/projects/mason/}",
+ note = "[12-oct-2011]"
+}
+
+@MISC{cuda_home,
+ title = {{CUDA: NVIDIA's parallel computing architecture}},
+ howpublished = "\url{http://www.nvidia.com/object/cuda_home.html}",
+ note = "[11-oct-2011]"
+}
+
+@MISC{opencl_home,
+ title = {{OpenCL: The open standard for parallel programming of heterogeneous systems}},
+ howpublished = "\url{http://www.khronos.org/opencl/}",
+ note = "[11-oct-2011]"
+}
+
+@MISC{jocl_home,
+ title = {{JOCL: Java bindings for OpenCL}},
+ howpublished = "\url{http://www.jocl.org/}",
+ note = "[11-oct-2011]"
+}
+
+@MISC{jcuda_home,
+ title = {{JCUDA: Java bindings for CUDA}},
+ howpublished = "\url{http://www.jcuda.org/}",
+ note = "[11-oct-2011]"
+}
+
+@MISC{repast_home2,
+ title = {{Repast Suite, a family of advanced, free and open source agent-based modeling and simulation platforms}},
+ howpublished = "\url{http://repast.sourceforge.net/}",
+ note = "[05-October-2011]"
+}
+
+@MISC{madkit_home,
+ title = {{Madkit, an open source modular and scalable multiagent platform written in Java}},
+ howpublished = "\url{http://www.madkit.org/}",
+ note = "[05-October-2011]"
+}
+
+@MISC{noAuthor,
+ howpublished = {{This reference includes one or more of the authors and has been removed from this review version, in compliance with the submission guidelines.}}
+}
+
+@MISC{jade_home,
+ title = {{JADE (Java Agent DEvelopment Framework)}},
+ howpublished = "\url{http://jade.tilab.com/}",
+ note = "[05-October-2011]"
+}
+
+@MISC{netlogo_home2,
+ title = {{NetLogo is a multi-agent programmable modeling environment}},
+ howpublished = "\url{http://ccl.northwestern.edu/netlogo/}",
+ note = "[05-October-2011]"
+}
+
+
+@MISC{Bleiweiss_2009,
+ author = {Bleiweiss, Avi},
+ title = {Multi Agent Navigation on the GPU},
+ howpublished = "\url{developer.download.nvidia.com/presentations/2009/GDC/MultiAgentGPU.pdf}",
+ year = {2009},
+ note = "[05-October-2011]"
+}
+
+
+@ARTICLE{BarakA.2010,
+ author = {Barak A., Ben-Nun T., Levy E. and Shiloh A.},
+ title = {A Package for OpenCL Based Heterogeneous Computing on Clusters with
+ Many GPU Devices},
+ journal = {Workshop on Parallel Programming and Applications on Accelerator
+ Clusters (PPAAC), IEEE Cluster 2010},
+ year = {2010},
+ owner = {glaville},
+ timestamp = {2011.10.04}
+}
+
+@ARTICLE{Bleiweiss_2008,
+ author = {Bleiweiss, Avi},
+ title = {Multi Agent Navigation on the GPU},
+ journal = {GDC09 Game Developers Conference 2009},
+ year = {2008}
+}
+
+@ARTICLE{C.Cambier2007,
+ author = {C. Cambier, D. Masse, M. Bousso and E. Perrier},
+ title = {An offer versus demand modelling approach to assess the impact of
+ micro-organisms spatio-temporal dynamics on soil organic matter decomposition
+ rates},
+ journal = {Ecological Modelling},
+ year = {2007},
+ pages = {301--313},
+ owner = {glaville},
+ timestamp = {2011.10.04}
+}
+
+@ARTICLE{C.Cambier2006,
+ author = {C. Cambier, D. Masse, M. Bousso and E. Perrier},
+ title = {Mior, A spatially explicit, individual based modeling approach to
+ simulate soil microbial and organic matter processes},
+ journal = {Ecological Modelling},
+ year = {2006},
+ owner = {glaville},
+ timestamp = {2011.10.04}
+}
+
+@INPROCEEDINGS{D'Souza:2009:DAA:1639809.1639831,
+ author = {D'Souza, Roshan M. and Lysenko, Mikola and Marino, Simeone and Kirschner,
+ Denise},
+ title = {Data-parallel algorithms for agent-based model simulation of tuberculosis
+ on graphics processing units},
+ booktitle = {Proceedings of the 2009 Spring Simulation Multiconference},
+ year = {2009},
+ series = {SpringSim '09},
+ pages = {21:1--21:12},
+ address = {San Diego, CA, USA},
+ publisher = {Society for Computer Simulation International},
+ acmid = {1639831},
+ articleno = {21},
+ keywords = {GPGPU, agent-based models, integrative systems biology},
+ location = {San Diego, California},
+ numpages = {12},
+ url = {http://dl.acm.org/citation.cfm?id=1639809.1639831}
+}
+
+@ARTICLE{E.Blanchart2009,
+ author = {E. Blanchart, N. Marilleau, A. Drogoul, E. Perrier, JL.
+ Chotte and C. Cambier},
+ title = {SWORM: an agent-based model to simulate the effect of earthworms
+ on soil structure},
+ journal = {EJSS. European Journal of Soil Science},
+ year = {2009},
+ pages = {13-21},
+ owner = {glaville},
+ timestamp = {2011.10.04}
+}
+
+@INPROCEEDINGS{Fischer:2009:GAP:1803298.1803361,
+ author = {Fischer, Leonardo G. and Silveira, Renato and Nedel, Luciana},
+ title = {GPU Accelerated Path-Planning for Multi-agents in Virtual Environments},
+ booktitle = {Proceedings of the 2009 VIII Brazilian Symposium on Games and Digital
+ Entertainment},
+ year = {2009},
+ series = {SBGAMES '09},
+ pages = {101--110},
+ address = {Washington, DC, USA},
+ publisher = {IEEE Computer Society},
+ acmid = {1803361},
+ doi = {http://dx.doi.org/10.1109/SBGAMES.2009.20},
+ isbn = {978-0-7695-3963-8},
+ keywords = {Path-planning, GPGPU, NVIDIA CUDA, Agent Simulation},
+ numpages = {10},
+ url = {http://dx.doi.org/10.1109/SBGAMES.2009.20}
+}
+
+@INPROCEEDINGS{Gomez-Luna:2009:PVS:1616772.1616869,
+ author = {G\'{o}mez-Luna, J. and Gonz\'{a}lez-Linares, J.-M.
+ and Benavides, J.-I. and Guil, N.},
+ title = {Parallelization of a Video Segmentation Algorithm on CUDA---Enabled
+ Graphics Processing Units},
+ booktitle = {15th Euro-Par Conference},
+ year = {2009},
+ pages = {924--935},
+ address = {Berlin, Heidelberg},
+ publisher = {Springer-Verlag},
+ acmid = {1616869},
+ doi = {http://dx.doi.org/10.1007/978-3-642-03869-3%5F85},
+ isbn = {978-3-642-03868-6},
+ keywords = {CUDA, Canny, Generalized Hough Transform, Video Segmentation},
+ location = {Delft, The Netherlands},
+ numpages = {12},
+ url = {http://dx.doi.org/10.1007/978-3-642-03869-3%5F85}
+}
+
+@incollection {Maitre2009,
+ author = {Maitre, Ogier and Lachiche, Nicolas and Clauss, Philippe and Baumes, Laurent and Corma, Avelino and Collet, Pierre},
+ affiliation = {LSIIT University of Strasbourg, France},
+ title = {Efficient Parallel Implementation of Evolutionary Algorithms on GPGPU Cards},
+ booktitle = {Euro-Par 2009 Parallel Processing},
+ series = {Lecture Notes in Computer Science},
+ editor = {Sips, Henk and Epema, Dick and Lin, Hai-Xiang},
+ publisher = {Springer Berlin / Heidelberg},
+ isbn = {978-3-642-03868-6},
+ keyword = {Computer Science},
+ pages = {974-985},
+ volume = {5704},
+ url = {http://dx.doi.org/10.1007/978-3-642-03869-3%5F89},
+ note = {10.1007/978-3-642-03869-3%5F89},
+ year = {2009}
+}
+
+
+@INPROCEEDINGS{Gutknecht2000,
+ author = {Gutknecht, O. and Ferber, J.},
+ title = {MadKit: a generic multi-agent platform},
+ booktitle = {Proceedings of the fourth international conference on Autonomous
+ agents},
+ year = {2000},
+ series = {AGENTS '00},
+ pages = {78--79},
+ address = {New York, NY, USA},
+ publisher = {ACM},
+ acmid = {337048},
+ doi = {http://doi.acm.org/10.1145/336595.337048},
+ isbn = {1-58113-230-1},
+ location = {Barcelona, Spain},
+ numpages = {2},
+ url = {http://doi.acm.org/10.1145/336595.337048}
+}
+
+@INPROCEEDINGS{J.Breitbart2009,
+ author = {J. Breitbart, J.;
+
+ Res. Group Programming Languages / Methodologies, Univ. Kassel, Kassel,
+ Germany},
+ title = {Data structure design for GPU based heterogeneous systems},
+ booktitle = {High Performance Computing \& Simulation, 2009. HPCS '09. International
+ Conference on},
+ year = {2009},
+ pages = {44 - 51},
+ owner = {glaville},
+ timestamp = {2011.10.04}
+}
+
+@BOOK{Jean-PierreTreuil2008,
+ title = {Mod{\'e}lisation et Simulation {\`a} base d'agents},
+ publisher = {Dunot},
+ year = {2008},
+ editor = {Dunot},
+ author = {Jean-Pierre Treuil, Alexis Drogoul, Jean-Daniel Zucker},
+ owner = {glaville},
+ timestamp = {2011.10.04}
+}
+
+@ARTICLE{DBLP:journals/corr/abs-1005-2581,
+ author = {Kamran Karimi and Neil G. Dickson and Firas Hamze},
+ title = {A Performance Comparison of CUDA and OpenCL},
+ journal = {CoRR},
+ year = {2010},
+ volume = {abs/1005.2581},
+ bibsource = {DBLP, http://dblp.uni-trier.de},
+ ee = {http://arxiv.org/abs/1005.2581}
+}
+
+@INPROCEEDINGS{komatsu10iwapt,
+ author = {Kazuhiko Komatsu and Katsuto Sato and Yusuke Arai and Kentaro Koyama
+ and Hiroyuki Takizawa and Hiroaki Kobayashi},
+ title = {Evaluating Performance and Portability of OpenCL Programs},
+ booktitle = {The Fifth International Workshop on Automatic Performance Tuning},
+ year = {2010},
+ month = {June}
+}
+
+@INPROCEEDINGS{Mistry:2011:APF:1964179.1964193,
+ author = {Mistry, Perhaad and Gregg, Chris and Rubin, Norman and Kaeli, David
+ and Hazelwood, Kim},
+ title = {Analyzing program flow within a many-kernel OpenCL application},
+ booktitle = {Proceedings of the Fourth Workshop on General Purpose Processing
+ on Graphics Processing Units},
+ year = {2011},
+ series = {GPGPU-4},
+ pages = {10:1--10:8},
+ address = {New York, NY, USA},
+ publisher = {ACM},
+ acmid = {1964193},
+ articleno = {10},
+ doi = {http://doi.acm.org/10.1145/1964179.1964193},
+ isbn = {978-1-4503-0569-3},
+ keywords = {GPGPU, OpenCL, SURF, computer vision, heterogeneous computing, performance
+ tools, profiling},
+ location = {Newport Beach, California},
+ numpages = {8},
+ url = {http://doi.acm.org/10.1145/1964179.1964193}
+}
+
+@ARTICLE{P.Du2010,
+ author = {P. Du, R. Weber, P. Luszczek, S. Tomov, G. Peterson and J. Dongarra,},
+ title = {From CUDA to OpenCL: Towards a Performance-portable Solution for
+ Multi-platform GPU Programming},
+ journal = {Parallel Computing, Aug. 2010},
+ year = {2010},
+ owner = {glaville},
+ timestamp = {2011.10.04}
+}
+
+@INPROCEEDINGS{Richmond2008,
+ author = {Richmond, P., Romano, D. M.},
+ title = {Agent Based GPU, A Real-time 3D Simulation and Interactive Visualisation
+ Framework for Massive Agent Based Modelling on the GPU},
+ booktitle = {Proceedings International Workshop on Supervisualisation 2008},
+ year = {2008},
+ owner = {glaville},
+ timestamp = {2011.10.04}
+}
+
+@CONFERENCE{S.Rul2010,
+ author = {S. Rul, H. Vandierendonck, J. D'Haene, and K. D. Bosschere},
+ title = {An experimental study on performance portability of OpenCL kernels.
+ In Symposium on Application Accelerators in High Performance Computing},
+ booktitle = {Symposium on Application Accelerators in High Performance Computing
+ (SAAHPC '10)},
+ year = {2010},
+ owner = {glaville},
+ timestamp = {2011.10.04}
+}
+
+@INPROCEEDINGS{Silveira:2010:PRG:1948395.1948446,
+ author = {Silveira, Renato and Fischer, Leonardo and Ferreira, Jos\'{e} Ant\^{o}nio
+ Salini and Prestes, Edson and Nedel, Luciana},
+ title = {Path-planning for RTS games based on potential fields},
+ booktitle = {Proceedings of the Third international conference on Motion in games},
+ year = {2010},
+ series = {MIG'10},
+ pages = {410--421},
+ address = {Berlin, Heidelberg},
+ publisher = {Springer-Verlag},
+ acmid = {1948446},
+ isbn = {3-642-16957-0, 978-3-642-16957-1},
+ keywords = {autonomous agent, navigation, path-planning},
+ location = {Utrecht, The Netherlands},
+ numpages = {12},
+ url = {http://dl.acm.org/citation.cfm?id=1948395.1948446}
+}
+
+@ARTICLE{CAV:CAV261,
+ author = {Silveira, Renato and Prestes, Edson and Nedel, Luciana P.},
+ title = {Managing coherent groups},
+ journal = {Computer Animation and Virtual Worlds},
+ year = {2008},
+ volume = {19},
+ pages = {295--305},
+ number = {3-4},
+ doi = {10.1002/cav.261},
+ issn = {1546-427X},
+ keywords = {groups simulation, motion planning, formation-keeping, potential field},
+ publisher = {John Wiley \& Sons, Ltd.},
+ url = {http://dx.doi.org/10.1002/cav.261}
+}
+
+@ARTICLE{Strippgen_Nagel_2009,
+ author = {Strippgen, David and Nagel, Kai},
+ title = {Multi-agent traffic simulation with CUDA},
+ journal = {2009 International Conference on High Performance Computing Simulation},
+ year = {2009},
+ pages = {106--114},
+ publisher = {Ieee},
+ url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=5192895}
+}
+
+@TECHREPORT{Thrane2005,
+ author = {Niels Thrane and Lars Ole Simonsen and Advisor Peter Ørbæk},
+ title = {A comparison of acceleration structures for GPU assisted ray tracing},
+ year = {2005}
+}
+
+@BOOK{2009,
+ title = {NVIDIA CUDA C Programming Best Practices Guide
+
+ CUDA Toolkit 2.3},
+ publisher = {NVIDIA Corporation},
+ year = {2009},
+ owner = {glaville},
+ timestamp = {2011.03.24}
+}
+
+@InProceedings{Edmonds04,
+author = {Edmonds, B. and Moss, S.},
+title = {"From KISS to KIDS: An" anti-simplistic" Modeling Approach"},
+booktitle = {MABS 2004},
+pages = {130-144},
+year = {2004}
+}
+
+@PhdThesis{Amouroux11,
+author = {Amouroux, E.},
+title = {"KIMONO: using the modelling process as an aid for research orientation"},
+school = {UPMC},
+year = {2011},
+address = {Paris, France},
+}
+
+@Book{tdz2008:book,
+ AUTHOR = {Treuil, J.-P. and Drogoul, A. and Zucker, J.-D.},
+ TITLE = {Mod{\'e}lisation et simulation {\`a} base d'agents: Approches particulaires, mod{\`e}les {\`a} base d'agents, de la mise en pratique aux questions th{\'e}oriques},
+ YEAR = {2008},
+ PUBLISHER = {Dunod}
+}
+
+@Article{Sean05,
+author = {Sean, L. and Cioffi-Revilla, C. and Panait, L. and Sullivan, K. and Balan, G.},
+title = {MASON: A Multi-Agent Simulation Environment},
+journal = {Simulation: Transactions of the society for Modeling and Simulation International},
+year = {2005},
+volume = {82},
+number = {7},
+pages = {517-527}
+}
+
+@Article{BLA09,
+ AUTHOR = {Blanchart, E. and Marilleau, N. and Drogoul, A. and Perrier, E. and Chotte, JL. and Cambier, C.},
+ TITLE = {SWORM: an agent-based model to simulate the effect of earthworms on soil structure},
+ YEAR = {2009},
+ JOURNAL = {EJSS. European Journal of Soil Science},
+ VOLUME = {60},
+ PAGES = {13-21}
+}
+
+
+@inproceedings{Vowel02,
+ author = {Da Silva Joao Luis T. and
+ Demazeau, Yves},
+ title = {Vowels co-ordination model},
+ booktitle = {AAMAS},
+ year = {2002},
+ pages = {1129-1136},
+ ee = {http://doi.acm.org/10.1145/545056.545083},
+ bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+ @inproceedings{MAR06,
+ author = { Marilleau, N. and
+ Lang, C. and
+ Chatonnay, P. and
+ Philippe, L.},
+ title = {An Agent Based Framework for Urban Mobility Simulation},
+ booktitle = {PDP},
+ year = {2006},
+ pages = {355-361},
+ ee = {http://doi.ieeecomputersociety.org/10.1109/PDP.2006.19},
+ address = {France},
+ bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+
+@Article{perrier,
+ AUTHOR = {Bird, N and Perrier, E.},
+ TITLE = {The PSF model and soil density scaling},
+ YEAR = {2003},
+ JOURNAL = {European Journal of Soil Science},
+ VOLUME = {54},
+ NUMBER = {3},
+ PAGES = {467-476}
+}
+
+
+@inproceedings{dem02,
+ author = {Joao Luis T. da Silva and
+ Yves Demazeau},
+ title = {Vowels co-ordination model},
+ booktitle = {AAMAS},
+ year = {2002},
+ pages = {1129-1136},
+ address = {Italy},
+ ee = {http://doi.acm.org/10.1145/545056.545083},
+ bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+@Article{netlogo_home,
+author = {Sklar, E.},
+title = {NetLogo, a multi-agent simulation environment},
+journal = {Artificial Life},
+year = {2011},
+volume = {13},
+number = {3},
+pages = {303-311}
+}
+
+@InProceedings{repast_home,
+author = {North, M.J. and Howe, T.R. and Collier, N.T. and Vos, J.R},
+title = {A Declarative Model Assembly Infrastructure for Verification and Validation},
+booktitle = {Advancing Social Simulation: The First World Congress},
+year = {2007},
+editor = {Springer},
+address = {Heidelberg, FRG}
+}
+
+
+@INPROCEEDINGS{Guy09clearpath,
+ author = {Stephen J. Guy and Jatin Chhugani and Changkyu Kim and Nadathur Satish and Ming C. Lin and Dinesh Manocha and Pradeep Dubey},
+ title = {ClearPath: Highly Parallel Collision Avoidance for Multi-Agent Simulation},
+ booktitle = {ACM SIGGRAPH/EUROGRAPHICS SYMPOSIUM ON COMPUTER ANIMATION},
+ year = {2009},
+ pages = {177--187},
+ publisher = {ACM}
+}
+
+@inproceedings{Kiran2010,
+ author = {Kiran, Mariam and Richmond, Paul and Holcombe, Mike and Chin, Lee Shawn and Worth, David and Greenough, Chris},
+ title = {FLAME: simulating large populations of agents on parallel hardware architectures},
+ booktitle = {Proceedings of the 9th International Conference on Autonomous Agents and Multiagent Systems: volume 1 - Volume 1},
+ series = {AAMAS '10},
+ year = {2010},
+ isbn = {978-0-9826571-1-9},
+ location = {Toronto, Canada},
+ pages = {1633--1636},
+ numpages = {4},
+ url = {http://dl.acm.org/citation.cfm?id=1838206.1838517},
+ acmid = {1838517},
+ publisher = {International Foundation for Autonomous Agents and Multiagent Systems},
+ address = {Richland, SC},
+ keywords = {simulation techniques, tools and environments},
+}
+
+@inproceedings{Dsouza2007,
+author={D'SOUZA R M, Lysenko M, Rahmani K},
+title={Sugarscape on Steroids: Simulating Over a Million Agents at Interactive Rates},
+year={2007},
+booktitle={Proceedings of the Agent 2007 Conference},
+adress={Chicago, IL}
+}
+
+@Book{Schweitzer2003,
+author = {Schweitzer, F. and Farmer, J. D.},
+title = {Brownian Agents and Active Particles: Collective Dynamics in the Natural and Social Sciences},
+publisher = {Springer},
+year = {2003},
+series = {Complexity}
+}
+
+@Book{Ferber99,
+author = {Ferber, J.},
+title = { Multi-Agent Systems. An Introduction to Distributed Artificial Intelligence. },
+publisher = {Addison Wesley},
+year = {1999},
+OPTaddress = {London}
+}
+
+
+@InProceedings{Cosenza2011,
+author = {Cosenza,B and Cordasco, G and De Chiara, R. and Scarano, V.},
+title = {Distributed Load Balancing for Parallel Agent-based Simulations},
+booktitle = {19th Euromicro International Conference on Parallel, Distributed and Network-Based Computing},
+year = {2011},
+address = {Ayia Napa, Cyprus}
+}
+
+@article{Chuffart2010,
+ author = {Chuffart, F and
+ Dumoulin, N and
+ Faure, T and
+ Deffuant, G},
+ title = {SimExplorer: Programming Experimental Designs on Models
+ and Managing Quality of Modelling Process},
+ journal = {IJAEIS},
+ volume = {1},
+ number = {1},
+ year = {2010},
+ pages = {55-68},
+ ee = {http://dx.doi.org/10.4018/jaeis.2010101304},
+ bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+
+@inproceedings{Blanchart11,
+ author = {Blanchart, E. and
+ Cambier, C. and
+ Canape, C. and
+ Gaudou, B. and
+ Ho, T.-N. and
+ Ho, T.-V. and
+ Lang, C. and
+ Michel, F. and
+ Marilleau, N. and
+ Philippe, L.},
+ title = {EPIS: A Grid Platform to Ease and Optimize Multi-agent Simulators
+ Running},
+ booktitle = {PAAMS},
+ year = {2011},
+ publisher = {Springer},
+ series = {Advances in Intelligent and Soft Computing},
+ volume = {88},
+ pages = {129-134},
+ ee = {http://dx.doi.org/10.1007/978-3-642-19875-5_17},
+ bibsource = {DBLP, http://dblp.uni-trier.de}
+}
+
+
+@inproceedings{Dziekonski2011,
+author={ Dziekonski, A. and Lamecki, A. and Mrozowski, M.},
+title={A memory efficient and fast sparse matrix vector product on a gpu},
+year={2011},
+booktitle={Progress In Electromagnetics Research, Vol. 116},
+pages={49--63},
+doi={10.2528/PIER11031607}
+}
+
+@article{couturier2011,
+ author = {Couturier, Rapha{\"e}l and Domas, St{\'e}phane},
+ affiliation = {LIFC, IUT Belfort-Montb{\'e}liard, University of Franche Comte, BP 527, 90016 Belfort CEDEX, France},
+ title = {Sparse systems solving on GPUs with GMRES},
+ journal = {The Journal of Supercomputing},
+ publisher = {Springer Netherlands},
+ issn = {0920-8542},
+ keyword = {Computer Science},
+ pages = {1-13},
+ year={2011},
+ url = {http://dx.doi.org/10.1007/s11227-011-0562-z},
+ note = {DOI 10.1007/s11227-011-0562-z}
+}
+
+@article{Horton2011,
+author={Horton, M. and Tomov, S. and Dongarra, J.},
+title={A Class of Hybrid LAPACK Algorithms for Multicore and GPU Architectures},
+journal={Symposium for Application Accelerators in High Performance Computing (SAAHPC'11)},
+year={2011}
+}
+
+@inproceedings{Aaby10,
+ author = {Aaby, Brandon G. and Perumalla, Kalyan S. and Seal, Sudip K.},
+ title = {Efficient simulation of agent-based models on multi-GPU and multi-core clusters},
+ booktitle = {Proceedings of the 3rd International ICST Conference on Simulation Tools and Techniques},
+ series = {SIMUTools '10},
+ year = {2010},
+ isbn = {978-963-9799-87-5},
+ location = {Torremolinos, Malaga, Spain},
+ pages = {29:1--29:10},
+ articleno = {29},
+ numpages = {10},
+ url = {http://dx.doi.org/10.4108/ICST.SIMUTOOLS2010.8822},
+ doi = {http://dx.doi.org/10.4108/ICST.SIMUTOOLS2010.8822},
+ acmid = {1808181},
+ publisher = {ICST (Institute for Computer Sciences, Social-Informatics and Telecommunications Engineering)},
+ address = {ICST, Brussels, Belgium, Belgium},
+ keywords = {CUDA, GPU, MPI, agent-based simulation, cluster, computational hierarchy, latency hiding, multi-core, threads}
+}
+
+
+@article{Durstenfeld1964,
+ author = {Durstenfeld, Richard},
+ title = {Algorithm 235: Random permutation},
+ journal = {Commun. ACM},
+ volume = {7},
+ issue = {7},
+ month = {July},
+ year = {1964},
+ issn = {0001-0782},
+ pages = {420--},
+ url = {http://doi.acm.org/10.1145/364520.364540},
+ doi = {http://doi.acm.org/10.1145/364520.364540},
+ acmid = {364540},
+ publisher = {ACM},
+ address = {New York, NY, USA}
+}
+
+@InBook{Collier11,
+ALTauthor = {Collier, N.T., and North, M.J. },
+title = {Repast SC++: A Platform for Large-scale Agent-based Modeling},
+chapter = {Large-Scale Computing Techniques for Complex System Simulation},
+publisher = {Wiley},
+year = {2011}
+}
+
+@inproceedings{lmlm+12:ip,
+inhal = {no},
+domainehal = {INFO:INFO_NI, INFO:INFO_DC, INFO:INFO_IU},
+equipe = {cartoon},
+classement = {ACTI},
+author = {Laville, Guillaume and Mazouzi, Kamel and Lang, Christophe and Marilleau, Nicolas and Philippe, Laurent},
+title = {Using {GPU} for Multi-agent Multi-scale Simulations},
+booktitle = {DCAI'12, 9-th Int. Conf. on Advances in Intelligent and Soft Computing},
+pages = {197--204},
+doi = {10.1007/978-3-642-28765-7_23},
+url = {http://dx.doi.org/10.1007/978-3-642-28765-7_23},
+series = {Advances in Intelligent and Soft Computing},
+volume = 151,
+address = {Salamanca, Spain},
+publisher = {Springer},
+month = mar,
+year = 2012,
+}
+
+@inproceedings{lmlm+13:ip,
+inhal = {no},
+domainehal = {INFO:INFO_NI, INFO:INFO_DC, INFO:INFO_IU},
+equipe = {cartoon},
+classement = {ACTI},
+corerank = {C},
+author = {Laville, Guillaume and Mazouzi, Kamel and Lang, Christophe and Marilleau, Nicolas and Philippe, Laurent},
+title = {Using {GPU} for Multi-agent Soil Simulation},
+booktitle = {PDP 2013, 21st Euromicro International Conference on Parallel, Distributed and Network-based Computing},
+pages = {***--***},
+address = {Belfast, Ireland},
+publisher = {IEEE Computer Society Press},
+month = feb,
+year = 2013,
+}
--- /dev/null
+\relax
+\@writefile{toc}{\author{G. Laville, C. Lang, K. Mazouzi, N. Marilleau, B. Herrmann, L. Philippe}{}}
+\@writefile{loa}{\addvspace {10\p@ }}
+\@writefile{toc}{\contentsline {chapter}{\numberline {16}Implementing MAS on GPU}{363}}
+\@writefile{lof}{\addvspace {10\p@ }}
+\@writefile{lot}{\addvspace {10\p@ }}
+\newlabel{chapter17}{{16}{363}}
+\@writefile{toc}{\contentsline {section}{\numberline {16.1}Introduction}{363}}
+\newlabel{ch17:intro}{{16.1}{363}}
+\@writefile{toc}{\contentsline {section}{\numberline {16.2}Running Agent-Based Simulations}{365}}
+\newlabel{ch17:ABM}{{16.2}{365}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.1}Multi-agent systems and parallelism}{365}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.2}MAS Implementation on GPU}{366}}
+\newlabel{ch17:subsec:gpu}{{16.2.2}{366}}
+\@writefile{toc}{\contentsline {section}{\numberline {16.3}A first practical example}{368}}
+\newlabel{ch17:sec:1stmodel}{{16.3}{368}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.1}The Collembola model}{368}}
+\newlabel{ch17:subsec:collembolamodel}{{16.3.1}{368}}
+\@writefile{lof}{\contentsline {figure}{\numberline {16.1}{\ignorespaces Evolution algorithm of Collembola model\relax }}{369}}
+\newlabel{ch17:fig:collem_algorithm}{{16.1}{369}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.2}Collembola Implementation}{369}}
+\newlabel{ch17:listing:collembola-diffuse}{{16.1}{369}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.1}Collembola OpenCL Diffusion kernel}{369}}
+\newlabel{ch17:listing:collembola-reduc}{{16.2}{370}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.2}Collembola OpenCL reduction kernel}{370}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.3}Collembola performance}{371}}
+\@writefile{lof}{\contentsline {figure}{\numberline {16.2}{\ignorespaces Performances CPU et GPU du mod\IeC {\`e}le Collemboles\relax }}{371}}
+\newlabel{ch17:fig:mior_perfs_collem}{{16.2}{371}}
+\@writefile{toc}{\contentsline {section}{\numberline {16.4}Second example}{372}}
+\newlabel{ch17:sec:2ndmodel}{{16.4}{372}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.4.1}The MIOR model}{372}}
+\newlabel{ch17:subsec:miormodel}{{16.4.1}{372}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {16}{\ignorespaces Evolution step of each Meta-Mior (microbial colony) agent\relax }}{373}}
+\newlabel{ch17:seqalgo}{{16}{373}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.4.2}MIOR Implementation}{373}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {16.4.2.1}Execution mapping on GPU}{373}}
+\@writefile{lof}{\contentsline {figure}{\numberline {16.3}{\ignorespaces Execution distribution retained on GPU\relax }}{374}}
+\newlabel{ch17:fig:gpu_distribution}{{16.3}{374}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {16.4.2.2}Data structures translation}{374}}
+\newlabel{ch17:subsec:datastructures}{{16.4.2.2}{374}}
+\newlabel{ch17:listing:mior_data_structures}{{16.3}{375}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.3}Main data structures used in a MIOR simulation}{375}}
+\@writefile{lof}{\contentsline {figure}{\numberline {16.4}{\ignorespaces Compact representation of the topology of a MIOR simulation\relax }}{376}}
+\newlabel{ch17:fig:csr_representation}{{16.4}{376}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {16.4.2.3}Critical resources access management}{376}}
+\newlabel{ch17:subsec:concurrency}{{16.4.2.3}{376}}
+\newlabel{ch17:listing:mior_kernels}{{16.4}{376}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.4}Main MIOR kernel}{376}}
+\newlabel{ch17:fig:mior_launcher}{{16.5}{377}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.5}MIOR simulation launcher}{377}}
+\@writefile{toc}{\contentsline {subsubsection}{\numberline {16.4.2.4}Termination detection}{378}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.4.3}Performance of MIOR implementations}{378}}
+\newlabel{ch17:subsec:miorexperiments}{{16.4.3}{378}}
+\@writefile{lof}{\contentsline {figure}{\numberline {16.5}{\ignorespaces CPU and GPU performance on a Tesla C1060 node\relax }}{380}}
+\newlabel{ch17:fig:mior_perfs_tesla}{{16.5}{380}}
+\@writefile{lof}{\contentsline {figure}{\numberline {16.6}{\ignorespaces CPU and GPU performance on a personal computer with a Geforce 8800GT\relax }}{381}}
+\newlabel{ch17:fig:mior_perfs_8800gt}{{16.6}{381}}
+\@writefile{toc}{\contentsline {section}{\numberline {16.5}Analysis and recommandations}{381}}
+\newlabel{ch17:analysis}{{16.5}{381}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.5.1}Analysis}{381}}
+\@writefile{lof}{\contentsline {figure}{\numberline {16.7}{\ignorespaces Execution time of one multi-simulation kernel on the Tesla platform\relax }}{382}}
+\newlabel{ch17:fig:monokernel_graph}{{16.7}{382}}
+\@writefile{lof}{\contentsline {figure}{\numberline {16.8}{\ignorespaces Total execution time for 1000 simulations on the Tesla platform, while varying the number of simulations for each kernel\relax }}{382}}
+\newlabel{ch17:fig:multikernel_graph}{{16.8}{382}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.5.2}MAS execution workflow}{382}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.5.3}Implementation challenges}{383}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {16.5.4}MCSMA}{384}}
+\newlabel{ch17:Mcsma}{{16.5.4}{384}}
+\@writefile{toc}{\contentsline {section}{\numberline {16.6}Conclusion}{385}}
+\newlabel{ch17:conclusion}{{16.6}{385}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{386}}
+\@setckpt{Chapters/chapter17/ch17}{
+\setcounter{page}{389}
+\setcounter{equation}{0}
+\setcounter{enumi}{3}
+\setcounter{enumii}{0}
+\setcounter{enumiii}{0}
+\setcounter{enumiv}{25}
+\setcounter{footnote}{1}
+\setcounter{mpfootnote}{0}
+\setcounter{part}{6}
+\setcounter{chapter}{16}
+\setcounter{section}{6}
+\setcounter{subsection}{0}
+\setcounter{subsubsection}{0}
+\setcounter{paragraph}{0}
+\setcounter{subparagraph}{0}
+\setcounter{figure}{8}
+\setcounter{table}{0}
+\setcounter{numauthors}{0}
+\setcounter{parentequation}{4}
+\setcounter{subfigure}{0}
+\setcounter{lofdepth}{1}
+\setcounter{subtable}{0}
+\setcounter{lotdepth}{1}
+\setcounter{lstnumber}{21}
+\setcounter{ContinuedFloat}{0}
+\setcounter{AlgoLine}{17}
+\setcounter{algocfline}{16}
+\setcounter{algocfproc}{16}
+\setcounter{algocf}{16}
+\setcounter{nprt@mantissa@digitsbefore}{0}
+\setcounter{nprt@mantissa@digitsafter}{0}
+\setcounter{nprt@exponent@digitsbefore}{0}
+\setcounter{nprt@exponent@digitsafter}{0}
+\setcounter{nprt@digitsfirstblock}{0}
+\setcounter{nprt@blockcnt}{0}
+\setcounter{nprt@cntprint}{0}
+\setcounter{proposition}{1}
+\setcounter{theorem}{0}
+\setcounter{exercise}{0}
+\setcounter{example}{0}
+\setcounter{definition}{0}
+\setcounter{proof}{1}
+\setcounter{lstlisting}{5}
+}
--- /dev/null
+\chapterauthor{G. Laville, C. Lang, K. Mazouzi, N. Marilleau, B. Herrmann, L. Philippe}{Femto-ST Institute, University of Franche-Comt{\'e}}
+
+\newlength\mylen
+\newcommand\myinput[1]{%
+ \settowidth\mylen{\KwIn{}}%
+ \setlength\hangindent{\mylen}%
+ \hspace*{\mylen}#1\\}
+
+\chapter{Implementing MAS on GPU}
+\label{chapter17}
+
+
+\section{Introduction}
+\label{ch17:intro}
+
+In this chapter we introduce the use of Graphical Processing Units for
+multi-agents based systems as an example of a not-so-regular
+application that could benefit from the GPU computing
+power. Multi-Agent Systems or MAS are a simulation paradigm used to
+study the behavior of dynamic systems. Dynamic systems as physical
+systems are often modeled by mathematical representations and their
+dynamic behavior simulated by differential equations. The simulation
+of the system thus often relay on the resolution of a linear system
+that can be efficiently computed on a graphical processing unit as
+shown in the preceeding chapters. But when the behavior of the system
+elements is not uniformly driven by the same law, when these elements
+have their own behavior, the modeling process is too complex to rely
+on formal expressions. In this context MAS is a recognized approach to
+model and simulate systems where individuals have an autonomous
+behavior that cannot be simulated by the evolution of a set of
+variables driven by mathematical laws. MAS are often used to simulate
+natural or collective phenomenons whose individuals are too numerous or
+various to provide a unified algorithm describing the system
+evolution. The agent-based approach is to divide these complex systems
+into individual self-contained entities with their smaller set of
+attributes and functions. But, as for mathematical simulations, when
+the size of the Multi-Agent System (MAS) increases the need of computing
+power and memory also increases. For this reason, multi-agent systems
+should benefit from the use of distributed computing
+architectures. Clusters and Grids are often identified as the main
+solution to increase simulation performance but Graphical Processing
+Units (GPU) are also a promising technology with an attractive
+performance/cost ratio.
+
+Conceptually a MAS is a distributed system as it favors the definition
+and description of large sets of individuals, the agents, that can be
+run in parallel. As a large set of agents could have the same behavior
+a SIMD model should fit the simulation execution. Most of the
+agent-based simulators are however designed with a sequential scheme
+in mind and these simulators seldom use more than one core for their
+execution. Due to simulation scheduling constraints, data sharing and
+exchange between agents and the huge amount of interactions between
+agents and their environment, it is indeed rather difficult to
+distribute an agent based simulator, for instance, to take advantage of new
+multi-threaded computer architectures. Thus, guidelines and tools
+dedicated to MAS paradigm and HPC is now a need for other complex
+system communities. Note that, from the described structure (large
+number of agents sharing data), we can conclude that MAS would more
+easily benefit from many-cores architectures than from other kinds of
+parallelism.
+
+Another key point that advocates for the use of many-core in MAS is
+the growing need for multi-scale simulations. Multi-scale simulations
+explore problems with interactions between several scales. The
+different scales use different granularities of the structure and
+potentially different models. Most of the time the lower scale
+simulations provide results to higher scale simulations. In that case
+the execution of the simulations can easily be distributed between the
+local cores and a many-core architecture, i.e. a GPU device.
+
+We explore in this chapter the use of many-core architectures to
+execute agent-based simulations. We illustrate our reflexion with two
+uses cases: the colembola simulator designed to simulate the diffusion
+of collembola between plots of land and the MIOR simulator that
+reproduces effects of earthworms on bacteria dynamics in a bulked
+soil. In Section \ref{ch17:ABM} we present the work related to MAS and
+parallelization with a special focus on many-core use. In sections
+\ref{ch17:sec:1stmodel} and \ref{ch17:sec:2ndmodel} we present in
+details two multi-agent models, their GPU implementations, the
+conducted experiments and their performance results. The first model,
+given in Section \ref{ch17:sec:1stmodel}, illustrates the use of a GPU
+architecture to speed up the execution of some computation intensive
+functions while the main model is still executed on the central
+processing unit. The second model, given in Section
+\ref{ch17:sec:2ndmodel}, illustrates the use of a GPU architecture to
+implement the whole model on the GPU processor which implies deeper
+changes in the initial algorithm. In Section \ref{ch17:analysis} we
+propose a more general reflexion on these implementations and we
+provide some guidelines. Then we conclude in Section
+\ref{ch17:conclusion} on the possible generalization of our work.
+
+
+\section{Running Agent-Based Simulations}
+\label{ch17:ABM}
+
+In this section, we present the context of multi-agent systems, MAS,
+their parallelization and we report several existing works on using
+GPU to simulate multi-agent systems.
+
+\subsection{Multi-agent systems and parallelism}
+
+Agent-Based systems are often used to simulate natural or collective
+phenomenons whose actors are too numerous or various to provide a
+simple unified algorithm describing the studied system
+dynamic~\cite{Schweitzer2003}. The implementation of an agent based
+simulation usually starts by designing the underlying Agent Based
+Model (ABM). Most ABM are based around a few types of entities such as Agents,
+Environment and around an Interaction Organization~\cite{Vowel02}. In the
+complex system domain, the environment often describes a real space,
+its structure (e.g. soil textures and porosities) and its dynamics
+(e.g. organic matter decomposition). It is a virtual world in which
+agents represent studied entities (e.g. biotic organisms)
+evolution. The actual agent is animated by a behavior that can range
+between reactivity (only react to external stimuli) and cognition
+(lives its own process alongside other individuals). Interaction
+and Organization define functions, types and patterns of
+communications of their member agents in the
+system~\cite{Odell:2003:RRD:1807559.1807562}. Note that, depending on
+the MAS, agents can communicate either directly through special
+primitives or indirectly through the information stored in the
+environment.
+
+Agent based simulations have been used for more than one decade to
+reproduce, understand even predic complex system dynamic. They have proved their interest in various
+scientific communities. Nowadays generic agent based frameworks are
+promoted such as Repast~\cite{repast_home} or
+NetLogo~\cite{netlogo_home} to implement simulators. Many ABM such as
+the crown model representing a city-wide
+scale~\cite{Strippgen_Nagel_2009} tend however to require a large
+number of agents to provide a realistic behavior and reliable global
+statistics. Moreover, an achieved model analysis needs to run many
+simulations identified into an experiment plan to obtain enough
+confidence in a simulation. In this case the available computing power
+often limits the simulation size and the result range thus requiring
+the use of parallelism to explore bigger configurations.
+
+For that, three major approaches can be identified:
+\begin{enumerate}
+\item parallelizing experiments plan on a cluster or a grid (one or
+ few simulations are submitted to each
+ core)~\cite{Blanchart11,Chuffart2010}.
+\item parallelizing the simulator on a cluster (the environment of the
+ MAS is split and run on several distributed
+ nodes)~\cite{Cosenza2011,MAR06}
+\item optimizing simulator by taking advantage of computer resources
+ (multi-threading, GPU, and so on) \cite{Aaby10}
+\end{enumerate}
+
+In the first case, experiments are run independently each others and only simulation parameters change between two runs so
+that a simple version of an existing simulator can be used. This
+approach does not, however, allow to run larger models.
+% It does not imply
+%model code changes except for Graphical Unit Interface extracting as
+%the experiments does not run interactively but in batch.
+In the second
+and the third case, model and code modifications are necessary. Only few
+frameworks however introduce distribution in agent simulation
+(Madkit~\cite{Gutknecht2000}, MASON~\cite{Sean05}, repastHPC~\cite{Collier11}) and parallel
+implementations are often based on the explicit use of threads using
+shared memory~\cite{Guy09clearpath} or cluster libraries such as
+MPI~\cite{Kiran2010}.
+
+Parallelizing a multi-agent simulation is however complex due to space
+and time constraints. Multi-agent simulations are usually based on a
+synchronous execution:
+%by the agents that share the same environment.
+at each time step, numerous events (space data
+modification, agent motion) and interactions between agents happen.
+Distributing the simulation on several computers or grid nodes thus
+implies to guaranty a distributed synchronous execution and
+coherency. This often leads to poor performance or complex synchronization
+problems. Multi-cores execution or delegating part of this execution
+to others processors as GPUs~\cite{Bleiweiss_2008} are thus usually easier
+to implement since all the threads share the data and the local clock.
+
+% Different agent patterns can be adopted in an ABMs such as
+% cognitive and reactive ones~\cite{Ferber99}. Cognitive agents act on
+% the environment and interact with other agents according to a complex
+% behavior. This behavior takes a local perception of the virtual world
+% and the agent past (a memory characterized by an internal state and
+% belief, imperfect knowledge about the world) into account. Reactive
+% agents have a much systematic pattern of action based on stimuli
+% response schemes (no or few knowledge and state conservation in agent). The
+% evolution of the ABM environment, in particular, is often represented with
+% this last kind of agents. As their behavior is usually simple, we
+% propose in this chapter to delegate part of the environment and of the
+% reactive agents execution to the graphical processing unit of the
+% computer. This way we can balance the load between both CPU and GPU
+% execution resources.
+
+% In the particular case of multi-scale simulations such as the Sworm
+% simulation~\cite{BLA09} the environment may be used at different
+% levels. Since the representation of the whole simulated environment
+% (i.e. the soil area) would be costly, the environment is organized as
+% a multi-level tree of small soil cubes which can be lazily
+% instantiated during the simulation. This allows to gradually refine
+% distribution details in units of soil as agents progress and need
+% those information, by using a fractal process based on the
+% bigger-grained already instantiated levels. This characteristic,
+% especially for a fractal model, could be the key of the
+% distribution. For instance, each branch of a fractal environment could
+% be identified as an independent area and parallelized. In addition
+% Fractal is a famous approach to describe multi-scale environment (such
+% as soil) and its organization~\cite{perrier}. In that case the lower
+% scale simulations can also be delegated to the GPU card to limit the
+% load of the main (upper scale) simulation.
+
+\subsection{MAS Implementation on GPU}
+\label{ch17:subsec:gpu}
+
+The last few years saw the apparition of new generations of graphic
+cards based on more general purpose execution units which are
+promising for less regular systems as MAS.
+% These units can be programmed using GPGPU languages to solve various
+% problems such as linear algebra resolutions, usually based on matrix
+% manipulations.
+The matrix-based data representation and SIMD computations are
+however not straightforward in Multi-Agent Systems, where data
+structures and algorithms are tightly coupled to the described
+simulation. However, works from existing literature show that MAS can
+benefit from these performance gains on various simulation types, such
+as traffic simulation~\cite{Strippgen_Nagel_2009}, cellular
+automatons~\cite{Dsouza2007}, mobile-agent based
+path-finding~\cite{Silveira:2010:PRG:1948395.1948446} or genetic
+algorithms~\cite{Maitre2009}. Note that an application-specific
+adaptation process was required in the case of these MAS: some of the
+previous examples are driven by mathematical laws (path-finding) or
+use a natural mapping between a discrete environment (cellular
+automaton) and GPU cores. Unfortunately, this mapping often requires
+algorithmic adaptations in other models but the experience shows that
+the more reactive a MAS is the more adapted its implementation is to
+GPU.
+
+The first step in the adaptation of an ABM to GPU platforms is the
+choice of the language. On the one hand, the Java programming language
+is often used for the implementation of MAS due to its availability
+on numerous platforms or frameworks and its focus on high-level,
+object-oriented programming. On the other hand, GPU platforms can
+only run specific languages as OpenCL or CUDA. OpenCL (supported on
+AMD, Intel and NVIDIA hardware) better suits the portability concerns
+across a wide range of hardware needed the agent simulators, as
+opposed to CUDA which is a NVIDIA-specific library.
+
+OpenCL is a C library which provides access to the underlying CPUs or
+GPUs threads using an asynchronous interface. Various OpenCL functions allow
+the compilation and the execution of programs on these execution
+resources, the copy of data buffers between devices, or the collection
+of profiling information.
+
+This library is based around three main concepts:
+
+\begin{itemize}
+\item the \emph{kernel} (similar to a CUDA kernel) which represents a runnable program
+ containing instructions to be executed on the GPU.
+\item the \emph{work-item} (equivalent to a CUDA thread), which is analogous to the concept
+ of thread on GPU, in that it represents one running instance of a
+ GPU kernel.
+\item the \emph{work-group} (or execution block) which is a set of work-items
+ sharing some memory to speed up data accesses and
+ computations. Synchronization operations such as barrier can only be
+ used across the same work-group.
+\end{itemize}
+
+Running an OpenCL computation consists in launching numerous work-items
+that execute the same kernel. The work-items are submitted to a
+submission queue to optimize the available cores usage. A calculus is
+achieved once all these kernel instances have terminated.
+
+The number of work-items used in each work-group is an important
+implementation choice which determines how many tasks will share the
+same cache memory. Data used by the work-items can be stored as
+N-dimensions matrices in local or global GPU memory. Since the size
+of this memory is often limited to a few hundred of kilobytes,
+choosing this number often implies a compromise between the model
+synchronization or data requirements and the available resources.
+
+In the case of agent-based simulations, each agent can be naturally
+mapped to a work-item. Work-groups can then be used to represents groups
+of agents or simulations sharing common data (such as the environment)
+or algorithms (such as the background evolution process).
+
+In the following examples the JOCL~\cite{jocl_home} binding is used
+to access the OpenCL platfmorm from the Java programming language.
+
+In the next sections we present two practical cases that will be
+studied in details, from the model to its implementation and
+performance.
+
+\section{A first practical example}
+\label{ch17:sec:1stmodel}
+
+The first model, the Collembola model, simulates the propagation of
+collembolas in flieds and forests. It is based on a diffusion
+algorithm which illustrates the case of agents with a simple behavior
+and few synchronization problems.
+
+\subsection{The Collembola model}
+\label{ch17:subsec:collembolamodel}
+
+The Collembola model is an example of multi-agent system using GIS (Geographical Information System)
+ and survey data (population count)
+ to modelize the evolution of the biodiversity
+across land plots. A first version of this model has been developed with Netlogo framework by Bioemco and UMMISCO researchers. In this model, the biodiversity is modelized by populations of
+athropod individuals, the Collembola, which can reproduce and diffuse
+to favorable new habitats. The simulator allows to study the diffusion
+of collembola, between plots of land depending on their landuse
+(artifical soil, crop, forest\ldots). In this
+model the environment is composed of the studied land and the
+colembola are agents. Every land plot is divided into several cells,
+each cell representing a surface unit (16*16 meters). A number of individuals per collembola species is associated to each cell. The model evolution
+is then based on a common diffusion model that diffuse individuals between
+cells. Each step of the simulation is based on four stages, as shown
+on Figure~\ref{ch17:fig:collem_algorithm}:
+
+% \begin{enumerate}
+% \item arrival of new individuals
+% \item reproduction in each cell
+% \item diffusion between cells
+% \item updating of colembola lifetime
+% \end{enumerate}
+
+\begin{figure}[h]
+\centering
+\includegraphics[width=0.6\textwidth]{Chapters/chapter17/figs/algo_collem.pdf}
+\caption{Evolution algorithm of Collembola model}
+\label{ch17:fig:collem_algorithm}
+\end{figure}
+
+The algorithm is quite simple but includes two costly operations, the
+reproduction and the diffusion, that must be parallelized to improve
+the model performances.
+
+The {\bf reproduction} stage consists in updating the total population
+of each plot by taking the individuals arrived at the preceding
+computation step. This stage implies to process the whole set of cells
+of the environment to sum their population. The computed value is
+recorded in the plot associated to each cell. This process can be
+assimilated to a reduction operation on all the population cells
+associated to one plot to obtain its population.
+
+The {\bf diffusion} stage simulates the natural behavior of the
+collembola that tends toward occupying the whole space over time. This
+stage constists in computing a new value for each cell depending on
+the population of the neighbor cells. This process can be assimilated
+to a linear diffusion at each iteration of the populationof the cells
+across their neightbors.
+
+These two processes are quite common in numerical computations so
+that the collembola model can be adapted to a GPU execution without
+much difficulties.
+
+\subsection{Collembola Implementation}
+
+In the Collembola simulator biodiversity is modeled by populations
+of Collembola individuals, which can reproduce and diffuse to
+favorable new habitats. This is implemented in the algorithm as a fixed
+reproduction factor, applied to the size of each population, followed
+by a linear diffusion of each cell population to its eight
+neighbors. These reproduction and diffusion processes are followed by
+two more steps on the GPU implementation. The first one consist of
+culling of populations in a inhospitable environment, by checking each
+cell value, landuse, and setting its population to zero if
+necessary. The final simulation step is the reduction of the cells
+populations for each plot, to obtain an updated plot population
+for statistic purposes. This separate computation step, done while
+updating each cell population in the reference sequential algorithm,
+is motivated by synchronization problems and allow the reduction of
+the total numbers of access needed to updated those populations.
+
+%\lstinputlisting[language=C,caption=Collembola OpenCL
+%kernels,label=fig:collem_kernels]{Chapters/chapter17/code/collem_kernels.cl}
+\lstinputlisting[caption=Collembola OpenCL Diffusion kernel,label=ch17:listing:collembola-diffuse]{Chapters/chapter17/code/collem_kernel_diffuse.cl}
+
+The reproduction, diffusion and culling steps are implemented on GPU
+(Figure~\ref{ch17:fig:collem_algorithm}) as a straight mapping of each cell
+to an OpenCL work-item (GPU thread). Listing
+\ref{ch17:listing:collembola-diffuse} gives the kernel for the
+diffusion implementation. To prevent data coherency problems, the
+diffusion step is split into two phases, separated by an execution
+{\it barrier}. In the first phase, each cell diffusion overflow is
+calculated, and divided by the number of neightbors. Note that, at the
+model frontier, populations can also overflow outside the environment
+grid but we do not manage those external populations, since there are
+no reason to assume our model to be isolated of its surroundings. The
+overflow by neigbors value is stored for each cell, before
+encountering the barrier. After the barrier is met, each cell read the
+overflows stored by all its neighbors at the previous step and applies
+them to its own population. In this manner, only one barrier is
+required to ensure the consistency of populations value, since no
+cells ever modify a value other than its own.
+
+\lstinputlisting[caption=Collembola OpenCL reduction kernel,label=ch17:listing:collembola-reduc]{Chapters/chapter17/code/collem_kernel_reduc.cl}
+
+Listing \ref{ch17:listing:collembola-reduc} gives the kernel for the
+reduction implementation. The only step requiring numerous
+synchronized accesses is the reduction one: in a first approach, we
+chose to use {\it atomic\_add} operation to implement this process, but more
+efficient implementations using partial reduction and local GPU memory
+could be implemented.
+
+\subsection{Collembola performance}
+
+In this part we present the performance of the Collembola model on
+various CPU and GPU execution
+platforms. Figure~\ref{ch17:fig:mior_perfs_collem} shows that the
+number of cores and the processor archiecture as a strong influence on the obtained results
+
+% : the
+% dual-core processor equipping our Thinkpad platform has two to six
+% longer executions times, compared to a six-core Phenom X6.
+
+% % \begin{figure}[h]
+% %begin{minipage}[t]{0.49\linewidth}
+% \centering \includegraphics[width=0.7\linewidth]{./Chapters/chapter17/figs/collem_perfs.pdf}
+% \caption{Performance CPU et GPU du modèle Collemboles}
+% \label{ch17:fig:mior_perfs_collem}
+% %end{minipage}
+% \end{figure}
+
+% In figure~\ref{ch17:fig:mior_perfs_collem2} the Thinkpad curve is removed
+% to make other trends clearer. Two more observation can be made, using
+% this more detailled scale:
+
+\begin{itemize}
+\item Older GPU cards can be slower than modern processors. This can
+ be explained by the new cache and memory access optimizations
+ implemented in newer generations of GPU devices. These optimizations
+ reduce the penalities associated with irregular and frequent global
+ memory accesses. They are not available on our Tesla nodes.
+\item GPU curves exhibit a odd-even pattern in their performance
+ results. Since this phenomenon is visible on two distinct
+ manufacturer hardware, drivers and OpenCL implementation, it is
+ likely the result of the decomposition process based on warp of
+ fixed, power-of-two sizes.
+\item The number of cores is not the only determining factor: an Intel
+ Core i7 2600K processor, even with only four cores, can provide
+ better performance than a Phenom one.
+\end{itemize}
+
+\begin{figure}[h]
+%begin{minipage}[t]{0.49\linewidth}
+\centering
+\includegraphics[width=0.7\linewidth]{./Chapters/chapter17/figs/collem_perfs_nothinkpad.pdf}
+\caption{Performances CPU et GPU du modèle Collemboles}
+\label{ch17:fig:mior_perfs_collem}
+%end{minipage}
+\end{figure}
+
+Both figures shows that using the GPU to parallelize part of the
+simulator results in tangible performance gains over a CPU execution
+on modern hardware. These gains are more mixed on older GPU platforms
+due to the limitations when dealing with irregular memory or execution
+patterns often encountered in MAS systems. This can be closely linked
+to the availability of caching facilities on the GPU hardware and its
+dramatical effects depending on the locality and frequency of data
+accesses. In this case, even if the Tesla architectures provides more
+execution cores and is the far costlier solution, more recent, cheaper,
+solutions such as high-end GPU provide better performance when the
+execution is not constrained by memory size.
+
+\section{Second example}
+\label{ch17:sec:2ndmodel}
+
+The second model, the MIOR model, simulates the behavior of microbian
+colonies. Its execution model is more complex so that it requires
+changes in the initial algorithm and the use of synchronization to
+benefit from the GPU architecture.
+
+\subsection{The MIOR model}
+\label{ch17:subsec:miormodel}
+
+The MIOR~\cite{C.Cambier2007} model was developed to simulate local
+interactions in a soil between microbial colonies and organic
+matters. It reproduces each small cubic units ($0.002 m^3$) of soil as a
+MAS.
+
+Multiple implementations of the MIOR model have already been
+realized, in Smalltalk and Netlogo, in 2 or 3 dimensions. The last
+implementation, used in our work and referenced as MIOR in the
+rest of the chapter, is freely accessible online as
+WebSimMior~\footnote{http://www.IRD.fr/websimmior/}.
+
+MIOR is based around two types of agents: (i) the Meta-Mior (MM),
+which represents microbial colonies consuming carbon and (ii) the
+Organic Matter (OM) which represents carbon deposits occurring in soil.
+
+The Meta-Mior agents are characterized by two distinct behaviors:
+\begin{itemize}
+\item \emph{breath}: the action converts mineral carbon from the soil
+ to carbon dioxide $CO_{2}$ that is released in the soil.
+\item \emph{growth}: by this action each microbial colony fixes the
+ carbon present in the environment to reproduce itself (augments its
+ size). This action is only possible if the colony breathing needs
+ where covered, i.e. enough mineral carbon is available.
+\end{itemize}
+
+These behaviors are described in Algorithm~\ref{ch17:seqalgo}.
+
+\begin{algorithm}[h]
+\caption{Evolution step of each Meta-Mior (microbial colony) agent}
+\label{ch17:seqalgo}
+\KwIn{A static array $mmList$ of MM agents}
+\myinput{A static array $omList$ of OM agents}
+\myinput{A MIOR environment $world$}
+$breathNeed \gets world.respirationRate \times mm.carbon$\;
+$growthNeed \gets world.growthRate \times mm.carbon$\;
+$availableCarbon \gets totalAccessibleCarbon(mm)$\;
+\uIf{$availableCarbon > breathNeed$}{
+ \tcc{ Breath }
+ $mm.active \gets true$\;
+ $availableCarbon \gets availableCarbon - consumCarbon(mm, breathNeed)$\;
+ $world.CO2 \gets world.CO2 + breathNeed$\;
+ \If{$availableCarbon > 0$}{
+ \tcc{ Growth }
+ $growthConsum \gets max(totalAccessCarbon(mm), growthNeed)$\;
+ $consumCarbon(mm, growthConsum)$\;
+ $mm.carbon \gets mm.carbon + growthConsum$\;
+ }
+}
+\Else{
+ $mm.active \gets false$
+}
+\end{algorithm}
+
+Since this simulation takes place at a microscopic scale, a large
+number of these simulations must be executed for each macroscopic
+simulation step to model realistic-sized unit of soil. This lead to
+large computing needs despite the small computation cost of each
+individual simulation.
+
+
+\subsection{MIOR Implementation}
+
+As pointed out previously, the MIOR implementation implied more
+changes for the initial code to be run on GPU. As a first attempt, we
+realized a simple GPU implementation of the MIOR simulator, with only
+minimal changes to the CPU algorithm. Execution times showed the
+inefficiency of this approach, and highlighted the necessity of
+adapting the simulator to take advantage of the GPU execution
+capabilities~\cite{lmlm+12:ip}. In this part, we show the main changes
+which were realized to adapt the MIOR simulator on GPU architectures.
+
+\subsubsection{Execution mapping on GPU}
+
+\begin{figure}
+\centering
+\includegraphics[width=0.7\textwidth]{Chapters/chapter17/figs/repartition.pdf}
+\caption{Execution distribution retained on GPU}
+\label{ch17:fig:gpu_distribution}
+\end{figure}
+
+Each MIOR simulation is represented by a work-group, and each agent by
+a work-item. A kernel is in charge of the life cycle process for each
+agent of the model. This kernel is executed by all the work-items of the simulation
+each on its own gpu core.
+
+The usage of one work-group for each simulation allows to easily
+execute multiple simulations in parallel, as shown on
+figure~\ref{ch17:fig:gpu_distribution}. By taking advantage of the
+execution overlap possibilities provided by OpenCL it then becomes
+possible to exploit all the cores at the same time, even if an unique
+simulation is to small to use all the available GPU cores. However,
+the maximum size of a work-group is limited (to $512$), which only allows
+us to execute one simulation per work-group when using $310$ threads
+(number of OM in the reference model) to execute the simulation.
+
+The usage of the GPU to execute multiple simulations is initiated by
+the CPU. The later keeps total control of the simulator execution
+flow. Thus, optimized scheduling policies (such as submitting kernels
+in batch, or limiting the number of kernels, or asynchronously
+retrieving the simulation results) can be defined to minimize the cost
+related to data transfers between CPU and GPUs.
+
+\subsubsection{Data structures translation}
+\label{ch17:subsec:datastructures}
+
+The adaptation of the MIOR model to GPU requires the mapping of the
+data model to OpenCL data structures. The environment and the agents
+are represented by arrays of structures, where each structure describes the
+state of one entity. The behavior of these entities are implemented
+as OpenCL functions to be called from the kernels during execution.
+
+Since the main program is written in Java, JOCL is responsible for the
+allocation and the mapping of the object data structures to OpenCL
+ones before execution.
+
+Four main data structures
+(Figure~\ref{ch17:listing:mior_data_structures}) are defined: (i) an
+array of MM agents, representing the state of the microbial
+colonies. (ii) an array of OM agents, representing the state of the
+carbon deposits. (iii) a topology matrix, which stores accessibility
+information between the two types of agents of the model (iv) a world
+structure, which contains all the global input data (metabolism rate,
+numbers of agents) and output data (quantity of $CO_{2}$ produced) of
+the simulation. These data structures are initialized by the CPU and
+then copied on the GPU.
+
+\lstinputlisting[caption=Main data structures used in a MIOR simulation,label=ch17:listing:mior_data_structures]{Chapters/chapter17/code/data_structures.cl}
+
+The world topology is stored as a two-dimension matrix which contains
+OM indexes on the abscissa and the MM index on the ordinate. Each
+agent walks through its line/column of the matrix at each iteration to
+determinate which agents can be accessed during the simulation. Since
+many agents are not connected this matrix is sparse, which introduces
+a big number of useless memory accesses. To reduce the impact of these
+memory accesses we use a compacted, optimized representation of this
+matrix based on~\cite{Gomez-Luna:2009:PVS:1616772.1616869}, as
+illustrated on the figure~\ref{ch17:fig:csr_representation}. This compact
+representation considers each line of the matrix as an index list, and
+only stores accessible agents continuously, to reduce the size of the
+list and the number of non productive accesses.
+
+\begin{figure}[h]
+\centering
+\includegraphics[width=0.8\textwidth]{Chapters/chapter17/figs/grid.pdf}
+%\psfig{file=figs/grid, height=1in}
+\caption{Compact representation of the topology of a MIOR simulation}
+\label{ch17:fig:csr_representation}
+\end{figure}
+
+Since dynamic memory allocation is not possible yet in OpenCL and only
+provided in the lastest revisions of the CUDA standard, these matrices
+are statically allocated. The allocation is based on the worst-case
+scenario where all OM and MM are linked as the real occupation of the
+matrix cells cannot be deduced without some kind of preprocessing
+computations.
+
+\subsubsection{Critical resources access management}
+\label{ch17:subsec:concurrency}
+
+One of the main issue in a MIOR model is to ensure that all the
+microbial colonies will have an equitable access to carbon resources,
+when multiple colonies share the same deposits. Access
+synchronizations are mandatory in these cases to prevent conflicting
+updates on the same data that may lead to calculus error (e.g. loss of
+matter).
+
+On a massively parallel architecture such as GPUs, this kind of
+synchronization conflicts can however lead to an inefficient
+implementation by enforcing a quasi-sequential execution. It is
+necessary, in the case of MIOR as well as for other ABM, to ensure
+that each work-item is not too constrained in its execution.
+
+\lstinputlisting[caption=Main MIOR kernel,label=ch17:listing:mior_kernels]{Chapters/chapter17/code/mior_kernels.cl}
+
+From the sequential algorithm 1 where all the agents share the same
+data, we have developed a parallel algorithm composed of three
+sequential stages separated by synchronization barriers. This new
+algorithm is based on the distribution of the available OM carbon
+deposits into parts at the beginning of each execution step. The three
+stages, illustrated on Listing~\ref{ch17:listing:mior_kernels}, are the following:
+
+\begin{enumerate}
+\item \emph{distribution}: the available carbon in each carbon deposit
+ (OM) is equitably dispatched between all accessible MM in the form
+ of parts.
+\item \emph{metabolism}: each MM consumes carbon in its allocated parts
+ for its breathing and growing processes.
+\item \emph{gathering}: unconsumed carbon in parts is gathered back
+ into the carbon deposits.
+\end{enumerate}
+
+This solution suppresses the data synchronization needed by the first
+algorithm and thus the need for synchronization barriers, and requires only
+one kernel launch from Java as described on Listing~\ref{ch17:fig:mior_launcher}.
+
+\lstinputlisting[caption=MIOR simulation launcher,label=ch17:fig:mior_launcher]{Chapters/chapter17/code/mior_launcher.java}
+
+\subsubsection{Termination detection}
+
+The termination of a MIOR simulation is reached when the model
+stabilizes and no more $CO_{2}$ is produced. This termination
+detection can be done either on the CPU or the GPU but it requires a
+global view on the system execution.
+
+In the first case, when the CPU controls the GPU simulation process,
+the dectection is done in two steps: (i) the CPU starts the execution
+of a simulation step on the GPU, (ii) the CPU retrieves the GPU data
+and determines if another iteration must be launched or if the
+simulation has terminated. This approach allows a fine-grain control
+over the GPU execution, but it requires many costly data transfers as
+each iteration results must be sent from the GPU to the CPU. In the
+case of the MIOR model these costs are mainly due to the inherent
+PCI-express port latencies rather than to bandwidth limitation since
+data sizes remains rather small, in the order of few dozens of
+Megabytes.
+
+In the second case the termination detection is directly implemented
+on the GPU by checking the amount of available carbon between two
+iterations. The CPU does not have any feedback while the simulation is
+running, but retrieves the results once the kernel execution is
+finished. This approach minimizes the number of transfers between the
+CPU and the GPU.
+
+\subsection{Performance of MIOR implementations}
+\label{ch17:subsec:miorexperiments}
+
+In this part we present several MIOR GPU implementations using the
+distribution/gathering process described in the previous section and
+compare their performance on two distinct hardware platform; i.e two
+different GPU devices. Five incremental MIOR implementations were
+realized with an increasing level of adaptation for the algorithm: in
+all cases, we choose the average time over 50 executions as a
+performance indicator.
+
+\begin{itemize}
+\item the \textbf{GPU 1.0} implementation is a direct implementation
+ of the existing algorithm and its data structures where data
+ dependencies were removed and that use the non-compact topology
+ representation described in Section~\ref{ch17:subsec:datastructures}
+\item the \textbf{GPU 2.0} implementation uses the previously
+ described compact representation of the topology and remains
+ otherwise identical to the GPU 1.0 implementation.
+\item the \textbf{GPU 3.0} implementation introduces the manual
+ copy into local (private) memory of often-accessed global data, such
+ as carbon parts or topology information.
+\item the \textbf{GPU 4.0} implementation is a variant of the
+ GPU 1.0 implementation which allows the execution of
+ multiple simulations for each kernel execution.
+\item the \textbf{GPU 5.0} implementation is a multi-simulation of the
+ GPU 2.0 implementation, using the execution of multiple simulations
+ for each kernel execution as for GPU 4.0
+\end{itemize}
+
+The two last implementations -- \textbf{GPU 4.0} and \textbf{GPU 5.0}
+-- illustrate the gain provided by a better usage of the hardware
+resources, thanks to the driver execution overlaping capabilities. A
+sequential version of the MIOR algorithm, labeled as \textbf{CPU}, is
+provided for comparison purpose. This sequential version is developped
+in Java, the same language used for GPU implementations and the Sworm
+model.
+
+For these performance evaluations, two platforms are used. The first
+one is representative of the kind of hardware which is available on
+HPC clusters. It is a cluster node dedicated to GPU computations with
+two Intel X5550 processors running at $2.67GHz$ and two Tesla C1060
+GPU devices running at $1.3$GHz and composed of $240$ cores ($30$
+multi-processors). Only one of these GPU is used in these experiments,
+at the moment. The second platform illustrates what can be expected
+from a personal desktop computer built a few years ago. It uses a
+Intel Q9300 CPU, running at $2.5$GHz, and a Geforce 8800GT GPU card
+running at $1.5$GHz and composed of $112$ cores ($14$
+multi-processors). The purpose of these two platforms is to assess the
+benefit that could be obtained either when a scientist has access to
+specialized hardware as a cluster or tries to take benefit from its
+own personal computer.
+
+\begin{figure}[h]
+%begin{minipage}[t]{0.49\linewidth}
+\centering
+\includegraphics[width=0.7\linewidth]{Chapters/chapter17/figs/mior_perfs_tesla.pdf}
+%\caption{Performance CPU and GPU sur carte Tesla C1060}
+\caption{CPU and GPU performance on a Tesla C1060 node}
+\label{ch17:fig:mior_perfs_tesla}
+%end{minipage}
+\end{figure}
+
+Figures~\ref{ch17:fig:mior_perfs_tesla}~and~\ref{ch17:fig:mior_perfs_8800gt}
+show the execution time for $50$ simulations on the two hardware
+platforms. A size factor is applied to the problem: at scale 1,
+the model contains $38$ MM and $310$ OM, while at the scale 6 these
+numbers are multiplied by six. The size of the environment is modified
+as well to maintain the same average agent density in the model. This
+scaling factor display the impact of the chosen size of simulation on
+performance.
+
+%hspace{0.02\linewidth}
+%begin{minipage}[t]{0.49\linewidth}
+\begin{figure}[h]
+\centering
+\includegraphics[width=0.7\linewidth]{Chapters/chapter17/figs/mior_perfs_8800gt.pdf}
+\caption{CPU and GPU performance on a personal computer with a Geforce 8800GT}
+\label{ch17:fig:mior_perfs_8800gt}
+%end{minipage}
+\end{figure}
+
+The charts show that for small problems the execution times of all
+the implementations are very close. This is because the GPU execution
+does not have enough threads (representing agents) for an optimal
+usage of GPU resources. This trend changes after scale $5$ where GPU
+2.0 and GPU 3.0 take the advantage on the GPU 1.0 and CPU
+implementations. This advantage continues to grow with the scaling
+factor, and reaches a speedup of $10$ at the scale $10$ between the
+fastest single-simulation GPU implementation and the first, naive one
+GPU 1.0.
+
+Multiple trends can be observed in these results. First, optimizations
+for the GPU hardware show a big, positive impact on performance,
+illustrating the strong requirements on the algorithm properties to
+reach execution efficiency. These charts also show that despite the
+vast difference in numbers of cores between the two GPUs platforms the
+same trends can be observed in both case. We can therefore expect
+similar results on other GPU cards, without the need for more
+adaptations.
+
+\begin{figure}[h]
+\centering
+\includegraphics[width=0.7\linewidth]{Chapters/chapter17/figs/monokernel.pdf}
+\caption{Execution time of one multi-simulation kernel on the Tesla
+ platform}
+\label{ch17:fig:monokernel_graph}
+\end{figure}
+
+\begin{figure}[h]
+\centering
+\includegraphics[width=0.7\linewidth]{Chapters/chapter17/figs/multikernel.pdf}
+\caption{Total execution time for 1000 simulations on the Tesla
+ platform, while varying the number of simulations for each kernel}
+\label{ch17:fig:multikernel_graph}
+\end{figure}
+
+There are two ways to measure simulations performance: (i) by
+executing only one kernel, and varying its size (the number of
+simulations executed in parallel), as shown on
+Figure~\ref{ch17:fig:monokernel_graph}, to test the costs linked to the
+parallelization process or (ii) by executing a fixed number of
+simulations and varying the size
+of each kernel, as shown on Figure~\ref{ch17:fig:multikernel_graph}.
+
+Figure~\ref{ch17:fig:monokernel_graph} illustrates the execution time for
+only one kernel. It shows that for small numbers of simulations run in
+parallel, the compact implementation of the model topology is faster
+than the two-dimension matrix representation. This trends reverse with
+more than $50$ simulations in parallel, which can be explained either by
+the non linear progression of the synchronization costs or by the
+additional memory required for the access-efficient representation.
+
+Figure~\ref{ch17:fig:multikernel_graph} illustrates the execution time of a
+fixed number of simulations. It shows that for a small number of
+simulations run in parallel, the costs resulting of program setup,
+data copies and launch on GPU are determinant and very detrimental to
+performance. Once the number of simulations executed for each kernel
+grows, these costs are counterbalanced by computation costs. This
+trend is more marked in the case of the sparse implementation (GPU
+4.0) than the compact one but appears on both curves. With more than
+$30$ simulations for each kernel, execution times stalls, since
+hardware limits are reached. This indicates that the cost of preparing
+and launching kernels become negligible compared to the computing
+time once a good GPU occupation rate is achieved.
+
+\section{Analysis and recommandations}
+\label{ch17:analysis}
+
+In this section we synthesize the observations done on the two models
+and identify some recommendations on implementing complex systems on
+GPU platforms.
+
+\subsection{Analysis}
+
+In both the Collembola and the MIOR model, a critical problematic is
+the determination of the part of the simulation which are to be run on
+GPU and which are to remain on the CPU. The determination of these
+parts is a classic step of any algorithm parallelization and must
+take into account considerations such as the cost of the different
+parts of the algorithm and the expected gains.
+
+In the case of the Collembola model two steps of the algorithm were
+ported to GPU. Both steps use straighforward, easily-parallelizable,
+operations where a direct gain can be expected by using more
+executions cores without important modifications of the algorithm.
+
+In the MIOR model case however no such inherently parallelizable parts
+are evident in the original sequential algorithm. This is mainly
+explained by the rate of interactions between agents in this model in
+the form of many operations (breathing, growth) on shared carbon
+ressources. In this case the algorithm had to be more profundly
+modified while keeping in head the need to remain true to the original
+model, to synchronize the main execution step of all agents in the
+model, to ensure enquity, and to minimize the numbers of
+synchronizations. The minimization is done by factoring the repartition of
+carbon in the model in two separated step at the beginning and the end
+of each iterations rather than at multiple point of the execution.
+
+\subsection{MAS execution workflow}
+
+Many MAS simulations decompose their execution process into discrete
+evolution steps where each step represents a quantum of time (minimal
+unit of time described). At the end of each step many global
+information, graphical displays or results are updated. This
+execution model may not correctly fit on GPU platforms as they assume
+more or less a batch-like workflow model. The execution model must be
+split into the following ever repeating steps:
+
+\begin{itemize}
+\item Allocation of GPU data buffers
+\item Copy of data from CPU to GPU
+\item GPU kernels execution
+\item Copy of results from GPU to CPU
+\end{itemize}
+
+This workflow works well if the considered data transfer time is
+negligible compared to GPU execution or can be done in parallel,
+thanks to the asynchronous nature of OpenCL. If we are to update the
+MAS model after each iteration then performance risk being
+degratated. This is illustrated in the MIOR model by the fact that the
+speedup observerd on GPU is much more significant for bigger
+simulations, which implies longer GPU execution times. Our solution to
+this problem is to desynchronize the execution of the SMA model and
+its GPU parts by requesting the execution of multiple steps of the
+GPU simulations for each launch.
+
+Another related prerequisite of GPU execution is the ability to have
+many threads to be executed, to allow an efficient exploitation of the
+superior number of cores provided by the architecture. In the case of
+MAS models, this means that executing one agent at a time on GPU is
+meaningless in regard to GPU usage, copying cost, and actual gain in
+execution time, if the process is not already parallel and costly at
+this scale. In the MIOR and the Collembola models, this is solved by
+executing the computations for all agents of the model at the same
+time. If the model has hovewer only chronic needs for intensive
+computations then some kind of batching mechanism is required to store
+waiting treatements in a queue, until the total sum of waiting
+computations justify the transfers cost to the GPU platform.
+
+\subsection{Implementation challenges}
+
+Besides the execution strategy challenges described above, some
+implementation challenges also occur when implementing an OpenCL
+version of a MAS model, mainly related to the underlying limitations
+of the execution platform.
+
+The first one is the impossibility (except in lastest CUDA versions)
+to dynamicaly allocate memory during execution. This is a problem in
+the case of models where the number of agent can vary during the
+simulation, such as prey-predator models. In this case, the only
+solution is to overestimate the size of arrays or data structures to
+accomodate these additionnal individuals, or using the CPU to resize
+data structures when these situations occur. Both approachs require
+to trend either memory or performance and are not always pratical.
+
+Another limitation is the impossibility to store pointers in data
+structures, we restraint any OpenCL to use only one dimension static
+arrays or specific data structures such as textures. This preclude the
+usage of structures like linked-list, graphs or sparse matrix not
+represented by some combination of static arrays, and can be another
+source of memory or performance losses.
+
+In the case of MIOR, this problem is especially exarcebed in the case
+of neighboring storage: both representations consum much more memory
+that actually required, since the worst case (all agents have
+access to all others agents) has to be taken into account defensivly
+when dimensioning the data structure.
+
+The existence of different generations of GPU hardwares is also a
+challenge. Older implementations, such as the four year old Tesla
+C1060 cards, have very strong expectation in term of memory accesses
+and requires very regular access patterns to perform efficiently. MAS
+having many random accesses, such as MIOR, or many small global memory
+accesses, such as Collembola, are penalized on these older
+cards. Fortunately, these requirements are less present is modern
+cards, which offer caches and other facilities traditionnaly present
+on CPU to offset these kind of penalities.
+
+The final concern is related to the previous ones and often result in
+more memory consumption. The amount of memory available on GPU cards
+is much more limited and adding new memory capabilities is more costly
+compared to expending a CPU RAM. On computing clusters, hardwares
+nodes with 128GB of memory or more have become affordable, whereas
+newer Tesla architecture remains limited to 16GB of memory. This can
+be a concern in the case of big MAS models, or small ones which can
+only use memory-inefficient OpenCL structures.
+
+\subsection{MCSMA}
+\label{ch17:Mcsma}
+
+As shown in the previous sections, many data representations choices
+are common to entire classes of MAS. The paradigm of grid, for example,
+is often encountered in models where each cell constitutes either the
+elementary unit of simulation (SugarScape~\cite{Dsouza2007}) or a
+discretisation of the environment space
+(Pathfinding~\cite{Guy09clearpath}). These grids can be considered as
+two or three-dimensional matrices, whose processing can be directly
+distributed.
+
+Another common data representation encountered in MAS system is the
+usage of 2d or 3d coordinates to store the position of each agent of
+the model. In this case, even if the environment is no longer
+discrete, the location information still imply computations
+(euclidian distances) which can be parallelized.
+
+MCSMA~\cite{lmlm+13:ip} is a framework developped to provide to the MAS designer
+those basic data structures, and the associated operations, to
+facilitate the portage of existing MAS on GPU. Two levels of
+utilization are provided to the developper, depending on its usage
+profile.
+
+\begin{itemize}
+\item A high-level library, constitued of modules regrouping classes
+ of operations. A module provides multiple methods of
+ distance computations in 1d, 2d or 3d grids, another one the
+ diffusion algorithm...
+\item A low level API which allows the developped direct access to the
+ GPU and the inner working of MCSMA, to develop its own module in
+ the case where the required operations are not yet provided by the platform.
+\end{itemize}
+
+Both usage levels were illustrated in the above two pratical cases. In
+MIOR, the whole algorithm (baring initialization) is ported on GPU as
+a specific plugin which allows executing $n$ MIOR simulations and
+retrieve their results. This first approach requires extensive
+adaptations to the original algorithm. In Collembola, on the
+contrary, the main steps of the algorithm remain executed on the CPU,
+and only specific operations are delegated to generic, already
+existing, diffusion and reduction kernels. The fundamental algorithm
+is not modified and GPU execution is only applied to specific parts of
+the execution which may benefit from it. These two programming
+approaches allow incremental adaptations of existing Java MAS to
+accelerate their execution on GPU, while retaining the option to
+develop their own reusable or more efficeint module to supplement the
+already existing ones.
+
+\section{Conclusion}
+\label{ch17:conclusion}
+
+This chapter addresses the issue of complex system simulation by using
+agent based paradigm and GPU hardware. From the experiments on two
+existing Agent Based Models of soil science we intend to provide
+useful information on the architecture, the algorithm design and, the
+data management to run Agent based simulations on GPU, and more
+generally to run compute intensive applications that are not based on
+a not-so-regular model. The first result of this work is that adapting
+the algorithm to a GPU architecture is possible and suitable to speed
+up agent based simulations as illustrated by the MIOR model. Coupling
+CPU with GPU seems even to be an interesting way to better take
+advantage of the power given by computers and clusters as illustrated
+by the Collembola model. From our point of view the adaptation process
+is lighter than a traditional parallelization on distributed nodes and
+not much difficult than a standard multi-threaded parallelization,
+since all the data remains on the same host and can be shared in
+central memory. The OCL adaptation also enables a portable simulator
+that can be run on different graphical units. Even using a mainstream
+card as the GPU card of a standard computer can lead to significant
+performance improvements. This is an interesting result as it opens
+the field of inexpensive HPC to a large community.
+
+In this perspective, we are working on MCSMA a development platform
+that would facilitate the use of GPU or many-core architectures for
+multi-agent simulations. Our first work has been the definition of
+common, efficient, reusable data structures, such as grids or
+lists. Another goal is to provide easier means to control the
+distribution of specific processes between CPU or GPU, to allow the
+easy exploitation of the strengths of each platform in the same
+multi-agent simulation. We think that the same approach, i.e
+developping specific environments that facilitate the developper
+access to the GPU power, can be applied in many domains with
+compute intensive needs to open the GPU use to a larger community.
+
+\putbib[Chapters/chapter17/biblio]
+
--- /dev/null
+#include "collem_structures.h"
+
+kernel void diffuse(
+ global CollemWorld *world,
+ global int *populations,
+ global int *overflows,
+ global int *newPopulations)
+{
+ const int i = get_global_id(0);
+ const int j = get_global_id(1);
+
+ // Compute the population to diffuse for this cell
+ OVERFLOW(world, overflows, i, j) = CELL(world, populations, i, j) * world->diffusionRate;
+
+ // _syncthreads() in CUDA
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+ // Retrieve neighbors surplus and add them to the current cell population
+ int surplus = OVERFLOW(world, overflows, i + 1, j) + OVERFLOW(world, overflows, i - 1, j) + OVERFLOW(world, overflows, i, j - 1) + OVERFLOW(world, overflows, i, j + 1);
+
+ CELL(world, newPopulations, i, j) = CELL(world, populations, i, j) + surplus / 8.0 - OVERFLOW(world, overflows, i, j);
+}
--- /dev/null
+#include "collem_structures.h"
+
+kernel void reduce(
+ global CollemWorld *world,
+ global int *populations,
+ global int *patchesOwners,
+ global int *plotsPopulations)
+{
+ const int i = get_global_id(0);
+ const int j = get_global_id(1);
+
+ // Retrieve the owner parcel
+ const int plotId = CELL(world, patchesOwners, i, j);
+ if (plotId == -1) return;
+
+ // Add cell population to the owner parcel population
+ atomic_add(plotsPopulations + plotId, CELL(world, populations, i, j));
+}
--- /dev/null
+#include "collem_structures.h"
+
+kernel void reduce(
+ global CollemWorld *world,
+ global int *populations,
+ global int *patchesOwners,
+ global int *plotsPopulations)
+{
+ const int i = get_global_id(0);
+ const int j = get_global_id(1);
+
+ // Retrieve the owner parcel
+ const int plotId = CELL(world, patchesOwners, i, j);
+ if (plotId == -1) return;
+
+ // Add cell population to the owner parcel population
+ atomic_add(plotsPopulations + plotId, CELL(world, populations, i, j));
+}
+
+kernel void diffuse(
+ global CollemWorld *world,
+ global int *populations,
+ global int *overflows,
+ global int *newPopulations)
+{
+ const int i = get_global_id(0);
+ const int j = get_global_id(1);
+
+ // Compute the population to diffuse for this cell
+ OVERFLOW(world, overflows, i, j) = CELL(world, populations, i, j) * world->diffusionRate;
+
+ // _syncthreads() in CUDA
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+ // Retrieve neighbors surplus and add them to the current cell population
+ int surplus = OVERFLOW(world, overflows, i + 1, j) + OVERFLOW(world, overflows, i - 1, j) + OVERFLOW(world, overflows, i, j - 1) + OVERFLOW(world, overflows, i, j + 1);
+
+ CELL(world, newPopulations, i, j) = CELL(world, populations, i, j) + surplus / 8.0 - OVERFLOW(world, overflows, i, j);
+}
--- /dev/null
+public void run() {
+ newArrivalsKernel.setArguments(worldMem, patchesPopulationsMem, patchesOwnersMem, plotsPopulationsMem, plotsSurfacesMem);
+ reproduceKernel.setArguments(worldMem, patchesPopulationsMem, patchesOwnersMem, plotsPopulationsMem);
+ diffuseKernel.setArguments(worldMem, patchesPopulationsMem, patchesOverflowsMem, newPopulationsMem);
+ deathKernel.setArguments(worldMem, patchesPopulationsMem, patchesTypesMem);
+
+ OCLEvent newArrivalFinished, reproduceFinished, diffuseFinished, copyFinished, deathFinished;
+
+ int i = 1;
+
+ OCLChrono totalTime = new OCLChrono("totalTime");
+ totalTime.start();
+
+ while (true) {
+ newArrivalFinished = enqueueKernel(newArrivalsKernel);
+ reproduceFinished = enqueueKernel(reproduceKernel, newArrivalFinished);
+ diffuseFinished = enqueueKernel(diffuseKernel, reproduceFinished);
+ copyFinished = queue.enqueueCopyBuffer(newPopulationsMem, patchesPopulationsMem, 0L, 0L, patchesPopulationsMem.getSize(), diffuseFinished);
+ OCLEvent.waitFor(copyFinished);
+ deathFinished = enqueueKernel(deathKernel, copyFinished);
+
+ queue.blockingReadBuffer(plotsPopulationsMem, Pointer.to(plotsPopulations), 0, plotsPopulationsMem.getSize(), deathFinished);
+ queue.blockingReadBuffer(patchesPopulationsMem, Pointer.to(patchesPopulations), 0, patchesPopulationsMem.getSize(), deathFinished);
+ queue.blockingReadBuffer(patchesOverflowsMem, Pointer.to(patchesOverflows), 0, patchesOverflowsMem.getSize(), deathFinished);
+
+ i++;
+
+ if (i == 100) break;
+ }
+
--- /dev/null
+// Microbial colony
+typedef struct MM {
+ float x;
+ float y;
+ int carbon;
+ int dormancy;
+} MM;
+
+// Carbon deposit
+typedef struct OM {
+ float x;
+ float y;
+ int carbon;
+ int lock;
+} OM;
+
+// MIOR Environment
+typedef struct World {
+ int nbMM;
+ int nbOM;
+ int RA; // Action radius
+ float RR; // Breathing rate
+ float GR; // Growth rate
+ float K; // Decomposition rate
+ int width; // Size of the model
+ int minSize; // Minimal size for a colony
+ int CO2; // Total CO2 in the model
+ int lock;
+} World;
--- /dev/null
+kernel void simulate(
+ global MM allMMList[],
+ global OM allOMList[],
+ global World allWorlds[],
+ constant int allMMOffsets[],
+ constant int allOMOffsets[],
+ global const int allMMCSR[],
+ global const int allOMCSR[],
+ global int allParts[])
+{
+ const int iSim = get_group_id(0);
+ const int i = get_local_id(0);
+
+ global World * world = allWorlds + iSim;
+ global MM * mmList = allMMList + iSim * NB_MM;
+ global OM * omList = allOMList + iSim * NB_OM;
+ constant int * mmOffsets = allMMOffsets + iSim * NB_MM;
+ constant int * omOffsets = allOMOffsets + iSim * NB_OM;
+ global const int * mmCSR = allMMCSR;
+ global const int * omCSR = allOMCSR;
+ global int * parts = allParts + iSim * PARTS_SIZE;
+
+ if (i < world->nbOM) {
+ scatter(omList, omOffsets, omCSR, parts, world);
+ }
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+ if (i < world->nbMM) {
+ live(mmList, mmOffsets, mmCSR, parts, world);
+ }
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+
+ if (i < world->nbOM) {
+ gather(omList, omOffsets, omCSR, parts, world);
+ }
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+}
--- /dev/null
+@Override
+protected void doLiveImpl() {
+ simulateKernel.setArguments(mmMem, omMem, worldsMem, mmOffsetsMem, omOffsetsMem, mmCSRMem, omCSRMem, partsMem);
+
+ if (blockSize < Math.max(nbOM, nbMM)) {
+ throw new RuntimeException("blockSize (" + blockSize + ") too small to execute the simulation");
+ }
+
+ OCLEvent event = queue.enqueue1DKernel(simulateKernel, nbSim * blockSize, blockSize);
+
+ OCLEvent.waitFor(event);
+ OCLUtils.printEventStats("simulate", event);
+
+ if (! isBatchModeEnabled()) {
+ queue.blockingReadBuffer(mmMem, mmList, 0, mmMem.getSize());
+ queue.blockingReadBuffer(omMem, omList, 0, omMem.getSize());
+ queue.blockingReadBuffer(worldsMem, worlds, 0, worldsMem.getSize());
+ System.out.println("copy");
+ }
+}
--- /dev/null
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:cc="http://creativecommons.org/ns#"
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+ xmlns:svg="http://www.w3.org/2000/svg"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+ xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+ width="210mm"
+ height="297mm"
+ id="svg2"
+ version="1.1"
+ inkscape:version="0.48.3.1 r9886"
+ sodipodi:docname="kernels.svg">
+ <defs
+ id="defs4">
+ <marker
+ inkscape:stockid="Arrow1Lstart"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow1Lstart"
+ style="overflow:visible">
+ <path
+ id="path3991"
+ d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;marker-start:none"
+ transform="scale(0.8) translate(12.5,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="DotL"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="DotL"
+ style="overflow:visible">
+ <path
+ id="path4147"
+ d="M -2.5,-1.0 C -2.5,1.7600000 -4.7400000,4.0 -7.5,4.0 C -10.260000,4.0 -12.5,1.7600000 -12.5,-1.0 C -12.5,-3.7600000 -10.260000,-6.0 -7.5,-6.0 C -4.7400000,-6.0 -2.5,-3.7600000 -2.5,-1.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;marker-start:none;marker-end:none"
+ transform="scale(0.8) translate(7.4, 1)" />
+ </marker>
+ <marker
+ inkscape:stockid="TriangleOutL"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="TriangleOutL"
+ style="overflow:visible">
+ <path
+ id="path4228"
+ d="M 5.77,0.0 L -2.88,5.0 L -2.88,-5.0 L 5.77,0.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;marker-start:none"
+ transform="scale(0.8)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow2Lend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow2Lend"
+ style="overflow:visible;">
+ <path
+ id="path4106"
+ style="font-size:12.0;fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
+ d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
+ transform="scale(1.1) rotate(180) translate(1,0)" />
+ </marker>
+ <marker
+ inkscape:stockid="Arrow1Lend"
+ orient="auto"
+ refY="0.0"
+ refX="0.0"
+ id="Arrow1Lend"
+ style="overflow:visible;">
+ <path
+ id="path4088"
+ d="M 0.0,0.0 L 5.0,-5.0 L -12.5,0.0 L 5.0,5.0 L 0.0,0.0 z "
+ style="fill-rule:evenodd;stroke:#000000;stroke-width:1.0pt;marker-start:none;"
+ transform="scale(0.8) rotate(180) translate(12.5,0)" />
+ </marker>
+ </defs>
+ <sodipodi:namedview
+ id="base"
+ pagecolor="#ffffff"
+ bordercolor="#666666"
+ borderopacity="1.0"
+ inkscape:pageopacity="0.0"
+ inkscape:pageshadow="2"
+ inkscape:zoom="0.7"
+ inkscape:cx="-94.649592"
+ inkscape:cy="560.28525"
+ inkscape:document-units="px"
+ inkscape:current-layer="layer1"
+ showgrid="false"
+ inkscape:window-width="1920"
+ inkscape:window-height="1023"
+ inkscape:window-x="0"
+ inkscape:window-y="33"
+ inkscape:window-maximized="1" />
+ <metadata
+ id="metadata7">
+ <rdf:RDF>
+ <cc:Work
+ rdf:about="">
+ <dc:format>image/svg+xml</dc:format>
+ <dc:type
+ rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+ <dc:title></dc:title>
+ </cc:Work>
+ </rdf:RDF>
+ </metadata>
+ <g
+ inkscape:label="Calque 1"
+ inkscape:groupmode="layer"
+ id="layer1">
+ <rect
+ style="fill:none;stroke:#000000;stroke-width:2;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;stroke-miterlimit:4;stroke-dasharray:none"
+ id="rect2989"
+ width="408.57144"
+ height="128.57143"
+ x="132.85715"
+ y="268.07648" />
+ <rect
+ style="fill:none"
+ id="rect3759"
+ width="334.28571"
+ height="137.14285"
+ x="134.28572"
+ y="516.64789" />
+ <rect
+ style="fill:none"
+ id="rect3761"
+ width="375.71429"
+ height="165.71428"
+ x="155.71428"
+ y="476.64789" />
+ <rect
+ style="fill:none;stroke:#000000;stroke-width:2;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;stroke-miterlimit:4;stroke-dasharray:none"
+ id="rect2989-8"
+ width="408.57144"
+ height="128.57143"
+ x="133.10522"
+ y="397.13724" />
+ <rect
+ style="fill:none;stroke:#000000;stroke-width:2;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;stroke-miterlimit:4;stroke-dasharray:none"
+ id="rect2989-82"
+ width="408.57144"
+ height="128.57143"
+ x="133.08492"
+ y="525.93164" />
+ <text
+ xml:space="preserve"
+ style="font-size:40px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+ x="337.14874"
+ y="330.92273"
+ id="text3800"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ x="337.14874"
+ y="330.92273"
+ id="tspan3004">Distribution</tspan><tspan
+ sodipodi:role="line"
+ x="337.14874"
+ y="364.97473"
+ id="tspan3008"
+ style="font-size:24px">(Nb<tspan
+ style="font-size:24px;baseline-shift:sub"
+ id="tspan3026">om</tspan> work-items)</tspan></text>
+ <path
+ style="fill:none;stroke:#000000;stroke-width:2;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-start:url(#Arrow1Lstart);marker-end:none"
+ d="m 330.91385,265.69034 -0.0545,-66.31157 277.77298,0.71801 1.67368,532.6715 -278.99992,0.50508 -0.16325,-78.41117"
+ id="path3814"
+ inkscape:connector-curvature="0"
+ sodipodi:nodetypes="cccccc" />
+ <text
+ xml:space="preserve"
+ style="font-size:40px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+ x="337.14874"
+ y="456.18143"
+ id="text3800-8"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ x="337.14874"
+ y="456.18143"
+ id="tspan3004-9">Live</tspan><tspan
+ sodipodi:role="line"
+ x="337.14874"
+ y="490.23343"
+ id="tspan3008-3"
+ style="font-size:24px">(Nb<tspan
+ style="font-size:24px;baseline-shift:sub"
+ id="tspan3026-4">mm</tspan> work-items)</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:40px;font-style:normal;font-weight:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+ x="337.14874"
+ y="586.70294"
+ id="text3800-8-7"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ x="337.14874"
+ y="586.70294"
+ id="tspan3004-9-3">Gathering</tspan><tspan
+ sodipodi:role="line"
+ x="337.14874"
+ y="620.75494"
+ id="tspan3008-3-0"
+ style="font-size:24px">(Nb<tspan
+ style="font-size:24px;baseline-shift:sub"
+ id="tspan3026-4-6">om</tspan> work-items)</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+ x="14.549108"
+ y="404.36218"
+ id="text3082"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan3084"
+ x="14.549108"
+ y="404.36218"
+ style="font-size:32px">barrier</tspan></text>
+ <text
+ xml:space="preserve"
+ style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+ x="24.285713"
+ y="509.50504"
+ id="text3086"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan3088"
+ x="24.285713"
+ y="509.50504" /></text>
+ <text
+ xml:space="preserve"
+ style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
+ x="14.549108"
+ y="536.86328"
+ id="text3082-8"
+ sodipodi:linespacing="125%"><tspan
+ sodipodi:role="line"
+ id="tspan3084-6"
+ x="14.549108"
+ y="536.86328"
+ style="font-size:32px">barrier</tspan></text>
+ </g>
+</svg>
\@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}}
\@writefile{toc}{\author{Christophe Guyeux}{}}
\@writefile{loa}{\addvspace {10\p@ }}
-\@writefile{toc}{\contentsline {chapter}{\numberline {16}Pseudorandom Number Generator on GPU}{363}}
+\@writefile{toc}{\contentsline {chapter}{\numberline {17}Pseudorandom Number Generator on GPU}{389}}
\@writefile{lof}{\addvspace {10\p@ }}
\@writefile{lot}{\addvspace {10\p@ }}
-\newlabel{chapter18}{{16}{363}}
-\@writefile{toc}{\contentsline {section}{\numberline {16.1}Introduction}{363}}
-\@writefile{toc}{\contentsline {section}{\numberline {16.2}Basic Remindees}{365}}
-\newlabel{section:BASIC RECALLS}{{16.2}{365}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.1}A Short Presentation of Chaos}{365}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.2}On Devaney's Definition of Chaos}{365}}
-\newlabel{sec:dev}{{16.2.2}{365}}
-\newlabel{Devaney}{{16.1}{365}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.3}Chaotic iterations}{366}}
-\newlabel{subsection:Chaotic iterations}{{16.2.3}{366}}
-\newlabel{Chaotic iterations}{{2}{366}}
-\newlabel{eq:generalIC}{{16.4}{367}}
-\newlabel{equation Oplus}{{16.5}{367}}
-\@writefile{toc}{\contentsline {section}{\numberline {16.3}Toward Efficiency and Improvement for CI PRNG}{367}}
-\newlabel{sec:efficient PRNG}{{16.3}{367}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{367}}
-\newlabel{algo:seqCIPRNG}{{16.1}{367}}
-\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.1}C code of the sequential PRNG based on chaotic iterations}{367}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{368}}
-\newlabel{sec:efficient PRNG gpu}{{16.3.2}{368}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.3}Naive Version for GPU}{368}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {16}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{369}}
-\newlabel{algo:gpu_kernel}{{16}{369}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.4}Improved Version for GPU}{369}}
-\newlabel{IR}{{17}{370}}
-\@writefile{loa}{\contentsline {algocf}{\numberline {17}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{370}}
-\newlabel{algo:gpu_kernel2}{{17}{370}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.5}Chaos Evaluation of the Improved Version}{370}}
-\@writefile{toc}{\contentsline {section}{\numberline {16.4}Experiments}{371}}
-\newlabel{sec:experiments}{{16.4}{371}}
-\@writefile{toc}{\contentsline {section}{\numberline {16.5}Summary}{371}}
-\@writefile{lof}{\contentsline {figure}{\numberline {16.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{372}}
-\newlabel{fig:time_xorlike_gpu}{{16.1}{372}}
-\@writefile{toc}{\contentsline {section}{Bibliography}{373}}
+\newlabel{chapter18}{{17}{389}}
+\@writefile{toc}{\contentsline {section}{\numberline {17.1}Introduction}{389}}
+\@writefile{toc}{\contentsline {section}{\numberline {17.2}Basic Remindees}{391}}
+\newlabel{section:BASIC RECALLS}{{17.2}{391}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {17.2.1}A Short Presentation of Chaos}{391}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {17.2.2}On Devaney's Definition of Chaos}{391}}
+\newlabel{sec:dev}{{17.2.2}{391}}
+\newlabel{Devaney}{{17.1}{391}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {17.2.3}Chaotic iterations}{392}}
+\newlabel{subsection:Chaotic iterations}{{17.2.3}{392}}
+\newlabel{Chaotic iterations}{{2}{392}}
+\newlabel{eq:generalIC}{{17.4}{393}}
+\newlabel{equation Oplus}{{17.5}{393}}
+\@writefile{toc}{\contentsline {section}{\numberline {17.3}Toward Efficiency and Improvement for CI PRNG}{393}}
+\newlabel{sec:efficient PRNG}{{17.3}{393}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {17.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{393}}
+\newlabel{algo:seqCIPRNG}{{17.1}{393}}
+\@writefile{lol}{\contentsline {lstlisting}{\numberline {17.1}C code of the sequential PRNG based on chaotic iterations}{393}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {17.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{394}}
+\newlabel{sec:efficient PRNG gpu}{{17.3.2}{394}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {17.3.3}Naive Version for GPU}{394}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {17}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{395}}
+\newlabel{algo:gpu_kernel}{{17}{395}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {17.3.4}Improved Version for GPU}{395}}
+\newlabel{IR}{{18}{396}}
+\@writefile{loa}{\contentsline {algocf}{\numberline {18}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{396}}
+\newlabel{algo:gpu_kernel2}{{18}{396}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {17.3.5}Chaos Evaluation of the Improved Version}{396}}
+\@writefile{toc}{\contentsline {section}{\numberline {17.4}Experiments}{397}}
+\newlabel{sec:experiments}{{17.4}{397}}
+\@writefile{toc}{\contentsline {section}{\numberline {17.5}Summary}{397}}
+\@writefile{lof}{\contentsline {figure}{\numberline {17.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{398}}
+\newlabel{fig:time_xorlike_gpu}{{17.1}{398}}
+\@writefile{toc}{\contentsline {section}{Bibliography}{399}}
\@setckpt{Chapters/chapter18/ch18}{
-\setcounter{page}{375}
+\setcounter{page}{401}
\setcounter{equation}{5}
-\setcounter{enumi}{2}
+\setcounter{enumi}{3}
\setcounter{enumii}{0}
\setcounter{enumiii}{0}
\setcounter{enumiv}{17}
\setcounter{footnote}{2}
\setcounter{mpfootnote}{0}
\setcounter{part}{6}
-\setcounter{chapter}{16}
+\setcounter{chapter}{17}
\setcounter{section}{5}
\setcounter{subsection}{0}
\setcounter{subsubsection}{0}
\setcounter{lstnumber}{15}
\setcounter{ContinuedFloat}{0}
\setcounter{AlgoLine}{14}
-\setcounter{algocfline}{17}
-\setcounter{algocfproc}{17}
-\setcounter{algocf}{17}
+\setcounter{algocfline}{18}
+\setcounter{algocfproc}{18}
+\setcounter{algocf}{18}
\setcounter{nprt@mantissa@digitsbefore}{0}
\setcounter{nprt@mantissa@digitsafter}{0}
\setcounter{nprt@exponent@digitsbefore}{0}
bibtex bu14
bibtex bu15
bibtex bu16
+ bibtex bu17
makeindex ${BOOK}.idx
pdflatex ${BOOK}