From: Raphael Couturier Date: Sat, 6 Apr 2013 05:30:00 +0000 (+0200) Subject: add ch17 X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/book_gpu.git/commitdiff_plain/bdec1b5087c2ea922fcf62ad0591b8d784ddf3b7?ds=inline add ch17 --- diff --git a/BookGPU/BookGPU.tex b/BookGPU/BookGPU.tex index 4c27d29..3cea1e7 100755 --- a/BookGPU/BookGPU.tex +++ b/BookGPU/BookGPU.tex @@ -187,6 +187,7 @@ \include{Chapters/chapter15/ch15} \include{Chapters/chapter16/ch16} \part{Other} +\include{Chapters/chapter17/ch17} \include{Chapters/chapter18/ch18} \include{Chapters/chapter19/ch19} diff --git a/BookGPU/Chapters/chapter17/biblio.bib b/BookGPU/Chapters/chapter17/biblio.bib new file mode 100644 index 0000000..c951cb0 --- /dev/null +++ b/BookGPU/Chapters/chapter17/biblio.bib @@ -0,0 +1,728 @@ +% This file was created with JabRef 2.6. +% Encoding: ISO8859_1 + +@incollection{Odell:2003:RRD:1807559.1807562, + author = {Odell, James J. and Van Dyke Parunak, H. and Fleischer, Mitchell}, + chapter = {The role of roles in designing effective agent organizations}, + title = {Software engineering for large-scale multi-agent systems}, + editor = {Garcia, Alessandro and Lucena, Carlos and Zambonelli, Franco and Omicini, Andrea and Castro, Jaelson}, + year = {2003}, + isbn = {3-540-08772-9}, + pages = {27--38}, + numpages = {12}, + url = {http://dl.acm.org/citation.cfm?id=1807559.1807562}, + acmid = {1807562}, + publisher = {Springer-Verlag}, + address = {Berlin, Heidelberg}, +} + +@INPROCEEDINGS{Aaby:2010:ESA:1808143.1808181, + author = {Aaby, Brandon G. and Perumalla, Kalyan S. and Seal, Sudip K.}, + title = {Efficient simulation of agent-based models on multi-GPU and multi-core + clusters}, + booktitle = {Proceedings of the 3rd International ICST Conference on Simulation + Tools and Techniques}, + year = {2010}, + series = {SIMUTools '10}, + pages = {29:1--29:10}, + address = {ICST, Brussels, Belgium, Belgium}, + publisher = {ICST (Institute for Computer Sciences, Social-Informatics and Telecommunications + Engineering)}, + acmid = {1808181}, + articleno = {29}, + doi = {http://dx.doi.org/10.4108/ICST.SIMUTOOLS2010.8822}, + isbn = {978-963-9799-87-5}, + keywords = {CUDA, GPU, MPI, agent-based simulation, cluster, computational hierarchy, + latency hiding, multi-core, threads}, + location = {Torremolinos, Malaga, Spain}, + numpages = {10}, + url = {http://dx.doi.org/10.4108/ICST.SIMUTOOLS2010.8822} +} + +@MISC{mason_home, + title = {{MASON Multiagent Simulation Toolkit}}, + howpublished = "\url{http://cs.gmu.edu/~eclab/projects/mason/}", + note = "[12-oct-2011]" +} + +@MISC{cuda_home, + title = {{CUDA: NVIDIA's parallel computing architecture}}, + howpublished = "\url{http://www.nvidia.com/object/cuda_home.html}", + note = "[11-oct-2011]" +} + +@MISC{opencl_home, + title = {{OpenCL: The open standard for parallel programming of heterogeneous systems}}, + howpublished = "\url{http://www.khronos.org/opencl/}", + note = "[11-oct-2011]" +} + +@MISC{jocl_home, + title = {{JOCL: Java bindings for OpenCL}}, + howpublished = "\url{http://www.jocl.org/}", + note = "[11-oct-2011]" +} + +@MISC{jcuda_home, + title = {{JCUDA: Java bindings for CUDA}}, + howpublished = "\url{http://www.jcuda.org/}", + note = "[11-oct-2011]" +} + +@MISC{repast_home2, + title = {{Repast Suite, a family of advanced, free and open source agent-based modeling and simulation platforms}}, + howpublished = "\url{http://repast.sourceforge.net/}", + note = "[05-October-2011]" +} + +@MISC{madkit_home, + title = {{Madkit, an open source modular and scalable multiagent platform written in Java}}, + howpublished = "\url{http://www.madkit.org/}", + note = "[05-October-2011]" +} + +@MISC{noAuthor, + howpublished = {{This reference includes one or more of the authors and has been removed from this review version, in compliance with the submission guidelines.}} +} + +@MISC{jade_home, + title = {{JADE (Java Agent DEvelopment Framework)}}, + howpublished = "\url{http://jade.tilab.com/}", + note = "[05-October-2011]" +} + +@MISC{netlogo_home2, + title = {{NetLogo is a multi-agent programmable modeling environment}}, + howpublished = "\url{http://ccl.northwestern.edu/netlogo/}", + note = "[05-October-2011]" +} + + +@MISC{Bleiweiss_2009, + author = {Bleiweiss, Avi}, + title = {Multi Agent Navigation on the GPU}, + howpublished = "\url{developer.download.nvidia.com/presentations/2009/GDC/MultiAgentGPU.pdf}", + year = {2009}, + note = "[05-October-2011]" +} + + +@ARTICLE{BarakA.2010, + author = {Barak A., Ben-Nun T., Levy E. and Shiloh A.}, + title = {A Package for OpenCL Based Heterogeneous Computing on Clusters with + Many GPU Devices}, + journal = {Workshop on Parallel Programming and Applications on Accelerator + Clusters (PPAAC), IEEE Cluster 2010}, + year = {2010}, + owner = {glaville}, + timestamp = {2011.10.04} +} + +@ARTICLE{Bleiweiss_2008, + author = {Bleiweiss, Avi}, + title = {Multi Agent Navigation on the GPU}, + journal = {GDC09 Game Developers Conference 2009}, + year = {2008} +} + +@ARTICLE{C.Cambier2007, + author = {C. Cambier, D. Masse, M. Bousso and E. Perrier}, + title = {An offer versus demand modelling approach to assess the impact of + micro-organisms spatio-temporal dynamics on soil organic matter decomposition + rates}, + journal = {Ecological Modelling}, + year = {2007}, + pages = {301--313}, + owner = {glaville}, + timestamp = {2011.10.04} +} + +@ARTICLE{C.Cambier2006, + author = {C. Cambier, D. Masse, M. Bousso and E. Perrier}, + title = {Mior, A spatially explicit, individual based modeling approach to + simulate soil microbial and organic matter processes}, + journal = {Ecological Modelling}, + year = {2006}, + owner = {glaville}, + timestamp = {2011.10.04} +} + +@INPROCEEDINGS{D'Souza:2009:DAA:1639809.1639831, + author = {D'Souza, Roshan M. and Lysenko, Mikola and Marino, Simeone and Kirschner, + Denise}, + title = {Data-parallel algorithms for agent-based model simulation of tuberculosis + on graphics processing units}, + booktitle = {Proceedings of the 2009 Spring Simulation Multiconference}, + year = {2009}, + series = {SpringSim '09}, + pages = {21:1--21:12}, + address = {San Diego, CA, USA}, + publisher = {Society for Computer Simulation International}, + acmid = {1639831}, + articleno = {21}, + keywords = {GPGPU, agent-based models, integrative systems biology}, + location = {San Diego, California}, + numpages = {12}, + url = {http://dl.acm.org/citation.cfm?id=1639809.1639831} +} + +@ARTICLE{E.Blanchart2009, + author = {E. Blanchart, N. Marilleau, A. Drogoul, E. Perrier, JL. + Chotte and C. Cambier}, + title = {SWORM: an agent-based model to simulate the effect of earthworms + on soil structure}, + journal = {EJSS. European Journal of Soil Science}, + year = {2009}, + pages = {13-21}, + owner = {glaville}, + timestamp = {2011.10.04} +} + +@INPROCEEDINGS{Fischer:2009:GAP:1803298.1803361, + author = {Fischer, Leonardo G. and Silveira, Renato and Nedel, Luciana}, + title = {GPU Accelerated Path-Planning for Multi-agents in Virtual Environments}, + booktitle = {Proceedings of the 2009 VIII Brazilian Symposium on Games and Digital + Entertainment}, + year = {2009}, + series = {SBGAMES '09}, + pages = {101--110}, + address = {Washington, DC, USA}, + publisher = {IEEE Computer Society}, + acmid = {1803361}, + doi = {http://dx.doi.org/10.1109/SBGAMES.2009.20}, + isbn = {978-0-7695-3963-8}, + keywords = {Path-planning, GPGPU, NVIDIA CUDA, Agent Simulation}, + numpages = {10}, + url = {http://dx.doi.org/10.1109/SBGAMES.2009.20} +} + +@INPROCEEDINGS{Gomez-Luna:2009:PVS:1616772.1616869, + author = {G\'{o}mez-Luna, J. and Gonz\'{a}lez-Linares, J.-M. + and Benavides, J.-I. and Guil, N.}, + title = {Parallelization of a Video Segmentation Algorithm on CUDA---Enabled + Graphics Processing Units}, + booktitle = {15th Euro-Par Conference}, + year = {2009}, + pages = {924--935}, + address = {Berlin, Heidelberg}, + publisher = {Springer-Verlag}, + acmid = {1616869}, + doi = {http://dx.doi.org/10.1007/978-3-642-03869-3%5F85}, + isbn = {978-3-642-03868-6}, + keywords = {CUDA, Canny, Generalized Hough Transform, Video Segmentation}, + location = {Delft, The Netherlands}, + numpages = {12}, + url = {http://dx.doi.org/10.1007/978-3-642-03869-3%5F85} +} + +@incollection {Maitre2009, + author = {Maitre, Ogier and Lachiche, Nicolas and Clauss, Philippe and Baumes, Laurent and Corma, Avelino and Collet, Pierre}, + affiliation = {LSIIT University of Strasbourg, France}, + title = {Efficient Parallel Implementation of Evolutionary Algorithms on GPGPU Cards}, + booktitle = {Euro-Par 2009 Parallel Processing}, + series = {Lecture Notes in Computer Science}, + editor = {Sips, Henk and Epema, Dick and Lin, Hai-Xiang}, + publisher = {Springer Berlin / Heidelberg}, + isbn = {978-3-642-03868-6}, + keyword = {Computer Science}, + pages = {974-985}, + volume = {5704}, + url = {http://dx.doi.org/10.1007/978-3-642-03869-3%5F89}, + note = {10.1007/978-3-642-03869-3%5F89}, + year = {2009} +} + + +@INPROCEEDINGS{Gutknecht2000, + author = {Gutknecht, O. and Ferber, J.}, + title = {MadKit: a generic multi-agent platform}, + booktitle = {Proceedings of the fourth international conference on Autonomous + agents}, + year = {2000}, + series = {AGENTS '00}, + pages = {78--79}, + address = {New York, NY, USA}, + publisher = {ACM}, + acmid = {337048}, + doi = {http://doi.acm.org/10.1145/336595.337048}, + isbn = {1-58113-230-1}, + location = {Barcelona, Spain}, + numpages = {2}, + url = {http://doi.acm.org/10.1145/336595.337048} +} + +@INPROCEEDINGS{J.Breitbart2009, + author = {J. Breitbart, J.; + + Res. Group Programming Languages / Methodologies, Univ. Kassel, Kassel, + Germany}, + title = {Data structure design for GPU based heterogeneous systems}, + booktitle = {High Performance Computing \& Simulation, 2009. HPCS '09. International + Conference on}, + year = {2009}, + pages = {44 - 51}, + owner = {glaville}, + timestamp = {2011.10.04} +} + +@BOOK{Jean-PierreTreuil2008, + title = {Mod{\'e}lisation et Simulation {\`a} base d'agents}, + publisher = {Dunot}, + year = {2008}, + editor = {Dunot}, + author = {Jean-Pierre Treuil, Alexis Drogoul, Jean-Daniel Zucker}, + owner = {glaville}, + timestamp = {2011.10.04} +} + +@ARTICLE{DBLP:journals/corr/abs-1005-2581, + author = {Kamran Karimi and Neil G. Dickson and Firas Hamze}, + title = {A Performance Comparison of CUDA and OpenCL}, + journal = {CoRR}, + year = {2010}, + volume = {abs/1005.2581}, + bibsource = {DBLP, http://dblp.uni-trier.de}, + ee = {http://arxiv.org/abs/1005.2581} +} + +@INPROCEEDINGS{komatsu10iwapt, + author = {Kazuhiko Komatsu and Katsuto Sato and Yusuke Arai and Kentaro Koyama + and Hiroyuki Takizawa and Hiroaki Kobayashi}, + title = {Evaluating Performance and Portability of OpenCL Programs}, + booktitle = {The Fifth International Workshop on Automatic Performance Tuning}, + year = {2010}, + month = {June} +} + +@INPROCEEDINGS{Mistry:2011:APF:1964179.1964193, + author = {Mistry, Perhaad and Gregg, Chris and Rubin, Norman and Kaeli, David + and Hazelwood, Kim}, + title = {Analyzing program flow within a many-kernel OpenCL application}, + booktitle = {Proceedings of the Fourth Workshop on General Purpose Processing + on Graphics Processing Units}, + year = {2011}, + series = {GPGPU-4}, + pages = {10:1--10:8}, + address = {New York, NY, USA}, + publisher = {ACM}, + acmid = {1964193}, + articleno = {10}, + doi = {http://doi.acm.org/10.1145/1964179.1964193}, + isbn = {978-1-4503-0569-3}, + keywords = {GPGPU, OpenCL, SURF, computer vision, heterogeneous computing, performance + tools, profiling}, + location = {Newport Beach, California}, + numpages = {8}, + url = {http://doi.acm.org/10.1145/1964179.1964193} +} + +@ARTICLE{P.Du2010, + author = {P. Du, R. Weber, P. Luszczek, S. Tomov, G. Peterson and J. Dongarra,}, + title = {From CUDA to OpenCL: Towards a Performance-portable Solution for + Multi-platform GPU Programming}, + journal = {Parallel Computing, Aug. 2010}, + year = {2010}, + owner = {glaville}, + timestamp = {2011.10.04} +} + +@INPROCEEDINGS{Richmond2008, + author = {Richmond, P., Romano, D. M.}, + title = {Agent Based GPU, A Real-time 3D Simulation and Interactive Visualisation + Framework for Massive Agent Based Modelling on the GPU}, + booktitle = {Proceedings International Workshop on Supervisualisation 2008}, + year = {2008}, + owner = {glaville}, + timestamp = {2011.10.04} +} + +@CONFERENCE{S.Rul2010, + author = {S. Rul, H. Vandierendonck, J. D'Haene, and K. D. Bosschere}, + title = {An experimental study on performance portability of OpenCL kernels. + In Symposium on Application Accelerators in High Performance Computing}, + booktitle = {Symposium on Application Accelerators in High Performance Computing + (SAAHPC '10)}, + year = {2010}, + owner = {glaville}, + timestamp = {2011.10.04} +} + +@INPROCEEDINGS{Silveira:2010:PRG:1948395.1948446, + author = {Silveira, Renato and Fischer, Leonardo and Ferreira, Jos\'{e} Ant\^{o}nio + Salini and Prestes, Edson and Nedel, Luciana}, + title = {Path-planning for RTS games based on potential fields}, + booktitle = {Proceedings of the Third international conference on Motion in games}, + year = {2010}, + series = {MIG'10}, + pages = {410--421}, + address = {Berlin, Heidelberg}, + publisher = {Springer-Verlag}, + acmid = {1948446}, + isbn = {3-642-16957-0, 978-3-642-16957-1}, + keywords = {autonomous agent, navigation, path-planning}, + location = {Utrecht, The Netherlands}, + numpages = {12}, + url = {http://dl.acm.org/citation.cfm?id=1948395.1948446} +} + +@ARTICLE{CAV:CAV261, + author = {Silveira, Renato and Prestes, Edson and Nedel, Luciana P.}, + title = {Managing coherent groups}, + journal = {Computer Animation and Virtual Worlds}, + year = {2008}, + volume = {19}, + pages = {295--305}, + number = {3-4}, + doi = {10.1002/cav.261}, + issn = {1546-427X}, + keywords = {groups simulation, motion planning, formation-keeping, potential field}, + publisher = {John Wiley \& Sons, Ltd.}, + url = {http://dx.doi.org/10.1002/cav.261} +} + +@ARTICLE{Strippgen_Nagel_2009, + author = {Strippgen, David and Nagel, Kai}, + title = {Multi-agent traffic simulation with CUDA}, + journal = {2009 International Conference on High Performance Computing Simulation}, + year = {2009}, + pages = {106--114}, + publisher = {Ieee}, + url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=5192895} +} + +@TECHREPORT{Thrane2005, + author = {Niels Thrane and Lars Ole Simonsen and Advisor Peter Ørbæk}, + title = {A comparison of acceleration structures for GPU assisted ray tracing}, + year = {2005} +} + +@BOOK{2009, + title = {NVIDIA CUDA C Programming Best Practices Guide + + CUDA Toolkit 2.3}, + publisher = {NVIDIA Corporation}, + year = {2009}, + owner = {glaville}, + timestamp = {2011.03.24} +} + +@InProceedings{Edmonds04, +author = {Edmonds, B. and Moss, S.}, +title = {"From KISS to KIDS: An" anti-simplistic" Modeling Approach"}, +booktitle = {MABS 2004}, +pages = {130-144}, +year = {2004} +} + +@PhdThesis{Amouroux11, +author = {Amouroux, E.}, +title = {"KIMONO: using the modelling process as an aid for research orientation"}, +school = {UPMC}, +year = {2011}, +address = {Paris, France}, +} + +@Book{tdz2008:book, + AUTHOR = {Treuil, J.-P. and Drogoul, A. and Zucker, J.-D.}, + TITLE = {Mod{\'e}lisation et simulation {\`a} base d'agents: Approches particulaires, mod{\`e}les {\`a} base d'agents, de la mise en pratique aux questions th{\'e}oriques}, + YEAR = {2008}, + PUBLISHER = {Dunod} +} + +@Article{Sean05, +author = {Sean, L. and Cioffi-Revilla, C. and Panait, L. and Sullivan, K. and Balan, G.}, +title = {MASON: A Multi-Agent Simulation Environment}, +journal = {Simulation: Transactions of the society for Modeling and Simulation International}, +year = {2005}, +volume = {82}, +number = {7}, +pages = {517-527} +} + +@Article{BLA09, + AUTHOR = {Blanchart, E. and Marilleau, N. and Drogoul, A. and Perrier, E. and Chotte, JL. and Cambier, C.}, + TITLE = {SWORM: an agent-based model to simulate the effect of earthworms on soil structure}, + YEAR = {2009}, + JOURNAL = {EJSS. European Journal of Soil Science}, + VOLUME = {60}, + PAGES = {13-21} +} + + +@inproceedings{Vowel02, + author = {Da Silva Joao Luis T. and + Demazeau, Yves}, + title = {Vowels co-ordination model}, + booktitle = {AAMAS}, + year = {2002}, + pages = {1129-1136}, + ee = {http://doi.acm.org/10.1145/545056.545083}, + bibsource = {DBLP, http://dblp.uni-trier.de} +} + + @inproceedings{MAR06, + author = { Marilleau, N. and + Lang, C. and + Chatonnay, P. and + Philippe, L.}, + title = {An Agent Based Framework for Urban Mobility Simulation}, + booktitle = {PDP}, + year = {2006}, + pages = {355-361}, + ee = {http://doi.ieeecomputersociety.org/10.1109/PDP.2006.19}, + address = {France}, + bibsource = {DBLP, http://dblp.uni-trier.de} +} + + +@Article{perrier, + AUTHOR = {Bird, N and Perrier, E.}, + TITLE = {The PSF model and soil density scaling}, + YEAR = {2003}, + JOURNAL = {European Journal of Soil Science}, + VOLUME = {54}, + NUMBER = {3}, + PAGES = {467-476} +} + + +@inproceedings{dem02, + author = {Joao Luis T. da Silva and + Yves Demazeau}, + title = {Vowels co-ordination model}, + booktitle = {AAMAS}, + year = {2002}, + pages = {1129-1136}, + address = {Italy}, + ee = {http://doi.acm.org/10.1145/545056.545083}, + bibsource = {DBLP, http://dblp.uni-trier.de} +} +@Article{netlogo_home, +author = {Sklar, E.}, +title = {NetLogo, a multi-agent simulation environment}, +journal = {Artificial Life}, +year = {2011}, +volume = {13}, +number = {3}, +pages = {303-311} +} + +@InProceedings{repast_home, +author = {North, M.J. and Howe, T.R. and Collier, N.T. and Vos, J.R}, +title = {A Declarative Model Assembly Infrastructure for Verification and Validation}, +booktitle = {Advancing Social Simulation: The First World Congress}, +year = {2007}, +editor = {Springer}, +address = {Heidelberg, FRG} +} + + +@INPROCEEDINGS{Guy09clearpath, + author = {Stephen J. Guy and Jatin Chhugani and Changkyu Kim and Nadathur Satish and Ming C. Lin and Dinesh Manocha and Pradeep Dubey}, + title = {ClearPath: Highly Parallel Collision Avoidance for Multi-Agent Simulation}, + booktitle = {ACM SIGGRAPH/EUROGRAPHICS SYMPOSIUM ON COMPUTER ANIMATION}, + year = {2009}, + pages = {177--187}, + publisher = {ACM} +} + +@inproceedings{Kiran2010, + author = {Kiran, Mariam and Richmond, Paul and Holcombe, Mike and Chin, Lee Shawn and Worth, David and Greenough, Chris}, + title = {FLAME: simulating large populations of agents on parallel hardware architectures}, + booktitle = {Proceedings of the 9th International Conference on Autonomous Agents and Multiagent Systems: volume 1 - Volume 1}, + series = {AAMAS '10}, + year = {2010}, + isbn = {978-0-9826571-1-9}, + location = {Toronto, Canada}, + pages = {1633--1636}, + numpages = {4}, + url = {http://dl.acm.org/citation.cfm?id=1838206.1838517}, + acmid = {1838517}, + publisher = {International Foundation for Autonomous Agents and Multiagent Systems}, + address = {Richland, SC}, + keywords = {simulation techniques, tools and environments}, +} + +@inproceedings{Dsouza2007, +author={D'SOUZA R M, Lysenko M, Rahmani K}, +title={Sugarscape on Steroids: Simulating Over a Million Agents at Interactive Rates}, +year={2007}, +booktitle={Proceedings of the Agent 2007 Conference}, +adress={Chicago, IL} +} + +@Book{Schweitzer2003, +author = {Schweitzer, F. and Farmer, J. D.}, +title = {Brownian Agents and Active Particles: Collective Dynamics in the Natural and Social Sciences}, +publisher = {Springer}, +year = {2003}, +series = {Complexity} +} + +@Book{Ferber99, +author = {Ferber, J.}, +title = { Multi-Agent Systems. An Introduction to Distributed Artificial Intelligence. }, +publisher = {Addison Wesley}, +year = {1999}, +OPTaddress = {London} +} + + +@InProceedings{Cosenza2011, +author = {Cosenza,B and Cordasco, G and De Chiara, R. and Scarano, V.}, +title = {Distributed Load Balancing for Parallel Agent-based Simulations}, +booktitle = {19th Euromicro International Conference on Parallel, Distributed and Network-Based Computing}, +year = {2011}, +address = {Ayia Napa, Cyprus} +} + +@article{Chuffart2010, + author = {Chuffart, F and + Dumoulin, N and + Faure, T and + Deffuant, G}, + title = {SimExplorer: Programming Experimental Designs on Models + and Managing Quality of Modelling Process}, + journal = {IJAEIS}, + volume = {1}, + number = {1}, + year = {2010}, + pages = {55-68}, + ee = {http://dx.doi.org/10.4018/jaeis.2010101304}, + bibsource = {DBLP, http://dblp.uni-trier.de} +} + + +@inproceedings{Blanchart11, + author = {Blanchart, E. and + Cambier, C. and + Canape, C. and + Gaudou, B. and + Ho, T.-N. and + Ho, T.-V. and + Lang, C. and + Michel, F. and + Marilleau, N. and + Philippe, L.}, + title = {EPIS: A Grid Platform to Ease and Optimize Multi-agent Simulators + Running}, + booktitle = {PAAMS}, + year = {2011}, + publisher = {Springer}, + series = {Advances in Intelligent and Soft Computing}, + volume = {88}, + pages = {129-134}, + ee = {http://dx.doi.org/10.1007/978-3-642-19875-5_17}, + bibsource = {DBLP, http://dblp.uni-trier.de} +} + + +@inproceedings{Dziekonski2011, +author={ Dziekonski, A. and Lamecki, A. and Mrozowski, M.}, +title={A memory efficient and fast sparse matrix vector product on a gpu}, +year={2011}, +booktitle={Progress In Electromagnetics Research, Vol. 116}, +pages={49--63}, +doi={10.2528/PIER11031607} +} + +@article{couturier2011, + author = {Couturier, Rapha{\"e}l and Domas, St{\'e}phane}, + affiliation = {LIFC, IUT Belfort-Montb{\'e}liard, University of Franche Comte, BP 527, 90016 Belfort CEDEX, France}, + title = {Sparse systems solving on GPUs with GMRES}, + journal = {The Journal of Supercomputing}, + publisher = {Springer Netherlands}, + issn = {0920-8542}, + keyword = {Computer Science}, + pages = {1-13}, + year={2011}, + url = {http://dx.doi.org/10.1007/s11227-011-0562-z}, + note = {DOI 10.1007/s11227-011-0562-z} +} + +@article{Horton2011, +author={Horton, M. and Tomov, S. and Dongarra, J.}, +title={A Class of Hybrid LAPACK Algorithms for Multicore and GPU Architectures}, +journal={Symposium for Application Accelerators in High Performance Computing (SAAHPC'11)}, +year={2011} +} + +@inproceedings{Aaby10, + author = {Aaby, Brandon G. and Perumalla, Kalyan S. and Seal, Sudip K.}, + title = {Efficient simulation of agent-based models on multi-GPU and multi-core clusters}, + booktitle = {Proceedings of the 3rd International ICST Conference on Simulation Tools and Techniques}, + series = {SIMUTools '10}, + year = {2010}, + isbn = {978-963-9799-87-5}, + location = {Torremolinos, Malaga, Spain}, + pages = {29:1--29:10}, + articleno = {29}, + numpages = {10}, + url = {http://dx.doi.org/10.4108/ICST.SIMUTOOLS2010.8822}, + doi = {http://dx.doi.org/10.4108/ICST.SIMUTOOLS2010.8822}, + acmid = {1808181}, + publisher = {ICST (Institute for Computer Sciences, Social-Informatics and Telecommunications Engineering)}, + address = {ICST, Brussels, Belgium, Belgium}, + keywords = {CUDA, GPU, MPI, agent-based simulation, cluster, computational hierarchy, latency hiding, multi-core, threads} +} + + +@article{Durstenfeld1964, + author = {Durstenfeld, Richard}, + title = {Algorithm 235: Random permutation}, + journal = {Commun. ACM}, + volume = {7}, + issue = {7}, + month = {July}, + year = {1964}, + issn = {0001-0782}, + pages = {420--}, + url = {http://doi.acm.org/10.1145/364520.364540}, + doi = {http://doi.acm.org/10.1145/364520.364540}, + acmid = {364540}, + publisher = {ACM}, + address = {New York, NY, USA} +} + +@InBook{Collier11, +ALTauthor = {Collier, N.T., and North, M.J. }, +title = {Repast SC++: A Platform for Large-scale Agent-based Modeling}, +chapter = {Large-Scale Computing Techniques for Complex System Simulation}, +publisher = {Wiley}, +year = {2011} +} + +@inproceedings{lmlm+12:ip, +inhal = {no}, +domainehal = {INFO:INFO_NI, INFO:INFO_DC, INFO:INFO_IU}, +equipe = {cartoon}, +classement = {ACTI}, +author = {Laville, Guillaume and Mazouzi, Kamel and Lang, Christophe and Marilleau, Nicolas and Philippe, Laurent}, +title = {Using {GPU} for Multi-agent Multi-scale Simulations}, +booktitle = {DCAI'12, 9-th Int. Conf. on Advances in Intelligent and Soft Computing}, +pages = {197--204}, +doi = {10.1007/978-3-642-28765-7_23}, +url = {http://dx.doi.org/10.1007/978-3-642-28765-7_23}, +series = {Advances in Intelligent and Soft Computing}, +volume = 151, +address = {Salamanca, Spain}, +publisher = {Springer}, +month = mar, +year = 2012, +} + +@inproceedings{lmlm+13:ip, +inhal = {no}, +domainehal = {INFO:INFO_NI, INFO:INFO_DC, INFO:INFO_IU}, +equipe = {cartoon}, +classement = {ACTI}, +corerank = {C}, +author = {Laville, Guillaume and Mazouzi, Kamel and Lang, Christophe and Marilleau, Nicolas and Philippe, Laurent}, +title = {Using {GPU} for Multi-agent Soil Simulation}, +booktitle = {PDP 2013, 21st Euromicro International Conference on Parallel, Distributed and Network-based Computing}, +pages = {***--***}, +address = {Belfast, Ireland}, +publisher = {IEEE Computer Society Press}, +month = feb, +year = 2013, +} diff --git a/BookGPU/Chapters/chapter17/ch17.aux b/BookGPU/Chapters/chapter17/ch17.aux new file mode 100644 index 0000000..032ebff --- /dev/null +++ b/BookGPU/Chapters/chapter17/ch17.aux @@ -0,0 +1,116 @@ +\relax +\@writefile{toc}{\author{G. Laville, C. Lang, K. Mazouzi, N. Marilleau, B. Herrmann, L. Philippe}{}} +\@writefile{loa}{\addvspace {10\p@ }} +\@writefile{toc}{\contentsline {chapter}{\numberline {16}Implementing MAS on GPU}{363}} +\@writefile{lof}{\addvspace {10\p@ }} +\@writefile{lot}{\addvspace {10\p@ }} +\newlabel{chapter17}{{16}{363}} +\@writefile{toc}{\contentsline {section}{\numberline {16.1}Introduction}{363}} +\newlabel{ch17:intro}{{16.1}{363}} +\@writefile{toc}{\contentsline {section}{\numberline {16.2}Running Agent-Based Simulations}{365}} +\newlabel{ch17:ABM}{{16.2}{365}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.1}Multi-agent systems and parallelism}{365}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.2}MAS Implementation on GPU}{366}} +\newlabel{ch17:subsec:gpu}{{16.2.2}{366}} +\@writefile{toc}{\contentsline {section}{\numberline {16.3}A first practical example}{368}} +\newlabel{ch17:sec:1stmodel}{{16.3}{368}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.1}The Collembola model}{368}} +\newlabel{ch17:subsec:collembolamodel}{{16.3.1}{368}} +\@writefile{lof}{\contentsline {figure}{\numberline {16.1}{\ignorespaces Evolution algorithm of Collembola model\relax }}{369}} +\newlabel{ch17:fig:collem_algorithm}{{16.1}{369}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.2}Collembola Implementation}{369}} +\newlabel{ch17:listing:collembola-diffuse}{{16.1}{369}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.1}Collembola OpenCL Diffusion kernel}{369}} +\newlabel{ch17:listing:collembola-reduc}{{16.2}{370}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.2}Collembola OpenCL reduction kernel}{370}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.3}Collembola performance}{371}} +\@writefile{lof}{\contentsline {figure}{\numberline {16.2}{\ignorespaces Performances CPU et GPU du mod\IeC {\`e}le Collemboles\relax }}{371}} +\newlabel{ch17:fig:mior_perfs_collem}{{16.2}{371}} +\@writefile{toc}{\contentsline {section}{\numberline {16.4}Second example}{372}} +\newlabel{ch17:sec:2ndmodel}{{16.4}{372}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.4.1}The MIOR model}{372}} +\newlabel{ch17:subsec:miormodel}{{16.4.1}{372}} +\@writefile{loa}{\contentsline {algocf}{\numberline {16}{\ignorespaces Evolution step of each Meta-Mior (microbial colony) agent\relax }}{373}} +\newlabel{ch17:seqalgo}{{16}{373}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.4.2}MIOR Implementation}{373}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {16.4.2.1}Execution mapping on GPU}{373}} +\@writefile{lof}{\contentsline {figure}{\numberline {16.3}{\ignorespaces Execution distribution retained on GPU\relax }}{374}} +\newlabel{ch17:fig:gpu_distribution}{{16.3}{374}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {16.4.2.2}Data structures translation}{374}} +\newlabel{ch17:subsec:datastructures}{{16.4.2.2}{374}} +\newlabel{ch17:listing:mior_data_structures}{{16.3}{375}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.3}Main data structures used in a MIOR simulation}{375}} +\@writefile{lof}{\contentsline {figure}{\numberline {16.4}{\ignorespaces Compact representation of the topology of a MIOR simulation\relax }}{376}} +\newlabel{ch17:fig:csr_representation}{{16.4}{376}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {16.4.2.3}Critical resources access management}{376}} +\newlabel{ch17:subsec:concurrency}{{16.4.2.3}{376}} +\newlabel{ch17:listing:mior_kernels}{{16.4}{376}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.4}Main MIOR kernel}{376}} +\newlabel{ch17:fig:mior_launcher}{{16.5}{377}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.5}MIOR simulation launcher}{377}} +\@writefile{toc}{\contentsline {subsubsection}{\numberline {16.4.2.4}Termination detection}{378}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.4.3}Performance of MIOR implementations}{378}} +\newlabel{ch17:subsec:miorexperiments}{{16.4.3}{378}} +\@writefile{lof}{\contentsline {figure}{\numberline {16.5}{\ignorespaces CPU and GPU performance on a Tesla C1060 node\relax }}{380}} +\newlabel{ch17:fig:mior_perfs_tesla}{{16.5}{380}} +\@writefile{lof}{\contentsline {figure}{\numberline {16.6}{\ignorespaces CPU and GPU performance on a personal computer with a Geforce 8800GT\relax }}{381}} +\newlabel{ch17:fig:mior_perfs_8800gt}{{16.6}{381}} +\@writefile{toc}{\contentsline {section}{\numberline {16.5}Analysis and recommandations}{381}} +\newlabel{ch17:analysis}{{16.5}{381}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.5.1}Analysis}{381}} +\@writefile{lof}{\contentsline {figure}{\numberline {16.7}{\ignorespaces Execution time of one multi-simulation kernel on the Tesla platform\relax }}{382}} +\newlabel{ch17:fig:monokernel_graph}{{16.7}{382}} +\@writefile{lof}{\contentsline {figure}{\numberline {16.8}{\ignorespaces Total execution time for 1000 simulations on the Tesla platform, while varying the number of simulations for each kernel\relax }}{382}} +\newlabel{ch17:fig:multikernel_graph}{{16.8}{382}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.5.2}MAS execution workflow}{382}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.5.3}Implementation challenges}{383}} +\@writefile{toc}{\contentsline {subsection}{\numberline {16.5.4}MCSMA}{384}} +\newlabel{ch17:Mcsma}{{16.5.4}{384}} +\@writefile{toc}{\contentsline {section}{\numberline {16.6}Conclusion}{385}} +\newlabel{ch17:conclusion}{{16.6}{385}} +\@writefile{toc}{\contentsline {section}{Bibliography}{386}} +\@setckpt{Chapters/chapter17/ch17}{ +\setcounter{page}{389} +\setcounter{equation}{0} +\setcounter{enumi}{3} +\setcounter{enumii}{0} +\setcounter{enumiii}{0} +\setcounter{enumiv}{25} +\setcounter{footnote}{1} +\setcounter{mpfootnote}{0} +\setcounter{part}{6} +\setcounter{chapter}{16} +\setcounter{section}{6} +\setcounter{subsection}{0} +\setcounter{subsubsection}{0} +\setcounter{paragraph}{0} +\setcounter{subparagraph}{0} +\setcounter{figure}{8} +\setcounter{table}{0} +\setcounter{numauthors}{0} +\setcounter{parentequation}{4} +\setcounter{subfigure}{0} +\setcounter{lofdepth}{1} +\setcounter{subtable}{0} +\setcounter{lotdepth}{1} +\setcounter{lstnumber}{21} +\setcounter{ContinuedFloat}{0} +\setcounter{AlgoLine}{17} +\setcounter{algocfline}{16} +\setcounter{algocfproc}{16} +\setcounter{algocf}{16} +\setcounter{nprt@mantissa@digitsbefore}{0} +\setcounter{nprt@mantissa@digitsafter}{0} +\setcounter{nprt@exponent@digitsbefore}{0} +\setcounter{nprt@exponent@digitsafter}{0} +\setcounter{nprt@digitsfirstblock}{0} +\setcounter{nprt@blockcnt}{0} +\setcounter{nprt@cntprint}{0} +\setcounter{proposition}{1} +\setcounter{theorem}{0} +\setcounter{exercise}{0} +\setcounter{example}{0} +\setcounter{definition}{0} +\setcounter{proof}{1} +\setcounter{lstlisting}{5} +} diff --git a/BookGPU/Chapters/chapter17/ch17.tex b/BookGPU/Chapters/chapter17/ch17.tex new file mode 100755 index 0000000..9d458de --- /dev/null +++ b/BookGPU/Chapters/chapter17/ch17.tex @@ -0,0 +1,1049 @@ +\chapterauthor{G. Laville, C. Lang, K. Mazouzi, N. Marilleau, B. Herrmann, L. Philippe}{Femto-ST Institute, University of Franche-Comt{\'e}} + +\newlength\mylen +\newcommand\myinput[1]{% + \settowidth\mylen{\KwIn{}}% + \setlength\hangindent{\mylen}% + \hspace*{\mylen}#1\\} + +\chapter{Implementing MAS on GPU} +\label{chapter17} + + +\section{Introduction} +\label{ch17:intro} + +In this chapter we introduce the use of Graphical Processing Units for +multi-agents based systems as an example of a not-so-regular +application that could benefit from the GPU computing +power. Multi-Agent Systems or MAS are a simulation paradigm used to +study the behavior of dynamic systems. Dynamic systems as physical +systems are often modeled by mathematical representations and their +dynamic behavior simulated by differential equations. The simulation +of the system thus often relay on the resolution of a linear system +that can be efficiently computed on a graphical processing unit as +shown in the preceeding chapters. But when the behavior of the system +elements is not uniformly driven by the same law, when these elements +have their own behavior, the modeling process is too complex to rely +on formal expressions. In this context MAS is a recognized approach to +model and simulate systems where individuals have an autonomous +behavior that cannot be simulated by the evolution of a set of +variables driven by mathematical laws. MAS are often used to simulate +natural or collective phenomenons whose individuals are too numerous or +various to provide a unified algorithm describing the system +evolution. The agent-based approach is to divide these complex systems +into individual self-contained entities with their smaller set of +attributes and functions. But, as for mathematical simulations, when +the size of the Multi-Agent System (MAS) increases the need of computing +power and memory also increases. For this reason, multi-agent systems +should benefit from the use of distributed computing +architectures. Clusters and Grids are often identified as the main +solution to increase simulation performance but Graphical Processing +Units (GPU) are also a promising technology with an attractive +performance/cost ratio. + +Conceptually a MAS is a distributed system as it favors the definition +and description of large sets of individuals, the agents, that can be +run in parallel. As a large set of agents could have the same behavior +a SIMD model should fit the simulation execution. Most of the +agent-based simulators are however designed with a sequential scheme +in mind and these simulators seldom use more than one core for their +execution. Due to simulation scheduling constraints, data sharing and +exchange between agents and the huge amount of interactions between +agents and their environment, it is indeed rather difficult to +distribute an agent based simulator, for instance, to take advantage of new +multi-threaded computer architectures. Thus, guidelines and tools +dedicated to MAS paradigm and HPC is now a need for other complex +system communities. Note that, from the described structure (large +number of agents sharing data), we can conclude that MAS would more +easily benefit from many-cores architectures than from other kinds of +parallelism. + +Another key point that advocates for the use of many-core in MAS is +the growing need for multi-scale simulations. Multi-scale simulations +explore problems with interactions between several scales. The +different scales use different granularities of the structure and +potentially different models. Most of the time the lower scale +simulations provide results to higher scale simulations. In that case +the execution of the simulations can easily be distributed between the +local cores and a many-core architecture, i.e. a GPU device. + +We explore in this chapter the use of many-core architectures to +execute agent-based simulations. We illustrate our reflexion with two +uses cases: the colembola simulator designed to simulate the diffusion +of collembola between plots of land and the MIOR simulator that +reproduces effects of earthworms on bacteria dynamics in a bulked +soil. In Section \ref{ch17:ABM} we present the work related to MAS and +parallelization with a special focus on many-core use. In sections +\ref{ch17:sec:1stmodel} and \ref{ch17:sec:2ndmodel} we present in +details two multi-agent models, their GPU implementations, the +conducted experiments and their performance results. The first model, +given in Section \ref{ch17:sec:1stmodel}, illustrates the use of a GPU +architecture to speed up the execution of some computation intensive +functions while the main model is still executed on the central +processing unit. The second model, given in Section +\ref{ch17:sec:2ndmodel}, illustrates the use of a GPU architecture to +implement the whole model on the GPU processor which implies deeper +changes in the initial algorithm. In Section \ref{ch17:analysis} we +propose a more general reflexion on these implementations and we +provide some guidelines. Then we conclude in Section +\ref{ch17:conclusion} on the possible generalization of our work. + + +\section{Running Agent-Based Simulations} +\label{ch17:ABM} + +In this section, we present the context of multi-agent systems, MAS, +their parallelization and we report several existing works on using +GPU to simulate multi-agent systems. + +\subsection{Multi-agent systems and parallelism} + +Agent-Based systems are often used to simulate natural or collective +phenomenons whose actors are too numerous or various to provide a +simple unified algorithm describing the studied system +dynamic~\cite{Schweitzer2003}. The implementation of an agent based +simulation usually starts by designing the underlying Agent Based +Model (ABM). Most ABM are based around a few types of entities such as Agents, +Environment and around an Interaction Organization~\cite{Vowel02}. In the +complex system domain, the environment often describes a real space, +its structure (e.g. soil textures and porosities) and its dynamics +(e.g. organic matter decomposition). It is a virtual world in which +agents represent studied entities (e.g. biotic organisms) +evolution. The actual agent is animated by a behavior that can range +between reactivity (only react to external stimuli) and cognition +(lives its own process alongside other individuals). Interaction +and Organization define functions, types and patterns of +communications of their member agents in the +system~\cite{Odell:2003:RRD:1807559.1807562}. Note that, depending on +the MAS, agents can communicate either directly through special +primitives or indirectly through the information stored in the +environment. + +Agent based simulations have been used for more than one decade to +reproduce, understand even predic complex system dynamic. They have proved their interest in various +scientific communities. Nowadays generic agent based frameworks are +promoted such as Repast~\cite{repast_home} or +NetLogo~\cite{netlogo_home} to implement simulators. Many ABM such as +the crown model representing a city-wide +scale~\cite{Strippgen_Nagel_2009} tend however to require a large +number of agents to provide a realistic behavior and reliable global +statistics. Moreover, an achieved model analysis needs to run many +simulations identified into an experiment plan to obtain enough +confidence in a simulation. In this case the available computing power +often limits the simulation size and the result range thus requiring +the use of parallelism to explore bigger configurations. + +For that, three major approaches can be identified: +\begin{enumerate} +\item parallelizing experiments plan on a cluster or a grid (one or + few simulations are submitted to each + core)~\cite{Blanchart11,Chuffart2010}. +\item parallelizing the simulator on a cluster (the environment of the + MAS is split and run on several distributed + nodes)~\cite{Cosenza2011,MAR06} +\item optimizing simulator by taking advantage of computer resources + (multi-threading, GPU, and so on) \cite{Aaby10} +\end{enumerate} + +In the first case, experiments are run independently each others and only simulation parameters change between two runs so +that a simple version of an existing simulator can be used. This +approach does not, however, allow to run larger models. +% It does not imply +%model code changes except for Graphical Unit Interface extracting as +%the experiments does not run interactively but in batch. +In the second +and the third case, model and code modifications are necessary. Only few +frameworks however introduce distribution in agent simulation +(Madkit~\cite{Gutknecht2000}, MASON~\cite{Sean05}, repastHPC~\cite{Collier11}) and parallel +implementations are often based on the explicit use of threads using +shared memory~\cite{Guy09clearpath} or cluster libraries such as +MPI~\cite{Kiran2010}. + +Parallelizing a multi-agent simulation is however complex due to space +and time constraints. Multi-agent simulations are usually based on a +synchronous execution: +%by the agents that share the same environment. +at each time step, numerous events (space data +modification, agent motion) and interactions between agents happen. +Distributing the simulation on several computers or grid nodes thus +implies to guaranty a distributed synchronous execution and +coherency. This often leads to poor performance or complex synchronization +problems. Multi-cores execution or delegating part of this execution +to others processors as GPUs~\cite{Bleiweiss_2008} are thus usually easier +to implement since all the threads share the data and the local clock. + +% Different agent patterns can be adopted in an ABMs such as +% cognitive and reactive ones~\cite{Ferber99}. Cognitive agents act on +% the environment and interact with other agents according to a complex +% behavior. This behavior takes a local perception of the virtual world +% and the agent past (a memory characterized by an internal state and +% belief, imperfect knowledge about the world) into account. Reactive +% agents have a much systematic pattern of action based on stimuli +% response schemes (no or few knowledge and state conservation in agent). The +% evolution of the ABM environment, in particular, is often represented with +% this last kind of agents. As their behavior is usually simple, we +% propose in this chapter to delegate part of the environment and of the +% reactive agents execution to the graphical processing unit of the +% computer. This way we can balance the load between both CPU and GPU +% execution resources. + +% In the particular case of multi-scale simulations such as the Sworm +% simulation~\cite{BLA09} the environment may be used at different +% levels. Since the representation of the whole simulated environment +% (i.e. the soil area) would be costly, the environment is organized as +% a multi-level tree of small soil cubes which can be lazily +% instantiated during the simulation. This allows to gradually refine +% distribution details in units of soil as agents progress and need +% those information, by using a fractal process based on the +% bigger-grained already instantiated levels. This characteristic, +% especially for a fractal model, could be the key of the +% distribution. For instance, each branch of a fractal environment could +% be identified as an independent area and parallelized. In addition +% Fractal is a famous approach to describe multi-scale environment (such +% as soil) and its organization~\cite{perrier}. In that case the lower +% scale simulations can also be delegated to the GPU card to limit the +% load of the main (upper scale) simulation. + +\subsection{MAS Implementation on GPU} +\label{ch17:subsec:gpu} + +The last few years saw the apparition of new generations of graphic +cards based on more general purpose execution units which are +promising for less regular systems as MAS. +% These units can be programmed using GPGPU languages to solve various +% problems such as linear algebra resolutions, usually based on matrix +% manipulations. +The matrix-based data representation and SIMD computations are +however not straightforward in Multi-Agent Systems, where data +structures and algorithms are tightly coupled to the described +simulation. However, works from existing literature show that MAS can +benefit from these performance gains on various simulation types, such +as traffic simulation~\cite{Strippgen_Nagel_2009}, cellular +automatons~\cite{Dsouza2007}, mobile-agent based +path-finding~\cite{Silveira:2010:PRG:1948395.1948446} or genetic +algorithms~\cite{Maitre2009}. Note that an application-specific +adaptation process was required in the case of these MAS: some of the +previous examples are driven by mathematical laws (path-finding) or +use a natural mapping between a discrete environment (cellular +automaton) and GPU cores. Unfortunately, this mapping often requires +algorithmic adaptations in other models but the experience shows that +the more reactive a MAS is the more adapted its implementation is to +GPU. + +The first step in the adaptation of an ABM to GPU platforms is the +choice of the language. On the one hand, the Java programming language +is often used for the implementation of MAS due to its availability +on numerous platforms or frameworks and its focus on high-level, +object-oriented programming. On the other hand, GPU platforms can +only run specific languages as OpenCL or CUDA. OpenCL (supported on +AMD, Intel and NVIDIA hardware) better suits the portability concerns +across a wide range of hardware needed the agent simulators, as +opposed to CUDA which is a NVIDIA-specific library. + +OpenCL is a C library which provides access to the underlying CPUs or +GPUs threads using an asynchronous interface. Various OpenCL functions allow +the compilation and the execution of programs on these execution +resources, the copy of data buffers between devices, or the collection +of profiling information. + +This library is based around three main concepts: + +\begin{itemize} +\item the \emph{kernel} (similar to a CUDA kernel) which represents a runnable program + containing instructions to be executed on the GPU. +\item the \emph{work-item} (equivalent to a CUDA thread), which is analogous to the concept + of thread on GPU, in that it represents one running instance of a + GPU kernel. +\item the \emph{work-group} (or execution block) which is a set of work-items + sharing some memory to speed up data accesses and + computations. Synchronization operations such as barrier can only be + used across the same work-group. +\end{itemize} + +Running an OpenCL computation consists in launching numerous work-items +that execute the same kernel. The work-items are submitted to a +submission queue to optimize the available cores usage. A calculus is +achieved once all these kernel instances have terminated. + +The number of work-items used in each work-group is an important +implementation choice which determines how many tasks will share the +same cache memory. Data used by the work-items can be stored as +N-dimensions matrices in local or global GPU memory. Since the size +of this memory is often limited to a few hundred of kilobytes, +choosing this number often implies a compromise between the model +synchronization or data requirements and the available resources. + +In the case of agent-based simulations, each agent can be naturally +mapped to a work-item. Work-groups can then be used to represents groups +of agents or simulations sharing common data (such as the environment) +or algorithms (such as the background evolution process). + +In the following examples the JOCL~\cite{jocl_home} binding is used +to access the OpenCL platfmorm from the Java programming language. + +In the next sections we present two practical cases that will be +studied in details, from the model to its implementation and +performance. + +\section{A first practical example} +\label{ch17:sec:1stmodel} + +The first model, the Collembola model, simulates the propagation of +collembolas in flieds and forests. It is based on a diffusion +algorithm which illustrates the case of agents with a simple behavior +and few synchronization problems. + +\subsection{The Collembola model} +\label{ch17:subsec:collembolamodel} + +The Collembola model is an example of multi-agent system using GIS (Geographical Information System) + and survey data (population count) + to modelize the evolution of the biodiversity +across land plots. A first version of this model has been developed with Netlogo framework by Bioemco and UMMISCO researchers. In this model, the biodiversity is modelized by populations of +athropod individuals, the Collembola, which can reproduce and diffuse +to favorable new habitats. The simulator allows to study the diffusion +of collembola, between plots of land depending on their landuse +(artifical soil, crop, forest\ldots). In this +model the environment is composed of the studied land and the +colembola are agents. Every land plot is divided into several cells, +each cell representing a surface unit (16*16 meters). A number of individuals per collembola species is associated to each cell. The model evolution +is then based on a common diffusion model that diffuse individuals between +cells. Each step of the simulation is based on four stages, as shown +on Figure~\ref{ch17:fig:collem_algorithm}: + +% \begin{enumerate} +% \item arrival of new individuals +% \item reproduction in each cell +% \item diffusion between cells +% \item updating of colembola lifetime +% \end{enumerate} + +\begin{figure}[h] +\centering +\includegraphics[width=0.6\textwidth]{Chapters/chapter17/figs/algo_collem.pdf} +\caption{Evolution algorithm of Collembola model} +\label{ch17:fig:collem_algorithm} +\end{figure} + +The algorithm is quite simple but includes two costly operations, the +reproduction and the diffusion, that must be parallelized to improve +the model performances. + +The {\bf reproduction} stage consists in updating the total population +of each plot by taking the individuals arrived at the preceding +computation step. This stage implies to process the whole set of cells +of the environment to sum their population. The computed value is +recorded in the plot associated to each cell. This process can be +assimilated to a reduction operation on all the population cells +associated to one plot to obtain its population. + +The {\bf diffusion} stage simulates the natural behavior of the +collembola that tends toward occupying the whole space over time. This +stage constists in computing a new value for each cell depending on +the population of the neighbor cells. This process can be assimilated +to a linear diffusion at each iteration of the populationof the cells +across their neightbors. + +These two processes are quite common in numerical computations so +that the collembola model can be adapted to a GPU execution without +much difficulties. + +\subsection{Collembola Implementation} + +In the Collembola simulator biodiversity is modeled by populations +of Collembola individuals, which can reproduce and diffuse to +favorable new habitats. This is implemented in the algorithm as a fixed +reproduction factor, applied to the size of each population, followed +by a linear diffusion of each cell population to its eight +neighbors. These reproduction and diffusion processes are followed by +two more steps on the GPU implementation. The first one consist of +culling of populations in a inhospitable environment, by checking each +cell value, landuse, and setting its population to zero if +necessary. The final simulation step is the reduction of the cells +populations for each plot, to obtain an updated plot population +for statistic purposes. This separate computation step, done while +updating each cell population in the reference sequential algorithm, +is motivated by synchronization problems and allow the reduction of +the total numbers of access needed to updated those populations. + +%\lstinputlisting[language=C,caption=Collembola OpenCL +%kernels,label=fig:collem_kernels]{Chapters/chapter17/code/collem_kernels.cl} +\lstinputlisting[caption=Collembola OpenCL Diffusion kernel,label=ch17:listing:collembola-diffuse]{Chapters/chapter17/code/collem_kernel_diffuse.cl} + +The reproduction, diffusion and culling steps are implemented on GPU +(Figure~\ref{ch17:fig:collem_algorithm}) as a straight mapping of each cell +to an OpenCL work-item (GPU thread). Listing +\ref{ch17:listing:collembola-diffuse} gives the kernel for the +diffusion implementation. To prevent data coherency problems, the +diffusion step is split into two phases, separated by an execution +{\it barrier}. In the first phase, each cell diffusion overflow is +calculated, and divided by the number of neightbors. Note that, at the +model frontier, populations can also overflow outside the environment +grid but we do not manage those external populations, since there are +no reason to assume our model to be isolated of its surroundings. The +overflow by neigbors value is stored for each cell, before +encountering the barrier. After the barrier is met, each cell read the +overflows stored by all its neighbors at the previous step and applies +them to its own population. In this manner, only one barrier is +required to ensure the consistency of populations value, since no +cells ever modify a value other than its own. + +\lstinputlisting[caption=Collembola OpenCL reduction kernel,label=ch17:listing:collembola-reduc]{Chapters/chapter17/code/collem_kernel_reduc.cl} + +Listing \ref{ch17:listing:collembola-reduc} gives the kernel for the +reduction implementation. The only step requiring numerous +synchronized accesses is the reduction one: in a first approach, we +chose to use {\it atomic\_add} operation to implement this process, but more +efficient implementations using partial reduction and local GPU memory +could be implemented. + +\subsection{Collembola performance} + +In this part we present the performance of the Collembola model on +various CPU and GPU execution +platforms. Figure~\ref{ch17:fig:mior_perfs_collem} shows that the +number of cores and the processor archiecture as a strong influence on the obtained results + +% : the +% dual-core processor equipping our Thinkpad platform has two to six +% longer executions times, compared to a six-core Phenom X6. + +% % \begin{figure}[h] +% %begin{minipage}[t]{0.49\linewidth} +% \centering \includegraphics[width=0.7\linewidth]{./Chapters/chapter17/figs/collem_perfs.pdf} +% \caption{Performance CPU et GPU du modèle Collemboles} +% \label{ch17:fig:mior_perfs_collem} +% %end{minipage} +% \end{figure} + +% In figure~\ref{ch17:fig:mior_perfs_collem2} the Thinkpad curve is removed +% to make other trends clearer. Two more observation can be made, using +% this more detailled scale: + +\begin{itemize} +\item Older GPU cards can be slower than modern processors. This can + be explained by the new cache and memory access optimizations + implemented in newer generations of GPU devices. These optimizations + reduce the penalities associated with irregular and frequent global + memory accesses. They are not available on our Tesla nodes. +\item GPU curves exhibit a odd-even pattern in their performance + results. Since this phenomenon is visible on two distinct + manufacturer hardware, drivers and OpenCL implementation, it is + likely the result of the decomposition process based on warp of + fixed, power-of-two sizes. +\item The number of cores is not the only determining factor: an Intel + Core i7 2600K processor, even with only four cores, can provide + better performance than a Phenom one. +\end{itemize} + +\begin{figure}[h] +%begin{minipage}[t]{0.49\linewidth} +\centering +\includegraphics[width=0.7\linewidth]{./Chapters/chapter17/figs/collem_perfs_nothinkpad.pdf} +\caption{Performances CPU et GPU du modèle Collemboles} +\label{ch17:fig:mior_perfs_collem} +%end{minipage} +\end{figure} + +Both figures shows that using the GPU to parallelize part of the +simulator results in tangible performance gains over a CPU execution +on modern hardware. These gains are more mixed on older GPU platforms +due to the limitations when dealing with irregular memory or execution +patterns often encountered in MAS systems. This can be closely linked +to the availability of caching facilities on the GPU hardware and its +dramatical effects depending on the locality and frequency of data +accesses. In this case, even if the Tesla architectures provides more +execution cores and is the far costlier solution, more recent, cheaper, +solutions such as high-end GPU provide better performance when the +execution is not constrained by memory size. + +\section{Second example} +\label{ch17:sec:2ndmodel} + +The second model, the MIOR model, simulates the behavior of microbian +colonies. Its execution model is more complex so that it requires +changes in the initial algorithm and the use of synchronization to +benefit from the GPU architecture. + +\subsection{The MIOR model} +\label{ch17:subsec:miormodel} + +The MIOR~\cite{C.Cambier2007} model was developed to simulate local +interactions in a soil between microbial colonies and organic +matters. It reproduces each small cubic units ($0.002 m^3$) of soil as a +MAS. + +Multiple implementations of the MIOR model have already been +realized, in Smalltalk and Netlogo, in 2 or 3 dimensions. The last +implementation, used in our work and referenced as MIOR in the +rest of the chapter, is freely accessible online as +WebSimMior~\footnote{http://www.IRD.fr/websimmior/}. + +MIOR is based around two types of agents: (i) the Meta-Mior (MM), +which represents microbial colonies consuming carbon and (ii) the +Organic Matter (OM) which represents carbon deposits occurring in soil. + +The Meta-Mior agents are characterized by two distinct behaviors: +\begin{itemize} +\item \emph{breath}: the action converts mineral carbon from the soil + to carbon dioxide $CO_{2}$ that is released in the soil. +\item \emph{growth}: by this action each microbial colony fixes the + carbon present in the environment to reproduce itself (augments its + size). This action is only possible if the colony breathing needs + where covered, i.e. enough mineral carbon is available. +\end{itemize} + +These behaviors are described in Algorithm~\ref{ch17:seqalgo}. + +\begin{algorithm}[h] +\caption{Evolution step of each Meta-Mior (microbial colony) agent} +\label{ch17:seqalgo} +\KwIn{A static array $mmList$ of MM agents} +\myinput{A static array $omList$ of OM agents} +\myinput{A MIOR environment $world$} +$breathNeed \gets world.respirationRate \times mm.carbon$\; +$growthNeed \gets world.growthRate \times mm.carbon$\; +$availableCarbon \gets totalAccessibleCarbon(mm)$\; +\uIf{$availableCarbon > breathNeed$}{ + \tcc{ Breath } + $mm.active \gets true$\; + $availableCarbon \gets availableCarbon - consumCarbon(mm, breathNeed)$\; + $world.CO2 \gets world.CO2 + breathNeed$\; + \If{$availableCarbon > 0$}{ + \tcc{ Growth } + $growthConsum \gets max(totalAccessCarbon(mm), growthNeed)$\; + $consumCarbon(mm, growthConsum)$\; + $mm.carbon \gets mm.carbon + growthConsum$\; + } +} +\Else{ + $mm.active \gets false$ +} +\end{algorithm} + +Since this simulation takes place at a microscopic scale, a large +number of these simulations must be executed for each macroscopic +simulation step to model realistic-sized unit of soil. This lead to +large computing needs despite the small computation cost of each +individual simulation. + + +\subsection{MIOR Implementation} + +As pointed out previously, the MIOR implementation implied more +changes for the initial code to be run on GPU. As a first attempt, we +realized a simple GPU implementation of the MIOR simulator, with only +minimal changes to the CPU algorithm. Execution times showed the +inefficiency of this approach, and highlighted the necessity of +adapting the simulator to take advantage of the GPU execution +capabilities~\cite{lmlm+12:ip}. In this part, we show the main changes +which were realized to adapt the MIOR simulator on GPU architectures. + +\subsubsection{Execution mapping on GPU} + +\begin{figure} +\centering +\includegraphics[width=0.7\textwidth]{Chapters/chapter17/figs/repartition.pdf} +\caption{Execution distribution retained on GPU} +\label{ch17:fig:gpu_distribution} +\end{figure} + +Each MIOR simulation is represented by a work-group, and each agent by +a work-item. A kernel is in charge of the life cycle process for each +agent of the model. This kernel is executed by all the work-items of the simulation +each on its own gpu core. + +The usage of one work-group for each simulation allows to easily +execute multiple simulations in parallel, as shown on +figure~\ref{ch17:fig:gpu_distribution}. By taking advantage of the +execution overlap possibilities provided by OpenCL it then becomes +possible to exploit all the cores at the same time, even if an unique +simulation is to small to use all the available GPU cores. However, +the maximum size of a work-group is limited (to $512$), which only allows +us to execute one simulation per work-group when using $310$ threads +(number of OM in the reference model) to execute the simulation. + +The usage of the GPU to execute multiple simulations is initiated by +the CPU. The later keeps total control of the simulator execution +flow. Thus, optimized scheduling policies (such as submitting kernels +in batch, or limiting the number of kernels, or asynchronously +retrieving the simulation results) can be defined to minimize the cost +related to data transfers between CPU and GPUs. + +\subsubsection{Data structures translation} +\label{ch17:subsec:datastructures} + +The adaptation of the MIOR model to GPU requires the mapping of the +data model to OpenCL data structures. The environment and the agents +are represented by arrays of structures, where each structure describes the +state of one entity. The behavior of these entities are implemented +as OpenCL functions to be called from the kernels during execution. + +Since the main program is written in Java, JOCL is responsible for the +allocation and the mapping of the object data structures to OpenCL +ones before execution. + +Four main data structures +(Figure~\ref{ch17:listing:mior_data_structures}) are defined: (i) an +array of MM agents, representing the state of the microbial +colonies. (ii) an array of OM agents, representing the state of the +carbon deposits. (iii) a topology matrix, which stores accessibility +information between the two types of agents of the model (iv) a world +structure, which contains all the global input data (metabolism rate, +numbers of agents) and output data (quantity of $CO_{2}$ produced) of +the simulation. These data structures are initialized by the CPU and +then copied on the GPU. + +\lstinputlisting[caption=Main data structures used in a MIOR simulation,label=ch17:listing:mior_data_structures]{Chapters/chapter17/code/data_structures.cl} + +The world topology is stored as a two-dimension matrix which contains +OM indexes on the abscissa and the MM index on the ordinate. Each +agent walks through its line/column of the matrix at each iteration to +determinate which agents can be accessed during the simulation. Since +many agents are not connected this matrix is sparse, which introduces +a big number of useless memory accesses. To reduce the impact of these +memory accesses we use a compacted, optimized representation of this +matrix based on~\cite{Gomez-Luna:2009:PVS:1616772.1616869}, as +illustrated on the figure~\ref{ch17:fig:csr_representation}. This compact +representation considers each line of the matrix as an index list, and +only stores accessible agents continuously, to reduce the size of the +list and the number of non productive accesses. + +\begin{figure}[h] +\centering +\includegraphics[width=0.8\textwidth]{Chapters/chapter17/figs/grid.pdf} +%\psfig{file=figs/grid, height=1in} +\caption{Compact representation of the topology of a MIOR simulation} +\label{ch17:fig:csr_representation} +\end{figure} + +Since dynamic memory allocation is not possible yet in OpenCL and only +provided in the lastest revisions of the CUDA standard, these matrices +are statically allocated. The allocation is based on the worst-case +scenario where all OM and MM are linked as the real occupation of the +matrix cells cannot be deduced without some kind of preprocessing +computations. + +\subsubsection{Critical resources access management} +\label{ch17:subsec:concurrency} + +One of the main issue in a MIOR model is to ensure that all the +microbial colonies will have an equitable access to carbon resources, +when multiple colonies share the same deposits. Access +synchronizations are mandatory in these cases to prevent conflicting +updates on the same data that may lead to calculus error (e.g. loss of +matter). + +On a massively parallel architecture such as GPUs, this kind of +synchronization conflicts can however lead to an inefficient +implementation by enforcing a quasi-sequential execution. It is +necessary, in the case of MIOR as well as for other ABM, to ensure +that each work-item is not too constrained in its execution. + +\lstinputlisting[caption=Main MIOR kernel,label=ch17:listing:mior_kernels]{Chapters/chapter17/code/mior_kernels.cl} + +From the sequential algorithm 1 where all the agents share the same +data, we have developed a parallel algorithm composed of three +sequential stages separated by synchronization barriers. This new +algorithm is based on the distribution of the available OM carbon +deposits into parts at the beginning of each execution step. The three +stages, illustrated on Listing~\ref{ch17:listing:mior_kernels}, are the following: + +\begin{enumerate} +\item \emph{distribution}: the available carbon in each carbon deposit + (OM) is equitably dispatched between all accessible MM in the form + of parts. +\item \emph{metabolism}: each MM consumes carbon in its allocated parts + for its breathing and growing processes. +\item \emph{gathering}: unconsumed carbon in parts is gathered back + into the carbon deposits. +\end{enumerate} + +This solution suppresses the data synchronization needed by the first +algorithm and thus the need for synchronization barriers, and requires only +one kernel launch from Java as described on Listing~\ref{ch17:fig:mior_launcher}. + +\lstinputlisting[caption=MIOR simulation launcher,label=ch17:fig:mior_launcher]{Chapters/chapter17/code/mior_launcher.java} + +\subsubsection{Termination detection} + +The termination of a MIOR simulation is reached when the model +stabilizes and no more $CO_{2}$ is produced. This termination +detection can be done either on the CPU or the GPU but it requires a +global view on the system execution. + +In the first case, when the CPU controls the GPU simulation process, +the dectection is done in two steps: (i) the CPU starts the execution +of a simulation step on the GPU, (ii) the CPU retrieves the GPU data +and determines if another iteration must be launched or if the +simulation has terminated. This approach allows a fine-grain control +over the GPU execution, but it requires many costly data transfers as +each iteration results must be sent from the GPU to the CPU. In the +case of the MIOR model these costs are mainly due to the inherent +PCI-express port latencies rather than to bandwidth limitation since +data sizes remains rather small, in the order of few dozens of +Megabytes. + +In the second case the termination detection is directly implemented +on the GPU by checking the amount of available carbon between two +iterations. The CPU does not have any feedback while the simulation is +running, but retrieves the results once the kernel execution is +finished. This approach minimizes the number of transfers between the +CPU and the GPU. + +\subsection{Performance of MIOR implementations} +\label{ch17:subsec:miorexperiments} + +In this part we present several MIOR GPU implementations using the +distribution/gathering process described in the previous section and +compare their performance on two distinct hardware platform; i.e two +different GPU devices. Five incremental MIOR implementations were +realized with an increasing level of adaptation for the algorithm: in +all cases, we choose the average time over 50 executions as a +performance indicator. + +\begin{itemize} +\item the \textbf{GPU 1.0} implementation is a direct implementation + of the existing algorithm and its data structures where data + dependencies were removed and that use the non-compact topology + representation described in Section~\ref{ch17:subsec:datastructures} +\item the \textbf{GPU 2.0} implementation uses the previously + described compact representation of the topology and remains + otherwise identical to the GPU 1.0 implementation. +\item the \textbf{GPU 3.0} implementation introduces the manual + copy into local (private) memory of often-accessed global data, such + as carbon parts or topology information. +\item the \textbf{GPU 4.0} implementation is a variant of the + GPU 1.0 implementation which allows the execution of + multiple simulations for each kernel execution. +\item the \textbf{GPU 5.0} implementation is a multi-simulation of the + GPU 2.0 implementation, using the execution of multiple simulations + for each kernel execution as for GPU 4.0 +\end{itemize} + +The two last implementations -- \textbf{GPU 4.0} and \textbf{GPU 5.0} +-- illustrate the gain provided by a better usage of the hardware +resources, thanks to the driver execution overlaping capabilities. A +sequential version of the MIOR algorithm, labeled as \textbf{CPU}, is +provided for comparison purpose. This sequential version is developped +in Java, the same language used for GPU implementations and the Sworm +model. + +For these performance evaluations, two platforms are used. The first +one is representative of the kind of hardware which is available on +HPC clusters. It is a cluster node dedicated to GPU computations with +two Intel X5550 processors running at $2.67GHz$ and two Tesla C1060 +GPU devices running at $1.3$GHz and composed of $240$ cores ($30$ +multi-processors). Only one of these GPU is used in these experiments, +at the moment. The second platform illustrates what can be expected +from a personal desktop computer built a few years ago. It uses a +Intel Q9300 CPU, running at $2.5$GHz, and a Geforce 8800GT GPU card +running at $1.5$GHz and composed of $112$ cores ($14$ +multi-processors). The purpose of these two platforms is to assess the +benefit that could be obtained either when a scientist has access to +specialized hardware as a cluster or tries to take benefit from its +own personal computer. + +\begin{figure}[h] +%begin{minipage}[t]{0.49\linewidth} +\centering +\includegraphics[width=0.7\linewidth]{Chapters/chapter17/figs/mior_perfs_tesla.pdf} +%\caption{Performance CPU and GPU sur carte Tesla C1060} +\caption{CPU and GPU performance on a Tesla C1060 node} +\label{ch17:fig:mior_perfs_tesla} +%end{minipage} +\end{figure} + +Figures~\ref{ch17:fig:mior_perfs_tesla}~and~\ref{ch17:fig:mior_perfs_8800gt} +show the execution time for $50$ simulations on the two hardware +platforms. A size factor is applied to the problem: at scale 1, +the model contains $38$ MM and $310$ OM, while at the scale 6 these +numbers are multiplied by six. The size of the environment is modified +as well to maintain the same average agent density in the model. This +scaling factor display the impact of the chosen size of simulation on +performance. + +%hspace{0.02\linewidth} +%begin{minipage}[t]{0.49\linewidth} +\begin{figure}[h] +\centering +\includegraphics[width=0.7\linewidth]{Chapters/chapter17/figs/mior_perfs_8800gt.pdf} +\caption{CPU and GPU performance on a personal computer with a Geforce 8800GT} +\label{ch17:fig:mior_perfs_8800gt} +%end{minipage} +\end{figure} + +The charts show that for small problems the execution times of all +the implementations are very close. This is because the GPU execution +does not have enough threads (representing agents) for an optimal +usage of GPU resources. This trend changes after scale $5$ where GPU +2.0 and GPU 3.0 take the advantage on the GPU 1.0 and CPU +implementations. This advantage continues to grow with the scaling +factor, and reaches a speedup of $10$ at the scale $10$ between the +fastest single-simulation GPU implementation and the first, naive one +GPU 1.0. + +Multiple trends can be observed in these results. First, optimizations +for the GPU hardware show a big, positive impact on performance, +illustrating the strong requirements on the algorithm properties to +reach execution efficiency. These charts also show that despite the +vast difference in numbers of cores between the two GPUs platforms the +same trends can be observed in both case. We can therefore expect +similar results on other GPU cards, without the need for more +adaptations. + +\begin{figure}[h] +\centering +\includegraphics[width=0.7\linewidth]{Chapters/chapter17/figs/monokernel.pdf} +\caption{Execution time of one multi-simulation kernel on the Tesla + platform} +\label{ch17:fig:monokernel_graph} +\end{figure} + +\begin{figure}[h] +\centering +\includegraphics[width=0.7\linewidth]{Chapters/chapter17/figs/multikernel.pdf} +\caption{Total execution time for 1000 simulations on the Tesla + platform, while varying the number of simulations for each kernel} +\label{ch17:fig:multikernel_graph} +\end{figure} + +There are two ways to measure simulations performance: (i) by +executing only one kernel, and varying its size (the number of +simulations executed in parallel), as shown on +Figure~\ref{ch17:fig:monokernel_graph}, to test the costs linked to the +parallelization process or (ii) by executing a fixed number of +simulations and varying the size +of each kernel, as shown on Figure~\ref{ch17:fig:multikernel_graph}. + +Figure~\ref{ch17:fig:monokernel_graph} illustrates the execution time for +only one kernel. It shows that for small numbers of simulations run in +parallel, the compact implementation of the model topology is faster +than the two-dimension matrix representation. This trends reverse with +more than $50$ simulations in parallel, which can be explained either by +the non linear progression of the synchronization costs or by the +additional memory required for the access-efficient representation. + +Figure~\ref{ch17:fig:multikernel_graph} illustrates the execution time of a +fixed number of simulations. It shows that for a small number of +simulations run in parallel, the costs resulting of program setup, +data copies and launch on GPU are determinant and very detrimental to +performance. Once the number of simulations executed for each kernel +grows, these costs are counterbalanced by computation costs. This +trend is more marked in the case of the sparse implementation (GPU +4.0) than the compact one but appears on both curves. With more than +$30$ simulations for each kernel, execution times stalls, since +hardware limits are reached. This indicates that the cost of preparing +and launching kernels become negligible compared to the computing +time once a good GPU occupation rate is achieved. + +\section{Analysis and recommandations} +\label{ch17:analysis} + +In this section we synthesize the observations done on the two models +and identify some recommendations on implementing complex systems on +GPU platforms. + +\subsection{Analysis} + +In both the Collembola and the MIOR model, a critical problematic is +the determination of the part of the simulation which are to be run on +GPU and which are to remain on the CPU. The determination of these +parts is a classic step of any algorithm parallelization and must +take into account considerations such as the cost of the different +parts of the algorithm and the expected gains. + +In the case of the Collembola model two steps of the algorithm were +ported to GPU. Both steps use straighforward, easily-parallelizable, +operations where a direct gain can be expected by using more +executions cores without important modifications of the algorithm. + +In the MIOR model case however no such inherently parallelizable parts +are evident in the original sequential algorithm. This is mainly +explained by the rate of interactions between agents in this model in +the form of many operations (breathing, growth) on shared carbon +ressources. In this case the algorithm had to be more profundly +modified while keeping in head the need to remain true to the original +model, to synchronize the main execution step of all agents in the +model, to ensure enquity, and to minimize the numbers of +synchronizations. The minimization is done by factoring the repartition of +carbon in the model in two separated step at the beginning and the end +of each iterations rather than at multiple point of the execution. + +\subsection{MAS execution workflow} + +Many MAS simulations decompose their execution process into discrete +evolution steps where each step represents a quantum of time (minimal +unit of time described). At the end of each step many global +information, graphical displays or results are updated. This +execution model may not correctly fit on GPU platforms as they assume +more or less a batch-like workflow model. The execution model must be +split into the following ever repeating steps: + +\begin{itemize} +\item Allocation of GPU data buffers +\item Copy of data from CPU to GPU +\item GPU kernels execution +\item Copy of results from GPU to CPU +\end{itemize} + +This workflow works well if the considered data transfer time is +negligible compared to GPU execution or can be done in parallel, +thanks to the asynchronous nature of OpenCL. If we are to update the +MAS model after each iteration then performance risk being +degratated. This is illustrated in the MIOR model by the fact that the +speedup observerd on GPU is much more significant for bigger +simulations, which implies longer GPU execution times. Our solution to +this problem is to desynchronize the execution of the SMA model and +its GPU parts by requesting the execution of multiple steps of the +GPU simulations for each launch. + +Another related prerequisite of GPU execution is the ability to have +many threads to be executed, to allow an efficient exploitation of the +superior number of cores provided by the architecture. In the case of +MAS models, this means that executing one agent at a time on GPU is +meaningless in regard to GPU usage, copying cost, and actual gain in +execution time, if the process is not already parallel and costly at +this scale. In the MIOR and the Collembola models, this is solved by +executing the computations for all agents of the model at the same +time. If the model has hovewer only chronic needs for intensive +computations then some kind of batching mechanism is required to store +waiting treatements in a queue, until the total sum of waiting +computations justify the transfers cost to the GPU platform. + +\subsection{Implementation challenges} + +Besides the execution strategy challenges described above, some +implementation challenges also occur when implementing an OpenCL +version of a MAS model, mainly related to the underlying limitations +of the execution platform. + +The first one is the impossibility (except in lastest CUDA versions) +to dynamicaly allocate memory during execution. This is a problem in +the case of models where the number of agent can vary during the +simulation, such as prey-predator models. In this case, the only +solution is to overestimate the size of arrays or data structures to +accomodate these additionnal individuals, or using the CPU to resize +data structures when these situations occur. Both approachs require +to trend either memory or performance and are not always pratical. + +Another limitation is the impossibility to store pointers in data +structures, we restraint any OpenCL to use only one dimension static +arrays or specific data structures such as textures. This preclude the +usage of structures like linked-list, graphs or sparse matrix not +represented by some combination of static arrays, and can be another +source of memory or performance losses. + +In the case of MIOR, this problem is especially exarcebed in the case +of neighboring storage: both representations consum much more memory +that actually required, since the worst case (all agents have +access to all others agents) has to be taken into account defensivly +when dimensioning the data structure. + +The existence of different generations of GPU hardwares is also a +challenge. Older implementations, such as the four year old Tesla +C1060 cards, have very strong expectation in term of memory accesses +and requires very regular access patterns to perform efficiently. MAS +having many random accesses, such as MIOR, or many small global memory +accesses, such as Collembola, are penalized on these older +cards. Fortunately, these requirements are less present is modern +cards, which offer caches and other facilities traditionnaly present +on CPU to offset these kind of penalities. + +The final concern is related to the previous ones and often result in +more memory consumption. The amount of memory available on GPU cards +is much more limited and adding new memory capabilities is more costly +compared to expending a CPU RAM. On computing clusters, hardwares +nodes with 128GB of memory or more have become affordable, whereas +newer Tesla architecture remains limited to 16GB of memory. This can +be a concern in the case of big MAS models, or small ones which can +only use memory-inefficient OpenCL structures. + +\subsection{MCSMA} +\label{ch17:Mcsma} + +As shown in the previous sections, many data representations choices +are common to entire classes of MAS. The paradigm of grid, for example, +is often encountered in models where each cell constitutes either the +elementary unit of simulation (SugarScape~\cite{Dsouza2007}) or a +discretisation of the environment space +(Pathfinding~\cite{Guy09clearpath}). These grids can be considered as +two or three-dimensional matrices, whose processing can be directly +distributed. + +Another common data representation encountered in MAS system is the +usage of 2d or 3d coordinates to store the position of each agent of +the model. In this case, even if the environment is no longer +discrete, the location information still imply computations +(euclidian distances) which can be parallelized. + +MCSMA~\cite{lmlm+13:ip} is a framework developped to provide to the MAS designer +those basic data structures, and the associated operations, to +facilitate the portage of existing MAS on GPU. Two levels of +utilization are provided to the developper, depending on its usage +profile. + +\begin{itemize} +\item A high-level library, constitued of modules regrouping classes + of operations. A module provides multiple methods of + distance computations in 1d, 2d or 3d grids, another one the + diffusion algorithm... +\item A low level API which allows the developped direct access to the + GPU and the inner working of MCSMA, to develop its own module in + the case where the required operations are not yet provided by the platform. +\end{itemize} + +Both usage levels were illustrated in the above two pratical cases. In +MIOR, the whole algorithm (baring initialization) is ported on GPU as +a specific plugin which allows executing $n$ MIOR simulations and +retrieve their results. This first approach requires extensive +adaptations to the original algorithm. In Collembola, on the +contrary, the main steps of the algorithm remain executed on the CPU, +and only specific operations are delegated to generic, already +existing, diffusion and reduction kernels. The fundamental algorithm +is not modified and GPU execution is only applied to specific parts of +the execution which may benefit from it. These two programming +approaches allow incremental adaptations of existing Java MAS to +accelerate their execution on GPU, while retaining the option to +develop their own reusable or more efficeint module to supplement the +already existing ones. + +\section{Conclusion} +\label{ch17:conclusion} + +This chapter addresses the issue of complex system simulation by using +agent based paradigm and GPU hardware. From the experiments on two +existing Agent Based Models of soil science we intend to provide +useful information on the architecture, the algorithm design and, the +data management to run Agent based simulations on GPU, and more +generally to run compute intensive applications that are not based on +a not-so-regular model. The first result of this work is that adapting +the algorithm to a GPU architecture is possible and suitable to speed +up agent based simulations as illustrated by the MIOR model. Coupling +CPU with GPU seems even to be an interesting way to better take +advantage of the power given by computers and clusters as illustrated +by the Collembola model. From our point of view the adaptation process +is lighter than a traditional parallelization on distributed nodes and +not much difficult than a standard multi-threaded parallelization, +since all the data remains on the same host and can be shared in +central memory. The OCL adaptation also enables a portable simulator +that can be run on different graphical units. Even using a mainstream +card as the GPU card of a standard computer can lead to significant +performance improvements. This is an interesting result as it opens +the field of inexpensive HPC to a large community. + +In this perspective, we are working on MCSMA a development platform +that would facilitate the use of GPU or many-core architectures for +multi-agent simulations. Our first work has been the definition of +common, efficient, reusable data structures, such as grids or +lists. Another goal is to provide easier means to control the +distribution of specific processes between CPU or GPU, to allow the +easy exploitation of the strengths of each platform in the same +multi-agent simulation. We think that the same approach, i.e +developping specific environments that facilitate the developper +access to the GPU power, can be applied in many domains with +compute intensive needs to open the GPU use to a larger community. + +\putbib[Chapters/chapter17/biblio] + diff --git a/BookGPU/Chapters/chapter17/code/collem_kernel_diffuse.cl b/BookGPU/Chapters/chapter17/code/collem_kernel_diffuse.cl new file mode 100644 index 0000000..93de319 --- /dev/null +++ b/BookGPU/Chapters/chapter17/code/collem_kernel_diffuse.cl @@ -0,0 +1,22 @@ +#include "collem_structures.h" + +kernel void diffuse( + global CollemWorld *world, + global int *populations, + global int *overflows, + global int *newPopulations) +{ + const int i = get_global_id(0); + const int j = get_global_id(1); + + // Compute the population to diffuse for this cell + OVERFLOW(world, overflows, i, j) = CELL(world, populations, i, j) * world->diffusionRate; + + // _syncthreads() in CUDA + barrier(CLK_GLOBAL_MEM_FENCE); + + // Retrieve neighbors surplus and add them to the current cell population + int surplus = OVERFLOW(world, overflows, i + 1, j) + OVERFLOW(world, overflows, i - 1, j) + OVERFLOW(world, overflows, i, j - 1) + OVERFLOW(world, overflows, i, j + 1); + + CELL(world, newPopulations, i, j) = CELL(world, populations, i, j) + surplus / 8.0 - OVERFLOW(world, overflows, i, j); +} diff --git a/BookGPU/Chapters/chapter17/code/collem_kernel_reduc.cl b/BookGPU/Chapters/chapter17/code/collem_kernel_reduc.cl new file mode 100644 index 0000000..609a2b2 --- /dev/null +++ b/BookGPU/Chapters/chapter17/code/collem_kernel_reduc.cl @@ -0,0 +1,18 @@ +#include "collem_structures.h" + +kernel void reduce( + global CollemWorld *world, + global int *populations, + global int *patchesOwners, + global int *plotsPopulations) +{ + const int i = get_global_id(0); + const int j = get_global_id(1); + + // Retrieve the owner parcel + const int plotId = CELL(world, patchesOwners, i, j); + if (plotId == -1) return; + + // Add cell population to the owner parcel population + atomic_add(plotsPopulations + plotId, CELL(world, populations, i, j)); +} diff --git a/BookGPU/Chapters/chapter17/code/collem_kernels.cl b/BookGPU/Chapters/chapter17/code/collem_kernels.cl new file mode 100644 index 0000000..5f019ee --- /dev/null +++ b/BookGPU/Chapters/chapter17/code/collem_kernels.cl @@ -0,0 +1,39 @@ +#include "collem_structures.h" + +kernel void reduce( + global CollemWorld *world, + global int *populations, + global int *patchesOwners, + global int *plotsPopulations) +{ + const int i = get_global_id(0); + const int j = get_global_id(1); + + // Retrieve the owner parcel + const int plotId = CELL(world, patchesOwners, i, j); + if (plotId == -1) return; + + // Add cell population to the owner parcel population + atomic_add(plotsPopulations + plotId, CELL(world, populations, i, j)); +} + +kernel void diffuse( + global CollemWorld *world, + global int *populations, + global int *overflows, + global int *newPopulations) +{ + const int i = get_global_id(0); + const int j = get_global_id(1); + + // Compute the population to diffuse for this cell + OVERFLOW(world, overflows, i, j) = CELL(world, populations, i, j) * world->diffusionRate; + + // _syncthreads() in CUDA + barrier(CLK_GLOBAL_MEM_FENCE); + + // Retrieve neighbors surplus and add them to the current cell population + int surplus = OVERFLOW(world, overflows, i + 1, j) + OVERFLOW(world, overflows, i - 1, j) + OVERFLOW(world, overflows, i, j - 1) + OVERFLOW(world, overflows, i, j + 1); + + CELL(world, newPopulations, i, j) = CELL(world, populations, i, j) + surplus / 8.0 - OVERFLOW(world, overflows, i, j); +} diff --git a/BookGPU/Chapters/chapter17/code/collem_launcher.java b/BookGPU/Chapters/chapter17/code/collem_launcher.java new file mode 100644 index 0000000..dad5e54 --- /dev/null +++ b/BookGPU/Chapters/chapter17/code/collem_launcher.java @@ -0,0 +1,30 @@ +public void run() { + newArrivalsKernel.setArguments(worldMem, patchesPopulationsMem, patchesOwnersMem, plotsPopulationsMem, plotsSurfacesMem); + reproduceKernel.setArguments(worldMem, patchesPopulationsMem, patchesOwnersMem, plotsPopulationsMem); + diffuseKernel.setArguments(worldMem, patchesPopulationsMem, patchesOverflowsMem, newPopulationsMem); + deathKernel.setArguments(worldMem, patchesPopulationsMem, patchesTypesMem); + + OCLEvent newArrivalFinished, reproduceFinished, diffuseFinished, copyFinished, deathFinished; + + int i = 1; + + OCLChrono totalTime = new OCLChrono("totalTime"); + totalTime.start(); + + while (true) { + newArrivalFinished = enqueueKernel(newArrivalsKernel); + reproduceFinished = enqueueKernel(reproduceKernel, newArrivalFinished); + diffuseFinished = enqueueKernel(diffuseKernel, reproduceFinished); + copyFinished = queue.enqueueCopyBuffer(newPopulationsMem, patchesPopulationsMem, 0L, 0L, patchesPopulationsMem.getSize(), diffuseFinished); + OCLEvent.waitFor(copyFinished); + deathFinished = enqueueKernel(deathKernel, copyFinished); + + queue.blockingReadBuffer(plotsPopulationsMem, Pointer.to(plotsPopulations), 0, plotsPopulationsMem.getSize(), deathFinished); + queue.blockingReadBuffer(patchesPopulationsMem, Pointer.to(patchesPopulations), 0, patchesPopulationsMem.getSize(), deathFinished); + queue.blockingReadBuffer(patchesOverflowsMem, Pointer.to(patchesOverflows), 0, patchesOverflowsMem.getSize(), deathFinished); + + i++; + + if (i == 100) break; + } + diff --git a/BookGPU/Chapters/chapter17/code/data_structures.cl b/BookGPU/Chapters/chapter17/code/data_structures.cl new file mode 100644 index 0000000..6c26382 --- /dev/null +++ b/BookGPU/Chapters/chapter17/code/data_structures.cl @@ -0,0 +1,29 @@ +// Microbial colony +typedef struct MM { + float x; + float y; + int carbon; + int dormancy; +} MM; + +// Carbon deposit +typedef struct OM { + float x; + float y; + int carbon; + int lock; +} OM; + +// MIOR Environment +typedef struct World { + int nbMM; + int nbOM; + int RA; // Action radius + float RR; // Breathing rate + float GR; // Growth rate + float K; // Decomposition rate + int width; // Size of the model + int minSize; // Minimal size for a colony + int CO2; // Total CO2 in the model + int lock; +} World; diff --git a/BookGPU/Chapters/chapter17/code/mior_kernels.cl b/BookGPU/Chapters/chapter17/code/mior_kernels.cl new file mode 100644 index 0000000..ac13507 --- /dev/null +++ b/BookGPU/Chapters/chapter17/code/mior_kernels.cl @@ -0,0 +1,40 @@ +kernel void simulate( + global MM allMMList[], + global OM allOMList[], + global World allWorlds[], + constant int allMMOffsets[], + constant int allOMOffsets[], + global const int allMMCSR[], + global const int allOMCSR[], + global int allParts[]) +{ + const int iSim = get_group_id(0); + const int i = get_local_id(0); + + global World * world = allWorlds + iSim; + global MM * mmList = allMMList + iSim * NB_MM; + global OM * omList = allOMList + iSim * NB_OM; + constant int * mmOffsets = allMMOffsets + iSim * NB_MM; + constant int * omOffsets = allOMOffsets + iSim * NB_OM; + global const int * mmCSR = allMMCSR; + global const int * omCSR = allOMCSR; + global int * parts = allParts + iSim * PARTS_SIZE; + + if (i < world->nbOM) { + scatter(omList, omOffsets, omCSR, parts, world); + } + + barrier(CLK_GLOBAL_MEM_FENCE); + + if (i < world->nbMM) { + live(mmList, mmOffsets, mmCSR, parts, world); + } + + barrier(CLK_GLOBAL_MEM_FENCE); + + if (i < world->nbOM) { + gather(omList, omOffsets, omCSR, parts, world); + } + + barrier(CLK_GLOBAL_MEM_FENCE); +} diff --git a/BookGPU/Chapters/chapter17/code/mior_launcher.java b/BookGPU/Chapters/chapter17/code/mior_launcher.java new file mode 100644 index 0000000..3b648eb --- /dev/null +++ b/BookGPU/Chapters/chapter17/code/mior_launcher.java @@ -0,0 +1,20 @@ +@Override +protected void doLiveImpl() { + simulateKernel.setArguments(mmMem, omMem, worldsMem, mmOffsetsMem, omOffsetsMem, mmCSRMem, omCSRMem, partsMem); + + if (blockSize < Math.max(nbOM, nbMM)) { + throw new RuntimeException("blockSize (" + blockSize + ") too small to execute the simulation"); + } + + OCLEvent event = queue.enqueue1DKernel(simulateKernel, nbSim * blockSize, blockSize); + + OCLEvent.waitFor(event); + OCLUtils.printEventStats("simulate", event); + + if (! isBatchModeEnabled()) { + queue.blockingReadBuffer(mmMem, mmList, 0, mmMem.getSize()); + queue.blockingReadBuffer(omMem, omList, 0, omMem.getSize()); + queue.blockingReadBuffer(worldsMem, worlds, 0, worldsMem.getSize()); + System.out.println("copy"); + } +} diff --git a/BookGPU/Chapters/chapter17/figs/algo_collem.pdf b/BookGPU/Chapters/chapter17/figs/algo_collem.pdf new file mode 100644 index 0000000..7d0f60d Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/algo_collem.pdf differ diff --git a/BookGPU/Chapters/chapter17/figs/algo_collem_en.odg b/BookGPU/Chapters/chapter17/figs/algo_collem_en.odg new file mode 100644 index 0000000..b2d252b Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/algo_collem_en.odg differ diff --git a/BookGPU/Chapters/chapter17/figs/collem_perfs.pdf b/BookGPU/Chapters/chapter17/figs/collem_perfs.pdf new file mode 100644 index 0000000..806c599 Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/collem_perfs.pdf differ diff --git a/BookGPU/Chapters/chapter17/figs/collem_perfs_gpuonly.pdf b/BookGPU/Chapters/chapter17/figs/collem_perfs_gpuonly.pdf new file mode 100644 index 0000000..265b5c1 Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/collem_perfs_gpuonly.pdf differ diff --git a/BookGPU/Chapters/chapter17/figs/collem_perfs_nothinkpad-crop.pdf b/BookGPU/Chapters/chapter17/figs/collem_perfs_nothinkpad-crop.pdf new file mode 100644 index 0000000..bb24171 Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/collem_perfs_nothinkpad-crop.pdf differ diff --git a/BookGPU/Chapters/chapter17/figs/collem_perfs_nothinkpad.pdf b/BookGPU/Chapters/chapter17/figs/collem_perfs_nothinkpad.pdf new file mode 100644 index 0000000..643ac42 Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/collem_perfs_nothinkpad.pdf differ diff --git a/BookGPU/Chapters/chapter17/figs/collem_perfs_zoom.pdf b/BookGPU/Chapters/chapter17/figs/collem_perfs_zoom.pdf new file mode 100644 index 0000000..4e7aa77 Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/collem_perfs_zoom.pdf differ diff --git a/BookGPU/Chapters/chapter17/figs/collem_perfs_zoom_10.pdf b/BookGPU/Chapters/chapter17/figs/collem_perfs_zoom_10.pdf new file mode 100644 index 0000000..3ff7a5e Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/collem_perfs_zoom_10.pdf differ diff --git a/BookGPU/Chapters/chapter17/figs/grid.pdf b/BookGPU/Chapters/chapter17/figs/grid.pdf new file mode 100644 index 0000000..13f9680 Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/grid.pdf differ diff --git a/BookGPU/Chapters/chapter17/figs/kernels.pdf b/BookGPU/Chapters/chapter17/figs/kernels.pdf new file mode 100644 index 0000000..d987e2a Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/kernels.pdf differ diff --git a/BookGPU/Chapters/chapter17/figs/kernels.svg b/BookGPU/Chapters/chapter17/figs/kernels.svg new file mode 100644 index 0000000..bbdb520 --- /dev/null +++ b/BookGPU/Chapters/chapter17/figs/kernels.svg @@ -0,0 +1,251 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + Distribution(Nbom work-items) + + Live(Nbmm work-items) + Gathering(Nbom work-items) + barrier + + barrier + + diff --git a/BookGPU/Chapters/chapter17/figs/mior_perfs_8800gt.pdf b/BookGPU/Chapters/chapter17/figs/mior_perfs_8800gt.pdf new file mode 100644 index 0000000..7a5c53e Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/mior_perfs_8800gt.pdf differ diff --git a/BookGPU/Chapters/chapter17/figs/mior_perfs_tesla.pdf b/BookGPU/Chapters/chapter17/figs/mior_perfs_tesla.pdf new file mode 100644 index 0000000..3a6454d Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/mior_perfs_tesla.pdf differ diff --git a/BookGPU/Chapters/chapter17/figs/monokernel.pdf b/BookGPU/Chapters/chapter17/figs/monokernel.pdf new file mode 100644 index 0000000..27d1134 Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/monokernel.pdf differ diff --git a/BookGPU/Chapters/chapter17/figs/multikernel.pdf b/BookGPU/Chapters/chapter17/figs/multikernel.pdf new file mode 100644 index 0000000..21dc6f3 Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/multikernel.pdf differ diff --git a/BookGPU/Chapters/chapter17/figs/repartition.pdf b/BookGPU/Chapters/chapter17/figs/repartition.pdf new file mode 100644 index 0000000..47ea4c6 Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/repartition.pdf differ diff --git a/BookGPU/Chapters/chapter17/figs/sworm_fractal.pdf b/BookGPU/Chapters/chapter17/figs/sworm_fractal.pdf new file mode 100644 index 0000000..17726ec Binary files /dev/null and b/BookGPU/Chapters/chapter17/figs/sworm_fractal.pdf differ diff --git a/BookGPU/Chapters/chapter18/ch18.aux b/BookGPU/Chapters/chapter18/ch18.aux index b69311d..ab5fa84 100644 --- a/BookGPU/Chapters/chapter18/ch18.aux +++ b/BookGPU/Chapters/chapter18/ch18.aux @@ -2,54 +2,54 @@ \@writefile{toc}{\author{Rapha\IeC {\"e}l Couturier}{}} \@writefile{toc}{\author{Christophe Guyeux}{}} \@writefile{loa}{\addvspace {10\p@ }} -\@writefile{toc}{\contentsline {chapter}{\numberline {16}Pseudorandom Number Generator on GPU}{363}} +\@writefile{toc}{\contentsline {chapter}{\numberline {17}Pseudorandom Number Generator on GPU}{389}} \@writefile{lof}{\addvspace {10\p@ }} \@writefile{lot}{\addvspace {10\p@ }} -\newlabel{chapter18}{{16}{363}} -\@writefile{toc}{\contentsline {section}{\numberline {16.1}Introduction}{363}} -\@writefile{toc}{\contentsline {section}{\numberline {16.2}Basic Remindees}{365}} -\newlabel{section:BASIC RECALLS}{{16.2}{365}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.1}A Short Presentation of Chaos}{365}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.2}On Devaney's Definition of Chaos}{365}} -\newlabel{sec:dev}{{16.2.2}{365}} -\newlabel{Devaney}{{16.1}{365}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.2.3}Chaotic iterations}{366}} -\newlabel{subsection:Chaotic iterations}{{16.2.3}{366}} -\newlabel{Chaotic iterations}{{2}{366}} -\newlabel{eq:generalIC}{{16.4}{367}} -\newlabel{equation Oplus}{{16.5}{367}} -\@writefile{toc}{\contentsline {section}{\numberline {16.3}Toward Efficiency and Improvement for CI PRNG}{367}} -\newlabel{sec:efficient PRNG}{{16.3}{367}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{367}} -\newlabel{algo:seqCIPRNG}{{16.1}{367}} -\@writefile{lol}{\contentsline {lstlisting}{\numberline {16.1}C code of the sequential PRNG based on chaotic iterations}{367}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{368}} -\newlabel{sec:efficient PRNG gpu}{{16.3.2}{368}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.3}Naive Version for GPU}{368}} -\@writefile{loa}{\contentsline {algocf}{\numberline {16}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{369}} -\newlabel{algo:gpu_kernel}{{16}{369}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.4}Improved Version for GPU}{369}} -\newlabel{IR}{{17}{370}} -\@writefile{loa}{\contentsline {algocf}{\numberline {17}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{370}} -\newlabel{algo:gpu_kernel2}{{17}{370}} -\@writefile{toc}{\contentsline {subsection}{\numberline {16.3.5}Chaos Evaluation of the Improved Version}{370}} -\@writefile{toc}{\contentsline {section}{\numberline {16.4}Experiments}{371}} -\newlabel{sec:experiments}{{16.4}{371}} -\@writefile{toc}{\contentsline {section}{\numberline {16.5}Summary}{371}} -\@writefile{lof}{\contentsline {figure}{\numberline {16.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{372}} -\newlabel{fig:time_xorlike_gpu}{{16.1}{372}} -\@writefile{toc}{\contentsline {section}{Bibliography}{373}} +\newlabel{chapter18}{{17}{389}} +\@writefile{toc}{\contentsline {section}{\numberline {17.1}Introduction}{389}} +\@writefile{toc}{\contentsline {section}{\numberline {17.2}Basic Remindees}{391}} +\newlabel{section:BASIC RECALLS}{{17.2}{391}} +\@writefile{toc}{\contentsline {subsection}{\numberline {17.2.1}A Short Presentation of Chaos}{391}} +\@writefile{toc}{\contentsline {subsection}{\numberline {17.2.2}On Devaney's Definition of Chaos}{391}} +\newlabel{sec:dev}{{17.2.2}{391}} +\newlabel{Devaney}{{17.1}{391}} +\@writefile{toc}{\contentsline {subsection}{\numberline {17.2.3}Chaotic iterations}{392}} +\newlabel{subsection:Chaotic iterations}{{17.2.3}{392}} +\newlabel{Chaotic iterations}{{2}{392}} +\newlabel{eq:generalIC}{{17.4}{393}} +\newlabel{equation Oplus}{{17.5}{393}} +\@writefile{toc}{\contentsline {section}{\numberline {17.3}Toward Efficiency and Improvement for CI PRNG}{393}} +\newlabel{sec:efficient PRNG}{{17.3}{393}} +\@writefile{toc}{\contentsline {subsection}{\numberline {17.3.1}First Efficient Implementation of a PRNG based on Chaotic Iterations}{393}} +\newlabel{algo:seqCIPRNG}{{17.1}{393}} +\@writefile{lol}{\contentsline {lstlisting}{\numberline {17.1}C code of the sequential PRNG based on chaotic iterations}{393}} +\@writefile{toc}{\contentsline {subsection}{\numberline {17.3.2}Efficient PRNGs based on Chaotic Iterations on GPU}{394}} +\newlabel{sec:efficient PRNG gpu}{{17.3.2}{394}} +\@writefile{toc}{\contentsline {subsection}{\numberline {17.3.3}Naive Version for GPU}{394}} +\@writefile{loa}{\contentsline {algocf}{\numberline {17}{\ignorespaces Main kernel of the GPU ``naive'' version of the PRNG based on chaotic iterations\relax }}{395}} +\newlabel{algo:gpu_kernel}{{17}{395}} +\@writefile{toc}{\contentsline {subsection}{\numberline {17.3.4}Improved Version for GPU}{395}} +\newlabel{IR}{{18}{396}} +\@writefile{loa}{\contentsline {algocf}{\numberline {18}{\ignorespaces Main kernel for the chaotic iterations based PRNG GPU efficient version\relax }}{396}} +\newlabel{algo:gpu_kernel2}{{18}{396}} +\@writefile{toc}{\contentsline {subsection}{\numberline {17.3.5}Chaos Evaluation of the Improved Version}{396}} +\@writefile{toc}{\contentsline {section}{\numberline {17.4}Experiments}{397}} +\newlabel{sec:experiments}{{17.4}{397}} +\@writefile{toc}{\contentsline {section}{\numberline {17.5}Summary}{397}} +\@writefile{lof}{\contentsline {figure}{\numberline {17.1}{\ignorespaces Quantity of pseudorandom numbers generated per second with the xorlike-based PRNG\relax }}{398}} +\newlabel{fig:time_xorlike_gpu}{{17.1}{398}} +\@writefile{toc}{\contentsline {section}{Bibliography}{399}} \@setckpt{Chapters/chapter18/ch18}{ -\setcounter{page}{375} +\setcounter{page}{401} \setcounter{equation}{5} -\setcounter{enumi}{2} +\setcounter{enumi}{3} \setcounter{enumii}{0} \setcounter{enumiii}{0} \setcounter{enumiv}{17} \setcounter{footnote}{2} \setcounter{mpfootnote}{0} \setcounter{part}{6} -\setcounter{chapter}{16} +\setcounter{chapter}{17} \setcounter{section}{5} \setcounter{subsection}{0} \setcounter{subsubsection}{0} @@ -66,9 +66,9 @@ \setcounter{lstnumber}{15} \setcounter{ContinuedFloat}{0} \setcounter{AlgoLine}{14} -\setcounter{algocfline}{17} -\setcounter{algocfproc}{17} -\setcounter{algocf}{17} +\setcounter{algocfline}{18} +\setcounter{algocfproc}{18} +\setcounter{algocf}{18} \setcounter{nprt@mantissa@digitsbefore}{0} \setcounter{nprt@mantissa@digitsafter}{0} \setcounter{nprt@exponent@digitsbefore}{0} diff --git a/BookGPU/Makefile b/BookGPU/Makefile index 9956a25..049fb5b 100644 --- a/BookGPU/Makefile +++ b/BookGPU/Makefile @@ -21,6 +21,7 @@ all: bibtex bu14 bibtex bu15 bibtex bu16 + bibtex bu17 makeindex ${BOOK}.idx pdflatex ${BOOK}