1 /* Copyright (c) 2004-2023. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "smpi_host.hpp"
9 #include "simgrid/s4u/Engine.hpp"
10 #include "simgrid/s4u/Barrier.hpp"
11 #include "smpi_comm.hpp"
14 XBT_LOG_EXTERNAL_DEFAULT_CATEGORY(smpi);
16 namespace simgrid::smpi::app {
18 static int universe_size = 0;
22 explicit Instance(int max_no_processes) : size_(max_no_processes)
24 auto* group = new simgrid::smpi::Group(size_);
25 comm_world_ = new simgrid::smpi::Comm(group, nullptr, false, -1);
26 universe_size += max_no_processes;
27 bar_ = s4u::Barrier::create(size_);
31 unsigned int finalized_ranks_ = 0;
34 } // namespace simgrid::smpi::app
36 using simgrid::smpi::app::Instance;
38 static std::map<std::string, Instance, std::less<>> smpi_instances;
40 /** @ingroup smpi_simulation
41 * @brief Registers a running instance of an MPI program.
43 * @param name the reference name of the function.
44 * @param code either the main mpi function
45 * (must have a int ..(int argc, char *argv[]) prototype) or nullptr
46 * (if the function deployment is managed somewhere else —
47 * e.g., when deploying manually or using smpirun)
48 * @param num_processes the size of the instance we want to deploy
50 void SMPI_app_instance_register(const char *name, xbt_main_func_t code, int num_processes)
52 if (code != nullptr) // When started with smpirun, we will not execute a function
53 simgrid::s4u::Engine::get_instance()->register_function(name, code);
55 smpi_instances.try_emplace(name, num_processes);
57 void SMPI_app_instance_start(const char* name, const std::function<void()>& code,
58 std::vector<simgrid::s4u::Host*> const& hosts)
60 xbt_assert(hosts.size(), "Cannot start a SMPI instance on 0 hosts");
61 smpi_instances.try_emplace(name, hosts.size());
64 for (auto* host : hosts) {
65 auto rank_str = std::to_string(rank);
66 std::string actor_name = std::string(name) + "#" + rank_str;
67 auto actor = simgrid::s4u::Actor::create(actor_name, host, code);
68 actor->set_property("instance_id", name);
69 actor->set_property("rank", rank_str);
70 smpi_deployment_register_process(name, rank, actor.get());
76 void smpi_deployment_register_process(const std::string& instance_id, int rank, const simgrid::s4u::Actor* actor)
78 const Instance& instance = smpi_instances.at(instance_id);
79 instance.comm_world_->group()->set_mapping(actor->get_pid(), rank);
82 void smpi_deployment_startup_barrier(const std::string& instance_id)
84 const Instance& instance = smpi_instances.at(instance_id);
85 instance.bar_->wait();
88 void smpi_deployment_unregister_process(const std::string& instance_id)
90 Instance& instance = smpi_instances.at(instance_id);
91 instance.finalized_ranks_++;
93 if (instance.finalized_ranks_ == instance.size_) {
94 simgrid::smpi::Comm::destroy(instance.comm_world_);
95 smpi_instances.erase(instance_id);
99 MPI_Comm* smpi_deployment_comm_world(const std::string& instance_id)
101 Instance& instance = smpi_instances.at(instance_id);
102 return &instance.comm_world_;
105 void smpi_deployment_cleanup_instances(){
106 for (auto const& [name, instance] : smpi_instances) {
107 XBT_INFO("Stalling SMPI instance: %s. Do all your MPI ranks call MPI_Finalize()?", name.c_str());
108 simgrid::smpi::Comm::destroy(instance.comm_world_);
110 smpi_instances.clear();
113 int smpi_get_universe_size()
115 return simgrid::smpi::app::universe_size;
118 /** @brief Auxiliary method to get list of hosts to deploy app */
119 static std::vector<simgrid::s4u::Host*> smpi_get_hosts(const simgrid::s4u::Engine* e, const std::string& hostfile)
121 if (hostfile == "") {
122 return e->get_all_hosts();
124 std::vector<simgrid::s4u::Host*> hosts;
125 std::ifstream in(hostfile.c_str());
126 xbt_assert(in, "smpirun: Cannot open the host file: %s", hostfile.c_str());
128 while (std::getline(in, str)) {
130 hosts.emplace_back(e->host_by_name(str));
132 xbt_assert(not hosts.empty(), "smpirun: the hostfile '%s' is empty", hostfile.c_str());
136 /** @brief Read replay configuration from file */
137 static std::vector<std::string> smpi_read_replay(const std::string& replayfile)
139 std::vector<std::string> replay;
140 if (replayfile == "")
143 std::ifstream in(replayfile.c_str());
144 xbt_assert(in, "smpirun: Cannot open the replay file: %s", replayfile.c_str());
146 while (std::getline(in, str)) {
148 replay.emplace_back(str);
154 /** @brief Build argument vector to pass to process */
155 static std::vector<std::string> smpi_deployment_get_args(int rank_id, const std::vector<std::string>& replay,
156 const std::vector<const char*>& run_args)
158 std::vector<std::string> args{std::to_string(rank_id)};
159 // pass arguments to process only if not a replay execution
161 args.insert(args.end(), begin(run_args), end(run_args));
162 /* one trace per process */
163 if (replay.size() > 1)
164 args.emplace_back(replay[rank_id]);
169 * @brief Deploy an SMPI application from a smpirun call
171 * This used to be done at smpirun script, parsing either the hostfile or the platform XML.
172 * If hostfile isn't provided, get the list of hosts from engine.
174 int smpi_deployment_smpirun(const simgrid::s4u::Engine* e, const std::string& hostfile, int np,
175 const std::string& replayfile, int map, const std::vector<const char*>& run_args)
177 auto hosts = smpi_get_hosts(e, hostfile);
178 auto replay = smpi_read_replay(replayfile);
179 int hosts_size = static_cast<int>(hosts.size());
183 xbt_assert(np > 0, "Invalid number of process (np must be > 0). Check your np parameter, platform or hostfile");
185 if (np > hosts_size) {
186 XBT_INFO("You requested to use %d ranks, but there is only %d processes in your hostfile...", np, hosts_size);
189 for (int i = 0; i < np; i++) {
190 simgrid::s4u::Host* host = hosts[i % hosts_size];
191 std::string rank_id = std::to_string(i);
192 auto args = smpi_deployment_get_args(i, replay, run_args);
193 auto actor = simgrid::s4u::Actor::create(rank_id, host, rank_id, args);
194 /* keeping the same behavior as done in smpirun script, print mapping rank/process */
196 XBT_INFO("[rank %d] -> %s", i, host->get_cname());
198 actor->set_property("instance_id", "smpirun");
199 actor->set_property("rank", rank_id);
200 if (not replay.empty())
201 actor->set_property("smpi_replay", "true");
202 /* shared trace file, set it to rank 0 */
203 if (i == 0 && replay.size() == 1)
204 actor->set_property("tracefile", replay[0]);