X-Git-Url: http://bilbo.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/blobdiff_plain/53f36dc53dfef67528460488c6d8ee3913153968..c972ddd5a74b83c183357f707e7dfba87d48e750:/src/smpi/internals/smpi_deployment.cpp diff --git a/src/smpi/internals/smpi_deployment.cpp b/src/smpi/internals/smpi_deployment.cpp index 5e74b69313..93c7e61443 100644 --- a/src/smpi/internals/smpi_deployment.cpp +++ b/src/smpi/internals/smpi_deployment.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2004-2019. The SimGrid Team. +/* Copyright (c) 2004-2021. The SimGrid Team. * All rights reserved. */ /* This program is free software; you can redistribute it and/or modify it @@ -10,7 +10,7 @@ #include "smpi_comm.hpp" #include -XBT_LOG_EXTERNAL_CATEGORY(smpi); +XBT_LOG_EXTERNAL_DEFAULT_CATEGORY(smpi); namespace simgrid { namespace smpi { @@ -20,11 +20,10 @@ static int universe_size = 0; class Instance { public: - Instance(const std::string& name, int max_no_processes, MPI_Comm comm) - : name_(name), size_(max_no_processes), comm_world_(comm) + Instance(int max_no_processes, MPI_Comm comm) : size_(max_no_processes), comm_world_(comm) { - MPI_Group group = new simgrid::smpi::Group(size_); - comm_world_ = new simgrid::smpi::Comm(group, nullptr, 0, -1); + auto* group = new simgrid::smpi::Group(size_); + comm_world_ = new simgrid::smpi::Comm(group, nullptr, false, -1); // FIXME : using MPI_Attr_put with MPI_UNIVERSE_SIZE is forbidden and we make it a no-op (which triggers a warning // as MPI_ERR_ARG is returned). Directly calling Comm::attr_put breaks for now, as MPI_UNIVERSE_SIZE,is <0 // instance.comm_world->attr_put(MPI_UNIVERSE_SIZE, reinterpret_cast(instance.size)); @@ -32,9 +31,7 @@ public: universe_size += max_no_processes; } - const std::string name_; unsigned int size_; - std::vector present_processes_; unsigned int finalized_ranks_ = 0; MPI_Comm comm_world_; }; @@ -44,10 +41,10 @@ public: using simgrid::smpi::app::Instance; -static std::map smpi_instances; +static std::map> smpi_instances; /** @ingroup smpi_simulation - * @brief Registers a running instance of a MPI program. + * @brief Registers a running instance of an MPI program. * * @param name the reference name of the function. * @param code either the main mpi function @@ -61,16 +58,15 @@ void SMPI_app_instance_register(const char *name, xbt_main_func_t code, int num_ if (code != nullptr) // When started with smpirun, we will not execute a function simgrid::s4u::Engine::get_instance()->register_function(name, code); - Instance instance(std::string(name), num_processes, MPI_COMM_NULL); + Instance instance(num_processes, MPI_COMM_NULL); smpi_instances.insert(std::pair(name, instance)); } -void smpi_deployment_register_process(const std::string& instance_id, int rank, simgrid::s4u::ActorPtr actor) +void smpi_deployment_register_process(const std::string& instance_id, int rank, const simgrid::s4u::Actor* actor) { - Instance& instance = smpi_instances.at(instance_id); - instance.present_processes_.push_back(actor); - instance.comm_world_->group()->set_mapping(actor, rank); + const Instance& instance = smpi_instances.at(instance_id); + instance.comm_world_->group()->set_mapping(actor->get_pid(), rank); } void smpi_deployment_unregister_process(const std::string& instance_id) @@ -79,7 +75,6 @@ void smpi_deployment_unregister_process(const std::string& instance_id) instance.finalized_ranks_++; if (instance.finalized_ranks_ == instance.size_) { - instance.present_processes_.clear(); simgrid::smpi::Comm::destroy(instance.comm_world_); smpi_instances.erase(instance_id); } @@ -97,9 +92,8 @@ MPI_Comm* smpi_deployment_comm_world(const std::string& instance_id) void smpi_deployment_cleanup_instances(){ for (auto const& item : smpi_instances) { - XBT_CINFO(smpi, "Stalling SMPI instance: %s. Do all your MPI ranks call MPI_Finalize()?", item.first.c_str()); + XBT_INFO("Stalling SMPI instance: %s. Do all your MPI ranks call MPI_Finalize()?", item.first.c_str()); Instance instance = item.second; - instance.present_processes_.clear(); simgrid::smpi::Comm::destroy(instance.comm_world_); } smpi_instances.clear(); @@ -109,3 +103,98 @@ int smpi_get_universe_size() { return simgrid::smpi::app::universe_size; } + +/** @brief Auxiliary method to get list of hosts to deploy app */ +static std::vector smpi_get_hosts(simgrid::s4u::Engine* e, const std::string& hostfile) +{ + if (hostfile == "") { + return e->get_all_hosts(); + } + std::vector hosts; + std::ifstream in(hostfile.c_str()); + xbt_assert(in, "smpirun: Cannot open the host file: %s", hostfile.c_str()); + std::string str; + while (std::getline(in, str)) { + if (not str.empty()) + hosts.emplace_back(e->host_by_name(str)); + } + xbt_assert(not hosts.empty(), "smpirun: the hostfile '%s' is empty", hostfile.c_str()); + return hosts; +} + +/** @brief Read replay configuration from file */ +static std::vector smpi_read_replay(const std::string& replayfile) +{ + std::vector replay; + if (replayfile == "") + return replay; + + std::ifstream in(replayfile.c_str()); + xbt_assert(in, "smpirun: Cannot open the replay file: %s", replayfile.c_str()); + std::string str; + while (std::getline(in, str)) { + if (not str.empty()) + replay.emplace_back(str); + } + + return replay; +} + +/** @brief Build argument vector to pass to process */ +static std::vector smpi_deployment_get_args(int rank_id, const std::vector& replay, int argc, + char* argv[]) +{ + std::vector args{std::to_string(rank_id)}; + // pass arguments to process only if not a replay execution + if (replay.empty()) { + for (int i = 0; i < argc; i++) { + args.push_back(argv[i]); + } + } + /* one trace per process */ + if (replay.size() > 1) { + args.push_back(replay[rank_id]); + } + return args; +} + +/** + * @brief Deploy an SMPI application from a smpirun call + * + * This used to be done at smpirun script, parsing either the hostfile or the platform XML. + * If hostfile isn't provided, get the list of hosts from engine. + */ +int smpi_deployment_smpirun(simgrid::s4u::Engine* e, const std::string& hostfile, int np, const std::string& replayfile, + int map, int argc, char* argv[]) +{ + auto hosts = smpi_get_hosts(e, hostfile); + auto replay = smpi_read_replay(replayfile); + int hosts_size = static_cast(hosts.size()); + if (np == 0) + np = hosts_size; + + xbt_assert(np > 0, "Invalid number of process (np must be > 0). Check your np parameter, platform or hostfile"); + + if (np > hosts_size) { + XBT_INFO("You requested to use %d ranks, but there is only %d processes in your hostfile...", np, hosts_size); + } + + for (int i = 0; i < np; i++) { + simgrid::s4u::Host* host = hosts[i % hosts_size]; + std::string rank_id = std::to_string(i); + auto args = smpi_deployment_get_args(i, replay, argc, argv); + auto actor = simgrid::s4u::Actor::create(rank_id, host, rank_id, args); + /* keeping the same behavior as done in smpirun script, print mapping rank/process */ + if (map != 0) { + XBT_INFO("[rank %d] -> %s", i, host->get_cname()); + } + actor->set_property("instance_id", "smpirun"); + actor->set_property("rank", rank_id); + if (not replay.empty()) + actor->set_property("smpi_replay", "true"); + /* shared trace file, set it to rank 0 */ + if (i == 0 && replay.size() == 1) + actor->set_property("tracefile", replay[0]); + } + return np; +}