From 29d145c3e90c4d7a9dd744dae93e330b10724392 Mon Sep 17 00:00:00 2001 From: Martin Quinson Date: Wed, 9 Mar 2022 09:34:24 +0100 Subject: [PATCH] new: Actor::get_restart_count(): Returns the number of reboots that this actor did Fixes https://framagit.org/simgrid/simgrid/-/issues/11 (again) --- ChangeLog | 1 + docs/source/app_s4u.rst | 1 + .../platform-failures/platform-failures.tesh | 4 +-- .../s4u-platform-failures.tesh | 4 +-- .../platform-failures/platform-failures.tesh | 2 +- include/simgrid/s4u/Actor.hpp | 2 ++ src/kernel/actor/ActorImpl.cpp | 26 ++++++++++++------ src/kernel/actor/ActorImpl.hpp | 9 ++++++- src/s4u/s4u_Actor.cpp | 9 ++++++- src/surf/HostImpl.cpp | 11 +------- src/surf/sg_platf.cpp | 27 +++++-------------- .../s4u/monkey-semaphore/monkey-semaphore.cpp | 16 +++++------ 12 files changed, 57 insertions(+), 55 deletions(-) diff --git a/ChangeLog b/ChangeLog index bdc3888022..4789ea8872 100644 --- a/ChangeLog +++ b/ChangeLog @@ -28,6 +28,7 @@ S4U: - Reimplementation of barriers natively. Previously, they were implemented on top of s4u::Mutex and s4u::ConditionVariable. The new version should be faster (and can be used in the model-checker). + - Actor::get_restart_count(): Returns the number of reboots that this actor did. MSG: - MSG_barrier_destroy now expects a non-const msg_barrier parameter. diff --git a/docs/source/app_s4u.rst b/docs/source/app_s4u.rst index 26043015bb..082c49b97c 100644 --- a/docs/source/app_s4u.rst +++ b/docs/source/app_s4u.rst @@ -598,6 +598,7 @@ Reacting to the end of actors .. doxygenfunction:: simgrid::s4u::Actor::join() const .. doxygenfunction:: simgrid::s4u::Actor::join(double timeout) const .. doxygenfunction:: simgrid::s4u::Actor::set_auto_restart(bool autorestart) + .. doxygenfunction:: simgrid::s4u::Actor::get_restart_count() .. group-tab:: Python diff --git a/examples/c/platform-failures/platform-failures.tesh b/examples/c/platform-failures/platform-failures.tesh index b8be3a441f..54fae56b0e 100644 --- a/examples/c/platform-failures/platform-failures.tesh +++ b/examples/c/platform-failures/platform-failures.tesh @@ -5,7 +5,7 @@ p Testing a simple master/worker example application handling failures TCP cross ! output sort 19 $ ${bindir:=.}/c-platform-failures --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_failures.xml ${srcdir:=.}/../../cpp/platform-failures/s4u-platform-failures_d.xml --cfg=path:${srcdir} --cfg=network/crosstraffic:0 "--log=root.fmt:[%10.6r]%e(%i:%a@%h)%e%m%n" --log=res_cpu.t:verbose > [ 0.000000] (0:maestro@) Cannot launch actor 'worker' on failed host 'Fafard' -> [ 0.000000] (0:maestro@) Deployment includes some initially turned off Hosts ... nevermind. +> [ 0.000000] (0:maestro@) Starting actor worker(Fafard) failed because its host is turned off. > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process > [ 0.000000] (1:master@Tremblay) Send a message to worker-0 > [ 0.010309] (1:master@Tremblay) Send to worker-0 completed @@ -110,7 +110,7 @@ p Testing a simple master/worker example application handling failures. TCP cros ! output sort 19 $ ${bindir:=.}/c-platform-failures --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_failures.xml ${srcdir:=.}/../../cpp/platform-failures/s4u-platform-failures_d.xml --cfg=path:${srcdir} "--log=root.fmt:[%10.6r]%e(%i:%a@%h)%e%m%n" --log=res_cpu.t:verbose > [ 0.000000] (0:maestro@) Cannot launch actor 'worker' on failed host 'Fafard' -> [ 0.000000] (0:maestro@) Deployment includes some initially turned off Hosts ... nevermind. +> [ 0.000000] (0:maestro@) Starting actor worker(Fafard) failed because its host is turned off. > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process > [ 0.000000] (1:master@Tremblay) Send a message to worker-0 > [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 diff --git a/examples/cpp/platform-failures/s4u-platform-failures.tesh b/examples/cpp/platform-failures/s4u-platform-failures.tesh index eb0906d506..c9f5b4f06c 100644 --- a/examples/cpp/platform-failures/s4u-platform-failures.tesh +++ b/examples/cpp/platform-failures/s4u-platform-failures.tesh @@ -5,7 +5,7 @@ p Testing a simple master/worker example application handling failures TCP cross ! output sort 19 $ ${bindir:=.}/s4u-platform-failures --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_failures.xml ${srcdir:=.}/s4u-platform-failures_d.xml --cfg=path:${srcdir} --cfg=network/crosstraffic:0 "--log=root.fmt:[%10.6r]%e(%i:%a@%h)%e%m%n" --log=res_cpu.t:verbose > [ 0.000000] (0:maestro@) Cannot launch actor 'worker' on failed host 'Fafard' -> [ 0.000000] (0:maestro@) Deployment includes some initially turned off Hosts ... nevermind. +> [ 0.000000] (0:maestro@) Starting actor worker(Fafard) failed because its host is turned off. > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process > [ 0.000000] (1:master@Tremblay) Send a message to worker-0 > [ 0.000000] (7:sleeper@Lilibeth) Start sleeping... @@ -120,7 +120,7 @@ p Testing a simple master/worker example application handling failures. TCP cros ! output sort 19 $ ${bindir:=.}/s4u-platform-failures --log=xbt_cfg.thres:critical --log=no_loc ${platfdir}/small_platform_failures.xml ${srcdir:=.}/s4u-platform-failures_d.xml --cfg=path:${srcdir} "--log=root.fmt:[%10.6r]%e(%i:%a@%h)%e%m%n" --log=res_cpu.t:verbose > [ 0.000000] (0:maestro@) Cannot launch actor 'worker' on failed host 'Fafard' -> [ 0.000000] (0:maestro@) Deployment includes some initially turned off Hosts ... nevermind. +> [ 0.000000] (0:maestro@) Starting actor worker(Fafard) failed because its host is turned off. > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process > [ 0.000000] (1:master@Tremblay) Send a message to worker-0 > [ 0.000000] (2:worker@Tremblay) Waiting a message on worker-0 diff --git a/examples/python/platform-failures/platform-failures.tesh b/examples/python/platform-failures/platform-failures.tesh index 35bcba084f..4e7846da61 100644 --- a/examples/python/platform-failures/platform-failures.tesh +++ b/examples/python/platform-failures/platform-failures.tesh @@ -5,7 +5,7 @@ p Testing a simple master/workers example application handling failures ! output sort 19 $ ${pythoncmd:=python3} ${PYTHON_TOOL_OPTIONS:=} ${bindir:=.}/platform-failures.py ${platfdir}/small_platform_failures.xml ${srcdir:=.}/platform-failures_d.xml --log=xbt_cfg.thres:critical --log=no_loc --cfg=path:${srcdir} --cfg=network/crosstraffic:0 "--log=root.fmt:[%10.6r]%e(%i:%a@%h)%e%m%n" --log=res_cpu.t:verbose > [ 0.000000] (0:maestro@) Cannot launch actor 'worker' on failed host 'Fafard' -> [ 0.000000] (0:maestro@) Deployment includes some initially turned off Hosts ... nevermind. +> [ 0.000000] (0:maestro@) Starting actor worker(Fafard) failed because its host is turned off. > [ 0.000000] (1:master@Tremblay) Got 5 workers and 20 tasks to process > [ 0.000000] (1:master@Tremblay) Send a message to worker-0 > [ 0.000000] (7:sleeper@Lilibeth) Start sleeping... diff --git a/include/simgrid/s4u/Actor.hpp b/include/simgrid/s4u/Actor.hpp index 7b8b4291b8..b5a2251eb2 100644 --- a/include/simgrid/s4u/Actor.hpp +++ b/include/simgrid/s4u/Actor.hpp @@ -343,6 +343,8 @@ public: /** If set to true, the actor will automatically restart when its host reboots */ Actor* set_auto_restart(bool autorestart = true); + /** Returns the number of reboots that this actor did. Before the first reboot, this function returns 0. */ + int get_restart_count(); /** Add a function to the list of "on_exit" functions for the current actor. The on_exit functions are the functions * executed when your actor is killed. You should use them to free the data used by your actor. diff --git a/src/kernel/actor/ActorImpl.cpp b/src/kernel/actor/ActorImpl.cpp index 213a54de8b..2676839cf6 100644 --- a/src/kernel/actor/ActorImpl.cpp +++ b/src/kernel/actor/ActorImpl.cpp @@ -329,19 +329,13 @@ s4u::Actor* ActorImpl::restart() XBT_DEBUG("Restarting actor %s on %s", get_cname(), host_->get_cname()); // retrieve the arguments of the old actor - ProcessArg arg(host_, this); + ProcessArg args(host_, this); // kill the old actor context::Context::self()->get_actor()->kill(this); // start the new actor - ActorImplPtr actor = ActorImpl::create(arg.name, arg.code, arg.data, arg.host, nullptr); - actor->set_properties(arg.properties); - *actor->on_exit = std::move(*arg.on_exit); - actor->set_kill_time(arg.kill_time); - actor->set_auto_restart(arg.auto_restart); - - return actor->get_ciface(); + return create(&args)->get_ciface(); } void ActorImpl::suspend() @@ -494,6 +488,22 @@ ActorImplPtr ActorImpl::create(const std::string& name, const ActorCode& code, v return actor; } +ActorImplPtr ActorImpl::create(ProcessArg* args) +{ + actor::ActorImplPtr actor = actor::ActorImpl::create(args->name, args->code, nullptr, args->host, nullptr); + auto* naked_actor = actor.get(); + naked_actor->restart_count_ = args->restart_count_; + actor->set_properties(args->properties); + if (args->on_exit) + *actor->on_exit = *args->on_exit; + if (args->kill_time >= 0) + actor->set_kill_time(args->kill_time); + if (args->auto_restart) + actor->set_auto_restart(args->auto_restart); + if (args->daemon_) + actor->daemonize(); + return actor; +} void create_maestro(const std::function& code) { diff --git a/src/kernel/actor/ActorImpl.hpp b/src/kernel/actor/ActorImpl.hpp index d0a522a44d..e9dc43ff49 100644 --- a/src/kernel/actor/ActorImpl.hpp +++ b/src/kernel/actor/ActorImpl.hpp @@ -19,6 +19,7 @@ namespace simgrid { namespace kernel { namespace actor { +class ProcessArg; class XBT_PUBLIC ActorImpl : public xbt::PropertyHolder { s4u::Host* host_ = nullptr; /* the host on which the actor is running */ @@ -26,6 +27,7 @@ class XBT_PUBLIC ActorImpl : public xbt::PropertyHolder { aid_t ppid_ = -1; bool daemon_ = false; /* Daemon actors are automatically killed when the last non-daemon leaves */ bool auto_restart_ = false; + int restart_count_ = 0; unsigned stacksize_; // set to default value in constructor std::vector mailboxes; @@ -62,6 +64,7 @@ public: / after terminaison) */ bool has_to_auto_restart() const { return auto_restart_; } void set_auto_restart(bool autorestart) { auto_restart_ = autorestart; } + int get_restart_count() { return restart_count_; } void set_stacksize(unsigned stacksize) { stacksize_ = stacksize; } unsigned get_stacksize() const { return stacksize_; } @@ -125,6 +128,7 @@ public: static ActorImplPtr create(const std::string& name, const ActorCode& code, void* data, s4u::Host* host, const ActorImpl* parent_actor); + static ActorImplPtr create(ProcessArg* args); static ActorImplPtr attach(const std::string& name, void* data, s4u::Host* host); static void detach(); void cleanup(); @@ -162,6 +166,7 @@ public: bool daemon_ = false; /* list of functions executed when the actor dies */ const std::shared_ptr>> on_exit; + int restart_count_ = 0; ProcessArg() = delete; ProcessArg(const ProcessArg&) = delete; @@ -169,7 +174,7 @@ public: explicit ProcessArg(const std::string& name, const std::function& code, void* data, s4u::Host* host, double kill_time, const std::unordered_map& properties, - bool auto_restart) + bool auto_restart, int restart_count) : name(name) , code(code) , data(data) @@ -177,6 +182,7 @@ public: , kill_time(kill_time) , properties(properties) , auto_restart(auto_restart) + , restart_count_(restart_count) { } @@ -189,6 +195,7 @@ public: , auto_restart(actor->has_to_auto_restart()) , daemon_(actor->is_daemon()) , on_exit(actor->on_exit) + , restart_count_(actor->get_restart_count() + 1) { } }; diff --git a/src/s4u/s4u_Actor.cpp b/src/s4u/s4u_Actor.cpp index ce00aafa29..751f333476 100644 --- a/src/s4u/s4u_Actor.cpp +++ b/src/s4u/s4u_Actor.cpp @@ -124,8 +124,11 @@ void Actor::join(double timeout) const Actor* Actor::set_auto_restart(bool autorestart) { + if (autorestart == pimpl_->has_to_auto_restart()) // not changed + return this; + kernel::actor::simcall_answered([this, autorestart]() { - xbt_assert(autorestart && not pimpl_->has_to_auto_restart()); // FIXME: handle all cases + xbt_assert(autorestart, "Asking an actor to stop being autorestart is not implemented yet. Ask us if you need it."); pimpl_->set_auto_restart(autorestart); auto* arg = new kernel::actor::ProcessArg(pimpl_->get_host(), pimpl_); @@ -134,6 +137,10 @@ Actor* Actor::set_auto_restart(bool autorestart) }); return this; } +int Actor::get_restart_count() +{ + return pimpl_->get_restart_count(); +} void Actor::on_exit(const std::function& fun) const { diff --git a/src/surf/HostImpl.cpp b/src/surf/HostImpl.cpp index 9dabe15b66..cb63f32b76 100644 --- a/src/surf/HostImpl.cpp +++ b/src/surf/HostImpl.cpp @@ -78,16 +78,7 @@ void HostImpl::turn_on() const { for (auto const& arg : actors_at_boot_) { XBT_DEBUG("Booting Actor %s(%s) right now", arg->name.c_str(), arg->host->get_cname()); - actor::ActorImplPtr actor = actor::ActorImpl::create(arg->name, arg->code, nullptr, arg->host, nullptr); - actor->set_properties(arg->properties); - if (arg->on_exit) - *actor->on_exit = *arg->on_exit; - if (arg->kill_time >= 0) - actor->set_kill_time(arg->kill_time); - if (arg->auto_restart) - actor->set_auto_restart(arg->auto_restart); - if (arg->daemon_) - actor->daemonize(); + actor::ActorImplPtr actor = actor::ActorImpl::create(arg); } } diff --git a/src/surf/sg_platf.cpp b/src/surf/sg_platf.cpp index 251c3ce2df..62ae0fd7cc 100644 --- a/src/surf/sg_platf.cpp +++ b/src/surf/sg_platf.cpp @@ -477,39 +477,26 @@ void sg_platf_new_actor(simgrid::kernel::routing::ActorCreationArgs* actor) simgrid::kernel::actor::ActorCode code = factory(std::move(actor->args)); auto* arg = new simgrid::kernel::actor::ProcessArg(actor_name, code, nullptr, host, kill_time, actor->properties, - auto_restart); + auto_restart, /*restart_count=*/0); host->get_impl()->add_actor_at_boot(arg); if (start_time > simgrid::s4u::Engine::get_clock()) { arg = new simgrid::kernel::actor::ProcessArg(actor_name, code, nullptr, host, kill_time, actor->properties, - auto_restart); + auto_restart, /*restart_count=*/0); - XBT_DEBUG("Process %s@%s will be started at time %f", arg->name.c_str(), arg->host->get_cname(), start_time); + XBT_DEBUG("Actor %s@%s will be started at time %f", arg->name.c_str(), arg->host->get_cname(), start_time); simgrid::kernel::timer::Timer::set(start_time, [arg, auto_restart]() { - simgrid::kernel::actor::ActorImplPtr new_actor = - simgrid::kernel::actor::ActorImpl::create(arg->name.c_str(), arg->code, arg->data, arg->host, nullptr); - new_actor->set_properties(arg->properties); - if (arg->kill_time >= 0) - new_actor->set_kill_time(arg->kill_time); - if (auto_restart) - new_actor->set_auto_restart(auto_restart); + simgrid::kernel::actor::ActorImplPtr new_actor = simgrid::kernel::actor::ActorImpl::create(arg); delete arg; }); } else { // start_time <= simgrid::s4u::Engine::get_clock() - XBT_DEBUG("Starting Process %s(%s) right now", arg->name.c_str(), host->get_cname()); + XBT_DEBUG("Starting actor %s(%s) right now", arg->name.c_str(), host->get_cname()); try { - simgrid::kernel::actor::ActorImplPtr new_actor = nullptr; - new_actor = simgrid::kernel::actor::ActorImpl::create(arg->name.c_str(), code, nullptr, host, nullptr); - new_actor->set_properties(arg->properties); - /* The actor creation will fail if the host is currently dead, but that's fine */ - if (arg->kill_time >= 0) - new_actor->set_kill_time(arg->kill_time); - if (auto_restart) - new_actor->set_auto_restart(auto_restart); + simgrid::kernel::actor::ActorImplPtr new_actor = simgrid::kernel::actor::ActorImpl::create(arg); } catch (simgrid::HostFailureException const&) { - XBT_WARN("Deployment includes some initially turned off Hosts ... nevermind."); + XBT_WARN("Starting actor %s(%s) failed because its host is turned off.", arg->name.c_str(), host->get_cname()); } } } diff --git a/teshsuite/s4u/monkey-semaphore/monkey-semaphore.cpp b/teshsuite/s4u/monkey-semaphore/monkey-semaphore.cpp index 3612aad873..de33292f1f 100644 --- a/teshsuite/s4u/monkey-semaphore/monkey-semaphore.cpp +++ b/teshsuite/s4u/monkey-semaphore/monkey-semaphore.cpp @@ -44,16 +44,14 @@ public: static void producer(SharedBuffer& buf) { - static bool inited = false; static int todo = cfg_item_count; // remaining amount of items to exchange SemStack to_release; - XBT_INFO("Producer %s", inited ? "rebooting" : "booting"); + bool rebooting = sg4::Actor::self()->get_restart_count() > 0; - if (not inited) { + XBT_INFO("Producer %s", rebooting ? "rebooting" : "booting"); + if (not rebooting) // Starting for the first time sg4::this_actor::on_exit( [](bool forcefully) { XBT_INFO("Producer dying %s.", forcefully ? "forcefully" : "peacefully"); }); - inited = true; - } while (todo > 0) { xbt_assert(sg4::Engine::get_clock() < cfg_deadline, @@ -80,14 +78,12 @@ static void producer(SharedBuffer& buf) static void consumer(const SharedBuffer& buf) { SemStack to_release; + bool rebooting = sg4::Actor::self()->get_restart_count() > 0; - static bool inited = false; - XBT_INFO("Consumer %s", inited ? "rebooting" : "booting"); - if (not inited) { + XBT_INFO("Consumer %s", rebooting ? "rebooting" : "booting"); + if (not rebooting) // Starting for the first time sg4::this_actor::on_exit( [](bool forcefully) { XBT_INFO("Consumer dying %s.", forcefully ? "forcefully" : "peacefully"); }); - inited = true; - } int item; do { -- 2.20.1