From 78af750e886b323992c521d8da375023e318aaad Mon Sep 17 00:00:00 2001 From: Martin Quinson Date: Sat, 21 Jan 2023 21:25:46 +0100 Subject: [PATCH] Cosmetics in an example and integrate it to the doc --- docs/source/Modeling_howtos.rst | 4 ++ examples/README.rst | 20 +++++++- .../cpp/comm-failure/s4u-comm-failure.cpp | 50 ++++++------------- .../cpp/comm-failure/s4u-comm-failure.tesh | 10 ++-- examples/python/comm-failure/comm-failure.py | 15 +++--- .../python/comm-failure/comm-failure.tesh | 30 ++++++----- 6 files changed, 61 insertions(+), 68 deletions(-) diff --git a/docs/source/Modeling_howtos.rst b/docs/source/Modeling_howtos.rst index f6589f7c5c..452a513e7e 100644 --- a/docs/source/Modeling_howtos.rst +++ b/docs/source/Modeling_howtos.rst @@ -93,6 +93,10 @@ Another possibility is to use the functions take a profile, that can be a fixed profile exhaustively listing the events, or something else if you wish. +For further reading, you could turn to :ref:`this example ` +on how to react to communication failures, or :ref:`this one ` +on how to attach a state profile to hosts and react to execution failures. + .. _howto_multicore: Modeling multicore machines diff --git a/examples/README.rst b/examples/README.rst index 16b544b295..e8594842cf 100644 --- a/examples/README.rst +++ b/examples/README.rst @@ -430,6 +430,21 @@ The ``test_any()`` returns whether at least one activity of the set has complete See also :py:func:`simgrid.Comm.test_any()`. +.. _s4u_ex_comm_failure: + +Dealing with network failures +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This examples shows how to survive to network exceptions that occur when a link is turned off. In this case, any blocking operation +such as ``put``, ``get`` or ``wait`` will raise an exception that you can catch and react to. See also :ref:`howto_churn` and +:ref:`this example ` on how to attach a state profile to hosts and react to execution failures. + +.. tabs:: + + .. example-tab:: examples/cpp/comm-failure/s4u-comm-failure.cpp + + .. example-tab:: examples/python/comm-failure/comm-failure.py + .. _s4u_ex_comm_host2host: Direct host-to-host communication @@ -754,11 +769,14 @@ Shows how to filter the actors that match given criteria. Profiles -------- +.. _s4u_ex_platform_state_profile: + Specifying state profiles ^^^^^^^^^^^^^^^^^^^^^^^^^ Shows how to specify when the resources must be turned off and on again, and how to react to such -failures in your code. See also :ref:`howto_churn`. +failures in your code. See also :ref:`howto_churn` and :ref:`this example ` +on how to react to communication failures. .. tabs:: diff --git a/examples/cpp/comm-failure/s4u-comm-failure.cpp b/examples/cpp/comm-failure/s4u-comm-failure.cpp index 5d3013a373..d7fc941125 100644 --- a/examples/cpp/comm-failure/s4u-comm-failure.cpp +++ b/examples/cpp/comm-failure/s4u-comm-failure.cpp @@ -3,16 +3,8 @@ /* This program is free software; you can redistribute it and/or modify it * under the terms of the license (GNU LGPL) which comes with this package. */ -/* This example shows how to serialize a set of communications going through a link +/* This example shows how to react to a failed communication, which occures when a link is turned off. * - * As for the other asynchronous examples, the sender initiates all the messages it wants to send and - * pack the resulting simgrid::s4u::CommPtr objects in a vector. - * At the same time, the receiver starts receiving all messages asynchronously. Without serialization, - * all messages would be received at the same timestamp in the receiver. - * - * However, as they will be serialized in a link of the platform, the messages arrive 2 by 2. - * - * The sender then blocks until all ongoing communication terminate, using simgrid::s4u::Comm::wait_all() */ #include @@ -49,12 +41,11 @@ public: XBT_INFO("Wait any returned index %ld (comm to %s)", index, pending_comms.at(index)->get_mailbox()->get_cname()); } catch (const simgrid::NetworkFailureException&) { XBT_INFO("Sender has experienced a network failure exception, so it knows that something went wrong"); - XBT_INFO("Now it needs to figure out which of the two comms failed by looking at their state"); + XBT_INFO("Now it needs to figure out which of the two comms failed by looking at their state:"); + XBT_INFO(" Comm to %s has state: %s", comm1->get_mailbox()->get_cname(), comm1->get_state_str()); + XBT_INFO(" Comm to %s has state: %s", comm2->get_mailbox()->get_cname(), comm2->get_state_str()); } - XBT_INFO("Comm to %s has state: %s", comm1->get_mailbox()->get_cname(), comm1->get_state_str()); - XBT_INFO("Comm to %s has state: %s", comm2->get_mailbox()->get_cname(), comm2->get_state_str()); - try { comm1->wait(); } catch (const simgrid::NetworkFailureException& e) { @@ -67,14 +58,13 @@ public: }; class Receiver { - std::string mailbox_name; + sg4::Mailbox* mailbox; public: - explicit Receiver(const std::string& mailbox_name) : mailbox_name(mailbox_name) {} + explicit Receiver(const std::string& mailbox_name) : mailbox(sg4::Mailbox::by_name(mailbox_name)) {} void operator()() const { - auto mailbox = sg4::Mailbox::by_name(mailbox_name); XBT_INFO("Receiver posting a receive..."); try { mailbox->get(); @@ -85,23 +75,6 @@ public: } }; -class LinkKiller { - std::string link_name; - -public: - explicit LinkKiller(const std::string& link_name) : link_name(link_name) {} - - void operator()() const - { - auto link_to_kill = sg4::Link::by_name(link_name); - XBT_INFO("LinkKiller sleeping 10 seconds..."); - sg4::this_actor::sleep_for(10.0); - XBT_INFO("LinkKiller turning off link %s", link_to_kill->get_cname()); - link_to_kill->turn_off(); - XBT_INFO("LinkKiller killed. exiting"); - } -}; - int main(int argc, char** argv) { sg4::Engine engine(&argc, argv); @@ -118,9 +91,14 @@ int main(int argc, char** argv) zone->seal(); sg4::Actor::create("Sender", host1, Sender("mailbox2", "mailbox3")); - sg4::Actor::create("Receiver", host2, Receiver("mailbox2"))->daemonize(); - sg4::Actor::create("Receiver", host3, Receiver("mailbox3"))->daemonize(); - sg4::Actor::create("LinkKiller", host1, LinkKiller("linkto2"))->daemonize(); + sg4::Actor::create("Receiver", host2, Receiver("mailbox2")); + sg4::Actor::create("Receiver", host3, Receiver("mailbox3")); + + sg4::Actor::create("LinkKiller", host1, [](){ + sg4::this_actor::sleep_for(10.0); + XBT_INFO("Turning off link 'linkto2'"); + sg4::Link::by_name("linkto2")->turn_off(); + }); engine.run(); diff --git a/examples/cpp/comm-failure/s4u-comm-failure.tesh b/examples/cpp/comm-failure/s4u-comm-failure.tesh index c0b12090f4..f3dcc62823 100644 --- a/examples/cpp/comm-failure/s4u-comm-failure.tesh +++ b/examples/cpp/comm-failure/s4u-comm-failure.tesh @@ -1,19 +1,17 @@ #!/usr/bin/env tesh $ ${bindir:=.}/s4u-comm-failure "--log=root.fmt:[%10.6r]%e(%i:%a@%h)%e%m%n" -> [ 0.000000] (4:LinkKiller@Host1) LinkKiller sleeping 10 seconds... > [ 0.000000] (2:Receiver@Host2) Receiver posting a receive... > [ 0.000000] (3:Receiver@Host3) Receiver posting a receive... > [ 0.000000] (1:Sender@Host1) Initiating asynchronous send to mailbox2 > [ 0.000000] (1:Sender@Host1) Initiating asynchronous send to mailbox3 > [ 0.000000] (1:Sender@Host1) Calling wait_any.. -> [ 10.000000] (4:LinkKiller@Host1) LinkKiller turning off link linkto2 -> [ 10.000000] (4:LinkKiller@Host1) LinkKiller killed. exiting +> [ 10.000000] (4:LinkKiller@Host1) Turning off link 'linkto2' > [ 10.000000] (2:Receiver@Host2) Receiver has experience a network failure exception > [ 10.000000] (1:Sender@Host1) Sender has experienced a network failure exception, so it knows that something went wrong -> [ 10.000000] (1:Sender@Host1) Now it needs to figure out which of the two comms failed by looking at their state -> [ 10.000000] (1:Sender@Host1) Comm to mailbox2 has state: FAILED -> [ 10.000000] (1:Sender@Host1) Comm to mailbox3 has state: STARTED +> [ 10.000000] (1:Sender@Host1) Now it needs to figure out which of the two comms failed by looking at their state: +> [ 10.000000] (1:Sender@Host1) Comm to mailbox2 has state: FAILED +> [ 10.000000] (1:Sender@Host1) Comm to mailbox3 has state: STARTED > [ 10.000000] (1:Sender@Host1) Waiting on a FAILED comm raises an exception: 'Cannot wait for a failed communication' > [ 10.000000] (1:Sender@Host1) Wait for remaining comm, just to be nice > [ 16.494845] (3:Receiver@Host3) Receiver has received successfully! diff --git a/examples/python/comm-failure/comm-failure.py b/examples/python/comm-failure/comm-failure.py index 7367720379..91ec73d3b5 100644 --- a/examples/python/comm-failure/comm-failure.py +++ b/examples/python/comm-failure/comm-failure.py @@ -25,15 +25,15 @@ def sender(mailbox1_name: str, mailbox2_name: str) -> None: this_actor.info(f"Wait any returned index {index} (comm to {pending_comms[index].mailbox.name})") except NetworkFailureException: this_actor.info(f"Sender has experienced a network failure exception, so it knows that something went wrong") - this_actor.info(f"Now it needs to figure out which of the two comms failed by looking at their state") + this_actor.info(f"Now it needs to figure out which of the two comms failed by looking at their state:") - this_actor.info(f"Comm to {comm1.mailbox.name} has state: {comm1.state_str}") - this_actor.info(f"Comm to {comm2.mailbox.name} has state: {comm2.state_str}") + this_actor.info(f" Comm to {comm1.mailbox.name} has state: {comm1.state_str}") + this_actor.info(f" Comm to {comm2.mailbox.name} has state: {comm2.state_str}") try: comm1.wait() - except NetworkFailureException: - this_actor.info(f"Waiting on a FAILED comm raises an exception") + except NetworkFailureException as err: + this_actor.info(f"Waiting on a FAILED comm raises an exception: '{err}'") this_actor.info("Wait for remaining comm, just to be nice") pending_comms.pop(0) @@ -55,12 +55,9 @@ def receiver(mailbox_name: str) -> None: def link_killer(link_name: str) -> None: link_to_kill = Link.by_name(link_name) - this_actor.info("sleeping 10 seconds...") this_actor.sleep_for(10.0) - this_actor.info(f"turning off link {link_to_kill.name}") + this_actor.info(f"Turning off link '{link_to_kill.name}'") link_to_kill.turn_off() - this_actor.info("link killed. exiting") - def main(): e = Engine(sys.argv) diff --git a/examples/python/comm-failure/comm-failure.tesh b/examples/python/comm-failure/comm-failure.tesh index cc78b708b8..ee26ae8d2d 100644 --- a/examples/python/comm-failure/comm-failure.tesh +++ b/examples/python/comm-failure/comm-failure.tesh @@ -1,19 +1,17 @@ #!/usr/bin/env tesh $ ${pythoncmd:=python3} ${PYTHON_TOOL_OPTIONS:=} ${bindir:=.}/comm-failure.py "--log=root.fmt:[%10.6r]%e(%i:%a@%h)%e%m%n" ->[ 0.000000] (4:LinkKiller@Host2) sleeping 10 seconds... ->[ 0.000000] (2:Receiver-1@Host2) Receiver posting a receive (mailbox2)... ->[ 0.000000] (3:Receiver-2@Host3) Receiver posting a receive (mailbox3)... ->[ 0.000000] (1:Sender@Host1) Initiating asynchronous send to mailbox2 ->[ 0.000000] (1:Sender@Host1) Initiating asynchronous send to mailbox3 ->[ 0.000000] (1:Sender@Host1) Calling wait_any.. ->[ 10.000000] (4:LinkKiller@Host2) turning off link link_to_2 ->[ 10.000000] (4:LinkKiller@Host2) link killed. exiting ->[ 10.000000] (2:Receiver-1@Host2) Receiver has experience a network failure exception (mailbox2) ->[ 10.000000] (1:Sender@Host1) Sender has experienced a network failure exception, so it knows that something went wrong ->[ 10.000000] (1:Sender@Host1) Now it needs to figure out which of the two comms failed by looking at their state ->[ 10.000000] (1:Sender@Host1) Comm to mailbox2 has state: FAILED ->[ 10.000000] (1:Sender@Host1) Comm to mailbox3 has state: STARTED ->[ 10.000000] (1:Sender@Host1) Waiting on a FAILED comm raises an exception ->[ 10.000000] (1:Sender@Host1) Wait for remaining comm, just to be nice ->[ 16.494845] (3:Receiver-2@Host3) Receiver has received successfully (mailbox3)! +> [ 0.000000] (2:Receiver-1@Host2) Receiver posting a receive (mailbox2)... +> [ 0.000000] (3:Receiver-2@Host3) Receiver posting a receive (mailbox3)... +> [ 0.000000] (1:Sender@Host1) Initiating asynchronous send to mailbox2 +> [ 0.000000] (1:Sender@Host1) Initiating asynchronous send to mailbox3 +> [ 0.000000] (1:Sender@Host1) Calling wait_any.. +> [ 10.000000] (4:LinkKiller@Host2) Turning off link 'link_to_2' +> [ 10.000000] (2:Receiver-1@Host2) Receiver has experience a network failure exception (mailbox2) +> [ 10.000000] (1:Sender@Host1) Sender has experienced a network failure exception, so it knows that something went wrong +> [ 10.000000] (1:Sender@Host1) Now it needs to figure out which of the two comms failed by looking at their state: +> [ 10.000000] (1:Sender@Host1) Comm to mailbox2 has state: FAILED +> [ 10.000000] (1:Sender@Host1) Comm to mailbox3 has state: STARTED +> [ 10.000000] (1:Sender@Host1) Waiting on a FAILED comm raises an exception: 'Cannot wait for a failed communication' +> [ 10.000000] (1:Sender@Host1) Wait for remaining comm, just to be nice +> [ 16.494845] (3:Receiver-2@Host3) Receiver has received successfully (mailbox3)! -- 2.20.1