From 3718b81f92488f30d14590c0b69b13c3843604d5 Mon Sep 17 00:00:00 2001 From: Augustin Degomme Date: Mon, 14 Jun 2021 14:07:41 +0200 Subject: [PATCH] Add option smpi/errors-are-fatal to allow users to bypass MPI errors returned by SMPI. This basically sets the default errhandler to MPI_ERRORS_RETURN if set to false. --- ChangeLog | 9 ++++++--- docs/source/Configuring_SimGrid.rst | 10 ++++++++++ src/smpi/include/smpi_comm.hpp | 3 ++- src/smpi/include/smpi_config.hpp | 1 + src/smpi/include/smpi_win.hpp | 3 ++- src/smpi/internals/smpi_config.cpp | 4 +++- src/smpi/mpi/smpi_comm.cpp | 9 ++++++--- src/smpi/mpi/smpi_file.cpp | 2 +- 8 files changed, 31 insertions(+), 10 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6c779f06a5..861f3ed647 100644 --- a/ChangeLog +++ b/ChangeLog @@ -15,9 +15,12 @@ SMPI: - The default SMPI compiler flags are no more taken from the environment. They can be explicitly set through cmake parameters SMPI_C_FLAGS, SMPI_CXX_FLAGS, or SMPI_Fortran_FLAGS. - - New option: --cfg=smpi/finalization-barrier, which can be used to add - a barrier inside MPI_Finalize. This can help for some codes which cleanup - data attached to a process, but still used in other SMPI processes. + - New options: + --cfg=smpi/finalization-barrier: which can be used to add + a barrier inside MPI_Finalize. This can help for some codes which cleanup + data attached to a process, but still used in other SMPI processes. + --cfg=smpi/errors-are-fatal: True by default, behaves like if MPI_ERRORS_RETURN + is active when set to false, to keep going after a small error LUA: - Lua platform files are deprecated. Their support will be dropped after v3.31. diff --git a/docs/source/Configuring_SimGrid.rst b/docs/source/Configuring_SimGrid.rst index b59b43e505..b2718c006b 100644 --- a/docs/source/Configuring_SimGrid.rst +++ b/docs/source/Configuring_SimGrid.rst @@ -150,6 +150,7 @@ Existing Configuration Items - **smpi/cpu-threshold:** :ref:`cfg=smpi/cpu-threshold` - **smpi/display-allocs:** :ref:`cfg=smpi/display-allocs` - **smpi/display-timing:** :ref:`cfg=smpi/display-timing` +- **smpi/errors-are-fatal:** :ref:`cfg=smpi/errors-are-fatal` - **smpi/finalization-barrier:** :ref:`cfg=smpi/finalization-barrier` - **smpi/grow-injected-times:** :ref:`cfg=smpi/grow-injected-times` - **smpi/host-speed:** :ref:`cfg=smpi/host-speed` @@ -1325,6 +1326,15 @@ this option will add an explicit MPI_Barrier(MPI_COMM_WORLD) call inside the MPI_Finalize, so that all processes will terminate at almost the same point. It might affect the total timing by the cost of a barrier. +.. _cfg=smpi/errors-are-fatal: + +**Option** ``smpi/errors-are-fatal`` **default:** on + +By default, SMPI processes will crash if a MPI error code is returned. MPI allows +to explicitely set MPI_ERRORS_RETURN errhandler to avoid this behaviour. This flag +will turn on this behaviour by default (for all concerned types and errhandlers). +This can ease debugging by going after the first reported error. + .. _cfg=smpi/iprobe: Inject constant times for MPI_Iprobe diff --git a/src/smpi/include/smpi_comm.hpp b/src/smpi/include/smpi_comm.hpp index f54a743c8b..34881bc3df 100644 --- a/src/smpi/include/smpi_comm.hpp +++ b/src/smpi/include/smpi_comm.hpp @@ -13,6 +13,7 @@ #include "smpi_keyvals.hpp" #include "smpi_group.hpp" #include "smpi_topo.hpp" +#include "smpi_config.hpp" namespace simgrid{ namespace smpi{ @@ -36,7 +37,7 @@ class Comm : public F2C, public Keyval{ std::string name_; MPI_Info info_ = MPI_INFO_NULL; int id_; - MPI_Errhandler errhandler_ = MPI_ERRORS_ARE_FATAL; + MPI_Errhandler errhandler_ = _smpi_cfg_default_errhandler_is_error ? MPI_ERRORS_ARE_FATAL : MPI_ERRORS_RETURN;; MPI_Errhandler* errhandlers_ = nullptr; //for MPI_COMM_WORLD only public: diff --git a/src/smpi/include/smpi_config.hpp b/src/smpi/include/smpi_config.hpp index 903ebdb05c..60d81ed788 100644 --- a/src/smpi/include/smpi_config.hpp +++ b/src/smpi/include/smpi_config.hpp @@ -23,6 +23,7 @@ extern XBT_PRIVATE simgrid::config::Flag _smpi_cfg_iprobe_cpu_usage; extern XBT_PRIVATE simgrid::config::Flag _smpi_cfg_trace_call_use_absolute_path; extern XBT_PRIVATE simgrid::config::Flag _smpi_cfg_trace_call_location; extern XBT_PRIVATE simgrid::config::Flag _smpi_cfg_comp_adjustment_file; +extern XBT_PRIVATE simgrid::config::Flag _smpi_cfg_default_errhandler_is_error; #if HAVE_PAPI extern XBT_PRIVATE simgrid::config::Flag _smpi_cfg_papi_events_file; #endif diff --git a/src/smpi/include/smpi_win.hpp b/src/smpi/include/smpi_win.hpp index d7d1880055..b3994e443c 100644 --- a/src/smpi/include/smpi_win.hpp +++ b/src/smpi/include/smpi_win.hpp @@ -11,6 +11,7 @@ #include "smpi_errhandler.hpp" #include "smpi_f2c.hpp" #include "smpi_keyvals.hpp" +#include "smpi_config.hpp" #include #include @@ -41,7 +42,7 @@ class Win : public F2C, public Keyval { int mode_ = 0; // exclusive or shared lock bool allocated_; bool dynamic_; - MPI_Errhandler errhandler_ = MPI_ERRORS_ARE_FATAL; + MPI_Errhandler errhandler_ = _smpi_cfg_default_errhandler_is_error ? MPI_ERRORS_ARE_FATAL : MPI_ERRORS_RETURN; public: static std::unordered_map keyvals_; diff --git a/src/smpi/internals/smpi_config.cpp b/src/smpi/internals/smpi_config.cpp index 41f771eb47..b7cd540240 100644 --- a/src/smpi/internals/smpi_config.cpp +++ b/src/smpi/internals/smpi_config.cpp @@ -108,7 +108,9 @@ simgrid::config::Flag _smpi_cfg_comp_adjustment_file{"smpi/comp-adj } } }}; - + +simgrid::config::Flag _smpi_cfg_default_errhandler_is_error{ + "smpi/errors-are-fatal", "Whether MPI errors are fatal or just return. Default is true", true }; #if HAVE_PAPI simgrid::config::Flag _smpi_cfg_papi_events_file{"smpi/papi-events", "This switch enables tracking the specified counters with PAPI", ""}; diff --git a/src/smpi/mpi/smpi_comm.cpp b/src/smpi/mpi/smpi_comm.cpp index 017d010a79..5c23877104 100644 --- a/src/smpi/mpi/smpi_comm.cpp +++ b/src/smpi/mpi/smpi_comm.cpp @@ -583,9 +583,12 @@ MPI_Errhandler Comm::errhandler() errhandler_->ref(); return errhandler_; } else { - if(errhandlers_==nullptr) - return MPI_ERRORS_ARE_FATAL; - else { + if(errhandlers_==nullptr){ + if (_smpi_cfg_default_errhandler_is_error) + return MPI_ERRORS_ARE_FATAL; + else + return MPI_ERRORS_RETURN; + } else { if(errhandlers_[this->rank()] != MPI_ERRHANDLER_NULL) errhandlers_[this->rank()]->ref(); return errhandlers_[this->rank()]; diff --git a/src/smpi/mpi/smpi_file.cpp b/src/smpi/mpi/smpi_file.cpp index 4ad8f8f3b9..82c5e2fe5f 100644 --- a/src/smpi/mpi/smpi_file.cpp +++ b/src/smpi/mpi/smpi_file.cpp @@ -21,7 +21,7 @@ XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_io, smpi, "Logging specific to SMPI (RMA operations)"); -MPI_Errhandler SMPI_default_File_Errhandler = MPI_ERRORS_RETURN; +MPI_Errhandler SMPI_default_File_Errhandler = _smpi_cfg_default_errhandler_is_error ? MPI_ERRORS_ARE_FATAL : MPI_ERRORS_RETURN;; namespace simgrid{ namespace smpi{ -- 2.20.1