Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Check owner for MPI keyvals, and skip callbacks for dead actors.
authorArnaud Giersch <arnaud.giersch@univ-fcomte.fr>
Thu, 27 May 2021 12:14:46 +0000 (14:14 +0200)
committerArnaud Giersch <arnaud.giersch@univ-fcomte.fr>
Mon, 27 Nov 2023 08:45:06 +0000 (09:45 +0100)
Fix crash seen for example with Petsc,

This is still not entirely correct according to the MPI standard
which states that attributes and keys are local to the process.

The difficulty is that an unique smpi::Comm is shared by the
actors, and the attributes are cleared by the actor which lastly
destroys the comm.

src/smpi/include/smpi_keyvals.hpp
src/smpi/mpi/smpi_keyvals.cpp

index 768bc51..015adfc 100644 (file)
@@ -6,11 +6,14 @@
 #ifndef SMPI_KEYVALS_HPP_INCLUDED
 #define SMPI_KEYVALS_HPP_INCLUDED
 
+#include "simgrid/s4u/Actor.hpp"
 #include "smpi/smpi.h"
 #include "xbt/asserts.h"
 
 #include <unordered_map>
 
+XBT_LOG_EXTERNAL_CATEGORY(smpi);
+
 struct smpi_delete_fn {
   MPI_Comm_delete_attr_function          *comm_delete_fn;
   MPI_Type_delete_attr_function          *type_delete_fn;
@@ -33,6 +36,7 @@ struct smpi_key_elem {
   smpi_copy_fn copy_fn;
   smpi_delete_fn delete_fn;
   void* extra_state;
+  aid_t actor_id;
   int refcount;
   bool deleted;
   bool delete_attr; // if true, xbt_free(attr) on delete: used by Fortran bindings
@@ -70,6 +74,7 @@ int Keyval::keyval_create(const smpi_copy_fn& copy_fn, const smpi_delete_fn& del
   value.copy_fn     = copy_fn;
   value.delete_fn   = delete_fn;
   value.extra_state = extra_state;
+  value.actor_id    = simgrid::s4u::this_actor::get_pid();
   value.refcount    = 0;
   value.deleted     = false;
   value.delete_attr = delete_attr;
@@ -87,6 +92,9 @@ template <typename T> int Keyval::keyval_free(int* keyval){
     return MPI_ERR_ARG;
 
   smpi_key_elem& elem = elem_it->second;
+  if (elem.actor_id != simgrid::s4u::this_actor::get_pid())
+    XBT_CWARN(smpi, "Asked to free keyval '%d' which is owned by actor #%ld", *keyval, elem.actor_id);
+
   elem.deleted        = true;
   if (elem.refcount == 0)
     T::keyvals_.erase(elem_it);
@@ -104,6 +112,9 @@ template <typename T> int Keyval::attr_delete(int keyval){
     return MPI_ERR_ARG;
 
   smpi_key_elem& elem = elem_it->second;
+  if (elem.actor_id != simgrid::s4u::this_actor::get_pid())
+    XBT_CWARN(smpi, "Asked to delete attribute for keyval '%d' which is owned by actor #%ld", keyval, elem.actor_id);
+
   int flag            = 0;
   if (int ret = call_deleter<T>((T*)this, elem, keyval, attr->second, &flag); ret != MPI_SUCCESS)
     return ret;
@@ -120,6 +131,9 @@ template <typename T> int Keyval::attr_get(int keyval, void* attr_value, int* fl
   if (auto elem_it = T::keyvals_.find(keyval); elem_it == T::keyvals_.end() || elem_it->second.deleted)
     return MPI_ERR_ARG;
 
+  if (elem_it->second.actor_id != simgrid::s4u::this_actor::get_pid())
+    XBT_CWARN(smpi, "Asked to get attribute for keyval '%d' which is owned by actor #%ld", keyval, elem_it->second.actor_id);
+
   if (auto attr = attributes().find(keyval); attr != attributes().end()) {
     *static_cast<void**>(attr_value) = attr->second;
     *flag=1;
@@ -135,6 +149,9 @@ template <typename T> int Keyval::attr_put(int keyval, void* attr_value){
     return MPI_ERR_ARG;
 
   smpi_key_elem& elem = elem_it->second;
+  if (elem.actor_id != simgrid::s4u::this_actor::get_pid())
+    XBT_CWARN(smpi, "Asked to put attribute for keyval '%d' which is owned by actor #%ld", keyval, elem.actor_id);
+
   if (auto [attr, inserted] = attributes().try_emplace(keyval, attr_value); inserted) {
     elem.refcount++;
   } else {
@@ -153,6 +170,9 @@ template <typename T> void Keyval::cleanup_attr(){
     auto elem_it = T::keyvals_.find(key);
     xbt_assert(elem_it != T::keyvals_.end());
     smpi_key_elem& elem = elem_it->second;
+    if (elem.actor_id != simgrid::s4u::this_actor::get_pid())
+      XBT_CWARN(smpi, "Delete attribute for keyval '%d' which is owned by actor #%ld", it.first, elem.actor_id);
+
     int flag            = 0;
     call_deleter<T>((T*)this, elem, key, value, &flag);
     elem.refcount--;
index b1c61a3..e60a2b2 100644 (file)
@@ -9,9 +9,17 @@
 
 namespace simgrid::smpi {
 
+#define CHECK_OWNER(aid)                                                                                               \
+  if (not s4u::Actor::by_pid(aid)) {                                                                                   \
+    XBT_CWARN(smpi, "Actor #%ld not found. Skip delete callback.", aid);                                               \
+    return ret;                                                                                                        \
+  } else                                                                                                               \
+  (void)0
+
 template <> int Keyval::call_deleter<Comm>(Comm* obj, const smpi_key_elem& elem, int keyval, void* value, int* /*flag*/)
 {
   int ret = MPI_SUCCESS;
+  CHECK_OWNER(elem.actor_id);
   if (elem.delete_fn.comm_delete_fn != MPI_NULL_DELETE_FN)
     ret = elem.delete_fn.comm_delete_fn(obj, keyval, value, elem.extra_state);
   else if (elem.delete_fn.comm_delete_fn_fort != MPI_NULL_DELETE_FN)
@@ -24,6 +32,7 @@ template <> int Keyval::call_deleter<Comm>(Comm* obj, const smpi_key_elem& elem,
 template <> int Keyval::call_deleter<Win>(Win* obj, const smpi_key_elem& elem, int keyval, void* value, int* /*flag*/)
 {
   int ret = MPI_SUCCESS;
+  CHECK_OWNER(elem.actor_id);
   if (elem.delete_fn.win_delete_fn != MPI_NULL_DELETE_FN)
     ret = elem.delete_fn.win_delete_fn(obj, keyval, value, elem.extra_state);
   else if (elem.delete_fn.win_delete_fn_fort != MPI_NULL_DELETE_FN)
@@ -37,6 +46,7 @@ template <>
 int Keyval::call_deleter<Datatype>(Datatype* obj, const smpi_key_elem& elem, int keyval, void* value, int* /*flag*/)
 {
   int ret = MPI_SUCCESS;
+  CHECK_OWNER(elem.actor_id);
   if (elem.delete_fn.type_delete_fn != MPI_NULL_DELETE_FN)
     ret = elem.delete_fn.type_delete_fn(obj, keyval, value, elem.extra_state);
   else if (elem.delete_fn.type_delete_fn_fort != MPI_NULL_DELETE_FN)