src/smpi/mpi/smpi_win.cpp

   1 /* Copyright (c) 2007-2022. The SimGrid Team. All rights reserved.          */
   2
   3 /* This program is free software; you can redistribute it and/or modify it
   4  * under the terms of the license (GNU LGPL) which comes with this package. */
   5
   6 #include "smpi_win.hpp"
   7
   8 #include "private.hpp"
   9 #include "smpi_coll.hpp"
  10 #include "smpi_comm.hpp"
  11 #include "smpi_datatype.hpp"
  12 #include "smpi_info.hpp"
  13 #include "smpi_keyvals.hpp"
  14 #include "smpi_request.hpp"
  15 #include "src/smpi/include/smpi_actor.hpp"
  16
  17 #include <algorithm>
  18
  19 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_rma, smpi, "Logging specific to SMPI (RMA operations)");
  20
  21 #define CHECK_RMA_REMOTE_WIN(fun, win)\
  22   if(target_count*target_datatype->get_extent()>win->size_){\
  23     XBT_WARN("%s: Trying to move %zd, which exceeds the window size on target process %d : %zd - Bailing out.",\
  24     fun, target_count*target_datatype->get_extent(), target_rank, win->size_);\
  25     simgrid::smpi::utils::set_current_buffer(1,"win_base",win->base_);\
  26     return MPI_ERR_RMA_RANGE;\
  27   }
  28
  29 #define CHECK_WIN_LOCKED(win)                                                                                          \
  30   if (opened_ == 0) { /*check that post/start has been done*/                                                          \
  31     bool locked = std::any_of(begin(win->lockers_), end(win->lockers_), [this](int it) { return it == this->rank_; }); \
  32     if (not locked)                                                                                                    \
  33       return MPI_ERR_WIN;                                                                                              \
  34   }
  35
  36 namespace simgrid{
  37 namespace smpi{
  38 std::unordered_map<int, smpi_key_elem> Win::keyvals_;
  39 int Win::keyval_id_=0;
  40
  41 Win::Win(void* base, MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm, bool allocated, bool dynamic)
  42     : base_(base)
  43     , size_(size)
  44     , disp_unit_(disp_unit)
  45     , info_(info)
  46     , comm_(comm)
  47     , connected_wins_(comm->size())
  48     , rank_(comm->rank())
  49     , allocated_(allocated)
  50     , dynamic_(dynamic)
  51 {
  52   XBT_DEBUG("Creating window");
  53   if(info!=MPI_INFO_NULL)
  54     info->ref();
  55   connected_wins_[rank_] = this;
  56   errhandler_->ref();
  57   comm->add_rma_win(this);
  58   comm->ref();
  59
  60   colls::allgather(&connected_wins_[rank_], sizeof(MPI_Win), MPI_BYTE, connected_wins_.data(), sizeof(MPI_Win),
  61                    MPI_BYTE, comm);
  62
  63   colls::barrier(comm);
  64   this->add_f();
  65 }
  66
  67 Win::~Win(){
  68   //As per the standard, perform a barrier to ensure every async comm is finished
  69   colls::barrier(comm_);
  70   flush_local_all();
  71
  72   if (info_ != MPI_INFO_NULL)
  73     simgrid::smpi::Info::unref(info_);
  74   if (errhandler_ != MPI_ERRHANDLER_NULL)
  75     simgrid::smpi::Errhandler::unref(errhandler_);
  76
  77   comm_->remove_rma_win(this);
  78
  79   colls::barrier(comm_);
  80   Comm::unref(comm_);
  81
  82   if (allocated_)
  83     xbt_free(base_);
  84
  85   F2C::free_f(this->f2c_id());
  86   cleanup_attr<Win>();
  87 }
  88
  89 int Win::attach(void* /*base*/, MPI_Aint size)
  90 {
  91   if (not(base_ == MPI_BOTTOM || base_ == nullptr))
  92     return MPI_ERR_ARG;
  93   base_ = nullptr; // actually the address will be given in the RMA calls, as being the disp.
  94   size_+=size;
  95   return MPI_SUCCESS;
  96 }
  97
  98 int Win::detach(const void* /*base*/)
  99 {
 100   base_=MPI_BOTTOM;
 101   size_=-1;
 102   return MPI_SUCCESS;
 103 }
 104
 105 void Win::get_name(char* name, int* length) const
 106 {
 107   *length = static_cast<int>(name_.length());
 108   if (not name_.empty()) {
 109     name_.copy(name, *length);
 110     name[*length] = '\0';
 111   }
 112 }
 113
 114 void Win::get_group(MPI_Group* group){
 115   if(comm_ != MPI_COMM_NULL){
 116     *group = comm_->group();
 117   } else {
 118     *group = MPI_GROUP_NULL;
 119   }
 120 }
 121
 122 MPI_Info Win::info()
 123 {
 124   return info_;
 125 }
 126
 127 int Win::rank() const
 128 {
 129   return rank_;
 130 }
 131
 132 MPI_Comm Win::comm() const
 133 {
 134   return comm_;
 135 }
 136
 137 MPI_Aint Win::size() const
 138 {
 139   return size_;
 140 }
 141
 142 void* Win::base() const
 143 {
 144   return base_;
 145 }
 146
 147 int Win::disp_unit() const
 148 {
 149   return disp_unit_;
 150 }
 151
 152 bool Win::dynamic() const
 153 {
 154   return dynamic_;
 155 }
 156
 157 void Win::set_info(MPI_Info info)
 158 {
 159   if (info_ != MPI_INFO_NULL)
 160     simgrid::smpi::Info::unref(info_);
 161   info_ = info;
 162   if (info_ != MPI_INFO_NULL)
 163     info_->ref();
 164 }
 165
 166 void Win::set_name(const char* name){
 167   name_ = name;
 168 }
 169
 170 int Win::fence(int assert)
 171 {
 172   XBT_DEBUG("Entering fence");
 173   opened_++;
 174   if (not (assert & MPI_MODE_NOPRECEDE)) {
 175     // This is not the first fence => finalize what came before
 176     colls::barrier(comm_);
 177     flush_local_all();
 178     count_=0;
 179   }
 180
 181   if (assert & MPI_MODE_NOSUCCEED) // there should be no ops after this one, tell we are closed.
 182     opened_=0;
 183   assert_ = assert;
 184   colls::barrier(comm_);
 185   XBT_DEBUG("Leaving fence");
 186
 187   return MPI_SUCCESS;
 188 }
 189
 190 int Win::put(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank,
 191               MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Request* request)
 192 {
 193   //get receiver pointer
 194   Win* recv_win = connected_wins_[target_rank];
 195
 196   CHECK_WIN_LOCKED(recv_win)
 197   CHECK_RMA_REMOTE_WIN("MPI_Put", recv_win)
 198
 199   void* recv_addr = static_cast<char*>(recv_win->base_) + target_disp * recv_win->disp_unit_;
 200
 201   if (target_rank != rank_) { // This is not for myself, so we need to send messages
 202     XBT_DEBUG("Entering MPI_Put to remote rank %d", target_rank);
 203     // prepare send_request
 204     MPI_Request sreq =
 205         Request::rma_send_init(origin_addr, origin_count, origin_datatype, rank_, target_rank, SMPI_RMA_TAG + 1, comm_,
 206                                MPI_OP_NULL);
 207
 208     //prepare receiver request
 209     MPI_Request rreq = Request::rma_recv_init(recv_addr, target_count, target_datatype, rank_, target_rank,
 210                                               SMPI_RMA_TAG + 1, recv_win->comm_, MPI_OP_NULL);
 211
 212     //start send
 213     sreq->start();
 214
 215     if(request!=nullptr){
 216       *request=sreq;
 217     }else{
 218       mut_->lock();
 219       requests_.push_back(sreq);
 220       mut_->unlock();
 221     }
 222
 223     //push request to receiver's win
 224     recv_win->mut_->lock();
 225     recv_win->requests_.push_back(rreq);
 226     rreq->start();
 227     recv_win->mut_->unlock();
 228   } else {
 229     XBT_DEBUG("Entering MPI_Put from myself to myself, rank %d", target_rank);
 230     Datatype::copy(origin_addr, origin_count, origin_datatype, recv_addr, target_count, target_datatype);
 231     if(request!=nullptr)
 232       *request = MPI_REQUEST_NULL;
 233   }
 234
 235   return MPI_SUCCESS;
 236 }
 237
 238 int Win::get( void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank,
 239               MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Request* request)
 240 {
 241   //get sender pointer
 242   Win* send_win = connected_wins_[target_rank];
 243
 244   CHECK_WIN_LOCKED(send_win)
 245   CHECK_RMA_REMOTE_WIN("MPI_Get", send_win)
 246
 247   const void* send_addr = static_cast<void*>(static_cast<char*>(send_win->base_) + target_disp * send_win->disp_unit_);
 248   XBT_DEBUG("Entering MPI_Get from %d", target_rank);
 249
 250   if (target_rank != rank_) {
 251     //prepare send_request
 252     MPI_Request sreq = Request::rma_send_init(send_addr, target_count, target_datatype, target_rank, rank_,
 253                                               SMPI_RMA_TAG + 2, send_win->comm_, MPI_OP_NULL);
 254
 255     //prepare receiver request
 256     MPI_Request rreq = Request::rma_recv_init(origin_addr, origin_count, origin_datatype, target_rank, rank_,
 257                                               SMPI_RMA_TAG + 2, comm_, MPI_OP_NULL);
 258
 259     //start the send, with another process than us as sender.
 260     sreq->start();
 261     // push request to sender's win
 262     send_win->mut_->lock();
 263     send_win->requests_.push_back(sreq);
 264     send_win->mut_->unlock();
 265
 266     //start recv
 267     rreq->start();
 268
 269     if(request!=nullptr){
 270       *request=rreq;
 271     }else{
 272       mut_->lock();
 273       requests_.push_back(rreq);
 274       mut_->unlock();
 275     }
 276   } else {
 277     Datatype::copy(send_addr, target_count, target_datatype, origin_addr, origin_count, origin_datatype);
 278     if(request!=nullptr)
 279       *request=MPI_REQUEST_NULL;
 280   }
 281   return MPI_SUCCESS;
 282 }
 283
 284 int Win::accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank,
 285               MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPI_Request* request)
 286 {
 287   XBT_DEBUG("Entering MPI_Win_Accumulate");
 288   //get receiver pointer
 289   Win* recv_win = connected_wins_[target_rank];
 290
 291   //FIXME: local version
 292   CHECK_WIN_LOCKED(recv_win)
 293   CHECK_RMA_REMOTE_WIN("MPI_Accumulate", recv_win)
 294
 295   void* recv_addr = static_cast<char*>(recv_win->base_) + target_disp * recv_win->disp_unit_;
 296   XBT_DEBUG("Entering MPI_Accumulate to %d", target_rank);
 297   // As the tag will be used for ordering of the operations, subtract count from it (to avoid collisions with other
 298   // SMPI tags, SMPI_RMA_TAG is set below all the other ones we use)
 299   // prepare send_request
 300
 301   MPI_Request sreq = Request::rma_send_init(origin_addr, origin_count, origin_datatype, rank_, target_rank,
 302                                             SMPI_RMA_TAG - 3 - count_, comm_, op);
 303
 304   // prepare receiver request
 305   MPI_Request rreq = Request::rma_recv_init(recv_addr, target_count, target_datatype, rank_, target_rank,
 306                                             SMPI_RMA_TAG - 3 - count_, recv_win->comm_, op);
 307
 308   count_++;
 309
 310   // start send
 311   sreq->start();
 312   // push request to receiver's win
 313   recv_win->mut_->lock();
 314   recv_win->requests_.push_back(rreq);
 315   rreq->start();
 316   recv_win->mut_->unlock();
 317
 318   if (request != nullptr) {
 319     *request = sreq;
 320   } else {
 321     mut_->lock();
 322     requests_.push_back(sreq);
 323     mut_->unlock();
 324   }
 325
 326   // FIXME: The current implementation fails to ensure the correct ordering of the accumulate requests.  The following
 327   // 'flush' is a workaround to fix that.
 328   flush(target_rank);
 329   XBT_DEBUG("Leaving MPI_Win_Accumulate");
 330   return MPI_SUCCESS;
 331 }
 332
 333 int Win::get_accumulate(const void* origin_addr, int origin_count, MPI_Datatype origin_datatype, void* result_addr,
 334                         int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp,
 335                         int target_count, MPI_Datatype target_datatype, MPI_Op op, MPI_Request*)
 336 {
 337   //get sender pointer
 338   const Win* send_win = connected_wins_[target_rank];
 339
 340   CHECK_WIN_LOCKED(send_win)
 341   CHECK_RMA_REMOTE_WIN("MPI_Get_Accumulate", send_win)
 342
 343   XBT_DEBUG("Entering MPI_Get_accumulate from %d", target_rank);
 344   //need to be sure ops are correctly ordered, so finish request here ? slow.
 345   MPI_Request req = MPI_REQUEST_NULL;
 346   send_win->atomic_mut_->lock();
 347   get(result_addr, result_count, result_datatype, target_rank,
 348               target_disp, target_count, target_datatype, &req);
 349   if (req != MPI_REQUEST_NULL)
 350     Request::wait(&req, MPI_STATUS_IGNORE);
 351   if(op!=MPI_NO_OP)
 352     accumulate(origin_addr, origin_count, origin_datatype, target_rank,
 353               target_disp, target_count, target_datatype, op, &req);
 354   if (req != MPI_REQUEST_NULL)
 355     Request::wait(&req, MPI_STATUS_IGNORE);
 356   send_win->atomic_mut_->unlock();
 357   return MPI_SUCCESS;
 358 }
 359
 360 int Win::compare_and_swap(const void* origin_addr, const void* compare_addr, void* result_addr, MPI_Datatype datatype,
 361                           int target_rank, MPI_Aint target_disp)
 362 {
 363   //get sender pointer
 364   const Win* send_win = connected_wins_[target_rank];
 365
 366   CHECK_WIN_LOCKED(send_win)
 367
 368   XBT_DEBUG("Entering MPI_Compare_and_swap with %d", target_rank);
 369   MPI_Request req = MPI_REQUEST_NULL;
 370   send_win->atomic_mut_->lock();
 371   get(result_addr, 1, datatype, target_rank,
 372               target_disp, 1, datatype, &req);
 373   if (req != MPI_REQUEST_NULL)
 374     Request::wait(&req, MPI_STATUS_IGNORE);
 375   if (not memcmp(result_addr, compare_addr, datatype->get_extent())) {
 376     put(origin_addr, 1, datatype, target_rank,
 377               target_disp, 1, datatype);
 378   }
 379   send_win->atomic_mut_->unlock();
 380   return MPI_SUCCESS;
 381 }
 382
 383 int Win::start(MPI_Group group, int /*assert*/)
 384 {
 385   /* From MPI forum advices
 386   The call to MPI_WIN_COMPLETE does not return until the put call has completed at the origin; and the target window
 387   will be accessed by the put operation only after the call to MPI_WIN_START has matched a call to MPI_WIN_POST by
 388   the target process. This still leaves much choice to implementors. The call to MPI_WIN_START can block until the
 389   matching call to MPI_WIN_POST occurs at all target processes. One can also have implementations where the call to
 390   MPI_WIN_START is nonblocking, but the call to MPI_PUT blocks until the matching call to MPI_WIN_POST occurred; or
 391   implementations where the first two calls are nonblocking, but the call to MPI_WIN_COMPLETE blocks until the call
 392   to MPI_WIN_POST occurred; or even implementations where all three calls can complete before any target process
 393   called MPI_WIN_POST --- the data put must be buffered, in this last case, so as to allow the put to complete at the
 394   origin ahead of its completion at the target. However, once the call to MPI_WIN_POST is issued, the sequence above
 395   must complete, without further dependencies.  */
 396
 397   //naive, blocking implementation.
 398   XBT_DEBUG("Entering MPI_Win_Start");
 399   std::vector<MPI_Request> reqs;
 400   for (int i = 0; i < group->size(); i++) {
 401     int src = comm_->group()->rank(group->actor(i));
 402     xbt_assert(src != MPI_UNDEFINED);
 403     if (src != rank_)
 404       reqs.emplace_back(Request::irecv_init(nullptr, 0, MPI_CHAR, src, SMPI_RMA_TAG + 4, comm_));
 405   }
 406   int size = static_cast<int>(reqs.size());
 407
 408   Request::startall(size, reqs.data());
 409   Request::waitall(size, reqs.data(), MPI_STATUSES_IGNORE);
 410   for (auto& req : reqs)
 411     Request::unref(&req);
 412
 413   group->ref();
 414   dst_group_ = group;
 415   opened_++; // we're open for business !
 416   XBT_DEBUG("Leaving MPI_Win_Start");
 417   return MPI_SUCCESS;
 418 }
 419
 420 int Win::post(MPI_Group group, int /*assert*/)
 421 {
 422   //let's make a synchronous send here
 423   XBT_DEBUG("Entering MPI_Win_Post");
 424   std::vector<MPI_Request> reqs;
 425   for (int i = 0; i < group->size(); i++) {
 426     int dst = comm_->group()->rank(group->actor(i));
 427     xbt_assert(dst != MPI_UNDEFINED);
 428     if (dst != rank_)
 429       reqs.emplace_back(Request::send_init(nullptr, 0, MPI_CHAR, dst, SMPI_RMA_TAG + 4, comm_));
 430   }
 431   int size = static_cast<int>(reqs.size());
 432
 433   Request::startall(size, reqs.data());
 434   Request::waitall(size, reqs.data(), MPI_STATUSES_IGNORE);
 435   for (auto& req : reqs)
 436     Request::unref(&req);
 437
 438   group->ref();
 439   src_group_ = group;
 440   opened_++; // we're open for business !
 441   XBT_DEBUG("Leaving MPI_Win_Post");
 442   return MPI_SUCCESS;
 443 }
 444
 445 int Win::complete(){
 446   xbt_assert(opened_ != 0, "Complete called on already opened MPI_Win");
 447
 448   XBT_DEBUG("Entering MPI_Win_Complete");
 449   std::vector<MPI_Request> reqs;
 450   for (int i = 0; i < dst_group_->size(); i++) {
 451     int dst = comm_->group()->rank(dst_group_->actor(i));
 452     xbt_assert(dst != MPI_UNDEFINED);
 453     if (dst != rank_)
 454       reqs.emplace_back(Request::send_init(nullptr, 0, MPI_CHAR, dst, SMPI_RMA_TAG + 5, comm_));
 455   }
 456   int size = static_cast<int>(reqs.size());
 457
 458   XBT_DEBUG("Win_complete - Sending sync messages to %d processes", size);
 459   Request::startall(size, reqs.data());
 460   Request::waitall(size, reqs.data(), MPI_STATUSES_IGNORE);
 461   for (auto& req : reqs)
 462     Request::unref(&req);
 463
 464   flush_local_all();
 465
 466   opened_--; //we're closed for business !
 467   Group::unref(dst_group_);
 468   dst_group_ = MPI_GROUP_NULL;
 469   return MPI_SUCCESS;
 470 }
 471
 472 int Win::wait(){
 473   //naive, blocking implementation.
 474   XBT_DEBUG("Entering MPI_Win_Wait");
 475   std::vector<MPI_Request> reqs;
 476   for (int i = 0; i < src_group_->size(); i++) {
 477     int src = comm_->group()->rank(src_group_->actor(i));
 478     xbt_assert(src != MPI_UNDEFINED);
 479     if (src != rank_)
 480       reqs.emplace_back(Request::irecv_init(nullptr, 0, MPI_CHAR, src, SMPI_RMA_TAG + 5, comm_));
 481   }
 482   int size = static_cast<int>(reqs.size());
 483
 484   XBT_DEBUG("Win_wait - Receiving sync messages from %d processes", size);
 485   Request::startall(size, reqs.data());
 486   Request::waitall(size, reqs.data(), MPI_STATUSES_IGNORE);
 487   for (auto& req : reqs)
 488     Request::unref(&req);
 489
 490   flush_local_all();
 491
 492   opened_--; //we're closed for business !
 493   Group::unref(src_group_);
 494   src_group_ = MPI_GROUP_NULL;
 495   return MPI_SUCCESS;
 496 }
 497
 498 int Win::lock(int lock_type, int rank, int /*assert*/)
 499 {
 500   MPI_Win target_win = connected_wins_[rank];
 501
 502   if ((lock_type == MPI_LOCK_EXCLUSIVE && target_win->mode_ != MPI_LOCK_SHARED)|| target_win->mode_ == MPI_LOCK_EXCLUSIVE){
 503     target_win->lock_mut_->lock();
 504     target_win->mode_+= lock_type;//add the lock_type to differentiate case when we are switching from EXCLUSIVE to SHARED (no release needed in the unlock)
 505     if(lock_type == MPI_LOCK_SHARED){//the window used to be exclusive, it's now shared.
 506       target_win->lock_mut_->unlock();
 507    }
 508   } else if (not(target_win->mode_ == MPI_LOCK_SHARED && lock_type == MPI_LOCK_EXCLUSIVE))
 509     target_win->mode_ += lock_type; // don't set to exclusive if it's already shared
 510
 511   target_win->lockers_.push_back(rank_);
 512
 513   flush(rank);
 514   return MPI_SUCCESS;
 515 }
 516
 517 int Win::lock_all(int assert){
 518   int retval = MPI_SUCCESS;
 519   for (int i = 0; i < comm_->size(); i++) {
 520     int ret = this->lock(MPI_LOCK_SHARED, i, assert);
 521     if (ret != MPI_SUCCESS)
 522       retval = ret;
 523   }
 524   return retval;
 525 }
 526
 527 int Win::unlock(int rank){
 528   MPI_Win target_win = connected_wins_[rank];
 529   int target_mode = target_win->mode_;
 530   target_win->mode_= 0;
 531   target_win->lockers_.remove(rank_);
 532   if (target_mode==MPI_LOCK_EXCLUSIVE){
 533     target_win->lock_mut_->unlock();
 534   }
 535
 536   flush(rank);
 537   return MPI_SUCCESS;
 538 }
 539
 540 int Win::unlock_all(){
 541   int retval = MPI_SUCCESS;
 542   for (int i = 0; i < comm_->size(); i++) {
 543     int ret = this->unlock(i);
 544     if (ret != MPI_SUCCESS)
 545       retval = ret;
 546   }
 547   return retval;
 548 }
 549
 550 int Win::flush(int rank){
 551   int finished = finish_comms(rank);
 552   XBT_DEBUG("Win_flush on local %d for remote %d - Finished %d RMA calls", rank_, rank, finished);
 553   if (rank != rank_) {
 554     finished = connected_wins_[rank]->finish_comms(rank_);
 555     XBT_DEBUG("Win_flush on remote %d for local %d - Finished %d RMA calls", rank, rank_, finished);
 556   }
 557   return MPI_SUCCESS;
 558 }
 559
 560 int Win::flush_local(int rank){
 561   int finished = finish_comms(rank);
 562   XBT_DEBUG("Win_flush_local on local %d for remote %d - Finished %d RMA calls", rank_, rank, finished);
 563   return MPI_SUCCESS;
 564 }
 565
 566 int Win::flush_all(){
 567   int finished = finish_comms();
 568   XBT_DEBUG("Win_flush_all on local %d - Finished %d RMA calls", rank_, finished);
 569   for (int i = 0; i < comm_->size(); i++) {
 570     if (i != rank_) {
 571       finished = connected_wins_[i]->finish_comms(rank_);
 572       XBT_DEBUG("Win_flush_all on remote %d for local %d - Finished %d RMA calls", i, rank_, finished);
 573     }
 574   }
 575   return MPI_SUCCESS;
 576 }
 577
 578 int Win::flush_local_all(){
 579   int finished = finish_comms();
 580   XBT_DEBUG("Win_flush_local_all on local %d - Finished %d RMA calls", rank_, finished);
 581   return MPI_SUCCESS;
 582 }
 583
 584 Win* Win::f2c(int id){
 585   return static_cast<Win*>(F2C::f2c(id));
 586 }
 587
 588 int Win::finish_comms(){
 589   // This (simulated) mutex ensures that no process pushes to the vector of requests during the waitall.
 590   // Without this, the vector could get redimensioned when another process pushes.
 591   // This would result in the array used by Request::waitall() to be invalidated.
 592   // Another solution would be to copy the data and cleanup the vector *before* Request::waitall
 593   mut_->lock();
 594   //Finish own requests
 595   int size = static_cast<int>(requests_.size());
 596   if (size > 0) {
 597     MPI_Request* treqs = requests_.data();
 598     Request::waitall(size, treqs, MPI_STATUSES_IGNORE);
 599     requests_.clear();
 600   }
 601   mut_->unlock();
 602   return size;
 603 }
 604
 605 int Win::finish_comms(int rank){
 606   // See comment about the mutex in finish_comms() above
 607   mut_->lock();
 608   // Finish own requests
 609   // Let's see if we're either the destination or the sender of this request
 610   // because we only wait for requests that we are responsible for.
 611   // Also use the process id here since the request itself returns from src()
 612   // and dst() the process id, NOT the rank (which only exists in the context of a communicator).
 613   aid_t proc_id = comm_->group()->actor(rank);
 614   auto it     = std::stable_partition(begin(requests_), end(requests_), [proc_id](const MPI_Request& req) {
 615     return (req == MPI_REQUEST_NULL || (req->src() != proc_id && req->dst() != proc_id));
 616   });
 617   std::vector<MPI_Request> myreqqs(it, end(requests_));
 618   requests_.erase(it, end(requests_));
 619   int size = static_cast<int>(myreqqs.size());
 620   if (size > 0) {
 621     MPI_Request* treqs = myreqqs.data();
 622     Request::waitall(size, treqs, MPI_STATUSES_IGNORE);
 623     myreqqs.clear();
 624   }
 625   mut_->unlock();
 626   return size;
 627 }
 628
 629 int Win::shared_query(int rank, MPI_Aint* size, int* disp_unit, void* baseptr) const
 630 {
 631   const Win* target_win = rank != MPI_PROC_NULL ? connected_wins_[rank] : nullptr;
 632   for (int i = 0; not target_win && i < comm_->size(); i++) {
 633     if (connected_wins_[i]->size_ > 0)
 634       target_win = connected_wins_[i];
 635   }
 636   if (target_win) {
 637     *size                         = target_win->size_;
 638     *disp_unit                    = target_win->disp_unit_;
 639     *static_cast<void**>(baseptr) = target_win->base_;
 640   } else {
 641     *size                         = 0;
 642     *static_cast<void**>(baseptr) = nullptr;
 643   }
 644   return MPI_SUCCESS;
 645 }
 646
 647 MPI_Errhandler Win::errhandler()
 648 {
 649   if (errhandler_ != MPI_ERRHANDLER_NULL)
 650     errhandler_->ref();
 651   return errhandler_;
 652 }
 653
 654 void Win::set_errhandler(MPI_Errhandler errhandler)
 655 {
 656   if (errhandler_ != MPI_ERRHANDLER_NULL)
 657     simgrid::smpi::Errhandler::unref(errhandler_);
 658   errhandler_ = errhandler;
 659   if (errhandler_ != MPI_ERRHANDLER_NULL)
 660     errhandler_->ref();
 661 }
 662 } // namespace smpi
 663 } // namespace simgrid