From: Augustin Degomme <26892-adegomme@users.noreply.framagit.org> Date: Thu, 6 Oct 2022 22:18:04 +0000 (+0000) Subject: SMPI: add option to inject a barrier before every collective call, to allow better... X-Git-Tag: v3.34~797^2 X-Git-Url: http://bilbo.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/755cb53034e37f0fce508150b0cea0203db9cb95 SMPI: add option to inject a barrier before every collective call, to allow better detection of dangerous patterns in MPI codes, triggering deadlocks more consistently. Activate it in -analyze mode, or with --cfg=smpi/colls-inject-barrier:true --- diff --git a/src/smpi/bindings/smpi_pmpi_coll.cpp b/src/smpi/bindings/smpi_pmpi_coll.cpp index 5915270a67..a8da1d4501 100644 --- a/src/smpi/bindings/smpi_pmpi_coll.cpp +++ b/src/smpi/bindings/smpi_pmpi_coll.cpp @@ -76,6 +76,9 @@ int PMPI_Ibcast(void* buf, int count, MPI_Datatype datatype, int root, MPI_Comm new simgrid::instr::CollTIData(request == MPI_REQUEST_IGNORED ? "bcast" : "ibcast", root, -1.0, count, 0, simgrid::smpi::Datatype::encode(datatype), "")); + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + if (comm->size() > 1) { if (request == MPI_REQUEST_IGNORED) simgrid::smpi::colls::bcast(buf, count, datatype, root, comm); @@ -135,6 +138,9 @@ int PMPI_Igather(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void const SmpiBenchGuard suspend_bench; + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + aid_t pid = simgrid::s4u::this_actor::get_pid(); TRACE_smpi_comm_in(pid, request == MPI_REQUEST_IGNORED ? "PMPI_Gather" : "PMPI_Igather", @@ -190,6 +196,10 @@ int PMPI_Igatherv(const void* sendbuf, int sendcount, MPI_Datatype sendtype, voi } const SmpiBenchGuard suspend_bench; + + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + const void* real_sendbuf = sendbuf; int real_sendcount = sendcount; MPI_Datatype real_sendtype = sendtype; @@ -260,6 +270,9 @@ int PMPI_Iallgather(const void* sendbuf, int sendcount, MPI_Datatype sendtype, v const SmpiBenchGuard suspend_bench; + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + aid_t pid = simgrid::s4u::this_actor::get_pid(); TRACE_smpi_comm_in(pid, request == MPI_REQUEST_IGNORED ? "PMPI_Allgather" : "PMPI_Iallggather", @@ -306,6 +319,10 @@ int PMPI_Iallgatherv(const void* sendbuf, int sendcount, MPI_Datatype sendtype, CHECK_COLLECTIVE(comm, MPI_REQUEST_IGNORED ? "PMPI_Allgatherv" : "PMPI_Iallgatherv") const SmpiBenchGuard suspend_bench; + + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + if (sendbuf == MPI_IN_PLACE) { sendbuf = static_cast(recvbuf) + recvtype->get_extent() * displs[comm->rank()]; sendcount = recvcounts[comm->rank()]; @@ -374,6 +391,9 @@ int PMPI_Iscatter(const void* sendbuf, int sendcount, MPI_Datatype sendtype, voi const SmpiBenchGuard suspend_bench; + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + aid_t pid = simgrid::s4u::this_actor::get_pid(); TRACE_smpi_comm_in(pid, request == MPI_REQUEST_IGNORED ? "PMPI_Scatter" : "PMPI_Iscatter", @@ -430,6 +450,9 @@ int PMPI_Iscatterv(const void* sendbuf, const int* sendcounts, const int* displs const SmpiBenchGuard suspend_bench; + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + aid_t pid = simgrid::s4u::this_actor::get_pid(); auto trace_sendcounts = std::make_shared>(); @@ -480,6 +503,10 @@ int PMPI_Ireduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype dat op->name() + " and root " + std::to_string(root)) const SmpiBenchGuard suspend_bench; + + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + aid_t pid = simgrid::s4u::this_actor::get_pid(); TRACE_smpi_comm_in(pid, request == MPI_REQUEST_IGNORED ? "PMPI_Reduce" : "PMPI_Ireduce", @@ -532,6 +559,10 @@ int PMPI_Iallreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype " with op " + op->name()) const SmpiBenchGuard suspend_bench; + + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + std::vector tmp_sendbuf; const void* real_sendbuf = smpi_get_in_place_buf(sendbuf, recvbuf, tmp_sendbuf, count, datatype); @@ -571,6 +602,10 @@ int PMPI_Iscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datat std::string(request == MPI_REQUEST_IGNORED ? "PMPI_Scan" : "PMPI_Iscan") + " with op " + op->name()) const SmpiBenchGuard suspend_bench; + + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + aid_t pid = simgrid::s4u::this_actor::get_pid(); std::vector tmp_sendbuf; const void* real_sendbuf = smpi_get_in_place_buf(sendbuf, recvbuf, tmp_sendbuf, count, datatype); @@ -608,6 +643,10 @@ int PMPI_Iexscan(const void *sendbuf, void *recvbuf, int count, MPI_Datatype dat op->name()) const SmpiBenchGuard suspend_bench; + + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + aid_t pid = simgrid::s4u::this_actor::get_pid(); std::vector tmp_sendbuf; const void* real_sendbuf = smpi_get_in_place_buf(sendbuf, recvbuf, tmp_sendbuf, count, datatype); @@ -651,6 +690,10 @@ int PMPI_Ireduce_scatter(const void *sendbuf, void *recvbuf, const int *recvcoun " with op " + op->name()) const SmpiBenchGuard suspend_bench; + + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + aid_t pid = simgrid::s4u::this_actor::get_pid(); auto trace_recvcounts = std::make_shared>(); trace_recvcounts->insert(trace_recvcounts->end(), &recvcounts[0], &recvcounts[comm->size()]); @@ -700,6 +743,10 @@ int PMPI_Ireduce_scatter_block(const void* sendbuf, void* recvbuf, int recvcount " with op " + op->name()) const SmpiBenchGuard suspend_bench; + + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + int count = comm->size(); aid_t pid = simgrid::s4u::this_actor::get_pid(); @@ -767,6 +814,9 @@ int PMPI_Ialltoall(const void* sendbuf, int sendcount, MPI_Datatype sendtype, vo const SmpiBenchGuard suspend_bench; + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + TRACE_smpi_comm_in(pid, request == MPI_REQUEST_IGNORED ? "PMPI_Alltoall" : "PMPI_Ialltoall", new simgrid::instr::CollTIData( request == MPI_REQUEST_IGNORED ? "alltoall" : "ialltoall", -1, -1.0, @@ -819,6 +869,10 @@ int PMPI_Ialltoallv(const void* sendbuf, const int* sendcounts, const int* sendd } const SmpiBenchGuard suspend_bench; + + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + int send_size = 0; int recv_size = 0; auto trace_sendcounts = std::make_shared>(); @@ -914,6 +968,9 @@ int PMPI_Ialltoallw(const void* sendbuf, const int* sendcounts, const int* sendd const SmpiBenchGuard suspend_bench; + if(simgrid::config::get_value("smpi/colls-inject-barrier")) + simgrid::smpi::colls::barrier(comm); + int send_size = 0; int recv_size = 0; auto trace_sendcounts = std::make_shared>(); diff --git a/src/smpi/internals/smpi_config.cpp b/src/smpi/internals/smpi_config.cpp index a0c13a9c12..2a0ec1876b 100644 --- a/src/smpi/internals/smpi_config.cpp +++ b/src/smpi/internals/smpi_config.cpp @@ -152,6 +152,9 @@ simgrid::config::Flag _smpi_cfg_list_leaks("smpi/list-leaks", "Whether we should display the n first MPI handle leaks (addresses and type only) after simulation", -1); +simgrid::config::Flag _smpi_cfg_colls_inject_barrier{ + "smpi/colls-inject-barrier", "Inject a barrier in each colllective operation, to detect some deadlocks in incorrect MPI codes, which may not be triggered in all cases", false }; + double smpi_cfg_host_speed(){ return _smpi_cfg_host_speed; } diff --git a/src/smpi/smpirun.in b/src/smpi/smpirun.in index 953a18e071..311973cb94 100755 --- a/src/smpi/smpirun.in +++ b/src/smpi/smpirun.in @@ -233,7 +233,7 @@ while true; do shift 1 ;; "-analyze") - SIMOPTS="$SIMOPTS --cfg=smpi/display-timing:yes --cfg=smpi/display-allocs:yes --cfg=smpi/list-leaks:50 --cfg=smpi/pedantic:true" + SIMOPTS="$SIMOPTS --cfg=smpi/display-timing:yes --cfg=smpi/display-allocs:yes --cfg=smpi/list-leaks:50 --cfg=smpi/pedantic:true --cfg=smpi/colls-inject-barrier:true" shift 1 ;; "-help" | "--help" | "-h")