X-Git-Url: http://bilbo.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/blobdiff_plain/e553a03ec20f531e48fab905d2034551c58a683f..8e43c615b2475d0174be55f95f930ca30988279e:/src/smpi/smpi_base.cpp diff --git a/src/smpi/smpi_base.cpp b/src/smpi/smpi_base.cpp index 9bae432bef..6e4ca6901e 100644 --- a/src/smpi/smpi_base.cpp +++ b/src/smpi/smpi_base.cpp @@ -17,12 +17,17 @@ #include "surf/surf.h" #include "simgrid/sg_config.h" #include "smpi/smpi_utils.hpp" +#include "src/smpi/smpi_group.hpp" #include "colls/colls.h" +#include #include "src/kernel/activity/SynchroComm.hpp" XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_base, smpi, "Logging specific to SMPI (base)"); +extern void (*smpi_comm_copy_data_callback) (smx_activity_t, void*, size_t); + + static int match_recv(void* a, void* b, smx_activity_t ignored) { MPI_Request ref = static_cast(a); MPI_Request req = static_cast(b); @@ -214,7 +219,7 @@ static MPI_Request build_request(void *buf, int count, MPI_Datatype datatype, in request->dst = dst; request->tag = tag; request->comm = comm; - smpi_comm_use(request->comm); + request->comm->use(); request->action = nullptr; request->flags = flags; request->detached = 0; @@ -255,7 +260,7 @@ MPI_Request smpi_mpi_send_init(void *buf, int count, MPI_Datatype datatype, int { MPI_Request request = nullptr; /* MC needs the comm to be set to nullptr during the call */ request = build_request(buf==MPI_BOTTOM ? nullptr : buf, count, datatype, smpi_process_index(), - smpi_group_index(smpi_comm_group(comm), dst), tag, comm, PERSISTENT | SEND | PREPARED); + comm->group()->index(dst), tag, comm, PERSISTENT | SEND | PREPARED); return request; } @@ -263,7 +268,7 @@ MPI_Request smpi_mpi_ssend_init(void *buf, int count, MPI_Datatype datatype, int { MPI_Request request = nullptr; /* MC needs the comm to be set to nullptr during the call */ request = build_request(buf==MPI_BOTTOM ? nullptr : buf, count, datatype, smpi_process_index(), - smpi_group_index(smpi_comm_group(comm), dst), tag, comm, PERSISTENT | SSEND | SEND | PREPARED); + comm->group()->index(dst), tag, comm, PERSISTENT | SSEND | SEND | PREPARED); return request; } @@ -271,7 +276,7 @@ MPI_Request smpi_mpi_recv_init(void *buf, int count, MPI_Datatype datatype, int { MPI_Request request = nullptr; /* MC needs the comm to be set to nullptr during the call */ request = build_request(buf==MPI_BOTTOM ? nullptr : buf, count, datatype, - src == MPI_ANY_SOURCE ? MPI_ANY_SOURCE : smpi_group_index(smpi_comm_group(comm), src), + src == MPI_ANY_SOURCE ? MPI_ANY_SOURCE : comm->group()->index(src), smpi_process_index(), tag, comm, PERSISTENT | RECV | PREPARED); return request; } @@ -332,7 +337,7 @@ void smpi_mpi_start(MPI_Request request) // we make a copy here, as the size is modified by simix, and we may reuse the request in another receive later request->real_size=request->size; request->action = simcall_comm_irecv(SIMIX_process_self(), mailbox, request->buf, &request->real_size, &match_recv, - ! smpi_process_get_replaying()? &smpi_comm_copy_buffer_callback + ! smpi_process_get_replaying()? smpi_comm_copy_data_callback : &smpi_comm_null_copy_buffer_callback, request, -1.0); XBT_DEBUG("recv simcall posted"); @@ -422,7 +427,7 @@ void smpi_mpi_start(MPI_Request request) request->action = simcall_comm_isend(SIMIX_process_from_PID(request->src+1), mailbox, request->size, -1.0, buf, request->real_size, &match_send, &xbt_free_f, // how to free the userdata if a detached send fails - !smpi_process_get_replaying() ? &smpi_comm_copy_buffer_callback + !smpi_process_get_replaying() ? smpi_comm_copy_data_callback : &smpi_comm_null_copy_buffer_callback, request, // detach if msg size < eager/rdv switch limit request->detached); @@ -455,7 +460,7 @@ void smpi_mpi_request_free(MPI_Request * request) if((*request)->refcount==0){ smpi_datatype_unuse((*request)->old_type); - smpi_comm_unuse((*request)->comm); + (*request)->comm->unuse(); print_request("Destroying", (*request)); xbt_free(*request); *request = MPI_REQUEST_NULL; @@ -501,7 +506,7 @@ MPI_Request smpi_isend_init(void *buf, int count, MPI_Datatype datatype, int dst { MPI_Request request = nullptr; /* MC needs the comm to be set to nullptr during the call */ request = build_request(buf==MPI_BOTTOM ? nullptr : buf , count, datatype, smpi_process_index(), - smpi_group_index(smpi_comm_group(comm), dst), tag,comm, PERSISTENT | ISEND | SEND | PREPARED); + comm->group()->index(dst), tag,comm, PERSISTENT | ISEND | SEND | PREPARED); return request; } @@ -509,7 +514,7 @@ MPI_Request smpi_mpi_isend(void *buf, int count, MPI_Datatype datatype, int dst, { MPI_Request request = nullptr; /* MC needs the comm to be set to nullptr during the call */ request = build_request(buf==MPI_BOTTOM ? nullptr : buf, count, datatype, smpi_process_index(), - smpi_group_index(smpi_comm_group(comm), dst), tag, comm, NON_PERSISTENT | ISEND | SEND); + comm->group()->index(dst), tag, comm, NON_PERSISTENT | ISEND | SEND); smpi_mpi_start(request); return request; } @@ -518,7 +523,7 @@ MPI_Request smpi_mpi_issend(void *buf, int count, MPI_Datatype datatype, int dst { MPI_Request request = nullptr; /* MC needs the comm to be set to nullptr during the call */ request = build_request(buf==MPI_BOTTOM ? nullptr : buf, count, datatype, smpi_process_index(), - smpi_group_index(smpi_comm_group(comm), dst), tag,comm, NON_PERSISTENT | ISEND | SSEND | SEND); + comm->group()->index(dst), tag,comm, NON_PERSISTENT | ISEND | SSEND | SEND); smpi_mpi_start(request); return request; } @@ -527,7 +532,7 @@ MPI_Request smpi_irecv_init(void *buf, int count, MPI_Datatype datatype, int src { MPI_Request request = nullptr; /* MC needs the comm to be set to nullptr during the call */ request = build_request(buf==MPI_BOTTOM ? nullptr : buf, count, datatype, src == MPI_ANY_SOURCE ? MPI_ANY_SOURCE : - smpi_group_index(smpi_comm_group(comm), src), smpi_process_index(), tag, + comm->group()->index(src), smpi_process_index(), tag, comm, PERSISTENT | RECV | PREPARED); return request; } @@ -536,7 +541,7 @@ MPI_Request smpi_mpi_irecv(void *buf, int count, MPI_Datatype datatype, int src, { MPI_Request request = nullptr; /* MC needs the comm to be set to nullptr during the call */ request = build_request(buf==MPI_BOTTOM ? nullptr : buf, count, datatype, src == MPI_ANY_SOURCE ? MPI_ANY_SOURCE : - smpi_group_index(smpi_comm_group(comm), src), smpi_process_index(), tag, comm, + comm->group()->index(src), smpi_process_index(), tag, comm, NON_PERSISTENT | RECV); smpi_mpi_start(request); return request; @@ -554,7 +559,7 @@ void smpi_mpi_send(void *buf, int count, MPI_Datatype datatype, int dst, int tag { MPI_Request request = nullptr; /* MC needs the comm to be set to nullptr during the call */ request = build_request(buf==MPI_BOTTOM ? nullptr : buf, count, datatype, smpi_process_index(), - smpi_group_index(smpi_comm_group(comm), dst), tag, comm, NON_PERSISTENT | SEND); + comm->group()->index(dst), tag, comm, NON_PERSISTENT | SEND); smpi_mpi_start(request); smpi_mpi_wait(&request, MPI_STATUS_IGNORE); @@ -565,7 +570,7 @@ void smpi_mpi_ssend(void *buf, int count, MPI_Datatype datatype, int dst, int ta { MPI_Request request = nullptr; /* MC needs the comm to be set to nullptr during the call */ request = build_request(buf==MPI_BOTTOM ? nullptr : buf, count, datatype, smpi_process_index(), - smpi_group_index(smpi_comm_group(comm), dst), tag, comm, NON_PERSISTENT | SSEND | SEND); + comm->group()->index(dst), tag, comm, NON_PERSISTENT | SSEND | SEND); smpi_mpi_start(request); smpi_mpi_wait(&request, MPI_STATUS_IGNORE); @@ -579,7 +584,7 @@ void smpi_mpi_sendrecv(void *sendbuf, int sendcount, MPI_Datatype sendtype,int d MPI_Request requests[2]; MPI_Status stats[2]; int myid=smpi_process_index(); - if ((smpi_group_index(smpi_comm_group(comm), dst) == myid) && (smpi_group_index(smpi_comm_group(comm), src) == myid)){ + if ((comm->group()->index(dst) == myid) && (comm->group()->index(src) == myid)){ smpi_datatype_copy(sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype); return; } @@ -608,7 +613,7 @@ static void finish_wait(MPI_Request * request, MPI_Status * status) if(!((req->detached != 0) && ((req->flags & SEND) != 0)) && ((req->flags & PREPARED) == 0)){ if(status != MPI_STATUS_IGNORE) { int src = req->src == MPI_ANY_SOURCE ? req->real_src : req->src; - status->MPI_SOURCE = smpi_group_rank(smpi_comm_group(req->comm), src); + status->MPI_SOURCE = req->comm->group()->rank(src); status->MPI_TAG = req->tag == MPI_ANY_TAG ? req->real_tag : req->tag; status->MPI_ERROR = req->truncated != 0 ? MPI_ERR_TRUNCATE : MPI_SUCCESS; // this handles the case were size in receive differs from size in send @@ -771,14 +776,18 @@ void smpi_mpi_probe(int source, int tag, MPI_Comm comm, MPI_Status* status){ void smpi_mpi_iprobe(int source, int tag, MPI_Comm comm, int* flag, MPI_Status* status){ MPI_Request request = build_request(nullptr, 0, MPI_CHAR, source == MPI_ANY_SOURCE ? MPI_ANY_SOURCE : - smpi_group_index(smpi_comm_group(comm), source), smpi_comm_rank(comm), tag, comm, PERSISTENT | RECV); + comm->group()->index(source), comm->rank(), tag, comm, PERSISTENT | RECV); // to avoid deadlock, we have to sleep some time here, or the timer won't advance and we will only do iprobe simcalls // (especially when used as a break condition, such as while(MPI_Iprobe(...)) ... ) // multiplier to the sleeptime, to increase speed of execution, each failed iprobe will increase it static int nsleeps = 1; - if(smpi_iprobe_sleep > 0) - simcall_process_sleep(nsleeps*smpi_iprobe_sleep); + double speed = simgrid::s4u::Actor::self()->host()->speed(); + double maxrate = xbt_cfg_get_double("smpi/iprobe-cpu-usage"); + if (smpi_iprobe_sleep > 0) { + smx_activity_t iprobe_sleep = simcall_execution_start("iprobe", /* flops to executek*/nsleeps*smpi_iprobe_sleep*speed*maxrate, /* priority */1.0, /* performance bound */maxrate*speed); + simcall_execution_wait(iprobe_sleep); + } // behave like a receive, but don't do it smx_mailbox_t mailbox; @@ -803,7 +812,7 @@ void smpi_mpi_iprobe(int source, int tag, MPI_Comm comm, int* flag, MPI_Status* MPI_Request req = static_cast(sync_comm->src_data); *flag = 1; if(status != MPI_STATUS_IGNORE && (req->flags & PREPARED) == 0) { - status->MPI_SOURCE = smpi_group_rank(smpi_comm_group(comm), req->src); + status->MPI_SOURCE = comm->group()->rank(req->src); status->MPI_TAG = req->tag; status->MPI_ERROR = MPI_SUCCESS; status->count = req->real_size; @@ -835,6 +844,11 @@ void smpi_mpi_wait(MPI_Request * request, MPI_Status * status) *request = MPI_REQUEST_NULL; } +static int sort_accumulates(MPI_Request a, MPI_Request b) +{ + return (a->tag < b->tag); +} + int smpi_mpi_waitany(int count, MPI_Request requests[], MPI_Status * status) { s_xbt_dynar_t comms; // Keep it on stack to save some extra mallocs @@ -855,15 +869,15 @@ int smpi_mpi_waitany(int count, MPI_Request requests[], MPI_Status * status) xbt_dynar_push(&comms, &requests[i]->action); map[size] = i; size++; - }else{ - //This is a finished detached request, let's return this one - size=0;//so we free the dynar but don't do the waitany call - index=i; - finish_wait(&requests[i], status);//cleanup if refcount = 0 - if (requests[i] != MPI_REQUEST_NULL && (requests[i]->flags & NON_PERSISTENT)) - requests[i]=MPI_REQUEST_NULL;//set to null - break; - } + } else { + // This is a finished detached request, let's return this one + size = 0; // so we free the dynar but don't do the waitany call + index = i; + finish_wait(&requests[i], status); // cleanup if refcount = 0 + if (requests[i] != MPI_REQUEST_NULL && (requests[i]->flags & NON_PERSISTENT)) + requests[i] = MPI_REQUEST_NULL; // set to null + break; + } } } if(size > 0) { @@ -872,11 +886,19 @@ int smpi_mpi_waitany(int count, MPI_Request requests[], MPI_Status * status) // not MPI_UNDEFINED, as this is a simix return code if (i != -1) { index = map[i]; - finish_wait(&requests[index], status); - if (requests[i] != MPI_REQUEST_NULL && (requests[i]->flags & NON_PERSISTENT)) - requests[index] = MPI_REQUEST_NULL; + //in case of an accumulate, we have to wait the end of all requests to apply the operation, ordered correctly. + if ((requests[index] == MPI_REQUEST_NULL) + || (!((requests[index]->flags & ACCUMULATE) && (requests[index]->flags & RECV)))){ + finish_wait(&requests[index], status); + if (requests[i] != MPI_REQUEST_NULL && (requests[i]->flags & NON_PERSISTENT)) + requests[index] = MPI_REQUEST_NULL; + }else{ + XBT_WARN("huu?"); + } } } + + xbt_dynar_free_data(&comms); xbt_free(map); } @@ -888,13 +910,14 @@ int smpi_mpi_waitany(int count, MPI_Request requests[], MPI_Status * status) int smpi_mpi_waitall(int count, MPI_Request requests[], MPI_Status status[]) { - int index, c; + std::vector accumulates; + int index; MPI_Status stat; MPI_Status *pstat = status == MPI_STATUSES_IGNORE ? MPI_STATUS_IGNORE : &stat; int retvalue = MPI_SUCCESS; //tag invalid requests in the set if (status != MPI_STATUSES_IGNORE) { - for (c = 0; c < count; c++) { + for (int c = 0; c < count; c++) { if (requests[c] == MPI_REQUEST_NULL || requests[c]->dst == MPI_PROC_NULL || (requests[c]->flags & PREPARED)) { smpi_empty_status(&status[c]); } else if (requests[c]->src == MPI_PROC_NULL) { @@ -903,8 +926,7 @@ int smpi_mpi_waitall(int count, MPI_Request requests[], MPI_Status status[]) } } } - for(c = 0; c < count; c++) { - + for (int c = 0; c < count; c++) { if (MC_is_active() || MC_record_replay_is_active()) { smpi_mpi_wait(&requests[c], pstat); index = c; @@ -912,8 +934,13 @@ int smpi_mpi_waitall(int count, MPI_Request requests[], MPI_Status status[]) index = smpi_mpi_waitany(count, requests, pstat); if (index == MPI_UNDEFINED) break; + + if (requests[index] != MPI_REQUEST_NULL + && (requests[index]->flags & RECV) + && (requests[index]->flags & ACCUMULATE)) + accumulates.push_back(requests[index]); if (requests[index] != MPI_REQUEST_NULL && (requests[index]->flags & NON_PERSISTENT)) - requests[index]=MPI_REQUEST_NULL; + requests[index] = MPI_REQUEST_NULL; } if (status != MPI_STATUSES_IGNORE) { status[index] = *pstat; @@ -922,6 +949,13 @@ int smpi_mpi_waitall(int count, MPI_Request requests[], MPI_Status status[]) } } + if (!accumulates.empty()) { + std::sort(accumulates.begin(), accumulates.end(), sort_accumulates); + for (auto req : accumulates) { + finish_wait(&req, status); + } + } + return retvalue; } @@ -996,8 +1030,8 @@ void smpi_mpi_gather(void *sendbuf, int sendcount, MPI_Datatype sendtype, MPI_Aint lb = 0; MPI_Aint recvext = 0; - int rank = smpi_comm_rank(comm); - int size = smpi_comm_size(comm); + int rank = comm->rank(); + int size = comm->size(); if(rank != root) { // Send buffer to root smpi_mpi_send(sendbuf, sendcount, sendtype, root, system_tag, comm); @@ -1029,10 +1063,10 @@ void smpi_mpi_gather(void *sendbuf, int sendcount, MPI_Datatype sendtype, void smpi_mpi_reduce_scatter(void *sendbuf, void *recvbuf, int *recvcounts, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) { - int rank = smpi_comm_rank(comm); + int rank = comm->rank(); /* arbitrarily choose root as rank 0 */ - int size = smpi_comm_size(comm); + int size = comm->size(); int count = 0; int *displs = xbt_new(int, size); for (int i = 0; i < size; i++) { @@ -1054,8 +1088,8 @@ void smpi_mpi_gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype, void MPI_Aint lb = 0; MPI_Aint recvext = 0; - int rank = smpi_comm_rank(comm); - int size = smpi_comm_size(comm); + int rank = comm->rank(); + int size = comm->size(); if (rank != root) { // Send buffer to root smpi_mpi_send(sendbuf, sendcount, sendtype, root, system_tag, comm); @@ -1092,8 +1126,8 @@ void smpi_mpi_allgather(void *sendbuf, int sendcount, MPI_Datatype sendtype, MPI_Aint recvext = 0; MPI_Request *requests; - int rank = smpi_comm_rank(comm); - int size = smpi_comm_size(comm); + int rank = comm->rank(); + int size = comm->size(); // FIXME: check for errors smpi_datatype_extent(recvtype, &lb, &recvext); // Local copy from self @@ -1127,8 +1161,8 @@ void smpi_mpi_allgatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype, vo MPI_Aint lb = 0; MPI_Aint recvext = 0; - int rank = smpi_comm_rank(comm); - int size = smpi_comm_size(comm); + int rank = comm->rank(); + int size = comm->size(); smpi_datatype_extent(recvtype, &lb, &recvext); // Local copy from self smpi_datatype_copy(sendbuf, sendcount, sendtype, @@ -1163,8 +1197,8 @@ void smpi_mpi_scatter(void *sendbuf, int sendcount, MPI_Datatype sendtype, MPI_Aint sendext = 0; MPI_Request *requests; - int rank = smpi_comm_rank(comm); - int size = smpi_comm_size(comm); + int rank = comm->rank(); + int size = comm->size(); if(rank != root) { // Recv buffer from root smpi_mpi_recv(recvbuf, recvcount, recvtype, root, system_tag, comm, MPI_STATUS_IGNORE); @@ -1202,8 +1236,8 @@ void smpi_mpi_scatterv(void *sendbuf, int *sendcounts, int *displs, MPI_Datatype MPI_Aint lb = 0; MPI_Aint sendext = 0; - int rank = smpi_comm_rank(comm); - int size = smpi_comm_size(comm); + int rank = comm->rank(); + int size = comm->size(); if(rank != root) { // Recv buffer from root smpi_mpi_recv(recvbuf, recvcount, recvtype, root, system_tag, comm, MPI_STATUS_IGNORE); @@ -1243,8 +1277,8 @@ void smpi_mpi_reduce(void *sendbuf, void *recvbuf, int count, MPI_Datatype datat char* sendtmpbuf = static_cast(sendbuf); - int rank = smpi_comm_rank(comm); - int size = smpi_comm_size(comm); + int rank = comm->rank(); + int size = comm->size(); //non commutative case, use a working algo from openmpi if(!smpi_op_is_commute(op)){ smpi_coll_tuned_reduce_ompi_basic_linear(sendtmpbuf, recvbuf, count, datatype, op, root, comm); @@ -1313,10 +1347,11 @@ void smpi_mpi_allreduce(void *sendbuf, void *recvbuf, int count, MPI_Datatype da void smpi_mpi_scan(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) { int system_tag = -888; - MPI_Aint lb = 0, dataext = 0; + MPI_Aint lb = 0; + MPI_Aint dataext = 0; - int rank = smpi_comm_rank(comm); - int size = smpi_comm_size(comm); + int rank = comm->rank(); + int size = comm->size(); smpi_datatype_extent(datatype, &lb, &dataext); @@ -1372,10 +1407,11 @@ void smpi_mpi_scan(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatyp void smpi_mpi_exscan(void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) { int system_tag = -888; - MPI_Aint lb = 0, dataext = 0; + MPI_Aint lb = 0; + MPI_Aint dataext = 0; int recvbuf_is_empty=1; - int rank = smpi_comm_rank(comm); - int size = smpi_comm_size(comm); + int rank = comm->rank(); + int size = comm->size(); smpi_datatype_extent(datatype, &lb, &dataext); @@ -1394,6 +1430,7 @@ void smpi_mpi_exscan(void *sendbuf, void *recvbuf, int count, MPI_Datatype datat } // Wait for completion of all comms. smpi_mpi_startall(size - 1, requests); + if(smpi_op_is_commute(op)){ for (int other = 0; other < size - 1; other++) { index = smpi_mpi_waitany(size - 1, requests, MPI_STATUS_IGNORE); @@ -1414,11 +1451,11 @@ void smpi_mpi_exscan(void *sendbuf, void *recvbuf, int count, MPI_Datatype datat for (int other = 0; other < size - 1; other++) { smpi_mpi_wait(&(requests[other]), MPI_STATUS_IGNORE); if(index < rank) { - if(recvbuf_is_empty){ - smpi_datatype_copy(tmpbufs[other], count, datatype, recvbuf, count, datatype); - recvbuf_is_empty=0; - } else - smpi_op_apply(op, tmpbufs[other], recvbuf, &count, &datatype); + if (recvbuf_is_empty) { + smpi_datatype_copy(tmpbufs[other], count, datatype, recvbuf, count, datatype); + recvbuf_is_empty = 0; + } else + smpi_op_apply(op, tmpbufs[other], recvbuf, &count, &datatype); } } }