1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
3 * (C) 2013 by Argonne National Laboratory.
4 * See COPYRIGHT in top-level directory.
13 #include "mcs-mutex.h"
15 /* TODO: Make these mutex operations no-ops for sequential runs */
17 /** Create an MCS mutex. Collective on comm.
19 * @param[out] comm communicator containing all processes that will use the
21 * @param[out] tail_rank rank of the process in comm that holds the tail
23 * @param[out] hdl handle to the mutex
26 int MCS_Mutex_create(int tail_rank, MPI_Comm comm, MCS_Mutex * hdl_out)
31 hdl = malloc(sizeof(struct mcs_mutex_s));
34 MPI_Comm_dup(comm, &hdl->comm);
36 MPI_Comm_rank(hdl->comm, &rank);
37 MPI_Comm_size(hdl->comm, &nproc);
39 hdl->tail_rank = tail_rank;
42 MPI_Win_allocate_shared(2 * sizeof(int), sizeof(int), MPI_INFO_NULL,
43 hdl->comm, &hdl->base, &hdl->window);
45 #ifdef USE_WIN_ALLOC_SHM
46 MPI_Info_create(&hdl->win_info);
47 MPI_Info_set(hdl->win_info, (char*)"alloc_shm", (char*)"true");
49 MPI_Info_create(&hdl->win_info);
50 MPI_Info_set(hdl->win_info, (char*)"alloc_shm", (char*)"false");
52 MPI_Win_allocate(2 * sizeof(int), sizeof(int), hdl->win_info, hdl->comm,
53 &hdl->base, &hdl->window);
56 MPI_Win_lock_all(0, hdl->window);
58 hdl->base[0] = MPI_PROC_NULL;
59 hdl->base[1] = MPI_PROC_NULL;
61 MPI_Win_sync(hdl->window);
62 MPI_Barrier(hdl->comm);
69 /** Free an MCS mutex. Collective on ranks in the communicator used at the
72 * @param[in] hdl handle to the group that will be freed
75 int MCS_Mutex_free(MCS_Mutex * hdl_ptr)
77 MCS_Mutex hdl = *hdl_ptr;
79 MPI_Win_unlock_all(hdl->window);
81 MPI_Win_free(&hdl->window);
82 MPI_Comm_free(&hdl->comm);
83 #ifndef USE_WIN_SHARED
84 MPI_Info_free(&hdl->win_info);
96 * @param[in] hdl Handle to the mutex
99 int MCS_Mutex_lock(MCS_Mutex hdl)
104 MPI_Comm_rank(hdl->comm, &rank);
105 MPI_Comm_size(hdl->comm, &nproc);
107 /* This store is safe, since it cannot happen concurrently with a remote
109 hdl->base[MCS_MTX_ELEM_DISP] = MPI_PROC_NULL;
110 MPI_Win_sync(hdl->window);
112 MPI_Fetch_and_op(&rank, &prev, MPI_INT, hdl->tail_rank, MCS_MTX_TAIL_DISP,
113 MPI_REPLACE, hdl->window);
114 MPI_Win_flush(hdl->tail_rank, hdl->window);
116 /* If there was a previous tail, update their next pointer and wait for
117 * notification. Otherwise, the mutex was successfully acquired. */
118 if (prev != MPI_PROC_NULL) {
119 /* Wait for notification */
122 MPI_Accumulate(&rank, 1, MPI_INT, prev, MCS_MTX_ELEM_DISP, 1, MPI_INT, MPI_REPLACE,
124 MPI_Win_flush(prev, hdl->window);
126 debug_print("%2d: LOCK - waiting for notification from %d\n", rank, prev);
127 MPI_Recv(NULL, 0, MPI_BYTE, prev, MCS_MUTEX_TAG, hdl->comm, &status);
130 debug_print("%2d: LOCK - lock acquired\n", rank);
136 /** Attempt to acquire a mutex.
138 * @param[in] hdl Handle to the mutex
139 * @param[out] success Indicates whether the mutex was acquired
142 int MCS_Mutex_trylock(MCS_Mutex hdl, int *success)
145 int tail, nil = MPI_PROC_NULL;
147 MPI_Comm_rank(hdl->comm, &rank);
148 MPI_Comm_size(hdl->comm, &nproc);
150 /* This store is safe, since it cannot happen concurrently with a remote
152 hdl->base[MCS_MTX_ELEM_DISP] = MPI_PROC_NULL;
153 MPI_Win_sync(hdl->window);
155 /* Check if the lock is available and claim it if it is. */
156 MPI_Compare_and_swap(&rank, &nil, &tail, MPI_INT, hdl->tail_rank,
157 MCS_MTX_TAIL_DISP, hdl->window);
158 MPI_Win_flush(hdl->tail_rank, hdl->window);
160 /* If the old tail was MPI_PROC_NULL, we have claimed the mutex */
161 *success = (tail == nil);
163 debug_print("%2d: TRYLOCK - %s\n", rank, (*success) ? "Success" : "Non-success");
171 * @param[in] hdl Handle to the mutex
174 int MCS_Mutex_unlock(MCS_Mutex hdl)
176 int rank, nproc, next;
178 MPI_Comm_rank(hdl->comm, &rank);
179 MPI_Comm_size(hdl->comm, &nproc);
181 MPI_Win_sync(hdl->window);
183 /* Read my next pointer. FOP is used since another process may write to
184 * this location concurrent with this read. */
185 MPI_Fetch_and_op(NULL, &next, MPI_INT, rank, MCS_MTX_ELEM_DISP, MPI_NO_OP, hdl->window);
186 MPI_Win_flush(rank, hdl->window);
188 if (next == MPI_PROC_NULL) {
190 int nil = MPI_PROC_NULL;
192 /* Check if we are the at the tail of the lock queue. If so, we're
193 * done. If not, we need to send notification. */
194 MPI_Compare_and_swap(&nil, &rank, &tail, MPI_INT, hdl->tail_rank,
195 MCS_MTX_TAIL_DISP, hdl->window);
196 MPI_Win_flush(hdl->tail_rank, hdl->window);
199 debug_print("%2d: UNLOCK - waiting for next pointer (tail = %d)\n", rank, tail);
200 assert(tail >= 0 && tail < nproc);
205 MPI_Fetch_and_op(NULL, &next, MPI_INT, rank, MCS_MTX_ELEM_DISP,
206 MPI_NO_OP, hdl->window);
208 MPI_Win_flush(rank, hdl->window);
209 if (next != MPI_PROC_NULL)
212 MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
217 /* Notify the next waiting process */
218 if (next != MPI_PROC_NULL) {
219 debug_print("%2d: UNLOCK - notifying %d\n", rank, next);
220 MPI_Send(NULL, 0, MPI_BYTE, next, MCS_MUTEX_TAG, hdl->comm);
223 debug_print("%2d: UNLOCK - lock released\n", rank);