1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
15 #include "simgrid/modelchecker.h"
16 #include "mc/mc_replay.h"
22 #include <sys/types.h>
25 #include <math.h> // sqrt
31 #define MAP_ANONYMOUS MAP_ANON
34 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
35 "Logging specific to SMPI (benchmarking)");
37 /* Shared allocations are handled through shared memory segments.
38 * Associated data and metadata are used as follows:
41 * `allocs' dict ---- -.
42 * ---------- shared_data_t shared_metadata_t / | | |
43 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
44 * | ---------- | fd of <name> | | | size of mmap | --| | | |
45 * | | count (2) | |-- | data | \ | | |
46 * `----------------- | <name> | | ----------------- ---- |
47 * -------------------- | ^ |
49 * | | `allocs_metadata' dict |
50 * | | ---------------------- |
51 * | `-- | <addr of mmap #1> |<-'
52 * | .-- | <addr of mmap #2> |<-.
53 * | | ---------------------- |
59 * | shared_metadata_t / | |
60 * | ----------------- | | |
61 * | | size of mmap | --| | |
63 * ----------------- | | |
68 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
70 xbt_dict_t allocs = NULL; /* Allocated on first use */
71 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
72 xbt_dict_t samples = NULL; /* Allocated on first use */
73 xbt_dict_t calls = NULL; /* Allocated on first use */
75 double smpi_cpu_threshold;
76 double smpi_running_power;
78 int smpi_loaded_page = -1;
79 char* smpi_start_data_exe = NULL;
80 int smpi_size_data_exe = 0;
81 int smpi_privatize_global_variables;
82 double smpi_total_benched_time = 0;
83 smpi_privatisation_region_t smpi_privatisation_regions;
96 static size_t shm_size(int fd) {
99 if(fstat(fd, &st) < 0) {
100 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
102 return (size_t)st.st_size;
106 static void* shm_map(int fd, size_t size, shared_data_t* data) {
108 char loc[PTR_STRLEN];
109 shared_metadata_t* meta;
111 if(size > shm_size(fd)) {
112 if(ftruncate(fd, (off_t)size) < 0) {
113 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
117 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
118 if(mem == MAP_FAILED) {
119 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
121 if(!allocs_metadata) {
122 allocs_metadata = xbt_dict_new_homogeneous(xbt_free_f);
124 snprintf(loc, PTR_STRLEN, "%p", mem);
125 meta = xbt_new(shared_metadata_t, 1);
128 xbt_dict_set(allocs_metadata, loc, meta, NULL);
129 XBT_DEBUG("MMAP %zu to %p", size, mem);
134 void smpi_bench_destroy(void)
136 xbt_dict_free(&allocs);
137 xbt_dict_free(&allocs_metadata);
138 xbt_dict_free(&samples);
139 xbt_dict_free(&calls);
142 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
143 void smpi_execute_flops_(double *flops)
145 smpi_execute_flops(*flops);
148 XBT_PUBLIC(void) smpi_execute_(double *duration);
149 void smpi_execute_(double *duration)
151 smpi_execute(*duration);
154 void smpi_execute_flops(double flops) {
155 smx_synchro_t action;
157 host = SIMIX_host_self();
158 XBT_DEBUG("Handle real computation time: %f flops", flops);
159 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
160 simcall_set_category (action, TRACE_internal_smpi_get_category());
161 simcall_host_execution_wait(action);
162 smpi_switch_data_segment(smpi_process_index());
165 void smpi_execute(double duration)
167 if (duration >= smpi_cpu_threshold) {
168 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
169 double flops = duration * smpi_running_power;
170 int rank = smpi_process_index();
171 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
172 extra->type=TRACING_COMPUTING;
173 extra->comp_size=flops;
174 TRACE_smpi_computing_in(rank, extra);
175 smpi_execute_flops(flops);
177 TRACE_smpi_computing_out(rank);
180 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
181 duration, smpi_cpu_threshold);
185 void smpi_switch_data_segment(int dest);
187 void smpi_bench_begin(void)
189 smpi_switch_data_segment(smpi_process_index());
191 if (MC_is_active() || MC_record_replay_is_active())
194 xbt_os_threadtimer_start(smpi_process_timer());
197 void smpi_bench_end(void)
200 if (MC_is_active() || MC_record_replay_is_active())
203 xbt_os_timer_t timer = smpi_process_timer();
204 xbt_os_threadtimer_stop(timer);
205 // smpi_switch_data_segment(smpi_process_count());
206 if (smpi_process_get_sampling()) {
207 XBT_CRITICAL("Cannot do recursive benchmarks.");
208 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
209 xbt_backtrace_display_current();
210 xbt_die("Aborting.");
212 // Simulate the benchmarked computation unless disabled via command-line argument
213 if (sg_cfg_get_boolean("smpi/simulate_computation")) {
214 smpi_execute(xbt_os_timer_elapsed(timer));
217 smpi_total_benched_time += xbt_os_timer_elapsed(timer);
220 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
221 static unsigned int private_sleep(double secs)
225 XBT_DEBUG("Sleep for: %lf secs", secs);
226 int rank = smpi_comm_rank(MPI_COMM_WORLD);
227 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
228 extra->type=TRACING_SLEEPING;
229 extra->sleep_duration=secs;
230 TRACE_smpi_sleeping_in(rank, extra);
232 simcall_process_sleep(secs);
234 TRACE_smpi_sleeping_out(rank);
240 unsigned int smpi_sleep(unsigned int secs)
242 return private_sleep((double)secs);
245 int smpi_usleep(useconds_t usecs)
247 return (int)private_sleep((double)usecs / 1000000.0);
251 int smpi_gettimeofday(struct timeval *tv, void* tz)
255 now = SIMIX_get_clock();
257 tv->tv_sec = (time_t)now;
259 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
261 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
268 extern double sg_surf_precision;
269 unsigned long long smpi_rastro_resolution (void)
272 double resolution = (1/sg_surf_precision);
274 return (unsigned long long)resolution;
277 unsigned long long smpi_rastro_timestamp (void)
280 double now = SIMIX_get_clock();
282 unsigned long long sec = (unsigned long long)now;
283 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
285 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
288 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
290 double threshold; /* maximal stderr requested (if positive) */
291 double relstderr; /* observed stderr so far */
292 double mean; /* mean of benched times, to be used if the block is disabled */
293 double sum; /* sum of benched times (to compute the mean and stderr) */
294 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
295 int iters; /* amount of requested iterations */
296 int count; /* amount of iterations done so far */
297 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
300 static char *sample_location(int global, const char *file, int line) {
302 return bprintf("%s:%d", file, line);
304 return bprintf("%s:%d:%d", file, line, smpi_process_index());
307 static int sample_enough_benchs(local_data_t *data) {
308 int res = data->count >= data->iters;
309 if (data->threshold>0.0) {
311 res = 0; // not enough data
312 if (data->relstderr > data->threshold)
313 res = 0; // stderr too high yet
315 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
316 (res?"enough benchs":"need more data"),
317 data->count, data->iters, data->relstderr, data->threshold, data->mean);
321 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
323 char *loc = sample_location(global, file, line);
326 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
327 smpi_process_set_sampling(1);
330 samples = xbt_dict_new_homogeneous(free);
332 data = xbt_dict_get_or_null(samples, loc);
334 xbt_assert(threshold>0 || iters>0,
335 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
336 data = (local_data_t *) xbt_new(local_data_t, 1);
339 data->sum_pow2 = 0.0;
341 data->threshold = threshold;
342 data->benching = 1; // If we have no data, we need at least one
344 xbt_dict_set(samples, loc, data, NULL);
345 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
347 if (data->iters != iters || data->threshold != threshold) {
348 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
349 loc, data->iters, data->threshold, iters,threshold);
353 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
354 data->benching = !sample_enough_benchs(data);
355 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
360 int smpi_sample_2(int global, const char *file, int line)
362 char *loc = sample_location(global, file, line);
366 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
367 data = xbt_dict_get(samples, loc);
368 XBT_DEBUG("sample2 %s",loc);
371 if (data->benching==1) {
372 // we need to run a new bench
373 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
374 data->count, data->iters, data->relstderr, data->threshold, data->mean);
377 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
378 // Just sleep instead
379 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
380 data->count, data->iters, data->relstderr, data->threshold, data->mean);
381 smpi_execute(data->mean);
382 smpi_process_set_sampling(0);
383 res = 0; // prepare to capture future, unrelated computations
390 void smpi_sample_3(int global, const char *file, int line)
392 char *loc = sample_location(global, file, line);
395 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
396 data = xbt_dict_get(samples, loc);
397 XBT_DEBUG("sample3 %s",loc);
400 if (data->benching==0) {
404 // ok, benchmarking this loop is over
405 xbt_os_threadtimer_stop(smpi_process_timer());
410 sample = xbt_os_timer_elapsed(smpi_process_timer());
412 data->sum_pow2 += sample * sample;
413 n = (double)data->count;
414 data->mean = data->sum / n;
415 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
416 if (!sample_enough_benchs(data)) {
417 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
419 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
420 data->mean, data->relstderr, sample);
422 // That's enough for now, prevent sample_2 to run the same code over and over
427 static void smpi_shared_alloc_free(void *p)
429 shared_data_t *data = p;
434 static char *smpi_shared_alloc_hash(char *loc)
444 loc = xbt_realloc(loc, 30);
446 for (i = 0; i < 40; i += 6) { /* base64 encode */
447 memcpy(s, hash + i, 6);
448 val = strtoul(s, NULL, 16);
449 for (j = 0; j < 4; j++) {
450 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
451 loc[1 + 4 * i / 6 + j] =
452 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
459 void *smpi_shared_malloc(size_t size, const char *file, int line)
462 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
463 char *loc = bprintf("/%zu_%s_%d", (size_t)getpid(), file, line);
466 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
469 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
471 data = xbt_dict_get_or_null(allocs, loc);
473 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
474 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
478 xbt_die("Please cleanup /dev/shm/%s", loc);
480 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
483 data = xbt_new(shared_data_t, 1);
487 mem = shm_map(fd, size, data);
488 if (shm_unlink(loc) < 0) {
489 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
491 xbt_dict_set(allocs, loc, data, NULL);
492 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
495 mem = shm_map(data->fd, size, data);
498 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
500 mem = xbt_malloc(size);
501 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
506 void smpi_shared_free(void *ptr)
508 char loc[PTR_STRLEN];
509 shared_metadata_t* meta;
511 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
514 XBT_WARN("Cannot free: nothing was allocated");
517 if(!allocs_metadata) {
518 XBT_WARN("Cannot free: no metadata was allocated");
520 snprintf(loc, PTR_STRLEN, "%p", ptr);
521 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
523 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
528 XBT_WARN("Cannot free: something is broken in the metadata link");
531 if(munmap(ptr, meta->size) < 0) {
532 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
535 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
536 if (data->count <= 0) {
538 xbt_dict_remove(allocs, data->loc);
539 XBT_DEBUG("Shared free - with removal - of %p", ptr);
542 XBT_DEBUG("Classic free of %p", ptr);
548 int smpi_shared_known_call(const char* func, const char* input)
550 char* loc = bprintf("%s:%s", func, input);
555 calls = xbt_dict_new_homogeneous(NULL);
558 xbt_dict_get(calls, loc); /* Succeed or throw */
565 if (ex.category != not_found_error)
572 void* smpi_shared_get_call(const char* func, const char* input) {
573 char* loc = bprintf("%s:%s", func, input);
577 calls = xbt_dict_new_homogeneous(NULL);
579 data = xbt_dict_get(calls, loc);
584 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
585 char* loc = bprintf("%s:%s", func, input);
588 calls = xbt_dict_new_homogeneous(NULL);
590 xbt_dict_set(calls, loc, data, NULL);
598 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
601 /** Map a given SMPI privatization segment (make a SMPI process active)
603 void smpi_switch_data_segment(int dest){
605 if (smpi_loaded_page==dest)//no need to switch either
609 smpi_really_switch_data_segment(dest);
612 /** Map a given SMPI privatization segment (make a SMPI process active)
613 * even if SMPI thinks it is already active
615 * When doing a state restoration, the state of the restored variables
616 * might not be consistent with the state of the virtual memory.
617 * In this case, we to change the data segment.
619 void smpi_really_switch_data_segment(int dest) {
621 if(smpi_size_data_exe == 0)//no need to switch
624 #ifdef HAVE_PRIVATIZATION
626 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
627 for (i=0; i< SIMIX_process_count(); i++){
628 memcpy(smpi_privatisation_regions[i].address,
629 TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
633 // FIXME, cross-process support (mmap across process when necessary)
634 int current = smpi_privatisation_regions[dest].file_descriptor;
635 XBT_DEBUG("Switching data frame to the one of process %d", dest);
636 void* tmp = mmap (TOPAGE(smpi_start_data_exe), smpi_size_data_exe,
637 PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
638 if (tmp != TOPAGE(smpi_start_data_exe))
639 xbt_die("Couldn't map the new region");
640 smpi_loaded_page = dest;
644 int smpi_is_privatisation_file(char* file)
646 return strncmp("/dev/shm/my-buffer-", file, 19) == 0;
649 void smpi_get_executable_global_size(){
650 int size_bss_binary=0;
651 int size_data_binary=0;
653 char *line = NULL; /* Temporal storage for each line that is readed */
654 ssize_t read; /* Number of bytes readed */
655 size_t n = 0; /* Amount of bytes to read by xbt_getline */
660 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
662 fp = popen(command, "r");
665 perror("popen failed");
669 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
674 /* Wipeout the new line character */
675 line[read - 1] = '\0';
677 lfields[0] = strtok(line, " ");
679 if(lfields[0] == NULL)
682 if(strcmp(lfields[0], "Sections:") == 0
683 || strcmp(lfields[0], "Idx") == 0
684 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
687 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
688 lfields[i] = strtok(NULL, " ");
692 * we are looking for these fields
693 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
694 CONTENTS, ALLOC, LOAD, DATA
695 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
700 if(strcmp(lfields[1], ".data") == 0){
701 size_data_binary = strtoul(lfields[2], NULL, 16);
702 smpi_start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
704 }else if(strcmp(lfields[1], ".bss") == 0){
705 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
706 //TODO : check if this is OK, as some segments may be inserted between them..
707 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (smpi_start_data_exe + size_data_binary))
708 + strtoul(lfields[2], NULL, 16);
716 smpi_size_data_exe = (unsigned long) smpi_start_data_exe
717 - (unsigned long) TOPAGE(smpi_start_data_exe)
718 + size_data_binary+size_bss_binary;
725 void smpi_initialize_global_memory_segments(){
727 #ifndef HAVE_PRIVATIZATION
728 smpi_privatize_global_variables=0;
733 smpi_get_executable_global_size();
735 XBT_DEBUG ("bss+data segment found : size %d starting at %p",
736 smpi_size_data_exe, smpi_start_data_exe );
738 if (smpi_size_data_exe == 0){//no need to switch
739 smpi_privatize_global_variables=0;
743 smpi_privatisation_regions = (smpi_privatisation_region_t) malloc(
744 smpi_process_count() * sizeof(struct s_smpi_privatisation_region));
746 for (i=0; i< SIMIX_process_count(); i++){
747 //create SIMIX_process_count() mappings of this size with the same data inside
748 void *address = NULL;
749 char path[] = "/dev/shm/my-buffer-XXXXXX";
752 int file_descriptor= mkstemp (path);
753 if (file_descriptor < 0) {
755 xbt_die("Impossible to create temporary file for memory mapping: %s\n\
756 The open() system call failed with the EMFILE error code (too many files). \n\n\
757 This means that you reached the system limits concerning the amount of files per process. \
758 This is not a surprise if you are trying to virtualize many processes on top of SMPI. \
759 Don't panic -- you should simply increase your system limits and try again. \n\n\
760 First, check what your limits are:\n\
761 cat /proc/sys/fs/file-max # Gives you the system-wide limit\n\
762 ulimit -Hn # Gives you the per process hard limit\n\
763 ulimit -Sn # Gives you the per process soft limit\n\
764 cat /proc/self/limits # Displays any per-process limitation (including the one given above)\n\n\
765 If one of these values is less than the amount of MPI processes that you try to run, then you got the explanation of this error. \
766 Ask the Internet about tutorials on how to increase the files limit such as: https://rtcamp.com/tutorials/linux/increase-open-files-limit/",
769 xbt_die("Impossible to create temporary file for memory mapping: %s",
773 status = unlink (path);
775 xbt_die("Impossible to unlink temporary file for memory mapping");
777 status = ftruncate(file_descriptor, smpi_size_data_exe);
779 xbt_die("Impossible to set the size of the temporary file for memory mapping");
781 /* Ask for a free region */
782 address = mmap (NULL, smpi_size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
783 if (address == MAP_FAILED)
784 xbt_die("Couldn't find a free region for memory mapping");
786 //initialize the values
787 memcpy(address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
789 //store the address of the mapping for further switches
790 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
791 smpi_privatisation_regions[i].address = address;
798 void smpi_destroy_global_memory_segments(){
799 if (smpi_size_data_exe == 0)//no need to switch
801 #ifdef HAVE_PRIVATIZATION
803 for (i=0; i< smpi_process_count(); i++){
804 if(munmap(smpi_privatisation_regions[i].address, smpi_size_data_exe) < 0) {
805 XBT_WARN("Unmapping of fd %d failed: %s",
806 smpi_privatisation_regions[i].file_descriptor, strerror(errno));
808 close(smpi_privatisation_regions[i].file_descriptor);
810 xbt_free(smpi_privatisation_regions);