1 /* Copyright (c) 2007, 2009-2015. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
15 #include "simgrid/modelchecker.h"
16 #include "mc/mc_replay.h"
22 #include <sys/types.h>
25 #include <math.h> // sqrt
31 #define MAP_ANONYMOUS MAP_ANON
34 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
35 "Logging specific to SMPI (benchmarking)");
37 /* Shared allocations are handled through shared memory segments.
38 * Associated data and metadata are used as follows:
41 * `allocs' dict ---- -.
42 * ---------- shared_data_t shared_metadata_t / | | |
43 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
44 * | ---------- | fd of <name> | | | size of mmap | --| | | |
45 * | | count (2) | |-- | data | \ | | |
46 * `----------------- | <name> | | ----------------- ---- |
47 * -------------------- | ^ |
49 * | | `allocs_metadata' dict |
50 * | | ---------------------- |
51 * | `-- | <addr of mmap #1> |<-'
52 * | .-- | <addr of mmap #2> |<-.
53 * | | ---------------------- |
59 * | shared_metadata_t / | |
60 * | ----------------- | | |
61 * | | size of mmap | --| | |
63 * ----------------- | | |
68 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
70 xbt_dict_t allocs = NULL; /* Allocated on first use */
71 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
72 xbt_dict_t samples = NULL; /* Allocated on first use */
73 xbt_dict_t calls = NULL; /* Allocated on first use */
75 double smpi_cpu_threshold;
76 double smpi_running_power;
78 int smpi_loaded_page = -1;
79 char* smpi_start_data_exe = NULL;
80 int smpi_size_data_exe = 0;
81 int smpi_privatize_global_variables;
82 double smpi_total_benched_time = 0;
83 smpi_privatisation_region_t smpi_privatisation_regions;
96 static size_t shm_size(int fd) {
99 if(fstat(fd, &st) < 0) {
100 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
102 return (size_t)st.st_size;
106 static void* shm_map(int fd, size_t size, shared_data_t* data) {
108 char loc[PTR_STRLEN];
109 shared_metadata_t* meta;
111 if(size > shm_size(fd)) {
112 if(ftruncate(fd, (off_t)size) < 0) {
113 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
117 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
118 if(mem == MAP_FAILED) {
119 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
121 if(!allocs_metadata) {
122 allocs_metadata = xbt_dict_new_homogeneous(xbt_free_f);
124 snprintf(loc, PTR_STRLEN, "%p", mem);
125 meta = xbt_new(shared_metadata_t, 1);
128 xbt_dict_set(allocs_metadata, loc, meta, NULL);
129 XBT_DEBUG("MMAP %zu to %p", size, mem);
134 void smpi_bench_destroy(void)
136 xbt_dict_free(&allocs);
137 xbt_dict_free(&allocs_metadata);
138 xbt_dict_free(&samples);
139 xbt_dict_free(&calls);
142 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
143 void smpi_execute_flops_(double *flops)
145 smpi_execute_flops(*flops);
148 XBT_PUBLIC(void) smpi_execute_(double *duration);
149 void smpi_execute_(double *duration)
151 smpi_execute(*duration);
154 void smpi_execute_flops(double flops) {
155 smx_synchro_t action;
156 XBT_DEBUG("Handle real computation time: %f flops", flops);
157 action = simcall_process_execute("computation", flops, 1, 0, 0);
158 simcall_set_category (action, TRACE_internal_smpi_get_category());
159 simcall_process_execution_wait(action);
160 smpi_switch_data_segment(smpi_process_index());
163 void smpi_execute(double duration)
165 if (duration >= smpi_cpu_threshold) {
166 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
167 double flops = duration * smpi_running_power;
168 int rank = smpi_process_index();
169 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
170 extra->type=TRACING_COMPUTING;
171 extra->comp_size=flops;
172 TRACE_smpi_computing_in(rank, extra);
173 smpi_execute_flops(flops);
175 TRACE_smpi_computing_out(rank);
178 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
179 duration, smpi_cpu_threshold);
183 void smpi_switch_data_segment(int dest);
185 void smpi_bench_begin(void)
187 smpi_switch_data_segment(smpi_process_index());
189 if (MC_is_active() || MC_record_replay_is_active())
192 xbt_os_threadtimer_start(smpi_process_timer());
195 void smpi_bench_end(void)
198 if (MC_is_active() || MC_record_replay_is_active())
201 xbt_os_timer_t timer = smpi_process_timer();
202 xbt_os_threadtimer_stop(timer);
203 // smpi_switch_data_segment(smpi_process_count());
204 if (smpi_process_get_sampling()) {
205 XBT_CRITICAL("Cannot do recursive benchmarks.");
206 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
207 xbt_backtrace_display_current();
208 xbt_die("Aborting.");
210 // Simulate the benchmarked computation unless disabled via command-line argument
211 if (sg_cfg_get_boolean("smpi/simulate_computation")) {
212 smpi_execute(xbt_os_timer_elapsed(timer));
215 smpi_total_benched_time += xbt_os_timer_elapsed(timer);
218 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
219 static unsigned int private_sleep(double secs)
223 XBT_DEBUG("Sleep for: %lf secs", secs);
224 int rank = smpi_comm_rank(MPI_COMM_WORLD);
225 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
226 extra->type=TRACING_SLEEPING;
227 extra->sleep_duration=secs;
228 TRACE_smpi_sleeping_in(rank, extra);
230 simcall_process_sleep(secs);
232 TRACE_smpi_sleeping_out(rank);
238 unsigned int smpi_sleep(unsigned int secs)
240 return private_sleep((double)secs);
243 int smpi_usleep(useconds_t usecs)
245 return (int)private_sleep((double)usecs / 1000000.0);
249 int smpi_gettimeofday(struct timeval *tv, void* tz)
253 now = SIMIX_get_clock();
255 tv->tv_sec = (time_t)now;
257 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
259 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
266 extern double sg_surf_precision;
267 unsigned long long smpi_rastro_resolution (void)
270 double resolution = (1/sg_surf_precision);
272 return (unsigned long long)resolution;
275 unsigned long long smpi_rastro_timestamp (void)
278 double now = SIMIX_get_clock();
280 unsigned long long sec = (unsigned long long)now;
281 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
283 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
286 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
288 double threshold; /* maximal stderr requested (if positive) */
289 double relstderr; /* observed stderr so far */
290 double mean; /* mean of benched times, to be used if the block is disabled */
291 double sum; /* sum of benched times (to compute the mean and stderr) */
292 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
293 int iters; /* amount of requested iterations */
294 int count; /* amount of iterations done so far */
295 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
298 static char *sample_location(int global, const char *file, int line) {
300 return bprintf("%s:%d", file, line);
302 return bprintf("%s:%d:%d", file, line, smpi_process_index());
305 static int sample_enough_benchs(local_data_t *data) {
306 int res = data->count >= data->iters;
307 if (data->threshold>0.0) {
309 res = 0; // not enough data
310 if (data->relstderr > data->threshold)
311 res = 0; // stderr too high yet
313 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
314 (res?"enough benchs":"need more data"),
315 data->count, data->iters, data->relstderr, data->threshold, data->mean);
319 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
321 char *loc = sample_location(global, file, line);
324 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
325 smpi_process_set_sampling(1);
328 samples = xbt_dict_new_homogeneous(free);
330 data = xbt_dict_get_or_null(samples, loc);
332 xbt_assert(threshold>0 || iters>0,
333 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
334 data = (local_data_t *) xbt_new(local_data_t, 1);
337 data->sum_pow2 = 0.0;
339 data->threshold = threshold;
340 data->benching = 1; // If we have no data, we need at least one
342 xbt_dict_set(samples, loc, data, NULL);
343 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
345 if (data->iters != iters || data->threshold != threshold) {
346 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
347 loc, data->iters, data->threshold, iters,threshold);
351 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
352 data->benching = !sample_enough_benchs(data);
353 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
358 int smpi_sample_2(int global, const char *file, int line)
360 char *loc = sample_location(global, file, line);
364 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
365 data = xbt_dict_get(samples, loc);
366 XBT_DEBUG("sample2 %s",loc);
369 if (data->benching==1) {
370 // we need to run a new bench
371 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
372 data->count, data->iters, data->relstderr, data->threshold, data->mean);
375 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
376 // Just sleep instead
377 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
378 data->count, data->iters, data->relstderr, data->threshold, data->mean);
379 smpi_execute(data->mean);
380 smpi_process_set_sampling(0);
381 res = 0; // prepare to capture future, unrelated computations
388 void smpi_sample_3(int global, const char *file, int line)
390 char *loc = sample_location(global, file, line);
393 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
394 data = xbt_dict_get(samples, loc);
395 XBT_DEBUG("sample3 %s",loc);
398 if (data->benching==0) {
402 // ok, benchmarking this loop is over
403 xbt_os_threadtimer_stop(smpi_process_timer());
408 sample = xbt_os_timer_elapsed(smpi_process_timer());
410 data->sum_pow2 += sample * sample;
411 n = (double)data->count;
412 data->mean = data->sum / n;
413 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
414 if (!sample_enough_benchs(data)) {
415 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
417 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
418 data->mean, data->relstderr, sample);
420 // That's enough for now, prevent sample_2 to run the same code over and over
425 static void smpi_shared_alloc_free(void *p)
427 shared_data_t *data = p;
432 static char *smpi_shared_alloc_hash(char *loc)
442 loc = xbt_realloc(loc, 30);
444 for (i = 0; i < 40; i += 6) { /* base64 encode */
445 memcpy(s, hash + i, 6);
446 val = strtoul(s, NULL, 16);
447 for (j = 0; j < 4; j++) {
448 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
449 loc[1 + 4 * i / 6 + j] =
450 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
457 void *smpi_shared_malloc(size_t size, const char *file, int line)
460 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
461 char *loc = bprintf("/%zu_%s_%d", (size_t)getpid(), file, line);
464 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
467 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
469 data = xbt_dict_get_or_null(allocs, loc);
471 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
472 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
476 xbt_die("Please cleanup /dev/shm/%s", loc);
478 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
481 data = xbt_new(shared_data_t, 1);
485 mem = shm_map(fd, size, data);
486 if (shm_unlink(loc) < 0) {
487 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
489 xbt_dict_set(allocs, loc, data, NULL);
490 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
493 mem = shm_map(data->fd, size, data);
496 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
498 mem = xbt_malloc(size);
499 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
504 void smpi_shared_free(void *ptr)
506 char loc[PTR_STRLEN];
507 shared_metadata_t* meta;
509 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
512 XBT_WARN("Cannot free: nothing was allocated");
515 if(!allocs_metadata) {
516 XBT_WARN("Cannot free: no metadata was allocated");
518 snprintf(loc, PTR_STRLEN, "%p", ptr);
519 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
521 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
526 XBT_WARN("Cannot free: something is broken in the metadata link");
529 if(munmap(ptr, meta->size) < 0) {
530 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
533 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
534 if (data->count <= 0) {
536 xbt_dict_remove(allocs, data->loc);
537 XBT_DEBUG("Shared free - with removal - of %p", ptr);
540 XBT_DEBUG("Classic free of %p", ptr);
546 int smpi_shared_known_call(const char* func, const char* input)
548 char* loc = bprintf("%s:%s", func, input);
553 calls = xbt_dict_new_homogeneous(NULL);
556 xbt_dict_get(calls, loc); /* Succeed or throw */
563 if (ex.category != not_found_error)
570 void* smpi_shared_get_call(const char* func, const char* input) {
571 char* loc = bprintf("%s:%s", func, input);
575 calls = xbt_dict_new_homogeneous(NULL);
577 data = xbt_dict_get(calls, loc);
582 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
583 char* loc = bprintf("%s:%s", func, input);
586 calls = xbt_dict_new_homogeneous(NULL);
588 xbt_dict_set(calls, loc, data, NULL);
596 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
599 /** Map a given SMPI privatization segment (make a SMPI process active)
601 void smpi_switch_data_segment(int dest){
603 if (smpi_loaded_page==dest)//no need to switch either
607 smpi_really_switch_data_segment(dest);
610 /** Map a given SMPI privatization segment (make a SMPI process active)
611 * even if SMPI thinks it is already active
613 * When doing a state restoration, the state of the restored variables
614 * might not be consistent with the state of the virtual memory.
615 * In this case, we to change the data segment.
617 void smpi_really_switch_data_segment(int dest) {
619 if(smpi_size_data_exe == 0)//no need to switch
622 #ifdef HAVE_PRIVATIZATION
624 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
625 for (i=0; i< SIMIX_process_count(); i++){
626 memcpy(smpi_privatisation_regions[i].address,
627 TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
631 // FIXME, cross-process support (mmap across process when necessary)
632 int current = smpi_privatisation_regions[dest].file_descriptor;
633 XBT_DEBUG("Switching data frame to the one of process %d", dest);
634 void* tmp = mmap (TOPAGE(smpi_start_data_exe), smpi_size_data_exe,
635 PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
636 if (tmp != TOPAGE(smpi_start_data_exe))
637 xbt_die("Couldn't map the new region");
638 smpi_loaded_page = dest;
642 int smpi_is_privatisation_file(char* file)
644 return strncmp("/dev/shm/my-buffer-", file, 19) == 0;
647 void smpi_get_executable_global_size(){
648 int size_bss_binary=0;
649 int size_data_binary=0;
651 char *line = NULL; /* Temporal storage for each line that is readed */
652 ssize_t read; /* Number of bytes readed */
653 size_t n = 0; /* Amount of bytes to read by xbt_getline */
658 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
660 fp = popen(command, "r");
663 perror("popen failed");
667 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
672 /* Wipeout the new line character */
673 line[read - 1] = '\0';
675 lfields[0] = strtok(line, " ");
677 if(lfields[0] == NULL)
680 if(strcmp(lfields[0], "Sections:") == 0
681 || strcmp(lfields[0], "Idx") == 0
682 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
685 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
686 lfields[i] = strtok(NULL, " ");
690 * we are looking for these fields
691 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
692 CONTENTS, ALLOC, LOAD, DATA
693 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
698 if(strcmp(lfields[1], ".data") == 0){
699 size_data_binary = strtoul(lfields[2], NULL, 16);
700 smpi_start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
702 }else if(strcmp(lfields[1], ".bss") == 0){
703 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
704 //TODO : check if this is OK, as some segments may be inserted between them..
705 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (smpi_start_data_exe + size_data_binary))
706 + strtoul(lfields[2], NULL, 16);
714 smpi_size_data_exe = (unsigned long) smpi_start_data_exe
715 - (unsigned long) TOPAGE(smpi_start_data_exe)
716 + size_data_binary+size_bss_binary;
723 void smpi_initialize_global_memory_segments(){
725 #ifndef HAVE_PRIVATIZATION
726 smpi_privatize_global_variables=0;
731 smpi_get_executable_global_size();
733 XBT_DEBUG ("bss+data segment found : size %d starting at %p",
734 smpi_size_data_exe, smpi_start_data_exe );
736 if (smpi_size_data_exe == 0){//no need to switch
737 smpi_privatize_global_variables=0;
741 smpi_privatisation_regions = (smpi_privatisation_region_t) malloc(
742 smpi_process_count() * sizeof(struct s_smpi_privatisation_region));
744 for (i=0; i< SIMIX_process_count(); i++){
745 //create SIMIX_process_count() mappings of this size with the same data inside
746 void *address = NULL;
747 char path[] = "/dev/shm/my-buffer-XXXXXX";
750 int file_descriptor= mkstemp (path);
751 if (file_descriptor < 0) {
753 xbt_die("Impossible to create temporary file for memory mapping: %s\n\
754 The open() system call failed with the EMFILE error code (too many files). \n\n\
755 This means that you reached the system limits concerning the amount of files per process. \
756 This is not a surprise if you are trying to virtualize many processes on top of SMPI. \
757 Don't panic -- you should simply increase your system limits and try again. \n\n\
758 First, check what your limits are:\n\
759 cat /proc/sys/fs/file-max # Gives you the system-wide limit\n\
760 ulimit -Hn # Gives you the per process hard limit\n\
761 ulimit -Sn # Gives you the per process soft limit\n\
762 cat /proc/self/limits # Displays any per-process limitation (including the one given above)\n\n\
763 If one of these values is less than the amount of MPI processes that you try to run, then you got the explanation of this error. \
764 Ask the Internet about tutorials on how to increase the files limit such as: https://rtcamp.com/tutorials/linux/increase-open-files-limit/",
767 xbt_die("Impossible to create temporary file for memory mapping: %s",
771 status = unlink (path);
773 xbt_die("Impossible to unlink temporary file for memory mapping");
775 status = ftruncate(file_descriptor, smpi_size_data_exe);
777 xbt_die("Impossible to set the size of the temporary file for memory mapping");
779 /* Ask for a free region */
780 address = mmap (NULL, smpi_size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
781 if (address == MAP_FAILED)
782 xbt_die("Couldn't find a free region for memory mapping");
784 //initialize the values
785 memcpy(address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
787 //store the address of the mapping for further switches
788 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
789 smpi_privatisation_regions[i].address = address;
796 void smpi_destroy_global_memory_segments(){
797 if (smpi_size_data_exe == 0)//no need to switch
799 #ifdef HAVE_PRIVATIZATION
801 for (i=0; i< smpi_process_count(); i++){
802 if(munmap(smpi_privatisation_regions[i].address, smpi_size_data_exe) < 0) {
803 XBT_WARN("Unmapping of fd %d failed: %s",
804 smpi_privatisation_regions[i].file_descriptor, strerror(errno));
806 close(smpi_privatisation_regions[i].file_descriptor);
808 xbt_free(smpi_privatisation_regions);