1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
15 #include "simgrid/modelchecker.h"
21 #include <sys/types.h>
24 #include <math.h> // sqrt
30 #define MAP_ANONYMOUS MAP_ANON
33 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
34 "Logging specific to SMPI (benchmarking)");
36 /* Shared allocations are handled through shared memory segments.
37 * Associated data and metadata are used as follows:
40 * `allocs' dict ---- -.
41 * ---------- shared_data_t shared_metadata_t / | | |
42 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
43 * | ---------- | fd of <name> | | | size of mmap | --| | | |
44 * | | count (2) | |-- | data | \ | | |
45 * `----------------- | <name> | | ----------------- ---- |
46 * -------------------- | ^ |
48 * | | `allocs_metadata' dict |
49 * | | ---------------------- |
50 * | `-- | <addr of mmap #1> |<-'
51 * | .-- | <addr of mmap #2> |<-.
52 * | | ---------------------- |
58 * | shared_metadata_t / | |
59 * | ----------------- | | |
60 * | | size of mmap | --| | |
62 * ----------------- | | |
67 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
69 xbt_dict_t allocs = NULL; /* Allocated on first use */
70 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
71 xbt_dict_t samples = NULL; /* Allocated on first use */
72 xbt_dict_t calls = NULL; /* Allocated on first use */
74 double smpi_cpu_threshold;
75 double smpi_running_power;
77 int smpi_loaded_page = -1;
78 char* start_data_exe = NULL;
79 int size_data_exe = 0;
80 int smpi_privatize_global_variables;
81 double smpi_total_benched_time = 0;
84 smpi_privatisation_region_t smpi_privatisation_regions;
97 static size_t shm_size(int fd) {
100 if(fstat(fd, &st) < 0) {
101 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
103 return (size_t)st.st_size;
107 static void* shm_map(int fd, size_t size, shared_data_t* data) {
109 char loc[PTR_STRLEN];
110 shared_metadata_t* meta;
112 if(size > shm_size(fd)) {
113 if(ftruncate(fd, (off_t)size) < 0) {
114 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
118 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
119 if(mem == MAP_FAILED) {
120 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
122 if(!allocs_metadata) {
123 allocs_metadata = xbt_dict_new_homogeneous(xbt_free_f);
125 snprintf(loc, PTR_STRLEN, "%p", mem);
126 meta = xbt_new(shared_metadata_t, 1);
129 xbt_dict_set(allocs_metadata, loc, meta, NULL);
130 XBT_DEBUG("MMAP %zu to %p", size, mem);
135 void smpi_bench_destroy(void)
137 xbt_dict_free(&allocs);
138 xbt_dict_free(&allocs_metadata);
139 xbt_dict_free(&samples);
140 xbt_dict_free(&calls);
143 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
144 void smpi_execute_flops_(double *flops)
146 smpi_execute_flops(*flops);
149 XBT_PUBLIC(void) smpi_execute_(double *duration);
150 void smpi_execute_(double *duration)
152 smpi_execute(*duration);
155 void smpi_execute_flops(double flops) {
158 host = SIMIX_host_self();
159 XBT_DEBUG("Handle real computation time: %f flops", flops);
160 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
162 simcall_set_category (action, TRACE_internal_smpi_get_category());
164 simcall_host_execution_wait(action);
165 smpi_switch_data_segment(smpi_process_index());
168 void smpi_execute(double duration)
170 if (duration >= smpi_cpu_threshold) {
171 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
172 double flops = duration * smpi_running_power;
174 int rank = smpi_process_index();
175 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
176 extra->type=TRACING_COMPUTING;
177 extra->comp_size=flops;
178 TRACE_smpi_computing_in(rank, extra);
180 smpi_execute_flops(flops);
183 TRACE_smpi_computing_out(rank);
187 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
188 duration, smpi_cpu_threshold);
192 void smpi_switch_data_segment(int dest);
194 void smpi_bench_begin(void)
196 smpi_switch_data_segment(smpi_process_index());
201 xbt_os_threadtimer_start(smpi_process_timer());
204 void smpi_bench_end(void)
210 xbt_os_timer_t timer = smpi_process_timer();
211 xbt_os_threadtimer_stop(timer);
212 // smpi_switch_data_segment(smpi_process_count());
213 if (smpi_process_get_sampling()) {
214 XBT_CRITICAL("Cannot do recursive benchmarks.");
215 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
216 xbt_backtrace_display_current();
217 xbt_die("Aborting.");
219 // Simulate the benchmarked computation unless disabled via command-line argument
220 if (sg_cfg_get_boolean("smpi/simulate_computation")) {
221 smpi_execute(xbt_os_timer_elapsed(timer));
224 smpi_total_benched_time += xbt_os_timer_elapsed(timer);
227 /* Private sleep function used by smpi_sleep() and smpi_usleep() */
228 static unsigned int private_sleep(double secs)
232 XBT_DEBUG("Sleep for: %lf secs", secs);
234 int rank = smpi_comm_rank(MPI_COMM_WORLD);
235 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
236 extra->type=TRACING_SLEEPING;
237 extra->sleep_duration=secs;
238 TRACE_smpi_sleeping_in(rank, extra);
240 simcall_process_sleep(secs);
242 TRACE_smpi_sleeping_out(rank);
249 unsigned int smpi_sleep(unsigned int secs)
251 return private_sleep((double)secs);
254 int smpi_usleep(useconds_t usecs)
256 return (int)private_sleep((double)usecs / 1000000.0);
260 int smpi_gettimeofday(struct timeval *tv, void* tz)
264 now = SIMIX_get_clock();
266 tv->tv_sec = (time_t)now;
268 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
270 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
277 extern double sg_surf_precision;
278 unsigned long long smpi_rastro_resolution (void)
281 double resolution = (1/sg_surf_precision);
283 return (unsigned long long)resolution;
286 unsigned long long smpi_rastro_timestamp (void)
289 double now = SIMIX_get_clock();
291 unsigned long long sec = (unsigned long long)now;
292 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
294 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
297 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
299 double threshold; /* maximal stderr requested (if positive) */
300 double relstderr; /* observed stderr so far */
301 double mean; /* mean of benched times, to be used if the block is disabled */
302 double sum; /* sum of benched times (to compute the mean and stderr) */
303 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
304 int iters; /* amount of requested iterations */
305 int count; /* amount of iterations done so far */
306 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
309 static char *sample_location(int global, const char *file, int line) {
311 return bprintf("%s:%d", file, line);
313 return bprintf("%s:%d:%d", file, line, smpi_process_index());
316 static int sample_enough_benchs(local_data_t *data) {
317 int res = data->count >= data->iters;
318 if (data->threshold>0.0) {
320 res = 0; // not enough data
321 if (data->relstderr > data->threshold)
322 res = 0; // stderr too high yet
324 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
325 (res?"enough benchs":"need more data"),
326 data->count, data->iters, data->relstderr, data->threshold, data->mean);
330 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
332 char *loc = sample_location(global, file, line);
335 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
336 smpi_process_set_sampling(1);
339 samples = xbt_dict_new_homogeneous(free);
341 data = xbt_dict_get_or_null(samples, loc);
343 xbt_assert(threshold>0 || iters>0,
344 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
345 data = (local_data_t *) xbt_new(local_data_t, 1);
348 data->sum_pow2 = 0.0;
350 data->threshold = threshold;
351 data->benching = 1; // If we have no data, we need at least one
353 xbt_dict_set(samples, loc, data, NULL);
354 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
356 if (data->iters != iters || data->threshold != threshold) {
357 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
358 loc, data->iters, data->threshold, iters,threshold);
362 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
363 data->benching = !sample_enough_benchs(data);
364 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
369 int smpi_sample_2(int global, const char *file, int line)
371 char *loc = sample_location(global, file, line);
375 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
376 data = xbt_dict_get(samples, loc);
377 XBT_DEBUG("sample2 %s",loc);
380 if (data->benching==1) {
381 // we need to run a new bench
382 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
383 data->count, data->iters, data->relstderr, data->threshold, data->mean);
386 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
387 // Just sleep instead
388 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
389 data->count, data->iters, data->relstderr, data->threshold, data->mean);
390 smpi_execute(data->mean);
391 smpi_process_set_sampling(0);
392 res = 0; // prepare to capture future, unrelated computations
399 void smpi_sample_3(int global, const char *file, int line)
401 char *loc = sample_location(global, file, line);
404 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
405 data = xbt_dict_get(samples, loc);
406 XBT_DEBUG("sample3 %s",loc);
409 if (data->benching==0) {
413 // ok, benchmarking this loop is over
414 xbt_os_threadtimer_stop(smpi_process_timer());
419 sample = xbt_os_timer_elapsed(smpi_process_timer());
421 data->sum_pow2 += sample * sample;
422 n = (double)data->count;
423 data->mean = data->sum / n;
424 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
425 if (!sample_enough_benchs(data)) {
426 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
428 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
429 data->mean, data->relstderr, sample);
431 // That's enough for now, prevent sample_2 to run the same code over and over
436 static void smpi_shared_alloc_free(void *p)
438 shared_data_t *data = p;
443 static char *smpi_shared_alloc_hash(char *loc)
453 loc = xbt_realloc(loc, 30);
455 for (i = 0; i < 40; i += 6) { /* base64 encode */
456 memcpy(s, hash + i, 6);
457 val = strtoul(s, NULL, 16);
458 for (j = 0; j < 4; j++) {
459 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
460 loc[1 + 4 * i / 6 + j] =
461 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
468 void *smpi_shared_malloc(size_t size, const char *file, int line)
471 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
472 char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
475 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
478 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
480 data = xbt_dict_get_or_null(allocs, loc);
482 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
483 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
487 xbt_die("Please cleanup /dev/shm/%s", loc);
489 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
492 data = xbt_new(shared_data_t, 1);
496 mem = shm_map(fd, size, data);
497 if (shm_unlink(loc) < 0) {
498 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
500 xbt_dict_set(allocs, loc, data, NULL);
501 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
504 mem = shm_map(data->fd, size, data);
507 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
509 mem = xbt_malloc(size);
510 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
515 void smpi_shared_free(void *ptr)
517 char loc[PTR_STRLEN];
518 shared_metadata_t* meta;
520 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
523 XBT_WARN("Cannot free: nothing was allocated");
526 if(!allocs_metadata) {
527 XBT_WARN("Cannot free: no metadata was allocated");
529 snprintf(loc, PTR_STRLEN, "%p", ptr);
530 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
532 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
537 XBT_WARN("Cannot free: something is broken in the metadata link");
540 if(munmap(ptr, meta->size) < 0) {
541 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
544 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
545 if (data->count <= 0) {
547 xbt_dict_remove(allocs, data->loc);
548 XBT_DEBUG("Shared free - with removal - of %p", ptr);
551 XBT_DEBUG("Classic free of %p", ptr);
557 int smpi_shared_known_call(const char* func, const char* input)
559 char* loc = bprintf("%s:%s", func, input);
564 calls = xbt_dict_new_homogeneous(NULL);
567 xbt_dict_get(calls, loc); /* Succeed or throw */
574 if (ex.category != not_found_error)
581 void* smpi_shared_get_call(const char* func, const char* input) {
582 char* loc = bprintf("%s:%s", func, input);
586 calls = xbt_dict_new_homogeneous(NULL);
588 data = xbt_dict_get(calls, loc);
593 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
594 char* loc = bprintf("%s:%s", func, input);
597 calls = xbt_dict_new_homogeneous(NULL);
599 xbt_dict_set(calls, loc, data, NULL);
607 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
610 /** Map a given SMPI privatization segment (make a SMPI process active)
612 void smpi_switch_data_segment(int dest){
614 if (smpi_loaded_page==dest)//no need to switch either
618 smpi_really_switch_data_segment(dest);
621 /** Map a given SMPI privatization segment (make a SMPI process active)
622 * even if SMPI thinks it is already active
624 * When doing a state restoration, the state of the restored variables
625 * might not be consistent with the state of the virtual memory.
626 * In this case, we to change the data segment.
628 void smpi_really_switch_data_segment(int dest) {
630 if(size_data_exe == 0)//no need to switch
635 if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
636 for (i=0; i< SIMIX_process_count(); i++){
637 memcpy(smpi_privatisation_regions[i].address,TOPAGE(start_data_exe),size_data_exe);
641 int current = smpi_privatisation_regions[dest].file_descriptor;
642 XBT_DEBUG("Switching data frame to the one of process %d", dest);
643 void* tmp = mmap (TOPAGE(start_data_exe), size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
644 if (tmp != TOPAGE(start_data_exe))
645 xbt_die("Couldn't map the new region");
646 smpi_loaded_page=dest;
650 void smpi_get_executable_global_size(){
651 int size_bss_binary=0;
652 int size_data_binary=0;
654 char *line = NULL; /* Temporal storage for each line that is readed */
655 ssize_t read; /* Number of bytes readed */
656 size_t n = 0; /* Amount of bytes to read by xbt_getline */
661 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
663 fp = popen(command, "r");
666 perror("popen failed");
670 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
675 /* Wipeout the new line character */
676 line[read - 1] = '\0';
678 lfields[0] = strtok(line, " ");
680 if(lfields[0] == NULL)
683 if(strcmp(lfields[0], "Sections:") == 0
684 || strcmp(lfields[0], "Idx") == 0
685 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
688 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
689 lfields[i] = strtok(NULL, " ");
693 * we are looking for these fields
694 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
695 CONTENTS, ALLOC, LOAD, DATA
696 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
701 if(strcmp(lfields[1], ".data") == 0){
702 size_data_binary = strtoul(lfields[2], NULL, 16);
703 start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
705 }else if(strcmp(lfields[1], ".bss") == 0){
706 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
707 //TODO : check if this is OK, as some segments may be inserted between them..
708 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (start_data_exe + size_data_binary))
709 + strtoul(lfields[2], NULL, 16);
717 size_data_exe =(unsigned long)start_data_exe - (unsigned long)TOPAGE(start_data_exe)+ size_data_binary+size_bss_binary;
724 void smpi_initialize_global_memory_segments(){
727 smpi_privatize_global_variables=0;
732 smpi_get_executable_global_size();
734 XBT_DEBUG ("bss+data segment found : size %d starting at %p",size_data_exe, start_data_exe );
736 if(size_data_exe == 0){//no need to switch
737 smpi_privatize_global_variables=0;
741 smpi_privatisation_regions = (smpi_privatisation_region_t) malloc(
742 smpi_process_count() * sizeof(struct s_smpi_privatisation_region));
744 for (i=0; i< SIMIX_process_count(); i++){
745 //create SIMIX_process_count() mappings of this size with the same data inside
746 void *address = NULL;
747 char path[] = "/dev/shm/my-buffer-XXXXXX";
750 int file_descriptor= mkstemp (path);
751 if (file_descriptor < 0)
752 xbt_die("Impossible to create temporary file for memory mapping");
754 status = unlink (path);
756 xbt_die("Impossible to unlink temporary file for memory mapping");
758 status = ftruncate(file_descriptor, size_data_exe);
760 xbt_die("Impossible to set the size of the temporary file for memory mapping");
762 /* Ask for a free region */
763 address = mmap (NULL, size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
764 if (address == MAP_FAILED)
765 xbt_die("Couldn't find a free region for memory mapping");
767 //initialize the values
768 memcpy(address,TOPAGE(start_data_exe),size_data_exe);
770 //store the address of the mapping for further switches
771 smpi_privatisation_regions[i].file_descriptor = file_descriptor;
772 smpi_privatisation_regions[i].address = address;
779 void smpi_destroy_global_memory_segments(){
780 if(size_data_exe == 0)//no need to switch
784 for (i=0; i< smpi_process_count(); i++){
785 if(munmap(smpi_privatisation_regions[i].address,size_data_exe) < 0) {
786 XBT_WARN("Unmapping of fd %d failed: %s",
787 smpi_privatisation_regions[i].file_descriptor, strerror(errno));
789 close(smpi_privatisation_regions[i].file_descriptor);
791 xbt_free(smpi_privatisation_regions);