1 /* Copyright (c) 2007, 2009-2014. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
7 #include "internal_config.h"
10 #include "xbt/sysdep.h"
13 #include "surf/surf.h"
14 #include "simgrid/sg_config.h"
20 #include <sys/types.h>
23 #include <math.h> // sqrt
29 #define MAP_ANONYMOUS MAP_ANON
32 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
33 "Logging specific to SMPI (benchmarking)");
35 /* Shared allocations are handled through shared memory segments.
36 * Associated data and metadata are used as follows:
39 * `allocs' dict ---- -.
40 * ---------- shared_data_t shared_metadata_t / | | |
41 * .->| <name> | ---> -------------------- <--. ----------------- | | | |
42 * | ---------- | fd of <name> | | | size of mmap | --| | | |
43 * | | count (2) | |-- | data | \ | | |
44 * `----------------- | <name> | | ----------------- ---- |
45 * -------------------- | ^ |
47 * | | `allocs_metadata' dict |
48 * | | ---------------------- |
49 * | `-- | <addr of mmap #1> |<-'
50 * | .-- | <addr of mmap #2> |<-.
51 * | | ---------------------- |
57 * | shared_metadata_t / | |
58 * | ----------------- | | |
59 * | | size of mmap | --| | |
61 * ----------------- | | |
66 #define PTR_STRLEN (2 + 2 * sizeof(void*) + 1)
68 xbt_dict_t allocs = NULL; /* Allocated on first use */
69 xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
70 xbt_dict_t samples = NULL; /* Allocated on first use */
71 xbt_dict_t calls = NULL; /* Allocated on first use */
72 __thread int smpi_current_rank = 0; /* Updated after each MPI call */
74 double smpi_cpu_threshold;
75 double smpi_running_power;
80 char* start_data_exe = NULL;
81 int size_data_exe = 0;
82 int smpi_privatize_global_variables;
95 static size_t shm_size(int fd) {
98 if(fstat(fd, &st) < 0) {
99 xbt_die("Could not stat fd %d: %s", fd, strerror(errno));
101 return (size_t)st.st_size;
105 static void* shm_map(int fd, size_t size, shared_data_t* data) {
107 char loc[PTR_STRLEN];
108 shared_metadata_t* meta;
110 if(size > shm_size(fd)) {
111 if(ftruncate(fd, (off_t)size) < 0) {
112 xbt_die("Could not truncate fd %d to %zu: %s", fd, size, strerror(errno));
116 mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
117 if(mem == MAP_FAILED) {
118 xbt_die("Could not map fd %d: %s", fd, strerror(errno));
120 if(!allocs_metadata) {
121 allocs_metadata = xbt_dict_new_homogeneous(xbt_free);
123 snprintf(loc, PTR_STRLEN, "%p", mem);
124 meta = xbt_new(shared_metadata_t, 1);
127 xbt_dict_set(allocs_metadata, loc, meta, NULL);
128 XBT_DEBUG("MMAP %zu to %p", size, mem);
133 void smpi_bench_destroy(void)
135 xbt_dict_free(&allocs);
136 xbt_dict_free(&allocs_metadata);
137 xbt_dict_free(&samples);
138 xbt_dict_free(&calls);
141 XBT_PUBLIC(void) smpi_execute_flops_(double *flops);
142 void smpi_execute_flops_(double *flops)
144 smpi_execute_flops(*flops);
147 XBT_PUBLIC(void) smpi_execute_(double *duration);
148 void smpi_execute_(double *duration)
150 smpi_execute(*duration);
153 void smpi_execute_flops(double flops) {
156 host = SIMIX_host_self();
157 XBT_DEBUG("Handle real computation time: %f flops", flops);
158 action = simcall_host_execute("computation", host, flops, 1, 0, 0);
160 simcall_set_category (action, TRACE_internal_smpi_get_category());
162 simcall_host_execution_wait(action);
165 void smpi_execute(double duration)
167 if (duration >= smpi_cpu_threshold) {
168 XBT_DEBUG("Sleep for %g to handle real computation time", duration);
169 double flops = duration * smpi_running_power;
171 int rank = smpi_process_index();
172 instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
173 extra->type=TRACING_COMPUTING;
174 extra->comp_size=flops;
175 TRACE_smpi_computing_in(rank, extra);
177 smpi_execute_flops(flops);
180 TRACE_smpi_computing_out(rank);
184 XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
185 duration, smpi_cpu_threshold);
189 void switch_data_segment(int dest);
191 void smpi_bench_begin(void)
193 switch_data_segment(smpi_process_index());
194 xbt_os_threadtimer_start(smpi_process_timer());
195 smpi_current_rank = smpi_process_index();
198 void smpi_bench_end(void)
200 xbt_os_timer_t timer = smpi_process_timer();
201 xbt_os_threadtimer_stop(timer);
202 // switch_data_segment(smpi_process_count());
203 if (smpi_process_get_sampling()) {
204 XBT_CRITICAL("Cannot do recursive benchmarks.");
205 XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
206 xbt_backtrace_display_current();
207 xbt_die("Aborting.");
209 smpi_execute(xbt_os_timer_elapsed(timer));
212 unsigned int smpi_sleep(unsigned int secs)
218 double flops = (double) secs*simcall_host_get_speed(SIMIX_host_self());
219 XBT_DEBUG("Sleep for: %f flops", flops);
220 action = simcall_host_execute("computation", SIMIX_host_self(), flops, 1, 0, 0);
222 simcall_set_category (action, TRACE_internal_smpi_get_category());
224 simcall_host_execution_wait(action);
230 int smpi_gettimeofday(struct timeval *tv)
234 now = SIMIX_get_clock();
236 tv->tv_sec = (time_t)now;
238 tv->tv_usec = (useconds_t)((now - tv->tv_sec) * 1e6);
240 tv->tv_usec = (suseconds_t)((now - tv->tv_sec) * 1e6);
247 extern double sg_surf_precision;
248 unsigned long long smpi_rastro_resolution (void)
251 double resolution = (1/sg_surf_precision);
253 return (unsigned long long)resolution;
256 unsigned long long smpi_rastro_timestamp (void)
259 double now = SIMIX_get_clock();
261 unsigned long long sec = (unsigned long long)now;
262 unsigned long long pre = (now - sec) * smpi_rastro_resolution();
264 return (unsigned long long)sec * smpi_rastro_resolution() + pre;
267 /* ****************************** Functions related to the SMPI_SAMPLE_ macros ************************************/
269 double threshold; /* maximal stderr requested (if positive) */
270 double relstderr; /* observed stderr so far */
271 double mean; /* mean of benched times, to be used if the block is disabled */
272 double sum; /* sum of benched times (to compute the mean and stderr) */
273 double sum_pow2; /* sum of the square of the benched times (to compute the stderr) */
274 int iters; /* amount of requested iterations */
275 int count; /* amount of iterations done so far */
276 int benching; /* 1: we are benchmarking; 0: we have enough data, no bench anymore */
279 static char *sample_location(int global, const char *file, int line) {
281 return bprintf("%s:%d", file, line);
283 return bprintf("%s:%d:%d", file, line, smpi_process_index());
286 static int sample_enough_benchs(local_data_t *data) {
287 int res = data->count >= data->iters;
288 if (data->threshold>0.0) {
290 res = 0; // not enough data
291 if (data->relstderr > data->threshold)
292 res = 0; // stderr too high yet
294 XBT_DEBUG("%s (count:%d iter:%d stderr:%f thres:%f mean:%fs)",
295 (res?"enough benchs":"need more data"),
296 data->count, data->iters, data->relstderr, data->threshold, data->mean);
300 void smpi_sample_1(int global, const char *file, int line, int iters, double threshold)
302 char *loc = sample_location(global, file, line);
305 smpi_bench_end(); /* Take time from previous, unrelated computation into account */
306 smpi_process_set_sampling(1);
309 samples = xbt_dict_new_homogeneous(free);
311 data = xbt_dict_get_or_null(samples, loc);
313 xbt_assert(threshold>0 || iters>0,
314 "You should provide either a positive amount of iterations to bench, or a positive maximal stderr (or both)");
315 data = (local_data_t *) xbt_new(local_data_t, 1);
318 data->sum_pow2 = 0.0;
320 data->threshold = threshold;
321 data->benching = 1; // If we have no data, we need at least one
323 xbt_dict_set(samples, loc, data, NULL);
324 XBT_DEBUG("XXXXX First time ever on benched nest %s.",loc);
326 if (data->iters != iters || data->threshold != threshold) {
327 XBT_ERROR("Asked to bench block %s with different settings %d, %f is not %d, %f. How did you manage to give two numbers at the same line??",
328 loc, data->iters, data->threshold, iters,threshold);
332 // if we already have some data, check whether sample_2 should get one more bench or whether it should emulate the computation instead
333 data->benching = !sample_enough_benchs(data);
334 XBT_DEBUG("XXXX Re-entering the benched nest %s. %s",loc, (data->benching?"more benching needed":"we have enough data, skip computes"));
339 int smpi_sample_2(int global, const char *file, int line)
341 char *loc = sample_location(global, file, line);
345 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
346 data = xbt_dict_get(samples, loc);
347 XBT_DEBUG("sample2 %s",loc);
350 if (data->benching==1) {
351 // we need to run a new bench
352 XBT_DEBUG("benchmarking: count:%d iter:%d stderr:%f thres:%f; mean:%f",
353 data->count, data->iters, data->relstderr, data->threshold, data->mean);
356 // Enough data, no more bench (either we got enough data from previous visits to this benched nest, or we just ran one bench and need to bail out now that our job is done).
357 // Just sleep instead
358 XBT_DEBUG("No benchmark (either no need, or just ran one): count >= iter (%d >= %d) or stderr<thres (%f<=%f). apply the %fs delay instead",
359 data->count, data->iters, data->relstderr, data->threshold, data->mean);
360 smpi_execute(data->mean);
361 smpi_process_set_sampling(0);
362 res = 0; // prepare to capture future, unrelated computations
369 void smpi_sample_3(int global, const char *file, int line)
371 char *loc = sample_location(global, file, line);
374 xbt_assert(samples, "Y U NO use SMPI_SAMPLE_* macros? Stop messing directly with smpi_sample_* functions!");
375 data = xbt_dict_get(samples, loc);
376 XBT_DEBUG("sample3 %s",loc);
379 if (data->benching==0) {
383 // ok, benchmarking this loop is over
384 xbt_os_threadtimer_stop(smpi_process_timer());
389 sample = xbt_os_timer_elapsed(smpi_process_timer());
391 data->sum_pow2 += sample * sample;
392 n = (double)data->count;
393 data->mean = data->sum / n;
394 data->relstderr = sqrt((data->sum_pow2 / n - data->mean * data->mean) / n) / data->mean;
395 if (!sample_enough_benchs(data)) {
396 data->mean = sample; // Still in benching process; We want sample_2 to simulate the exact time of this loop occurrence before leaving, not the mean over the history
398 XBT_DEBUG("Average mean after %d steps is %f, relative standard error is %f (sample was %f)", data->count,
399 data->mean, data->relstderr, sample);
401 // That's enough for now, prevent sample_2 to run the same code over and over
406 static void smpi_shared_alloc_free(void *p)
408 shared_data_t *data = p;
413 static char *smpi_shared_alloc_hash(char *loc)
423 loc = xbt_realloc(loc, 30);
425 for (i = 0; i < 40; i += 6) { /* base64 encode */
426 memcpy(s, hash + i, 6);
427 val = strtoul(s, NULL, 16);
428 for (j = 0; j < 4; j++) {
429 unsigned char x = (val >> (18 - 3 * j)) & 0x3f;
430 loc[1 + 4 * i / 6 + j] =
431 "ABCDEFGHIJKLMNOPQRSTUVZXYZabcdefghijklmnopqrstuvzxyz0123456789-_"[x];
438 void *smpi_shared_malloc(size_t size, const char *file, int line)
441 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
442 char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
445 loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
448 allocs = xbt_dict_new_homogeneous(smpi_shared_alloc_free);
450 data = xbt_dict_get_or_null(allocs, loc);
452 fd = shm_open(loc, O_RDWR | O_CREAT | O_EXCL,
453 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
457 xbt_die("Please cleanup /dev/shm/%s", loc);
459 xbt_die("An unhandled error occured while opening %s. shm_open: %s", loc, strerror(errno));
462 data = xbt_new(shared_data_t, 1);
466 mem = shm_map(fd, size, data);
467 if (shm_unlink(loc) < 0) {
468 XBT_WARN("Could not early unlink %s. shm_unlink: %s", loc, strerror(errno));
470 xbt_dict_set(allocs, loc, data, NULL);
471 XBT_DEBUG("Mapping %s at %p through %d", loc, mem, fd);
474 mem = shm_map(data->fd, size, data);
477 XBT_DEBUG("Shared malloc %zu in %p (metadata at %p)", size, mem, data);
479 mem = xbt_malloc(size);
480 XBT_DEBUG("Classic malloc %zu in %p", size, mem);
485 void smpi_shared_free(void *ptr)
487 char loc[PTR_STRLEN];
488 shared_metadata_t* meta;
490 if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
493 XBT_WARN("Cannot free: nothing was allocated");
496 if(!allocs_metadata) {
497 XBT_WARN("Cannot free: no metadata was allocated");
499 snprintf(loc, PTR_STRLEN, "%p", ptr);
500 meta = (shared_metadata_t*)xbt_dict_get_or_null(allocs_metadata, loc);
502 XBT_WARN("Cannot free: %p was not shared-allocated by SMPI", ptr);
507 XBT_WARN("Cannot free: something is broken in the metadata link");
510 if(munmap(ptr, meta->size) < 0) {
511 XBT_WARN("Unmapping of fd %d failed: %s", data->fd, strerror(errno));
514 XBT_DEBUG("Shared free - no removal - of %p, count = %d", ptr, data->count);
515 if (data->count <= 0) {
517 xbt_dict_remove(allocs, data->loc);
518 XBT_DEBUG("Shared free - with removal - of %p", ptr);
521 XBT_DEBUG("Classic free of %p", ptr);
527 int smpi_shared_known_call(const char* func, const char* input)
529 char* loc = bprintf("%s:%s", func, input);
534 calls = xbt_dict_new_homogeneous(NULL);
537 xbt_dict_get(calls, loc); /* Succeed or throw */
544 if (ex.category != not_found_error)
551 void* smpi_shared_get_call(const char* func, const char* input) {
552 char* loc = bprintf("%s:%s", func, input);
556 calls = xbt_dict_new_homogeneous(NULL);
558 data = xbt_dict_get(calls, loc);
563 void* smpi_shared_set_call(const char* func, const char* input, void* data) {
564 char* loc = bprintf("%s:%s", func, input);
567 calls = xbt_dict_new_homogeneous(NULL);
569 xbt_dict_set(calls, loc, data, NULL);
577 #define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
581 * - read the executable data+bss section addresses and sizes
582 * - for each process create a copy of these sections with mmap
583 * - store them in a dynar
589 void switch_data_segment(int dest){
591 if(size_data_exe == 0)//no need to switch
594 if (loaded_page==dest)//no need to switch either
600 if(loaded_page==-1){//initial switch, do the copy from the real page here
601 for (i=0; i< SIMIX_process_count(); i++){
602 memcpy(mappings[i],TOPAGE(start_data_exe),size_data_exe);
605 int current= fds[dest];
606 XBT_VERB("Switching data frame to the one of process %d", dest);
607 void* tmp = mmap (TOPAGE(start_data_exe), size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
608 if (tmp != TOPAGE(start_data_exe))
609 xbt_die("Couldn't map the new region");
614 void smpi_get_executable_global_size(){
615 int size_bss_binary=0;
616 int size_data_binary=0;
618 char *line = NULL; /* Temporal storage for each line that is readed */
619 ssize_t read; /* Number of bytes readed */
620 size_t n = 0; /* Amount of bytes to read by xbt_getline */
625 char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
627 fp = popen(command, "r");
630 perror("popen failed");
634 while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
639 /* Wipeout the new line character */
640 line[read - 1] = '\0';
642 lfields[0] = strtok(line, " ");
644 if(lfields[0] == NULL)
647 if(strcmp(lfields[0], "Sections:") == 0
648 || strcmp(lfields[0], "Idx") == 0
649 || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
652 for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
653 lfields[i] = strtok(NULL, " ");
657 * we are looking for these fields
658 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
659 CONTENTS, ALLOC, LOAD, DATA
660 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
665 if(strcmp(lfields[1], ".data") == 0){
666 size_data_binary = strtoul(lfields[2], NULL, 16);
667 start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
669 }else if(strcmp(lfields[1], ".bss") == 0){
670 //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
671 //TODO : check if this is OK, as some segments may be inserted between them..
672 size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (start_data_exe + size_data_binary))
673 + strtoul(lfields[2], NULL, 16);
681 size_data_exe =(unsigned long)start_data_exe - (unsigned long)TOPAGE(start_data_exe)+ size_data_binary+size_bss_binary;
688 void smpi_initialize_global_memory_segments(){
691 smpi_privatize_global_variables=0;
696 smpi_get_executable_global_size();
698 XBT_DEBUG ("bss+data segment found : size %d starting at %p",size_data_exe, start_data_exe );
700 if(size_data_exe == 0){//no need to switch
701 smpi_privatize_global_variables=0;
705 fds= (int*)xbt_malloc((smpi_process_count())*sizeof(int));
706 mappings= (void**)xbt_malloc((smpi_process_count())*sizeof(void*));
709 for (i=0; i< SIMIX_process_count(); i++){
710 //create SIMIX_process_count() mappings of this size with the same data inside
711 void *address = NULL, *tmp = NULL;
712 char path[] = "/dev/shm/my-buffer-XXXXXX";
714 int file_descriptor= mkstemp (path);
715 if (file_descriptor < 0)
716 xbt_die("Impossible to create temporary file for memory mapping");
717 status = unlink (path);
719 xbt_die("Impossible to unlink temporary file for memory mapping");
721 status = ftruncate(file_descriptor, size_data_exe);
723 xbt_die("Impossible to set the size of the temporary file for memory mapping");
725 /* Ask for a free region */
726 address = mmap (NULL, size_data_exe, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
728 if (address == MAP_FAILED)
729 xbt_die("Couldn't find a free region for memory mapping");
731 tmp = mmap (address, size_data_exe, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, file_descriptor, 0);
734 xbt_die("Couldn't obtain the right address");
735 //initialize the values
736 memcpy(address,TOPAGE(start_data_exe),size_data_exe);
738 //store the address of the mapping for further switches
739 fds[i]=file_descriptor;
740 mappings[i]= address;
747 void smpi_destroy_global_memory_segments(){
748 if(size_data_exe == 0)//no need to switch
752 for (i=0; i< smpi_process_count(); i++){
753 if(munmap(mappings[i],size_data_exe) < 0) {
754 XBT_WARN("Unmapping of fd %d failed: %s", fds[i], strerror(errno));