-/* Copyright (c) 2007, 2009-2013. The SimGrid Team.
+/* Copyright (c) 2007, 2009-2015. The SimGrid Team.
* All rights reserved. */
/* This program is free software; you can redistribute it and/or modify it
- * under the terms of the license (GNU LGPL) which comes with this package. */
+ * under the terms of the license (GNU LGPL) which comes with this package. */
+#include "internal_config.h"
#include "private.h"
#include "xbt/dict.h"
#include "xbt/sysdep.h"
#include "xbt/hash.h"
#include "surf/surf.h"
#include "simgrid/sg_config.h"
+#include "simgrid/modelchecker.h"
+#include "mc/mc_replay.h"
#ifndef WIN32
#include <sys/mman.h>
#include <string.h>
#include <stdio.h>
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_bench, smpi,
"Logging specific to SMPI (benchmarking)");
xbt_dict_t allocs_metadata = NULL; /* Allocated on first use */
xbt_dict_t samples = NULL; /* Allocated on first use */
xbt_dict_t calls = NULL; /* Allocated on first use */
-__thread int smpi_current_rank = 0; /* Updated after each MPI call */
double smpi_cpu_threshold;
double smpi_running_power;
+int smpi_loaded_page = -1;
+char* smpi_start_data_exe = NULL;
+int smpi_size_data_exe = 0;
+int smpi_privatize_global_variables;
+double smpi_total_benched_time = 0;
+smpi_privatisation_region_t smpi_privatisation_regions;
+
typedef struct {
int fd;
int count;
xbt_die("Could not map fd %d: %s", fd, strerror(errno));
}
if(!allocs_metadata) {
- allocs_metadata = xbt_dict_new_homogeneous(xbt_free);
+ allocs_metadata = xbt_dict_new_homogeneous(xbt_free_f);
}
snprintf(loc, PTR_STRLEN, "%p", mem);
meta = xbt_new(shared_metadata_t, 1);
}
void smpi_execute_flops(double flops) {
- smx_action_t action;
- smx_host_t host;
- host = SIMIX_host_self();
+ smx_synchro_t action;
XBT_DEBUG("Handle real computation time: %f flops", flops);
- action = simcall_host_execute("computation", host, flops, 1, 0, 0);
-#ifdef HAVE_TRACING
+ action = simcall_process_execute("computation", flops, 1, 0, 0);
simcall_set_category (action, TRACE_internal_smpi_get_category());
-#endif
- simcall_host_execution_wait(action);
+ simcall_process_execution_wait(action);
+ smpi_switch_data_segment(smpi_process_index());
}
void smpi_execute(double duration)
if (duration >= smpi_cpu_threshold) {
XBT_DEBUG("Sleep for %g to handle real computation time", duration);
double flops = duration * smpi_running_power;
-#ifdef HAVE_TRACING
int rank = smpi_process_index();
instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
extra->type=TRACING_COMPUTING;
extra->comp_size=flops;
TRACE_smpi_computing_in(rank, extra);
-#endif
smpi_execute_flops(flops);
-#ifdef HAVE_TRACING
TRACE_smpi_computing_out(rank);
-#endif
} else {
XBT_DEBUG("Real computation took %g while option smpi/cpu_threshold is set to %g => ignore it",
}
}
+void smpi_switch_data_segment(int dest);
+
void smpi_bench_begin(void)
{
+ smpi_switch_data_segment(smpi_process_index());
+
+ if (MC_is_active() || MC_record_replay_is_active())
+ return;
+
xbt_os_threadtimer_start(smpi_process_timer());
- smpi_current_rank = smpi_process_index();
}
void smpi_bench_end(void)
{
+
+ if (MC_is_active() || MC_record_replay_is_active())
+ return;
+
xbt_os_timer_t timer = smpi_process_timer();
xbt_os_threadtimer_stop(timer);
+// smpi_switch_data_segment(smpi_process_count());
if (smpi_process_get_sampling()) {
XBT_CRITICAL("Cannot do recursive benchmarks.");
XBT_CRITICAL("Are you trying to make a call to MPI within a SMPI_SAMPLE_ block?");
xbt_backtrace_display_current();
xbt_die("Aborting.");
}
- smpi_execute(xbt_os_timer_elapsed(timer));
+ // Simulate the benchmarked computation unless disabled via command-line argument
+ if (sg_cfg_get_boolean("smpi/simulate_computation")) {
+ smpi_execute(xbt_os_timer_elapsed(timer));
+ }
+
+ smpi_total_benched_time += xbt_os_timer_elapsed(timer);
}
-unsigned int smpi_sleep(unsigned int secs)
+/* Private sleep function used by smpi_sleep() and smpi_usleep() */
+static unsigned int private_sleep(double secs)
{
- smx_action_t action;
-
smpi_bench_end();
- double flops = (double) secs*simcall_host_get_speed(SIMIX_host_self());
- XBT_DEBUG("Sleep for: %f flops", flops);
- action = simcall_host_execute("computation", SIMIX_host_self(), flops, 1, 0, 0);
- #ifdef HAVE_TRACING
- simcall_set_category (action, TRACE_internal_smpi_get_category());
- #endif
- simcall_host_execution_wait(action);
+ XBT_DEBUG("Sleep for: %lf secs", secs);
+ int rank = smpi_comm_rank(MPI_COMM_WORLD);
+ instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
+ extra->type=TRACING_SLEEPING;
+ extra->sleep_duration=secs;
+ TRACE_smpi_sleeping_in(rank, extra);
+
+ simcall_process_sleep(secs);
+
+ TRACE_smpi_sleeping_out(rank);
smpi_bench_begin();
- return secs;
+ return 0;
+}
+
+unsigned int smpi_sleep(unsigned int secs)
+{
+ return private_sleep((double)secs);
}
-int smpi_gettimeofday(struct timeval *tv)
+int smpi_usleep(useconds_t usecs)
+{
+ return (int)private_sleep((double)usecs / 1000000.0);
+}
+
+
+int smpi_gettimeofday(struct timeval *tv, void* tz)
{
double now;
smpi_bench_end();
return 0;
}
-extern double sg_maxmin_precision;
+extern double sg_surf_precision;
unsigned long long smpi_rastro_resolution (void)
{
smpi_bench_end();
- double resolution = (1/sg_maxmin_precision);
+ double resolution = (1/sg_surf_precision);
smpi_bench_begin();
return (unsigned long long)resolution;
}
{
void* mem;
if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
- char *loc = bprintf("%zu_%s_%d", (size_t)getpid(), file, line);
+ char *loc = bprintf("/%zu_%s_%d", (size_t)getpid(), file, line);
int fd;
shared_data_t *data;
loc = smpi_shared_alloc_hash(loc); /* hash loc, in order to have something
shared_metadata_t* meta;
shared_data_t* data;
if (sg_cfg_get_boolean("smpi/use_shared_malloc")){
-
+
if (!allocs) {
XBT_WARN("Cannot free: nothing was allocated");
return;
free(loc);
return data;
}
+
+
+
+
+#define TOPAGE(addr) (void *)(((unsigned long)(addr) / xbt_pagesize) * xbt_pagesize)
+
+
+/** Map a given SMPI privatization segment (make a SMPI process active)
+ */
+void smpi_switch_data_segment(int dest){
+
+ if (smpi_loaded_page==dest)//no need to switch either
+ return;
+
+ // So the job:
+ smpi_really_switch_data_segment(dest);
+}
+
+/** Map a given SMPI privatization segment (make a SMPI process active)
+ * even if SMPI thinks it is already active
+ *
+ * When doing a state restoration, the state of the restored variables
+ * might not be consistent with the state of the virtual memory.
+ * In this case, we to change the data segment.
+ */
+void smpi_really_switch_data_segment(int dest) {
+
+ if(smpi_size_data_exe == 0)//no need to switch
+ return;
+
+#ifdef HAVE_PRIVATIZATION
+ int i;
+ if(smpi_loaded_page==-1){//initial switch, do the copy from the real page here
+ for (i=0; i< SIMIX_process_count(); i++){
+ memcpy(smpi_privatisation_regions[i].address,
+ TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
+ }
+ }
+
+ // FIXME, cross-process support (mmap across process when necessary)
+ int current = smpi_privatisation_regions[dest].file_descriptor;
+ XBT_DEBUG("Switching data frame to the one of process %d", dest);
+ void* tmp = mmap (TOPAGE(smpi_start_data_exe), smpi_size_data_exe,
+ PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, current, 0);
+ if (tmp != TOPAGE(smpi_start_data_exe))
+ xbt_die("Couldn't map the new region");
+ smpi_loaded_page = dest;
+#endif
+}
+
+int smpi_is_privatisation_file(char* file)
+{
+ return strncmp("/dev/shm/my-buffer-", file, 19) == 0;
+}
+
+void smpi_get_executable_global_size(){
+ int size_bss_binary=0;
+ int size_data_binary=0;
+ FILE *fp;
+ char *line = NULL; /* Temporal storage for each line that is readed */
+ ssize_t read; /* Number of bytes readed */
+ size_t n = 0; /* Amount of bytes to read by xbt_getline */
+
+ char *lfields[7];
+ int i, found = 0;
+
+ char *command = bprintf("objdump --section-headers %s", xbt_binary_name);
+
+ fp = popen(command, "r");
+
+ if(fp == NULL){
+ perror("popen failed");
+ xbt_abort();
+ }
+
+ while ((read = xbt_getline(&line, &n, fp)) != -1 && found != 2) {
+
+ if(n == 0)
+ continue;
+
+ /* Wipeout the new line character */
+ line[read - 1] = '\0';
+
+ lfields[0] = strtok(line, " ");
+
+ if(lfields[0] == NULL)
+ continue;
+
+ if(strcmp(lfields[0], "Sections:") == 0
+ || strcmp(lfields[0], "Idx") == 0
+ || strncmp(lfields[0], xbt_binary_name, strlen(xbt_binary_name)) == 0)
+ continue;
+
+ for (i = 1; i < 7 && lfields[i - 1] != NULL; i++) {
+ lfields[i] = strtok(NULL, " ");
+ }
+
+ /*
+ * we are looking for these fields
+ 23 .data 02625a20 00000000006013e0 00000000006013e0 000013e0 2**5
+ CONTENTS, ALLOC, LOAD, DATA
+ 24 .bss 02625a40 0000000002c26e00 0000000002c26e00 02626e00 2**5
+ ALLOC
+ */
+
+ if(i>=6){
+ if(strcmp(lfields[1], ".data") == 0){
+ size_data_binary = strtoul(lfields[2], NULL, 16);
+ smpi_start_data_exe = (char*) strtoul(lfields[4], NULL, 16);
+ found++;
+ }else if(strcmp(lfields[1], ".bss") == 0){
+ //the beginning of bss is not exactly the end of data if not aligned, grow bss reported size accordingly
+ //TODO : check if this is OK, as some segments may be inserted between them..
+ size_bss_binary = ((char*) strtoul(lfields[4], NULL, 16) - (smpi_start_data_exe + size_data_binary))
+ + strtoul(lfields[2], NULL, 16);
+ found++;
+ }
+
+ }
+
+ }
+
+ smpi_size_data_exe = (unsigned long) smpi_start_data_exe
+ - (unsigned long) TOPAGE(smpi_start_data_exe)
+ + size_data_binary+size_bss_binary;
+ xbt_free(command);
+ xbt_free(line);
+ pclose(fp);
+
+}
+
+void smpi_initialize_global_memory_segments(){
+
+#ifndef HAVE_PRIVATIZATION
+ smpi_privatize_global_variables=0;
+ return;
+#else
+
+ unsigned int i = 0;
+ smpi_get_executable_global_size();
+
+ XBT_DEBUG ("bss+data segment found : size %d starting at %p",
+ smpi_size_data_exe, smpi_start_data_exe );
+
+ if (smpi_size_data_exe == 0){//no need to switch
+ smpi_privatize_global_variables=0;
+ return;
+ }
+
+ smpi_privatisation_regions = (smpi_privatisation_region_t) malloc(
+ smpi_process_count() * sizeof(struct s_smpi_privatisation_region));
+
+ for (i=0; i< SIMIX_process_count(); i++){
+ //create SIMIX_process_count() mappings of this size with the same data inside
+ void *address = NULL;
+ char path[] = "/dev/shm/my-buffer-XXXXXX";
+ int status;
+
+ int file_descriptor= mkstemp (path);
+ if (file_descriptor < 0) {
+ if (errno==EMFILE) {
+ xbt_die("Impossible to create temporary file for memory mapping: %s\n\
+The open() system call failed with the EMFILE error code (too many files). \n\n\
+This means that you reached the system limits concerning the amount of files per process. \
+This is not a surprise if you are trying to virtualize many processes on top of SMPI. \
+Don't panic -- you should simply increase your system limits and try again. \n\n\
+First, check what your limits are:\n\
+ cat /proc/sys/fs/file-max # Gives you the system-wide limit\n\
+ ulimit -Hn # Gives you the per process hard limit\n\
+ ulimit -Sn # Gives you the per process soft limit\n\
+ cat /proc/self/limits # Displays any per-process limitation (including the one given above)\n\n\
+If one of these values is less than the amount of MPI processes that you try to run, then you got the explanation of this error. \
+Ask the Internet about tutorials on how to increase the files limit such as: https://rtcamp.com/tutorials/linux/increase-open-files-limit/",
+ strerror(errno));
+ }
+ xbt_die("Impossible to create temporary file for memory mapping: %s",
+ strerror(errno));
+ }
+
+ status = unlink (path);
+ if (status)
+ xbt_die("Impossible to unlink temporary file for memory mapping");
+
+ status = ftruncate(file_descriptor, smpi_size_data_exe);
+ if(status)
+ xbt_die("Impossible to set the size of the temporary file for memory mapping");
+
+ /* Ask for a free region */
+ address = mmap (NULL, smpi_size_data_exe, PROT_READ | PROT_WRITE, MAP_SHARED, file_descriptor, 0);
+ if (address == MAP_FAILED)
+ xbt_die("Couldn't find a free region for memory mapping");
+
+ //initialize the values
+ memcpy(address, TOPAGE(smpi_start_data_exe), smpi_size_data_exe);
+
+ //store the address of the mapping for further switches
+ smpi_privatisation_regions[i].file_descriptor = file_descriptor;
+ smpi_privatisation_regions[i].address = address;
+ }
+
+#endif
+
+}
+
+void smpi_destroy_global_memory_segments(){
+ if (smpi_size_data_exe == 0)//no need to switch
+ return;
+#ifdef HAVE_PRIVATIZATION
+ int i;
+ for (i=0; i< smpi_process_count(); i++){
+ if(munmap(smpi_privatisation_regions[i].address, smpi_size_data_exe) < 0) {
+ XBT_WARN("Unmapping of fd %d failed: %s",
+ smpi_privatisation_regions[i].file_descriptor, strerror(errno));
+ }
+ close(smpi_privatisation_regions[i].file_descriptor);
+ }
+ xbt_free(smpi_privatisation_regions);
+#endif
+
+}