ADD_TEST(test-smpi-mpich3-pt2pt-raw ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}/teshsuite/smpi/mpich3-test/pt2pt perl ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/runtests -mpiexec=${CMAKE_BINARY_DIR}/smpi_script/bin/smpirun -srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/pt2pt -tests=testlist -execarg=--cfg=contexts/factory:raw -execarg=--cfg=smpi/privatize_global_variables:yes)
ADD_TEST(test-smpi-mpich3-topo-raw ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}/teshsuite/smpi/mpich3-test/topo perl ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/runtests -mpiexec=${CMAKE_BINARY_DIR}/smpi_script/bin/smpirun -srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/topo -tests=testlist -execarg=--cfg=contexts/factory:raw)
ADD_TEST(test-smpi-mpich3-rma-raw ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}/teshsuite/smpi/mpich3-test/rma perl ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/runtests -mpiexec=${CMAKE_BINARY_DIR}/smpi_script/bin/smpirun -srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/rma -tests=testlist -execarg=--cfg=contexts/factory:raw -execarg=--cfg=smpi/privatize_global_variables:yes)
+ ADD_TEST(test-smpi-mpich3-perf-raw ${CMAKE_COMMAND} -E chdir ${CMAKE_BINARY_DIR}/teshsuite/smpi/mpich3-test/perf perl ${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/runtests -mpiexec=${CMAKE_BINARY_DIR}/smpi_script/bin/smpirun -srcdir=${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/perf -tests=testlist -execarg=--cfg=contexts/factory:raw -execarg=--cfg=smpi/running_power:-1)
SET_TESTS_PROPERTIES(test-smpi-mpich3-attr-raw test-smpi-mpich3-comm-raw test-smpi-mpich3-init-raw test-smpi-mpich3-datatype-raw test-smpi-mpich3-group-raw test-smpi-mpich3-pt2pt-raw test-smpi-mpich3-topo-raw test-smpi-mpich3-rma-raw PROPERTIES PASS_REGULAR_EXPRESSION "tests passed!")
ENDIF()
IF(SMPI_FORTRAN)
teshsuite/smpi/mpich3-test/pt2pt/CMakeLists.txt
teshsuite/smpi/mpich3-test/topo/CMakeLists.txt
teshsuite/smpi/mpich3-test/rma/CMakeLists.txt
+ teshsuite/smpi/mpich3-test/perf/CMakeLists.txt
teshsuite/surf/CMakeLists.txt
teshsuite/surf/lmm_usage/CMakeLists.txt
teshsuite/surf/maxmin_bench/CMakeLists.txt
add_subdirectory(${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/init)
add_subdirectory(${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/pt2pt)
add_subdirectory(${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/rma)
+add_subdirectory(${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/perf)
#add_subdirectory(${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/f77/attr)
add_subdirectory(${CMAKE_HOME_DIRECTORY}/teshsuite/smpi/mpich3-test/f77/util)
--- /dev/null
+cmake_minimum_required(VERSION 2.6)
+
+if(enable_smpi)
+ if(WIN32)
+ set(CMAKE_C_FLAGS "-include ${CMAKE_HOME_DIRECTORY}/include/smpi/smpi_main.h")
+ else()
+ set(CMAKE_C_COMPILER "${CMAKE_BINARY_DIR}/smpi_script/bin/smpicc")
+ set(CMAKE_Fortran_COMPILER "${CMAKE_BINARY_DIR}/smpi_script/bin/smpiff")
+ endif()
+
+ set(EXECUTABLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}")
+
+ include_directories("${CMAKE_HOME_DIRECTORY}/include/smpi")
+ include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../include/")
+
+
+ add_executable(commcreatep commcreatep.c)
+# add_executable(dtpack dtpack.c)
+# add_executable(indexperf indexperf.c)
+# add_executable(manyrma manyrma.c)
+# add_executable(nestvec2 nestvec2.c)
+# add_executable(nestvec nestvec.c)
+ add_executable(non_zero_root non_zero_root.c)
+ add_executable(sendrecvl sendrecvl.c)
+ add_executable(timer timer.c)
+ add_executable(transp-datatype transp-datatype.c)
+ add_executable(twovec twovec.c)
+
+
+ target_link_libraries(commcreatep simgrid mtest_c)
+# target_link_libraries(dtpack simgrid mtest_c)
+# target_link_libraries(indexperf simgrid mtest_c)
+# target_link_libraries(manyrma simgrid mtest_c)
+# target_link_libraries(nestvec2 simgrid mtest_c)
+# target_link_libraries(nestvec simgrid mtest_c)
+ target_link_libraries(non_zero_root simgrid mtest_c)
+ target_link_libraries(sendrecvl simgrid mtest_c)
+ target_link_libraries(timer simgrid mtest_c)
+ target_link_libraries(transp-datatype simgrid mtest_c)
+ target_link_libraries(twovec simgrid mtest_c)
+
+
+
+
+endif()
+
+set(tesh_files
+ ${tesh_files}
+ PARENT_SCOPE
+ )
+set(xml_files
+ ${xml_files}
+ PARENT_SCOPE
+ )
+set(examples_src
+ ${examples_src}
+ ${CMAKE_CURRENT_SOURCE_DIR}/allredtrace.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/commcreatep.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/dtpack.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/indexperf.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/manyrma.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/nestvec2.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/nestvec.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/non_zero_root.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/sendrecvl.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/timer.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/transp-datatype.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/twovec.c
+ PARENT_SCOPE
+ )
+set(bin_files
+ ${bin_files}
+ PARENT_SCOPE
+ )
+set(txt_files
+ ${txt_files}
+ ${CMAKE_CURRENT_SOURCE_DIR}/testlist
+ PARENT_SCOPE
+ )
--- /dev/null
+This directory contains some performance tests. These are not
+general performance tests; rather, they reflect our experience with
+particular performance articfacts that users (or ourselves) haver
+reported or experienced. The tests include:
+
+sendrecvl - Send and receive (head to head) large messages.
+mattrans - Matrix transpose example
+
--- /dev/null
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ * (C) 2008 by University of Illinois
+ * See COPYRIGHT in top-level directory.
+ */
+
+/*
+ * This code is intended to test the trace overhead when using an
+ * MPI tracing package. To perform the test, follow these steps:
+ *
+ * 1) Run with the versbose mode selected to determine the delay argument
+ * to use in subsequent tests:
+ * mpiexec -n 4096 allredtrace -v
+ * Assume that the computed delay count is 6237; that value is used in
+ * the following.
+ *
+ * 2) Run with an explicit delay count, without tracing enabled:
+ * mpiexec -n 4096 allredtrace -delaycount 6237
+ *
+ * 3) Build allredtrace with tracing enabled, then run:
+ * mpiexec -n 4096 allredtrace -delaycount 6237
+ *
+ * Compare the total times. The tracing version should take slightly
+ * longer but no more than, for example, 15%.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static int verbose = 0;
+static int lCount = 0;
+void Delay( int );
+void SetupDelay( double );
+
+int main( int argc, char *argv[] )
+{
+ double usecPerCall = 100;
+ double t, t1, tsum;
+ int i, nLoop = 100;
+ int rank;
+
+ MPI_Init( &argc, &argv );
+ MPI_Comm_rank( MPI_COMM_WORLD, &rank );
+
+ /* Process arguments. We allow the delay count to be set from the
+ command line to ensure reproducibility*/
+ for (i=1; i<argc; i++) {
+ if (strcmp( argv[i], "-delaycount" ) == 0) {
+ i++;
+ lCount = atoi( argv[i] );
+ }
+ else if (strcmp( argv[i], "-v" ) == 0) {
+ verbose = 1;
+ }
+ else {
+ fprintf( stderr, "Unrecognized argument %s\n", argv[i] );
+ exit(1);
+ }
+ }
+
+ if (lCount == 0) {
+ SetupDelay( usecPerCall );
+ }
+
+ MPI_Barrier( MPI_COMM_WORLD );
+
+ t = MPI_Wtime();
+ for (i=0; i<nLoop; i++) {
+ MPI_Allreduce( &t1, &tsum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+ Delay( lCount );
+ }
+ t = MPI_Wtime() - t;
+ MPI_Barrier( MPI_COMM_WORLD );
+ if (rank == 0) {
+ printf( "For delay count %d, time is %e\n", lCount, t );
+ }
+
+ MPI_Barrier( MPI_COMM_WORLD );
+
+ MPI_Finalize();
+
+ return 0;
+}
+
+void SetupDelay( double usec )
+{
+ double t, tick;
+ double sec = 1.0e-6 * usec;
+ int nLoop, i, direction;
+
+
+ /* Compute the number of times to run the tests to get an accurate
+ number given the timer resolution. */
+ nLoop = 1;
+ tick = 100 * MPI_Wtick();
+ do {
+ nLoop = 2 * nLoop;
+ t = MPI_Wtime();
+ for (i=0; i<nLoop; i++) {
+ MPI_Wtime();
+ }
+ t = MPI_Wtime() - t;
+ }
+ while ( t < tick && nLoop < 100000 );
+
+ if (verbose) printf( "nLoop = %d\n", nLoop );
+
+ /* Start with an estimated count */
+ lCount = 128;
+ direction = 0;
+ while (1) {
+ t = MPI_Wtime();
+ for (i=0; i<nLoop; i++) {
+ Delay( lCount );
+ }
+ t = MPI_Wtime() - t;
+ t = t / nLoop;
+ if (verbose) printf( "lCount = %d, time = %e\n", lCount, t );
+ if (t > 10 * tick) nLoop = nLoop / 2;
+
+ /* Compare measured delay */
+ if (t > 2*sec) {
+ lCount = lCount / 2;
+ if (direction == 1) break;
+ direction = -1;
+ }
+ else if (t < sec / 2) {
+ lCount = lCount * 2;
+ if (direction == -1) break;
+ direction = 1;
+ }
+ else if (t < sec) {
+ /* sec/2 <= t < sec , so estimate the lCount to hit sec */
+ lCount = (sec/t) * lCount;
+ }
+ else
+ break;
+ }
+
+ if (verbose) printf( "lCount = %d, t = %e\n", lCount, t );
+
+ /* Should coordinate with the other processes - take the max? */
+}
+
+volatile double delayCounter = 0;
+void Delay( int count )
+{
+ int i;
+
+ delayCounter = 0.0;
+ for (i=0; i<count; i++) {
+ delayCounter += 2.73;
+ }
+}
--- /dev/null
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ * (C) 2009 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "mpitest.h"
+
+#define MAX_LOG_WSIZE 31
+#define MAX_LOOP 20
+
+int main( int argc, char *argv[] )
+{
+ MPI_Group gworld, g;
+ MPI_Comm comm, newcomm[MAX_LOOP];
+ int wsize, wrank, range[1][3], errs=0;
+ double t[MAX_LOG_WSIZE], tf;
+ int maxi, i, k, ts, gsize[MAX_LOG_WSIZE];
+
+ MTest_Init( &argc, &argv );
+
+ MPI_Comm_size( MPI_COMM_WORLD, &wsize );
+ MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
+
+ if (wrank == 0)
+ MTestPrintfMsg( 1, "size\ttime\n" );
+
+ MPI_Comm_group( MPI_COMM_WORLD, &gworld );
+ ts = 1;
+ comm = MPI_COMM_WORLD;
+ for (i=0; ts<=wsize; i++, ts = ts + ts) {
+ /* Create some groups with at most ts members */
+ range[0][0] = ts-1;
+ range[0][1] = 0;
+ range[0][2] = -1;
+ MPI_Group_range_incl( gworld, 1, range, &g );
+
+ MPI_Barrier( MPI_COMM_WORLD );
+ tf = MPI_Wtime();
+ for (k=0; k<MAX_LOOP; k++)
+ MPI_Comm_create( comm, g, &newcomm[k] );
+ tf = MPI_Wtime() - tf;
+ MPI_Allreduce( &tf, &t[i], 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD );
+ t[i] = t[i] / MAX_LOOP;
+ gsize[i] = ts;
+ if (wrank == 0)
+ MTestPrintfMsg( 1, "%d\t%e\n", ts, t[i] );
+ MPI_Group_free( &g );
+ if (newcomm[0] != MPI_COMM_NULL)
+ for (k=0; k<MAX_LOOP; k++)
+ MPI_Comm_free( &newcomm[k] );
+ }
+ MPI_Group_free( &gworld );
+ maxi = i-1;
+
+ /* The cost should be linear or at worst ts*log(ts).
+ We can check this in a number of ways.
+ */
+ if (wrank == 0) {
+ for (i=4; i<=maxi; i++) {
+ double rdiff;
+ if (t[i] > 0) {
+ rdiff = (t[i] - t[i-1]) / t[i];
+ if (rdiff >= 4) {
+ errs++;
+ fprintf( stderr, "Relative difference between group of size %d and %d is %e exceeds 4\n",
+ gsize[i-1], gsize[i], rdiff );
+ }
+ }
+ }
+ }
+
+ MTest_Finalize( errs );
+
+ MPI_Finalize();
+
+ return 0;
+}
--- /dev/null
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ * (C) 2008 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+/*
+ * This code may be used to test the performance of some of the
+ * noncontiguous datatype operations, including vector and indexed
+ * pack and unpack operations. To simplify the use of this code for
+ * tuning an MPI implementation, it uses no communication, just the
+ * MPI_Pack and MPI_Unpack routines. In addition, the individual tests are
+ * in separate routines, making it easier to compare the compiler-generated
+ * code for the user (manual) pack/unpack with the code used by
+ * the MPI implementation. Further, to be fair to the MPI implementation,
+ * the routines are passed the source and destination buffers; this ensures
+ * that the compiler can't optimize for statically allocated buffers.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "mpi.h"
+
+/* Needed for restrict and const definitions */
+#include "mpitestconf.h"
+
+static int verbose = 0;
+
+#define N_REPS 1000
+#define THRESHOLD 0.10
+#define VARIANCE_THRESHOLD ((THRESHOLD * THRESHOLD) / 2)
+#define NTRIALS 10
+
+double mean(double *list, int count);
+double mean(double *list, int count)
+{
+ double retval;
+ int i;
+
+ retval = 0;
+ for (i = 0; i < count; i++)
+ retval += list[i];
+ retval /= count;
+
+ return retval;
+}
+
+double noise(double *list, int count);
+double noise(double *list, int count)
+{
+ double *margin, retval;
+ int i;
+
+ if (!(margin = malloc(count * sizeof(double)))) {
+ printf("Unable to allocate memory\n");
+ return -1;
+ }
+
+ for (i = 0; i < count; i++)
+ margin[i] = list[i] / mean(list, count);
+
+ retval = 0;
+ for (i = 0; i < count; i++) {
+ retval += ((margin[i] - 1) * (margin[i] - 1));
+ }
+ retval /= count;
+ if (retval < 0) retval = -retval;
+
+ return retval;
+}
+
+/* Here are the tests */
+
+/* Test packing a vector of individual doubles */
+/* We don't use restrict in the function args because assignments between
+ restrict pointers is not valid in C and some compilers, such as the
+ IBM xlc compilers, flag that use as an error.*/
+int TestVecPackDouble( int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI,
+ double *dest, const double *src );
+int TestVecPackDouble( int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI,
+ double *dest, const double *src )
+{
+ double *restrict d_dest;
+ const double *restrict d_src;
+ register int i, j;
+ int rep, position;
+ double t1, t2, t[NTRIALS];
+ MPI_Datatype vectype;
+
+ /* User code */
+ if (verbose) printf("TestVecPackDouble (USER): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep=0; rep<N_REPS; rep++) {
+ i = n;
+ d_dest = dest;
+ d_src = src;
+ while (i--) {
+ *d_dest++ = *d_src;
+ d_src += stride;
+ }
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose) printf("%.3f ", t[j]);
+ }
+ if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ return 0;
+ }
+ *avgTimeUser = mean(t, NTRIALS) / N_REPS;
+
+ /* MPI Vector code */
+ MPI_Type_vector( n, 1, stride, MPI_DOUBLE, &vectype );
+ MPI_Type_commit( &vectype );
+
+ if (verbose) printf("TestVecPackDouble (MPI): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep=0; rep<N_REPS; rep++) {
+ position = 0;
+ MPI_Pack( (void *)src, 1, vectype, dest, n*sizeof(double),
+ &position, MPI_COMM_SELF );
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose) printf("%.3f ", t[j]);
+ }
+ if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ }
+ else {
+ *avgTimeMPI = mean(t, NTRIALS) / N_REPS;
+ }
+
+ MPI_Type_free( &vectype );
+
+ return 0;
+}
+
+/* Test unpacking a vector of individual doubles */
+/* See above for why restrict is not used in the function args */
+int TestVecUnPackDouble( int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI,
+ double *dest, const double *src );
+int TestVecUnPackDouble( int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI,
+ double *dest, const double *src )
+{
+ double *restrict d_dest;
+ const double *restrict d_src;
+ register int i, j;
+ int rep, position;
+ double t1, t2, t[NTRIALS];
+ MPI_Datatype vectype;
+
+ /* User code */
+ if (verbose) printf("TestVecUnPackDouble (USER): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep=0; rep<N_REPS; rep++) {
+ i = n;
+ d_dest = dest;
+ d_src = src;
+ while (i--) {
+ *d_dest = *d_src++;
+ d_dest += stride;
+ }
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose) printf("%.3f ", t[j]);
+ }
+ if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ return 0;
+ }
+ *avgTimeUser = mean(t, NTRIALS) / N_REPS;
+
+ /* MPI Vector code */
+ MPI_Type_vector( n, 1, stride, MPI_DOUBLE, &vectype );
+ MPI_Type_commit( &vectype );
+
+ if (verbose) printf("TestVecUnPackDouble (MPI): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep=0; rep<N_REPS; rep++) {
+ position = 0;
+ MPI_Unpack( (void *)src, n*sizeof(double),
+ &position, dest, 1, vectype, MPI_COMM_SELF );
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose) printf("%.3f ", t[j]);
+ }
+ if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ }
+ else {
+ *avgTimeMPI = mean(t, NTRIALS) / N_REPS;
+ }
+
+ MPI_Type_free( &vectype );
+
+ return 0;
+}
+
+/* Test packing a vector of 2-individual doubles */
+/* See above for why restrict is not used in the function args */
+int TestVecPack2Double( int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI,
+ double *dest, const double *src );
+int TestVecPack2Double( int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI,
+ double *dest, const double *src )
+{
+ double *restrict d_dest;
+ const double *restrict d_src;
+ register int i, j;
+ int rep, position;
+ double t1, t2, t[NTRIALS];
+ MPI_Datatype vectype;
+
+ /* User code */
+ if (verbose) printf("TestVecPack2Double (USER): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep=0; rep<N_REPS; rep++) {
+ i = n;
+ d_dest = dest;
+ d_src = src;
+ while (i--) {
+ *d_dest++ = d_src[0];
+ *d_dest++ = d_src[1];
+ d_src += stride;
+ }
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose) printf("%.3f ", t[j]);
+ }
+ if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ return 0;
+ }
+ *avgTimeUser = mean(t, NTRIALS) / N_REPS;
+
+ /* MPI Vector code */
+ MPI_Type_vector( n, 2, stride, MPI_DOUBLE, &vectype );
+ MPI_Type_commit( &vectype );
+
+ if (verbose) printf("TestVecPack2Double (MPI): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep=0; rep<N_REPS; rep++) {
+ position = 0;
+ MPI_Pack( (void *)src, 1, vectype, dest, 2*n*sizeof(double),
+ &position, MPI_COMM_SELF );
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose) printf("%.3f ", t[j]);
+ }
+ if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ }
+ else {
+ *avgTimeMPI = mean(t, NTRIALS) / N_REPS;
+ }
+ MPI_Type_free( &vectype );
+
+ return 0;
+}
+
+/* This creates an indexed type that is like a vector (for simplicity
+ of construction). There is a possibility that the MPI implementation
+ will recognize and simplify this (e.g., in MPI_Type_commit); if so,
+ let us know and we'll add a version that is not as regular
+*/
+/* See above for why restrict is not used in the function args */
+int TestIndexPackDouble( int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI,
+ double *dest, const double *src );
+int TestIndexPackDouble( int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI,
+ double *dest, const double *src )
+{
+ double *restrict d_dest;
+ const double *restrict d_src;
+ register int i, j;
+ int rep, position;
+ int *restrict displs = 0;
+ double t1, t2, t[NTRIALS];
+ MPI_Datatype indextype;
+
+ displs = (int *)malloc( n * sizeof(int) );
+ for (i=0; i<n; i++) displs[i] = i * stride;
+
+ /* User code */
+ if (verbose) printf("TestIndexPackDouble (USER): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep=0; rep<N_REPS; rep++) {
+ i = n;
+ d_dest = dest;
+ d_src = src;
+ for (i=0; i<n; i++) {
+ *d_dest++ = d_src[displs[i]];
+ }
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose) printf("%.3f ", t[j]);
+ }
+ if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ return 0;
+ }
+ *avgTimeUser = mean(t, NTRIALS) / N_REPS;
+
+ /* MPI Index code */
+ MPI_Type_create_indexed_block( n, 1, displs, MPI_DOUBLE, &indextype );
+ MPI_Type_commit( &indextype );
+
+ free( displs );
+
+ if (verbose) printf("TestIndexPackDouble (MPI): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep=0; rep<N_REPS; rep++) {
+ position = 0;
+ MPI_Pack( (void *)src, 1, indextype, dest, n*sizeof(double),
+ &position, MPI_COMM_SELF );
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose) printf("%.3f ", t[j]);
+ }
+ if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ }
+ else {
+ *avgTimeMPI = mean(t, NTRIALS) / N_REPS;
+ }
+ MPI_Type_free( &indextype );
+
+ return 0;
+}
+
+int Report( const char *name, const char *packname,
+ double avgTimeMPI, double avgTimeUser );
+int Report( const char *name, const char *packname,
+ double avgTimeMPI, double avgTimeUser )
+{
+ double diffTime, maxTime;
+ int errs=0;
+
+ /* Move this into a common routine */
+ diffTime = avgTimeMPI - avgTimeUser;
+ if (diffTime < 0) diffTime = - diffTime;
+ if (avgTimeMPI > avgTimeUser) maxTime = avgTimeMPI;
+ else maxTime = avgTimeUser;
+
+ if (verbose) {
+ printf( "%-30s:\t%g\t%g\t(%g%%)\n", name,
+ avgTimeMPI, avgTimeUser,
+ 100 * (diffTime / maxTime) );
+ fflush(stdout);
+ }
+ if (avgTimeMPI > avgTimeUser && (diffTime > THRESHOLD * maxTime)) {
+ errs++;
+ printf( "%s:\tMPI %s code is too slow: MPI %g\t User %g\n",
+ name, packname, avgTimeMPI, avgTimeUser );
+ }
+
+ return errs;
+}
+
+/* Finally, here's the main program */
+int main( int argc, char *argv[] )
+{
+ int n, stride, err, errs = 0;
+ void *dest, *src;
+ double avgTimeUser, avgTimeMPI;
+
+ MPI_Init( &argc, &argv );
+ if (getenv("MPITEST_VERBOSE")) verbose = 1;
+
+ n = 30000;
+ stride = 4;
+ dest = (void *)malloc( n * sizeof(double) );
+ src = (void *)malloc( n * ((1+stride)*sizeof(double)) );
+ /* Touch the source and destination arrays */
+ memset( src, 0, n * (1+stride)*sizeof(double) );
+ memset( dest, 0, n * sizeof(double) );
+
+ err = TestVecPackDouble( n, stride, &avgTimeUser, &avgTimeMPI,
+ dest, src );
+ errs += Report( "VecPackDouble", "Pack", avgTimeMPI, avgTimeUser );
+
+ err = TestVecUnPackDouble( n, stride, &avgTimeUser, &avgTimeMPI,
+ src, dest );
+ errs += Report( "VecUnPackDouble", "Unpack", avgTimeMPI, avgTimeUser );
+
+ err = TestIndexPackDouble( n, stride, &avgTimeUser, &avgTimeMPI,
+ dest, src );
+ errs += Report( "VecIndexDouble", "Pack", avgTimeMPI, avgTimeUser );
+
+ free(dest);
+ free(src);
+
+ dest = (void *)malloc( 2*n * sizeof(double) );
+ src = (void *)malloc( (1 + n) * ((1+stride)*sizeof(double)) );
+ memset( dest, 0, 2*n * sizeof(double) );
+ memset( src, 0, (1+n) * (1+stride)*sizeof(double) );
+ err = TestVecPack2Double( n, stride, &avgTimeUser, &avgTimeMPI,
+ dest, src );
+ errs += Report( "VecPack2Double", "Pack", avgTimeMPI, avgTimeUser );
+
+ free(dest);
+ free(src);
+
+
+
+ if (errs == 0) {
+ printf( " No Errors\n" );
+ }
+ else {
+ printf( " Found %d performance problems\n", errs );
+ }
+
+ fflush(stdout);
+ MPI_Finalize();
+
+ return 0;
+}
--- /dev/null
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ * (C) 2012 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+
+/*
+ * Tests that basic optimizations are performed on indexed datatypes.
+ *
+ * If PACK_IS_NATIVE is defined, MPI_Pack stores exactly the same bytes as the
+ * user would pack manually; in that case, there is a consistency check.
+ */
+
+#ifdef MPICH
+/* MPICH (as of 6/2012) packs the native bytes */
+#define PACK_IS_NATIVE
+#endif
+
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static int verbose = 0;
+
+int main( int argc, char **argv )
+{
+ double *inbuf, *outbuf, *outbuf2;
+ MPI_Aint lb, extent;
+ int *index_displacement;
+ int icount, errs=0;
+ int i, packsize, position, inbufsize;
+ MPI_Datatype itype1, stype1;
+ double t0, t1;
+ double tpack, tspack, tmanual;
+ int ntry;
+
+ MPI_Init( &argc, &argv );
+
+ icount = 2014;
+
+ /* Create a simple block indexed datatype */
+ index_displacement = (int *)malloc( icount * sizeof(int) );
+ if (!index_displacement) {
+ fprintf( stderr, "Unable to allocated index array of size %d\n",
+ icount );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+
+ for (i=0; i<icount; i++) {
+ index_displacement[i] = (i * 3 + (i%3));
+ }
+
+ MPI_Type_create_indexed_block( icount, 1, index_displacement, MPI_DOUBLE,
+ &itype1 );
+ MPI_Type_commit( &itype1 );
+
+#if defined(MPICH) && defined(PRINT_DATATYPE_INTERNALS)
+ /* To use MPIDU_Datatype_debug to print the datatype internals,
+ you must configure MPICH with --enable-g=log */
+ if (verbose) {
+ printf( "Block index datatype:\n" );
+ MPIDU_Datatype_debug( itype1, 10 );
+ }
+#endif
+ MPI_Type_get_extent( itype1, &lb, &extent );
+
+ MPI_Pack_size( 1, itype1, MPI_COMM_WORLD, &packsize );
+
+ inbufsize = extent / sizeof(double);
+
+ inbuf = (double *)malloc( extent );
+ outbuf = (double *)malloc( packsize );
+ outbuf2 = (double *)malloc( icount * sizeof(double) );
+ if (!inbuf) {
+ fprintf( stderr, "Unable to allocate %ld for inbuf\n", (long)extent );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+ if (!outbuf) {
+ fprintf( stderr, "Unable to allocate %ld for outbuf\n", (long)packsize );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+ if (!outbuf2) {
+ fprintf( stderr, "Unable to allocate %ld for outbuf2\n", (long)packsize );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+ for (i=0; i<inbufsize; i++) {
+ inbuf[i] = (double)i;
+ }
+ position = 0;
+ /* Warm up the code and data */
+ MPI_Pack( inbuf, 1, itype1, outbuf, packsize, &position, MPI_COMM_WORLD );
+
+ tpack = 1e12;
+ for (ntry = 0; ntry < 5; ntry++) {
+ position = 0;
+ t0 = MPI_Wtime();
+ MPI_Pack( inbuf, 1, itype1, outbuf, packsize, &position,
+ MPI_COMM_WORLD );
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tpack) tpack = t1;
+ }
+
+ { int one = 1; MPI_Aint displ = (MPI_Aint) inbuf;
+ MPI_Type_create_struct( 1, &one, &displ, &itype1, &stype1 );
+ MPI_Type_commit( &stype1 );
+ }
+
+ position = 0;
+ /* Warm up the code and data */
+ MPI_Pack( MPI_BOTTOM, 1, stype1, outbuf, packsize, &position, MPI_COMM_WORLD );
+
+ tspack = 1e12;
+ for (ntry = 0; ntry < 5; ntry++) {
+ position = 0;
+ t0 = MPI_Wtime();
+ MPI_Pack( MPI_BOTTOM, 1, stype1, outbuf, packsize, &position,
+ MPI_COMM_WORLD );
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tspack) tspack = t1;
+ }
+
+ /*
+ Simple manual pack (without explicitly unrolling the index block)
+ */
+ tmanual = 1e12;
+ for (ntry = 0; ntry < 5; ntry++) {
+ const double *ppe = (const double *)inbuf;
+ const int *id = (const int *)index_displacement;
+ int k, j;
+ t0 = MPI_Wtime();
+ position = 0;
+ for (i=0; i<icount; i++) {
+ outbuf2[position++] = ppe[id[i]];
+ }
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tmanual) tmanual = t1;
+ /* Check on correctness */
+#ifdef PACK_IS_NATIVE
+ if (memcmp( outbuf, outbuf2, position ) != 0) {
+ printf( "Panic - pack buffers differ\n" );
+ }
+#endif
+ }
+
+ if (verbose) {
+ printf( "Bytes packed = %d\n", position );
+ printf( "MPI_Pack time = %e, manual pack time = %e\n",
+ tpack, tmanual );
+ printf( "Pack with struct = %e\n", tspack );
+ }
+
+ /* The threshold here permits the MPI datatype to perform at up to
+ only one half the performance of simple user code. Note that the
+ example code above may be made faster through careful use of const,
+ restrict, and unrolling if the compiler doesn't already do that. */
+ if (2 * tmanual < tpack) {
+ errs++;
+ printf( "MPI_Pack (block index) time = %e, manual pack time = %e\n", tpack, tmanual );
+ printf( "MPI_Pack time should be less than 2 times the manual time\n" );
+ printf( "For most informative results, be sure to compile this test with optimization\n" );
+ }
+ if (2 * tmanual < tspack) {
+ errs++;
+ printf( "MPI_Pack (struct of block index)) time = %e, manual pack time = %e\n", tspack, tmanual );
+ printf( "MPI_Pack time should be less than 2 times the manual time\n" );
+ printf( "For most informative results, be sure to compile this test with optimization\n" );
+ }
+ if (errs) {
+ printf( " Found %d errors\n", errs );
+ }
+ else {
+ printf( " No Errors\n" );
+ }
+
+ MPI_Type_free( &itype1 );
+ MPI_Type_free( &stype1 );
+
+ free( inbuf );
+ free( outbuf );
+ free( outbuf2 );
+ free( index_displacement );
+
+ MPI_Finalize();
+ return 0;
+}
--- /dev/null
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ * (C) 2010 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+
+/* This test measures the performance of many rma operations to a single
+ target process.
+ It uses a number of operations (put or accumulate) to different
+ locations in the target window
+ This is one of the ways that RMA may be used, and is used in the
+ reference implementation of the graph500 benchmark.
+*/
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define MAX_COUNT 65536*4
+#define MAX_RMA_SIZE 16
+#define MAX_RUNS 10
+
+typedef enum { SYNC_NONE=0,
+ SYNC_ALL=-1, SYNC_FENCE=1, SYNC_LOCK=2, SYNC_PSCW=4 } sync_t;
+typedef enum { RMA_NONE=0, RMA_ALL=-1, RMA_PUT=1, RMA_ACC=2, RMA_GET=4 } rma_t;
+/* Note GET not yet implemented */
+sync_t syncChoice = SYNC_ALL;
+rma_t rmaChoice = RMA_ALL;
+
+typedef struct {
+ double startOp, endOp, endSync;
+} timing;
+
+static int verbose = 1;
+static int barrierSync = 0;
+static double tickThreshold = 0.0;
+
+void PrintResults( int cnt, timing t[] );
+void RunAccFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
+void RunAccLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
+void RunPutFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
+void RunPutLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
+void RunAccPSCW( MPI_Win win, int destRank, int cnt, int sz,
+ MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] );
+void RunPutPSCW( MPI_Win win, int destRank, int cnt, int sz,
+ MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] );
+
+int main( int argc, char *argv[] )
+{
+ int arraysize, i, cnt, sz, maxCount=MAX_COUNT, *arraybuffer;
+ int wrank, wsize, destRank, srcRank;
+ MPI_Win win;
+ MPI_Group wgroup, accessGroup, exposureGroup;
+ timing t[MAX_RUNS];
+ int maxSz = MAX_RMA_SIZE;
+
+ MPI_Init( &argc, &argv );
+
+ /* Determine clock accuracy */
+ tickThreshold = 10.0 * MPI_Wtick();
+ MPI_Allreduce( MPI_IN_PLACE, &tickThreshold, 1, MPI_DOUBLE, MPI_MAX,
+ MPI_COMM_WORLD );
+
+ for (i=1; i<argc; i++) {
+ if (strcmp( argv[i], "-put" ) == 0) {
+ if (rmaChoice == RMA_ALL) rmaChoice = RMA_NONE;
+ rmaChoice |= RMA_PUT;
+ }
+ else if (strcmp( argv[i], "-acc" ) == 0) {
+ if (rmaChoice == RMA_ALL) rmaChoice = RMA_NONE;
+ rmaChoice |= RMA_ACC;
+ }
+ else if (strcmp( argv[i], "-fence" ) == 0) {
+ if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
+ syncChoice |= SYNC_FENCE;
+ }
+ else if (strcmp( argv[i], "-lock" ) == 0) {
+ if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
+ syncChoice |= SYNC_LOCK;
+ }
+ else if (strcmp( argv[i], "-pscw" ) == 0) {
+ if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
+ syncChoice |= SYNC_PSCW;
+ }
+ else if (strcmp( argv[i], "-maxsz" ) == 0) {
+ i++;
+ maxSz = atoi( argv[i] );
+ }
+ else if (strcmp( argv[i], "-maxcount" ) == 0) {
+ i++;
+ maxCount = atoi( argv[i] );
+ }
+ else if (strcmp( argv[i], "-barrier" ) == 0) {
+ barrierSync = 1;
+ }
+ else {
+ fprintf( stderr, "Unrecognized argument %s\n", argv[i] );
+ fprintf( stderr, "%s [ -put ] [ -acc ] [ -lock ] [ -fence ] [ -pscw ] [ -barrier ] [ -maxsz msgsize ]\n", argv[0] );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+ }
+
+ MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
+ MPI_Comm_size( MPI_COMM_WORLD, &wsize );
+ destRank = wrank + 1;
+ while (destRank >= wsize) destRank = destRank - wsize;
+ srcRank = wrank - 1;
+ if (srcRank < 0) srcRank += wsize;
+
+ /* Create groups for PSCW */
+ MPI_Comm_group( MPI_COMM_WORLD, &wgroup );
+ MPI_Group_incl( wgroup, 1, &destRank, &accessGroup );
+ MPI_Group_incl( wgroup, 1, &srcRank, &exposureGroup );
+ MPI_Group_free( &wgroup );
+
+ arraysize = maxSz * MAX_COUNT;
+ arraybuffer = (int*)malloc( arraysize * sizeof(int) );
+ if (!arraybuffer) {
+ fprintf( stderr, "Unable to allocate %d words\n", arraysize );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+
+ MPI_Win_create( arraybuffer, arraysize*sizeof(int), (int)sizeof(int),
+ MPI_INFO_NULL, MPI_COMM_WORLD, &win );
+
+ /* FIXME: we need a test on performance consistency.
+ The test needs to have both a relative growth limit and
+ an absolute limit.
+ */
+
+ if (maxCount > MAX_COUNT) {
+ fprintf( stderr, "MaxCount must not exceed %d\n", MAX_COUNT );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+
+ if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_ACC)) {
+ for (sz=1; sz<=maxSz; sz = sz + sz) {
+ if (wrank == 0)
+ printf( "Accumulate with fence, %d elements\n", sz );
+ cnt = 1;
+ while (cnt <= maxCount) {
+ RunAccFence( win, destRank, cnt, sz, t );
+ if (wrank == 0) {
+ PrintResults( cnt, t );
+ }
+ cnt = 2 * cnt;
+ }
+ }
+ }
+
+ if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_ACC)) {
+ for (sz=1; sz<=maxSz; sz = sz + sz) {
+ if (wrank == 0)
+ printf( "Accumulate with lock, %d elements\n", sz );
+ cnt = 1;
+ while (cnt <= maxCount) {
+ RunAccLock( win, destRank, cnt, sz, t );
+ if (wrank == 0) {
+ PrintResults( cnt, t );
+ }
+ cnt = 2 * cnt;
+ }
+ }
+ }
+
+ if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_PUT)) {
+ for (sz=1; sz<=maxSz; sz = sz + sz) {
+ if (wrank == 0)
+ printf( "Put with fence, %d elements\n", sz );
+ cnt = 1;
+ while (cnt <= maxCount) {
+ RunPutFence( win, destRank, cnt, sz, t );
+ if (wrank == 0) {
+ PrintResults( cnt, t );
+ }
+ cnt = 2 * cnt;
+ }
+ }
+ }
+
+ if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_PUT)) {
+ for (sz=1; sz<=maxSz; sz = sz + sz) {
+ if (wrank == 0)
+ printf( "Put with lock, %d elements\n", sz );
+ cnt = 1;
+ while (cnt <= maxCount) {
+ RunPutLock( win, destRank, cnt, sz, t );
+ if (wrank == 0) {
+ PrintResults( cnt, t );
+ }
+ cnt = 2 * cnt;
+ }
+ }
+ }
+
+ if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_PUT)) {
+ for (sz=1; sz<=maxSz; sz = sz + sz) {
+ if (wrank == 0)
+ printf( "Put with pscw, %d elements\n", sz );
+ cnt = 1;
+ while (cnt <= maxCount) {
+ RunPutPSCW( win, destRank, cnt, sz,
+ exposureGroup, accessGroup, t );
+ if (wrank == 0) {
+ PrintResults( cnt, t );
+ }
+ cnt = 2 * cnt;
+ }
+ }
+ }
+
+ if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_ACC)) {
+ for (sz=1; sz<=maxSz; sz = sz + sz) {
+ if (wrank == 0)
+ printf( "Accumulate with pscw, %d elements\n", sz );
+ cnt = 1;
+ while (cnt <= maxCount) {
+ RunAccPSCW( win, destRank, cnt, sz,
+ exposureGroup, accessGroup, t );
+ if (wrank == 0) {
+ PrintResults( cnt, t );
+ }
+ cnt = 2 * cnt;
+ }
+ }
+ }
+
+ MPI_Win_free( &win );
+
+ MPI_Group_free( &accessGroup );
+ MPI_Group_free( &exposureGroup );
+
+ MPI_Finalize();
+ return 0;
+}
+
+
+void RunAccFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
+{
+ int k, i, j, one = 1;
+
+ for (k=0; k<MAX_RUNS; k++) {
+ MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Win_fence( 0, win );
+ j = 0;
+ t[k].startOp = MPI_Wtime();
+ for (i=0; i<cnt; i++) {
+ MPI_Accumulate( &one, sz, MPI_INT, destRank,
+ j, sz, MPI_INT, MPI_SUM, win );
+ j += sz;
+ }
+ t[k].endOp = MPI_Wtime();
+ if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Win_fence( 0, win );
+ t[k].endSync = MPI_Wtime();
+ }
+}
+
+void RunAccLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
+{
+ int k, i, j, one = 1;
+
+ for (k=0; k<MAX_RUNS; k++) {
+ MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Win_lock( MPI_LOCK_SHARED, destRank, 0, win );
+ j = 0;
+ t[k].startOp = MPI_Wtime();
+ for (i=0; i<cnt; i++) {
+ MPI_Accumulate( &one, sz, MPI_INT, destRank,
+ j, sz, MPI_INT, MPI_SUM, win );
+ j += sz;
+ }
+ t[k].endOp = MPI_Wtime();
+ if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Win_unlock( destRank, win );
+ t[k].endSync = MPI_Wtime();
+ }
+}
+
+void RunPutFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
+{
+ int k, i, j, one = 1;
+
+ for (k=0; k<MAX_RUNS; k++) {
+ MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Win_fence( 0, win );
+ j = 0;
+ t[k].startOp = MPI_Wtime();
+ for (i=0; i<cnt; i++) {
+ MPI_Put( &one, sz, MPI_INT, destRank,
+ j, sz, MPI_INT, win );
+ j += sz;
+ }
+ t[k].endOp = MPI_Wtime();
+ if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Win_fence( 0, win );
+ t[k].endSync = MPI_Wtime();
+ }
+}
+
+void RunPutLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
+{
+ int k, i, j, one = 1;
+
+ for (k=0; k<MAX_RUNS; k++) {
+ MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Win_lock( MPI_LOCK_SHARED, destRank, 0, win );
+ j = 0;
+ t[k].startOp = MPI_Wtime();
+ for (i=0; i<cnt; i++) {
+ MPI_Put( &one, sz, MPI_INT, destRank, j, sz, MPI_INT, win );
+ j += sz;
+ }
+ t[k].endOp = MPI_Wtime();
+ if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Win_unlock( destRank, win );
+ t[k].endSync = MPI_Wtime();
+ }
+}
+
+void RunPutPSCW( MPI_Win win, int destRank, int cnt, int sz,
+ MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] )
+{
+ int k, i, j, one = 1;
+
+ for (k=0; k<MAX_RUNS; k++) {
+ MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Win_post( exposureGroup, 0, win );
+ MPI_Win_start( accessGroup, 0, win );
+ j = 0;
+ t[k].startOp = MPI_Wtime();
+ for (i=0; i<cnt; i++) {
+ MPI_Put( &one, sz, MPI_INT, destRank, j, sz, MPI_INT, win );
+ j += sz;
+ }
+ t[k].endOp = MPI_Wtime();
+ if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Win_complete( win );
+ MPI_Win_wait( win );
+ t[k].endSync = MPI_Wtime();
+ }
+}
+
+void RunAccPSCW( MPI_Win win, int destRank, int cnt, int sz,
+ MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] )
+{
+ int k, i, j, one = 1;
+
+ for (k=0; k<MAX_RUNS; k++) {
+ MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Win_post( exposureGroup, 0, win );
+ MPI_Win_start( accessGroup, 0, win );
+ j = 0;
+ t[k].startOp = MPI_Wtime();
+ for (i=0; i<cnt; i++) {
+ MPI_Accumulate( &one, sz, MPI_INT, destRank,
+ j, sz, MPI_INT, MPI_SUM, win );
+ j += sz;
+ }
+ t[k].endOp = MPI_Wtime();
+ if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Win_complete( win );
+ MPI_Win_wait( win );
+ t[k].endSync = MPI_Wtime();
+ }
+}
+
+void PrintResults( int cnt, timing t[] )
+{
+ int k;
+ double d1=0, d2=0;
+ double minD1 = 1e10, minD2 = 1e10;
+ double tOp, tSync;
+ for (k=0; k<MAX_RUNS; k++) {
+ tOp = t[k].endOp - t[k].startOp;
+ tSync = t[k].endSync - t[k].endOp;
+ d1 += tOp;
+ d2 += tSync;
+ if (tOp < minD1) minD1 = tOp;
+ if (tSync < minD2) minD2 = tSync;
+ }
+ if (verbose) {
+ long rate = 0;
+ /* Use the minimum times because they are more stable - if timing
+ accuracy is an issue, use the min over multiple trials */
+ d1 = minD1;
+ d2 = minD2;
+ /* d1 = d1 / MAX_RUNS; d2 = d2 / MAX_RUNS); */
+ if (d2 > 0) rate = (long)(cnt) / d2;
+ /* count, op, sync, op/each, sync/each, rate */
+ printf( "%d\t%e\t%e\t%e\t%e\t%ld\n", cnt,
+ d1, d2,
+ d1 / cnt, d2 / cnt, rate );
+ }
+}
--- /dev/null
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ * (C) 2012 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+
+/*
+ * Tests that basic optimizations are performed on vector of vector datatypes.
+ * As the "leaf" element is a large block (when properly optimized), the
+ * performance of an MPI datatype should be nearly as good (if not better)
+ * than manual packing (the threshold used in this test is *very* forgiving).
+ * This test may be run with one process.
+ *
+ * If PACK_IS_NATIVE is defined, MPI_Pack stores exactly the same bytes as the
+ * user would pack manually; in that case, there is a consistency check.
+ */
+
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "mpitestconf.h"
+
+#ifdef MPICH
+/* MPICH (as of 6/2012) packs the native bytes */
+#define PACK_IS_NATIVE
+#endif
+
+
+static int verbose = 0;
+
+int main( int argc, char **argv )
+{
+ int vcount = 16, vblock = vcount*vcount/2, vstride=2*vcount*vblock;
+ int v2stride, typesize, packsize, i, position, errs = 0;
+ char *inbuf, *outbuf, *outbuf2;
+ MPI_Datatype ft1type, ft2type, ft3type;
+ MPI_Datatype ftopttype;
+ MPI_Aint lb, extent;
+ double t0, t1;
+ double tpack, tmanual, tpackopt;
+ int ntry;
+
+ MPI_Init( &argc, &argv );
+
+ MPI_Type_contiguous( 6, MPI_FLOAT, &ft1type );
+ MPI_Type_size( ft1type, &typesize );
+ v2stride = vcount * vcount * vcount * vcount * typesize;
+ MPI_Type_vector( vcount, vblock, vstride, ft1type, &ft2type );
+ MPI_Type_create_hvector( 2, 1, v2stride, ft2type, &ft3type );
+ MPI_Type_commit( &ft3type );
+ MPI_Type_free( &ft1type );
+ MPI_Type_free( &ft2type );
+#if defined(MPICH) && defined(PRINT_DATATYPE_INTERNALS)
+ /* To use MPIDU_Datatype_debug to print the datatype internals,
+ you must configure MPICH with --enable-g=log */
+ if (verbose) {
+ printf( "Original datatype:\n" );
+ MPIDU_Datatype_debug( ft3type, 10 );
+ }
+#endif
+ /* The same type, but without using the contiguous type */
+ MPI_Type_vector( vcount, 6*vblock, 6*vstride, MPI_FLOAT, &ft2type );
+ MPI_Type_create_hvector( 2, 1, v2stride, ft2type, &ftopttype );
+ MPI_Type_commit( &ftopttype );
+ MPI_Type_free( &ft2type );
+#if defined(MPICH) && defined(PRINT_DATATYPE_INTERNALS)
+ if (verbose) {
+ printf( "\n\nMerged datatype:\n" );
+ MPIDU_Datatype_debug( ftopttype, 10 );
+ }
+#endif
+
+ MPI_Type_get_extent( ft3type, &lb, &extent );
+ MPI_Type_size( ft3type, &typesize );
+
+ MPI_Pack_size( 1, ft3type, MPI_COMM_WORLD, &packsize );
+
+ inbuf = (char *)malloc( extent );
+ outbuf = (char *)malloc( packsize );
+ outbuf2 = (char *)malloc( packsize );
+ if (!inbuf) {
+ fprintf( stderr, "Unable to allocate %ld for inbuf\n", (long)extent );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+ if (!outbuf) {
+ fprintf( stderr, "Unable to allocate %ld for outbuf\n", (long)packsize );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+ if (!outbuf2) {
+ fprintf( stderr, "Unable to allocate %ld for outbuf2\n", (long)packsize );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+ for (i=0; i<extent; i++) {
+ inbuf[i] = i & 0x7f;
+ }
+ position = 0;
+ /* Warm up the code and data */
+ MPI_Pack( inbuf, 1, ft3type, outbuf, packsize, &position, MPI_COMM_WORLD );
+
+ /* Pack using the vector of vector of contiguous */
+ tpack = 1e12;
+ for (ntry = 0; ntry < 5; ntry++) {
+ position = 0;
+ t0 = MPI_Wtime();
+ MPI_Pack( inbuf, 1, ft3type, outbuf, packsize, &position,
+ MPI_COMM_WORLD );
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tpack) tpack = t1;
+ }
+ MPI_Type_free( &ft3type );
+
+ /* Pack using vector of vector with big blocks (same type map) */
+ tpackopt = 1e12;
+ for (ntry = 0; ntry < 5; ntry++) {
+ position = 0;
+ t0 = MPI_Wtime();
+ MPI_Pack( inbuf, 1, ftopttype, outbuf, packsize, &position,
+ MPI_COMM_WORLD );
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tpackopt) tpackopt = t1;
+ }
+ MPI_Type_free( &ftopttype );
+
+ /* User (manual) packing code.
+ Note that we exploit the fact that the vector type contains vblock
+ instances of a contiguous type of size 24, or equivalently a
+ single block of 24*vblock bytes.
+ */
+ tmanual = 1e12;
+ for (ntry = 0; ntry < 5; ntry++) {
+ const char *ppe = (const char *)inbuf;
+ int k, j;
+ t0 = MPI_Wtime();
+ position = 0;
+ for (k=0; k<2; k++) { /* hvector count; blocksize is 1 */
+ const char *ptr = ppe;
+ for (j=0; j<vcount; j++) { /* vector count */
+ memcpy( outbuf2 + position, ptr, 24*vblock );
+ ptr += vstride * 24;
+ position += 24*vblock;
+ }
+ ppe += v2stride;
+ }
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tmanual) tmanual = t1;
+
+ /* Check on correctness */
+#ifdef PACK_IS_NATIVE
+ if (memcmp( outbuf, outbuf2, position ) != 0) {
+ printf( "Panic - pack buffers differ\n" );
+ }
+#endif
+ }
+
+ if (verbose) {
+ printf( "Bytes packed = %d\n", position );
+ printf( "MPI_Pack time = %e, opt version = %e, manual pack time = %e\n",
+ tpack, tpackopt, tmanual );
+ }
+
+ /* A factor of 4 is extremely generous, especially since the test suite
+ no longer builds any of the tests with optimization */
+ if (4 * tmanual < tpack) {
+ errs++;
+ printf( "MPI_Pack time = %e, manual pack time = %e\n", tpack, tmanual );
+ printf( "MPI_Pack time should be less than 4 times the manual time\n" );
+ printf( "For most informative results, be sure to compile this test with optimization\n" );
+ }
+ if (4 * tmanual < tpackopt) {
+ errs++;
+ printf( "MPI_Pack with opt = %e, manual pack time = %e\n", tpackopt,
+ tmanual );
+ printf( "MPI_Pack time should be less than 4 times the manual time\n" );
+ printf( "For most informative results, be sure to compile this test with optimization\n" );
+ }
+ if (errs) {
+ printf( " Found %d errors\n", errs );
+ }
+ else {
+ printf( " No Errors\n" );
+ }
+
+ free( inbuf );
+ free( outbuf );
+ free( outbuf2 );
+
+ MPI_Finalize();
+ return 0;
+}
--- /dev/null
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ * (C) 2012 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+
+/*
+ * Tests that the performance of a struct that contains a vector type
+ * exploits the vector type correctly
+ *
+ * If PACK_IS_NATIVE is defined, MPI_Pack stores exactly the same bytes as the
+ * user would pack manually; in that case, there is a consistency check.
+ */
+
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "mpitestconf.h"
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+
+#ifdef MPICH
+/* MPICH (as of 6/2012) packs the native bytes */
+#define PACK_IS_NATIVE
+#endif
+
+
+static int verbose = 0;
+
+int main( int argc, char **argv )
+{
+ int vcount, vstride;
+ int32_t counts[2];
+ int v2stride, typesize, packsize, i, position, errs = 0;
+ double *outbuf, *outbuf2;
+ double *vsource;
+ MPI_Datatype vtype, stype;
+ MPI_Aint lb, extent;
+ double t0, t1;
+ double tspack, tvpack, tmanual;
+ int ntry;
+ int blocklengths[2];
+ MPI_Aint displacements[2];
+ MPI_Datatype typesArray[2];
+
+ MPI_Init( &argc, &argv );
+
+ /* Create a struct consisting of a two 32-bit ints, followed by a
+ vector of stride 3 but count 128k (less than a few MB of data area) */
+ vcount = 128000;
+ vstride = 3;
+ MPI_Type_vector( vcount, 1, vstride, MPI_DOUBLE, &vtype );
+
+ vsource = (double *)malloc( (vcount + 1) * (vstride + 1) * sizeof(double) );
+ if (!vsource) {
+ fprintf( stderr, "Unable to allocate vsource\n" );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+ for (i=0; i<vcount*vstride; i++) {
+ vsource[i] = i;
+ }
+ blocklengths[0] = 2; MPI_Get_address( &counts[0], &displacements[0] );
+ blocklengths[1] = 1; MPI_Get_address( vsource, &displacements[1] );
+ if (verbose) {
+ printf( "%p = %p?\n", vsource, (void *)displacements[1] );
+ }
+ typesArray[0] = MPI_INT32_T;
+ typesArray[1] = vtype;
+ MPI_Type_create_struct( 2, blocklengths, displacements, typesArray,
+ &stype );
+ MPI_Type_commit( &stype );
+ MPI_Type_commit( &vtype );
+
+#if defined(MPICH) && defined(PRINT_DATATYPE_INTERNALS)
+ /* To use MPIDU_Datatype_debug to print the datatype internals,
+ you must configure MPICH with --enable-g=log */
+ if (verbose) {
+ printf( "Original struct datatype:\n" );
+ MPIDU_Datatype_debug( stype, 10 );
+ }
+#endif
+
+ MPI_Pack_size( 1, stype, MPI_COMM_WORLD, &packsize );
+ outbuf = (double *)malloc( packsize );
+ outbuf2 = (double *)malloc( packsize );
+ if (!outbuf) {
+ fprintf( stderr, "Unable to allocate %ld for outbuf\n", (long)packsize );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+ if (!outbuf2) {
+ fprintf( stderr, "Unable to allocate %ld for outbuf2\n", (long)packsize );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+ position = 0;
+ /* Warm up the code and data */
+ MPI_Pack( MPI_BOTTOM, 1, stype, outbuf, packsize, &position,
+ MPI_COMM_WORLD );
+
+ tspack = 1e12;
+ for (ntry = 0; ntry < 5; ntry++) {
+ position = 0;
+ t0 = MPI_Wtime();
+ MPI_Pack( MPI_BOTTOM, 1, stype, outbuf, packsize, &position,
+ MPI_COMM_WORLD );
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tspack) tspack = t1;
+ }
+ MPI_Type_free( &stype );
+
+ /* An equivalent packing, using the 2 ints and the vector separately */
+ tvpack = 1e12;
+ for (ntry = 0; ntry < 5; ntry++) {
+ position = 0;
+ t0 = MPI_Wtime();
+ MPI_Pack( counts, 2, MPI_INT32_T, outbuf, packsize, &position,
+ MPI_COMM_WORLD );
+ MPI_Pack( vsource, 1, vtype, outbuf, packsize, &position,
+ MPI_COMM_WORLD );
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tvpack) tvpack = t1;
+ }
+ MPI_Type_free( &vtype );
+
+ /* Note that we exploit the fact that the vector type contains vblock
+ instances of a contiguous type of size 24, or a single block of 24*vblock
+ bytes.
+ */
+ tmanual = 1e12;
+ for (ntry = 0; ntry < 5; ntry++) {
+ const double * restrict ppe = (const double *)vsource;
+ double * restrict ppo = outbuf2;
+ int j;
+ t0 = MPI_Wtime();
+ position = 0;
+ *(int32_t *)ppo = counts[0];
+ *( ((int32_t *)ppo) + 1) = counts[1];
+ ppo++;
+ /* Some hand optimization because this file is not normally
+ compiled with optimization by the test suite */
+ j = vcount;
+ while (j) {
+ *ppo++ = *ppe;
+ ppe += vstride;
+ *ppo++ = *ppe;
+ ppe += vstride;
+ *ppo++ = *ppe;
+ ppe += vstride;
+ *ppo++ = *ppe;
+ ppe += vstride;
+ j -= 4;
+ }
+ position += (1 + vcount);
+ position *= sizeof(double);
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tmanual) tmanual = t1;
+
+ /* Check on correctness */
+#ifdef PACK_IS_NATIVE
+ if (memcmp( outbuf, outbuf2, position ) != 0) {
+ printf( "Panic(manual) - pack buffers differ\n" );
+ for (j=0; j<8; j++) {
+ printf( "%d: %llx\t%llx\n", j, (long long unsigned)outbuf[j],
+ (long long unsigned)outbuf2[j] );
+ }
+ }
+#endif
+ }
+
+ if (verbose) {
+ printf( "Bytes packed = %d\n", position );
+ printf( "MPI_Pack time = %e (struct), = %e (vector), manual pack time = %e\n",
+ tspack, tvpack, tmanual );
+ }
+
+ if (4 * tmanual < tspack) {
+ errs++;
+ printf( "MPI_Pack time using struct with vector = %e, manual pack time = %e\n", tspack, tmanual )
+;
+ printf( "MPI_Pack time should be less than 4 times the manual time\n" );
+ printf( "For most informative results, be sure to compile this test with optimization\n" );
+ }
+ if (4 * tmanual < tvpack) {
+ errs++;
+ printf( "MPI_Pack using vector = %e, manual pack time = %e\n", tvpack,
+ tmanual );
+ printf( "MPI_Pack time should be less than 4 times the manual time\n" );
+ printf( "For most informative results, be sure to compile this test with optimization\n" );
+ }
+ if (4 * tvpack < tspack) {
+ errs++;
+ printf( "MPI_Pack using a vector = %e, using a struct with vector = %e\n", tvpack, tspack );
+ printf( "MPI_Pack time using vector should be about the same as the struct containing the vector\n" );
+ printf( "For most informative results, be sure to compile this test with optimization\n" );
+ }
+
+ if (errs) {
+ printf( " Found %d errors\n", errs );
+ }
+ else {
+ printf( " No Errors\n" );
+ }
+
+ free( vsource );
+ free( outbuf );
+ free( outbuf2 );
+
+ MPI_Finalize();
+ return 0;
+}
--- /dev/null
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ * (C) 2008 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#define SIZE 100000
+#define ITER 1000
+
+#define ERROR_MARGIN 0.5
+
+static int verbose = 0;
+
+int main(int argc, char* argv[])
+{
+ char *sbuf, *rbuf;
+ int i, j;
+ double t1, t2, t, ts;
+ int rank, size;
+ MPI_Status status;
+
+ MPI_Init(&argc,&argv);
+ MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+ if (getenv("MPITEST_VERBOSE")) verbose = 1;
+
+ /* Allocate memory regions to communicate */
+ sbuf = (char*) malloc(SIZE);
+ rbuf = (char*) malloc(size * SIZE);
+
+ /* Touch the buffers to make sure they are allocated */
+ for (i = 0; i < SIZE; i++) sbuf[i] = '0';
+ for (i = 0; i < SIZE * size; i++) rbuf[i] = '0';
+
+ /* Time when rank 0 gathers the data */
+ MPI_Barrier(MPI_COMM_WORLD);
+ t1 = MPI_Wtime();
+ for (i = 0; i < ITER; i++) {
+ MPI_Gather(sbuf, SIZE, MPI_BYTE, rbuf, SIZE, MPI_BYTE, 0, MPI_COMM_WORLD);
+ MPI_Barrier(MPI_COMM_WORLD);
+ }
+ t2 = MPI_Wtime();
+ t = (t2-t1)/ITER;
+
+ /* Time when rank 1 gathers the data */
+ MPI_Barrier(MPI_COMM_WORLD);
+ t1 = MPI_Wtime();
+ for (j = 0; j < ITER; j++) {
+ MPI_Gather(sbuf, SIZE, MPI_BYTE, rbuf, SIZE, MPI_BYTE, 1, MPI_COMM_WORLD);
+ MPI_Barrier(MPI_COMM_WORLD);
+ }
+ t2 = MPI_Wtime();
+ ts = (t2-t1)/ITER;
+
+ if (rank == 1)
+ MPI_Send(&ts, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
+ if (rank == 0)
+ MPI_Recv(&ts, 1, MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &status);
+
+ /* Print out the results */
+ if (!rank) {
+ if ((ts / t) > (1 + ERROR_MARGIN)) { /* If the difference is more than 10%, it's an error */
+ printf("%.3f\t%.3f\n", 1000000.0 * ts, 1000000.0 * t);
+ printf("Too much difference in performance\n");
+ }
+ else printf(" No Errors\n");
+ }
+
+ MPI_Finalize();
+
+ return 0;
+}
--- /dev/null
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ * (C) 2006 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+
+/* This program provides a simple test of send-receive performance between
+ two (or more) processes. This sometimes called head-to-head or
+ ping-ping test, as both processes send at the same time.
+*/
+
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+#define MAXTESTS 32
+#define ERROR_MARGIN 1.0 /* FIXME: This number is pretty much randomly chosen */
+
+static int verbose = 0;
+
+int main( int argc, char *argv[] )
+{
+ int wsize, wrank, partner, len, maxlen, k, reps, repsleft;
+ double t1;
+ MPI_Request rreq;
+ char *rbuf, *sbuf;
+ double times[3][MAXTESTS];
+
+ MPI_Init( &argc, &argv );
+ if (getenv("MPITEST_VERBOSE")) verbose = 1;
+
+ MPI_Comm_size( MPI_COMM_WORLD, &wsize );
+ MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
+
+ if (wsize < 2) {
+ fprintf( stderr, "This program requires at least 2 processes\n" );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+ /* Set partner based on whether rank is odd or even */
+ if (wrank & 0x1) {
+ partner = wrank - 1;
+ }
+ else if (wrank < wsize - 1) {
+ partner = wrank + 1;
+ }
+ else
+ /* Handle wsize odd */
+ partner = MPI_PROC_NULL;
+
+ /* Allocate and initialize buffers */
+ maxlen = 1024*1024;
+ rbuf = (char *)malloc( maxlen );
+ sbuf = (char *)malloc( maxlen );
+ if (!rbuf || !sbuf) {
+ fprintf( stderr, "Could not allocate %d byte buffers\n", maxlen );
+ MPI_Abort( MPI_COMM_WORLD, 2 );
+ }
+ for (k=0; k<maxlen; k++) {
+ rbuf[k] = 0;
+ sbuf[k] = 0;
+ }
+
+ MPI_Barrier( MPI_COMM_WORLD );
+
+ /* Test Irecv and send, head to head */
+ if (wrank == 0 && verbose) {
+ printf( "Irecv-send\n" );
+ printf( "len\ttime \trate\n" );
+ }
+
+ /* Send powers of 2 bytes */
+ len = 1;
+ for (k=0; k<20; k++) {
+ /* We use a simple linear form for the number of tests to
+ reduce the impact of the granularity of the timer */
+ reps = 50-k;
+ repsleft = reps;
+ /* Make sure that both processes are ready to start */
+ MPI_Sendrecv( MPI_BOTTOM, 0, MPI_BYTE, partner, 0,
+ MPI_BOTTOM, 0, MPI_BYTE, partner, 0, MPI_COMM_WORLD,
+ MPI_STATUS_IGNORE );
+ t1 = MPI_Wtime();
+ while (repsleft--) {
+ MPI_Irecv( rbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD, &rreq );
+ MPI_Send( sbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD );
+ MPI_Wait( &rreq, MPI_STATUS_IGNORE );
+ }
+ t1 = MPI_Wtime() - t1;
+ times[0][k] = t1 / reps;
+ if (wrank == 0) {
+ t1 = t1 / reps;
+ if (t1 > 0) {
+ double rate;
+ rate = (len / t1) / 1.e6;
+ t1 = t1 * 1.e6;
+ if (verbose)
+ printf( "%d\t%g\t%g\n", len, t1, len/t1 );
+ }
+ else {
+ t1 = t1 * 1.e6;
+ if (verbose)
+ printf( "%d\t%g\tINF\n", len, t1 );
+ }
+ if (verbose)
+ fflush( stdout );
+ }
+
+ len *= 2;
+ }
+
+ MPI_Barrier( MPI_COMM_WORLD );
+
+ /* Test Sendrecv, head to head */
+ if (wrank == 0 && verbose) {
+ printf( "Sendrecv\n" );
+ printf( "len\ttime (usec)\trate (MB/s)\n" );
+ }
+
+ /* Send powers of 2 bytes */
+ len = 1;
+ for (k=0; k<20; k++) {
+ /* We use a simple linear form for the number of tests to
+ reduce the impact of the granularity of the timer */
+ reps = 50-k;
+ repsleft = reps;
+ /* Make sure that both processes are ready to start */
+ MPI_Sendrecv( MPI_BOTTOM, 0, MPI_BYTE, partner, 0,
+ MPI_BOTTOM, 0, MPI_BYTE, partner, 0, MPI_COMM_WORLD,
+ MPI_STATUS_IGNORE );
+ t1 = MPI_Wtime();
+ while (repsleft--) {
+ MPI_Sendrecv( sbuf, len, MPI_BYTE, partner, k,
+ rbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD,
+ MPI_STATUS_IGNORE );
+ }
+ t1 = MPI_Wtime() - t1;
+ times[1][k] = t1 / reps;
+ if (wrank == 0) {
+ t1 = t1 / reps;
+ if (t1 > 0) {
+ double rate;
+ rate = (len / t1) / 1.e6;
+ t1 = t1 * 1.e6;
+ if (verbose)
+ printf( "%d\t%g\t%g\n", len, t1, len/t1 );
+ }
+ else {
+ t1 = t1 * 1.e6;
+ if (verbose)
+ printf( "%d\t%g\tINF\n", len, t1 );
+ }
+ if (verbose)
+ fflush( stdout );
+ }
+
+ len *= 2;
+ }
+
+ MPI_Barrier( MPI_COMM_WORLD );
+
+ /* Test Send/recv, ping-pong */
+ if (wrank == 0 && verbose) {
+ printf( "Pingpong\n" );
+ printf( "len\ttime (usec)\trate (MB/s)\n" );
+ }
+
+ /* Send powers of 2 bytes */
+ len = 1;
+ for (k=0; k<20; k++) {
+ /* We use a simple linear form for the number of tests to
+ reduce the impact of the granularity of the timer */
+ reps = 50-k;
+ repsleft = reps;
+ /* Make sure that both processes are ready to start */
+ MPI_Sendrecv( MPI_BOTTOM, 0, MPI_BYTE, partner, 0,
+ MPI_BOTTOM, 0, MPI_BYTE, partner, 0, MPI_COMM_WORLD,
+ MPI_STATUS_IGNORE );
+ t1 = MPI_Wtime();
+ while (repsleft--) {
+ if (wrank & 0x1) {
+ MPI_Send( sbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD );
+ MPI_Recv( rbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD,
+ MPI_STATUS_IGNORE );
+ }
+ else {
+ MPI_Recv( rbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD,
+ MPI_STATUS_IGNORE );
+ MPI_Send( sbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD );
+ }
+ }
+ t1 = MPI_Wtime() - t1;
+ times[2][k] = t1 / reps;
+ if (wrank == 0) {
+ t1 = t1 / reps;
+ if (t1 > 0) {
+ double rate;
+ rate = (len / t1) / 1.e6;
+ t1 = t1 * 1.e6;
+ if (verbose)
+ printf( "%d\t%g\t%g\n", len, t1, len/t1 );
+ }
+ else {
+ t1 = t1 * 1.e6;
+ if (verbose)
+ printf( "%d\t%g\tINF\n", len, t1 );
+ }
+ if (verbose)
+ fflush( stdout );
+ }
+
+ len *= 2;
+ }
+
+
+ /* At this point, we could optionally analyze the results and report
+ success or failure based on some criteria, such as near monotone
+ increases in bandwidth. This test was created because of a
+ fall-off in performance noted in the ch3:sock device:channel */
+
+ if (wrank == 0) {
+ int nPerfErrors = 0;
+ len = 1;
+ for (k=0; k<20; k++) {
+ double T0,T1,T2;
+ T0 = times[0][k] * 1.e6;
+ T1 = times[1][k] * 1.e6;
+ T2 = times[2][k] * 1.e6;
+ if (verbose)
+ printf( "%d\t%12.2f\t%12.2f\t%12.2f\n", len, T0, T1, T2 );
+ /* Lets look at long messages only */
+ if (k > 10) {
+ double T0Old, T1Old, T2Old;
+ T0Old = times[0][k-1] * 1.0e6;
+ T1Old = times[1][k-1] * 1.0e6;
+ T2Old = times[2][k-1] * 1.0e6;
+ if (T0 > (2+ERROR_MARGIN) * T0Old) {
+ nPerfErrors++;
+ if (verbose)
+ printf( "Irecv-Send:\t%d\t%12.2f\t%12.2f\n", len, T0Old, T0 );
+ }
+ if (T1 > (2+ERROR_MARGIN) * T1Old) {
+ nPerfErrors++;
+ if (verbose)
+ printf( "Sendrecv:\t%d\t%12.2f\t%12.2f\n", len, T1Old, T1 );
+ }
+ if (T2 > (2+ERROR_MARGIN) * T2Old) {
+ nPerfErrors++;
+ if (verbose)
+ printf( "Pingpong:\t%d\t%12.2f\t%12.2f\n", len, T2Old, T2 );
+ }
+ }
+ len *= 2;
+ }
+ if (nPerfErrors > 8) {
+ /* Allow for 1-2 errors for eager-rendezvous shifting
+ * point and cache effects. There should be a better way
+ * of doing this. */
+ printf( " Found %d performance errors\n", nPerfErrors );
+ }
+ else {
+ printf( " No Errors\n" );
+ }
+ fflush( stdout );
+ }
+
+ free( sbuf );
+ free( rbuf );
+
+ MPI_Finalize();
+
+ return 0;
+}
--- /dev/null
+transp-datatype 2
+sendrecvl 2
+twovec 1 xfail=ticket1788
+#Need MPI_Pack
+#dtpack 1 xfail=ticket1789
+#nestvec 1 xfail=ticket1788
+#nestvec2 1 xfail=ticket1788
+#indexperf 1 xfail=ticket1788
+non_zero_root 4
+timer 1
+# The commcreatep test looks at how communicator creation scales with group
+# size.
+commcreatep 64
--- /dev/null
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ * (C) 2006 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+
+/*
+ * Check that the timer produces monotone nondecreasing times and that
+ * the Tick is reasonable
+ */
+
+#include "mpi.h"
+#include <stdio.h>
+#include "mpitest.h"
+
+static int verbose = 0;
+
+#define MAX_TIMER_TEST 5000
+
+int main(int argc, char* argv[])
+{
+ double t1[MAX_TIMER_TEST], tick[MAX_TIMER_TEST], tickval;
+ double minDiff, maxDiff, diff;
+ int i, nZeros = 0;
+ int errs = 0;
+
+ MTest_Init(&argc,&argv);
+
+ for (i=0; i<MAX_TIMER_TEST; i++) {
+ t1[i] = MPI_Wtime();
+ }
+
+ for (i=0; i<MAX_TIMER_TEST; i++) {
+ tick[i] = MPI_Wtick();
+ }
+
+ /* Look at the values */
+ /* Look at the tick */
+ tickval = MPI_Wtick();
+ for (i=0; i<MAX_TIMER_TEST; i++) {
+ if (tickval != tick[i]) {
+ fprintf( stderr, "Nonconstant value for MPI_Wtick: %e != %e\n",
+ tickval, tick[i] );
+ errs ++;
+ }
+ }
+
+ /* Look at the timer */
+ minDiff = 1.e20;
+ maxDiff = -1.0;
+ nZeros = 0;
+ for (i=1; i<MAX_TIMER_TEST; i++) {
+ diff = t1[i] - t1[i-1];
+ if (diff == 0.0) nZeros++;
+ else if (diff < minDiff) minDiff = diff;
+ if (diff > maxDiff) maxDiff = diff;
+ }
+
+ /* Are the time diff values and tick values consistent */
+ if (verbose) {
+ printf( "Tick = %e, timer range = [%e,%e]\n", tickval, minDiff,
+ maxDiff );
+ if (nZeros) printf( "Wtime difference was 0 %d times\n", nZeros );
+ }
+
+ MTest_Finalize(errs);
+ MPI_Finalize();
+
+ return 0;
+}
--- /dev/null
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ * (C) 2006 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+/* modified 01/23/2011 by Jim Hoekstra - ISU
+ * changed test to follow mtest_init/mtest_finalize convention
+ * The following changes are based on suggestions from Chris Sadlo:
+ * variable row changed to col.
+ * manual transpose - code added to perform 'swap'.
+ * MPI_Send/MPI_Recv involving xpose changed.
+ */
+
+/* This is based on an example in the MPI standard and a bug report submitted
+ by Alexandr Konovalov of Intel */
+
+#include "mpi.h"
+#include <stdio.h>
+#include "mpitest.h"
+
+#define SIZE 100
+#define ITER 100
+
+int main(int argc, char* argv[])
+{
+ int i, j, k;
+ static double a[SIZE][SIZE],b[SIZE][SIZE];
+ double t1,t2,t,ts,tst;
+ double temp;
+ int myrank, mysize, errs = 0;
+ MPI_Status status;
+ MPI_Aint sizeofreal;
+
+ MPI_Datatype col, xpose;
+
+ MTest_Init( &argc, &argv );
+ MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
+ MPI_Comm_size( MPI_COMM_WORLD, &mysize );
+ if (mysize != 2) {
+ fprintf( stderr, "This test must be run with 2 processes\n" );
+ MPI_Abort( MPI_COMM_WORLD, 1 );
+ }
+
+ MPI_Type_extent(MPI_DOUBLE, &sizeofreal);
+
+ MPI_Type_vector(SIZE, 1, SIZE, MPI_DOUBLE, &col);
+ MPI_Type_hvector(SIZE, 1, sizeofreal, col, &xpose);
+ MPI_Type_commit(&xpose);
+
+ /* Preset the arrays so that they're in memory */
+ for (i=0; i<SIZE; i++)
+ for (j=0; j<SIZE; j++) {
+ a[i][j]=0;
+ b[i][j]=0;
+ }
+ a[SIZE-1][0] = 1;
+
+ /* Time the transpose example */
+ MPI_Barrier(MPI_COMM_WORLD);
+ t1=MPI_Wtime();
+ for(i=0;i< ITER; i++)
+ {
+ if(myrank==0)
+ MPI_Send(&a[0][0],SIZE*SIZE,MPI_DOUBLE,1,0,MPI_COMM_WORLD);
+ else
+ MPI_Recv(&b[0][0],1,xpose,0,0,MPI_COMM_WORLD,&status);
+ }
+ t2=MPI_Wtime();
+ t=(t2-t1)/ITER;
+
+ /* Time sending the same amount of data, but without the transpose */
+ MPI_Barrier(MPI_COMM_WORLD);
+ t1=MPI_Wtime();
+ for(i=0; i< ITER; i++){
+ if(myrank==0)
+ {
+ MPI_Send(&a[0][0],sizeof(a),MPI_BYTE,1,0,MPI_COMM_WORLD);
+ }
+ else {
+ MPI_Recv(&b[0][0],sizeof(b),MPI_BYTE,0,0,MPI_COMM_WORLD,&status);
+ }
+ }
+ t2=MPI_Wtime();
+ ts=(t2-t1)/ITER;
+
+ /* Time sending the same amount of data, with the transpose done
+ as a separate step */
+ MPI_Barrier(MPI_COMM_WORLD);
+ t1=MPI_Wtime();
+ for(k=0; k< ITER; k++){
+ if(myrank==0)
+ {
+ MPI_Send(&a[0][0],sizeof(a),MPI_BYTE,1,0,MPI_COMM_WORLD);
+ }
+ else {
+ MPI_Recv(&b[0][0],sizeof(b),MPI_BYTE,0,0,MPI_COMM_WORLD,&status);
+ for(i=0;i<SIZE;i++)
+ for(j=i;j<SIZE;j++) {
+ temp=b[j][i];
+ b[j][i]=b[i][j];
+ b[i][j]=temp;
+ }
+ }
+ }
+ t2=MPI_Wtime();
+ tst=(t2-t1)/ITER;
+
+ /* Print out the results */
+ if (myrank == 1) {
+ /* if t and tst are too different, then there is a performance
+ problem in the handling of the datatypes */
+
+ if (t > 2 * tst) {
+ errs ++;
+ fprintf( stderr, "Transpose time with datatypes is more than twice time without datatypes\n" );
+ fprintf( stderr, "%f\t%f\t%f\n", t, ts, tst );
+ }
+ }
+
+ MPI_Type_free(&col);
+ MPI_Type_free(&xpose);
+
+ MTest_Finalize( errs );
+ MPI_Finalize();
+ return 0;
+}
--- /dev/null
+/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
+/*
+ * (C) 2001 by Argonne National Laboratory.
+ * See COPYRIGHT in top-level directory.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "mpi.h"
+
+/* Make sure datatype creation is independent of data size */
+
+#define SKIP 4
+#define NUM_SIZES 16
+#define FRACTION 0.2
+
+/* Don't make the number of loops too high; we create so many
+ * datatypes before trying to free them */
+#define LOOPS 1024
+
+int main(int argc, char *argv[])
+{
+ MPI_Datatype column[LOOPS], xpose[LOOPS];
+ double t[NUM_SIZES], ttmp, tmean;
+ int size;
+ int i, j, errs = 0, nrows, ncols;
+
+ MPI_Init(&argc, &argv);
+
+ tmean = 0;
+ size = 1;
+ for (i = 0; i < NUM_SIZES + SKIP; i++) {
+ nrows = ncols = size;
+
+ ttmp = MPI_Wtime();
+
+ for (j = 0; j < LOOPS; j++) {
+ MPI_Type_vector(nrows, 1, ncols, MPI_INT, &column[j]);
+ MPI_Type_hvector(ncols, 1, sizeof(int), column[j], &xpose[j]);
+ MPI_Type_commit(&xpose[j]);
+ }
+
+ if (i >= SKIP) {
+ t[i - SKIP] = MPI_Wtime() - ttmp;
+ tmean += t[i - SKIP];
+ }
+
+ for (j = 0; j < LOOPS; j++) {
+ MPI_Type_free(&xpose[j]);
+ MPI_Type_free(&column[j]);
+ }
+
+ if (i >= SKIP)
+ size *= 2;
+ }
+ tmean /= NUM_SIZES;
+
+ /* Now, analyze the times to see that they are nearly independent
+ * of size */
+ for (i = 0; i < NUM_SIZES; i++) {
+ /* The difference between the value and the mean is more than
+ * a "FRACTION" of mean. */
+ if (fabs(t[i] - tmean) > (FRACTION * tmean))
+ errs++;
+ }
+
+ if (errs) {
+ fprintf(stderr, "too much difference in performance: ");
+ for (i = 0; i < NUM_SIZES; i++)
+ fprintf(stderr, "%.3f ", t[i] * 1e6);
+ fprintf(stderr, "\n");
+ }
+ else
+ printf(" No Errors\n");
+
+ MPI_Finalize();
+ return 0;
+}