include_directories(BEFORE "${CMAKE_HOME_DIRECTORY}/include/smpi")
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../include/")
- foreach(file commcreatep non_zero_root sendrecvl timer transp-datatype twovec)
- #not compiled files dtpack indexperf manyrma nestvec2 nestvec
+ foreach(file commcreatep non_zero_root sendrecvl timer transp-datatype twovec dtpack indexperf nestvec2 nestvec)
+ #not compiled files manyrma
add_executable(${file} ${file}.c)
target_link_libraries(${file} simgrid mtest_c)
endforeach()
*/
/*
- * This code is intended to test the trace overhead when using an
+ * This code is intended to test the trace overhead when using an
* MPI tracing package. To perform the test, follow these steps:
*
* 1) Run with the versbose mode selected to determine the delay argument
* to use in subsequent tests:
* mpiexec -n 4096 allredtrace -v
- * Assume that the computed delay count is 6237; that value is used in
+ * Assume that the computed delay count is 6237; that value is used in
* the following.
*
* 2) Run with an explicit delay count, without tracing enabled:
* 3) Build allredtrace with tracing enabled, then run:
* mpiexec -n 4096 allredtrace -delaycount 6237
*
- * Compare the total times. The tracing version should take slightly
+ * Compare the total times. The tracing version should take slightly
* longer but no more than, for example, 15%.
*/
#include "mpi.h"
static int verbose = 0;
static int lCount = 0;
-void Delay( int );
-void SetupDelay( double );
+void Delay(int);
+void SetupDelay(double);
-int main( int argc, char *argv[] )
+int main(int argc, char *argv[])
{
double usecPerCall = 100;
double t, t1, tsum;
int i, nLoop = 100;
int rank;
- MPI_Init( &argc, &argv );
- MPI_Comm_rank( MPI_COMM_WORLD, &rank );
-
- /* Process arguments. We allow the delay count to be set from the
- command line to ensure reproducibility*/
- for (i=1; i<argc; i++) {
- if (strcmp( argv[i], "-delaycount" ) == 0) {
- i++;
- lCount = atoi( argv[i] );
- }
- else if (strcmp( argv[i], "-v" ) == 0) {
- verbose = 1;
- }
- else {
- fprintf( stderr, "Unrecognized argument %s\n", argv[i] );
- exit(1);
- }
+ MPI_Init(&argc, &argv);
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+ /* Process arguments. We allow the delay count to be set from the
+ * command line to ensure reproducibility */
+ for (i = 1; i < argc; i++) {
+ if (strcmp(argv[i], "-delaycount") == 0) {
+ i++;
+ lCount = atoi(argv[i]);
+ }
+ else if (strcmp(argv[i], "-v") == 0) {
+ verbose = 1;
+ }
+ else {
+ fprintf(stderr, "Unrecognized argument %s\n", argv[i]);
+ exit(1);
+ }
}
if (lCount == 0) {
- SetupDelay( usecPerCall );
+ SetupDelay(usecPerCall);
}
-
- MPI_Barrier( MPI_COMM_WORLD );
+
+ MPI_Barrier(MPI_COMM_WORLD);
t = MPI_Wtime();
- for (i=0; i<nLoop; i++) {
- MPI_Allreduce( &t1, &tsum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
- Delay( lCount );
+ for (i = 0; i < nLoop; i++) {
+ MPI_Allreduce(&t1, &tsum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+ Delay(lCount);
}
t = MPI_Wtime() - t;
- MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0) {
- printf( "For delay count %d, time is %e\n", lCount, t );
+ printf("For delay count %d, time is %e\n", lCount, t);
}
-
- MPI_Barrier( MPI_COMM_WORLD );
+
+ MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
-
+
return 0;
}
-void SetupDelay( double usec )
+void SetupDelay(double usec)
{
double t, tick;
double sec = 1.0e-6 * usec;
int nLoop, i, direction;
-
+
/* Compute the number of times to run the tests to get an accurate
- number given the timer resolution. */
+ * number given the timer resolution. */
nLoop = 1;
tick = 100 * MPI_Wtick();
do {
- nLoop = 2 * nLoop;
- t = MPI_Wtime();
- for (i=0; i<nLoop; i++) {
- MPI_Wtime();
- }
- t = MPI_Wtime() - t;
+ nLoop = 2 * nLoop;
+ t = MPI_Wtime();
+ for (i = 0; i < nLoop; i++) {
+ MPI_Wtime();
+ }
+ t = MPI_Wtime() - t;
}
- while ( t < tick && nLoop < 100000 );
+ while (t < tick && nLoop < 100000);
+
+ if (verbose)
+ printf("nLoop = %d\n", nLoop);
- if (verbose) printf( "nLoop = %d\n", nLoop );
-
/* Start with an estimated count */
lCount = 128;
direction = 0;
while (1) {
- t = MPI_Wtime();
- for (i=0; i<nLoop; i++) {
- Delay( lCount );
- }
- t = MPI_Wtime() - t;
- t = t / nLoop;
- if (verbose) printf( "lCount = %d, time = %e\n", lCount, t );
- if (t > 10 * tick) nLoop = nLoop / 2;
-
- /* Compare measured delay */
- if (t > 2*sec) {
- lCount = lCount / 2;
- if (direction == 1) break;
- direction = -1;
- }
- else if (t < sec / 2) {
- lCount = lCount * 2;
- if (direction == -1) break;
- direction = 1;
- }
- else if (t < sec) {
- /* sec/2 <= t < sec , so estimate the lCount to hit sec */
- lCount = (sec/t) * lCount;
- }
- else
- break;
+ t = MPI_Wtime();
+ for (i = 0; i < nLoop; i++) {
+ Delay(lCount);
+ }
+ t = MPI_Wtime() - t;
+ t = t / nLoop;
+ if (verbose)
+ printf("lCount = %d, time = %e\n", lCount, t);
+ if (t > 10 * tick)
+ nLoop = nLoop / 2;
+
+ /* Compare measured delay */
+ if (t > 2 * sec) {
+ lCount = lCount / 2;
+ if (direction == 1)
+ break;
+ direction = -1;
+ }
+ else if (t < sec / 2) {
+ lCount = lCount * 2;
+ if (direction == -1)
+ break;
+ direction = 1;
+ }
+ else if (t < sec) {
+ /* sec/2 <= t < sec , so estimate the lCount to hit sec */
+ lCount = (sec / t) * lCount;
+ }
+ else
+ break;
}
- if (verbose) printf( "lCount = %d, t = %e\n", lCount, t );
+ if (verbose)
+ printf("lCount = %d, t = %e\n", lCount, t);
/* Should coordinate with the other processes - take the max? */
}
-double delayCounter = 0;
-void Delay( int count )
+volatile double delayCounter = 0;
+void Delay(int count)
{
int i;
delayCounter = 0.0;
- for (i=0; i<count; i++) {
- delayCounter += 2.73;
+ for (i = 0; i < count; i++) {
+ delayCounter += 2.73;
}
}
#define MAX_LOG_WSIZE 31
#define MAX_LOOP 20
-int main( int argc, char *argv[] )
+int main(int argc, char *argv[])
{
MPI_Group gworld, g;
- MPI_Comm comm, newcomm[MAX_LOOP];
- int wsize, wrank, range[1][3], errs=0;
- double t[MAX_LOG_WSIZE], tf;
- int maxi, i, k, ts, gsize[MAX_LOG_WSIZE];
+ MPI_Comm comm, newcomm[MAX_LOOP];
+ int wsize, wrank, range[1][3], errs = 0;
+ double t[MAX_LOG_WSIZE], tf;
+ int maxi, i, k, ts, gsize[MAX_LOG_WSIZE];
- MTest_Init( &argc, &argv );
+ MTest_Init(&argc, &argv);
- MPI_Comm_size( MPI_COMM_WORLD, &wsize );
- MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
+ MPI_Comm_size(MPI_COMM_WORLD, &wsize);
+ MPI_Comm_rank(MPI_COMM_WORLD, &wrank);
if (wrank == 0)
- MTestPrintfMsg( 1, "size\ttime\n" );
+ MTestPrintfMsg(1, "size\ttime\n");
- MPI_Comm_group( MPI_COMM_WORLD, &gworld );
+ MPI_Comm_group(MPI_COMM_WORLD, &gworld);
ts = 1;
comm = MPI_COMM_WORLD;
- for (i=0; ts<=wsize; i++, ts = ts + ts) {
- /* Create some groups with at most ts members */
- range[0][0] = ts-1;
- range[0][1] = 0;
- range[0][2] = -1;
- MPI_Group_range_incl( gworld, 1, range, &g );
-
- MPI_Barrier( MPI_COMM_WORLD );
- tf = MPI_Wtime();
- for (k=0; k<MAX_LOOP; k++)
- MPI_Comm_create( comm, g, &newcomm[k] );
- tf = MPI_Wtime() - tf;
- MPI_Allreduce( &tf, &t[i], 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD );
- t[i] = t[i] / MAX_LOOP;
- gsize[i] = ts;
- if (wrank == 0)
- MTestPrintfMsg( 1, "%d\t%e\n", ts, t[i] );
- MPI_Group_free( &g );
- if (newcomm[0] != MPI_COMM_NULL)
- for (k=0; k<MAX_LOOP; k++)
- MPI_Comm_free( &newcomm[k] );
+ for (i = 0; ts <= wsize; i++, ts = ts + ts) {
+ /* Create some groups with at most ts members */
+ range[0][0] = ts - 1;
+ range[0][1] = 0;
+ range[0][2] = -1;
+ MPI_Group_range_incl(gworld, 1, range, &g);
+
+ MPI_Barrier(MPI_COMM_WORLD);
+ tf = MPI_Wtime();
+ for (k = 0; k < MAX_LOOP; k++)
+ MPI_Comm_create(comm, g, &newcomm[k]);
+ tf = MPI_Wtime() - tf;
+ MPI_Allreduce(&tf, &t[i], 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+ t[i] = t[i] / MAX_LOOP;
+ gsize[i] = ts;
+ if (wrank == 0)
+ MTestPrintfMsg(1, "%d\t%e\n", ts, t[i]);
+ MPI_Group_free(&g);
+ if (newcomm[0] != MPI_COMM_NULL)
+ for (k = 0; k < MAX_LOOP; k++)
+ MPI_Comm_free(&newcomm[k]);
}
- MPI_Group_free( &gworld );
- maxi = i-1;
+ MPI_Group_free(&gworld);
+ maxi = i - 1;
/* The cost should be linear or at worst ts*log(ts).
- We can check this in a number of ways.
+ * We can check this in a number of ways.
*/
if (wrank == 0) {
- for (i=4; i<=maxi; i++) {
- double rdiff;
- if (t[i] > 0) {
- rdiff = (t[i] - t[i-1]) / t[i];
- if (rdiff >= 4) {
- errs++;
- fprintf( stderr, "Relative difference between group of size %d and %d is %e exceeds 4\n",
- gsize[i-1], gsize[i], rdiff );
- }
- }
- }
+ for (i = 4; i <= maxi; i++) {
+ double rdiff;
+ if (t[i] > 0) {
+ rdiff = (t[i] - t[i - 1]) / t[i];
+ if (rdiff >= 4) {
+ errs++;
+ fprintf(stderr,
+ "Relative difference between group of size %d and %d is %e exceeds 4\n",
+ gsize[i - 1], gsize[i], rdiff);
+ }
+ }
+ }
}
- MTest_Finalize( errs );
+ MTest_Finalize(errs);
MPI_Finalize();
* See COPYRIGHT in top-level directory.
*/
/*
- * This code may be used to test the performance of some of the
+ * This code may be used to test the performance of some of the
* noncontiguous datatype operations, including vector and indexed
- * pack and unpack operations. To simplify the use of this code for
+ * pack and unpack operations. To simplify the use of this code for
* tuning an MPI implementation, it uses no communication, just the
* MPI_Pack and MPI_Unpack routines. In addition, the individual tests are
* in separate routines, making it easier to compare the compiler-generated
- * code for the user (manual) pack/unpack with the code used by
+ * code for the user (manual) pack/unpack with the code used by
* the MPI implementation. Further, to be fair to the MPI implementation,
* the routines are passed the source and destination buffers; this ensures
* that the compiler can't optimize for statically allocated buffers.
double mean(double *list, int count);
double mean(double *list, int count)
{
- double retval;
- int i;
+ double retval;
+ int i;
- retval = 0;
- for (i = 0; i < count; i++)
- retval += list[i];
- retval /= count;
+ retval = 0;
+ for (i = 0; i < count; i++)
+ retval += list[i];
+ retval /= count;
- return retval;
+ return retval;
}
double noise(double *list, int count);
double noise(double *list, int count)
{
- double *margin, retval;
- int i;
+ double *margin, retval;
+ int i;
- if (!(margin = malloc(count * sizeof(double)))) {
- printf("Unable to allocate memory\n");
- return -1;
- }
+ if (!(margin = malloc(count * sizeof(double)))) {
+ printf("Unable to allocate memory\n");
+ return -1;
+ }
- for (i = 0; i < count; i++)
- margin[i] = list[i] / mean(list, count);
+ for (i = 0; i < count; i++)
+ margin[i] = list[i] / mean(list, count);
- retval = 0;
- for (i = 0; i < count; i++) {
- retval += ((margin[i] - 1) * (margin[i] - 1));
- }
- retval /= count;
- if (retval < 0) retval = -retval;
+ retval = 0;
+ for (i = 0; i < count; i++) {
+ retval += ((margin[i] - 1) * (margin[i] - 1));
+ }
+ retval /= count;
+ if (retval < 0)
+ retval = -retval;
- return retval;
+ return retval;
}
/* Here are the tests */
/* Test packing a vector of individual doubles */
/* We don't use restrict in the function args because assignments between
- restrict pointers is not valid in C and some compilers, such as the
+ restrict pointers is not valid in C and some compilers, such as the
IBM xlc compilers, flag that use as an error.*/
-int TestVecPackDouble( int n, int stride,
- double *avgTimeUser, double *avgTimeMPI,
- double *dest, const double *src );
-int TestVecPackDouble( int n, int stride,
- double *avgTimeUser, double *avgTimeMPI,
- double *dest, const double *src )
+int TestVecPackDouble(int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI, double *dest, const double *src);
+int TestVecPackDouble(int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI, double *dest, const double *src)
{
- double *restrict d_dest;
- const double *restrict d_src;
- register int i, j;
- int rep, position;
- double t1, t2, t[NTRIALS];
- MPI_Datatype vectype;
-
- /* User code */
- if (verbose) printf("TestVecPackDouble (USER): ");
- for (j = 0; j < NTRIALS; j++) {
- t1 = MPI_Wtime();
- for (rep=0; rep<N_REPS; rep++) {
- i = n;
- d_dest = dest;
- d_src = src;
- while (i--) {
- *d_dest++ = *d_src;
- d_src += stride;
- }
- }
- t2 = MPI_Wtime() - t1;
- t[j] = t2;
- if (verbose) printf("%.3f ", t[j]);
- }
- if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
- /* If there is too much noise, discard the test */
- if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
- *avgTimeUser = 0;
- *avgTimeMPI = 0;
- if (verbose)
- printf("Too much noise; discarding measurement\n");
- return 0;
- }
- *avgTimeUser = mean(t, NTRIALS) / N_REPS;
-
- /* MPI Vector code */
- MPI_Type_vector( n, 1, stride, MPI_DOUBLE, &vectype );
- MPI_Type_commit( &vectype );
-
- if (verbose) printf("TestVecPackDouble (MPI): ");
- for (j = 0; j < NTRIALS; j++) {
- t1 = MPI_Wtime();
- for (rep=0; rep<N_REPS; rep++) {
- position = 0;
- MPI_Pack( (void *)src, 1, vectype, dest, n*sizeof(double),
- &position, MPI_COMM_SELF );
- }
- t2 = MPI_Wtime() - t1;
- t[j] = t2;
- if (verbose) printf("%.3f ", t[j]);
- }
- if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
- /* If there is too much noise, discard the test */
- if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
- *avgTimeUser = 0;
- *avgTimeMPI = 0;
- if (verbose)
- printf("Too much noise; discarding measurement\n");
- }
- else {
- *avgTimeMPI = mean(t, NTRIALS) / N_REPS;
- }
-
- MPI_Type_free( &vectype );
-
- return 0;
+ double *restrict d_dest;
+ const double *restrict d_src;
+ register int i, j;
+ int rep, position;
+ double t1, t2, t[NTRIALS];
+ MPI_Datatype vectype;
+
+ /* User code */
+ if (verbose)
+ printf("TestVecPackDouble (USER): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep = 0; rep < N_REPS; rep++) {
+ i = n;
+ d_dest = dest;
+ d_src = src;
+ while (i--) {
+ *d_dest++ = *d_src;
+ d_src += stride;
+ }
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose)
+ printf("%.3f ", t[j]);
+ }
+ if (verbose)
+ printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ return 0;
+ }
+ *avgTimeUser = mean(t, NTRIALS) / N_REPS;
+
+ /* MPI Vector code */
+ MPI_Type_vector(n, 1, stride, MPI_DOUBLE, &vectype);
+ MPI_Type_commit(&vectype);
+
+ if (verbose)
+ printf("TestVecPackDouble (MPI): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep = 0; rep < N_REPS; rep++) {
+ position = 0;
+ MPI_Pack((void *) src, 1, vectype, dest, n * sizeof(double), &position, MPI_COMM_SELF);
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose)
+ printf("%.3f ", t[j]);
+ }
+ if (verbose)
+ printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ }
+ else {
+ *avgTimeMPI = mean(t, NTRIALS) / N_REPS;
+ }
+
+ MPI_Type_free(&vectype);
+
+ return 0;
}
/* Test unpacking a vector of individual doubles */
/* See above for why restrict is not used in the function args */
-int TestVecUnPackDouble( int n, int stride,
- double *avgTimeUser, double *avgTimeMPI,
- double *dest, const double *src );
-int TestVecUnPackDouble( int n, int stride,
- double *avgTimeUser, double *avgTimeMPI,
- double *dest, const double *src )
+int TestVecUnPackDouble(int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI, double *dest, const double *src);
+int TestVecUnPackDouble(int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI, double *dest, const double *src)
{
- double *restrict d_dest;
- const double *restrict d_src;
- register int i, j;
- int rep, position;
- double t1, t2, t[NTRIALS];
- MPI_Datatype vectype;
-
- /* User code */
- if (verbose) printf("TestVecUnPackDouble (USER): ");
- for (j = 0; j < NTRIALS; j++) {
- t1 = MPI_Wtime();
- for (rep=0; rep<N_REPS; rep++) {
- i = n;
- d_dest = dest;
- d_src = src;
- while (i--) {
- *d_dest = *d_src++;
- d_dest += stride;
- }
- }
- t2 = MPI_Wtime() - t1;
- t[j] = t2;
- if (verbose) printf("%.3f ", t[j]);
- }
- if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
- /* If there is too much noise, discard the test */
- if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
- *avgTimeUser = 0;
- *avgTimeMPI = 0;
- if (verbose)
- printf("Too much noise; discarding measurement\n");
- return 0;
- }
- *avgTimeUser = mean(t, NTRIALS) / N_REPS;
-
- /* MPI Vector code */
- MPI_Type_vector( n, 1, stride, MPI_DOUBLE, &vectype );
- MPI_Type_commit( &vectype );
-
- if (verbose) printf("TestVecUnPackDouble (MPI): ");
- for (j = 0; j < NTRIALS; j++) {
- t1 = MPI_Wtime();
- for (rep=0; rep<N_REPS; rep++) {
- position = 0;
- MPI_Unpack( (void *)src, n*sizeof(double),
- &position, dest, 1, vectype, MPI_COMM_SELF );
- }
- t2 = MPI_Wtime() - t1;
- t[j] = t2;
- if (verbose) printf("%.3f ", t[j]);
- }
- if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
- /* If there is too much noise, discard the test */
- if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
- *avgTimeUser = 0;
- *avgTimeMPI = 0;
- if (verbose)
- printf("Too much noise; discarding measurement\n");
- }
- else {
- *avgTimeMPI = mean(t, NTRIALS) / N_REPS;
- }
-
- MPI_Type_free( &vectype );
-
- return 0;
+ double *restrict d_dest;
+ const double *restrict d_src;
+ register int i, j;
+ int rep, position;
+ double t1, t2, t[NTRIALS];
+ MPI_Datatype vectype;
+
+ /* User code */
+ if (verbose)
+ printf("TestVecUnPackDouble (USER): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep = 0; rep < N_REPS; rep++) {
+ i = n;
+ d_dest = dest;
+ d_src = src;
+ while (i--) {
+ *d_dest = *d_src++;
+ d_dest += stride;
+ }
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose)
+ printf("%.3f ", t[j]);
+ }
+ if (verbose)
+ printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ return 0;
+ }
+ *avgTimeUser = mean(t, NTRIALS) / N_REPS;
+
+ /* MPI Vector code */
+ MPI_Type_vector(n, 1, stride, MPI_DOUBLE, &vectype);
+ MPI_Type_commit(&vectype);
+
+ if (verbose)
+ printf("TestVecUnPackDouble (MPI): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep = 0; rep < N_REPS; rep++) {
+ position = 0;
+ MPI_Unpack((void *) src, n * sizeof(double),
+ &position, dest, 1, vectype, MPI_COMM_SELF);
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose)
+ printf("%.3f ", t[j]);
+ }
+ if (verbose)
+ printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ }
+ else {
+ *avgTimeMPI = mean(t, NTRIALS) / N_REPS;
+ }
+
+ MPI_Type_free(&vectype);
+
+ return 0;
}
/* Test packing a vector of 2-individual doubles */
/* See above for why restrict is not used in the function args */
-int TestVecPack2Double( int n, int stride,
- double *avgTimeUser, double *avgTimeMPI,
- double *dest, const double *src );
-int TestVecPack2Double( int n, int stride,
- double *avgTimeUser, double *avgTimeMPI,
- double *dest, const double *src )
+int TestVecPack2Double(int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI, double *dest, const double *src);
+int TestVecPack2Double(int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI, double *dest, const double *src)
{
- double *restrict d_dest;
- const double *restrict d_src;
- register int i, j;
- int rep, position;
- double t1, t2, t[NTRIALS];
- MPI_Datatype vectype;
-
- /* User code */
- if (verbose) printf("TestVecPack2Double (USER): ");
- for (j = 0; j < NTRIALS; j++) {
- t1 = MPI_Wtime();
- for (rep=0; rep<N_REPS; rep++) {
- i = n;
- d_dest = dest;
- d_src = src;
- while (i--) {
- *d_dest++ = d_src[0];
- *d_dest++ = d_src[1];
- d_src += stride;
- }
- }
- t2 = MPI_Wtime() - t1;
- t[j] = t2;
- if (verbose) printf("%.3f ", t[j]);
- }
- if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
- /* If there is too much noise, discard the test */
- if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
- *avgTimeUser = 0;
- *avgTimeMPI = 0;
- if (verbose)
- printf("Too much noise; discarding measurement\n");
- return 0;
- }
- *avgTimeUser = mean(t, NTRIALS) / N_REPS;
-
- /* MPI Vector code */
- MPI_Type_vector( n, 2, stride, MPI_DOUBLE, &vectype );
- MPI_Type_commit( &vectype );
-
- if (verbose) printf("TestVecPack2Double (MPI): ");
- for (j = 0; j < NTRIALS; j++) {
- t1 = MPI_Wtime();
- for (rep=0; rep<N_REPS; rep++) {
- position = 0;
- MPI_Pack( (void *)src, 1, vectype, dest, 2*n*sizeof(double),
- &position, MPI_COMM_SELF );
- }
- t2 = MPI_Wtime() - t1;
- t[j] = t2;
- if (verbose) printf("%.3f ", t[j]);
- }
- if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
- /* If there is too much noise, discard the test */
- if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
- *avgTimeUser = 0;
- *avgTimeMPI = 0;
- if (verbose)
- printf("Too much noise; discarding measurement\n");
- }
- else {
- *avgTimeMPI = mean(t, NTRIALS) / N_REPS;
- }
- MPI_Type_free( &vectype );
-
- return 0;
+ double *restrict d_dest;
+ const double *restrict d_src;
+ register int i, j;
+ int rep, position;
+ double t1, t2, t[NTRIALS];
+ MPI_Datatype vectype;
+
+ /* User code */
+ if (verbose)
+ printf("TestVecPack2Double (USER): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep = 0; rep < N_REPS; rep++) {
+ i = n;
+ d_dest = dest;
+ d_src = src;
+ while (i--) {
+ *d_dest++ = d_src[0];
+ *d_dest++ = d_src[1];
+ d_src += stride;
+ }
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose)
+ printf("%.3f ", t[j]);
+ }
+ if (verbose)
+ printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ return 0;
+ }
+ *avgTimeUser = mean(t, NTRIALS) / N_REPS;
+
+ /* MPI Vector code */
+ MPI_Type_vector(n, 2, stride, MPI_DOUBLE, &vectype);
+ MPI_Type_commit(&vectype);
+
+ if (verbose)
+ printf("TestVecPack2Double (MPI): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep = 0; rep < N_REPS; rep++) {
+ position = 0;
+ MPI_Pack((void *) src, 1, vectype, dest, 2 * n * sizeof(double),
+ &position, MPI_COMM_SELF);
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose)
+ printf("%.3f ", t[j]);
+ }
+ if (verbose)
+ printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ }
+ else {
+ *avgTimeMPI = mean(t, NTRIALS) / N_REPS;
+ }
+ MPI_Type_free(&vectype);
+
+ return 0;
}
/* This creates an indexed type that is like a vector (for simplicity
- of construction). There is a possibility that the MPI implementation
+ of construction). There is a possibility that the MPI implementation
will recognize and simplify this (e.g., in MPI_Type_commit); if so,
- let us know and we'll add a version that is not as regular
+ let us know and we'll add a version that is not as regular
*/
/* See above for why restrict is not used in the function args */
-int TestIndexPackDouble( int n, int stride,
- double *avgTimeUser, double *avgTimeMPI,
- double *dest, const double *src );
-int TestIndexPackDouble( int n, int stride,
- double *avgTimeUser, double *avgTimeMPI,
- double *dest, const double *src )
+int TestIndexPackDouble(int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI, double *dest, const double *src);
+int TestIndexPackDouble(int n, int stride,
+ double *avgTimeUser, double *avgTimeMPI, double *dest, const double *src)
{
- double *restrict d_dest;
- const double *restrict d_src;
- register int i, j;
- int rep, position;
- int *restrict displs = 0;
- double t1, t2, t[NTRIALS];
- MPI_Datatype indextype;
-
- displs = (int *)malloc( n * sizeof(int) );
- for (i=0; i<n; i++) displs[i] = i * stride;
-
- /* User code */
- if (verbose) printf("TestIndexPackDouble (USER): ");
- for (j = 0; j < NTRIALS; j++) {
- t1 = MPI_Wtime();
- for (rep=0; rep<N_REPS; rep++) {
- i = n;
- d_dest = dest;
- d_src = src;
- for (i=0; i<n; i++) {
- *d_dest++ = d_src[displs[i]];
- }
- }
- t2 = MPI_Wtime() - t1;
- t[j] = t2;
- if (verbose) printf("%.3f ", t[j]);
- }
- if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
- /* If there is too much noise, discard the test */
- if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
- *avgTimeUser = 0;
- *avgTimeMPI = 0;
- if (verbose)
- printf("Too much noise; discarding measurement\n");
- return 0;
- }
- *avgTimeUser = mean(t, NTRIALS) / N_REPS;
-
- /* MPI Index code */
- MPI_Type_create_indexed_block( n, 1, displs, MPI_DOUBLE, &indextype );
- MPI_Type_commit( &indextype );
-
- free( displs );
-
- if (verbose) printf("TestIndexPackDouble (MPI): ");
- for (j = 0; j < NTRIALS; j++) {
- t1 = MPI_Wtime();
- for (rep=0; rep<N_REPS; rep++) {
- position = 0;
- MPI_Pack( (void *)src, 1, indextype, dest, n*sizeof(double),
- &position, MPI_COMM_SELF );
- }
- t2 = MPI_Wtime() - t1;
- t[j] = t2;
- if (verbose) printf("%.3f ", t[j]);
- }
- if (verbose) printf("[%.3f]\n", noise(t, NTRIALS));
- /* If there is too much noise, discard the test */
- if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
- *avgTimeUser = 0;
- *avgTimeMPI = 0;
- if (verbose)
- printf("Too much noise; discarding measurement\n");
- }
- else {
- *avgTimeMPI = mean(t, NTRIALS) / N_REPS;
- }
- MPI_Type_free( &indextype );
-
- return 0;
+ double *restrict d_dest;
+ const double *restrict d_src;
+ register int i, j;
+ int rep, position;
+ int *restrict displs = 0;
+ double t1, t2, t[NTRIALS];
+ MPI_Datatype indextype;
+
+ displs = (int *) malloc(n * sizeof(int));
+ for (i = 0; i < n; i++)
+ displs[i] = i * stride;
+
+ /* User code */
+ if (verbose)
+ printf("TestIndexPackDouble (USER): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep = 0; rep < N_REPS; rep++) {
+ i = n;
+ d_dest = dest;
+ d_src = src;
+ for (i = 0; i < n; i++) {
+ *d_dest++ = d_src[displs[i]];
+ }
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose)
+ printf("%.3f ", t[j]);
+ }
+ if (verbose)
+ printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ return 0;
+ }
+ *avgTimeUser = mean(t, NTRIALS) / N_REPS;
+
+ /* MPI Index code */
+ MPI_Type_create_indexed_block(n, 1, displs, MPI_DOUBLE, &indextype);
+ MPI_Type_commit(&indextype);
+
+ free(displs);
+
+ if (verbose)
+ printf("TestIndexPackDouble (MPI): ");
+ for (j = 0; j < NTRIALS; j++) {
+ t1 = MPI_Wtime();
+ for (rep = 0; rep < N_REPS; rep++) {
+ position = 0;
+ MPI_Pack((void *) src, 1, indextype, dest, n * sizeof(double),
+ &position, MPI_COMM_SELF);
+ }
+ t2 = MPI_Wtime() - t1;
+ t[j] = t2;
+ if (verbose)
+ printf("%.3f ", t[j]);
+ }
+ if (verbose)
+ printf("[%.3f]\n", noise(t, NTRIALS));
+ /* If there is too much noise, discard the test */
+ if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
+ *avgTimeUser = 0;
+ *avgTimeMPI = 0;
+ if (verbose)
+ printf("Too much noise; discarding measurement\n");
+ }
+ else {
+ *avgTimeMPI = mean(t, NTRIALS) / N_REPS;
+ }
+ MPI_Type_free(&indextype);
+
+ return 0;
}
-int Report( const char *name, const char *packname,
- double avgTimeMPI, double avgTimeUser );
-int Report( const char *name, const char *packname,
- double avgTimeMPI, double avgTimeUser )
+int Report(const char *name, const char *packname, double avgTimeMPI, double avgTimeUser);
+int Report(const char *name, const char *packname, double avgTimeMPI, double avgTimeUser)
{
- double diffTime, maxTime;
- int errs=0;
-
- /* Move this into a common routine */
- diffTime = avgTimeMPI - avgTimeUser;
- if (diffTime < 0) diffTime = - diffTime;
- if (avgTimeMPI > avgTimeUser) maxTime = avgTimeMPI;
- else maxTime = avgTimeUser;
-
- if (verbose) {
- printf( "%-30s:\t%g\t%g\t(%g%%)\n", name,
- avgTimeMPI, avgTimeUser,
- 100 * (diffTime / maxTime) );
- fflush(stdout);
- }
- if (avgTimeMPI > avgTimeUser && (diffTime > THRESHOLD * maxTime)) {
- errs++;
- printf( "%s:\tMPI %s code is too slow: MPI %g\t User %g\n",
- name, packname, avgTimeMPI, avgTimeUser );
- }
-
- return errs;
+ double diffTime, maxTime;
+ int errs = 0;
+
+ /* Move this into a common routine */
+ diffTime = avgTimeMPI - avgTimeUser;
+ if (diffTime < 0)
+ diffTime = -diffTime;
+ if (avgTimeMPI > avgTimeUser)
+ maxTime = avgTimeMPI;
+ else
+ maxTime = avgTimeUser;
+
+ if (verbose) {
+ printf("%-30s:\t%g\t%g\t(%g%%)\n", name,
+ avgTimeMPI, avgTimeUser, 100 * (diffTime / maxTime));
+ fflush(stdout);
+ }
+ if (avgTimeMPI > avgTimeUser && (diffTime > THRESHOLD * maxTime)) {
+ errs++;
+ printf("%s:\tMPI %s code is too slow: MPI %g\t User %g\n",
+ name, packname, avgTimeMPI, avgTimeUser);
+ }
+
+ return errs;
}
/* Finally, here's the main program */
-int main( int argc, char *argv[] )
+int main(int argc, char *argv[])
{
- int n, stride, err, errs = 0;
+ int n, stride, errs = 0;
void *dest, *src;
double avgTimeUser, avgTimeMPI;
- MPI_Init( &argc, &argv );
- if (getenv("MPITEST_VERBOSE")) verbose = 1;
+ MPI_Init(&argc, &argv);
+ if (getenv("MPITEST_VERBOSE"))
+ verbose = 1;
- n = 30000;
+ n = 30000;
stride = 4;
- dest = (void *)malloc( n * sizeof(double) );
- src = (void *)malloc( n * ((1+stride)*sizeof(double)) );
+ dest = (void *) malloc(n * sizeof(double));
+ src = (void *) malloc(n * ((1 + stride) * sizeof(double)));
/* Touch the source and destination arrays */
- memset( src, 0, n * (1+stride)*sizeof(double) );
- memset( dest, 0, n * sizeof(double) );
+ memset(src, 0, n * (1 + stride) * sizeof(double));
+ memset(dest, 0, n * sizeof(double));
- err = TestVecPackDouble( n, stride, &avgTimeUser, &avgTimeMPI,
- dest, src );
- errs += Report( "VecPackDouble", "Pack", avgTimeMPI, avgTimeUser );
+ TestVecPackDouble(n, stride, &avgTimeUser, &avgTimeMPI, dest, src);
+ errs += Report("VecPackDouble", "Pack", avgTimeMPI, avgTimeUser);
- err = TestVecUnPackDouble( n, stride, &avgTimeUser, &avgTimeMPI,
- src, dest );
- errs += Report( "VecUnPackDouble", "Unpack", avgTimeMPI, avgTimeUser );
+ TestVecUnPackDouble(n, stride, &avgTimeUser, &avgTimeMPI, src, dest);
+ errs += Report("VecUnPackDouble", "Unpack", avgTimeMPI, avgTimeUser);
- err = TestIndexPackDouble( n, stride, &avgTimeUser, &avgTimeMPI,
- dest, src );
- errs += Report( "VecIndexDouble", "Pack", avgTimeMPI, avgTimeUser );
+ TestIndexPackDouble(n, stride, &avgTimeUser, &avgTimeMPI, dest, src);
+ errs += Report("VecIndexDouble", "Pack", avgTimeMPI, avgTimeUser);
free(dest);
free(src);
-
- dest = (void *)malloc( 2*n * sizeof(double) );
- src = (void *)malloc( (1 + n) * ((1+stride)*sizeof(double)) );
- memset( dest, 0, 2*n * sizeof(double) );
- memset( src, 0, (1+n) * (1+stride)*sizeof(double) );
- err = TestVecPack2Double( n, stride, &avgTimeUser, &avgTimeMPI,
- dest, src );
- errs += Report( "VecPack2Double", "Pack", avgTimeMPI, avgTimeUser );
+
+ dest = (void *) malloc(2 * n * sizeof(double));
+ src = (void *) malloc((1 + n) * ((1 + stride) * sizeof(double)));
+ memset(dest, 0, 2 * n * sizeof(double));
+ memset(src, 0, (1 + n) * (1 + stride) * sizeof(double));
+ TestVecPack2Double(n, stride, &avgTimeUser, &avgTimeMPI, dest, src);
+ errs += Report("VecPack2Double", "Pack", avgTimeMPI, avgTimeUser);
free(dest);
free(src);
-
+
if (errs == 0) {
- printf( " No Errors\n" );
+ printf(" No Errors\n");
}
else {
- printf( " Found %d performance problems\n", errs );
+ printf(" Found %d performance problems\n", errs);
}
fflush(stdout);
static int verbose = 0;
-int main( int argc, char **argv )
+int main(int argc, char **argv)
{
- double *inbuf, *outbuf, *outbuf2;
- MPI_Aint lb, extent;
- int *index_displacement;
- int icount, errs=0;
- int i, packsize, position, inbufsize;
+ double *inbuf, *outbuf, *outbuf2;
+ MPI_Aint lb, extent;
+ int *index_displacement;
+ int icount, errs = 0;
+ int i, packsize, position, inbufsize;
MPI_Datatype itype1, stype1;
- double t0, t1;
- double tpack, tspack, tmanual;
- int ntry;
+ double t0, t1;
+ double tpack, tspack, tmanual;
+ int ntry;
- MPI_Init( &argc, &argv );
+ MPI_Init(&argc, &argv);
icount = 2014;
/* Create a simple block indexed datatype */
- index_displacement = (int *)malloc( icount * sizeof(int) );
+ index_displacement = (int *) malloc(icount * sizeof(int));
if (!index_displacement) {
- fprintf( stderr, "Unable to allocated index array of size %d\n",
- icount );
- MPI_Abort( MPI_COMM_WORLD, 1 );
+ fprintf(stderr, "Unable to allocated index array of size %d\n", icount);
+ MPI_Abort(MPI_COMM_WORLD, 1);
}
- for (i=0; i<icount; i++) {
- index_displacement[i] = (i * 3 + (i%3));
+ for (i = 0; i < icount; i++) {
+ index_displacement[i] = (i * 3 + (i % 3));
}
- MPI_Type_create_indexed_block( icount, 1, index_displacement, MPI_DOUBLE,
- &itype1 );
- MPI_Type_commit( &itype1 );
-
+ MPI_Type_create_indexed_block(icount, 1, index_displacement, MPI_DOUBLE, &itype1);
+ MPI_Type_commit(&itype1);
+
#if defined(MPICH) && defined(PRINT_DATATYPE_INTERNALS)
/* To use MPIDU_Datatype_debug to print the datatype internals,
- you must configure MPICH with --enable-g=log */
+ * you must configure MPICH with --enable-g=log */
if (verbose) {
- printf( "Block index datatype:\n" );
- MPIDU_Datatype_debug( itype1, 10 );
+ printf("Block index datatype:\n");
+ MPIDU_Datatype_debug(itype1, 10);
}
#endif
- MPI_Type_get_extent( itype1, &lb, &extent );
+ MPI_Type_get_extent(itype1, &lb, &extent);
- MPI_Pack_size( 1, itype1, MPI_COMM_WORLD, &packsize );
+ MPI_Pack_size(1, itype1, MPI_COMM_WORLD, &packsize);
inbufsize = extent / sizeof(double);
- inbuf = (double *)malloc( extent );
- outbuf = (double *)malloc( packsize );
- outbuf2 = (double *)malloc( icount * sizeof(double) );
+ inbuf = (double *) malloc(extent);
+ outbuf = (double *) malloc(packsize);
+ outbuf2 = (double *) malloc(icount * sizeof(double));
if (!inbuf) {
- fprintf( stderr, "Unable to allocate %ld for inbuf\n", (long)extent );
- MPI_Abort( MPI_COMM_WORLD, 1 );
+ fprintf(stderr, "Unable to allocate %ld for inbuf\n", (long) extent);
+ MPI_Abort(MPI_COMM_WORLD, 1);
}
if (!outbuf) {
- fprintf( stderr, "Unable to allocate %ld for outbuf\n", (long)packsize );
- MPI_Abort( MPI_COMM_WORLD, 1 );
+ fprintf(stderr, "Unable to allocate %ld for outbuf\n", (long) packsize);
+ MPI_Abort(MPI_COMM_WORLD, 1);
}
if (!outbuf2) {
- fprintf( stderr, "Unable to allocate %ld for outbuf2\n", (long)packsize );
- MPI_Abort( MPI_COMM_WORLD, 1 );
+ fprintf(stderr, "Unable to allocate %ld for outbuf2\n", (long) packsize);
+ MPI_Abort(MPI_COMM_WORLD, 1);
}
- for (i=0; i<inbufsize; i++) {
- inbuf[i] = (double)i;
+ for (i = 0; i < inbufsize; i++) {
+ inbuf[i] = (double) i;
}
position = 0;
/* Warm up the code and data */
- MPI_Pack( inbuf, 1, itype1, outbuf, packsize, &position, MPI_COMM_WORLD );
+ MPI_Pack(inbuf, 1, itype1, outbuf, packsize, &position, MPI_COMM_WORLD);
tpack = 1e12;
for (ntry = 0; ntry < 5; ntry++) {
- position = 0;
- t0 = MPI_Wtime();
- MPI_Pack( inbuf, 1, itype1, outbuf, packsize, &position,
- MPI_COMM_WORLD );
- t1 = MPI_Wtime() - t0;
- if (t1 < tpack) tpack = t1;
+ position = 0;
+ t0 = MPI_Wtime();
+ MPI_Pack(inbuf, 1, itype1, outbuf, packsize, &position, MPI_COMM_WORLD);
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tpack)
+ tpack = t1;
}
- { int one = 1; MPI_Aint displ = (MPI_Aint) inbuf;
- MPI_Type_create_struct( 1, &one, &displ, &itype1, &stype1 );
- MPI_Type_commit( &stype1 );
+ {
+ int one = 1;
+ MPI_Aint displ = (MPI_Aint) inbuf;
+ MPI_Type_create_struct(1, &one, &displ, &itype1, &stype1);
+ MPI_Type_commit(&stype1);
}
position = 0;
/* Warm up the code and data */
- MPI_Pack( MPI_BOTTOM, 1, stype1, outbuf, packsize, &position, MPI_COMM_WORLD );
+ MPI_Pack(MPI_BOTTOM, 1, stype1, outbuf, packsize, &position, MPI_COMM_WORLD);
tspack = 1e12;
for (ntry = 0; ntry < 5; ntry++) {
- position = 0;
- t0 = MPI_Wtime();
- MPI_Pack( MPI_BOTTOM, 1, stype1, outbuf, packsize, &position,
- MPI_COMM_WORLD );
- t1 = MPI_Wtime() - t0;
- if (t1 < tspack) tspack = t1;
+ position = 0;
+ t0 = MPI_Wtime();
+ MPI_Pack(MPI_BOTTOM, 1, stype1, outbuf, packsize, &position, MPI_COMM_WORLD);
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tspack)
+ tspack = t1;
}
- /*
- Simple manual pack (without explicitly unrolling the index block)
- */
+ /*
+ * Simple manual pack (without explicitly unrolling the index block)
+ */
tmanual = 1e12;
for (ntry = 0; ntry < 5; ntry++) {
- const double *ppe = (const double *)inbuf;
- const int *id = (const int *)index_displacement;
- int k, j;
- t0 = MPI_Wtime();
- position = 0;
- for (i=0; i<icount; i++) {
- outbuf2[position++] = ppe[id[i]];
- }
- t1 = MPI_Wtime() - t0;
- if (t1 < tmanual) tmanual = t1;
- /* Check on correctness */
+ const double *ppe = (const double *) inbuf;
+ const int *id = (const int *) index_displacement;
+ t0 = MPI_Wtime();
+ position = 0;
+ for (i = 0; i < icount; i++) {
+ outbuf2[position++] = ppe[id[i]];
+ }
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tmanual)
+ tmanual = t1;
+ /* Check on correctness */
#ifdef PACK_IS_NATIVE
- if (memcmp( outbuf, outbuf2, position ) != 0) {
- printf( "Panic - pack buffers differ\n" );
- }
+ if (memcmp(outbuf, outbuf2, position) != 0) {
+ printf("Panic - pack buffers differ\n");
+ }
#endif
}
if (verbose) {
- printf( "Bytes packed = %d\n", position );
- printf( "MPI_Pack time = %e, manual pack time = %e\n",
- tpack, tmanual );
- printf( "Pack with struct = %e\n", tspack );
+ printf("Bytes packed = %d\n", position);
+ printf("MPI_Pack time = %e, manual pack time = %e\n", tpack, tmanual);
+ printf("Pack with struct = %e\n", tspack);
}
- /* The threshold here permits the MPI datatype to perform at up to
- only one half the performance of simple user code. Note that the
- example code above may be made faster through careful use of const,
- restrict, and unrolling if the compiler doesn't already do that. */
+ /* The threshold here permits the MPI datatype to perform at up to
+ * only one half the performance of simple user code. Note that the
+ * example code above may be made faster through careful use of const,
+ * restrict, and unrolling if the compiler doesn't already do that. */
if (2 * tmanual < tpack) {
- errs++;
- printf( "MPI_Pack (block index) time = %e, manual pack time = %e\n", tpack, tmanual );
- printf( "MPI_Pack time should be less than 2 times the manual time\n" );
- printf( "For most informative results, be sure to compile this test with optimization\n" );
+ errs++;
+ printf("MPI_Pack (block index) time = %e, manual pack time = %e\n", tpack, tmanual);
+ printf("MPI_Pack time should be less than 2 times the manual time\n");
+ printf("For most informative results, be sure to compile this test with optimization\n");
}
if (2 * tmanual < tspack) {
- errs++;
- printf( "MPI_Pack (struct of block index)) time = %e, manual pack time = %e\n", tspack, tmanual );
- printf( "MPI_Pack time should be less than 2 times the manual time\n" );
- printf( "For most informative results, be sure to compile this test with optimization\n" );
+ errs++;
+ printf("MPI_Pack (struct of block index)) time = %e, manual pack time = %e\n", tspack,
+ tmanual);
+ printf("MPI_Pack time should be less than 2 times the manual time\n");
+ printf("For most informative results, be sure to compile this test with optimization\n");
}
if (errs) {
- printf( " Found %d errors\n", errs );
+ printf(" Found %d errors\n", errs);
}
else {
- printf( " No Errors\n" );
- }
-
- MPI_Type_free( &itype1 );
- MPI_Type_free( &stype1 );
-
- free( inbuf );
- free( outbuf );
- free( outbuf2 );
- free( index_displacement );
+ printf(" No Errors\n");
+ }
+
+ MPI_Type_free(&itype1);
+ MPI_Type_free(&stype1);
+
+ free(inbuf);
+ free(outbuf);
+ free(outbuf2);
+ free(index_displacement);
MPI_Finalize();
return 0;
* See COPYRIGHT in top-level directory.
*/
-/* This test measures the performance of many rma operations to a single
+/* This test measures the performance of many rma operations to a single
target process.
It uses a number of operations (put or accumulate) to different
- locations in the target window
- This is one of the ways that RMA may be used, and is used in the
+ locations in the target window
+ This is one of the ways that RMA may be used, and is used in the
reference implementation of the graph500 benchmark.
*/
#include "mpi.h"
#define MAX_RMA_SIZE 16
#define MAX_RUNS 10
-typedef enum { SYNC_NONE=0,
- SYNC_ALL=-1, SYNC_FENCE=1, SYNC_LOCK=2, SYNC_PSCW=4 } sync_t;
-typedef enum { RMA_NONE=0, RMA_ALL=-1, RMA_PUT=1, RMA_ACC=2, RMA_GET=4 } rma_t;
+typedef enum { SYNC_NONE = 0,
+ SYNC_ALL = -1, SYNC_FENCE = 1, SYNC_LOCK = 2, SYNC_PSCW = 4
+} sync_t;
+typedef enum { RMA_NONE = 0, RMA_ALL = -1, RMA_PUT = 1, RMA_ACC = 2, RMA_GET = 4 } rma_t;
/* Note GET not yet implemented */
sync_t syncChoice = SYNC_ALL;
rma_t rmaChoice = RMA_ALL;
static int barrierSync = 0;
static double tickThreshold = 0.0;
-void PrintResults( int cnt, timing t[] );
-void RunAccFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
-void RunAccLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
-void RunPutFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
-void RunPutLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] );
-void RunAccPSCW( MPI_Win win, int destRank, int cnt, int sz,
- MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] );
-void RunPutPSCW( MPI_Win win, int destRank, int cnt, int sz,
- MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] );
-
-int main( int argc, char *argv[] )
+void PrintResults(int cnt, timing t[]);
+void RunAccFence(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
+void RunAccLock(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
+void RunPutFence(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
+void RunPutLock(MPI_Win win, int destRank, int cnt, int sz, timing t[]);
+void RunAccPSCW(MPI_Win win, int destRank, int cnt, int sz,
+ MPI_Group exposureGroup, MPI_Group accessGroup, timing t[]);
+void RunPutPSCW(MPI_Win win, int destRank, int cnt, int sz,
+ MPI_Group exposureGroup, MPI_Group accessGroup, timing t[]);
+
+int main(int argc, char *argv[])
{
- int arraysize, i, cnt, sz, maxCount=MAX_COUNT, *arraybuffer;
+ int arraysize, i, cnt, sz, maxCount = MAX_COUNT, *arraybuffer;
int wrank, wsize, destRank, srcRank;
MPI_Win win;
MPI_Group wgroup, accessGroup, exposureGroup;
timing t[MAX_RUNS];
- int maxSz = MAX_RMA_SIZE;
+ int maxSz = MAX_RMA_SIZE;
- MPI_Init( &argc, &argv );
+ MPI_Init(&argc, &argv);
/* Determine clock accuracy */
tickThreshold = 10.0 * MPI_Wtick();
- MPI_Allreduce( MPI_IN_PLACE, &tickThreshold, 1, MPI_DOUBLE, MPI_MAX,
- MPI_COMM_WORLD );
-
- for (i=1; i<argc; i++) {
- if (strcmp( argv[i], "-put" ) == 0) {
- if (rmaChoice == RMA_ALL) rmaChoice = RMA_NONE;
- rmaChoice |= RMA_PUT;
- }
- else if (strcmp( argv[i], "-acc" ) == 0) {
- if (rmaChoice == RMA_ALL) rmaChoice = RMA_NONE;
- rmaChoice |= RMA_ACC;
- }
- else if (strcmp( argv[i], "-fence" ) == 0) {
- if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
- syncChoice |= SYNC_FENCE;
- }
- else if (strcmp( argv[i], "-lock" ) == 0) {
- if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
- syncChoice |= SYNC_LOCK;
- }
- else if (strcmp( argv[i], "-pscw" ) == 0) {
- if (syncChoice == SYNC_ALL) syncChoice = SYNC_NONE;
- syncChoice |= SYNC_PSCW;
- }
- else if (strcmp( argv[i], "-maxsz" ) == 0) {
- i++;
- maxSz = atoi( argv[i] );
- }
- else if (strcmp( argv[i], "-maxcount" ) == 0) {
- i++;
- maxCount = atoi( argv[i] );
- }
- else if (strcmp( argv[i], "-barrier" ) == 0) {
- barrierSync = 1;
- }
- else {
- fprintf( stderr, "Unrecognized argument %s\n", argv[i] );
- fprintf( stderr, "%s [ -put ] [ -acc ] [ -lock ] [ -fence ] [ -pscw ] [ -barrier ] [ -maxsz msgsize ]\n", argv[0] );
- MPI_Abort( MPI_COMM_WORLD, 1 );
- }
+ MPI_Allreduce(MPI_IN_PLACE, &tickThreshold, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+
+ for (i = 1; i < argc; i++) {
+ if (strcmp(argv[i], "-put") == 0) {
+ if (rmaChoice == RMA_ALL)
+ rmaChoice = RMA_NONE;
+ rmaChoice |= RMA_PUT;
+ }
+ else if (strcmp(argv[i], "-acc") == 0) {
+ if (rmaChoice == RMA_ALL)
+ rmaChoice = RMA_NONE;
+ rmaChoice |= RMA_ACC;
+ }
+ else if (strcmp(argv[i], "-fence") == 0) {
+ if (syncChoice == SYNC_ALL)
+ syncChoice = SYNC_NONE;
+ syncChoice |= SYNC_FENCE;
+ }
+ else if (strcmp(argv[i], "-lock") == 0) {
+ if (syncChoice == SYNC_ALL)
+ syncChoice = SYNC_NONE;
+ syncChoice |= SYNC_LOCK;
+ }
+ else if (strcmp(argv[i], "-pscw") == 0) {
+ if (syncChoice == SYNC_ALL)
+ syncChoice = SYNC_NONE;
+ syncChoice |= SYNC_PSCW;
+ }
+ else if (strcmp(argv[i], "-maxsz") == 0) {
+ i++;
+ maxSz = atoi(argv[i]);
+ }
+ else if (strcmp(argv[i], "-maxcount") == 0) {
+ i++;
+ maxCount = atoi(argv[i]);
+ }
+ else if (strcmp(argv[i], "-barrier") == 0) {
+ barrierSync = 1;
+ }
+ else {
+ fprintf(stderr, "Unrecognized argument %s\n", argv[i]);
+ fprintf(stderr,
+ "%s [ -put ] [ -acc ] [ -lock ] [ -fence ] [ -pscw ] [ -barrier ] [ -maxsz msgsize ]\n",
+ argv[0]);
+ MPI_Abort(MPI_COMM_WORLD, 1);
+ }
}
-
- MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
- MPI_Comm_size( MPI_COMM_WORLD, &wsize );
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &wrank);
+ MPI_Comm_size(MPI_COMM_WORLD, &wsize);
destRank = wrank + 1;
- while (destRank >= wsize) destRank = destRank - wsize;
+ while (destRank >= wsize)
+ destRank = destRank - wsize;
srcRank = wrank - 1;
- if (srcRank < 0) srcRank += wsize;
+ if (srcRank < 0)
+ srcRank += wsize;
/* Create groups for PSCW */
- MPI_Comm_group( MPI_COMM_WORLD, &wgroup );
- MPI_Group_incl( wgroup, 1, &destRank, &accessGroup );
- MPI_Group_incl( wgroup, 1, &srcRank, &exposureGroup );
- MPI_Group_free( &wgroup );
+ MPI_Comm_group(MPI_COMM_WORLD, &wgroup);
+ MPI_Group_incl(wgroup, 1, &destRank, &accessGroup);
+ MPI_Group_incl(wgroup, 1, &srcRank, &exposureGroup);
+ MPI_Group_free(&wgroup);
arraysize = maxSz * MAX_COUNT;
- arraybuffer = (int*)malloc( arraysize * sizeof(int) );
+ arraybuffer = (int *) malloc(arraysize * sizeof(int));
if (!arraybuffer) {
- fprintf( stderr, "Unable to allocate %d words\n", arraysize );
- MPI_Abort( MPI_COMM_WORLD, 1 );
+ fprintf(stderr, "Unable to allocate %d words\n", arraysize);
+ MPI_Abort(MPI_COMM_WORLD, 1);
}
- MPI_Win_create( arraybuffer, arraysize*sizeof(int), (int)sizeof(int),
- MPI_INFO_NULL, MPI_COMM_WORLD, &win );
+ MPI_Win_create(arraybuffer, arraysize * sizeof(int), (int) sizeof(int),
+ MPI_INFO_NULL, MPI_COMM_WORLD, &win);
/* FIXME: we need a test on performance consistency.
- The test needs to have both a relative growth limit and
- an absolute limit.
- */
+ * The test needs to have both a relative growth limit and
+ * an absolute limit.
+ */
if (maxCount > MAX_COUNT) {
- fprintf( stderr, "MaxCount must not exceed %d\n", MAX_COUNT );
- MPI_Abort( MPI_COMM_WORLD, 1 );
+ fprintf(stderr, "MaxCount must not exceed %d\n", MAX_COUNT);
+ MPI_Abort(MPI_COMM_WORLD, 1);
}
if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_ACC)) {
- for (sz=1; sz<=maxSz; sz = sz + sz) {
- if (wrank == 0)
- printf( "Accumulate with fence, %d elements\n", sz );
- cnt = 1;
- while (cnt <= maxCount) {
- RunAccFence( win, destRank, cnt, sz, t );
- if (wrank == 0) {
- PrintResults( cnt, t );
- }
- cnt = 2 * cnt;
- }
- }
+ for (sz = 1; sz <= maxSz; sz = sz + sz) {
+ if (wrank == 0)
+ printf("Accumulate with fence, %d elements\n", sz);
+ cnt = 1;
+ while (cnt <= maxCount) {
+ RunAccFence(win, destRank, cnt, sz, t);
+ if (wrank == 0) {
+ PrintResults(cnt, t);
+ }
+ cnt = 2 * cnt;
+ }
+ }
}
if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_ACC)) {
- for (sz=1; sz<=maxSz; sz = sz + sz) {
- if (wrank == 0)
- printf( "Accumulate with lock, %d elements\n", sz );
- cnt = 1;
- while (cnt <= maxCount) {
- RunAccLock( win, destRank, cnt, sz, t );
- if (wrank == 0) {
- PrintResults( cnt, t );
- }
- cnt = 2 * cnt;
- }
- }
+ for (sz = 1; sz <= maxSz; sz = sz + sz) {
+ if (wrank == 0)
+ printf("Accumulate with lock, %d elements\n", sz);
+ cnt = 1;
+ while (cnt <= maxCount) {
+ RunAccLock(win, destRank, cnt, sz, t);
+ if (wrank == 0) {
+ PrintResults(cnt, t);
+ }
+ cnt = 2 * cnt;
+ }
+ }
}
if ((syncChoice & SYNC_FENCE) && (rmaChoice & RMA_PUT)) {
- for (sz=1; sz<=maxSz; sz = sz + sz) {
- if (wrank == 0)
- printf( "Put with fence, %d elements\n", sz );
- cnt = 1;
- while (cnt <= maxCount) {
- RunPutFence( win, destRank, cnt, sz, t );
- if (wrank == 0) {
- PrintResults( cnt, t );
- }
- cnt = 2 * cnt;
- }
- }
+ for (sz = 1; sz <= maxSz; sz = sz + sz) {
+ if (wrank == 0)
+ printf("Put with fence, %d elements\n", sz);
+ cnt = 1;
+ while (cnt <= maxCount) {
+ RunPutFence(win, destRank, cnt, sz, t);
+ if (wrank == 0) {
+ PrintResults(cnt, t);
+ }
+ cnt = 2 * cnt;
+ }
+ }
}
if ((syncChoice & SYNC_LOCK) && (rmaChoice & RMA_PUT)) {
- for (sz=1; sz<=maxSz; sz = sz + sz) {
- if (wrank == 0)
- printf( "Put with lock, %d elements\n", sz );
- cnt = 1;
- while (cnt <= maxCount) {
- RunPutLock( win, destRank, cnt, sz, t );
- if (wrank == 0) {
- PrintResults( cnt, t );
- }
- cnt = 2 * cnt;
- }
- }
+ for (sz = 1; sz <= maxSz; sz = sz + sz) {
+ if (wrank == 0)
+ printf("Put with lock, %d elements\n", sz);
+ cnt = 1;
+ while (cnt <= maxCount) {
+ RunPutLock(win, destRank, cnt, sz, t);
+ if (wrank == 0) {
+ PrintResults(cnt, t);
+ }
+ cnt = 2 * cnt;
+ }
+ }
}
if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_PUT)) {
- for (sz=1; sz<=maxSz; sz = sz + sz) {
- if (wrank == 0)
- printf( "Put with pscw, %d elements\n", sz );
- cnt = 1;
- while (cnt <= maxCount) {
- RunPutPSCW( win, destRank, cnt, sz,
- exposureGroup, accessGroup, t );
- if (wrank == 0) {
- PrintResults( cnt, t );
- }
- cnt = 2 * cnt;
- }
- }
+ for (sz = 1; sz <= maxSz; sz = sz + sz) {
+ if (wrank == 0)
+ printf("Put with pscw, %d elements\n", sz);
+ cnt = 1;
+ while (cnt <= maxCount) {
+ RunPutPSCW(win, destRank, cnt, sz, exposureGroup, accessGroup, t);
+ if (wrank == 0) {
+ PrintResults(cnt, t);
+ }
+ cnt = 2 * cnt;
+ }
+ }
}
if ((syncChoice & SYNC_PSCW) && (rmaChoice & RMA_ACC)) {
- for (sz=1; sz<=maxSz; sz = sz + sz) {
- if (wrank == 0)
- printf( "Accumulate with pscw, %d elements\n", sz );
- cnt = 1;
- while (cnt <= maxCount) {
- RunAccPSCW( win, destRank, cnt, sz,
- exposureGroup, accessGroup, t );
- if (wrank == 0) {
- PrintResults( cnt, t );
- }
- cnt = 2 * cnt;
- }
- }
+ for (sz = 1; sz <= maxSz; sz = sz + sz) {
+ if (wrank == 0)
+ printf("Accumulate with pscw, %d elements\n", sz);
+ cnt = 1;
+ while (cnt <= maxCount) {
+ RunAccPSCW(win, destRank, cnt, sz, exposureGroup, accessGroup, t);
+ if (wrank == 0) {
+ PrintResults(cnt, t);
+ }
+ cnt = 2 * cnt;
+ }
+ }
}
- MPI_Win_free( &win );
+ MPI_Win_free(&win);
+
+ MPI_Group_free(&accessGroup);
+ MPI_Group_free(&exposureGroup);
- MPI_Group_free( &accessGroup );
- MPI_Group_free( &exposureGroup );
-
MPI_Finalize();
return 0;
}
-void RunAccFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
+void RunAccFence(MPI_Win win, int destRank, int cnt, int sz, timing t[])
{
int k, i, j, one = 1;
- for (k=0; k<MAX_RUNS; k++) {
- MPI_Barrier( MPI_COMM_WORLD );
- MPI_Win_fence( 0, win );
- j = 0;
- t[k].startOp = MPI_Wtime();
- for (i=0; i<cnt; i++) {
- MPI_Accumulate( &one, sz, MPI_INT, destRank,
- j, sz, MPI_INT, MPI_SUM, win );
- j += sz;
- }
- t[k].endOp = MPI_Wtime();
- if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
- MPI_Win_fence( 0, win );
- t[k].endSync = MPI_Wtime();
+ for (k = 0; k < MAX_RUNS; k++) {
+ MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Win_fence(0, win);
+ j = 0;
+ t[k].startOp = MPI_Wtime();
+ for (i = 0; i < cnt; i++) {
+ MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win);
+ j += sz;
+ }
+ t[k].endOp = MPI_Wtime();
+ if (barrierSync)
+ MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Win_fence(0, win);
+ t[k].endSync = MPI_Wtime();
}
}
-void RunAccLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
+void RunAccLock(MPI_Win win, int destRank, int cnt, int sz, timing t[])
{
int k, i, j, one = 1;
- for (k=0; k<MAX_RUNS; k++) {
- MPI_Barrier( MPI_COMM_WORLD );
- MPI_Win_lock( MPI_LOCK_SHARED, destRank, 0, win );
- j = 0;
- t[k].startOp = MPI_Wtime();
- for (i=0; i<cnt; i++) {
- MPI_Accumulate( &one, sz, MPI_INT, destRank,
- j, sz, MPI_INT, MPI_SUM, win );
- j += sz;
- }
- t[k].endOp = MPI_Wtime();
- if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
- MPI_Win_unlock( destRank, win );
- t[k].endSync = MPI_Wtime();
+ for (k = 0; k < MAX_RUNS; k++) {
+ MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Win_lock(MPI_LOCK_SHARED, destRank, 0, win);
+ j = 0;
+ t[k].startOp = MPI_Wtime();
+ for (i = 0; i < cnt; i++) {
+ MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win);
+ j += sz;
+ }
+ t[k].endOp = MPI_Wtime();
+ if (barrierSync)
+ MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Win_unlock(destRank, win);
+ t[k].endSync = MPI_Wtime();
}
}
-void RunPutFence( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
+void RunPutFence(MPI_Win win, int destRank, int cnt, int sz, timing t[])
{
int k, i, j, one = 1;
- for (k=0; k<MAX_RUNS; k++) {
- MPI_Barrier( MPI_COMM_WORLD );
- MPI_Win_fence( 0, win );
- j = 0;
- t[k].startOp = MPI_Wtime();
- for (i=0; i<cnt; i++) {
- MPI_Put( &one, sz, MPI_INT, destRank,
- j, sz, MPI_INT, win );
- j += sz;
- }
- t[k].endOp = MPI_Wtime();
- if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
- MPI_Win_fence( 0, win );
- t[k].endSync = MPI_Wtime();
+ for (k = 0; k < MAX_RUNS; k++) {
+ MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Win_fence(0, win);
+ j = 0;
+ t[k].startOp = MPI_Wtime();
+ for (i = 0; i < cnt; i++) {
+ MPI_Put(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, win);
+ j += sz;
+ }
+ t[k].endOp = MPI_Wtime();
+ if (barrierSync)
+ MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Win_fence(0, win);
+ t[k].endSync = MPI_Wtime();
}
}
-void RunPutLock( MPI_Win win, int destRank, int cnt, int sz, timing t[] )
+void RunPutLock(MPI_Win win, int destRank, int cnt, int sz, timing t[])
{
int k, i, j, one = 1;
- for (k=0; k<MAX_RUNS; k++) {
- MPI_Barrier( MPI_COMM_WORLD );
- MPI_Win_lock( MPI_LOCK_SHARED, destRank, 0, win );
- j = 0;
- t[k].startOp = MPI_Wtime();
- for (i=0; i<cnt; i++) {
- MPI_Put( &one, sz, MPI_INT, destRank, j, sz, MPI_INT, win );
- j += sz;
- }
- t[k].endOp = MPI_Wtime();
- if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
- MPI_Win_unlock( destRank, win );
- t[k].endSync = MPI_Wtime();
+ for (k = 0; k < MAX_RUNS; k++) {
+ MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Win_lock(MPI_LOCK_SHARED, destRank, 0, win);
+ j = 0;
+ t[k].startOp = MPI_Wtime();
+ for (i = 0; i < cnt; i++) {
+ MPI_Put(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, win);
+ j += sz;
+ }
+ t[k].endOp = MPI_Wtime();
+ if (barrierSync)
+ MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Win_unlock(destRank, win);
+ t[k].endSync = MPI_Wtime();
}
}
-void RunPutPSCW( MPI_Win win, int destRank, int cnt, int sz,
- MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] )
+void RunPutPSCW(MPI_Win win, int destRank, int cnt, int sz,
+ MPI_Group exposureGroup, MPI_Group accessGroup, timing t[])
{
int k, i, j, one = 1;
- for (k=0; k<MAX_RUNS; k++) {
- MPI_Barrier( MPI_COMM_WORLD );
- MPI_Win_post( exposureGroup, 0, win );
- MPI_Win_start( accessGroup, 0, win );
- j = 0;
- t[k].startOp = MPI_Wtime();
- for (i=0; i<cnt; i++) {
- MPI_Put( &one, sz, MPI_INT, destRank, j, sz, MPI_INT, win );
- j += sz;
- }
- t[k].endOp = MPI_Wtime();
- if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
- MPI_Win_complete( win );
- MPI_Win_wait( win );
- t[k].endSync = MPI_Wtime();
+ for (k = 0; k < MAX_RUNS; k++) {
+ MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Win_post(exposureGroup, 0, win);
+ MPI_Win_start(accessGroup, 0, win);
+ j = 0;
+ t[k].startOp = MPI_Wtime();
+ for (i = 0; i < cnt; i++) {
+ MPI_Put(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, win);
+ j += sz;
+ }
+ t[k].endOp = MPI_Wtime();
+ if (barrierSync)
+ MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Win_complete(win);
+ MPI_Win_wait(win);
+ t[k].endSync = MPI_Wtime();
}
}
-void RunAccPSCW( MPI_Win win, int destRank, int cnt, int sz,
- MPI_Group exposureGroup, MPI_Group accessGroup, timing t[] )
+void RunAccPSCW(MPI_Win win, int destRank, int cnt, int sz,
+ MPI_Group exposureGroup, MPI_Group accessGroup, timing t[])
{
int k, i, j, one = 1;
- for (k=0; k<MAX_RUNS; k++) {
- MPI_Barrier( MPI_COMM_WORLD );
- MPI_Win_post( exposureGroup, 0, win );
- MPI_Win_start( accessGroup, 0, win );
- j = 0;
- t[k].startOp = MPI_Wtime();
- for (i=0; i<cnt; i++) {
- MPI_Accumulate( &one, sz, MPI_INT, destRank,
- j, sz, MPI_INT, MPI_SUM, win );
- j += sz;
- }
- t[k].endOp = MPI_Wtime();
- if (barrierSync) MPI_Barrier( MPI_COMM_WORLD );
- MPI_Win_complete( win );
- MPI_Win_wait( win );
- t[k].endSync = MPI_Wtime();
+ for (k = 0; k < MAX_RUNS; k++) {
+ MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Win_post(exposureGroup, 0, win);
+ MPI_Win_start(accessGroup, 0, win);
+ j = 0;
+ t[k].startOp = MPI_Wtime();
+ for (i = 0; i < cnt; i++) {
+ MPI_Accumulate(&one, sz, MPI_INT, destRank, j, sz, MPI_INT, MPI_SUM, win);
+ j += sz;
+ }
+ t[k].endOp = MPI_Wtime();
+ if (barrierSync)
+ MPI_Barrier(MPI_COMM_WORLD);
+ MPI_Win_complete(win);
+ MPI_Win_wait(win);
+ t[k].endSync = MPI_Wtime();
}
}
-void PrintResults( int cnt, timing t[] )
+void PrintResults(int cnt, timing t[])
{
int k;
- double d1=0, d2=0;
+ double d1 = 0, d2 = 0;
double minD1 = 1e10, minD2 = 1e10;
double tOp, tSync;
- for (k=0; k<MAX_RUNS; k++) {
- tOp = t[k].endOp - t[k].startOp;
- tSync = t[k].endSync - t[k].endOp;
- d1 += tOp;
- d2 += tSync;
- if (tOp < minD1) minD1 = tOp;
- if (tSync < minD2) minD2 = tSync;
+ for (k = 0; k < MAX_RUNS; k++) {
+ tOp = t[k].endOp - t[k].startOp;
+ tSync = t[k].endSync - t[k].endOp;
+ d1 += tOp;
+ d2 += tSync;
+ if (tOp < minD1)
+ minD1 = tOp;
+ if (tSync < minD2)
+ minD2 = tSync;
}
if (verbose) {
- long rate = 0;
- /* Use the minimum times because they are more stable - if timing
- accuracy is an issue, use the min over multiple trials */
- d1 = minD1;
- d2 = minD2;
- /* d1 = d1 / MAX_RUNS; d2 = d2 / MAX_RUNS); */
- if (d2 > 0) rate = (long)(cnt) / d2;
- /* count, op, sync, op/each, sync/each, rate */
- printf( "%d\t%e\t%e\t%e\t%e\t%ld\n", cnt,
- d1, d2,
- d1 / cnt, d2 / cnt, rate );
+ long rate = 0;
+ /* Use the minimum times because they are more stable - if timing
+ * accuracy is an issue, use the min over multiple trials */
+ d1 = minD1;
+ d2 = minD2;
+ /* d1 = d1 / MAX_RUNS; d2 = d2 / MAX_RUNS); */
+ if (d2 > 0)
+ rate = (long) (cnt) / d2;
+ /* count, op, sync, op/each, sync/each, rate */
+ printf("%d\t%e\t%e\t%e\t%e\t%ld\n", cnt, d1, d2, d1 / cnt, d2 / cnt, rate);
}
}
static int verbose = 0;
-int main( int argc, char **argv )
+int main(int argc, char **argv)
{
- int vcount = 16, vblock = vcount*vcount/2, vstride=2*vcount*vblock;
- int v2stride, typesize, packsize, i, position, errs = 0;
- char *inbuf, *outbuf, *outbuf2;
+ int vcount = 16, vblock = vcount * vcount / 2, vstride = 2 * vcount * vblock;
+ int v2stride, typesize, packsize, i, position, errs = 0;
+ char *inbuf, *outbuf, *outbuf2;
MPI_Datatype ft1type, ft2type, ft3type;
MPI_Datatype ftopttype;
- MPI_Aint lb, extent;
- double t0, t1;
- double tpack, tmanual, tpackopt;
- int ntry;
-
- MPI_Init( &argc, &argv );
-
- MPI_Type_contiguous( 6, MPI_FLOAT, &ft1type );
- MPI_Type_size( ft1type, &typesize );
+ MPI_Aint lb, extent;
+ double t0, t1;
+ double tpack, tmanual, tpackopt;
+ int ntry;
+
+ MPI_Init(&argc, &argv);
+
+ MPI_Type_contiguous(6, MPI_FLOAT, &ft1type);
+ MPI_Type_size(ft1type, &typesize);
v2stride = vcount * vcount * vcount * vcount * typesize;
- MPI_Type_vector( vcount, vblock, vstride, ft1type, &ft2type );
- MPI_Type_create_hvector( 2, 1, v2stride, ft2type, &ft3type );
- MPI_Type_commit( &ft3type );
- MPI_Type_free( &ft1type );
- MPI_Type_free( &ft2type );
+ MPI_Type_vector(vcount, vblock, vstride, ft1type, &ft2type);
+ MPI_Type_create_hvector(2, 1, v2stride, ft2type, &ft3type);
+ MPI_Type_commit(&ft3type);
+ MPI_Type_free(&ft1type);
+ MPI_Type_free(&ft2type);
#if defined(MPICH) && defined(PRINT_DATATYPE_INTERNALS)
/* To use MPIDU_Datatype_debug to print the datatype internals,
- you must configure MPICH with --enable-g=log */
+ * you must configure MPICH with --enable-g=log */
if (verbose) {
- printf( "Original datatype:\n" );
- MPIDU_Datatype_debug( ft3type, 10 );
+ printf("Original datatype:\n");
+ MPIDU_Datatype_debug(ft3type, 10);
}
#endif
/* The same type, but without using the contiguous type */
- MPI_Type_vector( vcount, 6*vblock, 6*vstride, MPI_FLOAT, &ft2type );
- MPI_Type_create_hvector( 2, 1, v2stride, ft2type, &ftopttype );
- MPI_Type_commit( &ftopttype );
- MPI_Type_free( &ft2type );
+ MPI_Type_vector(vcount, 6 * vblock, 6 * vstride, MPI_FLOAT, &ft2type);
+ MPI_Type_create_hvector(2, 1, v2stride, ft2type, &ftopttype);
+ MPI_Type_commit(&ftopttype);
+ MPI_Type_free(&ft2type);
#if defined(MPICH) && defined(PRINT_DATATYPE_INTERNALS)
if (verbose) {
- printf( "\n\nMerged datatype:\n" );
- MPIDU_Datatype_debug( ftopttype, 10 );
+ printf("\n\nMerged datatype:\n");
+ MPIDU_Datatype_debug(ftopttype, 10);
}
#endif
- MPI_Type_get_extent( ft3type, &lb, &extent );
- MPI_Type_size( ft3type, &typesize );
+ MPI_Type_get_extent(ft3type, &lb, &extent);
+ MPI_Type_size(ft3type, &typesize);
- MPI_Pack_size( 1, ft3type, MPI_COMM_WORLD, &packsize );
+ MPI_Pack_size(1, ft3type, MPI_COMM_WORLD, &packsize);
- inbuf = (char *)malloc( extent );
- outbuf = (char *)malloc( packsize );
- outbuf2 = (char *)malloc( packsize );
+ inbuf = (char *) malloc(extent);
+ outbuf = (char *) malloc(packsize);
+ outbuf2 = (char *) malloc(packsize);
if (!inbuf) {
- fprintf( stderr, "Unable to allocate %ld for inbuf\n", (long)extent );
- MPI_Abort( MPI_COMM_WORLD, 1 );
+ fprintf(stderr, "Unable to allocate %ld for inbuf\n", (long) extent);
+ MPI_Abort(MPI_COMM_WORLD, 1);
}
if (!outbuf) {
- fprintf( stderr, "Unable to allocate %ld for outbuf\n", (long)packsize );
- MPI_Abort( MPI_COMM_WORLD, 1 );
+ fprintf(stderr, "Unable to allocate %ld for outbuf\n", (long) packsize);
+ MPI_Abort(MPI_COMM_WORLD, 1);
}
if (!outbuf2) {
- fprintf( stderr, "Unable to allocate %ld for outbuf2\n", (long)packsize );
- MPI_Abort( MPI_COMM_WORLD, 1 );
+ fprintf(stderr, "Unable to allocate %ld for outbuf2\n", (long) packsize);
+ MPI_Abort(MPI_COMM_WORLD, 1);
}
- for (i=0; i<extent; i++) {
- inbuf[i] = i & 0x7f;
+ for (i = 0; i < extent; i++) {
+ inbuf[i] = i & 0x7f;
}
position = 0;
/* Warm up the code and data */
- MPI_Pack( inbuf, 1, ft3type, outbuf, packsize, &position, MPI_COMM_WORLD );
+ MPI_Pack(inbuf, 1, ft3type, outbuf, packsize, &position, MPI_COMM_WORLD);
/* Pack using the vector of vector of contiguous */
tpack = 1e12;
for (ntry = 0; ntry < 5; ntry++) {
- position = 0;
- t0 = MPI_Wtime();
- MPI_Pack( inbuf, 1, ft3type, outbuf, packsize, &position,
- MPI_COMM_WORLD );
- t1 = MPI_Wtime() - t0;
- if (t1 < tpack) tpack = t1;
+ position = 0;
+ t0 = MPI_Wtime();
+ MPI_Pack(inbuf, 1, ft3type, outbuf, packsize, &position, MPI_COMM_WORLD);
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tpack)
+ tpack = t1;
}
- MPI_Type_free( &ft3type );
+ MPI_Type_free(&ft3type);
/* Pack using vector of vector with big blocks (same type map) */
tpackopt = 1e12;
for (ntry = 0; ntry < 5; ntry++) {
- position = 0;
- t0 = MPI_Wtime();
- MPI_Pack( inbuf, 1, ftopttype, outbuf, packsize, &position,
- MPI_COMM_WORLD );
- t1 = MPI_Wtime() - t0;
- if (t1 < tpackopt) tpackopt = t1;
+ position = 0;
+ t0 = MPI_Wtime();
+ MPI_Pack(inbuf, 1, ftopttype, outbuf, packsize, &position, MPI_COMM_WORLD);
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tpackopt)
+ tpackopt = t1;
}
- MPI_Type_free( &ftopttype );
+ MPI_Type_free(&ftopttype);
/* User (manual) packing code.
- Note that we exploit the fact that the vector type contains vblock
- instances of a contiguous type of size 24, or equivalently a
- single block of 24*vblock bytes.
- */
+ * Note that we exploit the fact that the vector type contains vblock
+ * instances of a contiguous type of size 24, or equivalently a
+ * single block of 24*vblock bytes.
+ */
tmanual = 1e12;
for (ntry = 0; ntry < 5; ntry++) {
- const char *ppe = (const char *)inbuf;
- int k, j;
- t0 = MPI_Wtime();
- position = 0;
- for (k=0; k<2; k++) { /* hvector count; blocksize is 1 */
- const char *ptr = ppe;
- for (j=0; j<vcount; j++) { /* vector count */
- memcpy( outbuf2 + position, ptr, 24*vblock );
- ptr += vstride * 24;
- position += 24*vblock;
- }
- ppe += v2stride;
- }
- t1 = MPI_Wtime() - t0;
- if (t1 < tmanual) tmanual = t1;
-
- /* Check on correctness */
+ const char *ppe = (const char *) inbuf;
+ int k, j;
+ t0 = MPI_Wtime();
+ position = 0;
+ for (k = 0; k < 2; k++) { /* hvector count; blocksize is 1 */
+ const char *ptr = ppe;
+ for (j = 0; j < vcount; j++) { /* vector count */
+ memcpy(outbuf2 + position, ptr, 24 * vblock);
+ ptr += vstride * 24;
+ position += 24 * vblock;
+ }
+ ppe += v2stride;
+ }
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tmanual)
+ tmanual = t1;
+
+ /* Check on correctness */
#ifdef PACK_IS_NATIVE
- if (memcmp( outbuf, outbuf2, position ) != 0) {
- printf( "Panic - pack buffers differ\n" );
- }
+ if (memcmp(outbuf, outbuf2, position) != 0) {
+ printf("Panic - pack buffers differ\n");
+ }
#endif
}
if (verbose) {
- printf( "Bytes packed = %d\n", position );
- printf( "MPI_Pack time = %e, opt version = %e, manual pack time = %e\n",
- tpack, tpackopt, tmanual );
+ printf("Bytes packed = %d\n", position);
+ printf("MPI_Pack time = %e, opt version = %e, manual pack time = %e\n",
+ tpack, tpackopt, tmanual);
}
/* A factor of 4 is extremely generous, especially since the test suite
- no longer builds any of the tests with optimization */
+ * no longer builds any of the tests with optimization */
if (4 * tmanual < tpack) {
- errs++;
- printf( "MPI_Pack time = %e, manual pack time = %e\n", tpack, tmanual );
- printf( "MPI_Pack time should be less than 4 times the manual time\n" );
- printf( "For most informative results, be sure to compile this test with optimization\n" );
+ errs++;
+ printf("MPI_Pack time = %e, manual pack time = %e\n", tpack, tmanual);
+ printf("MPI_Pack time should be less than 4 times the manual time\n");
+ printf("For most informative results, be sure to compile this test with optimization\n");
}
if (4 * tmanual < tpackopt) {
- errs++;
- printf( "MPI_Pack with opt = %e, manual pack time = %e\n", tpackopt,
- tmanual );
- printf( "MPI_Pack time should be less than 4 times the manual time\n" );
- printf( "For most informative results, be sure to compile this test with optimization\n" );
+ errs++;
+ printf("MPI_Pack with opt = %e, manual pack time = %e\n", tpackopt, tmanual);
+ printf("MPI_Pack time should be less than 4 times the manual time\n");
+ printf("For most informative results, be sure to compile this test with optimization\n");
}
if (errs) {
- printf( " Found %d errors\n", errs );
+ printf(" Found %d errors\n", errs);
}
else {
- printf( " No Errors\n" );
- }
-
- free( inbuf );
- free( outbuf );
- free( outbuf2 );
+ printf(" No Errors\n");
+ }
+
+ free(inbuf);
+ free(outbuf);
+ free(outbuf2);
MPI_Finalize();
return 0;
static int verbose = 0;
-int main( int argc, char **argv )
+int main(int argc, char **argv)
{
- int vcount, vstride;
- int32_t counts[2];
- int v2stride, typesize, packsize, i, position, errs = 0;
- double *outbuf, *outbuf2;
- double *vsource;
+ int vcount, vstride;
+ int32_t counts[2];
+ int packsize, i, position, errs = 0;
+ double *outbuf, *outbuf2;
+ double *vsource;
MPI_Datatype vtype, stype;
- MPI_Aint lb, extent;
- double t0, t1;
- double tspack, tvpack, tmanual;
- int ntry;
- int blocklengths[2];
- MPI_Aint displacements[2];
+ double t0, t1;
+ double tspack, tvpack, tmanual;
+ int ntry;
+ int blocklengths[2];
+ MPI_Aint displacements[2];
MPI_Datatype typesArray[2];
- MPI_Init( &argc, &argv );
-
- /* Create a struct consisting of a two 32-bit ints, followed by a
- vector of stride 3 but count 128k (less than a few MB of data area) */
- vcount = 128000;
+ MPI_Init(&argc, &argv);
+
+ /* Create a struct consisting of a two 32-bit ints, followed by a
+ * vector of stride 3 but count 128k (less than a few MB of data area) */
+ vcount = 128000;
vstride = 3;
- MPI_Type_vector( vcount, 1, vstride, MPI_DOUBLE, &vtype );
+ MPI_Type_vector(vcount, 1, vstride, MPI_DOUBLE, &vtype);
- vsource = (double *)malloc( (vcount + 1) * (vstride + 1) * sizeof(double) );
+ vsource = (double *) malloc((vcount + 1) * (vstride + 1) * sizeof(double));
if (!vsource) {
- fprintf( stderr, "Unable to allocate vsource\n" );
- MPI_Abort( MPI_COMM_WORLD, 1 );
+ fprintf(stderr, "Unable to allocate vsource\n");
+ MPI_Abort(MPI_COMM_WORLD, 1);
}
- for (i=0; i<vcount*vstride; i++) {
- vsource[i] = i;
+ for (i = 0; i < vcount * vstride; i++) {
+ vsource[i] = i;
}
- blocklengths[0] = 2; MPI_Get_address( &counts[0], &displacements[0] );
- blocklengths[1] = 1; MPI_Get_address( vsource, &displacements[1] );
+ blocklengths[0] = 2;
+ MPI_Get_address(&counts[0], &displacements[0]);
+ blocklengths[1] = 1;
+ MPI_Get_address(vsource, &displacements[1]);
if (verbose) {
- printf( "%p = %p?\n", vsource, (void *)displacements[1] );
+ printf("%p = %p?\n", vsource, (void *) displacements[1]);
}
typesArray[0] = MPI_INT32_T;
typesArray[1] = vtype;
- MPI_Type_create_struct( 2, blocklengths, displacements, typesArray,
- &stype );
- MPI_Type_commit( &stype );
- MPI_Type_commit( &vtype );
+ MPI_Type_create_struct(2, blocklengths, displacements, typesArray, &stype);
+ MPI_Type_commit(&stype);
+ MPI_Type_commit(&vtype);
#if defined(MPICH) && defined(PRINT_DATATYPE_INTERNALS)
/* To use MPIDU_Datatype_debug to print the datatype internals,
- you must configure MPICH with --enable-g=log */
+ * you must configure MPICH with --enable-g=log */
if (verbose) {
- printf( "Original struct datatype:\n" );
- MPIDU_Datatype_debug( stype, 10 );
+ printf("Original struct datatype:\n");
+ MPIDU_Datatype_debug(stype, 10);
}
#endif
- MPI_Pack_size( 1, stype, MPI_COMM_WORLD, &packsize );
- outbuf = (double *)malloc( packsize );
- outbuf2 = (double *)malloc( packsize );
+ MPI_Pack_size(1, stype, MPI_COMM_WORLD, &packsize);
+ outbuf = (double *) malloc(packsize);
+ outbuf2 = (double *) malloc(packsize);
if (!outbuf) {
- fprintf( stderr, "Unable to allocate %ld for outbuf\n", (long)packsize );
- MPI_Abort( MPI_COMM_WORLD, 1 );
+ fprintf(stderr, "Unable to allocate %ld for outbuf\n", (long) packsize);
+ MPI_Abort(MPI_COMM_WORLD, 1);
}
if (!outbuf2) {
- fprintf( stderr, "Unable to allocate %ld for outbuf2\n", (long)packsize );
- MPI_Abort( MPI_COMM_WORLD, 1 );
+ fprintf(stderr, "Unable to allocate %ld for outbuf2\n", (long) packsize);
+ MPI_Abort(MPI_COMM_WORLD, 1);
}
position = 0;
/* Warm up the code and data */
- MPI_Pack( MPI_BOTTOM, 1, stype, outbuf, packsize, &position,
- MPI_COMM_WORLD );
+ MPI_Pack(MPI_BOTTOM, 1, stype, outbuf, packsize, &position, MPI_COMM_WORLD);
tspack = 1e12;
for (ntry = 0; ntry < 5; ntry++) {
- position = 0;
- t0 = MPI_Wtime();
- MPI_Pack( MPI_BOTTOM, 1, stype, outbuf, packsize, &position,
- MPI_COMM_WORLD );
- t1 = MPI_Wtime() - t0;
- if (t1 < tspack) tspack = t1;
+ position = 0;
+ t0 = MPI_Wtime();
+ MPI_Pack(MPI_BOTTOM, 1, stype, outbuf, packsize, &position, MPI_COMM_WORLD);
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tspack)
+ tspack = t1;
}
- MPI_Type_free( &stype );
+ MPI_Type_free(&stype);
/* An equivalent packing, using the 2 ints and the vector separately */
tvpack = 1e12;
for (ntry = 0; ntry < 5; ntry++) {
- position = 0;
- t0 = MPI_Wtime();
- MPI_Pack( counts, 2, MPI_INT32_T, outbuf, packsize, &position,
- MPI_COMM_WORLD );
- MPI_Pack( vsource, 1, vtype, outbuf, packsize, &position,
- MPI_COMM_WORLD );
- t1 = MPI_Wtime() - t0;
- if (t1 < tvpack) tvpack = t1;
+ position = 0;
+ t0 = MPI_Wtime();
+ MPI_Pack(counts, 2, MPI_INT32_T, outbuf, packsize, &position, MPI_COMM_WORLD);
+ MPI_Pack(vsource, 1, vtype, outbuf, packsize, &position, MPI_COMM_WORLD);
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tvpack)
+ tvpack = t1;
}
- MPI_Type_free( &vtype );
+ MPI_Type_free(&vtype);
- /* Note that we exploit the fact that the vector type contains vblock
- instances of a contiguous type of size 24, or a single block of 24*vblock
- bytes.
- */
+ /* Note that we exploit the fact that the vector type contains vblock
+ * instances of a contiguous type of size 24, or a single block of 24*vblock
+ * bytes.
+ */
tmanual = 1e12;
for (ntry = 0; ntry < 5; ntry++) {
- const double * restrict ppe = (const double *)vsource;
- double * restrict ppo = outbuf2;
- int j;
- t0 = MPI_Wtime();
- position = 0;
- *(int32_t *)ppo = counts[0];
- *( ((int32_t *)ppo) + 1) = counts[1];
- ppo++;
- /* Some hand optimization because this file is not normally
- compiled with optimization by the test suite */
- j = vcount;
- while (j) {
- *ppo++ = *ppe;
- ppe += vstride;
- *ppo++ = *ppe;
- ppe += vstride;
- *ppo++ = *ppe;
- ppe += vstride;
- *ppo++ = *ppe;
- ppe += vstride;
- j -= 4;
- }
- position += (1 + vcount);
- position *= sizeof(double);
- t1 = MPI_Wtime() - t0;
- if (t1 < tmanual) tmanual = t1;
-
- /* Check on correctness */
+ const double *restrict ppe = (const double *) vsource;
+ double *restrict ppo = outbuf2;
+ int j;
+ t0 = MPI_Wtime();
+ position = 0;
+ *(int32_t *) ppo = counts[0];
+ *(((int32_t *) ppo) + 1) = counts[1];
+ ppo++;
+ /* Some hand optimization because this file is not normally
+ * compiled with optimization by the test suite */
+ j = vcount;
+ while (j) {
+ *ppo++ = *ppe;
+ ppe += vstride;
+ *ppo++ = *ppe;
+ ppe += vstride;
+ *ppo++ = *ppe;
+ ppe += vstride;
+ *ppo++ = *ppe;
+ ppe += vstride;
+ j -= 4;
+ }
+ position += (1 + vcount);
+ position *= sizeof(double);
+ t1 = MPI_Wtime() - t0;
+ if (t1 < tmanual)
+ tmanual = t1;
+
+ /* Check on correctness */
#ifdef PACK_IS_NATIVE
- if (memcmp( outbuf, outbuf2, position ) != 0) {
- printf( "Panic(manual) - pack buffers differ\n" );
- for (j=0; j<8; j++) {
- printf( "%d: %llx\t%llx\n", j, (long long unsigned)outbuf[j],
- (long long unsigned)outbuf2[j] );
- }
- }
+ if (memcmp(outbuf, outbuf2, position) != 0) {
+ printf("Panic(manual) - pack buffers differ\n");
+ for (j = 0; j < 8; j++) {
+ printf("%d: %llx\t%llx\n", j, (long long unsigned) outbuf[j],
+ (long long unsigned) outbuf2[j]);
+ }
+ }
#endif
}
if (verbose) {
- printf( "Bytes packed = %d\n", position );
- printf( "MPI_Pack time = %e (struct), = %e (vector), manual pack time = %e\n",
- tspack, tvpack, tmanual );
+ printf("Bytes packed = %d\n", position);
+ printf("MPI_Pack time = %e (struct), = %e (vector), manual pack time = %e\n",
+ tspack, tvpack, tmanual);
}
if (4 * tmanual < tspack) {
- errs++;
- printf( "MPI_Pack time using struct with vector = %e, manual pack time = %e\n", tspack, tmanual )
-;
- printf( "MPI_Pack time should be less than 4 times the manual time\n" );
- printf( "For most informative results, be sure to compile this test with optimization\n" );
+ errs++;
+ printf("MPI_Pack time using struct with vector = %e, manual pack time = %e\n", tspack,
+ tmanual);
+ printf("MPI_Pack time should be less than 4 times the manual time\n");
+ printf("For most informative results, be sure to compile this test with optimization\n");
}
if (4 * tmanual < tvpack) {
- errs++;
- printf( "MPI_Pack using vector = %e, manual pack time = %e\n", tvpack,
- tmanual );
- printf( "MPI_Pack time should be less than 4 times the manual time\n" );
- printf( "For most informative results, be sure to compile this test with optimization\n" );
+ errs++;
+ printf("MPI_Pack using vector = %e, manual pack time = %e\n", tvpack, tmanual);
+ printf("MPI_Pack time should be less than 4 times the manual time\n");
+ printf("For most informative results, be sure to compile this test with optimization\n");
}
if (4 * tvpack < tspack) {
- errs++;
- printf( "MPI_Pack using a vector = %e, using a struct with vector = %e\n", tvpack, tspack );
- printf( "MPI_Pack time using vector should be about the same as the struct containing the vector\n" );
- printf( "For most informative results, be sure to compile this test with optimization\n" );
+ errs++;
+ printf("MPI_Pack using a vector = %e, using a struct with vector = %e\n", tvpack, tspack);
+ printf
+ ("MPI_Pack time using vector should be about the same as the struct containing the vector\n");
+ printf("For most informative results, be sure to compile this test with optimization\n");
}
if (errs) {
- printf( " Found %d errors\n", errs );
+ printf(" Found %d errors\n", errs);
}
else {
- printf( " No Errors\n" );
- }
-
- free( vsource );
- free( outbuf );
- free( outbuf2 );
+ printf(" No Errors\n");
+ }
+
+ free(vsource);
+ free(outbuf);
+ free(outbuf2);
MPI_Finalize();
return 0;
static int verbose = 0;
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
{
- char *sbuf, *rbuf;
- int i, j;
- double t1, t2, t, ts;
- int rank, size;
- MPI_Status status;
+ char *sbuf, *rbuf;
+ int i, j;
+ double t1, t2, t, ts;
+ int rank, size;
+ MPI_Status status;
- MPI_Init(&argc,&argv);
- MPI_Comm_rank(MPI_COMM_WORLD,&rank);
- MPI_Comm_size(MPI_COMM_WORLD, &size);
+ MPI_Init(&argc, &argv);
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
- if (getenv("MPITEST_VERBOSE")) verbose = 1;
+ if (getenv("MPITEST_VERBOSE"))
+ verbose = 1;
- /* Allocate memory regions to communicate */
- sbuf = (char*) malloc(SIZE);
- rbuf = (char*) malloc(size * SIZE);
+ /* Allocate memory regions to communicate */
+ sbuf = (char *) malloc(SIZE);
+ rbuf = (char *) malloc(size * SIZE);
- /* Touch the buffers to make sure they are allocated */
- for (i = 0; i < SIZE; i++) sbuf[i] = '0';
- for (i = 0; i < SIZE * size; i++) rbuf[i] = '0';
+ /* Touch the buffers to make sure they are allocated */
+ for (i = 0; i < SIZE; i++)
+ sbuf[i] = '0';
+ for (i = 0; i < SIZE * size; i++)
+ rbuf[i] = '0';
- /* Time when rank 0 gathers the data */
- MPI_Barrier(MPI_COMM_WORLD);
- t1 = MPI_Wtime();
- for (i = 0; i < ITER; i++) {
- MPI_Gather(sbuf, SIZE, MPI_BYTE, rbuf, SIZE, MPI_BYTE, 0, MPI_COMM_WORLD);
- MPI_Barrier(MPI_COMM_WORLD);
- }
- t2 = MPI_Wtime();
- t = (t2-t1)/ITER;
+ /* Time when rank 0 gathers the data */
+ MPI_Barrier(MPI_COMM_WORLD);
+ t1 = MPI_Wtime();
+ for (i = 0; i < ITER; i++) {
+ MPI_Gather(sbuf, SIZE, MPI_BYTE, rbuf, SIZE, MPI_BYTE, 0, MPI_COMM_WORLD);
+ MPI_Barrier(MPI_COMM_WORLD);
+ }
+ t2 = MPI_Wtime();
+ t = (t2 - t1) / ITER;
- /* Time when rank 1 gathers the data */
- MPI_Barrier(MPI_COMM_WORLD);
- t1 = MPI_Wtime();
- for (j = 0; j < ITER; j++) {
- MPI_Gather(sbuf, SIZE, MPI_BYTE, rbuf, SIZE, MPI_BYTE, 1, MPI_COMM_WORLD);
- MPI_Barrier(MPI_COMM_WORLD);
- }
- t2 = MPI_Wtime();
- ts = (t2-t1)/ITER;
+ /* Time when rank 1 gathers the data */
+ MPI_Barrier(MPI_COMM_WORLD);
+ t1 = MPI_Wtime();
+ for (j = 0; j < ITER; j++) {
+ MPI_Gather(sbuf, SIZE, MPI_BYTE, rbuf, SIZE, MPI_BYTE, 1, MPI_COMM_WORLD);
+ MPI_Barrier(MPI_COMM_WORLD);
+ }
+ t2 = MPI_Wtime();
+ ts = (t2 - t1) / ITER;
- if (rank == 1)
- MPI_Send(&ts, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
- if (rank == 0)
- MPI_Recv(&ts, 1, MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &status);
+ if (rank == 1)
+ MPI_Send(&ts, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
+ if (rank == 0)
+ MPI_Recv(&ts, 1, MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &status);
- /* Print out the results */
- if (!rank) {
- if ((ts / t) > (1 + ERROR_MARGIN)) { /* If the difference is more than 10%, it's an error */
- printf("%.3f\t%.3f\n", 1000000.0 * ts, 1000000.0 * t);
- printf("Too much difference in performance\n");
- }
- else printf(" No Errors\n");
- }
-
- MPI_Finalize();
- free(sbuf);
- free(rbuf);
- return 0;
+ /* Print out the results */
+ if (!rank) {
+ if ((ts / t) > (1 + ERROR_MARGIN)) { /* If the difference is more than 10%, it's an error */
+ printf("%.3f\t%.3f\n", 1000000.0 * ts, 1000000.0 * t);
+ printf("Too much difference in performance\n");
+ }
+ else
+ printf(" No Errors\n");
+ }
+
+ MPI_Finalize();
+
+ return 0;
}
#include <stdlib.h>
#define MAXTESTS 32
-#define ERROR_MARGIN 1.0 /* FIXME: This number is pretty much randomly chosen */
+#define ERROR_MARGIN 1.0 /* FIXME: This number is pretty much randomly chosen */
static int verbose = 0;
-int main( int argc, char *argv[] )
+int main(int argc, char *argv[])
{
int wsize, wrank, partner, len, maxlen, k, reps, repsleft;
double t1;
char *rbuf, *sbuf;
double times[3][MAXTESTS];
- MPI_Init( &argc, &argv );
- if (getenv("MPITEST_VERBOSE")) verbose = 1;
+ MPI_Init(&argc, &argv);
+ if (getenv("MPITEST_VERBOSE"))
+ verbose = 1;
+
+ MPI_Comm_size(MPI_COMM_WORLD, &wsize);
+ MPI_Comm_rank(MPI_COMM_WORLD, &wrank);
- MPI_Comm_size( MPI_COMM_WORLD, &wsize );
- MPI_Comm_rank( MPI_COMM_WORLD, &wrank );
-
if (wsize < 2) {
- fprintf( stderr, "This program requires at least 2 processes\n" );
- MPI_Abort( MPI_COMM_WORLD, 1 );
+ fprintf(stderr, "This program requires at least 2 processes\n");
+ MPI_Abort(MPI_COMM_WORLD, 1);
}
/* Set partner based on whether rank is odd or even */
if (wrank & 0x1) {
- partner = wrank - 1;
+ partner = wrank - 1;
}
else if (wrank < wsize - 1) {
- partner = wrank + 1;
+ partner = wrank + 1;
}
- else
- /* Handle wsize odd */
- partner = MPI_PROC_NULL;
+ else
+ /* Handle wsize odd */
+ partner = MPI_PROC_NULL;
/* Allocate and initialize buffers */
- maxlen = 1024*1024;
- rbuf = (char *)malloc( maxlen );
- sbuf = (char *)malloc( maxlen );
+ maxlen = 1024 * 1024;
+ rbuf = (char *) malloc(maxlen);
+ sbuf = (char *) malloc(maxlen);
if (!rbuf || !sbuf) {
- fprintf( stderr, "Could not allocate %d byte buffers\n", maxlen );
- MPI_Abort( MPI_COMM_WORLD, 2 );
+ fprintf(stderr, "Could not allocate %d byte buffers\n", maxlen);
+ MPI_Abort(MPI_COMM_WORLD, 2);
}
- for (k=0; k<maxlen; k++) {
- rbuf[k] = 0;
- sbuf[k] = 0;
+ for (k = 0; k < maxlen; k++) {
+ rbuf[k] = 0;
+ sbuf[k] = 0;
}
-
- MPI_Barrier( MPI_COMM_WORLD );
+
+ MPI_Barrier(MPI_COMM_WORLD);
/* Test Irecv and send, head to head */
if (wrank == 0 && verbose) {
- printf( "Irecv-send\n" );
- printf( "len\ttime \trate\n" );
+ printf("Irecv-send\n");
+ printf("len\ttime \trate\n");
}
/* Send powers of 2 bytes */
len = 1;
- for (k=0; k<20; k++) {
- /* We use a simple linear form for the number of tests to
- reduce the impact of the granularity of the timer */
- reps = 50-k;
- repsleft = reps;
- /* Make sure that both processes are ready to start */
- MPI_Sendrecv( MPI_BOTTOM, 0, MPI_BYTE, partner, 0,
- MPI_BOTTOM, 0, MPI_BYTE, partner, 0, MPI_COMM_WORLD,
- MPI_STATUS_IGNORE );
- t1 = MPI_Wtime();
- while (repsleft--) {
- MPI_Irecv( rbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD, &rreq );
- MPI_Send( sbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD );
- MPI_Wait( &rreq, MPI_STATUS_IGNORE );
- }
- t1 = MPI_Wtime() - t1;
- times[0][k] = t1 / reps;
- if (wrank == 0) {
- t1 = t1 / reps;
- if (t1 > 0) {
- t1 = t1 * 1.e6;
- if (verbose)
- printf( "%d\t%g\t%g\n", len, t1, len/t1 );
- }
- else {
- t1 = t1 * 1.e6;
- if (verbose)
- printf( "%d\t%g\tINF\n", len, t1 );
- }
- if (verbose)
- fflush( stdout );
- }
+ for (k = 0; k < 20; k++) {
+ /* We use a simple linear form for the number of tests to
+ * reduce the impact of the granularity of the timer */
+ reps = 50 - k;
+ repsleft = reps;
+ /* Make sure that both processes are ready to start */
+ MPI_Sendrecv(MPI_BOTTOM, 0, MPI_BYTE, partner, 0,
+ MPI_BOTTOM, 0, MPI_BYTE, partner, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+ t1 = MPI_Wtime();
+ while (repsleft--) {
+ MPI_Irecv(rbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD, &rreq);
+ MPI_Send(sbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD);
+ MPI_Wait(&rreq, MPI_STATUS_IGNORE);
+ }
+ t1 = MPI_Wtime() - t1;
+ times[0][k] = t1 / reps;
+ if (wrank == 0) {
+ t1 = t1 / reps;
+ if (t1 > 0) {
+ t1 = t1 * 1.e6;
+ if (verbose)
+ printf("%d\t%g\t%g\n", len, t1, len / t1);
+ }
+ else {
+ t1 = t1 * 1.e6;
+ if (verbose)
+ printf("%d\t%g\tINF\n", len, t1);
+ }
+ if (verbose)
+ fflush(stdout);
+ }
- len *= 2;
+ len *= 2;
}
- MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Barrier(MPI_COMM_WORLD);
/* Test Sendrecv, head to head */
if (wrank == 0 && verbose) {
- printf( "Sendrecv\n" );
- printf( "len\ttime (usec)\trate (MB/s)\n" );
+ printf("Sendrecv\n");
+ printf("len\ttime (usec)\trate (MB/s)\n");
}
/* Send powers of 2 bytes */
len = 1;
- for (k=0; k<20; k++) {
- /* We use a simple linear form for the number of tests to
- reduce the impact of the granularity of the timer */
- reps = 50-k;
- repsleft = reps;
- /* Make sure that both processes are ready to start */
- MPI_Sendrecv( MPI_BOTTOM, 0, MPI_BYTE, partner, 0,
- MPI_BOTTOM, 0, MPI_BYTE, partner, 0, MPI_COMM_WORLD,
- MPI_STATUS_IGNORE );
- t1 = MPI_Wtime();
- while (repsleft--) {
- MPI_Sendrecv( sbuf, len, MPI_BYTE, partner, k,
- rbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD,
- MPI_STATUS_IGNORE );
- }
- t1 = MPI_Wtime() - t1;
- times[1][k] = t1 / reps;
- if (wrank == 0) {
- t1 = t1 / reps;
- if (t1 > 0) {
- t1 = t1 * 1.e6;
- if (verbose)
- printf( "%d\t%g\t%g\n", len, t1, len/t1 );
- }
- else {
- t1 = t1 * 1.e6;
- if (verbose)
- printf( "%d\t%g\tINF\n", len, t1 );
- }
- if (verbose)
- fflush( stdout );
- }
+ for (k = 0; k < 20; k++) {
+ /* We use a simple linear form for the number of tests to
+ * reduce the impact of the granularity of the timer */
+ reps = 50 - k;
+ repsleft = reps;
+ /* Make sure that both processes are ready to start */
+ MPI_Sendrecv(MPI_BOTTOM, 0, MPI_BYTE, partner, 0,
+ MPI_BOTTOM, 0, MPI_BYTE, partner, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+ t1 = MPI_Wtime();
+ while (repsleft--) {
+ MPI_Sendrecv(sbuf, len, MPI_BYTE, partner, k,
+ rbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+ }
+ t1 = MPI_Wtime() - t1;
+ times[1][k] = t1 / reps;
+ if (wrank == 0) {
+ t1 = t1 / reps;
+ if (t1 > 0) {
+ t1 = t1 * 1.e6;
+ if (verbose)
+ printf("%d\t%g\t%g\n", len, t1, len / t1);
+ }
+ else {
+ t1 = t1 * 1.e6;
+ if (verbose)
+ printf("%d\t%g\tINF\n", len, t1);
+ }
+ if (verbose)
+ fflush(stdout);
+ }
- len *= 2;
+ len *= 2;
}
- MPI_Barrier( MPI_COMM_WORLD );
+ MPI_Barrier(MPI_COMM_WORLD);
/* Test Send/recv, ping-pong */
if (wrank == 0 && verbose) {
- printf( "Pingpong\n" );
- printf( "len\ttime (usec)\trate (MB/s)\n" );
+ printf("Pingpong\n");
+ printf("len\ttime (usec)\trate (MB/s)\n");
}
/* Send powers of 2 bytes */
len = 1;
- for (k=0; k<20; k++) {
- /* We use a simple linear form for the number of tests to
- reduce the impact of the granularity of the timer */
- reps = 50-k;
- repsleft = reps;
- /* Make sure that both processes are ready to start */
- MPI_Sendrecv( MPI_BOTTOM, 0, MPI_BYTE, partner, 0,
- MPI_BOTTOM, 0, MPI_BYTE, partner, 0, MPI_COMM_WORLD,
- MPI_STATUS_IGNORE );
- t1 = MPI_Wtime();
- while (repsleft--) {
- if (wrank & 0x1) {
- MPI_Send( sbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD );
- MPI_Recv( rbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD,
- MPI_STATUS_IGNORE );
- }
- else {
- MPI_Recv( rbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD,
- MPI_STATUS_IGNORE );
- MPI_Send( sbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD );
- }
- }
- t1 = MPI_Wtime() - t1;
- times[2][k] = t1 / reps;
- if (wrank == 0) {
- t1 = t1 / reps;
- if (t1 > 0) {
- t1 = t1 * 1.e6;
- if (verbose)
- printf( "%d\t%g\t%g\n", len, t1, len/t1 );
- }
- else {
- t1 = t1 * 1.e6;
- if (verbose)
- printf( "%d\t%g\tINF\n", len, t1 );
- }
- if (verbose)
- fflush( stdout );
- }
+ for (k = 0; k < 20; k++) {
+ /* We use a simple linear form for the number of tests to
+ * reduce the impact of the granularity of the timer */
+ reps = 50 - k;
+ repsleft = reps;
+ /* Make sure that both processes are ready to start */
+ MPI_Sendrecv(MPI_BOTTOM, 0, MPI_BYTE, partner, 0,
+ MPI_BOTTOM, 0, MPI_BYTE, partner, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+ t1 = MPI_Wtime();
+ while (repsleft--) {
+ if (wrank & 0x1) {
+ MPI_Send(sbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD);
+ MPI_Recv(rbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+ }
+ else {
+ MPI_Recv(rbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+ MPI_Send(sbuf, len, MPI_BYTE, partner, k, MPI_COMM_WORLD);
+ }
+ }
+ t1 = MPI_Wtime() - t1;
+ times[2][k] = t1 / reps;
+ if (wrank == 0) {
+ t1 = t1 / reps;
+ if (t1 > 0) {
+ t1 = t1 * 1.e6;
+ if (verbose)
+ printf("%d\t%g\t%g\n", len, t1, len / t1);
+ }
+ else {
+ t1 = t1 * 1.e6;
+ if (verbose)
+ printf("%d\t%g\tINF\n", len, t1);
+ }
+ if (verbose)
+ fflush(stdout);
+ }
- len *= 2;
+ len *= 2;
}
-
-
+
+
/* At this point, we could optionally analyze the results and report
- success or failure based on some criteria, such as near monotone
- increases in bandwidth. This test was created because of a
- fall-off in performance noted in the ch3:sock device:channel */
+ * success or failure based on some criteria, such as near monotone
+ * increases in bandwidth. This test was created because of a
+ * fall-off in performance noted in the ch3:sock device:channel */
if (wrank == 0) {
- int nPerfErrors = 0;
- len = 1;
- for (k=0; k<20; k++) {
- double T0,T1,T2;
- T0 = times[0][k] * 1.e6;
- T1 = times[1][k] * 1.e6;
- T2 = times[2][k] * 1.e6;
- if (verbose)
- printf( "%d\t%12.2f\t%12.2f\t%12.2f\n", len, T0, T1, T2 );
- /* Lets look at long messages only */
- if (k > 10) {
- double T0Old, T1Old, T2Old;
- T0Old = times[0][k-1] * 1.0e6;
- T1Old = times[1][k-1] * 1.0e6;
- T2Old = times[2][k-1] * 1.0e6;
- if (T0 > (2+ERROR_MARGIN) * T0Old) {
- nPerfErrors++;
- if (verbose)
- printf( "Irecv-Send:\t%d\t%12.2f\t%12.2f\n", len, T0Old, T0 );
- }
- if (T1 > (2+ERROR_MARGIN) * T1Old) {
- nPerfErrors++;
- if (verbose)
- printf( "Sendrecv:\t%d\t%12.2f\t%12.2f\n", len, T1Old, T1 );
- }
- if (T2 > (2+ERROR_MARGIN) * T2Old) {
- nPerfErrors++;
- if (verbose)
- printf( "Pingpong:\t%d\t%12.2f\t%12.2f\n", len, T2Old, T2 );
- }
- }
- len *= 2;
- }
- if (nPerfErrors > 8) {
- /* Allow for 1-2 errors for eager-rendezvous shifting
- * point and cache effects. There should be a better way
- * of doing this. */
- printf( " Found %d performance errors\n", nPerfErrors );
- }
- else {
- printf( " No Errors\n" );
- }
- fflush( stdout );
+ int nPerfErrors = 0;
+ len = 1;
+ for (k = 0; k < 20; k++) {
+ double T0, T1, T2;
+ T0 = times[0][k] * 1.e6;
+ T1 = times[1][k] * 1.e6;
+ T2 = times[2][k] * 1.e6;
+ if (verbose)
+ printf("%d\t%12.2f\t%12.2f\t%12.2f\n", len, T0, T1, T2);
+ /* Lets look at long messages only */
+ if (k > 10) {
+ double T0Old, T1Old, T2Old;
+ T0Old = times[0][k - 1] * 1.0e6;
+ T1Old = times[1][k - 1] * 1.0e6;
+ T2Old = times[2][k - 1] * 1.0e6;
+ if (T0 > (2 + ERROR_MARGIN) * T0Old) {
+ nPerfErrors++;
+ if (verbose)
+ printf("Irecv-Send:\t%d\t%12.2f\t%12.2f\n", len, T0Old, T0);
+ }
+ if (T1 > (2 + ERROR_MARGIN) * T1Old) {
+ nPerfErrors++;
+ if (verbose)
+ printf("Sendrecv:\t%d\t%12.2f\t%12.2f\n", len, T1Old, T1);
+ }
+ if (T2 > (2 + ERROR_MARGIN) * T2Old) {
+ nPerfErrors++;
+ if (verbose)
+ printf("Pingpong:\t%d\t%12.2f\t%12.2f\n", len, T2Old, T2);
+ }
+ }
+ len *= 2;
+ }
+ if (nPerfErrors > 8) {
+ /* Allow for 1-2 errors for eager-rendezvous shifting
+ * point and cache effects. There should be a better way
+ * of doing this. */
+ printf(" Found %d performance errors\n", nPerfErrors);
+ }
+ else {
+ printf(" No Errors\n");
+ }
+ fflush(stdout);
}
- free( sbuf );
- free( rbuf );
+ free(sbuf);
+ free(rbuf);
MPI_Finalize();
sendrecvl 2
twovec 1 xfail=ticket1788
#Need MPI_Pack
-#dtpack 1 xfail=ticket1789
-#nestvec 1 xfail=ticket1788
-#nestvec2 1 xfail=ticket1788
-#indexperf 1 xfail=ticket1788
+dtpack 1 xfail=ticket1789
+nestvec 1 xfail=ticket1788
+nestvec2 1 xfail=ticket1788
+indexperf 1 xfail=ticket1788
non_zero_root 4
timer 1
# The commcreatep test looks at how communicator creation scales with group
* See COPYRIGHT in top-level directory.
*/
-/*
+/*
* Check that the timer produces monotone nondecreasing times and that
* the Tick is reasonable
*/
#define MAX_TIMER_TEST 5000
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
{
double t1[MAX_TIMER_TEST], tick[MAX_TIMER_TEST], tickval;
double minDiff, maxDiff, diff;
int i, nZeros = 0;
int errs = 0;
- MTest_Init(&argc,&argv);
+ MTest_Init(&argc, &argv);
- for (i=0; i<MAX_TIMER_TEST; i++) {
- t1[i] = MPI_Wtime();
+ for (i = 0; i < MAX_TIMER_TEST; i++) {
+ t1[i] = MPI_Wtime();
}
- for (i=0; i<MAX_TIMER_TEST; i++) {
- tick[i] = MPI_Wtick();
+ for (i = 0; i < MAX_TIMER_TEST; i++) {
+ tick[i] = MPI_Wtick();
}
/* Look at the values */
/* Look at the tick */
tickval = MPI_Wtick();
- for (i=0; i<MAX_TIMER_TEST; i++) {
- if (tickval != tick[i]) {
- fprintf( stderr, "Nonconstant value for MPI_Wtick: %e != %e\n",
- tickval, tick[i] );
- errs ++;
- }
+ for (i = 0; i < MAX_TIMER_TEST; i++) {
+ if (tickval != tick[i]) {
+ fprintf(stderr, "Nonconstant value for MPI_Wtick: %e != %e\n", tickval, tick[i]);
+ errs++;
+ }
}
/* Look at the timer */
minDiff = 1.e20;
maxDiff = -1.0;
- nZeros = 0;
- for (i=1; i<MAX_TIMER_TEST; i++) {
- diff = t1[i] - t1[i-1];
- if (diff == 0.0) nZeros++;
- else if (diff < minDiff) minDiff = diff;
- if (diff > maxDiff) maxDiff = diff;
+ nZeros = 0;
+ for (i = 1; i < MAX_TIMER_TEST; i++) {
+ diff = t1[i] - t1[i - 1];
+ if (diff == 0.0)
+ nZeros++;
+ else if (diff < minDiff)
+ minDiff = diff;
+ if (diff > maxDiff)
+ maxDiff = diff;
}
/* Are the time diff values and tick values consistent */
if (verbose) {
- printf( "Tick = %e, timer range = [%e,%e]\n", tickval, minDiff,
- maxDiff );
- if (nZeros) printf( "Wtime difference was 0 %d times\n", nZeros );
- }
+ printf("Tick = %e, timer range = [%e,%e]\n", tickval, minDiff, maxDiff);
+ if (nZeros)
+ printf("Wtime difference was 0 %d times\n", nZeros);
+ }
MTest_Finalize(errs);
MPI_Finalize();
#define SIZE 100
#define ITER 100
-int main(int argc, char* argv[])
+int main(int argc, char *argv[])
{
int i, j, k;
- static double a[SIZE][SIZE],b[SIZE][SIZE];
- double t1,t2,t,ts,tst;
+ static double a[SIZE][SIZE], b[SIZE][SIZE];
+ double t1, t2, t, ts, tst;
double temp;
int myrank, mysize, errs = 0;
MPI_Status status;
MPI_Datatype col, xpose;
- MTest_Init( &argc, &argv );
- MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
- MPI_Comm_size( MPI_COMM_WORLD, &mysize );
+ MTest_Init(&argc, &argv);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+ MPI_Comm_size(MPI_COMM_WORLD, &mysize);
if (mysize != 2) {
- fprintf( stderr, "This test must be run with 2 processes\n" );
- MPI_Abort( MPI_COMM_WORLD, 1 );
+ fprintf(stderr, "This test must be run with 2 processes\n");
+ MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_Type_extent(MPI_DOUBLE, &sizeofreal);
-
+
MPI_Type_vector(SIZE, 1, SIZE, MPI_DOUBLE, &col);
MPI_Type_hvector(SIZE, 1, sizeofreal, col, &xpose);
MPI_Type_commit(&xpose);
/* Preset the arrays so that they're in memory */
- for (i=0; i<SIZE; i++)
- for (j=0; j<SIZE; j++) {
- a[i][j]=0;
- b[i][j]=0;
- }
- a[SIZE-1][0] = 1;
+ for (i = 0; i < SIZE; i++)
+ for (j = 0; j < SIZE; j++) {
+ a[i][j] = 0;
+ b[i][j] = 0;
+ }
+ a[SIZE - 1][0] = 1;
/* Time the transpose example */
MPI_Barrier(MPI_COMM_WORLD);
- t1=MPI_Wtime();
- for(i=0;i< ITER; i++)
- {
- if(myrank==0)
- MPI_Send(&a[0][0],SIZE*SIZE,MPI_DOUBLE,1,0,MPI_COMM_WORLD);
- else
- MPI_Recv(&b[0][0],1,xpose,0,0,MPI_COMM_WORLD,&status);
- }
- t2=MPI_Wtime();
- t=(t2-t1)/ITER;
+ t1 = MPI_Wtime();
+ for (i = 0; i < ITER; i++) {
+ if (myrank == 0)
+ MPI_Send(&a[0][0], SIZE * SIZE, MPI_DOUBLE, 1, 0, MPI_COMM_WORLD);
+ else
+ MPI_Recv(&b[0][0], 1, xpose, 0, 0, MPI_COMM_WORLD, &status);
+ }
+ t2 = MPI_Wtime();
+ t = (t2 - t1) / ITER;
/* Time sending the same amount of data, but without the transpose */
MPI_Barrier(MPI_COMM_WORLD);
- t1=MPI_Wtime();
- for(i=0; i< ITER; i++){
- if(myrank==0)
- {
- MPI_Send(&a[0][0],sizeof(a),MPI_BYTE,1,0,MPI_COMM_WORLD);
- }
- else {
- MPI_Recv(&b[0][0],sizeof(b),MPI_BYTE,0,0,MPI_COMM_WORLD,&status);
- }
+ t1 = MPI_Wtime();
+ for (i = 0; i < ITER; i++) {
+ if (myrank == 0) {
+ MPI_Send(&a[0][0], sizeof(a), MPI_BYTE, 1, 0, MPI_COMM_WORLD);
+ }
+ else {
+ MPI_Recv(&b[0][0], sizeof(b), MPI_BYTE, 0, 0, MPI_COMM_WORLD, &status);
+ }
}
- t2=MPI_Wtime();
- ts=(t2-t1)/ITER;
+ t2 = MPI_Wtime();
+ ts = (t2 - t1) / ITER;
/* Time sending the same amount of data, with the transpose done
- as a separate step */
+ * as a separate step */
MPI_Barrier(MPI_COMM_WORLD);
- t1=MPI_Wtime();
- for(k=0; k< ITER; k++){
- if(myrank==0)
- {
- MPI_Send(&a[0][0],sizeof(a),MPI_BYTE,1,0,MPI_COMM_WORLD);
- }
- else {
- MPI_Recv(&b[0][0],sizeof(b),MPI_BYTE,0,0,MPI_COMM_WORLD,&status);
- for(i=0;i<SIZE;i++)
- for(j=i;j<SIZE;j++) {
- temp=b[j][i];
- b[j][i]=b[i][j];
- b[i][j]=temp;
- }
- }
+ t1 = MPI_Wtime();
+ for (k = 0; k < ITER; k++) {
+ if (myrank == 0) {
+ MPI_Send(&a[0][0], sizeof(a), MPI_BYTE, 1, 0, MPI_COMM_WORLD);
+ }
+ else {
+ MPI_Recv(&b[0][0], sizeof(b), MPI_BYTE, 0, 0, MPI_COMM_WORLD, &status);
+ for (i = 0; i < SIZE; i++)
+ for (j = i; j < SIZE; j++) {
+ temp = b[j][i];
+ b[j][i] = b[i][j];
+ b[i][j] = temp;
+ }
+ }
}
- t2=MPI_Wtime();
- tst=(t2-t1)/ITER;
+ t2 = MPI_Wtime();
+ tst = (t2 - t1) / ITER;
/* Print out the results */
if (myrank == 1) {
- /* if t and tst are too different, then there is a performance
- problem in the handling of the datatypes */
-
- if (t > 2 * tst) {
- errs ++;
- fprintf( stderr, "Transpose time with datatypes is more than twice time without datatypes\n" );
- fprintf( stderr, "%f\t%f\t%f\n", t, ts, tst );
- }
+ /* if t and tst are too different, then there is a performance
+ * problem in the handling of the datatypes */
+
+ if (t > 2 * tst) {
+ errs++;
+ fprintf(stderr,
+ "Transpose time with datatypes is more than twice time without datatypes\n");
+ fprintf(stderr, "%f\t%f\t%f\n", t, ts, tst);
+ }
}
MPI_Type_free(&col);
MPI_Type_free(&xpose);
- MTest_Finalize( errs );
+ MTest_Finalize(errs);
MPI_Finalize();
return 0;
}
*/
#define SKIP 4
-#define NUM_SIZES 15
+#define NUM_SIZES 16
#define FRACTION 1.0
/* Don't make the number of loops too high; we create so many
MPI_Init(&argc, &argv);
tmean = 0;
- size = 1;
+ size = 1;
for (i = -SKIP; i < NUM_SIZES; i++) {
nrows = ncols = size;
t[i] = MPI_Wtime() - ttmp;
if (t[i] < 100 * MPI_Wtick()) {
/* Time is too inaccurate to use. Set to zero.
- Consider increasing the LOOPS value to make this
- time large enough */
+ * Consider increasing the LOOPS value to make this
+ * time large enough */
t[i] = 0;
}
tmean += t[i];
tmean /= NUM_SIZES;
/* Now, analyze the times to see that they do not grow too fast
- as a function of size. As that is a vague criteria, we do the
- following as a simple test:
- Compute the mean of the first half and the second half of the
- data
- Compare the two means
- If the mean of the second half is more than FRACTION times the
- mean of the first half, then the time may be growing too fast.
+ * as a function of size. As that is a vague criteria, we do the
+ * following as a simple test:
+ * Compute the mean of the first half and the second half of the
+ * data
+ * Compare the two means
+ * If the mean of the second half is more than FRACTION times the
+ * mean of the first half, then the time may be growing too fast.
*/
tMeanLower = tMeanHigher = 0;
- for (i=0; i<NUM_SIZES/2; i++)
+ for (i = 0; i < NUM_SIZES / 2; i++)
tMeanLower += t[i];
- tMeanLower /= (NUM_SIZES/2);
- for (i=NUM_SIZES/2; i<NUM_SIZES; i++)
+ tMeanLower /= (NUM_SIZES / 2);
+ for (i = NUM_SIZES / 2; i < NUM_SIZES; i++)
tMeanHigher += t[i];
- tMeanHigher /= (NUM_SIZES - NUM_SIZES/2);
+ tMeanHigher /= (NUM_SIZES - NUM_SIZES / 2);
/* A large value (even 1 or greater) is a good choice for
- FRACTION here - the goal is to detect significant growth in
- execution time as the size increases, and there is no MPI
- standard requirement here to meet.
-
- If the times were too small, then the test also passes - the
- goal is to find implementation problems that lead to excessive
- time in these routines.
- */
- if (tMeanLower > 0 && tMeanHigher > (1 + FRACTION) * tMeanLower) errs++;
+ * FRACTION here - the goal is to detect significant growth in
+ * execution time as the size increases, and there is no MPI
+ * standard requirement here to meet.
+ *
+ * If the times were too small, then the test also passes - the
+ * goal is to find implementation problems that lead to excessive
+ * time in these routines.
+ */
+ if (tMeanLower > 0 && tMeanHigher > (1 + FRACTION) * tMeanLower)
+ errs++;
if (errs) {
fprintf(stderr, "too much difference in performance: ");