Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
SMPI colls in not really C++. But cleaner than before.
[simgrid.git] / src / smpi / colls / smpi_mvapich2_selector_stampede.h
1 /* selector for collective algorithms based on mvapich decision logic, with calibration from Stampede cluster at TACC*/
2 /* This is the tuning used by MVAPICH for Stampede platform based on (MV2_ARCH_INTEL_XEON_E5_2680_16, MV2_HCA_MLX_CX_FDR) */
3
4 /* Copyright (c) 2009-2017. The SimGrid Team. All rights reserved.          */
5
6 /* This program is free software; you can redistribute it and/or modify it
7  * under the terms of the license (GNU LGPL) which comes with this package. */
8
9 /************ Alltoall variables and initializers                        */
10
11 #define MV2_MAX_NB_THRESHOLDS  32
12
13 using namespace simgrid::smpi;
14
15 typedef struct {
16   int min;
17   int max;
18   int (*MV2_pt_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
19       void *recvbuf, int recvcount, MPI_Datatype recvtype,
20       MPI_Comm comm_ptr );
21 } mv2_alltoall_tuning_element;
22
23 typedef struct {
24   int numproc;
25   int size_table;
26   mv2_alltoall_tuning_element algo_table[MV2_MAX_NB_THRESHOLDS];
27   mv2_alltoall_tuning_element in_place_algo_table[MV2_MAX_NB_THRESHOLDS];
28 } mv2_alltoall_tuning_table;
29
30 int (*MV2_Alltoall_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm_ptr)=NULL;
31
32 /* Indicates number of processes per node */
33 int *mv2_alltoall_table_ppn_conf = NULL;
34 /* Indicates total number of configurations */
35 int mv2_alltoall_num_ppn_conf = 1;
36 int *mv2_size_alltoall_tuning_table = NULL;
37 mv2_alltoall_tuning_table **mv2_alltoall_thresholds_table = NULL;
38
39
40 #define MPIR_Alltoall_bruck_MV2 Coll_alltoall_bruck::alltoall
41 #define MPIR_Alltoall_RD_MV2 Coll_alltoall_rdb::alltoall
42 #define MPIR_Alltoall_Scatter_dest_MV2 Coll_alltoall_mvapich2_scatter_dest::alltoall
43 #define MPIR_Alltoall_pairwise_MV2 Coll_alltoall_pair::alltoall
44 #define MPIR_Alltoall_inplace_MV2 Coll_alltoall_ring::alltoall 
45
46
47 static void init_mv2_alltoall_tables_stampede(){
48   int i;
49   int agg_table_sum = 0;
50   mv2_alltoall_tuning_table **table_ptrs = NULL;
51   mv2_alltoall_num_ppn_conf = 3;
52   if(smpi_coll_cleanup_callback==NULL)
53     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
54   mv2_alltoall_thresholds_table = static_cast<mv2_alltoall_tuning_table**>(xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
55       * mv2_alltoall_num_ppn_conf));
56   table_ptrs = static_cast<mv2_alltoall_tuning_table**>(xbt_malloc(sizeof(mv2_alltoall_tuning_table *)
57       * mv2_alltoall_num_ppn_conf));
58   mv2_size_alltoall_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
59       mv2_alltoall_num_ppn_conf));
60   mv2_alltoall_table_ppn_conf = static_cast<int*>(xbt_malloc(mv2_alltoall_num_ppn_conf * sizeof(int)));
61   mv2_alltoall_table_ppn_conf[0] = 1;
62   mv2_size_alltoall_tuning_table[0] = 6;
63   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_1ppn[] = {
64       {2,
65           1,
66           {{0, -1, &MPIR_Alltoall_pairwise_MV2},
67           },
68
69           {{0, -1, &MPIR_Alltoall_inplace_MV2},
70           },
71       },
72
73       {4,
74           2,
75           {{0, 262144, &MPIR_Alltoall_Scatter_dest_MV2},
76               {262144, -1, &MPIR_Alltoall_pairwise_MV2},
77           },
78
79           {{0, -1, &MPIR_Alltoall_inplace_MV2},
80           },
81       },
82
83       {8,
84           2,
85           {{0, 8, &MPIR_Alltoall_RD_MV2},
86               {8, -1, &MPIR_Alltoall_Scatter_dest_MV2},
87           },
88
89           {{0, -1, &MPIR_Alltoall_inplace_MV2},
90           },
91       },
92
93       {16,
94           3,
95           {{0, 64, &MPIR_Alltoall_RD_MV2},
96               {64, 512, &MPIR_Alltoall_bruck_MV2},
97               {512, -1, &MPIR_Alltoall_Scatter_dest_MV2},
98           },
99
100           {{0,-1, &MPIR_Alltoall_inplace_MV2},
101           },
102       },
103
104       {32,
105           3,
106           {{0, 32, &MPIR_Alltoall_RD_MV2},
107               {32, 2048, &MPIR_Alltoall_bruck_MV2},
108               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
109           },
110
111           {{0, -1, &MPIR_Alltoall_inplace_MV2},
112           },
113       },
114
115       {64,
116           3,
117           {{0, 8, &MPIR_Alltoall_RD_MV2},
118               {8, 1024, &MPIR_Alltoall_bruck_MV2},
119               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
120           },
121
122           {{0, -1, &MPIR_Alltoall_inplace_MV2},
123           },
124       },
125   };
126   table_ptrs[0] = mv2_tmp_alltoall_thresholds_table_1ppn;
127   mv2_alltoall_table_ppn_conf[1] = 2;
128   mv2_size_alltoall_tuning_table[1] = 6;
129   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_2ppn[] = {
130       {4,
131           2,
132           {{0, 32, &MPIR_Alltoall_RD_MV2},
133               {32, -1, &MPIR_Alltoall_Scatter_dest_MV2},
134           },
135
136           {{0, -1, &MPIR_Alltoall_inplace_MV2},
137           },
138       },
139
140       {8,
141           2,
142           {{0, 64, &MPIR_Alltoall_RD_MV2},
143               {64, -1, &MPIR_Alltoall_Scatter_dest_MV2},
144           },
145
146           {{0, -1, &MPIR_Alltoall_inplace_MV2},
147           },
148       },
149
150       {16,
151           3,
152           {{0, 64, &MPIR_Alltoall_RD_MV2},
153               {64, 2048, &MPIR_Alltoall_bruck_MV2},
154               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
155           },
156
157           {{0,-1, &MPIR_Alltoall_inplace_MV2},
158           },
159       },
160
161       {32,
162           3,
163           {{0, 16, &MPIR_Alltoall_RD_MV2},
164               {16, 2048, &MPIR_Alltoall_bruck_MV2},
165               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
166           },
167
168           {{0, -1, &MPIR_Alltoall_inplace_MV2},
169           },
170       },
171
172       {64,
173           3,
174           {{0, 8, &MPIR_Alltoall_RD_MV2},
175               {8, 1024, &MPIR_Alltoall_bruck_MV2},
176               {1024, -1, &MPIR_Alltoall_Scatter_dest_MV2},
177           },
178
179           {{0, -1, &MPIR_Alltoall_inplace_MV2},
180           },
181       },
182
183       {128,
184           3,
185           {{0, 4, &MPIR_Alltoall_RD_MV2},
186               {4, 2048, &MPIR_Alltoall_bruck_MV2},
187               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
188           },
189
190           {{0, -1, &MPIR_Alltoall_inplace_MV2},
191           },
192       },
193   };
194   table_ptrs[1] = mv2_tmp_alltoall_thresholds_table_2ppn;
195   mv2_alltoall_table_ppn_conf[2] = 16;
196   mv2_size_alltoall_tuning_table[2] = 7;
197   mv2_alltoall_tuning_table mv2_tmp_alltoall_thresholds_table_16ppn[] = {
198       {16,
199           2,
200           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
201               {2048, -1,  &MPIR_Alltoall_Scatter_dest_MV2},
202           },
203
204           {{32768, -1, &MPIR_Alltoall_inplace_MV2},
205           },
206       },
207
208       {32,
209           2,
210           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
211               {2048, -1, &MPIR_Alltoall_Scatter_dest_MV2},
212           },
213
214           {{16384, -1, &MPIR_Alltoall_inplace_MV2},
215           },
216       },
217
218       {64,
219           3,
220           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
221               {2048, 16384, &MPIR_Alltoall_Scatter_dest_MV2},
222               {16384, -1, &MPIR_Alltoall_pairwise_MV2},
223           },
224
225           {{32768, 131072, &MPIR_Alltoall_inplace_MV2},
226           },
227       },
228
229       {128,
230           2,
231           {{0, 2048, &MPIR_Alltoall_bruck_MV2},
232               {2048, -1, &MPIR_Alltoall_pairwise_MV2},
233           },
234
235           {{16384,65536, &MPIR_Alltoall_inplace_MV2},
236           },
237       },
238
239       {256,
240           2,
241           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
242               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
243           },
244
245           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
246           },
247       },
248
249       {512,
250           2,
251           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
252               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
253           },
254
255           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
256           },
257       },
258       {1024,
259           2,
260           {{0, 1024, &MPIR_Alltoall_bruck_MV2},
261               {1024, -1, &MPIR_Alltoall_pairwise_MV2},
262           },
263
264           {{16384, 65536, &MPIR_Alltoall_inplace_MV2},
265           },
266       },
267
268   };
269   table_ptrs[2] = mv2_tmp_alltoall_thresholds_table_16ppn;
270   agg_table_sum = 0;
271   for (i = 0; i < mv2_alltoall_num_ppn_conf; i++) {
272       agg_table_sum += mv2_size_alltoall_tuning_table[i];
273   }
274   mv2_alltoall_thresholds_table[0] =
275       static_cast<mv2_alltoall_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_alltoall_tuning_table)));
276   memcpy(mv2_alltoall_thresholds_table[0], table_ptrs[0],
277       (sizeof(mv2_alltoall_tuning_table)
278           * mv2_size_alltoall_tuning_table[0]));
279   for (i = 1; i < mv2_alltoall_num_ppn_conf; i++) {
280       mv2_alltoall_thresholds_table[i] =
281           mv2_alltoall_thresholds_table[i - 1]
282                                         + mv2_size_alltoall_tuning_table[i - 1];
283       memcpy(mv2_alltoall_thresholds_table[i], table_ptrs[i],
284           (sizeof(mv2_alltoall_tuning_table)
285               * mv2_size_alltoall_tuning_table[i]));
286   }
287   xbt_free(table_ptrs);
288
289
290 }
291
292
293 /************ Allgather variables and initializers                        */
294
295 typedef struct {
296   int min;
297   int max;
298   int (*MV2_pt_Allgatherction)(void *sendbuf,
299       int sendcount,
300       MPI_Datatype sendtype,
301       void *recvbuf,
302       int recvcount,
303       MPI_Datatype recvtype, MPI_Comm comm_ptr);
304 } mv2_allgather_tuning_element;
305
306 typedef struct {
307   int numproc;
308   int two_level[MV2_MAX_NB_THRESHOLDS];
309   int size_inter_table;
310   mv2_allgather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
311 } mv2_allgather_tuning_table;
312
313 int (*MV2_Allgatherction)(void *sendbuf,
314     int sendcount,
315     MPI_Datatype sendtype,
316     void *recvbuf,
317     int recvcount,
318     MPI_Datatype recvtype, MPI_Comm comm);
319
320 int *mv2_allgather_table_ppn_conf = NULL;
321 int mv2_allgather_num_ppn_conf = 1;
322 int *mv2_size_allgather_tuning_table = NULL;
323 mv2_allgather_tuning_table **mv2_allgather_thresholds_table = NULL;
324
325 static int MPIR_Allgather_RD_Allgather_Comm_MV2( void *sendbuf,
326                                  int sendcount,
327                                  MPI_Datatype sendtype,
328                                  void *recvbuf,
329                                  int recvcount,
330                                  MPI_Datatype recvtype, MPI_Comm comm_ptr)
331 {
332     return 0;
333 }
334
335 #define MPIR_Allgather_Bruck_MV2 Coll_allgather_bruck::allgather
336 #define MPIR_Allgather_RD_MV2 Coll_allgather_rdb::allgather
337 #define MPIR_Allgather_Ring_MV2 Coll_allgather_ring::allgather
338 #define MPIR_2lvl_Allgather_MV2 Coll_allgather_mvapich2_smp::allgather
339
340 static void init_mv2_allgather_tables_stampede(){
341   int i;
342   int agg_table_sum = 0;
343
344   if(smpi_coll_cleanup_callback==NULL)
345     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
346   mv2_allgather_tuning_table **table_ptrs = NULL;
347   mv2_allgather_num_ppn_conf = 3;
348   mv2_allgather_thresholds_table
349   = static_cast<mv2_allgather_tuning_table**>(xbt_malloc(sizeof(mv2_allgather_tuning_table *)
350       * mv2_allgather_num_ppn_conf));
351   table_ptrs = static_cast<mv2_allgather_tuning_table**>(xbt_malloc(sizeof(mv2_allgather_tuning_table *)
352       * mv2_allgather_num_ppn_conf));
353   mv2_size_allgather_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
354       mv2_allgather_num_ppn_conf));
355   mv2_allgather_table_ppn_conf
356   = static_cast<int*>(xbt_malloc(mv2_allgather_num_ppn_conf * sizeof(int)));
357   mv2_allgather_table_ppn_conf[0] = 1;
358   mv2_size_allgather_tuning_table[0] = 6;
359   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_1ppn[] = {
360       {
361           2,
362           {0},
363           1,
364           {
365               {0, -1, &MPIR_Allgather_Ring_MV2},
366           },
367       },
368       {
369           4,
370           {0,0},
371           2,
372           {
373               {0, 262144, &MPIR_Allgather_RD_MV2},
374               {262144, -1, &MPIR_Allgather_Ring_MV2},
375           },
376       },
377       {
378           8,
379           {0,0},
380           2,
381           {
382               {0, 131072, &MPIR_Allgather_RD_MV2},
383               {131072, -1, &MPIR_Allgather_Ring_MV2},
384           },
385       },
386       {
387           16,
388           {0,0},
389           2,
390           {
391               {0, 131072, &MPIR_Allgather_RD_MV2},
392               {131072, -1, &MPIR_Allgather_Ring_MV2},
393           },
394       },
395       {
396           32,
397           {0,0},
398           2,
399           {
400               {0, 65536, &MPIR_Allgather_RD_MV2},
401               {65536, -1, &MPIR_Allgather_Ring_MV2},
402           },
403       },
404       {
405           64,
406           {0,0},
407           2,
408           {
409               {0, 32768, &MPIR_Allgather_RD_MV2},
410               {32768, -1, &MPIR_Allgather_Ring_MV2},
411           },
412       },
413   };
414   table_ptrs[0] = mv2_tmp_allgather_thresholds_table_1ppn;
415   mv2_allgather_table_ppn_conf[1] = 2;
416   mv2_size_allgather_tuning_table[1] = 6;
417   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_2ppn[] = {
418       {
419           4,
420           {0,0},
421           2,
422           {
423               {0, 524288, &MPIR_Allgather_RD_MV2},
424               {524288, -1, &MPIR_Allgather_Ring_MV2},
425           },
426       },
427       {
428           8,
429           {0,1,0},
430           2,
431           {
432               {0, 32768, &MPIR_Allgather_RD_MV2},
433               {32768, 524288, &MPIR_Allgather_Ring_MV2},
434               {524288, -1, &MPIR_Allgather_Ring_MV2},
435           },
436       },
437       {
438           16,
439           {0,1,0},
440           2,
441           {
442               {0, 16384, &MPIR_Allgather_RD_MV2},
443               {16384, 524288, &MPIR_Allgather_Ring_MV2},
444               {524288, -1, &MPIR_Allgather_Ring_MV2},
445           },
446       },
447       {
448           32,
449           {1,1,0},
450           2,
451           {
452               {0, 65536, &MPIR_Allgather_RD_MV2},
453               {65536, 524288, &MPIR_Allgather_Ring_MV2},
454               {524288, -1, &MPIR_Allgather_Ring_MV2},
455           },
456       },
457       {
458           64,
459           {1,1,0},
460           2,
461           {
462               {0, 32768, &MPIR_Allgather_RD_MV2},
463               {32768, 524288, &MPIR_Allgather_Ring_MV2},
464               {524288, -1, &MPIR_Allgather_Ring_MV2},
465           },
466       },
467       {
468           128,
469           {1,1,0},
470           2,
471           {
472               {0, 65536, &MPIR_Allgather_RD_MV2},
473               {65536, 524288, &MPIR_Allgather_Ring_MV2},
474               {524288, -1, &MPIR_Allgather_Ring_MV2},
475           },
476       },
477   };
478   table_ptrs[1] = mv2_tmp_allgather_thresholds_table_2ppn;
479   mv2_allgather_table_ppn_conf[2] = 16;
480   mv2_size_allgather_tuning_table[2] = 6;
481   mv2_allgather_tuning_table mv2_tmp_allgather_thresholds_table_16ppn[] = {
482       {
483           16,
484           {0,0},
485           2,
486           {
487               {0, 1024, &MPIR_Allgather_RD_MV2},
488               {1024, -1, &MPIR_Allgather_Ring_MV2},
489           },
490       },
491       {
492           32,
493           {0,0},
494           2,
495           {
496               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
497               {1024, -1, &MPIR_Allgather_Ring_MV2},
498           },
499       },
500       {
501           64,
502           {0,0},
503           2,
504           {
505               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
506               {1024, -1, &MPIR_Allgather_Ring_MV2},
507           },
508       },
509       {
510           128,
511           {0,0},
512           2,
513           {
514               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
515               {1024, -1, &MPIR_Allgather_Ring_MV2},
516           },
517       },
518       {
519           256,
520           {0,0},
521           2,
522           {
523               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
524               {1024, -1, &MPIR_Allgather_Ring_MV2},
525           },
526       },
527       {
528           512,
529           {0,0},
530           2,
531           {
532               {0, 1024, &MPIR_Allgather_RD_Allgather_Comm_MV2},
533               {1024, -1, &MPIR_Allgather_Ring_MV2},
534           },
535       },
536
537   };
538   table_ptrs[2] = mv2_tmp_allgather_thresholds_table_16ppn;
539   agg_table_sum = 0;
540   for (i = 0; i < mv2_allgather_num_ppn_conf; i++) {
541       agg_table_sum += mv2_size_allgather_tuning_table[i];
542   }
543   mv2_allgather_thresholds_table[0] =
544       static_cast<mv2_allgather_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_allgather_tuning_table)));
545   memcpy(mv2_allgather_thresholds_table[0], table_ptrs[0],
546       (sizeof(mv2_allgather_tuning_table)
547           * mv2_size_allgather_tuning_table[0]));
548   for (i = 1; i < mv2_allgather_num_ppn_conf; i++) {
549       mv2_allgather_thresholds_table[i] =
550           mv2_allgather_thresholds_table[i - 1]
551                                          + mv2_size_allgather_tuning_table[i - 1];
552       memcpy(mv2_allgather_thresholds_table[i], table_ptrs[i],
553           (sizeof(mv2_allgather_tuning_table)
554               * mv2_size_allgather_tuning_table[i]));
555   }
556   xbt_free(table_ptrs);
557 }
558
559
560 /************ Gather variables and initializers                        */
561
562 typedef struct {
563   int min;
564   int max;
565   int (*MV2_pt_Gather_function)(void *sendbuf, int sendcnt,
566       MPI_Datatype sendtype, void *recvbuf, int recvcnt,
567       MPI_Datatype recvtype, int root, MPI_Comm  comm_ptr);
568 } mv2_gather_tuning_element;
569
570
571 typedef struct {
572   int numproc;
573   int size_inter_table;
574   mv2_gather_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
575   int size_intra_table;
576   mv2_gather_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
577 } mv2_gather_tuning_table;
578
579 int mv2_size_gather_tuning_table=7;
580 mv2_gather_tuning_table * mv2_gather_thresholds_table=NULL; 
581
582 typedef int (*MV2_Gather_function_ptr) (void *sendbuf,
583     int sendcnt,
584     MPI_Datatype sendtype,
585     void *recvbuf,
586     int recvcnt,
587     MPI_Datatype recvtype,
588     int root, MPI_Comm comm);
589
590 MV2_Gather_function_ptr MV2_Gather_inter_leader_function = NULL;
591 MV2_Gather_function_ptr MV2_Gather_intra_node_function = NULL;
592
593
594
595 #define MPIR_Gather_MV2_Direct Coll_gather_ompi_basic_linear::gather
596 #define MPIR_Gather_MV2_two_level_Direct Coll_gather_mvapich2_two_level::gather
597 #define MPIR_Gather_intra Coll_gather_mpich::gather
598
599
600 static void init_mv2_gather_tables_stampede(){
601
602   if(smpi_coll_cleanup_callback==NULL)
603     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
604   mv2_size_gather_tuning_table=7;
605   mv2_gather_thresholds_table = static_cast<mv2_gather_tuning_table*>(xbt_malloc(mv2_size_gather_tuning_table*
606       sizeof (mv2_gather_tuning_table)));
607   mv2_gather_tuning_table mv2_tmp_gather_thresholds_table[]={
608       {16,
609           2,{{0, 524288, &MPIR_Gather_MV2_Direct},
610               {524288, -1, &MPIR_Gather_intra}},
611               1,{{0, -1, &MPIR_Gather_MV2_Direct}}},
612               {32,
613                   3,{{0, 16384, &MPIR_Gather_MV2_Direct},
614                       {16384, 131072, &MPIR_Gather_intra},
615                       {131072, -1, &MPIR_Gather_MV2_two_level_Direct}},
616                       1,{{0, -1, &MPIR_Gather_intra}}},
617                       {64,
618                           3,{{0, 256, &MPIR_Gather_MV2_two_level_Direct},
619                               {256, 16384, &MPIR_Gather_MV2_Direct},
620                               {256, -1, &MPIR_Gather_MV2_two_level_Direct}},
621                               1,{{0, -1, &MPIR_Gather_intra}}},
622                               {128,
623                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
624                                       {512, 16384, &MPIR_Gather_MV2_Direct},
625                                       {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
626                                       1,{{0, -1, &MPIR_Gather_intra}}},
627                                       {256,
628                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
629                                               {512, 16384, &MPIR_Gather_MV2_Direct},
630                                               {16384, -1, &MPIR_Gather_MV2_two_level_Direct}},
631                                               1,{{0, -1, &MPIR_Gather_intra}}},
632                                               {512,
633                                                   3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
634                                                       {512, 16384, &MPIR_Gather_MV2_Direct},
635                                                       {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
636                                                       1,{{0, -1, &MPIR_Gather_intra}}},
637                                                       {1024,
638                                                           3,{{0, 512, &MPIR_Gather_MV2_two_level_Direct},
639                                                               {512, 16384, &MPIR_Gather_MV2_Direct},
640                                                               {8196, -1, &MPIR_Gather_MV2_two_level_Direct}},
641                                                               1,{{0, -1, &MPIR_Gather_intra}}},
642   };
643
644   memcpy(mv2_gather_thresholds_table, mv2_tmp_gather_thresholds_table,
645       mv2_size_gather_tuning_table * sizeof (mv2_gather_tuning_table));
646
647 }
648
649
650 /************ Allgatherv variables and initializers                        */
651
652 typedef struct {
653   int min;
654   int max;
655   int (*MV2_pt_Allgatherv_function)(void *sendbuf,
656       int sendcount,
657       MPI_Datatype sendtype,
658       void *recvbuf,
659       int *recvcounts,
660       int *displs,
661       MPI_Datatype recvtype,
662       MPI_Comm commg);
663 } mv2_allgatherv_tuning_element;
664
665 typedef struct {
666   int numproc;
667   int size_inter_table;
668   mv2_allgatherv_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
669 } mv2_allgatherv_tuning_table;
670
671 int (*MV2_Allgatherv_function)(void *sendbuf,
672     int sendcount,
673     MPI_Datatype sendtype,
674     void *recvbuf,
675     int *recvcounts,
676     int *displs,
677     MPI_Datatype recvtype,
678     MPI_Comm comm);
679
680 int mv2_size_allgatherv_tuning_table = 0;
681 mv2_allgatherv_tuning_table *mv2_allgatherv_thresholds_table = NULL;
682
683 #define MPIR_Allgatherv_Rec_Doubling_MV2 Coll_allgatherv_mpich_rdb::allgatherv
684 #define MPIR_Allgatherv_Bruck_MV2 Coll_allgatherv_ompi_bruck::allgatherv
685 #define MPIR_Allgatherv_Ring_MV2 Coll_allgatherv_mpich_ring::allgatherv
686
687
688 static void init_mv2_allgatherv_tables_stampede(){
689   if(smpi_coll_cleanup_callback==NULL)
690     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
691   mv2_size_allgatherv_tuning_table = 6;
692   mv2_allgatherv_thresholds_table = static_cast<mv2_allgatherv_tuning_table*>(xbt_malloc(mv2_size_allgatherv_tuning_table *
693       sizeof (mv2_allgatherv_tuning_table)));
694   mv2_allgatherv_tuning_table mv2_tmp_allgatherv_thresholds_table[] = {
695       {
696           16,
697           2,
698           {
699               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
700               {512, -1, &MPIR_Allgatherv_Ring_MV2},
701           },
702       },
703       {
704           32,
705           2,
706           {
707               {0, 512, &MPIR_Allgatherv_Rec_Doubling_MV2},
708               {512, -1, &MPIR_Allgatherv_Ring_MV2},
709           },
710       },
711       {
712           64,
713           2,
714           {
715               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
716               {256, -1, &MPIR_Allgatherv_Ring_MV2},
717           },
718       },
719       {
720           128,
721           2,
722           {
723               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
724               {256, -1, &MPIR_Allgatherv_Ring_MV2},
725           },
726       },
727       {
728           256,
729           2,
730           {
731               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
732               {256, -1, &MPIR_Allgatherv_Ring_MV2},
733           },
734       },
735       {
736           512,
737           2,
738           {
739               {0, 256, &MPIR_Allgatherv_Rec_Doubling_MV2},
740               {256, -1, &MPIR_Allgatherv_Ring_MV2},
741           },
742       },
743
744   };
745   memcpy(mv2_allgatherv_thresholds_table, mv2_tmp_allgatherv_thresholds_table,
746       mv2_size_allgatherv_tuning_table * sizeof (mv2_allgatherv_tuning_table));
747 }
748
749
750 /************ Allreduce variables and initializers                        */
751
752 typedef struct {
753   int min;
754   int max;
755   int (*MV2_pt_Allreducection)(void *sendbuf,
756       void *recvbuf,
757       int count,
758       MPI_Datatype datatype,
759       MPI_Op op, MPI_Comm comm);
760 } mv2_allreduce_tuning_element;
761
762 typedef struct {
763   int numproc;
764   int mcast_enabled;
765   int is_two_level_allreduce[MV2_MAX_NB_THRESHOLDS];
766   int size_inter_table;
767   mv2_allreduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
768   int size_intra_table;
769   mv2_allreduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
770 } mv2_allreduce_tuning_table;
771
772
773 int (*MV2_Allreducection)(void *sendbuf,
774     void *recvbuf,
775     int count,
776     MPI_Datatype datatype,
777     MPI_Op op, MPI_Comm comm)=NULL;
778
779
780 int (*MV2_Allreduce_intra_function)( void *sendbuf,
781     void *recvbuf,
782     int count,
783     MPI_Datatype datatype,
784     MPI_Op op, MPI_Comm comm)=NULL;
785
786 int mv2_size_allreduce_tuning_table = 0;
787 mv2_allreduce_tuning_table *mv2_allreduce_thresholds_table = NULL;
788
789
790
791
792
793 static int MPIR_Allreduce_mcst_reduce_two_level_helper_MV2( void *sendbuf,
794     void *recvbuf,
795     int count,
796     MPI_Datatype datatype,
797     MPI_Op op, MPI_Comm comm)
798
799   return 0;
800 }
801
802 static  int MPIR_Allreduce_mcst_reduce_redscat_gather_MV2( void *sendbuf,
803     void *recvbuf,
804     int count,
805     MPI_Datatype datatype,
806     MPI_Op op, MPI_Comm  comm)
807 {
808   return 0;
809 }
810
811 static  int MPIR_Allreduce_reduce_p2p_MV2( void *sendbuf,
812     void *recvbuf,
813     int count,
814     MPI_Datatype datatype,
815     MPI_Op op, MPI_Comm  comm)
816 {
817   Colls::reduce(sendbuf,recvbuf,count,datatype,op,0,comm);
818   return MPI_SUCCESS;
819 }
820
821 static  int MPIR_Allreduce_reduce_shmem_MV2( void *sendbuf,
822     void *recvbuf,
823     int count,
824     MPI_Datatype datatype,
825     MPI_Op op, MPI_Comm  comm)
826 {
827   Colls::reduce(sendbuf,recvbuf,count,datatype,op,0,comm);
828   return MPI_SUCCESS;
829 }
830
831 #define MPIR_Allreduce_pt2pt_rd_MV2 Coll_allreduce_rdb::allreduce
832 #define MPIR_Allreduce_pt2pt_rs_MV2 Coll_allreduce_mvapich2_rs::allreduce
833 #define MPIR_Allreduce_two_level_MV2 Coll_allreduce_mvapich2_two_level::allreduce
834
835
836 static void init_mv2_allreduce_tables_stampede(){
837   if(smpi_coll_cleanup_callback==NULL)
838     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
839   mv2_size_allreduce_tuning_table = 8;
840   mv2_allreduce_thresholds_table = static_cast<mv2_allreduce_tuning_table*>(xbt_malloc(mv2_size_allreduce_tuning_table *
841       sizeof (mv2_allreduce_tuning_table)));
842   mv2_allreduce_tuning_table mv2_tmp_allreduce_thresholds_table[] = {
843       {
844           16,
845           0,
846           {1, 0},
847           2,
848           {
849               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
850               {1024, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
851           },
852           2,
853           {
854               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
855               {1024, -1, &MPIR_Allreduce_reduce_p2p_MV2},
856           },
857       },
858       {
859           32,
860           0,
861           {1, 1, 0},
862           3,
863           {
864               {0, 1024, &MPIR_Allreduce_pt2pt_rd_MV2},
865               {1024, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
866               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
867           },
868           2,
869           {
870               {0, 1024, &MPIR_Allreduce_reduce_shmem_MV2},
871               {1024, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
872           },
873       },
874       {
875           64,
876           0,
877           {1, 1, 0},
878           3,
879           {
880               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
881               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
882               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
883           },
884           2,
885           {
886               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
887               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
888           },
889       },
890       {
891           128,
892           0,
893           {1, 1, 0},
894           3,
895           {
896               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
897               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
898               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
899           },
900           2,
901           {
902               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
903               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
904           },
905       },
906       {
907           256,
908           0,
909           {1, 1, 0},
910           3,
911           {
912               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
913               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
914               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
915           },
916           2,
917           {
918               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
919               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
920           },
921       },
922       {
923           512,
924           0,
925           {1, 1, 0},
926           3,
927           {
928               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
929               {512, 16384, &MPIR_Allreduce_pt2pt_rd_MV2},
930               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
931           },
932           2,
933           {
934               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
935               {512, 16384, &MPIR_Allreduce_reduce_p2p_MV2},
936           },
937       },
938       {
939           1024,
940           0,
941           {1, 1, 1, 0},
942           4,
943           {
944               {0, 512, &MPIR_Allreduce_pt2pt_rd_MV2},
945               {512, 8192, &MPIR_Allreduce_pt2pt_rd_MV2},
946               {8192, 65536, &MPIR_Allreduce_pt2pt_rs_MV2},
947               {65536, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
948           },
949           2,
950           {
951               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
952               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
953           },
954       },
955       {
956           2048,
957           0,
958           {1, 1, 1, 0},
959           4,
960           {
961               {0, 64, &MPIR_Allreduce_pt2pt_rd_MV2},
962               {64, 512, &MPIR_Allreduce_reduce_p2p_MV2},
963               {512, 4096, &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2},
964               {4096, 16384, &MPIR_Allreduce_pt2pt_rs_MV2},
965               {16384, -1, &MPIR_Allreduce_pt2pt_rs_MV2},
966           },
967           2,
968           {
969               {0, 512, &MPIR_Allreduce_reduce_shmem_MV2},
970               {512, -1, &MPIR_Allreduce_reduce_p2p_MV2},
971           },
972       },
973
974   };
975   memcpy(mv2_allreduce_thresholds_table, mv2_tmp_allreduce_thresholds_table,
976       mv2_size_allreduce_tuning_table * sizeof (mv2_allreduce_tuning_table));
977 }
978
979
980
981
982 typedef struct {
983     int min;
984     int max;
985     int (*MV2_pt_Bcast_function) (void *buf, int count, MPI_Datatype datatype,
986                                   int root, MPI_Comm comm_ptr);
987     int zcpy_pipelined_knomial_factor;
988 } mv2_bcast_tuning_element;
989
990 typedef struct {
991     int numproc;
992     int bcast_segment_size;
993     int intra_node_knomial_factor;
994     int inter_node_knomial_factor;
995     int is_two_level_bcast[MV2_MAX_NB_THRESHOLDS];
996     int size_inter_table;
997     mv2_bcast_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
998     int size_intra_table;
999     mv2_bcast_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1000 } mv2_bcast_tuning_table;
1001
1002 int mv2_size_bcast_tuning_table = 0;
1003 mv2_bcast_tuning_table *mv2_bcast_thresholds_table = NULL;
1004
1005
1006 int (*MV2_Bcast_function) (void *buffer, int count, MPI_Datatype datatype,
1007                            int root, MPI_Comm comm_ptr) = NULL;
1008
1009 int (*MV2_Bcast_intra_node_function) (void *buffer, int count, MPI_Datatype datatype,
1010                                       int root, MPI_Comm comm_ptr) = NULL;
1011
1012 int zcpy_knomial_factor = 2;
1013 int mv2_pipelined_zcpy_knomial_factor = -1;
1014 int bcast_segment_size = 8192;
1015 int mv2_inter_node_knomial_factor = 4;
1016 int mv2_intra_node_knomial_factor = 4;
1017 #define mv2_bcast_two_level_system_size  64
1018 #define mv2_bcast_short_msg             16384
1019 #define mv2_bcast_large_msg            512*1024
1020
1021 #define INTRA_NODE_ROOT 0
1022
1023 #define MPIR_Pipelined_Bcast_Zcpy_MV2 Coll_bcast_mpich::bcast
1024 #define MPIR_Pipelined_Bcast_MV2 Coll_bcast_mpich::bcast
1025 #define MPIR_Bcast_binomial_MV2 Coll_bcast_binomial_tree::bcast
1026 #define MPIR_Bcast_scatter_ring_allgather_shm_MV2 Coll_bcast_scatter_LR_allgather::bcast
1027 #define MPIR_Bcast_scatter_doubling_allgather_MV2 Coll_bcast_scatter_rdb_allgather::bcast
1028 #define MPIR_Bcast_scatter_ring_allgather_MV2 Coll_bcast_scatter_LR_allgather::bcast
1029 #define MPIR_Shmem_Bcast_MV2 Coll_bcast_mpich::bcast
1030 #define MPIR_Bcast_tune_inter_node_helper_MV2 Coll_bcast_mvapich2_inter_node::bcast
1031 #define MPIR_Bcast_inter_node_helper_MV2 Coll_bcast_mvapich2_inter_node::bcast
1032 #define MPIR_Knomial_Bcast_intra_node_MV2 Coll_bcast_mvapich2_knomial_intra_node::bcast
1033 #define MPIR_Bcast_intra_MV2 Coll_bcast_mvapich2_intra_node::bcast
1034
1035 static void init_mv2_bcast_tables_stampede(){
1036  //Stampede,
1037   if(smpi_coll_cleanup_callback==NULL)
1038     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1039   mv2_size_bcast_tuning_table=8;
1040   mv2_bcast_thresholds_table = static_cast<mv2_bcast_tuning_table*>(xbt_malloc(mv2_size_bcast_tuning_table *
1041   sizeof (mv2_bcast_tuning_table)));
1042
1043   mv2_bcast_tuning_table mv2_tmp_bcast_thresholds_table[]={
1044     {
1045             16,
1046             8192, 4, 4,
1047             {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
1048             11,
1049             {
1050               {0, 8, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1051               {8, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1052               {16, 1024, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1053               {1024, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1054               {8192, 16384, &MPIR_Bcast_binomial_MV2, -1},
1055               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1056               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1057               {65536, 131072, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1058               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_MV2, -1},
1059               {262144, 524288, &MPIR_Bcast_scatter_doubling_allgather_MV2, -1},
1060               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1061             },
1062             11,
1063             {
1064               {0, 8, &MPIR_Shmem_Bcast_MV2, 2},
1065               {8, 16, &MPIR_Shmem_Bcast_MV2, 4},
1066               {16, 1024, &MPIR_Shmem_Bcast_MV2, 2},
1067               {1024, 8192, &MPIR_Shmem_Bcast_MV2, 4},
1068               {8192, 16384, &MPIR_Shmem_Bcast_MV2, -1},
1069               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 4},
1070               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1071               {65536, 131072, &MPIR_Shmem_Bcast_MV2, -1},
1072               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1073               {262144, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1074               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1075             }
1076     },
1077     {
1078             32,
1079             8192, 4, 4,
1080             {1, 1, 1, 1, 1, 1, 1, 1},
1081             8,
1082             {
1083               {0, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1084               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1085               {256, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1086               {32768, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1087               {65536, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1088               {131072, 262144, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1089               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1090               {524288, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8}
1091             },
1092             8,
1093             {
1094               {0, 128, &MPIR_Shmem_Bcast_MV2, 2},
1095               {128, 256, &MPIR_Shmem_Bcast_MV2, 4},
1096               {256, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1097               {32768, 65536, &MPIR_Shmem_Bcast_MV2, 4},
1098               {65536, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1099               {131072, 262144, &MPIR_Shmem_Bcast_MV2, 8},
1100               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1101               {524288, -1, &MPIR_Shmem_Bcast_MV2, 8}
1102             }
1103     },
1104     {
1105             64,
1106             8192, 4, 4,
1107             {1, 1, 1, 1, 1, 1, 1, 1, 1},
1108             9,
1109             {
1110               {0, 2, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1111               {2, 4, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1112               {4, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1113               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1114               {32, 128, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1115               {128, 256, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1116               {256, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1117               {4096, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1118               {32768, -1, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2}
1119             },
1120             9,
1121             {
1122               {0, 2, &MPIR_Shmem_Bcast_MV2, 4},
1123               {2, 4, &MPIR_Shmem_Bcast_MV2, 8},
1124               {4, 16, &MPIR_Shmem_Bcast_MV2, 4},
1125               {16, 32, &MPIR_Shmem_Bcast_MV2, 8},
1126               {32, 128, &MPIR_Shmem_Bcast_MV2, 4},
1127               {128, 256, &MPIR_Shmem_Bcast_MV2, 8},
1128               {256, 4096, &MPIR_Shmem_Bcast_MV2, 4},
1129               {4096, 32768, &MPIR_Shmem_Bcast_MV2, 8},
1130               {32768, -1, &MPIR_Shmem_Bcast_MV2, 2}
1131             }
1132     },
1133     {
1134             128,
1135             8192, 4, 4,
1136             {1, 1, 1, 0},
1137             4,
1138             {
1139               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1140               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1141               {16384, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1142               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_MV2, -1}
1143             },
1144             4,
1145             {
1146               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1147               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1148               {16384, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1149               {524288, -1, NULL, -1}
1150             }
1151     },
1152     {
1153             256,
1154             8192, 4, 4,
1155             {1, 1, 1, 1, 1},
1156             5,
1157             {
1158               {0, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1159               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1160               {131072, 262144, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1},
1161               {262144, 524288, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1162               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1163             },
1164             5,
1165             {
1166               {0, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1167               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1168               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1169               {262144, 524288, &MPIR_Shmem_Bcast_MV2, 2},
1170               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1171             }
1172     },
1173     {
1174             512,
1175             8192, 4, 4,
1176             {1, 1, 1, 1, 1},
1177             5,
1178             {
1179               {0, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1180               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1181               {16384, 131072, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1182               {131072, 262144, &MPIR_Pipelined_Bcast_MV2, -1},
1183               {262144, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1184             },
1185             5,
1186             {
1187               {0, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1188               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1189               {16384, 131072, &MPIR_Shmem_Bcast_MV2, 2},
1190               {131072, 262144, &MPIR_Shmem_Bcast_MV2, -1},
1191               {262144, -1, &MPIR_Shmem_Bcast_MV2, -1}
1192             }
1193     },
1194     {
1195             1024,
1196             8192, 4, 4,
1197             {1, 1, 1, 1, 1},
1198             5,
1199             {
1200               {0, 8192, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1201               {8192, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1202               {16384, 65536, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1203               {65536, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1204               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1205             },
1206             5,
1207             {
1208               {0, 8192, &MPIR_Shmem_Bcast_MV2, 8},
1209               {8192, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1210               {16384, 65536, &MPIR_Shmem_Bcast_MV2, 2},
1211               {65536, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1212               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1213             }
1214     },
1215     {
1216             2048,
1217             8192, 4, 4,
1218             {1, 1, 1, 1, 1, 1, 1},
1219             7,
1220             {
1221               {0, 16, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1222               {16, 32, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1223               {32, 4096, &MPIR_Pipelined_Bcast_Zcpy_MV2, 8},
1224               {4096, 16384, &MPIR_Pipelined_Bcast_Zcpy_MV2, 4},
1225               {16384, 32768, &MPIR_Pipelined_Bcast_Zcpy_MV2, 2},
1226               {32768, 524288, &MPIR_Pipelined_Bcast_MV2, -1},
1227               {524288, -1, &MPIR_Bcast_scatter_ring_allgather_shm_MV2, -1}
1228             },
1229             7,
1230             {
1231               {0, 16, &MPIR_Shmem_Bcast_MV2, 8},
1232               {16, 32, &MPIR_Shmem_Bcast_MV2, 4},
1233               {32, 4096, &MPIR_Shmem_Bcast_MV2, 8},
1234               {4096, 16384, &MPIR_Shmem_Bcast_MV2, 4},
1235               {16384, 32768, &MPIR_Shmem_Bcast_MV2, 2},
1236               {32768, 524288, &MPIR_Shmem_Bcast_MV2, -1},
1237               {524288, -1, &MPIR_Shmem_Bcast_MV2, -1}
1238             }
1239     }
1240   };
1241
1242         memcpy(mv2_bcast_thresholds_table, mv2_tmp_bcast_thresholds_table,
1243                     mv2_size_bcast_tuning_table * sizeof (mv2_bcast_tuning_table));
1244 }
1245
1246
1247 /************ Reduce variables and initializers                        */
1248
1249 typedef struct {
1250   int min;
1251   int max;
1252   int (*MV2_pt_Reduce_function)(void *sendbuf,
1253       void *recvbuf,
1254       int count,
1255       MPI_Datatype datatype,
1256       MPI_Op op,
1257       int root,
1258       MPI_Comm  comm_ptr);
1259 } mv2_reduce_tuning_element;
1260
1261 typedef struct {
1262   int numproc;
1263   int inter_k_degree;
1264   int intra_k_degree;
1265   int is_two_level_reduce[MV2_MAX_NB_THRESHOLDS];
1266   int size_inter_table;
1267   mv2_reduce_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1268   int size_intra_table;
1269   mv2_reduce_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1270 } mv2_reduce_tuning_table;
1271
1272 int mv2_size_reduce_tuning_table = 0;
1273 mv2_reduce_tuning_table *mv2_reduce_thresholds_table = NULL;
1274
1275
1276 int mv2_reduce_intra_knomial_factor = -1;
1277 int mv2_reduce_inter_knomial_factor = -1;
1278
1279 int (*MV2_Reduce_function)( void *sendbuf,
1280     void *recvbuf,
1281     int count,
1282     MPI_Datatype datatype,
1283     MPI_Op op,
1284     int root,
1285     MPI_Comm  comm_ptr)=NULL;
1286
1287 int (*MV2_Reduce_intra_function)( void *sendbuf,
1288     void *recvbuf,
1289     int count,
1290     MPI_Datatype datatype,
1291     MPI_Op op,
1292     int root,
1293     MPI_Comm  comm_ptr)=NULL;
1294
1295
1296 #define MPIR_Reduce_inter_knomial_wrapper_MV2 Coll_reduce_mvapich2_knomial::reduce
1297 #define MPIR_Reduce_intra_knomial_wrapper_MV2 Coll_reduce_mvapich2_knomial::reduce
1298 #define MPIR_Reduce_binomial_MV2 Coll_reduce_binomial::reduce
1299 #define MPIR_Reduce_redscat_gather_MV2 Coll_reduce_scatter_gather::reduce
1300 #define MPIR_Reduce_shmem_MV2 Coll_reduce_ompi_basic_linear::reduce
1301 #define MPIR_Reduce_two_level_helper_MV2 Coll_reduce_mvapich2_two_level::reduce
1302
1303
1304 static void init_mv2_reduce_tables_stampede(){
1305   if(smpi_coll_cleanup_callback==NULL)
1306     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1307   /*Stampede*/
1308   mv2_size_reduce_tuning_table = 8;
1309   mv2_reduce_thresholds_table = static_cast<mv2_reduce_tuning_table*>(xbt_malloc(mv2_size_reduce_tuning_table *
1310       sizeof (mv2_reduce_tuning_table)));
1311   mv2_reduce_tuning_table mv2_tmp_reduce_thresholds_table[] = {
1312       {
1313           16,
1314           4,
1315           4,
1316           {1, 0, 0},
1317           3,
1318           {
1319               {0, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1320               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1321               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1322           },
1323           2,
1324           {
1325               {0, 65536, &MPIR_Reduce_shmem_MV2},
1326               {65536,-1,  &MPIR_Reduce_binomial_MV2},
1327           },
1328       },
1329       {
1330           32,
1331           4,
1332           4,
1333           {1, 1, 1, 1, 0, 0, 0},
1334           7,
1335           {
1336               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1337               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1338               {16384, 32768, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1339               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1340               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1341               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1342               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1343           },
1344           6,
1345           {
1346               {0, 8192, &MPIR_Reduce_shmem_MV2},
1347               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1348               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1349               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1350               {65536, 262144, &MPIR_Reduce_shmem_MV2},
1351               {262144,-1,  &MPIR_Reduce_binomial_MV2},
1352           },
1353       },
1354       {
1355           64,
1356           4,
1357           4,
1358           {1, 1, 1, 1, 0},
1359           5,
1360           {
1361               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1362               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1363               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1364               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1365               {262144, -1, &MPIR_Reduce_redscat_gather_MV2},
1366           },
1367           5,
1368           {
1369               {0, 8192, &MPIR_Reduce_shmem_MV2},
1370               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1371               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1372               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1373               {262144, -1, &MPIR_Reduce_binomial_MV2},
1374           },
1375       },
1376       {
1377           128,
1378           4,
1379           4,
1380           {1, 0, 1, 0, 1, 0},
1381           6,
1382           {
1383               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1384               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1385               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1386               {65536, 262144, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1387               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1388               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1389           },
1390           5,
1391           {
1392               {0, 8192, &MPIR_Reduce_shmem_MV2},
1393               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1394               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1395               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1396               {262144, -1, &MPIR_Reduce_binomial_MV2},
1397           },
1398       },
1399       {
1400           256,
1401           4,
1402           4,
1403           {1, 1, 1, 0, 1, 1, 0},
1404           7,
1405           {
1406               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1407               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1408               {16384, 32768, &MPIR_Reduce_binomial_MV2},
1409               {32768, 65536, &MPIR_Reduce_binomial_MV2},
1410               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1411               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1412               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1413           },
1414           6,
1415           {
1416               {0, 8192, &MPIR_Reduce_shmem_MV2},
1417               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1418               {16384, 32768, &MPIR_Reduce_shmem_MV2},
1419               {32768, 65536, &MPIR_Reduce_shmem_MV2},
1420               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1421               {262144, -1, &MPIR_Reduce_binomial_MV2},
1422           },
1423       },
1424       {
1425           512,
1426           4,
1427           4,
1428           {1, 0, 1, 1, 1, 0},
1429           6,
1430           {
1431               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1432               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1433               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1434               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1435               {262144, 1048576, &MPIR_Reduce_binomial_MV2},
1436               {1048576, -1, &MPIR_Reduce_redscat_gather_MV2},
1437           },
1438           5,
1439           {
1440               {0, 8192, &MPIR_Reduce_shmem_MV2},
1441               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1442               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1443               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1444               {262144, -1, &MPIR_Reduce_binomial_MV2},
1445           },
1446       },
1447       {
1448           1024,
1449           4,
1450           4,
1451           {1, 0, 1, 1, 1},
1452           5,
1453           {
1454               {0, 8192, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1455               {8192, 16384, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1456               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1457               {65536, 262144, &MPIR_Reduce_binomial_MV2},
1458               {262144, -1, &MPIR_Reduce_binomial_MV2},
1459           },
1460           5,
1461           {
1462               {0, 8192, &MPIR_Reduce_shmem_MV2},
1463               {8192, 16384, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1464               {16384, 65536, &MPIR_Reduce_shmem_MV2},
1465               {65536, 262144, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1466               {262144, -1, &MPIR_Reduce_binomial_MV2},
1467           },
1468       },
1469       {
1470           2048,
1471           4,
1472           4,
1473           {1, 0, 1, 1, 1,1},
1474           6,
1475           {
1476               {0, 2048, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1477               {2048, 4096, &MPIR_Reduce_inter_knomial_wrapper_MV2},
1478               {4096, 16384, &MPIR_Reduce_binomial_MV2},
1479               {16384, 65536, &MPIR_Reduce_binomial_MV2},
1480               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1481               {131072, -1, &MPIR_Reduce_binomial_MV2},
1482           },
1483           6,
1484           {
1485               {0, 2048, &MPIR_Reduce_shmem_MV2},
1486               {2048, 4096, &MPIR_Reduce_shmem_MV2},
1487               {4096, 16384, &MPIR_Reduce_shmem_MV2},
1488               {16384, 65536, &MPIR_Reduce_intra_knomial_wrapper_MV2},
1489               {65536, 131072, &MPIR_Reduce_binomial_MV2},
1490               {131072, -1, &MPIR_Reduce_shmem_MV2},
1491           },
1492       },
1493
1494   };
1495   memcpy(mv2_reduce_thresholds_table, mv2_tmp_reduce_thresholds_table,
1496       mv2_size_reduce_tuning_table * sizeof (mv2_reduce_tuning_table));
1497 }
1498
1499 /************ Reduce scatter variables and initializers                        */
1500
1501 typedef struct {
1502   int min;
1503   int max;
1504   int (*MV2_pt_Red_scat_function)(void *sendbuf,
1505       void *recvbuf,
1506       int *recvcnts,
1507       MPI_Datatype datatype,
1508       MPI_Op op,
1509       MPI_Comm comm_ptr);
1510 } mv2_red_scat_tuning_element;
1511
1512 typedef struct {
1513   int numproc;
1514   int size_inter_table;
1515   mv2_red_scat_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1516 } mv2_red_scat_tuning_table;
1517
1518 int mv2_size_red_scat_tuning_table = 0;
1519 mv2_red_scat_tuning_table *mv2_red_scat_thresholds_table = NULL;
1520
1521
1522 int (*MV2_Red_scat_function)(void *sendbuf,
1523     void *recvbuf,
1524     int *recvcnts,
1525     MPI_Datatype datatype,
1526     MPI_Op op,
1527     MPI_Comm comm_ptr);
1528
1529
1530
1531 static  int MPIR_Reduce_Scatter_Basic_MV2(void *sendbuf,
1532     void *recvbuf,
1533     int *recvcnts,
1534     MPI_Datatype datatype,
1535     MPI_Op op,
1536     MPI_Comm comm)
1537 {
1538   Coll_reduce_scatter_default::reduce_scatter(sendbuf,recvbuf,recvcnts,datatype,op,comm);
1539   return MPI_SUCCESS;
1540 }
1541 #define MPIR_Reduce_scatter_non_comm_MV2 Coll_reduce_scatter_mpich_noncomm::reduce_scatter
1542 #define MPIR_Reduce_scatter_Rec_Halving_MV2 Coll_reduce_scatter_ompi_basic_recursivehalving::reduce_scatter
1543 #define MPIR_Reduce_scatter_Pair_Wise_MV2 Coll_reduce_scatter_mpich_pair::reduce_scatter
1544
1545
1546
1547
1548 static void init_mv2_reduce_scatter_tables_stampede(){
1549   if(smpi_coll_cleanup_callback==NULL)
1550     smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1551   mv2_size_red_scat_tuning_table = 6;
1552   mv2_red_scat_thresholds_table = static_cast<mv2_red_scat_tuning_table*>(xbt_malloc(mv2_size_red_scat_tuning_table *
1553       sizeof (mv2_red_scat_tuning_table)));
1554   mv2_red_scat_tuning_table mv2_tmp_red_scat_thresholds_table[] = {
1555       {
1556           16,
1557           3,
1558           {
1559               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1560               {64, 65536, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1561               {65536, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1562           },
1563       },
1564       {
1565           32,
1566           3,
1567           {
1568               {0, 64, &MPIR_Reduce_Scatter_Basic_MV2},
1569               {64, 131072, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1570               {131072, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1571           },
1572       },
1573       {
1574           64,
1575           3,
1576           {
1577               {0, 1024, &MPIR_Reduce_Scatter_Basic_MV2},
1578               {1024, 262144, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1579               {262144, -1, &MPIR_Reduce_scatter_Pair_Wise_MV2},
1580           },
1581       },
1582       {
1583           128,
1584           2,
1585           {
1586               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1587               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1588           },
1589       },
1590       {
1591           256,
1592           2,
1593           {
1594               {0, 128, &MPIR_Reduce_Scatter_Basic_MV2},
1595               {128, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1596           },
1597       },
1598       {
1599           512,
1600           2,
1601           {
1602               {0, 256, &MPIR_Reduce_Scatter_Basic_MV2},
1603               {256, -1, &MPIR_Reduce_scatter_Rec_Halving_MV2},
1604           },
1605       },
1606
1607   };
1608   memcpy(mv2_red_scat_thresholds_table, mv2_tmp_red_scat_thresholds_table,
1609       mv2_size_red_scat_tuning_table * sizeof (mv2_red_scat_tuning_table));
1610 }
1611
1612 /************ Scatter variables and initializers                        */
1613
1614 typedef struct {
1615   int min;
1616   int max;
1617   int (*MV2_pt_Scatter_function)(void *sendbuf,
1618       int sendcnt,
1619       MPI_Datatype sendtype,
1620       void *recvbuf,
1621       int recvcnt,
1622       MPI_Datatype recvtype,
1623       int root, MPI_Comm comm);
1624 } mv2_scatter_tuning_element;
1625
1626 typedef struct {
1627   int numproc;
1628   int size_inter_table;
1629   mv2_scatter_tuning_element inter_leader[MV2_MAX_NB_THRESHOLDS];
1630   int size_intra_table;
1631   mv2_scatter_tuning_element intra_node[MV2_MAX_NB_THRESHOLDS];
1632 } mv2_scatter_tuning_table;
1633
1634
1635 int *mv2_scatter_table_ppn_conf = NULL;
1636 int mv2_scatter_num_ppn_conf = 1;
1637 int *mv2_size_scatter_tuning_table = NULL;
1638 mv2_scatter_tuning_table **mv2_scatter_thresholds_table = NULL;
1639
1640 int (*MV2_Scatter_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1641     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1642     int root, MPI_Comm comm)=NULL;
1643
1644 int (*MV2_Scatter_intra_function) (void *sendbuf, int sendcount, MPI_Datatype sendtype,
1645     void *recvbuf, int recvcount, MPI_Datatype recvtype,
1646     int root, MPI_Comm comm)=NULL;
1647 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1648     int sendcnt,
1649     MPI_Datatype sendtype,
1650     void *recvbuf,
1651     int recvcnt,
1652     MPI_Datatype recvtype,
1653     int root, MPI_Comm comm_ptr);
1654
1655 int MPIR_Scatter_mcst_wrap_MV2(void *sendbuf,
1656     int sendcnt,
1657     MPI_Datatype sendtype,
1658     void *recvbuf,
1659     int recvcnt,
1660     MPI_Datatype recvtype,
1661     int root, MPI_Comm comm_ptr)
1662 {
1663   return 0;
1664 }
1665
1666 #define MPIR_Scatter_MV2_Binomial Coll_scatter_ompi_binomial::scatter
1667 #define MPIR_Scatter_MV2_Direct Coll_scatter_ompi_basic_linear::scatter
1668 #define MPIR_Scatter_MV2_two_level_Binomial Coll_scatter_mvapich2_two_level_binomial::scatter
1669 #define MPIR_Scatter_MV2_two_level_Direct Coll_scatter_mvapich2_two_level_direct::scatter
1670
1671
1672
1673
1674 static void init_mv2_scatter_tables_stampede(){
1675     if(smpi_coll_cleanup_callback==NULL)
1676       smpi_coll_cleanup_callback=&smpi_coll_cleanup_mvapich2;
1677
1678     int agg_table_sum = 0;
1679     int i;
1680     mv2_scatter_tuning_table **table_ptrs = NULL;
1681     mv2_scatter_num_ppn_conf = 3;
1682     mv2_scatter_thresholds_table
1683     = static_cast<mv2_scatter_tuning_table**>(xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1684         * mv2_scatter_num_ppn_conf));
1685     table_ptrs = static_cast<mv2_scatter_tuning_table**>(xbt_malloc(sizeof(mv2_scatter_tuning_table *)
1686         * mv2_scatter_num_ppn_conf));
1687     mv2_size_scatter_tuning_table = static_cast<int*>(xbt_malloc(sizeof(int) *
1688         mv2_scatter_num_ppn_conf));
1689     mv2_scatter_table_ppn_conf
1690     = static_cast<int*>(xbt_malloc(mv2_scatter_num_ppn_conf * sizeof(int)));
1691     mv2_scatter_table_ppn_conf[0] = 1;
1692     mv2_size_scatter_tuning_table[0] = 6;
1693     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_1ppn[] = {
1694         {2,
1695             1,
1696             {
1697                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1698             },
1699             1,
1700             {
1701                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1702             },
1703         },
1704
1705         {4,
1706             1,
1707             {
1708                 {0, -1, &MPIR_Scatter_MV2_Direct},
1709             },
1710             1,
1711             {
1712                 {0, -1, &MPIR_Scatter_MV2_Direct},
1713             },
1714         },
1715
1716         {8,
1717             1,
1718             {
1719                 {0, -1, &MPIR_Scatter_MV2_Direct},
1720             },
1721             1,
1722             {
1723                 {0, -1, &MPIR_Scatter_MV2_Direct},
1724             },
1725         },
1726
1727         {16,
1728             1,
1729             {
1730                 {0, -1, &MPIR_Scatter_MV2_Direct},
1731             },
1732             1,
1733             {
1734                 {0, -1, &MPIR_Scatter_MV2_Direct},
1735             },
1736         },
1737
1738         {32,
1739             1,
1740             {
1741                 {0, -1, &MPIR_Scatter_MV2_Direct},
1742             },
1743             1,
1744             {
1745                 {0, -1, &MPIR_Scatter_MV2_Direct},
1746             },
1747         },
1748
1749         {64,
1750             2,
1751             {
1752                 {0, 32, &MPIR_Scatter_MV2_Binomial},
1753                 {32, -1, &MPIR_Scatter_MV2_Direct},
1754             },
1755             1,
1756             {
1757                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1758             },
1759         },
1760     };
1761     table_ptrs[0] = mv2_tmp_scatter_thresholds_table_1ppn;
1762     mv2_scatter_table_ppn_conf[1] = 2;
1763     mv2_size_scatter_tuning_table[1] = 6;
1764     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_2ppn[] = {
1765         {4,
1766             2,
1767             {
1768                 {0, 4096, &MPIR_Scatter_MV2_Binomial},
1769                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1770             },
1771             1,
1772             {
1773                 {0, -1, &MPIR_Scatter_MV2_Direct},
1774             },
1775         },
1776
1777         {8,
1778             2,
1779             {
1780                 {0, 512, &MPIR_Scatter_MV2_two_level_Direct},
1781                 {512, -1, &MPIR_Scatter_MV2_Direct},
1782             },
1783             1,
1784             {
1785                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1786             },
1787         },
1788
1789         {16,
1790             2,
1791             {
1792                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1793                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1794             },
1795             1,
1796             {
1797                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1798             },
1799         },
1800
1801         {32,
1802             2,
1803             {
1804                 {0, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1805                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1806             },
1807             1,
1808             {
1809                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1810             },
1811         },
1812
1813         {64,
1814             2,
1815             {
1816                 {0, 8192, &MPIR_Scatter_MV2_two_level_Direct},
1817                 {8192, -1, &MPIR_Scatter_MV2_Direct},
1818             },
1819             1,
1820             {
1821                 {0, -1, &MPIR_Scatter_MV2_Binomial},
1822             },
1823         },
1824
1825         {128,
1826             4,
1827             {
1828                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1829                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1830                 {128, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1831                 {16384, -1, &MPIR_Scatter_MV2_Direct},
1832             },
1833             1,
1834             {
1835                 {0, 128, &MPIR_Scatter_MV2_Direct},
1836                 {128, -1, &MPIR_Scatter_MV2_Binomial},
1837             },
1838         },
1839     };
1840     table_ptrs[1] = mv2_tmp_scatter_thresholds_table_2ppn;
1841     mv2_scatter_table_ppn_conf[2] = 16;
1842     mv2_size_scatter_tuning_table[2] = 8;
1843     mv2_scatter_tuning_table mv2_tmp_scatter_thresholds_table_16ppn[] = {
1844         {
1845             16,
1846             2,
1847             {
1848                 {0, 256, &MPIR_Scatter_MV2_Binomial},
1849                 {256, -1, &MPIR_Scatter_MV2_Direct},
1850             },
1851             1,
1852             {
1853                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1854             },
1855         },
1856
1857         {
1858             32,
1859             2,
1860             {
1861                 {0, 512, &MPIR_Scatter_MV2_Binomial},
1862                 {512, -1, &MPIR_Scatter_MV2_Direct},
1863             },
1864             1,
1865             {
1866                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1867             },
1868         },
1869
1870         {
1871             64,
1872             2,
1873             {
1874                 {0, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1875                 {1024, -1, &MPIR_Scatter_MV2_Direct},
1876             },
1877             1,
1878             {
1879                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1880             },
1881         },
1882
1883         {
1884             128,
1885             4,
1886             {
1887                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1888                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1889                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1890                 {2048, -1, &MPIR_Scatter_MV2_Direct},
1891             },
1892             1,
1893             {
1894                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1895             },
1896         },
1897
1898         {
1899             256,
1900             4,
1901             {
1902                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1903                 {0, 16, &MPIR_Scatter_MV2_two_level_Direct},
1904                 {16, 2048, &MPIR_Scatter_MV2_two_level_Direct},
1905                 {2048, -1,  &MPIR_Scatter_MV2_Direct},
1906             },
1907             1,
1908             {
1909                 { 0, -1, &MPIR_Scatter_MV2_Direct},
1910             },
1911         },
1912
1913         {
1914             512,
1915             4,
1916             {
1917                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1918                 {16, 16, &MPIR_Scatter_MV2_two_level_Direct},
1919                 {16, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1920                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1921             },
1922             1,
1923             {
1924                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1925             },
1926         },
1927         {
1928             1024,
1929             5,
1930             {
1931                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1932                 {0, 16,  &MPIR_Scatter_MV2_Binomial},
1933                 {16, 32, &MPIR_Scatter_MV2_Binomial},
1934                 {32, 4096, &MPIR_Scatter_MV2_two_level_Direct},
1935                 {4096, -1, &MPIR_Scatter_MV2_Direct},
1936             },
1937             1,
1938             {
1939                 { 0, -1, &MPIR_Scatter_MV2_Binomial},
1940             },
1941         },
1942         {
1943             2048,
1944             7,
1945             {
1946                 {0, 16, &MPIR_Scatter_mcst_wrap_MV2},
1947                 {0, 16,  &MPIR_Scatter_MV2_two_level_Binomial},
1948                 {16, 128, &MPIR_Scatter_MV2_two_level_Binomial},
1949                 {128, 1024, &MPIR_Scatter_MV2_two_level_Direct},
1950                 {1024, 16384, &MPIR_Scatter_MV2_two_level_Direct},
1951                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1952                 {65536, -1, &MPIR_Scatter_MV2_two_level_Direct},
1953             },
1954             6,
1955             {
1956                 {0, 16, &MPIR_Scatter_MV2_Binomial},
1957                 {16, 128, &MPIR_Scatter_MV2_Binomial},
1958                 {128, 1024, &MPIR_Scatter_MV2_Binomial},
1959                 {1024, 16384, &MPIR_Scatter_MV2_Direct},
1960                 {16384, 65536, &MPIR_Scatter_MV2_Direct},
1961                 {65536, -1, &MPIR_Scatter_MV2_Direct},
1962             },
1963         },
1964     };
1965     table_ptrs[2] = mv2_tmp_scatter_thresholds_table_16ppn;
1966     agg_table_sum = 0;
1967     for (i = 0; i < mv2_scatter_num_ppn_conf; i++) {
1968         agg_table_sum += mv2_size_scatter_tuning_table[i];
1969     }
1970     mv2_scatter_thresholds_table[0] =
1971         static_cast<mv2_scatter_tuning_table*>(xbt_malloc(agg_table_sum * sizeof (mv2_scatter_tuning_table)));
1972     memcpy(mv2_scatter_thresholds_table[0], table_ptrs[0],
1973         (sizeof(mv2_scatter_tuning_table)
1974             * mv2_size_scatter_tuning_table[0]));
1975     for (i = 1; i < mv2_scatter_num_ppn_conf; i++) {
1976         mv2_scatter_thresholds_table[i] =
1977             mv2_scatter_thresholds_table[i - 1]
1978                                          + mv2_size_scatter_tuning_table[i - 1];
1979         memcpy(mv2_scatter_thresholds_table[i], table_ptrs[i],
1980             (sizeof(mv2_scatter_tuning_table)
1981                 * mv2_size_scatter_tuning_table[i]));
1982     }
1983     xbt_free(table_ptrs);
1984   
1985 }
1986