1 /*************************************************************************
3 * N A S P A R A L L E L B E N C H M A R K S 3.3 *
7 *************************************************************************
9 * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
11 * Permission to use, copy, distribute and modify this software *
12 * for any purpose with or without fee is hereby granted. We *
13 * request, however, that all derived work reference the NAS *
14 * Parallel Benchmarks 3.3. This software is provided "as is" *
15 * without express or implied warranty. *
17 * Information on NPB 3.3, including the technical report, the *
18 * original specifications, source code, results and information *
19 * on how to submit new results, is available at: *
21 * http: www.nas.nasa.gov/Software/NPB *
23 * Send comments or suggestions to npb@nas.nasa.gov *
24 * Send bug reports to npb-bugs@nas.nasa.gov *
26 * NAS Parallel Benchmarks Group *
27 * NASA Ames Research Center *
29 * Moffett Field, CA 94035-1000 *
31 * E-mail: npb@nas.nasa.gov *
32 * Fax: (650) 604-3957 *
34 *************************************************************************
36 * Author: M. Frumkin * *
38 *************************************************************************/
45 #include "npbparams.h"
47 #include "instr/instr.h" //TRACE_
54 //int passed_verification;
55 extern double randlc( double *X, double *A );
57 void c_print_results( char *name,
68 int passed_verification,
78 void timer_clear( int n );
79 void timer_start( int n );
80 void timer_stop( int n );
81 double timer_read( int n );
82 int timer_on=0,timers_tot=64;
84 int verify(char *bmname,double rnm2){
85 double verify_value=0.0;
86 double epsilon=1.0E-8;
91 if(strstr(bmname,"BH")){
92 verify_value=30892725.0;
93 }else if(strstr(bmname,"WH")){
94 verify_value=67349758.0;
95 }else if(strstr(bmname,"SH")){
96 verify_value=58875767.0;
98 fprintf(stderr,"No such benchmark as %s.\n",bmname);
102 if(strstr(bmname,"BH")){
103 verify_value = 4102461.0;
104 }else if(strstr(bmname,"WH")){
105 verify_value = 204280762.0;
106 }else if(strstr(bmname,"SH")){
107 verify_value = 186944764.0;
109 fprintf(stderr,"No such benchmark as %s.\n",bmname);
113 if(strstr(bmname,"BH")){
114 verify_value = 17809491.0;
115 }else if(strstr(bmname,"WH")){
116 verify_value = 1289925229.0;
117 }else if(strstr(bmname,"SH")){
118 verify_value = 610856482.0;
120 fprintf(stderr,"No such benchmark as %s.\n",bmname);
124 if(strstr(bmname,"BH")){
125 verify_value = 4317114.0;
126 }else if(strstr(bmname,"WH")){
127 verify_value = 7877279917.0;
128 }else if(strstr(bmname,"SH")){
129 verify_value = 1836863082.0;
131 fprintf(stderr,"No such benchmark as %s.\n",bmname);
135 if(strstr(bmname,"BH")){
137 }else if(strstr(bmname,"WH")){
139 }else if(strstr(bmname,"SH")){
142 fprintf(stderr,"No such benchmark as %s.\n",bmname);
146 if(strstr(bmname,"BH")){
148 }else if(strstr(bmname,"WH")){
150 }else if(strstr(bmname,"SH")){
153 fprintf(stderr,"No such benchmark as %s.\n",bmname);
157 fprintf(stderr,"No such class as %c.\n",cls);
159 fprintf(stderr," %s L2 Norm = %f\n",bmname,rnm2);
161 fprintf(stderr," No verification was performed.\n");
162 }else if( rnm2 - verify_value < epsilon &&
163 rnm2 - verify_value > -epsilon) { /* abs here does not work on ALTIX */
165 fprintf(stderr," Deviation = %f\n",(rnm2 - verify_value));
168 fprintf(stderr," The correct verification value = %f\n",verify_value);
169 fprintf(stderr," Got value = %f\n",rnm2);
177 int ipowMod(int a,long long int n,int md){
180 fprintf(stderr,"ipowMod: exponent must be nonnegative exp=%lld\n",n);
185 fprintf(stderr,"ipowMod: module must be positive mod=%d",md);
206 DGraph *buildSH(char cls){
208 Nodes of the graph must be topologically sorted
209 to avoid MPI deadlock.
212 int numSources=NUM_SOURCES; /* must be power of 2 */
213 int numOfLayers=0,tmpS=numSources>>1;
214 int firstLayerNode=0;
217 int mask=0x0,ndid=0,ndoff=0;
221 sprintf(nm,"DT_SH.%c",cls);
228 for(i=0;i<numSources;i++){
229 sprintf(nm,"Source.%d",i);
233 for(j=0;j<numOfLayers;j++){
235 for(i=0;i<numSources;i++){
236 sprintf(nm,"Comparator.%d",(i+j*firstLayerNode));
240 ndid=firstLayerNode+ndoff;
241 ar=newArc(dg->node[ndid],nd);
244 ndid=firstLayerNode+ndoff;
245 ar=newArc(dg->node[ndid],nd);
248 firstLayerNode+=numSources;
250 mask=0x00000001<<numOfLayers;
251 for(i=0;i<numSources;i++){
252 sprintf(nm,"Sink.%d",i);
256 ndid=firstLayerNode+ndoff;
257 ar=newArc(dg->node[ndid],nd);
260 ndid=firstLayerNode+ndoff;
261 ar=newArc(dg->node[ndid],nd);
266 DGraph *buildWH(char cls){
268 Nodes of the graph must be topologically sorted
269 to avoid MPI deadlock.
272 int numSources=NUM_SOURCES,maxInDeg=4;
273 int numLayerNodes=numSources,firstLayerNode=0;
274 int totComparators=0;
275 int numPrevLayerNodes=numLayerNodes;
278 DGNode *nd=NULL,*source=NULL,*tmp=NULL,*snd=NULL;
282 sprintf(nm,"DT_WH.%c",cls);
285 for(i=0;i<numSources;i++){
286 sprintf(nm,"Sink.%d",i);
291 numPrevLayerNodes=numLayerNodes;
292 while(numLayerNodes>maxInDeg){
293 numLayerNodes=numLayerNodes/maxInDeg;
294 if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
295 for(i=0;i<numLayerNodes;i++){
296 sprintf(nm,"Comparator.%d",totComparators);
299 id=AttachNode(dg,nd);
300 for(j=0;j<maxInDeg;j++){
302 if(sid>=numPrevLayerNodes) break;
303 snd=dg->node[firstLayerNode+sid];
304 ar=newArc(dg->node[id],snd);
308 firstLayerNode+=numPrevLayerNodes;
309 numPrevLayerNodes=numLayerNodes;
311 source=newNode("Source");
312 AttachNode(dg,source);
313 for(i=0;i<numPrevLayerNodes;i++){
314 nd=dg->node[firstLayerNode+i];
315 ar=newArc(source,nd);
319 for(i=0;i<dg->numNodes/2;i++){ /* Topological sorting */
321 dg->node[i]=dg->node[dg->numNodes-1-i];
323 dg->node[dg->numNodes-1-i]=tmp;
324 dg->node[dg->numNodes-1-i]->id=dg->numNodes-1-i;
328 DGraph *buildBH(char cls){
330 Nodes of the graph must be topologically sorted
331 to avoid MPI deadlock.
334 int numSources=NUM_SOURCES,maxInDeg=4;
335 int numLayerNodes=numSources,firstLayerNode=0;
337 DGNode *nd=NULL, *snd=NULL, *sink=NULL;
339 int totComparators=0;
340 int numPrevLayerNodes=numLayerNodes;
344 sprintf(nm,"DT_BH.%c",cls);
347 for(i=0;i<numSources;i++){
348 sprintf(nm,"Source.%d",i);
352 while(numLayerNodes>maxInDeg){
353 numLayerNodes=numLayerNodes/maxInDeg;
354 if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
355 for(i=0;i<numLayerNodes;i++){
356 sprintf(nm,"Comparator.%d",totComparators);
359 id=AttachNode(dg,nd);
360 for(j=0;j<maxInDeg;j++){
362 if(sid>=numPrevLayerNodes) break;
363 snd=dg->node[firstLayerNode+sid];
364 ar=newArc(snd,dg->node[id]);
368 firstLayerNode+=numPrevLayerNodes;
369 numPrevLayerNodes=numLayerNodes;
371 sink=newNode("Sink");
373 for(i=0;i<numPrevLayerNodes;i++){
374 nd=dg->node[firstLayerNode+i];
385 Arr *newArr(int len){
386 Arr *arr=(Arr *)SMPI_SHARED_MALLOC(sizeof(Arr));
388 arr->val=(double *)SMPI_SHARED_MALLOC(len*sizeof(double));
391 void arrShow(Arr* a){
392 if(!a) fprintf(stderr,"-- NULL array\n");
394 fprintf(stderr,"-- length=%d\n",a->len);
397 double CheckVal(Arr *feat){
400 for(i=0;i<feat->len;i++){
401 csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since
402 result will be 0 for large len */
406 int GetFNumDPar(int* mean, int* stdev){
408 *stdev=STD_DEVIATION;
411 int GetFeatureNum(char *mbname,int id){
412 double tran=314159265.0;
414 double denom=randlc(&tran,&A);
416 int mean=NUM_SAMPLES,stdev=128;
418 GetFNumDPar(&mean,&stdev);
419 rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev);
420 if(rtfs<0) rtfs=-rtfs;
424 Arr* RandomFeatures(char *bmname,int fdim,int id){
425 int len=GetFeatureNum(bmname,id)*fdim;
426 Arr* feat=newArr(len);
427 int nxg=2,nyg=2,nzg=2,nfg=5;
428 int nx=421,ny=419,nz=1427,nf=3527;
429 long long int expon=(len*(id+1))%3141592;
430 int seedx=ipowMod(nxg,expon,nx),
431 seedy=ipowMod(nyg,expon,ny),
432 seedz=ipowMod(nzg,expon,nz),
433 seedf=ipowMod(nfg,expon,nf);
439 for(i=0;i<len;i+=fdim){
440 seedx=(seedx*nxg)%nx;
441 seedy=(seedy*nyg)%ny;
442 seedz=(seedz*nzg)%nz;
443 seedf=(seedf*nfg)%nf;
445 feat->val[i+1]=seedy;
446 feat->val[i+2]=seedz;
447 feat->val[i+3]=seedf;
451 fprintf(stderr,"** RandomFeatures time in node %d = %f\n",id,timer_read(id+1));
455 void Resample(Arr *a,int blen){
456 long long int i=0,j=0,jlo=0,jhi=0;
458 double *nval=(double *)SMPI_SHARED_MALLOC(blen*sizeof(double));
460 for(i=0;i<blen;i++) nval[i]=0.0;
461 for(i=1;i<a->len-1;i++){
462 jlo=(int)(0.5*(2*i-1)*(blen/a->len));
463 jhi=(int)(0.5*(2*i+1)*(blen/a->len));
465 avval=a->val[i]/(jhi-jlo+1);
466 for(j=jlo;j<=jhi;j++){
471 nval[blen-1]=a->val[a->len-1];
472 SMPI_SHARED_FREE(a->val);
477 Arr* WindowFilter(Arr *a, Arr* b,int w){
479 double rms0=0.0,rms1=0.0,rmsm1=0.0;
480 double weight=((double) (w+1))/(w+2);
487 if(a->len<b->len) Resample(a,b->len);
488 if(a->len>b->len) Resample(b,a->len);
489 for(i=fielddim;i<a->len-fielddim;i+=fielddim){
490 rms0=(a->val[i]-b->val[i])*(a->val[i]-b->val[i])
491 +(a->val[i+1]-b->val[i+1])*(a->val[i+1]-b->val[i+1])
492 +(a->val[i+2]-b->val[i+2])*(a->val[i+2]-b->val[i+2])
493 +(a->val[i+3]-b->val[i+3])*(a->val[i+3]-b->val[i+3]);
495 rms1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
496 +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
497 +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
498 +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
500 rmsm1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
501 +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
502 +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
503 +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
512 a->val[i]=weight*b->val[i];
513 a->val[i+1]=weight*b->val[i+1];
514 a->val[i+2]=weight*b->val[i+2];
515 a->val[i+3]=weight*b->val[i+3];
518 a->val[i]=weight*b->val[j];
519 a->val[i+1]=weight*b->val[j+1];
520 a->val[i+2]=weight*b->val[j+2];
521 a->val[i+3]=weight*b->val[j+3];
522 }else { /*if(k==-1)*/
524 a->val[i]=weight*b->val[j];
525 a->val[i+1]=weight*b->val[j+1];
526 a->val[i+2]=weight*b->val[j+2];
527 a->val[i+3]=weight*b->val[j+3];
532 fprintf(stderr,"** WindowFilter time in node %d = %f\n",(w-1),timer_read(w));
537 int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
542 for(i=0;i<nd->outDegree;i++){
544 if(ar->tail!=nd) continue;
547 if(head->address!=nd->address){
548 MPI_Send(&feat->len,1,MPI_INT,head->address,tag,MPI_COMM_WORLD);
549 MPI_Send(feat->val,feat->len,MPI_DOUBLE,head->address,tag,MPI_COMM_WORLD);
554 Arr* CombineStreams(DGraph *dg,DGNode *nd){
555 Arr *resfeat=newArr(NUM_SAMPLES*fielddim);
560 Arr *feat=NULL,*featp=NULL;
562 if(nd->inDegree==0) return NULL;
563 for(i=0;i<nd->inDegree;i++){
565 if(ar->head!=nd) continue;
567 if(tail->address!=nd->address){
570 MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
572 MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
573 resfeat=WindowFilter(resfeat,feat,nd->id);
574 SMPI_SHARED_FREE(feat);
576 featp=(Arr *)tail->feat;
577 feat=newArr(featp->len);
578 memcpy(feat->val,featp->val,featp->len*sizeof(double));
579 resfeat=WindowFilter(resfeat,feat,nd->id);
580 SMPI_SHARED_FREE(feat);
583 for(i=0;i<resfeat->len;i++) resfeat->val[i]=((int)resfeat->val[i])/nd->inDegree;
587 double Reduce(Arr *a,int w){
593 retv=(int)(w*CheckVal(a));/* The casting needed for node
594 and array dependent verifcation */
597 fprintf(stderr,"** Reduce time in node %d = %f\n",(w-1),timer_read(w));
602 double ReduceStreams(DGraph *dg,DGNode *nd){
610 for(i=0;i<nd->inDegree;i++){
612 if(ar->head!=nd) continue;
614 if(tail->address!=nd->address){
618 MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
620 MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
621 csum+=Reduce(feat,(nd->id+1));
622 SMPI_SHARED_FREE(feat);
624 csum+=Reduce(tail->feat,(nd->id+1));
627 if(nd->inDegree>0)csum=(((long long int)csum)/nd->inDegree);
628 retv=(nd->id+1)*csum;
632 int ProcessNodes(DGraph *dg,int me){
635 int i=0,verified=0,tag;
640 for(i=0;i<dg->numNodes;i++){
642 if(nd->address!=me) continue;
643 if(strstr(nd->name,"Source")){
644 nd->feat=RandomFeatures(dg->name,fielddim,nd->id);
645 SendResults(dg,nd,nd->feat);
646 }else if(strstr(nd->name,"Sink")){
647 chksum=ReduceStreams(dg,nd);
648 tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
649 MPI_Send(&chksum,1,MPI_DOUBLE,0,tag,MPI_COMM_WORLD);
651 feat=CombineStreams(dg,nd);
652 SendResults(dg,nd,feat);
655 if(me==0){ /* Report node */
658 for(i=0;i<dg->numNodes;i++){
660 if(!strstr(nd->name,"Sink")) continue;
661 tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
662 MPI_Recv(&rchksum,1,MPI_DOUBLE,nd->address,tag,MPI_COMM_WORLD,&status);
665 verified=verify(dg->name,chksum);
670 int main(int argc,char **argv ){
671 int my_rank,comm_size;
674 int verified=0, featnum=0;
675 double bytes_sent=2.0,tot_time=0.0;
679 MPI_Init( &argc, &argv );
680 MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
681 MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
682 TRACE_smpi_set_category ("start");
685 ( strncmp(argv[1],"BH",2)!=0
686 &&strncmp(argv[1],"WH",2)!=0
687 &&strncmp(argv[1],"SH",2)!=0
691 fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n");
692 fprintf(stderr,"** Where \n - N is integer number of MPI processes\n");
693 fprintf(stderr," - S is the class S, W, or A \n");
694 fprintf(stderr," - GraphName is the communication graph name BH, WH, or SH.\n");
695 fprintf(stderr," - the number of MPI processes N should not be be less than \n");
696 fprintf(stderr," the number of nodes in the graph\n");
701 if(strncmp(argv[1],"BH",2)==0){
703 }else if(strncmp(argv[1],"WH",2)==0){
705 }else if(strncmp(argv[1],"SH",2)==0){
709 if(timer_on&&dg->numNodes+1>timers_tot){
712 fprintf(stderr,"Not enough timers. Node timeing is off. \n");
714 if(dg->numNodes>comm_size){
716 fprintf(stderr,"** The number of MPI processes should not be less than \n");
717 fprintf(stderr,"** the number of nodes in the graph\n");
718 fprintf(stderr,"** Number of MPI processes = %d\n",comm_size);
719 fprintf(stderr,"** Number nodes in the graph = %d\n",dg->numNodes);
724 for(i=0;i<dg->numNodes;i++){
725 dg->node[i]->address=i;
728 printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" );
734 TRACE_smpi_set_category ("dt");
735 verified=ProcessNodes(dg,my_rank);
736 TRACE_smpi_set_category ("finalize");
738 featnum=NUM_SAMPLES*fielddim;
739 bytes_sent=featnum*dg->numArcs;
743 tot_time=timer_read(0);
744 c_print_results( dg->name,