1 !-------------------------------------------------------------------------!
3 ! N A S P A R A L L E L B E N C H M A R K S 3.3 !
7 !-------------------------------------------------------------------------!
9 ! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
10 ! It is described in NAS Technical Reports 95-020 and 02-007 !
12 ! Permission to use, copy, distribute and modify this software !
13 ! for any purpose with or without fee is hereby granted. We !
14 ! request, however, that all derived work reference the NAS !
15 ! Parallel Benchmarks 3.3. This software is provided "as is" !
16 ! without express or implied warranty. !
18 ! Information on NPB 3.3, including the technical report, the !
19 ! original specifications, source code, results and information !
20 ! on how to submit new results, is available at: !
22 ! http://www.nas.nasa.gov/Software/NPB/ !
24 ! Send comments or suggestions to npb@nas.nasa.gov !
26 ! NAS Parallel Benchmarks Group !
27 ! NASA Ames Research Center !
29 ! Moffett Field, CA 94035-1000 !
31 ! E-mail: npb@nas.nasa.gov !
32 ! Fax: (650) 604-3957 !
34 !-------------------------------------------------------------------------!
36 !TO REDUCE THE AMOUNT OF MEMORY REQUIRED BY THE BENCHMARK WE NO LONGER
37 !STORE THE ENTIRE TIME EVOLUTION ARRAY "EX" FOR ALL TIME STEPS, BUT
38 !JUST FOR THE FIRST. ALSO, IT IS STORED ONLY FOR THE PART OF THE GRID
39 !FOR WHICH THE CALLING PROCESSOR IS RESPONSIBLE, SO THAT THE MEMORY
40 !USAGE BECOMES SCALABLE. THIS NEW ARRAY IS CALLED "TWIDDLE" (SEE
43 !TO AVOID PROBLEMS WITH VERY LARGE ARRAY SIZES THAT ARE COMPUTED BY
44 !MULTIPLYING GRID DIMENSIONS (CAUSING INTEGER OVERFLOW IN THE VARIABLE
45 !NTOTAL) AND SUBSEQUENTLY DIVIDING BY THE NUMBER OF PROCESSORS, WE
46 !COMPUTE THE SIZE OF ARRAY PARTITIONS MORE CONSERVATIVELY AS
47 !((NX*NY)/NP)*NZ, WHERE NX, NY, AND NZ ARE GRID DIMENSIONS AND NP IS
48 !THE NUMBER OF PROCESSORS, THE RESULT IS STORED IN "NTDIVNP". FOR THE
49 !PERFORMANCE CALCULATION WE STORE THE TOTAL NUMBER OF GRID POINTS IN A
50 !FLOATING POINT NUMBER "NTOTAL_F" INSTEAD OF AN INTEGER.
51 !THIS FIX WILL FAIL IF THE NUMBER OF PROCESSORS IS SMALL.
53 !UGLY HACK OF SUBROUTINE IPOW46: FOR VERY LARGE GRIDS THE SINGLE EXPONENT
54 !FROM NPB2.3 MAY NOT FIT IN A 32-BIT INTEGER. HOWEVER, WE KNOW THAT THE
55 !"EXPONENT" ARGUMENT OF THIS ROUTINE CAN ALWAYS BE FACTORED INTO A TERM
56 !DIVISIBLE BY NX (EXP_1) AND ANOTHER TERM (EXP_2). NX IS USUALLY A POWER
57 !OF TWO, SO WE CAN KEEP HALVING IT UNTIL THE PRODUCT OF EXP_1
58 !AND EXP_2 IS SMALL ENOUGH (NAMELY EXP_2 ITSELF). THIS UPDATED VERSION
59 !OF IPWO46, WHICH NOW TAKES THE TWO FACTORS OF "EXPONENT" AS SEPARATE
60 !ARGUMENTS, MAY BREAK DOWN IF EXP_1 DOES NOT CONTAIN A LARGE POWER OF TWO.
62 c---------------------------------------------------------------------
66 c R. F. Van der Wijngaart
68 c---------------------------------------------------------------------
70 c---------------------------------------------------------------------
72 c---------------------------------------------------------------------
74 c---------------------------------------------------------------------
76 c---------------------------------------------------------------------
77 c---------------------------------------------------------------------
81 c---------------------------------------------------------------------
82 c---------------------------------------------------------------------
90 c---------------------------------------------------------------------
91 c u0, u1, u2 are the main arrays in the problem.
92 c Depending on the decomposition, these arrays will have different
93 c dimensions. To accomodate all possibilities, we allocate them as
94 c one-dimensional arrays and pass them to subroutines for different
96 c - u0 contains the initial (transformed) initial condition
97 c - u1 and u2 are working arrays
98 c---------------------------------------------------------------------
100 double complex u0(ntdivnp),
103 double precision twiddle(ntdivnp)
104 c---------------------------------------------------------------------
105 c Large arrays are in common so that they are allocated on the
106 c heap rather than the stack. This common block is not
107 c referenced directly anywhere else. Padding is to avoid accidental
108 c cache problems, since all array sizes are powers of two.
109 c---------------------------------------------------------------------
111 double complex pad1(3), pad2(3), pad3(3)
112 common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle
115 double precision total_time, mflops
121 c---------------------------------------------------------------------
122 c Run the entire problem once to make sure all data is touched.
123 c This reduces variable startup costs, which is important for such a
124 c short benchmark. The other NPB 2 implementations are similar.
125 c---------------------------------------------------------------------
131 call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
132 call compute_initial_conditions(u1, dims(1,1), dims(2,1),
134 call fft_init (dims(1,1))
137 c---------------------------------------------------------------------
138 c Start over from the beginning. Note that all operations must
139 c be timed, in contrast to other benchmarks.
140 c---------------------------------------------------------------------
144 call MPI_Barrier(MPI_COMM_WORLD, ierr)
146 call timer_start(T_total)
147 if (timers_enabled) call timer_start(T_setup)
149 call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
150 call compute_initial_conditions(u1, dims(1,1), dims(2,1),
152 call fft_init (dims(1,1))
154 if (timers_enabled) call synchup()
155 if (timers_enabled) call timer_stop(T_setup)
157 if (timers_enabled) call timer_start(T_fft)
159 if (timers_enabled) call timer_stop(T_fft)
162 if (timers_enabled) call timer_start(T_evolve)
163 call evolve(u0, u1, twiddle, dims(1,1), dims(2,1), dims(3,1))
164 if (timers_enabled) call timer_stop(T_evolve)
165 if (timers_enabled) call timer_start(T_fft)
167 if (timers_enabled) call timer_stop(T_fft)
168 if (timers_enabled) call synchup()
169 if (timers_enabled) call timer_start(T_checksum)
170 call checksum(iter, u2, dims(1,1), dims(2,1), dims(3,1))
171 if (timers_enabled) call timer_stop(T_checksum)
174 call verify(nx, ny, nz, niter, verified, class)
175 call timer_stop(t_total)
176 if (np .ne. np_min) verified = .false.
177 total_time = timer_read(t_total)
179 if( total_time .ne. 0. ) then
180 mflops = 1.0d-6*ntotal_f *
181 > (14.8157+7.19641*log(ntotal_f)
182 > + (5.23518+7.21113*log(ntotal_f))*niter)
188 call print_results('FT', class, nx, ny, nz, niter, np_min, np,
189 > total_time, mflops, ' floating point', verified,
190 > npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
192 if (timers_enabled) call print_timers()
193 call MPI_Finalize(ierr)
196 c---------------------------------------------------------------------
197 c---------------------------------------------------------------------
199 subroutine evolve(u0, u1, twiddle, d1, d2, d3)
201 c---------------------------------------------------------------------
202 c---------------------------------------------------------------------
204 c---------------------------------------------------------------------
205 c evolve u0 -> u1 (t time steps) in fourier space
206 c---------------------------------------------------------------------
212 double complex u0(d1,d2,d3)
213 double complex u1(d1,d2,d3)
214 double precision twiddle(d1,d2,d3)
220 u0(i,j,k) = u0(i,j,k)*(twiddle(i,j,k))
221 u1(i,j,k) = u0(i,j,k)
230 c---------------------------------------------------------------------
231 c---------------------------------------------------------------------
233 subroutine compute_initial_conditions(u0, d1, d2, d3)
235 c---------------------------------------------------------------------
236 c---------------------------------------------------------------------
238 c---------------------------------------------------------------------
239 c Fill in array u0 with initial conditions from
240 c random number generator
241 c---------------------------------------------------------------------
245 double complex u0(d1, d2, d3)
247 double precision x0, start, an, dummy
249 c---------------------------------------------------------------------
250 c 0-D and 1-D layouts are easy because each processor gets a contiguous
251 c chunk of the array, in the Fortran ordering sense.
252 c For a 2-D layout, it's a bit more complicated. We always
253 c have entire x-lines (contiguous) in processor.
254 c We can do ny/np1 of them at a time since we have
255 c ny/np1 contiguous in y-direction. But then we jump
256 c by z-planes (nz/np2 of them, total).
257 c For the 0-D and 1-D layouts we could do larger chunks, but
258 c this turns out to have no measurable impact on performance.
259 c---------------------------------------------------------------------
263 c---------------------------------------------------------------------
264 c Jump to the starting element for our first plane.
265 c---------------------------------------------------------------------
266 call ipow46(a, 2*nx, (zstart(1)-1)*ny + (ystart(1)-1), an)
267 dummy = randlc(start, an)
268 call ipow46(a, 2*nx, ny, an)
270 c---------------------------------------------------------------------
271 c Go through by z planes filling in one square at a time.
272 c---------------------------------------------------------------------
273 do k = 1, dims(3, 1) ! nz/np2
275 call vranlc(2*nx*dims(2, 1), x0, a, u0(1, 1, k))
276 if (k .ne. dims(3, 1)) dummy = randlc(start, an)
283 c---------------------------------------------------------------------
284 c---------------------------------------------------------------------
286 subroutine ipow46(a, exp_1, exp_2, result)
288 c---------------------------------------------------------------------
289 c---------------------------------------------------------------------
291 c---------------------------------------------------------------------
292 c compute a^exponent mod 2^46
293 c---------------------------------------------------------------------
296 double precision a, result, dummy, q, r
297 integer exp_1, exp_2, n, n2, ierr
299 double precision randlc
301 c---------------------------------------------------------------------
303 c a^n = a^(n/2)*a^(n/2) if n even else
304 c a^n = a*a^(n-1) if n odd
305 c---------------------------------------------------------------------
307 if (exp_2 .eq. 0 .or. exp_1 .eq. 0) return
315 if (n2 * 2 .eq. n) then
326 if (n2 * 2 .eq. n) then
340 c---------------------------------------------------------------------
341 c---------------------------------------------------------------------
345 c---------------------------------------------------------------------
346 c---------------------------------------------------------------------
352 integer ierr, i, j, fstatus
355 call MPI_Comm_size(MPI_COMM_WORLD, np, ierr)
356 call MPI_Comm_rank(MPI_COMM_WORLD, me, ierr)
358 if (.not. convertdouble) then
359 dc_type = MPI_DOUBLE_COMPLEX
361 dc_type = MPI_COMPLEX
367 open (unit=2,file='inputft.data',status='old', iostat=fstatus)
369 if (fstatus .eq. 0) then
371 233 format(' Reading from input file inputft.data')
373 read (2,*) layout_type
377 c---------------------------------------------------------------------
378 c check to make sure input data is consistent
379 c---------------------------------------------------------------------
382 c---------------------------------------------------------------------
383 c 1. product of processor grid dims must equal number of processors
384 c---------------------------------------------------------------------
386 if (np1 * np2 .ne. np) then
388 238 format(' np1 and np2 given in input file are not valid.')
389 write(*, 239) np1*np2, np
390 239 format(' Product is ', i5, ' and should be ', i5)
391 call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
395 c---------------------------------------------------------------------
396 c 2. layout type must be valid
397 c---------------------------------------------------------------------
399 if (layout_type .ne. layout_0D .and.
400 > layout_type .ne. layout_1D .and.
401 > layout_type .ne. layout_2D) then
403 240 format(' Layout type specified in inputft.data is
405 call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
409 c---------------------------------------------------------------------
410 c 3. 0D layout must be 1x1 grid
411 c---------------------------------------------------------------------
413 if (layout_type .eq. layout_0D .and.
414 > (np1 .ne.1 .or. np2 .ne. 1)) then
416 241 format(' For 0D layout, both np1 and np2 must be 1 ')
417 call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
420 c---------------------------------------------------------------------
421 c 4. 1D layout must be 1xN grid
422 c---------------------------------------------------------------------
424 if (layout_type .eq. layout_1D .and. np1 .ne. 1) then
426 242 format(' For 1D layout, np1 must be 1 ')
427 call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
433 niter = niter_default
437 layout_type = layout_0D
438 else if (np .le. nz) then
441 layout_type = layout_1D
445 layout_type = layout_2D
449 if (np .lt. np_min) then
451 10 format(' Error: Compiled for ', I5, ' processors. ')
453 11 format(' Only ', i5, ' processors found ')
454 call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
458 234 format(' No input file inputft.data. Using compiled defaults')
459 write(*, 1001) nx, ny, nz
462 write(*, 1005) np1, np2
463 if (np .ne. np_min) write(*, 1006) np_min
465 if (layout_type .eq. layout_0D) then
467 else if (layout_type .eq. layout_1D) then
473 1000 format(//,' NAS Parallel Benchmarks 3.3 -- FT Benchmark',/)
474 1001 format(' Size : ', i4, 'x', i4, 'x', i4)
475 1002 format(' Iterations : ', 7x, i7)
476 1004 format(' Number of processes : ', 7x, i7)
477 1005 format(' Processor array : ', 5x, i4, 'x', i4)
478 1006 format(' WARNING: compiled for ', i5, ' processes. ',
479 > ' Will not verify. ')
480 1010 format(' Layout type : ', 9x, A5)
484 c---------------------------------------------------------------------
485 c Since np1, np2 and layout_type are in a common block,
486 c this sends all three.
487 c---------------------------------------------------------------------
488 call MPI_BCAST(np1, 3, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
489 call MPI_BCAST(niter, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
491 if (np1 .eq. 1 .and. np2 .eq. 1) then
492 layout_type = layout_0D
493 else if (np1 .eq. 1) then
494 layout_type = layout_1D
496 layout_type = layout_2D
499 if (layout_type .eq. layout_0D) then
505 else if (layout_type .eq. layout_1D) then
517 else if (layout_type .eq. layout_2D) then
532 dims(2, i) = dims(2, i) / np1
533 dims(3, i) = dims(3, i) / np2
537 c---------------------------------------------------------------------
538 c Determine processor coordinates of this processor
539 c Processor grid is np1xnp2.
540 c Arrays are always (n1, n2/np1, n3/np2)
541 c Processor coords are zero-based.
542 c---------------------------------------------------------------------
543 me2 = mod(me, np2) ! goes from 0...np2-1
544 me1 = me/np2 ! goes from 0...np1-1
545 c---------------------------------------------------------------------
546 c Communicators for rows/columns of processor grid.
547 c commslice1 is communicator of all procs with same me1, ranked as me2
548 c commslice2 is communicator of all procs with same me2, ranked as me1
549 c mpi_comm_split(comm, color, key, ...)
550 c---------------------------------------------------------------------
551 call MPI_Comm_split(MPI_COMM_WORLD, me1, me2, commslice1, ierr)
552 call MPI_Comm_split(MPI_COMM_WORLD, me2, me1, commslice2, ierr)
553 if (timers_enabled) call synchup()
555 if (debug) print *, 'proc coords: ', me, me1, me2
557 c---------------------------------------------------------------------
558 c Determine which section of the grid is owned by this
560 c---------------------------------------------------------------------
561 if (layout_type .eq. layout_0d) then
572 else if (layout_type .eq. layout_1d) then
578 zstart(1) = 1 + me2 * nz/np2
579 zend(1) = (me2+1) * nz/np2
585 zstart(2) = 1 + me2 * nz/np2
586 zend(2) = (me2+1) * nz/np2
590 ystart(3) = 1 + me2 * ny/np2
591 yend(3) = (me2+1) * ny/np2
595 else if (layout_type .eq. layout_2d) then
599 ystart(1) = 1 + me1 * ny/np1
600 yend(1) = (me1+1) * ny/np1
601 zstart(1) = 1 + me2 * nz/np2
602 zend(1) = (me2+1) * nz/np2
604 xstart(2) = 1 + me1 * nx/np1
605 xend(2) = (me1+1)*nx/np1
608 zstart(2) = zstart(1)
611 xstart(3) = xstart(2)
613 ystart(3) = 1 + me2 *ny/np2
614 yend(3) = (me2+1)*ny/np2
619 c---------------------------------------------------------------------
620 c Set up info for blocking of ffts and transposes. This improves
621 c performance on cache-based systems. Blocking involves
622 c working on a chunk of the problem at a time, taking chunks
623 c along the first, second, or third dimension.
625 c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
626 c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
628 c Since 1st dim is always in processor, we'll assume it's long enough
629 c (default blocking factor is 16 so min size for 1st dim is 16)
630 c The only case we have to worry about is cffts1 in a 2d decomposition.
631 c so the blocking factor should not be larger than the 2nd dimension.
632 c---------------------------------------------------------------------
634 fftblock = fftblock_default
635 fftblockpad = fftblockpad_default
637 if (layout_type .eq. layout_2d) then
638 if (dims(2, 1) .lt. fftblock) fftblock = dims(2, 1)
639 if (dims(2, 2) .lt. fftblock) fftblock = dims(2, 2)
640 if (dims(2, 3) .lt. fftblock) fftblock = dims(2, 3)
643 if (fftblock .ne. fftblock_default) fftblockpad = fftblock+3
649 c---------------------------------------------------------------------
650 c---------------------------------------------------------------------
652 subroutine compute_indexmap(twiddle, d1, d2, d3)
654 c---------------------------------------------------------------------
655 c---------------------------------------------------------------------
657 c---------------------------------------------------------------------
658 c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
659 c for time evolution exponent.
660 c---------------------------------------------------------------------
666 integer i, j, k, ii, ii2, jj, ij2, kk
667 double precision ap, twiddle(d1, d2, d3)
669 c---------------------------------------------------------------------
670 c this function is very different depending on whether
671 c we are in the 0d, 1d or 2d layout. Compute separately.
672 c basically we want to convert the fortran indices
675 c 0 1 2 3 -4 -3 -2 -1
676 c The following magic formula does the trick:
677 c mod(i-1+n/2, n) - n/2
678 c---------------------------------------------------------------------
680 ap = - 4.d0 * alpha * pi *pi
682 if (layout_type .eq. layout_0d) then ! xyz layout
684 ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2
687 jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
690 kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
691 twiddle(i,j,k) = dexp(ap*dfloat(kk*kk+ij2))
695 else if (layout_type .eq. layout_1d) then ! zxy layout
697 ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2
700 jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
703 kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
704 twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
708 else if (layout_type .eq. layout_2d) then ! zxy layout
710 ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2
713 jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
716 kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
717 twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
722 print *, ' Unknown layout type ', layout_type
731 c---------------------------------------------------------------------
732 c---------------------------------------------------------------------
734 subroutine print_timers()
736 c---------------------------------------------------------------------
737 c---------------------------------------------------------------------
742 character*25 tstrings(T_max)
743 data tstrings / ' total ',
751 > ' transpose1_loc ',
752 > ' transpose1_glo ',
753 > ' transpose1_fin ',
754 > ' transpose2_loc ',
755 > ' transpose2_glo ',
756 > ' transpose2_fin ',
759 if (me .ne. 0) return
761 if (timer_read(i) .ne. 0.0d0) then
762 write(*, 100) i, tstrings(i), timer_read(i)
765 100 format(' timer ', i2, '(', A16, ') :', F10.6)
771 c---------------------------------------------------------------------
772 c---------------------------------------------------------------------
774 subroutine fft(dir, x1, x2)
776 c---------------------------------------------------------------------
777 c---------------------------------------------------------------------
782 double complex x1(ntdivnp), x2(ntdivnp)
784 double complex scratch(fftblockpad_default*maxdim*2)
786 c---------------------------------------------------------------------
787 c note: args x1, x2 must be different arrays
788 c note: args for cfftsx are (direction, layout, xin, xout, scratch)
789 c xin/xout may be the same and it can be somewhat faster
791 c note: args for transpose are (layout1, layout2, xin, xout)
792 c xin/xout must be different
793 c---------------------------------------------------------------------
796 if (layout_type .eq. layout_0d) then
797 call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
799 call cffts2(1, dims(1,2), dims(2,2), dims(3,2),
801 call cffts3(1, dims(1,3), dims(2,3), dims(3,3),
803 else if (layout_type .eq. layout_1d) then
804 call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
806 call cffts2(1, dims(1,2), dims(2,2), dims(3,2),
808 if (timers_enabled) call timer_start(T_transpose)
809 call transpose_xy_z(2, 3, x1, x2)
810 if (timers_enabled) call timer_stop(T_transpose)
811 call cffts1(1, dims(1,3), dims(2,3), dims(3,3),
813 else if (layout_type .eq. layout_2d) then
814 call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
816 if (timers_enabled) call timer_start(T_transpose)
817 call transpose_x_y(1, 2, x1, x2)
818 if (timers_enabled) call timer_stop(T_transpose)
819 call cffts1(1, dims(1,2), dims(2,2), dims(3,2),
821 if (timers_enabled) call timer_start(T_transpose)
822 call transpose_x_z(2, 3, x2, x1)
823 if (timers_enabled) call timer_stop(T_transpose)
824 call cffts1(1, dims(1,3), dims(2,3), dims(3,3),
828 if (layout_type .eq. layout_0d) then
829 call cffts3(-1, dims(1,3), dims(2,3), dims(3,3),
831 call cffts2(-1, dims(1,2), dims(2,2), dims(3,2),
833 call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
835 else if (layout_type .eq. layout_1d) then
836 call cffts1(-1, dims(1,3), dims(2,3), dims(3,3),
838 if (timers_enabled) call timer_start(T_transpose)
839 call transpose_x_yz(3, 2, x1, x2)
840 if (timers_enabled) call timer_stop(T_transpose)
841 call cffts2(-1, dims(1,2), dims(2,2), dims(3,2),
843 call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
845 else if (layout_type .eq. layout_2d) then
846 call cffts1(-1, dims(1,3), dims(2,3), dims(3,3),
848 if (timers_enabled) call timer_start(T_transpose)
849 call transpose_x_z(3, 2, x1, x2)
850 if (timers_enabled) call timer_stop(T_transpose)
851 call cffts1(-1, dims(1,2), dims(2,2), dims(3,2),
853 if (timers_enabled) call timer_start(T_transpose)
854 call transpose_x_y(2, 1, x2, x1)
855 if (timers_enabled) call timer_stop(T_transpose)
856 call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
865 c---------------------------------------------------------------------
866 c---------------------------------------------------------------------
868 subroutine cffts1(is, d1, d2, d3, x, xout, y)
870 c---------------------------------------------------------------------
871 c---------------------------------------------------------------------
876 integer is, d1, d2, d3, logd1
877 double complex x(d1,d2,d3)
878 double complex xout(d1,d2,d3)
879 double complex y(fftblockpad, d1, 2)
885 do jj = 0, d2 - fftblock, fftblock
886 if (timers_enabled) call timer_start(T_fftcopy)
889 y(j,i,1) = x(i,j+jj,k)
892 if (timers_enabled) call timer_stop(T_fftcopy)
894 if (timers_enabled) call timer_start(T_fftlow)
895 call cfftz (is, logd1, d1, y, y(1,1,2))
896 if (timers_enabled) call timer_stop(T_fftlow)
898 if (timers_enabled) call timer_start(T_fftcopy)
901 xout(i,j+jj,k) = y(j,i,1)
904 if (timers_enabled) call timer_stop(T_fftcopy)
912 c---------------------------------------------------------------------
913 c---------------------------------------------------------------------
915 subroutine cffts2(is, d1, d2, d3, x, xout, y)
917 c---------------------------------------------------------------------
918 c---------------------------------------------------------------------
923 integer is, d1, d2, d3, logd2
924 double complex x(d1,d2,d3)
925 double complex xout(d1,d2,d3)
926 double complex y(fftblockpad, d2, 2)
932 do ii = 0, d1 - fftblock, fftblock
933 if (timers_enabled) call timer_start(T_fftcopy)
936 y(i,j,1) = x(i+ii,j,k)
939 if (timers_enabled) call timer_stop(T_fftcopy)
941 if (timers_enabled) call timer_start(T_fftlow)
942 call cfftz (is, logd2, d2, y, y(1, 1, 2))
943 if (timers_enabled) call timer_stop(T_fftlow)
945 if (timers_enabled) call timer_start(T_fftcopy)
948 xout(i+ii,j,k) = y(i,j,1)
951 if (timers_enabled) call timer_stop(T_fftcopy)
959 c---------------------------------------------------------------------
960 c---------------------------------------------------------------------
962 subroutine cffts3(is, d1, d2, d3, x, xout, y)
964 c---------------------------------------------------------------------
965 c---------------------------------------------------------------------
970 integer is, d1, d2, d3, logd3
971 double complex x(d1,d2,d3)
972 double complex xout(d1,d2,d3)
973 double complex y(fftblockpad, d3, 2)
979 do ii = 0, d1 - fftblock, fftblock
980 if (timers_enabled) call timer_start(T_fftcopy)
983 y(i,k,1) = x(i+ii,j,k)
986 if (timers_enabled) call timer_stop(T_fftcopy)
988 if (timers_enabled) call timer_start(T_fftlow)
989 call cfftz (is, logd3, d3, y, y(1, 1, 2))
990 if (timers_enabled) call timer_stop(T_fftlow)
992 if (timers_enabled) call timer_start(T_fftcopy)
995 xout(i+ii,j,k) = y(i,k,1)
998 if (timers_enabled) call timer_stop(T_fftcopy)
1006 c---------------------------------------------------------------------
1007 c---------------------------------------------------------------------
1009 subroutine fft_init (n)
1011 c---------------------------------------------------------------------
1012 c---------------------------------------------------------------------
1014 c---------------------------------------------------------------------
1015 c compute the roots-of-unity array that will be used for subsequent FFTs.
1016 c---------------------------------------------------------------------
1021 integer m,n,nu,ku,i,j,ln
1022 double precision t, ti
1025 c---------------------------------------------------------------------
1026 c Initialize the U array with sines and cosines in a manner that permits
1027 c stride one access at each FFT iteration.
1028 c---------------------------------------------------------------------
1040 u(i+ku) = dcmplx (cos (ti), sin(ti))
1050 c---------------------------------------------------------------------
1051 c---------------------------------------------------------------------
1053 subroutine cfftz (is, m, n, x, y)
1055 c---------------------------------------------------------------------
1056 c---------------------------------------------------------------------
1058 c---------------------------------------------------------------------
1059 c Computes NY N-point complex-to-complex FFTs of X using an algorithm due
1060 c to Swarztrauber. X is both the input and the output array, while Y is a
1061 c scratch array. It is assumed that N = 2^M. Before calling CFFTZ to
1062 c perform FFTs, the array U must be initialized by calling CFFTZ with IS
1063 c set to 0 and M set to MX, where MX is the maximum value of M for any
1065 c---------------------------------------------------------------------
1070 integer is,m,n,i,j,l,mx
1073 dimension x(fftblockpad,n), y(fftblockpad,n)
1075 c---------------------------------------------------------------------
1076 c Check if input parameters are invalid.
1077 c---------------------------------------------------------------------
1079 if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx)
1081 write (*, 1) is, m, mx
1082 1 format ('CFFTZ: Either U has not been initialized, or else'/
1083 > 'one of the input parameters is invalid', 3I5)
1087 c---------------------------------------------------------------------
1088 c Perform one variant of the Stockham FFT.
1089 c---------------------------------------------------------------------
1091 call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
1092 if (l .eq. m) goto 160
1093 call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
1098 c---------------------------------------------------------------------
1100 c---------------------------------------------------------------------
1112 c---------------------------------------------------------------------
1113 c---------------------------------------------------------------------
1115 subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)
1117 c---------------------------------------------------------------------
1118 c---------------------------------------------------------------------
1120 c---------------------------------------------------------------------
1121 c Performs the L-th iteration of the second variant of the Stockham FFT.
1122 c---------------------------------------------------------------------
1126 integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
1127 double complex u,x,y,u1,x11,x21
1128 dimension u(n), x(ny1,n), y(ny1,n)
1131 c---------------------------------------------------------------------
1132 c Set initial parameters.
1133 c---------------------------------------------------------------------
1149 u1 = dconjg (u(ku+i))
1152 c---------------------------------------------------------------------
1153 c This loop is vectorizable.
1154 c---------------------------------------------------------------------
1159 y(j,i21+k) = x11 + x21
1160 y(j,i22+k) = u1 * (x11 - x21)
1168 c---------------------------------------------------------------------
1171 c---------------------------------------------------------------------
1172 c---------------------------------------------------------------------
1174 integer function ilog2(n)
1176 c---------------------------------------------------------------------
1177 c---------------------------------------------------------------------
1187 do while (nn .lt. n)
1196 c---------------------------------------------------------------------
1197 c---------------------------------------------------------------------
1199 subroutine transpose_x_yz(l1, l2, xin, xout)
1201 c---------------------------------------------------------------------
1202 c---------------------------------------------------------------------
1207 double complex xin(ntdivnp), xout(ntdivnp)
1209 call transpose2_local(dims(1,l1),dims(2, l1)*dims(3, l1),
1212 call transpose2_global(xout, xin)
1214 call transpose2_finish(dims(1,l1),dims(2, l1)*dims(3, l1),
1221 c---------------------------------------------------------------------
1222 c---------------------------------------------------------------------
1224 subroutine transpose_xy_z(l1, l2, xin, xout)
1226 c---------------------------------------------------------------------
1227 c---------------------------------------------------------------------
1232 double complex xin(ntdivnp), xout(ntdivnp)
1234 call transpose2_local(dims(1,l1)*dims(2, l1),dims(3, l1),
1236 call transpose2_global(xout, xin)
1237 call transpose2_finish(dims(1,l1)*dims(2, l1),dims(3, l1),
1245 c---------------------------------------------------------------------
1246 c---------------------------------------------------------------------
1248 subroutine transpose2_local(n1, n2, xin, xout)
1250 c---------------------------------------------------------------------
1251 c---------------------------------------------------------------------
1257 double complex xin(n1, n2), xout(n2, n1)
1259 double complex z(transblockpad, transblock)
1261 integer i, j, ii, jj
1263 if (timers_enabled) call timer_start(T_transxzloc)
1265 c---------------------------------------------------------------------
1266 c If possible, block the transpose for cache memory systems.
1267 c How much does this help? Example: R8000 Power Challenge (90 MHz)
1268 c Blocked version decreases time spend in this routine
1269 c from 14 seconds to 5.2 seconds on 8 nodes class A.
1270 c---------------------------------------------------------------------
1272 if (n1 .lt. transblock .or. n2 .lt. transblock) then
1273 if (n1 .ge. n2) then
1276 xout(j, i) = xin(i, j)
1282 xout(j, i) = xin(i, j)
1287 do j = 0, n2-1, transblock
1288 do i = 0, n1-1, transblock
1290 c---------------------------------------------------------------------
1291 c Note: compiler should be able to take j+jj out of inner loop
1292 c---------------------------------------------------------------------
1293 do jj = 1, transblock
1294 do ii = 1, transblock
1295 z(jj,ii) = xin(i+ii, j+jj)
1299 do ii = 1, transblock
1300 do jj = 1, transblock
1301 xout(j+jj, i+ii) = z(jj,ii)
1308 if (timers_enabled) call timer_stop(T_transxzloc)
1314 c---------------------------------------------------------------------
1315 c---------------------------------------------------------------------
1317 subroutine transpose2_global(xin, xout)
1319 c---------------------------------------------------------------------
1320 c---------------------------------------------------------------------
1325 double complex xin(ntdivnp)
1326 double complex xout(ntdivnp)
1329 if (timers_enabled) call synchup()
1331 if (timers_enabled) call timer_start(T_transxzglo)
1332 call mpi_alltoall(xin, ntdivnp/np, dc_type,
1333 > xout, ntdivnp/np, dc_type,
1335 if (timers_enabled) call timer_stop(T_transxzglo)
1342 c---------------------------------------------------------------------
1343 c---------------------------------------------------------------------
1345 subroutine transpose2_finish(n1, n2, xin, xout)
1347 c---------------------------------------------------------------------
1348 c---------------------------------------------------------------------
1352 integer n1, n2, ioff
1353 double complex xin(n2, n1/np2, 0:np2-1), xout(n2*np2, n1/np2)
1357 if (timers_enabled) call timer_start(T_transxzfin)
1362 xout(i+ioff, j) = xin(i, j, p)
1366 if (timers_enabled) call timer_stop(T_transxzfin)
1372 c---------------------------------------------------------------------
1373 c---------------------------------------------------------------------
1375 subroutine transpose_x_z(l1, l2, xin, xout)
1377 c---------------------------------------------------------------------
1378 c---------------------------------------------------------------------
1383 double complex xin(ntdivnp), xout(ntdivnp)
1385 call transpose_x_z_local(dims(1,l1),dims(2,l1),dims(3,l1),
1387 call transpose_x_z_global(dims(1,l1),dims(2,l1),dims(3,l1),
1389 call transpose_x_z_finish(dims(1,l2),dims(2,l2),dims(3,l2),
1395 c---------------------------------------------------------------------
1396 c---------------------------------------------------------------------
1398 subroutine transpose_x_z_local(d1, d2, d3, xin, xout)
1400 c---------------------------------------------------------------------
1401 c---------------------------------------------------------------------
1406 double complex xin(d1,d2,d3)
1407 double complex xout(d3,d2,d1)
1408 integer block1, block3
1409 integer i, j, k, kk, ii, i1, k1
1411 double complex buf(transblockpad, maxdim)
1412 if (timers_enabled) call timer_start(T_transxzloc)
1413 if (d1 .lt. 32) goto 100
1415 if (block3 .eq. 1) goto 100
1416 if (block3 .gt. transblock) block3 = transblock
1418 if (block1*block3 .gt. transblock*transblock)
1419 > block1 = transblock*transblock/block3
1420 c---------------------------------------------------------------------
1422 c---------------------------------------------------------------------
1424 do kk = 0, d3-block3, block3
1425 do ii = 0, d1-block1, block1
1430 buf(k, i) = xin(i+ii, j, k1)
1437 xout(k+kk, j, i1) = buf(k, i)
1447 c---------------------------------------------------------------------
1449 c---------------------------------------------------------------------
1455 xout(k, j, i) = xin(i, j, k)
1460 c---------------------------------------------------------------------
1462 c---------------------------------------------------------------------
1465 if (timers_enabled) call timer_stop(T_transxzloc)
1470 c---------------------------------------------------------------------
1471 c---------------------------------------------------------------------
1473 subroutine transpose_x_z_global(d1, d2, d3, xin, xout)
1475 c---------------------------------------------------------------------
1476 c---------------------------------------------------------------------
1482 double complex xin(d3,d2,d1)
1483 double complex xout(d3,d2,d1) ! not real layout, but right size
1486 if (timers_enabled) call synchup()
1488 c---------------------------------------------------------------------
1489 c do transpose among all processes with same 1-coord (me1)
1490 c---------------------------------------------------------------------
1491 if (timers_enabled)call timer_start(T_transxzglo)
1492 call mpi_alltoall(xin, d1*d2*d3/np2, dc_type,
1493 > xout, d1*d2*d3/np2, dc_type,
1495 if (timers_enabled) call timer_stop(T_transxzglo)
1500 c---------------------------------------------------------------------
1501 c---------------------------------------------------------------------
1503 subroutine transpose_x_z_finish(d1, d2, d3, xin, xout)
1505 c---------------------------------------------------------------------
1506 c---------------------------------------------------------------------
1511 double complex xin(d1/np2, d2, d3, 0:np2-1)
1512 double complex xout(d1,d2,d3)
1513 integer i, j, k, p, ioff
1514 if (timers_enabled) call timer_start(T_transxzfin)
1515 c---------------------------------------------------------------------
1516 c this is the most straightforward way of doing it. the
1517 c calculation in the inner loop doesn't help.
1523 c xout(ii, j, k) = xin(i, j, k, p)
1528 c---------------------------------------------------------------------
1535 xout(i+ioff, j, k) = xin(i, j, k, p)
1540 if (timers_enabled) call timer_stop(T_transxzfin)
1545 c---------------------------------------------------------------------
1546 c---------------------------------------------------------------------
1548 subroutine transpose_x_y(l1, l2, xin, xout)
1550 c---------------------------------------------------------------------
1551 c---------------------------------------------------------------------
1556 double complex xin(ntdivnp), xout(ntdivnp)
1558 c---------------------------------------------------------------------
1559 c xy transpose is a little tricky, since we don't want
1560 c to touch 3rd axis. But alltoall must involve 3rd axis (most
1561 c slowly varying) to be efficient. So we do
1562 c (nx, ny/np1, nz/np2) -> (ny/np1, nz/np2, nx) (local)
1563 c (ny/np1, nz/np2, nx) -> ((ny/np1*nz/np2)*np1, nx/np1) (global)
1564 c then local finish.
1565 c---------------------------------------------------------------------
1568 call transpose_x_y_local(dims(1,l1),dims(2,l1),dims(3,l1),
1570 call transpose_x_y_global(dims(1,l1),dims(2,l1),dims(3,l1),
1572 call transpose_x_y_finish(dims(1,l2),dims(2,l2),dims(3,l2),
1579 c---------------------------------------------------------------------
1580 c---------------------------------------------------------------------
1582 subroutine transpose_x_y_local(d1, d2, d3, xin, xout)
1584 c---------------------------------------------------------------------
1585 c---------------------------------------------------------------------
1590 double complex xin(d1, d2, d3)
1591 double complex xout(d2, d3, d1)
1593 if (timers_enabled) call timer_start(T_transxyloc)
1598 xout(j,k,i)=xin(i,j,k)
1602 if (timers_enabled) call timer_stop(T_transxyloc)
1607 c---------------------------------------------------------------------
1608 c---------------------------------------------------------------------
1610 subroutine transpose_x_y_global(d1, d2, d3, xin, xout)
1612 c---------------------------------------------------------------------
1613 c---------------------------------------------------------------------
1619 c---------------------------------------------------------------------
1620 c array is in form (ny/np1, nz/np2, nx)
1621 c---------------------------------------------------------------------
1622 double complex xin(d2,d3,d1)
1623 double complex xout(d2,d3,d1) ! not real layout but right size
1626 if (timers_enabled) call synchup()
1628 c---------------------------------------------------------------------
1629 c do transpose among all processes with same 1-coord (me1)
1630 c---------------------------------------------------------------------
1631 if (timers_enabled) call timer_start(T_transxyglo)
1632 call mpi_alltoall(xin, d1*d2*d3/np1, dc_type,
1633 > xout, d1*d2*d3/np1, dc_type,
1635 if (timers_enabled) call timer_stop(T_transxyglo)
1641 c---------------------------------------------------------------------
1642 c---------------------------------------------------------------------
1644 subroutine transpose_x_y_finish(d1, d2, d3, xin, xout)
1646 c---------------------------------------------------------------------
1647 c---------------------------------------------------------------------
1652 double complex xin(d1/np1, d3, d2, 0:np1-1)
1653 double complex xout(d1,d2,d3)
1654 integer i, j, k, p, ioff
1655 if (timers_enabled) call timer_start(T_transxyfin)
1656 c---------------------------------------------------------------------
1657 c this is the most straightforward way of doing it. the
1658 c calculation in the inner loop doesn't help.
1664 c note order is screwy bcz we have (ny/np1, nz/np2, nx) -> (ny, nx/np1, nz/np2)
1665 c xout(ii, j, k) = xin(i, k, j, p)
1670 c---------------------------------------------------------------------
1677 xout(i+ioff, j, k) = xin(i, k, j, p)
1682 if (timers_enabled) call timer_stop(T_transxyfin)
1687 c---------------------------------------------------------------------
1688 c---------------------------------------------------------------------
1690 subroutine checksum(i, u1, d1, d2, d3)
1692 c---------------------------------------------------------------------
1693 c---------------------------------------------------------------------
1698 integer i, d1, d2, d3
1699 double complex u1(d1, d2, d3)
1700 integer j, q,r,s, ierr
1701 double complex chk,allchk
1706 if (q .ge. xstart(1) .and. q .le. xend(1)) then
1708 if (r .ge. ystart(1) .and. r .le. yend(1)) then
1710 if (s .ge. zstart(1) .and. s .le. zend(1)) then
1711 chk=chk+u1(q-xstart(1)+1,r-ystart(1)+1,s-zstart(1)+1)
1718 call MPI_Reduce(chk, allchk, 1, dc_type, MPI_SUM,
1719 > 0, MPI_COMM_WORLD, ierr)
1721 write (*, 30) i, allchk
1722 30 format (' T =',I5,5X,'Checksum =',1P2D22.12)
1726 c If we compute the checksum for diagnostic purposes, we let i be
1727 c negative, so the result will not be stored in an array
1728 if (i .gt. 0) sums(i) = allchk
1733 c---------------------------------------------------------------------
1734 c---------------------------------------------------------------------
1738 c---------------------------------------------------------------------
1739 c---------------------------------------------------------------------
1745 call timer_start(T_synch)
1746 call mpi_barrier(MPI_COMM_WORLD, ierr)
1747 call timer_stop(T_synch)
1752 c---------------------------------------------------------------------
1753 c---------------------------------------------------------------------
1755 subroutine verify (d1, d2, d3, nt, verified, class)
1757 c---------------------------------------------------------------------
1758 c---------------------------------------------------------------------
1763 integer d1, d2, d3, nt
1766 integer ierr, size, i
1767 double precision err, epsilon
1769 c---------------------------------------------------------------------
1770 c Reference checksums
1771 c---------------------------------------------------------------------
1772 double complex csum_ref(25)
1777 if (me .ne. 0) return
1782 if (d1 .eq. 64 .and.
1786 c---------------------------------------------------------------------
1787 c Sample size reference checksums
1788 c---------------------------------------------------------------------
1790 csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
1791 csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
1792 csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
1793 csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
1794 csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
1795 csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)
1797 else if (d1 .eq. 128 .and.
1801 c---------------------------------------------------------------------
1802 c Class W size reference checksums
1803 c---------------------------------------------------------------------
1805 csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
1806 csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
1807 csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
1808 csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
1809 csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
1810 csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)
1812 else if (d1 .eq. 256 .and.
1816 c---------------------------------------------------------------------
1817 c Class A size reference checksums
1818 c---------------------------------------------------------------------
1820 csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
1821 csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
1822 csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
1823 csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
1824 csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
1825 csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)
1827 else if (d1 .eq. 512 .and.
1831 c---------------------------------------------------------------------
1832 c Class B size reference checksums
1833 c---------------------------------------------------------------------
1835 csum_ref(1) = dcmplx(5.177643571579D+02, 5.077803458597D+02)
1836 csum_ref(2) = dcmplx(5.154521291263D+02, 5.088249431599D+02)
1837 csum_ref(3) = dcmplx(5.146409228649D+02, 5.096208912659D+02)
1838 csum_ref(4) = dcmplx(5.142378756213D+02, 5.101023387619D+02)
1839 csum_ref(5) = dcmplx(5.139626667737D+02, 5.103976610617D+02)
1840 csum_ref(6) = dcmplx(5.137423460082D+02, 5.105948019802D+02)
1841 csum_ref(7) = dcmplx(5.135547056878D+02, 5.107404165783D+02)
1842 csum_ref(8) = dcmplx(5.133910925466D+02, 5.108576573661D+02)
1843 csum_ref(9) = dcmplx(5.132470705390D+02, 5.109577278523D+02)
1844 csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
1845 csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
1846 csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
1847 csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
1848 csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
1849 csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
1850 csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
1851 csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
1852 csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
1853 csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
1854 csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)
1856 else if (d1 .eq. 512 .and.
1860 c---------------------------------------------------------------------
1861 c Class C size reference checksums
1862 c---------------------------------------------------------------------
1864 csum_ref(1) = dcmplx(5.195078707457D+02, 5.149019699238D+02)
1865 csum_ref(2) = dcmplx(5.155422171134D+02, 5.127578201997D+02)
1866 csum_ref(3) = dcmplx(5.144678022222D+02, 5.122251847514D+02)
1867 csum_ref(4) = dcmplx(5.140150594328D+02, 5.121090289018D+02)
1868 csum_ref(5) = dcmplx(5.137550426810D+02, 5.121143685824D+02)
1869 csum_ref(6) = dcmplx(5.135811056728D+02, 5.121496764568D+02)
1870 csum_ref(7) = dcmplx(5.134569343165D+02, 5.121870921893D+02)
1871 csum_ref(8) = dcmplx(5.133651975661D+02, 5.122193250322D+02)
1872 csum_ref(9) = dcmplx(5.132955192805D+02, 5.122454735794D+02)
1873 csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
1874 csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
1875 csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
1876 csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
1877 csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
1878 csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
1879 csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
1880 csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
1881 csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
1882 csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
1883 csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)
1885 else if (d1 .eq. 2048 .and.
1886 > d2 .eq. 1024 .and.
1887 > d3 .eq. 1024 .and.
1889 c---------------------------------------------------------------------
1890 c Class D size reference checksums
1891 c---------------------------------------------------------------------
1893 csum_ref(1) = dcmplx(5.122230065252D+02, 5.118534037109D+02)
1894 csum_ref(2) = dcmplx(5.120463975765D+02, 5.117061181082D+02)
1895 csum_ref(3) = dcmplx(5.119865766760D+02, 5.117096364601D+02)
1896 csum_ref(4) = dcmplx(5.119518799488D+02, 5.117373863950D+02)
1897 csum_ref(5) = dcmplx(5.119269088223D+02, 5.117680347632D+02)
1898 csum_ref(6) = dcmplx(5.119082416858D+02, 5.117967875532D+02)
1899 csum_ref(7) = dcmplx(5.118943814638D+02, 5.118225281841D+02)
1900 csum_ref(8) = dcmplx(5.118842385057D+02, 5.118451629348D+02)
1901 csum_ref(9) = dcmplx(5.118769435632D+02, 5.118649119387D+02)
1902 csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
1903 csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
1904 csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
1905 csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
1906 csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
1907 csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
1908 csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
1909 csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
1910 csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
1911 csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
1912 csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
1913 csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
1914 csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
1915 csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
1916 csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
1917 csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)
1919 else if (d1 .eq. 4096 .and.
1920 > d2 .eq. 2048 .and.
1921 > d3 .eq. 2048 .and.
1923 c---------------------------------------------------------------------
1924 c Class E size reference checksums
1925 c---------------------------------------------------------------------
1927 csum_ref(1) = dcmplx(5.121601045346D+02, 5.117395998266D+02)
1928 csum_ref(2) = dcmplx(5.120905403678D+02, 5.118614716182D+02)
1929 csum_ref(3) = dcmplx(5.120623229306D+02, 5.119074203747D+02)
1930 csum_ref(4) = dcmplx(5.120438418997D+02, 5.119345900733D+02)
1931 csum_ref(5) = dcmplx(5.120311521872D+02, 5.119551325550D+02)
1932 csum_ref(6) = dcmplx(5.120226088809D+02, 5.119720179919D+02)
1933 csum_ref(7) = dcmplx(5.120169296534D+02, 5.119861371665D+02)
1934 csum_ref(8) = dcmplx(5.120131225172D+02, 5.119979364402D+02)
1935 csum_ref(9) = dcmplx(5.120104767108D+02, 5.120077674092D+02)
1936 csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
1937 csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
1938 csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
1939 csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
1940 csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
1941 csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
1942 csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
1943 csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
1944 csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
1945 csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
1946 csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
1947 csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
1948 csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
1949 csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
1950 csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
1951 csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)
1956 if (class .ne. 'U') then
1959 err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
1960 if (.not.(err .le. epsilon)) goto 100
1967 call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
1968 if (size .ne. np) then
1972 c---------------------------------------------------------------------
1973 c multiple statements because some Fortran compilers have
1974 c problems with long strings.
1975 c---------------------------------------------------------------------
1976 4010 format( ' Warning: benchmark was compiled for ', i5,
1978 4011 format( ' Must be run on this many processors for official',
1980 4012 format( ' so memory access is repeatable')
1984 if (class .ne. 'U') then
1987 2000 format(' Result verification successful')
1990 2001 format(' Result verification failed')
1993 print *, 'class = ', class