examples/smpi/NAS/SP/y_solve.f

   1
   2 c---------------------------------------------------------------------
   3 c---------------------------------------------------------------------
   4
   5        subroutine y_solve
   6
   7 c---------------------------------------------------------------------
   8 c---------------------------------------------------------------------
   9
  10 c---------------------------------------------------------------------
  11 c this function performs the solution of the approximate factorization
  12 c step in the y-direction for all five matrix components
  13 c simultaneously. The Thomas algorithm is employed to solve the
  14 c systems for the y-lines. Boundary conditions are non-periodic
  15 c---------------------------------------------------------------------
  16
  17        include 'header.h'
  18        include 'mpinpb.h'
  19
  20        integer i, j, k, stage, ip, kp, n, isize, jend, ksize, j1, j2,
  21      >         buffer_size, c, m, p, jstart, error,
  22      >         requests(2), statuses(MPI_STATUS_SIZE, 2)
  23        double precision  r1, r2, d, e, s(5), sm1, sm2,
  24      >                   fac1, fac2
  25
  26
  27 c---------------------------------------------------------------------
  28 c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
  29 c on this node in the direction of increasing i for the forward sweep,
  30 c and after that reversing the direction for the backsubstitution
  31 c---------------------------------------------------------------------
  32
  33 c---------------------------------------------------------------------
  34 c                          FORWARD ELIMINATION
  35 c---------------------------------------------------------------------
  36        do    stage = 1, ncells
  37           c      = slice(2,stage)
  38
  39           jstart = 0
  40           jend   = cell_size(2,c)-1
  41
  42           isize     = cell_size(1,c)
  43           ksize     = cell_size(3,c)
  44           ip        = cell_coord(1,c)-1
  45           kp        = cell_coord(3,c)-1
  46
  47           buffer_size = (isize-start(1,c)-end(1,c)) *
  48      >                  (ksize-start(3,c)-end(3,c))
  49
  50           if ( stage .ne. 1) then
  51
  52 c---------------------------------------------------------------------
  53 c            if this is not the first processor in this row of cells,
  54 c            receive data from predecessor containing the right hand
  55 c            sides and the upper diagonal elements of the previous two rows
  56 c---------------------------------------------------------------------
  57
  58              call mpi_irecv(in_buffer, 22*buffer_size,
  59      >                      dp_type, predecessor(2),
  60      >                      DEFAULT_TAG, comm_solve,
  61      >                      requests(1), error)
  62
  63 c---------------------------------------------------------------------
  64 c            communication has already been started.
  65 c            compute the left hand side while waiting for the msg
  66 c---------------------------------------------------------------------
  67              call lhsy(c)
  68
  69 c---------------------------------------------------------------------
  70 c            wait for pending communication to complete
  71 c            This waits on the current receive and on the send
  72 c            from the previous stage. They always come in pairs.
  73 c---------------------------------------------------------------------
  74              call mpi_waitall(2, requests, statuses, error)
  75
  76 c---------------------------------------------------------------------
  77 c            unpack the buffer
  78 c---------------------------------------------------------------------
  79              j  = jstart
  80              j1 = jstart + 1
  81              n = 0
  82 c---------------------------------------------------------------------
  83 c            create a running pointer
  84 c---------------------------------------------------------------------
  85              p = 0
  86              do    k = start(3,c), ksize-end(3,c)-1
  87                 do    i = start(1,c), isize-end(1,c)-1
  88                    lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
  89      >                       in_buffer(p+1) * lhs(i,j,k,n+1,c)
  90                    lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
  91      >                       in_buffer(p+2) * lhs(i,j,k,n+1,c)
  92                    do    m = 1, 3
  93                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
  94      >                       in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
  95                    end do
  96                    d            = in_buffer(p+6)
  97                    e            = in_buffer(p+7)
  98                    do    m = 1, 3
  99                       s(m) = in_buffer(p+7+m)
 100                    end do
 101                    r1 = lhs(i,j,k,n+2,c)
 102                    lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
 103                    lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
 104                    do    m = 1, 3
 105                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
 106                    end do
 107                    r2 = lhs(i,j1,k,n+1,c)
 108                    lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2
 109                    lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2
 110                    do    m = 1, 3
 111                       rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - s(m) * r2
 112                    end do
 113                    p = p + 10
 114                 end do
 115              end do
 116
 117              do    m = 4, 5
 118                 n = (m-3)*5
 119                 do    k = start(3,c), ksize-end(3,c)-1
 120                    do    i = start(1,c), isize-end(1,c)-1
 121                       lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
 122      >                          in_buffer(p+1) * lhs(i,j,k,n+1,c)
 123                       lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
 124      >                          in_buffer(p+2) * lhs(i,j,k,n+1,c)
 125                       rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) -
 126      >                          in_buffer(p+3) * lhs(i,j,k,n+1,c)
 127                       d                = in_buffer(p+4)
 128                       e                = in_buffer(p+5)
 129                       s(m)             = in_buffer(p+6)
 130                       r1 = lhs(i,j,k,n+2,c)
 131                       lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
 132                       lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
 133                       rhs(i,j,k,m,c)   = rhs(i,j,k,m,c) - s(m) * r1
 134                       r2 = lhs(i,j1,k,n+1,c)
 135                       lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2
 136                       lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2
 137                       rhs(i,j1,k,m,c)   = rhs(i,j1,k,m,c) - s(m) * r2
 138                       p = p + 6
 139                    end do
 140                 end do
 141              end do
 142
 143           else
 144
 145 c---------------------------------------------------------------------
 146 c            if this IS the first cell, we still compute the lhs
 147 c---------------------------------------------------------------------
 148              call lhsy(c)
 149           endif
 150
 151 c---------------------------------------------------------------------
 152 c         perform the Thomas algorithm; first, FORWARD ELIMINATION
 153 c---------------------------------------------------------------------
 154           n = 0
 155
 156           do    k = start(3,c), ksize-end(3,c)-1
 157              do    j = jstart, jend-2
 158                 do    i = start(1,c), isize-end(1,c)-1
 159                    j1 = j  + 1
 160                    j2 = j  + 2
 161                    fac1               = 1.d0/lhs(i,j,k,n+3,c)
 162                    lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
 163                    lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
 164                    do    m = 1, 3
 165                       rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
 166                    end do
 167                    lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
 168      >                         lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
 169                    lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
 170      >                         lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
 171                    do    m = 1, 3
 172                       rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
 173      >                         lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
 174                    end do
 175                    lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) -
 176      >                         lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c)
 177                    lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) -
 178      >                         lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c)
 179                    do    m = 1, 3
 180                       rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) -
 181      >                         lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c)
 182                    end do
 183                 end do
 184              end do
 185           end do
 186
 187 c---------------------------------------------------------------------
 188 c         The last two rows in this grid block are a bit different,
 189 c         since they do not have two more rows available for the
 190 c         elimination of off-diagonal entries
 191 c---------------------------------------------------------------------
 192
 193           j  = jend - 1
 194           j1 = jend
 195           do    k = start(3,c), ksize-end(3,c)-1
 196              do    i = start(1,c), isize-end(1,c)-1
 197                 fac1               = 1.d0/lhs(i,j,k,n+3,c)
 198                 lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
 199                 lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
 200                 do    m = 1, 3
 201                    rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
 202                 end do
 203                 lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
 204      >                      lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
 205                 lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
 206      >                      lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
 207                 do    m = 1, 3
 208                    rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
 209      >                      lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
 210                 end do
 211 c---------------------------------------------------------------------
 212 c               scale the last row immediately (some of this is
 213 c               overkill in case this is the last cell)
 214 c---------------------------------------------------------------------
 215                 fac2               = 1.d0/lhs(i,j1,k,n+3,c)
 216                 lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c)
 217                 lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c)
 218                 do    m = 1, 3
 219                    rhs(i,j1,k,m,c) = fac2*rhs(i,j1,k,m,c)
 220                 end do
 221              end do
 222           end do
 223
 224 c---------------------------------------------------------------------
 225 c         do the u+c and the u-c factors
 226 c---------------------------------------------------------------------
 227           do    m = 4, 5
 228              n = (m-3)*5
 229              do    k = start(3,c), ksize-end(3,c)-1
 230                 do    j = jstart, jend-2
 231                    do    i = start(1,c), isize-end(1,c)-1
 232                    j1 = j  + 1
 233                    j2 = j  + 2
 234                    fac1               = 1.d0/lhs(i,j,k,n+3,c)
 235                    lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
 236                    lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
 237                    rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
 238                    lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
 239      >                         lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
 240                    lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
 241      >                         lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
 242                    rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
 243      >                         lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
 244                    lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) -
 245      >                         lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c)
 246                    lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) -
 247      >                         lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c)
 248                    rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) -
 249      >                         lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c)
 250                 end do
 251              end do
 252           end do
 253
 254 c---------------------------------------------------------------------
 255 c            And again the last two rows separately
 256 c---------------------------------------------------------------------
 257              j  = jend - 1
 258              j1 = jend
 259              do    k = start(3,c), ksize-end(3,c)-1
 260                 do    i = start(1,c), isize-end(1,c)-1
 261                 fac1               = 1.d0/lhs(i,j,k,n+3,c)
 262                 lhs(i,j,k,n+4,c)   = fac1*lhs(i,j,k,n+4,c)
 263                 lhs(i,j,k,n+5,c)   = fac1*lhs(i,j,k,n+5,c)
 264                 rhs(i,j,k,m,c)     = fac1*rhs(i,j,k,m,c)
 265                 lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
 266      >                      lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
 267                 lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
 268      >                      lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
 269                 rhs(i,j1,k,m,c)   = rhs(i,j1,k,m,c) -
 270      >                      lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
 271 c---------------------------------------------------------------------
 272 c               Scale the last row immediately (some of this is overkill
 273 c               if this is the last cell)
 274 c---------------------------------------------------------------------
 275                 fac2               = 1.d0/lhs(i,j1,k,n+3,c)
 276                 lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c)
 277                 lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c)
 278                 rhs(i,j1,k,m,c)   = fac2*rhs(i,j1,k,m,c)
 279
 280              end do
 281           end do
 282        end do
 283
 284 c---------------------------------------------------------------------
 285 c         send information to the next processor, except when this
 286 c         is the last grid block;
 287 c---------------------------------------------------------------------
 288
 289           if (stage .ne. ncells) then
 290
 291 c---------------------------------------------------------------------
 292 c            create a running pointer for the send buffer
 293 c---------------------------------------------------------------------
 294              p = 0
 295              n = 0
 296              do    k = start(3,c), ksize-end(3,c)-1
 297                 do    i = start(1,c), isize-end(1,c)-1
 298                    do    j = jend-1, jend
 299                       out_buffer(p+1) = lhs(i,j,k,n+4,c)
 300                       out_buffer(p+2) = lhs(i,j,k,n+5,c)
 301                       do    m = 1, 3
 302                          out_buffer(p+2+m) = rhs(i,j,k,m,c)
 303                       end do
 304                       p = p+5
 305                    end do
 306                 end do
 307              end do
 308
 309              do    m = 4, 5
 310                 n = (m-3)*5
 311                 do    k = start(3,c), ksize-end(3,c)-1
 312                    do    i = start(1,c), isize-end(1,c)-1
 313                       do    j = jend-1, jend
 314                          out_buffer(p+1) = lhs(i,j,k,n+4,c)
 315                          out_buffer(p+2) = lhs(i,j,k,n+5,c)
 316                          out_buffer(p+3) = rhs(i,j,k,m,c)
 317                          p = p + 3
 318                       end do
 319                    end do
 320                 end do
 321              end do
 322
 323 c---------------------------------------------------------------------
 324 c            pack and send the buffer
 325 c---------------------------------------------------------------------
 326              call mpi_isend(out_buffer, 22*buffer_size,
 327      >                     dp_type, successor(2),
 328      >                     DEFAULT_TAG, comm_solve,
 329      >                     requests(2), error)
 330
 331           endif
 332        end do
 333
 334 c---------------------------------------------------------------------
 335 c      now go in the reverse direction
 336 c---------------------------------------------------------------------
 337
 338 c---------------------------------------------------------------------
 339 c                         BACKSUBSTITUTION
 340 c---------------------------------------------------------------------
 341        do    stage = ncells, 1, -1
 342           c = slice(2,stage)
 343
 344           jstart = 0
 345           jend   = cell_size(2,c)-1
 346
 347           isize = cell_size(1,c)
 348           ksize = cell_size(3,c)
 349           ip    = cell_coord(1,c)-1
 350           kp    = cell_coord(3,c)-1
 351
 352           buffer_size = (isize-start(1,c)-end(1,c)) *
 353      >                  (ksize-start(3,c)-end(3,c))
 354
 355           if (stage .ne. ncells) then
 356
 357 c---------------------------------------------------------------------
 358 c            if this is not the starting cell in this row of cells,
 359 c            wait for a message to be received, containing the
 360 c            solution of the previous two stations
 361 c---------------------------------------------------------------------
 362
 363              call mpi_irecv(in_buffer, 10*buffer_size,
 364      >                      dp_type, successor(2),
 365      >                      DEFAULT_TAG, comm_solve,
 366      >                      requests(1), error)
 367
 368
 369 c---------------------------------------------------------------------
 370 c            communication has already been started
 371 c            while waiting, do the block-diagonal inversion for the
 372 c            cell that was just finished
 373 c---------------------------------------------------------------------
 374
 375              call pinvr(slice(2,stage+1))
 376
 377 c---------------------------------------------------------------------
 378 c            wait for pending communication to complete
 379 c---------------------------------------------------------------------
 380              call mpi_waitall(2, requests, statuses, error)
 381
 382 c---------------------------------------------------------------------
 383 c            unpack the buffer for the first three factors
 384 c---------------------------------------------------------------------
 385              n = 0
 386              p = 0
 387              j  = jend
 388              j1 = j - 1
 389              do    m = 1, 3
 390                 do   k = start(3,c), ksize-end(3,c)-1
 391                    do   i = start(1,c), isize-end(1,c)-1
 392                       sm1 = in_buffer(p+1)
 393                       sm2 = in_buffer(p+2)
 394                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
 395      >                        lhs(i,j,k,n+4,c)*sm1 -
 396      >                        lhs(i,j,k,n+5,c)*sm2
 397                       rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
 398      >                        lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) -
 399      >                        lhs(i,j1,k,n+5,c) * sm1
 400                       p = p + 2
 401                    end do
 402                 end do
 403              end do
 404
 405 c---------------------------------------------------------------------
 406 c            now unpack the buffer for the remaining two factors
 407 c---------------------------------------------------------------------
 408              do    m = 4, 5
 409                 n = (m-3)*5
 410                 do   k = start(3,c), ksize-end(3,c)-1
 411                    do   i = start(1,c), isize-end(1,c)-1
 412                       sm1 = in_buffer(p+1)
 413                       sm2 = in_buffer(p+2)
 414                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
 415      >                        lhs(i,j,k,n+4,c)*sm1 -
 416      >                        lhs(i,j,k,n+5,c)*sm2
 417                       rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
 418      >                        lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) -
 419      >                        lhs(i,j1,k,n+5,c) * sm1
 420                       p = p + 2
 421                    end do
 422                 end do
 423              end do
 424
 425           else
 426 c---------------------------------------------------------------------
 427 c            now we know this is the first grid block on the back sweep,
 428 c            so we don't need a message to start the substitution.
 429 c---------------------------------------------------------------------
 430
 431              j  = jend - 1
 432              j1 = jend
 433              n = 0
 434              do   m = 1, 3
 435                 do   k = start(3,c), ksize-end(3,c)-1
 436                    do   i = start(1,c), isize-end(1,c)-1
 437                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
 438      >                             lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c)
 439                    end do
 440                 end do
 441              end do
 442
 443              do    m = 4, 5
 444                 n = (m-3)*5
 445                 do   k = start(3,c), ksize-end(3,c)-1
 446                    do   i = start(1,c), isize-end(1,c)-1
 447                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
 448      >                             lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c)
 449                    end do
 450                 end do
 451              end do
 452           endif
 453
 454 c---------------------------------------------------------------------
 455 c         Whether or not this is the last processor, we always have
 456 c         to complete the back-substitution
 457 c---------------------------------------------------------------------
 458
 459 c---------------------------------------------------------------------
 460 c         The first three factors
 461 c---------------------------------------------------------------------
 462           n = 0
 463           do   m = 1, 3
 464              do   k = start(3,c), ksize-end(3,c)-1
 465                 do   j = jend-2, jstart, -1
 466                    do    i = start(1,c), isize-end(1,c)-1
 467                       j1 = j  + 1
 468                       j2 = j  + 2
 469                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
 470      >                          lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) -
 471      >                          lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c)
 472                    end do
 473                 end do
 474              end do
 475           end do
 476
 477 c---------------------------------------------------------------------
 478 c         And the remaining two
 479 c---------------------------------------------------------------------
 480           do    m = 4, 5
 481              n = (m-3)*5
 482              do   k = start(3,c), ksize-end(3,c)-1
 483                 do   j = jend-2, jstart, -1
 484                    do    i = start(1,c), isize-end(1,c)-1
 485                       j1 = j  + 1
 486                       j2 = j1 + 1
 487                       rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
 488      >                          lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) -
 489      >                          lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c)
 490                    end do
 491                 end do
 492              end do
 493           end do
 494
 495 c---------------------------------------------------------------------
 496 c         send on information to the previous processor, if needed
 497 c---------------------------------------------------------------------
 498           if (stage .ne.  1) then
 499              j  = jstart
 500              j1 = jstart + 1
 501              p = 0
 502              do    m = 1, 5
 503                 do    k = start(3,c), ksize-end(3,c)-1
 504                    do    i = start(1,c), isize-end(1,c)-1
 505                       out_buffer(p+1) = rhs(i,j,k,m,c)
 506                       out_buffer(p+2) = rhs(i,j1,k,m,c)
 507                       p = p + 2
 508                    end do
 509                 end do
 510              end do
 511
 512 c---------------------------------------------------------------------
 513 c            pack and send the buffer
 514 c---------------------------------------------------------------------
 515
 516              call mpi_isend(out_buffer, 10*buffer_size,
 517      >                     dp_type, predecessor(2),
 518      >                     DEFAULT_TAG, comm_solve,
 519      >                     requests(2), error)
 520
 521           endif
 522
 523 c---------------------------------------------------------------------
 524 c         If this was the last stage, do the block-diagonal inversion
 525 c---------------------------------------------------------------------
 526           if (stage .eq. 1) call pinvr(c)
 527
 528        end do
 529
 530        return
 531        end
 532
 533
 534
 535
 536
 537
 538