2 c---------------------------------------------------------------------
3 c---------------------------------------------------------------------
7 c---------------------------------------------------------------------
8 c---------------------------------------------------------------------
10 c---------------------------------------------------------------------
11 c this function performs the solution of the approximate factorization
12 c step in the y-direction for all five matrix components
13 c simultaneously. The Thomas algorithm is employed to solve the
14 c systems for the y-lines. Boundary conditions are non-periodic
15 c---------------------------------------------------------------------
20 integer i, j, k, stage, ip, kp, n, isize, jend, ksize, j1, j2,
21 > buffer_size, c, m, p, jstart, error,
22 > requests(2), statuses(MPI_STATUS_SIZE, 2)
23 double precision r1, r2, d, e, s(5), sm1, sm2,
27 c---------------------------------------------------------------------
28 c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
29 c on this node in the direction of increasing i for the forward sweep,
30 c and after that reversing the direction for the backsubstitution
31 c---------------------------------------------------------------------
33 c---------------------------------------------------------------------
35 c---------------------------------------------------------------------
40 jend = cell_size(2,c)-1
42 isize = cell_size(1,c)
43 ksize = cell_size(3,c)
44 ip = cell_coord(1,c)-1
45 kp = cell_coord(3,c)-1
47 buffer_size = (isize-start(1,c)-end(1,c)) *
48 > (ksize-start(3,c)-end(3,c))
50 if ( stage .ne. 1) then
52 c---------------------------------------------------------------------
53 c if this is not the first processor in this row of cells,
54 c receive data from predecessor containing the right hand
55 c sides and the upper diagonal elements of the previous two rows
56 c---------------------------------------------------------------------
58 call mpi_irecv(in_buffer, 22*buffer_size,
59 > dp_type, predecessor(2),
60 > DEFAULT_TAG, comm_solve,
63 c---------------------------------------------------------------------
64 c communication has already been started.
65 c compute the left hand side while waiting for the msg
66 c---------------------------------------------------------------------
69 c---------------------------------------------------------------------
70 c wait for pending communication to complete
71 c This waits on the current receive and on the send
72 c from the previous stage. They always come in pairs.
73 c---------------------------------------------------------------------
74 call mpi_waitall(2, requests, statuses, error)
76 c---------------------------------------------------------------------
78 c---------------------------------------------------------------------
82 c---------------------------------------------------------------------
83 c create a running pointer
84 c---------------------------------------------------------------------
86 do k = start(3,c), ksize-end(3,c)-1
87 do i = start(1,c), isize-end(1,c)-1
88 lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
89 > in_buffer(p+1) * lhs(i,j,k,n+1,c)
90 lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
91 > in_buffer(p+2) * lhs(i,j,k,n+1,c)
93 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
94 > in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
99 s(m) = in_buffer(p+7+m)
101 r1 = lhs(i,j,k,n+2,c)
102 lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
103 lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
105 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
107 r2 = lhs(i,j1,k,n+1,c)
108 lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2
109 lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2
111 rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - s(m) * r2
119 do k = start(3,c), ksize-end(3,c)-1
120 do i = start(1,c), isize-end(1,c)-1
121 lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
122 > in_buffer(p+1) * lhs(i,j,k,n+1,c)
123 lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
124 > in_buffer(p+2) * lhs(i,j,k,n+1,c)
125 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
126 > in_buffer(p+3) * lhs(i,j,k,n+1,c)
129 s(m) = in_buffer(p+6)
130 r1 = lhs(i,j,k,n+2,c)
131 lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
132 lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
133 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
134 r2 = lhs(i,j1,k,n+1,c)
135 lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2
136 lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2
137 rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - s(m) * r2
145 c---------------------------------------------------------------------
146 c if this IS the first cell, we still compute the lhs
147 c---------------------------------------------------------------------
151 c---------------------------------------------------------------------
152 c perform the Thomas algorithm; first, FORWARD ELIMINATION
153 c---------------------------------------------------------------------
156 do k = start(3,c), ksize-end(3,c)-1
157 do j = jstart, jend-2
158 do i = start(1,c), isize-end(1,c)-1
161 fac1 = 1.d0/lhs(i,j,k,n+3,c)
162 lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
163 lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
165 rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
167 lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
168 > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
169 lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
170 > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
172 rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
173 > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
175 lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) -
176 > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c)
177 lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) -
178 > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c)
180 rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) -
181 > lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c)
187 c---------------------------------------------------------------------
188 c The last two rows in this grid block are a bit different,
189 c since they do not have two more rows available for the
190 c elimination of off-diagonal entries
191 c---------------------------------------------------------------------
195 do k = start(3,c), ksize-end(3,c)-1
196 do i = start(1,c), isize-end(1,c)-1
197 fac1 = 1.d0/lhs(i,j,k,n+3,c)
198 lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
199 lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
201 rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
203 lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
204 > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
205 lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
206 > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
208 rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
209 > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
211 c---------------------------------------------------------------------
212 c scale the last row immediately (some of this is
213 c overkill in case this is the last cell)
214 c---------------------------------------------------------------------
215 fac2 = 1.d0/lhs(i,j1,k,n+3,c)
216 lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c)
217 lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c)
219 rhs(i,j1,k,m,c) = fac2*rhs(i,j1,k,m,c)
224 c---------------------------------------------------------------------
225 c do the u+c and the u-c factors
226 c---------------------------------------------------------------------
229 do k = start(3,c), ksize-end(3,c)-1
230 do j = jstart, jend-2
231 do i = start(1,c), isize-end(1,c)-1
234 fac1 = 1.d0/lhs(i,j,k,n+3,c)
235 lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
236 lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
237 rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
238 lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
239 > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
240 lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
241 > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
242 rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
243 > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
244 lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) -
245 > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c)
246 lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) -
247 > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c)
248 rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) -
249 > lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c)
254 c---------------------------------------------------------------------
255 c And again the last two rows separately
256 c---------------------------------------------------------------------
259 do k = start(3,c), ksize-end(3,c)-1
260 do i = start(1,c), isize-end(1,c)-1
261 fac1 = 1.d0/lhs(i,j,k,n+3,c)
262 lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
263 lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
264 rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
265 lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
266 > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
267 lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
268 > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
269 rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
270 > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
271 c---------------------------------------------------------------------
272 c Scale the last row immediately (some of this is overkill
273 c if this is the last cell)
274 c---------------------------------------------------------------------
275 fac2 = 1.d0/lhs(i,j1,k,n+3,c)
276 lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c)
277 lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c)
278 rhs(i,j1,k,m,c) = fac2*rhs(i,j1,k,m,c)
284 c---------------------------------------------------------------------
285 c send information to the next processor, except when this
286 c is the last grid block;
287 c---------------------------------------------------------------------
289 if (stage .ne. ncells) then
291 c---------------------------------------------------------------------
292 c create a running pointer for the send buffer
293 c---------------------------------------------------------------------
296 do k = start(3,c), ksize-end(3,c)-1
297 do i = start(1,c), isize-end(1,c)-1
299 out_buffer(p+1) = lhs(i,j,k,n+4,c)
300 out_buffer(p+2) = lhs(i,j,k,n+5,c)
302 out_buffer(p+2+m) = rhs(i,j,k,m,c)
311 do k = start(3,c), ksize-end(3,c)-1
312 do i = start(1,c), isize-end(1,c)-1
314 out_buffer(p+1) = lhs(i,j,k,n+4,c)
315 out_buffer(p+2) = lhs(i,j,k,n+5,c)
316 out_buffer(p+3) = rhs(i,j,k,m,c)
323 c---------------------------------------------------------------------
324 c pack and send the buffer
325 c---------------------------------------------------------------------
326 call mpi_isend(out_buffer, 22*buffer_size,
327 > dp_type, successor(2),
328 > DEFAULT_TAG, comm_solve,
329 > requests(2), error)
334 c---------------------------------------------------------------------
335 c now go in the reverse direction
336 c---------------------------------------------------------------------
338 c---------------------------------------------------------------------
340 c---------------------------------------------------------------------
341 do stage = ncells, 1, -1
345 jend = cell_size(2,c)-1
347 isize = cell_size(1,c)
348 ksize = cell_size(3,c)
349 ip = cell_coord(1,c)-1
350 kp = cell_coord(3,c)-1
352 buffer_size = (isize-start(1,c)-end(1,c)) *
353 > (ksize-start(3,c)-end(3,c))
355 if (stage .ne. ncells) then
357 c---------------------------------------------------------------------
358 c if this is not the starting cell in this row of cells,
359 c wait for a message to be received, containing the
360 c solution of the previous two stations
361 c---------------------------------------------------------------------
363 call mpi_irecv(in_buffer, 10*buffer_size,
364 > dp_type, successor(2),
365 > DEFAULT_TAG, comm_solve,
366 > requests(1), error)
369 c---------------------------------------------------------------------
370 c communication has already been started
371 c while waiting, do the block-diagonal inversion for the
372 c cell that was just finished
373 c---------------------------------------------------------------------
375 call pinvr(slice(2,stage+1))
377 c---------------------------------------------------------------------
378 c wait for pending communication to complete
379 c---------------------------------------------------------------------
380 call mpi_waitall(2, requests, statuses, error)
382 c---------------------------------------------------------------------
383 c unpack the buffer for the first three factors
384 c---------------------------------------------------------------------
390 do k = start(3,c), ksize-end(3,c)-1
391 do i = start(1,c), isize-end(1,c)-1
394 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
395 > lhs(i,j,k,n+4,c)*sm1 -
396 > lhs(i,j,k,n+5,c)*sm2
397 rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
398 > lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) -
399 > lhs(i,j1,k,n+5,c) * sm1
405 c---------------------------------------------------------------------
406 c now unpack the buffer for the remaining two factors
407 c---------------------------------------------------------------------
410 do k = start(3,c), ksize-end(3,c)-1
411 do i = start(1,c), isize-end(1,c)-1
414 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
415 > lhs(i,j,k,n+4,c)*sm1 -
416 > lhs(i,j,k,n+5,c)*sm2
417 rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
418 > lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) -
419 > lhs(i,j1,k,n+5,c) * sm1
426 c---------------------------------------------------------------------
427 c now we know this is the first grid block on the back sweep,
428 c so we don't need a message to start the substitution.
429 c---------------------------------------------------------------------
435 do k = start(3,c), ksize-end(3,c)-1
436 do i = start(1,c), isize-end(1,c)-1
437 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
438 > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c)
445 do k = start(3,c), ksize-end(3,c)-1
446 do i = start(1,c), isize-end(1,c)-1
447 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
448 > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c)
454 c---------------------------------------------------------------------
455 c Whether or not this is the last processor, we always have
456 c to complete the back-substitution
457 c---------------------------------------------------------------------
459 c---------------------------------------------------------------------
460 c The first three factors
461 c---------------------------------------------------------------------
464 do k = start(3,c), ksize-end(3,c)-1
465 do j = jend-2, jstart, -1
466 do i = start(1,c), isize-end(1,c)-1
469 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
470 > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) -
471 > lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c)
477 c---------------------------------------------------------------------
478 c And the remaining two
479 c---------------------------------------------------------------------
482 do k = start(3,c), ksize-end(3,c)-1
483 do j = jend-2, jstart, -1
484 do i = start(1,c), isize-end(1,c)-1
487 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
488 > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) -
489 > lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c)
495 c---------------------------------------------------------------------
496 c send on information to the previous processor, if needed
497 c---------------------------------------------------------------------
498 if (stage .ne. 1) then
503 do k = start(3,c), ksize-end(3,c)-1
504 do i = start(1,c), isize-end(1,c)-1
505 out_buffer(p+1) = rhs(i,j,k,m,c)
506 out_buffer(p+2) = rhs(i,j1,k,m,c)
512 c---------------------------------------------------------------------
513 c pack and send the buffer
514 c---------------------------------------------------------------------
516 call mpi_isend(out_buffer, 10*buffer_size,
517 > dp_type, predecessor(2),
518 > DEFAULT_TAG, comm_solve,
519 > requests(2), error)
523 c---------------------------------------------------------------------
524 c If this was the last stage, do the block-diagonal inversion
525 c---------------------------------------------------------------------
526 if (stage .eq. 1) call pinvr(c)