new

[book_gpu.git] / BookGPU / Chapters / chapter13 / ch13.tex
diff --git a/BookGPU/Chapters/chapter13/ch13.tex b/BookGPU/Chapters/chapter13/ch13.tex

index 90dd94649b135a21f31083cd392c7674b442114e..ce2c7c281c5162b463f25a3dea8c0d23760a2937 100755 (executable)
--- a/BookGPU/Chapters/chapter13/ch13.tex
+++ b/BookGPU/Chapters/chapter13/ch13.tex
@@ -322,7 +322,7 @@ each component of the vector must be relaxed an infinite number of times. The ch
  relaxed components to be used in the computational process may be guided by any criterion,
  and in particular, a natural criterion is to pickup the most recently available
  values of the components computed by the processors. Furthermore, the asynchronous
  relaxed components to be used in the computational process may be guided by any criterion,
  and in particular, a natural criterion is to pickup the most recently available
  values of the components computed by the processors. Furthermore, the asynchronous
-iterations are implemented by means of nonblocking MPI communication subroutines\index{MPI subroutines!nonblocking}
+iterations are implemented by means of nonblocking MPI communication subroutines\index{MPI!nonblocking}
  (asynchronous communications).
  
  The important property ensuring the convergence of the parallel projected Richardson
  (asynchronous communications).
  
  The important property ensuring the convergence of the parallel projected Richardson
@@ -486,13 +486,13 @@ in the synchronous algorithm and the asynchronous ones: \verb+cublasSetVectorAsy
  and \verb+cublasGetVectorAsync()+ in the asynchronous algorithm. Moreover, we
  use the communication routines of the MPI library to carry out the data exchanges
  between the neighboring nodes. We use the following communication routines: \verb+MPI_Isend()+
  and \verb+cublasGetVectorAsync()+ in the asynchronous algorithm. Moreover, we
  use the communication routines of the MPI library to carry out the data exchanges
  between the neighboring nodes. We use the following communication routines: \verb+MPI_Isend()+
-and \verb+MPI_Irecv()+ to perform nonblocking\index{MPI subroutines!nonblocking}
+and \verb+MPI_Irecv()+ to perform nonblocking\index{MPI!nonblocking}
  sends and receives, respectively. For the synchronous algorithm, we use the MPI
  routine \verb+MPI_Waitall()+ which puts the MPI process of a computing node in
  blocking status until all data exchanges with neighboring nodes (sends and receives)
  are completed. In contrast, for the asynchronous algorithms, we use the MPI routine
  \verb+MPI_Test()+ which tests the completion of a data exchange (send or receives)
  sends and receives, respectively. For the synchronous algorithm, we use the MPI
  routine \verb+MPI_Waitall()+ which puts the MPI process of a computing node in
  blocking status until all data exchanges with neighboring nodes (sends and receives)
  are completed. In contrast, for the asynchronous algorithms, we use the MPI routine
  \verb+MPI_Test()+ which tests the completion of a data exchange (send or receives)
-without putting the MPI process in blocking status\index{MPI subroutines!blocking}.   
+without putting the MPI process in blocking status\index{MPI!blocking}.   
  
  The function $Compute\_New\_Vector\_Elements()$ (line~$6$ in Algorithm~\ref{ch13:alg:02})
  computes, at each iteration, the new elements of the iterate vector $U$. Its general code
  
  The function $Compute\_New\_Vector\_Elements()$ (line~$6$ in Algorithm~\ref{ch13:alg:02})
  computes, at each iteration, the new elements of the iterate vector $U$. Its general code
@@ -598,7 +598,7 @@ AllReduce(error,\hspace{0.1cm}maxerror,\hspace{0.1cm}MAX); \\
  conv \leftarrow true;
  \end{array}
  $$
  conv \leftarrow true;
  \end{array}
  $$
-where the function $AllReduce()$ uses the MPI global reduction subroutine\index{MPI subroutines!global}
+where the function $AllReduce()$ uses the MPI global reduction subroutine\index{MPI!global}
  \verb+MPI_Allreduce()+ to compute the maximal value, $maxerror$, among the local
  absolute errors, $error$, of all computing nodes, and $p$ (in Algorithm~\ref{ch13:alg:02})
  is used as a counter of the local relaxations carried out by a computing node. In
  \verb+MPI_Allreduce()+ to compute the maximal value, $maxerror$, among the local
  absolute errors, $error$, of all computing nodes, and $p$ (in Algorithm~\ref{ch13:alg:02})
  is used as a counter of the local relaxations carried out by a computing node. In
@@ -698,6 +698,7 @@ $800^{3}$                     & $222,108.09$       & $1,769,232$      & $188,790
  
  \begin{table}
  \centering
  
  \begin{table}
  \centering
+\begin{scriptsize}
  \begin{tabular}{|c|c|c|c|c|c|c|c|}
  \hline
  \multirow{2}{*}{\bf Pb. size} & \multicolumn{3}{c|}{\bf Synchronous}                 & \multicolumn{3}{c|}{\bf Asynchronous}                & \multirow{2}{*}{\bf Gain\%}  \\ \cline{2-7}
  \begin{tabular}{|c|c|c|c|c|c|c|c|}
  \hline
  \multirow{2}{*}{\bf Pb. size} & \multicolumn{3}{c|}{\bf Synchronous}                 & \multicolumn{3}{c|}{\bf Asynchronous}                & \multirow{2}{*}{\bf Gain\%}  \\ \cline{2-7}
@@ -712,6 +713,7 @@ $768^{3}$                    & $4,112.68$         & $831,144$        & $50.13$
  
  $800^{3}$                    & $3,950.87$         & $899,088$        & $56.22$         & $3,636.57$        & $834,900$        & $51.91$     & $7.95$ \\ \hline 
  \end{tabular}
  
  $800^{3}$                    & $3,950.87$         & $899,088$        & $56.22$         & $3,636.57$        & $834,900$        & $51.91$     & $7.95$ \\ \hline 
  \end{tabular}
+\end{scriptsize}
  \vspace{0.5cm}
  \caption{Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.}
  \label{ch13:tab:02}
  \vspace{0.5cm}
  \caption{Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.}
  \label{ch13:tab:02}
@@ -745,7 +747,7 @@ consequently it also depends on the number of computing nodes.
  %%--------------------------%%
  \section{Red-black ordering technique}
  \label{ch13:sec:06}
  %%--------------------------%%
  \section{Red-black ordering technique}
  \label{ch13:sec:06}
-As is wellknown, the Jacobi method\index{iterative method!Jacobi} is characterized
+As is well-known, the Jacobi method\index{iterative method!Jacobi} is characterized
  by a slow convergence\index{convergence} rate compared to some iterative methods\index{iterative method}
  (for example, Gauss-Seidel method\index{iterative method!Gauss-Seidel}). So, in this
  section, we present some solutions to reduce the execution time and the number of
  by a slow convergence\index{convergence} rate compared to some iterative methods\index{iterative method}
  (for example, Gauss-Seidel method\index{iterative method!Gauss-Seidel}). So, in this
  section, we present some solutions to reduce the execution time and the number of
@@ -776,7 +778,7 @@ vector elements leads to using twice the initial number of memory transactions.
  we apply the point red-black ordering\index{iterative method!red-black ordering}
  accordingly to the $y$-coordinate, as is shown in Figure~\ref{ch13:fig:06.02}. In
  this case, the vector elements having even $y$-coordinate are computed in parallel
  we apply the point red-black ordering\index{iterative method!red-black ordering}
  accordingly to the $y$-coordinate, as is shown in Figure~\ref{ch13:fig:06.02}. In
  this case, the vector elements having even $y$-coordinate are computed in parallel
-using the values of those having odd $y$-coordinate and then viceversa. Moreover,
+using the values of those having odd $y$-coordinate and then vice-versa. Moreover,
  in the GPU implementation of the parallel projected Richardson method (Section~\ref{ch13:sec:04}),
  we have shown that a subproblem of size $(NX\times ny\times nz)$ is decomposed into
  $nz$ grids of size $(NX\times ny)$. Then, each kernel is executed in parallel by
  in the GPU implementation of the parallel projected Richardson method (Section~\ref{ch13:sec:04}),
  we have shown that a subproblem of size $(NX\times ny\times nz)$ is decomposed into
  $nz$ grids of size $(NX\times ny)$. Then, each kernel is executed in parallel by