new

[book_gpu.git] / BookGPU / Chapters / chapter13 / ch13.tex
diff --git a/BookGPU/Chapters/chapter13/ch13.tex b/BookGPU/Chapters/chapter13/ch13.tex

index b60105d8580976c9f64d28c020b804c23aa2dbb8..db2ab78b8b97321189b32af7ee3d556ccd772cbe 100755 (executable)
--- a/BookGPU/Chapters/chapter13/ch13.tex
+++ b/BookGPU/Chapters/chapter13/ch13.tex
@@ -134,7 +134,7 @@ where $b=\{b_{1},b_{2},b_{3}\}$, $\|b\|_{2}$ denotes the Euclidean norm of $b$,
  $v=e^{-a}.u$ represents the general change of variables such that $a=\frac{b^{t}(x,y,z)}{2\eta}$.
  Consequently, the numerical resolution of the diffusion problem (the self-adjoint
  operator~(\ref{ch13:eq:04})) is done by optimization algorithms, in contrast to that
  $v=e^{-a}.u$ represents the general change of variables such that $a=\frac{b^{t}(x,y,z)}{2\eta}$.
  Consequently, the numerical resolution of the diffusion problem (the self-adjoint
  operator~(\ref{ch13:eq:04})) is done by optimization algorithms, in contrast to that
-of the convection-diffusion problem (non self-adjoint operator~(\ref{ch13:eq:03}))
+of the convection-diffusion problem (nonself-adjoint operator~(\ref{ch13:eq:03}))
  which is done by relaxation algorithms. In the case of our studied algorithm, the convergence\index{convergence}
  is ensured by M-matrix property; then, the performance is linked to the magnitude of
  the spectral radius of the iteration matrix, which is independent of the condition
  which is done by relaxation algorithms. In the case of our studied algorithm, the convergence\index{convergence}
  is ensured by M-matrix property; then, the performance is linked to the magnitude of
  the spectral radius of the iteration matrix, which is independent of the condition
@@ -698,6 +698,7 @@ $800^{3}$                     & $222,108.09$       & $1,769,232$      & $188,790
  
  \begin{table}
  \centering
  
  \begin{table}
  \centering
+\begin{scriptsize}
  \begin{tabular}{|c|c|c|c|c|c|c|c|}
  \hline
  \multirow{2}{*}{\bf Pb. size} & \multicolumn{3}{c|}{\bf Synchronous}                 & \multicolumn{3}{c|}{\bf Asynchronous}                & \multirow{2}{*}{\bf Gain\%}  \\ \cline{2-7}
  \begin{tabular}{|c|c|c|c|c|c|c|c|}
  \hline
  \multirow{2}{*}{\bf Pb. size} & \multicolumn{3}{c|}{\bf Synchronous}                 & \multicolumn{3}{c|}{\bf Asynchronous}                & \multirow{2}{*}{\bf Gain\%}  \\ \cline{2-7}
@@ -712,6 +713,7 @@ $768^{3}$                    & $4,112.68$         & $831,144$        & $50.13$
  
  $800^{3}$                    & $3,950.87$         & $899,088$        & $56.22$         & $3,636.57$        & $834,900$        & $51.91$     & $7.95$ \\ \hline 
  \end{tabular}
  
  $800^{3}$                    & $3,950.87$         & $899,088$        & $56.22$         & $3,636.57$        & $834,900$        & $51.91$     & $7.95$ \\ \hline 
  \end{tabular}
+\end{scriptsize}
  \vspace{0.5cm}
  \caption{Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.}
  \label{ch13:tab:02}
  \vspace{0.5cm}
  \caption{Execution times in seconds of the parallel projected Richardson method implemented on a cluster of 12 GPUs.}
  \label{ch13:tab:02}
@@ -745,7 +747,7 @@ consequently it also depends on the number of computing nodes.
  %%--------------------------%%
  \section{Red-black ordering technique}
  \label{ch13:sec:06}
  %%--------------------------%%
  \section{Red-black ordering technique}
  \label{ch13:sec:06}
-As is wellknown, the Jacobi method\index{iterative method!Jacobi} is characterized
+As is well known, the Jacobi method\index{iterative method!Jacobi} is characterized
  by a slow convergence\index{convergence} rate compared to some iterative methods\index{iterative method}
  (for example, Gauss-Seidel method\index{iterative method!Gauss-Seidel}). So, in this
  section, we present some solutions to reduce the execution time and the number of
  by a slow convergence\index{convergence} rate compared to some iterative methods\index{iterative method}
  (for example, Gauss-Seidel method\index{iterative method!Gauss-Seidel}). So, in this
  section, we present some solutions to reduce the execution time and the number of
@@ -776,7 +778,7 @@ vector elements leads to using twice the initial number of memory transactions.
  we apply the point red-black ordering\index{iterative method!red-black ordering}
  accordingly to the $y$-coordinate, as is shown in Figure~\ref{ch13:fig:06.02}. In
  this case, the vector elements having even $y$-coordinate are computed in parallel
  we apply the point red-black ordering\index{iterative method!red-black ordering}
  accordingly to the $y$-coordinate, as is shown in Figure~\ref{ch13:fig:06.02}. In
  this case, the vector elements having even $y$-coordinate are computed in parallel
-using the values of those having odd $y$-coordinate and then viceversa. Moreover,
+using the values of those having odd $y$-coordinate and then vice-versa. Moreover,
  in the GPU implementation of the parallel projected Richardson method (Section~\ref{ch13:sec:04}),
  we have shown that a subproblem of size $(NX\times ny\times nz)$ is decomposed into
  $nz$ grids of size $(NX\times ny)$. Then, each kernel is executed in parallel by
  in the GPU implementation of the parallel projected Richardson method (Section~\ref{ch13:sec:04}),
  we have shown that a subproblem of size $(NX\times ny\times nz)$ is decomposed into
  $nz$ grids of size $(NX\times ny)$. Then, each kernel is executed in parallel by