31-01-2014

[GMRES_For_Journal.git] / GMRES_Journal.tex
diff --git a/GMRES_Journal.tex b/GMRES_Journal.tex

index e34ae84ed72fb2954916df54d598d0ebde0d5d16..be3eccf1303e69e6b2552dee1921d04ff5fb6e07 100644 (file)
--- a/GMRES_Journal.tex
+++ b/GMRES_Journal.tex
@@ -201,7 +201,7 @@ Algorithm~\ref{alg:01} illustrates the main key points of the GMRES method with
  %%% END %%%
  
  \begin{algorithm}[!h]
-  \SetAlgoLined
+  %\SetAlgoLined
    \Entree{$A$ (matrix), $b$ (vector), $M$ (preconditioning matrix),
  $x_{0}$ (initial guess), $\varepsilon$ (tolerance threshold), $max$ (maximum number of iterations),
  $m$ (number of iterations of the Arnoldi process)}
@@ -843,11 +843,48 @@ torso3                  & 183 863 292      & 25 682 514       & 613 250
  \end{center}
  \end{table}
  
+
+
+
+
+
+
+
+
+
+
  %%% MODIF %%%
+\textcolor{red}{\bf In order to show the influence of the communications on a GPU cluster
+In tables, we compute the ratios of the computation time over the communication time to show the influence of the communications on a GPU cluster compared to a CPU cluster}
+
+\begin{table}%[!h]
+\begin{center}
+\begin{tabular}{|c|c|c|c|} 
+\hline
+Matrix            & Computation time & Communication time & Ratio \\ \hline \hline
+2cubes\_sphere    & 37.067 s         & 1434.512 s         & 0.026 \\
+ecology2          & 4.116 s          & 501.327 s          & 0.008 \\
+finan512          & 7.170 s          & 386.742 s          & 0.019 \\
+G3\_circuit       & 4.797 s          & 537.343 s          & 0.009 \\
+shallow\_water2   & 3.620 s          & 411.208 s          & 0.009 \\ 
+thermal2          & 6.902 s          & 511.618 s          & 0.013 \\ \hline \hline
+cage13            & 12.837 s         & 625.175 s          & 0.021 \\
+crashbasis        & 48.532 s         & 3195.183 s         & 0.015 \\
+FEM\_3D\_thermal2 & 37.211 s         & 1584.650 s         & 0.023 \\
+language          & 22.912 s         & 2242.897 s         & 0.010 \\
+poli\_large       & 13.618 s         & 1722.304 s         & 0.008 \\
+torso3            & 74.194 s         & 4454.936 s         & 0.017 \\ \hline       
+\end{tabular}
+\caption{}
+\label{tab:09}
+\end{center}
+\end{table}
+
+
  \textcolor{red}{\bf Finally, the parallel solving of a linear system can be easy to optimize when the associated matrix is regular. This is unfortunately not the case of many real-world applications. When the matrix has an irregular structure, the amount of communication between processors is not the same. Another important parameter is the size of the matrix bandwidth which has a huge influence on the amount of communications. In this work, we have generated different kinds of matrices in order to analyze different difficulties. With as a large bandwidth as possible involving communications between all processors, which is the most difficult situation, we proposed to use two heuristics. Unfortunately, there is no fast method that optimizes the communication in any situation. For systems of non linear equations, there are different algorithms but most of them consist in linearizing the system of equations. In this case, a linear system needs to be solved. The big interest is that the matrix is the same at each step of the non linear system solving, so the partitioning method which is a time consuming step is performed once only.
  }
  
-\textcolor{red}{
+\textcolor{red}{\bf 
  Another very important issue is that the communications have a greater influence on a cluster of GPUs than on a cluster of CPUs. There are two reasons for this. The first one comes from the fact that with a cluster of GPUs, the CPU/GPU data transfers slow down communications between two GPUs that are not on the same machines. The second one is due to the fact that with GPUs the ratio of the computation time over the communication time decreases since the computation time is reduced. So the impact of the communications between GPUs might be a very important issue that can limit the scalability of a parallel algorithm.}
  %%% END %%%