\documentclass[11pt]{article}
%\documentclass{acmconf}
+\usepackage{multicol}
\usepackage[paper=a4paper,dvips,top=1.5cm,left=1.5cm,right=1.5cm,foot=1cm,bottom=1.5cm]{geometry}
\usepackage{times}
+
+
+
+
%%% MODIF %%%
-\textcolor{red}{\bf In order to show the influence of the communications on a GPU cluster
-In tables, we compute the ratios of the computation time over the communication time to show the influence of the communications on a GPU cluster compared to a CPU cluster}
+\textcolor{red}{\bf Hereafter, we show the influence of the communications on a GPU cluster compared to a CPU cluster. In Tables~\ref{tab:10},~\ref{tab:11} and~\ref{tab:12}, we compute the ratios between the computation time over the communication time of three versions of the parallel GMRES algorithm for solving sparse linear systems associated to matrices of Table~\ref{tab:06}. These tables show that the hypergraph partitioning and the compressed format of the vectors increase the ratios either on the GPU cluster or on the CPU cluster. This means that the two optimization techniques allow the minimization of the total communication volume between the computing nodes. However, we can notice that the ratios obtained on the GPU cluster are lower than those obtained on the CPU cluster. Indeed, GPUs compute faster than CPUs and communications are more time-consuming while the computation time remains unchanged.}
-\begin{table}%[!h]
+\begin{table}
\begin{center}
-\begin{tabular}{|c|c|c|c|}
+\begin{tabular}{|c||c|c|c||c|c|c|}
\hline
-Matrix & Computation time & Communication time & Ratio \\ \hline \hline
-2cubes\_sphere & 37.067 s & 1434.512 s & 0.026 \\
-ecology2 & 4.116 s & 501.327 s & 0.008 \\
-finan512 & 7.170 s & 386.742 s & 0.019 \\
-G3\_circuit & 4.797 s & 537.343 s & 0.009 \\
-shallow\_water2 & 3.620 s & 411.208 s & 0.009 \\
-thermal2 & 6.902 s & 511.618 s & 0.013 \\ \hline \hline
-cage13 & 12.837 s & 625.175 s & 0.021 \\
-crashbasis & 48.532 s & 3195.183 s & 0.015 \\
-FEM\_3D\_thermal2 & 37.211 s & 1584.650 s & 0.023 \\
-language & 22.912 s & 2242.897 s & 0.010 \\
-poli\_large & 13.618 s & 1722.304 s & 0.008 \\
-torso3 & 74.194 s & 4454.936 s & 0.017 \\ \hline
+\multirow{2}{*}{Matrix} & \multicolumn{3}{c||}{GPU version} & \multicolumn{3}{c|}{CPU version} \\ \cline{2-7}
+ & $Time_{comput}$ & $Time_{comm}$ & $Ratio$ & $Time_{comput}$ & $Time_{comm}$ & $Ratio$ \\ \hline \hline
+2cubes\_sphere & 37.067 s & 1434.512 s & {\bf 0.026} & 312.061 s & 3453.931 s & {\bf 0.090}\\
+ecology2 & 4.116 s & 501.327 s & {\bf 0.008} & 60.776 s & 1216.607 s & {\bf 0.050}\\
+finan512 & 7.170 s & 386.742 s & {\bf 0.019} & 72.464 s & 932.538 s & {\bf 0.078}\\
+G3\_circuit & 4.797 s & 537.343 s & {\bf 0.009} & 66.011 s & 1407.378 s & {\bf 0.047}\\
+shallow\_water2 & 3.620 s & 411.208 s & {\bf 0.009} & 51.294 s & 973.446 s & {\bf 0.053}\\
+thermal2 & 6.902 s & 511.618 s & {\bf 0.013} & 77.255 s & 1281.979 s & {\bf 0.060}\\ \hline \hline
+cage13 & 12.837 s & 625.175 s & {\bf 0.021} & 139.178 s & 1518.349 s & {\bf 0.092}\\
+crashbasis & 48.532 s & 3195.183 s & {\bf 0.015} & 623.686 s & 7741.777 s & {\bf 0.081}\\
+FEM\_3D\_thermal2 & 37.211 s & 1584.650 s & {\bf 0.023} & 370.297 s & 3810.255 s & {\bf 0.097}\\
+language & 22.912 s & 2242.897 s & {\bf 0.010} & 286.682 s & 5348.733 s & {\bf 0.054}\\
+poli\_large & 13.618 s & 1722.304 s & {\bf 0.008} & 190.302 s & 4059.642 s & {\bf 0.047}\\
+torso3 & 74.194 s & 4454.936 s & {\bf 0.017} & 190.302 s & 10800.787 s & {\bf 0.083}\\ \hline
\end{tabular}
-\caption{}
-\label{tab:09}
+\caption{Ratios of the computation time over the communication time obtained from the parallel GMRES algorithm using row-by-row partitioning on 12 GPUs and 24 CPUs.}
+\label{tab:10}
+\end{center}
+\end{table}
+
+
+\begin{table}
+\begin{center}
+\begin{tabular}{|c||c|c|c||c|c|c|}
+\hline
+\multirow{2}{*}{Matrix} & \multicolumn{3}{c||}{GPU version} & \multicolumn{3}{c|}{CPU version} \\ \cline{2-7}
+ & $Time_{comput}$ & $Time_{comm}$ & $Ratio$ & $Time_{comput}$ & $Time_{comm}$ & $Ratio$ \\ \hline \hline
+2cubes\_sphere & 27.386 s & 154.861 s & {\bf 0.177} & 342.255 s & 42.100 s & {\bf 8.130}\\
+ecology2 & 3.822 s & 53.131 s & {\bf 0.072} & 69.956 s & 15.019 s & {\bf 4.658}\\
+finan512 & 6.366 s & 41.155 s & {\bf 0.155} & 79.592 s & 8.604 s & {\bf 9.251}\\
+G3\_circuit & 4.543 s & 63.132 s & {\bf 0.072} & 76.540 s & 27.371 s & {\bf 2.796}\\
+shallow\_water2 & 3.282 s & 43.080 s & {\bf 0.076} & 58.348 s & 8.088 s & {\bf 7.214}\\
+thermal2 & 5.986 s & 57.100 s & {\bf 0.105} & 87.682 s & 28.544 s & {\bf 3.072}\\ \hline \hline
+cage13 & 10.227 s & 70.388 s & {\bf 0.145} & 152.718 s & 30.785 s & {\bf 4.961}\\
+crashbasis & 41.527 s & 369.071 s & {\bf 0.113} & 701.040 s & 158.916 s & {\bf 4.411}\\
+FEM\_3D\_thermal2 & 28.691 s & 167.140 s & {\bf 0.172} & 403.510 s & 50.935 s & {\bf 7.922}\\
+language & 22.408 s & 242.589 s & {\bf 0.092} & 333.119 s & 64.409 s & {\bf 5.172}\\
+poli\_large & 13.710 s & 179.208 s & {\bf 0.077} & 215.934 s & 30.903 s & {\bf 6.987}\\
+torso3 & 58.455 s & 480.315 s & {\bf 0.122} & 993.609 s & 152.173 s & {\bf 6.529}\\ \hline
+\end{tabular}
+\caption{Ratios of the computation time over the communication time obtained from the parallel GMRES algorithm using row-by-row partitioning and compressed format for vectors on 12 GPUs and 24 CPUs.}
+\label{tab:11}
+\end{center}
+\end{table}
+
+
+\begin{table}
+\begin{center}
+\begin{tabular}{|c||c|c|c||c|c|c|}
+\hline
+\multirow{2}{*}{Matrix} & \multicolumn{3}{c||}{GPU version} & \multicolumn{3}{c|}{CPU version} \\ \cline{2-7}
+ & $Time_{comput}$ & $Time_{comm}$ & $Ratio$ & $Time_{comput}$ & $Time_{comm}$ & $Ratio$ \\ \hline \hline
+2cubes\_sphere & 28.440 s & 7.768 s & {\bf 3.661} & 327.109 s & 63.788 s & {\bf 5.128}\\
+ecology2 & 3.652 s & 0.757 s & {\bf 4.823} & 63.632 s & 13.520 s & {\bf 4.707}\\
+finan512 & 7.579 s & 4.569 s & {\bf 1.659} & 74.120 s & 22.505 s & {\bf 3.294}\\
+G3\_circuit & 4.876 s & 8.745 s & {\bf 0.558} & 72.280 s & 28.395 s & {\bf 2.546}\\
+shallow\_water2 & 3.146 s & 0.606 s & {\bf 5.191} & 52.903 s & 11.177 s & {\bf 4.733}\\
+thermal2 & 6.473 s & 4.325 s & {\bf 1.497} & 81.171 s & 20.907 s & {\bf 3.882}\\ \hline \hline
+cage13 & 11.676 s & 7.723 s & {\bf 1.512} & 145.755 s & 46.547 s & {\bf 3.131}\\
+crashbasis & 42.799 s & 29.399 s & {\bf 1.456} & 650.386 s & 203.918 s & {\bf 3.189}\\
+FEM\_3D\_thermal2 & 29.875 s & 8.915 s & {\bf 3.351} & 382.887 s & 93.252 s & {\bf 4.106}\\
+language & 20.991 s & 11.197 s & {\bf 1.875} & 310.679 s & 82.480 s & {\bf 3.767}\\
+poli\_large & 13.817 s & 102.760 s & {\bf 0.134} & 197.508 s & 151.672 s & {\bf 1.302}\\
+torso3 & 57.469 s & 16.828 s & {\bf 3.415} & 926.588 s & 242.721 s & {\bf 3.817}\\ \hline
+\end{tabular}
+\caption{Ratios of the computation time over the communication time obtained from the parallel GMRES algorithm using hypergraph partitioning and compressed format for vectors on 12 GPUs and 24 CPUs.}
+\label{tab:12}
\end{center}
\end{table}