From c7a029391cac485cbc1a5f05eb222557290891a2 Mon Sep 17 00:00:00 2001 From: lilia Date: Sat, 1 Feb 2014 01:10:29 +0100 Subject: [PATCH 1/1] 01-02-2014 --- GMRES_Journal.tex | 93 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 74 insertions(+), 19 deletions(-) diff --git a/GMRES_Journal.tex b/GMRES_Journal.tex index be3eccf..2b077f1 100644 --- a/GMRES_Journal.tex +++ b/GMRES_Journal.tex @@ -1,5 +1,6 @@ \documentclass[11pt]{article} %\documentclass{acmconf} +\usepackage{multicol} \usepackage[paper=a4paper,dvips,top=1.5cm,left=1.5cm,right=1.5cm,foot=1cm,bottom=1.5cm]{geometry} \usepackage{times} @@ -853,30 +854,84 @@ torso3 & 183 863 292 & 25 682 514 & 613 250 + + + + %%% MODIF %%% -\textcolor{red}{\bf In order to show the influence of the communications on a GPU cluster -In tables, we compute the ratios of the computation time over the communication time to show the influence of the communications on a GPU cluster compared to a CPU cluster} +\textcolor{red}{\bf Hereafter, we show the influence of the communications on a GPU cluster compared to a CPU cluster. In Tables~\ref{tab:10},~\ref{tab:11} and~\ref{tab:12}, we compute the ratios between the computation time over the communication time of three versions of the parallel GMRES algorithm for solving sparse linear systems associated to matrices of Table~\ref{tab:06}. These tables show that the hypergraph partitioning and the compressed format of the vectors increase the ratios either on the GPU cluster or on the CPU cluster. This means that the two optimization techniques allow the minimization of the total communication volume between the computing nodes. However, we can notice that the ratios obtained on the GPU cluster are lower than those obtained on the CPU cluster. Indeed, GPUs compute faster than CPUs and communications are more time-consuming while the computation time remains unchanged.} -\begin{table}%[!h] +\begin{table} \begin{center} -\begin{tabular}{|c|c|c|c|} +\begin{tabular}{|c||c|c|c||c|c|c|} \hline -Matrix & Computation time & Communication time & Ratio \\ \hline \hline -2cubes\_sphere & 37.067 s & 1434.512 s & 0.026 \\ -ecology2 & 4.116 s & 501.327 s & 0.008 \\ -finan512 & 7.170 s & 386.742 s & 0.019 \\ -G3\_circuit & 4.797 s & 537.343 s & 0.009 \\ -shallow\_water2 & 3.620 s & 411.208 s & 0.009 \\ -thermal2 & 6.902 s & 511.618 s & 0.013 \\ \hline \hline -cage13 & 12.837 s & 625.175 s & 0.021 \\ -crashbasis & 48.532 s & 3195.183 s & 0.015 \\ -FEM\_3D\_thermal2 & 37.211 s & 1584.650 s & 0.023 \\ -language & 22.912 s & 2242.897 s & 0.010 \\ -poli\_large & 13.618 s & 1722.304 s & 0.008 \\ -torso3 & 74.194 s & 4454.936 s & 0.017 \\ \hline +\multirow{2}{*}{Matrix} & \multicolumn{3}{c||}{GPU version} & \multicolumn{3}{c|}{CPU version} \\ \cline{2-7} + & $Time_{comput}$ & $Time_{comm}$ & $Ratio$ & $Time_{comput}$ & $Time_{comm}$ & $Ratio$ \\ \hline \hline +2cubes\_sphere & 37.067 s & 1434.512 s & {\bf 0.026} & 312.061 s & 3453.931 s & {\bf 0.090}\\ +ecology2 & 4.116 s & 501.327 s & {\bf 0.008} & 60.776 s & 1216.607 s & {\bf 0.050}\\ +finan512 & 7.170 s & 386.742 s & {\bf 0.019} & 72.464 s & 932.538 s & {\bf 0.078}\\ +G3\_circuit & 4.797 s & 537.343 s & {\bf 0.009} & 66.011 s & 1407.378 s & {\bf 0.047}\\ +shallow\_water2 & 3.620 s & 411.208 s & {\bf 0.009} & 51.294 s & 973.446 s & {\bf 0.053}\\ +thermal2 & 6.902 s & 511.618 s & {\bf 0.013} & 77.255 s & 1281.979 s & {\bf 0.060}\\ \hline \hline +cage13 & 12.837 s & 625.175 s & {\bf 0.021} & 139.178 s & 1518.349 s & {\bf 0.092}\\ +crashbasis & 48.532 s & 3195.183 s & {\bf 0.015} & 623.686 s & 7741.777 s & {\bf 0.081}\\ +FEM\_3D\_thermal2 & 37.211 s & 1584.650 s & {\bf 0.023} & 370.297 s & 3810.255 s & {\bf 0.097}\\ +language & 22.912 s & 2242.897 s & {\bf 0.010} & 286.682 s & 5348.733 s & {\bf 0.054}\\ +poli\_large & 13.618 s & 1722.304 s & {\bf 0.008} & 190.302 s & 4059.642 s & {\bf 0.047}\\ +torso3 & 74.194 s & 4454.936 s & {\bf 0.017} & 190.302 s & 10800.787 s & {\bf 0.083}\\ \hline \end{tabular} -\caption{} -\label{tab:09} +\caption{Ratios of the computation time over the communication time obtained from the parallel GMRES algorithm using row-by-row partitioning on 12 GPUs and 24 CPUs.} +\label{tab:10} +\end{center} +\end{table} + + +\begin{table} +\begin{center} +\begin{tabular}{|c||c|c|c||c|c|c|} +\hline +\multirow{2}{*}{Matrix} & \multicolumn{3}{c||}{GPU version} & \multicolumn{3}{c|}{CPU version} \\ \cline{2-7} + & $Time_{comput}$ & $Time_{comm}$ & $Ratio$ & $Time_{comput}$ & $Time_{comm}$ & $Ratio$ \\ \hline \hline +2cubes\_sphere & 27.386 s & 154.861 s & {\bf 0.177} & 342.255 s & 42.100 s & {\bf 8.130}\\ +ecology2 & 3.822 s & 53.131 s & {\bf 0.072} & 69.956 s & 15.019 s & {\bf 4.658}\\ +finan512 & 6.366 s & 41.155 s & {\bf 0.155} & 79.592 s & 8.604 s & {\bf 9.251}\\ +G3\_circuit & 4.543 s & 63.132 s & {\bf 0.072} & 76.540 s & 27.371 s & {\bf 2.796}\\ +shallow\_water2 & 3.282 s & 43.080 s & {\bf 0.076} & 58.348 s & 8.088 s & {\bf 7.214}\\ +thermal2 & 5.986 s & 57.100 s & {\bf 0.105} & 87.682 s & 28.544 s & {\bf 3.072}\\ \hline \hline +cage13 & 10.227 s & 70.388 s & {\bf 0.145} & 152.718 s & 30.785 s & {\bf 4.961}\\ +crashbasis & 41.527 s & 369.071 s & {\bf 0.113} & 701.040 s & 158.916 s & {\bf 4.411}\\ +FEM\_3D\_thermal2 & 28.691 s & 167.140 s & {\bf 0.172} & 403.510 s & 50.935 s & {\bf 7.922}\\ +language & 22.408 s & 242.589 s & {\bf 0.092} & 333.119 s & 64.409 s & {\bf 5.172}\\ +poli\_large & 13.710 s & 179.208 s & {\bf 0.077} & 215.934 s & 30.903 s & {\bf 6.987}\\ +torso3 & 58.455 s & 480.315 s & {\bf 0.122} & 993.609 s & 152.173 s & {\bf 6.529}\\ \hline +\end{tabular} +\caption{Ratios of the computation time over the communication time obtained from the parallel GMRES algorithm using row-by-row partitioning and compressed format for vectors on 12 GPUs and 24 CPUs.} +\label{tab:11} +\end{center} +\end{table} + + +\begin{table} +\begin{center} +\begin{tabular}{|c||c|c|c||c|c|c|} +\hline +\multirow{2}{*}{Matrix} & \multicolumn{3}{c||}{GPU version} & \multicolumn{3}{c|}{CPU version} \\ \cline{2-7} + & $Time_{comput}$ & $Time_{comm}$ & $Ratio$ & $Time_{comput}$ & $Time_{comm}$ & $Ratio$ \\ \hline \hline +2cubes\_sphere & 28.440 s & 7.768 s & {\bf 3.661} & 327.109 s & 63.788 s & {\bf 5.128}\\ +ecology2 & 3.652 s & 0.757 s & {\bf 4.823} & 63.632 s & 13.520 s & {\bf 4.707}\\ +finan512 & 7.579 s & 4.569 s & {\bf 1.659} & 74.120 s & 22.505 s & {\bf 3.294}\\ +G3\_circuit & 4.876 s & 8.745 s & {\bf 0.558} & 72.280 s & 28.395 s & {\bf 2.546}\\ +shallow\_water2 & 3.146 s & 0.606 s & {\bf 5.191} & 52.903 s & 11.177 s & {\bf 4.733}\\ +thermal2 & 6.473 s & 4.325 s & {\bf 1.497} & 81.171 s & 20.907 s & {\bf 3.882}\\ \hline \hline +cage13 & 11.676 s & 7.723 s & {\bf 1.512} & 145.755 s & 46.547 s & {\bf 3.131}\\ +crashbasis & 42.799 s & 29.399 s & {\bf 1.456} & 650.386 s & 203.918 s & {\bf 3.189}\\ +FEM\_3D\_thermal2 & 29.875 s & 8.915 s & {\bf 3.351} & 382.887 s & 93.252 s & {\bf 4.106}\\ +language & 20.991 s & 11.197 s & {\bf 1.875} & 310.679 s & 82.480 s & {\bf 3.767}\\ +poli\_large & 13.817 s & 102.760 s & {\bf 0.134} & 197.508 s & 151.672 s & {\bf 1.302}\\ +torso3 & 57.469 s & 16.828 s & {\bf 3.415} & 926.588 s & 242.721 s & {\bf 3.817}\\ \hline +\end{tabular} +\caption{Ratios of the computation time over the communication time obtained from the parallel GMRES algorithm using hypergraph partitioning and compressed format for vectors on 12 GPUs and 24 CPUs.} +\label{tab:12} \end{center} \end{table} -- 2.39.5