X-Git-Url: https://bilbo.iut-bm.univ-fcomte.fr/and/gitweb/ThesisAhmed.git/blobdiff_plain/adc90dd4355e95417b713b836614fd9a4edd3dd1..c0e283963e036526dd575326d4f257b270ac03e9:/thesis-presentation/AhmedSlides.tex diff --git a/thesis-presentation/AhmedSlides.tex b/thesis-presentation/AhmedSlides.tex index 7e7d97b..6dbbd81 100644 --- a/thesis-presentation/AhmedSlides.tex +++ b/thesis-presentation/AhmedSlides.tex @@ -22,7 +22,7 @@ \usepackage{fixltx2e} %% used to put some subscripts lower, and make them more legible \newcommand{\fxheight}[1]{\ifx#1\relax\relax\else\rule{0pt}{1.52ex}#1\fi} - +\usepackage{ragged2e} \newcommand{\CL}{\Xsub{C}{L}} \newcommand{\Dist}{\mathit{Dist}} \newcommand{\EdNew}{\Xsub{E}{dNew}} @@ -75,8 +75,8 @@ %Iterations using CPU Frequency Scaling} \vspace{2cm} -\title{ \textbf{Energy Consumption Optimization of Parallel Applications with Iterations using CPU Frequency Scaling} \\ \vspace{0.2cm} \hspace{1.8cm}\textbf{\textcolor{cyan}{\small PhD Dissertation Defense}}}\vspace{-1cm} -\author{ \textbf{Ahmed Badri Muslim Fanfakh} \\ \vspace{0.5cm}\small Under Supervision: \textcolor{cyan}{\small Raphaël COUTURIER and Jean-Claude CHARR} \\\vspace{0.1cm} \textcolor{blue}{ University of Bourgogne Franche-Comté - FEMTO-ST - DISC Dept. - AND Team} \\ ~~~~~~~~~~~~~~~~~~~~~ \textbf{\textcolor{blue}{ 17 October 2016 }}} +\title{ \textbf{Energy Consumption Optimization of Parallel Applications with Iterations using CPU Frequency Scaling} \\ \vspace{0.2cm} \hspace{1.8cm}\textbf{\textcolor{cyan}{\small PhD Dissertation Defense}}}\vspace{-0.5cm} +\author{ \textbf{Ahmed Badri Muslim Fanfakh} \\ \vspace{0.5cm}\small Under the supervision of: \\ \textcolor{cyan}{\small Raphaël COUTURIER and Jean-Claude CHARR} \\\vspace{0.1cm} \textcolor{blue}{ UBFC - FEMTO-ST - DISC Dept. - AND Team} \\ ~~~~~~~~~~~~~~~~~~~~~ \textbf{\textcolor{blue}{ 17 October 2016 }}} \date{} \vspace{-3cm} @@ -109,15 +109,57 @@ \tableofcontents \end{frame} +%%%%%%%%%%%%%%%%%%%% +%% SLIDE 03 %% +%%%%%%%%%%%%%%%%%%%% +\begin{frame}{Definition of parallel computing} +\section{\small {Introduction and Problem definition}} + \centering + \includegraphics[width=0.99\textwidth]{para.pdf} +\end{frame} + + + +\begin{frame}{Execution of synchronous parallel tasks} +\vspace{-0.5 cm} +\begin{figure} + \centering + \subfloat[Synchronous imbalanced communications]{% + \includegraphics[scale=0.49]{c1/commtasks}\label{fig:h1}} + \subfloat[Synchronous imbalanced computations]{% + \includegraphics[scale=0.49]{c1/compt}\label{fig:h2}} + % \caption{Parallel tasks on homogeneous platform} + \label{fig:homo} +\end{figure} + \end{frame} + + %%%%%%%%%%%%%%%%%%%% +%% SLIDE 07 %% +%%%%%%%%%%%%%%%%%%%% + + +\begin{frame}{\large Synchronous and asynchronous iterative methods } +\vspace{-0.5 cm} +\begin{figure} + +\includegraphics[scale=0.42]{syn_tasks.pdf} +\vspace{0.6 cm} +\includegraphics[scale=0.42]{Asyn_tasks.pdf} +\end{figure} + + + \end{frame} + + %%%%%%%%%%%%%%%%%%%% %% SLIDE 03 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{Introduction and problem definition} - \section{\small {Introduction and Problem definition}} - \bf \textcolor{blue}{Approaches to increase the computing power of the parallel platform :} +\begin{frame}{Approaches to get more computing power} + + %\bf \textcolor{blue}{} \begin{minipage}{0.5\textwidth} - \textcolor{blue}{1)} \small \bf \textcolor{black}{Increasing the frequency of a processor.} + \textcolor{blue}{1)} \small \bf \textcolor{black}{Increase the frequency of a processor.\\ (limited due to overheating)} \end{minipage}% \begin{minipage}{0.6\textwidth} @@ -128,9 +170,10 @@ \end{minipage}% \vspace{0.2cm} \begin{minipage}{0.5\textwidth} - \textcolor{blue}{2)} \small \bf \textcolor{black}{Increasing the number of nodes.} + \textcolor{blue}{2)} \small \bf \textcolor{black}{Increase the number of computing + units.} - \tiny \textcolor{blue}{Recently, Tianhe-2 supercomputer had more than 3 million cores while consuming around 17.8 megawatts.} + \textcolor{black}{The supercomputer Tianhe-2 has more than 3 million cores and consumes around 17.8 megawatts.} \end{minipage}% \begin{minipage}{0.6\textwidth} @@ -142,49 +185,47 @@ - %%%%%%%%%%%%%%%%%%% %% SLIDE 04 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{Introduction and problem definition} - \vspace{0.1cm} - \bf \textcolor{blue}{Techniques for energy consumption reduction} - +\begin{frame}{Techniques for energy consumption reduction} + \textcolor{blue}{1)} \bf \textcolor{black}{Switch-off idle nodes method} \vspace{-0.9cm} \begin{figure} - \animategraphics[autopause,loop,controls,scale=0.25,buttonsize=0.2cm]{200}{on-off/a-}{0}{69} + \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{200}{on-off/a-}{0}{111} %\includegraphics[width=0.6\textwidth]{on-off/a-69} \end{figure} \end{frame} %%%%%%%%%%%%%%%%%%%% -%% SLIDE 06 %% +%% SLIDE 05 %% %%%%%%%%%%%%%%%%%%%% \begin{frame}{Techniques for energy consumption reduction} - \textcolor{blue}{2)} \bf \textcolor{black}{Dynamic voltage and frequency Scaling (DVFS)} - \vspace{-0.5cm} + \textcolor{blue}{2)} \bf \textcolor{black}{Dynamic Voltage and Frequency Scaling (DVFS)} + \vspace{-0.9cm} \begin{figure} - \animategraphics[autopause,controls,scale=0.25,buttonsize=0.2cm]{10}{DVFS-meq/a-}{0}{109} + \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{10}{DVFS-meq/a-}{0}{175} %\includegraphics[width=0.6\textwidth]{DVFS-meq/a-109} \end{figure} \end{frame} - - +%%%%%%%%%%%%%%%%%%%% +%% SLIDE 06 %% +%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%% %% SLIDE 07 %% %%%%%%%%%%%%%%%%%%%% \begin{frame}{Motivations} \vspace{0.05cm} \section{\small {Motivations}} -\textcolor{blue}{Why we used DVFS method:} +\textcolor{blue}{Why we used the DVFS method:} \vspace{-0.49cm} \begin{minipage}{0.5\textwidth} \vspace{-0.49cm} \begin{itemize} - \item \small \textcolor{black}{The biggest power consumption is consumed by the processor \textsuperscript{1}. } + \item \small \textcolor{black}{ The CPU is the component that consumes the highest amount of energy in a node \textsuperscript{1}. } \end{itemize} @@ -197,14 +238,15 @@ \end{figure} \end{minipage}% - \begin{itemize} \item \small \textcolor{black}{It uses to reduce the energy consumption while keeping all the nodes working, thus it is more adapted to parallel computing.} - \item \small \textcolor{black}{It has a very small overhead compared to switching-off the idle nodes method.} \end{itemize} + \begin{itemize} \item \small \textcolor{black}{DVFS reduces the energy consumption while + keeping all the nodes working.} + \item \small \textcolor{black}{It has a very small overhead compared to switching-off the idle nodes.} \end{itemize} \vspace{-0.12cm} \begin{block}{\textcolor{white}{Challenge and Objective}} - \small \textcolor{blue}{Challenge:} \textcolor{black}{DVFS is used to reduce the energy consumption, \textcolor{blue}{but} it degrades the performance simultaneously.} + \small \textcolor{blue}{Challenge:} \textcolor{black}{DVFS is used to reduce the energy consumption, \textcolor{blue}{but} it also degrades the performance of the CPU.} \vspace{0.1cm} \small \textcolor{blue}{Objective:} \textcolor{black}{Applying the DVFS to minimize the energy consumption while maintaining the performance of the parallel application.} @@ -222,9 +264,12 @@ for a warehouse-sized computer. %%%%%%%%%%%%%%%%%%%% -\begin{frame}{Contribution} +\begin{frame}{The first contribution} + +\section{\small {Energy optimization of a homogeneous platform}} +%\vspace{-3cm} + % \includegraphics[width=0.6\textwidth]{white.pdf} -\section{\small {Energy optimization of homogeneous platform}} \begin{center} \bf \Large \textcolor{blue}{Energy optimization of a parallel application with iterations running over a homogeneous platform} \end{center} @@ -237,44 +282,59 @@ for a warehouse-sized computer. %%%%%%%%%%%%%%%%%%%% \begin{frame}{Objectives} - \begin{femtoBlock}{} \vspace{-12 mm} - \begin{itemize} \small - \item Study the effect of the scaling factor $S$ on \textbf{energy consumption and performance } of parallel applications with iterations such as NAS - Benchmarks. \includegraphics[width=.06\textwidth]{c1/nasa.pdf} \medskip + + \begin{itemize} \small \justifying + + \item Studying the effect of the frequency scaling on the \textbf{energy consumption and performance } of parallel applications with iterations. \medskip - \item Discovering the \textbf{energy-performance trade-off relation} when changing the frequency of the processor.\medskip - \item Proposing an algorithm for selecting the scaling factor $S$ producing \textbf {the optimal trade-off} between the energy consumption and the performance. \medskip - \item Comparing the proposed algorithm to existing methods. + \item Discovering the \textbf{energy-performance trade-off relation} when changing the frequency of the processor.\medskip + \item Proposing an algorithm for selecting the scaling factor that produces \textbf {the good trade-off} between the energy consumption and the performance. \medskip + \item Comparing the proposed algorithm to existing methods. %\footnote{\tiny Thomas Rauber and Gudula Rünger. Analytical modeling and simulation of the %energy consumption \\ \quad ~ ~\quad of independent tasks. In Proceedings of the Winter Simulation Conference, 2012.} method that our method best on. \end{itemize} %\let\thefootnote\relax\footnote{} - \vspace{-10 mm} - \end{femtoBlock} + + \end{frame} + %%%%%%%%%%%%%%%%%%%% -%% SLIDE 10 %% +%% SLIDE 13 %% %%%%%%%%%%%%%%%%%%%% +\begin{frame}{Performance evaluation of MPI programs} + +\small The frequency scaling factor is the ratio between the maximum and the new frequency, \textcolor{blue}{$S = \frac{F_{max}}{F_{new}}$}. + \vspace{5 mm} + + \begin{femtoBlock}{} + \vspace{-5 mm} + \begin{block}{\small Execution time prediction model} + \centering{ $ \textcolor{red}{T_{new}} = \textcolor{blue}{T_{Max Comp Old} \cdot S + T_{{Min Comm Old}}}$} + \end{block} + \vspace{5 mm} + \centering{\includegraphics[width=.4\textwidth]{c1/cg_per} + \quad% + \includegraphics[width=.4\textwidth]{c1/lu_pre}} + \vspace{1 mm} + + \small The maximum normalized error for CG=0.0073 \textbf{(the smallest)} and LU=0.031 \textbf{(the worst)}. + \end{femtoBlock} +\end{frame} -\begin{frame}{Execution of synchronous parallel tasks} -\vspace{-0.5 cm} -\begin{figure} - \centering - \subfloat[Sync. imbalanced communications]{% - \includegraphics[scale=0.49]{c1/commtasks}\label{fig:h1}} - \subfloat[Sync. imbalanced computations]{% - \includegraphics[scale=0.49]{c1/compt}\label{fig:h2}} - % \caption{Parallel tasks on homogeneous platform} - \label{fig:homo} -\end{figure} - \end{frame} + + + + + + + @@ -282,69 +342,56 @@ for a warehouse-sized computer. %%%%%%%%%%%%%%%%%%%% %% SLIDE 11 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{Energy model for homogeneous platform} - The power consumed by a processor divided into two power metrics: the dynamic (\textcolor{red}{$P_d$}) and static - (\textcolor{red}{$P_s$}) power. +\begin{frame}{Energy model for a homogeneous platform} + The power consumed by a processor is divided into two power metrics: the dynamic (\textcolor{red}{$P_d$}) and the static + (\textcolor{red}{$P_s$}) powers. \begin{equation} \label{eq:pd} \textcolor{red}{ P_d} = \textcolor{blue}{\alpha \cdot CL \cdot V^2 \cdot F} \end{equation} \scriptsize \underline{Where}: \\ - \scriptsize {\textcolor{blue}{$\alpha$}: switching activity \hspace{15 mm} \textcolor{blue}{$CL$}: load capacitance\\ - \textcolor{blue}{$V$} the supply voltage \hspace{14 mm} \textcolor{blue}{$F$}: operational frequency} + \scriptsize {\textcolor{blue}{$\alpha$}: switching activity. \hspace{15 mm} \textcolor{blue}{$CL$}: load capacitance [F].\\ + \textcolor{blue}{$V$}: the supply voltage [V]. \hspace{8 mm} \textcolor{blue}{$F$}: operational frequency [Hz].} \begin{equation} \label{eq:ps} \small \textcolor{red}{P_s} = \textcolor{blue}{V \cdot N_{trans} \cdot K_{design} \cdot I_{Leak}} \end{equation} \underline{Where}:\\ - \scriptsize{ \textcolor{blue}{$V$}: the supply voltage. \hspace{28 mm} \textcolor{blue}{$N_{trans}$}: number of transistors. \\ - \textcolor{blue}{$K_{design}$}: design dependent parameter. \hspace{8 mm} \textcolor{blue}{$I_{leak}$}: technology dependent - parameter.} + \scriptsize{ \textcolor{blue}{$V$}: the supply voltage [V]. \hspace{19 mm} \textcolor{blue}{$N_{trans}$}: number of transistors. \\ + \textcolor{blue}{$K_{design}$}: design dependent parameter. \hspace{3 mm} \textcolor{blue}{$I_{leak}$}: technology dependent + parameter [A].} + + \end{frame} + + %%%%%%%%%%%%%%%%%%%% %% SLIDE 12 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{Energy model for homogeneous platform} - - The frequency scaling factor is the ratio between the maximum and the new frequency, \textcolor{blue}{$S = \frac{F_{max}}{F_{new}}$}. \medskip - - +\begin{frame}{Energy model for a homogeneous platform} + \vspace{-0.77cm} + \begin{figure} + \animategraphics[autopause,controls,scale=0.3,buttonsize=0.2cm]{10}{homo-model/a-}{0}{441} + %\includegraphics[width=0.6\textwidth]{homo-model/a-356} + \end{figure} - \begin{block}{\small Rauber and Rünger's energy model} - $ E = P_{d} \cdot S_1^{-2} \cdot - \left( T_1 + \sum_{i=2}^{N} \frac{T_i^3}{T_1^2} \right) + - P_{s} \cdot S_1 \cdot T_1 \cdot N$ - \end{block} - \textcolor{blue}{$S_1$}: the max. scaling factor\\ - \textcolor{blue}{$P_{d}$}: the dynamic power\\ - \textcolor{blue}{$P_{s}$}: the static power\\ - \textcolor{blue}{$T_I$}: the time of the slower task\\ - \textcolor{blue}{$T_i$}: the time of the other tasks\\ - \textcolor{blue}{$N$}: the number of nodes + % \begin{block}{\small Rauber and Rünger's energy model} + %$ E = P_{d} \cdot S_1^{-2} \cdot + %\left( T_1 + \sum_{i=2}^{N} \frac{T_i^3}{T_1^2} \right) + + % P_{s} \cdot S_1 \cdot T_1 \cdot N$ + %\end{block} + % \textcolor{blue}{$S_1$}: the maximum scaling factor.\\ + % \textcolor{blue}{$P_{d}$}: the dynamic power.\\ + % \textcolor{blue}{$P_{s}$}: the static power.\\ + % \textcolor{blue}{$T_I$}: the execution time of the slower task.\\ + % \textcolor{blue}{$T_i$}: the execution time of task i.\\ + % \textcolor{blue}{$N$}: the number of nodes. + + \end{frame} - - -%%%%%%%%%%%%%%%%%%%% -%% SLIDE 13 %% -%%%%%%%%%%%%%%%%%%%% -\begin{frame}{Performance evaluation of MPI programs} - \begin{femtoBlock}{} - \vspace{-5 mm} - \begin{block}{\small Execution time prediction model} - \centering{ $ \textcolor{red}{T_{new}} = \textcolor{blue}{T_{Max Comp Old} \cdot S + T_{{Min Comm Old}}}$} - \end{block} - \vspace{10 mm} - \centering{\includegraphics[width=.4\textwidth]{c1/cg_per} - \quad% - \includegraphics[width=.4\textwidth]{c1/lu_pre}} - \vspace{5 mm} - - \small The maximum normalized error for CG=0.0073 \textbf{(the smallest)} and LU=0.031 \textbf{(the worst)}. - \end{femtoBlock} -\end{frame} @@ -382,23 +429,23 @@ for a warehouse-sized computer. %%%%%%%%%%%%%%%%%%%% %% SLIDE 15 %% %%%%%%%%%%%%%%%%%%%% - \begin{frame}{Scaling factor selection algorithm} -\vspace{-0.75cm} - \begin{center} - \includegraphics[width=.56 \textwidth]{c1/algo-homo} - \end{center} + %\begin{frame}{Scaling factor selection algorithm} +%\vspace{-0.75cm} + % \begin{center} + %\includegraphics[width=.56 \textwidth]{c1/algo-homo} + %\end{center} -\end{frame} +%\end{frame} %%%%%%%%%%%%%%%%%%%% %% SLIDE 16 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{Scaling algorithm example} +\begin{frame}{Scaling factor selection algorithm} \vspace{-0.75cm} \begin{figure} - \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{dvfs-homo/a-}{0}{159} + \animategraphics[autopause,controls,scale=0.29,buttonsize=0.2cm]{10}{dvfs-homo/a-}{0}{335} %\includegraphics[width=0.6\textwidth]{dvfs-homo/a-159} \end{figure} \end{frame} @@ -406,14 +453,14 @@ for a warehouse-sized computer. %%%%%%%%%%%%%%%%%%%% %% SLIDE 17 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{Experimental results } +\begin{frame}{Experiment over SimGrid } \begin{femtoBlock}{} \begin{itemize} \small \item The experiments were executed on the simulator SimGrid/SMPI v3.10.\medskip \item The proposed algorithm was applied to the NAS parallel benchmarks.\medskip \item Each node in the cluster has 18 frequency values from \textbf{2.5$GHz$} to \textbf{800$MHz$}.\medskip - \item The proposed algorithm was evaluated over the A, B, C classes of the benchmarks using 4, 8 or 9 and 16 nodes respectively. \medskip + \item The proposed algorithm was evaluated over the A, B and C classes of the benchmarks using 4, 8 or 9 and 16 nodes respectively. \medskip \item $P_d=20W$, $P_s=4W$. \end{itemize} \end{femtoBlock} @@ -430,6 +477,8 @@ for a warehouse-sized computer. \includegraphics[width=.35\textwidth]{c1/cg} \includegraphics[width=.35\textwidth]{c1/bt}} +\hspace{0.5cm} + \centering {\includegraphics[width=.55\textwidth]{c1/results.pdf}} \end{femtoBlock} \end{frame} @@ -439,43 +488,46 @@ for a warehouse-sized computer. %% SLIDE 19 %% %%%%%%%%%%%%%%%%%%%% \begin{frame}{Results comparison} - \begin{block}{\small Rauber and Rünger's optimal scaling factor} - $S_{opt} = \sqrt[3]{\frac{2}{N} \cdot \frac{P_{dyn}}{P_{static}} \cdot - \left( 1 + \sum_{i=2}^{N} \frac{T_i^3}{T_1^3}\right) } $ - \end{block} - \centering { - %\includegraphics[width=.33\textwidth]{c1/c1.pdf} - %\qquad - %\includegraphics[width=.33\textwidth]{c1/c2.pdf}} - + \small \textcolor{blue}{Rauber and Rünger's scaling factor \textcolor{black}{ \tiny \textsuperscript{2}}} - \includegraphics[width=.55\textwidth]{c1/compare_c.pdf}} + \vspace{2 mm} + + $S_{opt} = \sqrt[3]{\frac{2}{N} \cdot \frac{P_{dyn}}{P_{static}} \cdot + \left( 1 + \sum_{i=2}^{N} \frac{T_i^3}{T_1^3}\right) } $ + + \begin{center} + \includegraphics[width=.55\textwidth]{c1/compare-c.pdf} + \end{center} + + +\vspace{-2 mm} + \tiny \textsuperscript{2} Thomas Rauber and Gudula Rünger. Analytical modeling and simulation of the energy consumption of independent tasks. In Proceedings of the Winter Simulation Conference, 2012. \end{frame} %%%%%%%%%%%%%%%%%%%% %% SLIDE 20 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{The proposed new energy model} - \vspace{-0.75cm} - \begin{figure} - \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{homo-model/a-}{0}{356} +%\begin{frame}{The proposed new energy model} + % \vspace{-0.75cm} + %\begin{figure} + % \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{homo-model/a-}{0}{356} %\includegraphics[width=0.6\textwidth]{homo-model/a-356} - \end{figure} -\end{frame} + % \end{figure} +%\end{frame} %%%%%%%%%%%%%%%%%%%% %% SLIDE 21 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{Comparing the new model with Rauber model } - \vspace{0.1cm} - \centering - \includegraphics[width=.45\textwidth]{c1/energy_con} +%\begin{frame}{\large Comparing the new model with Rauber's model } +% \vspace{0.1cm} +% \centering + %\includegraphics[width=.45\textwidth]{c1/energy_con} - \includegraphics[width=.5\textwidth]{c1/compare-scales} -\end{frame} + %\includegraphics[width=.5\textwidth]{c1/compare-scales} +%\end{frame} @@ -502,9 +554,9 @@ for a warehouse-sized computer. %%%%%%%%%%%%%%%%%%%% -\begin{frame}{Contribution} +\begin{frame}{The second contribution} -\section{\small {Energy optimization of heterogeneous platform}} +\section{\small {Energy optimization of a heterogeneous platform}} \begin{center} @@ -522,11 +574,11 @@ for a warehouse-sized computer. \begin{femtoBlock}{} \vspace{-12 mm} \begin{itemize} \small \item Proposing \textcolor{blue}{new energy and performance models} for message passing applications with iterations running - over a heterogeneous platform (cluster and Grid). \medskip + over a heterogeneous platform (cluster or Grid). \medskip \item Studying the effect of the scaling factor $S$ on both the \textcolor{blue}{energy consumption and the performance} of message passing iterative applications. \medskip - \item Computing the vector of scaling factors ($S_1, S_2, ..., S_n$) producing \textcolor{blue} {the optimal trade-off} between + \item Computing the vector of scaling factors ($S_1, S_2, ..., S_n$) producing \textcolor{blue} {the good trade-off} between the energy consumption and the performance. \end{itemize} @@ -561,27 +613,27 @@ for a warehouse-sized computer. %%%%%%%%%%%%%%%%%%%% %% SLIDE 25 %% %%%%%%%%%%%%%%%%%%%% - \begin{frame}{The energy consumption model} - The overall energy consumption of a message passing synchronous application executed over - a heterogeneous platform can be computed as follows: - \begin{multline} - \label{eq:energy} - \textcolor{red}{E} = \textcolor{blue}{\sum_{i=1}^{N} {(S_i^{-2} \cdot Pd_i \cdot Tcp_i)}} + {} \\ - \textcolor{blue}{\sum_{i=1}^{N} (Ps_i \cdot (\max_{i=1,2,\dots,N} (Tcp_i \cdot S_{i}) + {\min_{i=1,2,\dots,N} (Tcm_i))}} - \hspace{10 mm} - \end{multline} - \underline{where}:\\ - \textcolor{blue}{N} : is the number of nodes. -\end{frame} + %\begin{frame}{The energy consumption model} + % The overall energy consumption of a message passing synchronous application executed over + % a heterogeneous platform can be computed as follows: + % \begin{multline} + % \label{eq:energy} + % \textcolor{red}{E} = \textcolor{blue}{\sum_{i=1}^{N} {(S_i^{-2} \cdot Pd_i \cdot Tcp_i)}} + {} \\ + % \textcolor{blue}{\sum_{i=1}^{N} (Ps_i \cdot (\max_{i=1,2,\dots,N} (Tcp_i \cdot S_{i}) + {\min_{i=1,2,\dots,N} (Tcm_i))}} + % \hspace{10 mm} + % \end{multline} + % \underline{where}:\\ + % \textcolor{blue}{N} : is the number of nodes. +%\end{frame} %%%%%%%%%%%%%%%%%%%% %% SLIDE 26 %% %%%%%%%%%%%%%%%%%%%% - \begin{frame}{The energy model example for heter. cluster} - \vspace{-0.5cm} + \begin{frame}{The energy model for heterogeneous cluster} + \vspace{-0.77cm} \begin{figure} - \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{heter-model/a-}{0}{272} + \animategraphics[autopause,controls,scale=0.3,buttonsize=0.2cm]{10}{heter-model/a-}{0}{350} %\includegraphics[width=0.6\textwidth]{heter-model/a-272} \end{figure} \end{frame} @@ -617,22 +669,22 @@ for a warehouse-sized computer. %%%%%%%%%%%%%%%%%%%% %% SLIDE 28 %% %%%%%%%%%%%%%%%%%%%% - \begin{frame}{The scaling algorithm for heter. cluster} + %\begin{frame}{The scaling algorithm for heter. cluster} - \centering - \includegraphics[width=.52\textwidth]{algo-heter} - \end{frame} + %\centering + %\includegraphics[width=.52\textwidth]{algo-heter} + %\end{frame} %%%%%%%%%%%%%%%%%%%% %% SLIDE 29 %% %%%%%%%%%%%%%%%%%%%% - \begin{frame}{The scaling algorithm example} - \vspace{-0.5cm} + \begin{frame}{The scaling algorithm for heter. cluster} + \vspace{-0.77cm} \centering \begin{figure} - \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{dvfs-heter/a-}{0}{650} + \animategraphics[autopause,controls,scale=0.3,buttonsize=0.2cm]{10}{dvfs-heter/a-}{0}{836} % \includegraphics[width=0.6\textwidth]{dvfs-heter/a-650} \end{figure} \end{frame} @@ -643,85 +695,57 @@ for a warehouse-sized computer. %%%%%%%%%%%%%%%%%%%% %% SLIDE 30 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{Experiments over a heterogeneous cluster } - \begin{itemize} - \small - \item The experiments executed on the simulator SimGrid/SMPI v3.10.\medskip - \item The scaling algorithm was applied to the NAS parallel benchmarks class C.\medskip - \item Four types of processors with different computing powers were used.\medskip - \item We ran the benchmarks on different number of nodes ranging from 4 to 144 nodes.\medskip - \item The total power consumption of the chosen CPUs assumed to be composed of $80\%$ for the dynamic power and $20\%$ for the static power. - \medskip +%\begin{frame}{Experiments over a heterogeneous cluster } + % \begin{itemize} + % \small + % \item The experiments were executed on the simulator SimGrid/SMPI v3.10.\medskip + % \item The scaling algorithm was applied to the NAS parallel benchmarks class C.\medskip + % \item Four types of processors with different computing powers were used.\medskip + % \item The benchmarks were executed with different number of nodes ranging from 4 to 144 nodes.\medskip + % \item It was assumed that the total power consumption of the CPU consist of 80\% dynamic power and 20\% static power. + % \medskip - \end{itemize} + %\end{itemize} -\end{frame} +%\end{frame} %%%%%%%%%%%%%%%%%%%% %% SLIDE 31 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{The experimental results} - \vspace{-5 mm} - \begin{figure}[!t] - \centering - \includegraphics[width=0.8\textwidth]{c2/energy_saving.pdf} +%\begin{frame}{The simulation results} + % \vspace{-5 mm} + % \begin{figure}[!t] + %\centering + %\includegraphics[width=0.8\textwidth]{c2/energy_saving.pdf} - \textcolor{blue}{On average, it reduces the energy consumption by \textcolor{red}{29\%} - for the class C of the NAS Benchmarks executed over 8 nodes} + % \textcolor{blue}{On average, it reduces the energy consumption by \textcolor{red}{29\%} + %for the class C of the NAS Benchmarks executed over 8 nodes} - \end{figure} -\end{frame} + % \end{figure} +%\end{frame} %%%%%%%%%%%%%%%%%%%% %% SLIDE 32 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{The experimental results} - \vspace{-5 mm} - \begin{figure}[!t] - \centering +%\begin{frame}{The simulation results} + % \vspace{-5 mm} + % \begin{figure}[!t] + % \centering - \includegraphics[width=.8\textwidth]{c2/perf_degra.pdf} + % \includegraphics[width=.8\textwidth]{c2/perf_degra.pdf} - \textcolor{blue}{On average, it degrades by \textcolor{red}{3.8\%} the performance - of NAS Benchmarks class C executed over 8 nodes} - \end{figure} -\end{frame} + % \textcolor{blue}{On average, it degrades by \textcolor{red}{3.8\%} the performance + % of NAS Benchmarks class C executed over 8 nodes} + % \end{figure} +%\end{frame} -%%%%%%%%%%%%%%%%%%%% -%% SLIDE 33 %% -%%%%%%%%%%%%%%%%%%%% -\begin{frame}{The results of the three power scenarios} - \vspace{-5 mm} - \begin{figure}[!t] - \centering - \includegraphics[width=.55\textwidth]{c2/three_power.pdf} - \vspace{10 mm} - \includegraphics[width=.55\textwidth]{c2/three_scenarios.pdf} - \end{figure} -\end{frame} - -%%%%%%%%%%%%%%%%%%%% -%% SLIDE 34 %% -%%%%%%%%%%%%%%%%%%%% -\begin{frame}{Comparing the objective function to EDP} - - EDP is the products between the energy consumption and the delay. - \vspace{-5 mm} - \begin{figure}[!t] - \centering - \includegraphics[width=.55\textwidth]{c2/avg_compare.pdf} - - \includegraphics[width=.55\textwidth]{c2/compare_with_EDP.pdf} - \end{figure} -\end{frame} - @@ -741,10 +765,10 @@ for a warehouse-sized computer. %%%%%%%%%%%%%%%%%%%% %% SLIDE 36 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{The grid architecture} -\begin{center} -\includegraphics[width=.8\textwidth]{c2/init_freq.pdf} -\end{center} +%\begin{frame}{The grid architecture} +%\begin{center} +%\includegraphics[width=.8\textwidth]{c2/init_freq.pdf} +%\end{center} %\begin{frame}{Performance, Energy and trade-off models} \small %\begin{block}{\small The performance model of grid} @@ -776,7 +800,7 @@ for a warehouse-sized computer. % \end{block} - \end{frame} + %\end{frame} @@ -784,19 +808,21 @@ for a warehouse-sized computer. %% SLIDE 37 %% %%%%%%%%%%%%%%%%%%%% \begin{frame}{Experiments over Grid'5000} - \centering - + + \textcolor{blue}{The experiments were conducted using three + clusters distributed over one or two sites.} + \vspace{-7 mm} + \begin{center} \includegraphics[width=.5\textwidth]{c2/grid5000-2.pdf} - - \vspace{-3 mm} - \textcolor{blue}{Two experiments were conducted: over one site and two sites - each one with three clusters } - - \vspace{1mm} - + \end{center} + \vspace{-10 mm} + \textcolor{blue}{Grid'5000 power measurement tools were used.} + \vspace{-9 mm} + \begin{center} \includegraphics[width=.5\textwidth]{c2/power_consumption.pdf} + \end{center} - \textcolor{blue}{Grid'5000 power measurement tools were used} + \end{frame} @@ -810,7 +836,7 @@ for a warehouse-sized computer. \begin{minipage}{0.4\textwidth} %\textcolor{blue}{Execution the NAS class D on 16 nodes saves the energy by %\textcolor{red}{30\%}} - \textcolor{blue}{The energy saving = \textcolor{red}{30\%}} + \small \textcolor{blue}{The average energy saving = \textcolor{red}{30\%}} \end{minipage} \begin{minipage}{0.55\textwidth} \begin{figure}[h!] @@ -821,7 +847,7 @@ for a warehouse-sized computer. \begin{minipage}{0.4\textwidth} %\textcolor{blue}{Execution the NAS class D on 16 nodes degrades the %performance by \textcolor{red}{3.2\%}} - \textcolor{blue}{The performance degradation = \textcolor{red}{3.2\%}} + \small \textcolor{blue}{The average performance degradation = \textcolor{red}{3.2\%}} \end{minipage} \begin{minipage}{0.55\textwidth} \begin{figure}[h!] @@ -832,11 +858,32 @@ for a warehouse-sized computer. +%%%%%%%%%%%%%%%%%%%% +%% SLIDE 33 %% +%%%%%%%%%%%%%%%%%%%% +\begin{frame}{The results of the three power scenarios} + \vspace{-5 mm} + \begin{figure}[!t] + \centering + \includegraphics[width=.45\textwidth]{c2/eng_pow.eps} + \hspace{0.3cm} + \includegraphics[width=.45\textwidth]{c2/per_pow.eps} + \vspace{4 mm} + \includegraphics[width=.7\textwidth]{c2/three_scenarios.pdf} + \end{figure} +\end{frame} + + + + + + + %%%%%%%%%%%%%%%%%%%% %% SLIDE 39 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{Experiments over Grid'5000} - \textcolor{blue}{One core and Multi-cores per node results:} +\begin{frame}{One core and Multi-cores per node results} + %\textcolor{blue}{One core and Multi-cores per node results:} \begin{figure}[h!] \includegraphics[width=.48\textwidth]{c2/eng_s_mc.eps} @@ -848,7 +895,22 @@ for a warehouse-sized computer. \end{frame} - +%%%%%%%%%%%%%%%%%%%% +%% SLIDE 34 %% +%%%%%%%%%%%%%%%%%%%% +\begin{frame}{Comparing the objective function to EDP} + + EDP is the product between the energy consumption and the delay \tiny\textsuperscript{3}. + \vspace{-5 mm} + \begin{figure}[!t] + \centering + \includegraphics[width=.6\textwidth]{c2/edp_dist.eps} + + + \end{figure} + + \tiny \textsuperscript{3} Spiliopoulos et al, Green governors: A framework for continuously adaptive dvfs, in International Green Computing Conference and Workshops (IGCC), 2011. +\end{frame} %\begin{frame}{Summary} %\begin{itemize} % \small @@ -872,7 +934,7 @@ for a warehouse-sized computer. %%%%%%%%%%%%%%%%%%%% %% SLIDE 40 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{Contribution} +\begin{frame}{The third contribution} \section{\small {Energy optimization of asynchronous applications}} \begin{center} \bf \Large \textcolor{blue}{Energy optimization of asynchronous iterative message passing applications} @@ -888,7 +950,7 @@ for a warehouse-sized computer. \textcolor{blue}{The execution of a synchronous parallel iterative application over a grid } \vspace{-8 mm} \begin{figure} - \animategraphics[autopause,controls,scale=0.25,buttonsize=0.2cm]{10}{syn/a-}{0}{503} + \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{10}{syn/a-}{0}{647} %\includegraphics[width=0.6\textwidth]{syn/a-503} \end{figure} \end{frame} @@ -902,7 +964,7 @@ for a warehouse-sized computer. \textcolor{blue}{The execution of an asynchronous parallel iterative application over a grid } \vspace{-8 mm} \begin{figure} - \animategraphics[autopause,controls,scale=0.25,buttonsize=0.2cm]{10}{asyn/a-}{0}{440} + \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{10}{asyn/a-}{0}{556} %\includegraphics[width=0.6\textwidth]{asyn/a-440} \end{figure} \end{frame} @@ -916,7 +978,7 @@ for a warehouse-sized computer. \textcolor{blue}{Using asynchronous communications with DVFS } \vspace{-8 mm} \begin{figure} - \animategraphics[autopause,controls,scale=0.25,buttonsize=0.2cm]{10}{asyn+dvfs/a-}{0}{314} + \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{10}{asyn+dvfs/a-}{0}{344} %\includegraphics[width=0.6\textwidth]{asyn+dvfs/a-314} \end{figure} \end{frame} @@ -990,11 +1052,11 @@ for a warehouse-sized computer. %%%%%%%%%%%%%%%%%%%% %% SLIDE 46 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{The scaling algorithm for Asynch. applications} -\vspace{-0.1 mm} -\centering -\includegraphics[width=0.55\textwidth]{algo-hybrid.pdf} -\end{frame} +%\begin{frame}{The scaling algorithm for Asynch. applications} +%\vspace{-0.1 mm} +%\centering +%\includegraphics[width=0.55\textwidth]{algo-hybrid.pdf} +%\end{frame} @@ -1022,27 +1084,27 @@ for a warehouse-sized computer. %%%%%%%%%%%%%%%%%%%% %% SLIDE 48 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{The simulation results} -\centering \small \textcolor{blue}{The best scenario in terms of energy and performance is the Async. MS with Sync. DVFS} +%\begin{frame}{The simulation results} +%\centering \small \textcolor{blue}{The best scenario in terms of energy and performance is %the Async. MS with Sync. DVFS} -\centering - \includegraphics[scale=0.42]{c3/energy_saving.eps} +%\centering + % \includegraphics[scale=0.42]{c3/energy_saving.eps} - \centering The average of energy saving = \textcolor{red}{22\%} -\end{frame} + %\centering The average energy saving = \textcolor{red}{22\%} +%\end{frame} %%%%%%%%%%%%%%%%%%%% %% SLIDE 49 %% %%%%%%%%%%%%%%%%%%%% -\begin{frame}{The simulation results} -\centering +%\begin{frame}{The simulation results} +%\centering - \includegraphics[scale=0.42]{c3/perf_degra.eps} + % \includegraphics[scale=0.42]{c3/perf_degra.eps} - \centering The average speed-up = \textcolor{red}{5.72\%} -\end{frame} +%\centering The average speed-up = \textcolor{red}{5.72\%} +%\end{frame} @@ -1050,7 +1112,7 @@ for a warehouse-sized computer. %% SLIDE 50 %% %%%%%%%%%%%%%%%%%%%% \begin{frame}{The Grid'5000 results} - \vspace{-20 mm} + \vspace{-10 mm} \begin{figure}[!t] \centering \hspace{-8 mm} @@ -1058,8 +1120,11 @@ for a warehouse-sized computer. \includegraphics[width=0.53\textwidth]{c3/perf-deg-compare.eps} \end{figure} \vspace{-5 mm} - \centering -The energy saving = \textcolor{red}{26.93\%}, the average speed-up = \textcolor{red}{21.48\%} + \centering \footnotesize + + %\small \textcolor{blue}{The best scenario in terms of energy and performance is the Async. MS with Sync. DVFS} + +The average energy saving = \textcolor{red}{26.93\%}, the average speed-up = \textcolor{red}{21.48\%} \end{frame} @@ -1083,8 +1148,8 @@ The energy saving = \textcolor{red}{26.93\%}, the average speed-up = \textcolor \section{Conclusions and Perspectives} \begin{itemize} -\small \barrow Three \textcolor{blue}{ new energy consumption and performance} models were proposed for synchronous and asynchronous parallel applications with iterations running over -\textcolor{blue}{homogeneous and heterogeneous clusters and grids}. +\small \barrow Three \textcolor{blue}{ new energy consumption and performance} models were proposed for synchronous or asynchronous parallel applications with iterations running over +\textcolor{blue}{homogeneous and heterogeneous clusters or grids}. @@ -1095,9 +1160,9 @@ The energy saving = \textcolor{red}{26.93\%}, the average speed-up = \textcolor \small \barrow The proposed algorithms were applied to the \textcolor{blue}{NAS parallel benchmarks} and \textcolor{blue}{the Multi-splitting} method. -\small \barrow The proposed algorithms were evaluated over the \textcolor{blue}{SimGrid simulator} and over \textcolor{blue}{Grid'5000 testbed}. +\small \barrow The proposed algorithms were evaluated over the \textcolor{blue}{SimGrid simulator} and over the \textcolor{blue}{Grid'5000 testbed}. -\small \barrow All the proposed methods were compared to either \textcolor{blue}{Rauber and Rünger's method} or \textcolor{blue}{the EDP objective function}. +\small \barrow All the proposed methods were compared to either \textcolor{blue}{Rauber and Rünger's method} or to the \textcolor{blue}{EDP objective function}. \end{itemize} @@ -1117,7 +1182,7 @@ Multi-splitting} method. Science}, 2016. \item Ahmed Fanfakh, Jean-Claude Charr, Raphaël Couturier, Arnaud Giersch. Energy Consumption Reduction for - Asynchronous Message Passing Applications. \textit{Journal of Supercomputing}, 2016, (Submitted) + Asynchronous Message Passing Applications. \textit{Journal of Supercomputing}, 2016, (Accepted with minor revisions) \end{enumerate} \end{block} @@ -1160,6 +1225,9 @@ Multi-splitting} method. \small \barrow The proposed algorithms for heterogeneous platforms should be applied to heterogeneous platforms composed of \textcolor{blue}{CPUs and GPUs}. \small \barrow Comparing the results returned by the energy models to the values given by \textcolor{blue}{real instruments that measure the energy consumptions} of CPUs during the execution time. +\small \barrow Considering the power consumed by the other devices in the node such as +\textcolor{blue}{the memory and the hard drive} in the energy consumption model. + \end{itemize} \end{frame} @@ -1169,7 +1237,7 @@ Multi-splitting} method. %%%%%%%%%%%%%%%%%%%% \begin{frame}{Fin} \vspace{-10 mm} - \centering \Large \textcolor{blue}{Thank you for your listening} + \centering \Large \textcolor{blue}{Thank you for your attention} \vspace{2cm} \centering \textcolor{blue}{ {\Large Questions?}}