\documentclass{beamer} \usepackage{beamerthemefemto} \usepackage[latin1]{inputenc} \usepackage[T1]{fontenc} \DeclareGraphicsExtensions{.jpg, .png , .pdf, .bmp, .pdftex} \usepackage{algorithm,algorithmicx,algpseudocode} \usepackage{graphicx,graphics} \usepackage{subfig} \usepackage{listings} \usepackage{colortbl} \usepackage{amsmath} \usepackage{xspace} \usepackage[textsize=footnotesize]{todonotes} \title{Optimal Dynamic Frequency Scaling for Energy - Performance of Parallel MPI Programs} \author{Jean-Claude Charr, Raphaël Couturier, Ahmed Fanfakh and Arnaud Giersch} \institute[DISC Department - AND Team]{FEMTO-ST - DISC Department - AND Team} \date{August 29th,~2014} % ____ _____ ____ _ _ _____ % | _ \| ____| __ )| | | |_ _| % | | | | _| | _ \| | | | | | % | |_| | |___| |_) | |_| | | | % |____/|_____|____/ \___/ |_| % \begin{document} \setbeamertemplate{background}{\titrefemto} \begin{frame}[plain] \titlepage \end{frame} \setbeamertemplate{background}{\pagefemto} \begin{frame}{Outline} \setbeamertemplate{section in toc}[sections numbered] \tableofcontents \end{frame} \section{Definitions and objectives } \begin{frame}{Definitions} \begin{femtoBlock} {} % Et ici le texte dans le femtoBlock \begin{itemize} \small \item Modern processors provide \textbf{Dynamic Voltage and Frequency Scaling (DVFS)} technique. \medskip \item DVFS is used to reduce the frequency and thus to \textbf{reduce the energy consumption} by a CPU while computing.\medskip \item Energy consumption by \textbf{individual processor} of a synchronous parallel program: $E_{ind} = P_{dyn} \cdot T_{Comp} + P_{static} \cdot (T_{Comp}+T_{Comm})$.\medskip \item The frequency scaling factor is the ratio between the maximum and the new frequency, $S = \frac{F_{max}}{F_{new}}$. \medskip \end{itemize} \end{femtoBlock} \end{frame} \begin{frame}{Objectives} \begin{femtoBlock}{} \vspace{-12 mm} \begin{itemize} \small \item Study the effect of the scaling factor $S$ on \textbf{energy consumption} of parallel iterative applications such as NAS Benchmarks. \includegraphics[width=.06\textwidth]{fig/nasa.pdf} \medskip \item Study the effect of the scaling factor $S$ on \textbf{performance} of these benchmarks.\medskip \item Discovering the \textbf{energy-performance trade-off relation} when changing the frequency.\medskip \item We propose an algorithm for selecting the scaling factor $S$ producing \textbf {optimal trade-off} between the energy and performance. \medskip \item Improving Rauber and Rünger's\footnote{\tiny Thomas Rauber and Gudula Rünger. Analytical modeling and simulation of the energy consumption \\ \quad ~ ~\quad of independent tasks. In Proceedings of the Winter Simulation Conference, 2012.} method that our method best on. \end{itemize} \let\thefootnote\relax\footnote{} \vspace{-10 mm} \end{femtoBlock} \end{frame} \section{Energy and performance models} \begin{frame}{Energy model for homogeneous platform} \begin{femtoBlock}{}\small The dynamic power is \textbf{exponentially} related to the scaling factor $S$ and the static consumed energy is \textbf{linearly} related to this factor. \begin{block}{\small Rauber and Rünger's energy model} $ E = P_{dyn} \cdot S_1^{-2} \cdot \left( T_1 + \sum_{i=2}^{N} \frac{T_i^3}{T_1^2} \right) + P_{static} \cdot S_1 \cdot T_1 \cdot N$ \end{block} $S_1$: is the max. scaling factor, $T_I$: is the time of the slower task, $T_i$: is the time of the other tasks and $N$: is the number of nodes. \begin{block}{\small Rauber and Rünger's optimal scaling factor} $S_{opt} = \sqrt[3]{\frac{2}{N} \cdot \frac{P_{dyn}}{P_{static}} \cdot \left( 1 + \sum_{i=2}^{N} \frac{T_i^3}{T_1^3}\right) } $ \end{block} They reduce degradation of the performance by \textbf{setting the highest frequency to the slowest task}. \end{femtoBlock} \end{frame} \begin{frame}{Performance evaluation of MPI programs} \begin{femtoBlock}{} \vspace{-5 mm} \begin{block}{\small Execution time prediction model} \centering{ $ T_{new} = T_{Max Comp Old} \cdot S + T_{{Max Comm Old}}$} \end{block} \vspace{10 mm} \centering{\includegraphics[width=.35\textwidth]{fig/cg_per} \quad% \includegraphics[width=.35\textwidth]{fig/lu_pre}} \vspace{5 mm} \small The maximum normalized error for CG=0.0073 \textbf{(the smallest)} and LU=0.031 \textbf{(the worst)}. \end{femtoBlock} \end{frame} \section{Performance and energy reduction trade-off} \begin{frame}{Performance and energy reduction trade-off} \begin{femtoBlock}{} \vspace{-15 mm} \begin{figure} \centering \subfloat[\small Real relation.]{% \includegraphics[width=.4\textwidth]{fig/file3}\label{fig:r2}} \quad% \subfloat[\small Converted relation.]{% \includegraphics[width=.4\textwidth]{fig/file}\label{fig:r1}}% \label{fig:rel} % \caption{The energy and performance relation} \end{figure} $Performance=\frac{1}{execution~time}$ \small \begin{block}{\small Our objective function} \centering{$\textbf{\emph {MaxDist}} = \max_{j=1,2,\dots ,F} (\overbrace{P_{Norm}(S_j)}^{{Maximize}} - \overbrace{E_{Norm}(S_j)}^{{Minimize}} )$} \end{block} \end{femtoBlock} \end{frame} \section{Experimental results and comparison} \begin{frame}{Experimental results } \begin{femtoBlock}{} \begin{itemize} \small \item Our experiments are executed on the simulator SimGrid/SMPI v3.10.\medskip \item Our algorithm is applied to NAS parallel benchmarks.\medskip \item Each node in the cluster has 18 frequency values from \textbf{2.5$GHz$} to \textbf{800$MHz$}.\medskip \item We run the classes A, B and C on 4, 8 or 9 and 16 nodes respectively.\medskip \item The dynamic power with the highest frequency is equal to \textbf{20 $W$} and the power static is equal to \textbf{4 $W$}. \end{itemize} \end{femtoBlock} \end{frame} \begin{frame}{Experimental results} \begin{femtoBlock}{} \centering { \includegraphics[width=.35\textwidth]{fig/ep} \includegraphics[width=.35\textwidth]{fig/cg} \includegraphics[width=.35\textwidth]{fig/bt}} \centering {\includegraphics[width=.55\textwidth]{fig/results.pdf}} \end{femtoBlock} \end{frame} \begin{frame}{Results comparison} \begin{femtoBlock}{} \centering { \includegraphics[width=.33\textwidth]{fig/c1.pdf} \qquad \includegraphics[width=.33\textwidth]{fig/c2.pdf}} \includegraphics[width=.45\textwidth]{fig/compare_c.pdf} \end{femtoBlock} \end{frame} \section{Conclusions} \begin{frame}{Conclusions} \begin{femtoBlock}{} \begin{itemize} \small \item We have presented a new online scaling factor selection method that \textbf{optimizes simultaneously the energy and performance}.\medskip \item It predicts \textbf{ the energy consumption and the performance} of the parallel applications. \medskip \item Our algorithm \textbf{saves more energy} when the communication and the other slacks times are big. \medskip \item It gives the \textbf{best trade-off between energy reduction and performance}. \medskip \item Our method \textbf{outperforms Rauber and Rünger's method} in terms of energy-performance ratio. \end{itemize} \end{femtoBlock} \end{frame} \begin{frame}{Thanks for Listening} \vspace{-10 mm} \begin{femtoBlock}{} \begin{block}{\small Appeared} This work has appeared in ISPA conference proceedings, 26-28 August 2014 \end{block} \medskip \medskip \medskip \medskip \centering {\Large Questions?} \end{femtoBlock} \end{frame} \end{document} % _____ ___ _ _ % | ___|_ _| \ | | % | |_ | || \| | % | _| | || |\ | % |_| |___|_| \_| %