2 \usepackage{beamerthemefemto}
3 \usepackage[latin1]{inputenc}
4 \usepackage[T1]{fontenc}
5 \DeclareGraphicsExtensions{.jpg, .png , .pdf, .bmp, .pdftex}
6 \usepackage{algorithm,algorithmicx,algpseudocode}
7 \usepackage{graphicx,graphics}
16 \newcommand{\AG}[2][inline]{%
17 \todo[color=green!50,#1]{\sffamily\textbf{AG:} #2}\xspace}
18 \newcommand{\JC}[2][inline]{%
19 \todo[color=red!10,#1]{\sffamily\textbf{JC:} #2}\xspace}
20 \definecolor{myblue}{RGB}{0,29,119}
21 \newcommand{\Xsub}[2]{{\ensuremath{#1_\mathit{#2}}}}
23 %% used to put some subscripts lower, and make them more legible
24 \newcommand{\fxheight}[1]{\ifx#1\relax\relax\else\rule{0pt}{1.52ex}#1\fi}
26 \newcommand{\CL}{\Xsub{C}{L}}
27 \newcommand{\Dist}{\mathit{Dist}}
28 \newcommand{\EdNew}{\Xsub{E}{dNew}}
29 \newcommand{\Eind}{\Xsub{E}{ind}}
30 \newcommand{\Enorm}{\Xsub{E}{Norm}}
31 \newcommand{\Eoriginal}{\Xsub{E}{Original}}
32 \newcommand{\Ereduced}{\Xsub{E}{Reduced}}
33 \newcommand{\Es}{\Xsub{E}{S}}
34 \newcommand{\Fdiff}[1][]{\Xsub{F}{diff}_{\!#1}}
35 \newcommand{\Fmax}[1][]{\Xsub{F}{max}_{\fxheight{#1}}}
36 \newcommand{\Fnew}{\Xsub{F}{new}}
37 \newcommand{\Vnew}{\Xsub{V}{new}}
38 \newcommand{\Vmax}{\Xsub{V}{max}}
39 \newcommand{\Ileak}{\Xsub{I}{leak}}
40 \newcommand{\Kdesign}{\Xsub{K}{design}}
41 \newcommand{\MaxDist}{\mathit{Max}\Dist}
42 \newcommand{\MinTcm}{\mathit{Min}\Tcm}
43 \newcommand{\Ntrans}{\Xsub{N}{trans}}
44 \newcommand{\Pd}[1][]{\Xsub{P}{d}_{\fxheight{#1}}}
45 \newcommand{\PdNew}{\Xsub{P}{dNew}}
47 \newcommand{\PdOld}{\Xsub{P}{dOld}}
48 \newcommand{\Pnorm}{\Xsub{P}{Norm}}
49 \newcommand{\Tnorm}{\Xsub{T}{Norm}}
50 \newcommand{\Ps}[1][]{\Xsub{P}{s}_{\fxheight{#1}}}
51 \newcommand{\Scp}[1][]{\Xsub{S}{cp}_{#1}}
52 \newcommand{\Sopt}[1][]{\Xsub{S}{opt}_{#1}}
53 \newcommand{\Tcm}[1][]{\Xsub{T}{cm}_{\fxheight{#1}}}
54 \newcommand{\Tcp}[1][]{\Xsub{T}{cp}_{#1}}
55 \newcommand{\TcpOld}[1][]{\Xsub{T}{cpOld}_{#1}}
56 \newcommand{\Tnew}{\Xsub{T}{New}}
57 \newcommand{\Told}{\Xsub{T}{Old}}
58 \newcommand{\Ltcm}[1][]{\Xsub{L}{tcm}_{\fxheight{#1}}}
59 \newcommand{\Etcm}[1][]{\Xsub{E}{tcm}_{\fxheight{#1}}}
60 \newcommand{\Niter}[1][]{\Xsub{N}{iter}_{\fxheight{#1}}}
61 \newcommand{\Pmax}[1][]{\Xsub{P}{max}_{\fxheight{#1}}}
62 \newcommand{\Pidle}[1][]{\Xsub{P}{idle}_{\fxheight{#1}}}
65 \definecolor{myblue}{RGB}{0,29,119}
66 \usepackage[textsize=footnotesize]{todonotes}
67 \newcommand{\bsquare}{\item[\color{myblue}\ding{110}]}
68 \newcommand{\barrow}{\item[\color{myblue}\ding{228}]}
69 \newcommand{\bwarrow}{\item[\color{myblue}\ding{227}]}
70 \DeclareGraphicsExtensions{.jpg, .png , .pdf, .bmp, .pdftex}
74 %\title{Energy Consumption Optimization of Parallel Applications with
75 %Iterations using CPU Frequency Scaling}
78 \title{ \textbf{Energy Consumption Optimization of Parallel Applications with Iterations using CPU Frequency Scaling} \\ \vspace{0.2cm} \hspace{1.8cm}\textbf{\textcolor{cyan}{\small PhD Dissertation Defense}}}\vspace{-0.5cm}
79 \author{ \textbf{Ahmed Badri Muslim Fanfakh} \\ \vspace{0.5cm}\small Under the supervision of: \\ \textcolor{cyan}{\small Raphaël COUTURIER and Jean-Claude CHARR} \\\vspace{0.1cm} \textcolor{blue}{ UBFC - FEMTO-ST - DISC Dept. - AND Team} \\ ~~~~~~~~~~~~~~~~~~~~~ \textbf{\textcolor{blue}{ 17 October 2016 }}}
83 % ____ _____ ____ _ _ _____
84 % | _ \| ____| __ )| | | |_ _|
85 % | | | | _| | _ \| | | | | |
86 % | |_| | |___| |_) | |_| | | |
87 % |____/|_____|____/ \___/ |_|
90 \setbeamertemplate{background}{\titrefemto}
105 \setbeamertemplate{background}{\pagefemto}
106 \begin{frame}{Outline}
108 \setbeamertemplate{section in toc}[sections numbered]
115 \begin{frame}{Definition of parallel computing}
116 \section{\small {Introduction and Problem definition}}
118 \includegraphics[width=0.99\textwidth]{para.pdf}
123 \begin{frame}{Execution of synchronous parallel tasks}
127 \subfloat[Synchronous imbalanced communications]{%
128 \includegraphics[scale=0.49]{c1/commtasks}\label{fig:h1}}
129 \subfloat[Synchronous imbalanced computations]{%
130 \includegraphics[scale=0.49]{c1/compt}\label{fig:h2}}
131 % \caption{Parallel tasks on homogeneous platform}
143 \begin{frame}{\large Synchronous and asynchronous iterative methods }
147 \includegraphics[scale=0.42]{syn_tasks.pdf}
149 \includegraphics[scale=0.42]{Asyn_tasks.pdf}
158 \begin{frame}{Approaches to get more computing power}
160 %\bf \textcolor{blue}{}
161 \begin{minipage}{0.5\textwidth}
162 \textcolor{blue}{1)} \small \bf \textcolor{black}{Increase the frequency of a processor.\\ (limited due to overheating)}
164 \begin{minipage}{0.6\textwidth}
168 \includegraphics[width=0.7\textwidth]{fig/freq-years}
172 \begin{minipage}{0.5\textwidth}
173 \textcolor{blue}{2)} \small \bf \textcolor{black}{Increase the number of computing
176 \textcolor{black}{The supercomputer Tianhe-2 has more than 3 million cores and consumes around 17.8 megawatts.}
179 \begin{minipage}{0.6\textwidth}
181 \includegraphics[width=0.7\textwidth]{fig/clusters}
191 \begin{frame}{Techniques for energy consumption reduction}
193 \textcolor{blue}{1)} \bf \textcolor{black}{Switch-off idle nodes method}
196 \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{200}{on-off/a-}{0}{111}
197 %\includegraphics[width=0.6\textwidth]{on-off/a-69}
204 \begin{frame}{Techniques for energy consumption reduction}
206 \textcolor{blue}{2)} \bf \textcolor{black}{Dynamic Voltage and Frequency Scaling (DVFS)}
209 \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{10}{DVFS-meq/a-}{0}{175}
210 %\includegraphics[width=0.6\textwidth]{DVFS-meq/a-109}
220 \begin{frame}{Motivations}
222 \section{\small {Motivations}}
223 \textcolor{blue}{Why we used the DVFS method:}
225 \begin{minipage}{0.5\textwidth}
228 \item \small \textcolor{black}{ The CPU is the component that consumes the highest amount of energy in a node \textsuperscript{1}. }
233 \begin{minipage}{0.5\textwidth}
236 \includegraphics[width=0.85\textwidth]{fig/node-power}
241 \begin{itemize} \item \small \textcolor{black}{DVFS reduces the energy consumption while
242 keeping all the nodes working.}
243 \item \small \textcolor{black}{It has a very small overhead compared to switching-off the idle nodes.} \end{itemize}
247 \begin{block}{\textcolor{white}{Challenge and Objective}}
249 \small \textcolor{blue}{Challenge:} \textcolor{black}{DVFS is used to reduce the energy consumption, \textcolor{blue}{but} it also degrades the performance of the CPU.}
252 \small \textcolor{blue}{Objective:} \textcolor{black}{Applying the DVFS to minimize the energy consumption while maintaining the performance of the parallel application.}
255 \tiny \textsuperscript{1} Fan, X., Weber, W., and Barroso, L. A. 2007. Power provisioning
256 for a warehouse-sized computer.
267 \begin{frame}{The first contribution}
269 \section{\small {Energy optimization of a homogeneous platform}}
271 % \includegraphics[width=0.6\textwidth]{white.pdf}
274 \bf \Large \textcolor{blue}{Energy optimization of a parallel application with iterations running over a homogeneous platform}
284 \begin{frame}{Objectives}
286 \begin{itemize} \small \justifying
288 \item Studying the effect of the frequency scaling on the \textbf{energy consumption and performance } of parallel applications with iterations. \medskip
290 \item Discovering the \textbf{energy-performance trade-off relation} when changing the frequency of the processor.\medskip
291 \item Proposing an algorithm for selecting the scaling factor that produces \textbf {the good trade-off} between the energy consumption and the performance. \medskip
292 \item Comparing the proposed algorithm to existing methods.
295 %\footnote{\tiny Thomas Rauber and Gudula Rünger. Analytical modeling and simulation of the
296 %energy consumption \\ \quad ~ ~\quad of independent tasks. In Proceedings of the Winter Simulation Conference, 2012.} method that our method best on.
298 %\let\thefootnote\relax\footnote{}
309 \begin{frame}{Performance evaluation of MPI programs}
311 \small The frequency scaling factor is the ratio between the maximum and the new frequency, \textcolor{blue}{$S = \frac{F_{max}}{F_{new}}$}.
316 \begin{block}{\small Execution time prediction model}
317 \centering{ $ \textcolor{red}{T_{new}} = \textcolor{blue}{T_{Max Comp Old} \cdot S + T_{{Min Comm Old}}}$}
320 \centering{\includegraphics[width=.4\textwidth]{c1/cg_per}
322 \includegraphics[width=.4\textwidth]{c1/lu_pre}}
325 \small The maximum normalized error for CG=0.0073 \textbf{(the smallest)} and LU=0.031 \textbf{(the worst)}.
345 \begin{frame}{Energy model for a homogeneous platform}
346 The power consumed by a processor is divided into two power metrics: the dynamic (\textcolor{red}{$P_d$}) and the static
347 (\textcolor{red}{$P_s$}) powers.
350 \textcolor{red}{ P_d} = \textcolor{blue}{\alpha \cdot CL \cdot V^2 \cdot F}
352 \scriptsize \underline{Where}: \\
353 \scriptsize {\textcolor{blue}{$\alpha$}: switching activity. \hspace{15 mm} \textcolor{blue}{$CL$}: load capacitance [F].\\
354 \textcolor{blue}{$V$}: the supply voltage [V]. \hspace{8 mm} \textcolor{blue}{$F$}: operational frequency [Hz].}
357 \small \textcolor{red}{P_s} = \textcolor{blue}{V \cdot N_{trans} \cdot K_{design} \cdot I_{Leak}}
360 \scriptsize{ \textcolor{blue}{$V$}: the supply voltage [V]. \hspace{19 mm} \textcolor{blue}{$N_{trans}$}: number of transistors. \\
361 \textcolor{blue}{$K_{design}$}: design dependent parameter. \hspace{3 mm} \textcolor{blue}{$I_{leak}$}: technology dependent
373 \begin{frame}{Energy model for a homogeneous platform}
376 \animategraphics[autopause,controls,scale=0.3,buttonsize=0.2cm]{10}{homo-model/a-}{0}{441}
377 %\includegraphics[width=0.6\textwidth]{homo-model/a-356}
380 % \begin{block}{\small Rauber and Rünger's energy model}
381 %$ E = P_{d} \cdot S_1^{-2} \cdot
382 %\left( T_1 + \sum_{i=2}^{N} \frac{T_i^3}{T_1^2} \right) +
383 % P_{s} \cdot S_1 \cdot T_1 \cdot N$
385 % \textcolor{blue}{$S_1$}: the maximum scaling factor.\\
386 % \textcolor{blue}{$P_{d}$}: the dynamic power.\\
387 % \textcolor{blue}{$P_{s}$}: the static power.\\
388 % \textcolor{blue}{$T_I$}: the execution time of the slower task.\\
389 % \textcolor{blue}{$T_i$}: the execution time of task i.\\
390 % \textcolor{blue}{$N$}: the number of nodes.
402 \begin{frame}{Performance and energy reduction trade-off}
403 \begin{femtoBlock}{} \vspace{-15 mm}
406 \subfloat[\small Real relation.]{%
407 \includegraphics[width=.43\textwidth]{c1/file3}\label{fig:r2}}
409 \subfloat[\small Converted relation.]{%
410 \includegraphics[width=.43\textwidth]{c1/file}\label{fig:r1}}%
412 % \caption{The energy and performance relation}
415 Where:~~~ $\textcolor{blue}{Performance} = execution~time^{-1}$
419 \begin{block}{\small Our objective function}
420 \centering{$\textbf{\emph {\textcolor{red}{MaxDist}}} = \max_{j=1,2,\dots ,F}
421 (\overbrace{P_{Norm}(S_j)}^{{\textcolor{blue}{Maximize}}} -
422 \overbrace{E_{Norm}(S_j)}^{{\textcolor{blue}{Minimize}}} )$}
432 %\begin{frame}{Scaling factor selection algorithm}
435 %\includegraphics[width=.56 \textwidth]{c1/algo-homo}
444 \begin{frame}{Scaling factor selection algorithm}
448 \animategraphics[autopause,controls,scale=0.29,buttonsize=0.2cm]{10}{dvfs-homo/a-}{0}{335}
449 %\includegraphics[width=0.6\textwidth]{dvfs-homo/a-159}
456 \begin{frame}{Experiment over SimGrid }
460 \item The experiments were executed on the simulator SimGrid/SMPI v3.10.\medskip
461 \item The proposed algorithm was applied to the NAS parallel benchmarks.\medskip
462 \item Each node in the cluster has 18 frequency values from \textbf{2.5$GHz$} to \textbf{800$MHz$}.\medskip
463 \item The proposed algorithm was evaluated over the A, B and C classes of the benchmarks using 4, 8 or 9 and 16 nodes respectively. \medskip
464 \item $P_d=20W$, $P_s=4W$.
473 \begin{frame}{Experimental results}
476 \includegraphics[width=.35\textwidth]{c1/ep}
477 \includegraphics[width=.35\textwidth]{c1/cg}
478 \includegraphics[width=.35\textwidth]{c1/bt}}
482 \centering {\includegraphics[width=.55\textwidth]{c1/results.pdf}}
490 \begin{frame}{Results comparison}
491 \small \textcolor{blue}{Rauber and Rünger's scaling factor \textcolor{black}{ \tiny \textsuperscript{2}}}
495 $S_{opt} = \sqrt[3]{\frac{2}{N} \cdot \frac{P_{dyn}}{P_{static}} \cdot
496 \left( 1 + \sum_{i=2}^{N} \frac{T_i^3}{T_1^3}\right) } $
500 \includegraphics[width=.55\textwidth]{c1/compare-c.pdf}
505 \tiny \textsuperscript{2} Thomas Rauber and Gudula Rünger. Analytical modeling and simulation of the energy consumption of independent tasks. In Proceedings of the Winter Simulation Conference, 2012.
512 %\begin{frame}{The proposed new energy model}
515 % \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{homo-model/a-}{0}{356}
516 %\includegraphics[width=0.6\textwidth]{homo-model/a-356}
524 %\begin{frame}{\large Comparing the new model with Rauber's model }
527 %\includegraphics[width=.45\textwidth]{c1/energy_con}
529 %\includegraphics[width=.5\textwidth]{c1/compare-scales}
535 % \begin{frame}{Summary}
536 % \begin{femtoBlock}{}
539 %\item We have presented a new online scaling factor selection method that \textcolor{blue}{optimizes simultaneously the energy and performance}.\medskip
540 % \item It predicts \textcolor{blue}{ the energy consumption and the performance} of the parallel applications. \medskip
541 %\item Our algorithm \textcolor{blue}{saves more energy} when the communication and the other slacks times are big. \medskip
542 %\item It gives the \textcolor{blue}{best trade-off between energy reduction and
543 % performance}. \medskip
544 %\item Our method \ \textcolor{blue}{outperforms Rauber and Rünger's method} in terms of energy-performance ratio.
545 %\item The proposed new energy model is \textcolor{blue}{more accurate} then Rauber energy model.
557 \begin{frame}{The second contribution}
559 \section{\small {Energy optimization of a heterogeneous platform}}
563 \bf \Large \textcolor{blue}{Energy optimization of a parallel application with iterations running over a Heterogeneous platform}
573 \begin{frame}{Objectives}
574 \begin{femtoBlock}{} \vspace{-12 mm}
575 \begin{itemize} \small
576 \item Proposing \textcolor{blue}{new energy and performance models} for message passing applications with iterations running
577 over a heterogeneous platform (cluster or Grid). \medskip
578 \item Studying the effect of the scaling factor $S$ on both the \textcolor{blue}{energy consumption and the performance} of
579 message passing iterative applications. \medskip
581 \item Computing the vector of scaling factors ($S_1, S_2, ..., S_n$) producing \textcolor{blue} {the good trade-off} between
582 the energy consumption and the performance.
593 \begin{frame}{The execution time model}
597 \includegraphics[scale=0.5]{c2/commtasks}
603 \begin{block}{\small The execution time prediction model}
606 \small\textcolor{red}{ T_{new}} = \textcolor{blue}{\max_{i=1,2,\dots,N} ({TcpOld_i} \cdot S_{i}) + \min_{i=1,2,\dots,N} (Tcm_i)}
609 \small Where: $ \textcolor{red}{Tcm} = \textcolor{blue}{communication~times + slack~times}$
616 %\begin{frame}{The energy consumption model}
617 % The overall energy consumption of a message passing synchronous application executed over
618 % a heterogeneous platform can be computed as follows:
621 % \textcolor{red}{E} = \textcolor{blue}{\sum_{i=1}^{N} {(S_i^{-2} \cdot Pd_i \cdot Tcp_i)}} + {} \\
622 % \textcolor{blue}{\sum_{i=1}^{N} (Ps_i \cdot (\max_{i=1,2,\dots,N} (Tcp_i \cdot S_{i}) + {\min_{i=1,2,\dots,N} (Tcm_i))}}
625 % \underline{where}:\\
626 % \textcolor{blue}{N} : is the number of nodes.
633 \begin{frame}{The energy model for heterogeneous cluster}
636 \animategraphics[autopause,controls,scale=0.3,buttonsize=0.2cm]{10}{heter-model/a-}{0}{350}
637 %\includegraphics[width=0.6\textwidth]{heter-model/a-272}
647 %\begin{frame}{The trade-off between energy and performance}
650 % \centering{ \includegraphics[width=.4\textwidth]{c2/heter}}
653 % \textcolor{red}{\underline{Step1}}: computing the normalized energy \textcolor{blue}%{$E_{norm} = \frac{E_{reduced}}
655 % \textcolor{red}{\underline{Step2}}: computing the normalized performance \textcolor{blue}{$P_{norm} = \frac{T_{Max}}{T_{new}}$}.
657 % \begin{block}{\small The tradeoff model}
660 % \textcolor{red}{MaxDist} =
661 % \mathop {\max_{i=1,\dots F}}_{j=1,\dots,N}
662 % (\overbrace{P_{norm}(S_{ij})}^{\text{\textcolor{blue}{Maximize}}} -
663 % \overbrace{E_{norm}(S_{ij})}^{\text{\textcolor{blue}{Minimize}}} )
672 %\begin{frame}{The scaling algorithm for heter. cluster}
675 %\includegraphics[width=.52\textwidth]{algo-heter}
682 \begin{frame}{The scaling algorithm for heter. cluster}
687 \animategraphics[autopause,controls,scale=0.3,buttonsize=0.2cm]{10}{dvfs-heter/a-}{0}{836}
688 % \includegraphics[width=0.6\textwidth]{dvfs-heter/a-650}
698 %\begin{frame}{Experiments over a heterogeneous cluster }
701 % \item The experiments were executed on the simulator SimGrid/SMPI v3.10.\medskip
702 % \item The scaling algorithm was applied to the NAS parallel benchmarks class C.\medskip
703 % \item Four types of processors with different computing powers were used.\medskip
704 % \item The benchmarks were executed with different number of nodes ranging from 4 to 144 nodes.\medskip
705 % \item It was assumed that the total power consumption of the CPU consist of 80\% dynamic power and 20\% static power.
716 %\begin{frame}{The simulation results}
720 %\includegraphics[width=0.8\textwidth]{c2/energy_saving.pdf}
722 % \textcolor{blue}{On average, it reduces the energy consumption by \textcolor{red}{29\%}
723 %for the class C of the NAS Benchmarks executed over 8 nodes}
733 %\begin{frame}{The simulation results}
738 % \includegraphics[width=.8\textwidth]{c2/perf_degra.pdf}
740 % \textcolor{blue}{On average, it degrades by \textcolor{red}{3.8\%} the performance
741 % of NAS Benchmarks class C executed over 8 nodes}
755 %\begin{frame}{Energy optimization of grid platform}
758 % \includegraphics[width=.6\textwidth]{c2/grid5000.pdf}
760 % \small 10 sites distributed over France and Luxembourg
768 %\begin{frame}{The grid architecture}
770 %\includegraphics[width=.8\textwidth]{c2/init_freq.pdf}
773 %\begin{frame}{Performance, Energy and trade-off models} \small
774 %\begin{block}{\small The performance model of grid}
777 %\Tnew = \mathop{\max_{i=1,\dots N}}_{j=1,\dots,M_i}({\TcpOld[ij]} \cdot S_{ij})
778 % +\mathop{\min_{j=1,\dots,M_h}} (\Tcm[hj])
783 %\begin{block}{\small The energy model of grid}\small
786 %E = \sum_{i=1}^{N} \sum_{i=1}^{M_i} {(S_{ij}^{-2} \cdot \Pd[ij] \cdot \Tcp[ij])} +
787 % \sum_{i=1}^{N} \sum_{j=1}^{M_i} (\Ps[ij] \cdot \Tnew)
791 %\begin{block}{\small The trade-off model of grid}
796 %\mathop{ \mathop{\max_{i=1,\dots N}}_{j=1,\dots,M_i}}_{k=1,\dots,F_j}
797 % (\overbrace{\Pnorm(S_{ijk})}^{\text{Maximize}} -
798 % \overbrace{\Enorm(S_{ijk})}^{\text{Minimize}} )
810 \begin{frame}{Experiments over Grid'5000}
812 \textcolor{blue}{The experiments were conducted using three
813 clusters distributed over one or two sites.}
816 \includegraphics[width=.5\textwidth]{c2/grid5000-2.pdf}
819 \textcolor{blue}{Grid'5000 power measurement tools were used.}
822 \includegraphics[width=.5\textwidth]{c2/power_consumption.pdf}
834 \begin{frame}{Experiments over Grid'5000}
836 \begin{minipage}{0.4\textwidth}
837 %\textcolor{blue}{Execution the NAS class D on 16 nodes saves the energy by
838 %\textcolor{red}{30\%}}
839 \small \textcolor{blue}{The average energy saving = \textcolor{red}{30\%}}
841 \begin{minipage}{0.55\textwidth}
843 \includegraphics[width=0.83 \textwidth]{c2/eng_s.eps}
847 \begin{minipage}{0.4\textwidth}
848 %\textcolor{blue}{Execution the NAS class D on 16 nodes degrades the
849 %performance by \textcolor{red}{3.2\%}}
850 \small \textcolor{blue}{The average performance degradation = \textcolor{red}{3.2\%}}
852 \begin{minipage}{0.55\textwidth}
854 \includegraphics[width=.83\textwidth]{c2/per_d.eps}
864 \begin{frame}{The results of the three power scenarios}
868 \includegraphics[width=.45\textwidth]{c2/eng_pow.eps}
870 \includegraphics[width=.45\textwidth]{c2/per_pow.eps}
872 \includegraphics[width=.7\textwidth]{c2/three_scenarios.pdf}
885 \begin{frame}{One core and Multi-cores per node results}
886 %\textcolor{blue}{One core and Multi-cores per node results:}
889 \includegraphics[width=.48\textwidth]{c2/eng_s_mc.eps}
891 \includegraphics[width=.48\textwidth]{c2/per_d_mc.eps}
894 \centering \small \textcolor{blue}{Using multi-cores per node scenario decreases the computations to communications ratio}.
901 \begin{frame}{Comparing the objective function to EDP}
903 EDP is the product between the energy consumption and the delay \tiny\textsuperscript{3}.
907 \includegraphics[width=.6\textwidth]{c2/edp_dist.eps}
912 \tiny \textsuperscript{3} Spiliopoulos et al, Green governors: A framework for continuously adaptive dvfs, in International Green Computing Conference and Workshops (IGCC), 2011.
914 %\begin{frame}{Summary}
917 % \item Two scaling algorithm were applies to \textcolor{blue}{heterogeneous %cluster} and \textcolor{blue}{grid}.
918 % \item A new \textcolor{blue}{energy} and \textcolor{blue}{performance} models were proposed.
919 % \item The experimental results ere conducted over \textcolor{blue}{SimGrid} simulators and real
920 %test-bed \textcolor{blue}{Grid'5000}.
922 %\item The algorithm saves the energy by \textcolor{blue}{29\%} and only
923 % degrades the performance by \textcolor{blue}{3.8\%} for simulated heterogeneous
926 %\item The algorithm saves the energy by \textcolor{blue}{30\%} and only
927 % degrades the performance by \textcolor{blue}{3.2\%} for Grid'5000 results.
929 % \item The proposed method \textcolor{blue}{outperforms the EDP method} in terms of energy-performance ratio.
937 \begin{frame}{The third contribution}
938 \section{\small {Energy optimization of asynchronous applications}}
940 \bf \Large \textcolor{blue}{Energy optimization of asynchronous iterative message passing applications}
949 \begin{frame}{Problem definition}\vspace{0.8 mm}
950 \textcolor{blue}{The execution of a synchronous parallel iterative application over a grid }
953 \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{10}{syn/a-}{0}{647}
954 %\includegraphics[width=0.6\textwidth]{syn/a-503}
963 \begin{frame}{Problem definition}\vspace{0.8 mm}
964 \textcolor{blue}{The execution of an asynchronous parallel iterative application over a grid }
967 \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{10}{asyn/a-}{0}{556}
968 %\includegraphics[width=0.6\textwidth]{asyn/a-440}
977 \begin{frame}{Solution}\vspace{0.8mm}
978 \textcolor{blue}{Using asynchronous communications with DVFS }
981 \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{10}{asyn+dvfs/a-}{0}{344}
982 %\includegraphics[width=0.6\textwidth]{asyn+dvfs/a-314}
992 %\begin{frame}{The performance models}
994 %\begin{block}{\small The performance model of Asynch. Applications}\small
996 %\label{eq:asyn_time}
997 %\Tnew = \frac{\sum_{i=1}^{N} \sum_{j=1}^{M_i}({\TcpOld[ij]} \cdot S_{ij})} {N \cdot M_i }
1002 %\begin{block}{\small The performance model of Hybrid Applications}\small
1004 %\label{eq:asyn_perf}
1005 %\Tnew = \frac{\sum_{i=1}^{N} (\max_{j=1,\dots, M_i} ({\TcpOld[ij]} \cdot S_{ij}) +
1006 %\min_{j=1,\dots,M_i} ({\Ltcm[ij]}))}{N}
1015 %%%%%%%%%%%%%%%%%%%%
1017 %%%%%%%%%%%%%%%%%%%%
1018 %\begin{frame}{The energy consumption models}
1020 %\begin{block}{\small The energy model of Asynch. Applications}\small
1022 %\label{eq:asyn_energy1}
1023 % E = \sum_{i=1}^{N} \sum_{j=1}^{M_i} {(S_{ij}^{-2} \cdot \Tcp[ij] \cdot (\Pd[ij]+\Ps[ij]) )}
1028 %\begin{block}{\small The energy model of Hybrid Applications}\small
1030 %\label{eq:asyn_energy}
1031 %E = \sum_{i=1}^{N} \sum_{j=1}^{M_i} {(S_{ij}^{-2} \cdot \Pd[ij] \cdot \Tcp[ij])} + \sum_{i=1}^{N} \sum_{j=1}^{M_i} (\Ps[ij] \cdot \\
1032 % ( \mathop{\max_{j=1,\dots,M_i}} ({\Tcp[ij]} \cdot S_{ij}) + \mathop{\min_{j=1,\dots,M_i}} ({\Ltcm[ij]})))
1039 %%%%%%%%%%%%%%%%%%%%
1041 %%%%%%%%%%%%%%%%%%%%
1042 \begin{frame}{The performance and the energy models }
1045 \includegraphics[width=0.9\textwidth]{syn-vs-asyn.pdf}
1052 %%%%%%%%%%%%%%%%%%%%
1054 %%%%%%%%%%%%%%%%%%%%
1055 %\begin{frame}{The scaling algorithm for Asynch. applications}
1058 %\includegraphics[width=0.55\textwidth]{algo-hybrid.pdf}
1063 %%%%%%%%%%%%%%%%%%%%
1065 %%%%%%%%%%%%%%%%%%%%
1066 \begin{frame}{The experiments}
1071 \item The architecture of the grid:
1073 \includegraphics[width=0.5\textwidth]{c3/hybrid-model.pdf}
1077 \item Applying the proposed algorithm to the asynchronous iterative message passing multi-splitting method.
1078 \item Evaluating the application over the simulator and Grid'5000.
1084 %%%%%%%%%%%%%%%%%%%%
1086 %%%%%%%%%%%%%%%%%%%%
1087 %\begin{frame}{The simulation results}
1088 %\centering \small \textcolor{blue}{The best scenario in terms of energy and performance is %the Async. MS with Sync. DVFS}
1091 % \includegraphics[scale=0.42]{c3/energy_saving.eps}
1093 %\centering The average energy saving = \textcolor{red}{22\%}
1098 %%%%%%%%%%%%%%%%%%%%
1100 %%%%%%%%%%%%%%%%%%%%
1101 %\begin{frame}{The simulation results}
1104 % \includegraphics[scale=0.42]{c3/perf_degra.eps}
1106 %\centering The average speed-up = \textcolor{red}{5.72\%}
1111 %%%%%%%%%%%%%%%%%%%%
1113 %%%%%%%%%%%%%%%%%%%%
1114 \begin{frame}{The Grid'5000 results}
1119 \includegraphics[width=0.53\textwidth]{c3/energy-s-compare.eps}
1120 \includegraphics[width=0.53\textwidth]{c3/perf-deg-compare.eps}
1123 \centering \footnotesize
1125 %\small \textcolor{blue}{The best scenario in terms of energy and performance is the Async. MS with Sync. DVFS}
1127 The average energy saving = \textcolor{red}{26.93\%}, the average speed-up = \textcolor{red}{21.48\%}
1131 %%%%%%%%%%%%%%%%%%%%
1133 %%%%%%%%%%%%%%%%%%%%
1134 \begin{frame}{The comparison results}
1136 \includegraphics[width=.5\textwidth]{c3/compare.eps}
1138 \includegraphics[width=.5\textwidth]{c3/compare_scales.eps}
1144 %%%%%%%%%%%%%%%%%%%%
1146 %%%%%%%%%%%%%%%%%%%%
1147 \begin{frame}{Conclusions}
1148 \section{Conclusions and Perspectives}
1151 \small \barrow Three \textcolor{blue}{ new energy consumption and performance} models were proposed for synchronous or asynchronous parallel applications with iterations running over
1152 \textcolor{blue}{homogeneous and heterogeneous clusters or grids}.
1156 \small \barrow \textcolor{blue}{A new objective function} to optimize both the energy consumption and the performance was proposed.
1158 \small \barrow \textcolor{blue}{New online frequency selecting algorithms} for clusters and grids were developed.
1160 \small \barrow The proposed algorithms were applied to the \textcolor{blue}{NAS parallel benchmarks} and \textcolor{blue}{the
1161 Multi-splitting} method.
1163 \small \barrow The proposed algorithms were evaluated over the \textcolor{blue}{SimGrid simulator} and over the \textcolor{blue}{Grid'5000 testbed}.
1165 \small \barrow All the proposed methods were compared to either \textcolor{blue}{Rauber and Rünger's method} or to the \textcolor{blue}{EDP objective function}.
1173 %%%%%%%%%%%%%%%%%%%%
1175 %%%%%%%%%%%%%%%%%%%%
1176 \begin{frame}{Publications}
1178 \begin{block}{\small Journal Articles }\scriptsize
1179 \begin{enumerate}[$\lbrack$1$\rbrack$]
1181 \item Ahmed Fanfakh, Jean-Claude Charr, Raphaël Couturier, Arnaud Giersch. Optimizing the energy consumption of message passing applications with iterations executed over grids. \textit{Journal of Computational
1184 \item Ahmed Fanfakh, Jean-Claude Charr, Raphaël Couturier, Arnaud Giersch. Energy Consumption Reduction for
1185 Asynchronous Message Passing Applications. \textit{Journal of Supercomputing}, 2016, (Accepted with minor revisions)
1191 \begin{block}{\small Conference Articles }\scriptsize
1193 \begin{enumerate}[$\lbrack$1$\rbrack$]
1195 \item Jean-Claude Charr, Raphaël Couturier, Ahmed Fanfakh, Arnaud Giersch. Dynamic Frequency Scaling for
1196 Energy Consumption Reduction in Distributed MPI Programs. \textit{ISPA 2014}, pp.
1197 225-230. IEEE Computer Society, Milan, Italy (2014).
1199 \item Jean-Claude Charr, Raphaël Couturier, Ahmed Fanfakh, Arnaud Giersch. Energy Consumption Reduction
1200 with DVFS for Message Passing Iterative Applications on Heterogeneous Architectures.
1201 \textit{The $16^{th}$ PDSEC}. pp. 922-931. IEEE Computer Society, INDIA (2015).
1203 \item Ahmed Fanfakh, Jean-Claude Charr, Raphaël Couturier, Arnaud Giersch. CPUs Energy Consumption
1204 Reduction for Asynchronous Parallel Methods Running over Grids. \textit{The $19^{th}$ CSE conference}. IEEE Computer Society,
1213 %%%%%%%%%%%%%%%%%%%%
1215 %%%%%%%%%%%%%%%%%%%%
1216 \begin{frame}{Perspectives}
1220 \small \barrow The proposed algorithms should take into consideration the
1221 \textcolor{blue}{variability between some iterations}.
1223 \small \barrow The proposed algorithms should be applied to \textcolor{blue}{other message passing methods with iterations} in order to see how they adapt to the characteristics of these methods.
1225 \small \barrow The proposed algorithms for heterogeneous platforms should be applied to heterogeneous platforms composed of \textcolor{blue}{CPUs and GPUs}.
1227 \small \barrow Comparing the results returned by the energy models to the values given by \textcolor{blue}{real instruments that measure the energy consumptions} of CPUs during the execution time.
1228 \small \barrow Considering the power consumed by the other devices in the node such as
1229 \textcolor{blue}{the memory and the hard drive} in the energy consumption model.
1235 %%%%%%%%%%%%%%%%%%%%
1237 %%%%%%%%%%%%%%%%%%%%
1238 \begin{frame}{Fin} \vspace{-10 mm}
1240 \centering \Large \textcolor{blue}{Thank you for your attention}
1243 \centering \textcolor{blue}{ {\Large Questions?}}