2 \usepackage{beamerthemefemto}
3 \usepackage[latin1]{inputenc}
4 \usepackage[T1]{fontenc}
5 \DeclareGraphicsExtensions{.jpg, .png , .pdf, .bmp, .pdftex}
6 \usepackage{algorithm,algorithmicx,algpseudocode}
7 \usepackage{graphicx,graphics}
16 \newcommand{\AG}[2][inline]{%
17 \todo[color=green!50,#1]{\sffamily\textbf{AG:} #2}\xspace}
18 \newcommand{\JC}[2][inline]{%
19 \todo[color=red!10,#1]{\sffamily\textbf{JC:} #2}\xspace}
20 \definecolor{myblue}{RGB}{0,29,119}
21 \newcommand{\Xsub}[2]{{\ensuremath{#1_\mathit{#2}}}}
23 %% used to put some subscripts lower, and make them more legible
24 \newcommand{\fxheight}[1]{\ifx#1\relax\relax\else\rule{0pt}{1.52ex}#1\fi}
26 \newcommand{\CL}{\Xsub{C}{L}}
27 \newcommand{\Dist}{\mathit{Dist}}
28 \newcommand{\EdNew}{\Xsub{E}{dNew}}
29 \newcommand{\Eind}{\Xsub{E}{ind}}
30 \newcommand{\Enorm}{\Xsub{E}{Norm}}
31 \newcommand{\Eoriginal}{\Xsub{E}{Original}}
32 \newcommand{\Ereduced}{\Xsub{E}{Reduced}}
33 \newcommand{\Es}{\Xsub{E}{S}}
34 \newcommand{\Fdiff}[1][]{\Xsub{F}{diff}_{\!#1}}
35 \newcommand{\Fmax}[1][]{\Xsub{F}{max}_{\fxheight{#1}}}
36 \newcommand{\Fnew}{\Xsub{F}{new}}
37 \newcommand{\Vnew}{\Xsub{V}{new}}
38 \newcommand{\Vmax}{\Xsub{V}{max}}
39 \newcommand{\Ileak}{\Xsub{I}{leak}}
40 \newcommand{\Kdesign}{\Xsub{K}{design}}
41 \newcommand{\MaxDist}{\mathit{Max}\Dist}
42 \newcommand{\MinTcm}{\mathit{Min}\Tcm}
43 \newcommand{\Ntrans}{\Xsub{N}{trans}}
44 \newcommand{\Pd}[1][]{\Xsub{P}{d}_{\fxheight{#1}}}
45 \newcommand{\PdNew}{\Xsub{P}{dNew}}
47 \newcommand{\PdOld}{\Xsub{P}{dOld}}
48 \newcommand{\Pnorm}{\Xsub{P}{Norm}}
49 \newcommand{\Tnorm}{\Xsub{T}{Norm}}
50 \newcommand{\Ps}[1][]{\Xsub{P}{s}_{\fxheight{#1}}}
51 \newcommand{\Scp}[1][]{\Xsub{S}{cp}_{#1}}
52 \newcommand{\Sopt}[1][]{\Xsub{S}{opt}_{#1}}
53 \newcommand{\Tcm}[1][]{\Xsub{T}{cm}_{\fxheight{#1}}}
54 \newcommand{\Tcp}[1][]{\Xsub{T}{cp}_{#1}}
55 \newcommand{\TcpOld}[1][]{\Xsub{T}{cpOld}_{#1}}
56 \newcommand{\Tnew}{\Xsub{T}{New}}
57 \newcommand{\Told}{\Xsub{T}{Old}}
58 \newcommand{\Ltcm}[1][]{\Xsub{L}{tcm}_{\fxheight{#1}}}
59 \newcommand{\Etcm}[1][]{\Xsub{E}{tcm}_{\fxheight{#1}}}
60 \newcommand{\Niter}[1][]{\Xsub{N}{iter}_{\fxheight{#1}}}
61 \newcommand{\Pmax}[1][]{\Xsub{P}{max}_{\fxheight{#1}}}
62 \newcommand{\Pidle}[1][]{\Xsub{P}{idle}_{\fxheight{#1}}}
65 \definecolor{myblue}{RGB}{0,29,119}
66 \usepackage[textsize=footnotesize]{todonotes}
67 \newcommand{\bsquare}{\item[\color{myblue}\ding{110}]}
68 \newcommand{\barrow}{\item[\color{myblue}\ding{228}]}
69 \newcommand{\bwarrow}{\item[\color{myblue}\ding{227}]}
70 \DeclareGraphicsExtensions{.jpg, .png , .pdf, .bmp, .pdftex}
74 %\title{Energy Consumption Optimization of Parallel Applications with
75 %Iterations using CPU Frequency Scaling}
78 \title{ \textbf{Energy Consumption Optimization of Parallel Applications with Iterations using CPU Frequency Scaling} \\ \vspace{0.2cm} \hspace{1.8cm}\textbf{\textcolor{cyan}{\small PhD Dissertation Defense}}}\vspace{-1cm}
79 \author{ \textbf{Ahmed Badri Muslim Fanfakh} \\ \vspace{0.5cm}\small Under Supervision: \textcolor{cyan}{\small Raphaël COUTURIER and Jean-Claude CHARR} \\\vspace{0.1cm} \textcolor{blue}{ University of Bourgogne Franche-Comté - FEMTO-ST - DISC Dept. - AND Team} \\ ~~~~~~~~~~~~~~~~~~~~~ \textbf{\textcolor{blue}{ 17 October 2016 }}}
83 % ____ _____ ____ _ _ _____
84 % | _ \| ____| __ )| | | |_ _|
85 % | | | | _| | _ \| | | | | |
86 % | |_| | |___| |_) | |_| | | |
87 % |____/|_____|____/ \___/ |_|
90 \setbeamertemplate{background}{\titrefemto}
105 \setbeamertemplate{background}{\pagefemto}
106 \begin{frame}{Outline}
108 \setbeamertemplate{section in toc}[sections numbered]
116 \begin{frame}{Introduction and problem definition}
117 \section{\small {Introduction and Problem definition}}
118 \bf \textcolor{blue}{Approaches to increase the computing power of the parallel platform :}
119 \begin{minipage}{0.5\textwidth}
120 \textcolor{blue}{1)} \small \bf \textcolor{black}{Increasing the frequency of a processor.}
122 \begin{minipage}{0.6\textwidth}
126 \includegraphics[width=0.7\textwidth]{fig/freq-years}
130 \begin{minipage}{0.5\textwidth}
131 \textcolor{blue}{2)} \small \bf \textcolor{black}{Increasing the number of nodes.}
133 \tiny \textcolor{blue}{Recently, Tianhe-2 supercomputer had more than 3 million cores while consuming around 17.8 megawatts.}
136 \begin{minipage}{0.6\textwidth}
138 \includegraphics[width=0.7\textwidth]{fig/clusters}
149 \begin{frame}{Introduction and problem definition}
151 \bf \textcolor{blue}{Techniques for energy consumption reduction}
153 \textcolor{blue}{1)} \bf \textcolor{black}{Switch-off idle nodes method}
156 \animategraphics[autopause,loop,controls,scale=0.25,buttonsize=0.2cm]{200}{on-off/a-}{0}{69}
157 %\includegraphics[width=0.6\textwidth]{on-off/a-69}
164 \begin{frame}{Techniques for energy consumption reduction}
166 \textcolor{blue}{2)} \bf \textcolor{black}{Dynamic voltage and frequency Scaling (DVFS)}
169 \animategraphics[autopause,controls,scale=0.25,buttonsize=0.2cm]{10}{DVFS-meq/a-}{0}{109}
170 %\includegraphics[width=0.6\textwidth]{DVFS-meq/a-109}
179 \begin{frame}{Motivations}
181 \section{\small {Motivations}}
182 \textcolor{blue}{Why we used DVFS method:}
184 \begin{minipage}{0.5\textwidth}
187 \item \small \textcolor{black}{The biggest power consumption is consumed by the processor \textsuperscript{1}. }
192 \begin{minipage}{0.5\textwidth}
195 \includegraphics[width=0.85\textwidth]{fig/node-power}
200 \begin{itemize} \item \small \textcolor{black}{It uses to reduce the energy consumption while keeping all the nodes working, thus it is more adapted to parallel computing.}
201 \item \small \textcolor{black}{It has a very small overhead compared to switching-off the idle nodes method.} \end{itemize}
205 \begin{block}{\textcolor{white}{Challenge and Objective}}
207 \small \textcolor{blue}{Challenge:} \textcolor{black}{DVFS is used to reduce the energy consumption, \textcolor{blue}{but} it degrades the performance simultaneously.}
210 \small \textcolor{blue}{Objective:} \textcolor{black}{Applying the DVFS to minimize the energy consumption while maintaining the performance of the parallel application.}
213 \tiny \textsuperscript{1} Fan, X., Weber, W., and Barroso, L. A. 2007. Power provisioning
214 for a warehouse-sized computer.
225 \begin{frame}{Contribution}
227 \section{\small {Energy optimization of homogeneous platform}}
229 \bf \Large \textcolor{blue}{Energy optimization of a parallel application with iterations running over a homogeneous platform}
239 \begin{frame}{Objectives}
240 \begin{femtoBlock}{} \vspace{-12 mm}
241 \begin{itemize} \small
242 \item Study the effect of the scaling factor $S$ on \textbf{energy consumption and performance } of parallel applications with iterations such as NAS
243 Benchmarks. \includegraphics[width=.06\textwidth]{c1/nasa.pdf} \medskip
245 \item Discovering the \textbf{energy-performance trade-off relation} when changing the frequency of the processor.\medskip
246 \item Proposing an algorithm for selecting the scaling factor $S$ producing \textbf {the optimal trade-off} between the energy consumption and the performance. \medskip
247 \item Comparing the proposed algorithm to existing methods.
250 %\footnote{\tiny Thomas Rauber and Gudula Rünger. Analytical modeling and simulation of the
251 %energy consumption \\ \quad ~ ~\quad of independent tasks. In Proceedings of the Winter Simulation Conference, 2012.} method that our method best on.
253 %\let\thefootnote\relax\footnote{}
265 \begin{frame}{Execution of synchronous parallel tasks}
269 \subfloat[Sync. imbalanced communications]{%
270 \includegraphics[scale=0.49]{c1/commtasks}\label{fig:h1}}
271 \subfloat[Sync. imbalanced computations]{%
272 \includegraphics[scale=0.49]{c1/compt}\label{fig:h2}}
273 % \caption{Parallel tasks on homogeneous platform}
285 \begin{frame}{Energy model for homogeneous platform}
286 The power consumed by a processor divided into two power metrics: the dynamic (\textcolor{red}{$P_d$}) and static
287 (\textcolor{red}{$P_s$}) power.
290 \textcolor{red}{ P_d} = \textcolor{blue}{\alpha \cdot CL \cdot V^2 \cdot F}
292 \scriptsize \underline{Where}: \\
293 \scriptsize {\textcolor{blue}{$\alpha$}: switching activity \hspace{15 mm} \textcolor{blue}{$CL$}: load capacitance\\
294 \textcolor{blue}{$V$} the supply voltage \hspace{14 mm} \textcolor{blue}{$F$}: operational frequency}
297 \small \textcolor{red}{P_s} = \textcolor{blue}{V \cdot N_{trans} \cdot K_{design} \cdot I_{Leak}}
300 \scriptsize{ \textcolor{blue}{$V$}: the supply voltage. \hspace{28 mm} \textcolor{blue}{$N_{trans}$}: number of transistors. \\
301 \textcolor{blue}{$K_{design}$}: design dependent parameter. \hspace{8 mm} \textcolor{blue}{$I_{leak}$}: technology dependent
309 \begin{frame}{Energy model for homogeneous platform}
311 The frequency scaling factor is the ratio between the maximum and the new frequency, \textcolor{blue}{$S = \frac{F_{max}}{F_{new}}$}. \medskip
315 \begin{block}{\small Rauber and Rünger's energy model}
316 $ E = P_{d} \cdot S_1^{-2} \cdot
317 \left( T_1 + \sum_{i=2}^{N} \frac{T_i^3}{T_1^2} \right) +
318 P_{s} \cdot S_1 \cdot T_1 \cdot N$
320 \textcolor{blue}{$S_1$}: the max. scaling factor\\
321 \textcolor{blue}{$P_{d}$}: the dynamic power\\
322 \textcolor{blue}{$P_{s}$}: the static power\\
323 \textcolor{blue}{$T_I$}: the time of the slower task\\
324 \textcolor{blue}{$T_i$}: the time of the other tasks\\
325 \textcolor{blue}{$N$}: the number of nodes
333 \begin{frame}{Performance evaluation of MPI programs}
336 \begin{block}{\small Execution time prediction model}
337 \centering{ $ \textcolor{red}{T_{new}} = \textcolor{blue}{T_{Max Comp Old} \cdot S + T_{{Min Comm Old}}}$}
340 \centering{\includegraphics[width=.4\textwidth]{c1/cg_per}
342 \includegraphics[width=.4\textwidth]{c1/lu_pre}}
345 \small The maximum normalized error for CG=0.0073 \textbf{(the smallest)} and LU=0.031 \textbf{(the worst)}.
355 \begin{frame}{Performance and energy reduction trade-off}
356 \begin{femtoBlock}{} \vspace{-15 mm}
359 \subfloat[\small Real relation.]{%
360 \includegraphics[width=.43\textwidth]{c1/file3}\label{fig:r2}}
362 \subfloat[\small Converted relation.]{%
363 \includegraphics[width=.43\textwidth]{c1/file}\label{fig:r1}}%
365 % \caption{The energy and performance relation}
368 Where:~~~ $\textcolor{blue}{Performance} = execution~time^{-1}$
372 \begin{block}{\small Our objective function}
373 \centering{$\textbf{\emph {\textcolor{red}{MaxDist}}} = \max_{j=1,2,\dots ,F}
374 (\overbrace{P_{Norm}(S_j)}^{{\textcolor{blue}{Maximize}}} -
375 \overbrace{E_{Norm}(S_j)}^{{\textcolor{blue}{Minimize}}} )$}
385 \begin{frame}{Scaling factor selection algorithm}
388 \includegraphics[width=.56 \textwidth]{c1/algo-homo}
397 \begin{frame}{Scaling algorithm example}
401 \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{dvfs-homo/a-}{0}{159}
402 %\includegraphics[width=0.6\textwidth]{dvfs-homo/a-159}
409 \begin{frame}{Experimental results }
413 \item The experiments were executed on the simulator SimGrid/SMPI v3.10.\medskip
414 \item The proposed algorithm was applied to the NAS parallel benchmarks.\medskip
415 \item Each node in the cluster has 18 frequency values from \textbf{2.5$GHz$} to \textbf{800$MHz$}.\medskip
416 \item The proposed algorithm was evaluated over the A, B, C classes of the benchmarks using 4, 8 or 9 and 16 nodes respectively. \medskip
417 \item $P_d=20W$, $P_s=4W$.
426 \begin{frame}{Experimental results}
429 \includegraphics[width=.35\textwidth]{c1/ep}
430 \includegraphics[width=.35\textwidth]{c1/cg}
431 \includegraphics[width=.35\textwidth]{c1/bt}}
433 \centering {\includegraphics[width=.55\textwidth]{c1/results.pdf}}
441 \begin{frame}{Results comparison}
442 \begin{block}{\small Rauber and Rünger's optimal scaling factor}
443 $S_{opt} = \sqrt[3]{\frac{2}{N} \cdot \frac{P_{dyn}}{P_{static}} \cdot
444 \left( 1 + \sum_{i=2}^{N} \frac{T_i^3}{T_1^3}\right) } $
447 %\includegraphics[width=.33\textwidth]{c1/c1.pdf}
449 %\includegraphics[width=.33\textwidth]{c1/c2.pdf}}
452 \includegraphics[width=.55\textwidth]{c1/compare_c.pdf}}
460 \begin{frame}{The proposed new energy model}
463 \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{homo-model/a-}{0}{356}
464 %\includegraphics[width=0.6\textwidth]{homo-model/a-356}
472 \begin{frame}{Comparing the new model with Rauber model }
475 \includegraphics[width=.45\textwidth]{c1/energy_con}
477 \includegraphics[width=.5\textwidth]{c1/compare-scales}
483 % \begin{frame}{Summary}
484 % \begin{femtoBlock}{}
487 %\item We have presented a new online scaling factor selection method that \textcolor{blue}{optimizes simultaneously the energy and performance}.\medskip
488 % \item It predicts \textcolor{blue}{ the energy consumption and the performance} of the parallel applications. \medskip
489 %\item Our algorithm \textcolor{blue}{saves more energy} when the communication and the other slacks times are big. \medskip
490 %\item It gives the \textcolor{blue}{best trade-off between energy reduction and
491 % performance}. \medskip
492 %\item Our method \ \textcolor{blue}{outperforms Rauber and Rünger's method} in terms of energy-performance ratio.
493 %\item The proposed new energy model is \textcolor{blue}{more accurate} then Rauber energy model.
505 \begin{frame}{Contribution}
507 \section{\small {Energy optimization of heterogeneous platform}}
511 \bf \Large \textcolor{blue}{Energy optimization of a parallel application with iterations running over a Heterogeneous platform}
521 \begin{frame}{Objectives}
522 \begin{femtoBlock}{} \vspace{-12 mm}
523 \begin{itemize} \small
524 \item Proposing \textcolor{blue}{new energy and performance models} for message passing applications with iterations running
525 over a heterogeneous platform (cluster and Grid). \medskip
526 \item Studying the effect of the scaling factor $S$ on both the \textcolor{blue}{energy consumption and the performance} of
527 message passing iterative applications. \medskip
529 \item Computing the vector of scaling factors ($S_1, S_2, ..., S_n$) producing \textcolor{blue} {the optimal trade-off} between
530 the energy consumption and the performance.
541 \begin{frame}{The execution time model}
545 \includegraphics[scale=0.5]{c2/commtasks}
551 \begin{block}{\small The execution time prediction model}
554 \small\textcolor{red}{ T_{new}} = \textcolor{blue}{\max_{i=1,2,\dots,N} ({TcpOld_i} \cdot S_{i}) + \min_{i=1,2,\dots,N} (Tcm_i)}
557 \small Where: $ \textcolor{red}{Tcm} = \textcolor{blue}{communication~times + slack~times}$
564 \begin{frame}{The energy consumption model}
565 The overall energy consumption of a message passing synchronous application executed over
566 a heterogeneous platform can be computed as follows:
569 \textcolor{red}{E} = \textcolor{blue}{\sum_{i=1}^{N} {(S_i^{-2} \cdot Pd_i \cdot Tcp_i)}} + {} \\
570 \textcolor{blue}{\sum_{i=1}^{N} (Ps_i \cdot (\max_{i=1,2,\dots,N} (Tcp_i \cdot S_{i}) + {\min_{i=1,2,\dots,N} (Tcm_i))}}
574 \textcolor{blue}{N} : is the number of nodes.
581 \begin{frame}{The energy model example for heter. cluster}
584 \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{heter-model/a-}{0}{272}
585 %\includegraphics[width=0.6\textwidth]{heter-model/a-272}
595 %\begin{frame}{The trade-off between energy and performance}
598 % \centering{ \includegraphics[width=.4\textwidth]{c2/heter}}
601 % \textcolor{red}{\underline{Step1}}: computing the normalized energy \textcolor{blue}%{$E_{norm} = \frac{E_{reduced}}
603 % \textcolor{red}{\underline{Step2}}: computing the normalized performance \textcolor{blue}{$P_{norm} = \frac{T_{Max}}{T_{new}}$}.
605 % \begin{block}{\small The tradeoff model}
608 % \textcolor{red}{MaxDist} =
609 % \mathop {\max_{i=1,\dots F}}_{j=1,\dots,N}
610 % (\overbrace{P_{norm}(S_{ij})}^{\text{\textcolor{blue}{Maximize}}} -
611 % \overbrace{E_{norm}(S_{ij})}^{\text{\textcolor{blue}{Minimize}}} )
620 \begin{frame}{The scaling algorithm for heter. cluster}
623 \includegraphics[width=.52\textwidth]{algo-heter}
630 \begin{frame}{The scaling algorithm example}
635 \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{dvfs-heter/a-}{0}{650}
636 % \includegraphics[width=0.6\textwidth]{dvfs-heter/a-650}
646 \begin{frame}{Experiments over a heterogeneous cluster }
649 \item The experiments executed on the simulator SimGrid/SMPI v3.10.\medskip
650 \item The scaling algorithm was applied to the NAS parallel benchmarks class C.\medskip
651 \item Four types of processors with different computing powers were used.\medskip
652 \item We ran the benchmarks on different number of nodes ranging from 4 to 144 nodes.\medskip
653 \item The total power consumption of the chosen CPUs assumed to be composed of $80\%$ for the dynamic power and $20\%$ for the static power.
664 \begin{frame}{The experimental results}
668 \includegraphics[width=0.8\textwidth]{c2/energy_saving.pdf}
670 \textcolor{blue}{On average, it reduces the energy consumption by \textcolor{red}{29\%}
671 for the class C of the NAS Benchmarks executed over 8 nodes}
681 \begin{frame}{The experimental results}
686 \includegraphics[width=.8\textwidth]{c2/perf_degra.pdf}
688 \textcolor{blue}{On average, it degrades by \textcolor{red}{3.8\%} the performance
689 of NAS Benchmarks class C executed over 8 nodes}
698 \begin{frame}{The results of the three power scenarios}
702 \includegraphics[width=.55\textwidth]{c2/three_power.pdf}
704 \includegraphics[width=.55\textwidth]{c2/three_scenarios.pdf}
713 \begin{frame}{Comparing the objective function to EDP}
715 EDP is the products between the energy consumption and the delay.
719 \includegraphics[width=.55\textwidth]{c2/avg_compare.pdf}
721 \includegraphics[width=.55\textwidth]{c2/compare_with_EDP.pdf}
731 %\begin{frame}{Energy optimization of grid platform}
734 % \includegraphics[width=.6\textwidth]{c2/grid5000.pdf}
736 % \small 10 sites distributed over France and Luxembourg
744 \begin{frame}{The grid architecture}
746 \includegraphics[width=.8\textwidth]{c2/init_freq.pdf}
749 %\begin{frame}{Performance, Energy and trade-off models} \small
750 %\begin{block}{\small The performance model of grid}
753 %\Tnew = \mathop{\max_{i=1,\dots N}}_{j=1,\dots,M_i}({\TcpOld[ij]} \cdot S_{ij})
754 % +\mathop{\min_{j=1,\dots,M_h}} (\Tcm[hj])
759 %\begin{block}{\small The energy model of grid}\small
762 %E = \sum_{i=1}^{N} \sum_{i=1}^{M_i} {(S_{ij}^{-2} \cdot \Pd[ij] \cdot \Tcp[ij])} +
763 % \sum_{i=1}^{N} \sum_{j=1}^{M_i} (\Ps[ij] \cdot \Tnew)
767 %\begin{block}{\small The trade-off model of grid}
772 %\mathop{ \mathop{\max_{i=1,\dots N}}_{j=1,\dots,M_i}}_{k=1,\dots,F_j}
773 % (\overbrace{\Pnorm(S_{ijk})}^{\text{Maximize}} -
774 % \overbrace{\Enorm(S_{ijk})}^{\text{Minimize}} )
786 \begin{frame}{Experiments over Grid'5000}
789 \includegraphics[width=.5\textwidth]{c2/grid5000-2.pdf}
792 \textcolor{blue}{Two experiments were conducted: over one site and two sites
793 each one with three clusters }
797 \includegraphics[width=.5\textwidth]{c2/power_consumption.pdf}
799 \textcolor{blue}{Grid'5000 power measurement tools were used}
808 \begin{frame}{Experiments over Grid'5000}
810 \begin{minipage}{0.4\textwidth}
811 %\textcolor{blue}{Execution the NAS class D on 16 nodes saves the energy by
812 %\textcolor{red}{30\%}}
813 \textcolor{blue}{The energy saving = \textcolor{red}{30\%}}
815 \begin{minipage}{0.55\textwidth}
817 \includegraphics[width=0.83 \textwidth]{c2/eng_s.eps}
821 \begin{minipage}{0.4\textwidth}
822 %\textcolor{blue}{Execution the NAS class D on 16 nodes degrades the
823 %performance by \textcolor{red}{3.2\%}}
824 \textcolor{blue}{The performance degradation = \textcolor{red}{3.2\%}}
826 \begin{minipage}{0.55\textwidth}
828 \includegraphics[width=.83\textwidth]{c2/per_d.eps}
838 \begin{frame}{Experiments over Grid'5000}
839 \textcolor{blue}{One core and Multi-cores per node results:}
842 \includegraphics[width=.48\textwidth]{c2/eng_s_mc.eps}
844 \includegraphics[width=.48\textwidth]{c2/per_d_mc.eps}
847 \centering \small \textcolor{blue}{Using multi-cores per node scenario decreases the computations to communications ratio}.
852 %\begin{frame}{Summary}
855 % \item Two scaling algorithm were applies to \textcolor{blue}{heterogeneous %cluster} and \textcolor{blue}{grid}.
856 % \item A new \textcolor{blue}{energy} and \textcolor{blue}{performance} models were proposed.
857 % \item The experimental results ere conducted over \textcolor{blue}{SimGrid} simulators and real
858 %test-bed \textcolor{blue}{Grid'5000}.
860 %\item The algorithm saves the energy by \textcolor{blue}{29\%} and only
861 % degrades the performance by \textcolor{blue}{3.8\%} for simulated heterogeneous
864 %\item The algorithm saves the energy by \textcolor{blue}{30\%} and only
865 % degrades the performance by \textcolor{blue}{3.2\%} for Grid'5000 results.
867 % \item The proposed method \textcolor{blue}{outperforms the EDP method} in terms of energy-performance ratio.
875 \begin{frame}{Contribution}
876 \section{\small {Energy optimization of asynchronous applications}}
878 \bf \Large \textcolor{blue}{Energy optimization of asynchronous iterative message passing applications}
887 \begin{frame}{Problem definition}\vspace{0.8 mm}
888 \textcolor{blue}{The execution of a synchronous parallel iterative application over a grid }
891 \animategraphics[autopause,controls,scale=0.25,buttonsize=0.2cm]{10}{syn/a-}{0}{503}
892 %\includegraphics[width=0.6\textwidth]{syn/a-503}
901 \begin{frame}{Problem definition}\vspace{0.8 mm}
902 \textcolor{blue}{The execution of an asynchronous parallel iterative application over a grid }
905 \animategraphics[autopause,controls,scale=0.25,buttonsize=0.2cm]{10}{asyn/a-}{0}{440}
906 %\includegraphics[width=0.6\textwidth]{asyn/a-440}
915 \begin{frame}{Solution}\vspace{0.8mm}
916 \textcolor{blue}{Using asynchronous communications with DVFS }
919 \animategraphics[autopause,controls,scale=0.25,buttonsize=0.2cm]{10}{asyn+dvfs/a-}{0}{314}
920 %\includegraphics[width=0.6\textwidth]{asyn+dvfs/a-314}
930 %\begin{frame}{The performance models}
932 %\begin{block}{\small The performance model of Asynch. Applications}\small
934 %\label{eq:asyn_time}
935 %\Tnew = \frac{\sum_{i=1}^{N} \sum_{j=1}^{M_i}({\TcpOld[ij]} \cdot S_{ij})} {N \cdot M_i }
940 %\begin{block}{\small The performance model of Hybrid Applications}\small
942 %\label{eq:asyn_perf}
943 %\Tnew = \frac{\sum_{i=1}^{N} (\max_{j=1,\dots, M_i} ({\TcpOld[ij]} \cdot S_{ij}) +
944 %\min_{j=1,\dots,M_i} ({\Ltcm[ij]}))}{N}
956 %\begin{frame}{The energy consumption models}
958 %\begin{block}{\small The energy model of Asynch. Applications}\small
960 %\label{eq:asyn_energy1}
961 % E = \sum_{i=1}^{N} \sum_{j=1}^{M_i} {(S_{ij}^{-2} \cdot \Tcp[ij] \cdot (\Pd[ij]+\Ps[ij]) )}
966 %\begin{block}{\small The energy model of Hybrid Applications}\small
968 %\label{eq:asyn_energy}
969 %E = \sum_{i=1}^{N} \sum_{j=1}^{M_i} {(S_{ij}^{-2} \cdot \Pd[ij] \cdot \Tcp[ij])} + \sum_{i=1}^{N} \sum_{j=1}^{M_i} (\Ps[ij] \cdot \\
970 % ( \mathop{\max_{j=1,\dots,M_i}} ({\Tcp[ij]} \cdot S_{ij}) + \mathop{\min_{j=1,\dots,M_i}} ({\Ltcm[ij]})))
980 \begin{frame}{The performance and the energy models }
983 \includegraphics[width=0.9\textwidth]{syn-vs-asyn.pdf}
993 \begin{frame}{The scaling algorithm for Asynch. applications}
996 \includegraphics[width=0.55\textwidth]{algo-hybrid.pdf}
1001 %%%%%%%%%%%%%%%%%%%%
1003 %%%%%%%%%%%%%%%%%%%%
1004 \begin{frame}{The experiments}
1009 \item The architecture of the grid:
1011 \includegraphics[width=0.5\textwidth]{c3/hybrid-model.pdf}
1015 \item Applying the proposed algorithm to the asynchronous iterative message passing multi-splitting method.
1016 \item Evaluating the application over the simulator and Grid'5000.
1022 %%%%%%%%%%%%%%%%%%%%
1024 %%%%%%%%%%%%%%%%%%%%
1025 \begin{frame}{The simulation results}
1026 \centering \small \textcolor{blue}{The best scenario in terms of energy and performance is the Async. MS with Sync. DVFS}
1029 \includegraphics[scale=0.42]{c3/energy_saving.eps}
1031 \centering The average of energy saving = \textcolor{red}{22\%}
1036 %%%%%%%%%%%%%%%%%%%%
1038 %%%%%%%%%%%%%%%%%%%%
1039 \begin{frame}{The simulation results}
1042 \includegraphics[scale=0.42]{c3/perf_degra.eps}
1044 \centering The average speed-up = \textcolor{red}{5.72\%}
1049 %%%%%%%%%%%%%%%%%%%%
1051 %%%%%%%%%%%%%%%%%%%%
1052 \begin{frame}{The Grid'5000 results}
1057 \includegraphics[width=0.53\textwidth]{c3/energy-s-compare.eps}
1058 \includegraphics[width=0.53\textwidth]{c3/perf-deg-compare.eps}
1062 The energy saving = \textcolor{red}{26.93\%}, the average speed-up = \textcolor{red}{21.48\%}
1066 %%%%%%%%%%%%%%%%%%%%
1068 %%%%%%%%%%%%%%%%%%%%
1069 \begin{frame}{The comparison results}
1071 \includegraphics[width=.5\textwidth]{c3/compare.eps}
1073 \includegraphics[width=.5\textwidth]{c3/compare_scales.eps}
1079 %%%%%%%%%%%%%%%%%%%%
1081 %%%%%%%%%%%%%%%%%%%%
1082 \begin{frame}{Conclusions}
1083 \section{Conclusions and Perspectives}
1086 \small \barrow Three \textcolor{blue}{ new energy consumption and performance} models were proposed for synchronous and asynchronous parallel applications with iterations running over
1087 \textcolor{blue}{homogeneous and heterogeneous clusters and grids}.
1091 \small \barrow \textcolor{blue}{A new objective function} to optimize both the energy consumption and the performance was proposed.
1093 \small \barrow \textcolor{blue}{New online frequency selecting algorithms} for clusters and grids were developed.
1095 \small \barrow The proposed algorithms were applied to the \textcolor{blue}{NAS parallel benchmarks} and \textcolor{blue}{the
1096 Multi-splitting} method.
1098 \small \barrow The proposed algorithms were evaluated over the \textcolor{blue}{SimGrid simulator} and over \textcolor{blue}{Grid'5000 testbed}.
1100 \small \barrow All the proposed methods were compared to either \textcolor{blue}{Rauber and Rünger's method} or \textcolor{blue}{the EDP objective function}.
1108 %%%%%%%%%%%%%%%%%%%%
1110 %%%%%%%%%%%%%%%%%%%%
1111 \begin{frame}{Publications}
1113 \begin{block}{\small Journal Articles }\scriptsize
1114 \begin{enumerate}[$\lbrack$1$\rbrack$]
1116 \item Ahmed Fanfakh, Jean-Claude Charr, Raphaël Couturier, Arnaud Giersch. Optimizing the energy consumption of message passing applications with iterations executed over grids. \textit{Journal of Computational
1119 \item Ahmed Fanfakh, Jean-Claude Charr, Raphaël Couturier, Arnaud Giersch. Energy Consumption Reduction for
1120 Asynchronous Message Passing Applications. \textit{Journal of Supercomputing}, 2016, (Submitted)
1126 \begin{block}{\small Conference Articles }\scriptsize
1128 \begin{enumerate}[$\lbrack$1$\rbrack$]
1130 \item Jean-Claude Charr, Raphaël Couturier, Ahmed Fanfakh, Arnaud Giersch. Dynamic Frequency Scaling for
1131 Energy Consumption Reduction in Distributed MPI Programs. \textit{ISPA 2014}, pp.
1132 225-230. IEEE Computer Society, Milan, Italy (2014).
1134 \item Jean-Claude Charr, Raphaël Couturier, Ahmed Fanfakh, Arnaud Giersch. Energy Consumption Reduction
1135 with DVFS for Message Passing Iterative Applications on Heterogeneous Architectures.
1136 \textit{The $16^{th}$ PDSEC}. pp. 922-931. IEEE Computer Society, INDIA (2015).
1138 \item Ahmed Fanfakh, Jean-Claude Charr, Raphaël Couturier, Arnaud Giersch. CPUs Energy Consumption
1139 Reduction for Asynchronous Parallel Methods Running over Grids. \textit{The $19^{th}$ CSE conference}. IEEE Computer Society,
1148 %%%%%%%%%%%%%%%%%%%%
1150 %%%%%%%%%%%%%%%%%%%%
1151 \begin{frame}{Perspectives}
1155 \small \barrow The proposed algorithms should take into consideration the
1156 \textcolor{blue}{variability between some iterations}.
1158 \small \barrow The proposed algorithms should be applied to \textcolor{blue}{other message passing methods with iterations} in order to see how they adapt to the characteristics of these methods.
1160 \small \barrow The proposed algorithms for heterogeneous platforms should be applied to heterogeneous platforms composed of \textcolor{blue}{CPUs and GPUs}.
1162 \small \barrow Comparing the results returned by the energy models to the values given by \textcolor{blue}{real instruments that measure the energy consumptions} of CPUs during the execution time.
1167 %%%%%%%%%%%%%%%%%%%%
1169 %%%%%%%%%%%%%%%%%%%%
1170 \begin{frame}{Fin} \vspace{-10 mm}
1172 \centering \Large \textcolor{blue}{Thank you for your listening}
1175 \centering \textcolor{blue}{ {\Large Questions?}}