2 \usepackage{beamerthemefemto}
3 \usepackage[latin1]{inputenc}
4 \usepackage[T1]{fontenc}
5 \DeclareGraphicsExtensions{.jpg, .png , .pdf, .bmp, .pdftex}
6 \usepackage{algorithm,algorithmicx,algpseudocode}
7 \usepackage{graphicx,graphics}
16 \newcommand{\AG}[2][inline]{%
17 \todo[color=green!50,#1]{\sffamily\textbf{AG:} #2}\xspace}
18 \newcommand{\JC}[2][inline]{%
19 \todo[color=red!10,#1]{\sffamily\textbf{JC:} #2}\xspace}
20 \definecolor{myblue}{RGB}{0,29,119}
21 \newcommand{\Xsub}[2]{{\ensuremath{#1_\mathit{#2}}}}
23 %% used to put some subscripts lower, and make them more legible
24 \newcommand{\fxheight}[1]{\ifx#1\relax\relax\else\rule{0pt}{1.52ex}#1\fi}
26 \newcommand{\CL}{\Xsub{C}{L}}
27 \newcommand{\Dist}{\mathit{Dist}}
28 \newcommand{\EdNew}{\Xsub{E}{dNew}}
29 \newcommand{\Eind}{\Xsub{E}{ind}}
30 \newcommand{\Enorm}{\Xsub{E}{Norm}}
31 \newcommand{\Eoriginal}{\Xsub{E}{Original}}
32 \newcommand{\Ereduced}{\Xsub{E}{Reduced}}
33 \newcommand{\Es}{\Xsub{E}{S}}
34 \newcommand{\Fdiff}[1][]{\Xsub{F}{diff}_{\!#1}}
35 \newcommand{\Fmax}[1][]{\Xsub{F}{max}_{\fxheight{#1}}}
36 \newcommand{\Fnew}{\Xsub{F}{new}}
37 \newcommand{\Vnew}{\Xsub{V}{new}}
38 \newcommand{\Vmax}{\Xsub{V}{max}}
39 \newcommand{\Ileak}{\Xsub{I}{leak}}
40 \newcommand{\Kdesign}{\Xsub{K}{design}}
41 \newcommand{\MaxDist}{\mathit{Max}\Dist}
42 \newcommand{\MinTcm}{\mathit{Min}\Tcm}
43 \newcommand{\Ntrans}{\Xsub{N}{trans}}
44 \newcommand{\Pd}[1][]{\Xsub{P}{d}_{\fxheight{#1}}}
45 \newcommand{\PdNew}{\Xsub{P}{dNew}}
47 \newcommand{\PdOld}{\Xsub{P}{dOld}}
48 \newcommand{\Pnorm}{\Xsub{P}{Norm}}
49 \newcommand{\Tnorm}{\Xsub{T}{Norm}}
50 \newcommand{\Ps}[1][]{\Xsub{P}{s}_{\fxheight{#1}}}
51 \newcommand{\Scp}[1][]{\Xsub{S}{cp}_{#1}}
52 \newcommand{\Sopt}[1][]{\Xsub{S}{opt}_{#1}}
53 \newcommand{\Tcm}[1][]{\Xsub{T}{cm}_{\fxheight{#1}}}
54 \newcommand{\Tcp}[1][]{\Xsub{T}{cp}_{#1}}
55 \newcommand{\TcpOld}[1][]{\Xsub{T}{cpOld}_{#1}}
56 \newcommand{\Tnew}{\Xsub{T}{New}}
57 \newcommand{\Told}{\Xsub{T}{Old}}
58 \newcommand{\Ltcm}[1][]{\Xsub{L}{tcm}_{\fxheight{#1}}}
59 \newcommand{\Etcm}[1][]{\Xsub{E}{tcm}_{\fxheight{#1}}}
60 \newcommand{\Niter}[1][]{\Xsub{N}{iter}_{\fxheight{#1}}}
61 \newcommand{\Pmax}[1][]{\Xsub{P}{max}_{\fxheight{#1}}}
62 \newcommand{\Pidle}[1][]{\Xsub{P}{idle}_{\fxheight{#1}}}
65 \definecolor{myblue}{RGB}{0,29,119}
66 \usepackage[textsize=footnotesize]{todonotes}
67 \newcommand{\bsquare}{\item[\color{myblue}\ding{110}]}
68 \newcommand{\barrow}{\item[\color{myblue}\ding{228}]}
69 \newcommand{\bwarrow}{\item[\color{myblue}\ding{227}]}
70 \DeclareGraphicsExtensions{.jpg, .png , .pdf, .bmp, .pdftex}
74 %\title{Energy Consumption Optimization of Parallel Applications with
75 %Iterations using CPU Frequency Scaling}
78 \title{ \textbf{Energy Consumption Optimization of Parallel Applications with Iterations using CPU Frequency Scaling} \\ \vspace{0.2cm} \hspace{1.8cm}\textbf{\textcolor{cyan}{\small PhD Dissertation Defense}}}\vspace{-0.5cm}
79 \author{ \textbf{Ahmed Badri Muslim Fanfakh} \\ \vspace{0.5cm}\small Under the supervision of: \\ \textcolor{cyan}{\small Raphaël COUTURIER and Jean-Claude CHARR} \\\vspace{0.1cm} \textcolor{blue}{ UBFC - FEMTO-ST - DISC Dept. - AND Team} \\ ~~~~~~~~~~~~~~~~~~~~~ \textbf{\textcolor{blue}{ 17 October 2016 }}}
83 % ____ _____ ____ _ _ _____
84 % | _ \| ____| __ )| | | |_ _|
85 % | | | | _| | _ \| | | | | |
86 % | |_| | |___| |_) | |_| | | |
87 % |____/|_____|____/ \___/ |_|
90 \setbeamertemplate{background}{\titrefemto}
105 \setbeamertemplate{background}{\pagefemto}
106 \begin{frame}{Outline}
108 \setbeamertemplate{section in toc}[sections numbered]
116 \begin{frame}{Introduction and problem definition}
117 \section{\small {Introduction and Problem definition}}
118 \bf \textcolor{blue}{To get more computing power:}
119 \begin{minipage}{0.5\textwidth}
120 \textcolor{blue}{1)} \small \bf \textcolor{black}{Increase the frequency of a processor.\\ (limited due to overheating)}
122 \begin{minipage}{0.6\textwidth}
126 \includegraphics[width=0.7\textwidth]{fig/freq-years}
130 \begin{minipage}{0.5\textwidth}
131 \textcolor{blue}{2)} \small \bf \textcolor{black}{Use more nodes.}
133 \textcolor{black}{The supercomputer Tianhe-2 has more than 3 million cores and consumes around 17.8 megawatts.}
136 \begin{minipage}{0.6\textwidth}
138 \includegraphics[width=0.7\textwidth]{fig/clusters}
149 \begin{frame}{Techniques for energy consumption reduction}
151 \textcolor{blue}{1)} \bf \textcolor{black}{Switch-off idle nodes method}
154 \animategraphics[autopause,loop,controls,scale=0.25,buttonsize=0.2cm]{200}{on-off/a-}{0}{69}
155 %\includegraphics[width=0.6\textwidth]{on-off/a-69}
162 \begin{frame}{Techniques for energy consumption reduction}
164 \textcolor{blue}{2)} \bf \textcolor{black}{Dynamic Voltage and Frequency Scaling (DVFS)}
167 \animategraphics[autopause,controls,scale=0.25,buttonsize=0.2cm]{10}{DVFS-meq/a-}{0}{109}
168 %\includegraphics[width=0.6\textwidth]{DVFS-meq/a-109}
177 \begin{frame}{Motivations}
179 \section{\small {Motivations}}
180 \textcolor{blue}{Why we used the DVFS method:}
182 \begin{minipage}{0.5\textwidth}
185 \item \small \textcolor{black}{ The CPU is the component that consumes the highest amount of energy in a node \textsuperscript{1}. }
190 \begin{minipage}{0.5\textwidth}
193 \includegraphics[width=0.85\textwidth]{fig/node-power}
198 \begin{itemize} \item \small \textcolor{black}{DVFS reduces the energy consumption while
199 keeping all the nodes working.}
200 \item \small \textcolor{black}{It has a very small overhead compared to switching-off the idle nodes.} \end{itemize}
204 \begin{block}{\textcolor{white}{Challenge and Objective}}
206 \small \textcolor{blue}{Challenge:} \textcolor{black}{DVFS is used to reduce the energy consumption, \textcolor{blue}{but} it also degrades the performance of the CPU.}
209 \small \textcolor{blue}{Objective:} \textcolor{black}{Applying the DVFS to minimize the energy consumption while maintaining the performance of the parallel application.}
212 \tiny \textsuperscript{1} Fan, X., Weber, W., and Barroso, L. A. 2007. Power provisioning
213 for a warehouse-sized computer.
224 \begin{frame}{The first contribution}
226 \section{\small {Energy optimization of a homogeneous platform}}
228 % \includegraphics[width=0.6\textwidth]{white.pdf}
231 \bf \Large \textcolor{blue}{Energy optimization of a parallel application with iterations running over a homogeneous platform}
241 \begin{frame}{Objectives}
243 \begin{itemize} \small \justifying
245 \item Study the effect of the scaling factor on the \textbf{energy consumption and performance } of parallel applications with iterations. \medskip
247 \item Discovering the \textbf{energy-performance trade-off relation} when changing the frequency of the processor.\medskip
248 \item Proposing an algorithm for selecting the scaling factor that produces \textbf {the optimal trade-off} between the energy consumption and the performance. \medskip
249 \item Comparing the proposed algorithm to existing methods.
252 %\footnote{\tiny Thomas Rauber and Gudula Rünger. Analytical modeling and simulation of the
253 %energy consumption \\ \quad ~ ~\quad of independent tasks. In Proceedings of the Winter Simulation Conference, 2012.} method that our method best on.
255 %\let\thefootnote\relax\footnote{}
267 \begin{frame}{Execution of synchronous parallel tasks}
271 \subfloat[Synchronous imbalanced communications]{%
272 \includegraphics[scale=0.49]{c1/commtasks}\label{fig:h1}}
273 \subfloat[Synchronous imbalanced computations]{%
274 \includegraphics[scale=0.49]{c1/compt}\label{fig:h2}}
275 % \caption{Parallel tasks on homogeneous platform}
287 \begin{frame}{Energy model for a homogeneous platform}
288 The power consumed by a processor divided into two power metrics: the dynamic (\textcolor{red}{$P_d$}) and static
289 (\textcolor{red}{$P_s$}) power.
292 \textcolor{red}{ P_d} = \textcolor{blue}{\alpha \cdot CL \cdot V^2 \cdot F}
294 \scriptsize \underline{Where}: \\
295 \scriptsize {\textcolor{blue}{$\alpha$}: switching activity \hspace{15 mm} \textcolor{blue}{$CL$}: load capacitance\\
296 \textcolor{blue}{$V$}: the supply voltage \hspace{14 mm} \textcolor{blue}{$F$}: operational frequency}
299 \small \textcolor{red}{P_s} = \textcolor{blue}{V \cdot N_{trans} \cdot K_{design} \cdot I_{Leak}}
302 \scriptsize{ \textcolor{blue}{$V$}: the supply voltage. \hspace{28 mm} \textcolor{blue}{$N_{trans}$}: number of transistors. \\
303 \textcolor{blue}{$K_{design}$}: design dependent parameter. \hspace{8 mm} \textcolor{blue}{$I_{leak}$}: technology dependent
311 \begin{frame}{Energy model for a homogeneous platform}
313 The frequency scaling factor is the ratio between the maximum and the new frequency, \textcolor{blue}{$S = \frac{F_{max}}{F_{new}}$}. \medskip
317 \begin{block}{\small Rauber and Rünger's energy model}
318 $ E = P_{d} \cdot S_1^{-2} \cdot
319 \left( T_1 + \sum_{i=2}^{N} \frac{T_i^3}{T_1^2} \right) +
320 P_{s} \cdot S_1 \cdot T_1 \cdot N$
322 \textcolor{blue}{$S_1$}: the maximum scaling factor.\\
323 \textcolor{blue}{$P_{d}$}: the dynamic power.\\
324 \textcolor{blue}{$P_{s}$}: the static power.\\
325 \textcolor{blue}{$T_I$}: the execution time of the slower task.\\
326 \textcolor{blue}{$T_i$}: the execution time of task i.\\
327 \textcolor{blue}{$N$}: the number of nodes.
335 \begin{frame}{Performance evaluation of MPI programs}
338 \begin{block}{\small Execution time prediction model}
339 \centering{ $ \textcolor{red}{T_{new}} = \textcolor{blue}{T_{Max Comp Old} \cdot S + T_{{Min Comm Old}}}$}
342 \centering{\includegraphics[width=.4\textwidth]{c1/cg_per}
344 \includegraphics[width=.4\textwidth]{c1/lu_pre}}
347 \small The maximum normalized error for CG=0.0073 \textbf{(the smallest)} and LU=0.031 \textbf{(the worst)}.
357 \begin{frame}{Performance and energy reduction trade-off}
358 \begin{femtoBlock}{} \vspace{-15 mm}
361 \subfloat[\small Real relation.]{%
362 \includegraphics[width=.43\textwidth]{c1/file3}\label{fig:r2}}
364 \subfloat[\small Converted relation.]{%
365 \includegraphics[width=.43\textwidth]{c1/file}\label{fig:r1}}%
367 % \caption{The energy and performance relation}
370 Where:~~~ $\textcolor{blue}{Performance} = execution~time^{-1}$
374 \begin{block}{\small Our objective function}
375 \centering{$\textbf{\emph {\textcolor{red}{MaxDist}}} = \max_{j=1,2,\dots ,F}
376 (\overbrace{P_{Norm}(S_j)}^{{\textcolor{blue}{Maximize}}} -
377 \overbrace{E_{Norm}(S_j)}^{{\textcolor{blue}{Minimize}}} )$}
387 \begin{frame}{Scaling factor selection algorithm}
390 \includegraphics[width=.56 \textwidth]{c1/algo-homo}
399 \begin{frame}{Scaling algorithm example}
403 \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{dvfs-homo/a-}{0}{159}
404 %\includegraphics[width=0.6\textwidth]{dvfs-homo/a-159}
411 \begin{frame}{Experimental results }
415 \item The experiments were executed on the simulator SimGrid/SMPI v3.10.\medskip
416 \item The proposed algorithm was applied to the NAS parallel benchmarks.\medskip
417 \item Each node in the cluster has 18 frequency values from \textbf{2.5$GHz$} to \textbf{800$MHz$}.\medskip
418 \item The proposed algorithm was evaluated over the A, B and C classes of the benchmarks using 4, 8 or 9 and 16 nodes respectively. \medskip
419 \item $P_d=20W$, $P_s=4W$.
428 \begin{frame}{Experimental results}
431 \includegraphics[width=.35\textwidth]{c1/ep}
432 \includegraphics[width=.35\textwidth]{c1/cg}
433 \includegraphics[width=.35\textwidth]{c1/bt}}
435 \centering {\includegraphics[width=.55\textwidth]{c1/results.pdf}}
443 \begin{frame}{Results comparison}
444 \begin{block}{\small Rauber and Rünger's optimal scaling factor}
445 $S_{opt} = \sqrt[3]{\frac{2}{N} \cdot \frac{P_{dyn}}{P_{static}} \cdot
446 \left( 1 + \sum_{i=2}^{N} \frac{T_i^3}{T_1^3}\right) } $
451 %\includegraphics[width=.33\textwidth]{c1/c1.pdf}
453 %\includegraphics[width=.33\textwidth]{c1/c2.pdf}}
456 \includegraphics[width=.55\textwidth]{c1/compare-c.pdf}}
464 \begin{frame}{The proposed new energy model}
467 \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{homo-model/a-}{0}{356}
468 %\includegraphics[width=0.6\textwidth]{homo-model/a-356}
476 \begin{frame}{\large Comparing the new model with Rauber's model }
479 \includegraphics[width=.45\textwidth]{c1/energy_con}
481 \includegraphics[width=.5\textwidth]{c1/compare-scales}
487 % \begin{frame}{Summary}
488 % \begin{femtoBlock}{}
491 %\item We have presented a new online scaling factor selection method that \textcolor{blue}{optimizes simultaneously the energy and performance}.\medskip
492 % \item It predicts \textcolor{blue}{ the energy consumption and the performance} of the parallel applications. \medskip
493 %\item Our algorithm \textcolor{blue}{saves more energy} when the communication and the other slacks times are big. \medskip
494 %\item It gives the \textcolor{blue}{best trade-off between energy reduction and
495 % performance}. \medskip
496 %\item Our method \ \textcolor{blue}{outperforms Rauber and Rünger's method} in terms of energy-performance ratio.
497 %\item The proposed new energy model is \textcolor{blue}{more accurate} then Rauber energy model.
509 \begin{frame}{The second contribution}
511 \section{\small {Energy optimization of a heterogeneous platform}}
515 \bf \Large \textcolor{blue}{Energy optimization of a parallel application with iterations running over a Heterogeneous platform}
525 \begin{frame}{Objectives}
526 \begin{femtoBlock}{} \vspace{-12 mm}
527 \begin{itemize} \small
528 \item Proposing \textcolor{blue}{new energy and performance models} for message passing applications with iterations running
529 over a heterogeneous platform (cluster or Grid). \medskip
530 \item Studying the effect of the scaling factor $S$ on both the \textcolor{blue}{energy consumption and the performance} of
531 message passing iterative applications. \medskip
533 \item Computing the vector of scaling factors ($S_1, S_2, ..., S_n$) producing \textcolor{blue} {the optimal trade-off} between
534 the energy consumption and the performance.
545 \begin{frame}{The execution time model}
549 \includegraphics[scale=0.5]{c2/commtasks}
555 \begin{block}{\small The execution time prediction model}
558 \small\textcolor{red}{ T_{new}} = \textcolor{blue}{\max_{i=1,2,\dots,N} ({TcpOld_i} \cdot S_{i}) + \min_{i=1,2,\dots,N} (Tcm_i)}
561 \small Where: $ \textcolor{red}{Tcm} = \textcolor{blue}{communication~times + slack~times}$
568 %\begin{frame}{The energy consumption model}
569 % The overall energy consumption of a message passing synchronous application executed over
570 % a heterogeneous platform can be computed as follows:
573 % \textcolor{red}{E} = \textcolor{blue}{\sum_{i=1}^{N} {(S_i^{-2} \cdot Pd_i \cdot Tcp_i)}} + {} \\
574 % \textcolor{blue}{\sum_{i=1}^{N} (Ps_i \cdot (\max_{i=1,2,\dots,N} (Tcp_i \cdot S_{i}) + {\min_{i=1,2,\dots,N} (Tcm_i))}}
577 % \underline{where}:\\
578 % \textcolor{blue}{N} : is the number of nodes.
585 \begin{frame}{The energy model for heterogeneous cluster}
588 \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{heter-model/a-}{0}{272}
589 %\includegraphics[width=0.6\textwidth]{heter-model/a-272}
599 %\begin{frame}{The trade-off between energy and performance}
602 % \centering{ \includegraphics[width=.4\textwidth]{c2/heter}}
605 % \textcolor{red}{\underline{Step1}}: computing the normalized energy \textcolor{blue}%{$E_{norm} = \frac{E_{reduced}}
607 % \textcolor{red}{\underline{Step2}}: computing the normalized performance \textcolor{blue}{$P_{norm} = \frac{T_{Max}}{T_{new}}$}.
609 % \begin{block}{\small The tradeoff model}
612 % \textcolor{red}{MaxDist} =
613 % \mathop {\max_{i=1,\dots F}}_{j=1,\dots,N}
614 % (\overbrace{P_{norm}(S_{ij})}^{\text{\textcolor{blue}{Maximize}}} -
615 % \overbrace{E_{norm}(S_{ij})}^{\text{\textcolor{blue}{Minimize}}} )
624 \begin{frame}{The scaling algorithm for heter. cluster}
627 \includegraphics[width=.52\textwidth]{algo-heter}
634 \begin{frame}{The scaling algorithm example}
639 \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{dvfs-heter/a-}{0}{650}
640 % \includegraphics[width=0.6\textwidth]{dvfs-heter/a-650}
650 %\begin{frame}{Experiments over a heterogeneous cluster }
653 % \item The experiments were executed on the simulator SimGrid/SMPI v3.10.\medskip
654 % \item The scaling algorithm was applied to the NAS parallel benchmarks class C.\medskip
655 % \item Four types of processors with different computing powers were used.\medskip
656 % \item The benchmarks were executed with different number of nodes ranging from 4 to 144 nodes.\medskip
657 % \item It was assumed that the total power consumption of the CPU consist of 80\% dynamic power and 20\% static power.
668 \begin{frame}{The simulation results}
672 \includegraphics[width=0.8\textwidth]{c2/energy_saving.pdf}
674 \textcolor{blue}{On average, it reduces the energy consumption by \textcolor{red}{29\%}
675 for the class C of the NAS Benchmarks executed over 8 nodes}
685 \begin{frame}{The simulation results}
690 \includegraphics[width=.8\textwidth]{c2/perf_degra.pdf}
692 \textcolor{blue}{On average, it degrades by \textcolor{red}{3.8\%} the performance
693 of NAS Benchmarks class C executed over 8 nodes}
702 \begin{frame}{The results of the three power scenarios}
706 \includegraphics[width=.55\textwidth]{c2/three_power.pdf}
708 \includegraphics[width=.55\textwidth]{c2/three_scenarios.pdf}
717 \begin{frame}{Comparing the objective function to EDP}
719 EDP is the products between the energy consumption and the delay.
723 \includegraphics[width=.55\textwidth]{c2/avg_compare.pdf}
725 \includegraphics[width=.55\textwidth]{c2/compare_with_EDP.pdf}
735 %\begin{frame}{Energy optimization of grid platform}
738 % \includegraphics[width=.6\textwidth]{c2/grid5000.pdf}
740 % \small 10 sites distributed over France and Luxembourg
748 \begin{frame}{The grid architecture}
750 \includegraphics[width=.8\textwidth]{c2/init_freq.pdf}
753 %\begin{frame}{Performance, Energy and trade-off models} \small
754 %\begin{block}{\small The performance model of grid}
757 %\Tnew = \mathop{\max_{i=1,\dots N}}_{j=1,\dots,M_i}({\TcpOld[ij]} \cdot S_{ij})
758 % +\mathop{\min_{j=1,\dots,M_h}} (\Tcm[hj])
763 %\begin{block}{\small The energy model of grid}\small
766 %E = \sum_{i=1}^{N} \sum_{i=1}^{M_i} {(S_{ij}^{-2} \cdot \Pd[ij] \cdot \Tcp[ij])} +
767 % \sum_{i=1}^{N} \sum_{j=1}^{M_i} (\Ps[ij] \cdot \Tnew)
771 %\begin{block}{\small The trade-off model of grid}
776 %\mathop{ \mathop{\max_{i=1,\dots N}}_{j=1,\dots,M_i}}_{k=1,\dots,F_j}
777 % (\overbrace{\Pnorm(S_{ijk})}^{\text{Maximize}} -
778 % \overbrace{\Enorm(S_{ijk})}^{\text{Minimize}} )
790 \begin{frame}{Experiments over Grid'5000}
792 \textcolor{blue}{The experiments were conducted using three
793 clusters distributed over one or two sites.}
796 \includegraphics[width=.5\textwidth]{c2/grid5000-2.pdf}
799 \textcolor{blue}{Grid'5000 power measurement tools were used.}
802 \includegraphics[width=.5\textwidth]{c2/power_consumption.pdf}
814 \begin{frame}{Experiments over Grid'5000}
816 \begin{minipage}{0.4\textwidth}
817 %\textcolor{blue}{Execution the NAS class D on 16 nodes saves the energy by
818 %\textcolor{red}{30\%}}
819 \small \textcolor{blue}{The average energy saving = \textcolor{red}{30\%}}
821 \begin{minipage}{0.55\textwidth}
823 \includegraphics[width=0.83 \textwidth]{c2/eng_s.eps}
827 \begin{minipage}{0.4\textwidth}
828 %\textcolor{blue}{Execution the NAS class D on 16 nodes degrades the
829 %performance by \textcolor{red}{3.2\%}}
830 \small \textcolor{blue}{The average performance degradation = \textcolor{red}{3.2\%}}
832 \begin{minipage}{0.55\textwidth}
834 \includegraphics[width=.83\textwidth]{c2/per_d.eps}
844 \begin{frame}{Experiments over Grid'5000}
845 \textcolor{blue}{One core and Multi-cores per node results:}
848 \includegraphics[width=.48\textwidth]{c2/eng_s_mc.eps}
850 \includegraphics[width=.48\textwidth]{c2/per_d_mc.eps}
853 \centering \small \textcolor{blue}{Using multi-cores per node scenario decreases the computations to communications ratio}.
858 %\begin{frame}{Summary}
861 % \item Two scaling algorithm were applies to \textcolor{blue}{heterogeneous %cluster} and \textcolor{blue}{grid}.
862 % \item A new \textcolor{blue}{energy} and \textcolor{blue}{performance} models were proposed.
863 % \item The experimental results ere conducted over \textcolor{blue}{SimGrid} simulators and real
864 %test-bed \textcolor{blue}{Grid'5000}.
866 %\item The algorithm saves the energy by \textcolor{blue}{29\%} and only
867 % degrades the performance by \textcolor{blue}{3.8\%} for simulated heterogeneous
870 %\item The algorithm saves the energy by \textcolor{blue}{30\%} and only
871 % degrades the performance by \textcolor{blue}{3.2\%} for Grid'5000 results.
873 % \item The proposed method \textcolor{blue}{outperforms the EDP method} in terms of energy-performance ratio.
881 \begin{frame}{The third contribution}
882 \section{\small {Energy optimization of asynchronous applications}}
884 \bf \Large \textcolor{blue}{Energy optimization of asynchronous iterative message passing applications}
893 \begin{frame}{Problem definition}\vspace{0.8 mm}
894 \textcolor{blue}{The execution of a synchronous parallel iterative application over a grid }
897 \animategraphics[autopause,controls,scale=0.25,buttonsize=0.2cm]{10}{syn/a-}{0}{503}
898 %\includegraphics[width=0.6\textwidth]{syn/a-503}
907 \begin{frame}{Problem definition}\vspace{0.8 mm}
908 \textcolor{blue}{The execution of an asynchronous parallel iterative application over a grid }
911 \animategraphics[autopause,controls,scale=0.25,buttonsize=0.2cm]{10}{asyn/a-}{0}{440}
912 %\includegraphics[width=0.6\textwidth]{asyn/a-440}
921 \begin{frame}{Solution}\vspace{0.8mm}
922 \textcolor{blue}{Using asynchronous communications with DVFS }
925 \animategraphics[autopause,controls,scale=0.25,buttonsize=0.2cm]{10}{asyn+dvfs/a-}{0}{314}
926 %\includegraphics[width=0.6\textwidth]{asyn+dvfs/a-314}
936 %\begin{frame}{The performance models}
938 %\begin{block}{\small The performance model of Asynch. Applications}\small
940 %\label{eq:asyn_time}
941 %\Tnew = \frac{\sum_{i=1}^{N} \sum_{j=1}^{M_i}({\TcpOld[ij]} \cdot S_{ij})} {N \cdot M_i }
946 %\begin{block}{\small The performance model of Hybrid Applications}\small
948 %\label{eq:asyn_perf}
949 %\Tnew = \frac{\sum_{i=1}^{N} (\max_{j=1,\dots, M_i} ({\TcpOld[ij]} \cdot S_{ij}) +
950 %\min_{j=1,\dots,M_i} ({\Ltcm[ij]}))}{N}
962 %\begin{frame}{The energy consumption models}
964 %\begin{block}{\small The energy model of Asynch. Applications}\small
966 %\label{eq:asyn_energy1}
967 % E = \sum_{i=1}^{N} \sum_{j=1}^{M_i} {(S_{ij}^{-2} \cdot \Tcp[ij] \cdot (\Pd[ij]+\Ps[ij]) )}
972 %\begin{block}{\small The energy model of Hybrid Applications}\small
974 %\label{eq:asyn_energy}
975 %E = \sum_{i=1}^{N} \sum_{j=1}^{M_i} {(S_{ij}^{-2} \cdot \Pd[ij] \cdot \Tcp[ij])} + \sum_{i=1}^{N} \sum_{j=1}^{M_i} (\Ps[ij] \cdot \\
976 % ( \mathop{\max_{j=1,\dots,M_i}} ({\Tcp[ij]} \cdot S_{ij}) + \mathop{\min_{j=1,\dots,M_i}} ({\Ltcm[ij]})))
986 \begin{frame}{The performance and the energy models }
989 \includegraphics[width=0.9\textwidth]{syn-vs-asyn.pdf}
999 \begin{frame}{The scaling algorithm for Asynch. applications}
1002 \includegraphics[width=0.55\textwidth]{algo-hybrid.pdf}
1007 %%%%%%%%%%%%%%%%%%%%
1009 %%%%%%%%%%%%%%%%%%%%
1010 \begin{frame}{The experiments}
1015 \item The architecture of the grid:
1017 \includegraphics[width=0.5\textwidth]{c3/hybrid-model.pdf}
1021 \item Applying the proposed algorithm to the asynchronous iterative message passing multi-splitting method.
1022 \item Evaluating the application over the simulator and Grid'5000.
1028 %%%%%%%%%%%%%%%%%%%%
1030 %%%%%%%%%%%%%%%%%%%%
1031 \begin{frame}{The simulation results}
1032 \centering \small \textcolor{blue}{The best scenario in terms of energy and performance is the Async. MS with Sync. DVFS}
1035 \includegraphics[scale=0.42]{c3/energy_saving.eps}
1037 \centering The average energy saving = \textcolor{red}{22\%}
1042 %%%%%%%%%%%%%%%%%%%%
1044 %%%%%%%%%%%%%%%%%%%%
1045 \begin{frame}{The simulation results}
1048 \includegraphics[scale=0.42]{c3/perf_degra.eps}
1050 \centering The average speed-up = \textcolor{red}{5.72\%}
1055 %%%%%%%%%%%%%%%%%%%%
1057 %%%%%%%%%%%%%%%%%%%%
1058 \begin{frame}{The Grid'5000 results}
1063 \includegraphics[width=0.53\textwidth]{c3/energy-s-compare.eps}
1064 \includegraphics[width=0.53\textwidth]{c3/perf-deg-compare.eps}
1067 \centering \footnotesize
1068 The average energy saving = \textcolor{red}{26.93\%}, the average speed-up = \textcolor{red}{21.48\%}
1072 %%%%%%%%%%%%%%%%%%%%
1074 %%%%%%%%%%%%%%%%%%%%
1075 \begin{frame}{The comparison results}
1077 \includegraphics[width=.5\textwidth]{c3/compare.eps}
1079 \includegraphics[width=.5\textwidth]{c3/compare_scales.eps}
1085 %%%%%%%%%%%%%%%%%%%%
1087 %%%%%%%%%%%%%%%%%%%%
1088 \begin{frame}{Conclusions}
1089 \section{Conclusions and Perspectives}
1092 \small \barrow Three \textcolor{blue}{ new energy consumption and performance} models were proposed for synchronous or asynchronous parallel applications with iterations running over
1093 \textcolor{blue}{homogeneous and heterogeneous clusters or grids}.
1097 \small \barrow \textcolor{blue}{A new objective function} to optimize both the energy consumption and the performance was proposed.
1099 \small \barrow \textcolor{blue}{New online frequency selecting algorithms} for clusters and grids were developed.
1101 \small \barrow The proposed algorithms were applied to the \textcolor{blue}{NAS parallel benchmarks} and \textcolor{blue}{the
1102 Multi-splitting} method.
1104 \small \barrow The proposed algorithms were evaluated over the \textcolor{blue}{SimGrid simulator} and over the \textcolor{blue}{Grid'5000 testbed}.
1106 \small \barrow All the proposed methods were compared to either \textcolor{blue}{Rauber and Rünger's method} or to the \textcolor{blue}{EDP objective function}.
1114 %%%%%%%%%%%%%%%%%%%%
1116 %%%%%%%%%%%%%%%%%%%%
1117 \begin{frame}{Publications}
1119 \begin{block}{\small Journal Articles }\scriptsize
1120 \begin{enumerate}[$\lbrack$1$\rbrack$]
1122 \item Ahmed Fanfakh, Jean-Claude Charr, Raphaël Couturier, Arnaud Giersch. Optimizing the energy consumption of message passing applications with iterations executed over grids. \textit{Journal of Computational
1125 \item Ahmed Fanfakh, Jean-Claude Charr, Raphaël Couturier, Arnaud Giersch. Energy Consumption Reduction for
1126 Asynchronous Message Passing Applications. \textit{Journal of Supercomputing}, 2016, (Submitted)
1132 \begin{block}{\small Conference Articles }\scriptsize
1134 \begin{enumerate}[$\lbrack$1$\rbrack$]
1136 \item Jean-Claude Charr, Raphaël Couturier, Ahmed Fanfakh, Arnaud Giersch. Dynamic Frequency Scaling for
1137 Energy Consumption Reduction in Distributed MPI Programs. \textit{ISPA 2014}, pp.
1138 225-230. IEEE Computer Society, Milan, Italy (2014).
1140 \item Jean-Claude Charr, Raphaël Couturier, Ahmed Fanfakh, Arnaud Giersch. Energy Consumption Reduction
1141 with DVFS for Message Passing Iterative Applications on Heterogeneous Architectures.
1142 \textit{The $16^{th}$ PDSEC}. pp. 922-931. IEEE Computer Society, INDIA (2015).
1144 \item Ahmed Fanfakh, Jean-Claude Charr, Raphaël Couturier, Arnaud Giersch. CPUs Energy Consumption
1145 Reduction for Asynchronous Parallel Methods Running over Grids. \textit{The $19^{th}$ CSE conference}. IEEE Computer Society,
1154 %%%%%%%%%%%%%%%%%%%%
1156 %%%%%%%%%%%%%%%%%%%%
1157 \begin{frame}{Perspectives}
1161 \small \barrow The proposed algorithms should take into consideration the
1162 \textcolor{blue}{variability between some iterations}.
1164 \small \barrow The proposed algorithms should be applied to \textcolor{blue}{other message passing methods with iterations} in order to see how they adapt to the characteristics of these methods.
1166 \small \barrow The proposed algorithms for heterogeneous platforms should be applied to heterogeneous platforms composed of \textcolor{blue}{CPUs and GPUs}.
1168 \small \barrow Comparing the results returned by the energy models to the values given by \textcolor{blue}{real instruments that measure the energy consumptions} of CPUs during the execution time.
1173 %%%%%%%%%%%%%%%%%%%%
1175 %%%%%%%%%%%%%%%%%%%%
1176 \begin{frame}{Fin} \vspace{-10 mm}
1178 \centering \Large \textcolor{blue}{Thank you for your attention}
1181 \centering \textcolor{blue}{ {\Large Questions?}}