2 \usepackage{beamerthemefemto}
3 \usepackage[latin1]{inputenc}
4 \usepackage[T1]{fontenc}
5 \DeclareGraphicsExtensions{.jpg, .png , .pdf, .bmp, .pdftex}
6 \usepackage{algorithm,algorithmicx,algpseudocode}
7 \usepackage{graphicx,graphics}
16 \newcommand{\AG}[2][inline]{%
17 \todo[color=green!50,#1]{\sffamily\textbf{AG:} #2}\xspace}
18 \newcommand{\JC}[2][inline]{%
19 \todo[color=red!10,#1]{\sffamily\textbf{JC:} #2}\xspace}
20 \definecolor{myblue}{RGB}{0,29,119}
21 \newcommand{\Xsub}[2]{{\ensuremath{#1_\mathit{#2}}}}
23 %% used to put some subscripts lower, and make them more legible
24 \newcommand{\fxheight}[1]{\ifx#1\relax\relax\else\rule{0pt}{1.52ex}#1\fi}
26 \newcommand{\CL}{\Xsub{C}{L}}
27 \newcommand{\Dist}{\mathit{Dist}}
28 \newcommand{\EdNew}{\Xsub{E}{dNew}}
29 \newcommand{\Eind}{\Xsub{E}{ind}}
30 \newcommand{\Enorm}{\Xsub{E}{Norm}}
31 \newcommand{\Eoriginal}{\Xsub{E}{Original}}
32 \newcommand{\Ereduced}{\Xsub{E}{Reduced}}
33 \newcommand{\Es}{\Xsub{E}{S}}
34 \newcommand{\Fdiff}[1][]{\Xsub{F}{diff}_{\!#1}}
35 \newcommand{\Fmax}[1][]{\Xsub{F}{max}_{\fxheight{#1}}}
36 \newcommand{\Fnew}{\Xsub{F}{new}}
37 \newcommand{\Vnew}{\Xsub{V}{new}}
38 \newcommand{\Vmax}{\Xsub{V}{max}}
39 \newcommand{\Ileak}{\Xsub{I}{leak}}
40 \newcommand{\Kdesign}{\Xsub{K}{design}}
41 \newcommand{\MaxDist}{\mathit{Max}\Dist}
42 \newcommand{\MinTcm}{\mathit{Min}\Tcm}
43 \newcommand{\Ntrans}{\Xsub{N}{trans}}
44 \newcommand{\Pd}[1][]{\Xsub{P}{d}_{\fxheight{#1}}}
45 \newcommand{\PdNew}{\Xsub{P}{dNew}}
47 \newcommand{\PdOld}{\Xsub{P}{dOld}}
48 \newcommand{\Pnorm}{\Xsub{P}{Norm}}
49 \newcommand{\Tnorm}{\Xsub{T}{Norm}}
50 \newcommand{\Ps}[1][]{\Xsub{P}{s}_{\fxheight{#1}}}
51 \newcommand{\Scp}[1][]{\Xsub{S}{cp}_{#1}}
52 \newcommand{\Sopt}[1][]{\Xsub{S}{opt}_{#1}}
53 \newcommand{\Tcm}[1][]{\Xsub{T}{cm}_{\fxheight{#1}}}
54 \newcommand{\Tcp}[1][]{\Xsub{T}{cp}_{#1}}
55 \newcommand{\TcpOld}[1][]{\Xsub{T}{cpOld}_{#1}}
56 \newcommand{\Tnew}{\Xsub{T}{New}}
57 \newcommand{\Told}{\Xsub{T}{Old}}
58 \newcommand{\Ltcm}[1][]{\Xsub{L}{tcm}_{\fxheight{#1}}}
59 \newcommand{\Etcm}[1][]{\Xsub{E}{tcm}_{\fxheight{#1}}}
60 \newcommand{\Niter}[1][]{\Xsub{N}{iter}_{\fxheight{#1}}}
61 \newcommand{\Pmax}[1][]{\Xsub{P}{max}_{\fxheight{#1}}}
62 \newcommand{\Pidle}[1][]{\Xsub{P}{idle}_{\fxheight{#1}}}
65 \definecolor{myblue}{RGB}{0,29,119}
66 \usepackage[textsize=footnotesize]{todonotes}
67 \newcommand{\bsquare}{\item[\color{myblue}\ding{110}]}
68 \newcommand{\barrow}{\item[\color{myblue}\ding{228}]}
69 \newcommand{\bwarrow}{\item[\color{myblue}\ding{227}]}
70 \DeclareGraphicsExtensions{.jpg, .png , .pdf, .bmp, .pdftex}
74 %\title{Energy Consumption Optimization of Parallel Applications with
75 %Iterations using CPU Frequency Scaling}
78 \title{ \textbf{Energy Consumption Optimization of Parallel Applications with Iterations using CPU Frequency Scaling} \\ \vspace{0.2cm} \hspace{1.8cm}\textbf{\textcolor{cyan}{\small PhD Dissertation Defense}}}\vspace{-0.5cm}
79 \author{ \textbf{Ahmed Badri Muslim Fanfakh} \\ \vspace{0.5cm}\small Under the supervision of: \\ \textcolor{cyan}{\small Raphaël COUTURIER and Jean-Claude CHARR} \\\vspace{0.1cm} \textcolor{blue}{ UBFC - FEMTO-ST - DISC Dept. - AND Team} \\ ~~~~~~~~~~~~~~~~~~~~~ \textbf{\textcolor{blue}{ 17 October 2016 }}}
83 % ____ _____ ____ _ _ _____
84 % | _ \| ____| __ )| | | |_ _|
85 % | | | | _| | _ \| | | | | |
86 % | |_| | |___| |_) | |_| | | |
87 % |____/|_____|____/ \___/ |_|
90 \setbeamertemplate{background}{\titrefemto}
105 \setbeamertemplate{background}{\pagefemto}
106 \begin{frame}{Outline}
108 \setbeamertemplate{section in toc}[sections numbered]
115 \begin{frame}{Introduction and problem definition}
116 \section{\small {Introduction and Problem definition}}
118 \includegraphics[width=0.99\textwidth]{para.pdf}
128 \begin{frame}{Execution of synchronous parallel tasks}
132 \subfloat[Synchronous imbalanced communications]{%
133 \includegraphics[scale=0.49]{c1/commtasks}\label{fig:h1}}
134 \subfloat[Synchronous imbalanced computations]{%
135 \includegraphics[scale=0.49]{c1/compt}\label{fig:h2}}
136 % \caption{Parallel tasks on homogeneous platform}
148 \begin{frame}{\large Synchronous and asynchronous iterative methods }
152 \includegraphics[scale=0.42]{syn_tasks.pdf}
154 \includegraphics[scale=0.42]{Asyn_tasks.pdf}
163 \begin{frame}{Approaches to get more computing power}
165 %\bf \textcolor{blue}{}
166 \begin{minipage}{0.5\textwidth}
167 \textcolor{blue}{1)} \small \bf \textcolor{black}{Increase the frequency of a processor.\\ (limited due to overheating)}
169 \begin{minipage}{0.6\textwidth}
173 \includegraphics[width=0.7\textwidth]{fig/freq-years}
177 \begin{minipage}{0.5\textwidth}
178 \textcolor{blue}{2)} \small \bf \textcolor{black}{Increase the number of nodes.}
180 \textcolor{black}{The supercomputer Tianhe-2 has more than 3 million cores and consumes around 17.8 megawatts.}
183 \begin{minipage}{0.6\textwidth}
185 \includegraphics[width=0.7\textwidth]{fig/clusters}
195 \begin{frame}{Techniques for energy consumption reduction}
197 \textcolor{blue}{1)} \bf \textcolor{black}{Switch-off idle nodes method}
200 \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{200}{on-off/a-}{0}{111}
201 %\includegraphics[width=0.6\textwidth]{on-off/a-69}
208 \begin{frame}{Techniques for energy consumption reduction}
210 \textcolor{blue}{2)} \bf \textcolor{black}{Dynamic Voltage and Frequency Scaling (DVFS)}
213 \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{10}{DVFS-meq/a-}{0}{175}
214 %\includegraphics[width=0.6\textwidth]{DVFS-meq/a-109}
224 \begin{frame}{Motivations}
226 \section{\small {Motivations}}
227 \textcolor{blue}{Why we used the DVFS method:}
229 \begin{minipage}{0.5\textwidth}
232 \item \small \textcolor{black}{ The CPU is the component that consumes the highest amount of energy in a node \textsuperscript{1}. }
237 \begin{minipage}{0.5\textwidth}
240 \includegraphics[width=0.85\textwidth]{fig/node-power}
245 \begin{itemize} \item \small \textcolor{black}{DVFS reduces the energy consumption while
246 keeping all the nodes working.}
247 \item \small \textcolor{black}{It has a very small overhead compared to switching-off the idle nodes.} \end{itemize}
251 \begin{block}{\textcolor{white}{Challenge and Objective}}
253 \small \textcolor{blue}{Challenge:} \textcolor{black}{DVFS is used to reduce the energy consumption, \textcolor{blue}{but} it also degrades the performance of the CPU.}
256 \small \textcolor{blue}{Objective:} \textcolor{black}{Applying the DVFS to minimize the energy consumption while maintaining the performance of the parallel application.}
259 \tiny \textsuperscript{1} Fan, X., Weber, W., and Barroso, L. A. 2007. Power provisioning
260 for a warehouse-sized computer.
271 \begin{frame}{The first contribution}
273 \section{\small {Energy optimization of a homogeneous platform}}
275 % \includegraphics[width=0.6\textwidth]{white.pdf}
278 \bf \Large \textcolor{blue}{Energy optimization of a parallel application with iterations running over a homogeneous platform}
288 \begin{frame}{Objectives}
290 \begin{itemize} \small \justifying
292 \item Studying the effect of the scaling factor on the \textbf{energy consumption and performance } of parallel applications with iterations. \medskip
294 \item Discovering the \textbf{energy-performance trade-off relation} when changing the frequency of the processor.\medskip
295 \item Proposing an algorithm for selecting the scaling factor that produces \textbf {the optimal trade-off} between the energy consumption and the performance. \medskip
296 \item Comparing the proposed algorithm to existing methods.
299 %\footnote{\tiny Thomas Rauber and Gudula Rünger. Analytical modeling and simulation of the
300 %energy consumption \\ \quad ~ ~\quad of independent tasks. In Proceedings of the Winter Simulation Conference, 2012.} method that our method best on.
302 %\let\thefootnote\relax\footnote{}
317 \begin{frame}{Energy model for a homogeneous platform}
318 The power consumed by a processor is divided into two power metrics: the dynamic (\textcolor{red}{$P_d$}) and the static
319 (\textcolor{red}{$P_s$}) power.
322 \textcolor{red}{ P_d} = \textcolor{blue}{\alpha \cdot CL \cdot V^2 \cdot F}
324 \scriptsize \underline{Where}: \\
325 \scriptsize {\textcolor{blue}{$\alpha$}: switching activity \hspace{15 mm} \textcolor{blue}{$CL$}: load capacitance\\
326 \textcolor{blue}{$V$}: the supply voltage \hspace{14 mm} \textcolor{blue}{$F$}: operational frequency}
329 \small \textcolor{red}{P_s} = \textcolor{blue}{V \cdot N_{trans} \cdot K_{design} \cdot I_{Leak}}
332 \scriptsize{ \textcolor{blue}{$V$}: the supply voltage. \hspace{28 mm} \textcolor{blue}{$N_{trans}$}: number of transistors. \\
333 \textcolor{blue}{$K_{design}$}: design dependent parameter. \hspace{8 mm} \textcolor{blue}{$I_{leak}$}: technology dependent
336 The frequency scaling factor is the ratio between the maximum and the new frequency, \textcolor{blue}{$S = \frac{F_{max}}{F_{new}}$}.
343 \begin{frame}{Energy model for a homogeneous platform}
346 \animategraphics[autopause,controls,scale=0.3,buttonsize=0.2cm]{10}{homo-model/a-}{0}{441}
347 %\includegraphics[width=0.6\textwidth]{homo-model/a-356}
350 % \begin{block}{\small Rauber and Rünger's energy model}
351 %$ E = P_{d} \cdot S_1^{-2} \cdot
352 %\left( T_1 + \sum_{i=2}^{N} \frac{T_i^3}{T_1^2} \right) +
353 % P_{s} \cdot S_1 \cdot T_1 \cdot N$
355 % \textcolor{blue}{$S_1$}: the maximum scaling factor.\\
356 % \textcolor{blue}{$P_{d}$}: the dynamic power.\\
357 % \textcolor{blue}{$P_{s}$}: the static power.\\
358 % \textcolor{blue}{$T_I$}: the execution time of the slower task.\\
359 % \textcolor{blue}{$T_i$}: the execution time of task i.\\
360 % \textcolor{blue}{$N$}: the number of nodes.
370 \begin{frame}{Performance evaluation of MPI programs}
373 \begin{block}{\small Execution time prediction model}
374 \centering{ $ \textcolor{red}{T_{new}} = \textcolor{blue}{T_{Max Comp Old} \cdot S + T_{{Min Comm Old}}}$}
377 \centering{\includegraphics[width=.4\textwidth]{c1/cg_per}
379 \includegraphics[width=.4\textwidth]{c1/lu_pre}}
382 \small The maximum normalized error for CG=0.0073 \textbf{(the smallest)} and LU=0.031 \textbf{(the worst)}.
392 \begin{frame}{Performance and energy reduction trade-off}
393 \begin{femtoBlock}{} \vspace{-15 mm}
396 \subfloat[\small Real relation.]{%
397 \includegraphics[width=.43\textwidth]{c1/file3}\label{fig:r2}}
399 \subfloat[\small Converted relation.]{%
400 \includegraphics[width=.43\textwidth]{c1/file}\label{fig:r1}}%
402 % \caption{The energy and performance relation}
405 Where:~~~ $\textcolor{blue}{Performance} = execution~time^{-1}$
409 \begin{block}{\small Our objective function}
410 \centering{$\textbf{\emph {\textcolor{red}{MaxDist}}} = \max_{j=1,2,\dots ,F}
411 (\overbrace{P_{Norm}(S_j)}^{{\textcolor{blue}{Maximize}}} -
412 \overbrace{E_{Norm}(S_j)}^{{\textcolor{blue}{Minimize}}} )$}
422 %\begin{frame}{Scaling factor selection algorithm}
425 %\includegraphics[width=.56 \textwidth]{c1/algo-homo}
434 \begin{frame}{Scaling factor selection algorithm}
438 \animategraphics[autopause,controls,scale=0.29,buttonsize=0.2cm]{10}{dvfs-homo/a-}{0}{335}
439 %\includegraphics[width=0.6\textwidth]{dvfs-homo/a-159}
446 \begin{frame}{Experimental results }
450 \item The experiments were executed on the simulator SimGrid/SMPI v3.10.\medskip
451 \item The proposed algorithm was applied to the NAS parallel benchmarks.\medskip
452 \item Each node in the cluster has 18 frequency values from \textbf{2.5$GHz$} to \textbf{800$MHz$}.\medskip
453 \item The proposed algorithm was evaluated over the A, B and C classes of the benchmarks using 4, 8 or 9 and 16 nodes respectively. \medskip
454 \item $P_d=20W$, $P_s=4W$.
463 \begin{frame}{Experimental results}
466 \includegraphics[width=.35\textwidth]{c1/ep}
467 \includegraphics[width=.35\textwidth]{c1/cg}
468 \includegraphics[width=.35\textwidth]{c1/bt}}
472 \centering {\includegraphics[width=.55\textwidth]{c1/results.pdf}}
480 \begin{frame}{Results comparison}
481 \begin{block}{\small Rauber and Rünger's optimal scaling factor}
482 $S_{opt} = \sqrt[3]{\frac{2}{N} \cdot \frac{P_{dyn}}{P_{static}} \cdot
483 \left( 1 + \sum_{i=2}^{N} \frac{T_i^3}{T_1^3}\right) } $
488 %\includegraphics[width=.33\textwidth]{c1/c1.pdf}
490 %\includegraphics[width=.33\textwidth]{c1/c2.pdf}}
493 \includegraphics[width=.55\textwidth]{c1/compare-c.pdf}}
501 %\begin{frame}{The proposed new energy model}
504 % \animategraphics[autopause,controls,scale=0.28,buttonsize=0.2cm]{10}{homo-model/a-}{0}{356}
505 %\includegraphics[width=0.6\textwidth]{homo-model/a-356}
513 %\begin{frame}{\large Comparing the new model with Rauber's model }
516 %\includegraphics[width=.45\textwidth]{c1/energy_con}
518 %\includegraphics[width=.5\textwidth]{c1/compare-scales}
524 % \begin{frame}{Summary}
525 % \begin{femtoBlock}{}
528 %\item We have presented a new online scaling factor selection method that \textcolor{blue}{optimizes simultaneously the energy and performance}.\medskip
529 % \item It predicts \textcolor{blue}{ the energy consumption and the performance} of the parallel applications. \medskip
530 %\item Our algorithm \textcolor{blue}{saves more energy} when the communication and the other slacks times are big. \medskip
531 %\item It gives the \textcolor{blue}{best trade-off between energy reduction and
532 % performance}. \medskip
533 %\item Our method \ \textcolor{blue}{outperforms Rauber and Rünger's method} in terms of energy-performance ratio.
534 %\item The proposed new energy model is \textcolor{blue}{more accurate} then Rauber energy model.
546 \begin{frame}{The second contribution}
548 \section{\small {Energy optimization of a heterogeneous platform}}
552 \bf \Large \textcolor{blue}{Energy optimization of a parallel application with iterations running over a Heterogeneous platform}
562 \begin{frame}{Objectives}
563 \begin{femtoBlock}{} \vspace{-12 mm}
564 \begin{itemize} \small
565 \item Proposing \textcolor{blue}{new energy and performance models} for message passing applications with iterations running
566 over a heterogeneous platform (cluster or Grid). \medskip
567 \item Studying the effect of the scaling factor $S$ on both the \textcolor{blue}{energy consumption and the performance} of
568 message passing iterative applications. \medskip
570 \item Computing the vector of scaling factors ($S_1, S_2, ..., S_n$) producing \textcolor{blue} {the optimal trade-off} between
571 the energy consumption and the performance.
582 \begin{frame}{The execution time model}
586 \includegraphics[scale=0.5]{c2/commtasks}
592 \begin{block}{\small The execution time prediction model}
595 \small\textcolor{red}{ T_{new}} = \textcolor{blue}{\max_{i=1,2,\dots,N} ({TcpOld_i} \cdot S_{i}) + \min_{i=1,2,\dots,N} (Tcm_i)}
598 \small Where: $ \textcolor{red}{Tcm} = \textcolor{blue}{communication~times + slack~times}$
605 %\begin{frame}{The energy consumption model}
606 % The overall energy consumption of a message passing synchronous application executed over
607 % a heterogeneous platform can be computed as follows:
610 % \textcolor{red}{E} = \textcolor{blue}{\sum_{i=1}^{N} {(S_i^{-2} \cdot Pd_i \cdot Tcp_i)}} + {} \\
611 % \textcolor{blue}{\sum_{i=1}^{N} (Ps_i \cdot (\max_{i=1,2,\dots,N} (Tcp_i \cdot S_{i}) + {\min_{i=1,2,\dots,N} (Tcm_i))}}
614 % \underline{where}:\\
615 % \textcolor{blue}{N} : is the number of nodes.
622 \begin{frame}{The energy model for heterogeneous cluster}
625 \animategraphics[autopause,controls,scale=0.3,buttonsize=0.2cm]{10}{heter-model/a-}{0}{350}
626 %\includegraphics[width=0.6\textwidth]{heter-model/a-272}
636 %\begin{frame}{The trade-off between energy and performance}
639 % \centering{ \includegraphics[width=.4\textwidth]{c2/heter}}
642 % \textcolor{red}{\underline{Step1}}: computing the normalized energy \textcolor{blue}%{$E_{norm} = \frac{E_{reduced}}
644 % \textcolor{red}{\underline{Step2}}: computing the normalized performance \textcolor{blue}{$P_{norm} = \frac{T_{Max}}{T_{new}}$}.
646 % \begin{block}{\small The tradeoff model}
649 % \textcolor{red}{MaxDist} =
650 % \mathop {\max_{i=1,\dots F}}_{j=1,\dots,N}
651 % (\overbrace{P_{norm}(S_{ij})}^{\text{\textcolor{blue}{Maximize}}} -
652 % \overbrace{E_{norm}(S_{ij})}^{\text{\textcolor{blue}{Minimize}}} )
661 %\begin{frame}{The scaling algorithm for heter. cluster}
664 %\includegraphics[width=.52\textwidth]{algo-heter}
671 \begin{frame}{The scaling algorithm for heter. cluster}
676 \animategraphics[autopause,controls,scale=0.3,buttonsize=0.2cm]{10}{dvfs-heter/a-}{0}{836}
677 % \includegraphics[width=0.6\textwidth]{dvfs-heter/a-650}
687 %\begin{frame}{Experiments over a heterogeneous cluster }
690 % \item The experiments were executed on the simulator SimGrid/SMPI v3.10.\medskip
691 % \item The scaling algorithm was applied to the NAS parallel benchmarks class C.\medskip
692 % \item Four types of processors with different computing powers were used.\medskip
693 % \item The benchmarks were executed with different number of nodes ranging from 4 to 144 nodes.\medskip
694 % \item It was assumed that the total power consumption of the CPU consist of 80\% dynamic power and 20\% static power.
705 %\begin{frame}{The simulation results}
709 %\includegraphics[width=0.8\textwidth]{c2/energy_saving.pdf}
711 % \textcolor{blue}{On average, it reduces the energy consumption by \textcolor{red}{29\%}
712 %for the class C of the NAS Benchmarks executed over 8 nodes}
722 %\begin{frame}{The simulation results}
727 % \includegraphics[width=.8\textwidth]{c2/perf_degra.pdf}
729 % \textcolor{blue}{On average, it degrades by \textcolor{red}{3.8\%} the performance
730 % of NAS Benchmarks class C executed over 8 nodes}
744 %\begin{frame}{Energy optimization of grid platform}
747 % \includegraphics[width=.6\textwidth]{c2/grid5000.pdf}
749 % \small 10 sites distributed over France and Luxembourg
757 %\begin{frame}{The grid architecture}
759 %\includegraphics[width=.8\textwidth]{c2/init_freq.pdf}
762 %\begin{frame}{Performance, Energy and trade-off models} \small
763 %\begin{block}{\small The performance model of grid}
766 %\Tnew = \mathop{\max_{i=1,\dots N}}_{j=1,\dots,M_i}({\TcpOld[ij]} \cdot S_{ij})
767 % +\mathop{\min_{j=1,\dots,M_h}} (\Tcm[hj])
772 %\begin{block}{\small The energy model of grid}\small
775 %E = \sum_{i=1}^{N} \sum_{i=1}^{M_i} {(S_{ij}^{-2} \cdot \Pd[ij] \cdot \Tcp[ij])} +
776 % \sum_{i=1}^{N} \sum_{j=1}^{M_i} (\Ps[ij] \cdot \Tnew)
780 %\begin{block}{\small The trade-off model of grid}
785 %\mathop{ \mathop{\max_{i=1,\dots N}}_{j=1,\dots,M_i}}_{k=1,\dots,F_j}
786 % (\overbrace{\Pnorm(S_{ijk})}^{\text{Maximize}} -
787 % \overbrace{\Enorm(S_{ijk})}^{\text{Minimize}} )
799 \begin{frame}{Experiments over Grid'5000}
801 \textcolor{blue}{The experiments were conducted using three
802 clusters distributed over one or two sites.}
805 \includegraphics[width=.5\textwidth]{c2/grid5000-2.pdf}
808 \textcolor{blue}{Grid'5000 power measurement tools were used.}
811 \includegraphics[width=.5\textwidth]{c2/power_consumption.pdf}
823 \begin{frame}{Experiments over Grid'5000}
825 \begin{minipage}{0.4\textwidth}
826 %\textcolor{blue}{Execution the NAS class D on 16 nodes saves the energy by
827 %\textcolor{red}{30\%}}
828 \small \textcolor{blue}{The average energy saving = \textcolor{red}{30\%}}
830 \begin{minipage}{0.55\textwidth}
832 \includegraphics[width=0.83 \textwidth]{c2/eng_s.eps}
836 \begin{minipage}{0.4\textwidth}
837 %\textcolor{blue}{Execution the NAS class D on 16 nodes degrades the
838 %performance by \textcolor{red}{3.2\%}}
839 \small \textcolor{blue}{The average performance degradation = \textcolor{red}{3.2\%}}
841 \begin{minipage}{0.55\textwidth}
843 \includegraphics[width=.83\textwidth]{c2/per_d.eps}
853 \begin{frame}{The results of the three power scenarios}
857 \includegraphics[width=.45\textwidth]{c2/eng_pow.eps}
859 \includegraphics[width=.45\textwidth]{c2/per_pow.eps}
861 \includegraphics[width=.7\textwidth]{c2/three_scenarios.pdf}
874 \begin{frame}{One core and Multi-cores per node results}
875 %\textcolor{blue}{One core and Multi-cores per node results:}
878 \includegraphics[width=.48\textwidth]{c2/eng_s_mc.eps}
880 \includegraphics[width=.48\textwidth]{c2/per_d_mc.eps}
883 \centering \small \textcolor{blue}{Using multi-cores per node scenario decreases the computations to communications ratio}.
890 \begin{frame}{Comparing the objective function to EDP}
892 EDP is the products between the energy consumption and the delay.
896 \includegraphics[width=.6\textwidth]{c2/edp_dist.eps}
901 %\begin{frame}{Summary}
904 % \item Two scaling algorithm were applies to \textcolor{blue}{heterogeneous %cluster} and \textcolor{blue}{grid}.
905 % \item A new \textcolor{blue}{energy} and \textcolor{blue}{performance} models were proposed.
906 % \item The experimental results ere conducted over \textcolor{blue}{SimGrid} simulators and real
907 %test-bed \textcolor{blue}{Grid'5000}.
909 %\item The algorithm saves the energy by \textcolor{blue}{29\%} and only
910 % degrades the performance by \textcolor{blue}{3.8\%} for simulated heterogeneous
913 %\item The algorithm saves the energy by \textcolor{blue}{30\%} and only
914 % degrades the performance by \textcolor{blue}{3.2\%} for Grid'5000 results.
916 % \item The proposed method \textcolor{blue}{outperforms the EDP method} in terms of energy-performance ratio.
924 \begin{frame}{The third contribution}
925 \section{\small {Energy optimization of asynchronous applications}}
927 \bf \Large \textcolor{blue}{Energy optimization of asynchronous iterative message passing applications}
936 \begin{frame}{Problem definition}\vspace{0.8 mm}
937 \textcolor{blue}{The execution of a synchronous parallel iterative application over a grid }
940 \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{10}{syn/a-}{0}{647}
941 %\includegraphics[width=0.6\textwidth]{syn/a-503}
950 \begin{frame}{Problem definition}\vspace{0.8 mm}
951 \textcolor{blue}{The execution of an asynchronous parallel iterative application over a grid }
954 \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{10}{asyn/a-}{0}{556}
955 %\includegraphics[width=0.6\textwidth]{asyn/a-440}
964 \begin{frame}{Solution}\vspace{0.8mm}
965 \textcolor{blue}{Using asynchronous communications with DVFS }
968 \animategraphics[autopause,controls,scale=0.26,buttonsize=0.2cm]{10}{asyn+dvfs/a-}{0}{344}
969 %\includegraphics[width=0.6\textwidth]{asyn+dvfs/a-314}
979 %\begin{frame}{The performance models}
981 %\begin{block}{\small The performance model of Asynch. Applications}\small
983 %\label{eq:asyn_time}
984 %\Tnew = \frac{\sum_{i=1}^{N} \sum_{j=1}^{M_i}({\TcpOld[ij]} \cdot S_{ij})} {N \cdot M_i }
989 %\begin{block}{\small The performance model of Hybrid Applications}\small
991 %\label{eq:asyn_perf}
992 %\Tnew = \frac{\sum_{i=1}^{N} (\max_{j=1,\dots, M_i} ({\TcpOld[ij]} \cdot S_{ij}) +
993 %\min_{j=1,\dots,M_i} ({\Ltcm[ij]}))}{N}
1002 %%%%%%%%%%%%%%%%%%%%
1004 %%%%%%%%%%%%%%%%%%%%
1005 %\begin{frame}{The energy consumption models}
1007 %\begin{block}{\small The energy model of Asynch. Applications}\small
1009 %\label{eq:asyn_energy1}
1010 % E = \sum_{i=1}^{N} \sum_{j=1}^{M_i} {(S_{ij}^{-2} \cdot \Tcp[ij] \cdot (\Pd[ij]+\Ps[ij]) )}
1015 %\begin{block}{\small The energy model of Hybrid Applications}\small
1017 %\label{eq:asyn_energy}
1018 %E = \sum_{i=1}^{N} \sum_{j=1}^{M_i} {(S_{ij}^{-2} \cdot \Pd[ij] \cdot \Tcp[ij])} + \sum_{i=1}^{N} \sum_{j=1}^{M_i} (\Ps[ij] \cdot \\
1019 % ( \mathop{\max_{j=1,\dots,M_i}} ({\Tcp[ij]} \cdot S_{ij}) + \mathop{\min_{j=1,\dots,M_i}} ({\Ltcm[ij]})))
1026 %%%%%%%%%%%%%%%%%%%%
1028 %%%%%%%%%%%%%%%%%%%%
1029 \begin{frame}{The performance and the energy models }
1032 \includegraphics[width=0.9\textwidth]{syn-vs-asyn.pdf}
1039 %%%%%%%%%%%%%%%%%%%%
1041 %%%%%%%%%%%%%%%%%%%%
1042 \begin{frame}{The scaling algorithm for Asynch. applications}
1045 \includegraphics[width=0.55\textwidth]{algo-hybrid.pdf}
1050 %%%%%%%%%%%%%%%%%%%%
1052 %%%%%%%%%%%%%%%%%%%%
1053 \begin{frame}{The experiments}
1058 \item The architecture of the grid:
1060 \includegraphics[width=0.5\textwidth]{c3/hybrid-model.pdf}
1064 \item Applying the proposed algorithm to the asynchronous iterative message passing multi-splitting method.
1065 \item Evaluating the application over the simulator and Grid'5000.
1071 %%%%%%%%%%%%%%%%%%%%
1073 %%%%%%%%%%%%%%%%%%%%
1074 %\begin{frame}{The simulation results}
1075 %\centering \small \textcolor{blue}{The best scenario in terms of energy and performance is %the Async. MS with Sync. DVFS}
1078 % \includegraphics[scale=0.42]{c3/energy_saving.eps}
1080 %\centering The average energy saving = \textcolor{red}{22\%}
1085 %%%%%%%%%%%%%%%%%%%%
1087 %%%%%%%%%%%%%%%%%%%%
1088 %\begin{frame}{The simulation results}
1091 % \includegraphics[scale=0.42]{c3/perf_degra.eps}
1093 %\centering The average speed-up = \textcolor{red}{5.72\%}
1098 %%%%%%%%%%%%%%%%%%%%
1100 %%%%%%%%%%%%%%%%%%%%
1101 \begin{frame}{The Grid'5000 results}
1106 \includegraphics[width=0.53\textwidth]{c3/energy-s-compare.eps}
1107 \includegraphics[width=0.53\textwidth]{c3/perf-deg-compare.eps}
1110 \centering \footnotesize
1112 %\small \textcolor{blue}{The best scenario in terms of energy and performance is the Async. MS with Sync. DVFS}
1114 The average energy saving = \textcolor{red}{26.93\%}, the average speed-up = \textcolor{red}{21.48\%}
1118 %%%%%%%%%%%%%%%%%%%%
1120 %%%%%%%%%%%%%%%%%%%%
1121 \begin{frame}{The comparison results}
1123 \includegraphics[width=.5\textwidth]{c3/compare.eps}
1125 \includegraphics[width=.5\textwidth]{c3/compare_scales.eps}
1131 %%%%%%%%%%%%%%%%%%%%
1133 %%%%%%%%%%%%%%%%%%%%
1134 \begin{frame}{Conclusions}
1135 \section{Conclusions and Perspectives}
1138 \small \barrow Three \textcolor{blue}{ new energy consumption and performance} models were proposed for synchronous or asynchronous parallel applications with iterations running over
1139 \textcolor{blue}{homogeneous and heterogeneous clusters or grids}.
1143 \small \barrow \textcolor{blue}{A new objective function} to optimize both the energy consumption and the performance was proposed.
1145 \small \barrow \textcolor{blue}{New online frequency selecting algorithms} for clusters and grids were developed.
1147 \small \barrow The proposed algorithms were applied to the \textcolor{blue}{NAS parallel benchmarks} and \textcolor{blue}{the
1148 Multi-splitting} method.
1150 \small \barrow The proposed algorithms were evaluated over the \textcolor{blue}{SimGrid simulator} and over the \textcolor{blue}{Grid'5000 testbed}.
1152 \small \barrow All the proposed methods were compared to either \textcolor{blue}{Rauber and Rünger's method} or to the \textcolor{blue}{EDP objective function}.
1160 %%%%%%%%%%%%%%%%%%%%
1162 %%%%%%%%%%%%%%%%%%%%
1163 \begin{frame}{Publications}
1165 \begin{block}{\small Journal Articles }\scriptsize
1166 \begin{enumerate}[$\lbrack$1$\rbrack$]
1168 \item Ahmed Fanfakh, Jean-Claude Charr, Raphaël Couturier, Arnaud Giersch. Optimizing the energy consumption of message passing applications with iterations executed over grids. \textit{Journal of Computational
1171 \item Ahmed Fanfakh, Jean-Claude Charr, Raphaël Couturier, Arnaud Giersch. Energy Consumption Reduction for
1172 Asynchronous Message Passing Applications. \textit{Journal of Supercomputing}, 2016, (Submitted)
1178 \begin{block}{\small Conference Articles }\scriptsize
1180 \begin{enumerate}[$\lbrack$1$\rbrack$]
1182 \item Jean-Claude Charr, Raphaël Couturier, Ahmed Fanfakh, Arnaud Giersch. Dynamic Frequency Scaling for
1183 Energy Consumption Reduction in Distributed MPI Programs. \textit{ISPA 2014}, pp.
1184 225-230. IEEE Computer Society, Milan, Italy (2014).
1186 \item Jean-Claude Charr, Raphaël Couturier, Ahmed Fanfakh, Arnaud Giersch. Energy Consumption Reduction
1187 with DVFS for Message Passing Iterative Applications on Heterogeneous Architectures.
1188 \textit{The $16^{th}$ PDSEC}. pp. 922-931. IEEE Computer Society, INDIA (2015).
1190 \item Ahmed Fanfakh, Jean-Claude Charr, Raphaël Couturier, Arnaud Giersch. CPUs Energy Consumption
1191 Reduction for Asynchronous Parallel Methods Running over Grids. \textit{The $19^{th}$ CSE conference}. IEEE Computer Society,
1200 %%%%%%%%%%%%%%%%%%%%
1202 %%%%%%%%%%%%%%%%%%%%
1203 \begin{frame}{Perspectives}
1207 \small \barrow The proposed algorithms should take into consideration the
1208 \textcolor{blue}{variability between some iterations}.
1210 \small \barrow The proposed algorithms should be applied to \textcolor{blue}{other message passing methods with iterations} in order to see how they adapt to the characteristics of these methods.
1212 \small \barrow The proposed algorithms for heterogeneous platforms should be applied to heterogeneous platforms composed of \textcolor{blue}{CPUs and GPUs}.
1214 \small \barrow Comparing the results returned by the energy models to the values given by \textcolor{blue}{real instruments that measure the energy consumptions} of CPUs during the execution time.
1219 %%%%%%%%%%%%%%%%%%%%
1221 %%%%%%%%%%%%%%%%%%%%
1222 \begin{frame}{Fin} \vspace{-10 mm}
1224 \centering \Large \textcolor{blue}{Thank you for your attention}
1227 \centering \textcolor{blue}{ {\Large Questions?}}