% /solutions/conference-talks/conference-ornate-20min.fr.tex, 22/02/2006 De Sousa \documentclass{beamer} %\documentclass[red]{beamer} \usepackage{pgf,pgfarrows,pgfnodes,pgfautomata,pgfheaps,pgfshade} \usepackage{amsfonts,amsmath,amssymb,stmaryrd} \usepackage[latin1]{inputenc} \usepackage[T1]{fontenc} %\usepackage[frenchb]{babel} \usepackage{aeguill} \usepackage{graphicx} \usepackage{amsmath,amssymb} \usepackage{multirow} \usepackage{multicol} \usepackage{colortbl} \usepackage{graphicx} %\usepackage{figlatex} \usepackage{multirow} \usepackage{times} \usepackage{marvosym} \usepackage{pifont} %\usepackage[vlined,longend,algoruled,titlenumbered,french]{algorithm2e} \usepackage[vlined,french,algoruled]{algorithm2e} \usepackage[gen]{eurosym} % _ _ _ _ % ___| |_ _ _| | ___ ___| |_ ___ ___ _ _| | ___ _ _ _ __ ___ % / __| __| | | | |/ _ \ / _ \ __| / __/ _ \| | | | |/ _ \ | | | '__/ __| % \__ \ |_| |_| | | __/ | __/ |_ | (_| (_) | |_| | | __/ |_| | | \__ \ % |___/\__|\__, |_|\___| \___|\__| \___\___/ \__,_|_|\___|\__,_|_| |___/ % |___/ % \setbeamercovered{transparent} \usetheme{Warsaw} % enlever [red] dans le \documentclass si utilisation de la suite : \usecolortheme[RGB={202,31,39}]{structure} % rouge %\setbeamertemplate{background}{\includegraphics[height=96mm,width=128mm]{fond.png}} % definition de la couleur du fond des blocks \definecolor{ArrierePlanBodyW}{rgb}{0.5,0.5,0.5} % si ecriture blanche \definecolor{ArrierePlanBodyB}{rgb}{0.85,0.85,0.85} % si ecriture noire % definition de la couleur des slides \definecolor{ArrierePlan}{rgb}{0.9,0.9,0.9} % selection de la couleur du fond des slides \setbeamercolor{background canvas}{bg=ArrierePlan} % selection de la couleur du fond des block avec ecriture noire %\setbeamercolor{block body}{use=structure,fg=black,bg=ArrierePlanBodyB} % selection de la couleur du fond des block avec ecriture blanche \setbeamercolor{block body}{use=structure,fg=white,bg=ArrierePlanBodyW} \DeclareGraphicsExtensions{pdf,eps,jpg,png} \graphicspath{{.}{./fig/}{img/}} \defbeamertemplate*{footline}{section in head/foot} { \leavevmode% \hbox{\begin{beamercolorbox}[wd=.5\paperwidth,ht=2.5ex,dp=1.125ex,leftskip=.3cm plus1fill,rightskip=.3cm]{author in head/foot}% \usebeamerfont{author in head/foot}\insertshortauthor \end{beamercolorbox}% \begin{beamercolorbox}[wd=.5\paperwidth,ht=2.5ex,dp=1.125ex,leftskip=.3cm,rightskip=.3cm plus1fil]{title in head/foot}% \usebeamerfont{title in head/foot}\insertshorttitle \hfill \insertframenumber{} / \inserttotalframenumber\hspace*{2ex}% \end{beamercolorbox}}% \vskip0pt% } \setbeamercovered{dynamic} %\logo{\includegraphics[height=1cm]{./img/logo_petit-tr.png}} % \defbeamertemplate*{headline}{split theme} % {% % \begin{beamercolorbox}{section in head/foot} % \insertsectionnavigationhorizontal{2cm}{\includegraphics[height=2cm]{./img/logo_petit-tr.png}}{} % \end{beamercolorbox}% % } \defbeamertemplate*{headline}{page title}{ \leavevmode% \begin{beamercolorbox}[wd=.08\paperwidth,ht=1.45cm]{section in head/foot}% \vbox to 1.45cm{\vfill\hspace*{-0.1mm}\includegraphics[height=1.45cm]{./img/logo_fresnel.pdf}\vfill}% \end{beamercolorbox}% \begin{beamercolorbox}[wd=.655\paperwidth,ht=1.45cm]{section in head/foot}% \vbox to 1.45cm{\vfill\hspace*{-0.1mm}\includegraphics[height=1.45cm]{./img/UPC.jpg}\vfill}% \end{beamercolorbox}% \begin{beamercolorbox}[wd=.1\paperwidth,ht=1.45cm]{section in head/foot}% \vbox to 1.45cm{\vfill\hspace*{1mm}\includegraphics[height=1.25cm]{./img/lifc.pdf}\vfill}% \end{beamercolorbox}% \begin{beamercolorbox}[wd=.17\paperwidth,ht=1.45cm]{section in head/foot}% \vbox to 1.45cm{\vfill\hspace*{-0.1mm}\includegraphics[height=1.45cm]{./img/ufc2.pdf}\vfill}% \end{beamercolorbox}% } \defbeamertemplate*{headline}{section in head/foot} { \leavevmode% \begin{beamercolorbox}[wd=.1\paperwidth,ht=1.45cm]{section in head/foot}% \vbox to 1.45cm{\vfill\hspace*{-0.1mm}\includegraphics[height=1.25cm]{./img/lifc.pdf}\vfill}% \end{beamercolorbox}% \begin{beamercolorbox}[wd=.39\paperwidth,ht=1.45cm]{section in head/foot}% \vbox to 1.45cm{\vfil\insertsectionnavigation{.4\paperwidth}\vfil}% \end{beamercolorbox}% \begin{beamercolorbox}[wd=.43\paperwidth,ht=1.45cm]{subsection in head/foot}% \vbox to 1.45cm{\vfil\insertsubsectionnavigation{.4\paperwidth}\vfil}% \end{beamercolorbox}% %\begin{beamercolorbox}[wd=.17\paperwidth,ht=1.45cm]{section in head/foot}% % \vbox to 1.45cm{\vfill\hspace*{-0.1mm}\includegraphics[height=1.45cm]{./img/ufc2.pdf}\vfill}% %\end{beamercolorbox}% %\begin{beamercolorbox}[wd=.235\paperwidth,ht=1.45cm]{section in head/foot}% % \vbox to 1.45cm{\vfill\hspace*{-0.1mm}\includegraphics[height=1.45cm]{./img/UPC.jpg}\vfill}% %\end{beamercolorbox}% \begin{beamercolorbox}[wd=.1\paperwidth,ht=1.45cm]{section in head/foot}% \vbox to 1.45cm{\vfill\includegraphics[height=1.45cm]{./img/logo_fresnel.pdf}\vfill}% \end{beamercolorbox}% } % CUSTOM JMN %%%%%%%%%%%%% % EXERCICES % %%%%%%%%%%%%% \newcounter{exo} \setcounter{exo}{0} \newcounter{labelExo} \newcommand{\Exos}[1]{\stepcounter{exo}{\vspace{0.5cm} {\textbf{Exercice \arabic{exo}~:\ #1}}}} \newcommand{\Solution}{\vspace{0.5cm} {\textbf{\emph{Solution de l'exercice \arabic{exo}}}}} %%%%%%%%%%%%%%% % DEFINITIONS % %%%%%%%%%%%%%%% \newcounter{def} \setcounter{def}{0} \newcounter{labelDef} \renewcommand{\Definition}[1]{\stepcounter{def}{%\vspace{0.5cm} {\textbf{Définition \arabic{def}~:\ #1 }}}} \def\N{{ I\!\!N}} \def\R{{ I\!\!R}} \def\Z{{ Z\!\!\! Z}} \def\Q{I\kern-0.60em Q} \def\C{I\kern-0.60em C} \def\F{I\kern-0.20em F} \def\K{I\kern-0.20em K} % fleche d'affectation pour les algo \def\plv{$ \leftarrow$\hspace{1ex}} \titlegraphic{% \centerline{% %% \parbox{.165\textwidth}{% %% \includegraphics[width=0.165\textwidth]{logo-2lignes-moyen-tr.png}} %%\hfill %%\parbox{.22\textwidth}{% %% \includegraphics[width=0.11\textwidth]{logo_tutelle4.jpg}} %\hfill %\parbox{.22\textwidth}{ % \includegraphics[width=0.11\textwidth]{logo_petit-tr.png}} \hfill \parbox{.5\textwidth}{% \includegraphics[width=0.5\textwidth]{CITLogo2.jpg}} \hfill } } \title[Snake GPU] % (facultatif, à utiliser uniquement si le titre de l'article est trop long) {\scshape GPU-accelerated snake} \subtitle {GPU implementation of a region-based segmentation algorithm (snake) for large images} \author[G. Perrot] % (facultatif, à utiliser seulement avec plusieurs auteurs) {Gilles Perrot$¹$, Stéphane Domas$¹$, Raphaël Couturier$¹$, Nicolas Bertaux$²$.} % - Composez les noms dans l'ordre dans lequel ils apparaîtrons dans l'article % - Utilisez la commande \inst{?} uniquement si les auteurs ont des affiliations % différentes. \institute[] % (facultatif mais généralement nécessaire) { $¹$University of Franche-Comté - Distributed Numerical Algorithmics group (AND),\\ $²$Fresnel Institute - Physics and Image Computing group (PhyTI) } % - Utilisez la commande \inst uniquement s'il y a plusieurs affectations % - Faîtes quelque chose de simple, personne ne s'intéresse à votre adresse. \date[] % (facultatif) {\small 31 august - 1 september} %\subject{} % Inséré uniquement dans la page d'information du fichier PDF. Peut être % supprimé. % Si vous avez un fichier nommé "université-logo-nomfichier.xxx", où xxx % est un format graphique accepté par latex ou pdflatex (comme par exemple .png), % alors vous pouvez insérer votre logo ainsi : %\pgfdeclareimage[height=1cm]{logo-lifc}{img/logo-lifc.jpg} %\logo{\pgfuseimage{logo-lifc}} % À supprimer si vous ne voulez pas que la table des matières apparaisse % au début de chaque sous-section : % % \AtBeginSubsection[] { % \begin{frame}{Lignes directrices} % \tableofcontents[currentsection,currentsubsection] % \end{frame} % } % Si vous souhaitez recouvrir vos transparents un à un, % utilisez la commande suivante (pour plus d'info, voir la page 74 du manuel % d'utilisation de Beamer (version 3.06) par Till Tantau) : %\beamerdefaultoverlayspecification{<+->} \colorlet{textvert}{green} \begin{document} \setbeamertemplate{headline}[page title] \setbeamertemplate{footline}[default] \begin{frame}[default] \titlepage \end{frame} \begin{frame}{Table of contents} \tableofcontents % Vous pouvez, si vous le souhaitez ajouter l'option [pausesections] \end{frame} \setbeamertemplate{headline}[section in head/foot] \setbeamertemplate{footline}[section in head/foot] \section{Context} \begin{frame}{Image segmentation} \begin{block}{Definition, goal} \begin{itemize} \item Dividing an image in two homogeneous regions. \item Reducing the amount of data needed to code information. \item Helping the human perception in certain cases. \end{itemize} \end{block} \begin{block}{Image characteristics} \begin{itemize} \item 16 bit-coded gray levels, \item From 10~Mpixels to more than 100~Mpixels, \item Very noisy. \end{itemize} \end{block} \end{frame} % \begin{frame}{Images of our interest} % \begin{block}{Origins} % \begin{itemize} % \item Synthetic Aperture RADAR (S.A.R.), % \item Ultrasonic (medical imaging), % \item Photographic (IR, nightshots). % \end{itemize} % \end{block} % \begin{block}{Characteristics} % \begin{itemize} % \item 16 bit-coded gray levels, % \item From 10~Mpixels to more than 100~Mpixels, % \item Very noisy. % \end{itemize} % \end{block} % \end{frame} \section{Snake algorithm} % \begin{frame}{Active contour hypothesis} % \begin{itemize} % \item A background region $B$ and a target region $T$, containing $n_T$ and $n_B$ pixels, compose a $L x H$ pixel image. % \item Probability density functions (PDF) $p^B$ and $p^T$ are of the same family (Gaussian in this example). % % inutile ds les hypotheses, on ne dit pas qu'on ls connait % %\item Vectors of parameters $\Theta^B$ and $\Theta^T$ of the PDFs $p^B$ and $p^T$ are unknown. % \item The contour $\varGamma$ which surrounds $T$ is chosen polygonal. % \item [\ding{253}] Implementation of a region-based algorithm. % \end{itemize} % \end{frame} \begin{frame}{Algorithm basics : criterion} \begin{columns} \begin{column}[l]{5cm} \resizebox{5cm}{5cm}{\input{img/fig_target.pdf_t}} \end{column} \begin{column}[l]{7.5cm} \begin{itemize} \item The goal is to find the most likely contour $\varGamma$ (number and positions of nodes). \item The criterion used is a \emph{Generalized Likelihood}.\\In the Gaussian case, it is given by $$GL = \dfrac{1}{2}\left[ n_B.log\left({\widehat{\sigma_B}}^2\right) + n_T.log\left({\widehat{\sigma_T}}^2\right)\right]$$ where $\widehat{\sigma_\Omega}$ is the estimation of the deviation $\sigma$ for the region $\Omega$. \end{itemize} \end{column} \end{columns} \end{frame} \begin{frame}{Algorithm basics : criterion} \begin{columns} \begin{column}[l]{5cm} \resizebox{5cm}{5cm}{\input{img/fig_target.pdf_t}} \end{column} \begin{column}[l]{7.5cm} \begin{itemize} \item Parameters estimation is done by 1-D sums on along the contour. \item Every pixel coordinates are needed. \end{itemize} \end{column} \end{columns} \end{frame} % \begin{frame}{Algorithm basics : parameters estimation} % \begin{itemize} % \item In the Gaussian case, $p_{\Omega}$ has two parameters, average and deviation, which are estimated by maximum likelihood.\\ % If $z$ is the gray level of the pixel of coordinates $(i,j)$: % $$ % \left( % \begin{array}{l} % \widehat{\mu_{\Omega}} = \frac{1}{n_{\Omega}} \displaystyle\sum_{(i,j)\in \Omega} z(i,j) \\ % \widehat{\sigma^2_{\Omega}} = \frac{1}{n_{\Omega}} \displaystyle\sum_{(i,j)\in \Omega} \left(z(i,j) - \widehat{\mu_{\Omega}}\right)^2 \\ % \end{array} % \right. % $$ % \item These estimations have to be computed for each test state of the contour $\varGamma$: time-consuming. % \end{itemize} % \end{frame} % \begin{frame}{Algorithm basics : parameters estimation} % \begin{itemize} % %peut-être illustrer ici % \item Based on the Green-Ostogradsky theorem, Chesnaud has shown how to replace those 2-dimensions sums inside the contour % by 1-dimension sums along the contour. % \item This optimization implies: % \begin{itemize} % \item the precomputation of a few matrices (called cumulated images) containing the potential \emph{contributions} of each pixel of the image, % \item the use of constant lookup tables of weighting coefficients to determine the \emph{contributions} of each segment of pixels. % \end{itemize} % \end{itemize} % \end{frame} % \begin{frame}{CPU sequential implementation : outlines} % \begin{enumerate} % \item Precomputation of the cumulated images. % \item Choice of the initial contour $\varGamma_0$: rectangle near the borders. % \item Changes made to the contour in order to find the most likely one: % \begin{enumerate}[i] % \item Move every node as long as it gives a better criterion. % \item Add nodes in the middle of each big enough segment. % \end{enumerate} % \end{enumerate} % \end{frame} \begin{frame}{Snake algorithm in action} \begin{columns} \begin{column}[T]{7cm} \begin{overprint} \onslide<1> \includegraphics[width=7cm]{./img/sequences/cochon_it0.png} \onslide<2> \includegraphics[width=7cm]{./img/sequences/cochon_it1.png} \onslide<3> \includegraphics[width=7cm]{./img/sequences/cochon_it21.png} \onslide<4> \includegraphics[width=7cm]{./img/sequences/cochon_it22.png} \onslide<5> \includegraphics[width=7cm]{./img/sequences/cochon_it4.png} \end{overprint} \end{column} \begin{column}[T]{5cm} \only<1> { \begin{itemize} \item 150~Mpixels image. \item Initial contour: 4 nodes. \end{itemize} } \only<2> { \begin{itemize} \item End of first iteration: no more move can be of interest. \end{itemize} } \only<3> { \begin{itemize} \item Nodes added in the middle of segments. \end{itemize} } \only<4> { \begin{itemize} \item End of second iteration. \end{itemize} } \only<5> { \begin{itemize} \item End of fifth iteration\\(36 nodes). \item Fast segmentation. \item Efficient with noise. \end{itemize} } \end{column} \end{columns} \end{frame} \section{GPU Implementation} \begin{frame}{GPU design : key points} \begin{itemize} \item The parallelism of a modern GPU lays on a SIMT paradigm (Single Instruction Multiple Threads): the same instruction is processed by a great number of threads at a time (up to $2^{16}$). \item Threads are compounded in independants blocks with no possible synchronization between blocks. \item Threads in a block share a small amount of shared memory (16-48~KBytes). \item There are restrictive conditions to be fullfilled in order to make efficent accesses to global and shared memory. \item Data transfers between CPU and GPU are slow. \end{itemize} \end{frame} % \begin{frame}{GPU implementation: precomputations} % \begin{itemize} % \item One of the cumulated images is not to be computed anymore: values are evaluated on the fly. % \item An inclusive parallel prefixsum is performed on each row of the image for each matrix to be processed ($z$,$z^2$). % \item[\ding{253}] Speedup is around x7 for images larger than 100~MPixels. Comparison is done with the SSE/CPU implementation of the PhyTI group. % \item[\ding{253}] Higher speedups (x15) are obtained with specific versions for constant image sizes. % \end{itemize} % \end{frame} \begin{frame}{GPU implementation} When modifying the shape of the contour, we have to compute: \begin{itemize} \item Parameters of the corresponding regions. \item The value of the criterion. \item The parallelization needs reside essentially in the parameters estimation.\\Two possible parallelism levels: \begin{itemize} \item One contour per thread. \item One pixel per thread. \item[\ding{253}] The \emph{one pixel per thread} rule is far more efficient, due to memory access constraints. \end{itemize} \end{itemize} \end{frame} \begin{frame}{GPU implementation: parallelization} \begin{columns} \begin{column}[T]{6cm} \resizebox{6cm}{6cm}{\input{./img/topologie.pdf_t}} \end{column} \begin{column}[T]{6.5cm} \begin{itemize} \item Every 16 segments for every node are processed in parallel. \item Fits GPU specific parallelism: each segment pixel is processed by a thread. \item Criterion values are obtained after several reduction stages. \item All nodes are possibly moved in one step. \end{itemize} \end{column} \end{columns} \end{frame} % \begin{frame}{GPU implementation: data structure} % The main idea is to organize, in a single array, every pixels of every segments to be processed.\\ % Thus, for a given state of the contour ($N$ nodes), we: % \begin{enumerate} % \item Find the largest segment to be processed. It gives: % \begin{itemize} % \item the block size $bs$ of the computing grid, % \item the number of blocks needed for each segment ($N_{TB}$). % \end{itemize} % \item Compute in parallel, the coordinates of every pixels of the $16.N$ segments to be considered, % \item Make some parallel reductions to finally obtain parameters estimation. % \end{enumerate} % \end{frame} % \begin{frame}{GPU implementation: data structure} % \begin{center} % \begin{overprint} % \onslide<1> \resizebox{8cm}{1.43cm}{\input{img/contribs_segments_seg.pdf_t}} % \onslide<2> \resizebox{8cm}{3.78cm}{\input{img/contribs_segments_16seg.pdf_t}} % \onslide<3> \resizebox{8cm}{6cm}{\input{img/contribs_segments_even.pdf_t}} % \onslide<4> \resizebox{8cm}{6cm}{\input{img/contribs_segments.pdf_t}} % \end{overprint} % \end{center} % \end{frame} \begin{frame}{GPU implementation: first results} \begin{itemize} \item Global speedup around x7-x8 for image sizes from 15 to 150~Mpixels. \item First iterations have higher speedups: \begin{itemize} \item several large segments, \item few inactive threads in the grid. \end{itemize} % \item Last iterations are sometimes slower than on CPU: % \begin{itemize} % \item a lot of small segments, % \item more inactive threads in the grid. % \end{itemize} \end{itemize} \end{frame} \begin{frame}{GPU implementation: smart init (reasons)} \begin{itemize} \item The target shape is often far from initial contour, \item It causes the very first iteration to be much more time-consuming than the other ones. %\item Horizontal segments contributions are null. %\item Vertical segments contributions computations can be fast, through a specific process. \item[\ding{253}] It's fast on GPU to find a rectangle near the target. But it needs to overrule the 'one thread/one pixel' principle. \end{itemize} \end{frame} \begin{frame}{GPU implementation: smart init (process)} \begin{columns} \begin{column}[l]{7cm} \resizebox{7cm}{6.75cm}{\input{img/smart_init1.pdf_t}} \end{column} \begin{column}[l]{5.5cm} \begin{itemize} \item Realize a periodic sampling of a few hundreds of J-coordinates. \item Evaluate in parallel every possible rectangle of diagonal $(0 , j_L)-(H , j_H)$. \item Select the one with the best GL criterion. \item $j_L$ and $j_H$ are now considered as constants. \end{itemize} \end{column} \end{columns} \end{frame} \begin{frame}{GPU implementation: smart init (process)} \begin{columns} \begin{column}[l]{7cm} \resizebox{7cm}{6.75cm}{\input{img/smart_init2.pdf_t}} \end{column} \begin{column}[l]{5.5cm} \begin{itemize} \item Given $j_L$ and $j_H$. \item Realize a periodic sampling of a few hundreds of I-coordinates. \item Evaluate in parallel every possible rectangle of diagonal $(i_L , j_L)-(i_H , j_H)$. \item Select the one with the best GL criterion. \end{itemize} \end{column} \end{columns} \end{frame} \begin{frame}{GPU implementation: improvement} \begin{itemize} \item Global speedup around x10 for image sizes from 15 to 150~Mpixels and a small enough target (as in the example) \item Less than 0.6 second for the 150~Mpixels image of this example. \end{itemize} \end{frame} \section{Conclusion} \begin{frame}{Conclusion, future works} \begin{itemize} \item Interesting speedups \item Original algorithm is not GPU-friendly \item Future works: \begin{itemize} \item Finding a more suited structure to describe the contour. \item Switching to a statistical model independant from a PDF: the potts model. \item Benefit from recent features of CUDA v4 (overlapping, multiple kernels). \item Extend to a multiple targets algorithm, based on this single target elementary piece of code. \end{itemize} \end{itemize} \end{frame} %\begin{frame}[allowframebreaks]{References} %\bibliographystyle{plain} %\bibliography{biblio} %\nocite{*} %\end{frame} \end{document}