DOCS/paper_snake_gpu/snake_gpu_final.tex

   1
   2 %% bare_conf.tex
   3 %% V1.3
   4 %% 2007/01/11
   5 %% by Michael Shell
   6 %% See:
   7 %% http://www.michaelshell.org/
   8 %% for current contact information.
   9 %%
  10 %% This is a skeleton file demonstrating the use of IEEEtran.cls
  11 %% (requires IEEEtran.cls version 1.7 or later) with an IEEE conference paper.
  12 %%
  13 %% Support sites:
  14 %% http://www.michaelshell.org/tex/ieeetran/
  15 %% http://www.ctan.org/tex-archive/macros/latex/contrib/IEEEtran/
  16 %% and
  17 %% http://www.ieee.org/
  18
  19 %%*************************************************************************
  20 %% Legal Notice:
  21 %% This code is offered as-is without any warranty either expressed or
  22 %% implied; without even the implied warranty of MERCHANTABILITY or
  23 %% FITNESS FOR A PARTICULAR PURPOSE!
  24 %% User assumes all risk.
  25 %% In no event shall IEEE or any contributor to this code be liable for
  26 %% any damages or losses, including, but not limited to, incidental,
  27 %% consequential, or any other damages, resulting from the use or misuse
  28 %% of any information contained here.
  29 %%
  30 %% All comments are the opinions of their respective authors and are not
  31 %% necessarily endorsed by the IEEE.
  32 %%
  33 %% This work is distributed under the LaTeX Project Public License (LPPL)
  34 %% ( http://www.latex-project.org/ ) version 1.3, and may be freely used,
  35 %% distributed and modified. A copy of the LPPL, version 1.3, is included
  36 %% in the base LaTeX documentation of all distributions of LaTeX released
  37 %% 2003/12/01 or later.
  38
  39
  40 %% Retain all contribution notices and credits.
  41 %% ** Modified files should be clearly indicated as such, including  **
  42 %% ** renaming them and changing author support contact information. **
  43 %%
  44 %% File list of work: IEEEtran.cls, IEEEtran_HOWTO.pdf, bare_adv.tex,
  45 %%                    bare_conf.tex, bare_jrnl.tex, bare_jrnl_compsoc.tex
  46 %%*************************************************************************
  47
  48 % *** Authors should verify (and, if needed, correct) their LaTeX system  ***
  49 % *** with the testflow diagnostic prior to trusting their LaTeX platform ***
  50 % *** with production work. IEEE's font choices can trigger bugs that do  ***
  51 % *** not appear when using other class files.                            ***
  52 % The testflow support page is at:
  53 % http://www.michaelshell.org/tex/testflow/
  54
  55
  56
  57 % Note that the a4paper option is mainly intended so that authors in
  58 % countries using A4 can easily print to A4 and see how their papers will
  59 % look in print - the typesetting of the document will not typically be
  60 % affected with changes in paper size (but the bottom and side margins will).
  61 % Use the testflow package mentioned above to verify correct handling of
  62 % both paper sizes by the user's LaTeX system.
  63 %
  64 % Also note that the "draftcls" or "draftclsnofoot", not "draft", option
  65 % should be used if it is desired that the figures are to be displayed in
  66 % draft mode.
  67 %
  68 \documentclass[10pt, conference, compsocconf]{IEEEtran}
  69 % Add the compsocconf option for Computer Society conferences.
  70 %
  71 % If IEEEtran.cls has not been installed into the LaTeX system files,
  72 % manually specify the path to it like:
  73 % \documentclass[conference]{../sty/IEEEtran}
  74
  75
  76 %  \usepackage[latin1]{inputenc}
  77 %  \usepackage[cyr]{aeguill}
  78 %  \usepackage[francais]{babel}
  79
  80
  81 % Some very useful LaTeX packages include:
  82 % (uncomment the ones you want to load)
  83
  84
  85 % *** MISC UTILITY PACKAGES ***
  86 %
  87 %\usepackage{ifpdf}
  88 % Heiko Oberdiek's ifpdf.sty is very useful if you need conditional
  89 % compilation based on whether the output is pdf or dvi.
  90 % usage:
  91 % \ifpdf
  92 %   % pdf code
  93 % \else
  94 %   % dvi code
  95 % \fi
  96 % The latest version of ifpdf.sty can be obtained from:
  97 % http://www.ctan.org/tex-archive/macros/latex/contrib/oberdiek/
  98 % Also, note that IEEEtran.cls V1.7 and later provides a builtin
  99 % \ifCLASSINFOpdf conditional that works the same way.
 100 % When switching from latex to pdflatex and vice-versa, the compiler may
 101 % have to be run twice to clear warning/error messages.
 102
 103
 104
 105
 106
 107
 108 % *** CITATION PACKAGES ***
 109 %
 110 \usepackage{cite}
 111 % cite.sty was written by Donald Arseneau
 112 % V1.6 and later of IEEEtran pre-defines the format of the cite.sty package
 113 % \cite{} output to follow that of IEEE. Loading the cite package will
 114 % result in citation numbers being automatically sorted and properly
 115 % "compressed/ranged". e.g., [1], [9], [2], [7], [5], [6] without using
 116 % cite.sty will become [1], [2], [5]--[7], [9] using cite.sty. cite.sty's
 117 % \cite will automatically add leading space, if needed. Use cite.sty's
 118 % noadjust option (cite.sty V3.8 and later) if you want to turn this off.
 119 % cite.sty is already installed on most LaTeX systems. Be sure and use
 120 % version 4.0 (2003-05-27) and later if using hyperref.sty. cite.sty does
 121 % not currently provide for hyperlinked citations.
 122 % The latest version can be obtained at:
 123 % http://www.ctan.org/tex-archive/macros/latex/contrib/cite/
 124 % The documentation is contained in the cite.sty file itself.
 125
 126
 127
 128
 129
 130
 131 % *** GRAPHICS RELATED PACKAGES ***
 132 %
 133 \ifCLASSINFOpdf
 134   \usepackage[pdftex]{graphicx,color}
 135   % declare the path(s) where your graphic files are
 136   \graphicspath{{img/}}
 137   % and their extensions so you won't have to specify these with
 138   % every instance of \includegraphics
 139   \DeclareGraphicsExtensions{.pdf,.jpeg,.png}
 140 \else
 141   % or other class option (dvipsone, dvipdf, if not using dvips). graphicx
 142   % will default to the driver specified in the system graphics.cfg if no
 143   % driver is specified.
 144   % \usepackage[dvips]{graphicx}
 145   % declare the path(s) where your graphic files are
 146   % \graphicspath{{../eps/}}
 147   % and their extensions so you won't have to specify these with
 148   % every instance of \includegraphics
 149   % \DeclareGraphicsExtensions{.eps}
 150 \fi
 151 % graphicx was written by David Carlisle and Sebastian Rahtz. It is
 152 % required if you want graphics, photos, etc. graphicx.sty is already
 153 % installed on most LaTeX systems. The latest version and documentation can
 154 % be obtained at:
 155 % http://www.ctan.org/tex-archive/macros/latex/required/graphics/
 156 % Another good source of documentation is "Using Imported Graphics in
 157 % LaTeX2e" by Keith Reckdahl which can be found as epslatex.ps or
 158 % epslatex.pdf at: http://www.ctan.org/tex-archive/info/
 159 %
 160 % latex, and pdflatex in dvi mode, support graphics in encapsulated
 161 % postscript (.eps) format. pdflatex in pdf mode supports graphics
 162 % in .pdf, .jpeg, .png and .mps (metapost) formats. Users should ensure
 163 % that all non-photo figures use a vector format (.eps, .pdf, .mps) and
 164 % not a bitmapped formats (.jpeg, .png). IEEE frowns on bitmapped formats
 165 % which can result in "jaggedy"/blurry rendering of lines and letters as
 166 % well as large increases in file sizes.
 167 %
 168 % You can find documentation about the pdfTeX application at:
 169 % http://www.tug.org/applications/pdftex
 170
 171
 172
 173
 174
 175 % *** MATH PACKAGES ***
 176 %
 177 %\usepackage[cmex10]{amsmath}
 178 % A popular package from the American Mathematical Society that provides
 179 % many useful and powerful commands for dealing with mathematics. If using
 180 % it, be sure to load this package with the cmex10 option to ensure that
 181 % only type 1 fonts will utilized at all point sizes. Without this option,
 182 % it is possible that some math symbols, particularly those within
 183 % footnotes, will be rendered in bitmap form which will result in a
 184 % document that can not be IEEE Xplore compliant!
 185 %
 186 % Also, note that the amsmath package sets \interdisplaylinepenalty to 10000
 187 % thus preventing page breaks from occurring within multiline equations. Use:
 188 %\interdisplaylinepenalty=2500
 189 % after loading amsmath to restore such page breaks as IEEEtran.cls normally
 190 % does. amsmath.sty is already installed on most LaTeX systems. The latest
 191 % version and documentation can be obtained at:
 192 % http://www.ctan.org/tex-archive/macros/latex/required/amslatex/math/
 193
 194
 195
 196
 197
 198 % *** SPECIALIZED LIST PACKAGES ***
 199 %
 200 \usepackage[ruled,lined,linesnumbered]{algorithm2e}
 201 %\usepackage{algorithmic}
 202 % algorithmic.sty was written by Peter Williams and Rogerio Brito.
 203 % This package provides an algorithmic environment fo describing algorithms.
 204 % You can use the algorithmic environment in-text or within a figure
 205 % environment to provide for a floating algorithm. Do NOT use the algorithm
 206 % floating environment provided by algorithm.sty (by the same authors) or
 207 % algorithm2e.sty (by Christophe Fiorio) as IEEE does not use dedicated
 208 % algorithm float types and packages that provide these will not provide
 209 % correct IEEE style captions. The latest version and documentation of
 210 % algorithmic.sty can be obtained at:
 211 % http://www.ctan.org/tex-archive/macros/latex/contrib/algorithms/
 212 % There is also a support site at:
 213 % http://algorithms.berlios.de/index.html
 214 % Also of interest may be the (relatively newer and more customizable)
 215 % algorithmicx.sty package by Szasz Janos:
 216 % http://www.ctan.org/tex-archive/macros/latex/contrib/algorithmicx/
 217
 218
 219
 220
 221 % *** ALIGNMENT PACKAGES ***
 222 %
 223 \usepackage{array}
 224 % Frank Mittelbach's and David Carlisle's array.sty patches and improves
 225 % the standard LaTeX2e array and tabular environments to provide better
 226 % appearance and additional user controls. As the default LaTeX2e table
 227 % generation code is lacking to the point of almost being broken with
 228 % respect to the quality of the end results, all users are strongly
 229 % advised to use an enhanced (at the very least that provided by array.sty)
 230 % set of table tools. array.sty is already installed on most systems. The
 231 % latest version and documentation can be obtained at:
 232 % http://www.ctan.org/tex-archive/macros/latex/required/tools/
 233
 234
 235 \usepackage{mdwmath}
 236 \usepackage{mdwtab}
 237 % Also highly recommended is Mark Wooding's extremely powerful MDW tools,
 238 % especially mdwmath.sty and mdwtab.sty which are used to format equations
 239 % and tables, respectively. The MDWtools set is already installed on most
 240 % LaTeX systems. The lastest version and documentation is available at:
 241 % http://www.ctan.org/tex-archive/macros/latex/contrib/mdwtools/
 242
 243
 244 % IEEEtran contains the IEEEeqnarray family of commands that can be used to
 245 % generate multiline equations as well as matrices, tables, etc., of high
 246 % quality.
 247
 248
 249 %\usepackage{eqparbox}
 250 % Also of notable interest is Scott Pakin's eqparbox package for creating
 251 % (automatically sized) equal width boxes - aka "natural width parboxes".
 252 % Available at:
 253 % http://www.ctan.org/tex-archive/macros/latex/contrib/eqparbox/
 254
 255
 256
 257
 258
 259 % *** SUBFIGURE PACKAGES ***
 260 %\usepackage[tight,footnotesize]{subfigure}
 261 % subfigure.sty was written by Steven Douglas Cochran. This package makes it
 262 % easy to put subfigures in your figures. e.g., "Figure 1a and 1b". For IEEE
 263 % work, it is a good idea to load it with the tight package option to reduce
 264 % the amount of white space around the subfigures. subfigure.sty is already
 265 % installed on most LaTeX systems. The latest version and documentation can
 266 % be obtained at:
 267 % http://www.ctan.org/tex-archive/obsolete/macros/latex/contrib/subfigure/
 268 % subfigure.sty has been superceeded by subfig.sty.
 269
 270
 271
 272 %\usepackage[caption=false]{caption}
 273 %\usepackage[font=footnotesize]{subfig}
 274 % subfig.sty, also written by Steven Douglas Cochran, is the modern
 275 % replacement for subfigure.sty. However, subfig.sty requires and
 276 % automatically loads Axel Sommerfeldt's caption.sty which will override
 277 % IEEEtran.cls handling of captions and this will result in nonIEEE style
 278 % figure/table captions. To prevent this problem, be sure and preload
 279 % caption.sty with its "caption=false" package option. This is will preserve
 280 % IEEEtran.cls handing of captions. Version 1.3 (2005/06/28) and later
 281 % (recommended due to many improvements over 1.2) of subfig.sty supports
 282 % the caption=false option directly:
 283 \usepackage[caption=false,font=footnotesize]{subfig}
 284 %
 285 % The latest version and documentation can be obtained at:
 286 % http://www.ctan.org/tex-archive/macros/latex/contrib/subfig/
 287 % The latest version and documentation of caption.sty can be obtained at:
 288 % http://www.ctan.org/tex-archive/macros/latex/contrib/caption/
 289
 290
 291
 292
 293 % *** FLOAT PACKAGES ***
 294 %
 295 \usepackage{fixltx2e}
 296 % fixltx2e, the successor to the earlier fix2col.sty, was written by
 297 % Frank Mittelbach and David Carlisle. This package corrects a few problems
 298 % in the LaTeX2e kernel, the most notable of which is that in current
 299 % LaTeX2e releases, the ordering of single and double column floats is not
 300 % guaranteed to be preserved. Thus, an unpatched LaTeX2e can allow a
 301 % single column figure to be placed prior to an earlier double column
 302 % figure. The latest version and documentation can be found at:
 303 % http://www.ctan.org/tex-archive/macros/latex/base/
 304
 305
 306
 307 %\usepackage{stfloats}
 308 % stfloats.sty was written by Sigitas Tolusis. This package gives LaTeX2e
 309 % the ability to do double column floats at the bottom of the page as well
 310 % as the top. (e.g., "\begin{figure*}[!b]" is not normally possible in
 311 % LaTeX2e). It also provides a command:
 312 %\fnbelowfloat
 313 % to enable the placement of footnotes below bottom floats (the standard
 314 % LaTeX2e kernel puts them above bottom floats). This is an invasive package
 315 % which rewrites many portions of the LaTeX2e float routines. It may not work
 316 % with other packages that modify the LaTeX2e float routines. The latest
 317 % version and documentation can be obtained at:
 318 % http://www.ctan.org/tex-archive/macros/latex/contrib/sttools/
 319 % Documentation is contained in the stfloats.sty comments as well as in the
 320 % presfull.pdf file. Do not use the stfloats baselinefloat ability as IEEE
 321 % does not allow \baselineskip to stretch. Authors submitting work to the
 322 % IEEE should note that IEEE rarely uses double column equations and
 323 % that authors should try to avoid such use. Do not be tempted to use the
 324 % cuted.sty or midfloat.sty packages (also by Sigitas Tolusis) as IEEE does
 325 % not format its papers in such ways.
 326
 327
 328
 329 % correct bad hyphenation here
 330 % \hyphenation{op-tical net-works semi-conduc-tor}
 331
 332
 333 \begin{document}
 334 %
 335 % paper title
 336 % can use linebreaks \\ within to get better formatting as desired
 337 \title{GPU implementation of a region based algorithm \\ for large images segmentation}
 338
 339
 340 % author names and affiliations
 341 % use a multiple column layout for up to two different
 342 % affiliations
 343
 344 \author{
 345 \IEEEauthorblockN{Gilles Perrot$^1$, St\'{e}phane Domas$^1$, Rapha\"{e}l Couturier$^1$, Nicolas Bertaux$^2$}
 346 \IEEEauthorblockA{$^1$Distributed Numerical Algorithmics team (AND), Laboratoire d'Informatique de Franche-comt\'{e}\\
 347 Rue Engel Gros, 90000 Belfort, France.\\
 348 forename.name@univ-fcomte.fr}
 349 %\IEEEauthorblockN{Nicolas Bertaux}
 350 \IEEEauthorblockA{$^2$ Institut Fresnel, CNRS, Aix-Marseille Universit\'e, Ecole Centrale Marseille,\\
 351 Campus de Saint-J\'er\^ome, 13013 Marseille, France.\\
 352 nicolas.bertaux@ec-marseille.fr}
 353 }
 354
 355
 356
 357 % use for special paper notices
 358 %\IEEEspecialpapernotice{(Invited Paper)}
 359
 360
 361 % make the title area
 362 \maketitle
 363
 364 \begin{abstract}
 365 Image segmentation is one of the most challenging issues in image computing.
 366 In this work, we focus on region-based active contour techniques (snakes) as they seem to achieve a high level of robustness and fit with a large range of
 367 applications. Some algorithmic optimizations provide significant speedups, but even so, execution times are still non-neglectable
 368 with the continuing increase of image sizes. Moreover, these algorithms are not well suited for running on multi-core CPU's.
 369 At the same time, recent developments of Graphical Processing Units (GPU) suggest that higher speedups could be obtained
 370 by use of their specific design. We have managed to adapt a specially efficient snake algorithm that fits recent Nvidia GPU architecture
 371 and takes advantage of its massive multithreaded execution capabilities. The speedup obtained is most often around 7.
 372 \end{abstract}
 373
 374 \begin{IEEEkeywords}
 375  GPU; segmentation; snake;
 376 \end{IEEEkeywords}
 377
 378 \section{Introduction}
 379 Segmentation and shape detection are still key issues in image computing. These techniques are used in numerous fields ranging from medical imaging to video tracking, shape recognition or localization.
 380 Since 1988, the active contours (snakes) introduced by and  Kass et al. \cite{KassWT88}, have proved to be efficient and robust, especially against noise, for a wide range of image types.
 381
 382 The main shortcoming of these algorithms is often their high dependence on the initial contour, though several contributions have lowered this dependency and also brought
 383 more accurate segmentation of non convex shape \cite{ChesnaudRB99}.
 384
 385 The information that drives a contour model comes either from the contour itself or from the characteristics of the regions it defines.
 386 For noisy images, the second option is often more suitable as it takes into account the statistical fluctuations of the pixels.
 387 One approach \cite{ChesnaudRB99} proposes a geometric (polygonal) region-based snake driven by the minimization of the likelihood (ML).
 388
 389 An important issue of image processing, especially segmentation, has always been the computation time of most algorithms. Over the years, the increase of CPU computing capabilities,
 390 although quite impressive, has not been able to fulfill the combined needs of growing resolution and real-time computation.
 391 Since having been introduced in the early 1980's, the capabilities and speed of graphics accelerators have always been increasing. So much so that the recent GPGPU
 392 (General Purpose Graphic Processing Units) currently benefit by a massively parallel architecture for general purpose programming, especially when dealing with large matrices
 393 or vectors. On the other hand, their specific design obviously imposes a number of limitations and constraints.
 394
 395 \subsection{\label{related}Related work}
 396 Since the main issue with most of the segmentation methods is the computational effort it implies, researchers have recently tried to benefit from the GPGPU architecture
 397 to reduce time costs. In \cite{catanB09}  authors achieve impressive speed-ups on a range of contour detection algorithms on the condition that they are applied
 398 to images with good contrast and SNR (Signal-to-Noise Ratio). Others, like \cite{cremD09}, have focused on the related issues of segmentation and tracking:
 399 this proves efficient in processing low-contrast images, but imposes limits as to the size of images that cannot be of big dimensions (several million pixels).
 400 One third option, parametric snakes, has also been investigated with some success, as in \cite{brunett}, although the principle of computation per small tile is not suited
 401 to the algorithm we have implemented. In the medical imaging field, some researchers have implement efficient parallel segmentation algorithms.
 402 For example in \cite{KauffmannP08}, auhtors implement a GPU watershed-based algorithm, but source images are considered with a good contrast.
 403
 404 \subsection{\label{contrib}Contribution}
 405  The geometric snake we have focused on has proved to be efficient processing “real-life” images, with poor SNR and contrast. However,
 406 the required computational effort still imposes limits to the size of a wide range of images to be processed, such as the output of SAR (Synthetic Aperture Radar).
 407 Our goal was then to propose a way to fit such a region-based snake algorithm to the Nvidia Tesla$^{\textcopyright}$ GPU architecture.
 408  The remainder of this paper exposes the principles of the algorithm and notations in section \ref{secCPUalgooutlines}.
 409  Section \ref{secCPUalgodetails} deals with the details of sequential CPU implementation. Section \ref{GPUgeneralites} summarizes the main features of
 410  the Nvidia$^{\textcopyright}$ GPU and explains how to deal with them efficiently. Then sections \ref{GPUimplementation} and \ref{secSpeedups} detail our GPU implementation
 411 and timing results. In our conclusion \ref{secConclusion} we attempt to evaluate the pros and cons of this implementation, and suggest further tracks to be investigated
 412 in future research.
 413
 414
 415 \section{\label{secCPUalgooutlines}Sequential algorithm: outlines}
 416 The goal of the active contour segmentation method (snake) we studied \cite{ChesnaudRB99} is to distinguish, inside an image $I$, a target region $T$ from the background region
 417 $B$. The size of $I$ is L x H pixels of coordinates $(i,j)$ and gray level $z(i,j)$. $Z$ represents the gray levels data of $I$.
 418 We assume that the gray levels of $T$ and $B$ are vectors of independent and identically distributed values, each with a probability density function (PDF)
 419  $p^{\Omega}$ $(\Omega \in \{T ; B\})$.
 420 The present implementation uses a Gaussian PDFw, but another one can easily be used as Gamma or Poisson (Cf. \cite{ChesnaudRB99}).
 421
 422 The \textit{active contour} $S$, which defines the shape of $T$ is chosen as polygonal.
 423 The purpose of the segmentation is then to determine the shape that optimizes a generalized log-likelihood-based criterion (GL).
 424 This is done by an iterative process which is initialized with an arbitrary shape, then at each step:
 425 \begin{enumerate}
 426   \item it modifies the shape
 427   \item it estimates the parameters of the Gaussian functions for the two regions and evaluates the criterion.
 428   \item it validates the new shape if the criterion has a better value.
 429 \end{enumerate}
 430 A simplified description of it is given in \emph{Algorithm \ref{cpualgosimple}} which features two nested loops: the main one, on iteration level, is
 431 responsible for tuning the number of nodes; the inner one, on step level, takes care of finding the best shape for a given number of nodes.
 432 \emph{Figure \ref{images_algo}} shows intermediate results at iteration level. Sub-figure \emph{\ref{fig:labelinit}} shows the initial rectangular shape, \emph{\ref{fig:labelit1}}
 433  shows the best four-node shape that ends
 434 the first iteration. Sub-figures \emph{\ref{fig:labelit2}} and \emph{\ref{fig:labelit4}} show the best shape for an eight-node contour (resp. 29-node)
 435 which occurs at the end of the second  iteration (resp. fourth).
 436
 437 \begin{algorithm}[h]
 438 \label{cpualgosimple}
 439 \caption{Sequential algorithm: outlines}
 440 \SetNlSty{textbf}{}{:}
 441
 442   %compute\_cumulated\_images()\;
 443   begin with a rectangular 4 nodes contour\;
 444   \Repeat(\tcc*[f]{iteration level}){no more node can be added}{
 445       \Repeat(\tcc*[f]{step level}){no more node can be moved}{
 446           Test some other positions for each node, near its current position\;
 447           Find the best GL and adjust the node's position\;
 448       }
 449   Add a node in the middle of each \emph{long enough} segment\;
 450   }
 451 \end{algorithm}
 452
 453
 454 \begin{figure}[h]
 455  \centering
 456 \subfloat[Initial contour]{\label{fig:labelinit} \includegraphics[width=0.4\linewidth]{./img/cochon_petit_init.jpg}}\qquad
 457 \subfloat[End of first iteration (4 nodes) ]{\label{fig:labelit1} \includegraphics[width=0.4\linewidth]{./img/cochon_petit_it1.jpg}}\\
 458 \subfloat[End of second iteration (8 nodes)]{\label{fig:labelit2} \includegraphics[width=0.4\linewidth]{./img/cochon_petit_it2.jpg}}\qquad
 459 \subfloat[End of fourth iteration (29 nodes)]{\label{fig:labelit4} \includegraphics[width=0.4\linewidth]{./img/cochon_petit_it4.jpg}}
 460 %\subfloat[width=0.4\linewidth]{./img/cochon_b_entier.jpg}
 461  % cochon_b_entier.jpg: 3960x2970 pixel, 72dpi, 139.70x104.78 cm, bb=0 0 3960 2970
 462  \caption{segmentation of a noisy image}
 463   \label{images_algo}
 464 \end{figure}
 465
 466
 467 \section{\label{secCPUalgodetails}Sequential algorithm: details}
 468 \subsection{Criterion}
 469 Let $p^{\Omega}$ be a Gaussian PDF. Its vector of parameters $\Theta_{\Omega}$ ($\Omega \in \{T ; B\}$) has two components,
 470 the average value $\mu$ and the standard deviation $\sigma$.
 471 The likelihood for the regions $\Omega$ $(\Omega \in \{T ; B\})$ is given by
 472 $$ P[Z|T, B, \Theta_T, \Theta_B] = P(Z|T, \Theta_T)P(Z|B, \Theta_B)$$
 473 where
 474 $$P(Z|\Omega,  \Theta_{\Omega}) = \prod_{(i,j)\in \Omega} p^{\Omega}[z(i,j), \Theta_{\Omega}] ~~~~(\Omega \in \{T ; B\})$$
 475 The log-likelihood of the region $\Omega$ is then
 476 $$-N_{\Omega}\log\left(\sqrt{2\pi}\right) -N_{\Omega}.log\left(\sigma\right) - \frac{1}{2\sigma^2}\sum_{(i,j)\in \Omega} \left( z(i,j)-\mu \right)^2 $$
 477 inside which the vectors of parameters $\Theta_{\Omega}$ are determined by ML estimation
 478 $$
 479 \widehat{\Theta_{\Omega}} \left(
 480 \begin{array}{l}
 481 \widehat{\mu_{\Omega}} = \frac{1}{N_{\Omega}} \displaystyle\sum_{(i,j)\in \Omega} z(i,j) \\
 482 \widehat{\sigma^2_{\Omega}} = \frac{1}{N_{\Omega}} \displaystyle\sum_{(i,j)\in \Omega} \left(z(i,j) - \widehat{\mu_{\Omega}}\right)^2 \\
 483   \end{array}
 484 \right.
 485 $$
 486
 487 Considering the two regions, the criterion to be optimized is then, up to a constant, the Generalized Likelihood (GL):
 488 $$GL = \frac{1}{2}\left( N_B\log\left(\widehat{\sigma_B}^2\right) + N_T\log\left(\widehat{\sigma_T}^2\right)\right)$$
 489
 490 \subsection{CPU implementation}
 491 Let $S_{n,l}$ be the polygonal contour state at step $l$ of iteration $n$, and $S_{n,l}^i$ the node $i$ of $S_{n,l}$ ($i \in [0;N_n]$).
 492 $S_{n,l}^{i,w}$ is the neighbor of index $\omega$ of the node $S_{n,l}^i$ in a 8-connexity meaning with $d$ pixels scope.
 493 Each segment of $S_{n,l}$ is considered as an oriented list of discrete points.
 494 Chesnaud \& R\'{e}fr\'{e}gier, based on the Green-Ostogradski theorem,  have shown how to replace the 2 dimensions (2D) sums needed to estimate $\Theta_{\Omega}$ by 1 dimension sums along $S_{n,l}$ \cite{ChesnaudRB99}.
 495 This approach leads to compute a pair of transformed images, called cumulated images, at the very beginning of the process, which are then used as lookup tables.
 496 It also involves weighting coefficients for pixels and segments of the contour. See \cite{ChesnaudRB99} for details.
 497 Therefore, beyond this point, we will talk about the \emph{contribution} of each point to the 1D sums. By extension, we also talk about the \emph{contribution} of each segment to the 1D sums.
 498
 499 A more detailed description of the sequential algorithm is given by \emph{Algorithm \ref{cpualgo}}.
 500 The process starts with the computation of cumulated images; an initialization stage  takes place from line \ref{debinit} to line \ref{fininit}.
 501 Then we recognize the two nested loops (line \ref{loopnewnodes} and line \ref{loopmovenodes}) and finally the heart of the algorithm stands on line \ref{kernelGL} which represents
 502 the main part of the calculations to be done:
 503 \begin{enumerate}
 504  \item compute the various sums without the contributions of both segments connected to current node  $S_{n,l}^i$.
 505   \item \label{CPUcontrib_segments} compute the contributions of both segments, which requires:
 506   \begin{itemize}
 507    \item \label{CPUbresenham} To determine the coordinates of every discrete pixel of both segments connected to $S_{n,l}^{i,w}$.
 508     \item \label{CPUcontrib_pixels} To compute every pixel contribution.
 509     \item To sum pixel contributions to obtain segment contributions.
 510   \end{itemize}
 511    \item compute the GL given the contribution of each segment of the tested contour.
 512 \end{enumerate}
 513
 514 \begin{algorithm}[h]
 515 \SetNlSty{textbf}{}{:}
 516 \caption{Sequential simplified algorithm}
 517 \label{cpualgo}
 518    read image from source (Hard Disk Drive)\;
 519    compute\_cumulated\_images()\label{cumuls}\;
 520    iteration $n \leftarrow 0$\label{debinit}\;
 521    $N_0 \leftarrow 4$\;
 522    $S_{n,l} \leftarrow S_{0,0}$\;
 523    step $d \leftarrow d_{max}$ an arbitrary power of 2 value\;
 524    current node $S_{0,0}^i \leftarrow S_{0,0}^0$\;
 525    $l \leftarrow 0$\;
 526    compute $GL_{ref}$, the GL of $S_{n,0}$\label{fininit}\;
 527   \Repeat(\tcc*[f]{iteration level, n index}){no new node added}{\label{loopnewnodes}
 528     \Repeat(\tcc*[f]{step level, l index}){no node move occured}{\label{loopmovenodes}
 529       \For{$i=0$ to $N_n$}{
 530         $S_{n,l}^{i,w}$ ($w \in [0;7]$) are the neighbors of $S_{n,l}^i$ by $d$ pixels\;
 531         \For{$w=0$ to $7$}{
 532           compute $GL_w$ for $S_{n,l}$ when $S_{n,l}^{i,w}$ replaces $S_{n,l}^i$ \label{kernelGL}\;
 533           \lIf{$GL_w$ is better than $GL_{ref}$}{
 534             $GL_{ref} \leftarrow GL_w$\;
 535             move node $S_{n,l}^i \leftarrow S_{n,l}^{i,w}$\;
 536           }
 537         }
 538       }
 539      $l \leftarrow l+1$\;
 540    }
 541     add new nodes, $N_n \leftarrow N_n + N_{newnodes}$\;
 542    \lIf{$d > 1$}{ $d \leftarrow d/2$ } \lElse{ $d=1$ }\;
 543     $n \leftarrow n+1$\;
 544     compute $GL_{ref}$, the GL of $S_{n,0}$ \;
 545   }
 546 \end{algorithm}
 547
 548
 549 The profiling results of the CPU implementation shown in \emph{Figure \ref{CPUprofile}} display the relative costs of the most time-consuming functions.
 550 It appears that more than 80\% of the total execution time is always spent by only three functions:
 551 \begin{itemize}
 552  \item \texttt{compute\_segment\_contribution()} which is responsible for point \ref{CPUcontrib_segments} above,
 553   \item \texttt{compute\_cumulated\_images()} which computes the 3 lookup tables at the very beginning,
 554   \item \texttt{compute\_pixels\_coordinate()} which is called by \texttt{compute\_segment\_contribution()}.
 555 \end{itemize}
 556
 557 \begin{figure}[h]
 558  \centering
 559  \includegraphics[width=1.0\linewidth, height=0.5\linewidth]{./img/cpu_profil.png}
 560  \caption{\label{CPUprofile}the three most-consumming functions for various image sizes}
 561 \end{figure}
 562
 563 Measurements have been performed for several image sizes from 15~MPixels (about 3900 x 3900)
 564 to 144 MPixels (about 12000 x 12000). On the one hand, we can notice that function \texttt{compute\_segment\_contribution()} always lasts more than 45\% of the total running time, and even
 565 more when the image gets larger.
 566 On the other hand, the function \texttt{compute\_cumulated\_images()} costs more than 23\%, decreasing with image size, while function \texttt{compute\_pixels\_coordinate()} always takes around 6\%.
 567 It confirms that the need for parallelization resides in line \ref{kernelGL} and line \ref{cumuls} of Algorithm \ref{cpualgo} as they contain every call to those three functions.
 568
 569 The following sections detail how we managed to implement these time-consumming functions in parallel, but
 570 a brief reminder on GPU's recent architecture is presented first.
 571
 572
 573
 574 \section{\label{GPUgeneralites}NVidia's GPU architecture}
 575 GPUs are multi-core, multi-threaded processors, optimized for highly parallel computation. Their design focuses on a Single Instruction Multiple Threads (SIMT) model by devoting
 576 more transistors to data processing rather than data-caching and flow control \cite{CUDAPG}.
 577 For example, a C2050 card features 3GB of global memory and a total of 448 cores bundled in several Streaming Multiprocessors (SM). An amount of
 578 shared memory, much faster than the global memory, is avalaible on each SM (from 16~KB to 487~KB)
 579 % \begin{itemize}
 580 %  \item 8 Scalar Processors (SP)
 581 %   \item two Floating Point Units (FPU)
 582 %   \item two parallel execution units (SIMT) that runs threads by half-warps of 16.
 583 %  \item 48KB (max) of shared memory, organized in 32 banks of 32 bits words
 584 % \end{itemize}
 585 % Nvidia uses a parameter called the \emph{compute capability} of each GPU model. Its value is composed of a major number and a minor number ; for example the C1060 is a sm13 GPU (major=1 minor=3)
 586 %
 587 % \begin{figure*}[htbp]
 588 %  \centering
 589 %  \includegraphics[width=0.7\linewidth]{./img/GPU_block.png}
 590 %  \caption{\label{GPUC1060}schematic diagram of GPU's internal architecture}
 591 % \end{figure*}
 592
 593 % The recent Fermi cards (eg. C2050,) have improved performances by supplying more shared memory in a 32 banks array, a second execution
 594 % unit and several managing capabilities on both the shared memory and level 1 cache memory ( \cite{CUDAPG}, \cite{CUDAFT}, \cite{CUDAFC}.
 595 Writing efficient code for such architectures is not obvious, as re-serialization must be avoided as much as possible. Thus, when designing, one must
 596 keep a few key points in mind:
 597 \begin{itemize}
 598  \item the CUDA model organizes threads by a) threads blocks in which synchronization is possible, b) a grid of blocks with no possible synchronization
 599       between blocks.
 600   \item there is no way to know in what order the blocks are to be scheduled during one single kernel execution.
 601   \item data must be kept in GPU memory, to reduce the overhead due to copying between CPU and GPU memories.
 602   \item the total amount of threads running the same computation must be maximized.
 603   \item the number of execution branches inside a block should be reduced as much as possible.
 604   \item global memory accesses should be coalescent, \emph{ie}. memory accesses done by physically parallel threads (16 at a time) must be consecutive and contained in a 128 Bytes range.
 605   \item shared memory is organized by 16 x 32 bits wide banks. To avoid bank conflicts, each parallel thread (16 at a time) must access a different bank.
 606 \end{itemize}
 607
 608 All the above charasteristics make it always a quite constrained problem to solve when designing a GPU code.
 609 %detailler
 610 Moreover, a non suited code would probably run even slower on GPU than on CPU due to the automatic serialization which would be done at run time.
 611 %%
 612 \section{\label{GPUimplementation}GPU implementation}
 613 In the implementation described below, pre-computations and proper segmentation are discussed separately.
 614 To keep data in GPU memory, the whole computation is assigned to the GPU. CPU still hosts:
 615 \begin{itemize}
 616  \item data reading from HDD
 617   \item data writing on HDD if needed
 618   \item main loops control (corresponding to lines \ref{loopnewnodes} and \ref{loopmovenodes} of Algorithm \ref{cpualgo})
 619 \end{itemize}
 620
 621 It must be noticed that controlling these loops is achieved with only a very small amount of data being transferred between host (CPU) and device (GPU),
 622 which does not produce high overhead. \\
 623 Morever, the structures described below need 20 Bytes per pixel of the image to process (plus an offset of about 50~MByte).
 624 It defines the maximum image size we can accept: approximately 150 M Pixels.
 625
 626 \subsection{Pre-computations}
 627 To replace 2D sums by 1D sums, Chesnaud \textit{et al.} \cite{ChesnaudRB99}  have shown that the three matrices below should be computed:
 628 $$C_1(i,j) = \sum_{k=0}^{k=j} (1+k)$$
 629 $$C_z(i,j) = \sum_{k=0}^{k=j} z(i,k)$$ and
 630 $$C_{z^2}(i,j) = \sum_{k=0}^{k=j} z^2(i,k)$$
 631 Where $z(i,k)$ is the gray level of pixel of coordinate $(i,j)$, so that $C_1$, $C_z$ and $C_{z^2}$ are the same size as image $I$.
 632
 633 \begin{figure*}[htbp]
 634   \centering
 635  \resizebox{0.8\linewidth}{0.3\linewidth}{\input{./img/GPUcumuls.pdf_t}}
 636   \caption{\label{GPUcumuls}\texttt{compute\_blocks\_prefixes()} details.}
 637 \end{figure*}
 638
 639 \medskip
 640 First, we chose not to generate $C_1(i,j)$, which requires that values should be computed when needed, but saves global memory and does not lead to any overhead.
 641 The computation of $C_{z}$ and $C_{z^2}$ easily decomposes into series of  \emph{inclusive prefixsums} \cite{Harris07}.
 642 However, by keeping the \emph{1 thread per pixel} rule, as the total number of threads that can be run in a grid cannot exceed $2^{25}$ (Cf. \cite{CUDAPG}),
 643 slicing is necessary for images exceeding a size threshold which can vary according to the GPU model (e.g. 33 MPix for sm13 GPU, eg. C1060).
 644 It's quite easy to do, but it leads to a small overhead as the process requires multiple calls to one kernel.
 645 Slicing can be done in two ways:
 646 \begin{itemize}
 647  \item all slices are of the same size (balanced)
 648   \item slices fit the maximum size allowed by the GPU, leaving one smaller slice at the end of the process (full-sized).
 649 \end{itemize}
 650 The balanced slice option has proved to run faster.\\
 651 For example: if a given image has 9000 lines and the GPU can process up to 4000 lines at a time, it's faster to run 3 times with 3000 lines rather than twice with
 652 4000 and once with 1000.
 653
 654 As the sums in $C_z$ and $C_{z^2}$ are row-wide, it is easy to see that every block-wide sum will be needed before being able to use it in the global sum.
 655 But as mentioned earlier, the scheduling of blocks must be considered as random.
 656 So, in order to ensure synchronizations, each row of the original image is then treated by three different kernels:
 657 \begin{itemize}
 658  \item \texttt{compute\_blocks\_prefixes()}.
 659   \item \texttt{scan\_blocksums()}.
 660   \item \texttt{add\_sums2prefixes()}.
 661 \end{itemize}
 662 Figures \ref{GPUcumuls}, \ref{GPUscansomblocs} and \ref{GPUaddsoms2cumuls} show relevant data structures for a given row $i$ of $I$.
 663 We assume that each thread block runs $bs$ threads in parallel and each row of $C_z$ needs $n$ blocks to cover its $L$ pixels.
 664
 665 Figure \ref{GPUcumuls} shows the details of the process for row $i$ of the original image $I$, already stored in GPU global memory.
 666 Operands are first copied into GPU shared memory for efficiency reasons.
 667 An inclusive prefixsum is then performed inside each independant thread block.
 668 At this point, only the first shared memory block contains the final values. Its last element contains the sum of all
 669 elements in the corresponding block of $I$.
 670 In order to obtain the right values for the row $i$ of $C_z$, every element value in the other blocks must then be summed with an offset value.
 671 This offset value is the sum of all element values in every corresponding previous block of row $i$.
 672
 673 As the scheduling of blocks is fully unpredictable, the necessary intermediate results have to be stored in GPU global memory before exiting from kernel.
 674 Each element of the prefixsums in GPU shared memory has been stored in its corresponding position in $C_z$ (GPU global mem),
 675 along with the vector of block sums which will be passed later to the next kernel \texttt{scan\_blocksums()}.
 676
 677 The kernel \texttt{scan\_blocksums()} (Figure \ref{GPUscansomblocs}) only makes an exclusive prefixsum on the vector of block sums described above.
 678 The result is a vector containing, at index $x$, the value to be added to every element of block $x$ in each line of $C_z$.
 679
 680 This summing is done in shared memory by kernel \texttt{add\_sums2prefixes()} as described by Figure \ref{GPUaddsoms2cumuls}.
 681
 682 The values of $C_{z^2}$ are obtained together with those of $C_{z}$ and in exactly the same way.
 683 For publishing reasons, figures do not show the $C_{z^2}$ part of structures.
 684
 685
 686
 687 \begin{figure*}[htbp]
 688   \centering
 689  \resizebox{0.6\linewidth}{0.2\linewidth}{\input{./img/GPUscansomblocs.pdf_t}}
 690   \caption{\label{GPUscansomblocs}\texttt{scan\_blocksums()} details.}
 691 \end{figure*}
 692
 693 \begin{figure*}[htbp]
 694   \centering
 695  \resizebox{0.7\linewidth}{0.4\linewidth}{\input{./img/GPUaddsoms2cumuls.pdf_t}}
 696   \caption{\label{GPUaddsoms2cumuls}\texttt{add\_sums2prefixes()} details.}
 697 \end{figure*}
 698
 699 With this implementation, speedups are quite significant (Table \ref{tabresults}). Moreover, the larger the image,
 700 the higher the speedup is, as the step-complexity of the sequential algorithm is of $O(N^2)$ and  $O(N\log(N))$ for the parallel version.
 701 Even higher speedups are achieved by adapting the code to specific-size images, especially when the number of columns is a power of 2. This avoids
 702 inactive threads in the grid, and thus improves efficiency.
 703 However, since the use of 64-bit sums is imposed by image sizes (up to 12000 pixel wide) and 16-bit pixel coding,
 704 computations are made with a 2-way bank conflict as sums are based on 64-bit words, thus creating overhead.
 705
 706
 707 \subsection{Segment contributions}
 708 The choice made for this implementation has been to keep the \emph{1 thread per pixel} rule for the main kernels.
 709 Of course, some reduction stages need to override this principle and will be pointed out.
 710
 711 As each of the $N_n$ nodes of the contour $S_{n,l}$ may move to one of the eight neighbor positions as shown in \emph{Figure \ref{GPUtopo}},
 712 there is $16 N_n$ segments whose contribution has to be estimated.
 713 The best combination is then chosen to  obtain $S_{n,l+1}$ (Figure \ref{GPUtopo}).
 714 Segment contributions are computed in parallel by kernel \texttt{GPU\_compute\_segments\_contrib()}.
 715
 716 \begin{figure}[h]
 717   \centering
 718  \resizebox{0.9\linewidth}{0.81\linewidth}{\input{./img/topologie.pdf_t}}
 719   \caption{\label{GPUtopo}Optimization of node locations using 8 position tests around each node.}
 720 \end{figure}
 721
 722 The grid parameters for this kernel are determined according to the size of the longest segment $npix_{max}$.
 723 If $bs_{max}$ is the maximum theoritical blocksize that a GPU can accept,
 724 \begin{itemize}
 725  \item the block size $bs$ is taken as
 726   \begin{itemize}
 727    \item $npix_{max}$'s next power of two if \\${npix_{max} \in [33 ; bs_{max} ] }$
 728     \item 32 if ${npix_{max} < 32 }$
 729     \item $bs_{max}$ if ${npix_{max} > 256 }$
 730   \end{itemize}
 731   \item the number of threads blocks assigned to each segment, $N_{TB} = \frac{npix_{max} + bs -1 }{bs}$
 732 \end{itemize}
 733 Our implementation makes intensive use of shared memory and does not allow the use of the maximum theoritical blocksizes
 734 (512 for sm13, 1024 for sm20, see \cite{CUDAFT} and \cite{CUDAPG}).
 735 Instead we set $bs_{max}^{sm13} = 256$ and $bs_{max}^{sm20} = 512$.
 736 Anyway, testing has shown that most often, the best value is 256 for both \textit{sm13} and \textit{sm20} GPU's.
 737
 738 \begin{figure*}[htbp]
 739   \centering
 740  \resizebox{0.6\linewidth}{0.35\linewidth}{\input{./img/contribs_segments.pdf_t}}
 741   \caption{\label{contribs_segments}structure for segments contributions computation. Gray symbols help to locate inactive threads as opposed to black
 742           ones that figure active threads.}
 743 \end{figure*}
 744
 745 Then \texttt{GPU\_compute\_segments\_contrib()} computes in parallel:
 746 \begin{itemize}
 747  \item each pixel coordinates for all $16 N_n$ segments. Since the contour is only read in one direction, we have been able
 748   to use a very simple parallel algorithm instead of Bresenham's.
 749 It is based on the slope $k$ of each segment: one pixel per row if $|k|>1$, one pixel per column otherwise.
 750   \item each pixel contribution by reading the corresponding values in the lookup tables.
 751   \item each thread-block sum of individual pixel contributions by running a \textit{reduction} stage for each block.
 752 \end{itemize}
 753 The top line of Figure \ref{contribs_segments} shows the base data structure in GPU shared memory which is relative to one segment.
 754 We concatenate the single segment structure as much as necessary to create a large vector representing every pixel of every test segment.
 755 As each segment has a different size (most often different from any power of two), there is a non-neglectable number of inactive threads scattered in the whole structure.
 756 Two stages are processed separately: one for all even nodes and another one for odd nodes,
 757 as shown in the two bottom lines of Figure \ref{contribs_segments}.
 758
 759
 760 The process is entirely done in shared memory; only a small amount of data needs to be stored in global memory for each segment:
 761 \begin{itemize}
 762  \item the coordinates of its middle point, in order to be able to add nodes easily if needed.
 763   \item the coordinates of its first and last two points, to compute the slope at each end of the segment.
 764 \end{itemize}
 765 The five values above are part of the weighting coefficients determination for each segment and node.
 766
 767  The \texttt{GPU\_sum\_contribs()} takes the blocks sums obtained by \texttt{GPU\_compute\_segments\_contrib()} and computes a second stage parallel summing to provide
 768 the $16 N_n$ segment contributions.
 769
 770 \subsection{Segments with a slope $k$ such as $|k|\leq1$}
 771 Such a segment is treated with 1 thread per column and consequently, it often has more than one pixel per row as shown by Figure \ref{tripix}.
 772 In an image row, consecutive pixels which belong to the target define an interval which can only have one low and one high ends.
 773 That's why, on each row, we choose to consider only the contributions of the innermost pixels.
 774 This selection is also done inside \texttt{GPU\_compute\_segments\_contrib()} when reading the lookup tables for each pixel contribution.
 775 We simply set a null contribution for pixels that need to be ignored.
 776 \begin{figure}[h]
 777   \centering
 778  \resizebox{0.75\linewidth}{0.35\linewidth}{\input{./img/tripix.pdf_t}}
 779   \caption{\label{tripix}zoom on part a of segment with $|k| < 1$,  at pixel level.}
 780 \end{figure}
 781
 782
 783 \subsection{Parameters estimation}
 784 A \texttt{GPU\_compute\_GL()} kernel computes in parallel:
 785 \begin{itemize}
 786  \item every $8N_n$ vector of parameters values corresponding to each possible next state of the contour. Summing is done in shared memory but relevant
 787   data for these operations are stored in global memory.
 788   \item every associated pseudo likelihood value.
 789   \item every node substitution when better GL have been found and if it does not lead to segments crossing.
 790 \end{itemize}
 791
 792 \subsection{End of segmentation}
 793 Segmentation is considered achieved out when no other node can be added to the contour (Algorithm \ref{gpualgosimple}).
 794 A very simple GPU kernel adds every possible node and returns the number it added.
 795
 796 \begin{algorithm}[h]
 797 \label{gpualgosimple}
 798 \caption{Parralel GPU algorithm: outlines. \texttt{<<<...>>>} indicates a GPU kernel parallel process.}
 799 \SetNlSty{textbf}{}{:}
 800   load images\;
 801   transfer image from CPU to GPU\;
 802   \texttt{<<<}compute the 2 cumulated images\texttt{>>>}\;
 803   \texttt{<<<}initialize the contour\texttt{>>>}\;
 804   \Repeat(\tcc*[f]{iteration level}){no more node can be added}{
 805       \Repeat(\tcc*[f]{step level}){no more node can be moved}{
 806           \texttt{<<<}find best neighbor contour\texttt{>>>}\;
 807           \texttt{<<<}adjust node's positions\texttt{>>>}\;
 808           transfer the number of moves achieved from GPU memory to CPU memory.
 809       }
 810   \texttt{<<<}Add nodes\texttt{>>>}\;
 811   transfert the number of nodes added from GPU memory to CPU memory.
 812   }
 813 \end{algorithm}
 814
 815 \section{\label{secSpeedups}Speedups}
 816 Results are given in Table \ref{tabresults}.
 817 CPU timings were measured on an Intel Xeon E5530-2.4GHz with 12Go RAM (LIFC cluster).
 818 GPU timings were obtained on a C2050 GPU with 3GB RAM (adonis-11.grenoble.grid5000.fr).\\
 819 Execution times reported are means on ten executions.
 820 %Measurements on CPU may vary more than on GPU. So CPU results given in \ref{tabresults} are near the fastest values we observed.
 821 The image of figure \ref{fig:labelinit} (scaled down for printing reasons) is based on a real noisy image (800 x 800), 16-bit gray level.
 822 Contrast has been enhanced for better viewing; its various sizes have been obtained by interpolation and addition of gaussian noise.
 823
 824 We separately give the timings of pre-computations as they are a very general purpose piece of code.
 825 Segmentations have been performed with strictly the same parameters (initial shape, threshold length).
 826 The neighborhood distance for the first iteration is 32 pixels. It has a slight influence on the
 827 time process, but it leads to similar speedups values of approximately 7 times faster than CPU.
 828
 829 Though it does not appear in Table \ref{tabresults}, we observed that during segmentation stage, higher speedups are obtained in the very first iterations, when segments are made of a lot of pixels, leading to a higher parallelism ratio.\\
 830 Several parameters prevent from achieving higher speedups:
 831 \begin{itemize}
 832  \item accesses in the lookup tables in global memory cannot be coalescent. It would imply that the pixel contributions of a segment are stored in consecutive spaces in $C_z$ and $C_{z^2}$.
 833    This is only the case for horizontal segments.
 834   \item the use of 64-bit words for computations in shared memory often leads to 2-way bank conflicts.
 835   \item the level of parallelism is not so high, ie. the total number of pixel is not large enough to achieve impressive speedups. For example, on C2050 GPU, a grid can
 836   run about 66 million of threads, but a contour in a 10000 x 10000 image would be less than 0.1 million pixel long.
 837 \end{itemize}
 838
 839
 840 \begin{table}
 841  \begin{center}
 842 \begin{tabular}{|l| r|r r r|}
 843 \hline
 844 && CPU & GPU & Speedup\\\cline{3-5}
 845 Image 15MP      & \bf total     & \bf0.51 s     &  \bf0.06 s & \bf x8.5 \\
 846                 & pre-comp.     & 0.13 s        &  0.02 s & x6.5\\
 847                 & segment.      & 0.46 s        &  0.04 s & x11.5\\\hline
 848 Image 100MP     & \bf total     & \bf 4.08 s    &  \bf 0.59 s & \bf x6.9\\
 849                 & pre-comp.     & 0.91 s        &  0.13 s & x6.9\\
 850                 & segment.      & 3.17 s        &  0.46 s & x6.9\\\hline
 851 Image 150Mp     & \bf total     & \bf 5.7 s     &  \bf 0.79 s & \bf x7.2\\
 852                 & pre-comp.     & 1.4 s         &  0.20 s & x7.0\\
 853                 & segment.      & 4.3 s         &  0.59 s & x7.3\\\hline
 854  \end{tabular}
 855  \end{center}
 856 \caption{GPU (C2050) vs CPU timings.}\label{tabresults}
 857 \end{table}
 858
 859 \IEEEpeerreviewmaketitle
 860
 861 \vspace{1cm}
 862
 863 \section{\label{secConclusion}Conclusion}
 864 The algorithm we have focused on is not easy to adapt for high speedups on GPGPU, though we managed to make it work quite faster than on CPU.
 865 The main drawback is clearly its relative low level of parallelism. Nevertheless, we proposed different kernels that allowed us to take advantage of the computation power of  GPUs.
 866  In future works, we plan  to try and manage to benefit from larger computing grids of thread blocks. Among the possible solutions, we plan to work on:
 867 \begin{itemize}
 868   \item slicing the image and processing the parts in parallel. This is made possible since sm20 GPU provide multi kernel capabilities.
 869   \item slicing the image and processing the parts on two different GPUs, hosted by the same CPU.
 870 \end{itemize}
 871 To extend the scope of this work beyond our present hypothesis (based on \emph{single} target segmentation), we are also going to investigate achieving speedups
 872 in \emph{multiple} target segmentation of large images. This might be useful in a wide range of applications.
 873
 874
 875 % trigger a \newpage just before the given reference
 876 % number - used to balance the cumns on the last page
 877 % adjust value as needed - may need to be readjusted if
 878 % the document is modified later
 879 %\IEEEtriggeratref{8}
 880 % The "triggered" command can be changed if desired:
 881 %\IEEEtriggercmd{\enlargethispage{-5in}}
 882
 883 % references section
 884
 885
 886 \bibliographystyle{IEEEtran}
 887
 888 \bibliography{IEEEabrv,biblio}
 889
 890
 891 % that's all folks
 892 \end{document}
 893
 894