%% bare_conf.tex
%% V1.3
%% 2007/01/11
%% by Michael Shell
%% See:
%% http://www.michaelshell.org/
%% for current contact information.
%%
%% This is a skeleton file demonstrating the use of IEEEtran.cls
%% (requires IEEEtran.cls version 1.7 or later) with an IEEE conference paper.
%%
%% Support sites:
%% http://www.michaelshell.org/tex/ieeetran/
%% http://www.ctan.org/tex-archive/macros/latex/contrib/IEEEtran/
%% and
%% http://www.ieee.org/

%%*************************************************************************
%% Legal Notice:
%% This code is offered as-is without any warranty either expressed or
%% implied; without even the implied warranty of MERCHANTABILITY or
%% FITNESS FOR A PARTICULAR PURPOSE! 
%% User assumes all risk.
%% In no event shall IEEE or any contributor to this code be liable for
%% any damages or losses, including, but not limited to, incidental,
%% consequential, or any other damages, resulting from the use or misuse
%% of any information contained here.
%%
%% All comments are the opinions of their respective authors and are not
%% necessarily endorsed by the IEEE.
%%
%% This work is distributed under the LaTeX Project Public License (LPPL)
%% ( http://www.latex-project.org/ ) version 1.3, and may be freely used,
%% distributed and modified. A copy of the LPPL, version 1.3, is included
%% in the base LaTeX documentation of all distributions of LaTeX released
%% 2003/12/01 or later.


%% Retain all contribution notices and credits.
%% ** Modified files should be clearly indicated as such, including  **
%% ** renaming them and changing author support contact information. **
%%
%% File list of work: IEEEtran.cls, IEEEtran_HOWTO.pdf, bare_adv.tex,
%%                    bare_conf.tex, bare_jrnl.tex, bare_jrnl_compsoc.tex
%%*************************************************************************

% *** Authors should verify (and, if needed, correct) their LaTeX system  ***
% *** with the testflow diagnostic prior to trusting their LaTeX platform ***
% *** with production work. IEEE's font choices can trigger bugs that do  ***
% *** not appear when using other class files.                            ***
% The testflow support page is at:
% http://www.michaelshell.org/tex/testflow/


% Note that the a4paper option is mainly intended so that authors in
% countries using A4 can easily print to A4 and see how their papers will
% look in print - the typesetting of the document will not typically be
% affected with changes in paper size (but the bottom and side margins will).
% Use the testflow package mentioned above to verify correct handling of
% both paper sizes by the user's LaTeX system.
%
% Also note that the "draftcls" or "draftclsnofoot", not "draft", option
% should be used if it is desired that the figures are to be displayed in
% draft mode.
%
\documentclass[10pt, conference, compsocconf]{IEEEtran}
% Add the compsocconf option for Computer Society conferences.
%
% If IEEEtran.cls has not been installed into the LaTeX system files,
% manually specify the path to it like:
% \documentclass[conference]{../sty/IEEEtran}


%  \usepackage[latin1]{inputenc}
%  \usepackage[cyr]{aeguill}
%  \usepackage[francais]{babel}


% Some very useful LaTeX packages include:
% (uncomment the ones you want to load)


% *** MISC UTILITY PACKAGES ***
%
%\usepackage{ifpdf}
% Heiko Oberdiek's ifpdf.sty is very useful if you need conditional
% compilation based on whether the output is pdf or dvi.
% usage:
% \ifpdf
%   % pdf code
% \else
%   % dvi code
% \fi
% The latest version of ifpdf.sty can be obtained from:
% http://www.ctan.org/tex-archive/macros/latex/contrib/oberdiek/
% Also, note that IEEEtran.cls V1.7 and later provides a builtin
% \ifCLASSINFOpdf conditional that works the same way.
% When switching from latex to pdflatex and vice-versa, the compiler may
% have to be run twice to clear warning/error messages.


% *** CITATION PACKAGES ***
%
\usepackage{cite}
% cite.sty was written by Donald Arseneau
% V1.6 and later of IEEEtran pre-defines the format of the cite.sty package
% \cite{} output to follow that of IEEE. Loading the cite package will
% result in citation numbers being automatically sorted and properly
% "compressed/ranged". e.g., [1], [9], [2], [7], [5], [6] without using
% cite.sty will become [1], [2], [5]--[7], [9] using cite.sty. cite.sty's
% \cite will automatically add leading space, if needed. Use cite.sty's
% noadjust option (cite.sty V3.8 and later) if you want to turn this off.
% cite.sty is already installed on most LaTeX systems. Be sure and use
% version 4.0 (2003-05-27) and later if using hyperref.sty. cite.sty does
% not currently provide for hyperlinked citations.
% The latest version can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/cite/
% The documentation is contained in the cite.sty file itself.


% *** GRAPHICS RELATED PACKAGES ***
%
\ifCLASSINFOpdf
  \usepackage[pdftex]{graphicx,color}
  % declare the path(s) where your graphic files are
  \graphicspath{{img/}}
  % and their extensions so you won't have to specify these with
  % every instance of \includegraphics
  \DeclareGraphicsExtensions{.pdf,.jpeg,.png}
\else
  % or other class option (dvipsone, dvipdf, if not using dvips). graphicx
  % will default to the driver specified in the system graphics.cfg if no
  % driver is specified.
  % \usepackage[dvips]{graphicx}
  % declare the path(s) where your graphic files are
  % \graphicspath{{../eps/}}
  % and their extensions so you won't have to specify these with
  % every instance of \includegraphics
  % \DeclareGraphicsExtensions{.eps}
\fi
% graphicx was written by David Carlisle and Sebastian Rahtz. It is
% required if you want graphics, photos, etc. graphicx.sty is already
% installed on most LaTeX systems. The latest version and documentation can
% be obtained at: 
% http://www.ctan.org/tex-archive/macros/latex/required/graphics/
% Another good source of documentation is "Using Imported Graphics in
% LaTeX2e" by Keith Reckdahl which can be found as epslatex.ps or
% epslatex.pdf at: http://www.ctan.org/tex-archive/info/
%
% latex, and pdflatex in dvi mode, support graphics in encapsulated
% postscript (.eps) format. pdflatex in pdf mode supports graphics
% in .pdf, .jpeg, .png and .mps (metapost) formats. Users should ensure
% that all non-photo figures use a vector format (.eps, .pdf, .mps) and
% not a bitmapped formats (.jpeg, .png). IEEE frowns on bitmapped formats
% which can result in "jaggedy"/blurry rendering of lines and letters as
% well as large increases in file sizes.
%
% You can find documentation about the pdfTeX application at:
% http://www.tug.org/applications/pdftex


% *** MATH PACKAGES ***
%
%\usepackage[cmex10]{amsmath}
% A popular package from the American Mathematical Society that provides
% many useful and powerful commands for dealing with mathematics. If using
% it, be sure to load this package with the cmex10 option to ensure that
% only type 1 fonts will utilized at all point sizes. Without this option,
% it is possible that some math symbols, particularly those within
% footnotes, will be rendered in bitmap form which will result in a
% document that can not be IEEE Xplore compliant!
%
% Also, note that the amsmath package sets \interdisplaylinepenalty to 10000
% thus preventing page breaks from occurring within multiline equations. Use:
%\interdisplaylinepenalty=2500
% after loading amsmath to restore such page breaks as IEEEtran.cls normally
% does. amsmath.sty is already installed on most LaTeX systems. The latest
% version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/required/amslatex/math/


% *** SPECIALIZED LIST PACKAGES ***
%
\usepackage[ruled,lined,linesnumbered]{algorithm2e}
%\usepackage{algorithmic}
% algorithmic.sty was written by Peter Williams and Rogerio Brito.
% This package provides an algorithmic environment fo describing algorithms.
% You can use the algorithmic environment in-text or within a figure
% environment to provide for a floating algorithm. Do NOT use the algorithm
% floating environment provided by algorithm.sty (by the same authors) or
% algorithm2e.sty (by Christophe Fiorio) as IEEE does not use dedicated
% algorithm float types and packages that provide these will not provide
% correct IEEE style captions. The latest version and documentation of
% algorithmic.sty can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/algorithms/
% There is also a support site at:
% http://algorithms.berlios.de/index.html
% Also of interest may be the (relatively newer and more customizable)
% algorithmicx.sty package by Szasz Janos:
% http://www.ctan.org/tex-archive/macros/latex/contrib/algorithmicx/


% *** ALIGNMENT PACKAGES ***
%
\usepackage{array}
% Frank Mittelbach's and David Carlisle's array.sty patches and improves
% the standard LaTeX2e array and tabular environments to provide better
% appearance and additional user controls. As the default LaTeX2e table
% generation code is lacking to the point of almost being broken with
% respect to the quality of the end results, all users are strongly
% advised to use an enhanced (at the very least that provided by array.sty)
% set of table tools. array.sty is already installed on most systems. The
% latest version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/required/tools/


\usepackage{mdwmath}
\usepackage{mdwtab}
% Also highly recommended is Mark Wooding's extremely powerful MDW tools,
% especially mdwmath.sty and mdwtab.sty which are used to format equations
% and tables, respectively. The MDWtools set is already installed on most
% LaTeX systems. The lastest version and documentation is available at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/mdwtools/


% IEEEtran contains the IEEEeqnarray family of commands that can be used to
% generate multiline equations as well as matrices, tables, etc., of high
% quality.


%\usepackage{eqparbox}
% Also of notable interest is Scott Pakin's eqparbox package for creating
% (automatically sized) equal width boxes - aka "natural width parboxes".
% Available at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/eqparbox/


% *** SUBFIGURE PACKAGES ***
%\usepackage[tight,footnotesize]{subfigure}
% subfigure.sty was written by Steven Douglas Cochran. This package makes it
% easy to put subfigures in your figures. e.g., "Figure 1a and 1b". For IEEE
% work, it is a good idea to load it with the tight package option to reduce
% the amount of white space around the subfigures. subfigure.sty is already
% installed on most LaTeX systems. The latest version and documentation can
% be obtained at:
% http://www.ctan.org/tex-archive/obsolete/macros/latex/contrib/subfigure/
% subfigure.sty has been superceeded by subfig.sty.


%\usepackage[caption=false]{caption}
%\usepackage[font=footnotesize]{subfig}
% subfig.sty, also written by Steven Douglas Cochran, is the modern
% replacement for subfigure.sty. However, subfig.sty requires and
% automatically loads Axel Sommerfeldt's caption.sty which will override
% IEEEtran.cls handling of captions and this will result in nonIEEE style
% figure/table captions. To prevent this problem, be sure and preload
% caption.sty with its "caption=false" package option. This is will preserve
% IEEEtran.cls handing of captions. Version 1.3 (2005/06/28) and later 
% (recommended due to many improvements over 1.2) of subfig.sty supports
% the caption=false option directly:
\usepackage[caption=false,font=footnotesize]{subfig}
%
% The latest version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/subfig/
% The latest version and documentation of caption.sty can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/caption/


% *** FLOAT PACKAGES ***
%
\usepackage{fixltx2e}
% fixltx2e, the successor to the earlier fix2col.sty, was written by
% Frank Mittelbach and David Carlisle. This package corrects a few problems
% in the LaTeX2e kernel, the most notable of which is that in current
% LaTeX2e releases, the ordering of single and double column floats is not
% guaranteed to be preserved. Thus, an unpatched LaTeX2e can allow a
% single column figure to be placed prior to an earlier double column
% figure. The latest version and documentation can be found at:
% http://www.ctan.org/tex-archive/macros/latex/base/


%\usepackage{stfloats}
% stfloats.sty was written by Sigitas Tolusis. This package gives LaTeX2e
% the ability to do double column floats at the bottom of the page as well
% as the top. (e.g., "\begin{figure*}[!b]" is not normally possible in
% LaTeX2e). It also provides a command:
%\fnbelowfloat
% to enable the placement of footnotes below bottom floats (the standard
% LaTeX2e kernel puts them above bottom floats). This is an invasive package
% which rewrites many portions of the LaTeX2e float routines. It may not work
% with other packages that modify the LaTeX2e float routines. The latest
% version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/sttools/
% Documentation is contained in the stfloats.sty comments as well as in the
% presfull.pdf file. Do not use the stfloats baselinefloat ability as IEEE
% does not allow \baselineskip to stretch. Authors submitting work to the
% IEEE should note that IEEE rarely uses double column equations and
% that authors should try to avoid such use. Do not be tempted to use the
% cuted.sty or midfloat.sty packages (also by Sigitas Tolusis) as IEEE does
% not format its papers in such ways.


% correct bad hyphenation here
% \hyphenation{op-tical net-works semi-conduc-tor}


\begin{document}
%
% paper title
% can use linebreaks \\ within to get better formatting as desired
\title{GPU implementation of a region based algorithm \\ for large images segmentation}


% author names and affiliations
% use a multiple column layout for up to two different
% affiliations

\author{
\IEEEauthorblockN{Gilles Perrot, St\'{e}phane Domas, Rapha\"{e}l Couturier}
\IEEEauthorblockA{Distributed Numerical Algorithmics team (AND), Laboratoire d'Informatique de Franche-comt\'{e}\\
Rue Engel Gros, 90000 Belfort, France\\
forename.name@univ-fcomte.fr}
}


% use for special paper notices
%\IEEEspecialpapernotice{(Invited Paper)}


% make the title area
\maketitle

\begin{abstract}
Image segmentation is one of the most challenging issues in image computing.
In this work, we focus on region-based active contour techniques (snakes) as they seem to achieve a high level of robustness and fit with a large range of
applications. Some algorithmic optimizations provide significant speedups, but even so, execution times are still non-neglectable  
with the continuing increase of image sizes. Moreover, these algorithms are not well suited for running on multi-core CPU's.
At the same time, recent developments of Graphical Processing Units (GPU) suggest that higher speedups could be obtained 
by use of their specific design. We have managed to adapt a specially efficient snake algorithm that fits recent Nvidia GPU architecture 
and takes advantage of its massive multithreaded execution capabilities. The speedup obtained is most often around 7.
\end{abstract}

\begin{IEEEkeywords}
 GPU; segmentation; snake;
\end{IEEEkeywords}

\section{Introduction}
Segmentation and shape detection are still key issues in image computing. These techniques are used in numerous fields ranging from medical imaging to video tracking, shape recognition or localization.
Since 1988, the active contours (snakes) introduced by and  Kass et al. \cite{KassWT88}, have proved to be efficient and robust, especially against noise, for a wide range of image types. 

The main shortcoming of these algorithms is often their high dependence on the initial shape, though several contributions have lowered this dependency and also brought 
more accurate segmentation of non convex shapes \cite{Ruch01} \cite{XuP98}. 

The information that drives a snake model comes either from the contour itself or from the characteristics of the regions it defines. 
For noisy images, the second option is often more suitable as it takes into account the statistical fluctuations of the pixels. 
One approach \cite{ChesnaudRB99,AllainBG08} proposes a geometric (polygonal) region-based snake driven by the minimization of the stochastic complexity. One significant
advantage is that it runs without any free parameter which can be helpful when dealing with image sequences or slices (3D).

An important issue of image processing, especially segmentation, has always been the computation time of most algorithms. Over the years, the increase of CPU computing capabilities, 
although quite impressive, has not been able to fulfill the combined needs of growing resolution and real-time computation.
Since having been introduced in the early 1980's, the capabilities and speed of graphics accelerators have always been increasing. So much so that the recent GPGPU 
(General Purpose Graphic Processing Units) currently benefit by a massively parallel architecture for general purpose programming, especially when dealing with large matrices 
or vectors. On the other hand, their specific design obviously imposes a number of limitations and constraints. 
Some implementations of parametric snakes have already been tested,  such as \cite{Brunett}. However, a similar solution (computation per small tile)
is not suited for the algorithm we have implemented.

Our goal, in collaboration with the PhyTI team\footnote{Physics and Image Processing Group, Fresnel Institute, Ecole Centrale de Marseille (France)}, was to propose a way to fit their algorithm 
to the Nvidia$^{\textcopyright}$ Tesla GPU architecture.
The remainder of this paper presents the principles of the algorithm and notations in section \ref{secCPUalgooutlines}. In section \ref{secCPUalgodetails}, the details of
the sequential CPU implementation are explained. Section \ref{GPUgeneralites} summarizes Nvidia's GPU 
important characteristics and how to deal with them efficiently. Then sections \ref{GPUimplementation} and \ref{secSpeedups} detail our GPU implementation and timing results. 
Finally, the conclusion of section \ref{secConclusion} evaluates the pros and cons of this implementation and then gives a few direction to be followed in future works.
 

\section{\label{secCPUalgooutlines}Sequential algorithm : outlines} 
The goal of the active contour segmentation (snake) method we studied \cite{Ruch01} is to distinguish, inside an image $I$, a target region $T$ from the background region
$B$. The size of $I$ is L x H pixels of coordinates $(i,j)$ and gray level $z(i,j)$.
We assume that the gray levels of $T$ and $B$ are independent random vectors, each with a distribution $p^{\Omega}$ of its components $(\Omega \in \{T ; B\})$. 
The present implementation uses a Gaussian distribution, but another one can easily be used as Gamma, Poisson,...(Cf. \cite{ChesnaudRB99})\dots 

The \textit{active contour} $S$, which represents the shape of $T$ is chosen as polygonal.
The purpose of the segmentation is then to determine the shape that optimizes a pseudo log-likelihood-based criterion (PLH).
This is done by a very simple iterative process which is initialized with an arbitrary shape, then at each step :
\begin{enumerate}
  \item it modifies the shape
  \item it estimates the parameters of the Gaussian functions for the two regions and evaluates the criterion.
  \item it validates the new shape if the criterion has a better value.
\end{enumerate}
A simplified description of it is given in \emph{Algorithm \ref{cpualgosimple}} which features two nested loops : the main one, on iteration level, is 
responsible for tuning the number of nodes ; the inner one, on step level, takes care of finding the best shape for a given number of nodes.
\emph{Figure \ref{images_algo}} shows intermediate results at iteration level. Sub-figure \emph{\ref{fig:labelinit}} shows the initial rectangular shape, \emph{\ref{fig:labelit1}}
 shows the best four-node shape that ends
the first iteration. Sub-figures \emph{\ref{fig:labelit2}} and \emph{\ref{fig:labelit4}} show the best shape for an eight-node snake (resp. 29-node) 
which occurs at the end of the second  iteration (resp. fourth). 

\begin{algorithm}[h]
\label{cpualgosimple}
\caption{Sequential algorithm : outlines}
\SetNlSty{textbf}{}{:}

  %compute\_cumulated\_images()\;
  begin with a rectangular 4 nodes snake\;
  \Repeat(\tcc*[f]{iteration level}){no more node can be added}{
      \Repeat(\tcc*[f]{step level}){no more node can be moved}{
	  Test some other positions for each node, near its current position\;
	  Find the best PLH and adjust the node's position\;
      }
  Add a node in the middle of each \emph{long enough} segment\;
  }
\end{algorithm}


\begin{figure}[h]
 \centering
\subfloat[Initial snake ]{\label{fig:labelinit} \includegraphics[width=0.4\linewidth]{./img/cochon_petit_init.jpg}}\qquad
\subfloat[End of first iteration (4 nodes) ]{\label{fig:labelit1} \includegraphics[width=0.4\linewidth]{./img/cochon_petit_it1.jpg}}\\
\subfloat[End of second iteration (8 nodes)]{\label{fig:labelit2} \includegraphics[width=0.4\linewidth]{./img/cochon_petit_it2.jpg}}\qquad
\subfloat[End of fourth iteration (29 nodes)]{\label{fig:labelit4} \includegraphics[width=0.4\linewidth]{./img/cochon_petit_it4.jpg}}
%\subfloat[width=0.4\linewidth]{./img/cochon_b_entier.jpg}
 % cochon_b_entier.jpg: 3960x2970 pixel, 72dpi, 139.70x104.78 cm, bb=0 0 3960 2970
 \caption{segmentation of a noisy image}
  \label{images_algo}
\end{figure}


\section{\label{secCPUalgodetails}Sequential algorithm : details}
\subsection{Criterion}
For $p^{\Omega}$ a Gaussian function, $\Theta_{\Omega}$ ($\Omega \in \{T ; B\}$) has two components, the average value $\mu$ and the deviation $\sigma$ which are estimated by 
$$
\widehat{\Theta_{\Omega}} \left(
\begin{array}{l}
\widehat{\mu} = \frac{1}{N_{\Omega}} \displaystyle\sum_{(i,j)\in \Omega} z(i,j) \\
\widehat{\sigma^2} = \frac{1}{N_{\Omega}} \displaystyle\sum_{(i,j)\in \Omega} z^2(i,j) - \mu^2 \\
  \end{array}
\right.
$$
The likelihood of a region is given by
$$ P[I|S_{n,l}, \Theta_T, \Theta_B] = P(\chi_T | \Theta_T)P(\chi_B | \Theta_B)$$ 
where
$$P(\chi_{\Omega} | \Theta_{\Omega}) = \prod_{(i,j)\in \Omega} p^{\Omega}[z(i,j)] ~~~~(\Omega \in \{T ; B\})$$ 
And then the log-likelihood by 
$$-N_{\Omega}\log\left(\sqrt{2\pi}\right) -N_{\Omega}.log\left(\sigma\right) - \frac{1}{2\sigma^2}\sum_{(i,j)\in \Omega} \left( z(i,j)-\mu \right)^2 $$
Considering the two regions, the criterion to be optimized is then :
$$C = \frac{1}{2}\left( N_B\log\left(\widehat{\sigma_B}^2\right) + N_T\log\left(\widehat{\sigma_T}^2\right)\right)$$ 

\subsection{CPU implementation}
Let $S_{n,l}$ be the snake state at step $l$ of iteration $n$, and $S_{n,l}^i$ the node $i$ of $S_{n,l}$ ($i \in [0;N_n]$).
Each segment of $S_{n,l}$ is considered as an oriented list of discrete points.
Chesnaud \& Refregier \cite{ChesnaudRB99} have shown how to replace the 2 dimensions sums needed to estimate $\Theta_{\Omega}$ by 1 dimension sums along $S_{n,l}$.
However, this approach involves weighting coefficients for every single point of $S_{n,l}$ which leads to compute a pair of transformed images, at the very 
beginning of the process. Such images are called cumulated images and will be used as lookup tables.
Therefore, beyond this point, we will talk about the \emph{contribution} of each point to the 1D sums. By extension, we also talk about the \emph{contribution} of each segment to the 1D sums.

A more detailed description of the sequential algorithm is given by \emph{Algorithm \ref{cpualgo}}. 
The process starts with the computation of cumulated images ; an initialization stage  takes place from line \ref{debinit} to line \ref{fininit}. 
Then we recognize the two nested loops (line \ref{loopnewnodes} and line \ref{loopmovenodes}) and finally the heart of the algorithm stands on line \ref{kernelPLH} which represents
the main part of the calculations to be done :
\begin{enumerate}
 \item compute the various sums without the contributions of both segments connected to current node  $S_{n,l}^i$.
  \item \label{CPUcontrib_segments} compute the contributions of both segments, which requires :
  \begin{itemize}
   \item \label{CPUbresenham} To determine the coordinates of every discrete pixel of both segments connected to $S_{n,l}^{i,w}$.
    \item \label{CPUcontrib_pixels} To compute every pixel contribution.
    \item To sum pixel contributions to obtain segment contributions.
  \end{itemize}
   \item compute the PLH given the contribution of each segment of the tested snake.
\end{enumerate}

\begin{algorithm}[h]
\SetNlSty{textbf}{}{:}
\caption{Sequential simplified algorithm} 
\label{cpualgo}
   read image from HDD\;
   compute\_cumulated\_images()\label{cumuls}\;
   iteration $n \leftarrow 0$\label{debinit}\;
   $N_0 \leftarrow 4$\;
   $S_{n,l} \leftarrow S_{0,0}$\;
   step $d \leftarrow d_{max} = 2^q$\;
   current node $S_{0,0}^i \leftarrow S_{0,0}^0$\;
   $l \leftarrow 0$\;
   compute $PLH_{ref}$, the PLH of $S_{n,0}$\label{fininit}\;
  \Repeat(\tcc*[f]{iteration level}){no new node added}{\label{loopnewnodes}
    \Repeat(\tcc*[f]{step level}){no node move occured}{\label{loopmovenodes}
      \For{$i=0$ to $N_n$}{
	$S_{n,l}^{i,w}$ ($w \in [0;7]$) are the neighbors of $S_{n,l}^i$ by distance $d$\;
	\For{$w=0$ to $7$}{
	  compute $PLH_w$ for $S_{n,l}$ when $S_{n,l}^{i,w}$ replaces $S_{n,l}^i$ \label{kernelPLH}\;
	  \lIf{$PLH_w$ is better than $PLH_{ref}$}{
	    $PLH_{ref} \leftarrow PLH_w$\;
	    move node $S_{n,l}^i \leftarrow S_{n,l}^{i,w}$\;
	  }
	}
      }
     $l \leftarrow l+1$\;
   }
    add new nodes, $N_n \leftarrow N_n + N_{newnodes}$\;
   \lIf{$d > 1$}{ $d \leftarrow d/2$ } \lElse{ $d=1$ }\;
    $n \leftarrow n+1$\;
    compute $PLH_{ref}$, the PLH of $S_{n,0}$ \;
  }
\end{algorithm}


The profiling results of the CPU implementation shown in \emph{Figure \ref{CPUprofile}} display the relative costs of the most time-consumming functions.
It appears that more than 80\% of the total execution time is always spent by only three functions~:
\begin{itemize}
 \item \texttt{compute\_segment\_contribution()} which is responsible for point \ref{CPUcontrib_segments} above,
  \item \texttt{compute\_cumulated\_images()} which computes the 3 lookup tables at the very beginning,
  \item \texttt{compute\_pixels\_coordinate()} which is called by \texttt{compute\_segment\_contribution()}.
\end{itemize}

\begin{figure}[h]
 \centering
 \includegraphics[width=0.9\linewidth, height=0.5\linewidth]{./img/data_profile_cpu.png}
 \caption{\label{CPUprofile}the three most-consumming functions for various image sizes}
\end{figure}

Measurements have been performed for several image sizes from 15~MPixels (about 3900 x 3900) 
to 144 MPixels (about 12000 x 12000). On the one hand, we can notice that function \texttt{compute\_segment\_contribution()} always lasts more than 45\% of the total running time, and even 
more when the image gets larger. 
On the other hand, the function \texttt{compute\_cumulated\_images()} costs more than 23\%, decreasing with image size, while function \texttt{compute\_pixels\_coordinate()} always takes around 6\%.
It confirms that the need for parallelization resides in line \ref{kernelPLH} and line \ref{cumuls} of Algorithm \ref{cpualgo} as they contain every call to those three functions.

The following sections detail how we managed to implement these time-consumming functions in parallel, but
a brief reminder on GPU's recent architecture is presented first. 
 

\section{\label{GPUgeneralites}NVidia's GPU architecture}
GPUs are multi-core, multi-threaded processors, optimized for highly parallel computation. Their design focuses on SIMT model by devoting
more transistors to data processing rather than data-caching and flow control \cite{CUDAPG}.
 
For example, Figure \ref{GPUC1060} shows a Tesla C1060  with its 4GB of global memory and 30 SM processors, each including  :
\begin{itemize}
 \item 8 Scalar Processors (SP)
  \item a Floating Point Unit (FPU)
  \item a parallel execution unit (SIMT) that runs threads by warps of 32. 
 \item 16KB of shared memory, organized in 16 banks of 32 bits words
\end{itemize}
Nvidia uses a parameter called the \emph{compute capability} of each GPU model. Its value is composed of a major number and a minor number ; for example the C1060 is a sm13 GPU (major=1 minor=3) 
and C2050 is a sm20 GPU.

\begin{figure*}[htbp]
 \centering
 \includegraphics[width=0.7\linewidth]{./img/GPU_block.png} 
 \caption{\label{GPUC1060}schematic diagram of GPU's internal architecture}
\end{figure*}

The recent Fermi cards (eg. C2050,) have improved performances by supplying more shared memory in a 32 banks array, a second execution 
unit and several managing 
capabilities on both the shared memory and level 1 cache memory ( \cite{CUDAPG}, \cite{CUDAFT}, \cite{CUDAFC}. 
However, writing efficient code for such architectures is not obvious, as re-serialization must be avoided as much as possible. Thus, when designing, one must 
keep a few key points in mind :
\begin{itemize}
 \item CUDA model organizes threads by a) threads blocks in which synchronization is possible, b) a grid of blocks with no possible synchronization
      between blocks.
  \item there is no way to know in what order the blocks are to be scheduled during one single kernel execution.
  \item data must be kept in GPU memory, to reduce the overhead due to copying between CPU and GPU memories.
  \item the total amount of threads running the same computation must be maximized.
  \item the number of execution branches inside a block should be reduced as much as possible.
  \item global memory accesses should be coalescent, \emph{ie}. memory accesses done by physically parallel threads (16 at a time) must be consecutive and contained in a 128 Bytes range.
  \item shared memory is organized by 16 x 32 bits wide banks. To avoid bank conflicts, each parallel thread (16 at a time) must access a different bank.
\end{itemize}

All the above charasteristics make it always a quite constrained problem to solve when designing a GPU code. 
%detailler
Moreover, a non suited code would probably run even slower on GPU than on CPU due to the automatic serialization which would be done at run time.
%%
\section{\label{GPUimplementation}GPU implementation}
In the implementation described below, pre-computations and proper segmentation are discussed separately.
To keep data in GPU memory, the whole computation is assigned to the GPU. CPU still hosts :
\begin{itemize}
 \item data reading from HDD
  \item data writing on HDD if needed
  \item main loops control (corresponding to lines \ref{loopnewnodes} and \ref{loopmovenodes} of Algorithm \ref{cpualgo})
\end{itemize}

It must be noticed that controlling these loops is achieved with only a very small amount of data being transferred between host (CPU) and device (GPU),
which does not produce high overhead. \\
Morever, the structures described below need 20 Bytes per pixel of the image to process (plus an offset of about 50~MByte). 
It defines the maximum image size we can accept : approximately 150 M Pixels.

\subsection{Pre-computations}
To replace 2D sums by 1D sums, Chesnaud \textit{et al.} \cite{ChesnaudRB99}  have shown that the three matrices below should be computed :
$$C_1(i,j) = \sum_{k=0}^{k=j} (1+k)$$
$$C_z(i,j) = \sum_{k=0}^{k=j} z(i,k)$$ and
$$C_{z^2}(i,j) = \sum_{k=0}^{k=j} z^2(i,k)$$
Where $z(i,k)$ is the gray level of pixel of coordinate $(i,j)$, so that $C_1$, $C_z$ and $C_{z^2}$ are the same size as image $I$.

\begin{figure*}[htbp]
  \centering
 \resizebox{0.8\linewidth}{0.3\linewidth}{\input{./img/GPUcumuls.pdf_t}}
  \caption{\label{GPUcumuls}\texttt{compute\_blocks\_prefixes()} details.}
\end{figure*}

\medskip 
First, we chose not to generate $C_1(i,j)$, which requires that values should be computed when needed, but saves global memory and does not lead to any overhead.
The computation of $C_{z}$ and $C_{z^2}$ easily decomposes into series of  \emph{inclusive prefixsums} \cite{Harris07}.
However, by keeping the \emph{1 thread per pixel} rule, as the total number of threads that can be run in a grid cannot exceed $2^{25}$ (Cf. \cite{CUDAPG}),
slicing is necessary for images exceeding a size threshold which can vary according to the GPU model (e.g. 33 MPix for sm13 GPU, eg. C1060).
It's quite easy to do, but it leads to a small overhead as the process requires multiple calls to one kernel.
Slicing can be done in two ways :
\begin{itemize}
 \item all slices are of the same size (balanced)
  \item slices fit the maximum size allowed by the GPU, leaving one smaller slice at the end of the process (full-sized).
\end{itemize}
The balanced slice option has proved to run faster.\\
For example : if a given image has 9000 lines and the GPU can process up to 4000 lines at a time, it's faster to run 3 times with 3000 lines rather than twice with 
4000 and once with 1000. 

As the sums in $C_z$ and $C_{z^2}$ are row-wide, it is easy to see that every block-wide sum will be needed before being able to use it in the global sum.
But as mentioned earlier, the scheduling of blocks must be considered as random.
So, in order to ensure synchronizations, each row of the original image is then treated by three different kernels : 
\begin{itemize}
 \item \texttt{compute\_blocks\_prefixes()}.
  \item \texttt{scan\_blocksums()}.
  \item \texttt{add\_sums2prefixes()}.   
\end{itemize}
Figures \ref{GPUcumuls}, \ref{GPUscansomblocs} and \ref{GPUaddsoms2cumuls} show relevant data structures for a given row $i$ of $I$. 
We assume that each thread block runs $bs$ threads in parallel and each row of $C_z$ needs $n$ blocks to cover its $L$ pixels.

Figure \ref{GPUcumuls} shows the details of the process for row $i$ of the original image $I$, already stored in GPU global memory.
Operands are first copied into GPU shared memory for efficiency reasons.
An inclusive prefixsum is then performed inside each independant thread block. 
At this point, only the first shared memory block contains the final values. Its last element contains the sum of all
elements in the corresponding block of $I$.
In order to obtain the right values for the row $i$ of $C_z$, every element value in the other blocks must then be summed with an offset value. 
This offset value is the sum of all element values in every corresponding previous block of row $i$.

As the scheduling of blocks is fully unpredictable, the necessary intermediate results have to be stored in GPU global memory before exiting from kernel.
Each element of the prefixsums in GPU shared memory has been stored in its corresponding position in $C_z$ (GPU global mem),
along with the vector of block sums which will be passed later to the next kernel \texttt{scan\_blocksums()}.

The kernel \texttt{scan\_blocksums()} (Figure \ref{GPUscansomblocs}) only makes an exclusive prefixsum on the vector of block sums described above.
The result is a vector containing, at index $x$, the value to be added to every element of block $x$ in each line of $C_z$.

This summing is done in shared memory by kernel \texttt{add\_sums2prefixes()} as described by Figure \ref{GPUaddsoms2cumuls}.

The values of $C_{z^2}$ are obtained together with those of $C_{z}$ and in exactly the same way.
For publishing reasons, figures do not show the $C_{z^2}$ part of structures. 


\begin{figure*}[htbp]
  \centering
 \resizebox{0.6\linewidth}{0.2\linewidth}{\input{./img/GPUscansomblocs.pdf_t}}
  \caption{\label{GPUscansomblocs}\texttt{scan\_blocksums()} details.}
\end{figure*}

\begin{figure*}[htbp]
  \centering
 \resizebox{0.7\linewidth}{0.4\linewidth}{\input{./img/GPUaddsoms2cumuls.pdf_t}}
  \caption{\label{GPUaddsoms2cumuls}\texttt{add\_sums2prefixes()} details.}
\end{figure*}

With this implementation, speedups are quite significant (Table \ref{tabresults}). Moreover, the larger the image, 
the higher the speedup is, as the step-complexity of the sequential algorithm is of $O(N^2)$ and  $O(N\log(N))$ for the parallel version.
Even higher speedups are achieved by adapting the code to specific-size images, especially when the number of columns is a power of 2. This avoids 
inactive threads in the grid, and thus improves efficiency.
However, on sm13 GPUs, these computations are made with a 2-way bank conflict as sums are based on 64-bit words, thus creating overhead.


\subsection{Segment contributions}
The choice made for this implementation has been to keep the \emph{1 thread per pixel} rule for the main kernels.
Of course, some reduction stages need to override this principle and will be pointed out.

As each of the $N_n$ nodes of the snake $S_{n,l}$ may move to one of the eight neighbor positions as shown in \emph{Figure \ref{GPUtopo}}, 
there is $16 N_n$ segments whose contribution has to be estimated.
The best combination is then chosen to  obtain $S_{n,l+1}$ (Figure \ref{GPUtopo}).
Segment contributions are computed in parallel by kernel \texttt{GPU\_compute\_segments\_contrib()}.

\begin{figure}[h]
  \centering
 \resizebox{0.9\linewidth}{0.81\linewidth}{\input{./img/topologie.pdf_t}}
  \caption{\label{GPUtopo}topology around nodes}
\end{figure}

The grid parameters for this kernel are determined according to the size of the longest segment $npix_{max}$.
If $bs_{max}$ is the maximum theoritical blocksize that a GPU can accept,
\begin{itemize}
 \item the block size $bs$ is taken as 
  \begin{itemize}
   \item $npix_{max}$'s next power of two if \\${npix_{max} \in [33 ; bs_{max} ] }$
    \item 32 if ${npix_{max} < 32 }$
    \item $bs_{max}$ if ${npix_{max} > 256 }$
  \end{itemize}
  \item the number of threads blocks assigned to each segment, $N_{TB} = \frac{npix_{max} + bs -1 }{bs}$ 
\end{itemize}
Our implementation makes intensive use of shared memory and does not allow the use of the maximum theoritical blocksizes 
(512 for sm13, 1024 for sm20, see \cite{CUDAFT} and \cite{CUDAPG}).
Instead we set $bs_{max}^{sm13} = 256$ and $bs_{max}^{sm20} = 512$. 
Anyway, testing has shown that most often, the best value is 256 for both \textit{sm13} and \textit{sm20} GPU's.

\begin{figure*}[htbp]
  \centering
 \resizebox{0.6\linewidth}{0.35\linewidth}{\input{./img/contribs_segments.pdf_t}}
  \caption{\label{contribs_segments}structure for segments contributions computation. Gray symbols help to locate inactive threads as opposed to black 
	  ones that figure active threads.}
\end{figure*}

Then \texttt{GPU\_compute\_segments\_contrib()} computes in parallel :
\begin{itemize}
 \item each pixel coordinates for all $16 N_n$ segments. Since the snake is only read in one direction, we have been able 
  to use a very simple parallel algorithm instead of Bresenham's.
It is based on the slope $k$ of each segment~: one pixel per row if $|k|>1$, one pixel per column otherwise.
  \item each pixel contribution by reading the corresponding values in the lookup tables.
  \item each thread-block sum of individual pixel contributions by running a \textit{reduction} stage for each block.
\end{itemize}
The top line of Figure \ref{contribs_segments} shows the base data structure in GPU shared memory which is relative to one segment.
We concatenate the single segment structure as much as necessary to create a large vector representing every pixel of every test segment.
As each segment has a different size (most often different from any power of two), there is a non-neglectable number of inactive threads scattered in the whole structure.
Two stages are processed separately : one for all even nodes and another one for odd nodes,
as shown in the two bottom lines of Figure \ref{contribs_segments}. 


The process is entirely done in shared memory ; only a small amount of data needs to be stored in global memory for each segment~:
\begin{itemize}
 \item the coordinates of its middle point, in order to be able to add nodes easily if needed.
  \item the coordinates of its first and last two points, to compute the slope at each end of the segment.
\end{itemize}
The five values above are part of the weighting coefficients determination for each segment and node.

 The \texttt{GPU\_sum\_contribs()} takes the blocks sums obtained by \texttt{GPU\_compute\_segments\_contrib()} and computes a second stage parallel summing to provide 
the $16 N_n$ segment contributions.

\subsection{Segments with a slope $k$ such as $|k|\leq1$}
Such a segment is treated with 1 thread per column and consequently, it often has more than one pixel per row as shown by Figure \ref{tripix}.
In an image row, consecutive pixels which belong to the target define an interval which can only have one low and one high ends 
That's why, on each row, we choose to consider only the contributions of the innermost pixels.
This selection is also done inside \texttt{GPU\_compute\_segments\_contrib()} when reading the lookup tables for each pixel contribution. 
We simply set a null contribution for pixels that need to be ignored.
\begin{figure}[h]
  \centering
 \resizebox{0.75\linewidth}{0.35\linewidth}{\input{./img/tripix.pdf_t}}
  \caption{\label{tripix}Zoom on part a of segment with $|k| < 1$,  at pixel level.}
\end{figure}


\subsection{Parameters estimation}
A \texttt{GPU\_compute\_PLH()} kernel computes in parallel :
\begin{itemize}
 \item every $8N_n$ vector of parameters values corresponding to each possible next state of the snake. Summing is done in shared memory but relevant 
  data for these operations are stored in global memory.
  \item every associated pseudo likelihood value.
  \item every node substitution when better PLH have been found and if it does not lead to segments crossing.
\end{itemize}

\subsection{End of segmentation}
Segmentation is considered achieved out when no other node can be added to the snake (Algorithm \ref{gpualgosimple}).
A very simple GPU kernel adds every possible node and returns the number it added.

\begin{algorithm}[h]
\label{gpualgosimple}
\caption{Parralel GPU algorithm : outlines. \texttt{<<<...>>>} indicates a GPU kernel parallel process.}
\SetNlSty{textbf}{}{:}
  load images\;
  transfer image from CPU to GPU\;
  \texttt{<<<}compute the 2 cumulated images\texttt{>>>}\;
  \texttt{<<<}initialize the snake\texttt{>>>}\;
  \Repeat(\tcc*[f]{iteration level}){no more node can be added}{
      \Repeat(\tcc*[f]{step level}){no more node can be moved}{
	  \texttt{<<<}find best neighbor snake\texttt{>>>}\;
	  \texttt{<<<}adjust node's positions\texttt{>>>}\;
	  transfer the number of moves achieved from GPU memory to CPU memory.
      }
  \texttt{<<<}Add nodes\texttt{>>>}\;
  transfert the number of nodes added from GPU memory to CPU memory.
  }
\end{algorithm}

\section{\label{secSpeedups}Speedups}
The CPU (SSE) implementation by N. Bertaux from the PhyTI team, based on \cite{AllainBG08} has been our reference to ensure segmentation's quality and to estimate speedups. 
Results are given in Table \ref{tabresults}. 
CPU timings were measured on an Intel Xeon E5530-2.4GHz with 12Go RAM (LIFC cluster).
GPU timings were obtained on a C2050 GPU with 3GB RAM (adonis-11.grenoble.grid5000.fr).\\
Execution times reported are means on ten executions.
%Measurements on CPU may vary more than on GPU. So CPU results given in \ref{tabresults} are near the fastest values we observed.
The image of figure \ref{fig:labelinit} (scaled down for printing reasons) is a 16-bit gray level photo from PhyTI team,
voluntarily noisy for testing reasons. The contrast has been enhanced for better viewing.

We separately give the timings of pre-computations as they are a very general purpose piece of code.
Segmentations have been performed with strictly the same parameters (initial shape, threshold length).
The neighborhood distance for the first iteration is 32 pixels. It has a slight influence on the 
time process, but it leads to similar speedups values of approximately 7 times faster than CPU.

Though it does not appear in Table \ref{tabresults}, we observed that during segmentation stage, higher speedups are obtained in the very first iterations, when segments are made of a lot of pixels, leading to a higher parallelism ratio.\\
Several parameters prevent from achieving higher speedups~:
\begin{itemize}
 \item accesses in the lookup tables in global memory cannot be coalescent. It would imply that the pixel contributions of a segment are stored in consecutive spaces in $C_z$ and $C_{z^2}$. 
   This is only the case for horizontal segments.
  \item the use of 64-bit words for computations in shared memory often leads to 2-way bank conflicts.
  \item the level of parallelism is not so high, ie. the total number of pixel is not large enough to achieve impressive speedups. For example, on C2050 GPU, a grid can 
  run about 66 million of threads, but a snake in a 10000 x 10000 image would be less than 0.1 million pixel long.
\end{itemize}


% \begin{table}
%  \begin{center}
% \begin{tabular}{|l| r|r r r|}
% \hline
% && CPU & GPU & Speedup\\\cline{3-5}
% Image 15MP 	& \bf total 	& \bf0.51 s 	&  \bf0.06 s & \bf x8.5 \\
% 		& pre-comp. 	& 0.21 s 	&  0.02 s & x10\\
% 		& segment. 	& 0.34 s 	&  0.04 s & x8.5\\\hline
% Image 100MP 	& \bf total 	& \bf 4.33 s 	&  \bf 0.59 s & \bf x7.3\\
% 		& pre-comp. 	& 1.49 s 	&  0.13 s & x11\\
% 		& segment. 	& 2.84 s 	&  0.46 s & x6.1\\\hline
% Image 150Mp	& \bf total 	& \bf 26.4 s 	&  \bf 0.79 s & \bf x33\\
% 		& pre-comp. 	& 8.4 s 	&  0.20 s & x42\\
% 		& segment. 	& 18.0 s 	&  0.59 s & x30\\\hline
%  \end{tabular}
%  \end{center}
% 
% \caption{\label{tabresults} GPU (C2050, sm20) vs CPU timings.}
% \end{table} 


\begin{table}
 \begin{center}
\begin{tabular}{|l| r|r r r|}
\hline
&& CPU & GPU & Speedup\\\cline{3-5}
Image 15MP 	& \bf total 	& \bf0.51 s 	&  \bf0.06 s & \bf x8.5 \\
		& pre-comp. 	& 0.13 s 	&  0.02 s & x6.5\\
		& segment. 	& 0.46 s 	&  0.04 s & x11.5\\\hline
Image 100MP 	& \bf total 	& \bf 4.08 s 	&  \bf 0.59 s & \bf x6.9\\
		& pre-comp. 	& 0.91 s 	&  0.13 s & x6.9\\
		& segment. 	& 3.17 s 	&  0.46 s & x6.9\\\hline
Image 150Mp	& \bf total 	& \bf 5.7 s 	&  \bf 0.79 s & \bf x7.2\\
		& pre-comp. 	& 1.4 s 	&  0.20 s & x7.0\\
		& segment. 	& 4.3 s 	&  0.59 s & x7.3\\\hline
 \end{tabular}
 \end{center}

\caption{\label{tabresults} GPU (C2050, sm20) vs CPU timings.}
\end{table} 

\IEEEpeerreviewmaketitle

\vspace{1cm}

\section{\label{secConclusion}Conclusion}
The algorithm we have focused on is not easy to adapt for high speedups on GPGPU, though we managed to make it work faster than on CPU.
The main drawback is clearly its relative low level of parallelism. Nevertheless, we proposed different kernels that allowed us to take advantage of the computation power of  GPUs.
 In future works, we plan  to try and manage to benefit from larger computing grids of thread blocks. Among the possible solutions, we plan to work on:
\begin{itemize}
  \item slicing the image and proceeding the parts in parallel. This is made possible since sm20 GPU provide multi kernel capabilities.
  \item slicing the image and proceeding the parts on two different GPUs, hosted by the same CPU. 
   \item translating the parallelism from pixel level (\emph{1 thread per pixel}) to snake level (\emph{1 thread per snake}), at least during the first iteration, which 
is often the longest lasting one. 
  \item designing an algorithm, in a GPU way of thinking, instead of adapting the existing CPU-designed algorithm to GPU constraints as we did.
\end{itemize}


%%RAPH
%%Est ce qu'on parle du fait qu'on va également réfléchir à repenser l'algo en gpu?


% trigger a \newpage just before the given reference
% number - used to balance the columns on the last page
% adjust value as needed - may need to be readjusted if
% the document is modified later
%\IEEEtriggeratref{8}
% The "triggered" command can be changed if desired:
%\IEEEtriggercmd{\enlargethispage{-5in}}

% references section


\bibliographystyle{IEEEtran}

\bibliography{IEEEabrv,biblio}


% that's all folks
\end{document}