paper_lniv_gpu/paper_lniv_gpu.tex~

   1 %% bare_conf.tex
   2 %% V1.3
   3 %% 2007/01/11
   4 %% by Michael Shell
   5 %% See:
   6 %% http://www.michaelshell.org/
   7 %% for current contact information.
   8 %%
   9 %% This is a skeleton file demonstrating the use of IEEEtran.cls
  10 %% (requires IEEEtran.cls version 1.7 or later) with an IEEE conference paper.
  11 %%
  12 %% Support sites:
  13 %% http://www.michaelshell.org/tex/ieeetran/
  14 %% http://www.ctan.org/tex-archive/macros/latex/contrib/IEEEtran/
  15 %% and
  16 %% http://www.ieee.org/
  17
  18 %%*************************************************************************
  19 %% Legal Notice:
  20 %% This code is offered as-is without any warranty either expressed or
  21 %% implied; without even the implied warranty of MERCHANTABILITY or
  22 %% FITNESS FOR A PARTICULAR PURPOSE!
  23 %% User assumes all risk.
  24 %% In no event shall IEEE or any contributor to this code be liable for
  25 %% any damages or losses, including, but not limited to, incidental,
  26 %% consequential, or any other damages, resulting from the use or misuse
  27 %% of any information contained here.
  28 %%
  29 %% All comments are the opinions of their respective authors and are not
  30 %% necessarily endorsed by the IEEE.
  31 %%
  32 %% This work is distributed under the LaTeX Project Public License (LPPL)
  33 %% ( http://www.latex-project.org/ ) version 1.3, and may be freely used,
  34 %% distributed and modified. A copy of the LPPL, version 1.3, is included
  35 %% in the base LaTeX documentation of all distributions of LaTeX released
  36 %% 2003/12/01 or later.
  37
  38
  39 %% Retain all contribution notices and credits.
  40 %% ** Modified files should be clearly indicated as such, including  **
  41 %% ** renaming them and changing author support contact information. **
  42 %%
  43 %% File list of work: IEEEtran.cls, IEEEtran_HOWTO.pdf, bare_adv.tex,
  44 %%                    bare_conf.tex, bare_jrnl.tex, bare_jrnl_compsoc.tex
  45 %%*************************************************************************
  46
  47 % *** Authors should verify (and, if needed, correct) their LaTeX system  ***
  48 % *** with the testflow diagnostic prior to trusting their LaTeX platform ***
  49 % *** with production work. IEEE's font choices can trigger bugs that do  ***
  50 % *** not appear when using other class files.                            ***
  51 % The testflow support page is at:
  52 % http://www.michaelshell.org/tex/testflow/
  53
  54
  55
  56 % Note that the a4paper option is mainly intended so that authors in
  57 % countries using A4 can easily print to A4 and see how their papers will
  58 % look in print - the typesetting of the document will not typically be
  59 % affected with changes in paper size (but the bottom and side margins will).
  60 % Use the testflow package mentioned above to verify correct handling of
  61 % both paper sizes by the user's LaTeX system.
  62 %
  63 % Also note that the "draftcls" or "draftclsnofoot", not "draft", option
  64 % should be used if it is desired that the figures are to be displayed in
  65 % draft mode.
  66 %
  67 \documentclass[10pt, journal]{IEEEtran}
  68 %\documentclass[12pt, journal, drafcls, onecolumn]{IEEEtran}
  69 % Add the compsocconf option for Computer Society conferences.
  70 %
  71 % If IEEEtran.cls has not been installed into the LaTeX system files,
  72 % manually specify the path to it like:
  73 % \documentclass[conference]{../sty/IEEEtran}
  74
  75
  76 %  \usepackage[latin1]{inputenc}
  77 %  \usepackage[cyr]{aeguill}
  78 %  \usepackage[francais]{babel}
  79
  80
  81 % Some very useful LaTeX packages include:
  82 % (uncomment the ones you want to load)
  83
  84
  85 % *** MISC UTILITY PACKAGES ***
  86 %
  87 %\usepackage{ifpdf}
  88 % Heiko Oberdiek's ifpdf.sty is very useful if you need conditional
  89 % compilation based on whether the output is pdf or dvi.
  90 % usage:
  91 % \ifpdf
  92 %   % pdf code
  93 % \else
  94 %   % dvi code
  95 % \fi
  96 % The latest version of ifpdf.sty can be obtained from:
  97 % http://www.ctan.org/tex-archive/macros/latex/contrib/oberdiek/
  98 % Also, note that IEEEtran.cls V1.7 and later provides a builtin
  99 % \ifCLASSINFOpdf conditional that works the same way.
 100 % When switching from latex to pdflatex and vice-versa, the compiler may
 101 % have to be run twice to clear warning/error messages.
 102
 103
 104
 105
 106
 107
 108 % *** CITATION PACKAGES ***
 109 %
 110 \usepackage{cite}
 111 % cite.sty was written by Donald Arseneau
 112 % V1.6 and later of IEEEtran pre-defines the format of the cite.sty package
 113 % \cite{} output to follow that of IEEE. Loading the cite package will
 114 % result in citation numbers being automatically sorted and properly
 115 % "compressed/ranged". e.g., [1], [9], [2], [7], [5], [6] without using
 116 % cite.sty will become [1], [2], [5]--[7], [9] using cite.sty. cite.sty's
 117 % \cite will automatically add leading space, if needed. Use cite.sty's
 118 % noadjust option (cite.sty V3.8 and later) if you want to turn this off.
 119 % cite.sty is already installed on most LaTeX systems. Be sure and use
 120 % version 4.0 (2003-05-27) and later if using hyperref.sty. cite.sty does
 121 % not currently provide for hyperlinked citations.
 122 % The latest version can be obtained at:
 123 % http://www.ctan.org/tex-archive/macros/latex/contrib/cite/
 124 % The documentation is contained in the cite.sty file itself.
 125
 126
 127
 128
 129
 130
 131 % *** GRAPHICS RELATED PACKAGES ***
 132 %
 133 \usepackage{transparent}
 134 \ifCLASSINFOpdf
 135   \usepackage[pdftex]{graphicx,color}
 136   % declare the path(s) where your graphic files are
 137   \graphicspath{{img/}}
 138   % and their extensions so you won't have to specify these with
 139   % every instance of \includegraphics
 140   \DeclareGraphicsExtensions{.pdf,.jpeg,.png}
 141 \else
 142   % or other class option (dvipsone, dvipdf, if not using dvips). graphicx
 143   % will default to the driver specified in the system graphics.cfg if no
 144   % driver is specified.
 145    \usepackage[dvips]{graphicx}
 146   % declare the path(s) where your graphic files are
 147    \graphicspath{{imgfs/}}
 148   % and their extensions so you won't have to specify these with
 149   % every instance of \includegraphics
 150    \DeclareGraphicsExtensions{.png,.jpg}
 151 \fi
 152 % graphicx was written by David Carlisle and Sebastian Rahtz. It is
 153 % required if you want graphics, photos, etc. graphicx.sty is already
 154 % installed on most LaTeX systems. The latest version and documentation can
 155 % be obtained at:
 156 % http://www.ctan.org/tex-archive/macros/latex/required/graphics/
 157 % Another good source of documentation is "Using Imported Graphics in
 158 % LaTeX2e" by Keith Reckdahl which can be found as epslatex.ps or
 159 % epslatex.pdf at: http://www.ctan.org/tex-archive/info/
 160 %
 161 % latex, and pdflatex in dvi mode, support graphics in encapsulated
 162 % postscript (.eps) format. pdflatex in pdf mode supports graphics
 163 % in .pdf, .jpeg, .png and .mps (metapost) formats. Users should ensure
 164 % that all non-photo figures use a vector format (.eps, .pdf, .mps) and
 165 % not a bitmapped formats (.jpeg, .png). IEEE frowns on bitmapped formats
 166 % which can result in "jaggedy"/blurry rendering of lines and letters as
 167 % well as large increases in file sizes.
 168 %
 169 % You can find documentation about the pdfTeX application at:
 170 % http://www.tug.org/applications/pdftex
 171
 172
 173
 174
 175
 176 % *** MATH PACKAGES ***
 177 %
 178 \usepackage[cmex10]{amsmath}
 179 % A popular package from the American Mathematical Society that provides
 180 % many useful and powerful commands for dealing with mathematics. If using
 181 % it, be sure to load this package with the cmex10 option to ensure that
 182 % only type 1 fonts will utilized at all point sizes. Without this option,
 183 % it is possible that some math symbols, particularly those within
 184 % footnotes, will be rendered in bitmap form which will result in a
 185 % document that can not be IEEE Xplore compliant!
 186 %
 187 % Also, note that the amsmath package sets \interdisplaylinepenalty to 10000
 188 % thus preventing page breaks from occurring within multiline equations. Use:
 189 %\interdisplaylinepenalty=2500
 190 % after loading amsmath to restore such page breaks as IEEEtran.cls normally
 191 % does. amsmath.sty is already installed on most LaTeX systems. The latest
 192 % version and documentation can be obtained at:
 193 % http://www.ctan.org/tex-archive/macros/latex/required/amslatex/math/
 194
 195
 196
 197
 198
 199 % *** SPECIALIZED LIST PACKAGES ***
 200 %
 201 \usepackage[ruled,lined,linesnumbered]{algorithm2e}
 202 % pour les refs dans les environnements
 203 %\usepackage{algorithmic}
 204 % algorithmic.sty was written by Peter Williams and Rogerio Brito.
 205 % This package provides an algorithmic environment fo describing algorithms.
 206 % You can use the algorithmic environment in-text or within a figure
 207 % environment to provide for a floating algorithm. Do NOT use the algorithm
 208 % floating environment provided by algorithm.sty (by the same authors) or
 209 % algorithm2e.sty (by Christophe Fiorio) as IEEE does not use dedicated
 210 % algorithm float types and packages that provide these will not provide
 211 % correct IEEE style captions. The latest version and documentation of
 212 % algorithmic.sty can be obtained at:
 213 % http://www.ctan.org/tex-archive/macros/latex/contrib/algorithms/
 214 % There is also a support site at:
 215 % http://algorithms.berlios.de/index.html
 216 % Also of interest may be the (relatively newer and more customizable)
 217 % algorithmicx.sty package by Szasz Janos:
 218 % http://www.ctan.org/tex-archive/macros/latex/contrib/algorithmicx/
 219
 220
 221
 222
 223 % *** ALIGNMENT PACKAGES ***
 224 %
 225 \usepackage{array}
 226 % Frank Mittelbach's and David Carlisle's array.sty patches and improves
 227 % the standard LaTeX2e array and tabular environments to provide better
 228 % appearance and additional user controls. As the default LaTeX2e table
 229 % generation code is lacking to the point of almost being broken with
 230 % respect to the quality of the end results, all users are strongly
 231 % advised to use an enhanced (at the very least that provided by array.sty)
 232 % set of table tools. array.sty is already installed on most systems. The
 233 % latest version and documentation can be obtained at:
 234 % http://www.ctan.org/tex-archive/macros/latex/required/tools/
 235
 236
 237 \usepackage{mdwmath}
 238 \usepackage{mdwtab}
 239 % Also highly recommended is Mark Wooding's extremely powerful MDW tools,
 240 % especially mdwmath.sty and mdwtab.sty which are used to format equations
 241 % and tables, respectively. The MDWtools set is already installed on most
 242 % LaTeX systems. The lastest version and documentation is available at:
 243 % http://www.ctan.org/tex-archive/macros/latex/contrib/mdwtools/
 244
 245
 246 % IEEEtran contains the IEEEeqnarray family of commands that can be used to
 247 % generate multiline equations as well as matrices, tables, etc., of high
 248 % quality.
 249
 250
 251 %\usepackage{eqparbox}
 252 % Also of notable interest is Scott Pakin's eqparbox package for creating
 253 % (automatically sized) equal width boxes - aka "natural width parboxes".
 254 % Available at:
 255 % http://www.ctan.org/tex-archive/macros/latex/contrib/eqparbox/
 256
 257
 258
 259
 260
 261 % *** SUBFIGURE PACKAGES ***
 262 %\usepackage[tight,footnotesize]{subfigure}
 263 % subfigure.sty was written by Steven Douglas Cochran. This package makes it
 264 % easy to put subfigures in your figures. e.g., "Figure 1a and 1b". For IEEE
 265 % work, it is a good idea to load it with the tight package option to reduce
 266 % the amount of white space around the subfigures. subfigure.sty is already
 267 % installed on most LaTeX systems. The latest version and documentation can
 268 % be obtained at:
 269 % http://www.ctan.org/tex-archive/obsolete/macros/latex/contrib/subfigure/
 270 % subfigure.sty has been superceeded by subfig.sty.
 271
 272
 273
 274 %\usepackage[caption=false]{caption}
 275 %\usepackage[font=footnotesize]{subfig}
 276 % subfig.sty, also written by Steven Douglas Cochran, is the modern
 277 % replacement for subfigure.sty. However, subfig.sty requires and
 278 % automatically loads Axel Sommerfeldt's caption.sty which will override
 279 % IEEEtran.cls handling of captions and this will result in nonIEEE style
 280 % figure/table captions. To prevent this problem, be sure and preload
 281 % caption.sty with its "caption=false" package option. This is will preserve
 282 % IEEEtran.cls handing of captions. Version 1.3 (2005/06/28) and later
 283 % (recommended due to many improvements over 1.2) of subfig.sty supports
 284 % the caption=false option directly:
 285 \usepackage[caption=false,font=footnotesize]{subfig}
 286 %
 287 % The latest version and documentation can be obtained at:
 288 % http://www.ctan.org/tex-archive/macros/latex/contrib/subfig/
 289 % The latest version and documentation of caption.sty can be obtained at:
 290 % http://www.ctan.org/tex-archive/macros/latex/contrib/caption/
 291
 292
 293
 294
 295 % *** FLOAT PACKAGES ***
 296 %
 297 \usepackage{fixltx2e}
 298 % fixltx2e, the successor to the earlier fix2col.sty, was written by
 299 % Frank Mittelbach and David Carlisle. This package corrects a few problems
 300 % in the LaTeX2e kernel, the most notable of which is that in current
 301 % LaTeX2e releases, the ordering of single and double column floats is not
 302 % guaranteed to be preserved. Thus, an unpatched LaTeX2e can allow a
 303 % single column figure to be placed prior to an earlier double column
 304 % figure. The latest version and documentation can be found at:
 305 % http://www.ctan.org/tex-archive/macros/latex/base/
 306
 307
 308
 309 %\usepackage{stfloats}
 310 % stfloats.sty was written by Sigitas Tolusis. This package gives LaTeX2e
 311 % the ability to do double column floats at the bottom of the page as well
 312 % as the top. (e.g., "\begin{figure*}[!b]" is not normally possible in
 313 % LaTeX2e). It also provides a command:
 314 %\fnbelowfloat
 315 % to enable the placement of footnotes below bottom floats (the standard
 316 % LaTeX2e kernel puts them above bottom floats). This is an invasive package
 317 % which rewrites many portions of the LaTeX2e float routines. It may not work
 318 % with other packages that modify the LaTeX2e float routines. The latest
 319 % version and documentation can be obtained at:
 320 % http://www.ctan.org/tex-archive/macros/latex/contrib/sttools/
 321 % Documentation is contained in the stfloats.sty comments as well as in the
 322 % presfull.pdf file. Do not use the stfloats baselinefloat ability as IEEE
 323 % does not allow \baselineskip to stretch. Authors submitting work to the
 324 % IEEE should note that IEEE rarely uses double column equations and
 325 % that authors should try to avoid such use. Do not be tempted to use the
 326 % cuted.sty or midfloat.sty packages (also by Sigitas Tolusis) as IEEE does
 327 % not format its papers in such ways.
 328
 329
 330
 331 % correct bad hyphenation here
 332 % \hyphenation{op-tical net-works semi-conduc-tor}
 333
 334
 335 \begin{document}
 336 %
 337 % paper title
 338 % can use linebreaks \\ within to get better formatting as desired
 339 \title{Fast GPU-based denoising filter using isoline levels}
 340
 341
 342 % author names and affiliations
 343 % use a multiple column layout for up to two different
 344 % affiliations
 345
 346 \author{
 347 \IEEEauthorblockN{Gilles Perrot$^1$, St\'{e}phane Domas$^1$, Rapha\"{e}l Couturier$^1$, Nicolas Bertaux$^2$}
 348
 349 \IEEEauthorblockA{$^1$FEMTO-ST institute\\
 350 Rue Engel Gros, 90000 Belfort, France.\\
 351 forename.name@univ-fcomte.fr}
 352
 353 \IEEEauthorblockA{$^2$ Institut Fresnel, CNRS, Aix-Marseille Universit\'e, Ecole Centrale Marseille,\\
 354 Campus de Saint-J\'er\^ome, 13013 Marseille, France.\\
 355 nicolas.bertaux@ec-marseille.fr}
 356 }
 357
 358
 359
 360 % use for special paper notices
 361 %\IEEEspecialpapernotice{(Invited Paper)}
 362
 363
 364 % make the title area
 365 \maketitle
 366
 367 \begin{abstract}
 368 In this study, we propose to address the issue of image denoising by means of a GPU-based filter, able to achieve high-speed processing by taking advantage of the parallel computation capabilities of modern GPUs. Our approach is based on the level sets theory first introduced by \cite{matheron75} in 1975 but little implemented because of its high computation costs. What we actually do is try to guess the best isoline shapes inside the noisy image. At first, our method involved the polyline modelling of isolines; then we found an optimization heuristics which very closely fits the capabilities of GPUs. So far, though our proposed hybrid PI-PD filter has not achieved the best denoising levels, it is nonetheless able to process a 512x512 image in about 11~ms.
 369 \end{abstract}
 370
 371 \begin{IEEEkeywords}
 372  GPU; denoising; filter; isoline;level line;
 373 \end{IEEEkeywords}
 374
 375 \section{Introduction}
 376 Denoising has been a much studied research issue since electronic transmission was first used. The wide range of applications that involve denoising makes it uneasy to propose a universal filtering method. Among them, digital image processing is a major field of interest as the number of digital devices able to take pictures or make movies is growing fast and shooting is rarely done in optimal conditions. Moreover, the increase in pixel density of the CCD or CMOS sensors used to measure light intensity leads to higher noise effects and imposes high output flow rates to the various processing algorithms.
 377
 378 In addition, it is difficult to quantify the quality of an image processing algorithm, as visual perception is subject to high variation from one human to another.
 379 So far, the advent of GPUs has brought high speedups to a lot of algorithms, and many researchers and developpers have successfully adressed the issue of implementing existing algorithms on such devices. For example in \cite{mcguire2008median},\cite{chen09} and \cite{sanchezICASSP12}, authors managed to design quite fast median filters. Bilateral filtering has also been successfully proposed in \cite{YangTA09}.
 380 Still, most high quality algorithms, like NL-means \cite{ipol.2011.bcm_nlm} or BM3D \cite{Dabov09bm3dimage} make use of non-local similarities and/or frequency domain transforms. However, speedups achieved by their current GPU implementations, though quite sigificant (as shown for example with NL-means in \cite{PALHANOXAVIERDEFONTES}), do not come near those achieved by local methods such as gaussian, median or neighborhood filters, as they have not originally been designed against GPU architecture.
 381 In order to fully benefit from the capabilities of GPUs, it is important that the approach to designing algorithms be more hardware-oriented, keeping in mind, from the very beginning, the intrinsic constraints of the device which is actually going to run those algorithms. Consequently, this often results in unusual options and even apparently sub-optimal solutions, but the considerable speed benefits obtained would possibly make it at least a good compromise or even the only current way to real-time high-definition image processing.
 382
 383
 384
 385 \section{\label{contrib}Contribution}
 386 As early as 1975 \cite{matheron75}, it was found that, under the conditions mentioned in section \ref{isolines}, an image can be decomposed into a set of level lines. Accordingly, real-life images fulfill the above conditions and since then, with the increase of computing capabilities, researchers have succeded in implementing such level-lines based algorithms as in \cite{caselles97} and \cite{springerlink:10.1007/3-540-48236-9_16}.
 387 A few years ago, in \cite{bertaux:04}, authors  proposed an original method which significantly reduces speckle noise inside coherent images, using the level lines in the image to constrain the minimization process.
 388 Those level lines are actually \textit{iso-gray-level} lines, which are called \textit{isolines}. In \cite{bertaux:04}, isolines consist in neighborhoods of polyline shapes determined by maximum likelihood optimization. This method proved not only to bring good enhancement but also to preserve edges between regions. Nevertheless, the costs in computation time, though not prohibitive, did not allow real-time image processing; as an example, the authors of \cite{bertaux:04} managed to process an almost 2Mpixel image within a minute on an old PIII-1GHz.
 389
 390 Our work started by designing a set of GPU implementations with various optimization heuristics, in order to find out which tracks could be followed  towards minimizing loss in quality and preserve admissible execution times. Those algorithms have been tested with reference images taken from \cite{denoiselab} for which various processing results have been published. Some of the more interesting ones are listed and compared in  \cite{denoisereview}.
 391 Statistical observations (to be detailed below) made on the output images produced by the method proposed in \cite{bertaux:04}, led us to propose a very fast and simple parallel denoising method which gives good results in terms of average gray-level error, but also avoids the blurring of edges.
 392
 393 On the basis of the BM3D timings listed in \cite{Dabov09bm3dimage} and with our own measurements, our proposed GPU-based filter runs around 350 times faster and thus is able to process high definition images at over 16fps. It also achieves good denoising quality.
 394
 395 \section{Plan}
 396 In the following, section \ref{GPUgeneralites} briefly focuses on recent Nvidia GPU characteristics. Section \ref{isolines} will introduce the theory and notations used to define isolines. Then, in section \ref{lniv0}, we will describe the two isoline based models that led to the final hybrid model, while section \ref{lniv} details the parallel implementation of the proposed algorithm. Finally, we present our results in section \ref{results} before drawing our conclusions and outlining our future works in section \ref{conclusion}.
 397
 398
 399 \section{\label{GPUgeneralites}NVidia's GPU architecture}
 400 GPUs are multi-core, multi-threaded processors, optimized for highly parallel computation. Their design focuses on a SIMT model (Single Instruction Multiple Threads) that devotes more transistors to data processing rather than data-caching and flow control (see \cite{CUDAPG} for more details).
 401 For example, a C2070 card features 6~GBytes global memory and a total of 448 cores bundled in several Streaming Multiprocessors (SM). An amount of shared memory, much faster than global memory, is avalaible on each SM (up to 48~KB for a C20xx card)
 402
 403 Writing efficient code for such architectures is not obvious, as re-serialization must be avoided as much as possible. Thus, code design requires one pays attention to a number of points, among which:
 404 \begin{itemize}
 405  \item the CUDA model organizes threads by a) thread blocks in which synchronization is possible, b) a grid of blocks with no possible synchronization between them.
 406   \item there is no way to know how blocks are scheduled during one single kernel execution.
 407   \item data must be kept in GPU memory, to reduce the overhead generated by copying between CPU and GPU.
 408   \item the total amount of threads running the same computation must be as large as possible.
 409   \item the number of execution branches inside one block should be as small as possible.
 410   \item global memory accesses should be coalescent, \emph{i.e.} memory accesses done by physically parallel threads (2 x 16 at a time) must be consecutive and contained in a 128 Bytes range.
 411   \item shared memory is organized in 32x32 bit-wide banks. To avoid bank conflicts, each parallel thread (2 x 16 at a time) must access a different bank.
 412 \end{itemize}
 413
 414 All the above characteristics always make designing efficient GPU code all the more constraining as non-suited code would probably run even slower on GPU than on CPU.
 415
 416 \section{\label{isolines}Isolines}
 417 In the following, let $I$ be the reference noiseless image (assuming we have one), $I'$ the noisy acquired image corrupted by Independent and Identically Distributed (IID) additive white gaussian noise of zero mean value and standard deviation $\sigma$. Let $\widehat{I}$ be the denoised image. Each pixel of $I'$ of coordinates $(i,j)$ has its own gray level $z(i,j)$.
 418
 419 As introduced above and since most common images are continuous and contain few edges, they can be decomposed into a set of constant gray level lines called \textit{isolines}. Then our goal is to find, for each single pixel of a noisy image, the isoline it belongs to.
 420 The generalized likelihood criterion (GL) is used to select the best isoline among all the considered ones, all of which must have the same number of pixels in order to be compared.
 421
 422 \subsection{Fixed-length isolines}
 423 For each pixel $(i,j)$ of the corrupted image, we look for the gray level of the isoline it belongs to, inside a rectangular window $\omega$ centered on $(i,j)$. Inside $\omega$, let $S^n$ be the isoline part which the center pixel belongs to. $S^n$ is a set of $n$ pixel positions $(i_q,j_q)$ ($q \in [0;n[$).\\
 424 The gray levels $z$ along $S^n$ follow a gaussian probability density function whose parameters $\mu_{S^n}$ (mean value of isoline part) and $\sigma$ (standard deviation brought by gaussian noise ) are unknown.\\
 425 Let $\overline{S^n}$ be defined by $\omega = S^n \cup \overline{S^n}$.\\
 426 For each pixel, the mean values $\mu_{ij}$ of gray levels $z$ over $\overline{S^n}$ are unknown and supposed independant .\\
 427 Let $Z$ be the gray levels of pixels in $\omega$ and $\left\{\mu_{ij}\right\}_{\overline{S^n}}$ the mean values of pixels in $\overline{S^n}$.
 428 The likelihood is given by:
 429 $$
 430 \displaystyle P \left[Z | S^n, \mu_{S^n}, \left\{\mu_{ij}\right\}_{\overline{S^n}}, \sigma \right]
 431 $$
 432 When separating contributions from regions $S^n$ and $\overline{S^n}$, it becomes:
 433 \begin{eqnarray}
 434 \displaystyle \prod_{(i,j)\in S^n}{P\left[z(i,j) | \mu_{S^n}, \sigma \right]} . \displaystyle\prod_{(i,j)\in \overline{S^n}}{P\left[z(i,j) | \left\{\mu_{ij}\right\}_{\overline{S^n}}, \sigma \right]}
 435 \label{LL2}
 436 \end{eqnarray}
 437 The goal is then to estimate the value of the above expression, in order to find the boundaries of $S^n$ that maximize expression \eqref{LL2}.\\
 438 Let us consider that, on $\overline{S^n}$, the values $z(i,j)$ are the likelihood estimations $\widehat{\mu_{ij}}$ for $\mu_{ij}$. The second term of expression \eqref{LL2} becomes:
 439 \begin{eqnarray}
 440 \displaystyle\prod_{(i,j)\in \overline{S^n}}{P\left[z(i,j) | \left\{\widehat{\mu_{ij}}\right\}_{\overline{S^n}}, \sigma \right]=1}
 441 \end{eqnarray}
 442 which leads to the generalized likelihood expression:
 443 \begin{eqnarray}
 444 \displaystyle \prod_{(i,j)\in S^n}{P\left[z(i,j) | \mu_{S^n}, \sigma \right]}
 445 \label{GL}
 446 \end{eqnarray}
 447 As we know the probability density function on $S^n$, \eqref{GL} can then be developped as
 448 \begin{eqnarray}
 449 \displaystyle \prod_{(i,j)\in S^n}\frac{1}{\sqrt{2\pi\sigma^2}}e^{-\frac{\left(z(i,j)-\mu_{S^n}\right)^2}{2\sigma^2}}
 450 \label{GL2}
 451 \end{eqnarray}
 452
 453 The log-likelihood is then given by:
 454 \begin{eqnarray}
 455 \displaystyle -\frac{n}{2}log\left(2\pi\right) - \frac{n}{2}log\left(\sigma^2\right) - \frac{n}{2}
 456 \label{LL1}
 457 \end{eqnarray}
 458 inside which the vector of parameters $(\mu_{S^n}, \sigma)$ is determined by maximum likelihood estimation
 459 $$
 460 \left(
 461 \begin{array}{l}
 462 \widehat{\mu_{S^n}} = \displaystyle\frac{1}{n} \sum_{(i,j)\in S^n} z(i,j) \\
 463 \widehat{\sigma^2} = \displaystyle\frac{1}{n} \sum_{(i,j)\in S^n} \left(z(i,j) - \widehat{\mu_{S^n}}\right)^2 \\
 464 \end{array}
 465 \right.
 466 $$
 467 The selection of the best isoline is done by searching which one maximizes the expression of equation \eqref{LL1}.
 468
 469 \begin{figure}[h]
 470 \includegraphics{isolines_1.ps}
 471 \caption{Determination and lengthening of an isoline: The gray level $z$ of each pixel is seen as an elevation value. $S^n$ is the $n$ pixel length isoline for pixel of coordinates $(i, j)$. The elongation of $S^n$ by $S^p$ ($p$ pixel length) is submitted to the GLRT condition (see eq. \eqref{GLRT}).}
 472 \label{si3}
 473 \end{figure}
 474
 475
 476 \subsection{Lengthenable  isolines}
 477 Searching for larger isolines should lead to better filtering as a larger number of pixels would be involved.
 478 However, processing all possible isolines starting from each pixel would be too costly in computing time, even in the case of a small GPU-processed 512x512 pixel image. Therefore, we chose to build large isolines inside an iterative process including a mandatory validation stage between each lengthening iteration, so as to reduce the number of pixel combinations to be examined and keep the estimation of deviation $\sigma$ within a satisfactory range of values.
 479
 480 Let $S^n$ be a previously selected isoline part and $S^p$ connected to $S^n$ in such a way that $S^p$ could be seen as an addition to $S^n$ so as to define a possible valid isoline $S^{n+p}$.
 481 Figure \ref{si3} illustrates this situation with a very simple example image. In this figure, the gray level of each pixel is used as its corresponding height ($z$) in order to visualize isolines easily. Some of the orthogonal isoline projections have been drawn in dotted line in the $(\vec{i},\vec{j})$ plane. Both labeled parts $S^p$ and $S^n$ are represented in the $(\vec{i},\vec{j})$ plane and in the 3D associated plot.
 482
 483 In order to decide whether $S^{n+p}$ can be considered as an actual isoline, we compare the log-likelihood of both hypothesis below by using GLRT (Generalized Likelihood Ratio Test):
 484
 485 First, assuming that $S^{n+p}$ is an isoline, the gray levels of its pixels share the same mean value $\mu_{n+p}$. According to \eqref{LL1}, its log-likelihood is
 486 \begin{eqnarray}
 487 \displaystyle -\frac{(n+p)}{2}\left(log\left(2\pi\right)+1\right) - \frac{(n+p)}{2}log\left(\widehat{\sigma_1}^2\right)
 488 \label{LLNP}
 489 \end{eqnarray}
 490 where $\widehat{\sigma_1}$ is the estimation of the standard deviation along $S^n$.
 491
 492 Second, considering $S^n$ and $S^p$ as two separate isoline parts connected together, the gray levels of their pixels have two different mean values $\mu_n$ and $\mu_p$. The log-likelihood is the sum of both log-likelihoods, given by
 493 \begin{eqnarray}
 494 \displaystyle -\frac{(n+p)}{2}\left(log\left(2\pi\right)+1\right) - \frac{n}{2}log\left(\widehat{\sigma_2}^2\right) - \frac{p}{2}log\left(\widehat{\sigma_2}^2\right)
 495 \label{LLNP2}
 496 \end{eqnarray}
 497 where $\widehat{\sigma_2}$ is the estimation of the standard deviation along $S^n$ and $S^p$.
 498
 499 The difference between \eqref{LLNP} and \eqref{LLNP2} leads to the expression of $GLRT(S^{n+p},S^n, S^p, T_{max})$:
 500 \begin{eqnarray}
 501 T_{max}- (n+p).\left[log\left(\widehat{\sigma_1}^2\right) - log\left(\widehat{\sigma_2}^2\right) \right]
 502 \label{GLRT}
 503 \end{eqnarray}
 504
 505 The decision to validate lengthening from $S^n$ to $S^{n+p}$ depends whether $GLRT(S^{n+p},S^n, S^p, T_{max})$ is higher or lower than $0$. Value $T_{max}$ is the GLRT threshold.
 506
 507 \section{\label{lniv0}Isoline models}
 508 The most obvious model considers isolines as polylines. Each isoline can then be curved by allowing a direction change at the end of each segment; we shall call such isolines \textit{poly-isolines}.
 509
 510 In order to keep the number of candidate isolines within reasonable range, we chose to build them by combinating segments described by simple pre-computed patterns. Each pattern $p_{l,d}$ describes a segment of length $l$ and direction $d$. For one given $l$ value, all $p_{l,d}$ patterns are grouped into a matrix denoted $P_l$. Figure \ref{p5q1} shows an example of such a pattern matrix for $l=5$.
 511
 512 To fit the GPU-specific architecture, we define regularly distributed $D$ primary directions ($D=32$ in our examples).
 513
 514
 515 \subsection{\label{lniv2}Poly-isolines with limited deviation angle (PI-LD)}
 516 At one stage we implemented an algorithm parsing the tree of all possible polyline configurations, but the process proved far too slow regarding our goal, even on GPU, because of the amount of memory  involved (and consequent memory accesses) and because of the necessary reduction stage for which GPU efficiency is not maximum.
 517 So we focused on a variant inspired by \cite{bertaux:04} in which the selected direction of the next segment depends on the whole of the previously built and validated poly-isoline.
 518
 519 Let us consider a poly-isoline $S^n$ under construction, starting from pixel $(i,j)$ and made of $K$ validated segments $s_k~(k \in [1;K])$ of length $l$, each of them having its own direction $d_k$. The coordinates of the ending pixel of each segment $s_k$ are denoted $(i_k, j_k)$.
 520 Both of the following sums
 521 \begin{eqnarray}
 522 C_x\left(Z(S^n)\right) &=& \displaystyle\sum_{(i,j)\in S^n}z(i,j)\label{cx}\\
 523 and~~C_{x^2}\left(Z(S^n)\right)&=& \displaystyle\sum_{(i,j)\in S^n}z(i,j)^2\label{cx2}
 524 \end{eqnarray}
 525 have been obtained during the previous lengthening steps.
 526
 527 Let us examine now how to decide wether to add a new segment to $S^n$ or to stop the lenghtening process.
 528 The main idea is to apply each pattern $p_{l,d}$ to the ending pixel $(i_k,j_k)$, on the condition that its direction is contained within the limits of maximum deviation $\Delta d_{max}$.
 529 Maximum deviation $\Delta d_{max}$ prevents poly-isolines from beeing of circular shape (or backward-oriented) which would possibly generate supplementary artefacts in the output image. Another of its benefits is to reduce the number of combinations to be evaluated.
 530
 531 For each allowed pattern, GLRT is performed in order to decide if the corresponding segment could likely be added to the end of the poly-isoline $S^n$.
 532 If none is validated by GLRT, the poly-isoline $S^n$ is stopped.
 533
 534 If at least one segment has been accepted by GLRT, the one that leads to the maximum likelihood (ML) value of the lengthened poly-isoline $S^{n+l}$ is selected and integrated to $S^{n+l}$ as $s_{K+1}$.
 535
 536 In order to avoid critical situations where the first selected segment would not share the primary direction of the actual poly-isoline,
 537 no selection is performed on the level of the first segment; $D$ poly-isolines are kept and submitted to the lengthening process. To ensure isotropy, each of them shares the direction of one pattern $p_{l,d}~ (d \in[0;D])$.
 538
 539 Eventually, the poly-isoline with the maximum likelihood value is selected among the longest ones.
 540
 541 Figure \ref{pild} illustrates one stage of the lengthening process with the example of a two-segment poly-isoline  at the beginning of stage ($l=5$ and $\Delta d_{max}=2$).
 542
 543
 544
 545 \begin{figure}[h]
 546  \centering
 547 \subfloat[Isoline with two validated segments $s_1$ and $s_2$.]{\label{fig:debut} \includegraphics{./imgfs/PI-LD_detail_3.ps}}\qquad
 548 \subfloat[First evaluated segment, corresponding to pattern $p_{5,0}$.]{\label{fig:sub1} \includegraphics{./imgfs/PI-LD_detail_sub1.ps}}\\
 549 \subfloat[Second evaluated segment, corresponding to pattern $p_{5,1}$.]{\label{fig:sub2} \includegraphics{./imgfs/PI-LD_detail_sub2.ps}}\qquad
 550 \subfloat[Third evaluated segment, corresponding to pattern $p_{5,2}$.]{\label{fig:sub3} \includegraphics{./imgfs/PI-LD_detail_sub3.ps}}\\
 551 \subfloat[Fourth evaluated segment, corresponding to pattern $p_{5,3}$.]{\label{fig:sub4} \includegraphics{./imgfs/PI-LD_detail_sub4.ps}}\qquad
 552 \subfloat[Fifth evaluated segment, corresponding to pattern $p_{5,4}$.]{\label{fig:sub5} \includegraphics{./imgfs/PI-LD_detail_sub5.ps}}\\
 553 \caption{Example of lengthening process starting with a two-segment poly-isoline ($l=5$, $\Delta d_{max}=2$).
 554 The initial situation is shown in \ref{fig:debut}, while \ref{fig:sub1} to \ref{fig:sub5} represent the successive candidate segments.
 555 The direction index of the last validated segment is $d_2=2$ (\ref{fig:debut}). It implies that direction indexes allowed for the third segment range from $d_2-\Delta d{max}=0$ to $d_2+\Delta d{max}=4$ (\ref{fig:sub1} to \ref{fig:sub5}).
 556 The lengthening of the poly-isoline is accepted if at least one segment has a positive GLRT. If there are several, the one which minimizes the standard deviation of the whole poly-isoline is selected.
 557 }
 558 \label{pild}
 559 \end{figure}
 560
 561
 562 \renewcommand{\labelenumi}{\alph{enumi})}
 563 \renewcommand{\theenumi}{\alph{enumi})}
 564
 565 \subsection{\label{lniv3}Poly-isolines with precomputed directions (PI-PD)}
 566 Though much faster, the PI-LD-based filter may be considered a bit weak compared to \textit{state-of-the-art} filters like BM3D family algorithms \cite{Dabov09bm3dimage}. Furthermore, we saw that this way of building poly-isolines  requires the alternate use of two different types of validation at each lengthening stage: GLRT and maximum likelihood minimization. In order to be performed, each of them generates numerous branches during kernel execution, which does not fit GPU architecture well and leads to execution times that we hoped would be more impressive.
 567
 568 Within the PI-LD model, at each pixel $(i,j)$, as no selection is done at the first stage, $D$ poly-isolines are computed and kept as candidate though, obviously, only one follows the actual isoline at $(i,j)$. So, if we assume we can achieve a robust determination of the direction at any given pixel of this isoline, it becomes unnecessary to perform the selection at each lenghtening step.
 569 Thus, at each pixel $(i,j)$, only the first segment has to be determined in order to obtain the local direction of the isoline. This leads to an important reduction of the work complexity: the above PI-LD model needs to evaluate $D.\left(2.\Delta_{dmax}+1\right)^{K-1}$ segments at each pixel position, while only $D.K$ evaluations are needed in the second case. For example, with a maximum of $K=5$ segments and a maximum deviation of $\Delta_{dmax}=2$, the PI-LD needs to evaluate up to 20000 segments per pixel where only 160 should be enough.
 570
 571
 572 On the basis of these observations, we propose a new model that we shall call PI-PD, that completely separates the validation stages performed in the PI-LD model implementation mentioned above:
 573
 574 A first computation stage selects the best first segment $s_1$ starting at each pixel $(i,j)$ of the input image. Its direction index $d_1(i,j)$ is then stored in a reference matrix denoted $I_\Theta$; sums $C_x$ and $C_{x2}$ along $s_1(i,j)$ are also computed and stored in a dedicated matrix $I_\Sigma$. It can be noticed that this selection method of $s_1$ segments is a degraded version of PI-LD constrained by $K=1$.
 575
 576 A second stage manages the now independant lengthening process. For one given state of a poly-isoline where the last added segment has been $s_K$, the pattern whose direction index is given by $d=I_\Theta(i_K,j_K)$ defines the only segment to be evaluated.
 577 Both corresponding sums  $C_x$ and $C_{x2}$ are read from matrix $I_\Sigma$ and used in GLRT evaluation.
 578 The last point is to prevent poly-isolines from turning back.
 579
 580 Figure \ref{pipd} details this process, starting from the same initial state as in figure \ref{pild} with the noticeable difference that no deviation limit is needed.
 581
 582 Thus, as introduced above, work complexity is considerably reduced, as each pattern is only applied once at one given pixel $(i,j)$, and   associated values are computed only once; they are re-used every time one poly-isoline's segment ends at pixel $(i,j)$. Also, this fits GPU constraints better, as it avoids multiple branches during kernel execution.
 583 It remains that, the building of poly-isolines is done without global likelihood optimization.
 584
 585 Eventually, the model has been improved by adding to it the ability to thicken poly-isolines from one pixel up to three which allows to achieve higher PSNR values by increasing the number of pixels of poly-isolines in addition to the lengthening process.
 586 This may apply to large images which do not contain small relevant details, as it may blur small significant details or objects present in the noisy image.
 587 Still, this feature makes PI-PD more versatile than our reference BM3D, which has prohibitive computation times when processing large images (over 5 minutes for a 4096x4096 pixel image) and thus should require a slicing stage prior to processing them, causing some overhead.
 588
 589 \begin{figure}[h]
 590  \centering
 591 \subfloat[Poly-isoline with two validated segments.]{\label{fig:pipd1} \includegraphics{./imgfs/PI-PD_detail_sub1.ps}}\qquad
 592 \subfloat[Next direction is read from element $(i_2,j_2)$ of $I_{\Theta}$.]{\label{fig:pipd2} \includegraphics{./imgfs/PI-PD_detail_sub2.ps}}\\
 593 \subfloat[Pattern $p_{l,d_3}$ is then applied at $(i_2,j_2)$ and GLRT is performed. Both sums needed to perform GLRT are read from element $(i_2,j_2)$ of $I_{\Sigma}$.]{\label{fig:pipd3} \includegraphics{./imgfs/PI-PD_detail_sub3.ps}}\qquad
 594 \subfloat[If accepted by GLRT, segment $s_3$ is added to poly-isoline.]{\label{fig:pipd4} \includegraphics{./imgfs/PI-PD_detail_sub4.ps}}\\
 595 \caption{Example of PI-PD lengthening process starting with a two-segment poly-isoline ($l=5$).
 596 The initial situation is represented in \ref{fig:pipd1}, while \ref{fig:pipd1} to \ref{fig:pipd4} represent the successive processing steps.
 597 The end pixel of the last validated segment is $(i_2,j_2)$ (\ref{fig:pipd1}). Reference matrices \(I_{\Theta}\) and \(I_{\Sigma}\) provide the values needed to select the pattern to be applied on \((i_2,j_2)\) (\ref{fig:pipd2} and \ref{fig:pipd3}). GLRT is performed to validated lengthening or not. This process goes on until one submitted segment does not comply with GLRT.
 598 }
 599 \label{pipd}
 600 \end{figure}
 601
 602
 603
 604 \subsection{\label{pipd_plan}Hybrid PI-PD}
 605 As the determination of each segment's direction only involves a few pixels,
 606 the PI-PD model may not be robust enough in regions where the surface associated with $Z$ has a low local slope value
 607 regarding power of noise $\sigma^2$.
 608 We shall call those regions Low Slope Regions (LSR).
 609 Figure \ref{img_plans} shows this lack of robustness with an example of two drawings of additive white gaussian noise applied on the same reference image (Figure \ref{img_window}). Within this image, we focused on a small 11x11 pixel window containing two LSR with one sharp edge between them.
 610
 611 Figures \ref{fig:dir1} and \ref{fig:dir2} show that the directions computed by PI-PD are identical from one drawing to the other near the edge (lines 5-7), while they vary in LSR (lines 1-4, 8-11).
 612
 613 \begin{figure}[h]
 614  \centering
 615 \subfloat[Reference image]{\label{fig:ref} \includegraphics{./imgfs/zoom_edge_ref2.ps}}\\
 616 \subfloat[Image corrupted by random drawing $n^{\circ}1$]{\label{fig:noisy1} \includegraphics{./imgfs/zoom_edge_bruit.ps}}\qquad
 617 \subfloat[Image corrupted by random drawing $n^{\circ}2$]{\label{fig:noisy2} \includegraphics{./imgfs/zoom_edge2_bruit.ps}}\\
 618 \subfloat[Isoline directions for random drawing $n^{\circ}1$]{\label{fig:dir1} \includegraphics{./imgfs/zoom_edge1_2D_superpose3.ps}}\qquad
 619 \subfloat[Isoline directions for random drawing $n^{\circ}2$]{\label{fig:dir2} \includegraphics{./imgfs/zoom_edge2_2D_superpose2.ps}}\\
 620  \caption{Zoom on a small square window of the airplane image. \ref{fig:ref} reproduce the zoom on the window, taken from the reference image of Figure \ref{img_window}. \ref{fig:noisy1}, \ref{fig:noisy2} and \ref{fig:ref} and are 3D views where each bar represents a pixel whose gray-level corresponds to the height of the bar. Figures \ref{fig:dir1} and \ref{fig:dir2} are 2D top views of the window. The chosen window shows an edge between two regions of low slope. The images \ref{fig:noisy1} and \ref{fig:noisy2} are corrupted with two different random drawings of the same additive  white gaussian noise (AWGN) of power $\sigma^2$ and mean value $0$. \ref{fig:dir1} and \ref{fig:dir2} show, for each pixel of the window, the direction of the isoline found by PI-PD. In regions of low slope (the two regions at the top and the bottom), the determination of the direction is not robust. But near the edge, directions do not vary from one drawing to another.}
 621   \label{img_plans}
 622 \end{figure}
 623
 624 Within such regions, our speed goals forbid us to compute isoline directions with the PI-LD model, more robust
 625 but far too slow.
 626 Instead we propose a fast solution which implies designing an edge detector whose
 627 principle is to re-use the segment patterns defined in section \ref{lniv0} and to combine them by pairs in order to detect any possible LSR around the center pixel.
 628 If a LSR is detected, the output gray-level value is the average value computed on the current square window, otherwise, the PI-PD output value is used.
 629
 630 In order to further simplify computation, only the patterns that do not share any pixel are used.
 631 These patterns have a direction which is a multiple of $45^{\circ}$.
 632
 633 Each base direction $(\Theta_i)$ and its opposite $(\Theta_i + \pi) \left[2\pi\right]$ define a line that separates the square
 634 window in two regions (top and and bottom regions, denoted T and B).
 635 We assume that segments on the limit belong to the T region which includes pixels of orientation from $\Theta_i$ to $\Theta_i+\pi$. This region comprises three more segments of directions $(\Theta_i+\frac{\pi}{4})$, $(\Theta_i+\frac{2\pi}{4})$ and $(\Theta_i+\frac{3\pi}{4})$.
 636 The other region (B) only includes three segments of directions $(\Theta_i+\frac{5\pi}{4})$, $(\Theta_i+\frac{6\pi}{4})$ and $(\Theta_i+\frac{7\pi}{4})$.
 637
 638 Figure \ref{detect_plans} illustrates this organization for $\Theta_i=\Theta_4=45^{\circ}$. Each bar represents a pixel in the detector's window. Pixels with null height are not involved in the GLRT. Pixels represented by higher bars define the T region and those represented by shorter bars define the B region.
 639
 640 \begin{figure}[h]
 641 \begin{center}
 642  \includegraphics{./imgfs/pattern_detecteur.ps}
 643  \caption{\label{detect_plans}Edge detector. 3D view representing an example square 11x11 pixel window ($l=5$) used in the edge detector for $\Theta_4=45^{\circ}$ around a center pixel colored in black. Each pixel is represented by a bar. Bars of height value 0 are for pixels that are not involved in the detector. Top region is defined by five pattern segments and includes the center pixel. Bottomp region only includes three pattern segments. The different height values are meant to distinguish between each of the three different sets of pixels and their role. }
 644 \end{center}
 645
 646 \end{figure}
 647
 648 For each $\Theta_i$, one GLRT is performed in order to decide whether the two regions T and B defined above are likely to be seen as a single region or as two different ones, separated by an edge as shown in figure \ref{detect_plans}. The center pixel is located on the edge.
 649 Equations \eqref{LLNP}, \eqref{LLNP2} and \eqref{GLRT} lead to a similar GLRT expression:
 650 \begin{eqnarray}
 651 T2_{max}- (8.l+1).\left[log\left(\widehat{\sigma_3}^2\right) - log\left(\widehat{\sigma_4}^2\right) \right]
 652 \label{GLRT2}
 653 \end{eqnarray}
 654 where $\sigma_3$ is the standard deviation considering that the two regions are likely to define a single one and $\sigma_4$ the standard deviation if an edge is more likely to separate the two regions. $T2_{max}$ is the decision threshold.
 655 With equation \eqref{GLRT2}, a negative result leads to an edge detection, oriented towards direction $\Theta_i$.
 656 When GLRT is known for each $\Theta_i$, we apply the following hybridation policy:
 657 \begin{enumerate}
 658 \item more than one negative GLRT: the PI-PD output value is used.
 659 \item only one negative GLRT: the center pixel is likely to be on a well-defined edge, and only the region it belongs to is considered. The average value of its pixel gray levels is then used.\label{halfplane}
 660 \item no negative GLRT: the window around the center pixel is likely to be a LSR. The average value on the whole square window is used (11x11 pixels in the example of Figure \ref{detect_plans}).
 661 \end{enumerate}
 662
 663 \begin{figure}[h]
 664  \centering
 665 \subfloat[Reference noiseless airplane image]{\label{img_window:ref} \includegraphics{./imgfs/airplane.ps}}\qquad
 666 \subfloat[Location of the example window in the reference image.]{\label{img_window:win} \includegraphics{./imgfs/zoom_windows_A.ps}}\\
 667  \caption{Location of the example window inside the reference image. Figure \ref{img_window:ref} shows the whole reference image and \ref{img_window:win} zooms on the part where the example 11x11 pixel window is.}
 668   \label{img_window}
 669 \end{figure}
 670
 671 It must be noticed that point \ref{halfplane} has been introduced in order to achieve smoother transitions between regions to which  PI-PD is applied and those in which the plain average value is used.
 672 Figure \ref{exbords} shows an example of such a classification achieved by the edge detector. The detector has been applied on the top noisy airplane image with a GLRT threshold value $T2_{max}=2$. Black pixels represent pixel classified as \textit{on an edge}, while white ones are those which belong to LSR.
 673
 674 \begin{figure}[h]
 675  \centering
 676 \subfloat[Noisy airplane image]{\label{exbords:noisy} \includegraphics{./imgfs/airplane_noisy_small.ps}}\qquad
 677 \subfloat[Pixel classification performed by the edge detector. ]{\label{exbords:bords} \includegraphics{./imgfs/img_bords_T2_small.ps}}\\
 678  \caption{Pixel classification inside the noisy image. Figure \ref{exbords:noisy} shows the noisy input image and \ref{exbords:bords} reproduces the output classification of pixels, as a black and white image, obtained with threshold value  $T2_{max}=2$. Black pixels are supposed to be near an edge, while white pixels belong to Low Slope Regions.}
 679   \label{exbords}
 680 \end{figure}
 681
 682
 683 \section{\label{lniv} Hybrid PI-PD filter Implementation: details}
 684 All implementation details that will be given here are relative to the proposed PI-PD models and Nvidia$^\copyright~$ GPU devices.
 685
 686 \subsection{\label{genPaths}Segment patterns}
 687 The first kernel to be run is \texttt{kernel\_genPaths()} which generates matrix $P_l$. Its elements $(\Delta i; \Delta j)$ are the relative coordinates of the pixels which define segment patterns $p_{l,d}$. The dimensions of matrix $P_l$ are $D$ rows $\times$  $l$ columns.
 688 To fit GPU architecture as closely as possible, we chose $D=32$ patterns.
 689 Each segment $s_k$ of a poly-isoline can then be seen as a pattern $p_{l,d}$ applied on the starting pixel $(i,j)$ of this segment, denoted $p_{l,d}(i,j)$.
 690
 691 The example in figure \ref{p5q1} shows the first quarter of $P_5$ and the corresponding eight discrete segment patterns in the first quadrant. The three remaining quarters of the matrix are easily deduced by applying successive rotations of angle $\frac{\pi}{2}$ to the above elements.
 692
 693 \begin{figure}[h]
 694 \begin{center}
 695 \includegraphics[width=0.65\linewidth]{./imgfs/P5Q1.ps}
 696 \end{center}
 697 \tiny{
 698 $$
 699 P_5 =
 700 \begin{bmatrix}
 701 (0,1)&(0,2)&(0,3)&(0,4)&(0,5)\\
 702 (0,1)&(0,2)&(-1,3)&(-1,4)&(-1,5)\\
 703 (0,1)&(-1,2)&(-1,3)&(-2,4)&(-2,5)\\
 704 (-1,1)&(-1,2)&(-2,3)&(-3,4)&(-3,5)\\
 705 (-1,1)&(-2,2)&(-3,3)&(-4,4)&(-5,5)\\
 706 (-1,1)&(-2,1)&(-3,2)&(-4,3)&(-5,3)\\
 707 (-1,0)&(-2,1)&(-3,1)&(-4,2)&(-5,2)\\
 708 (-1,0)&(-2,0)&(-3,1)&(-4,1)&(-5,1)\\
 709 \ldots&\ldots&\ldots&\ldots&\ldots\\
 710 \end{bmatrix}
 711 $$
 712 }
 713 \caption{\label{p5q1}Top: example segment patterns $p_{5,d}$ for $d\in[0;7]$; the black pixel represents the center pixel $(i,j)$, which does not belong to the pattern. The gray ones define the actual pattern segments. Bottom: the first 8 lines of corresponding matrix $P_5$ whose elements are the positions of segment pixels with respect to the center pixel.}
 714 \end{figure}
 715
 716 \subsection{\label{sipd}Generation of reference matrices $I_{\Sigma}$ and $I_{\Theta}$}
 717 In order to generate both matrices, a GPU kernel \texttt{kernel\_precomp()} computes, in parallel for each pixel $(i,j)$:
 718 \begin{itemize}
 719 \item the direction $\delta$ of the most likely segment $s_1 = p_{l,\delta}(i,j)$ among the $D$ possible ones. This value is stored in matrix $I_{\Theta}$ at position $(i,j)$.
 720 \item values $C_x(s_1)$ and $C_{x^2}(s_1)$ defined in equations \eqref{cx} and \eqref{cx2}. This vector of values is stored in matrix $I_{\Sigma}$ at position $(i,j)$.
 721 \end{itemize}
 722
 723 In order to reduce processing time, the input image is first copied into texture memory (see algorithm \ref{algoinit} for initializations and memory transfer details), thus taking advantage of the 2D optimized caching mechanism.
 724
 725 This kernel follows the \textit{one thread per pixel} rule. Consequently, each value of $P_l$ has to be accessed by every thread of a block. That led us to load it from texture memory first, then copy it into all shared memory blocks. This has proved to be the fastest scheme.
 726
 727 Algorithm \ref{algoprecomp} summarizes the computations achieved by \texttt{kernel\_precomp()}.
 728 Vector $(C_x, C_{x2})$ stores the values of $C_x(s_1)$ and $C_{x^2}(s_1)$ associated with the current tested pattern.
 729 Vector $(C_{x-best}, C_{x2-best})$ stores the values of $C_x(s_1)$ and $C_{x^2}(s_1)$ associated with the best previously tested pattern.
 730
 731 In the same manner, $\sigma$ and $\sigma_{best}$ are deviation values for current and best tested patterns.
 732
 733  The selection of the best pattern is driven by the value of the standard deviation of candidate isolines.
 734 Lines 2 and 3 compute both sums for the first pattern to be evaluated. Line 4 computes its standard deviation.
 735 Then, lines 5 to 14 loop on each pattern and keep values associated with the best pattern found. These values are eventually stored in matrices $I_{\Theta}$ and $I_{\Sigma}$ on lines 16 and 17.
 736
 737 \begin{algorithm}[htb]
 738   \SetNlSty{textbf}{}{:}
 739 \caption{Initializations in GPU memory}
 740 \label{algoinit}
 741 $l \leftarrow$ step size\;
 742 $D \leftarrow$ number of primary directions\;
 743 $I_n \leftarrow$ noisy image\;
 744 $I_{n tex} \leftarrow I_n $\tcc*[r]{copy to texture mem.}
 745 $P_l \leftarrow$ kernel\_genPaths \tcc*[r]{pattern matrix}
 746 $P_{l tex} \leftarrow P_l $\tcc*[r]{copy to texture mem.}
 747 $T_{max} \leftarrow$ GLRT threshold (lengthening)\;
 748 $T2_{max} \leftarrow$ GLRT threshold (edge detection)\;
 749 \end{algorithm}
 750
 751 \begin{algorithm}
 752   \SetNlSty{textbf}{}{:}
 753   \SetKwComment{Videcomment}{}{}
 754 \caption{generation of reference matrices, kernel \texttt{kernel\_precomp()}}
 755 \label{algoprecomp}
 756 \ForEach(\tcc*[f]{\textbf{in parallel}}){pixel $(i,j)$}{
 757   $C_{x-best} \leftarrow  \displaystyle\sum_{(y,x)\in p_{l,0}(i,j)} I_{n tex}(i+y,j+x)$ \;
 758   $C_{x2-best} \leftarrow  \displaystyle\sum_{(y,x)\in p_{l,0}(i,j)} I_{n tex}^2(i+y,j+x)$ \;
 759   $\sigma_{best} \leftarrow$ standard deviation along $p_{l,0}(i,j)$ \;
 760   %\Videcomment{}
 761   \tcc{loop on each pattern}
 762   \ForEach{$d \in [1;D-1]$}{
 763     $C_x \leftarrow \displaystyle\sum_{(y,x)\in p_{l,d}(i,j)} I_{n tex}(i+y,j+x)$\;
 764     $C_{x2} \leftarrow \displaystyle\sum_{(y,x)\in p_{l,d}(i,j)} I_{n tex}^2(i+y,j+x)$\;
 765     $\sigma \leftarrow$ standard deviation along $p_{l,d}(i,j)$\;
 766     \If(\tcc*[f]{keep the best}){$\sigma_d < \sigma_{best}$}{
 767       $C_{x-best} \leftarrow C_x$ \;
 768       $C_{x2-best} \leftarrow C_{x2}$ \;
 769       $\Theta_{best} \leftarrow d$ \;
 770     }
 771   }
 772   $I_{\Sigma}(i,j) \leftarrow \left[ C_{x-best}, C_{x2-best}\right]$ \tcc*[r]{stores}
 773   $I_{\Theta}(i,j) \leftarrow \Theta_{best}$ \tcc*[r]{in matrices}
 774 }
 775 \end{algorithm}
 776
 777 \begin{algorithm}[ht]
 778 \SetNlSty{textbf}{}{:}
 779 \caption{PI-PD lengthening process \texttt{kernel\_PIPD()}}
 780 \label{algoPIPD}
 781 \ForEach(\tcc*[f]{\textbf{in parallel}}){pixel $(i,j)$}{
 782  % \tcc{in parallel}
 783  % \tcc{starting pixel $(i,j)$ and first segment without GLRT}
 784   $(C_x^1, C_{x2}^1) \leftarrow z(i,j)$ \tcc*[r]{starting pixel}
 785   $(i_1, j_1) \leftarrow (i, j)$ \tcc*[r]{first segment}
 786   $(C_x^1, C_{x2}^1) \leftarrow I_{\Sigma}(i_1,j_1)$ \tcc*[r]{read matrix}
 787   $d_1 \leftarrow I_{\Theta}(i,j)$ \tcc*[r]{read matrix}
 788   $l_1 \leftarrow l$ \tcc*[r]{isoline length}
 789   $\sigma_1 \leftarrow (C_{x2}^1/l_1 - C_x^1)/l_1$\;
 790   $(i_2, j_2) \leftarrow end~of~first~segment$\;
 791   $(C_{x}^2, C_{x2}^2) \leftarrow I_{\Sigma}(i_2,j_2) $ \tcc*[r]{2$^{nd}$ segment}
 792   $d_2 \leftarrow I_{\Theta}(i_2,j_2)$\;
 793   $\sigma_2 \leftarrow (C_{x2}^2/l - C_x^2)/l$ \;
 794   %
 795   \While{$GLRT(\sigma_1, \sigma_2, l_1, l) < T_{max}$}{
 796     $l_1 \leftarrow l_1 + l$ \tcc*[r]{lengthening}
 797     $(C_x^1, C_{x2}^1) \leftarrow (C_x^1, C_{x2}^1)+(C_x^2, C_{x2}^2)$\;
 798     $\sigma_1 \leftarrow (C_{x2}^1/l_1 - C_x^1)/l_1$ \tcc*[r]{update}
 799     $(i_1,j_1) \leftarrow (i_2, j_2)$ \tcc*[r]{step forward}
 800     $d_1 \leftarrow d_2$\;
 801     $(i_2, j_2) \leftarrow end~of~next~segment$\;
 802     \tcc*[f]{next segment}
 803     $(C_{x}^2, C_{x2}^2) \leftarrow I_{\Sigma}(i_2,j_2) $\;
 804     $d_2 \leftarrow I_{\Theta}(i_2,j_2)$\;
 805     $\sigma_2 \leftarrow (C_{s2}^2/l - C_s^2)/l$ \;
 806     }
 807   }
 808   $\widehat{I}(i, j) \leftarrow C_x^1/l_1$ \tcc*[r]{isoline value}
 809 \end{algorithm}
 810
 811
 812 \subsection{\label{sipd}PI-PD lengthening process:  \texttt{kernel\_PIPD()} }
 813 This parallel kernel is run in order to obtain the image of the \emph{isolines}. It is detailed in algorithm \ref{algoPIPD}, (see section \ref{lniv3} for process description).
 814
 815 Lines from 2 to 11 perform allocations for the first lengthening to evaluate.
 816 More precisely, $(i_1, j_1)$ represents the starting pixel of the current segment; $(i_2, j_2)$ is both its ending pixel and the starting pixel of the next segment; $d_1$ and $d_2$ are their directions, read from precomputed matrix $I_{\Theta}$.
 817 $C_x^1$ and $C_{x2}^1$ are the gray-level sums along the current poly-isoline; $C_x^2$ and $C_{x2}^2$ are the gray-level sums of the candidate segment.
 818 The current poly-isoline ends at $(i_1, j_1)$ and is made of $l_1$ pixels (already accepted segments); its standard deviation is $\sigma_1$.
 819 The loop extending from lines 12 to 21 performs the allocations needed to proceed one segment forward, as long as GLRT is true.
 820 If the lengthening has been accepted, the length of the poly-isoline is updated in line 13, and the same is done with $C_x$ and $C_{x2}$ which are read from precomputed matrix $I_{\Sigma}$ (see equations \eqref{cx} and \eqref{cx2} for definition).
 821 Finally, using direction value $d_2$, it translates the coordinates $(i_1, j_1)$ to the end of the newly elongated poly-isoline, and $(i_2, j_2)$ to the end of the next segment to be tested.
 822 As soon as the GLRT condition becomes false, line 23 eventually produces the output value of the denoised image at pixel $(i,j)$, that is, the average gray-level value along the poly-isoline.
 823
 824
 825
 826
 827 \subsection{\label{sipd}Hybrid PI-PD :  \texttt{kernel\_edge\_detector()} }
 828 As introduced in section \ref{pipd_plan}, the aim of kernel \texttt{kernel\_edge\_detector()} is to divide pixels into two classes according to their belonging to a LSR or not. Algorithm \ref{algoDetect} explains the detailled procedure. Lines 2 to 6 initialize values of the direction index ($\Theta$), the number of edges detected ($edgeCount$), the gray-level sum along the pixels that defines the H half-plane ($sumEdge$) and the number of pixels that defines both half-planes H and L ($nH$, $nL$). Then the loop starting at line 7 performs the GLRT for every considered direction index $\Theta$.
 829 Values $sumH$ and $sumL$ are vectors of two parameters $x$ and $y$, parameter $x$ being the sum of gray-level values and $y$ the sum of square gray-level values. Value $sumH$ is computed along the pixels of half-plane H and is obtained by loop at lines 10 to 14; Value $sumL$ is computed along the pixels of half-plane L and is obtained by loop at lines 15 to 19. Value $I_{ntex}(i,j)$ refers to the gray-level value at pixel (i,j) previously stored in texture memory. Eventually, the isoline level value is output at line 27, 30 or 33 depending on the situation (see \ref{pipd_plan} for details about the decision process).
 830
 831 \begin{algorithm}[ht]
 832 \SetNlSty{textbf}{}{:}
 833 \caption{edge detector and pixel classifier \texttt{kernel\_edge\_detector()}}
 834 \label{algoDetect}
 835 \ForEach(\tcc*[f]{\textbf{in parallel}}){pixel $(i,j)$}{
 836   $\Theta \leftarrow 0$\tcc*[r]{direction index}
 837   $edgeCount \leftarrow 0$\;
 838   $sumEdge \leftarrow 0$\;
 839   $nH \leftarrow 5l+1$\;
 840   $nL \leftarrow 3l$\;
 841   \While{($\Theta < 32$) }{
 842     $sumH \leftarrow (I_{ntex}(i,j), I_{ntex}^2(i,j))$\;
 843     $sumL \leftarrow (0, 0)$\;
 844     \For{($\alpha=\Theta$ to $\alpha=\Theta+16$ by step $4$)}{
 845       $sPat \leftarrow \displaystyle\sum_{(y,x)\in P_{l,\alpha}(i,j)} I_{n tex}(i+y,j+x)$\;
 846       $sPat2 \leftarrow \displaystyle\sum_{(y,x)\in P_{l,\alpha}(i,j)} I^2_{n tex}(i+y,j+x)$\;
 847       $sumH \leftarrow sumH + (sPat, sPat2)$\;
 848     }
 849     \For{($\alpha=\Theta+20$ to $\alpha=\Theta+28$ by step $4$)}{
 850       $sPat \leftarrow \displaystyle\sum_{(y,x)\in P_{l,\alpha}(i,j)} I_{n tex}(i+y,j+x)$\;
 851       $sPat2 \leftarrow \displaystyle\sum_{(y,x)\in P_{l,\alpha}(i,j)} I^2_{n tex}(i+y,j+x)$\;
 852       $sumL \leftarrow sumL + (sPat, sPat2)$\;
 853     }
 854     \If{($GLRT(sumH, nH, sumL, nL) > T2_{max}$)}{
 855       $edgeCount \leftarrow edgeCount + 1$\;
 856       $sumEdge \leftarrow sumH.x$\;
 857     }
 858     $\Theta \leftarrow \Theta + 4$\;
 859     }
 860     \tcc{outputs isoline value}
 861     \If{($edgeCount == 0$)}{
 862       $\widehat{I}(i,j) \leftarrow \dfrac{(sumH.x + sumL.x)}{nH+nL}$ \tcc*[r]{LSR}
 863     }
 864     \If{($edgeCount == 1$)}{
 865       $\widehat{I}(i,j) \leftarrow \dfrac{(sumEdge)}{nH}$
 866     }
 867     \If{($edgeCount > 1$)}{
 868       $\widehat{I}(i,j) \leftarrow \widehat{I_{PIPD}}(i,j)$\tcc*[r]{PI-PD}
 869     }
 870   }
 871 \end{algorithm}
 872
 873 \section{\label{results}Results}
 874  The proposed hybrid PI-PD model has been evaluated with the 512x512 pixel sample images used by \cite{denoiselab} in order to make relevant comparisons with other filtering techniques. As we aim to address image processing in very noisy conditions (as in \cite{6036776}), we focused on the noisiest versions, degraded by AWGN of standard deviation $\sigma=25$.
 875
 876  % First, as an experimental verification, we analyzed ouput images produced by PI-LD and it reveals that the primary direction of a poly-isoline (\textit{i.e.} the direction of its first segment) most often reflects the direction of the most likely segment which could have been selected by a constrained PI-LD model where $K=1$ (denoted PI-LD$_{K=1}$). It mostly confirms our assumption that computing the primary direction at each pixel can be considered sufficient provided it can be done with enough robustness.
 877
 878  % Figure \ref{histodir} shows an example histogram of the differences in primary directions according to the PI-LD model between maximum number of segments $K=1$ (\textit{i.e.} PI-PD) and $K=8$ ($l=4$ in both cases).
 879  % It appears that more than 60\% of the isolines share the same direction and more than 80\% differ by less than 22.5 degrees.
 880  % The same histogram's shape would be obtained with all thirteen images in the panel, with little variation.
 881  % The estimation of isoline directions of the remaining 40\% pixels is not robust and leads to different direction values as pixels involved belong to low slope regions. In such regions, PI-PD model is not relevant and is replaced by the average value of all pixels involved in edge detection, which represent the best compromise, as long as the detector window is small and the slope value is low.
 882
 883
 884 % \begin{figure}[h]
 885 % \centering\includegraphics{./imgfs/histodir.ps}
 886 % \caption{\label{histodir}Example of angular differences between the results of \textit{PI-LD}$_{K=1}$ and \textit{PI-LD}$_{K=8}$ variants of the model ($l=4$). The above histogram represents the number of pixels as a function of the angular difference (in percentage of the total number of pixels of the image). The mandrill image is the input image processed in the example.}
 887 % \end{figure}
 888
 889
 890 Quality measurements of the denoised images in comparison with reference images have been obtained by the
 891 evaluation of:
 892 \begin{enumerate}
 893 \item Peak Signal to Noise Ratio (PSNR) that quantifies the mean square error between denoised and reference images:
 894  $MSE(I,\widehat{I})$. We used the following expression: $$PSNR =10.log_{10}\left(\frac{max(\widehat{I})}{MSE(I,\widehat{I})}\right)$$
 895  PSNR values are given in dB and highest values mean best PSNR.
 896 \item The Mean Structure Similarity Index (MSSIM, defined in \cite{Wang04imagequality}), which quantifies local similarities between denoised and reference images inside a sliding window. MSSIM values belong to an interval $[0; 1]$; the closer to 1 the better.
 897 \end{enumerate}
 898 PSNR is widely used to measure image quality but can be misleading when used by itself: as demonstrated in \cite{Wang04imagequality}, the processing of noisy images can bring a high PSNR value but very bad visual quality. This is avoided by the use of the MSSIM index along with the PSNR value: when both of them show high values, the overall quality can be considered high.
 899
 900 Result figure \ref{tablePI} provides the PSNR and MSSIM of every image, denoised with three different filters: average 5x5, hybrid PI-PD and BM3D. The \emph{noisy} column shows the values for each image before denoising. BM3D (\cite{Dabov09bm3dimage}) is taken as a reference in terms of denoising quality, while the average filter is taken as a reference in terms of processing time. The window size of 5x5 pixels has been choosen to achieve PSNR values similar to those obtained by PI-PD.
 901
 902 BM3D code is run on a quad-core Xeon E31245 at 3.3GHz and 8GByte RAM under linux kernel 3.2 (64bits), while PI-PD as well as average filter codes is run on a Nvidia C2070 GPU hosted by a PC running linux kernel 2.6.18 (64bits).
 903 The average filter used is an efficient parallel GPU implementation that we developped. It is a generic and versatile separable convolution kernel that outputs more than 700MPixels per second in the 5x5 averaging configuration.
 904
 905 Hybrid PI-PD measurements were performed with $n=25$, $l=5$, $T_{max}=1$ and $T2_{max}=2$. BM3D measurements have been performed with the freely available BM3D software proposed in \cite{Dabov09bm3dimage}.
 906
 907 The hybrid PI-PD model proves much faster than BM3D and better than the average 5x5 filter.
 908 Processing the thirteen images of the database reveals that hybrid PI-PD brings an average improvement of 1.5dB (PSNR) and 7.2\% (MSSIM) against the average filter at the cost of 35 times its computational duration. Against hybrid PI-PD, BM3D achieves an average improvement of 2.4dB and 4.6\%  at the cost of 350 times as much duration.
 909 Actually, the 5x5 average filter takes around \textbf{0.35~ms} to process an image while hybrid PI-PD needs around \textbf{11~ms} and BM3D around \textbf{4.3~s}.
 910
 911 It must be noticed that experimental optimization show that the vector of parameter values $T_{max}=1$ and $T2_{max}=2$ is optimal for 11 of the 13 images of the database. Better results are obtained with a slightly different value of $T2_{max}$ for \textit{peppers} or \textit{zelda} whose denoised images can obtain a MSSIM index of 0.90.
 912 Most of the computational time of hybrid PI-PD is spent by the edge detector, which clearly does not fit GPU requirements to achieve good performance. For information, the simple PI-PD model runs in less than 4~ms in the same conditions.
 913
 914 Figure \ref{comparimg} shows denoised images produced by hybrid PI-PD model compared with the output of the BM3D and the average 5x5 filters. The figure illustrates the merits and drawbacks of each model: edges are well preserved by hybrid PI-PD, but a \textit{staircase} effect is visible, a well-known artefact inherent to this type of neighborhood filters. Our recent GPU-implementation of the regression method proposed in \cite{BuadesCM06} brings a mean improvement of 1dB at the cost of 0.4~ms.
 915
 916
 917
 918 \begin{figure}
 919 %\begin{table}[h]
 920 \footnotesize
 921  \begin{center}
 922 \begin{tabular}{|c|r|r|r|r|}\hline
 923 \bf Image&\bf Noisy  &\bf average &\bf hybrid&\bf BM3D \\
 924          &           &\bf 5x5     &\bf PI-PD &         \\\hline\hline
 925 airplane &   19.49dB &   26.39dB  &  28.46dB& 30.88dB  \\
 926          &    0.58   &    0.84    &  0.88  &   0.93    \\\hline
 927 barbara  &   20.04dB &   22.76dB  & 24.26dB&  30.60dB  \\
 928          &    0.70   &    0.76    &  0.83  &   0.94    \\\hline
 929 boat     &   20.33dB &   25.58dB  & 27.54dB&  30.02dB  \\
 930          &    0.66   &    0.81    &  0.87  &   0.91    \\\hline
 931 couple   &   20.28dB &   25.25dB  & 27.33dB&  29.77dB  \\
 932          &    0.69   &    0.79    &  0.87  &   0.91    \\\hline
 933 elaine   &   19.85dB &   28.71dB  & 28.94dB&  30.60dB  \\
 934          &    0.59   &    0.86    &  0.87  &   0.91    \\\hline
 935 fingerprint &20.34dB &   23.33dB  & 26.07dB&  27.93dB  \\
 936          &     0.93  &    0.87    &  0.95  &   0.96    \\\hline
 937 goldhill &    19.59dB&   26.47dB  & 27.43dB&  29.22dB  \\
 938          &     0.67  &    0.82    &  0.87  &   0.88    \\\hline
 939 lena     &    19.92dB&   27.99dB  & 29.14dB&  31.80dB  \\
 940          &     0.60  &    0.84    &  0.88  &   0.93    \\\hline
 941 man      &    20.38dB&   24.74dB  & 26.74dB&  28.14dB  \\
 942          &     0.71  &    0.80    &  0.86  &   0.87    \\\hline
 943 mandrill &    19.34dB&   20.34dB  & 22.38dB&  24.75dB  \\
 944          &     0.77  &    0.69    &  0.83  &   0.88    \\\hline
 945 peppers  &    19.53dB&   27.30dB  & 28.68dB&  30.87dB  \\
 946          &     0.61  &    0.86    &  0.87  &   0.92    \\\hline
 947 stream   &    20.35dB&   23.23dB  & 25.35dB&  26.34dB  \\
 948          &     0.80  &    0.78    &  0.87  &   0.88    \\\hline
 949 zelda    &    17.71dB&   23.13dB  & 27.71dB&  30.49dB  \\
 950          &     0.58  &    0.87    &  0.88  &   0.93    \\\hline
 951  \end{tabular}
 952  \end{center}
 953 \caption{Comparison between hybrid PI-PD, average and BM3D filters. PI-PD parameter values: $n=25$, $l=5$, $T_{max}=1$ and $T2_{max}=2$. The \emph{noisy} column correspond to the noisy input images, before denoising. \newline Timings: average filter in around 0.35~ms hybrid PI-PD in around 11.0~ms and BM3D in around 4.3~s}
 954 \label{tablePI}
 955 %\end{table}
 956 \end{figure}
 957
 958 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 959 %  table avec PIPD simple, non present dans version 2
 960 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 961 % \begin{figure}
 962 % %\begin{table}[h]
 963 % \footnotesize
 964 %  \begin{center}
 965 % \begin{tabular}{|c|r|r|r|r|}\hline
 966 % \bf Image&\bf Noisy  &\bf median&\bf mean \\
 967 %          &           &\bf 5x5   &    5x5  \\\hline\hline
 968 % airplane &   19.49dB & 28.46dB  & 28.47dB    \\
 969 %          &    0.58   &  0.88    &  0.87      \\\hline
 970 % barbara  &   20.04dB & 24.26dB  & 24.22dB    \\
 971 %          &    0.70   &  0.83    &  0.82      \\\hline
 972 % boat     &   20.33dB & 27.54dB  & 27.55dB    \\
 973 %          &    0.66   &  0.87    &  0.87      \\\hline
 974 % couple   &   20.28dB & 27.33dB  & 27.37dB    \\
 975 %          &    0.69   &  0.87    &  0.87      \\\hline
 976 % elaine   &   19.85dB & 28.94dB  & 28.86dB    \\
 977 %          &    0.59   &  0.87    &  0.86      \\\hline
 978 % fingerprint &20.34dB & 26.07dB  & 26.65dB    \\
 979 %          &     0.93  &  0.95    &  0.95      \\\hline
 980 % goldhill &    19.59dB& 27.43dB  & 27.46dB    \\
 981 %          &     0.67  &  0.87    &  0.87      \\\hline
 982 % lena     &    19.92dB& 29.14dB  & 29.09dB    \\
 983 %          &     0.60  &  0.88    &  0.87      \\\hline
 984 % man      &    20.38dB& 26.74dB  & 26.80dB    \\
 985 %          &     0.71  &  0.86    &  0.86      \\\hline
 986 % mandrill &    19.34dB& 22.38dB  & 22.44dB    \\
 987 %          &     0.77  &  0.83    &  0.84      \\\hline
 988 % peppers  &    19.53dB& 28.68dB  & 28.60dB    \\
 989 %          &     0.61  &  0.87    &  0.86      \\\hline
 990 % stream   &    20.35dB& 25.35dB  & 25.41dB    \\
 991 %          &     0.80  &  0.87    &  0.88      \\\hline
 992 % zelda    &    17.71dB& 27.71dB  & 27.64dB    \\
 993 %          &     0.58  &  0.88    &  0.87      \\\hline
 994 %  \end{tabular}
 995 %  \end{center}
 996 % \caption{Comparison between two fast and simple reference filters: median and mean. Both filters share the same kernel size of 5x5 pixels.\newline Timings: median filter in around 4.0~ms,  mean filter in around 12.0~ms.}
 997 % \label{tablePI}
 998 % %\end{table}
 999
1000 % \begin{figure}
1001 % %\begin{table}[h]
1002 % \footnotesize
1003 %  \begin{center}
1004 % \begin{tabular}{|c c|r|r|r|}\hline
1005 %   \multicolumn{2}{|r|}{\bf size (pixels)$\rightarrow$}  &  512x512    &  1024x1024 &  4096x4096 \\
1006 %   \multicolumn{2}{|l|}{\bf filter $\downarrow}$         &             &            &            \\\hline\hline
1007 % \bf mean 5x5     & psnr                                 & 26.39dB     & 27.39dB    &            \\\cline{3-5}
1008 %                  & mssim                                &  0.84       &  0.75      &            \\\cline{3-5}
1009 %                  & time                                 &  0.35ms     & 1.25ms     &            \\\hline
1010 % \bf hybrid PI-PD & psnr                                 & 28.46dB     & 26.31dB    &  32.17dB   \\\cline{3-5}
1011 %                  & mssim                                &  0.88       & 0.87       &  0.99      \\\cline{3-5}
1012 %                  & time                                 &  4.0ms      & 85ms       &  1.6s      \\\hline
1013 % \bf BM3D         & psnr                                 & 30.88dB     & 26.09dB    &   38.72dB  \\\cline{3-5}
1014 %                  & mssim                                &  0.93       &  0.72      &    0.99    \\\cline{3-5}
1015 %                  & time                                 &  4.3s       & 16.5s      &  320.0s    \\\hline
1016 %  \end{tabular}
1017 %  \end{center}
1018 % \caption{Comparison between hybrid PI-PD and BM3D filters for three different size of the noisy airplane input image ($\sigma=25$). PI-PD parameter values: $T_{max}=1$ and $T2_{max}=2$ for all three images, $l=5$ and $n=25$ for size 512$^2$ ( $l=12$, $n=100$ and $l=32$, $n=200$ respectively for sizes 1024$^2$ and 4096$^2$)}.\label{compimgsizes}
1019 % %\end{table}
1020 % \end{figure}
1021
1022 \begin{figure}[h]
1023 \centering
1024 \subfloat[Noisy image $\sigma=25$]{\label{fig:noisy} \includegraphics{./imgfs/airplane_25_noisy_zoom.ps}}\qquad
1025 \subfloat[Average 5x5 filter, in $0.35~ms$]{\label{fig:pipd} \includegraphics{./imgfs/airplane_25_mean5_zoom.ps}}\\
1026 \subfloat[PI-PD hybrid filter, $n=25$, $l=5$, $T_{max}=1$, $T2_{max}=2$, in $11~ms$ ]{\label{fig:hpipd} \includegraphics{./imgfs/airplane_zoom_hybrid_6_r50_T10_P2.ps}}\qquad
1027 \subfloat[BM3D filter, in $4.3s$]{\label{fig:bm3d} \includegraphics{./imgfs/airplane_bm3d_zoom.ps}}
1028 \caption{Comparison of 512x512 images denoised from noisy airplane image (\ref{fig:noisy}) with a PI-PD filter (\ref{fig:pipd}), PI-PD hybrid filter (\ref{fig:hpipd}) and BM3D filter (\ref{fig:bm3d}). Only zoomed parts of images are shown in order to ensure better viewing.}
1029 \label{comparimg}
1030 \end{figure}
1031
1032 \section{\label{conclusion}Conclusion, future work}
1033 From the start, our approach, unlike quite a few others, has been to base this study on the conception and characteristics of the targeted hardware (Nvidia Graphic cards).
1034
1035 So as to get high execution speeds, we chose, for example, to find a method that remains local (concentrating on the immediate neighborhood of the center pixel), but still provides very significant benefits, using our technique of progressive lengthening.
1036
1037 Nevertheless, our method has proved slightly sub-optimal and lacking robustness in \textit{flat} regions (see above, Low Slope Regions), even if the actual visual effect may be considered quite satisfactory.
1038
1039 As a first step to address the above drawbacks, we have devised a hybrid method that detects and applies distinct processing to LSR regions (see above). Processing speeds remain fast, and much higher than the BM3D implementation taken as quality reference. This is very promising, and opens the perspective of real-time high definition  image sequence processing at 25 fps, provided we improve the edge detector, which currently limits the HD frame rate at 16fps (High Definition: 1920x1080 pixels).
1040
1041 To further improve the quality of output images, we also implemented a efficient parallel implementation of the staircase effect reduction technique presented in \cite{BuadesCM06}. With this method, searching for best improvement factors leads to different parameters values for each image processed, which prompts to studying some way of overriding such parameters.
1042
1043 Our study so far has been based on additive noise; we are currently working on transposing criteria to various multiplicative noise types. We also extended the process to color images with very interesting visual results to be confirmed by the experimental measurement currently in progress.
1044
1045
1046 %\vspace{0.5cm}
1047 %\textbf{Aknowledgements:} We thank Ingrid Couturier and Daniel Cuney for their help in refining the use of English in this paper.
1048
1049 % trigger a \newpage just before the given reference
1050 % number - used to balance the cumns on the last page
1051 % adjust value as needed - may need to be readjusted if
1052 % the document is modified later
1053 %\IEEEtriggeratref{8}
1054 % The "triggered" command can be changed if desired:
1055 %\IEEEtriggercmd{\enlargethispage{-5in}}
1056
1057 % references section
1058 \IEEEpeerreviewmaketitle
1059 \bibliographystyle{IEEEtran}
1060
1061 %\bibliography{IEEEabrv,/home/perrot/Documents/biblio}
1062 \bibliography{IEEEabrv,biblio}
1063
1064
1065 % Gilles Perrot is a Ph.D student at the university of Franche-comt\'{e} within the FEMTO-ST Laboratory. His research interests currently focus on GPU-oriented, high performance image processing algorithms, especially in very noisy conditions.
1066 % His university studies have been conducted at University of Orsay (Paris X) and at \'{E}cole Normale Sup\'{e}rieure (ENS).
1067 % He received the Agr\'{e}gation of electrical engineering (EE) in 1994 (the highest qualification of teachers in France, obtained through a prestigious examination).
1068 % In 1995 he presented a MA thesis in signal processing on the topic of modeling the jaw-larynx-hyoid dynamics.
1069 % Since then, he has been a EE teacher in several high-schools, technical advisor and manager for several projects on Communication and Information technologies.
1070 % that's all folks
1071 \end{document}
1072