9ème :

[dmems12.git] / dmems12.tex
diff --git a/dmems12.tex b/dmems12.tex

index 806e5098f69359b8b9436f61f3f5cf542ffccab1..314368e8ff8e9cad225c946751adb95aa857a125 100644 (file)
--- a/dmems12.tex
+++ b/dmems12.tex
@@ -1,7 +1,9 @@
-\documentclass[12pt]{article}
+
+\documentclass[10pt, peerreview, compsocconf]{IEEEtran}
  %\usepackage{latex8}
  %\usepackage{times}
  %\usepackage{latex8}
  %\usepackage{times}
-\usepackage[latin1]{inputenc}
+\usepackage[utf8]{inputenc}
+%\usepackage[cyr]{aeguill}
  %\usepackage{pstricks,pst-node,pst-text,pst-3d}
  %\usepackage{babel}
  \usepackage{amsmath}
  %\usepackage{pstricks,pst-node,pst-text,pst-3d}
  %\usepackage{babel}
  \usepackage{amsmath}
@@ -16,36 +18,47 @@
  \usepackage{fullpage}
  \usepackage{fancybox}
  
  \usepackage{fullpage}
  \usepackage{fancybox}
  
+\usepackage[ruled,lined,linesnumbered]{algorithm2e}
+
  %%%%%%%%%%%%%%%%%%%%%%%%%%%% LyX specific LaTeX commands.
  \newcommand{\noun}[1]{\textsc{#1}}
  
  \newcommand{\tab}{\ \ \ }
  
  %%%%%%%%%%%%%%%%%%%%%%%%%%%% LyX specific LaTeX commands.
  \newcommand{\noun}[1]{\textsc{#1}}
  
  \newcommand{\tab}{\ \ \ }
  
-%%%%%%%%%%%%%%%%%%%%%%%%%%%% my bib path.
+
+
+\begin{document}
+
+
+%% \author{\IEEEauthorblockN{Authors Name/s per 1st Affiliation (Author)}
+%% \IEEEauthorblockA{line 1 (of Affiliation): dept. name of organization\\
+%% line 2: name of organization, acronyms acceptable\\
+%% line 3: City, Country\\
+%% line 4: Email: name@xyz.com}
+%% \and
+%% \IEEEauthorblockN{Authors Name/s per 2nd Affiliation (Author)}
+%% \IEEEauthorblockA{line 1 (of Affiliation): dept. name of organization\\
+%% line 2: name of organization, acronyms acceptable\\
+%% line 3: City, Country\\
+%% line 4: Email: name@xyz.com}
+%% }
+
  
  
  \title{Using FPGAs for high speed and real time cantilever deflection estimation}
  
  
  \title{Using FPGAs for high speed and real time cantilever deflection estimation}
+\author{\IEEEauthorblockN{Raphaël Couturier\IEEEauthorrefmark{1}, Stéphane Domas\IEEEauthorrefmark{1}, Gwenhaël Goavec-Merou\IEEEauthorrefmark{2} and Michel Lenczner\IEEEauthorrefmark{2}}
+\IEEEauthorblockA{\IEEEauthorrefmark{1}FEMTO-ST, DISC, University of Franche-Comte, Belfort, France\\
+\{raphael.couturier,stephane.domas\}@univ-fcomte.fr}
+\IEEEauthorblockA{\IEEEauthorrefmark{2}FEMTO-ST, Time-Frequency, University of Franche-Comte, Besançon, France\\
+\{michel.lenczner@utbm.fr,gwenhael.goavec@trabucayre.com}
+}
+
  
  
-\author{ Raphaël COUTURIER\\
-Laboratoire d'Informatique 
-de l'Universit\'e de  Franche-Comt\'e, \\
-BP 527, \\
-90016~Belfort CEDEX, France\\
- \and Stéphane Domas\\
-Laboratoire d'Informatique 
-de l'Universit\'e de  Franche-Comt\'e, \\
-BP 527, \\
-90016~Belfort CEDEX, France\\
- \and Gwenhaël Goavec\\
-??
-?? \\
-??, \\
-??\\}
  
  
  
  
-\begin{document}
  
  
-\maketitle
+
+%\maketitle
  
  \thispagestyle{empty}
  
  
  \thispagestyle{empty}
  
@@ -53,39 +66,112 @@ BP 527, \\
  
    
  
  
    
  
-{\it keywords}: FPGA, cantilever, interferometry.
+
  \end{abstract}
  
  \end{abstract}
  
+\begin{IEEEkeywords}
+FPGA, cantilever, interferometry.
+\end{IEEEkeywords}
+
+
+\IEEEpeerreviewmaketitle
+
  \section{Introduction}
  
  \section{Introduction}
  
-%% blabla +
+Cantilevers  are  used  inside  atomic  force  microscope (AFM) which  provides  high
+resolution images of  surfaces.  Several technics have been  used to measure the
+displacement  of cantilevers  in litterature.   For example,  it is  possible to
+determine  accurately  the  deflection  with different  mechanisms. 
+In~\cite{CantiPiezzo01},   authors  used   piezoresistor  integrated   into  the
+cantilever.   Nevertheless this  approach  suffers from  the  complexity of  the
+microfabrication  process needed  to  implement the  sensor  in the  cantilever.
+In~\cite{CantiCapacitive03},  authors  have  presented an  cantilever  mechanism
+based on  capacitive sensing. This kind  of technic also  involves to instrument
+the cantiliver which result in a complex fabrication process.
+
+In this  paper our attention is focused  on a method based  on interferometry to
+measure cantilevers' displacements.  In  this method cantilevers are illuminated
+by  an optic  source. The  interferometry produces  fringes on  each cantilevers
+which enables to  compute the cantilever displacement.  In  order to analyze the
+fringes a  high speed camera  is used. Images  need to be processed  quickly and
+then  a estimation  method is  required to  determine the  displacement  of each
+cantilever.  In~\cite{AFMCSEM11},  the authors have  used an algorithm  based on
+spline to estimate the cantilevers' positions.
+
+   The overall  process gives
+accurate results  but all the computation  are performed on  a standard computer
+using labview.  Consequently,  the main drawback of this  implementation is that
+the computer is a bootleneck in the overall process. In this paper we propose to
+use a  method based on least  square and to  implement all the computation  on a
+FGPA.
+
+The remainder  of the paper  is organized as  follows. Section~\ref{sec:measure}
+describes  more precisely  the measurement  process. Our  solution based  on the
+least  square   method  and   the  implementation  on   FPGA  is   presented  in
+Section~\ref{sec:solus}.       Experimentations      are       described      in
+Section~\ref{sec:results}.  Finally  a  conclusion  and  some  perspectives  are
+presented.
+
+
+
  %% quelques ref commentées sur les calculs basés sur l'interférométrie
  
  %% quelques ref commentées sur les calculs basés sur l'interférométrie
  
-\section{Measurement architecture}
-\label{sec:measure-archi}
+\section{Measurement principles}
+\label{sec:measure}
+
+
+
+
+
+
+
  
  
+\subsection{Architecture}
+\label{sec:archi}
  %% description de l'architecture générale de l'acquisition d'images
  %% avec au milieu une unité de traitement dont on ne précise pas ce
  %% qu'elle est.
  
  %% description de l'architecture générale de l'acquisition d'images
  %% avec au milieu une unité de traitement dont on ne précise pas ce
  %% qu'elle est.
  
-%% image tirée des expériences.
-
-\section{Design goals}
-\label{sec:goals}
-
-%% objectifs en terme de rapidité et de précision, avec en vue l'ajout
-%% du contrôle => l'unité de traitement qui s'impose est un FPGA =>
-%% algo adapté au FPGA.
+In order to develop simple,  cost effective and user-friendly cantilever arrays,
+authors   of    ~\cite{AFMCSEM11}   have   developped   a    system   based   of
+interferometry. In opposition to other optical based systems, using a laser beam
+deflection scheme and  sentitive to the angular displacement  of the cantilever,
+interferometry  is sensitive  to  the  optical path  difference  induced by  the
+vertical displacement of the cantilever.
+
+The system build  by authors of~\cite{AFMCSEM11} has been  developped based on a
+Linnick     interferomter~\cite{Sinclair:05}.    It     is     illustrated    in
+Figure~\ref{fig:AFM}.  A  laser diode  is first split  (by the splitter)  into a
+reference beam and a sample beam  that reachs the cantilever array.  In order to
+be  able to  move  the cantilever  array, it  is  mounted on  a translation  and
+rotational hexapod  stage with  five degrees of  freedom. The optical  system is
+also fixed to the stage.  Thus,  the cantilever array is centered in the optical
+system which  can be adjusted accurately.   The beam illuminates the  array by a
+microscope objective  and the  light reflects on  the cantilevers.  Likewise the
+reference beam  reflects on a  movable mirror.  A  CMOS camera chip  records the
+reference and  sample beams which  are recombined in  the beam splitter  and the
+interferogram.   At the  beginning of  each  experiment, the  movable mirror  is
+fitted  manually in  order to  align the  interferometric  fringes approximately
+parallel  to the cantilevers.   When cantilevers  move due  to the  surface, the
+bending of  cantilevers produce  movements in the  fringes that can  be detected
+with    the    CMOS    camera.     Finally    the    fringes    need    to    be
+analyzed. In~\cite{AFMCSEM11}, the authors used a LabView program to compute the
+cantilevers' movements from the fringes.
+
+\begin{figure}    
+\begin{center}
+\includegraphics[width=\columnwidth]{AFM}
+\end{center}
+\caption{schema of the AFM}
+\label{fig:AFM}   
+\end{figure}
  
  
-%% peut etre que cette section peut être déplacée en intro ... à voir.
  
  
-\section{Proposed solution}
-\label{sec:solus}
+%% image tirée des expériences.
  
  \subsection{Cantilever deflection estimation}
  
  \subsection{Cantilever deflection estimation}
+\label{sec:deflest}
  
  
-%% => faire de l'interpolation de signal sinusoidal
-%% descriptif rapide des deux méthodes : splines et moindres carrés
  As shown on image \ref{img:img-xp}, each cantilever is covered by
  interferometric fringes. The fringes will distort when cantilevers are
  deflected. Estimating the deflection is done by computing this
  As shown on image \ref{img:img-xp}, each cantilever is covered by
  interferometric fringes. The fringes will distort when cantilevers are
  deflected. Estimating the deflection is done by computing this
@@ -113,19 +199,151 @@ where $x$ is the position of a pixel in its associated segment.
  
  The global method consists in two main sequences. The first one aims
  to determin the frequency $f$ of each profile with an algorithm based
  
  The global method consists in two main sequences. The first one aims
  to determin the frequency $f$ of each profile with an algorithm based
-on spline interpolation (see below). It also computes the coefficient
-used for unwrapping the phase. The second one is the acquisition loop,
-while which images are taken at regular time steps. For each image,
-the phase $\theta$ of all profiles is computed to obtain, after
-unwrapping, the deflection of cantilevers.
-
-This phase computation is obviously the bottle-neck of the whole
-process. For example, if we consider the camera actually in use, an
-exposition time of 2.5ms for $1024\times 1204$ pixels seems the
-minimum that can be reached. For a $10\times 10$ cantilever array, if
-we neglect the time to extract pixels, it implies that computing the
-deflection of a single cantilever should take less than 25$µ$s, which is
-quite small.
+on spline interpolation (see section \ref{algo-spline}). It also
+computes the coefficient used for unwrapping the phase. The second one
+is the acquisition loop, while which images are taken at regular time
+steps. For each image, the phase $\theta$ of all profiles is computed
+to obtain, after unwrapping, the deflection of
+cantilevers. Originally, this computation was also done with an
+algorithm based on spline. This article proposes a new version based
+on a least square method.
+
+\subsection{Design goals}
+\label{sec:goals}
+
+The main goal is to implement a computing unit to estimate the
+deflection of about $10\times10$ cantilevers, faster than the stream of
+images coming from the camera. The accuracy of results must be close
+to the maximum precision ever obtained experimentally on the
+architecture, i.e. 0.3nm. Finally, the latency between an image
+entering in the unit and the deflections must be as small as possible
+(NB : future works plan to add some control on the cantilevers).\\
+
+If we put aside some hardware issues like the speed of the link
+between the camera and the computation unit, the time to deserialize
+pixels and to store them in memory, ... the phase computation is
+obviously the bottle-neck of the whole process. For example, if we
+consider the camera actually in use, an exposition time of 2.5ms for
+$1024\times 1204$ pixels seems the minimum that can be reached. For
+100 cantilevers, if we neglect the time to extract pixels, it implies
+that computing the deflection of a single
+cantilever should take less than 25$\mu$s, thus 12.5$\mu$s by phase.\\
+
+In fact, this timing is a very hard constraint. Let consider a very
+small programm that initializes twenty million of doubles in memory
+and then does 1000000 cumulated sums on 20 contiguous values
+(experimental profiles have about this size). On an intel Core 2 Duo
+E6650 at 2.33GHz, this program reaches an average of 155Mflops. 
+
+%%Itimplies that the phase computation algorithm should not take more than
+%%$155\times 12.5 = 1937$ floating operations. For integers, it gives $3000$ operations. 
+
+Obviously, some cache effects and optimizations on
+huge amount of computations can drastically increase these
+performances : peak efficiency is about 2.5Gflops for the considered
+CPU. But this is not the case for phase computation that used only few
+tenth of values.\\
+
+In order to evaluate the original algorithm, we translated it in C
+language. As said further, for 20 pixels, it does about 1550
+operations, thus an estimated execution time of $1550/155
+=$10$\mu$s. For a more realistic evaluation, we constructed a file of
+1Mo containing 200 profiles of 20 pixels, equally scattered. This file
+is equivalent to an image stored in a device file representing the
+camera. We obtained an average of 10.5$\mu$s by profile (including I/O
+accesses). It is under are requirements but close to the limit. In
+case of an occasional load of the system, it could be largely
+overtaken. A solution would be to use a real-time operating system but
+another one to search for a more efficient algorithm.
+
+But the main drawback is the latency of such a solution : since each
+profile must be treated one after another, the deflection of 100
+cantilevers takes about $200\times 10.5 = 2.1$ms, which is inadequate
+for an efficient control. An obvious solution is to parallelize the
+computations, for example on a GPU. Nevertheless, the cost to transfer
+profile in GPU memory and to take back results would be prohibitive
+compared to computation time. It is certainly more efficient to
+pipeline the computation. For example, supposing that 200 profiles of
+20 pixels can be pushed sequentially in the pipelined unit cadenced at
+a 100MHz (i.e. a pixel enters in the unit each 10ns), all profiles
+would be treated in $200\times 20\times 10.10^{-9} =$ 40$\mu$s plus
+the latency of the pipeline. This is about 500 times faster than
+actual results.\\
+
+For these reasons, an FPGA as the computation unit is the best choice
+to achieve the required performance. Nevertheless, passing from
+a C code to a pipelined version in VHDL is not obvious at all. As
+explained in the next section, it can even be impossible because of
+some hardware constraints specific to FPGAs.
+
+
+\section{Proposed solution}
+\label{sec:solus}
+
+Project Oscar aims  to provide a hardware and  software architecture to estimate
+and  control the  deflection of  cantilevers. The  hardware part  consists  in a
+high-speed camera,  linked on an embedded  board hosting FPGAs. By  the way, the
+camera output stream can be pushed  directly into the FPGA. The software part is
+mostly the VHDL  code that deserializes the camera  stream, extracts profile and
+computes  the deflection. Before  focusing on  our work  to implement  the phase
+computation, we give some general information about FPGAs and the board we use.
+
+\subsection{FPGAs}
+
+A field-programmable gate array (FPGA) is an integrated circuit
+designed to be configured by the customer. FGPAs are composed of
+programmable logic components, called configurable logic blocks
+(CLB). These blocks mainly contains look-up tables (LUT), flip/flops
+(F/F) and latches, organized in one or more slices connected
+together. Each CLB can be configured to perform simple (AND, XOR, ...)
+or complex combinational functions. They are interconnected by
+reconfigurable links. Modern FPGAs contain memory elements and
+multipliers which enable to simplify the design and to increase the
+performance. Nevertheless, all other complex operations, like
+division, trigonometric functions, $\ldots$ are not available and must
+be done by configuring a set of CLBs.
+
+Since this configuration is not obvious at all, it can be done via a
+framework that synthetize a design written in an hardware description
+language (HDL), and after, that place and route 
+
+ is used to configure a FPGA.
+FGPAs programming  is very different  from classic processors  programming. When
+logic blocks are  programmed and linked to perform an  operation, they cannot be
+reused anymore.  FPGAs are cadenced more slowly than classic processors but they
+can perform pipeline  as well as parallel operations. A  pipeline provides a way
+to  manipulate  data  quickly  since  at   each  clock  top  it  handles  a  new
+data. However, using  a pipeline consumes more logics  and components since they
+are not  reusable. Nevertheless it is  probably the most  efficient technique on
+FPGA.   Parallel operations  can be  used in  order to  manipulate  several data
+simultaneously. When  it is  possible, using  a pipeline is  a good  solution to
+manipulate  new  data  at  each  clock  top  and  using  parallelism  to  handle
+simultaneously several pipelines in order to handle multiple data streams.
+
+%% parler du VHDL, synthèse et bitstream
+\subsection{The board}
+
+The board we use is designed by the Armadeus compagny, under the name
+SP Vision. It consists in a development board hosting a i.MX27 ARM
+processor (from Freescale). The board includes all classical
+connectors : USB, Ethernet, ... A Flash memory contains a Linux kernel
+that can be launched after booting the board via u-Boot.
+
+The processor is directly connected to a Spartan3A FPGA (from Xilinx)
+via its special interface called WEIM. The Spartan3A is itself
+connected to a Spartan6 FPGA. Thus, it is possible to develop programs
+that communicate between i.MX and Spartan6, using Spartan3 as a
+tunnel. By default, the WEIM interface provides a clock signal at
+100MHz that is connected to dedicated FPGA pins.
+
+The Spartan6 is an LX100 version. It has 15822 slices, equivalent to
+101261 logic cells. There are 268 internal block RAM of 18Kbits, and
+180 dedicated multiply-adders (named DSP48), which is largely enough
+for our project.
+
+Some I/O pins of Spartan6 are connected to two $2\times 17$ headers
+that can be used as user wants. For the project, they will be
+connected to the interface card of the camera.
  
  \subsection{Considered algorithms}
  
  
  \subsection{Considered algorithms}
  
@@ -137,7 +355,7 @@ classical least square method but suppose that frequency is already
  known.
  
  \subsubsection{Spline algorithm}
  known.
  
  \subsubsection{Spline algorithm}
-
+\label{sec:algo-spline}
  Let consider a profile $P$, that is a segment of $M$ pixels with an
  intensity in gray levels. Let call $I(x)$ the intensity of profile in $x
  \in [0,M[$. 
  Let consider a profile $P$, that is a segment of $M$ pixels with an
  intensity in gray levels. Let call $I(x)$ the intensity of profile in $x
  \in [0,M[$. 
@@ -146,7 +364,7 @@ At first, only $M$ values of $I$ are known, for $x = 0, 1,
  \ldots,M-1$. A normalisation allows to scale known intensities into
  $[-1,1]$. We compute splines that fit at best these normalised
  intensities. Splines are used to interpolate $N = k\times M$ points
  \ldots,M-1$. A normalisation allows to scale known intensities into
  $[-1,1]$. We compute splines that fit at best these normalised
  intensities. Splines are used to interpolate $N = k\times M$ points
-(typically $k=3$ is sufficient), within $[0,M[$. Let call $x^s$ the
+(typically $k=4$ is sufficient), within $[0,M[$. Let call $x^s$ the
  coordinates of these $N$ points and $I^s$ their intensities.
  
  In order to have the frequency, the mean line $a.x+b$ (see equation \ref{equ:profile}) of $I^s$ is
  coordinates of these $N$ points and $I^s$ their intensities.
  
  In order to have the frequency, the mean line $a.x+b$ (see equation \ref{equ:profile}) of $I^s$ is
@@ -158,12 +376,15 @@ The phase is computed via the equation :
  \theta = atan \left[ \frac{\sum_{i=0}^{N-1} sin(2\pi f x^s_i) \times I^s(x^s_i)}{\sum_{i=0}^{N-1} cos(2\pi f x^s_i) \times I^s(x^s_i)} \right]
  \end{equation}
  
  \theta = atan \left[ \frac{\sum_{i=0}^{N-1} sin(2\pi f x^s_i) \times I^s(x^s_i)}{\sum_{i=0}^{N-1} cos(2\pi f x^s_i) \times I^s(x^s_i)} \right]
  \end{equation}
  
-Two things can be noticed. Firstly, the frequency could also be
-obtained using the derivates of spline equations, which only implies
-to solve quadratic equations. Secondly, frequency of each profile is
-computed a single time, before the acquisition loop. Thus, $sin(2\pi f
-x^s_i)$ and $cos(2\pi f x^s_i)$ could also be computed before the loop, which leads to a
-much faster computation of $\theta$.
+Two things can be noticed :
+\begin{itemize}
+\item the frequency could also be obtained using the derivates of
+  spline equations, which only implies to solve quadratic equations.
+\item frequency of each profile is computed a single time, before the
+  acquisition loop. Thus, $sin(2\pi f x^s_i)$ and $cos(2\pi f x^s_i)$
+  could also be computed before the loop, which leads to a much faster
+  computation of $\theta$.
+\end{itemize}
  
  \subsubsection{Least square algorithm}
  
  
  \subsubsection{Least square algorithm}
  
@@ -218,32 +439,261 @@ Several points can be noticed :
  computed.
  
  \item The simplest method to find the good $\theta$ is to discretize
  computed.
  
  \item The simplest method to find the good $\theta$ is to discretize
-  $[-\pi,\pi]$ in $N$ steps, and to search which step leads to the
+  $[-\pi,\pi]$ in $nb_s$ steps, and to search which step leads to the
    result closest to zero. By the way, three other lookup tables can
    also be computed before the loop :
  
    result closest to zero. By the way, three other lookup tables can
    also be computed before the loop :
  
-\[ sin \theta, cos \theta, \left[ cos 2\theta \sum_{i=0}^{M-1} sin(4\pi f.i) + sin 2\theta \sum_{i=0}^{M-1} cos(4\pi f.i)\right] \]
-
-\item This search can be very fast using a dichotomous process in $log_2(N)$ 
-
-\end{itemize}
+\[ sin \theta, cos \theta, \]
  
  
-\subsubsection{Comparison}
+\[ \left[ cos 2\theta \sum_{i=0}^{M-1} sin(4\pi f.i) + sin 2\theta \sum_{i=0}^{M-1} cos(4\pi f.i)\right] \]
  
  
-\subsection{FPGA constraints}
+\item This search can be very fast using a dichotomous process in $log_2(nb_s)$ 
  
  
-%% contraintes imposées par le FPGA : algo pipeline/parallele, pas d'op math complexe, ...
+\end{itemize}
  
  
-\subsection{Least square algorithm}
+Finally, the whole summarizes in an algorithm (called LSQ in the following) in two parts, one before and one during the acquisition loop :
+\begin{algorithm}[h]
+\caption{LSQ algorithm - before acquisition loop.}
+\label{alg:lsq-before}
+
+   $M \leftarrow $ number of pixels of the profile\\
+   I[] $\leftarrow $ intensities of pixels\\
+   $f \leftarrow $ frequency of the profile\\
+   $s4i \leftarrow \sum_{i=0}^{M-1} sin(4\pi f.i)$\\
+   $c4i \leftarrow \sum_{i=0}^{M-1} cos(4\pi f.i)$\\
+   $nb_s \leftarrow $ number of discretization steps of $[-\pi,\pi]$\\
+
+   \For{$i=0$ to $nb_s $}{
+     $\theta  \leftarrow -\pi + 2\pi\times \frac{i}{nb_s}$\\
+     lut$_s$[$i$] $\leftarrow sin \theta$\\
+     lut$_c$[$i$] $\leftarrow cos \theta$\\
+     lut$_A$[$i$] $\leftarrow cos 2 \theta \times s4i + sin 2 \theta \times c4i$\\
+     lut$_{sfi}$[$i$] $\leftarrow sin (2\pi f.i)$\\
+     lut$_{cfi}$[$i$] $\leftarrow cos (2\pi f.i)$\\
+   }
+\end{algorithm}
+
+\begin{algorithm}[ht]
+\caption{LSQ algorithm - during acquisition loop.}
+\label{alg:lsq-during}
+
+   $\bar{x} \leftarrow \frac{M-1}{2}$\\
+   $\bar{y} \leftarrow 0$, $x_{var} \leftarrow 0$, $xy_{covar} \leftarrow 0$\\
+   \For{$i=0$ to $M-1$}{
+     $\bar{y} \leftarrow \bar{y} + $ I[$i$]\\
+     $x_{var} \leftarrow x_{var} + (i-\bar{x})^2$\\
+   }
+   $\bar{y} \leftarrow \frac{\bar{y}}{M}$\\
+   \For{$i=0$ to $M-1$}{
+     $xy_{covar} \leftarrow xy_{covar} + (i-\bar{x}) \times (I[i]-\bar{y})$\\
+   }
+   $slope \leftarrow \frac{xy_{covar}}{x_{var}}$\\
+   $start \leftarrow y_{moy} - slope\times \bar{x}$\\
+   \For{$i=0$ to $M-1$}{
+     $I[i] \leftarrow I[i] - start - slope\times i$\\
+   }
+   
+   $I_{max} \leftarrow max_i(I[i])$, $I_{min} \leftarrow min_i(I[i])$\\
+   $amp \leftarrow \frac{I_{max}-I_{min}}{2}$\\
+
+   $Is \leftarrow 0$, $Ic \leftarrow 0$\\
+   \For{$i=0$ to $M-1$}{
+     $Is \leftarrow Is + I[i]\times $ lut$_{sfi}$[$i$]\\
+     $Ic \leftarrow Ic + I[i]\times $ lut$_{cfi}$[$i$]\\
+   }
+
+   $\delta \leftarrow \frac{nb_s}{2}$, $b_l \leftarrow 0$, $b_r \leftarrow \delta$\\
+   $v_l \leftarrow -2.I_s - amp.$lut$_A$[$b_l$]\\
+
+   \While{$\delta >= 1$}{
+
+     $v_r \leftarrow 2.[ Is.$lut$_c$[$b_r$]$ + Ic.$lut$_s$[$b_r$]$ ] - amp.$lut$_A$[$b_r$]\\
+
+     \If{$!(v_l < 0$ and $v_r >= 0)$}{
+       $v_l \leftarrow v_r$ \\
+       $b_l \leftarrow b_r$ \\
+     }
+     $\delta \leftarrow \frac{\delta}{2}$\\
+     $b_r \leftarrow b_l + \delta$\\
+   }
+   \uIf{$!(v_l < 0$ and $v_r >= 0)$}{
+     $v_l \leftarrow v_r$ \\
+     $b_l \leftarrow b_r$ \\
+     $b_r \leftarrow b_l + 1$\\
+     $v_r \leftarrow 2.[ Is.$lut$_c$[$b_r$]$ + Ic.$lut$_s$[$b_r$]$ ] - amp.$lut$_A$[$b_r$]\\
+   }
+   \Else {
+     $b_r \leftarrow b_l + 1$\\
+   }
+
+   \uIf{$ abs(v_l) < v_r$}{
+     $b_{\theta} \leftarrow b_l$ \\
+   }
+   \Else {
+     $b_{\theta} \leftarrow b_r$ \\
+   }
+   $\theta \leftarrow \pi\times \left[\frac{2.b_{ref}}{nb_s}-1\right]$\\
+
+\end{algorithm}
  
  
-%% description précise
-%% avantage sur FPGA
+\subsubsection{Comparison}
  
  
-\subsection{VDHL design paradigms}
+We compared the two algorithms on the base of three criterions :
+\begin{itemize}
+\item precision of results on a cosinus profile, distorted with noise,
+\item number of operations,
+\item complexity to implement an FPGA version.
+\end{itemize}
  
  
-\subsection{VDHL implementation}
+For the first item, we produced a matlab version of each algorithm,
+running with double precision values. The profile was generated for
+about 34000 different values of period ($\in [3.1, 6.1]$, step = 0.1),
+phase ($\in [-3.1 , 3.1]$, step = 0.062) and slope ($\in [-2 , 2]$,
+step = 0.4). For LSQ, $nb_s = 1024$, which leads to a maximal error of
+$\frac{\pi}{1024}$ on phase computation. Current A. Meister and
+M. Favre experiments show a ratio of 50 between variation of phase and
+the deflection of a lever. Thus, the maximal error due to
+discretization correspond to an error of 0.15nm on the lever
+deflection, which is smaller than the best precision they achieved,
+i.e. 0.3nm.
+
+For each test, we add some noise to the profile : each group of two
+pixels has its intensity added to a random number picked in $[-N,N]$
+(NB: it should be noticed that picking a new value for each pixel does
+not distort enough the profile). The absolute error on the result is
+evaluated by comparing the difference between the reference and
+computed phase, out of $2\pi$, expressed in percents. That is : $err =
+100\times \frac{|\theta_{ref} - \theta_{comp}|}{2\pi}$.
+
+Table \ref{tab:algo_prec} gives the maximum and average error for the two algorithms and increasing values of $N$.
+
+\begin{table}[ht]
+  \begin{center}
+    \begin{tabular}{|c|c|c|c|c|}
+      \hline
+  & \multicolumn{2}{c|}{SPL} & \multicolumn{2}{c|}{LSQ} \\ \cline{2-5}
+  noise & max. err. & aver. err. & max. err. & aver. err. \\ \hline
+  0 & 2.46 & 0.58 & 0.49 & 0.1 \\ \hline
+  2.5 & 2.75 & 0.62 & 1.16 & 0.22 \\ \hline
+  5 & 3.77 & 0.72 & 2.47 & 0.41 \\ \hline
+  7.5 & 4.72 & 0.86 & 3.33 & 0.62 \\ \hline
+  10 & 5.62 & 1.03 & 4.29 & 0.81 \\ \hline
+  15 & 7.96 & 1.38 & 6.35 & 1.21 \\ \hline
+  30 & 17.06 & 2.6 & 13.94 & 2.45 \\ \hline
+
+\end{tabular}
+\caption{Error (in \%) for cosinus profiles, with noise.}
+\label{tab:algo_prec}
+\end{center}
+\end{table}
+
+These results show that the two algorithms are very close, with a
+slight advantage for LSQ. Furthemore, both behave very well against
+noise. Assuming the experimental ratio of 50 (see above), an error of
+1 percent on phase correspond to an error of 0.5nm on the lever
+deflection, which is very close to the best precision.
+
+Obviously, it is very hard to predict which level of noise will be
+present in real experiments and how it will distort the
+profiles. Nevertheless, we can see on figure \ref{fig:noise20} the
+profile with $N=10$ that leads to the biggest error. It is a bit
+distorted, with pikes and straight/rounded portions, and relatively
+close to most of that come from experiments. Figure \ref{fig:noise60}
+shows a sample of worst profile for $N=30$. It is completly distorted,
+largely beyond the worst experimental ones. 
+
+\begin{figure}[ht]
+\begin{center}
+  \includegraphics[width=9cm]{intens-noise20}
+\end{center}
+\caption{Sample of worst profile for N=10}
+\label{fig:noise20}
+\end{figure}
+
+\begin{figure}[ht]
+\begin{center}
+  \includegraphics[width=9cm]{intens-noise60}
+\end{center}
+\caption{Sample of worst profile for N=30}
+\label{fig:noise60}
+\end{figure}
+
+The second criterion is relatively easy to estimate for LSQ and harder
+for SPL because of $atan$ operation. In both cases, it is proportional
+to numbers of pixels $M$. For LSQ, it also depends on $nb_s$ and for
+SPL on $N = k\times M$, i.e. the number of interpolated points. 
+
+We assume that $M=20$, $nb_s=1024$, $k=4$, all possible parts are
+already in lookup tables and a limited set of operations (+, -, *, /,
+$<$, $>$) is taken account. Translating the two algorithms in C code, we
+obtain about 430 operations for LSQ and 1550 (plus few tenth for
+$atan$) for SPL. This result is largely in favor of LSQ. Nevertheless,
+considering the total number of operations is not really pertinent for
+an FPGA implementation : it mainly depends on the type of operations
+and their
+ordering. The final decision is thus driven by the third criterion.\\
+
+The Spartan 6 used in our architecture has hard constraint : it has no
+built-in floating point units. Obviously, it is possible to use some
+existing "black-boxes" for double precision operations. But they have
+a quite long latency. It is much simpler to exclusively use integers,
+with a quantization of all double precision values. Obviously, this
+quantization should not decrease too much the precision of
+results. Furthermore, it should not lead to a design with a huge
+latency because of operations that could not complete during a single
+or few clock cycles. Divisions are in this case and, moreover, they
+need an varying number of clock cycles to complete. Even
+multiplications can be a problem : DSP48 take inputs of 18 bits
+maximum. For larger multiplications, several DSP must be combined,
+increasing the latency.
+
+Nevertheless, the hardest constraint does not come from the FPGA
+characteristics but from the algorithms. Their VHDL implentation will
+be efficient only if they can be fully (or near) pipelined. By the
+way, the choice is quickly done : only a small part of SPL can be.
+Indeed, the computation of spline coefficients implies to solve a
+tridiagonal system $A.m = b$. Values in $A$ and $b$ can be computed
+from incoming pixels intensity but after, the back-solve starts with
+the lastest values, which breaks the pipeline. Moreover, SPL relies on
+interpolating far more points than profile size. Thus, the end
+of SPL works on a larger amount of data than the beginning, which
+also breaks the pipeline.
+
+LSQ has not this problem : all parts except the dichotomial search
+work on the same amount of data, i.e. the profile size. Furthermore,
+LSQ needs less operations than SPL, implying a smaller output
+latency. Consequently, it is the best candidate for phase
+computation. Nevertheless, obtaining a fully pipelined version
+supposes that operations of different parts complete in a single clock
+cycle. It is the case for simulations but it completely fails when
+mapping and routing the design on the Spartan6. By the way,
+extra-latency is generated and there must be idle times between two
+profiles entering into the pipeline.
+
+%%Before obtaining the least bitstream, the crucial question is : how to
+%%translate the C code the LSQ into VHDL ?
+
+
+%\subsection{VHDL design paradigms}
+
+\section{Experimental tests}
+
+\subsection{VHDL implementation}
+
+% - ecriture d'un code en C avec integer
+% - calcul de la taille max en bit de chaque variable en fonction de la quantization.
+% - tests de quantization : équilibre entre précision et contraintes FPGA
+% - en parallèle : simulink et VHDL à la main
+%
+\subsection{Simulation}
+
+% ghdl + gtkwave
+% au mieux : une phase tous les 33 cycles, latence de 95 cycles.
+% mais routage/placement impossible.
+\subsection{Bitstream creation}
+
+% pas fait mais prévision d'une sortie tous les 480ns avec une latence de 1120
  
  
-\section{Experimental results}
  \label{sec:results}
  
  
  \label{sec:results}