5ème commit :

[dmems12.git] / dmems12.tex
diff --git a/dmems12.tex b/dmems12.tex

index 51d7344b629665bd4e9e4592ba16f4769b3ca251..f559661e358c7e6f2041166b10f9a7a69daf7081 100644 (file)
--- a/dmems12.tex
+++ b/dmems12.tex
@@ -1,8 +1,9 @@
-\documentclass[12pt]{article}
+
+\documentclass[10pt, conference, compsocconf]{IEEEtran}
  %\usepackage{latex8}
  %\usepackage{times}
-\usepackage[latin1]{inputenc}
-\usepackage[cyr]{aeguill}
+\usepackage[utf8]{inputenc}
+%\usepackage[cyr]{aeguill}
  %\usepackage{pstricks,pst-node,pst-text,pst-3d}
  %\usepackage{babel}
  \usepackage{amsmath}
@@ -24,29 +25,38 @@
  
  \newcommand{\tab}{\ \ \ }
  
-%%%%%%%%%%%%%%%%%%%%%%%%%%%% my bib path.
+
+
+\begin{document}
+
+
+%% \author{\IEEEauthorblockN{Authors Name/s per 1st Affiliation (Author)}
+%% \IEEEauthorblockA{line 1 (of Affiliation): dept. name of organization\\
+%% line 2: name of organization, acronyms acceptable\\
+%% line 3: City, Country\\
+%% line 4: Email: name@xyz.com}
+%% \and
+%% \IEEEauthorblockN{Authors Name/s per 2nd Affiliation (Author)}
+%% \IEEEauthorblockA{line 1 (of Affiliation): dept. name of organization\\
+%% line 2: name of organization, acronyms acceptable\\
+%% line 3: City, Country\\
+%% line 4: Email: name@xyz.com}
+%% }
+
  
  
  \title{Using FPGAs for high speed and real time cantilever deflection estimation}
+\author{\IEEEauthorblockN{Raphaël Couturier\IEEEauthorrefmark{1}, Stéphane Domas\IEEEauthorrefmark{1}, Gwenhaël Goavec-Merou\IEEEauthorrefmark{2} and Michel Lenczner\IEEEauthorrefmark{2}}
+\IEEEauthorblockA{\IEEEauthorrefmark{1}FEMTO-ST, DISC, University of Franche-Comte, Belfort, France\\
+\{raphael.couturier,stephane.domas\}@univ-fcomte.fr}
+\IEEEauthorblockA{\IEEEauthorrefmark{2}FEMTO-ST, Time-Frequency, University of Franche-Comte, Besançon, France\\
+\{michel.lenczner@utbm.fr,gwenhael.goavec@trabucayre.com}
+}
+
+
  
-\author{ Raphaël COUTURIER\\
-Laboratoire d'Informatique 
-de l'Universit\'e de  Franche-Comt\'e, \\
-BP 527, \\
-90016~Belfort CEDEX, France\\
- \and Stéphane Domas\\
-Laboratoire d'Informatique 
-de l'Universit\'e de  Franche-Comt\'e, \\
-BP 527, \\
-90016~Belfort CEDEX, France\\
- \and Gwenhaël Goavec\\
-??
-?? \\
-??, \\
-??\\}
  
  
-\begin{document}
  
  \maketitle
  
@@ -61,7 +71,35 @@ BP 527, \\
  
  \section{Introduction}
  
-%% blabla +
+Cantilevers  are  used  inside  atomic  force  microscope  which  provides  high
+resolution images of  surfaces.  Several technics have been  used to measure the
+displacement  of cantilevers  in litterature.   For example,  it is  possible to
+determine        accurately        the        deflection       with        optic
+interferometer~\cite{CantiOptic89},     pizeoresistor~\cite{CantiPiezzo01}    or
+capacitive  sensing~\cite{CantiCapacitive03}.  In  this paper  our  attention is
+focused   on  a  method   based  on   interferometry  to   measure  cantilevers'
+displacements.   In  this  method   cantilevers  are  illiminated  by  an  optic
+source. The interferometry produces fringes on each cantilevers which enables to
+compute the  cantilever displacement.   In order to  analyze the fringes  a high
+speed camera is used. Images need  to be processed quickly and then a estimation
+method  is   required  to  determine   the  displacement  of   each  cantilever.
+In~\cite{AFMCSEM11} {\bf verifier ref}, the authors have used an algorithm based
+on spline  to estimate  the cantilevers' positions.   The overall  process gives
+accurate results  but all the computation  are performed on  a standard computer
+using labview.  Consequently,  the main drawback of this  implementation is that
+the computer is a bootleneck in the overall process. In this paper we propose to
+use a  method based on least  square and to  implement all the computation  on a
+FGPA.
+
+The remainder  of the paper  is organized as  follows. Section~\ref{sec:measure}
+describes  more precisely  the measurement  process. Our  solution based  on the
+least  square   method  and   the  implementation  on   FPGA  is   presented  in
+Section~\ref{sec:solus}.       Experimentations      are       described      in
+Section~\ref{sec:results}.  Finally  a  conclusion  and  some  perspectives  are
+presented.
+
+
+
  %% quelques ref commentées sur les calculs basés sur l'interférométrie
  
  \section{Measurement principles}
@@ -122,7 +160,7 @@ consider the camera actually in use, an exposition time of 2.5ms for
  $1024\times 1204$ pixels seems the minimum that can be reached. For a
  $10\times 10$ cantilever array, if we neglect the time to extract
  pixels, it implies that computing the deflection of a single
-cantilever should take less than 25$µ$s, thus 12.5$µ$s by phase.\\
+cantilever should take less than 25$\mu$s, thus 12.5$\mu$s by phase.\\
  
  In fact, this timing is a very hard constraint. Let consider a very
  small programm that initializes twenty million of doubles in memory
@@ -135,7 +173,7 @@ $3000$ operations.
  
  %% to be continued ...
  
-%% à faire : timing de l'algo spline en C avec atan et tout le bordel.
+%% � faire : timing de l'algo spline en C avec atan et tout le bordel.
  
  
  
@@ -168,7 +206,7 @@ At first, only $M$ values of $I$ are known, for $x = 0, 1,
  \ldots,M-1$. A normalisation allows to scale known intensities into
  $[-1,1]$. We compute splines that fit at best these normalised
  intensities. Splines are used to interpolate $N = k\times M$ points
-(typically $k=3$ is sufficient), within $[0,M[$. Let call $x^s$ the
+(typically $k=4$ is sufficient), within $[0,M[$. Let call $x^s$ the
  coordinates of these $N$ points and $I^s$ their intensities.
  
  In order to have the frequency, the mean line $a.x+b$ (see equation \ref{equ:profile}) of $I^s$ is
@@ -180,12 +218,15 @@ The phase is computed via the equation :
  \theta = atan \left[ \frac{\sum_{i=0}^{N-1} sin(2\pi f x^s_i) \times I^s(x^s_i)}{\sum_{i=0}^{N-1} cos(2\pi f x^s_i) \times I^s(x^s_i)} \right]
  \end{equation}
  
-Two things can be noticed. Firstly, the frequency could also be
-obtained using the derivates of spline equations, which only implies
-to solve quadratic equations. Secondly, frequency of each profile is
-computed a single time, before the acquisition loop. Thus, $sin(2\pi f
-x^s_i)$ and $cos(2\pi f x^s_i)$ could also be computed before the loop, which leads to a
-much faster computation of $\theta$.
+Two things can be noticed :
+\begin{itemize}
+\item the frequency could also be obtained using the derivates of
+  spline equations, which only implies to solve quadratic equations.
+\item frequency of each profile is computed a single time, before the
+  acquisition loop. Thus, $sin(2\pi f x^s_i)$ and $cos(2\pi f x^s_i)$
+  could also be computed before the loop, which leads to a much faster
+  computation of $\theta$.
+\end{itemize}
  
  \subsubsection{Least square algorithm}
  
@@ -240,13 +281,15 @@ Several points can be noticed :
  computed.
  
  \item The simplest method to find the good $\theta$ is to discretize
-  $[-\pi,\pi]$ in $N$ steps, and to search which step leads to the
+  $[-\pi,\pi]$ in $nb_s$ steps, and to search which step leads to the
    result closest to zero. By the way, three other lookup tables can
    also be computed before the loop :
  
-\[ sin \theta, cos \theta, \left[ cos 2\theta \sum_{i=0}^{M-1} sin(4\pi f.i) + sin 2\theta \sum_{i=0}^{M-1} cos(4\pi f.i)\right] \]
+\[ sin \theta, cos \theta, \]
+
+\[ \left[ cos 2\theta \sum_{i=0}^{M-1} sin(4\pi f.i) + sin 2\theta \sum_{i=0}^{M-1} cos(4\pi f.i)\right] \]
  
-\item This search can be very fast using a dichotomous process in $log_2(N)$ 
+\item This search can be very fast using a dichotomous process in $log_2(nb_s)$ 
  
  \end{itemize}
  
@@ -264,15 +307,15 @@ Finally, the whole summarizes in an algorithm (called LSQ in the following) in t
  
     \For{$i=0$ to $nb_s $}{
       $\theta  \leftarrow -\pi + 2\pi\times \frac{i}{nb_s}$\\
-     lut\_sin[$i$] $\leftarrow sin \theta$\\
-     lut\_cos[$i$] $\leftarrow cos \theta$\\
-     lut\_A[$i$] $\leftarrow cos 2 \theta \times s4i + sin 2 \theta \times c4i$\\
-     lut\_sinfi[$i$] $\leftarrow sin (2\pi f.i)$\\
-     lut\_cosfi[$i$] $\leftarrow cos (2\pi f.i)$\\
+     lut$_s$[$i$] $\leftarrow sin \theta$\\
+     lut$_c$[$i$] $\leftarrow cos \theta$\\
+     lut$_A$[$i$] $\leftarrow cos 2 \theta \times s4i + sin 2 \theta \times c4i$\\
+     lut$_{sfi}$[$i$] $\leftarrow sin (2\pi f.i)$\\
+     lut$_{cfi}$[$i$] $\leftarrow cos (2\pi f.i)$\\
     }
  \end{algorithm}
  
-\begin{algorithm}[h]
+\begin{algorithm}[ht]
  \caption{LSQ algorithm - during acquisition loop.}
  \label{alg:lsq-during}
  
@@ -289,7 +332,7 @@ Finally, the whole summarizes in an algorithm (called LSQ in the following) in t
     $slope \leftarrow \frac{xy_{covar}}{x_{var}}$\\
     $start \leftarrow y_{moy} - slope\times \bar{x}$\\
     \For{$i=0$ to $M-1$}{
-     $I[i] \leftarrow I[i] - start - slope\times i$\tcc*[f]{slope removal}\\
+     $I[i] \leftarrow I[i] - start - slope\times i$\\
     }
     
     $I_{max} \leftarrow max_i(I[i])$, $I_{min} \leftarrow min_i(I[i])$\\
@@ -297,30 +340,149 @@ Finally, the whole summarizes in an algorithm (called LSQ in the following) in t
  
     $Is \leftarrow 0$, $Ic \leftarrow 0$\\
     \For{$i=0$ to $M-1$}{
-     $Is \leftarrow Is + I[i]\times $ lut\_sinfi[$i$]\\
-     $Ic \leftarrow Ic + I[i]\times $ lut\_cosfi[$i$]\\
+     $Is \leftarrow Is + I[i]\times $ lut$_{sfi}$[$i$]\\
+     $Ic \leftarrow Ic + I[i]\times $ lut$_{cfi}$[$i$]\\
     }
  
-   $\theta \leftarrow -\pi$\\
-   $val_1 \leftarrow 2\times \left[ Is.\cos(\theta) + Ic.\sin(\theta) \right] - amp\times \left[ c4i.\sin(2\theta) + s4i.\cos(2\theta) \right]$\\
-   \For{$i=1-n_s$ to $n_s$}{
-     $\theta \leftarrow \frac{i.\pi}{n_s}$\\
-     $val_2 \leftarrow 2\times \left[ Is.\cos(\theta) + Ic.\sin(\theta) \right] - amp\times \left[ c4i.\sin(2\theta) + s4i.\cos(2\theta) \right]$\\
+   $\delta \leftarrow \frac{nb_s}{2}$, $b_l \leftarrow 0$, $b_r \leftarrow \delta$\\
+   $v_l \leftarrow -2.I_s - amp.$lut$_A$[$b_l$]\\
+
+   \While{$\delta >= 1$}{
  
-     \lIf{$val_1 < 0$ et $val_2 >= 0$}{
-       $\theta_s \leftarrow \theta - \left[ \frac{val_2}{val_2-val_1}\times \frac{\pi}{n_s} \right]$\\
+     $v_r \leftarrow 2.[ Is.$lut$_c$[$b_r$]$ + Ic.$lut$_s$[$b_r$]$ ] - amp.$lut$_A$[$b_r$]\\
+
+     \If{$!(v_l < 0$ and $v_r >= 0)$}{
+       $v_l \leftarrow v_r$ \\
+       $b_l \leftarrow b_r$ \\
       }
-     $val_1 \leftarrow val_2$\\
+     $\delta \leftarrow \frac{\delta}{2}$\\
+     $b_r \leftarrow b_l + \delta$\\
+   }
+   \uIf{$!(v_l < 0$ and $v_r >= 0)$}{
+     $v_l \leftarrow v_r$ \\
+     $b_l \leftarrow b_r$ \\
+     $b_r \leftarrow b_l + 1$\\
+     $v_r \leftarrow 2.[ Is.$lut$_c$[$b_r$]$ + Ic.$lut$_s$[$b_r$]$ ] - amp.$lut$_A$[$b_r$]\\
+   }
+   \Else {
+     $b_r \leftarrow b_l + 1$\\
     }
  
-\end{algorithm}
+   \uIf{$ abs(v_l) < v_r$}{
+     $b_{\theta} \leftarrow b_l$ \\
+   }
+   \Else {
+     $b_{\theta} \leftarrow b_r$ \\
+   }
+   $\theta \leftarrow \pi\times \left[\frac{2.b_{ref}}{nb_s}-1\right]$\\
  
+\end{algorithm}
  
  \subsubsection{Comparison}
  
-\subsection{VDHL design paradigms}
+We compared the two algorithms on the base of three criterions :
+\begin{itemize}
+\item precision of results on a cosinus profile, distorted with noise,
+\item number of operations,
+\item complexity to implement an FPGA version.
+\end{itemize}
  
-\subsection{VDHL implementation}
+For the first item, we produced a matlab version of each algorithm,
+running with double precision values. The profile was generated for
+about 34000 different values of period ($\in [3.1, 6.1]$, step = 0.1),
+phase ($\in [-3.1 , 3.1]$, step = 0.062) and slope ($\in [-2 , 2]$,
+step = 0.4). For LSQ, $nb_s = 1024$, which leads to a maximal error of
+$\frac{\pi}{1024}$ on phase computation. Current A. Meister and
+M. Favre experiments show a ratio of 50 between variation of phase and
+the deflection of a lever. Thus, the maximal error due to
+discretization correspond to an error of 0.15nm on the lever
+deflection, which is smaller than the best precision they achieved,
+i.e. 0.3nm.
+
+For each test, we add some noise to the profile : each group of two
+pixels has its intensity added to a random number picked in $[-N,N]$
+(NB: it should be noticed that picking a new value for each pixel does
+not distort enough the profile). The absolute error on the result is
+evaluated by comparing the difference between the reference and
+computed phase, out of $2\pi$, expressed in percents. That is : $err =
+100\times \frac{|\theta_{ref} - \theta_{comp}|}{2\pi}$.
+
+Table \ref{tab:algo_prec} gives the maximum and average error for the two algorithms and increasing values of $N$.
+
+\begin{table}[ht]
+  \begin{center}
+    \begin{tabular}{|c|c|c|c|c|}
+      \hline
+  & \multicolumn{2}{c|}{SPL} & \multicolumn{2}{c|}{LSQ} \\ \cline{2-5}
+  noise & max. err. & aver. err. & max. err. & aver. err. \\ \hline
+  0 & 2.46 & 0.58 & 0.49 & 0.1 \\ \hline
+  2.5 & 2.75 & 0.62 & 1.16 & 0.22 \\ \hline
+  5 & 3.77 & 0.72 & 2.47 & 0.41 \\ \hline
+  7.5 & 4.72 & 0.86 & 3.33 & 0.62 \\ \hline
+  10 & 5.62 & 1.03 & 4.29 & 0.81 \\ \hline
+  15 & 7.96 & 1.38 & 6.35 & 1.21 \\ \hline
+  30 & 17.06 & 2.6 & 13.94 & 2.45 \\ \hline
+
+\end{tabular}
+\caption{Error (in \%) for cosinus profiles, with noise.}
+\label{tab:algo_prec}
+\end{center}
+\end{table}
+
+These results show that the two algorithms are very close, with a
+slight advantage for LSQ. Furthemore, both behave very well against
+noise. Assuming the experimental ratio of 50 (see above), an error of
+1 percent on phase correspond to an error of 0.5nm on the lever
+deflection, which is very close to the best precision.
+
+Obviously, it is very hard to predict which level of noise will be
+present in real experiments and how it will distort the
+profiles. Nevertheless, we can see on figure \ref{fig:noise20} the
+profile with $N=10$ that leads to the biggest error. It is a bit
+distorted, with pikes and straight/rounded portions, and relatively
+close to most of that come from experiments. Figure \ref{fig:noise60}
+shows a sample of worst profile for $N=30$. It is completly distorted,
+largely beyond the worst experimental ones. 
+
+\begin{figure}[ht]
+\begin{center}
+  \includegraphics[width=9cm]{intens-noise20-spl}
+\end{center}
+\caption{Sample of worst profile for N=10}
+\label{fig:noise20}
+\end{figure}
+
+\begin{figure}[ht]
+\begin{center}
+  \includegraphics[width=9cm]{intens-noise60-lsq}
+\end{center}
+\caption{Sample of worst profile for N=30}
+\label{fig:noise60}
+\end{figure}
+
+The second criterion is relatively easy to estimate for LSQ and harder
+for SPL because of $atan$ operation. In both cases, it is proportional
+to numbers of pixels $M$. For LSQ, it also depends on $nb_s$ and for
+SPL on $N = k\times M$, i.e. the number of interpolated points. 
+
+We assume that $M=20$, $nb_s=1024$, $k=4$, all possible parts are
+already in lookup tables and only arithmetic operations (+, -, *, /)
+are taken account. Translating the two algorithms in C code, we obtain
+about 400 operations for LSQ and 1340 (plus the unknown for $atan$)
+for SPL. Even if the result is largely in favor of LSQ, we can notice
+that executing the C code (compiled with \tt{-O3}) of SPL on an
+2.33GHz Core 2 Duo only takes 6.5µs in average, which is under our
+desing goals. The final decision is thus driven by the third criterion.\\
+
+The Spartan 6 used in our architecture has hard constraint : it has no
+floating point units. Thus, all computations have to be done with
+integers. 
+
+
+
+\subsection{VHDL design paradigms}
+
+\subsection{VHDL implementation}
  
  \section{Experimental results}
  \label{sec:results}