From 68ef101fa4f71c2911e9ffa93ceb5e07afb4af88 Mon Sep 17 00:00:00 2001
From: =?utf8?q?St=C3=A9phane=20Domas?=
 <sdomas@prodigy.iut-bm.univ-fcomte.fr>
Date: Fri, 14 Oct 2011 17:33:29 +0200
Subject: [PATCH] =?utf8?q?deuxi=C3=A8me=20commit=20:=20-=20changement=20da?=
 =?utf8?q?ns=20l'ordre=20de=20certaines=20sections=20-=20algo=20LSQ=20quas?=
 =?utf8?q?i=20fini=20d'=C3=A9crire=20=C3=A0=20part=20la=20fin=20(=3Ddichot?=
 =?utf8?q?omie)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

---
 dmems12.tex | 161 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 120 insertions(+), 41 deletions(-)

diff --git a/dmems12.tex b/dmems12.tex
index 806e509..51d7344 100644
--- a/dmems12.tex
+++ b/dmems12.tex
@@ -2,6 +2,7 @@
 %\usepackage{latex8}
 %\usepackage{times}
 \usepackage[latin1]{inputenc}
+\usepackage[cyr]{aeguill}
 %\usepackage{pstricks,pst-node,pst-text,pst-3d}
 %\usepackage{babel}
 \usepackage{amsmath}
@@ -16,6 +17,8 @@
 \usepackage{fullpage}
 \usepackage{fancybox}
 
+\usepackage[ruled,lined,linesnumbered]{algorithm2e}
+
 %%%%%%%%%%%%%%%%%%%%%%%%%%%% LyX specific LaTeX commands.
 \newcommand{\noun}[1]{\textsc{#1}}
 
@@ -26,17 +29,17 @@
 
 \title{Using FPGAs for high speed and real time cantilever deflection estimation}
 
-\author{ RaphaÃ«l COUTURIER\\
+\author{ Raphaël COUTURIER\\
 Laboratoire d'Informatique 
 de l'Universit\'e de  Franche-Comt\'e, \\
 BP 527, \\
 90016~Belfort CEDEX, France\\
- \and StÃ©phane Domas\\
+ \and Stéphane Domas\\
 Laboratoire d'Informatique 
 de l'Universit\'e de  Franche-Comt\'e, \\
 BP 527, \\
 90016~Belfort CEDEX, France\\
- \and GwenhaÃ«l Goavec\\
+ \and Gwenhaël Goavec\\
 ??
 ?? \\
 ??, \\
@@ -61,31 +64,20 @@ BP 527, \\
 %% blabla +
 %% quelques ref commentÃ©es sur les calculs basÃ©s sur l'interfÃ©romÃ©trie
 
-\section{Measurement architecture}
-\label{sec:measure-archi}
+\section{Measurement principles}
+\label{sec:measure}
 
+\subsection{Architecture}
+\label{sec:archi}
 %% description de l'architecture gÃ©nÃ©rale de l'acquisition d'images
 %% avec au milieu une unitÃ© de traitement dont on ne prÃ©cise pas ce
 %% qu'elle est.
 
 %% image tirÃ©e des expÃ©riences.
 
-\section{Design goals}
-\label{sec:goals}
-
-%% objectifs en terme de rapiditÃ© et de prÃ©cision, avec en vue l'ajout
-%% du contrÃ´le => l'unitÃ© de traitement qui s'impose est un FPGA =>
-%% algo adaptÃ© au FPGA.
-
-%% peut etre que cette section peut Ãªtre dÃ©placÃ©e en intro ... Ã  voir.
-
-\section{Proposed solution}
-\label{sec:solus}
-
 \subsection{Cantilever deflection estimation}
+\label{sec:deflest}
 
-%% => faire de l'interpolation de signal sinusoidal
-%% descriptif rapide des deux mÃ©thodes : splines et moindres carrÃ©s
 As shown on image \ref{img:img-xp}, each cantilever is covered by
 interferometric fringes. The fringes will distort when cantilevers are
 deflected. Estimating the deflection is done by computing this
@@ -113,19 +105,49 @@ where $x$ is the position of a pixel in its associated segment.
 
 The global method consists in two main sequences. The first one aims
 to determin the frequency $f$ of each profile with an algorithm based
-on spline interpolation (see below). It also computes the coefficient
-used for unwrapping the phase. The second one is the acquisition loop,
-while which images are taken at regular time steps. For each image,
-the phase $\theta$ of all profiles is computed to obtain, after
-unwrapping, the deflection of cantilevers.
-
-This phase computation is obviously the bottle-neck of the whole
-process. For example, if we consider the camera actually in use, an
-exposition time of 2.5ms for $1024\times 1204$ pixels seems the
-minimum that can be reached. For a $10\times 10$ cantilever array, if
-we neglect the time to extract pixels, it implies that computing the
-deflection of a single cantilever should take less than 25$Âµ$s, which is
-quite small.
+on spline interpolation (see section \ref{algo-spline}). It also
+computes the coefficient used for unwrapping the phase. The second one
+is the acquisition loop, while which images are taken at regular time
+steps. For each image, the phase $\theta$ of all profiles is computed
+to obtain, after unwrapping, the deflection of cantilevers.
+
+\subsection{Design goals}
+\label{sec:goals}
+
+If we put aside some hardware issues like the speed of the link
+between the camera and the computation unit, the time to deserialize
+pixels and to store them in memory, ... the phase computation is
+obviously the bottle-neck of the whole process. For example, if we
+consider the camera actually in use, an exposition time of 2.5ms for
+$1024\times 1204$ pixels seems the minimum that can be reached. For a
+$10\times 10$ cantilever array, if we neglect the time to extract
+pixels, it implies that computing the deflection of a single
+cantilever should take less than 25$µ$s, thus 12.5$µ$s by phase.\\
+
+In fact, this timing is a very hard constraint. Let consider a very
+small programm that initializes twenty million of doubles in memory
+and then does 1000000 cumulated sums on 20 contiguous values
+(experimental profiles have about this size). On an intel Core 2 Duo
+E6650 at 2.33GHz, this program reaches an average of 155Mflops. It
+implies that the phase computation algorithm should not take more than
+$240\times 12.5 = 1937$ floating operations. For integers, it gives
+$3000$ operations.
+
+%% to be continued ...
+
+%% à faire : timing de l'algo spline en C avec atan et tout le bordel.
+
+
+
+
+\section{Proposed solution}
+\label{sec:solus}
+
+
+\subsection{FPGA constraints}
+
+%% contraintes imposÃ©es par le FPGA : algo pipeline/parallele, pas d'op math complexe, ...
+
 
 \subsection{Considered algorithms}
 
@@ -137,7 +159,7 @@ classical least square method but suppose that frequency is already
 known.
 
 \subsubsection{Spline algorithm}
-
+\label{sec:algo-spline}
 Let consider a profile $P$, that is a segment of $M$ pixels with an
 intensity in gray levels. Let call $I(x)$ the intensity of profile in $x
 \in [0,M[$. 
@@ -228,16 +250,73 @@ computed.
 
 \end{itemize}
 
-\subsubsection{Comparison}
-
-\subsection{FPGA constraints}
-
-%% contraintes imposÃ©es par le FPGA : algo pipeline/parallele, pas d'op math complexe, ...
+Finally, the whole summarizes in an algorithm (called LSQ in the following) in two parts, one before and one during the acquisition loop :
+\begin{algorithm}[h]
+\caption{LSQ algorithm - before acquisition loop.}
+\label{alg:lsq-before}
+
+   $M \leftarrow $ number of pixels of the profile\\
+   I[] $\leftarrow $ intensities of pixels\\
+   $f \leftarrow $ frequency of the profile\\
+   $s4i \leftarrow \sum_{i=0}^{M-1} sin(4\pi f.i)$\\
+   $c4i \leftarrow \sum_{i=0}^{M-1} cos(4\pi f.i)$\\
+   $nb_s \leftarrow $ number of discretization steps of $[-\pi,\pi]$\\
+
+   \For{$i=0$ to $nb_s $}{
+     $\theta  \leftarrow -\pi + 2\pi\times \frac{i}{nb_s}$\\
+     lut\_sin[$i$] $\leftarrow sin \theta$\\
+     lut\_cos[$i$] $\leftarrow cos \theta$\\
+     lut\_A[$i$] $\leftarrow cos 2 \theta \times s4i + sin 2 \theta \times c4i$\\
+     lut\_sinfi[$i$] $\leftarrow sin (2\pi f.i)$\\
+     lut\_cosfi[$i$] $\leftarrow cos (2\pi f.i)$\\
+   }
+\end{algorithm}
+
+\begin{algorithm}[h]
+\caption{LSQ algorithm - during acquisition loop.}
+\label{alg:lsq-during}
+
+   $\bar{x} \leftarrow \frac{M-1}{2}$\\
+   $\bar{y} \leftarrow 0$, $x_{var} \leftarrow 0$, $xy_{covar} \leftarrow 0$\\
+   \For{$i=0$ to $M-1$}{
+     $\bar{y} \leftarrow \bar{y} + $ I[$i$]\\
+     $x_{var} \leftarrow x_{var} + (i-\bar{x})^2$\\
+   }
+   $\bar{y} \leftarrow \frac{\bar{y}}{M}$\\
+   \For{$i=0$ to $M-1$}{
+     $xy_{covar} \leftarrow xy_{covar} + (i-\bar{x}) \times (I[i]-\bar{y})$\\
+   }
+   $slope \leftarrow \frac{xy_{covar}}{x_{var}}$\\
+   $start \leftarrow y_{moy} - slope\times \bar{x}$\\
+   \For{$i=0$ to $M-1$}{
+     $I[i] \leftarrow I[i] - start - slope\times i$\tcc*[f]{slope removal}\\
+   }
+   
+   $I_{max} \leftarrow max_i(I[i])$, $I_{min} \leftarrow min_i(I[i])$\\
+   $amp \leftarrow \frac{I_{max}-I_{min}}{2}$\\
+
+   $Is \leftarrow 0$, $Ic \leftarrow 0$\\
+   \For{$i=0$ to $M-1$}{
+     $Is \leftarrow Is + I[i]\times $ lut\_sinfi[$i$]\\
+     $Ic \leftarrow Ic + I[i]\times $ lut\_cosfi[$i$]\\
+   }
+
+   $\theta \leftarrow -\pi$\\
+   $val_1 \leftarrow 2\times \left[ Is.\cos(\theta) + Ic.\sin(\theta) \right] - amp\times \left[ c4i.\sin(2\theta) + s4i.\cos(2\theta) \right]$\\
+   \For{$i=1-n_s$ to $n_s$}{
+     $\theta \leftarrow \frac{i.\pi}{n_s}$\\
+     $val_2 \leftarrow 2\times \left[ Is.\cos(\theta) + Ic.\sin(\theta) \right] - amp\times \left[ c4i.\sin(2\theta) + s4i.\cos(2\theta) \right]$\\
+
+     \lIf{$val_1 < 0$ et $val_2 >= 0$}{
+       $\theta_s \leftarrow \theta - \left[ \frac{val_2}{val_2-val_1}\times \frac{\pi}{n_s} \right]$\\
+     }
+     $val_1 \leftarrow val_2$\\
+   }
+
+\end{algorithm}
 
-\subsection{Least square algorithm}
 
-%% description prÃ©cise
-%% avantage sur FPGA
+\subsubsection{Comparison}
 
 \subsection{VDHL design paradigms}
 
-- 
2.39.5