From 0f2f8faba6ce0dbfd8f3e8df9d31f67756cff6c1 Mon Sep 17 00:00:00 2001
From: Michel Salomon <salomon@caseb.iut-bm.univ-fcomte.fr>
Date: Tue, 19 Nov 2013 11:24:06 +0100
Subject: [PATCH] Modifications made in the two first sections

---
 abstract.tex   | 34 ++++++++++++------------
 annotated.tex  | 24 ++++++++++++++++-
 classEquiv.tex | 70 +++++++++++++++++++++++++++++++++-----------------
 intro.tex      |  7 +++++
 main.tex       | 32 ++++++++---------------
 5 files changed, 103 insertions(+), 64 deletions(-)

diff --git a/abstract.tex b/abstract.tex
index 9b3c438..3c789e3 100644
--- a/abstract.tex
+++ b/abstract.tex
@@ -1,20 +1,20 @@
 \begin{abstract}
-DNA analysis techniques have received  a lot of attention  these last 
-years,  because they play an  important role  in understanding 
-genomes evolution over time, and in phylogenetic and genetic analyses. Various models 
-of genomes evolution  are based on the analysis  of DNA sequences, SNPs, 
-mutations, and so on. We have recently investigated the use of
-core (\emph{i.e.}, common  genes) and pan genomes  to infer evolutionary information 
-on a collection of 107 chloroplasts. In particular,
-we have regarded methods to  build a genes content evolutionary tree  using  
-distances to core  genome. However,  
-the production of reliable core and pan genomes is not an easy task,
-due to error annotations of the NCBI. The presentation will then
-consist in various compared approaches to construct such a tree using
-fully annotated genomes by NCBI and Dogma, followed by a gene quality 
-control among the  common genes. We will finally explain how, by comparing 
-sequences from Dogma with NCBI contents, we achieved to identify the 
-genes that play a key role in the dynamics of genomes evolution. \\
+DNA analysis  techniques have received  a lot of attention  these last
+years, because  they play an  important role in  understanding genomes
+evolution over time, and in phylogenetic and genetic analyses. Various
+models  of  genomes  evolution  are  based  on  the  analysis  of  DNA
+sequences, SNPs,  mutations, and so on. We  have recently investigated
+the use of  core (\emph{i.e.}, common genes) and  pan genomes to infer
+evolutionary  information on  a  collection of  107 chloroplasts.   In
+particular,  we  have  regarded  methods  to  build  a  genes  content
+evolutionary  tree  using  distances  to core  genome.   However,  the
+production of reliable  core and pan genomes is not  an easy task, due
+to error  annotations. We will  first compare different  approaches to
+construct such a tree using fully annoted genomes provided by NCBI and
+Dogma, followed by a gene quality control among the common genes. Then
+we  will explain  how, by  comparing  sequences from  Dogma with  NCBI
+contents, we  achieved to identify the  genes that play a  key role in
+the dynamics of genomes evolution.
 
 \textbf{Keywords:} genome evolution, phylogenetic tree, core genes, evolution tree, genome annotation  
-\end{abstract}
\ No newline at end of file
+\end{abstract}
diff --git a/annotated.tex b/annotated.tex
index 762d50c..3a7fbd1 100644
--- a/annotated.tex
+++ b/annotated.tex
@@ -1,4 +1,26 @@
-The field of genome annotation pays a lot of attentions where the ability to collect and analysis genomical data can provide strong indicators for the study of life\cite{Eisen2007}. Four of genome annotation centers (such as, \textit{NCBI\cite{Sayers01012011}, Dogma \cite{RDogma}, cpBase \cite{de2002comparative}, CpGAVAS \cite{liu2012cpgavas}, and CEGMA\cite{parra2007cegma}}) present various types of annotation tools (\emph{i.e.} cost-effective sequencing methods\cite{Bakke2009}) on different annotation levels. Generally, previous studies used one of three methods for gene finding in annotated genome using these centers: \textit{alignment-based, composition based, or combination of both\cite{parra2007cegma}}. The alignment-based method is used when we try to predict a coding gene (\emph{i.e.}. genes that produce proteins) by aligning DNA sequence of gene to the protein of cDNA sequence of homology\cite{parra2007cegma}. This approach also is used in GeneWise\cite{birney2004genewise}. Composition-based method (known as \textit{ab initio}) is based on a probabilistic model of gene structure to find genes according to the gene value probability (GeneID\cite{parra2000geneid}). In this section, we consider a new method of finding core genes from large amount of chloroplast genomes, as a solution of the problem resulting from the method stated in section two. This method is based on extracting gene features. A general overview of the system is illustrated in Figure \ref{Fig1}.\\
+The  field of genome  annotation pays  a lot  of attentions  where the
+ability  to collect  and analysis  genomical data  can  provide strong
+indicators  for  the study  of  life\cite{Eisen2007}.  Four of  genome
+annotation   centers   (such  as,   \textit{NCBI\cite{Sayers01012011},
+Dogma       \cite{RDogma},       cpBase      \cite{de2002comparative},
+CpGAVAS    \cite{liu2012cpgavas},   and   CEGMA\cite{parra2007cegma}})
+present various types  of annotation tools (\emph{i.e.} cost-effective
+sequencing    methods\cite{Bakke2009})    on   different    annotation
+levels. Generally, previous studies used one of three methods for gene
+finding       in        annotated       genome       using       these
+centers: \textit{alignment-based, composition based, or combination of
+both\cite{parra2007cegma}}. The alignment-based method is used when we
+try  to  predict  a  coding  gene  (\emph{i.e.}.  genes  that  produce
+proteins)  by aligning DNA  sequence of  gene to  the protein  of cDNA
+sequence of homology\cite{parra2007cegma}.  This approach also is used
+in GeneWise\cite{birney2004genewise}.  Composition-based method (known
+as  \textit{ab initio})  is based  on  a probabilistic  model of  gene
+structure  to  find genes  according  to  the  gene value  probability
+(GeneID\cite{parra2000geneid}).  In this  section, we  consider  a new
+method of finding core genes from large amount of chloroplast genomes,
+as  a solution  of the  problem resulting  from the  method  stated in
+section  two. This  method is  based  on extracting  gene features.  A
+general overview of the system is illustrated in Figure \ref{Fig1}.\\
 
 \begin{figure}[H]  
   \centering
diff --git a/classEquiv.tex b/classEquiv.tex
index 41abda4..00b227d 100644
--- a/classEquiv.tex
+++ b/classEquiv.tex
@@ -1,28 +1,50 @@
-Identifying  core genes  is important  to understand  evolutionary and
-functional phylogenies. Therefore, in this work we present two methods
-to build a  genes content evolutionary tree. More  precisely, we focus
-on   the    following   questions   considering    a   collection   of
-99~chloroplasts  annotated from  NCBI \cite{Sayers01012011} and  Dogma
-\cite{RDogma} : how can we identify the best core genome and what
-is the evolutionary scenario of these chloroplasts.
-Two methods are considered here. The first one is based on NCBI annotation, it is explained below.
-We start by the following definition.
+
+The first method, described below, considers NCBI annotations and uses
+a distance-based similarity measure. We start with the following
+preliminary Definition:
+
 \begin{definition}
 \label{def1}
-Let $A=\{A,T,C,G\}$ be the nucleotides alphabet, and $A^\ast$ be the set of finite words on $A$ (\emph{i.e.}, of DNA sequences). Let $d:A^{\ast}\times A^{\ast}\rightarrow[0,1]$ be a distance on $A^{\ast}$. Consider a given value $T\in[0,1]$ called a threshold. For all $x,y\in A^{\ast}$, we will say that $x\sim_{d,T}y$ if $d(x,y)\leqslant T$. 
+Let $A=\{A,T,C,G\}$  be the nucleotides alphabet, and  $A^\ast$ be the
+set  of finite  words on  $A$  (\emph{i.e.}, of  DNA sequences).   Let
+$d:A^{\ast}\times   A^{\ast}\rightarrow[0,1]$   be   a   distance   on
+$A^{\ast}$. Consider a given value $T\in[0,1]$ called a threshold. For
+all   $x,y\in  A^{\ast}$,   we   will  say   that  $x\sim_{d,T}y$   if
+$d(x,y)\leqslant T$.
 \end{definition}
 
-\noindent$\sim_{d,T}$ is obviously an equivalence relation. When $d=1-\Delta$, where $\Delta$ is the similarity scoring function embedded into the emboss package (Needleman-Wunch released by EMBL), we will simply denote $\sim_{d,0.1}$ by $\sim$. The method starts by building an undirected graph based on
-the similarity rates $r_{ij}$  between sequences $g_{i}$ and $g_{j}$ (\emph{i.e.}, $r_{ij}=\Delta(g_{i},g_{j})$).
-In this latter, nodes are constituted by all the coding sequences of the set of genomes under consideration, and there is an edge between $g_{i}$ and $g_{j}$ if the 
-similarity rate $r_{ij}$ is
-greater than the given similarity threshold. The Connected Components
-(CC) of the ``similarity'' graph are thus computed.
-This produces an equivalence 
-relation between sequences in the same CC based on Definition~\ref{def1}.
-Any class for this relation is called ``gene'' here, where its representatives (DNA sequences) are the ``alleles'' of this gene. Thus this first method produces for each genome $G$, which is a set $\{g_{1}^G,...,g_{m_G}^G\}$ of $m_{G}$ DNA coding sequences, the projection of each sequence according to $\pi$, where $\pi$ maps each sequence
-into its gene (class) according to $\sim$. In other words, $G$ is mapped into $\{\pi(g_{1}^G),...,\pi(g_{m_G}^G)\}$.  
-Remark that a projected genome has no duplicated gene, as it is a set. The core  genome (resp. the pan genome) of $G_{1}$ and $G_{2}$ is defined thus as the intersection (resp. as the union) of these projected genomes.\\
-We then consider the intersection of all the projected genomes, which is the set of all the genes $\dot{x}$
-such that each genome has at least one allele in $\dot{x}$. The pan genome is computed similarly as the union of all the projected genomes. However such approach suffers from producing too small core genomes, 
-for any chosen similarity threshold, compared to what is usually waited by biologists regarding these chloroplasts. We are then left with the following questions: how can we improve the confidence put in the produced core? Can we thus guess the evolution scenario of these genomes?
\ No newline at end of file
+\noindent $\sim_{d,T}$ is obviously an equivalence relation and when $d=1-\Delta$, where $\Delta$ is the similarity scoring function embedded into the emboss package (Needleman-Wunch released by EMBL), we will simply denote $\sim_{d,0.1}$ by $\sim$.
+
+The method begins by building  an undirected graph based on similarity
+rates $r_{ij}$ between DNA~sequences $g_{i}$ and $g_{j}$ (\emph{i.e.},
+$r_{ij}=\Delta\left(g_{i},g_{j}\right)$).  In this latter graph, nodes
+are  constituted by all  the coding  sequences of  the set  of genomes
+under consideration, and there is  an edge between $g_{i}$ and $g_{j}$
+if the  similarity rate  $r_{ij}$ is greater  than a  given similarity
+threshold. The  Connected Components (CC) of  the ``similarity'' graph
+are thus computed.
+
+This process also results in an equivalence relation between sequences
+in the  same CC  based on Definition~\ref{def1}.   Any class  for this
+relation   is  called   ``gene''  here,   where   its  representatives
+(DNA~sequences)  are the ``alleles''  of this  gene.  Thus  this first
+method   produces   for   each    genome   $G$,   which   is   a   set
+$\left\{g_{1}^G,...,g_{m_G}^G\right\}$    of   $m_{G}$    DNA   coding
+sequences, the  projection of each sequence according  to $\pi$, where
+$\pi$ maps each sequence into its gene (class) according to $\sim$. In
+other     words,      a     genome     $G$      is     mapped     into
+$\left\{\pi(g_{1}^G),...,\pi(g_{m_G}^G)\right\}$.    Note    that    a
+projected genome has no duplicated gene since it is a set.
+
+Consequently, the core  genome (resp.  the pan genome)  of two genomes
+$G_{1}$  and $G_{2}$  is defined  as  the intersection  (resp. as  the
+union) of their projected  genomes.  We then consider the intersection
+of  all the  projected genomes,  which  is the  set of  all the  genes
+$\dot{x}$  such  that   each  genome  has  at  least   one  allele  in
+$\dot{x}$. The  pan genome is computed  similarly as the  union of all
+the projected  genomes. However  such approach suffers  from producing
+too small core genomes,  for any chosen similarity threshold, compared
+to   what  is   usually   expected  by   biologists  regarding   these
+chloroplasts. We are  then left with the following  questions: how can
+we improve the confidence put in  the produced core? Can we thus guess
+the evolution scenario of these genomes?
diff --git a/intro.tex b/intro.tex
index e69de29..84c9ed8 100644
--- a/intro.tex
+++ b/intro.tex
@@ -0,0 +1,7 @@
+Identifying  core genes  is important  to understand  evolutionary and
+functional phylogenies. Therefore, in  this work we present methods to
+build a genes  content evolutionary tree. More precisely,  we focus on
+the following  questions considering a  collection of 107~chloroplasts
+annotated from NCBI \cite{Sayers01012011} and Dogma \cite{RDogma}: how
+can  we identify the  best core  genome and  what is  the evolutionary
+scenario of these chloroplasts.
diff --git a/main.tex b/main.tex
index 921954b..65e70d4 100755
--- a/main.tex
+++ b/main.tex
@@ -16,30 +16,23 @@
 \usepackage{tikz}
 \usetikzlibrary{shapes,arrows}
 
-
 % correct bad hyphenation here
 \hyphenation{op-tical net-works semi-conduc-tor}
-
  
 \begin{document}
 
 \title{Finding the core-genes of Chloroplast Species}
-
-
 \author{
-Bassam AlKindy,
-Jean-Fran\c{c}ois Couchot,  
-Christophe Guyeux,
-and Michel Salomon*\\
-FEMTO-ST Institute, UMR 6174 CNRS\\
-Computer Science Laboratory DISC,
-University of Franche-Comt\'{e},
-Besan\c con, France.\\
-$*:$ Authors in alphabetic order.\\
+Bassam AlKindy\footnote{email: balkindy@femto-st.fr} \and Jean-Fran\c{c}ois Couchot 
+\and Christophe Guyeux \and Michel Salomon \and\\
+FEMTO-ST Institute, UMR 6174 CNRS, \\
+Computer Science Department DISC, \\
+University of Franche-Comt\'{e}, France \\
+{\small \it Authors in alphabetic order}
 }
+
 \newcommand{\JFC}[1]{\begin{color}{green}\textit{}\end{color}}
-\newcommand{\CG}[1]{\begin{color}
-{blue}\textit{}\end{color}}
+\newcommand{\CG}[1]{\begin{color}{blue}\textit{}\end{color}}
 % make the title area
 \maketitle
 
@@ -49,22 +42,17 @@ $*:$ Authors in alphabetic order.\\
 \section{Introduction}\label{sec:intro}
 \input{intro.tex}
 
-
-\section{Similarity-based Approach}
-%input 
+\section{Similarity-based approach}
 % Main author : jfc
 \input{classEquiv}
 
-
-
-\section{Annotated-based Approaches}
+\section{Annotations-based approaches}
 % Main author : bassam
 \input{annotated}
 
 % \section{Second Stage: to Find Closed Genomes}
 % \input{closedgenomes}
 
-
 \section{Conclusion}\label{sec:concl}
 
 
-- 
2.39.5