add discussion file

[chloroplast13.git] / annotated.tex
diff --git a/annotated.tex b/annotated.tex

index 3a97542e12805305f0e0939e760ea57f80a3626b..46619ab9a42a64153ccd0cd2133a800e3363d40c 100644 (file)
--- a/annotated.tex
+++ b/annotated.tex
@@ -47,11 +47,11 @@ that stores annotated and/or unannotated chloroplast genomes.  We have
  considered the GenBank-NCBI \cite{Sayers01012011} database as sequence
  database:  99~genomes of chloroplasts  were retrieved.   These genomes
  lie in  the eleven type  of chloroplast families and  Table \ref{Tab2}
-summarizes their distribution in our dataset.\\
+summarizes their distribution in our dataset.
  
  \begin{figure}[h]  
    \centering
-    \includegraphics[width=0.8\textwidth]{generalView}
+    \includegraphics[width=0.75\textwidth]{generalView}
  \caption{A general overview of the annotation-based approach}\label{Fig1}
  \end{figure}
  
@@ -190,9 +190,9 @@ to extract core genes, as explained in Algorithm \ref{Alg3:thirdM}.
  \STATE $geneList=\text{empty list}$
  \STATE $common=set(dir(NCBI\_Genes)) \cap set(dir(Dogma\_Genes))$
  \FOR{$\text{gene in common}$}
-       \STATE $gen1 \leftarrow open(NCBI\_Genes(gene)).read()$         
-       \STATE $gen2 \leftarrow open(Dogma\_Genes(gene)).read()$
-       \STATE $score \leftarrow geneChk(gen1,gen2)$
+       \STATE $g1 \leftarrow open(NCBI\_Genes(gene)).read()$   
+       \STATE $g2 \leftarrow open(Dogma\_Genes(gene)).read()$
+       \STATE $score \leftarrow geneChk(g1,g2)$
         \IF {$score > Threshold$}
                 \STATE $geneList \leftarrow gene$
         \ENDIF 
@@ -210,18 +210,16 @@ geneChk subroutine.
  \caption{Find the Maximum Similarity Score between two sequences}
  \label{Alg3:genechk}
  \begin{algorithmic} 
-\REQUIRE $gen1,gen2 \leftarrow \text{NCBI gene sequence, Dogma gene sequence}$
+\REQUIRE $g1,g2 \leftarrow \text{NCBI gene sequence, Dogma gene sequence}$
  \ENSURE $\text{Maximum similarity score}$
-\STATE $Score1 \leftarrow needle(gen1,gen2)$
-\STATE $Score2 \leftarrow needle(gen1,Reverse(gen2))$
-\STATE $Score3 \leftarrow needle(gen1,Complement(gen2))$
-\STATE $Score4 \leftarrow needle(gen1,Reverse(Complement(gen2)))$
-\RETURN $max(Score1, Score2, Score3, Score4)$
+\STATE $score1 \leftarrow needle(g1,g2)$
+\STATE $score2 \leftarrow needle(g1,Reverse(g2))$
+\STATE $score3 \leftarrow needle(g1,Complement(g2))$
+\STATE $score4 \leftarrow needle(g1,Reverse(Complement(g2)))$
+\RETURN $max(score1,score2,score3,score4)$
  \end{algorithmic}
  \end{algorithm}  
  
-% THIS SUBSECTION MUST BE IMPROVED 
-
  \subsubsection{Intersection Core Matrix (\textit{ICM})}
  
  To extract  core genes, we  iteratively collect the maximum  number of
@@ -240,36 +238,25 @@ score_{ij}=\vert g_i \cap g_j\vert
  \label{Eq1}
  \end{equation}
  \noindent where $1 \leq i \leq n$, $1 \leq j \leq n$, and $g_i, g_j$ are 
-genomes. The  generation of a new  core gene depends obviously on the
-value of intersection scores $score_{ij}$:
-
-% TO BE CONTINUED
-
-$$
-\text{new Core} = 
-\begin{cases}
-\text{Ignored} & \text{if $\textit{score}=0$;} \\
-\text{new Core id} & \text{if $\textit{Score}>0$.}
-\end{cases}
-$$
-
-if     $\textit{Score}=0$     then     we    have     \textit{disjoint
-relation} \emph{i.e.},  no common genes between two  genomes.  In this
-case  the  system  ignores  the   genome  that  annul  the  core  gene
-size. Otherwise, The system removes these two genomes from ICM and add
-new  core  genome  with a  \textit{coreID}  of  them  to ICM  for  the
-calculation in  next iteration. This  process reduces the size  of ICM
-and repeats until all genomes  are treated \emph{i.e.} ICM has no more
-genomes.  We observe  that ICM is very large because  of the amount of
-data that it stores. This results  to be time and memory consuming for
-calculating  the  intersection  scores.   To  increase  the  speed  of
-calculations, it  is sufficient to  only calculate the  upper triangle
-scores. The time complexity for this process after enhancement is thus
-$O(\frac{n.(n-1)}{2})$.   Algorithm   \ref{Alg1:ICM}  illustrates  the
-construction of  the ICM matrix and  the extraction of  the core genes
-where \textit{GenomeList},  represents the database  where all genomes
-data are stored. At each iteration, it computes the maximum core genes
-with its two genomes parents.
+genomes. The  generation of a new  core gene depends  obviously on the
+value  of the  intersection scores  $score_{ij}$. More  precisely, the
+idea is  to consider a  pair of genomes  such that their score  is the
+largest element in ICM. These two genomes are then removed from matrix
+and the  resulting new  core genome is  added for the  next iteration.
+The ICM is then updated to take into account the new core gene: new IS
+values are computed for it. This process is repeated until no new core
+gene can be obtained.
+
+We  can observe  that  the ICM  is very  large  due to  the amount  of
+data. As a consequence, the  computation of the intersection scores is
+both  time and  memory consuming.  However,  since ICM  is a  symetric
+matrix we can reduce the  computation overhead by considering only its
+triangular  upper part.  The  time complexity  for this  process after
+enhancement is thus $O(\frac{n.(n-1)}{2})$.  Algorithm ~\ref{Alg1:ICM}
+illustrates the construction  of the ICM matrix and  the extraction of
+the  core  genes, where  \textit{GenomeList}  represents the  database
+storing all genomes  data. At each iteration, it  computes the maximum
+core genes with its two genomes parents.
  
  % ALGORITHM HAS BEEN REWRITTEN
  
@@ -277,8 +264,8 @@ with its two genomes parents.
  \caption{Extract Maximum Intersection Score}
  \label{Alg1:ICM}
  \begin{algorithmic} 
-\REQUIRE $L \leftarrow \text{genomes vectors}$
-\ENSURE $B1 \leftarrow Max Core Vector$ 
+\REQUIRE $L \leftarrow \text{genomes sets}$
+\ENSURE $B1 \leftarrow \text{Max Core set}$ 
  \FOR{$i \leftarrow 0:len(L)-1$}
          \STATE $score \leftarrow 0$
         \STATE $core1 \leftarrow set(GenomeList[L[i]])$
@@ -308,7 +295,7 @@ names\_Accession number)}, while an edge is labeled with the number of
  lost  genes from  a leaf  genome or  an intermediate  core  gene. Such
  numbers are  very interesting because  they give an  information about
  the evolution:  how many genes  were lost between two  species whether
-they  belong  to  the  same  familie  or not.   By  the  principle  of
+they  belong  to  the  same  family  or not.   By  the  principle  of
  classification, a  small number of genes lost  among species indicates
  that those species are close to  each other and belong to same family,
  while a  large lost  means that we  have an  evolutionary relationship
@@ -332,7 +319,7 @@ to align these sequences with each others.
  \end{enumerate} 
  
  \begin{figure}[H]
-  \centering \includegraphics[width=0.8\textwidth]{Whole_system}
+  \centering \includegraphics[width=0.75\textwidth]{Whole_system}
    \caption{Overview of the pipeline}\label{wholesystem}
  \end{figure}