Paper2/discussion.tex

   1
   2
   3
   4
   5 %\subsection{Implementation}
   6 %\label{sec:implem}
   7 %All the  algorithms detailed in this article have
   8 %been implemented using  Python~2.7 on a personal computer (Ubuntu~12.04 with 6~GiB memory,  quad-core Intel~i5 with  an operating  frequency of
   9 %2.5~GHz). %All programs can be downloaded at \begin{color}{red} \url{http://......} \end{color}.
  10 %%genes  from large  amount of  chloroplast  genomes.
  11 %
  12 %%\begin{center}
  13 %\begin{table}[H]
  14 %\centering
  15 %\caption{Type of annotations and execution time}\label{Etime}
  16 %{%\scriptsize
  17 %%\begin{tabular}{p{2.3cm}p{0.5cm}p{0.25cm}p{0.5cm}p{0.25cm}p{0.5cm}p{0.25cm}}%p{0.5cm}p{0.25cm}p{0.5cm}p{0.2cm}}
  18 %\begin{tabular}{ccccccc}
  19 %\hline\hline
  20 % Method & \multicolumn{2}{c}{Annotation} & \multicolumn{2}{c}{Features} & \multicolumn{2}{c}{Exec. time (min.)} \\%& \multicolumn{2}{c}{Core genes} & \multicolumn{2}{c}{Bad genomes} \\
  21 %~ & N & D & Name & Seq & N & D \\%& N & D & N & D \\
  22 %\hline
  23 %First approach & $\surd$ & - & - & $\surd$ & 1.7 & -\\% & ? & - & 0 & -\\[0.5ex]
  24 %Second approach & $\surd$ & $\surd$ & $\surd$ & - & 4.98 & 1.52\\% & 28 & 10 & 1 & 0\\[0.5ex]
  25 %Third approach & $\surd$ & $\surd$ & $\surd$ & $\surd$ & \multicolumn{2}{c}{$\simeq$3 days + 1.29} \\%& \multicolumn{2}{c}{4} & \multicolumn{2}{c}{1}\\[1ex]
  26 %\hline
  27 %\end{tabular}
  28 %}
  29 %\end{table}
  30 %%\end{center}
  31 %
  32 %%\vspace{-1cm}
  33 %
  34 %Table~\ref{Etime}  presents   the  annotation  type,
  35 %execution time,  and the  number of core  genes  for  each proposed  method. The following
  36 %notations have been used:  \textbf{N}  denotes NCBI and  \textbf{D} means  DOGMA,
  37 %while \textbf{Seq}  stands for sequence. The  two first {\it Annotation} columns
  38 %represent the algorithm used to annotate chloroplast genomes. The next two {\it
  39 %Features} columns mean  the kind  of gene feature used to extract core
  40 %genes: gene name, gene sequence, or  both of them.
  41 %
  42 %It can be seen that
  43 %almost all methods need low execution time  to extract core genes
  44 %from the large set of chloroplast genomes. Only the third method requires
  45 %more than one day of computation (about 3-4 days) for sequence comparisons. However,
  46 %once the quality genomes are well constructed, it only takes 1.29~minutes to
  47 %extract the core genes. Such low execution times allow us  to use these
  48 %methods to extract all core genomes  on a personal computer.
  49 %The lowest execution time (1.52~minutes)
  50 %is obtained with the second method using DOGMA annotations.
  51 %
  52 %
  53 %The second important computational factor is the amount of memory necessary for each
  54 %methodology.   Table~\ref{mem}  shows  the  memory   usage  of  each
  55 %method. In this table, the values are  presented in megabyte
  56 %unit, while \textit{gV} means  geneVision~file~format. We can notice that
  57 %the quantity  of required memory  is relatively low  for all methods,
  58 %and is available  on any personal computer. The  different values also
  59 %show that the gene features  method based on DOGMA annotations has the
  60 %most   reasonable   memory   usage,   except  when   extracting   core
  61 %sequences. The third method gives the lowest values if we already have
  62 %the   ``quality   genomes'',   otherwise   it  will   consume   far   more
  63 %memory. Remark that the  amount of memory  used by the third method also
  64 %depends on the size of each genome.
  65 %
  66 %
  67 %\begin{table}[H]
  68 %\centering
  69 %\caption{Memory usages for each methodology (in MB)}\label{mem}
  70 %\tabcolsep=0.11cm
  71 %{\scriptsize
  72 %\begin{tabular}{p{2.5cm}@{\hskip 0.1mm}p{1.5cm}@{\hskip 0.1mm}p{1cm}@{\hskip 0.1mm}p{1cm}@{\hskip 0.1mm}p{1cm}@{\hskip 0.1mm}p{1cm}@{\hskip 0.1mm}p{1cm}@{\hskip 0.1mm}p{1cm}}
  73 %\hline\hline
  74 %Method& & Load Gen. & Conv. gV & Read gV & ICM & Core tree & Core Seq. \\
  75 %\hline
  76 %Gene prediction & NCBI & 108 & - & - & - & - & -\\
  77 %\multirow{2}{*}{Gene Features} & NCBI & 15.4 & 18.9 & 17.5 & 18 & 18 & 28.1\\
  78 %              & DOGMA& 15.3 & 15.3 & 16.8 & 17.8 & 17.9 & 31.2\\
  79 %Gene Quality  & ~ & 15.3 & $\le$3G & 16.1 & 17 & 17.1 & 24.4\\
  80 %\hline
  81 %\end{tabular}
  82 %}
  83 %\end{table}
  84 %
  85 %
  86 %\subsection{Results comparison}
  87 %
  88 %Method 2 has indicated to us that two genomes must be removed from the
  89 %set of chloroplasts, namely \textit{Epifagus virginiana} (NC\_001568.1)
  90 %and \textit{Cuscuta gronovii} (NC\_009765.1). The reason to
  91 %be of this update is that (1) these chloroplastic genomes are non functional ones,
  92 %and (2) considering them leads to a too small final core genome.
  93 %Additionally, we have been forced to remove \textit{NC\_012568.1  Micromonas pusilla}
  94 %from the NCBI study, as its wrong annotations lead to an empty
  95 %final core genome.
  96 %
  97 %The number
  98 %of {\it  Core genes} in Table~\ref{Etime} represents the  amount of genes in  the last core
  99 %genome (the core genes shared by all the chloroplasts).
 100 %%The main goal is to  find the maximum core genes that simulate
 101 %%biological background of chloroplasts.
 102 %With  NCBI we obtained 28 genes for
 103 %96   genomes, while DOGMA approach produces 10   genes for  the whole 97  genomes.
 104 % However we will see that the distribution of  genomes
 105 %in the NCBI core tree is less relevant, biologically speaking, than the one obtained
 106 %by using DOGMA naming process (see Section~\ref{sec:discuss}).
 107 %
 108 %%\begin{sidewaystable}
 109 %\begin{table}
 110 %\centering
 111 %    \begin{tabular}{llllllllll}
 112 %    \hline
 113 %    Method         & \multicolumn{2}{l}{Connected Components} & \multicolumn{6}{l}{ICM-Genes' names} & ICM-Quality test \\ \hline
 114 %    Annotation     & NCBI                 & DOGMA & \multicolumn{3}{l}{NCBI} & \multicolumn{3}{l}{DOGMA} & NCBI and DOGMA   \\
 115 %    Nb. of genomes & 99                   & 99    & 99*              & 97  & 96  & 99    & 97  & 96  & 99               \\
 116 %    Core genome    & 5                    & 3     & 9                & 0   & 28  & 2     & 10  & 28  & 5                \\
 117 %    Pan genome     & 761                  & 445   & 766              & 764 & 737 & 297   & 297 & 297 & 245              \\ \hline
 118 %    \end{tabular}
 119 %\end{table}
 120 %
 121 %% Please remember to add \use{multirow} to your document preamble in order to suppor multirow cells
 122 %\begin{table}[h]
 123 %\begin{tabular}{ccccl}
 124 %\hline
 125 %Nb. of       & Methods                   & Type of & Size of & \multicolumn{1}{c}{Names of core genes}                                                                                                                                                              \\
 126 % genomes&&annotation&core genome&\\ \hline
 127 %\multirow{3}{*}{97}  & \multirow{2}{*}{Method 2} & NCBI               & 0                   & -                                                                                                                                                                                                    \\
 128 %\multicolumn{1}{l}{} & \multicolumn{1}{l}{}      & DOGMA              & 10                  & ATPI, PSAA, PSAB, PSBA,  \\
 129 %&&&&PSBE, PSBF, PSBL, RPL2,\\
 130 %&&&& TRNC-GCA, TRNH-GUG \\
 131 %\multicolumn{1}{l}{} & Method 3                  & Both    & 5                   & ATPI, ATPA, ATPH, PSBJ, PSBE                                                                                                                                                                         \\
 132 %\multirow{3}{*}{96}  & \multirow{2}{*}{Method 2} & NCBI               & 28                  & ATPA, ATPB, ATPE, ATPH,\\
 133 %&&&& ATPI, PETG, PSAA, PSAB,\\
 134 %&&&& PSAC, PSBA, PSBD, PSBE,\\
 135 %&&&& PSBF, PSBH, PSBJ, PSBK,\\
 136 %&&&& PSBN, RBCL, RPL14, RPL16,\\
 137 %&&&& RPL20, RPL36, RPS18, RPS3,\\
 138 %&&&& RPS4, RPS7, RPS8, RPS11                         \\
 139 %\multicolumn{1}{l}{} &  & DOGMA & 28                  & ATPA, ATPB, ATPI, ATPH, PETB, PETG, PSAA, PSAB, PSAC, PSBA, PSBD, PSBE, PSBF, PSBC, PSBJ, PSBI, PSBL, PSBT, RBCL, RPL2, RRN16, TRND-GUC, TRNFM-CAU, TRNH-GUG, TRNI-GAU, TRNN-GUU, TRNQ-UUG, TRNC-GCA \\
 140 %\multicolumn{1}{l}{} & \multicolumn{1}{l}{Method 3}      & Both     & 5                   & ATPI, ATPA, ATPH, PSBJ, PSBE                                                                                                                                                                         \\ \hline
 141 %\end{tabular}
 142 %\end{table}
 143 %%\end{sidewaystable}
 144
 145
 146
 147 \subsection{Biological evaluation}\label{sec:discuss}
 148 It is well known that the first plants' endosymbiosis ended in a great diversification of
 149 lineages comprising \textit{Red Algae}, \textit{Green Algae}, and \textit{Land Plants} (terrestrial).
 150 Several second endosymbioses occurred then: two involving a \textit{Red Algae}
 151 and other heterotrophic eucaryotes and giving birth to both \textit{Brown Algae}
 152 and \textit{Dinoflagellates} lineages; another involving a \textit{Green Algae} and
 153 a heterotrophic eucaryote and giving birth to \textit{Euglens}~\cite{mcfadden2001primary}.
 154
 155 The interesting point with the produced core trees (especially the one
 156 obtained with DOGMA, see \url{http://members.femto-st.fr/christophe-guyeux/en/chloroplasts}) is
 157 that organisms resulting from the first endosymbiosis are distributed in
 158 each of the lineages found in the chloroplast genome structure
 159 evolution. More precisely, all \textit{Red Algae} chloroplasts are grouped together in one lineage, while
 160 \textit{Green Algae} and \textit{Land Plants} chloroplasts are all in a second lineage.
 161 Furthermore organisms resulting from the secondary endosymbioses are well localized in
 162 the tree: both the chloroplasts of \textit{Brown Algae} and \textit{Dinoflagellates}
 163 representatives are found exclusively in the lineage also comprising the
 164 \textit{Red Algae} chloroplasts from which they evolved, while the \textit{Euglens}
 165 chloroplasts are related to the \textit{Green Algae} chloroplasts from which they
 166 evolved.
 167 This makes sense in terms of biology, history of lineages, and
 168 theories of chloroplasts origins (and so photosynthetic ability)  in
 169 different Eucaryotic lineages~\cite{mcfadden2001primary}.
 170
 171 Interestingly, the sole organisms under consideration that possess a
 172 chloroplast (and so a chloroplastic genome) but that have lost the
 173 photosynthetic ability (being parasitic plants) are found at the basis of
 174 the tree, and not together with their phylogenetically related species.
 175 This means that functional chloroplast genes are evolutionary constrained
 176 when used in photosynthetic process, but loose rapidly their efficiency
 177 when not used, as recently observed for a species of Angiosperms\cite{li2013complete}.
 178 These species are \textit{Cuscuta-grovonii}, an Angiosperm (flowering plant)
 179 at the base of the DOGMA Angiosperm-Conifers branch, and
 180 \textit{Epipactis-virginiana}, also an Angiosperm, at the complete basis of this tree.
 181
 182 Another interesting result is that \textit{Land Plants} that
 183 represent a single sublineage originating from the large and diverse
 184 lineage of \textit{Green Algae} in Eucaryotes history are present in two different
 185 branches of the DOGMA tree, both associated with \textit{Green Algae}: one branch
 186 comprising the basal grade of \textit{Land Plants} (mosses and ferns) and the second one
 187 containing the most internal lineages of \textit{Land Plants} (Conifers and flowering plants).
 188 But independently of their split in two distinct branches of the DOGMA
 189 tree, the \textit{Land Plants} always show a higher number of functional genes in
 190 their chloroplasts than the \textit{Green Algae} from which they emerged, probably meaning that the
 191 terrestrial way of life necessitates more functional genes for an
 192 optimal photosynthesis than the marine one. However, a more detailed
 193 analysis of selected genes is necessary to better understand the reasons why
 194 such a distribution has been obtained.
 195 Remark finally that all these biologically interesting results are apparent
 196 only in the core tree based on DOGMA, while they are not so obvious in the NCBI one.
 197
 198
 199 %\begin{figure}
 200 %\centering
 201 %\includegraphics[scale=0.37]{core}
 202 %\caption{Core}
 203 %\end{figure}
 204 %
 205 %
 206 %\begin{figure}
 207 %\centering
 208 %\includegraphics[scale=0.37]{pan}
 209 %\caption{Pan}
 210 %\end{figure}