From 05b9ee01c9c471e85dd17b5a4740b30dbc418df5 Mon Sep 17 00:00:00 2001 From: bassam al-kindy Date: Thu, 24 Oct 2013 16:59:54 +0200 Subject: [PATCH] Update the section of dogma annotation --- annotated.tex | 74 ++++++++++++++++++++++++++++++--------------------- biblio.bib | 41 ++++++++++++++++++++++++++++ main.tex | 5 ++-- 3 files changed, 87 insertions(+), 33 deletions(-) diff --git a/annotated.tex b/annotated.tex index f4ed0cd..55c4324 100644 --- a/annotated.tex +++ b/annotated.tex @@ -1,4 +1,4 @@ -The field of Genome annotation pay a lot of attentions where the ability to collect and analysis genomical data can provide strong indicator for the study of life\cite{Eisen2007}. A lot of genome annotation centres present various types of annotation tools (i.e cost-effective sequencing methods\cite{Bakke2009}) on different annotation levels. Methods of gene finding in annotated genome can be categorized as: Alignment-based, composition based or combination of both\cite{parra2007cegma}. The Alignment-based method is used when we try to predict a coding gene (i.e. Genes that produce proteins) by aligning DNA sequence of gene to the protein of cDNA sequence of homolog\cite{parra2007cegma}, this approache used in GeneWise\cite{birney2004genewise} with known splicing signals. Composition-based mothod (known as \textit{ab initio} is based on a probabilistic model of gene structure to find genes and/or new genes accoding to the probility gene value, this method like GeneID\cite{parra2000geneid}. In this section, we will consider a new method of finding core genes from large amount of chloroplast genomes, as a solution of the previous method where stated in section two. This method is based on extracting gene features. The question now is how can we have good annotation genome? To answer this question, we need to focusing on studying the annotation's accuracy (systematically\cite{Bakke2009}) of the genome. The general overview of the system is illustrated in Figure \ref{Fig1}.\\ +The field of Genome annotation pay a lot of attentions where the ability to collect and analysis genomical data can provide strong indicator for the study of life\cite{Eisen2007}. A lot of genome annotation centres present various types of annotation tools (i.e cost-effective sequencing methods\cite{Bakke2009}) on different annotation levels. Methods of gene finding in annotated genome can be categorized as: Alignment-based, composition based or combination of both\cite{parra2007cegma}. The Alignment-based method is used when we try to predict a coding gene (i.e. Genes that produce proteins) by aligning DNA sequence of gene to the protein of cDNA sequence of homolog\cite{parra2007cegma}. This approache also used in GeneWise\cite{birney2004genewise} with known splicing signals. Composition-based mothod (known as \textit{ab initio} is based on a probabilistic model of gene structure to find genes and/or new genes accoding to the probility gene value, this method like GeneID\cite{parra2000geneid}. In this section, we will consider a new method of finding core genes from large amount of chloroplast genomes, as a solution of the previous method where stated in section two. This method is based on extracting gene features. The question now is how can we have good annotation genome? To answer this question, we need to focusing on studying the annotation's accuracy (systematically\cite{Bakke2009}) of the genome. The general overview of the system is illustrated in Figure \ref{Fig1}.\\ \begin{figure}[H] \caption{A general overview of the system} @@ -11,15 +11,17 @@ In Figure 1, we illustrate the general overview of the system. In this system, t The output from each stage in our system will be considered to be an input to the second stage and so on. The rest of this section, in section 3.1, we will introduce some annotation problem with NCBI chloroplast genomes and we will discuss our method for how can we extract useful data. Section 3.2 we will present here our system for calculating evolutionary core genome based on another annotation tool than NCBI. \subsection{Genomes Samples} -In this research, we retrieved 107 genomes of Chloroplasts from NCBI where 9 genomes considered as not good. These 99 genomes lies in the 11 types of chloroplast families, divided as 11 for Algues Brunes, 3 Algue Rouges, 17 Algues Vertes, 45 Angiospermes, 3 Brypoytes, 2 Dinoflagelles, 2 Euglenes, 5 Filicophytes, 7 Gymnosperms, 2 Lycophytes, and 1 Haptophytes, as show in Table 1. +In this research, we retrieved 107 genomes of Chloroplasts from NCBI where 9 genomes considered as not good. These 99 genomes lies in the 11 types of chloroplast families, divided as 11 for Algues Brunes, 3 Algue Rouges, 17 Algues Vertes, 45 Angiospermes, 3 Brypoytes, 2 Dinoflagelles, 2 Euglenes, 5 Filicophytes, 7 Gymnosperms, 2 Lycophytes, and 1 Haptophytes, as show in Table \ref{Tab1}.\pagebreak -\footnotesize % Switch from 12pt to 11pt; otherwise, table won't fit -\setlength\LTleft{-30pt} % default: \fill -\setlength\LTright{-30pt} % default: \fill +\footnotesize +\setlength\LTleft{-30pt} +\setlength\LTright{-30pt} \begin{longtable}{@{\extracolsep{\fill}}llllllllll@{}} + +\caption[NCBI Genomes Families]{List of family groups of Chloroplast Genomes from NCBI\label{Tab1}}\\ % Heading \hline\hline - Category & Counts & Accession No & Scientific Name\\ + {\textbf{Category}} & {\textbf{Counts}} & {\textbf{Accession No}} & {\textbf{Scientific Name}} \\ \hline %Entering First line & & NC\_001713.1 & Odontella sinensis \\ @@ -27,8 +29,7 @@ In this research, we retrieved 107 genomes of Chloroplasts from NCBI where 9 gen & & NC\_010772.1 & Heterosigma akashiwo \\ & & NC\_011600.1 & Vaucheria litorea \\ & & NC\_012903.1 & Aureoumbra lagunensis \\ - Algues Brunes & 11 - & NC\_014808.1 & Thalassiosira oceanica \\ + Algues Brunes & 11 & NC\_014808.1 & Thalassiosira oceanica \\ & & NC\_015403.1 & Fistulifera sp \\ & & NC\_016731.1 & Synedra acus \\ & & NC\_016735.1 & Fucus vesiculosus \\ @@ -36,8 +37,7 @@ In this research, we retrieved 107 genomes of Chloroplasts from NCBI where 9 gen & & NC\_020014.1 & Nannochloropsis gadtina \\ [1ex] %Entering second group & & NC\_000925.1 & Porphyra purpurea \\ - Algues Rouges & 3 - & NC\_001840.1 & Cyanidium caldarium \\ + Algues Rouges & 3 & NC\_001840.1 & Cyanidium caldarium \\ & & NC\_006137.1 & Gracilaria tenuistipitata \\ [1ex] %Entering third group & & NC\_000927.1 & Nephroselmis olivacea \\ @@ -48,8 +48,7 @@ In this research, we retrieved 107 genomes of Chloroplasts from NCBI where 9 gen & & NC\_008114.1 & Pseudoclonium akinetum \\ & & NC\_008289.1 & Ostreococcus tauri \\ & & NC\_008372.1 & Stigeoclonium helveticum \\ - Algues Vertes & 17 - & NC\_008822.1 & Chlorokybus atmophyticus \\ + Algues Vertes & 17 & NC\_008822.1 & Chlorokybus atmophyticus \\ & & NC\_011031.1 & Oedogonium cardiacum \\ & & NC\_012097.1 & Pycnococcus provaseolii \\ & & NC\_012099.1 & Pyramimonas parkeae \\ @@ -60,8 +59,7 @@ In this research, we retrieved 107 genomes of Chloroplasts from NCBI where 9 gen & & NC\_016733.1 & Pedinomonas minor \\ [1ex] %Entering fourth group & & NC\_001319.1 & Marchantia polymorpha \\ - Bryophytes & 3 - & NC\_004543.1 & Anthoceros formosae \\ + Bryophytes & 3 & NC\_004543.1 & Anthoceros formosae \\ & & NC\_005087.1 & Physcomitrella patens \\ [1ex] %Entering fifth group & & NC\_014267.1 & Kryptoperidinium foliaceum \\ @@ -69,13 +67,11 @@ In this research, we retrieved 107 genomes of Chloroplasts from NCBI where 9 gen & NC\_014287.1 & Durinskia baltica \\ [1ex] %Entering sixth group & & NC\_001603.2 & Euglena gracilis \\ - Euglenes & 2 - & NC\_020018.1 & Monomorphina aenigmatica \\ [1ex] + Euglenes & 2 & NC\_020018.1 & Monomorphina aenigmatica \\ [1ex] %Entering seventh group & & NC\_003386.1 & Psilotum nudum \\ - & & NC\_008829.1 & Angiopteris evecta \\ [1ex] - Filicophytes & 5 - & NC\_014348.1 & Pteridium aquilinum \\ + & & NC\_008829.1 & Angiopteris evecta \\ + Filicophytes & 5 & NC\_014348.1 & Pteridium aquilinum \\ & & NC\_014699.1 & Equisetum arvense \\ & & NC\_017006.1 & Mankyua chejuensis \\ [1ex] % Entering eighth group @@ -96,8 +92,7 @@ In this research, we retrieved 107 genomes of Chloroplasts from NCBI where 9 gen & & NC\_009601.1 & Dioscorea elephantipes \\ & & NC\_009765.1 & Cuscuta gronovii \\ & & NC\_009808.1 & Ipomea purpurea \\ - Angiospermes & 45 - & NC\_010361.1 & Oenothera biennis \\ + Angiospermes & 45 & NC\_010361.1 & Oenothera biennis \\ & & NC\_010433.1 & Manihot esculenta \\ & & NC\_010442.1 & Trachelium caeruleum \\ & & NC\_013707.2 & Olea europea \\ @@ -127,7 +122,12 @@ In this research, we retrieved 107 genomes of Chloroplasts from NCBI where 9 gen Gymnosperms & 7 & NC\_016063.1 & Cephalotaxus wilsoniana \\ & & NC\_016065.1 & Taiwania cryptomerioides \\ & & NC\_016069.1 & Picea morrisonicola \\ - & & NC\_016986.1 & Gingko biloba \\ + & & NC\_016986.1 & Gingko biloba \\ [1ex] + %Entering tenth group + Haptophytes & 1 & NC\_007288.1 & Emiliana huxleyi\\ [1ex] + %Entering eleventh group + Lycophytes & 2 & NC\_014675.1 & Isoetes flaccida \\ + & & NC\_006861.1 & Huperzia lucidula \\ \hline \end{longtable} @@ -138,8 +138,8 @@ With NCBI, the idea is to use the existing annotations of NCBI with chloroplast The trivial and simple idea to construct the core genome is based on the extraction of Genes names (as gene presence or absence). For instant, in this stage neither sequence comparison nor new annotation were made, we just want to extract all gene counts stored in each chloroplast genome then find the intersection core genes based on gene names.\\ \textbf{Step I: pre-processing}\\ -The objective from this step is to organize, solve genes duplications, and generate sets of genes for each genome. The input to the system is a list genomes from NCBI stored as a \textit{.fasta} file that include a collection of coding genes\cite{parra2007cegma}(genes that produce protein) with its coding sequences. -As a preparation step to achieve the set of core genes, we need to translate these genomes and extracting all information needed to find the core genes. This is not an easy job. The output from this operation is a lists of genes stored in a local database for genomes, their genes names and genes counts. In this stage, we will accumulate some Gene duplications with each genome treated. In other words, duplication in gene name can comes from genes fragments as long as chloroplast DNA sequences. Identical state, which it is the state that each gene present only one time in a genome (i.e Gene has no copy) without considering the position or gene orientation can be reached by filtering the database from redundant gene name. To do this, we have two solutions: first, we made an orthography checking. Orthography checking is used to merge fragments of a gene to be one gene so that we can solve a duplication. +The objective from this step is to organize, solve genes duplications, and generate sets of genes for each genome. The input to the system is a list genomes from NCBI stored as a \textit{.fasta} file that include a collection of Protein coding genes\cite{parra2007cegma,RDogma}(genes that produce protein) with its coding sequences. +As a preparation step to achieve the set of core genes, we need to translate these genomes using \textit{BioPython} package\cite{chapman2000biopython}, and extracting all information needed to find the core genes. The process starts by converting each genome in fasta format to GenVision\cite{geneVision} format from DNASTAR, and this is not an easy job. The output from this operation is a lists of genes stored in a local database for genomes, their genes names and genes counts. In this stage, we will accumulate some Gene duplications with each genome treated. In other words, duplication in gene name can comes from genes fragments as long as chloroplast DNA sequences. Identical state, which it is the state that each gene present only one time in a genome (i.e Gene has no copy) without considering the position or gene orientation can be reached by filtering the database from redundant gene name. To do this, we have two solutions: first, we made an orthography checking. Orthography checking is used to merge fragments of a gene to be one gene so that we can solve a duplication. Second, we convert the list of genes names for each genome (i.e. after orthography check) in the database to be a set of genes names. Mathematically speaking, if $g=\left[g_1,g_2,g_3,g_1,g_3,g_4\right]$ is a list of genes names, by using the definition of a set in mathematics, we will have $set(g)=\{g_1,g_2,g_3,g_4\}$, where each gene represented only ones. With NCBI genomes, we do not have a problem of genes fragments because they already treated it, but there are a problem of genes orthography. This can generate the problem of gene lost in our method and effect in turn the core genes. The whole process of extracting core genome based on genes names and counts among genomes is illustrate in Figure \ref{Fig2}. @@ -190,29 +190,43 @@ The Algorithm of construction the matrix and extracting maximum core genes where \textit{GenomeList} represents the database.\\ \textbf{Step III: Drawing the Tree}\\ -The main objective here is to the results for visualizing a tree of evolution. We use here a directed graph from Dot graph package\cite{gansner2002drawing} from Graphviz library. The system can produce this tree automatically by using all information available in a database. Core genes generated with their genes can be very important information in the tree, because they can be represented as an ancestor information for two genomes or more. Further more, each node represents a genome or a core as \textit{(Genes count:Family name,Scientific name,Accession number)}, Edges represent the number of lost genes from genomes-core or core-core relationship. The number of lost genes here can be an important factor for evolution, it represents how much lost of genes for the species in same or different families. By the principle of classification, small genes lost among species can say that these species are closely together and belongs to same family, while big genes lost means that species is far to be in the same family. To see the picture clearly, Phylogenetic tree is an evolutionary tree generated also by the system. Generating this tree is based on the distances among genes sequences. There are many resources to build such tree (for example: PHYML\cite{guindon2005phyml}, RAxML{\cite{stamatakis2008raxml,stamatakis2005raxml}, BioNJ , and TNT\cite{goloboff2008tnt}}. We consider to use RAxML\cite{stamatakis2008raxml,stamatakis2005raxml} to generate this tree. +The main objective here is to the results for visualizing a tree of evolution. We use here a directed graph from Dot graphs package\cite{gansner2002drawing} from Graphviz library. The system can produce this tree automatically by using all information available in a database. Core genes generated with their genes can be very important information in the tree, because they can be represented as an ancestor information for two genomes or more. Further more, each node represents a genome or core as \textit{(Genes count:Family name, Scientific names, Accession number)}, Edges represent numbers of lost genes from genomes-core or core-core relationship. The number of lost genes here can be an important factor for evolution, it represents how much lost of genes for the species in same or different families. By the principle of classification, small number of gene lost among species indicate that those species are related together and belong to same family, while big genes lost means that species is far to be in the same family. To see the picture clearly, Phylogenetic tree is an evolutionary tree generated also by the system. Generating this tree is based on the distances among genes sequences. There are many resources to build such tree (for example: PHYML\cite{guindon2005phyml}, RAxML{\cite{stamatakis2008raxml,stamatakis2005raxml}, BioNJ , and TNT\cite{goloboff2008tnt}}. We consider to use RAxML\cite{stamatakis2008raxml,stamatakis2005raxml} to generate this tree. The main drawback from this method is that we can not depending only on genes names because of three causes: first, the genome may have not totally named (This can be found in early versions of NCBI genomes), so we will have some lost sequences. Second, we may have two genes sharing the same name, while their sequences are different. Third, we need to annotate all the genomes. \subsubsection{Extracting Core genome from NCBI gene contents} - +{to do later} \subsection{Core genes from Dogma Annotation tool} +In previous section, extracting core genes based on NCBI annotation caused some lost of genes due to annotation process of NCBI. Annotation tool play an important role for this losts, because it represents the first stage of gene identification. Good annotation tool still be challenged subject. (Genis Parra in 2007) published a paper state that the subject of accurately genomic and/or gene annotation is still an open source problem, even in the best case scenario where any project has all the expert biologists resources to annotate gene structures, the catalogues of genes can still unclear and still less accurate than experts. \cite{Bakke2009} also stated ("Errors in the annotations are routinely deposited in databases such as NCBI and used to validate subsequent annotation errors."). So, good core genes still need good annotation tool. A lot of softwares today’s were developed for extracted core genes for eukaryote and prokaryote organisms such as CEGMA\cite{parra2007cegma}, Coregenes 3.0\cite{zafar2002coregenes}, and Dogma\cite{RDogma}. The most appropriate annotation tool for chloroplast and mitochondrial genomes is Dogma. + +\subsubsection{Why Dogma rather than NCBI annotation?} +Dogma is an annotation tool developed in the university of Texas by \cite{RDogma} in 2004. Dogma is an abbreviation of \textit{Dual Organellar GenoMe Annotator}\cite{RDogma} for plant chloroplast and animal mitochondrial genomes. +It has its own database for translated the genome in all six reading frames by quering the amino acid sequence database using Blast\cite{altschul1990basic}(i.e Blastx) with various parameters, and to identify protein coding genes\cite{parra2007cegma,RDogma} in the input genome based on sequence similarity of genes in Dogma database, further more it can produce the \textit{Transfer RNAs (tRNA)}\cite{RDogma} and the \textit{Ribosomal RNAs (rRNA)}\cite{RDogma} and verifying their start and end positions rather than NCBI annotation tool. + +\subsubsection{Core genes based on Dogma Genes names and counts} +The method for extracting core genes using Dogma annotation can be more reliable and confidence than NCBI. This is because NCBI annotation carry some annotation errors\cite{Bakke2009}. The method of extracting core genes can summerized in the following steps:\\ + +\textbf{Step I: Annotate Genomes}\\ + +In this step, we starts to annotate same chloroplast genome population, this method is done manually for 99 genomes. The output from Dogma annotation website is collection of coding genes file for each genome in GeneVision\cite{geneVision} file format.\\ + +\textbf{Step two: Genes Defragmentation} +As a result of the annotation from dogma is coding genes with fragments. It is not considered as gene duplication. -\subsubsection{Core genes based on Genes names and count} \begin{figure}[H] -\caption{Extracting Core genome based on Gene Name} +\caption{Core genome based on Dogma Gene Name and count} \centering \includegraphics[width=0.7\textwidth]{Dogma_GeneName} \end{figure} \subsubsection{Core genome from Dogma gene contents} - +[To do Later] \begin{figure}[H] -\caption{Extracting Core genome based on Gene Name} +\caption{Core genes based on the comparison of Dogma Genes Sequences} \centering \includegraphics[width=0.7\textwidth]{Dogma_GeneContent} \end{figure} \ No newline at end of file diff --git a/biblio.bib b/biblio.bib index 92db33a..e349702 100644 --- a/biblio.bib +++ b/biblio.bib @@ -53,6 +53,18 @@ DOI={10.1089/cmb.2010.0092} year={2004}, publisher={Cold Spring Harbor Lab} } + +@article{chapman2000biopython, + title={Biopython: Python tools for computational biology}, + author={Chapman, Brad and Chang, Jeffrey}, + journal={ACM SIGBIO Newsletter}, + volume={20}, + number={2}, + pages={15--19}, + year={2000}, + publisher={ACM} +} + @incollection{FI09, year={2009}, isbn={978-3-642-04743-5}, @@ -91,6 +103,18 @@ pages={1-12} number = {12}, doi = {10.1371/journal.pone.0052841} } + +@article{zafar2002coregenes, + title={CoreGenes: A computational tool for identifying and cataloging}, + author={Zafar, Nikhat and Mazumder, Raja and Seto, Donald}, + journal={BMC bioinformatics}, + volume={3}, + number={1}, + pages={12}, + year={2002}, + publisher={BioMed Central Ltd} +} + @Article{17623808, AUTHOR = {Gomez-Valero, Laura and Rocha, Eduardo P C and Latorre, Amparo and Silva, Francisco J}, TITLE = {Reconstructing the ancestor of Mycobacterium leprae: the dynamics of gene loss and genome reduction.}, @@ -186,3 +210,20 @@ URL={http://www.biosci.utexas.edu/ib/faculty/jansen/pubs/Wyman%20et%20al.%202004 number = {7}, doi = {10.1371/journal.pone.0006291} } + +@article{altschul1990basic, + title={Basic local alignment search tool}, + author={Altschul, Stephen F and Gish, Warren and Miller, Webb and Myers, Eugene W and Lipman, David J}, + journal={Journal of molecular biology}, + volume={215}, + number={3}, + pages={403--410}, + year={1990}, + publisher={Elsevier} +} + +@article{geneVision, + title={DNASTAR- GenVision Software for Genomic Visualizations}, + author={DNASTAR}, + url = {http://www.dnastar.com/products/genvision.php} +} \ No newline at end of file diff --git a/main.tex b/main.tex index 6a6c3d5..611fff1 100755 --- a/main.tex +++ b/main.tex @@ -1,4 +1,4 @@ -\documentclass[12pt]{article} +\documentclass{article} \usepackage{subfig} \usepackage{color} \usepackage{graphicx} @@ -7,7 +7,6 @@ \usepackage{algorithm} \usepackage{algorithmic} \usepackage{longtable} -\usepackage{pdflscape} % correct bad hyphenation here @@ -28,7 +27,7 @@ FEMTO-ST Institute, UMR 6174 CNRS\\ Computer Science Laboratory DISC, University of Franche-Comt\'{e}, Besan\c con, France.\\ - $*:$ Authors in alphabetic order.\\ +$*:$ Authors in alphabetic order.\\ } \newcommand{\JFC}[1]{\begin{color}{green}\textit{}\end{color}} \newcommand{\CG}[1]{\begin{color} -- 2.39.5