• Facebook
  • Twitter
  • Reddit
  • StumbleUpon
  • Digg
  • email

"""Provide a (semi) automated mapping between GenBank and ontologies.
 
The goal is to provide an ontology namespace and term for each item
of a standard GenBank file.
"""
from __future__ import with_statement
import os
import collections
import csv
 
from rdflib.Graph import Graph
#from Mgh.SemanticLiterature import sparta
import sparta
 
def main(ft_file, so_ft_map_file, so_file, dc_file, rdfs_file):
    # -- get all of the keys we need to map from GenBank
    # pulled by hand from the Biopython parser
    header_keys = ['ACCESSION', 'AUTHORS', 'COMMENT', 'CONSRTM', 'DBLINK',
            'DBSOURCE', 'DEFINITION', 'JOURNAL', 'KEYWORDS', 'MEDLINE', 'NID',
            'ORGANISM', 'PID', 'PROJECT', 'PUBMED',
            'REMARK', 'SEGMENT', 'SOURCE', 'TITLE', 'VERSION',
            'VERSION;gi', 'LOCUS;date']
    feature_keys, qual_keys = parse_feature_table(ft_file)
 
    # -- terms to map them two, along with dictionaries for the existing or
    # hand defined mappings
    dc_terms = parse_dc_terms(dc_file)
    dc_ontology = OntologyGroup("http://purl.org/dc/terms/", dc_terms)
    so_terms = parse_so_terms(so_file)
    so_ontology = OntologyGroup("http://purl.org/obo/owl/SO", so_terms)
    so_ft_map = parse_so_ft_map(so_ft_map_file)
    so_ontology.add_map('feature', 
            'http://www.sequenceontology.org/mappings/FT_SO_map.txt', so_ft_map)
    rdfs_terms = parse_rdfs_terms(rdfs_file)
    rdfs_ontology = OntologyGroup('http://www.w3.org/2000/01/rdf-schema#',
            rdfs_terms)
    me_ft_map = dict(
            rep_origin = 'origin_of_replication',
            unsure = 'sequence_uncertainty',
            conflict = 'sequence_conflict',
            GC_signal = 'GC_rich_promoter_region',
            mat_peptide = 'mature_protein_region',
            C_region = 'C_cluster',
            J_segment = 'J_gene',
            N_region = '',
            S_region = '',
            V_region = 'V_cluster',
            V_segment = 'V_gene'
            )
    me_ft_dc_map = dict(
            old_sequence = 'replaces')
    me_hd_map = {'ACCESSION': 'databank_entry'}
    me_hd_dc_map = {'DEFINITION' : 'description',
                    'VERSION' : 'hasVersion',
                    'VERSION;gi' : 'identifier',
                    'KEYWORDS' : 'subject',
                    'LOCUS;date' : 'created',
                    'PUBMED' : 'relation',
                    'JOURNAL' : 'source',
                    'AUTHORS' : 'contributor',
                    'CONSRTM' : 'creator',
                    'ORGANISM' : '',
                    'SEGMENT' : 'isPartOf',
                    'DBLINK' : 'relation',
                    'DBSOURCE' : 'relation',
                    'MEDLINE' : 'relation',
                    'NID' : 'relation',
                    'PID' : 'relation',
                    'PROJECT' : 'relation'}
    me_ql_map = dict(
            bio_material = 'biomaterial_region',
            bound_moiety = 'bound_by_factor',
            codon_start = 'coding_start',
            direction = 'direction_attribute',
            experiment = 'experimental_result_region',
            macronuclear = 'macronuclear_sequence',
            map = 'fragment_assembly',
            mobile_element = 'integrated_mobile_genetic_element',
            mod_base = 'modified_base_site',
            mol_type = 'sequence_attribute',
            ncRNA_class = 'ncRNA',
            organelle = 'organelle_sequence',
            PCR_primers = 'primer',
            proviral = 'proviral_region',
            pseudo = 'pseudogene',
            rearranged = 'rearranged_at_DNA_level',
            satellite = 'satellite_DNA',
            segment = 'gene_segment',
            rpt_family = 'repeat_family',
            rpt_type = 'repeat_unit',
            rpt_unit_range = 'repeat_region',
            rpt_unit_seq = 'repeat_component',
            tag_peptide = 'cleaved_peptide_region',
            trans_splicing = 'trans_spliced',
            translation = 'polypeptide',
            )
    me_ql_dc_map = dict(
            citation = 'bibliographicCitation',
            gene_synonym = 'alternative',
            identified_by = 'creator',
            label = 'alternative',
            locus_tag = 'alternative',
            old_locus_tag = 'replaces',
            replace = 'isReplacedBy',
            db_xref = 'relation',
            compare = 'relation',
            EC_number = 'relation',
            protein_id = 'relation',
            product = 'alternative',
            standard_name = 'alternative',
            number = 'coverage',
            function = 'description',
            )
    me_ql_rdfs_map = dict(
            note = 'comment',
            )
    ql_make_no_sense_list = ['number']
    so_ontology.add_map('header', 'Brad', me_hd_map)
    so_ontology.add_map('feature', 'Brad', me_ft_map)
    so_ontology.add_map('qualifier', 'Brad', me_ql_map)
    dc_ontology.add_map('header', 'Brad', me_hd_dc_map)
    dc_ontology.add_map('feature', 'Brad', me_ft_dc_map)
    dc_ontology.add_map('qualifier', 'Brad', me_ql_dc_map)
    rdfs_ontology.add_map('qualifier', 'Brad', me_ql_rdfs_map)
 
    # -- write out the mappings in each of the categories
    with open("genbank_ontology_map.txt", "w") as out_handle:
        out_writer = csv.writer(out_handle, delimiter="\t")
        out_writer.writerow(['gb section', 'identifier', 'ontology',
            'namespace', 'evidence'])
        match_keys_to_ontology('header', header_keys, [so_ontology, dc_ontology,
            rdfs_ontology], out_writer)
        match_keys_to_ontology('feature', feature_keys, [so_ontology,
            dc_ontology, rdfs_ontology], out_writer)
        match_keys_to_ontology('qualifier', qual_keys, [so_ontology,
            dc_ontology, rdfs_ontology], out_writer)
 
class OntologyGroup:
    def __init__(self, namespace, terms):
        self.ns = namespace
        self.terms = terms
        self._maps = collections.defaultdict(lambda: [])
 
    def add_map(self, key_type, origin, key_map):
        """Add a mapping of keys to terms within this ontology.
        """
        self._maps[key_type].append((origin, key_map))
 
    def normalized_terms(self):
        """Retrieve the terms all lower cased and with extra items removed.
        """
        lower_so_terms = {}
        for term in self.terms:
            lower_so_terms[self._normal_term(term)] = term
        return lower_so_terms
 
    def _normal_term(self, term):
        return term.replace("_", "").lower() 
 
    def match_key_to_ontology(self, key_type, cur_key):
        normal_terms = self.normalized_terms()
        # try to get it from a dictionary
        for map_origin, check_map in self._maps[key_type]:
            try:
                match_key = check_map[cur_key]
            except KeyError:
                match_key = None
            if (match_key and
                    self._normal_term(match_key) in normal_terms.keys()):
                return match_key, map_origin
        # try to match it by name
        if self._normal_term(cur_key) in normal_terms.keys():
            match_key = normal_terms[self._normal_term(cur_key)]
            return match_key, 'name_match'
        #in_terms = [x for x in normal_terms.keys() if
        #        x.find(self._normal_term(cur_key)) != -1]
        #if len(in_terms) > 0:
        #    print '***', cur_key, in_terms, self.ns
        # could not find a match
        return None, ''
 
def match_keys_to_ontology(key_type, keys, ontologies, out_writer):
    no_matches = []
    for cur_key in keys:
        found_match = False
        for ontology in ontologies:
            match_key, origin = ontology.match_key_to_ontology(key_type, cur_key)
            if match_key is not None:
                out_writer.writerow([key_type, cur_key, match_key, ontology.ns,
                    origin])
                found_match = True
                break
        if not found_match:
            no_matches.append(cur_key)
    for no_key in no_matches:
        out_writer.writerow([key_type, no_key])
 
def _parse_terms_from_rdf(in_file, prefix):
    graph = Graph()
    graph.parse(in_file)
    sparta_store = sparta.ThingFactory(graph)
    subjects = []
    for subj in graph.subjects():
        if subj not in subjects:
            subjects.append(subj)
    terms = []
    for subj in subjects:
        rdf_item = sparta_store(subj)
        str_item = str(rdf_item.get_id().replace(prefix + "_", ""))
        if str_item:
            terms.append(str_item)
    terms.sort()
    #print terms
    return terms
 
def parse_rdfs_terms(rdfs_file):
    return _parse_terms_from_rdf(rdfs_file, "rdfs")
 
def parse_dc_terms(dc_file):
    """Retrieve a list of Dublin core terms from the RDF file.
    """
    return _parse_terms_from_rdf(dc_file, "dcterms")
 
def parse_so_terms(so_file):
    """Retrieve all available Sequence Ontology terms from the file.
    """
    so_terms = []
    with open(so_file) as in_handle:
        for line in in_handle:
            if line.find('name:') == 0:
                name = line[5:].strip()
                so_terms.append(name)
    return so_terms
 
def parse_so_ft_map(so_ft_map_file):
    """Parse out mappings between feature keys and SO.
    """
    so_ft_map = {}
    with open(so_ft_map_file) as in_handle:
        in_handle.readline()
        for line in in_handle:
            parts = line.split()
            if parts[1] not in ['undefined']:
                so_ft_map[parts[0]] = parts[1]
    return so_ft_map
 
def parse_feature_table(ft_file):
    """Parse all available features and qualifiers from the FT definition.
 
    This is ugly and parses it straight out of the HTML but this is much easier
    than trying to get it from the specs.
    """
    feature_keys = []
    qual_keys = []
    with open(ft_file) as ft_handle:
        in_feature_region = False
        for line in ft_handle:
            if in_feature_region:
                if line.strip() == "":
                    in_feature_region = False
                else:
                    qual_key, feature_key = line.strip().split()
                    qual_keys.append(qual_key)
                    feature_keys.append(feature_key)
            elif line.find('QUALIFIER FEATURE KEY') == 0:
                in_feature_region = True
    qual_keys = list(set(qual_keys))
    qual_keys = [k.replace('/', '') for k in qual_keys]
    feature_keys = list(set(feature_keys))
    qual_keys.sort()
    feature_keys.sort()
    return feature_keys, qual_keys
 
if __name__ == "__main__":
    ft_file = os.path.join("feature_table", "FT_index.html")
    so_ft_map_file = os.path.join("feature_table", "FT_SO_map.txt")
    so_file = os.path.join("ontologies", "so.obo")
    dc_file = os.path.join("ontologies", "dcterms.rdf")
    rdfs_file = os.path.join("ontologies", "rdf-schema.rdf")
    main(ft_file, so_ft_map_file, so_file, dc_file, rdfs_file)