Source code for ontobio.io.gafparser

import re
import logging
import json

from typing import List, Tuple, Set, Dict
from dataclasses import dataclass

from prefixcommons import curie_util

from ontobio.io import assocparser
from ontobio.io import parser_version_regex
from ontobio.io.assocparser import ENTITY, EXTENSION, ANNOTATION, Report
from ontobio.io import qc
from ontobio.io import entityparser
from ontobio.io import entitywriter
from ontobio.model import association
from ontobio.model import collections
from ontobio.ecomap import EcoMap
from ontobio.rdfgen import relations
from ontobio.ontol import Ontology

import dateutil.parser
import functools

import click

logger = logging.getLogger(__name__)




gaf_line_validators = {
    "default": assocparser.ColumnValidator(),
    "qualifier2_1": assocparser.Qualifier2_1(),
    "qualifier2_2": assocparser.Qualifier2_2(),
    "curie": assocparser.CurieValidator(),
    "taxon": assocparser.TaxonValidator()
}


def subclass_closure(ontology: Ontology, curieNamespace: str, curieIdentity: str)-> Set[str]:
    curie = association.Curie(namespace=curieNamespace, identity=curieIdentity)
    children_of_curie = set(ontology.descendants(str(curie), relations=["subClassOf"], reflexive=True))
    return children_of_curie

def protein_complex_sublcass_closure(ontology: Ontology) -> Set[str]:
    return subclass_closure(ontology, "GO", "0032991")

def cellular_anatomical_entity_subclass_closure(ontology: Ontology) -> Set[str]:
    return subclass_closure(ontology, "GO", "0110165")

def virion_component_subclass_closure(ontology: Ontology) -> Set[str]:
    return subclass_closure(ontology, "GO", "0044423")


[docs]class GafParser(assocparser.AssocParser): """ Parser for GO GAF format """ ANNOTATION_CLASS_COLUMN=4 def __init__(self, config=None, group="unknown", dataset="unknown", bio_entities=None): """ Arguments: --------- config : a AssocParserConfig object """ self.config = config self.group = group self.version = None self.default_version = "2.1" self.bio_entities = bio_entities if self.bio_entities is None: self.bio_entities = collections.BioEntities(dict()) self.cell_component_descendants_closure = None if config is None: self.config = assocparser.AssocParserConfig() self.report = assocparser.Report(group=group, dataset=dataset, config=self.config) # self.gpi = None if self.config.gpi_authority_path is not None: gpi_paths = self.config.gpi_authority_path if isinstance(gpi_paths, str): gpi_paths = [gpi_paths] for gpi_path in gpi_paths: gpi_bio_entities = collections.BioEntities.load_from_file(gpi_path) self.bio_entities.merge(gpi_bio_entities) print("Loaded {} entities from {}".format(len(gpi_bio_entities.entities.keys()), gpi_path)) def gaf_version(self) -> str: if self.version: return self.version else: return self.default_version def qualifier_parser(self) -> assocparser.ColumnValidator: if self.gaf_version() == "2.2": return assocparser.Qualifier2_2() return assocparser.Qualifier2_1()
[docs] def skim(self, file): file = self._ensure_file(file) tuples = [] for line in file: if line.startswith("!"): continue vals = line.split("\t") if len(vals) < 15: logger.error("Unexpected number of vals: {}. GAFv1 has 15, GAFv2 has 17.".format(vals)) split_line = assocparser.SplitLine(line=line, values=vals, taxon=vals[12]) negated, relation, _ = self._parse_qualifier(vals[3], vals[8]) # never include NOTs in a skim if negated: continue if self._is_exclude_relation(relation): continue id = self._pair_to_id(vals[0], vals[1]) if not self._validate_id(id, split_line, context=ENTITY): continue n = vals[2] t = vals[4] tuples.append( (id,n,t) ) return tuples
def make_internal_cell_component_closure(self): if self.config.ontology: self.cell_component_descendants_closure = protein_complex_sublcass_closure(self.config.ontology)
[docs] def parse_line(self, line): """ Parses a single line of a GAF Return a tuple `(processed_line, associations)`. Typically there will be a single association, but in some cases there may be none (invalid line) or multiple (disjunctive clause in annotation extensions) Note: most applications will only need to call this directly if they require fine-grained control of parsing. For most purposes, :method:`parse_file` can be used over the whole file Arguments --------- line : str A single tab-seperated line from a GAF file """ # Returns assocparser.ParseResult parsed = super().validate_line(line) if parsed: return parsed if self.is_header(line): # Save off version info here if self.version is None: # We are still looking parsed = parser_version_regex.findall(line) if len(parsed) == 1: filetype, version, _ = parsed[0] if version == "2.2": logger.info("Detected GAF version 2.2") self.version = version else: logger.info("Detected GAF version {}, so using 2.1".format(version)) self.version = self.default_version # Compute the cell component subclass closure self.make_internal_cell_component_closure() return assocparser.ParseResult(line, [{ "header": True, "line": line.strip() }], False) # At this point, we should have gone through all the header, and a version number should be established if self.version is None: logger.warning("No version number found for this file so we will assume GAF version: {}".format(self.default_version)) self.version = self.default_version self.make_internal_cell_component_closure() vals = [el.strip() for el in line.split("\t")] # GAF v1 is defined as 15 cols, GAF v2 as 17. # We treat everything as GAF2 by adding two blank columns. # TODO: check header metadata to see if columns corresponds to declared dataformat version parsed = to_association(list(vals), report=self.report, qualifier_parser=self.qualifier_parser(), bio_entities=self.bio_entities) if parsed.associations == []: return parsed assoc = parsed.associations[0] # Qualifier is index 3 # If we are 2.1, and qualifier has no relation # Also must have an ontology # Then upgrade # For https://github.com/geneontology/go-site/issues/1558 if self.gaf_version() == "2.1" and (vals[3] == "" or vals[3] == "NOT") and self.config.ontology: assoc = self.upgrade_empty_qualifier(assoc) ## Run GO Rules, save split values into individual variables # print("Config is {}".format(self.config.__dict__.keys())) go_rule_results = qc.test_go_rules(assoc, self.config, group=self.group) for rule, result in go_rule_results.all_results.items(): if result.result_type == qc.ResultType.WARNING: self.report.warning(line, assocparser.Report.VIOLATES_GO_RULE, "", msg="{id}: {message}".format(id=rule.id, message=result.message), rule=int(rule.id.split(":")[1])) if result.result_type == qc.ResultType.ERROR: self.report.error(line, assocparser.Report.VIOLATES_GO_RULE, "", msg="{id}: {message}".format(id=rule.id, message=result.message), rule=int(rule.id.split(":")[1])) # Skip the annotation return assocparser.ParseResult(line, [], True) if result.result_type == qc.ResultType.PASS: self.report.message(assocparser.Report.INFO, line, Report.RULE_PASS, "", msg="Passing Rule", rule=int(rule.id.split(":")[1])) assoc = go_rule_results.annotation # type: association.GoAssociation split_line = assocparser.SplitLine(line=line, values=vals, taxon=str(assoc.object.taxon)) if self.config.group_idspace is not None and assoc.provided_by not in self.config.group_idspace: self.report.warning(line, Report.INVALID_ID, assoc.provided_by, "GORULE:0000027: assigned_by is not present in groups reference", taxon=str(assoc.object.taxon), rule=27) db = assoc.subject.id.namespace if self.config.entity_idspaces is not None and db not in self.config.entity_idspaces: # Are we a synonym? upgrade = self.config.entity_idspaces.reverse(db) if upgrade is not None: # If we found a synonym self.report.warning(line, Report.INVALID_ID_DBXREF, db, "GORULE:0000027: {} is a synonym for the correct ID {}, and has been updated".format(db, upgrade), taxon=str(assoc.object.taxon), rule=27) assoc.subject.id.namespace = upgrade ## -- ## db + db_object_id. CARD=1 ## --assigned_by if not self._validate_id(str(assoc.subject.id), split_line, allowed_ids=self.config.entity_idspaces): return assocparser.ParseResult(line, [], True) # Using a given gpi file to validate the gene object # if self.gpi is not None: # entity = self.gpi.get(str(assoc.subject.id), None) # if entity is not None: # assoc.subject.label = entity["symbol"] # assoc.subject.fullname = entity["name"] # assoc.subject.synonyms = entity["synonyms"].split("|") # assoc.subject.type = entity["type"] if not self._validate_id(str(assoc.object.id), split_line, context=ANNOTATION): print("skipping because {} not validated!".format(assoc.object.id)) return assocparser.ParseResult(line, [], True) valid_goid = self._validate_ontology_class_id(str(assoc.object.id), split_line) if valid_goid is None: return assocparser.ParseResult(line, [], True) assoc.object.id = association.Curie.from_str(valid_goid) references = self.validate_curie_ids(assoc.evidence.has_supporting_reference, split_line) if references is None: # Reporting occurs in above function call return assocparser.ParseResult(line, [], True) # With/From for wf in assoc.evidence.with_support_from: validated = self.validate_curie_ids(wf.elements, split_line) if validated is None: return assocparser.ParseResult(line, [], True) with_support_from = self._unroll_withfrom_and_replair_obsoletes(split_line, 'gaf') if with_support_from is None: return assocparser.ParseResult(line, [], True) assoc.evidence.with_support_from = with_support_from # Extension # repair, if possible any GO terms in the extensions that may be obsolete if (0 < len(assoc.object_extensions)): for ext in assoc.object_extensions: validated = self.validate_curie_ids([e.term for e in ext.elements], split_line) if validated is None: return assocparser.ParseResult(line, [], True) repaired = self._repair_extensions(assoc.object_extensions, split_line) if repaired is None: assoc.object_extensions = [] return assocparser.ParseResult(line, [], True) assoc.object_extensions = repaired # validation self._validate_symbol(assoc.subject.label, split_line) ## -- ## taxon CARD={1,2} ## -- ## if a second value is specified, this is the interacting taxon ## We do not use the second value valid_taxon = self._validate_taxon(str(assoc.object.taxon), split_line) valid_interacting = self._validate_taxon(str(assoc.interacting_taxon), split_line) if assoc.interacting_taxon else True if not valid_taxon: self.report.error(line, assocparser.Report.INVALID_TAXON, str(assoc.object.taxon), "Taxon ID is invalid", rule=27) if not valid_interacting: self.report.error(line, assocparser.Report.INVALID_TAXON, str(assoc.interacting_taxon), "Taxon ID is invalid", rule=27) if (not valid_taxon) or (not valid_interacting): return assocparser.ParseResult(line, [], True) return assocparser.ParseResult(line, [assoc], False, vals[6])
[docs] def upgrade_empty_qualifier(self, assoc: association.GoAssociation) -> association.GoAssociation: """ From https://github.com/geneontology/go-site/issues/1558 For GAF 2.1 we will apply an algorithm to find a best fit relation if the qualifier column is empty. If the qualifiers field is empty, then: If the GO Term is exactly GO:008150 Biological Process, then the qualifier should be `involved_in` If the GO Term is exactly GO:0008372 Cellular Component, then the qualifer should be `is_active_in` If the GO Term is a Molecular Function, then the new qualifier should be `enables` If the GO Term is a Biological Process, then the new qualifier should be `acts_upstream_or_within Otherwise for Cellular Component, if it's subclass of anatomical structure, than use `located_in` and if it's a protein-containing complexes, use `part_of` :param assoc: GoAssociation :return: the possibly upgraded GoAssociation """ term = str(assoc.object.id) namespace = self.config.ontology.obo_namespace(term) if term == "GO:0008150": involved_in = association.Curie(namespace="RO", identity="0002331") assoc.qualifiers = [involved_in] assoc.relation = involved_in elif term == "GO:0008372": is_active_in = association.Curie(namespace="RO", identity="0002432") assoc.qualifiers = [is_active_in] assoc.relation = is_active_in elif namespace == "molecular_function": enables = association.Curie(namespace="RO", identity="0002327") assoc.qualifiers = [enables] assoc.relation = enables elif namespace == "biological_process": acts_upstream_or_within = association.Curie(namespace="RO", identity="0002264") assoc.qualifiers = [acts_upstream_or_within] assoc.relation = acts_upstream_or_within elif namespace == "cellular_component": if term in self.cell_component_descendants_closure: part_of = association.Curie(namespace="BFO", identity="0000050") assoc.qualifiers = [part_of] assoc.relation = part_of else: located_in = association.Curie(namespace="RO", identity="0001025") assoc.qualifiers = [located_in] assoc.relation = located_in self.report.warning(assoc.source_line, Report.INVALID_QUALIFIER, "EMPTY", "GORULE:0000059 Upgrading qualifier/relation to {} when reading GAF 2.1".format(assoc.relation), taxon=str(assoc.subject.taxon), rule=59) return assoc
ecomap = EcoMap() ecomap.mappings() def to_association(gaf_line: List[str], report=None, group="unknown", dataset="unknown", qualifier_parser=assocparser.Qualifier2_1(), bio_entities=None) -> assocparser.ParseResult: report = Report(group=group, dataset=dataset) if report is None else report bio_entities = collections.BioEntities(dict()) if bio_entities is None else bio_entities source_line = "\t".join(gaf_line) if source_line == "": report.error(source_line, "Blank Line", "EMPTY", "Blank lines are not allowed", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if len(gaf_line) > 17: # If we see more than 17 columns, we will just cut off the columns after column 17 report.warning(source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg="There were more than 17 columns in this line. Proceeding by cutting off extra columns after column 17.", rule=1) gaf_line = gaf_line[:17] if 17 > len(gaf_line) >= 15: gaf_line += [""] * (17 - len(gaf_line)) if len(gaf_line) != 17: report.error(source_line, assocparser.Report.WRONG_NUMBER_OF_COLUMNS, "", msg="There were {columns} columns found in this line, and there should be 15 (for GAF v1) or 17 (for GAF v2)".format(columns=len(gaf_line)), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) ## check for missing columns ## We use indeces here because we run GO RULES before we split the vals into individual variables DB_INDEX = 0 DB_OBJECT_INDEX = 1 DB_OBJECT_SYMBOL = 2 TAXON_INDEX = 12 REFERENCE_INDEX = 5 if gaf_line[DB_INDEX] == "": report.error(source_line, Report.INVALID_IDSPACE, "EMPTY", "col1 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gaf_line[DB_OBJECT_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "col2 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gaf_line[DB_OBJECT_SYMBOL] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "col3 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) if gaf_line[REFERENCE_INDEX] == "": report.error(source_line, Report.INVALID_ID, "EMPTY", "reference column 6 is empty", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) parsed_taxons_result = gaf_line_validators["taxon"].validate(gaf_line[TAXON_INDEX]) # type: assocparser.ValidateResult if not parsed_taxons_result.valid: report.error(source_line, Report.INVALID_TAXON, parsed_taxons_result.original, parsed_taxons_result.message, taxon=parsed_taxons_result.original, rule=1) return assocparser.ParseResult(source_line, [], True, report=report) taxon = parsed_taxons_result.parsed[0] if taxon.identity is None or taxon.identity == '0': report.error(source_line, Report.INVALID_TAXON, parsed_taxons_result.original, parsed_taxons_result.message, taxon=parsed_taxons_result.original, rule=1) return assocparser.ParseResult(source_line, [], True, report=report) date = assocparser.parse_date(gaf_line[13], report, source_line) if date is None: return assocparser.ParseResult(source_line, [], True, report=report) interacting_taxon = parsed_taxons_result.parsed[1] if len(parsed_taxons_result.parsed) == 2 else None subject_curie = association.Curie(gaf_line[0], gaf_line[1]) subject = association.Subject(subject_curie, gaf_line[2], [gaf_line[9]], gaf_line[10].split("|"), [association.map_gp_type_label_to_curie(gaf_line[11])], taxon) gpi_entity = bio_entities.get(subject_curie) if gpi_entity is not None and subject != gpi_entity: subject = gpi_entity # column 4 is qualifiers -> index 3 # For allowed, see http://geneontology.org/docs/go-annotations/#annotation-qualifiers # We use the below validate to check validaty if qualifiers, not as much to *parse* them into the GoAssociation object. # For GoAssociation we will use the above qualifiers list. This is fine because the above does not include `NOT`, etc # This is confusing, and we can fix later on by consolidating qualifier and relation in GoAssociation. parsed_qualifiers = qualifier_parser.validate(gaf_line[3]) if not parsed_qualifiers.valid: report.error(source_line, Report.INVALID_QUALIFIER, parsed_qualifiers.original, parsed_qualifiers.message, taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) aspect = gaf_line[8] negated, relation_label, qualifiers = assocparser._parse_qualifier(gaf_line[3], aspect) # Note: Relation label is grabbed from qualifiers, if any exist in _parse_qualifier qualifiers = [association.Curie.from_str(curie_util.contract_uri(relations.lookup_label(q), strict=False)[0]) for q in qualifiers] object = association.Term(association.Curie.from_str(gaf_line[4]), taxon) if isinstance(object, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gaf_line[4], "Problem parsing GO Term", taxon=gaf_line[TAXON_INDEX], rule=1) # References references = [association.Curie.from_str(e) for e in gaf_line[5].split("|") if e] for r in references: if isinstance(r, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gaf_line[5], "Problem parsing references", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) gorefs = [ref for ref in references if ref.namespace == "GO_REF"] + [None] eco_curie = ecomap.coderef_to_ecoclass(gaf_line[6], reference=gorefs[0]) if eco_curie is None: report.error(source_line, Report.UNKNOWN_EVIDENCE_CLASS, gaf_line[6], msg="Expecting a known ECO GAF code, e.g ISS", rule=1) return assocparser.ParseResult(source_line, [], True, report=report) withfroms = association.ConjunctiveSet.str_to_conjunctions(gaf_line[7]) if isinstance(withfroms, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gaf_line[7], "Problem parsing with/from", taxon=gaf_line[TAXON_INDEX], rule=1) return assocparser.ParseResult(source_line, [], True, report=report) evidence_type = association.Curie.from_str(eco_curie) if isinstance(evidence_type, association.Error): report.error(source_line, Report.INVALID_SYMBOL, gaf_line[6], "Problem parsing evidence type", taxon=gaf_line[TAXON_INDEX], rule=1) evidence = association.Evidence(association.Curie.from_str(eco_curie), references, withfroms) if any([isinstance(e, association.Error) for e in evidence.has_supporting_reference]): first_error = [e for e in evidence.has_supporting_reference if isinstance(e, association.Error)][0] report.error(source_line, Report.INVALID_SYMBOL, gaf_line[5], first_error.info, taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) subject_extensions = [] if gaf_line[16]: subject_filler = association.Curie.from_str(gaf_line[16]) if isinstance(subject_filler, association.Error): report.error(source_line, assocparser.Report.INVALID_ID, gaf_line[16], subject_filler.info, taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) # filler is not an Error, so keep moving subject_extensions.append(association.ExtensionUnit(association.Curie.from_str("rdfs:subClassOf"), subject_filler)) conjunctions = [] if gaf_line[15]: conjunctions = association.ConjunctiveSet.str_to_conjunctions( gaf_line[15], conjunct_element_builder=lambda el: association.ExtensionUnit.from_str(el)) if isinstance(conjunctions, association.Error): report.error(source_line, Report.EXTENSION_SYNTAX_ERROR, conjunctions.info, "extensions should be relation(curie) and relation should have corresponding URI", taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) relation_uri = relations.lookup_label(relation_label) if relation_uri is None: report.error(source_line, assocparser.Report.INVALID_QUALIFIER, relation_label, "Could not find CURIE for relation `{}`".format(relation_label), taxon=str(taxon), rule=1) return assocparser.ParseResult(source_line, [], True, report=report) # We don't have to check that this is well formed because we're grabbing it from the known relations URI map. relation_curie = association.Curie.from_str(curie_util.contract_uri(relation_uri)[0]) a = association.GoAssociation( source_line="\t".join(gaf_line), subject=subject, relation=relation_curie, object=object, negated=negated, qualifiers=qualifiers, aspect=aspect, interacting_taxon=interacting_taxon, evidence=evidence, subject_extensions=subject_extensions, object_extensions=conjunctions, provided_by=gaf_line[14], date=date, properties={}) return assocparser.ParseResult(source_line, [a], False, report=report)