Source code for ontobio.golr.golr_query

"""
A query wrapper for a Golr instance

Intended to work with:

* Monarch golr instance
* AmiGO/GO golr instance (including both GO and Planteome)

Conventions
-----------

Documents follow either entity or association patterns.

Associations
------------

Connects some kind of *subject* to an *object* via a *relation*, this
should be read as any RDF triple.

The subject may be a molecular biological entity such as a gene, or an
ontology class. The distinction between these two may be malleable.

The object is typically an ontology class, but not
always. E.g. gene-gene interactions or homology for exceptions.

An association also has evidence plus various provenance metadata.

In Monarch, the evidence is modeled as a graph encoded as a JSON blob;

In AmiGO, we follow the GAF data model where it is assumed evidence is
simple as does not follow chains, there is assumed to be one evidence
object for the intermediate entity.

### Entities

TODO
"""

import json
import logging
import pysolr
import re
from dataclasses import asdict
from typing import Dict, List
import xml.etree.ElementTree as ET
from collections import OrderedDict
from ontobio.vocabulary.relations import HomologyTypes
from ontobio.model.GolrResults import SearchResults, AutocompleteResult, Highlight
from ontobio.util.user_agent import get_user_agent
from prefixcommons.curie_util import expand_uri
from ontobio.util.curie_map import get_curie_map
from ontobio import ecomap

INVOLVED_IN="involved_in"
ACTS_UPSTREAM_OF_OR_WITHIN="acts_upstream_of_or_within"

ISA_PARTOF_CLOSURE="isa_partof_closure"
REGULATES_CLOSURE="regulates_closure"

ecomapping = ecomap.EcoMap()
iea_eco = ecomapping.coderef_to_ecoclass("IEA")

logger = logging.getLogger(__name__)


class GolrFields:
    """
    Enumeration of fields in Golr.
    Note the Monarch golr schema is taken as canonical here
    """

    ID='id'
    ASSOCIATION_TYPE='association_type'
    SOURCE='source'
    OBJECT_CLOSURE='object_closure'
    SOURCE_CLOSURE_MAP='source_closure_map'
    SUBJECT_TAXON_CLOSURE_LABEL='subject_taxon_closure_label'
    OBJECT_TAXON_CLOSURE_LABEL = 'object_taxon_closure_label'
    SUBJECT_GENE_CLOSURE_MAP='subject_gene_closure_map'
    SUBJECT_TAXON_LABEL_SEARCHABLE='subject_taxon_label_searchable'
    OBJECT_TAXON_LABEL_SEARCHABLE = 'object_taxon_label_searchable'
    IS_DEFINED_BY='is_defined_by'
    SUBJECT_GENE_CLOSURE_LABEL='subject_gene_closure_label'
    SUBJECT_TAXON_CLOSURE='subject_taxon_closure'
    OBJECT_TAXON_CLOSURE = 'object_taxon_closure'
    OBJECT_LABEL='object_label'
    SUBJECT_CATEGORY='subject_category'
    SUBJECT_GENE_LABEL='subject_gene_label'
    SUBJECT_TAXON_CLOSURE_LABEL_SEARCHABLE='subject_taxon_closure_label_searchable'
    OBJECT_TAXON_CLOSURE_LABEL_SEARCHABLE = 'object_taxon_closure_label_searchable'
    SUBJECT_GENE_CLOSURE='subject_gene_closure'
    SUBJECT_GENE_LABEL_SEARCHABLE='subject_gene_label_searchable'
    OBJECT_GENE_LABEL_SEARCHABLE = 'object_gene_label_searchable'
    SUBJECT='subject'
    SUBJECT_LABEL='subject_label'
    SUBJECT_CLOSURE_LABEL_SEARCHABLE='subject_closure_label_searchable'
    OBJECT_CLOSURE_LABEL_SEARCHABLE='object_closure_label_searchable'
    OBJECT_CLOSURE_LABEL='object_closure_label'
    SUBJECT_CLOSURE_LABEL='subject_closure_label'
    SUBJECT_GENE='subject_gene'
    SUBJECT_TAXON='subject_taxon'
    OBJECT_TAXON = 'object_taxon'
    OBJECT_LABEL_SEARCHABLE='object_label_searchable'
    OBJECT_CATEGORY='object_category'
    SUBJECT_TAXON_CLOSURE_MAP='subject_taxon_closure_map'
    OBJECT_TAXON_CLOSURE_MAP = 'object_taxon_closure_map'
    QUALIFIER='qualifier'
    SUBJECT_TAXON_LABEL='subject_taxon_label'
    OBJECT_TAXON_LABEL = 'object_taxon_label'
    SUBJECT_CLOSURE_MAP='subject_closure_map'
    SUBJECT_ORTHOLOG_CLOSURE='subject_ortholog_closure'
    SUBJECT_CLOSURE='subject_closure'
    OBJECT='object'
    OBJECT_CLOSURE_MAP='object_closure_map'
    SUBJECT_LABEL_SEARCHABLE='subject_label_searchable'
    EVIDENCE_OBJECT='evidence_object'
    EVIDENCE_OBJECT_CLOSURE_MAP='evidence_object_closure_map'
    EVIDENCE_OBJECT_LABEL='evidence_object_label'
    EVIDENCE_OBJECT_CLOSURE='evidence_object_closure'
    EVIDENCE_OBJECT_CLOSURE_LABEL='evidence_object_closure_label'
    EVIDENCE='evidence'
    EVIDENCE_LABEL='evidence_label'
    EVIDENCE_CLOSURE_MAP = 'evidence_closure_map'
    EVIDENCE_GRAPH = 'evidence_graph'
    _VERSION_='_version_'
    SUBJECT_GENE_CLOSURE_LABEL_SEARCHABLE='subject_gene_closure_label_searchable'
    ASPECT='aspect'
    RELATION='relation'
    RELATION_LABEL='relation_label'
    FREQUENCY='frequency'
    FREQUENCY_LABEL='frequency_label'
    ONSET='onset'
    ONSET_LABEL='onset_label'

    # This is a temporary fix until
    # https://github.com/biolink/ontobio/issues/126 is resolved.

    # AmiGO specific fields
    AMIGO_SPECIFIC_FIELDS = [
        'reference',
        'qualifier',
        'is_redundant_for',
        'type',
        'evidence',
        'evidence_label',
        'evidence_type',
        'evidence_type_label',
        'evidence_with',
        'evidence_closure',
        'evidence_closure_label',
        'evidence_subset_closure',
        'evidence_subset_closure_label',
        'evidence_type_closure',
        'evidence_type_closure_label',
        'aspect'
    ]

    # golr convention: for any entity FOO, the id is denoted 'foo'
    # and the label FOO_label
    def label_field(self, f):
        return f + "_label"

    # golr convention: for any class FOO, the id is denoted 'foo'
    # and the cosure FOO_closure. Other closures may exist
    def closure_field(self, f):
        return f + "_closure"

# create an instance
M=GolrFields()

# fields in the result docs that are to be inverted when 'invert_subject_object' is True
INVERT_FIELDS_MAP = {
    M.SUBJECT: M.OBJECT,
    M.SUBJECT_CLOSURE: M.OBJECT_CLOSURE,
    M.SUBJECT_TAXON: M.OBJECT_TAXON,
    M.SUBJECT_CLOSURE_LABEL: M.OBJECT_CLOSURE_LABEL,
    M.SUBJECT_TAXON_CLOSURE_LABEL: M.OBJECT_TAXON_CLOSURE_LABEL,
    M.SUBJECT_TAXON_LABEL_SEARCHABLE: M.OBJECT_TAXON_LABEL_SEARCHABLE,
    M.SUBJECT_TAXON_CLOSURE: M.OBJECT_TAXON_CLOSURE,
    M.SUBJECT_LABEL: M.OBJECT_LABEL,
    M.SUBJECT_TAXON_CLOSURE_LABEL_SEARCHABLE: M.OBJECT_TAXON_CLOSURE_LABEL_SEARCHABLE,
    M.SUBJECT_CLOSURE_LABEL_SEARCHABLE: M.OBJECT_CLOSURE_LABEL_SEARCHABLE,
    M.SUBJECT_LABEL_SEARCHABLE: M.OBJECT_LABEL_SEARCHABLE,
    M.SUBJECT_CATEGORY: M.OBJECT_CATEGORY,
    M.SUBJECT_TAXON_CLOSURE_MAP: M.OBJECT_TAXON_CLOSURE_MAP,
    M.SUBJECT_TAXON_LABEL: M.OBJECT_TAXON_LABEL,
    M.SUBJECT_CLOSURE_MAP: M.OBJECT_CLOSURE_MAP,
}

ASPECT_MAP = {
    'F': 'molecular_activity',
    'P': 'biological_process',
    'C': 'cellular_component'
}


# normalize to what Monarch uses
PREFIX_NORMALIZATION_MAP = {
    'MGI:MGI' : 'MGI',
    'FB' : 'FlyBase',
}

def flip(d, x, y):
    dx = d.get(x)
    dy = d.get(y)
    d[x] = dy
    d[y] = dx


def solr_quotify(v, operator="OR"):
    if isinstance(v, list):
        if len(v) == 1:
            return solr_quotify(v[0], operator)
        else:
            return '({})'.format(" {} ".format(operator).join([solr_quotify(x) for x in v]))
    else:
        # TODO - escape quotes
        return '"{}"'.format(v)


def translate_facet_field(fcs, invert_subject_object = False):
    """
    Translates solr facet_fields results into something easier to manipulate

    A solr facet field looks like this: [field1, count1, field2, count2, ..., fieldN, countN]

    We translate this to a dict {f1: c1, ..., fn: cn}

    This has slightly higher overhead for sending over the wire, but is easier to use
    """
    if 'facet_fields' not in fcs:
        return {}
    ffs = fcs['facet_fields']
    rs={}
    for (facet, facetresults) in ffs.items():
        if invert_subject_object:
            for (k,v) in INVERT_FIELDS_MAP.items():
                if facet == k:
                    facet = v
                    break
                elif facet == v:
                    facet = k
                    break

        pairs = {}
        rs[facet] = pairs
        for i in range(int(len(facetresults)/2)):
            (fv,fc) = (facetresults[i*2],facetresults[i*2+1])
            pairs[fv] = fc
    return rs




### GO-SPECIFIC CODE

def goassoc_fieldmap(relationship_type=ACTS_UPSTREAM_OF_OR_WITHIN):
    """
    Returns a mapping of canonical monarch fields to amigo-golr.

    See: https://github.com/geneontology/amigo/blob/master/metadata/ann-config.yaml

    """
    return {
        M.SUBJECT: 'bioentity',
        M.SUBJECT_CLOSURE: 'bioentity',
        ## In the GO AmiGO instance, the type field is not correctly populated
        ## See above in the code for hack that restores this for planteome instance
        ## M.SUBJECT_CATEGORY: 'type',
        M.SUBJECT_CATEGORY: None,
        M.SUBJECT_LABEL: 'bioentity_label',
        M.SUBJECT_TAXON: 'taxon',
        M.SUBJECT_TAXON_LABEL: 'taxon_label',
        M.SUBJECT_TAXON_CLOSURE: 'taxon_closure',
        M.RELATION: 'qualifier',
        M.OBJECT: 'annotation_class',
        M.OBJECT_CLOSURE: REGULATES_CLOSURE if relationship_type == ACTS_UPSTREAM_OF_OR_WITHIN else ISA_PARTOF_CLOSURE,
        M.OBJECT_LABEL: 'annotation_class_label',
        M.OBJECT_TAXON: 'taxon',
        M.OBJECT_TAXON_LABEL: 'taxon_label',
        M.OBJECT_TAXON_CLOSURE: 'taxon_closure',
        M.OBJECT_CATEGORY: None,
        M.EVIDENCE_OBJECT_CLOSURE: 'evidence_subset_closure',
        M.IS_DEFINED_BY: 'assigned_by'
    }

def map_field(fn, m) :
    """
    Maps a field name, given a mapping file.
    Returns input if fieldname is unmapped.
    """
    if m is None:
        return fn
    if fn in m:
        return m[fn]
    else:
        return fn

### CLASSES

class GolrServer():
    pass

class GolrAbstractQuery():
    def get_config(self):
        if self.config is None:
            from ontobio.config import Config, get_config
            self.config = get_config()
        return self.config

    def _set_solr(self, url, timeout=2):
        self.solr = pysolr.Solr(url=url, timeout=timeout)
        return self.solr

    def _set_user_agent(self, user_agent):
        self.solr.get_session().headers['User-Agent'] = user_agent

    def _use_amigo_schema(self, object_category):
        if object_category is not None and object_category == 'function':
            return True
        ds = self.get_config().default_solr_schema
        if ds is not None and ds == 'amigo':
            return True
        return False


[docs]class GolrSearchQuery(GolrAbstractQuery): """ Controller for monarch and go solr search cores Queries over a search document """ def __init__(self, term=None, category=None, is_go=False, url=None, solr=None, config=None, fq=None, fq_string=None, hl=True, facet_fields=None, facet=True, search_fields=None, taxon_map=True, rows=100, start=None, prefix=None, boost_fx=None, boost_q=None, highlight_class=None, taxon=None, min_match=None, minimal_tokenizer=False, include_eqs=False, exclude_groups=False, user_agent=None): self.term = term self.category = category self.is_go = is_go self.url = url self.solr = solr self.config = config self.hl = hl self.facet = facet self.facet_fields = facet_fields self.search_fields = search_fields self.taxon_map = taxon_map self.rows = rows self.start = start # test if client explicitly passes a URL; do not override self.is_explicit_url = url is not None # Raw fq param string self.fq_string = fq_string if fq_string is not None else [] # fq as dictionary where key:values get converted # to fq="(key1:value1 OR key2:value2)" self.fq = fq if fq is not None else {} self.prefix = prefix self.boost_fx = boost_fx self.boost_q = boost_q self.highlight_class = highlight_class self.taxon = taxon self.min_match = min_match self.include_eqs = include_eqs self.exclude_groups = exclude_groups self.minimal_tokenizer = minimal_tokenizer self.user_agent = get_user_agent(modules=[requests, pysolr], caller_name=__name__) if user_agent is not None: self.user_agent += " {}".format(user_agent) if self.search_fields is None: self.search_fields = dict(id=3, label=2, synonym=1, definition=1, taxon_label=1, taxon_label_synonym=1, equivalent_curie=1) if self.is_go: if self.url is None: endpoint = self.get_config().amigo_solr_search solr_config = {'url': endpoint.url, 'timeout': endpoint.timeout} else: solr_config = {'url': self.url, 'timeout': 2} else: if self.url is None: endpoint = self.get_config().solr_search solr_config = {'url': endpoint.url, 'timeout': endpoint.timeout} else: solr_config = {'url': self.url, 'timeout': 2} self._set_solr(**solr_config) self._set_user_agent(self.user_agent) def update_solr_url(self, url, timeout=2): self.url = url solr_config = {'url': url, 'timeout': timeout} self._set_solr(**solr_config) self._set_user_agent(self.user_agent) def solr_params(self, mode=None): if self.facet_fields is None and self.facet: self.facet_fields = ['category', 'taxon', 'taxon_label'] if self.category is not None: self.fq['category'] = self.category suffixes = ['std', 'kw', 'eng'] if self.is_go: self.search_fields=dict(entity_label=3, general_blob=3) self.hl = False # TODO: formal mapping if 'taxon_label' in self.facet_fields: self.facet_fields.remove('taxon_label') suffixes = ['searchable'] self.fq['document_category'] = "general" qf = self._format_query_filter(self.search_fields, suffixes) if mode == 'search': # Decrease ngram weight and increase keyword and standard tokenizer for field, weight in qf.items(): if '_kw' in field: qf[field] += 2 elif '_std' in field: qf[field] += 1 if self.term is not None and ":" in self.term: qf["id_kw"] = 20 qf["equivalent_curie_kw"] = 20 if self.minimal_tokenizer: # Split text using a minimal set of word boundaries # useful for variants and genotypes where typical # word boundaries are part of the nomenclature tokens = re.split(r'[\s|\'\",]+', self.term) if tokens[-1] == '': del tokens[-1] tokenized = "".join(['"{}"'.format(token) for token in tokens]) else: # Solr will run through the Standard Tokenizer tokenized = self.term select_fields = ["*", "score"] params = { 'q': '{0} "{1}"'.format(tokenized, self.term), "qt": "standard", 'fl': ",".join(list(filter(None, select_fields))), "defType": "edismax", "qf": ["{}^{}".format(field, weight) for field, weight in qf.items()], 'rows': self.rows } if self.facet: params['facet'] = 'on' params['facet.field'] = self.facet_fields params['facet.limit'] = 25 params['facet.mincount'] = 1 if self.taxon_map: params["facet.pivot.mincount"] =1 params["facet.pivot"] = "taxon,taxon_label" if self.start is not None: params['start'] = self.start if self.hl: params['hl.simple.pre'] = "<em class=\"hilite\">" params['hl.snippets'] = "1000" params['hl'] = 'on' if self.fq is not None: filter_queries = ['{}:{}'.format(k,solr_quotify(v)) for (k,v) in self.fq.items()] params['fq'] = filter_queries else: params['fq'] = [] for fq in self.fq_string: params['fq'].append(fq) if self.prefix is not None: negative_filter = [p_filt[1:] for p_filt in self.prefix if p_filt.startswith('-')] positive_filter = [p_filt for p_filt in self.prefix if not p_filt.startswith('-')] if negative_filter: if self.include_eqs: single_filts = [ f'(-prefix:"{prefix}" OR -equivalent_curie:{prefix}\:*)' for prefix in negative_filter ] for filt in single_filts: params['fq'].append(filt) else: neg_filter = '({})'.format(" OR ".join([filt for filt in negative_filter])) params['fq'].append('-prefix:{}'.format(solr_quotify(negative_filter))) if positive_filter: if self.include_eqs: # fq=((prefix:HP OR equivalent_curie:HP) OR (prefix:MONDO OR equivalent_curie:MONDO)) single_filts = [ f'(prefix:"{prefix}" OR equivalent_curie:{prefix}\:*)' for prefix in positive_filter ] pos_filter = '({})'.format(" OR ".join([filt for filt in single_filts])) params['fq'].append(pos_filter) else: params['fq'].append('prefix:{}'.format(solr_quotify(positive_filter))) if self.boost_fx is not None: params['bf'] = [] for boost in self.boost_fx: params['bf'].append(boost) if self.boost_q is not None: params['bq'] = [] for boost in self.boost_q: params['bq'].append(boost) if self.taxon is not None: for tax in self.taxon: params['fq'].append('taxon:"{}"'.format(tax)) if self.exclude_groups: params['fq'].append('leaf:1') if self.min_match is not None: params['mm'] = self.min_match if self.highlight_class is not None: params['hl.simple.pre'] = \ '<em class=\"{}\">'.format(self.highlight_class) return params
[docs] def search(self): """ Execute solr search query """ params = self.solr_params(mode='search') logger.info("PARAMS=" + str(params)) results = self.solr.search(**params) logger.info("Docs found: {}".format(results.hits)) return self._process_search_results(results)
[docs] def autocomplete(self): """ Execute solr autocomplete """ self.facet = False params = self.solr_params() logger.info("PARAMS=" + str(params)) results = self.solr.search(**params) logger.info("Docs found: {}".format(results.hits)) return self._process_autocomplete_results(results)
def _process_search_results(self, results: pysolr.Results) -> SearchResults: """ Convert solr docs to biolink object :param results: pysolr.Results :return: model.GolrResults.SearchResults """ # map go-golr fields to standard for doc in results.docs: if 'entity' in doc: doc['id'] = doc['entity'] doc['label'] = doc['entity_label'] translated_facets = translate_facet_field(results.facets) # inject the taxon map (aka a facet pivot) into the returned facets if self.taxon_map: translated_facets['_taxon_map'] = [ { 'id': taxon['value'], 'label': taxon['pivot'][0]['value'], 'count': taxon['pivot'][0]['count'] } for taxon in results.facets['facet_pivot']['taxon,taxon_label'] ] highlighting = { doc['id']: asdict(self._process_highlight(results, doc)) for doc in results.docs if results.highlighting } payload = SearchResults( facet_counts=translated_facets, highlighting=highlighting, docs=results.docs, numFound=results.hits ) logger.debug('Docs: {}'.format(len(results.docs))) return payload def _process_autocomplete_results( self, results: pysolr.Results) -> Dict[str, List[AutocompleteResult]]: """ Convert results to biolink autocomplete object :param results: pysolr.Results :return: {'docs': List[AutocompleteResult]} """ # map go-golr fields to standard for doc in results.docs: if 'entity' in doc: doc['id'] = doc['entity'] doc['label'] = doc['entity_label'] docs = [] for doc in results.docs: if results.highlighting: hl = self._process_highlight(results, doc) else: hl = Highlight(None, None, None) # In some cases a node does not have a category category = doc['category'] if 'category' in doc else [] doc['taxon'] = doc['taxon'] if 'taxon' in doc else "" doc['taxon_label'] = doc['taxon_label'] if 'taxon_label' in doc else "" doc['equivalent_curie'] = doc['equivalent_curie'] if 'equivalent_curie' in doc else [] doc = AutocompleteResult( id=doc['id'], label=doc['label'], match=hl.match, category=category, taxon=doc['taxon'], taxon_label=doc['taxon_label'], highlight=hl.highlight, has_highlight=hl.has_highlight, equivalent_ids=doc['equivalent_curie'] ) docs.append(doc) payload = { 'docs': docs } logger.debug('Docs: {}'.format(len(results.docs))) return payload def _process_highlight(self, results: pysolr.Results, doc) -> Highlight: hl = results.highlighting[doc['id']] highlights = [] primary_label_matches = [] # Store all primary label for field, hl_list in hl.items(): if field.startswith('label'): primary_label_matches.extend(hl_list) highlights.extend(hl_list) # If we've matched on the primary label, get the longest # from the list, else use other fields if primary_label_matches: highlights = primary_label_matches try: highlight = Highlight( highlight=self._get_longest_hl(highlights), match=self._hl_as_string(self._get_longest_hl(highlights)), has_highlight=True ) except ET.ParseError: highlight = Highlight( highlight=doc['label'][0], match=doc['label'][0], has_highlight=False ) return highlight @staticmethod def _format_query_filter(search_fields, suffixes): qf = {} for (field, relevancy) in search_fields.items(): for suffix in suffixes: field_filter = "{}_{}".format(field, suffix) qf[field_filter] = relevancy return qf def _get_longest_hl(self, highlights): """ Given a list of highlighted text, returns the longest highlight For example: [ "<em>Muscle</em> <em>atrophy</em>, generalized", "Generalized <em>muscle</em> degeneration", "Diffuse skeletal <em>">muscle</em> wasting" ] and returns: <em>Muscle</em> <em>atrophy</em>, generalized If there are mutliple matches of the same length, returns the top (arbitrary) highlight :return: """ len_dict = OrderedDict() for hl in highlights: # dummy tags to make it valid xml dummy_xml = "<p>" + hl + "</p>" try: element_tree = ET.fromstring(dummy_xml) hl_length = 0 for emph in element_tree.findall('em'): hl_length += len(emph.text) len_dict[hl] = hl_length except ET.ParseError: raise ET.ParseError return max(len_dict, key=len_dict.get) def _hl_as_string(self, highlight): """ Given a solr string of highlighted text, returns the str representations For example: "Foo <em>Muscle</em> bar <em>atrophy</em>, generalized" Returns: "Foo Muscle bar atrophy, generalized" :return: str """ # dummy tags to make it valid xml dummy_xml = "<p>" + highlight + "</p>" try: element_tree = ET.fromstring(dummy_xml) except ET.ParseError: raise ET.ParseError return "".join(list(element_tree.itertext()))
class GolrLayPersonSearch(GolrSearchQuery): """ Controller for the HPO lay person index, see https://github.com/monarch-initiative/hpo-plain-index """ def __init__(self, term=None, **kwargs): super().__init__(term, **kwargs) self.facet = False endpoint = self.get_config().lay_person_search self._set_solr(endpoint.url, endpoint.timeout) self._set_user_agent(self.user_agent) def set_lay_params(self): params = self.solr_params() suffixes = ['std', 'kw', 'eng'] qf = self._get_default_weights(suffixes) params['qf'] = ["{}^{}".format(field, weight) for field, weight in qf.items()] return params def autocomplete(self): """ Execute solr query for autocomplete """ params = self.set_lay_params() logger.info("PARAMS="+str(params)) results = self.solr.search(**params) logger.info("Docs found: {}".format(results.hits)) return self._process_layperson_results(results) def _process_layperson_results(self, results): """ Convert pysolr.Results to biolink object :param results: :return: """ payload = { 'results': [] } for doc in results.docs: hl = self._process_highlight(results, doc) highlight = { 'id': doc['id'], 'highlight': hl.highlight, 'label': doc['label'], 'matched_synonym': hl.match } payload['results'].append(highlight) logger.debug('Docs: {}'.format(len(results.docs))) return payload @staticmethod def _get_default_weights(suffixes): """ Defaults for the plain language index :param suffixes: list of suffixes (eng (ngram), std,) :return: """ weights = { "exact_synonym": "5", "related_synonym": "2", "broad_synonym": "1", "narrow_synonym": "3" } qf = GolrLayPersonSearch._format_query_filter(weights, suffixes) return qf
[docs]class GolrAssociationQuery(GolrAbstractQuery): """ A Query object providing a higher level of abstraction over either GO or Monarch Solr indexes Fields ------ All of these can be set when creating a new object fetch_objects : bool we frequently want a list of distinct association objects (in the RDF sense). for example, when querying for all phenotype associations for a gene, it is convenient to get a list of distinct phenotype terms. Although this can be obtained by iterating over the list of associations, it can be expensive to obtain all associations. Results are in the 'objects' field fetch_subjects : bool This is the analog of the fetch_objects field. Note that due to an inherent asymmetry by which the list of subjects can be very large (e.g. all genes in all species for "metabolic process" or "metabolic phenotype") it's necessary to combine this with subject_category and subject_taxon filters Results are in the 'subjects' field slim : List a list of either class ids (or in future subset ids), used to map up (slim) objects in associations. This will populate an additional 'slim' field in each association object corresponding to the slimmed-up value(s) from the direct objects. If fetch_objects is passed, this will be populated with slimmed IDs. evidence: String Evidence class from ECO. Inference is used. exclude_automatic_assertions : bool If true, then any annotations with ECO evidence code for IEA or subclasses will be excluded. use_compact_associations : bool If true, then the associations list will be false, instead compact_associations contains a more compact representation consisting of objects with (subject, relation and objects) config : Config See :ref:`Config` for details. The config object can be used to set values for the solr instance to be queried TODO - Extract params into their own object """ def __init__(self, subject_category=None, object_category=None, relation=None, relationship_type=None, subject_or_object_ids=None, subject_or_object_category=None, subject=None, subjects=None, object=None, objects=None, subject_direct=False, object_direct=False, subject_taxon=None, subject_taxon_direct=False, object_taxon=None, object_taxon_direct=False, invert_subject_object=None, evidence=None, exclude_automatic_assertions=False, q=None, id=None, use_compact_associations=False, include_raw=False, field_mapping=None, solr=None, config=None, url=None, select_fields=None, fetch_objects=False, fetch_subjects=False, fq=None, slim=None, json_facet=None, iterate=False, map_identifiers=None, facet_fields=None, facet_field_limits=None, facet_limit=25, facet_mincount=1, facet_pivot_fields=None, stats=False, stats_field=None, facet=True, pivot_subject_object=False, unselect_evidence=False, rows=10, start=None, homology_type=None, non_null_fields=None, user_agent=None, association_type=None, sort=None, **kwargs): """Fetch a set of association objects based on a query. """ self.subject_category = subject_category self.object_category = object_category self.relation = relation self.relationship_type = relationship_type self.subject_or_object_ids = subject_or_object_ids self.subject_or_object_category = subject_or_object_category self.subject = subject self.subjects = subjects self.subject_direct = subject_direct self.object = object self.objects = objects self.object_direct = object_direct self.subject_taxon = subject_taxon self.subject_taxon_direct = subject_taxon_direct self.object_taxon = object_taxon self.object_taxon_direct = object_taxon_direct self.invert_subject_object = invert_subject_object self.evidence = evidence self.exclude_automatic_assertions = exclude_automatic_assertions self.id = id self.q = q self.use_compact_associations = use_compact_associations self.include_raw = include_raw self.field_mapping = field_mapping self.solr = solr self.config = config self.select_fields = select_fields self.fetch_objects = fetch_objects self.fetch_subjects = fetch_subjects self.fq = fq if fq is not None else {} self.slim = slim if slim is not None else [] self.json_facet = json_facet self.iterate = iterate self.map_identifiers = map_identifiers self.facet_fields = facet_fields self.facet_field_limits = facet_field_limits self.facet_limit = facet_limit self.facet_mincount = facet_mincount self.facet_pivot_fields = facet_pivot_fields self.stats = stats self.stats_field = stats_field self.facet = facet self.pivot_subject_object = pivot_subject_object self.unselect_evidence = unselect_evidence self.max_rows = 100000 self.rows = rows self.start = start self.homology_type = homology_type self.url = url # test if client explicitly passes a URL; do not override self.is_explicit_url = url is not None self.non_null_fields = non_null_fields self.association_type = association_type self.sort = sort self.user_agent = get_user_agent(modules=[requests, pysolr], caller_name=__name__) if user_agent is not None: self.user_agent += " {}".format(user_agent) if self.facet_pivot_fields is None: self.facet_pivot_fields = [] if self.non_null_fields is None: self.non_null_fields = [] if self.facet_fields is None: if self.facet: self.facet_fields = [ M.SUBJECT_TAXON, M.SUBJECT_TAXON_LABEL, M.OBJECT_CLOSURE ] if self.sort is None and not self._use_amigo_schema(object_category): # Make default descending by count of publications for monarch self.sort = 'source_count desc' if self.solr is None: if self.url is None: endpoint = self.get_config().solr_assocs solr_config = {'url': endpoint.url, 'timeout': endpoint.timeout} else: solr_config = {'url': self.url, 'timeout': 5} self.update_solr_url(**solr_config) def update_solr_url(self, url, timeout=2): self.url = url solr_config = {'url': url, 'timeout': timeout} self._set_solr(**solr_config) self._set_user_agent(self.user_agent) def adjust(self): pass
[docs] def solr_params(self): """ Generate HTTP parameters for passing to Solr. In general you should not need to call this directly, calling exec() on a query object will transparently perform this step for you. """ ## Main query params for solr fq=self.fq if fq is None: fq = {} logger.info("TEMPx FQ={}".format(fq)) # subject_or_object_ids is a list of identifiers that can be matched to either subjects or objects subject_or_object_ids = self.subject_or_object_ids if subject_or_object_ids is not None: subject_or_object_ids = [self.make_canonical_identifier(c) for c in subject_or_object_ids] # canonical form for MGI is a CURIE MGI:nnnn #if subject is not None and subject.startswith('MGI:MGI:'): # logger.info('Unhacking MGI ID presumably from GO:'+str(subject)) # subject = subject.replace("MGI:MGI:","MGI") subject = self.subject if subject is not None: subject = self.make_canonical_identifier(subject) subjects = self.subjects if subjects is not None: subjects = [self.make_canonical_identifier(s) for s in subjects] subject_direct = self.subject_direct # temporary: for querying go solr, map fields. TODO object_category = self.object_category logger.info("Object category: {}".format(object_category)) object = self.object objects = self.objects object_direct = self.object_direct if object_category is None and object is not None and object.startswith('GO:'): # Infer category object_category = 'function' logger.info("Inferring Object category: {} from {}". format(object_category, object)) # URL to use for querying solr if self._use_amigo_schema(object_category): # Override solr config and use go solr endpoint = self.get_config().amigo_solr_assocs solr_config = {'url': endpoint.url, 'timeout': endpoint.timeout} self.update_solr_url(**solr_config) self.field_mapping=goassoc_fieldmap(self.relationship_type) # awkward hack: we want to avoid typing on the amigo golr gene field, # UNLESS this is a planteome golr if "planteome" in self.get_config().amigo_solr_assocs.url: self.field_mapping[M.SUBJECT_CATEGORY] = 'type' fq['document_category'] = 'annotation' if subject is not None: subject = self.make_gostyle_identifier(subject) if subjects is not None: subjects = [self.make_gostyle_identifier(s) for s in subjects] # the AmiGO schema lacks an object_category field; # we could use the 'aspect' field but instead we use a mapping of # the category to a root class if object_category is not None: cc = self.get_config().get_category_class(object_category) if cc is not None and object is None: object = cc ## subject params subject_taxon = self.subject_taxon subject_taxon_direct = self.subject_taxon_direct subject_category = self.subject_category # heuristic procedure to guess unspecified subject_category if subject_category is None and subject is not None: subject_category = self.infer_category(subject) if subject_category is not None and subject_category == 'disease': if subject_taxon is not None and subject_taxon=='NCBITaxon:9606': logger.info("Unsetting taxon, until indexed correctly") subject_taxon = None if self.invert_subject_object is None: # TODO: consider placing in a separate lookup p = (subject_category, object_category) if p == ('disease', 'gene'): self.invert_subject_object = True elif p == ('disease', 'model'): self.invert_subject_object = True else: self.invert_subject_object = False if self.invert_subject_object: logger.info("Inferred that subject/object should be inverted for {}".format(p)) ## taxon of object of triple object_taxon=self.object_taxon object_taxon_direct = self.object_taxon_direct # typically information is stored one-way, e.g. model-disease; # sometimes we want associations from perspective of object if self.invert_subject_object: (subject, object) = (object,subject) (subject_category, object_category) = (object_category,subject_category) (subject_taxon, object_taxon) = (object_taxon,subject_taxon) (object_direct, subject_direct) = (subject_direct, object_direct) (object_taxon_direct, subject_taxon_direct) = (subject_taxon_direct, object_taxon_direct) ## facet fields facet_fields=self.facet_fields facet=self.facet facet_limit=self.facet_limit select_fields=self.select_fields if self.use_compact_associations: facet_fields = [] facet = False facet_limit = 0 select_fields = [ M.SUBJECT, M.SUBJECT_LABEL, M.RELATION, M.OBJECT] if subject_category is not None: fq['subject_category'] = subject_category if object_category is not None: fq['object_category'] = object_category if subject is not None: # note: by including subject closure by default, # we automaticaly get equivalent nodes if subject_direct: fq['subject_eq'] = subject else: fq['subject_closure'] = subject if subjects is not None: # lists are assumed to be disjunctive if subject_direct: fq['subject'] = subjects else: fq['subject_closure'] = subjects if object is not None: if object_direct: fq['object_eq'] = object else: fq['object_closure'] = object if objects is not None: # lists are assumed to be disjunctive if object_direct: fq['object_eq'] = objects else: fq['object_eq'] = objects objects=self.objects if objects is not None: # lists are assumed to be disjunctive fq['object_closure'] = objects relation=self.relation if relation is not None: fq['relation_closure'] = relation if subject_taxon is not None: if subject_taxon_direct: fq['subject_taxon'] = subject_taxon else: fq['subject_taxon_closure'] = subject_taxon if object_taxon is not None: if object_taxon_direct: fq['object_taxon'] = object_taxon else: fq['object_taxon_closure'] = object_taxon if self.id is not None: fq['id'] = self.id if self.evidence is not None: e = self.evidence if e.startswith("-"): fq['-evidence_object_closure'] = e.replace("-","") else: fq['evidence_object_closure'] = e if self.exclude_automatic_assertions: fq['-evidence_object_closure'] = iea_eco # Homolog service params # TODO can we sync with argparse.choices? if self.homology_type is not None: if self.homology_type == 'O': fq['relation_closure'] = HomologyTypes.Ortholog.value elif self.homology_type == 'P': fq['relation_closure'] = HomologyTypes.Paralog.value elif self.homology_type == 'LDO': fq['relation_closure'] = \ HomologyTypes.LeastDivergedOrtholog.value ## Association type, monarch only if self.association_type is not None: fq['association_type'] = self.association_type ## pivots facet_pivot_fields=self.facet_pivot_fields if self.pivot_subject_object: facet_pivot_fields = [M.SUBJECT, M.OBJECT] # Map solr field names for fq. The generic Monarch schema is # canonical, GO schema is mapped to this using # field_mapping dictionary if self.field_mapping is not None: for (k,v) in self.field_mapping.items(): # map fq[k] -> fq[k] if k in fq: if v is None: del fq[k] else: fq[v] = fq[k] del fq[k] # in solr, the fq field can be # a negated expression, e.g. -evidence_object_closure:"ECO:0000501" # ideally we would have a higher level representation rather than # relying on string munging... negk = '-' + k if negk in fq: if v is None: del fq[negk] else: negv = '-' + v fq[negv] = fq[negk] del fq[negk] filter_queries = [] qstr = "*:*" if self.q is not None: qstr = self.q filter_queries = [ '{}:{}'.format(k,solr_quotify(v)) for (k,v) in fq.items()] # We want to match all associations that have either a subject or object # with an ID that is contained in subject_or_object_ids. if subject_or_object_ids is not None: quotified_ids = solr_quotify(subject_or_object_ids) subject_id_filter = '{}:{}'.format('subject_closure', quotified_ids) object_id_filter = '{}:{}'.format('object_closure', quotified_ids) # If subject_or_object_category is provided, we add it to the filter. if self.subject_or_object_category is not None: quotified_categories = solr_quotify(self.subject_or_object_category) subject_category_filter = '{}:{}'.format('subject_category', quotified_categories) object_category_filter = '{}:{}'.format('object_category', quotified_categories) filter_queries.append( '(' + subject_id_filter + ' AND ' + object_category_filter + ')' \ ' OR ' \ '(' + object_id_filter + ' AND ' + subject_category_filter + ')' ) else: filter_queries.append(subject_id_filter + ' OR ' + object_id_filter) # unless caller specifies a field list, use default if select_fields is None: select_fields = [ M.ID, M.IS_DEFINED_BY, M.SOURCE, M.SUBJECT, M.SUBJECT_LABEL, M.SUBJECT_TAXON, M.SUBJECT_TAXON_LABEL, M.RELATION, M.RELATION_LABEL, M.OBJECT, M.OBJECT_LABEL, M.OBJECT_TAXON, M.OBJECT_TAXON_LABEL, M.EVIDENCE, M.EVIDENCE_CLOSURE_MAP, M.FREQUENCY, M.FREQUENCY_LABEL, M.ONSET, M.ONSET_LABEL ] if not self.unselect_evidence: select_fields += [ M.EVIDENCE_GRAPH ] if not self._use_amigo_schema(object_category): select_fields.append(M.SUBJECT_CATEGORY) select_fields.append(M.OBJECT_CATEGORY) if self.map_identifiers is not None: select_fields.append(M.SUBJECT_CLOSURE) if self.slim is not None and len(self.slim) > 0: select_fields.append(M.OBJECT_CLOSURE) if self.field_mapping is not None: logger.info("Applying field mapping to SELECT: {}".format(self.field_mapping)) select_fields = [ map_field(fn, self.field_mapping) for fn in select_fields ] if facet_pivot_fields is not None: logger.info("Applying field mapping to PIV: {}".format(facet_pivot_fields)) facet_pivot_fields = [ map_field(fn, self.field_mapping) for fn in facet_pivot_fields ] logger.info("APPLIED field mapping to PIV: {}".format(facet_pivot_fields)) if facet_fields: facet_fields = [ map_field(fn, self.field_mapping) for fn in facet_fields ] if self._use_amigo_schema(object_category): select_fields += [x for x in M.AMIGO_SPECIFIC_FIELDS if x not in select_fields] ## true if iterate in windows of max_size until all results found iterate=self.iterate #logger.info('FL'+str(select_fields)) is_unlimited = False rows=self.rows if rows < 0: is_unlimited = True iterate = True rows = self.max_rows for field in self.non_null_fields: filter_queries.append(field + ":['' TO *]") search_fields = None if self.q is not None and not self._use_amigo_schema(object_category): search_fields = [ M.SUBJECT_LABEL_SEARCHABLE, M.OBJECT_LABEL_SEARCHABLE, M.SUBJECT_TAXON_LABEL_SEARCHABLE, M.OBJECT_TAXON_LABEL_SEARCHABLE, M.SUBJECT_GENE_LABEL_SEARCHABLE, M.OBJECT_GENE_LABEL_SEARCHABLE, ] params = { 'q': qstr, 'fq': filter_queries, 'facet': 'on' if facet else 'off', 'facet.field': facet_fields if facet_fields else [], 'facet.limit': facet_limit, 'facet.mincount': self.facet_mincount, 'fl': ",".join(list(filter(None, select_fields))), 'rows': rows, "defType": "edismax" } if self.start is not None: params['start'] = self.start json_facet = self.json_facet if json_facet: params['json.facet'] = json.dumps(json_facet) facet_field_limits = self.facet_field_limits if facet_field_limits is not None: for (f,flim) in facet_field_limits.items(): params["f."+f+".facet.limit"] = flim if len(facet_pivot_fields) > 0: params['facet.pivot'] = ",".join(facet_pivot_fields) params['facet.pivot.mincount'] = 1 if self.stats_field: self.stats = True params['stats.field'] = self.stats_field params['stats'] = json.dumps(self.stats) if self.sort is not None: params['sort'] = self.sort if search_fields: params['qf'] = search_fields return params
[docs] def exec(self, **kwargs): """ Execute solr query Result object is a dict with the following keys: - raw - associations : list - compact_associations : list - facet_counts - facet_pivot """ params = self.solr_params() logger.info("PARAMS="+str(params)) results = self.solr.search(**params) n_docs = len(results.docs) logger.info("Docs found: {}".format(results.hits)) if self.iterate: docs = results.docs start = n_docs while n_docs >= self.rows: logger.info("Iterating; start={}".format(start)) next_results = self.solr.search(start=start, **params) next_docs = next_results.docs n_docs = len(next_docs) docs += next_docs start += self.rows results.docs = docs fcs = results.facets payload = { 'facet_counts': translate_facet_field(fcs, self.invert_subject_object), 'pagination': {}, 'numFound': results.hits } include_raw = self.include_raw if include_raw: # note: this is not JSON serializable, do not send via REST payload['raw'] = results # TODO - check if truncated logger.info("COMPACT={} INV={}".format(self.use_compact_associations, self.invert_subject_object)) if self.use_compact_associations: payload['compact_associations'] = self.translate_docs_compact(results.docs, field_mapping=self.field_mapping, slim=self.slim, invert_subject_object=self.invert_subject_object, map_identifiers=self.map_identifiers, **kwargs) else: payload['associations'] = self.translate_docs(results.docs, field_mapping=self.field_mapping, map_identifiers=self.map_identifiers, **kwargs) if 'facet_pivot' in fcs: payload['facet_pivot'] = fcs['facet_pivot'] if 'facets' in results.raw_response: payload['facets'] = results.raw_response['facets'] # For solr, we implement this by finding all facets # TODO: no need to do 2nd query, see https://wiki.apache.org/solr/SimpleFacetParameters#Parameters fetch_objects=self.fetch_objects if fetch_objects: core_object_field = M.OBJECT if self.slim is not None and len(self.slim)>0: core_object_field = M.OBJECT_CLOSURE object_field = map_field(core_object_field, self.field_mapping) if self.invert_subject_object: object_field = map_field(M.SUBJECT, self.field_mapping) oq_params = params.copy() oq_params['fl'] = [] oq_params['facet.field'] = [object_field] oq_params['facet.limit'] = -1 oq_params['rows'] = 0 oq_params['facet.mincount'] = 1 oq_results = self.solr.search(**oq_params) if self.facet: ff = oq_results.facets['facet_fields'] ofl = ff.get(object_field) # solr returns facets counts as list, every 2nd element is number, we don't need the numbers here payload['objects'] = ofl[0::2] fetch_subjects=self.fetch_subjects if fetch_subjects: core_subject_field = M.SUBJECT if self.slim is not None and len(self.slim)>0: core_subject_field = M.SUBJECT_CLOSURE subject_field = map_field(core_subject_field, self.field_mapping) if self.invert_subject_object: subject_field = map_field(M.SUBJECT, self.field_mapping) oq_params = params.copy() oq_params['fl'] = [] oq_params['facet.field'] = [subject_field] oq_params['facet.limit'] = self.max_rows oq_params['rows'] = 0 oq_params['facet.mincount'] = 1 oq_results = self.solr.search(**oq_params) if self.facet: ff = oq_results.facets['facet_fields'] ofl = ff.get(subject_field) # solr returns facets counts as list, every 2nd element is number, we don't need the numbers here payload['subjects'] = ofl[0::2] if len(payload['subjects']) == self.max_rows: payload['is_truncated'] = True if self.slim is not None and len(self.slim)>0: if 'objects' in payload: payload['objects'] = [x for x in payload['objects'] if x in self.slim] if 'associations' in payload: for a in payload['associations']: a['slim'] = [x for x in a['object_closure'] if x in self.slim] del a['object_closure'] return payload
[docs] def infer_category(self, id): """ heuristic to infer a category from an id, e.g. DOID:nnn --> disease """ logger.info("Attempting category inference on id={}".format(id)) toks = id.split(":") idspace = toks[0] c = None if idspace == 'DOID': c='disease' if c is not None: logger.info("Inferred category: {} based on id={}".format(c, id)) return c
[docs] def make_canonical_identifier(self,id): """ E.g. MGI:MGI:nnnn --> MGI:nnnn """ if id is not None: for (k,v) in PREFIX_NORMALIZATION_MAP.items(): s = k+':' if id.startswith(s): return id.replace(s,v+':') return id
[docs] def make_gostyle_identifier(self,id): """ E.g. MGI:nnnn --> MGI:MGI:nnnn """ if id is not None: for (k,v) in PREFIX_NORMALIZATION_MAP.items(): s = v+':' if id.startswith(s): return id.replace(s,k+':') return id
[docs] def translate_objs(self, d, fname, default=None): """ Translate a field whose value is expected to be a list """ if fname not in d: # TODO: consider adding arg for failure on null return default #lf = M.label_field(fname) v = d[fname] if not isinstance(v,list): v = [v] objs = [{'id': idval} for idval in v] # todo - labels return objs
[docs] def translate_obj(self,d,fname): """ Translate a field value from a solr document. This includes special logic for when the field value denotes an object, here we nest it """ if fname not in d: # TODO: consider adding arg for failure on null return None lf = M.label_field(fname) id = d[fname] id = self.make_canonical_identifier(id) #if id.startswith('MGI:MGI:'): # id = id.replace('MGI:MGI:','MGI:') obj = {'id': id} if id: if self._use_amigo_schema(self.object_category): iri = expand_uri(id) else: iri = expand_uri(id, [get_curie_map('{}/cypher/curies'.format(self.config.scigraph_data.url))]) obj['iri'] = iri if lf in d: obj['label'] = d[lf] cf = fname + "_category" if cf in d: obj['category'] = [d[cf]] if 'aspect' in d and id.startswith('GO:'): obj['category'] = [ASPECT_MAP[d['aspect']]] del d['aspect'] return obj
def map_doc(self, d, field_mapping, invert_subject_object=False): if field_mapping is not None: for (k,v) in field_mapping.items(): if v is not None and k is not None: #logger.debug("TESTING FOR:"+v+" IN "+str(d)) if v in d: #logger.debug("Setting field {} to {} // was in {}".format(k,d[v],v)) d[k] = d[v] if invert_subject_object: for field in INVERT_FIELDS_MAP: flip(d, field, INVERT_FIELDS_MAP[field]) return d
[docs] def translate_doc(self, d, field_mapping=None, map_identifiers=None, **kwargs): """ Translate a solr document (i.e. a single result row) """ if field_mapping is not None: self.map_doc(d, field_mapping) subject = self.translate_obj(d, M.SUBJECT) obj = self.translate_obj(d, M.OBJECT) # TODO: use a more robust method; we need equivalence as separate field in solr if map_identifiers is not None: if M.SUBJECT_CLOSURE in d: subject['id'] = self.map_id(subject, map_identifiers, d[M.SUBJECT_CLOSURE]) else: logger.info("NO SUBJECT CLOSURE IN: "+str(d)) if M.SUBJECT_TAXON in d: subject['taxon'] = self.translate_obj(d,M.SUBJECT_TAXON) if M.OBJECT_TAXON in d: obj['taxon'] = self.translate_obj(d, M.OBJECT_TAXON) qualifiers = [] if M.RELATION in d and isinstance(d[M.RELATION],list): # GO overloads qualifiers and relation relation = None for rel in d[M.RELATION]: if rel.lower() == 'not': qualifiers.append(rel) else: relation = rel if relation is not None: d[M.RELATION] = relation else: d[M.RELATION] = None negated = 'not' in qualifiers assoc = {'id':d.get(M.ID), 'subject': subject, 'object': obj, 'negated': negated, 'relation': self.translate_obj(d,M.RELATION), 'publications': self.translate_objs(d, M.SOURCE, []), # note 'source' is used in the golr schema } if self.invert_subject_object and assoc['relation'] is not None: assoc['relation']['inverse'] = True if len(qualifiers) > 0: assoc['qualifiers'] = qualifiers evidence_types = [] if M.EVIDENCE in d: evidence_label_map = json.loads(d[M.EVIDENCE_CLOSURE_MAP]) if self._use_amigo_schema(self.object_category): evidence_codes = [d[M.EVIDENCE]] else: evidence_codes = d[M.EVIDENCE] for evidence_code in evidence_codes: evidence_label = None if evidence_code in evidence_label_map: evidence_label = evidence_label_map[evidence_code] evidence_types.append({ 'id': evidence_code, 'label': evidence_label }) assoc['evidence_types'] = evidence_types if M.OBJECT_CLOSURE in d: assoc['object_closure'] = d.get(M.OBJECT_CLOSURE) if M.IS_DEFINED_BY in d: if isinstance(d[M.IS_DEFINED_BY],list): assoc['provided_by'] = d[M.IS_DEFINED_BY] else: # hack for GO Golr instance assoc['provided_by'] = [d[M.IS_DEFINED_BY]] # solr does not allow nested objects, so evidence graph is json-encoded if M.EVIDENCE_GRAPH in d: assoc[M.EVIDENCE_GRAPH] = json.loads(d[M.EVIDENCE_GRAPH]) if M.FREQUENCY in d: assoc[M.FREQUENCY] = { 'id': d[M.FREQUENCY] } if M.FREQUENCY_LABEL in d: assoc[M.FREQUENCY]['label'] = d[M.FREQUENCY_LABEL] if M.ONSET in d: assoc[M.ONSET] = { 'id': d[M.ONSET] } if M.ONSET_LABEL in d: assoc[M.ONSET]['label'] = d[M.ONSET_LABEL] if M.ASSOCIATION_TYPE in d: assoc['type'] = d[M.ASSOCIATION_TYPE] if self._use_amigo_schema(self.object_category): for f in M.AMIGO_SPECIFIC_FIELDS: if f in d: assoc[f] = d[f] return assoc
[docs] def translate_docs(self, ds, **kwargs): """ Translate a set of solr results """ for d in ds: self.map_doc(d, {}, self.invert_subject_object) return [self.translate_doc(d, **kwargs) for d in ds]
[docs] def translate_docs_compact(self, ds, field_mapping=None, slim=None, map_identifiers=None, invert_subject_object=False, **kwargs): """ Translate golr association documents to a compact representation """ amap = {} logger.info("Translating docs to compact form. Slim={}".format(slim)) for d in ds: self.map_doc(d, field_mapping, invert_subject_object=invert_subject_object) subject = d[M.SUBJECT] subject_label = d[M.SUBJECT_LABEL] # TODO: use a more robust method; we need equivalence as separate field in solr if map_identifiers is not None: if M.SUBJECT_CLOSURE in d: subject = self.map_id(subject, map_identifiers, d[M.SUBJECT_CLOSURE]) else: logger.debug("NO SUBJECT CLOSURE IN: "+str(d)) rel = d.get(M.RELATION) skip = False # TODO if rel == 'not' or rel == 'NOT': skip = True # this is a list in GO if isinstance(rel,list): if 'not' in rel or 'NOT' in rel: skip = True if len(rel) > 1: logger.warning(">1 relation: {}".format(rel)) rel = ";".join(rel) if skip: logger.debug("Skipping: {}".format(d)) continue subject = self.make_canonical_identifier(subject) #if subject.startswith('MGI:MGI:'): # subject = subject.replace('MGI:MGI:','MGI:') k = (subject,rel) if k not in amap: amap[k] = {'subject':subject, 'subject_label':subject_label, 'relation':rel, 'objects': []} if slim is not None and len(slim)>0: mapped_objects = [x for x in d[M.OBJECT_CLOSURE] if x in slim] logger.debug("Mapped objects: {}".format(mapped_objects)) amap[k]['objects'] += mapped_objects else: amap[k]['objects'].append(d[M.OBJECT]) for k in amap.keys(): amap[k]['objects'] = list(set(amap[k]['objects'])) return list(amap.values())
[docs] def map_id(self,id, prefix, closure_list): """ Map identifiers based on an equivalence closure list. """ prefixc = prefix + ':' ids = [eid for eid in closure_list if eid.startswith(prefixc)] # TODO: add option to fail if no mapping, or if >1 mapping if len(ids) == 0: # default to input return id return ids[0]
### This may quite possibly be a temporary code, but it looks a lot simpler than the above for more customizable Solr queries import requests from enum import Enum ## Should take those URLs from config.yaml class ESOLR(Enum): GOLR = "http://golr-aux.geneontology.io/solr/" MOLR = "https://solr.monarchinitiative.org/solr/search" class ESOLRDoc(Enum): ONTOLOGY = "ontology_class" ANNOTATION = "annotation" BIOENTITY = "bioentity" ## Respect the method name for run_sparql_on with enums def run_solr_on(solrInstance, category, id, fields): """ Return the result of a solr query on the given solrInstance (Enum ESOLR), for a certain document_category (ESOLRDoc) and id """ query = solrInstance.value + "select?q=*:*&fq=document_category:\"" + category.value + "\"&fq=id:\"" + id + "\"&fl=" + fields + "&wt=json&indent=on" response = requests.get(query) return response.json()['response']['docs'][0] def run_solr_text_on(solrInstance, category, q, qf, fields, optionals): """ Return the result of a solr query on the given solrInstance (Enum ESOLR), for a certain document_category (ESOLRDoc) and id """ if optionals == None: optionals = "" query = solrInstance.value + "select?q=" + q + "&qf=" + qf + "&fq=document_category:\"" + category.value + "\"&fl=" + fields + "&wt=json&indent=on" + optionals # print("QUERY: ", query) response = requests.get(query) return response.json()['response']['docs'] ### Those utility functions should find their place in a common utils.py if any exists ## Utility function to merge two field of a json def merge(json, firstField, secondField): """ merge two fields of a json into an array of { firstField : secondField } """ merged = [] for i in range(0, len(json[firstField])): merged.append({ json[firstField][i] : json[secondField][i] }) return merged ## Utility function to filter out two fields of a json and give it each a new label def mergeWithLabels(json, firstField, firstFieldLabel, secondField, secondFieldLabel): """ merge two fields of a json into an array of { firstFieldLabel : firstFieldLabel, secondFieldLabel : secondField } """ merged = [] for i in range(0, len(json[firstField])): merged.append({ firstFieldLabel : json[firstField][i], secondFieldLabel : json[secondField][i] }) return merged ## Utility function to replace in a specific <field> an <old> string by a <new> string def replace(json, field, old, new): for i in range(0, len(json)): if json[i][field]: json[i][field] = json[i][field].replace(old, new) return json