"""
A query wrapper for a Golr instance
Intended to work with:
* Monarch golr instance
* AmiGO/GO golr instance (including both GO and Planteome)
Conventions
-----------
Documents follow either entity or association patterns.
Associations
------------
Connects some kind of *subject* to an *object* via a *relation*, this
should be read as any RDF triple.
The subject may be a molecular biological entity such as a gene, or an
ontology class. The distinction between these two may be malleable.
The object is typically an ontology class, but not
always. E.g. gene-gene interactions or homology for exceptions.
An association also has evidence plus various provenance metadata.
In Monarch, the evidence is modeled as a graph encoded as a JSON blob;
In AmiGO, we follow the GAF data model where it is assumed evidence is
simple as does not follow chains, there is assumed to be one evidence
object for the intermediate entity.
### Entities
TODO
"""
import json
import logging
import pysolr
import re
from dataclasses import asdict
from typing import Dict, List
import xml.etree.ElementTree as ET
from collections import OrderedDict
from ontobio.vocabulary.relations import HomologyTypes
from ontobio.model.GolrResults import SearchResults, AutocompleteResult, Highlight
from ontobio.util.user_agent import get_user_agent
from prefixcommons.curie_util import expand_uri
from ontobio.util.curie_map import get_curie_map
from ontobio import ecomap
INVOLVED_IN="involved_in"
ACTS_UPSTREAM_OF_OR_WITHIN="acts_upstream_of_or_within"
ISA_PARTOF_CLOSURE="isa_partof_closure"
REGULATES_CLOSURE="regulates_closure"
ecomapping = ecomap.EcoMap()
iea_eco = ecomapping.coderef_to_ecoclass("IEA")
logger = logging.getLogger(__name__)
class GolrFields:
"""
Enumeration of fields in Golr.
Note the Monarch golr schema is taken as canonical here
"""
ID='id'
ASSOCIATION_TYPE='association_type'
SOURCE='source'
OBJECT_CLOSURE='object_closure'
SOURCE_CLOSURE_MAP='source_closure_map'
SUBJECT_TAXON_CLOSURE_LABEL='subject_taxon_closure_label'
OBJECT_TAXON_CLOSURE_LABEL = 'object_taxon_closure_label'
SUBJECT_GENE_CLOSURE_MAP='subject_gene_closure_map'
SUBJECT_TAXON_LABEL_SEARCHABLE='subject_taxon_label_searchable'
OBJECT_TAXON_LABEL_SEARCHABLE = 'object_taxon_label_searchable'
IS_DEFINED_BY='is_defined_by'
SUBJECT_GENE_CLOSURE_LABEL='subject_gene_closure_label'
SUBJECT_TAXON_CLOSURE='subject_taxon_closure'
OBJECT_TAXON_CLOSURE = 'object_taxon_closure'
OBJECT_LABEL='object_label'
SUBJECT_CATEGORY='subject_category'
SUBJECT_GENE_LABEL='subject_gene_label'
SUBJECT_TAXON_CLOSURE_LABEL_SEARCHABLE='subject_taxon_closure_label_searchable'
OBJECT_TAXON_CLOSURE_LABEL_SEARCHABLE = 'object_taxon_closure_label_searchable'
SUBJECT_GENE_CLOSURE='subject_gene_closure'
SUBJECT_GENE_LABEL_SEARCHABLE='subject_gene_label_searchable'
OBJECT_GENE_LABEL_SEARCHABLE = 'object_gene_label_searchable'
SUBJECT='subject'
SUBJECT_LABEL='subject_label'
SUBJECT_CLOSURE_LABEL_SEARCHABLE='subject_closure_label_searchable'
OBJECT_CLOSURE_LABEL_SEARCHABLE='object_closure_label_searchable'
OBJECT_CLOSURE_LABEL='object_closure_label'
SUBJECT_CLOSURE_LABEL='subject_closure_label'
SUBJECT_GENE='subject_gene'
SUBJECT_TAXON='subject_taxon'
OBJECT_TAXON = 'object_taxon'
OBJECT_LABEL_SEARCHABLE='object_label_searchable'
OBJECT_CATEGORY='object_category'
SUBJECT_TAXON_CLOSURE_MAP='subject_taxon_closure_map'
OBJECT_TAXON_CLOSURE_MAP = 'object_taxon_closure_map'
QUALIFIER='qualifier'
SUBJECT_TAXON_LABEL='subject_taxon_label'
OBJECT_TAXON_LABEL = 'object_taxon_label'
SUBJECT_CLOSURE_MAP='subject_closure_map'
SUBJECT_ORTHOLOG_CLOSURE='subject_ortholog_closure'
SUBJECT_CLOSURE='subject_closure'
OBJECT='object'
OBJECT_CLOSURE_MAP='object_closure_map'
SUBJECT_LABEL_SEARCHABLE='subject_label_searchable'
EVIDENCE_OBJECT='evidence_object'
EVIDENCE_OBJECT_CLOSURE_MAP='evidence_object_closure_map'
EVIDENCE_OBJECT_LABEL='evidence_object_label'
EVIDENCE_OBJECT_CLOSURE='evidence_object_closure'
EVIDENCE_OBJECT_CLOSURE_LABEL='evidence_object_closure_label'
EVIDENCE='evidence'
EVIDENCE_LABEL='evidence_label'
EVIDENCE_CLOSURE_MAP = 'evidence_closure_map'
EVIDENCE_GRAPH = 'evidence_graph'
_VERSION_='_version_'
SUBJECT_GENE_CLOSURE_LABEL_SEARCHABLE='subject_gene_closure_label_searchable'
ASPECT='aspect'
RELATION='relation'
RELATION_LABEL='relation_label'
FREQUENCY='frequency'
FREQUENCY_LABEL='frequency_label'
ONSET='onset'
ONSET_LABEL='onset_label'
# This is a temporary fix until
# https://github.com/biolink/ontobio/issues/126 is resolved.
# AmiGO specific fields
AMIGO_SPECIFIC_FIELDS = [
'reference',
'qualifier',
'is_redundant_for',
'type',
'evidence',
'evidence_label',
'evidence_type',
'evidence_type_label',
'evidence_with',
'evidence_closure',
'evidence_closure_label',
'evidence_subset_closure',
'evidence_subset_closure_label',
'evidence_type_closure',
'evidence_type_closure_label',
'aspect'
]
# golr convention: for any entity FOO, the id is denoted 'foo'
# and the label FOO_label
def label_field(self, f):
return f + "_label"
# golr convention: for any class FOO, the id is denoted 'foo'
# and the cosure FOO_closure. Other closures may exist
def closure_field(self, f):
return f + "_closure"
# create an instance
M=GolrFields()
# fields in the result docs that are to be inverted when 'invert_subject_object' is True
INVERT_FIELDS_MAP = {
M.SUBJECT: M.OBJECT,
M.SUBJECT_CLOSURE: M.OBJECT_CLOSURE,
M.SUBJECT_TAXON: M.OBJECT_TAXON,
M.SUBJECT_CLOSURE_LABEL: M.OBJECT_CLOSURE_LABEL,
M.SUBJECT_TAXON_CLOSURE_LABEL: M.OBJECT_TAXON_CLOSURE_LABEL,
M.SUBJECT_TAXON_LABEL_SEARCHABLE: M.OBJECT_TAXON_LABEL_SEARCHABLE,
M.SUBJECT_TAXON_CLOSURE: M.OBJECT_TAXON_CLOSURE,
M.SUBJECT_LABEL: M.OBJECT_LABEL,
M.SUBJECT_TAXON_CLOSURE_LABEL_SEARCHABLE: M.OBJECT_TAXON_CLOSURE_LABEL_SEARCHABLE,
M.SUBJECT_CLOSURE_LABEL_SEARCHABLE: M.OBJECT_CLOSURE_LABEL_SEARCHABLE,
M.SUBJECT_LABEL_SEARCHABLE: M.OBJECT_LABEL_SEARCHABLE,
M.SUBJECT_CATEGORY: M.OBJECT_CATEGORY,
M.SUBJECT_TAXON_CLOSURE_MAP: M.OBJECT_TAXON_CLOSURE_MAP,
M.SUBJECT_TAXON_LABEL: M.OBJECT_TAXON_LABEL,
M.SUBJECT_CLOSURE_MAP: M.OBJECT_CLOSURE_MAP,
}
ASPECT_MAP = {
'F': 'molecular_activity',
'P': 'biological_process',
'C': 'cellular_component'
}
# normalize to what Monarch uses
PREFIX_NORMALIZATION_MAP = {
'MGI:MGI' : 'MGI',
'FB' : 'FlyBase',
}
def flip(d, x, y):
dx = d.get(x)
dy = d.get(y)
d[x] = dy
d[y] = dx
def solr_quotify(v, operator="OR"):
if isinstance(v, list):
if len(v) == 1:
return solr_quotify(v[0], operator)
else:
return '({})'.format(" {} ".format(operator).join([solr_quotify(x) for x in v]))
else:
# TODO - escape quotes
return '"{}"'.format(v)
def translate_facet_field(fcs, invert_subject_object = False):
"""
Translates solr facet_fields results into something easier to manipulate
A solr facet field looks like this: [field1, count1, field2, count2, ..., fieldN, countN]
We translate this to a dict {f1: c1, ..., fn: cn}
This has slightly higher overhead for sending over the wire, but is easier to use
"""
if 'facet_fields' not in fcs:
return {}
ffs = fcs['facet_fields']
rs={}
for (facet, facetresults) in ffs.items():
if invert_subject_object:
for (k,v) in INVERT_FIELDS_MAP.items():
if facet == k:
facet = v
break
elif facet == v:
facet = k
break
pairs = {}
rs[facet] = pairs
for i in range(int(len(facetresults)/2)):
(fv,fc) = (facetresults[i*2],facetresults[i*2+1])
pairs[fv] = fc
return rs
### GO-SPECIFIC CODE
def goassoc_fieldmap(relationship_type=ACTS_UPSTREAM_OF_OR_WITHIN):
"""
Returns a mapping of canonical monarch fields to amigo-golr.
See: https://github.com/geneontology/amigo/blob/master/metadata/ann-config.yaml
"""
return {
M.SUBJECT: 'bioentity',
M.SUBJECT_CLOSURE: 'bioentity',
## In the GO AmiGO instance, the type field is not correctly populated
## See above in the code for hack that restores this for planteome instance
## M.SUBJECT_CATEGORY: 'type',
M.SUBJECT_CATEGORY: None,
M.SUBJECT_LABEL: 'bioentity_label',
M.SUBJECT_TAXON: 'taxon',
M.SUBJECT_TAXON_LABEL: 'taxon_label',
M.SUBJECT_TAXON_CLOSURE: 'taxon_closure',
M.RELATION: 'qualifier',
M.OBJECT: 'annotation_class',
M.OBJECT_CLOSURE: REGULATES_CLOSURE if relationship_type == ACTS_UPSTREAM_OF_OR_WITHIN else ISA_PARTOF_CLOSURE,
M.OBJECT_LABEL: 'annotation_class_label',
M.OBJECT_TAXON: 'taxon',
M.OBJECT_TAXON_LABEL: 'taxon_label',
M.OBJECT_TAXON_CLOSURE: 'taxon_closure',
M.OBJECT_CATEGORY: None,
M.EVIDENCE_OBJECT_CLOSURE: 'evidence_subset_closure',
M.IS_DEFINED_BY: 'assigned_by'
}
def map_field(fn, m) :
"""
Maps a field name, given a mapping file.
Returns input if fieldname is unmapped.
"""
if m is None:
return fn
if fn in m:
return m[fn]
else:
return fn
### CLASSES
class GolrServer():
pass
class GolrAbstractQuery():
def get_config(self):
if self.config is None:
from ontobio.config import Config, get_config
self.config = get_config()
return self.config
def _set_solr(self, url, timeout=2):
self.solr = pysolr.Solr(url=url, timeout=timeout)
return self.solr
def _set_user_agent(self, user_agent):
self.solr.get_session().headers['User-Agent'] = user_agent
def _use_amigo_schema(self, object_category):
if object_category is not None and object_category == 'function':
return True
ds = self.get_config().default_solr_schema
if ds is not None and ds == 'amigo':
return True
return False
[docs]class GolrSearchQuery(GolrAbstractQuery):
"""
Controller for monarch and go solr search cores
Queries over a search document
"""
def __init__(self,
term=None,
category=None,
is_go=False,
url=None,
solr=None,
config=None,
fq=None,
fq_string=None,
hl=True,
facet_fields=None,
facet=True,
search_fields=None,
taxon_map=True,
rows=100,
start=None,
prefix=None,
boost_fx=None,
boost_q=None,
highlight_class=None,
taxon=None,
min_match=None,
minimal_tokenizer=False,
include_eqs=False,
exclude_groups=False,
user_agent=None):
self.term = term
self.category = category
self.is_go = is_go
self.url = url
self.solr = solr
self.config = config
self.hl = hl
self.facet = facet
self.facet_fields = facet_fields
self.search_fields = search_fields
self.taxon_map = taxon_map
self.rows = rows
self.start = start
# test if client explicitly passes a URL; do not override
self.is_explicit_url = url is not None
# Raw fq param string
self.fq_string = fq_string if fq_string is not None else []
# fq as dictionary where key:values get converted
# to fq="(key1:value1 OR key2:value2)"
self.fq = fq if fq is not None else {}
self.prefix = prefix
self.boost_fx = boost_fx
self.boost_q = boost_q
self.highlight_class = highlight_class
self.taxon = taxon
self.min_match = min_match
self.include_eqs = include_eqs
self.exclude_groups = exclude_groups
self.minimal_tokenizer = minimal_tokenizer
self.user_agent = get_user_agent(modules=[requests, pysolr], caller_name=__name__)
if user_agent is not None:
self.user_agent += " {}".format(user_agent)
if self.search_fields is None:
self.search_fields = dict(id=3,
label=2,
synonym=1,
definition=1,
taxon_label=1,
taxon_label_synonym=1,
equivalent_curie=1)
if self.is_go:
if self.url is None:
endpoint = self.get_config().amigo_solr_search
solr_config = {'url': endpoint.url, 'timeout': endpoint.timeout}
else:
solr_config = {'url': self.url, 'timeout': 2}
else:
if self.url is None:
endpoint = self.get_config().solr_search
solr_config = {'url': endpoint.url, 'timeout': endpoint.timeout}
else:
solr_config = {'url': self.url, 'timeout': 2}
self._set_solr(**solr_config)
self._set_user_agent(self.user_agent)
def update_solr_url(self, url, timeout=2):
self.url = url
solr_config = {'url': url, 'timeout': timeout}
self._set_solr(**solr_config)
self._set_user_agent(self.user_agent)
def solr_params(self, mode=None):
if self.facet_fields is None and self.facet:
self.facet_fields = ['category', 'taxon', 'taxon_label']
if self.category is not None:
self.fq['category'] = self.category
suffixes = ['std', 'kw', 'eng']
if self.is_go:
self.search_fields=dict(entity_label=3, general_blob=3)
self.hl = False
# TODO: formal mapping
if 'taxon_label' in self.facet_fields:
self.facet_fields.remove('taxon_label')
suffixes = ['searchable']
self.fq['document_category'] = "general"
qf = self._format_query_filter(self.search_fields, suffixes)
if mode == 'search':
# Decrease ngram weight and increase keyword and standard tokenizer
for field, weight in qf.items():
if '_kw' in field:
qf[field] += 2
elif '_std' in field:
qf[field] += 1
if self.term is not None and ":" in self.term:
qf["id_kw"] = 20
qf["equivalent_curie_kw"] = 20
if self.minimal_tokenizer:
# Split text using a minimal set of word boundaries
# useful for variants and genotypes where typical
# word boundaries are part of the nomenclature
tokens = re.split(r'[\s|\'\",]+', self.term)
if tokens[-1] == '':
del tokens[-1]
tokenized = "".join(['"{}"'.format(token) for token in tokens])
else:
# Solr will run through the Standard Tokenizer
tokenized = self.term
select_fields = ["*", "score"]
params = {
'q': '{0} "{1}"'.format(tokenized, self.term),
"qt": "standard",
'fl': ",".join(list(filter(None, select_fields))),
"defType": "edismax",
"qf": ["{}^{}".format(field, weight) for field, weight in qf.items()],
'rows': self.rows
}
if self.facet:
params['facet'] = 'on'
params['facet.field'] = self.facet_fields
params['facet.limit'] = 25
params['facet.mincount'] = 1
if self.taxon_map:
params["facet.pivot.mincount"] =1
params["facet.pivot"] = "taxon,taxon_label"
if self.start is not None:
params['start'] = self.start
if self.hl:
params['hl.simple.pre'] = "<em class=\"hilite\">"
params['hl.snippets'] = "1000"
params['hl'] = 'on'
if self.fq is not None:
filter_queries = ['{}:{}'.format(k,solr_quotify(v))
for (k,v) in self.fq.items()]
params['fq'] = filter_queries
else:
params['fq'] = []
for fq in self.fq_string:
params['fq'].append(fq)
if self.prefix is not None:
negative_filter = [p_filt[1:] for p_filt in self.prefix
if p_filt.startswith('-')]
positive_filter = [p_filt for p_filt in self.prefix
if not p_filt.startswith('-')]
if negative_filter:
if self.include_eqs:
single_filts = [
f'(-prefix:"{prefix}" OR -equivalent_curie:{prefix}\:*)'
for prefix in negative_filter
]
for filt in single_filts:
params['fq'].append(filt)
else:
neg_filter = '({})'.format(" OR ".join([filt for filt in negative_filter]))
params['fq'].append('-prefix:{}'.format(solr_quotify(negative_filter)))
if positive_filter:
if self.include_eqs:
# fq=((prefix:HP OR equivalent_curie:HP) OR (prefix:MONDO OR equivalent_curie:MONDO))
single_filts = [
f'(prefix:"{prefix}" OR equivalent_curie:{prefix}\:*)'
for prefix in positive_filter
]
pos_filter = '({})'.format(" OR ".join([filt for filt in single_filts]))
params['fq'].append(pos_filter)
else:
params['fq'].append('prefix:{}'.format(solr_quotify(positive_filter)))
if self.boost_fx is not None:
params['bf'] = []
for boost in self.boost_fx:
params['bf'].append(boost)
if self.boost_q is not None:
params['bq'] = []
for boost in self.boost_q:
params['bq'].append(boost)
if self.taxon is not None:
for tax in self.taxon:
params['fq'].append('taxon:"{}"'.format(tax))
if self.exclude_groups:
params['fq'].append('leaf:1')
if self.min_match is not None:
params['mm'] = self.min_match
if self.highlight_class is not None:
params['hl.simple.pre'] = \
'<em class=\"{}\">'.format(self.highlight_class)
return params
[docs] def search(self):
"""
Execute solr search query
"""
params = self.solr_params(mode='search')
logger.info("PARAMS=" + str(params))
results = self.solr.search(**params)
logger.info("Docs found: {}".format(results.hits))
return self._process_search_results(results)
[docs] def autocomplete(self):
"""
Execute solr autocomplete
"""
self.facet = False
params = self.solr_params()
logger.info("PARAMS=" + str(params))
results = self.solr.search(**params)
logger.info("Docs found: {}".format(results.hits))
return self._process_autocomplete_results(results)
def _process_search_results(self,
results: pysolr.Results) -> SearchResults:
"""
Convert solr docs to biolink object
:param results: pysolr.Results
:return: model.GolrResults.SearchResults
"""
# map go-golr fields to standard
for doc in results.docs:
if 'entity' in doc:
doc['id'] = doc['entity']
doc['label'] = doc['entity_label']
translated_facets = translate_facet_field(results.facets)
# inject the taxon map (aka a facet pivot) into the returned facets
if self.taxon_map:
translated_facets['_taxon_map'] = [
{
'id': taxon['value'],
'label': taxon['pivot'][0]['value'],
'count': taxon['pivot'][0]['count']
}
for taxon in results.facets['facet_pivot']['taxon,taxon_label']
]
highlighting = {
doc['id']: asdict(self._process_highlight(results, doc))
for doc in results.docs if results.highlighting
}
payload = SearchResults(
facet_counts=translated_facets,
highlighting=highlighting,
docs=results.docs,
numFound=results.hits
)
logger.debug('Docs: {}'.format(len(results.docs)))
return payload
def _process_autocomplete_results(
self,
results: pysolr.Results) -> Dict[str, List[AutocompleteResult]]:
"""
Convert results to biolink autocomplete object
:param results: pysolr.Results
:return: {'docs': List[AutocompleteResult]}
"""
# map go-golr fields to standard
for doc in results.docs:
if 'entity' in doc:
doc['id'] = doc['entity']
doc['label'] = doc['entity_label']
docs = []
for doc in results.docs:
if results.highlighting:
hl = self._process_highlight(results, doc)
else:
hl = Highlight(None, None, None)
# In some cases a node does not have a category
category = doc['category'] if 'category' in doc else []
doc['taxon'] = doc['taxon'] if 'taxon' in doc else ""
doc['taxon_label'] = doc['taxon_label'] if 'taxon_label' in doc else ""
doc['equivalent_curie'] = doc['equivalent_curie'] if 'equivalent_curie' in doc else []
doc = AutocompleteResult(
id=doc['id'],
label=doc['label'],
match=hl.match,
category=category,
taxon=doc['taxon'],
taxon_label=doc['taxon_label'],
highlight=hl.highlight,
has_highlight=hl.has_highlight,
equivalent_ids=doc['equivalent_curie']
)
docs.append(doc)
payload = {
'docs': docs
}
logger.debug('Docs: {}'.format(len(results.docs)))
return payload
def _process_highlight(self, results: pysolr.Results, doc) -> Highlight:
hl = results.highlighting[doc['id']]
highlights = []
primary_label_matches = [] # Store all primary label
for field, hl_list in hl.items():
if field.startswith('label'):
primary_label_matches.extend(hl_list)
highlights.extend(hl_list)
# If we've matched on the primary label, get the longest
# from the list, else use other fields
if primary_label_matches:
highlights = primary_label_matches
try:
highlight = Highlight(
highlight=self._get_longest_hl(highlights),
match=self._hl_as_string(self._get_longest_hl(highlights)),
has_highlight=True
)
except ET.ParseError:
highlight = Highlight(
highlight=doc['label'][0],
match=doc['label'][0],
has_highlight=False
)
return highlight
@staticmethod
def _format_query_filter(search_fields, suffixes):
qf = {}
for (field, relevancy) in search_fields.items():
for suffix in suffixes:
field_filter = "{}_{}".format(field, suffix)
qf[field_filter] = relevancy
return qf
def _get_longest_hl(self, highlights):
"""
Given a list of highlighted text, returns the
longest highlight
For example:
[
"<em>Muscle</em> <em>atrophy</em>, generalized",
"Generalized <em>muscle</em> degeneration",
"Diffuse skeletal <em>">muscle</em> wasting"
]
and returns:
<em>Muscle</em> <em>atrophy</em>, generalized
If there are mutliple matches of the same length, returns
the top (arbitrary) highlight
:return:
"""
len_dict = OrderedDict()
for hl in highlights:
# dummy tags to make it valid xml
dummy_xml = "<p>" + hl + "</p>"
try:
element_tree = ET.fromstring(dummy_xml)
hl_length = 0
for emph in element_tree.findall('em'):
hl_length += len(emph.text)
len_dict[hl] = hl_length
except ET.ParseError:
raise ET.ParseError
return max(len_dict, key=len_dict.get)
def _hl_as_string(self, highlight):
"""
Given a solr string of highlighted text, returns the
str representations
For example:
"Foo <em>Muscle</em> bar <em>atrophy</em>, generalized"
Returns:
"Foo Muscle bar atrophy, generalized"
:return: str
"""
# dummy tags to make it valid xml
dummy_xml = "<p>" + highlight + "</p>"
try:
element_tree = ET.fromstring(dummy_xml)
except ET.ParseError:
raise ET.ParseError
return "".join(list(element_tree.itertext()))
class GolrLayPersonSearch(GolrSearchQuery):
"""
Controller for the HPO lay person index,
see https://github.com/monarch-initiative/hpo-plain-index
"""
def __init__(self, term=None, **kwargs):
super().__init__(term, **kwargs)
self.facet = False
endpoint = self.get_config().lay_person_search
self._set_solr(endpoint.url, endpoint.timeout)
self._set_user_agent(self.user_agent)
def set_lay_params(self):
params = self.solr_params()
suffixes = ['std', 'kw', 'eng']
qf = self._get_default_weights(suffixes)
params['qf'] = ["{}^{}".format(field, weight) for field, weight in qf.items()]
return params
def autocomplete(self):
"""
Execute solr query for autocomplete
"""
params = self.set_lay_params()
logger.info("PARAMS="+str(params))
results = self.solr.search(**params)
logger.info("Docs found: {}".format(results.hits))
return self._process_layperson_results(results)
def _process_layperson_results(self, results):
"""
Convert pysolr.Results to biolink object
:param results:
:return:
"""
payload = {
'results': []
}
for doc in results.docs:
hl = self._process_highlight(results, doc)
highlight = {
'id': doc['id'],
'highlight': hl.highlight,
'label': doc['label'],
'matched_synonym': hl.match
}
payload['results'].append(highlight)
logger.debug('Docs: {}'.format(len(results.docs)))
return payload
@staticmethod
def _get_default_weights(suffixes):
"""
Defaults for the plain language index
:param suffixes: list of suffixes (eng (ngram), std,)
:return:
"""
weights = {
"exact_synonym": "5",
"related_synonym": "2",
"broad_synonym": "1",
"narrow_synonym": "3"
}
qf = GolrLayPersonSearch._format_query_filter(weights, suffixes)
return qf
[docs]class GolrAssociationQuery(GolrAbstractQuery):
"""
A Query object providing a higher level of abstraction over either GO or Monarch Solr indexes
Fields
------
All of these can be set when creating a new object
fetch_objects : bool
we frequently want a list of distinct association objects (in
the RDF sense). for example, when querying for all phenotype
associations for a gene, it is convenient to get a list of
distinct phenotype terms. Although this can be obtained by
iterating over the list of associations, it can be expensive
to obtain all associations.
Results are in the 'objects' field
fetch_subjects : bool
This is the analog of the fetch_objects field. Note that due
to an inherent asymmetry by which the list of subjects can be
very large (e.g. all genes in all species for "metabolic
process" or "metabolic phenotype") it's necessary to combine
this with subject_category and subject_taxon filters
Results are in the 'subjects' field
slim : List
a list of either class ids (or in future subset ids), used to
map up (slim) objects in associations. This will populate
an additional 'slim' field in each association object corresponding
to the slimmed-up value(s) from the direct objects.
If fetch_objects is passed, this will be populated with slimmed IDs.
evidence: String
Evidence class from ECO. Inference is used.
exclude_automatic_assertions : bool
If true, then any annotations with ECO evidence code for IEA or
subclasses will be excluded.
use_compact_associations : bool
If true, then the associations list will be false, instead
compact_associations contains a more compact representation
consisting of objects with (subject, relation and objects)
config : Config
See :ref:`Config` for details. The config object can be used
to set values for the solr instance to be queried
TODO - Extract params into their own object
"""
def __init__(self,
subject_category=None,
object_category=None,
relation=None,
relationship_type=None,
subject_or_object_ids=None,
subject_or_object_category=None,
subject=None,
subjects=None,
object=None,
objects=None,
subject_direct=False,
object_direct=False,
subject_taxon=None,
subject_taxon_direct=False,
object_taxon=None,
object_taxon_direct=False,
invert_subject_object=None,
evidence=None,
exclude_automatic_assertions=False,
q=None,
id=None,
use_compact_associations=False,
include_raw=False,
field_mapping=None,
solr=None,
config=None,
url=None,
select_fields=None,
fetch_objects=False,
fetch_subjects=False,
fq=None,
slim=None,
json_facet=None,
iterate=False,
map_identifiers=None,
facet_fields=None,
facet_field_limits=None,
facet_limit=25,
facet_mincount=1,
facet_pivot_fields=None,
stats=False,
stats_field=None,
facet=True,
pivot_subject_object=False,
unselect_evidence=False,
rows=10,
start=None,
homology_type=None,
non_null_fields=None,
user_agent=None,
association_type=None,
sort=None,
**kwargs):
"""Fetch a set of association objects based on a query.
"""
self.subject_category = subject_category
self.object_category = object_category
self.relation = relation
self.relationship_type = relationship_type
self.subject_or_object_ids = subject_or_object_ids
self.subject_or_object_category = subject_or_object_category
self.subject = subject
self.subjects = subjects
self.subject_direct = subject_direct
self.object = object
self.objects = objects
self.object_direct = object_direct
self.subject_taxon = subject_taxon
self.subject_taxon_direct = subject_taxon_direct
self.object_taxon = object_taxon
self.object_taxon_direct = object_taxon_direct
self.invert_subject_object = invert_subject_object
self.evidence = evidence
self.exclude_automatic_assertions = exclude_automatic_assertions
self.id = id
self.q = q
self.use_compact_associations = use_compact_associations
self.include_raw = include_raw
self.field_mapping = field_mapping
self.solr = solr
self.config = config
self.select_fields = select_fields
self.fetch_objects = fetch_objects
self.fetch_subjects = fetch_subjects
self.fq = fq if fq is not None else {}
self.slim = slim if slim is not None else []
self.json_facet = json_facet
self.iterate = iterate
self.map_identifiers = map_identifiers
self.facet_fields = facet_fields
self.facet_field_limits = facet_field_limits
self.facet_limit = facet_limit
self.facet_mincount = facet_mincount
self.facet_pivot_fields = facet_pivot_fields
self.stats = stats
self.stats_field = stats_field
self.facet = facet
self.pivot_subject_object = pivot_subject_object
self.unselect_evidence = unselect_evidence
self.max_rows = 100000
self.rows = rows
self.start = start
self.homology_type = homology_type
self.url = url
# test if client explicitly passes a URL; do not override
self.is_explicit_url = url is not None
self.non_null_fields = non_null_fields
self.association_type = association_type
self.sort = sort
self.user_agent = get_user_agent(modules=[requests, pysolr], caller_name=__name__)
if user_agent is not None:
self.user_agent += " {}".format(user_agent)
if self.facet_pivot_fields is None:
self.facet_pivot_fields = []
if self.non_null_fields is None:
self.non_null_fields = []
if self.facet_fields is None:
if self.facet:
self.facet_fields = [
M.SUBJECT_TAXON,
M.SUBJECT_TAXON_LABEL,
M.OBJECT_CLOSURE
]
if self.sort is None and not self._use_amigo_schema(object_category):
# Make default descending by count of publications for monarch
self.sort = 'source_count desc'
if self.solr is None:
if self.url is None:
endpoint = self.get_config().solr_assocs
solr_config = {'url': endpoint.url, 'timeout': endpoint.timeout}
else:
solr_config = {'url': self.url, 'timeout': 5}
self.update_solr_url(**solr_config)
def update_solr_url(self, url, timeout=2):
self.url = url
solr_config = {'url': url, 'timeout': timeout}
self._set_solr(**solr_config)
self._set_user_agent(self.user_agent)
def adjust(self):
pass
[docs] def solr_params(self):
"""
Generate HTTP parameters for passing to Solr.
In general you should not need to call this directly, calling exec() on a query object
will transparently perform this step for you.
"""
## Main query params for solr
fq=self.fq
if fq is None:
fq = {}
logger.info("TEMPx FQ={}".format(fq))
# subject_or_object_ids is a list of identifiers that can be matched to either subjects or objects
subject_or_object_ids = self.subject_or_object_ids
if subject_or_object_ids is not None:
subject_or_object_ids = [self.make_canonical_identifier(c) for c in subject_or_object_ids]
# canonical form for MGI is a CURIE MGI:nnnn
#if subject is not None and subject.startswith('MGI:MGI:'):
# logger.info('Unhacking MGI ID presumably from GO:'+str(subject))
# subject = subject.replace("MGI:MGI:","MGI")
subject = self.subject
if subject is not None:
subject = self.make_canonical_identifier(subject)
subjects = self.subjects
if subjects is not None:
subjects = [self.make_canonical_identifier(s) for s in subjects]
subject_direct = self.subject_direct
# temporary: for querying go solr, map fields. TODO
object_category = self.object_category
logger.info("Object category: {}".format(object_category))
object = self.object
objects = self.objects
object_direct = self.object_direct
if object_category is None and object is not None and object.startswith('GO:'):
# Infer category
object_category = 'function'
logger.info("Inferring Object category: {} from {}".
format(object_category, object))
# URL to use for querying solr
if self._use_amigo_schema(object_category):
# Override solr config and use go solr
endpoint = self.get_config().amigo_solr_assocs
solr_config = {'url': endpoint.url, 'timeout': endpoint.timeout}
self.update_solr_url(**solr_config)
self.field_mapping=goassoc_fieldmap(self.relationship_type)
# awkward hack: we want to avoid typing on the amigo golr gene field,
# UNLESS this is a planteome golr
if "planteome" in self.get_config().amigo_solr_assocs.url:
self.field_mapping[M.SUBJECT_CATEGORY] = 'type'
fq['document_category'] = 'annotation'
if subject is not None:
subject = self.make_gostyle_identifier(subject)
if subjects is not None:
subjects = [self.make_gostyle_identifier(s) for s in subjects]
# the AmiGO schema lacks an object_category field;
# we could use the 'aspect' field but instead we use a mapping of
# the category to a root class
if object_category is not None:
cc = self.get_config().get_category_class(object_category)
if cc is not None and object is None:
object = cc
## subject params
subject_taxon = self.subject_taxon
subject_taxon_direct = self.subject_taxon_direct
subject_category = self.subject_category
# heuristic procedure to guess unspecified subject_category
if subject_category is None and subject is not None:
subject_category = self.infer_category(subject)
if subject_category is not None and subject_category == 'disease':
if subject_taxon is not None and subject_taxon=='NCBITaxon:9606':
logger.info("Unsetting taxon, until indexed correctly")
subject_taxon = None
if self.invert_subject_object is None:
# TODO: consider placing in a separate lookup
p = (subject_category, object_category)
if p == ('disease', 'gene'):
self.invert_subject_object = True
elif p == ('disease', 'model'):
self.invert_subject_object = True
else:
self.invert_subject_object = False
if self.invert_subject_object:
logger.info("Inferred that subject/object should be inverted for {}".format(p))
## taxon of object of triple
object_taxon=self.object_taxon
object_taxon_direct = self.object_taxon_direct
# typically information is stored one-way, e.g. model-disease;
# sometimes we want associations from perspective of object
if self.invert_subject_object:
(subject, object) = (object,subject)
(subject_category, object_category) = (object_category,subject_category)
(subject_taxon, object_taxon) = (object_taxon,subject_taxon)
(object_direct, subject_direct) = (subject_direct, object_direct)
(object_taxon_direct, subject_taxon_direct) = (subject_taxon_direct, object_taxon_direct)
## facet fields
facet_fields=self.facet_fields
facet=self.facet
facet_limit=self.facet_limit
select_fields=self.select_fields
if self.use_compact_associations:
facet_fields = []
facet = False
facet_limit = 0
select_fields = [
M.SUBJECT,
M.SUBJECT_LABEL,
M.RELATION,
M.OBJECT]
if subject_category is not None:
fq['subject_category'] = subject_category
if object_category is not None:
fq['object_category'] = object_category
if subject is not None:
# note: by including subject closure by default,
# we automaticaly get equivalent nodes
if subject_direct:
fq['subject_eq'] = subject
else:
fq['subject_closure'] = subject
if subjects is not None:
# lists are assumed to be disjunctive
if subject_direct:
fq['subject'] = subjects
else:
fq['subject_closure'] = subjects
if object is not None:
if object_direct:
fq['object_eq'] = object
else:
fq['object_closure'] = object
if objects is not None:
# lists are assumed to be disjunctive
if object_direct:
fq['object_eq'] = objects
else:
fq['object_eq'] = objects
objects=self.objects
if objects is not None:
# lists are assumed to be disjunctive
fq['object_closure'] = objects
relation=self.relation
if relation is not None:
fq['relation_closure'] = relation
if subject_taxon is not None:
if subject_taxon_direct:
fq['subject_taxon'] = subject_taxon
else:
fq['subject_taxon_closure'] = subject_taxon
if object_taxon is not None:
if object_taxon_direct:
fq['object_taxon'] = object_taxon
else:
fq['object_taxon_closure'] = object_taxon
if self.id is not None:
fq['id'] = self.id
if self.evidence is not None:
e = self.evidence
if e.startswith("-"):
fq['-evidence_object_closure'] = e.replace("-","")
else:
fq['evidence_object_closure'] = e
if self.exclude_automatic_assertions:
fq['-evidence_object_closure'] = iea_eco
# Homolog service params
# TODO can we sync with argparse.choices?
if self.homology_type is not None:
if self.homology_type == 'O':
fq['relation_closure'] = HomologyTypes.Ortholog.value
elif self.homology_type == 'P':
fq['relation_closure'] = HomologyTypes.Paralog.value
elif self.homology_type == 'LDO':
fq['relation_closure'] = \
HomologyTypes.LeastDivergedOrtholog.value
## Association type, monarch only
if self.association_type is not None:
fq['association_type'] = self.association_type
## pivots
facet_pivot_fields=self.facet_pivot_fields
if self.pivot_subject_object:
facet_pivot_fields = [M.SUBJECT, M.OBJECT]
# Map solr field names for fq. The generic Monarch schema is
# canonical, GO schema is mapped to this using
# field_mapping dictionary
if self.field_mapping is not None:
for (k,v) in self.field_mapping.items():
# map fq[k] -> fq[k]
if k in fq:
if v is None:
del fq[k]
else:
fq[v] = fq[k]
del fq[k]
# in solr, the fq field can be
# a negated expression, e.g. -evidence_object_closure:"ECO:0000501"
# ideally we would have a higher level representation rather than
# relying on string munging...
negk = '-' + k
if negk in fq:
if v is None:
del fq[negk]
else:
negv = '-' + v
fq[negv] = fq[negk]
del fq[negk]
filter_queries = []
qstr = "*:*"
if self.q is not None:
qstr = self.q
filter_queries = [ '{}:{}'.format(k,solr_quotify(v)) for (k,v) in fq.items()]
# We want to match all associations that have either a subject or object
# with an ID that is contained in subject_or_object_ids.
if subject_or_object_ids is not None:
quotified_ids = solr_quotify(subject_or_object_ids)
subject_id_filter = '{}:{}'.format('subject_closure', quotified_ids)
object_id_filter = '{}:{}'.format('object_closure', quotified_ids)
# If subject_or_object_category is provided, we add it to the filter.
if self.subject_or_object_category is not None:
quotified_categories = solr_quotify(self.subject_or_object_category)
subject_category_filter = '{}:{}'.format('subject_category', quotified_categories)
object_category_filter = '{}:{}'.format('object_category', quotified_categories)
filter_queries.append(
'(' + subject_id_filter + ' AND ' + object_category_filter + ')' \
' OR ' \
'(' + object_id_filter + ' AND ' + subject_category_filter + ')'
)
else:
filter_queries.append(subject_id_filter + ' OR ' + object_id_filter)
# unless caller specifies a field list, use default
if select_fields is None:
select_fields = [
M.ID,
M.IS_DEFINED_BY,
M.SOURCE,
M.SUBJECT,
M.SUBJECT_LABEL,
M.SUBJECT_TAXON,
M.SUBJECT_TAXON_LABEL,
M.RELATION,
M.RELATION_LABEL,
M.OBJECT,
M.OBJECT_LABEL,
M.OBJECT_TAXON,
M.OBJECT_TAXON_LABEL,
M.EVIDENCE,
M.EVIDENCE_CLOSURE_MAP,
M.FREQUENCY,
M.FREQUENCY_LABEL,
M.ONSET,
M.ONSET_LABEL
]
if not self.unselect_evidence:
select_fields += [
M.EVIDENCE_GRAPH
]
if not self._use_amigo_schema(object_category):
select_fields.append(M.SUBJECT_CATEGORY)
select_fields.append(M.OBJECT_CATEGORY)
if self.map_identifiers is not None:
select_fields.append(M.SUBJECT_CLOSURE)
if self.slim is not None and len(self.slim) > 0:
select_fields.append(M.OBJECT_CLOSURE)
if self.field_mapping is not None:
logger.info("Applying field mapping to SELECT: {}".format(self.field_mapping))
select_fields = [ map_field(fn, self.field_mapping) for fn in select_fields ]
if facet_pivot_fields is not None:
logger.info("Applying field mapping to PIV: {}".format(facet_pivot_fields))
facet_pivot_fields = [ map_field(fn, self.field_mapping) for fn in facet_pivot_fields ]
logger.info("APPLIED field mapping to PIV: {}".format(facet_pivot_fields))
if facet_fields:
facet_fields = [ map_field(fn, self.field_mapping) for fn in facet_fields ]
if self._use_amigo_schema(object_category):
select_fields += [x for x in M.AMIGO_SPECIFIC_FIELDS if x not in select_fields]
## true if iterate in windows of max_size until all results found
iterate=self.iterate
#logger.info('FL'+str(select_fields))
is_unlimited = False
rows=self.rows
if rows < 0:
is_unlimited = True
iterate = True
rows = self.max_rows
for field in self.non_null_fields:
filter_queries.append(field + ":['' TO *]")
search_fields = None
if self.q is not None and not self._use_amigo_schema(object_category):
search_fields = [
M.SUBJECT_LABEL_SEARCHABLE,
M.OBJECT_LABEL_SEARCHABLE,
M.SUBJECT_TAXON_LABEL_SEARCHABLE,
M.OBJECT_TAXON_LABEL_SEARCHABLE,
M.SUBJECT_GENE_LABEL_SEARCHABLE,
M.OBJECT_GENE_LABEL_SEARCHABLE,
]
params = {
'q': qstr,
'fq': filter_queries,
'facet': 'on' if facet else 'off',
'facet.field': facet_fields if facet_fields else [],
'facet.limit': facet_limit,
'facet.mincount': self.facet_mincount,
'fl': ",".join(list(filter(None, select_fields))),
'rows': rows,
"defType": "edismax"
}
if self.start is not None:
params['start'] = self.start
json_facet = self.json_facet
if json_facet:
params['json.facet'] = json.dumps(json_facet)
facet_field_limits = self.facet_field_limits
if facet_field_limits is not None:
for (f,flim) in facet_field_limits.items():
params["f."+f+".facet.limit"] = flim
if len(facet_pivot_fields) > 0:
params['facet.pivot'] = ",".join(facet_pivot_fields)
params['facet.pivot.mincount'] = 1
if self.stats_field:
self.stats = True
params['stats.field'] = self.stats_field
params['stats'] = json.dumps(self.stats)
if self.sort is not None:
params['sort'] = self.sort
if search_fields:
params['qf'] = search_fields
return params
[docs] def exec(self, **kwargs):
"""
Execute solr query
Result object is a dict with the following keys:
- raw
- associations : list
- compact_associations : list
- facet_counts
- facet_pivot
"""
params = self.solr_params()
logger.info("PARAMS="+str(params))
results = self.solr.search(**params)
n_docs = len(results.docs)
logger.info("Docs found: {}".format(results.hits))
if self.iterate:
docs = results.docs
start = n_docs
while n_docs >= self.rows:
logger.info("Iterating; start={}".format(start))
next_results = self.solr.search(start=start, **params)
next_docs = next_results.docs
n_docs = len(next_docs)
docs += next_docs
start += self.rows
results.docs = docs
fcs = results.facets
payload = {
'facet_counts': translate_facet_field(fcs, self.invert_subject_object),
'pagination': {},
'numFound': results.hits
}
include_raw = self.include_raw
if include_raw:
# note: this is not JSON serializable, do not send via REST
payload['raw'] = results
# TODO - check if truncated
logger.info("COMPACT={} INV={}".format(self.use_compact_associations, self.invert_subject_object))
if self.use_compact_associations:
payload['compact_associations'] = self.translate_docs_compact(results.docs, field_mapping=self.field_mapping,
slim=self.slim, invert_subject_object=self.invert_subject_object,
map_identifiers=self.map_identifiers, **kwargs)
else:
payload['associations'] = self.translate_docs(results.docs, field_mapping=self.field_mapping, map_identifiers=self.map_identifiers, **kwargs)
if 'facet_pivot' in fcs:
payload['facet_pivot'] = fcs['facet_pivot']
if 'facets' in results.raw_response:
payload['facets'] = results.raw_response['facets']
# For solr, we implement this by finding all facets
# TODO: no need to do 2nd query, see https://wiki.apache.org/solr/SimpleFacetParameters#Parameters
fetch_objects=self.fetch_objects
if fetch_objects:
core_object_field = M.OBJECT
if self.slim is not None and len(self.slim)>0:
core_object_field = M.OBJECT_CLOSURE
object_field = map_field(core_object_field, self.field_mapping)
if self.invert_subject_object:
object_field = map_field(M.SUBJECT, self.field_mapping)
oq_params = params.copy()
oq_params['fl'] = []
oq_params['facet.field'] = [object_field]
oq_params['facet.limit'] = -1
oq_params['rows'] = 0
oq_params['facet.mincount'] = 1
oq_results = self.solr.search(**oq_params)
if self.facet:
ff = oq_results.facets['facet_fields']
ofl = ff.get(object_field)
# solr returns facets counts as list, every 2nd element is number, we don't need the numbers here
payload['objects'] = ofl[0::2]
fetch_subjects=self.fetch_subjects
if fetch_subjects:
core_subject_field = M.SUBJECT
if self.slim is not None and len(self.slim)>0:
core_subject_field = M.SUBJECT_CLOSURE
subject_field = map_field(core_subject_field, self.field_mapping)
if self.invert_subject_object:
subject_field = map_field(M.SUBJECT, self.field_mapping)
oq_params = params.copy()
oq_params['fl'] = []
oq_params['facet.field'] = [subject_field]
oq_params['facet.limit'] = self.max_rows
oq_params['rows'] = 0
oq_params['facet.mincount'] = 1
oq_results = self.solr.search(**oq_params)
if self.facet:
ff = oq_results.facets['facet_fields']
ofl = ff.get(subject_field)
# solr returns facets counts as list, every 2nd element is number, we don't need the numbers here
payload['subjects'] = ofl[0::2]
if len(payload['subjects']) == self.max_rows:
payload['is_truncated'] = True
if self.slim is not None and len(self.slim)>0:
if 'objects' in payload:
payload['objects'] = [x for x in payload['objects'] if x in self.slim]
if 'associations' in payload:
for a in payload['associations']:
a['slim'] = [x for x in a['object_closure'] if x in self.slim]
del a['object_closure']
return payload
[docs] def infer_category(self, id):
"""
heuristic to infer a category from an id, e.g. DOID:nnn --> disease
"""
logger.info("Attempting category inference on id={}".format(id))
toks = id.split(":")
idspace = toks[0]
c = None
if idspace == 'DOID':
c='disease'
if c is not None:
logger.info("Inferred category: {} based on id={}".format(c, id))
return c
[docs] def make_canonical_identifier(self,id):
"""
E.g. MGI:MGI:nnnn --> MGI:nnnn
"""
if id is not None:
for (k,v) in PREFIX_NORMALIZATION_MAP.items():
s = k+':'
if id.startswith(s):
return id.replace(s,v+':')
return id
[docs] def make_gostyle_identifier(self,id):
"""
E.g. MGI:nnnn --> MGI:MGI:nnnn
"""
if id is not None:
for (k,v) in PREFIX_NORMALIZATION_MAP.items():
s = v+':'
if id.startswith(s):
return id.replace(s,k+':')
return id
[docs] def translate_objs(self, d, fname, default=None):
"""
Translate a field whose value is expected to be a list
"""
if fname not in d:
# TODO: consider adding arg for failure on null
return default
#lf = M.label_field(fname)
v = d[fname]
if not isinstance(v,list):
v = [v]
objs = [{'id': idval} for idval in v]
# todo - labels
return objs
[docs] def translate_obj(self,d,fname):
"""
Translate a field value from a solr document.
This includes special logic for when the field value
denotes an object, here we nest it
"""
if fname not in d:
# TODO: consider adding arg for failure on null
return None
lf = M.label_field(fname)
id = d[fname]
id = self.make_canonical_identifier(id)
#if id.startswith('MGI:MGI:'):
# id = id.replace('MGI:MGI:','MGI:')
obj = {'id': id}
if id:
if self._use_amigo_schema(self.object_category):
iri = expand_uri(id)
else:
iri = expand_uri(id, [get_curie_map('{}/cypher/curies'.format(self.config.scigraph_data.url))])
obj['iri'] = iri
if lf in d:
obj['label'] = d[lf]
cf = fname + "_category"
if cf in d:
obj['category'] = [d[cf]]
if 'aspect' in d and id.startswith('GO:'):
obj['category'] = [ASPECT_MAP[d['aspect']]]
del d['aspect']
return obj
def map_doc(self, d, field_mapping, invert_subject_object=False):
if field_mapping is not None:
for (k,v) in field_mapping.items():
if v is not None and k is not None:
#logger.debug("TESTING FOR:"+v+" IN "+str(d))
if v in d:
#logger.debug("Setting field {} to {} // was in {}".format(k,d[v],v))
d[k] = d[v]
if invert_subject_object:
for field in INVERT_FIELDS_MAP:
flip(d, field, INVERT_FIELDS_MAP[field])
return d
[docs] def translate_doc(self, d, field_mapping=None, map_identifiers=None, **kwargs):
"""
Translate a solr document (i.e. a single result row)
"""
if field_mapping is not None:
self.map_doc(d, field_mapping)
subject = self.translate_obj(d, M.SUBJECT)
obj = self.translate_obj(d, M.OBJECT)
# TODO: use a more robust method; we need equivalence as separate field in solr
if map_identifiers is not None:
if M.SUBJECT_CLOSURE in d:
subject['id'] = self.map_id(subject, map_identifiers, d[M.SUBJECT_CLOSURE])
else:
logger.info("NO SUBJECT CLOSURE IN: "+str(d))
if M.SUBJECT_TAXON in d:
subject['taxon'] = self.translate_obj(d,M.SUBJECT_TAXON)
if M.OBJECT_TAXON in d:
obj['taxon'] = self.translate_obj(d, M.OBJECT_TAXON)
qualifiers = []
if M.RELATION in d and isinstance(d[M.RELATION],list):
# GO overloads qualifiers and relation
relation = None
for rel in d[M.RELATION]:
if rel.lower() == 'not':
qualifiers.append(rel)
else:
relation = rel
if relation is not None:
d[M.RELATION] = relation
else:
d[M.RELATION] = None
negated = 'not' in qualifiers
assoc = {'id':d.get(M.ID),
'subject': subject,
'object': obj,
'negated': negated,
'relation': self.translate_obj(d,M.RELATION),
'publications': self.translate_objs(d, M.SOURCE, []), # note 'source' is used in the golr schema
}
if self.invert_subject_object and assoc['relation'] is not None:
assoc['relation']['inverse'] = True
if len(qualifiers) > 0:
assoc['qualifiers'] = qualifiers
evidence_types = []
if M.EVIDENCE in d:
evidence_label_map = json.loads(d[M.EVIDENCE_CLOSURE_MAP])
if self._use_amigo_schema(self.object_category):
evidence_codes = [d[M.EVIDENCE]]
else:
evidence_codes = d[M.EVIDENCE]
for evidence_code in evidence_codes:
evidence_label = None
if evidence_code in evidence_label_map:
evidence_label = evidence_label_map[evidence_code]
evidence_types.append({
'id': evidence_code,
'label': evidence_label
})
assoc['evidence_types'] = evidence_types
if M.OBJECT_CLOSURE in d:
assoc['object_closure'] = d.get(M.OBJECT_CLOSURE)
if M.IS_DEFINED_BY in d:
if isinstance(d[M.IS_DEFINED_BY],list):
assoc['provided_by'] = d[M.IS_DEFINED_BY]
else:
# hack for GO Golr instance
assoc['provided_by'] = [d[M.IS_DEFINED_BY]]
# solr does not allow nested objects, so evidence graph is json-encoded
if M.EVIDENCE_GRAPH in d:
assoc[M.EVIDENCE_GRAPH] = json.loads(d[M.EVIDENCE_GRAPH])
if M.FREQUENCY in d:
assoc[M.FREQUENCY] = {
'id': d[M.FREQUENCY]
}
if M.FREQUENCY_LABEL in d:
assoc[M.FREQUENCY]['label'] = d[M.FREQUENCY_LABEL]
if M.ONSET in d:
assoc[M.ONSET] = {
'id': d[M.ONSET]
}
if M.ONSET_LABEL in d:
assoc[M.ONSET]['label'] = d[M.ONSET_LABEL]
if M.ASSOCIATION_TYPE in d:
assoc['type'] = d[M.ASSOCIATION_TYPE]
if self._use_amigo_schema(self.object_category):
for f in M.AMIGO_SPECIFIC_FIELDS:
if f in d:
assoc[f] = d[f]
return assoc
[docs] def translate_docs(self, ds, **kwargs):
"""
Translate a set of solr results
"""
for d in ds:
self.map_doc(d, {}, self.invert_subject_object)
return [self.translate_doc(d, **kwargs) for d in ds]
[docs] def translate_docs_compact(self, ds, field_mapping=None, slim=None, map_identifiers=None, invert_subject_object=False, **kwargs):
"""
Translate golr association documents to a compact representation
"""
amap = {}
logger.info("Translating docs to compact form. Slim={}".format(slim))
for d in ds:
self.map_doc(d, field_mapping, invert_subject_object=invert_subject_object)
subject = d[M.SUBJECT]
subject_label = d[M.SUBJECT_LABEL]
# TODO: use a more robust method; we need equivalence as separate field in solr
if map_identifiers is not None:
if M.SUBJECT_CLOSURE in d:
subject = self.map_id(subject, map_identifiers, d[M.SUBJECT_CLOSURE])
else:
logger.debug("NO SUBJECT CLOSURE IN: "+str(d))
rel = d.get(M.RELATION)
skip = False
# TODO
if rel == 'not' or rel == 'NOT':
skip = True
# this is a list in GO
if isinstance(rel,list):
if 'not' in rel or 'NOT' in rel:
skip = True
if len(rel) > 1:
logger.warning(">1 relation: {}".format(rel))
rel = ";".join(rel)
if skip:
logger.debug("Skipping: {}".format(d))
continue
subject = self.make_canonical_identifier(subject)
#if subject.startswith('MGI:MGI:'):
# subject = subject.replace('MGI:MGI:','MGI:')
k = (subject,rel)
if k not in amap:
amap[k] = {'subject':subject,
'subject_label':subject_label,
'relation':rel,
'objects': []}
if slim is not None and len(slim)>0:
mapped_objects = [x for x in d[M.OBJECT_CLOSURE] if x in slim]
logger.debug("Mapped objects: {}".format(mapped_objects))
amap[k]['objects'] += mapped_objects
else:
amap[k]['objects'].append(d[M.OBJECT])
for k in amap.keys():
amap[k]['objects'] = list(set(amap[k]['objects']))
return list(amap.values())
[docs] def map_id(self,id, prefix, closure_list):
"""
Map identifiers based on an equivalence closure list.
"""
prefixc = prefix + ':'
ids = [eid for eid in closure_list if eid.startswith(prefixc)]
# TODO: add option to fail if no mapping, or if >1 mapping
if len(ids) == 0:
# default to input
return id
return ids[0]
### This may quite possibly be a temporary code, but it looks a lot simpler than the above for more customizable Solr queries
import requests
from enum import Enum
## Should take those URLs from config.yaml
class ESOLR(Enum):
GOLR = "http://golr-aux.geneontology.io/solr/"
MOLR = "https://solr.monarchinitiative.org/solr/search"
class ESOLRDoc(Enum):
ONTOLOGY = "ontology_class"
ANNOTATION = "annotation"
BIOENTITY = "bioentity"
## Respect the method name for run_sparql_on with enums
def run_solr_on(solrInstance, category, id, fields):
"""
Return the result of a solr query on the given solrInstance (Enum ESOLR), for a certain document_category (ESOLRDoc) and id
"""
query = solrInstance.value + "select?q=*:*&fq=document_category:\"" + category.value + "\"&fq=id:\"" + id + "\"&fl=" + fields + "&wt=json&indent=on"
response = requests.get(query)
return response.json()['response']['docs'][0]
def run_solr_text_on(solrInstance, category, q, qf, fields, optionals):
"""
Return the result of a solr query on the given solrInstance (Enum ESOLR), for a certain document_category (ESOLRDoc) and id
"""
if optionals == None:
optionals = ""
query = solrInstance.value + "select?q=" + q + "&qf=" + qf + "&fq=document_category:\"" + category.value + "\"&fl=" + fields + "&wt=json&indent=on" + optionals
# print("QUERY: ", query)
response = requests.get(query)
return response.json()['response']['docs']
### Those utility functions should find their place in a common utils.py if any exists
## Utility function to merge two field of a json
def merge(json, firstField, secondField):
"""
merge two fields of a json into an array of { firstField : secondField }
"""
merged = []
for i in range(0, len(json[firstField])):
merged.append({ json[firstField][i] : json[secondField][i] })
return merged
## Utility function to filter out two fields of a json and give it each a new label
def mergeWithLabels(json, firstField, firstFieldLabel, secondField, secondFieldLabel):
"""
merge two fields of a json into an array of { firstFieldLabel : firstFieldLabel, secondFieldLabel : secondField }
"""
merged = []
for i in range(0, len(json[firstField])):
merged.append({ firstFieldLabel : json[firstField][i],
secondFieldLabel : json[secondField][i] })
return merged
## Utility function to replace in a specific <field> an <old> string by a <new> string
def replace(json, field, old, new):
for i in range(0, len(json)):
if json[i][field]:
json[i][field] = json[i][field].replace(old, new)
return json