Source code for search.services.index.results

"""
Functions for processing search results (after execution).

The primary public function in this module is :func:`.to_documentset`.
"""

import re
from datetime import datetime
from math import floor
from typing import Any, Dict, Union

from elasticsearch_dsl.response import Response, Hit
from elasticsearch_dsl.utils import AttrList, AttrDict
from search.domain import Document, Query, DocumentSet, Classification, Person
from arxiv.base import logging

from .util import MAX_RESULTS, TEXISM
from .highlighting import add_highlighting, preview

logger = logging.getLogger(__name__)
logger.propagate = False


def _to_author(author_data: dict) -> Person:
    """Prevent e-mail, other extraneous data, from escaping."""
    data = {}
    for key, value in author_data.items():
        if key == 'email':
            continue
        elif key == 'name':
            key = 'full_name'
        if key not in Person.fields():
            continue
        data[key] = value
    return Person(**data)   # type: ignore


[docs]def to_document(raw: Union[Hit, dict], highlight: bool = True) -> Document: """Transform an ES search result back into a :class:`.Document`.""" # typing: ignore result: Dict[str, Any] = {} result['match'] = {} # Hit on field, but no highlighting. result['truncated'] = {} # Preview is truncated. for key in Document.fields(): if type(raw) is Hit: if not hasattr(raw, key): continue value = getattr(raw, key) elif type(raw) is dict: if key not in raw: continue value = raw.get(key) else: continue # We want to prevent ES-specific data types from escaping the module # API. if isinstance(value, AttrList): value = value._l_ elif isinstance(value, AttrDict): value = value.to_dict() if key == 'primary_classification': value = Classification(**value) # type: ignore elif key == 'secondary_classification': value = [Classification(**v) for v in value] # type: ignore elif key in ['authors', 'owners']: value = [_to_author(au) for au in value] elif key == 'submitter': value = _to_author(value) elif key == 'announced_date_first' and \ value and isinstance(value, str): value = datetime.strptime(value, '%Y-%m').date() elif key in ['submitted_date', 'submitted_date_first', 'submitted_date_latest']: try: value = datetime.strptime(value, '%Y-%m-%dT%H:%M:%S%z') except (ValueError, TypeError): logger.warning(f'Could not parse {key}: {value} as datetime') pass elif key in ['acm_class', 'msc_class'] and value: value = '; '.join(value) result[key] = value if type(raw) is Response: result['score'] = raw.meta.score # type: ignore if type(result.get('abstract')) is str and highlight: if 'preview' not in result: result['preview'] = {} result['preview']['abstract'] = preview(result['abstract']) if result['preview']['abstract'].endswith('…'): result['truncated']['abstract'] = True if highlight and type(raw) in [Response, Hit]: result['highlight'] = {} logger.debug('%s: add highlighting to result', raw.paper_id) # type: ignore result = add_highlighting(result, raw) return Document(**result) # type: ignore
# See https://github.com/python/mypy/issues/3937
[docs]def to_documentset(query: Query, response: Response, highlight: bool = True) \ -> DocumentSet: """ Transform a response from ES to a :class:`.DocumentSet`. Parameters ---------- query : :class:`.Query` The original search query. response : :class:`.Response` The response from Elasticsearch. Returns ------- :class:`.DocumentSet` The set of :class:`.Document`s responding to the query on the current page, along with pagination metadata. """ max_pages = int(MAX_RESULTS/query.size) N_pages_raw = response['hits']['total']/query.size N_pages = int(floor(N_pages_raw)) + \ int(N_pages_raw % query.size > 0) logger.debug('got %i results', response['hits']['total']) return DocumentSet(**{ # type: ignore 'metadata': { 'start': query.page_start, 'end': min(query.page_start + query.size, response['hits']['total']), 'total': response['hits']['total'], 'current_page': query.page, 'total_pages': N_pages, 'size': query.size, 'max_pages': max_pages }, 'results': [to_document(raw, highlight=highlight) for raw in response] })
# See https://github.com/python/mypy/issues/3937