Source code for search.services.index.util

"""Helpers for building ES queries."""

import re
from typing import Any, Optional, Tuple, Union, List
from string import punctuation

from elasticsearch_dsl import Search, Q, SF

from search.domain import Query
from .exceptions import QueryError


# We'll compile this ahead of time, since it gets called quite a lot.
STRING_LITERAL = re.compile(r"([\"][^\"]*[\"])")
"""Pattern for string literals (quoted) in search queries."""

TEXISM = re.compile(r'(([\$]{2}[^\$]+[\$]{2})|([\$]{1}[^\$]+[\$]{1}))')

# TODO: make this configurable.
MAX_RESULTS = 10_000
"""This is the maximum result offset for pagination."""

SPECIAL_CHARACTERS = ['+', '=', '&&', '||', '>', '<', '!', '(', ')', '{',
                      '}', '[', ']', '^', '~', ':', '\\', '/', '-']
DEFAULT_SORT = ['-announced_date_first', '_doc']

DATE_PARTIAL = r"(?:^|[\s])(\d{2})((?:0[1-9]{1})|(?:1[0-2]{1}))(?:$|[\s])"
"""Used to match parts of paper IDs that encode the announcement date."""

OLD_ID_NUMBER = \
   r'(910[7-9]|911[0-2]|9[2-9](0[1-9]|1[0-2])|0[0-6](0[1-9]|1[0-2])|070[1-3])'\
   r'(00[1-9]|0[1-9][0-9]|[1-9][0-9][0-9])'
"""
The number part of the old arXiv identifier looks like YYMMNNN.

The old arXiv identifier scheme was used between 1991-07 and 2007-03
(inclusive).
"""


[docs]def wildcard_escape(querystring: str) -> Tuple[str, bool]: """ Detect wildcard characters, and escape any that occur within a literal. Parameters ---------- querystring : str Returns ------- str Query string with wildcard characters enclosed in literals escaped. bool If a non-literal wildcard character is present, returns True. """ # This should get caught by the controller (form validation), but just # in case we should check for it here. if querystring.startswith('?') or querystring.startswith('*'): raise QueryError('Query cannot start with a wildcard') # Escape wildcard characters within string literals. # re.sub() can't handle the complexity, sadly... parts = re.split(STRING_LITERAL, querystring) parts = [part.replace('*', r'\*').replace('?', r'\?') if part.startswith('"') or part.startswith("'") else part for part in parts] querystring = "".join(parts) # Only unescaped wildcard characters should remain. wildcard = re.search(r'(?<!\\)([\*\?])', querystring) is not None return querystring, wildcard
[docs]def has_wildcard(term: str) -> bool: """Determine whether or not ``term`` contains a wildcard.""" return (('*' in term or '?' in term) and not (term.startswith('*') or term.startswith('?')))
[docs]def is_literal_query(term: str) -> bool: """Determine whether the term is intended to be treated as a literal.""" # return re.match('"[^"]+"', term) is not None return '"' in term
[docs]def is_tex_query(term: str) -> bool: """Determine whether the term is intended as a TeX query.""" return re.match(TEXISM, term) is not None
[docs]def is_old_papernum(term: str) -> bool: """Check whether term matches 7-digit pattern for old arXiv ID numbers.""" return re.fullmatch(OLD_ID_NUMBER, term) is not None
[docs]def strip_tex(term: str) -> str: """Remove TeX-isms from a term.""" return re.sub(TEXISM, '', term).strip()
[docs]def Q_(qtype: str, field: str, value: str, operator: str = 'or') -> Q: """Construct a :class:`.Q`, but handle wildcards first.""" value, wildcard = wildcard_escape(value) if wildcard: return Q('wildcard', **{field: {'value': value.lower()}}) if 'match' in qtype: return Q(qtype, **{field: value}) return Q(qtype, **{field: value}, operator=operator)
[docs]def escape(term: str, quotes: bool = False) -> str: """Escape special characters.""" escaped = [] for i, char in enumerate(term): if char in SPECIAL_CHARACTERS or quotes and char == '"': escaped.append("\\") escaped.append(char) return "".join(escaped)
[docs]def strip_punctuation(s: str) -> str: """Remove all punctuation characters from a string.""" return ''.join([c for c in s if c not in punctuation])
[docs]def remove_single_characters(term: str) -> str: """Remove any single characters in the search string.""" return ' '.join([part for part in term.split() if len(strip_punctuation(part)) > 1])
[docs]def sort(query: Query, search: Search) -> Search: """Apply sorting to a :class:`.Search`.""" if not query.order: sort_params = DEFAULT_SORT else: direction = '-' if query.order.startswith('-') else '' sort_params = [query.order, f'{direction}paper_id_v'] if sort_params is not None: search = search.sort(*sort_params) return search
[docs]def parse_date(term: str) -> Tuple[str, str]: """ Attempt to find date-related information in the query. Parameters ---------- term : str Search term. Returns ------- tuple First element is the responding date-related fragment, second element is the remainder of `term` (without the date). Raises ------ ValueError Raised if no date-related information is found in `term`. """ match = re.search(r'(?:^|[\s]+)([0-9]{4}-[0-9]{2})(?:$|[\s]+)', term) if match: remainder = term[:match.start()] + " " + term[match.end():] return match.group(1), remainder.strip() match = re.search(r'(?:^|[\s]+)([0-9]{4})(?:$|[\s]+)', term) if match: # Looks like a year: remainder = term[:match.start()] + " " + term[match.end():] return match.group(1), remainder.strip() raise ValueError('No date info detected')
[docs]def parse_date_partial(term: str) -> Optional[str]: """ Convert a 4-digit ID date partial into a full year-month value. This can be used to search for papers by announcement date. Parameters ---------- term : str Search term. Returns ------- str Date in `yyyy-MM` format, if found. """ match = re.search(DATE_PARTIAL, term) if match: year, month = match.groups() # This should be fine until 2091. century = 19 if int(year) >= 91 else 20 date_partial = f"{century}{year}-{month}" # year_month format in ES. return date_partial return None