Source code for

"""Helpers for building ES queries."""

import re
from typing import Any, Optional, Tuple, Union, List
from string import punctuation

from elasticsearch_dsl import Search, Q, SF

from search.domain import Query
from .exceptions import QueryError

# We'll compile this ahead of time, since it gets called quite a lot.
STRING_LITERAL = re.compile(r"([\"][^\"]*[\"])")
"""Pattern for string literals (quoted) in search queries."""

TEXISM = re.compile(r'(([\$]{2}[^\$]+[\$]{2})|([\$]{1}[^\$]+[\$]{1}))')

# TODO: make this configurable.
MAX_RESULTS = 10_000
"""This is the maximum result offset for pagination."""

SPECIAL_CHARACTERS = ['+', '=', '&&', '||', '>', '<', '!', '(', ')', '{',
                      '}', '[', ']', '^', '~', ':', '\\', '/', '-']
DEFAULT_SORT = ['-announced_date_first', '_doc']

DATE_PARTIAL = r"(?:^|[\s])(\d{2})((?:0[1-9]{1})|(?:1[0-2]{1}))(?:$|[\s])"
"""Used to match parts of paper IDs that encode the announcement date."""

The number part of the old arXiv identifier looks like YYMMNNN.

The old arXiv identifier scheme was used between 1991-07 and 2007-03

[docs]def wildcard_escape(querystring: str) -> Tuple[str, bool]: """ Detect wildcard characters, and escape any that occur within a literal. Parameters ---------- querystring : str Returns ------- str Query string with wildcard characters enclosed in literals escaped. bool If a non-literal wildcard character is present, returns True. """ # This should get caught by the controller (form validation), but just # in case we should check for it here. if querystring.startswith('?') or querystring.startswith('*'): raise QueryError('Query cannot start with a wildcard') # Escape wildcard characters within string literals. # re.sub() can't handle the complexity, sadly... parts = re.split(STRING_LITERAL, querystring) parts = [part.replace('*', r'\*').replace('?', r'\?') if part.startswith('"') or part.startswith("'") else part for part in parts] querystring = "".join(parts) # Only unescaped wildcard characters should remain. wildcard ='(?<!\\)([\*\?])', querystring) is not None return querystring, wildcard
[docs]def has_wildcard(term: str) -> bool: """Determine whether or not ``term`` contains a wildcard.""" return (('*' in term or '?' in term) and not (term.startswith('*') or term.startswith('?')))
[docs]def is_literal_query(term: str) -> bool: """Determine whether the term is intended to be treated as a literal.""" # return re.match('"[^"]+"', term) is not None return '"' in term
[docs]def is_tex_query(term: str) -> bool: """Determine whether the term is intended as a TeX query.""" return re.match(TEXISM, term) is not None
[docs]def is_old_papernum(term: str) -> bool: """Check whether term matches 7-digit pattern for old arXiv ID numbers.""" return re.fullmatch(OLD_ID_NUMBER, term) is not None
[docs]def strip_tex(term: str) -> str: """Remove TeX-isms from a term.""" return re.sub(TEXISM, '', term).strip()
[docs]def Q_(qtype: str, field: str, value: str, operator: str = 'or') -> Q: """Construct a :class:`.Q`, but handle wildcards first.""" value, wildcard = wildcard_escape(value) if wildcard: return Q('wildcard', **{field: {'value': value.lower()}}) if 'match' in qtype: return Q(qtype, **{field: value}) return Q(qtype, **{field: value}, operator=operator)
[docs]def escape(term: str, quotes: bool = False) -> str: """Escape special characters.""" escaped = [] for i, char in enumerate(term): if char in SPECIAL_CHARACTERS or quotes and char == '"': escaped.append("\\") escaped.append(char) return "".join(escaped)
[docs]def strip_punctuation(s: str) -> str: """Remove all punctuation characters from a string.""" return ''.join([c for c in s if c not in punctuation])
[docs]def remove_single_characters(term: str) -> str: """Remove any single characters in the search string.""" return ' '.join([part for part in term.split() if len(strip_punctuation(part)) > 1])
[docs]def sort(query: Query, search: Search) -> Search: """Apply sorting to a :class:`.Search`.""" if not query.order: sort_params = DEFAULT_SORT else: direction = '-' if query.order.startswith('-') else '' sort_params = [query.order, f'{direction}paper_id_v'] if sort_params is not None: search = search.sort(*sort_params) return search
[docs]def parse_date(term: str) -> Tuple[str, str]: """ Attempt to find date-related information in the query. Parameters ---------- term : str Search term. Returns ------- tuple First element is the responding date-related fragment, second element is the remainder of `term` (without the date). Raises ------ ValueError Raised if no date-related information is found in `term`. """ match ='(?:^|[\s]+)([0-9]{4}-[0-9]{2})(?:$|[\s]+)', term) if match: remainder = term[:match.start()] + " " + term[match.end():] return, remainder.strip() match ='(?:^|[\s]+)([0-9]{4})(?:$|[\s]+)', term) if match: # Looks like a year: remainder = term[:match.start()] + " " + term[match.end():] return, remainder.strip() raise ValueError('No date info detected')
[docs]def parse_date_partial(term: str) -> Optional[str]: """ Convert a 4-digit ID date partial into a full year-month value. This can be used to search for papers by announcement date. Parameters ---------- term : str Search term. Returns ------- str Date in `yyyy-MM` format, if found. """ match =, term) if match: year, month = match.groups() # This should be fine until 2091. century = 19 if int(year) >= 91 else 20 date_partial = f"{century}{year}-{month}" # year_month format in ES. return date_partial return None