"""Helpers for building ES queries."""
import re
from typing import Any, Optional, Tuple, Union, List
from string import punctuation
from elasticsearch_dsl import Search, Q, SF
from search.domain import Query
from .exceptions import QueryError
# We'll compile this ahead of time, since it gets called quite a lot.
STRING_LITERAL = re.compile(r"([\"][^\"]*[\"])")
"""Pattern for string literals (quoted) in search queries."""
TEXISM = re.compile(r'(([\$]{2}[^\$]+[\$]{2})|([\$]{1}[^\$]+[\$]{1}))')
# TODO: make this configurable.
MAX_RESULTS = 10_000
"""This is the maximum result offset for pagination."""
SPECIAL_CHARACTERS = ['+', '=', '&&', '||', '>', '<', '!', '(', ')', '{',
'}', '[', ']', '^', '~', ':', '\\', '/', '-']
DEFAULT_SORT = ['-announced_date_first', '_doc']
DATE_PARTIAL = r"(?:^|[\s])(\d{2})((?:0[1-9]{1})|(?:1[0-2]{1}))(?:$|[\s])"
"""Used to match parts of paper IDs that encode the announcement date."""
OLD_ID_NUMBER = \
r'(910[7-9]|911[0-2]|9[2-9](0[1-9]|1[0-2])|0[0-6](0[1-9]|1[0-2])|070[1-3])'\
r'(00[1-9]|0[1-9][0-9]|[1-9][0-9][0-9])'
"""
The number part of the old arXiv identifier looks like YYMMNNN.
The old arXiv identifier scheme was used between 1991-07 and 2007-03
(inclusive).
"""
[docs]def wildcard_escape(querystring: str) -> Tuple[str, bool]:
"""
Detect wildcard characters, and escape any that occur within a literal.
Parameters
----------
querystring : str
Returns
-------
str
Query string with wildcard characters enclosed in literals escaped.
bool
If a non-literal wildcard character is present, returns True.
"""
# This should get caught by the controller (form validation), but just
# in case we should check for it here.
if querystring.startswith('?') or querystring.startswith('*'):
raise QueryError('Query cannot start with a wildcard')
# Escape wildcard characters within string literals.
# re.sub() can't handle the complexity, sadly...
parts = re.split(STRING_LITERAL, querystring)
parts = [part.replace('*', r'\*').replace('?', r'\?')
if part.startswith('"') or part.startswith("'") else part
for part in parts]
querystring = "".join(parts)
# Only unescaped wildcard characters should remain.
wildcard = re.search(r'(?<!\\)([\*\?])', querystring) is not None
return querystring, wildcard
[docs]def has_wildcard(term: str) -> bool:
"""Determine whether or not ``term`` contains a wildcard."""
return (('*' in term or '?' in term) and not
(term.startswith('*') or term.startswith('?')))
[docs]def is_literal_query(term: str) -> bool:
"""Determine whether the term is intended to be treated as a literal."""
# return re.match('"[^"]+"', term) is not None
return '"' in term
[docs]def is_tex_query(term: str) -> bool:
"""Determine whether the term is intended as a TeX query."""
return re.match(TEXISM, term) is not None
[docs]def is_old_papernum(term: str) -> bool:
"""Check whether term matches 7-digit pattern for old arXiv ID numbers."""
return re.fullmatch(OLD_ID_NUMBER, term) is not None
[docs]def strip_tex(term: str) -> str:
"""Remove TeX-isms from a term."""
return re.sub(TEXISM, '', term).strip()
[docs]def Q_(qtype: str, field: str, value: str, operator: str = 'or') -> Q:
"""Construct a :class:`.Q`, but handle wildcards first."""
value, wildcard = wildcard_escape(value)
if wildcard:
return Q('wildcard', **{field: {'value': value.lower()}})
if 'match' in qtype:
return Q(qtype, **{field: value})
return Q(qtype, **{field: value}, operator=operator)
[docs]def escape(term: str, quotes: bool = False) -> str:
"""Escape special characters."""
escaped = []
for i, char in enumerate(term):
if char in SPECIAL_CHARACTERS or quotes and char == '"':
escaped.append("\\")
escaped.append(char)
return "".join(escaped)
[docs]def strip_punctuation(s: str) -> str:
"""Remove all punctuation characters from a string."""
return ''.join([c for c in s if c not in punctuation])
[docs]def remove_single_characters(term: str) -> str:
"""Remove any single characters in the search string."""
return ' '.join([part for part in term.split()
if len(strip_punctuation(part)) > 1])
[docs]def sort(query: Query, search: Search) -> Search:
"""Apply sorting to a :class:`.Search`."""
if not query.order:
sort_params = DEFAULT_SORT
else:
direction = '-' if query.order.startswith('-') else ''
sort_params = [query.order, f'{direction}paper_id_v']
if sort_params is not None:
search = search.sort(*sort_params)
return search
[docs]def parse_date(term: str) -> Tuple[str, str]:
"""
Attempt to find date-related information in the query.
Parameters
----------
term : str
Search term.
Returns
-------
tuple
First element is the responding date-related fragment, second element
is the remainder of `term` (without the date).
Raises
------
ValueError
Raised if no date-related information is found in `term`.
"""
match = re.search(r'(?:^|[\s]+)([0-9]{4}-[0-9]{2})(?:$|[\s]+)', term)
if match:
remainder = term[:match.start()] + " " + term[match.end():]
return match.group(1), remainder.strip()
match = re.search(r'(?:^|[\s]+)([0-9]{4})(?:$|[\s]+)', term)
if match: # Looks like a year:
remainder = term[:match.start()] + " " + term[match.end():]
return match.group(1), remainder.strip()
raise ValueError('No date info detected')
[docs]def parse_date_partial(term: str) -> Optional[str]:
"""
Convert a 4-digit ID date partial into a full year-month value.
This can be used to search for papers by announcement date.
Parameters
----------
term : str
Search term.
Returns
-------
str
Date in `yyyy-MM` format, if found.
"""
match = re.search(DATE_PARTIAL, term)
if match:
year, month = match.groups()
# This should be fine until 2091.
century = 19 if int(year) >= 91 else 20
date_partial = f"{century}{year}-{month}" # year_month format in ES.
return date_partial
return None