Source code for search.controllers.simple

"""
Handle requests to support the simple search feature.

The primary entrypoint to this module is :func:`.search`, which handles
GET requests to the base search endpoint. It uses :class:`.SimpleSearchForm`
to generate form HTML, validate request parameters, and produce informative
error messages for the user.
"""

from typing import Tuple, Dict, Any, Optional, List

from werkzeug.exceptions import InternalServerError, NotFound, BadRequest
from werkzeug import MultiDict, ImmutableMultiDict
from flask import url_for

from arxiv import status, identifier, taxonomy

from arxiv.base import logging
from search.services import index, fulltext, metadata
from search.domain import Query, SimpleQuery, asdict, Classification, \
    ClassificationList
from search.controllers.util import paginate, catch_underscore_syntax

from .forms import SimpleSearchForm
# from search.routes.ui import external_url_builder

logger = logging.getLogger(__name__)

Response = Tuple[Dict[str, Any], int, Dict[str, Any]]


[docs]def search(request_params: MultiDict,
           archives: Optional[List[str]] = None) -> Response:
    """
    Perform a simple search.

    This supports requests from both the form-based view (provided here) AND
    from the mini search widget displayed on all arXiv.org pages.

    At a minimum, expects the parameter ``value`` in the GET request. This may
    be a match value for a search query, or an arXiv ID.

    Parameters
    ----------
    request_params : :class:`.MultiDict`
    archives : list
        A list of archives within which the search should be performed.

    Returns
    -------
    dict
        Search result response data.
    int
        HTTP status code.
    dict
        Headers to add to the response.

    Raises
    ------
    :class:`.InternalServerError`
        Raised when there is a problem communicating with ES, or there was an
        unexpected problem executing the query.

    """
    if archives is not None and len(archives) == 0:
        raise NotFound('No such archive')

    # We may need to intervene on the request parameters, so we'll
    # reinstantiate as a mutable MultiDict.
    if isinstance(request_params, ImmutableMultiDict):
        request_params = MultiDict(request_params.items(multi=True))

    logger.debug('simple search form')
    response_data = {}  # type: Dict[str, Any]

    logger.debug('simple search request')
    if 'query' in request_params:
        try:
            # first check if the URL includes an arXiv ID
            arxiv_id: Optional[str] = identifier.parse_arxiv_id(
                request_params['query']
            )
            # If so, redirect.
            logger.debug(f"got arXiv ID: {arxiv_id}")
        except ValueError as e:
            logger.debug('No arXiv ID detected; fall back to form')
            arxiv_id = None
    else:
        arxiv_id = None

    if arxiv_id:
        headers = {'Location': url_for('abs_by_id', paper_id=arxiv_id)}
        return {}, status.HTTP_301_MOVED_PERMANENTLY, headers

    # Here we intervene on the user's query to look for holdouts from the
    # classic search system's author indexing syntax (surname_f). We
    # rewrite with a comma, and show a warning to the user about the
    # change.
    response_data['has_classic_format'] = False
    if 'searchtype' in request_params and 'query' in request_params:
        if request_params['searchtype'] in ['author', 'all']:
            _query, _classic = catch_underscore_syntax(request_params['query'])
            response_data['has_classic_format'] = _classic
            request_params['query'] = _query

    # Fall back to form-based search.
    form = SimpleSearchForm(request_params)

    if form.query.data:
        # Temporary workaround to support classic help search
        if form.searchtype.data == 'help':
            return {}, status.HTTP_301_MOVED_PERMANENTLY,\
                {'Location': 'https://arxiv.org/help/search?method=and'
                 f'&format=builtin-short&sort=score&words={form.query.data}'}

        # Support classic "expeirmental" search
        elif form.searchtype.data == 'full_text':
            return {}, status.HTTP_301_MOVED_PERMANENTLY,\
                {'Location': 'http://search.arxiv.org:8081/'
                             f'?in=&query={form.query.data}'}

    q: Optional[Query]
    if form.validate():
        logger.debug('form is valid')
        q = _query_from_form(form)

        if archives is not None:
            q = _update_with_archives(q, archives)

        # Pagination is handled outside of the form.
        q = paginate(q, request_params)

        try:
            # Execute the search. We'll use the results directly in
            #  template rendering, so they get added directly to the
            #  response content.
            response_data.update(asdict(index.search(q)))
        except index.IndexConnectionError as e:
            # There was a (hopefully transient) connection problem. Either
            #  this will clear up relatively quickly (next request), or
            #  there is a more serious outage.
            logger.error('IndexConnectionError: %s', e)
            raise InternalServerError(
                "There was a problem connecting to the search index. This is "
                "quite likely a transient issue, so please try your search "
                "again. If this problem persists, please report it to "
                "help@arxiv.org."
            ) from e
        except index.QueryError as e:
            # Base exception routers should pick this up and show bug page.
            logger.error('QueryError: %s', e)
            raise InternalServerError(
                "There was a problem executing your query. Please try your "
                "search again.  If this problem persists, please report it to "
                "help@arxiv.org."
            ) from e
        except index.OutsideAllowedRange as e:
            raise BadRequest(
                "Hello clever friend. You can't get results in that range"
                " right now."
            ) from e

        except Exception as e:
            logger.error('Unhandled exception: %s', str(e))
            raise
    else:
        logger.debug('form is invalid: %s', str(form.errors))
        if 'order' in form.errors or 'size' in form.errors:
            # It's likely that the user tried to set these parameters manually,
            # or that the search originated from somewhere else (and was
            # configured incorrectly).
            simple_url = url_for('ui.search')
            raise BadRequest(
                f"It looks like there's something odd about your search"
                f" request. Please try <a href='{simple_url}'>starting"
                f" over</a>.")
        q = None
    response_data['query'] = q
    response_data['form'] = form
    return response_data, status.HTTP_200_OK, {}


[docs]def retrieve_document(document_id: str) -> Response:
    """
    Retrieve an arXiv paper by ID.

    Parameters
    ----------
    document_id : str
        arXiv identifier for the paper.

    Returns
    -------
    dict
        Metadata about the paper.
    int
        HTTP status code.
    dict
        Headers to add to the response.

    Raises
    ------
    InternalServerError
        Encountered error in search query.
    NotFound
        No such document

    """
    try:
        result = index.get_document(document_id)
    except index.IndexConnectionError as e:
        # There was a (hopefully transient) connection problem. Either
        #  this will clear up relatively quickly (next request), or
        #  there is a more serious outage.
        logger.error('IndexConnectionError: %s', e)
        raise InternalServerError(
            "There was a problem connecting to the search index. This is "
            "quite likely a transient issue, so please try your search "
            "again. If this problem persists, please report it to "
            "help@arxiv.org."
        ) from e
    except index.QueryError as e:
        # Base exception routers should pick this up and show bug page.
        logger.error('QueryError: %s', e)
        raise InternalServerError(
            "There was a problem executing your query. Please try your "
            "search again.  If this problem persists, please report it to "
            "help@arxiv.org."
        ) from e
    except index.DocumentNotFound as e:
        logger.error('DocumentNotFound: %s', e)
        raise NotFound(f"Could not find a paper with id {document_id}") from e
    return {'document': result}, status.HTTP_200_OK, {}


def _update_with_archives(q: SimpleQuery, archives: List[str]) -> SimpleQuery:
    """
    Search within a group or archive.

    Parameters
    ----------
    q : :class:`SimpleQuery`
    groups_or_archives : str

    Returns
    -------
    :class:`SimpleQuery`
    """
    logger.debug('Search within %s', archives)
    q.classification = ClassificationList([
        Classification(archive={'id': archive})    # type: ignore
        for archive in archives
    ])
    return q


def _query_from_form(form: SimpleSearchForm) -> SimpleQuery:
    """
    Generate a :class:`.SimpleQuery` from valid :class:`.SimpleSearchForm`.

    Parameters
    ----------
    form : :class:`.SimpleSearchForm`
        Presumed to be filled and valid.

    Returns
    -------
    :class:`.SimpleQuery`

    """
    q = SimpleQuery()
    q.search_field = form.searchtype.data
    q.value = form.query.data
    q.hide_abstracts = form.abstracts.data == form.HIDE_ABSTRACTS
    order = form.order.data
    if order and order != 'None':
        q.order = order
    return q
Source code for search.controllers.simple

arXiv search

Navigation

Related Topics