Source code for search.services.index.highlighting

"""
Provide hit highlighting to the search.

Highlighting requires amendation of the query as well as post-processing of
the returned results. :func:`.highlight` adds a highlighting part to the query
in the Elasticsearch DSL. :func:`.add_highlighting` performs post-processing
of the search results. :func:`.preview` generates a TeX-safe snippet for
abridged display in the search results.
"""

import re
from typing import Any, Union

from elasticsearch_dsl import Search, Q, SF
from elasticsearch_dsl.response import Response, Hit
import bleach
from flask import escape
from arxiv.base import logging

from .util import TEXISM

logger = logging.getLogger(__name__)

HIGHLIGHT_TAG_OPEN = '<span class="search-hit mathjax">'
HIGHLIGHT_TAG_CLOSE = '</span>'


[docs]def highlight(search: Search) -> Search:
    """
    Apply hit highlighting to the search, before execution.

    Parameters
    ----------
    search : :class:`.Search`

    Returns
    -------
    :class:`.Search`
        The search object that was originally passed, updated to include
        requests for hit highlighting.

    """
    # Highlight class .search-hit defined in search.sass
    search = search.highlight_options(
        pre_tags=[HIGHLIGHT_TAG_OPEN],
        post_tags=[HIGHLIGHT_TAG_CLOSE]
    )
    search = search.highlight('title', type='plain', number_of_fragments=0)
    search = search.highlight('title.english', type='plain',
                              number_of_fragments=0)
    search = search.highlight('title.tex', type='plain',
                              number_of_fragments=0)

    search = search.highlight('comments', number_of_fragments=0)
    # Highlight any field the name of which begins with "author".
    search = search.highlight('author*')
    search = search.highlight('owner*')
    search = search.highlight('announced_date_first')
    search = search.highlight('submitter*')
    search = search.highlight('journal_ref', type='plain')
    search = search.highlight('acm_class', number_of_fragments=0)
    search = search.highlight('msc_class', number_of_fragments=0)
    search = search.highlight('doi', type='plain')
    search = search.highlight('report_num', type='plain')

    # Setting number_of_fragments to 0 tells ES to highlight the entire field.
    search = search.highlight('abstract', number_of_fragments=0)
    search = search.highlight('abstract.tex', type='plain',
                              number_of_fragments=0)
    search = search.highlight('abstract.english', number_of_fragments=0)
    return search


[docs]def preview(value: str, fragment_size: int = 400,
            start_tag: str = HIGHLIGHT_TAG_OPEN,
            end_tag: str = HIGHLIGHT_TAG_CLOSE) -> str:
    """
    Generate a snippet preview that doesn't breaking TeXisms or highlighting.

    Parameters
    ----------
    value : str
        The full text of the field, which we assume contains TeXisms and/or
        hit hightlighting tags.
    fragment_size : int
        The desired size of the preview (number of characters). The actual
        preview may be smaller or larger than this target, depending on where
        the TeXisms and highlight tags are located.
    start_tag : str
        The opening tag used for hit highlighting.
    end_tag: str
        The closing tag used for hit highlighting.

    Returns
    -------
    str
        A preview that is approximately ``fragment_size`` long.

    """
    # value = re.sub('')
    # value = value.replace('$$', '$')
    if start_tag in value and end_tag in value:
        start = value.index(start_tag)
        end = value.index(end_tag) + len(end_tag)
        # Roll back the start until we hit a TeXism or HTML tag, or we get
        # roughly half the target fragment size.
        start_frag_size = round((fragment_size - (end - start)) / 2)
        c = value[start - 1]
        s = start
        while start - s < start_frag_size and s > 0:
            if c in '$>':   # This may or may not be an actual HTML tag or TeX.
                break       # But it doesn't hurt to play it safe.
            s -= 1
            c = value[s - 1]
        start = s
        # Move the start forward slightly, to find a word boundary.
        while c not in '.,!? \t\n$<' and start > 0:
            start += 1
            c = value[start - 1]
    else:
        # There is no highlighting; we'll start at the beginning, and find
        # a safe place to end.
        start = 0
        end = 1

    # Jump the end forward until we consume (as much as possible of) the
    # rest of the target fragment size.
    remaining = max(0, fragment_size - (end - start))
    end += _end_safely(value[end:], remaining, start_tag=start_tag,
                       end_tag=end_tag)
    snippet = value[start:end].strip()
    last_open = snippet.rfind(HIGHLIGHT_TAG_OPEN)
    last_close = snippet.rfind(HIGHLIGHT_TAG_CLOSE)

    if last_open > last_close and last_open >= 0:
        snippet += HIGHLIGHT_TAG_CLOSE
    snippet = (
        ('&hellip;' if start > 0 else '')
        + snippet
        + ('&hellip;' if end < len(value) else '')
    )
    return snippet


[docs]def add_highlighting(result: dict, raw: Union[Response, Hit]) -> dict:
    """
    Add hit highlighting to a search result.

    Parameters
    ----------
    result : dict
        Contains processed search result data destined for the caller.
    raw : :class:`.Response`
        A response from Elasticsearch.

    Returns
    -------
    dict
        The ``result`` object, updated with ``highlight`` and ``preview``
        items.

    """
    # There may or may not be highlighting in the result set.
    highlighted_fields = getattr(raw.meta, 'highlight', None)

    # ``meta.matched_queries`` contains a list of query ``_name``s that
    # matched. This is nice for non-string fields.
    matched_fields = getattr(raw.meta, 'matched_queries', [])

    # These are from hits within child documents, e.g.
    # secondary_classification.
    inner_hits = getattr(raw.meta, 'inner_hits', None)

    # The values here will (almost) always be list-like. So we need to stitch
    # them together. Note that dir(None) won't return anything, so this block
    # is skipped if there are no highlights from ES.
    for field in dir(highlighted_fields):
        if field.startswith('_'):
            continue
        value = getattr(highlighted_fields, field)
        if hasattr(value, '__iter__'):
            value = '&hellip;'.join(value)

        # Non-TeX searches may hit inside of TeXisms. Highlighting those
        # fragments (i.e. inserting HTML) will break MathJax rendering.
        # To guard against this while preserving highlighting, we move
        # any highlighting tags from within TeXisms to encapsulate the
        # entire TeXism.
        if field in ['title', 'title.english',
                     'abstract', 'abstract.english']:
            value = _highlight_whole_texism(value)
            value = _escape(value)

        # A hit on authors may originate in several different fields, most
        # of which are not displayed. And in any case, author names may be
        # truncated. So instead of highlighting author names themselves, we
        # set a 'flag' that can get picked up in the template and highlight
        # the entire author field.
        if field.startswith('author') or field.startswith('owner') \
                or field.startswith('submitter'):
            result['match']['author'] = True
            continue

        result['highlight'][field] = value

    for field in matched_fields:
        if field not in result['highlight']:
            result['match'][field] = True

    # We're using inner_hits to see which category in particular responded to
    # the query.
    if hasattr(inner_hits, 'secondary_classification'):
        result['match']['secondary_classification'] = [
            ih.category.id for ih in inner_hits.secondary_classification
        ]

    # We just want to know whether there was a hit on the announcement date.
    result['match']['announced_date_first'] = (
        bool('announced_date_first' in matched_fields)
    )

    # If there is a hit in a TeX field, we prefer highlighting on that
    # field, since other tokenizers will clobber the TeX.
    for field in ['abstract', 'title']:
        if f'{field}.tex' in result['highlight']:
            result['highlight'][field] = result['highlight'][f'{field}.tex']
            del result['highlight'][f'{field}.tex']

    for field in ['abstract.tex', 'abstract.english', 'abstract']:
        if field in result['highlight']:
            value = result['highlight'][field]
            abstract_snippet = preview(value)
            result['preview']['abstract'] = abstract_snippet
            result['highlight']['abstract'] = value
            break
    for field in ['title.english', 'title']:
        if field in result['highlight']:
            result['highlight']['title'] = result['highlight'][field]
            break
    return result


def _strip_highlight_and_enclose(match: Any) -> str:
    """Move any highlights within a TeXism to outside the TeXism."""
    value: str = match.group(0)
    if HIGHLIGHT_TAG_OPEN not in value and HIGHLIGHT_TAG_CLOSE not in value:
        return value
    value = value.replace(HIGHLIGHT_TAG_OPEN, "")
    value = value.replace(HIGHLIGHT_TAG_CLOSE, "")
    # If HTML was removed, we will assume that it was highlighting HTML.
    # if len(new_value) < len(value):
    value = f'{HIGHLIGHT_TAG_OPEN}{value}{HIGHLIGHT_TAG_CLOSE}'
    return value


def _highlight_whole_texism(value: str) -> str:
    """Move highlighting from within TeXism to encapsulate whole statement."""
    return re.sub(TEXISM, _strip_highlight_and_enclose, value)


def _escape(value: str) -> str:
    """
    Escape anything that isn't part of highlighting.

    Ideally, we'd use bleach.clean to do this for us. Unfortunately, it just
    gets too tripped up on equation content to use it reliably. Sometimes it
    throws exceptions when it hits equations that look like (but are not)
    HTML, and other times it panics. Since we really only have one tag-pair
    that we care to preserve, this approach works well enough for our purposes.
    """
    tag_o = HIGHLIGHT_TAG_OPEN
    tag_c = HIGHLIGHT_TAG_CLOSE
    _new = ""
    i = 0
    while True:
        i_o = value[i:].index(tag_o) if tag_o in value[i:] else None
        i_c = value[i:].index(tag_c) if tag_c in value[i:] else None
        if i_o is None and i_c is None:
            _new += str(escape(value[i:]))
            break
        if i_o is not None and i_c is not None:
            if i_o < i_c:
                _sub = str(escape(value[i:i + i_o])) + tag_o
                i += i_o + len(tag_o)
            elif i_c < i_o:
                _sub = str(escape(value[i:i + i_c])) + tag_c
                i += i_c + len(tag_c)
        elif i_o is not None and i_c is None:
            _sub = str(escape(value[i:i + i_o])) + tag_o
            i += i_o + len(tag_o)
        elif i_c is not None and i_o is None:
            _sub = str(escape(value[i:i + i_c])) + tag_c
            i += i_c + len(tag_c)
        _new += _sub
    return _new


def _start_safely(value: str, start: int, end: int, fragment_size: int,
                  tolerance: int = 0, start_tag: str = HIGHLIGHT_TAG_OPEN,
                  end_tag: str = HIGHLIGHT_TAG_CLOSE) -> int:
    # Try to maximize the length of the fragment up to the fragment_size, but
    # avoid starting in the middle of a tag or a TeXism.
    space_remaining = (fragment_size + tolerance) - (end - start)

    remainder = value[start - fragment_size:start]
    acceptable = value[start - fragment_size - tolerance:start]
    if end_tag in remainder:
        # Relative index of the first end tag.
        first_end_tag = value[start - space_remaining:start].index(end_tag)
        if start_tag in value[start - space_remaining:first_end_tag]:
            target_area = value[start - space_remaining:first_end_tag]
            first_start_tag = target_area.index(start_tag)
            return (start - space_remaining) + first_start_tag
    elif '$' in remainder:
        m = TEXISM.search(acceptable)
        if m is None:   # Can't get to opening
            return start - remainder[::-1].index('$') + 1
        return (start - fragment_size - tolerance) + m.start()

    # Ideally, we hit the fragment size without entering a tag or TeXism.
    return start - fragment_size


def _end_safely(value: str, remaining: int,
                start_tag: str = HIGHLIGHT_TAG_OPEN,
                end_tag: str = HIGHLIGHT_TAG_CLOSE) -> int:
    """Find a fragment end that doesn't break TeXisms or HTML."""
    # Should match on either a TeXism or a TeXism enclosed in highlight tags.
    # TeXisms may be enclosed in pairs of $$ or $.
    ptn = r'|'.join([
        r'([\$]{2}[^\$]+[\$]{2})',
        r'([\$]{1}[^\$]+[\$]{1})',
        r'(%s[\$]{2}[^\$]+[\$]{2}%s)' % (start_tag, end_tag),
        r'(%s[\$]{1}[^\$]+[\$]{1}%s)' % (start_tag, end_tag),
        r'(%s[^\$]+%s)' % (start_tag, end_tag)
    ])
    m = re.search(ptn, value)
    if m is None:   # Nothing to worry about; the coast is clear.
        return remaining
    ptn_start = m.start()
    ptn_end = m.end()
    if remaining <= ptn_start:  # The ideal end falls before the next TeX/tag.
        return remaining
    elif ptn_end < remaining:   # The ideal end falls after the next TeX/tag.
        return ptn_end + _end_safely(value[ptn_end:], remaining - ptn_end,
                                     start_tag, end_tag)

    # We can't make it past the end of the next TeX/tag without exceeding the
    # target fragment size, so we will end at the beginning of the match.
    return ptn_start
Source code for search.services.index.highlighting

arXiv search

Navigation

Related Topics