Source code for search.services.fulltext
"""Provides access to fulltext content for arXiv papers."""
from functools import wraps
import os
from urllib.parse import urljoin
import json
import requests
from arxiv import status
from search.context import get_application_config, get_application_global
from search.domain import Fulltext
[docs]class FulltextSession(object):
"""An HTTP session with the fulltext endpoint."""
def __init__(self, endpoint: str) -> None:
"""Initialize an HTTP session."""
self._session = requests.Session()
self._adapter = requests.adapters.HTTPAdapter(max_retries=2)
self._session.mount('https://', self._adapter)
if not endpoint[-1] == '/':
endpoint += '/'
self.endpoint = endpoint
[docs] def retrieve(self, document_id: str) -> Fulltext:
"""
Retrieve fulltext content for an arXiv paper.
Parameters
----------
document_id : str
arXiv identifier, including version tag. E.g. ``"1234.56787v3"``.
endpoint : str
Base URL for fulltext endpoint.
Returns
-------
:class:`.Fulltext`
Includes the content itself, creation (extraction) date, and
extractor version.
Raises
------
ValueError
Raised when ``document_id`` is not a valid arXiv paper identifier.
IOError
Raised when unable to retrieve fulltext content.
"""
if not document_id: # This could use further elaboration.
raise ValueError('Invalid value for document_id')
try:
response = requests.get(urljoin(self.endpoint, document_id))
except requests.exceptions.SSLError as e:
raise IOError('SSL failed: %s' % e)
if response.status_code != status.HTTP_200_OK:
raise IOError('%s: could not retrieve fulltext: %i' %
(document_id, response.status_code))
try:
data = response.json()
except json.decoder.JSONDecodeError as e:
raise IOError('%s: could not decode response: %s' %
(document_id, e)) from e
return Fulltext(**data) # type: ignore
# See https://github.com/python/mypy/issues/3937
[docs]def init_app(app: object = None) -> None:
"""Set default configuration parameters for an application instance."""
config = get_application_config(app)
config.setdefault('FULLTEXT_ENDPOINT',
'https://fulltext.arxiv.org/fulltext/')
[docs]def get_session(app: object = None) -> FulltextSession:
"""Get a new session with the fulltext endpoint."""
config = get_application_config(app)
endpoint = config.get('FULLTEXT_ENDPOINT',
'https://fulltext.arxiv.org/fulltext/')
return FulltextSession(endpoint)
[docs]def current_session() -> FulltextSession:
"""Get/create :class:`.FulltextSession` for this context."""
g = get_application_global()
if not g:
return get_session()
if 'fulltext' not in g:
g.fulltext = get_session() # type: ignore
return g.fulltext # type: ignore
[docs]@wraps(FulltextSession.retrieve)
def retrieve(document_id: str) -> Fulltext:
"""Retrieve an arxiv document by id."""
return current_session().retrieve(document_id)