Source code for arxiv.canonical.domain.content

"""Core concepts for characterizing bitstream/version content."""

from enum import Enum
from typing import List, Optional

from .identifier import VersionedIdentifier


[docs]class SourceFileType(Enum):
    """Source file types are represented by single-character codes."""

    Ignore = 'I'
    """All files auto ignore. No paper available."""

    SourceEncrypted = 'S'
    """Source is encrypted and should not be made available."""

    PostscriptOnly = 'P'
    """
    Multi-file PS submission.

    It is not necessary to indicate P with single file PS since in this case
    the source file has .ps.gz extension.
    """

    PDFLaTeX = 'D'
    """A TeX submission that must be processed with PDFlatex."""

    HTML = 'H'
    """Multi-file HTML submission."""

    Ancillary = 'A'
    """Submission includes ancillary files in the /anc directory."""

    DCPilot = 'B'
    """Submission has associated data in the DC pilot system."""

    DOCX = 'X'
    """Submission in Microsoft DOCX (Office Open XML) format."""

    ODF = 'O'
    """Submission in Open Document Format."""

    PDFOnly = 'F'
    """PDF-only with .tar.gz package (likely because of anc files)."""


[docs]class SourceType(str):
    """Characterizes a version source package."""

    def __init__(self, value: str) -> None:
        """Initialize with source file type codes."""
        self._types = [SourceFileType(v) for v in list(value.upper())]

    @property
    def has_docx(self) -> bool:
        """Indicate whether the source has DOCX content."""
        return bool(SourceFileType.DOCX in self._types)

    @property
    def has_encrypted_source(self) -> bool:
        """Indicate whether the source is encryped."""
        return bool(SourceFileType.SourceEncrypted in self._types)

    @property
    def has_html(self) -> bool:
        """Indicate whether the source has HTML content."""
        return bool(SourceFileType.HTML in self._types)

    @property
    def has_ignore(self) -> bool:
        """Indicate whether the source content should be ignored."""
        return bool(SourceFileType.Ignore in self._types)

    @property
    def has_odf(self) -> bool:
        """Indicate whether the source has ODF content."""
        return bool(SourceFileType.ODF in self._types)

    @property
    def has_pdf_only(self) -> bool:
        """Indicate whether the source contains only a PDF."""
        return bool(SourceFileType.PDFOnly in self._types)

    @property
    def has_pdflatex(self) -> bool:
        """Indicate whether the source has PDFLaTeX content."""
        return bool(SourceFileType.PDFLaTeX in self._types)

    @property
    def has_ps_only(self) -> bool:
        """Indicate whether the source has postcript content only."""
        return bool(SourceFileType.PostscriptOnly in self._types)

    @property
    def available_formats(self) -> List['ContentType']:
        """
        List the available dissemination formats for this source type.

        Depending on the original source type, we may not be able to provide
        all supported formats.

        This does not include the source format. Note also that this does
        **not** enforce rules about what should be displayed as an option
        or provided to end users.
        """
        formats = []
        if self.has_ignore and not self.has_encrypted_source:
            pass
        elif self.has_ps_only:
            formats.extend([ContentType.pdf, ContentType.ps])
        elif self.has_pdflatex:
            formats.append(ContentType.pdf)
        elif self.has_pdf_only:
            formats.append(ContentType.pdf)
        elif self.has_html:
            formats.append(ContentType.html)
        elif self.has_docx or self.has_odf:
            formats.append(ContentType.pdf)
        else:
            formats.extend([
                ContentType.pdf,
                ContentType.ps,
                ContentType.dvi,
            ])
        return formats


[docs]class ContentType(Enum):
    """Characterization of the content type of an individual bitstream."""

    pdf = 'pdf'
    tar = 'tar'
    json = 'json'
    abs = 'abs'
    html = 'html'
    dvi = 'dvi'
    ps = 'ps'
    tex = 'tex'

    @property
    def mime_type(self) -> str:
        """The MIME content type for this :class:`.ContentType`."""
        return _mime_types[self]

    @property
    def ext(self) -> str:
        """The preferred filename extension for this :class:`.ContentType`."""
        return _extensions[self]

[docs]    @classmethod
    def from_filename(cls, filename: str) -> 'ContentType':
        """Infer the :class:`.ContentType` of a file from its filename."""
        for ctype, ext in _extensions.items():
            if filename.endswith(ext) or filename.endswith(f'{ext}.gz'):
                return ctype
        raise ValueError(f'Unrecognized extension: {filename}')

[docs]    @classmethod
    def from_mimetype(cls, mime: str) -> 'ContentType':
        """Infer the :class:`.ContentType` of a file from its MIME type."""
        return {v: k for k, v in _mime_types.items()}[mime]

[docs]    def make_filename(self, identifier: VersionedIdentifier,
                      is_gzipped: bool = False) -> str:
        """Make a filename for a bitstream with this :class:`.ContentType`."""
        if identifier.is_old_style:
            fn = f'{identifier.numeric_part}v{identifier.version}.{self.ext}'
        else:
            fn = f'{identifier}.{self.ext}'
        if is_gzipped:
            fn = f'{fn}.gz'
        return fn


_mime_types = {
    ContentType.pdf: 'application/pdf',
    ContentType.tar: 'application/x-tar',
    ContentType.json: 'application/json',
    ContentType.abs: 'text/plain',
    ContentType.html: 'text/html',
    ContentType.dvi: 'application/x-dvi',
    ContentType.ps: 'application/postscript',
    ContentType.tex: 'application/x-tex',
}

_extensions = {
    ContentType.pdf: 'pdf',
    ContentType.tar: 'tar',
    ContentType.json: 'json',
    ContentType.abs: 'abs',
    ContentType.html: 'html',
    ContentType.dvi: 'dvi',
    ContentType.ps: 'ps',
    ContentType.tex: 'tex'
}


DISSEMINATION_FORMATS_BY_SOURCE_EXT = [
    ('.tar.gz', None),
    ('.tar', None),
    ('.dvi.gz', None),
    ('.dvi', None),
    ('.pdf', [ContentType.pdf]),
    ('.ps.gz', [ContentType.pdf, ContentType.ps]),
    ('.ps', [ContentType.pdf, ContentType.ps]),
    ('.html.gz', [ContentType.html]),
    ('.html', [ContentType.html]),
    ('.gz', None),
]
"""
Dissemination formats that can be inferred from source file extension.

.. note::
    This is largely to support format discovery in classic. In the NG
    canonical record, this should all be explicit.
"""


[docs]def available_formats_by_ext(filename: str) -> Optional[List[ContentType]]:
    """
    Attempt to determine the available dissemination formats by file extension.

    It sometimes (but not always) possible to infer the available dissemination
    formats based on the filename extension of the source package.

    .. note::
        This is largely to support format discovery in classic. In the NG
        canonical record, this should all be explicit.

    """
    for ext, formats in DISSEMINATION_FORMATS_BY_SOURCE_EXT:
        if filename.endswith(ext):
            return formats
    return None


[docs]def list_source_extensions() -> List[str]:
    """List all of the known filename extensions for source files."""
    return [ext for ext, _ in DISSEMINATION_FORMATS_BY_SOURCE_EXT]
Source code for arxiv.canonical.domain.content

arXiv Canonical Record

Navigation

Related Topics