"""Core concepts for characterizing bitstream/version content."""
from enum import Enum
from typing import List, Optional
from .identifier import VersionedIdentifier
[docs]class SourceFileType(Enum):
"""Source file types are represented by single-character codes."""
Ignore = 'I'
"""All files auto ignore. No paper available."""
SourceEncrypted = 'S'
"""Source is encrypted and should not be made available."""
PostscriptOnly = 'P'
"""
Multi-file PS submission.
It is not necessary to indicate P with single file PS since in this case
the source file has .ps.gz extension.
"""
PDFLaTeX = 'D'
"""A TeX submission that must be processed with PDFlatex."""
HTML = 'H'
"""Multi-file HTML submission."""
Ancillary = 'A'
"""Submission includes ancillary files in the /anc directory."""
DCPilot = 'B'
"""Submission has associated data in the DC pilot system."""
DOCX = 'X'
"""Submission in Microsoft DOCX (Office Open XML) format."""
ODF = 'O'
"""Submission in Open Document Format."""
PDFOnly = 'F'
"""PDF-only with .tar.gz package (likely because of anc files)."""
[docs]class SourceType(str):
"""Characterizes a version source package."""
def __init__(self, value: str) -> None:
"""Initialize with source file type codes."""
self._types = [SourceFileType(v) for v in list(value.upper())]
@property
def has_docx(self) -> bool:
"""Indicate whether the source has DOCX content."""
return bool(SourceFileType.DOCX in self._types)
@property
def has_encrypted_source(self) -> bool:
"""Indicate whether the source is encryped."""
return bool(SourceFileType.SourceEncrypted in self._types)
@property
def has_html(self) -> bool:
"""Indicate whether the source has HTML content."""
return bool(SourceFileType.HTML in self._types)
@property
def has_ignore(self) -> bool:
"""Indicate whether the source content should be ignored."""
return bool(SourceFileType.Ignore in self._types)
@property
def has_odf(self) -> bool:
"""Indicate whether the source has ODF content."""
return bool(SourceFileType.ODF in self._types)
@property
def has_pdf_only(self) -> bool:
"""Indicate whether the source contains only a PDF."""
return bool(SourceFileType.PDFOnly in self._types)
@property
def has_pdflatex(self) -> bool:
"""Indicate whether the source has PDFLaTeX content."""
return bool(SourceFileType.PDFLaTeX in self._types)
@property
def has_ps_only(self) -> bool:
"""Indicate whether the source has postcript content only."""
return bool(SourceFileType.PostscriptOnly in self._types)
@property
def available_formats(self) -> List['ContentType']:
"""
List the available dissemination formats for this source type.
Depending on the original source type, we may not be able to provide
all supported formats.
This does not include the source format. Note also that this does
**not** enforce rules about what should be displayed as an option
or provided to end users.
"""
formats = []
if self.has_ignore and not self.has_encrypted_source:
pass
elif self.has_ps_only:
formats.extend([ContentType.pdf, ContentType.ps])
elif self.has_pdflatex:
formats.append(ContentType.pdf)
elif self.has_pdf_only:
formats.append(ContentType.pdf)
elif self.has_html:
formats.append(ContentType.html)
elif self.has_docx or self.has_odf:
formats.append(ContentType.pdf)
else:
formats.extend([
ContentType.pdf,
ContentType.ps,
ContentType.dvi,
])
return formats
[docs]class ContentType(Enum):
"""Characterization of the content type of an individual bitstream."""
pdf = 'pdf'
tar = 'tar'
json = 'json'
abs = 'abs'
html = 'html'
dvi = 'dvi'
ps = 'ps'
tex = 'tex'
@property
def mime_type(self) -> str:
"""The MIME content type for this :class:`.ContentType`."""
return _mime_types[self]
@property
def ext(self) -> str:
"""The preferred filename extension for this :class:`.ContentType`."""
return _extensions[self]
[docs] @classmethod
def from_filename(cls, filename: str) -> 'ContentType':
"""Infer the :class:`.ContentType` of a file from its filename."""
for ctype, ext in _extensions.items():
if filename.endswith(ext) or filename.endswith(f'{ext}.gz'):
return ctype
raise ValueError(f'Unrecognized extension: {filename}')
[docs] @classmethod
def from_mimetype(cls, mime: str) -> 'ContentType':
"""Infer the :class:`.ContentType` of a file from its MIME type."""
return {v: k for k, v in _mime_types.items()}[mime]
[docs] def make_filename(self, identifier: VersionedIdentifier,
is_gzipped: bool = False) -> str:
"""Make a filename for a bitstream with this :class:`.ContentType`."""
if identifier.is_old_style:
fn = f'{identifier.numeric_part}v{identifier.version}.{self.ext}'
else:
fn = f'{identifier}.{self.ext}'
if is_gzipped:
fn = f'{fn}.gz'
return fn
_mime_types = {
ContentType.pdf: 'application/pdf',
ContentType.tar: 'application/x-tar',
ContentType.json: 'application/json',
ContentType.abs: 'text/plain',
ContentType.html: 'text/html',
ContentType.dvi: 'application/x-dvi',
ContentType.ps: 'application/postscript',
ContentType.tex: 'application/x-tex',
}
_extensions = {
ContentType.pdf: 'pdf',
ContentType.tar: 'tar',
ContentType.json: 'json',
ContentType.abs: 'abs',
ContentType.html: 'html',
ContentType.dvi: 'dvi',
ContentType.ps: 'ps',
ContentType.tex: 'tex'
}
DISSEMINATION_FORMATS_BY_SOURCE_EXT = [
('.tar.gz', None),
('.tar', None),
('.dvi.gz', None),
('.dvi', None),
('.pdf', [ContentType.pdf]),
('.ps.gz', [ContentType.pdf, ContentType.ps]),
('.ps', [ContentType.pdf, ContentType.ps]),
('.html.gz', [ContentType.html]),
('.html', [ContentType.html]),
('.gz', None),
]
"""
Dissemination formats that can be inferred from source file extension.
.. note::
This is largely to support format discovery in classic. In the NG
canonical record, this should all be explicit.
"""
[docs]def list_source_extensions() -> List[str]:
"""List all of the known filename extensions for source files."""
return [ext for ext, _ in DISSEMINATION_FORMATS_BY_SOURCE_EXT]