Source code for search.process.tests
"""Tests for :mod:`search.transform`."""
from unittest import TestCase
import json
import jsonschema
from datetime import datetime, date
from search.process import transform
from search.domain import Document, DocMeta
[docs]class TestTransformMetdata(TestCase):
"""Test transformations for each of the metadata fields."""
[docs] def test_id(self):
"""Field ``id`` is populated from ``paper_id``."""
meta = DocMeta(**{'paper_id': '1234.56789'})
doc = transform.to_search_document(meta)
self.assertEqual(doc.id, '1234.56789v1')
[docs] def test_abstract(self):
"""Field ``abstract`` is populated from ``abstract_utf8``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'abstract_utf8': 'abstract!'
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.abstract, 'abstract!')
[docs] def test_authors(self):
"""Field ``authors`` is populated from ``authors_parsed``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'authors_parsed': [
{
'first_name': 'B. Ivan',
'last_name': 'Dole'
}
]
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.authors[0]['first_name'], 'B. Ivan')
self.assertEqual(doc.authors[0]['last_name'], 'Dole')
self.assertEqual(doc.authors[0]['full_name'], 'B. Ivan Dole',
"full_name should be generated from first_name and"
" last_name")
self.assertEqual(doc.authors[0]['initials'], "B I",
"initials should be generated from first name")
[docs] def test_authors_freeform(self):
"""Field ``authors_freeform`` is populated from ``authors_utf8``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'authors_utf8': 'authors!'
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.authors_freeform, 'authors!')
[docs] def test_owners(self):
"""Field ``owners`` is populated from ``author_owners``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'author_owners': [
{
'first_name': 'B. Ivan',
'last_name': 'Dole'
}
]
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.owners[0]['first_name'], 'B. Ivan')
self.assertEqual(doc.owners[0]['last_name'], 'Dole')
self.assertEqual(doc.owners[0]['full_name'], 'B. Ivan Dole',
"full_name should be generated from first_name and"
" last_name")
self.assertEqual(doc.owners[0]['initials'], "B I",
"initials should be generated from first name")
[docs] def test_submitted_date(self):
"""Field ``submitted_date`` is populated from ``submitted_date``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'submitted_date': '2007-04-25T16:06:50-0400'
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.submitted_date, '2007-04-25T16:06:50-0400')
[docs] def test_submitted_date_all(self):
"""``submitted_date_all`` is populated from ``submitted_date_all``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
"submitted_date_all": [
"2007-04-25T15:58:28-0400", "2007-04-25T16:06:50-0400"
],
'is_current': True
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.submitted_date_all[0], '2007-04-25T15:58:28-0400')
self.assertEqual(doc.submitted_date_all[1], '2007-04-25T16:06:50-0400')
self.assertEqual(doc.submitted_date_first, '2007-04-25T15:58:28-0400',
"Should be populated from submitted_date_all")
self.assertEqual(doc.submitted_date_latest, "2007-04-25T16:06:50-0400",
"Should be populated from submitted_date_all")
[docs] def test_modified_date(self):
"""Field ``modified_date`` is populated from ``modified_date``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'modified_date': '2007-04-25T16:06:50-0400'
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.modified_date, '2007-04-25T16:06:50-0400')
[docs] def test_updated_date(self):
"""Field ``updated_date`` is populated from ``updated_date``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'updated_date': '2007-04-25T16:06:50-0400'
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.updated_date, '2007-04-25T16:06:50-0400')
[docs] def test_announced_date_first(self):
"""``announced_date_first`` populated from ``announced_date_first``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'announced_date_first': '2007-04'
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.announced_date_first, '2007-04')
[docs] def test_is_withdrawn(self):
"""Field ``is_withdrawn`` is populated from ``is_withdrawn``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'is_withdrawn': False
})
doc = transform.to_search_document(meta)
self.assertFalse(doc.is_withdrawn)
[docs] def test_license(self):
"""Field ``license`` is populated from ``license``."""
_license = {
"label": "arXiv.org perpetual, non-exclusive license to"
" distribute this article",
"uri": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/"
}
meta = DocMeta(**{
'paper_id': '1234.56789',
'license': _license
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.license['uri'], _license['uri'])
self.assertEqual(doc.license['label'], _license['label'])
meta = DocMeta(**{
'paper_id': '1234.56789',
'license': {'uri': None, 'label': None}
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.license['uri'], transform.DEFAULT_LICENSE['uri'],
"The default license should be used")
self.assertEqual(doc.license['label'],
transform.DEFAULT_LICENSE['label'],
"The default license should be used")
[docs] def test_paper_version(self):
"""Field ``paper_id_v`` is populated from ``paper_id``."""
meta = DocMeta(**{'paper_id': '1234.56789', 'version': 4})
doc = transform.to_search_document(meta)
self.assertEqual(doc.paper_id_v, '1234.56789v4')
[docs] def test_primary_classification(self):
"""``primary_classification`` set from ``primary_classification``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'primary_classification': {
"group": {
"name": "Physics",
"id": "physics"
},
"archive": {
"name": "Astrophysics",
"id": "astro-ph"
},
"category": {
"name": "Astrophysics",
"id": "astro-ph"
}
}
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.primary_classification,
meta.primary_classification)
[docs] def test_secondary_classification(self):
"""``secondary_classification`` from ``secondary_classification``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'secondary_classification': [{
"group": {
"name": "Physics",
"id": "physics"
},
"archive": {
"name": "Astrophysics",
"id": "astro-ph"
},
"category": {
"name": "Astrophysics",
"id": "astro-ph"
}
}]
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.secondary_classification,
meta.secondary_classification)
[docs] def test_title(self):
"""Field ``title`` is populated from ``title_utf8``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'title_utf8': 'foo title'
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.title, 'foo title')
[docs] def test_title_utf8(self):
"""Field ``title`` is populated from ``title_utf8``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'title_utf8': 'foö title'
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.title, 'foö title')
[docs] def test_source(self):
"""Field ``source`` is populated from ``source``."""
_source = {"flags": "1", "format": "pdf", "size_bytes": 1230119}
meta = DocMeta(**{
'paper_id': '1234.56789',
'source': _source
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.source, _source)
[docs] def test_version(self):
"""Field ``version`` is populated from ``version``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'version': 25
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.version, 25)
[docs] def test_submitter(self):
"""Field ``submitter`` is populated from ``submitter``."""
_submitter = {
"email": "s.mitter@cornell.edu",
"name": "Sub Mitter",
"name_utf8": "Süb Mitter"
}
meta = DocMeta(**{
'paper_id': '1234.56789',
'submitter': _submitter
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.submitter, _submitter)
[docs] def test_report_num(self):
"""Field ``report_num`` is populated from ``report_num``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'report_num': "Physica A, 245 (1997) 181"
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.report_num, "Physica A, 245 (1997) 181")
[docs] def test_proxy(self):
"""Field ``proxy`` is populated from ``proxy``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'proxy': True
})
doc = transform.to_search_document(meta)
self.assertTrue(doc.proxy)
def test_metadata_id(self):
"""Field ``metadata_id`` is populated from ``metadata_id``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'metadata_id': '690776'
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.metadata_id, '690776')
[docs] def test_msc_class(self):
"""Field ``msc_class`` is populated from ``msc_class``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'msc_class': "03B70,68Q60"
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.msc_class, ["03B70", "68Q60"])
[docs] def test_acm_class(self):
"""Field ``acm_class`` is populated from ``acm_class``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'acm_class': "F.4.1; D.2.4"
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.acm_class, ["F.4.1", "D.2.4"])
[docs] def test_doi(self):
"""Field ``doi`` is populated from ``doi``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'doi': '10.1103/PhysRevD.76.104043'
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.doi, ['10.1103/PhysRevD.76.104043'])
[docs] def test_metadata_id(self):
"""Field ``comments`` is populated from ``comments_utf8``."""
meta = DocMeta(**{
'paper_id': '1234.56789',
'comments_utf8': 'comments!'
})
doc = transform.to_search_document(meta)
self.assertEqual(doc.comments, 'comments!')
[docs]class TestTransformBulkDocmeta(TestCase):
"""Test transformation of docmeta retrieved from bulk endpoint."""
[docs] def test_transform(self):
"""All of the paper ID and version fields should be set correctly."""
with open('tests/data/docmeta_bulk.json') as f:
data = json.load(f)
docmeta = [DocMeta(**datum) for datum in data]
documents = [transform.to_search_document(meta) for meta in docmeta]
for doc in documents:
self.assertIsNotNone(doc.id)
self.assertGreater(len(doc.id), 0)
self.assertIsNotNone(doc.paper_id)
self.assertGreater(len(doc.paper_id), 0)
self.assertNotIn('v', doc.paper_id)
self.assertIsNotNone(doc.paper_id_v)
self.assertGreater(len(doc.paper_id_v), 0)
self.assertIn('v', doc.paper_id_v)
self.assertIsNotNone(doc.version)
self.assertGreater(doc.version, 0)
if doc.version == 2:
self.assertEqual(doc.latest, f"{doc.paper_id}v2")
self.assertTrue(doc.is_current)
self.assertEqual(doc.id, doc.paper_id_v)
else:
self.assertFalse(doc.is_current)
self.assertEqual(doc.id, doc.paper_id_v)
self.assertEqual(doc.latest_version, 2)