From eefc9f3d1e632e5c3b8dd15b267f471fd4db9a9d Mon Sep 17 00:00:00 2001 From: thomasallen Date: Wed, 11 Mar 2026 12:45:54 -0700 Subject: [PATCH 1/3] add resolved_reference columns --- adsrefpipe/app.py | 11 +++- adsrefpipe/models.py | 14 +++-- adsrefpipe/tests/unittests/test_app.py | 72 +++++++++++++++++++++++++- 3 files changed, 91 insertions(+), 6 deletions(-) diff --git a/adsrefpipe/app.py b/adsrefpipe/app.py index fd9b22b..de5317f 100755 --- a/adsrefpipe/app.py +++ b/adsrefpipe/app.py @@ -433,6 +433,9 @@ def update_resolved_reference_records(self, session: object, resolved_list: List "score": r.score, "reference_raw": r.reference_raw, "external_identifier": _ensure_list(getattr(r, "external_identifier", None)) or [], + "scix_id": getattr(r, "scix_id", None), + "publication_year": getattr(r, "publication_year", None), + "refereed_status": getattr(r, "refereed_status", 0), }) session.bulk_update_mappings(ResolvedReference, mappings) @@ -474,7 +477,9 @@ def populate_resolved_reference_records_pre_resolved(self, references: List, his scix_id = '0000', score=-1, reference_raw=ref.get('refraw', None), - external_identifier=_ensure_list(ref.get('external_identifier', None)) or []) + external_identifier=_ensure_list(ref.get('external_identifier', None)) or [], + publication_year=ref.get('publication_year', None), + refereed_status=int(ref.get('refereed_status', 0) or 0)) resolved_records.append(resolved_record) # add the id and remove xml_reference that is now in database ref['id'] = 'H%dI%d' % (history_id, item_num) @@ -578,7 +583,9 @@ def populate_tables_post_resolved(self, resolved_reference: List, source_bibcode scix_id=ref.get('scix_id',None), score=ref.get('score', None), reference_raw=ref.get('refstring', None), - external_identifier=_ensure_list(ref.get('external_identifier', None)) or []) + external_identifier=_ensure_list(ref.get('external_identifier', None)) or [], + publication_year=ref.get('publication_year', None), + refereed_status=int(ref.get('refereed_status', 0) or 0)) resolved_records.append(resolved_record) if resolved_classic: compare_record = CompareClassic(history_id=history_id, diff --git a/adsrefpipe/models.py b/adsrefpipe/models.py index 0c6ecd9..9c05612 100755 --- a/adsrefpipe/models.py +++ b/adsrefpipe/models.py @@ -215,8 +215,11 @@ class ResolvedReference(Base): reference_raw = Column(String) external_identifier = Column(ARRAY(String)) scix_id = Column(String) + publication_year = Column(Integer) + refereed_status = Column(Integer) - def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str, external_identifier: list = None, scix_id: str = None): + def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str, + external_identifier: list = None, scix_id: str = None, publication_year: int = None, refereed_status: int = 0): """ initializes a resolved reference object @@ -228,6 +231,8 @@ def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: :param score: confidence score of the resolved reference :param reference_raw: raw reference string :param external_identifier: list of external identifiers associated with the reference, e.g. ["doi:...", "arxiv:...", "ascl:..."] + :param publication_year: publication year + :param refereed_status: refereed status flag (0 or 1) """ self.history_id = history_id self.item_num = item_num @@ -237,6 +242,8 @@ def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: self.reference_raw = reference_raw self.external_identifier = external_identifier or [] self.scix_id = scix_id + self.publication_year = publication_year + self.refereed_status = refereed_status def toJSON(self) -> dict: """ @@ -252,7 +259,9 @@ def toJSON(self) -> dict: 'item_num': self.item_num, **({'reference_raw': self.reference_raw} if self.reference_raw else {}), 'external_identifier': self.external_identifier, - **({'scix_id': self.scix_id} if self.scix_id else {}) + **({'scix_id': self.scix_id} if self.scix_id else {}), + **({'publication_year': self.publication_year} if self.publication_year is not None else {}), + **({'refereed_status': self.refereed_status} if self.refereed_status is not None else {}), } @@ -299,4 +308,3 @@ def toJSON(self) -> dict: 'score': self.score, 'state': self.state, } - diff --git a/adsrefpipe/tests/unittests/test_app.py b/adsrefpipe/tests/unittests/test_app.py index de57411..5cc678e 100644 --- a/adsrefpipe/tests/unittests/test_app.py +++ b/adsrefpipe/tests/unittests/test_app.py @@ -53,6 +53,28 @@ def _get_scix_id(rec): return getattr(rec, "scix_id", None) +def _get_publication_year(rec): + """ + Works whether rec is a dict (bulk mappings) or an ORM object. + """ + if rec is None: + return None + if isinstance(rec, dict): + return rec.get("publication_year") + return getattr(rec, "publication_year", None) + + +def _get_refereed_status(rec): + """ + Works whether rec is a dict (bulk mappings) or an ORM object. + """ + if rec is None: + return None + if isinstance(rec, dict): + return rec.get("refereed_status") + return getattr(rec, "refereed_status", None) + + def _make_session_scope_cm(session): """ Return a context manager mock that behaves like app.session_scope() @@ -717,6 +739,8 @@ def test_populate_tables_post_resolved_with_classic(self): 'score': 1.0, 'external_identifier': ['doi:10.1234/abc', 'arxiv:2301.00001'], 'scix_id': 'scix:ABCD-1234-ref1', + 'publication_year': 2023, + 'refereed_status': 1, }, { 'id': 'H1I2', @@ -725,6 +749,8 @@ def test_populate_tables_post_resolved_with_classic(self): 'score': 0.8, 'external_identifier': ['ascl:2301.001', 'doi:10.9999/xyz'], 'scix_id': 'scix:ABCD-1234-ref2', + 'publication_year': 2021, + 'refereed_status': 0, } ] @@ -756,6 +782,10 @@ def test_populate_tables_post_resolved_with_classic(self): self.assertEqual(_get_scix_id(resolved_records[0]), 'scix:ABCD-1234-ref1') self.assertEqual(_get_scix_id(resolved_records[1]), 'scix:ABCD-1234-ref2') + self.assertEqual(_get_publication_year(resolved_records[0]), 2023) + self.assertEqual(_get_publication_year(resolved_records[1]), 2021) + self.assertEqual(_get_refereed_status(resolved_records[0]), 1) + self.assertEqual(_get_refereed_status(resolved_records[1]), 0) @patch("adsrefpipe.app.ProcessedHistory") @patch("adsrefpipe.app.ResolvedReference") @@ -1058,6 +1088,8 @@ def test_resolved_reference_toJSON_includes_scix_id(self): reference_raw="Some ref raw", external_identifier=["doi:10.1234/xyz"], scix_id="scix:ABCD-1234-0004", + publication_year=2020, + refereed_status=1, ) got = rr.toJSON() self.assertEqual(got["history_id"], 123) @@ -1065,6 +1097,8 @@ def test_resolved_reference_toJSON_includes_scix_id(self): self.assertEqual(got["bibcode"], "2020A&A...000A...1X") self.assertEqual(got["external_identifier"], ["doi:10.1234/xyz"]) self.assertEqual(got["scix_id"], "scix:ABCD-1234-0004") + self.assertEqual(got["publication_year"], 2020) + self.assertEqual(got["refereed_status"], 1) def test_resolved_reference_toJSON_omits_scix_id_when_none(self): """Test ResolvedReference.toJSON omits scix_id when not set""" @@ -1077,9 +1111,13 @@ def test_resolved_reference_toJSON_omits_scix_id_when_none(self): reference_raw="Some ref raw", external_identifier=["doi:10.1234/xyz"], scix_id=None, + publication_year=None, + refereed_status=0, ) got = rr.toJSON() self.assertTrue("scix_id" not in got) + self.assertTrue("publication_year" not in got) + self.assertEqual(got["refereed_status"], 0) class TestDatabaseNoStubdata(unittest.TestCase): @@ -1126,6 +1164,31 @@ def test_app(self): assert self.app._config.get('SQLALCHEMY_URL') == 'postgresql://mock/mock' assert self.app.conf.get('SQLALCHEMY_URL') == 'postgresql://mock/mock' + def test_update_resolved_reference_records_includes_new_columns(self): + """Verify bulk update payload includes publication_year and refereed_status.""" + rr = ResolvedReference( + history_id=1, + item_num=2, + reference_str="Some reference", + bibcode="2023A&A...657A...1X", + score=1.0, + reference_raw="Some reference", + external_identifier=["doi:10.1234/example"], + scix_id="scix:ABCD-1234-9999", + publication_year=2023, + refereed_status=1, + ) + + result = self.app.update_resolved_reference_records(self.mock_session, [rr]) + self.assertTrue(result) + + self.mock_session.bulk_update_mappings.assert_called_once() + called_model, called_mappings = self.mock_session.bulk_update_mappings.call_args[0] + self.assertIs(called_model, ResolvedReference) + self.assertEqual(len(called_mappings), 1) + self.assertEqual(called_mappings[0]["publication_year"], 2023) + self.assertEqual(called_mappings[0]["refereed_status"], 1) + def test_query_reference_tbl_when_empty(self): """ verify reference_source table being empty """ self.app.diagnostic_query = MagicMock(return_value=[]) @@ -1164,6 +1227,8 @@ def test_populate_tables(self): "id": "H1I1", "external_identifier": ["arxiv:1009.5514", "doi:10.1234/abc"], "scix_id": "scix:ABCD-1234-0005", + "publication_year": 2011, + "refereed_status": 1, }, { "score": "1.0", @@ -1173,6 +1238,8 @@ def test_populate_tables(self): "id": "H1I2", "external_identifier": ["arxiv:1709.02923", "ascl:2301.001"], "scix_id": "scix:ABCD-1234-0006", + "publication_year": 2017, + "refereed_status": 0, } ] @@ -1221,6 +1288,10 @@ def test_populate_tables(self): self.assertEqual(got[1]["external_identifier"], ["arxiv:1709.02923", "ascl:2301.001"]) self.assertEqual(got[0]["scix_id"], "scix:ABCD-1234-0005") self.assertEqual(got[1]["scix_id"], "scix:ABCD-1234-0006") + self.assertEqual(got[0]["publication_year"], 2011) + self.assertEqual(got[1]["publication_year"], 2017) + self.assertEqual(got[0]["refereed_status"], 1) + self.assertEqual(got[1]["refereed_status"], 0) def test_get_parser_error(self): """ test get_parser when it errors for unrecognized source filename """ @@ -1242,4 +1313,3 @@ def _fake_get_parser(path): if __name__ == '__main__': unittest.main() - From ffcc13613f890518d5d88cc4d09288b032d8ee36 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Wed, 11 Mar 2026 12:56:26 -0700 Subject: [PATCH 2/3] alembic upgrade --- alembic/versions/835999dfb9e3_add_scix_id.py | 39 ++++++++++++++++ ...dd_publication_year_and_refereed_status.py | 45 +++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 alembic/versions/835999dfb9e3_add_scix_id.py create mode 100644 alembic/versions/9a4b1e8b6c7d_add_publication_year_and_refereed_status.py diff --git a/alembic/versions/835999dfb9e3_add_scix_id.py b/alembic/versions/835999dfb9e3_add_scix_id.py new file mode 100644 index 0000000..962b133 --- /dev/null +++ b/alembic/versions/835999dfb9e3_add_scix_id.py @@ -0,0 +1,39 @@ +"""add scix_id + +Revision ID: 835999dfb9e3 +Revises: 08ca70bd6f5f +Create Date: 2026-02-11 12:45:45.441650 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '835999dfb9e3' +down_revision = '08ca70bd6f5f' +branch_labels = None +depends_on = None + + +def upgrade(): + bind = op.get_bind() + inspector = sa.inspect(bind) + if not inspector.has_table("resolved_reference"): + raise RuntimeError( + "Migration 835999dfb9e3 requires table `resolved_reference`, " + "but it does not exist. Database schema and alembic_version are out of sync." + ) + columns = {c["name"] for c in inspector.get_columns("resolved_reference")} + if "scix_id" not in columns: + op.add_column("resolved_reference", sa.Column("scix_id", sa.String(), nullable=True)) + + +def downgrade(): + bind = op.get_bind() + inspector = sa.inspect(bind) + if not inspector.has_table("resolved_reference"): + return + columns = {c["name"] for c in inspector.get_columns("resolved_reference")} + if "scix_id" in columns: + op.drop_column("resolved_reference", "scix_id") diff --git a/alembic/versions/9a4b1e8b6c7d_add_publication_year_and_refereed_status.py b/alembic/versions/9a4b1e8b6c7d_add_publication_year_and_refereed_status.py new file mode 100644 index 0000000..1279978 --- /dev/null +++ b/alembic/versions/9a4b1e8b6c7d_add_publication_year_and_refereed_status.py @@ -0,0 +1,45 @@ +"""add publication_year and refereed_status + +Revision ID: 9a4b1e8b6c7d +Revises: 835999dfb9e3 +Create Date: 2026-03-11 00:00:00.000000 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "9a4b1e8b6c7d" +down_revision = "835999dfb9e3" +branch_labels = None +depends_on = None + + +def upgrade(): + bind = op.get_bind() + inspector = sa.inspect(bind) + if not inspector.has_table("resolved_reference"): + raise RuntimeError( + "Migration 9a4b1e8b6c7d requires table `resolved_reference`, " + "but it does not exist. Database schema and alembic_version are out of sync." + ) + + columns = {c["name"] for c in inspector.get_columns("resolved_reference")} + if "publication_year" not in columns: + op.add_column("resolved_reference", sa.Column("publication_year", sa.Integer(), nullable=True)) + if "refereed_status" not in columns: + op.add_column("resolved_reference", sa.Column("refereed_status", sa.Integer(), nullable=True)) + + +def downgrade(): + bind = op.get_bind() + inspector = sa.inspect(bind) + if not inspector.has_table("resolved_reference"): + return + + columns = {c["name"] for c in inspector.get_columns("resolved_reference")} + if "refereed_status" in columns: + op.drop_column("resolved_reference", "refereed_status") + if "publication_year" in columns: + op.drop_column("resolved_reference", "publication_year") From df54299d9c004c12a54ecfeac9f83a3d029e43ac Mon Sep 17 00:00:00 2001 From: thomasallen Date: Wed, 11 Mar 2026 13:03:56 -0700 Subject: [PATCH 3/3] refereed_status default to None --- adsrefpipe/app.py | 6 +++--- adsrefpipe/models.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/adsrefpipe/app.py b/adsrefpipe/app.py index de5317f..5ca8c33 100755 --- a/adsrefpipe/app.py +++ b/adsrefpipe/app.py @@ -435,7 +435,7 @@ def update_resolved_reference_records(self, session: object, resolved_list: List "external_identifier": _ensure_list(getattr(r, "external_identifier", None)) or [], "scix_id": getattr(r, "scix_id", None), "publication_year": getattr(r, "publication_year", None), - "refereed_status": getattr(r, "refereed_status", 0), + "refereed_status": getattr(r, "refereed_status", None), }) session.bulk_update_mappings(ResolvedReference, mappings) @@ -479,7 +479,7 @@ def populate_resolved_reference_records_pre_resolved(self, references: List, his reference_raw=ref.get('refraw', None), external_identifier=_ensure_list(ref.get('external_identifier', None)) or [], publication_year=ref.get('publication_year', None), - refereed_status=int(ref.get('refereed_status', 0) or 0)) + refereed_status=ref.get('refereed_status', None)) resolved_records.append(resolved_record) # add the id and remove xml_reference that is now in database ref['id'] = 'H%dI%d' % (history_id, item_num) @@ -585,7 +585,7 @@ def populate_tables_post_resolved(self, resolved_reference: List, source_bibcode reference_raw=ref.get('refstring', None), external_identifier=_ensure_list(ref.get('external_identifier', None)) or [], publication_year=ref.get('publication_year', None), - refereed_status=int(ref.get('refereed_status', 0) or 0)) + refereed_status=ref.get('refereed_status', None)) resolved_records.append(resolved_record) if resolved_classic: compare_record = CompareClassic(history_id=history_id, diff --git a/adsrefpipe/models.py b/adsrefpipe/models.py index 9c05612..55ddb2e 100755 --- a/adsrefpipe/models.py +++ b/adsrefpipe/models.py @@ -219,7 +219,7 @@ class ResolvedReference(Base): refereed_status = Column(Integer) def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str, - external_identifier: list = None, scix_id: str = None, publication_year: int = None, refereed_status: int = 0): + external_identifier: list = None, scix_id: str = None, publication_year: int = None, refereed_status: int = None): """ initializes a resolved reference object