From 3607ae32d1c253fcb7c4a2316eeec013b8d7ccf0 Mon Sep 17 00:00:00 2001 From: Oscar Arbelaez Date: Sun, 2 Feb 2025 12:33:50 +0000 Subject: [PATCH 1/2] Label is actually required --- src/bibx/_entities/article.py | 27 +++++++---- .../_entities/collection_builders/openalex.py | 3 +- .../collection_builders/scopus_bib.py | 47 +++++++++--------- .../collection_builders/scopus_ris.py | 48 ++++++++++--------- src/bibx/_entities/collection_builders/wos.py | 43 +++++++++-------- tests/entities/test_collection.py | 22 ++++----- 6 files changed, 105 insertions(+), 85 deletions(-) diff --git a/src/bibx/_entities/article.py b/src/bibx/_entities/article.py index 7da9782..e5372c1 100644 --- a/src/bibx/_entities/article.py +++ b/src/bibx/_entities/article.py @@ -9,8 +9,13 @@ def _keep(a: T, b: T) -> T: return a if a is not None else b +def _keep_longest(a: str, b: str) -> str: + return a if len(a) > len(b) else b + + @dataclass class Article: + label: str ids: set[str] authors: list[str] = field(default_factory=list) year: Optional[int] = None @@ -20,7 +25,6 @@ class Article: issue: Optional[str] = None page: Optional[str] = None doi: Optional[str] = None - _label: Optional[str] = None _permalink: Optional[str] = None times_cited: Optional[int] = None references: list["Article"] = field(default_factory=list) @@ -31,6 +35,7 @@ class Article: def merge(self, other: "Article") -> "Article": """Merge two articles into a new one.""" return Article( + label=_keep_longest(self.label, other.label), ids=self.ids.union(other.ids), authors=self.authors if self.authors else other.authors, year=_keep(self.year, other.year), @@ -40,7 +45,6 @@ def merge(self, other: "Article") -> "Article": issue=_keep(self.issue, other.issue), page=_keep(self.page, other.page), doi=_keep(self.doi, other.doi), - _label=_keep(self._label, other._label), _permalink=_keep(self._permalink, other._permalink), times_cited=_keep(self.times_cited, other.times_cited), references=self.references or other.references, @@ -54,17 +58,17 @@ def key(self) -> str: return next(iter(sorted(self.ids))) @property - def label(self) -> str: - if self._label is not None: - return self._label + def simple_label(self) -> Optional[str]: pieces = { - "AU": self.authors[0].replace(",", "") if self.authors else "anonymous", + "AU": self.authors[0].replace(",", "") if self.authors else None, "PY": str(self.year) if self.year else None, "J9": str(self.journal) if self.journal else None, "VL": f"V{self.volume}" if self.volume else None, "BP": f"P{self.page}" if self.page else None, "DI": f"DOI {self.doi}" if self.doi else None, } + if not any(pieces.values()): + return None return ", ".join(value for value in pieces.values() if value) @property @@ -85,10 +89,17 @@ def simple_id(self) -> Optional[str]: def __repr__(self) -> str: return f"Article(ids={self.ids!r}, authors={self.authors!r})" - def add_simple_id(self) -> None: + def add_simple_id(self) -> "Article": if self.simple_id is None: - return + return self self.ids.add(f"simple:{self.simple_id}") + return self + + def set_simple_label(self) -> "Article": + if self.simple_label is None: + return self + self.label = self.simple_label + return self def info( self, diff --git a/src/bibx/_entities/collection_builders/openalex.py b/src/bibx/_entities/collection_builders/openalex.py index 8d9f748..9be957f 100644 --- a/src/bibx/_entities/collection_builders/openalex.py +++ b/src/bibx/_entities/collection_builders/openalex.py @@ -75,6 +75,7 @@ def _extract_doi(url: str) -> str: @classmethod def _work_to_article(cls, work: Work) -> Article: article = Article( + label=work.id, ids={ f"{source}:{id_}" if source != "doi" @@ -93,7 +94,6 @@ def _work_to_article(cls, work: Work) -> Article: issue=work.biblio.issue, page=work.biblio.first_page, doi=cls._extract_doi(work.doi) if work.doi else None, - _label=work.id, _permalink=work.primary_location and work.primary_location.landing_page_url, times_cited=work.cited_by_count, references=[cls._reference_to_article(r) for r in work.referenced_works], @@ -108,6 +108,7 @@ def _work_to_article(cls, work: Work) -> Article: @staticmethod def _reference_to_article(reference: str) -> Article: return Article( + label=reference, ids={f"openalex:{reference}"}, _permalink=reference, sources={"openalex"}, diff --git a/src/bibx/_entities/collection_builders/scopus_bib.py b/src/bibx/_entities/collection_builders/scopus_bib.py index eec8610..3714348 100644 --- a/src/bibx/_entities/collection_builders/scopus_bib.py +++ b/src/bibx/_entities/collection_builders/scopus_bib.py @@ -41,24 +41,29 @@ def _article_from_entry(self, entry: dict) -> Article: doi = entry.get("doi") if doi is not None: ids.add(f"doi:{doi}") - article = Article( - ids=ids, - authors=entry["author"].split(" and "), - year=int(entry["year"]), - title=entry.get("title"), - journal=entry.get("journal"), - volume=entry.get("volume"), - issue=entry.get("issue"), - page=entry.get("art_number"), - doi=entry.get("doi"), - references=list(self._articles_from_references(entry.get("references"))), - keywords=entry.get("keywords", "").split("; "), - extra=entry, - sources={json.dumps(entry)}, - times_cited=times_cited, + return ( + Article( + label=doi or entry.get("title", "replaceme"), + ids=ids, + authors=entry["author"].split(" and "), + year=int(entry["year"]), + title=entry.get("title"), + journal=entry.get("journal"), + volume=entry.get("volume"), + issue=entry.get("issue"), + page=entry.get("art_number"), + doi=entry.get("doi"), + references=list( + self._articles_from_references(entry.get("references")) + ), + keywords=entry.get("keywords", "").split("; "), + extra=entry, + sources={json.dumps(entry)}, + times_cited=times_cited, + ) + .add_simple_id() + .set_simple_label() ) - article.add_simple_id() - return article def _articles_from_references(self, references: Optional[str]) -> Iterable[Article]: if references is None: @@ -76,13 +81,11 @@ def _article_from_reference(reference: str) -> Article: author = reference.split(",", maxsplit=2)[0].strip() match = re.search(r"(10.\d{4,9}/[-._;()/:A-Z0-9]+)", reference) doi = match.groups()[0] if match else None - article = Article( + return Article( + label=reference, ids=set() if doi is None else {f"doi:{doi}"}, authors=[author], year=year, - _label=reference, doi=doi, sources={reference}, - ) - article.add_simple_id() - return article + ).add_simple_id() diff --git a/src/bibx/_entities/collection_builders/scopus_ris.py b/src/bibx/_entities/collection_builders/scopus_ris.py index 788791a..b1ac019 100644 --- a/src/bibx/_entities/collection_builders/scopus_ris.py +++ b/src/bibx/_entities/collection_builders/scopus_ris.py @@ -102,7 +102,8 @@ def _article_form_reference(cls, scopusref: str) -> Article: doi, _ = cls._find_doi(scopusref) if not authors or not year: raise MissingCriticalInformationError() - article = Article( + return Article( + label=scopusref, ids=set() if doi is None else {f"doi:{doi}"}, authors=[f"{first_name} {last_name.replace(' ', '').replace('.', '')}"], year=int(year), @@ -114,9 +115,7 @@ def _article_form_reference(cls, scopusref: str) -> Article: volume=volume_info.get("volume"), page=volume_info.get("page"), doi=doi, - ) - article.add_simple_id() - return article + ).add_simple_id() @classmethod def _parse_references(cls, refs: list[str]) -> list[Article]: @@ -134,7 +133,6 @@ def _parse_references(cls, refs: list[str]) -> list[Article]: def _ris_to_dict(record: str) -> dict[str, list[str]]: parsed = defaultdict(list) current = None - for line in record.split("\n"): match = _RIS_PATTERN.match(line) if not match: @@ -163,25 +161,29 @@ def _article_from_record(cls, record: str) -> Article: authors = data.get("AU", []) if not authors or not year: raise MissingCriticalInformationError() - doi = data.get("DO") - article = Article( - ids=set() if doi is None else {f"doi:{doi}"}, - title=_joined(data.get("TI")), - authors=authors, - year=year, - journal=_joined(data.get("J2")), - volume=_joined(data.get("VL")), - issue=_joined(data.get("IS")), - page=_joined(data.get("SP")), - doi=_joined(data.get("DO")), - keywords=data.get("KW", []), - references=cls._parse_references(data.get("N1:References", [])), - sources={"scopus"}, - extra=data, - times_cited=times_cited, + doi_list = data.get("DO") + doi = doi_list[0] if doi_list else None + return ( + Article( + label=doi or "replaceme", + ids=set() if doi is None else {f"doi:{doi}"}, + title=_joined(data.get("TI")), + authors=authors, + year=year, + journal=_joined(data.get("J2")), + volume=_joined(data.get("VL")), + issue=_joined(data.get("IS")), + page=_joined(data.get("SP")), + doi=doi, + keywords=data.get("KW", []), + references=cls._parse_references(data.get("N1:References", [])), + sources={"scopus"}, + extra=data, + times_cited=times_cited, + ) + .add_simple_id() + .set_simple_label() ) - article.add_simple_id() - return article @classmethod def _parse_file(cls, file: TextIO) -> Iterable[Article]: diff --git a/src/bibx/_entities/collection_builders/wos.py b/src/bibx/_entities/collection_builders/wos.py index 872bdd5..01d0c38 100644 --- a/src/bibx/_entities/collection_builders/wos.py +++ b/src/bibx/_entities/collection_builders/wos.py @@ -297,26 +297,29 @@ def _parse_article_from_str(cls, article_as_str: str) -> Article: article_data[field].append(parsed["value"]) processed = cls._parse_all(dict(article_data)) doi = processed.get("DOI") - article = Article( - ids=set() if doi is None else {f"doi:{doi}"}, - authors=processed.get("authors", []), - year=processed.get("year"), - title=processed.get("title"), - journal=processed.get("source_abbreviation"), - volume=processed.get("volume"), - issue=processed.get("issue"), - page=processed.get("beginning_page"), - doi=doi, - times_cited=processed.get("times_cited"), - references=list( - cls._get_articles_from_references(processed.get("references")) - ), - keywords=processed.get("keywords", []), - extra=processed, - sources={article_as_str}, + return ( + Article( + label=doi or "replaceme", + ids=set() if doi is None else {f"doi:{doi}"}, + authors=processed.get("authors", []), + year=processed.get("year"), + title=processed.get("title"), + journal=processed.get("source_abbreviation"), + volume=processed.get("volume"), + issue=processed.get("issue"), + page=processed.get("beginning_page"), + doi=doi, + times_cited=processed.get("times_cited"), + references=list( + cls._get_articles_from_references(processed.get("references")) + ), + keywords=processed.get("keywords", []), + extra=processed, + sources={article_as_str}, + ) + .add_simple_id() + .set_simple_label() ) - article.add_simple_id() - return article @classmethod def _parse_reference_from_str(cls, reference: str) -> Article: @@ -327,8 +330,8 @@ def _parse_reference_from_str(cls, reference: str) -> Article: processed = cls._parse_all(data) doi = processed.get("DOI") article = Article( + label=reference, ids=set() if doi is None else {f"doi:{doi}"}, - _label=reference, title=processed.get("title"), authors=processed.get("authors", []), # FIXME: Year is required here diff --git a/tests/entities/test_collection.py b/tests/entities/test_collection.py index c831821..d65a083 100644 --- a/tests/entities/test_collection.py +++ b/tests/entities/test_collection.py @@ -3,6 +3,7 @@ articles = [ Article( + label="doi:1", ids={"doi:1"}, authors=["A"], year=2010, @@ -16,9 +17,9 @@ keywords=[], sources=set(), extra={}, - _label=None, ), Article( + label="doi:12", ids={"doi:12"}, authors=["B"], year=2000, @@ -32,9 +33,9 @@ keywords=[], sources=set(), extra={}, - _label=None, ), Article( + label="doi:13", ids={"doi:13"}, authors=["C"], year=2021, @@ -48,9 +49,9 @@ keywords=[], sources=set(), extra={}, - _label=None, ), Article( + label="doi:14", ids={"doi:14"}, authors=["D"], year=2022, @@ -64,9 +65,9 @@ keywords=[], sources=set(), extra={}, - _label=None, ), Article( + label="doi:15", ids={"doi:15"}, authors=["E"], year=2005, @@ -80,9 +81,9 @@ keywords=[], sources=set(), extra={}, - _label=None, ), Article( + label="doi:16", ids={"doi:16"}, authors=["F"], year=2005, @@ -96,9 +97,9 @@ keywords=[], sources=set(), extra={}, - _label=None, ), Article( + label="doi:17", ids={"doi:17"}, authors=["J"], year=2010, @@ -112,9 +113,9 @@ keywords=[], sources=set(), extra={}, - _label=None, ), Article( + label="doi:18", ids={"doi:18"}, authors=["H"], year=2000, @@ -128,9 +129,9 @@ keywords=[], sources=set(), extra={}, - _label=None, ), Article( + label="doi:19", ids={"doi:19"}, authors=["I"], year=2021, @@ -143,9 +144,9 @@ keywords=[], sources=set(), extra={}, - _label=None, ), Article( + label="doi:19", ids={"doi:19"}, authors=["I"], year=None, @@ -158,9 +159,9 @@ keywords=[], sources=set(), extra={}, - _label=None, ), Article( + label="doi:19", ids={"doi:19"}, authors=["I"], title="Ii", @@ -172,7 +173,6 @@ keywords=[], sources=set(), extra={}, - _label=None, ), ] From 49f6a2e889b6db10a98b425fa751352d807843f6 Mon Sep 17 00:00:00 2001 From: Oscar Arbelaez Date: Sun, 2 Feb 2025 12:37:20 +0000 Subject: [PATCH 2/2] Update version --- src/bibx/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bibx/__init__.py b/src/bibx/__init__.py index 93422b7..96b8a4a 100644 --- a/src/bibx/__init__.py +++ b/src/bibx/__init__.py @@ -25,7 +25,7 @@ "read_wos", ] -__version__ = "0.3.0" +__version__ = "0.3.1" def query_openalex(query: str, limit: int = 600) -> Collection: