coreofscience · odarbelaeze · Feb 2, 2025 · Jan 30, 2025 · Feb 2, 2025 · Feb 2, 2025
diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
@@ -20,12 +20,11 @@ jobs:
         uses: astral-sh/setup-uv@v2
         with:
           enable-cache: true
-          version: "0.5.7"
+          version: "0.5.26"
 
       - name: Install dependencies
         run: |
-          uv venv .venv
-          uv pip install ".[dev]"
+          uv sync
 
       - name: Make sure we didn't forget anything in pre-commit
         run: |

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -30,11 +30,10 @@ jobs:
         uses: astral-sh/setup-uv@v2
         with:
           enable-cache: true
-          version: "0.5.7"
+          version: "0.5.26"
 
       - name: Run tests for ${{ matrix.python-version }}
         run: |
-          uv venv .venv
-          uv pip install ".[dev]"
+          uv sync
           uv run pre-commit run --all
           uv run pytest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,10 +11,11 @@ repos:
     hooks:
       # Run the linter.
       - id: ruff
-        args: [ --fix ]
+        args: [--fix]
       # Run the formatter.
       - id: ruff-format
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v1.13.0'
+    rev: "v1.13.0"
     hooks:
-    -   id: mypy
+      - id: mypy
+        additional_dependencies: ["types-requests>=2.32.0.20241016"]
diff --git a/pyproject.toml b/pyproject.toml
@@ -2,39 +2,108 @@
 name = "bibx"
 description = "Python bibliometric tools."
 authors = [
-    {name = "Core of Science Team", email = "technology@coreofscience.org"},
+    { name = "Core of Science Team", email = "technology@coreofscience.org" },
 ]
 license = "MIT"
 readme = "README.md"
-keywords = [
-    "bibliometrics",
-    "science",
-    "text mining",
-]
+keywords = ["bibliometrics", "science", "text mining"]
 dynamic = ["version"]
 dependencies = [
     "bibtexparser~=1.4.0",
     "networkx~=3.0",
+    "pydantic~=2.10.6",
+    "requests~=2.32.3",
     "typer[all]~=0.9.0",
     "xlsxwriter~=3.2.0",
 ]
 requires-python = ">=3.9"
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Natural Language :: English",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Scientific/Engineering :: Information Analysis",
+    "Topic :: Text Processing",
+    "Typing :: Typed",
+]
 
-
-[project.optional-dependencies]
+[dependency-groups]
 dev = [
-    "pytest~=7.2.0",
+    "pytest~=8.3.4",
     "pre-commit~=2.20.0",
-    "ruff~=0.3.3",
+    "ruff~=0.8.2",
     "mypy~=1.9.0",
+    "types-requests>=2.32.0.20241016",
+    "ipython>=8.18.1",
 ]
 
 [project.scripts]
 bibx = "bibx.cli:app"
 
 [tool.ruff.lint]
-select = ["I", "E", "F", "UP", "W"]
-ignore = ["E501"]
+select = [
+    "F",   # Pyflakes
+    "W",
+    "E",   # pycodestyle
+    "C90", # mccabe
+    "I",   # isort
+    "D",   # pydocstyle
+    "UP",  # pyupgrade
+    "N",   # pep8-naming
+    "YTT", # flake8-2020
+    "ANN", # flake8-annotations
+    "S",   # flake8-bandit
+    "BLE", # flake8-blind-except
+    "FBT", # flake8-boolean-trap
+    "B",   # flake8-bugbear
+    "A",   # flake8-builtins
+    "C4",  # flake8-comprehensions
+    "T10", # flake8-debugger
+    "EM",  # flake8-errmsg
+    "ISC", # flake8-implicit-str-concat
+    "ICN", # flake8-import-conventions
+    "G",   # flake8-logging-format
+    "T20", # flake8-print
+    "Q",   # flake8-quotes
+    "RET", # flake8-return
+    "SIM", # flake8-simplify
+    "TID", # flake8-tidy-imports
+    "TID", # flake8-tidy-imports
+    "DTZ", # flake8-datetimez
+    "ARG", # flake8-unused-arguments
+    "PGH", # pygrep-hooks
+    "PLC",
+    "PLE",
+    "PLR",
+    "PLW", # Pylint
+    "RUF", # Ruff-specific rules
+]
+ignore = [
+    "A002",
+    "B008",
+    "D100",
+    "D106",
+    "D107",
+    "D203",
+    "D213",
+    "D406",
+    "D407",
+    "DTZ003",
+    "FBT001",
+    "FBT003",
+    "ISC001",
+    "N815",
+    "PGH003",
+    "S101",
+    "T201",
+]
 
 [tool.mypy]
 mypy_path = "./stubs/"

diff --git a/src/bibx/__init__.py b/src/bibx/__init__.py
@@ -1,8 +1,11 @@
+"""BibX is a library to work with bibliographic data."""
+
 import logging
 from typing import TextIO
 
 from bibx._entities.article import Article
 from bibx._entities.collection import Collection
+from bibx._entities.collection_builders.openalex import OpenAlexCollectionBuilder
 from bibx._entities.collection_builders.scopus_bib import ScopusBibCollectionBuilder
 from bibx._entities.collection_builders.scopus_ris import ScopusRisCollectionBuilder
 from bibx._entities.collection_builders.wos import WosCollectionBuilder
@@ -15,18 +18,23 @@
     "Article",
     "Collection",
     "Sap",
+    "query_openalex",
+    "read_any",
     "read_scopus_bib",
     "read_scopus_ris",
     "read_wos",
-    "read_any",
 ]
 
-__version__ = "0.2.0"
+__version__ = "0.3.0"
+
+
+def query_openalex(query: str, limit: int = 600) -> Collection:
+    """Query OpenAlex and return a collection."""
+    return OpenAlexCollectionBuilder(query, limit).build()
 
 
 def read_scopus_bib(*files: TextIO) -> Collection:
-    """
-    Takes any number of bibtex files from scopus and generates a collection.
+    """Take any number of bibtex files from scopus and generates a collection.
 
     :param files: Scopus bib files open.
     :return: the collection
@@ -35,8 +43,7 @@ def read_scopus_bib(*files: TextIO) -> Collection:
 
 
 def read_scopus_ris(*files: TextIO) -> Collection:
-    """
-    Takes any number of ris files from scopus and generates a collection.
+    """Take any number of ris files from scopus and generates a collection.
 
     :param files: Scopus bib files open.
     :return: the collection
@@ -45,8 +52,7 @@ def read_scopus_ris(*files: TextIO) -> Collection:
 
 
 def read_wos(*files: TextIO) -> Collection:
-    """
-    Takes any number of wos text files and returns a collection.
+    """Take any number of wos text files and returns a collection.
 
     :param files: WoS files open.
     :return: the collection
@@ -55,16 +61,15 @@ def read_wos(*files: TextIO) -> Collection:
 
 
 def read_any(file: TextIO) -> Collection:
-    """
-    Tries to read a file with the supported formats.
-    """
+    """Try to read a file with the supported formats."""
     for handler in (read_wos, read_scopus_ris, read_scopus_bib):
         try:
             return handler(file)
         except BibXError as e:
-            logger.debug(f"Error: {e}")
+            logger.debug("Error: %s", e)
         except ValueError:
             logger.debug(
-                f"Error: the {handler.__name__} function does not support this file"
+                "Error: the %s function does not support this file", handler.__name__
             )
-    raise ValueError("Unsupported file type")
+    message = "Unsupported file type"
+    raise ValueError(message)
diff --git a/src/bibx/_entities/article.py b/src/bibx/_entities/article.py
@@ -1,10 +1,17 @@
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Optional
+from typing import Optional, TypeVar, Union
+
+T = TypeVar("T")
+
+
+def _keep(a: T, b: T) -> T:
+    return a if a is not None else b
 
 
 @dataclass
 class Article:
+    ids: set[str]
     authors: list[str] = field(default_factory=list)
     year: Optional[int] = None
     title: Optional[str] = None
@@ -14,23 +21,40 @@ class Article:
     page: Optional[str] = None
     doi: Optional[str] = None
     _label: Optional[str] = None
+    _permalink: Optional[str] = None
     times_cited: Optional[int] = None
     references: list["Article"] = field(default_factory=list)
     keywords: list[str] = field(default_factory=list)
     sources: set[str] = field(default_factory=set)
     extra: Mapping = field(default_factory=dict)
 
+    def merge(self, other: "Article") -> "Article":
+        """Merge two articles into a new one."""
+        return Article(
+            ids=self.ids.union(other.ids),
+            authors=self.authors if self.authors else other.authors,
+            year=_keep(self.year, other.year),
+            title=_keep(self.title, other.title),
+            journal=_keep(self.journal, other.journal),
+            volume=_keep(self.volume, other.volume),
+            issue=_keep(self.issue, other.issue),
+            page=_keep(self.page, other.page),
+            doi=_keep(self.doi, other.doi),
+            _label=_keep(self._label, other._label),
+            _permalink=_keep(self._permalink, other._permalink),
+            times_cited=_keep(self.times_cited, other.times_cited),
+            references=self.references or other.references,
+            keywords=self.keywords or other.keywords,
+            sources=self.sources.union(other.sources),
+            extra={**self.extra, **other.extra},
+        )
+
     @property
-    def key(self):
-        if self.authors:
-            author = self.authors[0].split(" ")[0].replace(",", "")
-        else:
-            author = "anonymous"
-        year = self.year
-        return f"{author}{year}".lower()
+    def key(self) -> str:
+        return next(iter(sorted(self.ids)))
 
     @property
-    def label(self):
+    def label(self) -> str:
         if self._label is not None:
             return self._label
         pieces = {
@@ -42,3 +66,45 @@ def label(self):
             "DI": f"DOI {self.doi}" if self.doi else None,
         }
         return ", ".join(value for value in pieces.values() if value)
+
+    @property
+    def permalink(self) -> Optional[str]:
+        if self._permalink is not None:
+            return self._permalink
+        if self.doi is not None:
+            return f"https://doi.org/{self.doi}"
+        return None
+
+    @property
+    def simple_id(self) -> Optional[str]:
+        if self.authors and self.year is not None:
+            author = self.authors[0].split(" ")[0].replace(",", "")
+            return f"{author}{self.year}".lower()
+        return None
+
+    def __repr__(self) -> str:
+        return f"Article(ids={self.ids!r}, authors={self.authors!r})"
+
+    def add_simple_id(self) -> None:
+        if self.simple_id is None:
+            return
+        self.ids.add(f"simple:{self.simple_id}")
+
+    def info(
+        self,
+    ) -> dict[str, Union[str, int, list[str], None]]:
+        return {
+            "permalink": self.permalink,
+            "label": self.label,
+            "authors": self.authors,
+            "year": self.year,
+            "title": self.title,
+            "journal": self.journal,
+            "volume": self.volume,
+            "issue": self.issue,
+            "page": self.page,
+            "doi": self.doi,
+            "times_cited": self.times_cited,
+            "keywords": self.keywords,
+            "sources": list(self.sources),
+        }