diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 8795b97..9793b9d 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -20,12 +20,11 @@ jobs: uses: astral-sh/setup-uv@v2 with: enable-cache: true - version: "0.5.7" + version: "0.5.26" - name: Install dependencies run: | - uv venv .venv - uv pip install ".[dev]" + uv sync - name: Make sure we didn't forget anything in pre-commit run: | diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 005f2a8..f8b405e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,11 +30,10 @@ jobs: uses: astral-sh/setup-uv@v2 with: enable-cache: true - version: "0.5.7" + version: "0.5.26" - name: Run tests for ${{ matrix.python-version }} run: | - uv venv .venv - uv pip install ".[dev]" + uv sync uv run pre-commit run --all uv run pytest diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c8021f1..46efde0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,10 +11,11 @@ repos: hooks: # Run the linter. - id: ruff - args: [ --fix ] + args: [--fix] # Run the formatter. - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.13.0' + rev: "v1.13.0" hooks: - - id: mypy + - id: mypy + additional_dependencies: ["types-requests>=2.32.0.20241016"] diff --git a/pyproject.toml b/pyproject.toml index 365eedb..13b49ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,39 +2,108 @@ name = "bibx" description = "Python bibliometric tools." authors = [ - {name = "Core of Science Team", email = "technology@coreofscience.org"}, + { name = "Core of Science Team", email = "technology@coreofscience.org" }, ] license = "MIT" readme = "README.md" -keywords = [ - "bibliometrics", - "science", - "text mining", -] +keywords = ["bibliometrics", "science", "text mining"] dynamic = ["version"] dependencies = [ "bibtexparser~=1.4.0", "networkx~=3.0", + "pydantic~=2.10.6", + "requests~=2.32.3", "typer[all]~=0.9.0", "xlsxwriter~=3.2.0", ] requires-python = ">=3.9" +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Text Processing", + "Typing :: Typed", +] - -[project.optional-dependencies] +[dependency-groups] dev = [ - "pytest~=7.2.0", + "pytest~=8.3.4", "pre-commit~=2.20.0", - "ruff~=0.3.3", + "ruff~=0.8.2", "mypy~=1.9.0", + "types-requests>=2.32.0.20241016", + "ipython>=8.18.1", ] [project.scripts] bibx = "bibx.cli:app" [tool.ruff.lint] -select = ["I", "E", "F", "UP", "W"] -ignore = ["E501"] +select = [ + "F", # Pyflakes + "W", + "E", # pycodestyle + "C90", # mccabe + "I", # isort + "D", # pydocstyle + "UP", # pyupgrade + "N", # pep8-naming + "YTT", # flake8-2020 + "ANN", # flake8-annotations + "S", # flake8-bandit + "BLE", # flake8-blind-except + "FBT", # flake8-boolean-trap + "B", # flake8-bugbear + "A", # flake8-builtins + "C4", # flake8-comprehensions + "T10", # flake8-debugger + "EM", # flake8-errmsg + "ISC", # flake8-implicit-str-concat + "ICN", # flake8-import-conventions + "G", # flake8-logging-format + "T20", # flake8-print + "Q", # flake8-quotes + "RET", # flake8-return + "SIM", # flake8-simplify + "TID", # flake8-tidy-imports + "TID", # flake8-tidy-imports + "DTZ", # flake8-datetimez + "ARG", # flake8-unused-arguments + "PGH", # pygrep-hooks + "PLC", + "PLE", + "PLR", + "PLW", # Pylint + "RUF", # Ruff-specific rules +] +ignore = [ + "A002", + "B008", + "D100", + "D106", + "D107", + "D203", + "D213", + "D406", + "D407", + "DTZ003", + "FBT001", + "FBT003", + "ISC001", + "N815", + "PGH003", + "S101", + "T201", +] [tool.mypy] mypy_path = "./stubs/" diff --git a/src/bibx/__init__.py b/src/bibx/__init__.py index 4f36be1..93422b7 100644 --- a/src/bibx/__init__.py +++ b/src/bibx/__init__.py @@ -1,8 +1,11 @@ +"""BibX is a library to work with bibliographic data.""" + import logging from typing import TextIO from bibx._entities.article import Article from bibx._entities.collection import Collection +from bibx._entities.collection_builders.openalex import OpenAlexCollectionBuilder from bibx._entities.collection_builders.scopus_bib import ScopusBibCollectionBuilder from bibx._entities.collection_builders.scopus_ris import ScopusRisCollectionBuilder from bibx._entities.collection_builders.wos import WosCollectionBuilder @@ -15,18 +18,23 @@ "Article", "Collection", "Sap", + "query_openalex", + "read_any", "read_scopus_bib", "read_scopus_ris", "read_wos", - "read_any", ] -__version__ = "0.2.0" +__version__ = "0.3.0" + + +def query_openalex(query: str, limit: int = 600) -> Collection: + """Query OpenAlex and return a collection.""" + return OpenAlexCollectionBuilder(query, limit).build() def read_scopus_bib(*files: TextIO) -> Collection: - """ - Takes any number of bibtex files from scopus and generates a collection. + """Take any number of bibtex files from scopus and generates a collection. :param files: Scopus bib files open. :return: the collection @@ -35,8 +43,7 @@ def read_scopus_bib(*files: TextIO) -> Collection: def read_scopus_ris(*files: TextIO) -> Collection: - """ - Takes any number of ris files from scopus and generates a collection. + """Take any number of ris files from scopus and generates a collection. :param files: Scopus bib files open. :return: the collection @@ -45,8 +52,7 @@ def read_scopus_ris(*files: TextIO) -> Collection: def read_wos(*files: TextIO) -> Collection: - """ - Takes any number of wos text files and returns a collection. + """Take any number of wos text files and returns a collection. :param files: WoS files open. :return: the collection @@ -55,16 +61,15 @@ def read_wos(*files: TextIO) -> Collection: def read_any(file: TextIO) -> Collection: - """ - Tries to read a file with the supported formats. - """ + """Try to read a file with the supported formats.""" for handler in (read_wos, read_scopus_ris, read_scopus_bib): try: return handler(file) except BibXError as e: - logger.debug(f"Error: {e}") + logger.debug("Error: %s", e) except ValueError: logger.debug( - f"Error: the {handler.__name__} function does not support this file" + "Error: the %s function does not support this file", handler.__name__ ) - raise ValueError("Unsupported file type") + message = "Unsupported file type" + raise ValueError(message) diff --git a/src/bibx/_entities/article.py b/src/bibx/_entities/article.py index 535b394..7da9782 100644 --- a/src/bibx/_entities/article.py +++ b/src/bibx/_entities/article.py @@ -1,10 +1,17 @@ from collections.abc import Mapping from dataclasses import dataclass, field -from typing import Optional +from typing import Optional, TypeVar, Union + +T = TypeVar("T") + + +def _keep(a: T, b: T) -> T: + return a if a is not None else b @dataclass class Article: + ids: set[str] authors: list[str] = field(default_factory=list) year: Optional[int] = None title: Optional[str] = None @@ -14,23 +21,40 @@ class Article: page: Optional[str] = None doi: Optional[str] = None _label: Optional[str] = None + _permalink: Optional[str] = None times_cited: Optional[int] = None references: list["Article"] = field(default_factory=list) keywords: list[str] = field(default_factory=list) sources: set[str] = field(default_factory=set) extra: Mapping = field(default_factory=dict) + def merge(self, other: "Article") -> "Article": + """Merge two articles into a new one.""" + return Article( + ids=self.ids.union(other.ids), + authors=self.authors if self.authors else other.authors, + year=_keep(self.year, other.year), + title=_keep(self.title, other.title), + journal=_keep(self.journal, other.journal), + volume=_keep(self.volume, other.volume), + issue=_keep(self.issue, other.issue), + page=_keep(self.page, other.page), + doi=_keep(self.doi, other.doi), + _label=_keep(self._label, other._label), + _permalink=_keep(self._permalink, other._permalink), + times_cited=_keep(self.times_cited, other.times_cited), + references=self.references or other.references, + keywords=self.keywords or other.keywords, + sources=self.sources.union(other.sources), + extra={**self.extra, **other.extra}, + ) + @property - def key(self): - if self.authors: - author = self.authors[0].split(" ")[0].replace(",", "") - else: - author = "anonymous" - year = self.year - return f"{author}{year}".lower() + def key(self) -> str: + return next(iter(sorted(self.ids))) @property - def label(self): + def label(self) -> str: if self._label is not None: return self._label pieces = { @@ -42,3 +66,45 @@ def label(self): "DI": f"DOI {self.doi}" if self.doi else None, } return ", ".join(value for value in pieces.values() if value) + + @property + def permalink(self) -> Optional[str]: + if self._permalink is not None: + return self._permalink + if self.doi is not None: + return f"https://doi.org/{self.doi}" + return None + + @property + def simple_id(self) -> Optional[str]: + if self.authors and self.year is not None: + author = self.authors[0].split(" ")[0].replace(",", "") + return f"{author}{self.year}".lower() + return None + + def __repr__(self) -> str: + return f"Article(ids={self.ids!r}, authors={self.authors!r})" + + def add_simple_id(self) -> None: + if self.simple_id is None: + return + self.ids.add(f"simple:{self.simple_id}") + + def info( + self, + ) -> dict[str, Union[str, int, list[str], None]]: + return { + "permalink": self.permalink, + "label": self.label, + "authors": self.authors, + "year": self.year, + "title": self.title, + "journal": self.journal, + "volume": self.volume, + "issue": self.issue, + "page": self.page, + "doi": self.doi, + "times_cited": self.times_cited, + "keywords": self.keywords, + "sources": list(self.sources), + } diff --git a/src/bibx/_entities/collection.py b/src/bibx/_entities/collection.py index 92784c5..d03a085 100644 --- a/src/bibx/_entities/collection.py +++ b/src/bibx/_entities/collection.py @@ -1,7 +1,11 @@ import datetime import logging +from collections import defaultdict from collections.abc import Iterable from dataclasses import dataclass +from functools import reduce + +import networkx as nx from bibx._entities.article import Article @@ -13,48 +17,106 @@ class Collection: articles: list[Article] def merge(self, other: "Collection") -> "Collection": - """ - Creates a new collection merging the articles by key. + """Create a new collection merging the articles by key. :param other: collection to merge to. :return: a new collection object. """ - keys = {a.key for a in self.articles} - merged = self.articles[:] - merged.extend(a for a in other.articles if a.key not in keys) - return Collection(merged) + all_articles = self.articles + other.articles + return Collection(self.deduplicate_articles(all_articles)) + + @staticmethod + def _all_articles(articles: list[Article]) -> Iterable[Article]: + seen = set() + for article in articles: + if id(article) in seen: + continue + yield article + seen.add(id(article)) + for reference in article.references: + if id(reference) in seen: + continue + yield reference + seen.add(id(reference)) + + @classmethod + def _uniqe_articles_by_id(cls, articles: list[Article]) -> dict[str, Article]: + graph = nx.Graph() + id_to_article: defaultdict[str, list[Article]] = defaultdict(list) + for article in cls._all_articles(articles): + first, *rest = article.ids + # Add a loop edge so that the unique articles are included + graph.add_edge(first, first) + id_to_article[first].append(article) + for id_ in rest: + graph.add_edge(first, id_) + id_to_article[id_].append(article) + components = list(nx.connected_components(graph)) + biggest = max(components, key=len) + smallest = min(components, key=len) + logger.info( + "Found %d components, biggest has %d articles, smallest has %d", + len(components), + len(biggest), + len(smallest), + ) - @property - def all_articles(self) -> Iterable[Article]: - """ - Yields all articles and references + article_by_id: dict[str, Article] = {} + for ids in components: + visited = set() + articles = [] + for id_ in ids: + for article in id_to_article[id_]: + if id(article) in visited: + continue + articles.append(article) + visited.add(id(article)) + merged = reduce(Article.merge, articles) + article_by_id.update({id_: merged for id_ in ids}) + + return article_by_id + + @classmethod + def deduplicate_articles( + cls, + articles: list[Article], + ) -> list[Article]: + article_by_id = cls._uniqe_articles_by_id(articles) + + unique_articles: list[Article] = [] + seen = set() + for article in articles: + if not article.ids: + continue + id_ = next(iter(article.ids)) + unique = article_by_id[id_] + if id(unique) in seen: + continue + unique_articles.append(unique) + seen.add(id(unique)) - :return: an iterable over `Article`. - """ - cache = {article.key: article for article in self.articles} - for article in self.articles: - for reference in article.references: - cache.setdefault(reference.key, reference) - yield from cache.values() + for article in unique_articles: + new_references = [] + for ref in article.references: + if not ref.ids: + continue + id_ = next(iter(ref.ids)) + new_references.append(article_by_id.get(id_, ref)) + article.references = new_references + + return unique_articles @property def citation_pairs(self) -> Iterable[tuple[Article, Article]]: - cache = {article.key: article for article in self.articles} for article in self.articles: if not article.references: continue for reference in article.references: - # Yield a full article if we have the metadata for a citation - if article.key != reference.key and reference.key in cache: - logger.debug("Found a cache hit for key %s", reference.key) - yield article, cache[reference.key] - else: - yield article, reference + yield article, reference @property def _first_year(self) -> int: - """ - Returns the year of the first article in the collection. + """Returns the year of the first article in the collection. :return: an integer. """ @@ -63,14 +125,16 @@ def _first_year(self) -> int: ) def published_by_year(self) -> dict[int, int]: - """ - Returns a dictionary where the key is the year of publication and the value is the number of articles - published that year. The dictionary starts from the oldest article to the current year consecutively. - If a year has no publications the value will be zero. + """Retrn a dictionary with the publication count by year. + + Returns a dictionary where the key is the year of publication and the + value is the number of articles published that year. The dictionary + starts from the oldest article to the current year consecutively. If a + year has no publications the value will be zero. :return: a dictionary with the number of articles published each year. """ - current_year = datetime.date.today().year + current_year = datetime.datetime.now(datetime.timezone.utc).year years = {} for year in range(self._first_year, current_year + 1): years[year] = 0 @@ -85,14 +149,16 @@ def published_by_year(self) -> dict[int, int]: return years def cited_by_year(self) -> dict[int, int]: - """ - Returns a dictionary where the key is the year of publication and the value is the number of - citations in that year. The dictionary starts from the oldest article to the current year consecutively. - If a year has no citations the value will be zero. + """Return a dictionary with the citation count by year. + + Returns a dictionary where the key is the year of publication and the + value is the number of citations in that year. The dictionary starts + from the oldest article to the current year consecutively. If a year has + no citations the value will be zero. :return: a dictionary with the number of citations each year. """ - current_year = datetime.date.today().year + current_year = datetime.datetime.now(datetime.timezone.utc).year cited_items_per_year = {} for year in range(self._first_year, current_year + 1): cited_items_per_year[year] = 0 diff --git a/src/bibx/_entities/collection_builders/cross_ref.py b/src/bibx/_entities/collection_builders/cross_ref.py index 8cb9e40..0239d0b 100644 --- a/src/bibx/_entities/collection_builders/cross_ref.py +++ b/src/bibx/_entities/collection_builders/cross_ref.py @@ -3,11 +3,11 @@ class CrossRefCollectionBuilder(CollectionBuilder): - def __init__(self, query: str, count: int = 100): + def __init__(self, query: str, count: int = 100) -> None: self._query = query self._count = count - def with_count(self, count: int): + def with_count(self, count: int) -> "CrossRefCollectionBuilder": self._count = count return self diff --git a/src/bibx/_entities/collection_builders/generic.py b/src/bibx/_entities/collection_builders/generic.py index 1e84bcb..4a62245 100644 --- a/src/bibx/_entities/collection_builders/generic.py +++ b/src/bibx/_entities/collection_builders/generic.py @@ -3,7 +3,7 @@ class GenericCollectionBuilder(CollectionBuilder): - def __init__(self, *collections: Collection): + def __init__(self, *collections: Collection) -> None: self._collections = collections def build(self) -> Collection: diff --git a/src/bibx/_entities/collection_builders/openalex.py b/src/bibx/_entities/collection_builders/openalex.py new file mode 100644 index 0000000..8d9f748 --- /dev/null +++ b/src/bibx/_entities/collection_builders/openalex.py @@ -0,0 +1,114 @@ +import logging +from enum import Enum +from typing import Optional +from urllib.parse import urlparse + +from bibx._entities.article import Article +from bibx._entities.collection import Collection +from bibx.clients.openalex import OpenAlexClient, Work + +from .base import CollectionBuilder + +logger = logging.getLogger(__name__) + + +class HandleReferences(Enum): + BASIC = "basic" + FULL = "full" + + +class OpenAlexCollectionBuilder(CollectionBuilder): + def __init__( + self, + query: str, + limit: int = 600, + references: HandleReferences = HandleReferences.BASIC, + client: Optional[OpenAlexClient] = None, + ) -> None: + self.query = query + self.limit = limit + self.references = references + self.client = client or OpenAlexClient() + + def build(self) -> Collection: + """Build a collection of articles from the OpenAlex API.""" + logger.info("building collection for query %s", self.query) + works = self.client.list_recent_articles(self.query, self.limit) + cache = {work.id: work for work in works} + if self.references == HandleReferences.FULL: + references: list[str] = [] + for work in works: + references.extend(work.referenced_works) + missing = set(references) - set(cache.keys()) + logger.info("fetching %d missing references", len(missing)) + missing_works = self.client.list_articles_by_openalex_id(list(missing)) + cache.update({work.id: work for work in missing_works}) + article_cache = { + openalexid: self._work_to_article(work) + for openalexid, work in cache.items() + } + logger.info("enriching references") + articles = [] + for work in works: + article = article_cache[work.id] + article.references = [ + article_cache.get(reference, self._reference_to_article(reference)) + for reference in work.referenced_works + if reference != work.id + ] + articles.append(article) + return Collection(Collection.deduplicate_articles(articles)) + + @staticmethod + def _invert_name(name: str) -> str: + *first, last = name.split() + return f"{last}, {' '.join(first)}" + + @staticmethod + def _extract_doi(url: str) -> str: + parsed = urlparse(url) + if parsed.scheme != "https": + # maybe it was actually a DOI + return url + return parsed.path.lstrip("/") + + @classmethod + def _work_to_article(cls, work: Work) -> Article: + article = Article( + ids={ + f"{source}:{id_}" + if source != "doi" + else f"{source}:{cls._extract_doi(id_)}" + for source, id_ in work.ids.items() + }, + authors=[cls._invert_name(a.author.display_name) for a in work.authorships], + year=work.publication_year, + title=work.title, + journal=( + work.primary_location + and work.primary_location.source + and work.primary_location.source.display_name + ), + volume=work.biblio.volume, + issue=work.biblio.issue, + page=work.biblio.first_page, + doi=cls._extract_doi(work.doi) if work.doi else None, + _label=work.id, + _permalink=work.primary_location and work.primary_location.landing_page_url, + times_cited=work.cited_by_count, + references=[cls._reference_to_article(r) for r in work.referenced_works], + keywords=[k.display_name for k in work.keywords], + sources={"openalex"}, + extra={}, + ) + if article.simple_id: + article.ids.add(f"simple:{article.simple_id}") + return article + + @staticmethod + def _reference_to_article(reference: str) -> Article: + return Article( + ids={f"openalex:{reference}"}, + _permalink=reference, + sources={"openalex"}, + ) diff --git a/src/bibx/_entities/collection_builders/scopus_bib.py b/src/bibx/_entities/collection_builders/scopus_bib.py index 0455126..eec8610 100644 --- a/src/bibx/_entities/collection_builders/scopus_bib.py +++ b/src/bibx/_entities/collection_builders/scopus_bib.py @@ -9,38 +9,40 @@ from bibx._entities.article import Article from bibx._entities.collection import Collection from bibx._entities.collection_builders.base import CollectionBuilder -from bibx.exceptions import MissingCriticalInformation +from bibx.exceptions import MissingCriticalInformationError class ScopusBibCollectionBuilder(CollectionBuilder): - def __init__(self, *scopus_files: TextIO): + def __init__(self, *scopus_files: TextIO) -> None: self._files = scopus_files for file in self._files: file.seek(0) def build(self) -> Collection: articles = self._get_articles_from_files() - return Collection(list(articles)) + return Collection(Collection.deduplicate_articles(list(articles))) def _get_articles_from_files(self) -> Iterable[Article]: for file in self._files: db = bibtexparser.load(file) for entry in db.entries: - with suppress(MissingCriticalInformation): + with suppress(MissingCriticalInformationError): yield self._article_from_entry(entry) def _article_from_entry(self, entry: dict) -> Article: if "author" not in entry or "year" not in entry: - raise MissingCriticalInformation() + raise MissingCriticalInformationError() if "note" in entry: match = re.search(r"cited By (\d+)", entry["note"], re.IGNORECASE) - if match: - times_cited = int(match.groups()[0]) - else: - times_cited = None + times_cited = int(match.groups()[0]) if match else None else: times_cited = None - return Article( + ids = set() + doi = entry.get("doi") + if doi is not None: + ids.add(f"doi:{doi}") + article = Article( + ids=ids, authors=entry["author"].split(" and "), year=int(entry["year"]), title=entry.get("title"), @@ -55,26 +57,32 @@ def _article_from_entry(self, entry: dict) -> Article: sources={json.dumps(entry)}, times_cited=times_cited, ) + article.add_simple_id() + return article def _articles_from_references(self, references: Optional[str]) -> Iterable[Article]: if references is None: references = "" for reference in references.split("; "): - with suppress(MissingCriticalInformation): + with suppress(MissingCriticalInformationError): yield self._article_from_reference(reference) @staticmethod def _article_from_reference(reference: str) -> Article: match = re.search(r"\((\d{4})\)", reference) if not match: - raise MissingCriticalInformation() + raise MissingCriticalInformationError() year = int(match.groups()[0]) author = reference.split(",", maxsplit=2)[0].strip() - doi = re.search(r"(10.\d{4,9}/[-._;()/:A-Z0-9]+)", reference) - return Article( + match = re.search(r"(10.\d{4,9}/[-._;()/:A-Z0-9]+)", reference) + doi = match.groups()[0] if match else None + article = Article( + ids=set() if doi is None else {f"doi:{doi}"}, authors=[author], year=year, _label=reference, - doi=doi.groups()[0] if doi else None, + doi=doi, sources={reference}, ) + article.add_simple_id() + return article diff --git a/src/bibx/_entities/collection_builders/scopus_ris.py b/src/bibx/_entities/collection_builders/scopus_ris.py index ed8aa08..788791a 100644 --- a/src/bibx/_entities/collection_builders/scopus_ris.py +++ b/src/bibx/_entities/collection_builders/scopus_ris.py @@ -7,12 +7,14 @@ from bibx._entities.article import Article from bibx._entities.collection import Collection from bibx._entities.collection_builders.base import CollectionBuilder -from bibx.exceptions import InvalidScopusFile, MissingCriticalInformation +from bibx.exceptions import InvalidScopusFileError, MissingCriticalInformationError logger = logging.getLogger(__name__) +_RIS_PATTERN = re.compile(r"^(((?P[A-Z0-9]{2}))[ ]{2}-[ ]{1})?(?P(.*))$") -def _size(file) -> int: + +def _size(file: TextIO) -> int: file.seek(0, 2) size = file.tell() file.seek(0) @@ -35,14 +37,14 @@ def _joined(raw: Optional[list[str]]) -> Optional[str]: class ScopusRisCollectionBuilder(CollectionBuilder): - def __init__(self, *ris_files: TextIO): + def __init__(self, *ris_files: TextIO) -> None: self._files = ris_files for file in self._files: file.seek(0) def build(self) -> Collection: articles = self._get_articles_from_files() - return Collection(list(articles)) + return Collection(Collection.deduplicate_articles(list(articles))) def _get_articles_from_files(self) -> Iterable[Article]: for file in self._files: @@ -72,9 +74,9 @@ def _find_volume_info(ref: str) -> tuple[dict[str, str], str]: if volume: data.update(volume.groupdict()) - if "volume" in data and data["volume"]: + if data.get("volume"): data["volume"] = f"V{data['volume']}" - if "page" in data and data["page"]: + if data.get("page"): data["page"] = f"P{data['page']}" return data, ref[last_index:] @@ -99,8 +101,9 @@ def _article_form_reference(cls, scopusref: str) -> Article: volume_info, rest = cls._find_volume_info(rest) doi, _ = cls._find_doi(scopusref) if not authors or not year: - raise MissingCriticalInformation() - return Article( + raise MissingCriticalInformationError() + article = Article( + ids=set() if doi is None else {f"doi:{doi}"}, authors=[f"{first_name} {last_name.replace(' ', '').replace('.', '')}"], year=int(year), journal=( @@ -112,6 +115,8 @@ def _article_form_reference(cls, scopusref: str) -> Article: page=volume_info.get("page"), doi=doi, ) + article.add_simple_id() + return article @classmethod def _parse_references(cls, refs: list[str]) -> list[Article]: @@ -122,21 +127,18 @@ def _parse_references(cls, refs: list[str]) -> list[Article]: try: result.append(cls._article_form_reference(ref)) except (KeyError, IndexError, TypeError, ValueError): - logging.debug(f"Ignoring invalid reference {ref}") + logging.debug("Ignoring invalid reference %s", ref) return result @staticmethod def _ris_to_dict(record: str) -> dict[str, list[str]]: - RIS_PATTERN = re.compile( - r"^(((?P[A-Z0-9]{2}))[ ]{2}-[ ]{1})?(?P(.*))$" - ) parsed = defaultdict(list) current = None for line in record.split("\n"): - match = RIS_PATTERN.match(line) + match = _RIS_PATTERN.match(line) if not match: - raise InvalidScopusFile() + raise InvalidScopusFileError() data = match.groupdict() key = data.get("key") value = data.get("value") @@ -159,7 +161,11 @@ def _article_from_record(cls, record: str) -> Article: year = _int_or_nothing(data.get("PY", [])) times_cited = _int_or_nothing(data.get("TC")) authors = data.get("AU", []) - return Article( + if not authors or not year: + raise MissingCriticalInformationError() + doi = data.get("DO") + article = Article( + ids=set() if doi is None else {f"doi:{doi}"}, title=_joined(data.get("TI")), authors=authors, year=year, @@ -174,6 +180,8 @@ def _article_from_record(cls, record: str) -> Article: extra=data, times_cited=times_cited, ) + article.add_simple_id() + return article @classmethod def _parse_file(cls, file: TextIO) -> Iterable[Article]: @@ -185,5 +193,5 @@ def _parse_file(cls, file: TextIO) -> Iterable[Article]: try: article = cls._article_from_record(item.strip()) yield article - except MissingCriticalInformation: + except MissingCriticalInformationError: logger.info("Missing critical information for record %s", item) diff --git a/src/bibx/_entities/collection_builders/simple.py b/src/bibx/_entities/collection_builders/simple.py index 4a6c3cd..742b3fc 100644 --- a/src/bibx/_entities/collection_builders/simple.py +++ b/src/bibx/_entities/collection_builders/simple.py @@ -4,8 +4,8 @@ class SimpleCollectionBuilder(CollectionBuilder): - def __init__(self, articles: list[Article]): + def __init__(self, articles: list[Article]) -> None: self.articles = articles def build(self) -> Collection: - return Collection(self.articles[:]) + return Collection(Collection.deduplicate_articles(self.articles)) diff --git a/src/bibx/_entities/collection_builders/wos.py b/src/bibx/_entities/collection_builders/wos.py index 4534f0a..872bdd5 100644 --- a/src/bibx/_entities/collection_builders/wos.py +++ b/src/bibx/_entities/collection_builders/wos.py @@ -5,15 +5,15 @@ from collections.abc import Iterable, Mapping from contextlib import suppress from dataclasses import dataclass -from typing import Any, Callable, Optional, TextIO +from typing import Any, Callable, ClassVar, Optional, TextIO, Union from bibx._entities.article import Article from bibx._entities.collection import Collection from bibx._entities.collection_builders.base import CollectionBuilder from bibx.exceptions import ( InvalidIsiLineError, - InvalidIsiReference, - MissingCriticalInformation, + InvalidIsiReferenceError, + MissingCriticalInformationError, ) logger = logging.getLogger(__name__) @@ -38,7 +38,8 @@ def _delimited(values: list[str], delimiter: str = "; ") -> list[str]: def _integer(values: list[str]) -> int: if len(values) > 1: - raise ValueError(f"Expected no more than one item and got {len(values)}") + message = f"Expected no more than one item and got {len(values)}" + raise ValueError(message) first, *_ = values return int(first.strip()) @@ -51,7 +52,7 @@ class IsiField: parser: Callable aliases: list[str] - def parse(self, value: list[str]): + def parse(self, value: list[str]) -> Union[str, int, list[str]]: return self.parser(value) @@ -70,7 +71,7 @@ class WosCollectionBuilder(CollectionBuilder): re.X, ) - FIELDS = { + FIELDS: ClassVar = { "AB": IsiField("AB", "Abstract", _joined, ["abstract"]), "AF": IsiField("AF", "Author Full Names", _ident, ["author_full_names"]), "AR": IsiField("AR", "Article Number", _joined, ["article_number"]), @@ -247,7 +248,7 @@ class WosCollectionBuilder(CollectionBuilder): ), } - def __init__(self, *isi_files: TextIO): + def __init__(self, *isi_files: TextIO) -> None: self._files = isi_files for file in self._files: file.seek(0) @@ -260,13 +261,13 @@ def _get_articles_as_str_from_files(self) -> Iterable[str]: for file in self._files: articles_as_str = file.read().split("\n\n") for article_as_str in articles_as_str: - if article_as_str != "ER" and article_as_str: + if article_as_str.strip() not in ("ER", "EF") and article_as_str: # Strip `\n` at the end of the article so we don't trip yield article_as_str.strip() def _get_articles_from_files(self) -> Iterable[Article]: for article_as_str in self._get_articles_as_str_from_files(): - with suppress(MissingCriticalInformation): + with suppress(MissingCriticalInformationError): article = self._parse_article_from_str(article_as_str) yield article @@ -277,7 +278,7 @@ def _get_articles_from_references( if not references: return for ref_str in references: - with suppress(InvalidIsiReference): + with suppress(InvalidIsiReferenceError): yield cls._parse_reference_from_str(ref_str) @classmethod @@ -294,10 +295,10 @@ def _parse_article_from_str(cls, article_as_str: str) -> Article: if not field or "value" not in parsed or parsed["value"] is None: continue article_data[field].append(parsed["value"]) - processed = cls._parse_all(dict(article_data)) - - return Article( + doi = processed.get("DOI") + article = Article( + ids=set() if doi is None else {f"doi:{doi}"}, authors=processed.get("authors", []), year=processed.get("year"), title=processed.get("title"), @@ -305,7 +306,7 @@ def _parse_article_from_str(cls, article_as_str: str) -> Article: volume=processed.get("volume"), issue=processed.get("issue"), page=processed.get("beginning_page"), - doi=processed.get("DOI"), + doi=doi, times_cited=processed.get("times_cited"), references=list( cls._get_articles_from_references(processed.get("references")) @@ -314,15 +315,19 @@ def _parse_article_from_str(cls, article_as_str: str) -> Article: extra=processed, sources={article_as_str}, ) + article.add_simple_id() + return article @classmethod def _parse_reference_from_str(cls, reference: str) -> Article: match = cls.ISI_CITATION_PATTERN.match(reference) if not match: - raise InvalidIsiReference(reference) + raise InvalidIsiReferenceError(reference) data = {key: [value] for key, value in match.groupdict().items() if value} processed = cls._parse_all(data) - return Article( + doi = processed.get("DOI") + article = Article( + ids=set() if doi is None else {f"doi:{doi}"}, _label=reference, title=processed.get("title"), authors=processed.get("authors", []), @@ -336,6 +341,8 @@ def _parse_reference_from_str(cls, reference: str) -> Article: sources={reference}, times_cited=processed.get("times_cited"), ) + article.add_simple_id() + return article @classmethod def _parse_all(cls, article_data: dict[str, list[str]]) -> Mapping[str, Any]: @@ -355,5 +362,5 @@ def _parse(cls, key: str, value: list[str]) -> dict: parsed_value = field.parse(value) return {new_key: parsed_value for new_key in [field.key, *field.aliases]} - logger.debug(f"Found an unknown field with key {key} and value {value}") + logger.debug("Found an unknown field with key %s and value %s", key, value) return {key: _ident(value)} diff --git a/src/bibx/algorithms/__init__.py b/src/bibx/algorithms/__init__.py index e69de29..8360516 100644 --- a/src/bibx/algorithms/__init__.py +++ b/src/bibx/algorithms/__init__.py @@ -0,0 +1 @@ +"""Algorithm implementations.""" diff --git a/src/bibx/algorithms/preprocess.py b/src/bibx/algorithms/preprocess.py index 0915e8f..cfba3a8 100644 --- a/src/bibx/algorithms/preprocess.py +++ b/src/bibx/algorithms/preprocess.py @@ -1,6 +1,4 @@ -""" -Algorithms for preprocessing the data. -""" +"""Algorithms for preprocessing the data.""" from typing import Optional @@ -13,18 +11,22 @@ class Preprocess: + """Preprocess the data.""" + def __init__(self, wos: Collection, scopus: Collection) -> None: self.wos = wos self.scopus = scopus self._merged: Optional[Collection] = None @property - def merged(self): + def merged(self) -> Collection: + """Merge the collections.""" if self._merged is None: self._merged = self.wos.merge(self.scopus) return self._merged def __repr__(self) -> str: + """Return the representation of the object.""" return f"{self.__class__.__name__}(wos={self.wos}, scopus={self.scopus})" def _write_collection_to_workseet( @@ -61,15 +63,19 @@ def _write_collection_to_workseet( workseet.write(i, 9, row.label) def write_merged_information(self, workseet: Worksheet) -> None: + """Write the merged information to a worksheet.""" self._write_collection_to_workseet(self.merged, workseet) def write_wos_information(self, workseet: Worksheet) -> None: + """Write the WOS information to a worksheet.""" self._write_collection_to_workseet(self.wos, workseet) def write_scopus_information(self, workseet: Worksheet) -> None: + """Write the Scopus information to a worksheet.""" self._write_collection_to_workseet(self.scopus, workseet) def write_reference_information(self, workseet: Worksheet) -> None: + """Write the reference information to a worksheet.""" for i, title in enumerate( [ "SR", @@ -95,6 +101,7 @@ def write_reference_information(self, workseet: Worksheet) -> None: row += 1 def write_journal_information(self, workseet: Worksheet) -> None: + """Write the journal information to the worksheet.""" for i, title in enumerate( [ "Label", @@ -116,6 +123,7 @@ def write_journal_information(self, workseet: Worksheet) -> None: row += 1 def write_author_information(self, workseet: Worksheet) -> None: + """Write the author information to the worksheet.""" for i, title in enumerate( [ "Author", @@ -137,6 +145,7 @@ def write_author_information(self, workseet: Worksheet) -> None: row += 1 def write_times_cited_information(self, workseet: Worksheet) -> None: + """Write the times cited information to the worksheet.""" for i, title in enumerate( [ "Year", @@ -156,15 +165,16 @@ def write_times_cited_information(self, workseet: Worksheet) -> None: def _get_tos(data: dict) -> str: if data[ROOT] > 0: return "Root" - elif data[TRUNK] > 0: + if data[TRUNK] > 0: return "Trunk" - elif data[LEAF] > 0: + if data[LEAF] > 0: return "Leaf" - elif data[BRANCH] > 0: + if data[BRANCH] > 0: return f"Branch {data[BRANCH]}" return "_" def write_tree_of_science_information(self, workseet: Worksheet) -> None: + """Write the tree of science information to the worksheet.""" s = Sap() g = s.create_graph(self.merged) g = s.clean_graph(g) @@ -208,6 +218,7 @@ def write_tree_of_science_information(self, workseet: Worksheet) -> None: i += 1 def create_workbook(self, filename: str) -> None: + """Create a workbook with the information.""" workbook = Workbook(filename) self.write_merged_information(workbook.add_worksheet("Merged")) self.write_wos_information(workbook.add_worksheet("WOS")) diff --git a/src/bibx/algorithms/sap.py b/src/bibx/algorithms/sap.py index 79264fb..e789638 100644 --- a/src/bibx/algorithms/sap.py +++ b/src/bibx/algorithms/sap.py @@ -1,4 +1,3 @@ -import dataclasses import logging from typing import Any, cast @@ -17,19 +16,22 @@ ELABORATE_SAP = "_elaborate_sap" ROOT_CONNECTIONS = "_root_connections" RAW_SAP = "_raw_sap" +MIN_LEAF_CONNECTIONS = 3 +MAX_LEAF_AGE_YEARS = 7 + logger = logging.getLogger(__name__) -def _limit(attribute: list[tuple[Any, int]], _max: int): +def _limit(attribute: list[tuple[Any, int]], _max: int) -> list[tuple[Any, int]]: if _max is not None: sorted_attribute = sorted(attribute, key=lambda x: x[1], reverse=True) attribute = sorted_attribute[:_max] return attribute -def _add_article_info(g: nx.DiGraph, article: Article): - for key, val in dataclasses.asdict(article).items(): +def _add_article_info(g: nx.DiGraph, article: Article) -> None: + for key, val in article.info().items(): if key in ("sources", "references") or key.startswith("_"): continue try: @@ -39,38 +41,37 @@ def _add_article_info(g: nx.DiGraph, article: Article): class Sap: + """Sap algorithm to classify nodes in a graph.""" + def __init__( self, max_roots: int = 20, max_leaves: int = 50, max_trunk: int = 20, - min_leaf_connections: int = 3, - max_leaf_age: int = 7, max_branch_size: int = 15, - ): - """ - Create a Sap instance with the given parameters. + ) -> None: + """Create a Sap instance with the given parameters. :param max_roots: maximum number of roots on the tree :param max_leaves: maximum number of leaves on the tree :param max_trunk: maximum number of trunk nodes in the tree - :param min_leaf_connections: minimum number of connections between leaves and roots + :param min_leaf_connections: minimum number of connections between + leaves and roots :param max_leaf_age: maximum age for a leaf """ self.max_roots = max_roots self.max_leaves = max_leaves self.max_trunk = max_trunk - self.min_leaf_connections = min_leaf_connections - self.max_leaf_age = max_leaf_age self.max_branch_size = max_branch_size + self.min_leaf_connections = MIN_LEAF_CONNECTIONS + self.max_leaf_age = MAX_LEAF_AGE_YEARS @staticmethod def create_graph(collection: Collection) -> nx.DiGraph: - """ - Creates a `networkx.DiGraph` from a `Collection`. + """Create a `networkx.DiGraph` from a `Collection`. - It uses the article label as a key and adds all the properties of the article to - the graph. + It uses the article label as a key and adds all the properties of the + article to the graph. :param collection: a `bibx.Collection` instance. :return: a `networkx.DiGraph` instance. @@ -87,8 +88,7 @@ def create_graph(collection: Collection) -> nx.DiGraph: @staticmethod def clean_graph(g: nx.DiGraph) -> nx.DiGraph: - """ - Clean a graph to make it ready for the sap algorithm. + """Clean a graph to make it ready for the sap algorithm. :param g: graph with unnecessary nodes :return: cleaned up giant component @@ -99,11 +99,7 @@ def clean_graph(g: nx.DiGraph) -> nx.DiGraph: # Remove nodes that cite one element and are never cited themselves giant.remove_nodes_from( - [ - n - for n in giant - if giant.in_degree(n) == 1 and giant.out_degree(n) == 0 # noqa # noqa - ] + [n for n in giant if giant.in_degree(n) == 1 and giant.out_degree(n) == 0] ) # Break loops @@ -115,52 +111,47 @@ def clean_graph(g: nx.DiGraph) -> nx.DiGraph: return giant - def tree(self, graph: nx.DiGraph, clear: bool = False) -> nx.DiGraph: - """ - Computes the whole tree. - """ + def tree(self, graph: nx.DiGraph) -> nx.DiGraph: + """Compute the whole tree.""" graph = cast(nx.DiGraph, graph.copy()) graph = self._compute_root(graph) graph = self._compute_leaves(graph) graph = self._compute_sap(graph) graph = self._compute_trunk(graph) - graph = self._compute_branches(graph) - if clear: - graph = self._clear(graph) - return graph + return self._compute_branches(graph) def _compute_root(self, graph: nx.DiGraph) -> nx.DiGraph: - """ - Takes in a connected graph and returns it labeled with a `root` property. + """Label a graph with the root property. + :return: Labeled graph with the root property. """ g = cast(nx.DiGraph, graph.copy()) valid_roots = [ - (n, cast(int, g.in_degree(n))) - for n in g.nodes - if g.out_degree(n) == 0 # noqa + (n, cast(int, g.in_degree(n))) for n in g.nodes if g.out_degree(n) == 0 ] sorted_roots = _limit(valid_roots, self.max_roots) - nx.set_node_attributes(g, 0, ROOT) + nx.set_node_attributes(g, 0, ROOT) # type: ignore for node, degree in sorted_roots: g.nodes[node][ROOT] = degree return g def _compute_leaves(self, graph: nx.DiGraph) -> nx.DiGraph: - """ - Takes in a connected graph and returns it labeled with a `leaf` property. + """Label a graph with the leaf property. + :param graph: Connected and filtered graph to work with. :return: Labeled graph with the leaf property. """ g = cast(nx.DiGraph, graph.copy()) try: roots = [n for n, d in g.nodes.items() if d[ROOT] > 0] - except AttributeError: - raise TypeError("It's necessary to have some roots") + except AttributeError as e: + message = "It's necessary to have a 'root' attribute" + raise TypeError(message) from e if not roots: - raise TypeError("It's necessary to have some roots") + message = "It's necessary to have some roots" + raise TypeError(message) - nx.set_node_attributes(g, 0, ROOT_CONNECTIONS) + nx.set_node_attributes(g, 0, ROOT_CONNECTIONS) # type: ignore for node in roots: g.nodes[node][ROOT_CONNECTIONS] = 1 topological_order = list(nx.topological_sort(g)) @@ -174,7 +165,7 @@ def _compute_leaves(self, graph: nx.DiGraph) -> nx.DiGraph: potential_leaves = [ (node, g.nodes[node][ROOT_CONNECTIONS]) for node in g.nodes - if g.in_degree(node) == 0 # noqa + if g.in_degree(node) == 0 ] extended_leaves = potential_leaves[:] @@ -206,39 +197,56 @@ def _compute_leaves(self, graph: nx.DiGraph) -> nx.DiGraph: potential_leaves = extended_leaves potential_leaves = _limit(potential_leaves, self.max_leaves) - nx.set_node_attributes(g, 0, LEAF) + nx.set_node_attributes(g, 0, LEAF) # type: ignore for node, c in potential_leaves: g.nodes[node][LEAF] = c return g @staticmethod - def _compute_sap(graph: nx.DiGraph) -> nx.DiGraph: - """ - Computes the sap of each node. - """ + def _raw_sap(graph: nx.DiGraph) -> nx.DiGraph: + """Compute the raw sap of each node.""" g = cast(nx.DiGraph, graph.copy()) try: valid_root = [n for n, d in g.nodes.items() if d[ROOT] > 0] - valid_leaves = [n for n, d in g.nodes.items() if d[LEAF] > 0] - except AttributeError: - raise TypeError("The graph needs to have a 'root' and a 'leaf' attribute") - if not valid_root or not valid_leaves: - raise TypeError("The graph needs to have at least some roots and leafs") - - nx.set_node_attributes(g, 0, RAW_SAP) - nx.set_node_attributes(g, 0, ROOT_CONNECTIONS) + except AttributeError as e: + message = "The graph needs to have a 'root' attribute" + raise TypeError(message) from e + if not valid_root: + message = "The graph needs to have at least some roots" + raise TypeError(message) + + nx.set_node_attributes(g, 0, ROOT_CONNECTIONS) # type: ignore + nx.set_node_attributes(g, 0, RAW_SAP) # type: ignore + for node in valid_root: g.nodes[node][RAW_SAP] = g.nodes[node][ROOT] g.nodes[node][ROOT_CONNECTIONS] = 1 for node in reversed(list(nx.topological_sort(g))): neighbors = list(g.successors(node)) - if neighbors: - for attr in (RAW_SAP, ROOT_CONNECTIONS): - g.nodes[node][attr] = sum(g.nodes[nb][attr] for nb in neighbors) + if not neighbors: + continue + for attr in (RAW_SAP, ROOT_CONNECTIONS): + g.nodes[node][attr] = sum(g.nodes[nb][attr] for nb in neighbors) + + return g + + @staticmethod + def _elaborate_sap(graph: nx.DiGraph) -> nx.DiGraph: + """Compute the elaborate sap of each node.""" + g = Sap._raw_sap(graph) - nx.set_node_attributes(g, 0, ELABORATE_SAP) - nx.set_node_attributes(g, 0, LEAF_CONNECTIONS) - for node in valid_leaves: + try: + valid_leaf = [n for n, d in g.nodes.items() if d[LEAF] > 0] + except AttributeError as e: + message = "The graph needs to have a 'leaf' attribute" + raise TypeError(message) from e + if not valid_leaf: + message = "The graph needs to have at least some leaves" + raise TypeError(message) + + nx.set_node_attributes(g, 0, ELABORATE_SAP) # type: ignore + nx.set_node_attributes(g, 0, LEAF_CONNECTIONS) # type: ignore + for node in valid_leaf: g.nodes[node][ELABORATE_SAP] = g.nodes[node][LEAF] g.nodes[node][LEAF_CONNECTIONS] = 1 for node in nx.topological_sort(g): @@ -247,7 +255,15 @@ def _compute_sap(graph: nx.DiGraph) -> nx.DiGraph: for attr in (ELABORATE_SAP, LEAF_CONNECTIONS): g.nodes[node][attr] = sum(g.nodes[nb][attr] for nb in neighbors) - nx.set_node_attributes(g, 0, SAP) + return g + + @staticmethod + def _compute_sap(graph: nx.DiGraph) -> nx.DiGraph: + """Compute the sap of each node.""" + g = Sap._raw_sap(graph) + g = Sap._elaborate_sap(g) + + nx.set_node_attributes(g, 0, SAP) # type: ignore for node in g.nodes: g.nodes[node][SAP] = ( g.nodes[node][LEAF_CONNECTIONS] * g.nodes[node][RAW_SAP] @@ -257,9 +273,7 @@ def _compute_sap(graph: nx.DiGraph) -> nx.DiGraph: return g def _compute_trunk(self, graph: nx.DiGraph) -> nx.DiGraph: - """ - Tags leaves. - """ + """Tags leaves.""" g = cast(nx.DiGraph, graph.copy()) try: potential_trunk = [ @@ -267,33 +281,33 @@ def _compute_trunk(self, graph: nx.DiGraph) -> nx.DiGraph: for n, d in g.nodes.items() if d[ROOT] == 0 and d[LEAF] == 0 and d[SAP] > 0 ] - except AttributeError: - raise TypeError( - "The graph needs to have a 'root', 'leaf' and 'sap' attributes" - ) + except AttributeError as e: + message = "The graph needs to have a 'root', 'leaf' and 'sap' attributes" + raise TypeError(message) from e if not potential_trunk: - raise TypeError("The graph needs to have at least some nodes with sap") + message = "The graph needs to have at least some nodes with sap" + raise TypeError(message) potential_trunk = _limit(potential_trunk, self.max_trunk) - nx.set_node_attributes(g, 0, TRUNK) + nx.set_node_attributes(g, 0, TRUNK) # type: ignore for node, sap in potential_trunk: g.nodes[node][TRUNK] = sap return g def _compute_branches(self, graph: nx.DiGraph) -> nx.DiGraph: - """ - Tags branches. - """ + """Tags branches.""" g = cast(nx.DiGraph, graph.copy()) undirected = g.to_undirected() communities: list[set] = louvain_communities(undirected) - branches = list(sorted(communities, key=len))[:3] - nx.set_node_attributes(g, 0, BRANCH) + branches = sorted(communities, key=len)[:3] + nx.set_node_attributes(g, 0, BRANCH) # type: ignore for i, branch in enumerate(branches, start=1): potential_branch = [ (n, g.nodes[n][YEAR]) for n in branch - if g.nodes[n][ROOT] == 0 and g.nodes[n][TRUNK] == 0 + if g.nodes[n][ROOT] == 0 + and g.nodes[n][TRUNK] == 0 + and g.nodes[n][YEAR] is not None ] potential_branch = _limit(potential_branch, self.max_branch_size) for node, _ in potential_branch: @@ -301,10 +315,8 @@ def _compute_branches(self, graph: nx.DiGraph) -> nx.DiGraph: return g @staticmethod - def _clear(graph: nx.DiGraph) -> nx.DiGraph: - """ - Returns a copy of the graph clear of untagged nodes. - """ + def clear(graph: nx.DiGraph) -> nx.DiGraph: + """Return a copy of the graph clear of untagged nodes.""" nodes = [ n for n in graph.nodes diff --git a/src/bibx/cli.py b/src/bibx/cli.py index ff62948..4ff177e 100644 --- a/src/bibx/cli.py +++ b/src/bibx/cli.py @@ -4,9 +4,16 @@ import networkx as nx import typer -from rich import print - -from bibx import Collection, read_any, read_scopus_bib, read_scopus_ris, read_wos +from rich import print as rprint + +from bibx import ( + Collection, + query_openalex, + read_any, + read_scopus_bib, + read_scopus_ris, + read_wos, +) from bibx.algorithms.preprocess import Preprocess from bibx.algorithms.sap import Sap @@ -14,35 +21,36 @@ class Format(Enum): + """Supported formats.""" + WOS = "wos" RIS = "ris" BIB = "bib" @app.command() -def describe(format: Format, filename: str): - """ - Parses a file and provides a short description. - """ +def describe(format: Format, filename: str) -> None: + """Parse a file and provides a short description.""" if format == Format.WOS: - c = read_wos(open(filename)) - print(":boom: the file satisfies the ISI WOS format") - print(f"There are {len(c.articles)} records parsed") + with open(filename) as f: + c = read_wos(f) + rprint(":boom: the file satisfies the ISI WOS format") + rprint(f"There are {len(c.articles)} records parsed") if format == Format.RIS: - c = read_scopus_ris(open(filename)) - print(":boom: the file satisfies the ISI WOS format") - print(f"There are {len(c.articles)} records parsed") + with open(filename) as f: + c = read_scopus_ris(f) + rprint(":boom: the file satisfies the ISI WOS format") + rprint(f"There are {len(c.articles)} records parsed") if format == Format.BIB: - c = read_scopus_bib(open(filename)) - print(":boom: the file satisfies the ISI WOS format") - print(f"There are {len(c.articles)} records parsed") + with open(filename) as f: + c = read_scopus_bib(f) + rprint(":boom: the file satisfies the ISI WOS format") + rprint(f"There are {len(c.articles)} records parsed") @app.command() -def toy_sap(): - """ - Runs the sap algorithm on a toy graph. - """ +def toy_sap() -> None: + """Run the sap algorithm on a toy graph.""" graph = nx.DiGraph() for node in "abcde": graph.add_node(node, year=2000) @@ -51,14 +59,12 @@ def toy_sap(): graph.add_edge("b", node) s = Sap() graph = s.tree(graph) - print(graph) + rprint(graph) @app.command() -def sap(filename: str): - """ - Runs the sap algorithm on a seed file of any supported format. - """ +def sap(filename: str) -> None: + """Run the sap algorithm on a seed file of any supported format.""" with open(filename) as f: collection = read_any(f) @@ -66,7 +72,18 @@ def sap(filename: str): graph = s.create_graph(collection) graph = s.clean_graph(graph) graph = s.tree(graph) - print(graph) + rprint(graph) + + +@app.command() +def openalex(query: list[str]) -> None: + """Run the sap algorithm on a seed file of any supported format.""" + c = query_openalex(" ".join(query)) + s = Sap() + graph = s.create_graph(c) + graph = s.clean_graph(graph) + graph = s.tree(graph) + rprint(graph) def _read_many( @@ -87,14 +104,13 @@ def preprocess( output: str, wos: list[str] = typer.Option(help="WoS files to pre process"), scopus: list[str] = typer.Option(help="scopus files to preprocess"), -): - """ - Preprocesses a collection. - """ +) -> None: + """Preprocesses a collection.""" wos_collection = _read_many(read_wos, *wos) scopus_collection = _read_many(read_scopus_ris, *scopus) p = Preprocess(wos_collection, scopus_collection) p.create_workbook(output) + rprint(f":boom: workbook created at {output}") if __name__ == "__main__": diff --git a/src/bibx/clients/__init__.py b/src/bibx/clients/__init__.py new file mode 100644 index 0000000..d89e11b --- /dev/null +++ b/src/bibx/clients/__init__.py @@ -0,0 +1 @@ +"""Clients to talk to external services.""" diff --git a/src/bibx/clients/openalex.py b/src/bibx/clients/openalex.py new file mode 100644 index 0000000..8219e84 --- /dev/null +++ b/src/bibx/clients/openalex.py @@ -0,0 +1,189 @@ +import logging +from enum import Enum +from typing import Optional, Union + +import requests +from pydantic import BaseModel, ValidationError + +from bibx.exceptions import OpenAlexError +from bibx.utils import chunks + +logger = logging.getLogger(__name__) + +MAX_WORKS_PER_PAGE = 200 +MAX_IDS_PER_REQUEST = 80 + + +class AuthorPosition(Enum): + """Position of an author in a work.""" + + FIRST = "first" + MIDDLE = "middle" + LAST = "last" + + +class Author(BaseModel): + """An author from the openalex API.""" + + id: str + display_name: str + orcid: Optional[str] = None + + +class WorkAuthorship(BaseModel): + """An authorship from the openalex API.""" + + author_position: AuthorPosition + author: Author + is_corresponding: bool + + +class WorkKeyword(BaseModel): + """A keyword from the openalex API.""" + + id: str + display_name: str + score: float + + +class WorkBiblio(BaseModel): + """Work bibliographic information from the openalex API.""" + + volume: Optional[str] = None + issue: Optional[str] = None + first_page: Optional[str] = None + last_page: Optional[str] = None + + +class WorkLocationSource(BaseModel): + """Source of the work location from the openalex API.""" + + id: str + display_name: str + type: str + + +class WorkLoacation(BaseModel): + """Location of the work from the openalex API.""" + + is_oa: bool + landing_page_url: Optional[str] = None + pdf_url: Optional[str] = None + source: Optional[WorkLocationSource] + + +class Work(BaseModel): + """A work from the openalex API.""" + + id: str + ids: dict[str, str] + doi: Optional[str] = None + title: Optional[str] = None + publication_year: int + authorships: list[WorkAuthorship] + cited_by_count: int + keywords: list[WorkKeyword] + referenced_works: list[str] + biblio: WorkBiblio + primary_location: Optional[WorkLoacation] = None + + +class ResponseMeta(BaseModel): + """Metadata from the openalex API response.""" + + count: int + page: int + per_page: int + + +class WorkResponse(BaseModel): + """Response from the openalex API.""" + + results: list[Work] + meta: ResponseMeta + + +class OpenAlexClient: + """Client for the openalex API.""" + + def __init__( + self, + base_url: Optional[str] = None, + email: Optional[str] = None, + ) -> None: + self.base_url = base_url or "https://api.openalex.org" + self.session = requests.Session() + self.email = email or "technology@coreofscience.org" + self.session.headers.update( + { + "Accept": "application/json", + "Content-Type": "application/json", + "User-Agent": f"Python/requests/bibx mailto:{email}", + } + ) + + def list_recent_articles(self, query: str, limit: int = 600) -> list[Work]: + """List recent articles from the openalex API.""" + select = ",".join(Work.model_fields.keys()) + filter_ = ",".join( + [ + f"title_and_abstract.search:{query.replace(' ', '+')}", + "type:types/article", + "cited_by_count:>1", + ] + ) + pages = (limit // MAX_WORKS_PER_PAGE) + 1 + results: list[Work] = [] + for page in range(1, pages + 1): + logger.info("fetching page %d with filter %s", page, filter_) + params: dict[str, Union[str, int]] = { + "select": select, + "filter": filter_, + "sort": "publication_year:desc", + "per_page": MAX_WORKS_PER_PAGE, + "page": page, + } + response = self.session.get( + f"{self.base_url}/works", + params=params, + ) + try: + response.raise_for_status() + data = response.json() + work_response = WorkResponse.model_validate(data) + logger.info( + "fetched %d works in page %d", len(work_response.results), page + ) + results.extend(work_response.results) + if page * MAX_WORKS_PER_PAGE >= work_response.meta.count: + break + except (requests.RequestException, ValidationError) as error: + raise OpenAlexError(str(error)) from error + return results[:limit] + + def list_articles_by_openalex_id(self, ids: list[str]) -> list[Work]: + """List articles by openalex id.""" + select = ",".join(Work.model_fields.keys()) + filter_ = ",".join([f"ids.openalex:{id_}" for id_ in ids]) + results: list[Work] = [] + for ids_ in chunks(ids, MAX_IDS_PER_REQUEST): + value = "|".join(ids_) + filter_ = f"ids.openalex:{value},type:types/article" + logger.info("fetching %d ids from openalex", len(ids_)) + params: dict[str, Union[str, int]] = { + "select": select, + "filter": filter_, + "per_page": MAX_IDS_PER_REQUEST, + } + response = self.session.get( + f"{self.base_url}/works", + params=params, + ) + try: + response.raise_for_status() + data = response.json() + work_response = WorkResponse.model_validate(data) + results.extend(work_response.results) + except (requests.RequestException, ValidationError) as error: + raise OpenAlexError(str(error)) from error + return results diff --git a/src/bibx/exceptions.py b/src/bibx/exceptions.py index e0a3c1b..37b9fba 100644 --- a/src/bibx/exceptions.py +++ b/src/bibx/exceptions.py @@ -1,40 +1,34 @@ class BibXError(Exception): - """ - Any exception known by wostools. - """ + """Any exception known by wostools.""" class InvalidIsiLineError(BibXError, ValueError): - """ - Raised when we encounter an invalid line when processing an ISI file. - """ + """Raised when we encounter an invalid line when processing an ISI file.""" - def __init__(self, line: str): + def __init__(self, line: str) -> None: super().__init__(f"'{line}' is not a valid ISI file line") -class InvalidIsiReference(BibXError, ValueError): - """ - Raised when we try to create an article out of an invalid ISI reference. - """ +class InvalidIsiReferenceError(BibXError, ValueError): + """Raised when we try to create an article out of an invalid ISI reference.""" - def __init__(self, reference: str): + def __init__(self, reference: str) -> None: super().__init__(f"{reference} does not look like an ISI citation") -class MissingCriticalInformation(BibXError, ValueError): - """ - Raised when we don't have the publication year of an article. - """ +class MissingCriticalInformationError(BibXError, ValueError): + """Raised when we don't have the publication year of an article.""" - def __init__(self): + def __init__(self) -> None: super().__init__("Article is missing some critical information") -class InvalidScopusFile(BibXError, ValueError): - """ - Raised when we find an invalid line on an scopus RIS file. - """ +class InvalidScopusFileError(BibXError, ValueError): + """Raised when we find an invalid line on an scopus RIS file.""" - def __init__(self): + def __init__(self) -> None: super().__init__("The file contains an invalid RIS line") + + +class OpenAlexError(BibXError): + """Raised when we encounter an error with the OpenAlex API.""" diff --git a/src/bibx/utils.py b/src/bibx/utils.py new file mode 100644 index 0000000..15fd7d6 --- /dev/null +++ b/src/bibx/utils.py @@ -0,0 +1,10 @@ +from collections.abc import Generator +from typing import TypeVar + +T = TypeVar("T") + + +def chunks(lst: list[T], n: int) -> Generator[list[T], None, None]: + """Yield successive n-sized chunks from lst.""" + for i in range(0, len(lst), n): + yield lst[i : i + n] diff --git a/stubs/bibtexparser/__init__.pyi b/stubs/bibtexparser/__init__.pyi index aeedcb6..e7519db 100644 --- a/stubs/bibtexparser/__init__.pyi +++ b/stubs/bibtexparser/__init__.pyi @@ -1,3 +1,6 @@ +from typing import TextIO + from _typeshed import Incomplete +from bibtexparser.bparser import BibDatabase -def load(bibtex_file, parser: Incomplete | None = None): ... +def load(bibtex_file: TextIO, parser: Incomplete | None = None) -> BibDatabase: ... diff --git a/stubs/networkx/__init__.pyi b/stubs/networkx/__init__.pyi index 8d36ae7..17e6bbd 100644 --- a/stubs/networkx/__init__.pyi +++ b/stubs/networkx/__init__.pyi @@ -1,8 +1,8 @@ from collections.abc import Iterable, Iterator -from typing import Any from _typeshed import Self from networkx.classes.reportviews import NodeView +from pyparsing.helpers import Union __version__: str @@ -10,8 +10,10 @@ class Graph: nodes: NodeView def subgraph(self: Self, nodes: Iterable) -> Self: ... def copy(self: Self) -> Self: ... - def add_node(self, node: str, **kwargs) -> None: ... - def add_edge(self, u: str, v: str, **kwargs) -> None: ... + def add_node( + self, node: str, **kwargs: Union[int, str, list[str], None] + ) -> None: ... + def add_edge(self, u: str, v: str, **kwargs: int) -> None: ... def add_edges_from(self, edges: Iterable) -> None: ... def remove_nodes_from(self, edges: Iterable) -> None: ... def remove_edges_from(self, edges: Iterable) -> None: ... @@ -25,8 +27,9 @@ class DiGraph(Graph): def predecessors(self, node: str) -> Iterable[str]: ... # NOTE: The attr parameter of this function is quite more complicated than this -def set_node_attributes(g: DiGraph, attr: Any, name: str) -> None: ... +def set_node_attributes(g: DiGraph, attr: Union[int, float], name: str) -> None: ... def topological_sort(g: DiGraph) -> Iterable[str]: ... def selfloop_edges(g: DiGraph) -> Iterable[str]: ... -def weakly_connected_components(g: DiGraph) -> Iterable[list[str]]: ... -def strongly_connected_components(g: DiGraph) -> Iterable[list[str]]: ... +def weakly_connected_components(g: DiGraph) -> Iterable[set[str]]: ... +def strongly_connected_components(g: DiGraph) -> Iterable[set[str]]: ... +def connected_components(g: Graph) -> Iterable[set[str]]: ... diff --git a/stubs/networkx/algorithms/community/louvain.pyi b/stubs/networkx/algorithms/community/louvain.pyi index 06e7aa6..73506e2 100644 --- a/stubs/networkx/algorithms/community/louvain.pyi +++ b/stubs/networkx/algorithms/community/louvain.pyi @@ -1,3 +1,3 @@ from networkx import Graph -def louvain_communities(g: Graph): ... +def louvain_communities(g: Graph) -> list[set]: ... diff --git a/tests/algorithms/test_sap.py b/tests/algorithms/test_sap.py index 437c50d..ed5387c 100644 --- a/tests/algorithms/test_sap.py +++ b/tests/algorithms/test_sap.py @@ -4,8 +4,7 @@ def create_toy_graph() -> nx.DiGraph: - r""" - Creates a toy graph with known sap. + r"""Create a toy graph with known sap. The toy graph has the following shape: @@ -15,7 +14,6 @@ def create_toy_graph() -> nx.DiGraph: / | \ e f g """ - g = nx.DiGraph() for n in "abcdefg": # we need to have years for all the nodes @@ -27,7 +25,8 @@ def create_toy_graph() -> nx.DiGraph: return g -def test_sap(): +def test_sap() -> None: + """Test the sap algorithm with a toy graph.""" g = create_toy_graph() s = Sap() g = s.tree(g) diff --git a/tests/entities/collection_builders/test_scopus_bib.py b/tests/entities/collection_builders/test_scopus_bib.py index 54a9057..8c8b219 100644 --- a/tests/entities/collection_builders/test_scopus_bib.py +++ b/tests/entities/collection_builders/test_scopus_bib.py @@ -3,7 +3,8 @@ from bibx import read_scopus_bib -def test_scopus_works(): +def test_scopus_works() -> None: + """Test that we can read a scopus file.""" file = io.StringIO( """ @ARTICLE{Boerner19995318, @@ -42,11 +43,11 @@ def test_scopus_works(): art_number={169805}, note={cited By 0}, url={https://www.scopus.com/inward/record.uri?eid=2-s2.0-85135886322&doi=10.1016%2fj.jmmm.2022.169805&partnerID=40&md5=759122e06fc4fd8cc4a87011561de73b}, -affiliation={Koszalin University of Technology, Faculty of Electronics and Informatics, Koszalin, Poland; Silesian University of Technology, Institute of Physics – Center for Science and Education, Gliwice, Poland; Bielefeld University of Applied Sciences, Faculty of Engineering Sciences and Mathematics, Bielefeld, Germany}, +affiliation={Koszalin University of Technology, Faculty of Electronics and Informatics, Koszalin, Poland; Silesian University of Technology, Institute of Physics - Center for Science and Education, Gliwice, Poland; Bielefeld University of Applied Sciences, Faculty of Engineering Sciences and Mathematics, Bielefeld, Germany}, abstract={Improved magnetic memory systems belong to the main research topics in spintronics. Here we show micromagnetic simulations used to analyze the energy density of nano-scaled iron spheres. Layers of different thickness, partly coated with iron oxide, were tested in terms of spatial uniformity of the physical system energy. For a single non-coated or iron-oxide coated droplet, the spatial distribution of the total energy is not uniform and depends on the nano-droplet size. Additionally, for systems consisting of four objects, the relation between relative distance and the resultant magnetization distribution was analyzed. The mutual relation between droplet size and the underlying magnetization distribution as well as the character of local energy extrema was investigated. The size changes for the four-droplet system were compared with the single object behavior to obtain a criterion for the minimum distance between spheres to behave as a single object. The calculations revealed that the oxidized spheres could be placed closer to each other in comparison to the non-coated system. For the proposed oxide coated system, the increase of this maximum packing density is equal to about 12%, as compared to the non-coated system. © 2022 Elsevier B.V.}, author_keywords={Ferromagnetism; Magnetic anisotropy; Micromagnetism}, keywords={Drops; Ferromagnetic materials; Ferromagnetism; Magnetic anisotropy; Magnetic storage; Magnetization; Spheres, Bit-patterned media; Coated systems; Droplets sizes; Energy distributions; Magnetic memory; Magnetization distribution; Memory systems; Micromagnetisms; Nano-spheres; Single object, Iron oxides}, -references={Richter, H., Dobin, A., Heinonen, O., Gao, K., Veerdonk, R., Lynch, R., Xue, J., Brockie, R., Recording on Bit-Patterned Media at Densities of 1 Tb/in and Beyond (2006) IEEE Trans. Magn., 42, pp. 2255-2260; Koltsov, D.K., Adeyeye, A.O., Welland, M.E., Tricker, D.M., Single-domain circular nanomagnets (1999) Phys. Rev. Lett., 83, p. 1042; Zhang, W., Haas, S., Phase diagram of magnetization reversal processes in nanorings (2010) Phys. Rev. B, 81; He, K., Smith, D.J., McCartney, M.R., Effects of vortex chirality and shape anisotropy on magnetization reversal of Co nanorings (invited) (2010) J. Appl. Phys., 107, p. 09D307; Blachowicz, T., Ehrmann, A., Square nano-magnets as bit-patterned media with doubled possible data density (2017) Mater. Today:. Proc., 4, pp. S226-S231; Döpke, C., Grothe, T., Steblinski, P., Klöcker, M., Sabantina, L., Kosmalska, D., Blachowicz, T., Ehrmann, A., Magnetic Nanofiber Mats for Data Storage and Transfer (2019) Nanomaterials, 9, p. 92; Steblinski, P., Blachowicz, T., Conception of magnetic memory switched by time dependent current density and current electron spin polarization (2019) International Journal of Electronics and Telecommunications, 65, p. 309; Shutyi, A.M., Sementsov, D.I., Multistability of the Magnetization Reversal of a Nanoparticle with Cubic Anisotropy (2020) JETP Lett., 111, pp. 619-626; Russier, V., De-Montferrand, C., Lalatonne, Y., Motte, L., Magnetization of densely packed interacting magnetic nanoparticles with cubic and uniaxial anisotropies: a Monte Carlo study (2013) J. Appl. Phys., 114; Simeonidis, K., Martinez-Boubeta, C., Serantes, D., Ruta, S., Chubykalo-Fesenko, O., Chantrell, R., Oró-Solé, J., Angelakeris, M., Controlling Magnetization Reversal and Hyperthermia Efficiency in Core−Shell Iron−Iron Oxide Magnetic Nanoparticles by Tuning the Interphase Coupling (2020) ACS Appl. Nano Mater., 3, p. 4465; Nemati, Z., Alonso, J., Khurshid, H., Phan, M.H., Srikanth, H., Core/shell iron/iron oxide nanoparticles: are they promising for magnetic hyperthermia? (2016) RSC Adv., 6, p. 38697; Kim, Y.-W., Park, H.S., Hyun Soon Park Microstructural and Magnetic Characterization of Iron Oxide Nanoparticles Fabricated by Pulsed Wire Evaporation (2019) Electron. Mater. Lett., 15 (6), pp. 665-672; Scholz, W., Fidler, J., Schrefl, T., Suess, D., Dittrich, R., Forster, H., Tsiantos, V., Scalable parallel micromagnetic solvers for magnetic nanostructures (2003) Comput. Mater. Sci., 28, p. 366; (2019), TAO Users Manual, ANL/MCS-TM-322 Rev. 3.11, Argonne National Laboratory; Kostopoulou, A., Brintakis, K., Vasilakaki, M., Trohidou, K.N., Douvalis, A.P., Lascialfari, A., Manna, L., Lappas, A., Assembly-mediated Interplay of Dipolar Interactions and Surface Spin Disorder in Colloidal Maghemite Nanoclusters (2014) Nanoscale, 6 (7), pp. 3764-3776; Coey, J.M.D., Magnetism and Magnetic Materials (2010), Cambridge University Press; Cullity, B.D., Graham, C.D., Introduction to magnetic materials (2008), 2nd ed. Wiley; Johansson, C., Hanson, M., Pedersen, M.S., Morup, S., Magnetic properties of magnetic liquids with iron-oxide particles—the influence of anisotropy and interactions (1997) J. Magn. Magn. Mater., 173, pp. 5-14; Aguilera-del-Toro, R.H., Aguilera-Granja, F., Torres, M.B., Vega, A., Relation between structural patterns and magnetism in small iron oxide clusters: reentrance of the magnetic moment at high oxidation ratios (2021) Phys. Chem. Chem. Phys., 23, pp. 246-272; Erlebach, A., Hühn, C., Jana, R., Sierka, M., Structure and magnetic properties of (Fe2O3)n clusters (n = 1–5) (2014) Phys. Chem. Chem. Phys., 16 (48), pp. 26421-26426}, +references={Richter, H., Dobin, A., Heinonen, O., Gao, K., Veerdonk, R., Lynch, R., Xue, J., Brockie, R., Recording on Bit-Patterned Media at Densities of 1 Tb/in and Beyond (2006) IEEE Trans. Magn., 42, pp. 2255-2260; Koltsov, D.K., Adeyeye, A.O., Welland, M.E., Tricker, D.M., Single-domain circular nanomagnets (1999) Phys. Rev. Lett., 83, p. 1042; Zhang, W., Haas, S., Phase diagram of magnetization reversal processes in nanorings (2010) Phys. Rev. B, 81; He, K., Smith, D.J., McCartney, M.R., Effects of vortex chirality and shape anisotropy on magnetization reversal of Co nanorings (invited) (2010) J. Appl. Phys., 107, p. 09D307; Blachowicz, T., Ehrmann, A., Square nano-magnets as bit-patterned media with doubled possible data density (2017) Mater. Today:. Proc., 4, pp. S226-S231; Döpke, C., Grothe, T., Steblinski, P., Klöcker, M., Sabantina, L., Kosmalska, D., Blachowicz, T., Ehrmann, A., Magnetic Nanofiber Mats for Data Storage and Transfer (2019) Nanomaterials, 9, p. 92; Steblinski, P., Blachowicz, T., Conception of magnetic memory switched by time dependent current density and current electron spin polarization (2019) International Journal of Electronics and Telecommunications, 65, p. 309; Shutyi, A.M., Sementsov, D.I., Multistability of the Magnetization Reversal of a Nanoparticle with Cubic Anisotropy (2020) JETP Lett., 111, pp. 619-626; Russier, V., De-Montferrand, C., Lalatonne, Y., Motte, L., Magnetization of densely packed interacting magnetic nanoparticles with cubic and uniaxial anisotropies: a Monte Carlo study (2013) J. Appl. Phys., 114; Simeonidis, K., Martinez-Boubeta, C., Serantes, D., Ruta, S., Chubykalo-Fesenko, O., Chantrell, R., Oró-Solé, J., Angelakeris, M., Controlling Magnetization Reversal and Hyperthermia Efficiency in Core-Shell Iron-Iron Oxide Magnetic Nanoparticles by Tuning the Interphase Coupling (2020) ACS Appl. Nano Mater., 3, p. 4465; Nemati, Z., Alonso, J., Khurshid, H., Phan, M.H., Srikanth, H., Core/shell iron/iron oxide nanoparticles: are they promising for magnetic hyperthermia? (2016) RSC Adv., 6, p. 38697; Kim, Y.-W., Park, H.S., Hyun Soon Park Microstructural and Magnetic Characterization of Iron Oxide Nanoparticles Fabricated by Pulsed Wire Evaporation (2019) Electron. Mater. Lett., 15 (6), pp. 665-672; Scholz, W., Fidler, J., Schrefl, T., Suess, D., Dittrich, R., Forster, H., Tsiantos, V., Scalable parallel micromagnetic solvers for magnetic nanostructures (2003) Comput. Mater. Sci., 28, p. 366; (2019), TAO Users Manual, ANL/MCS-TM-322 Rev. 3.11, Argonne National Laboratory; Kostopoulou, A., Brintakis, K., Vasilakaki, M., Trohidou, K.N., Douvalis, A.P., Lascialfari, A., Manna, L., Lappas, A., Assembly-mediated Interplay of Dipolar Interactions and Surface Spin Disorder in Colloidal Maghemite Nanoclusters (2014) Nanoscale, 6 (7), pp. 3764-3776; Coey, J.M.D., Magnetism and Magnetic Materials (2010), Cambridge University Press; Cullity, B.D., Graham, C.D., Introduction to magnetic materials (2008), 2nd ed. Wiley; Johansson, C., Hanson, M., Pedersen, M.S., Morup, S., Magnetic properties of magnetic liquids with iron-oxide particles—the influence of anisotropy and interactions (1997) J. Magn. Magn. Mater., 173, pp. 5-14; Aguilera-del-Toro, R.H., Aguilera-Granja, F., Torres, M.B., Vega, A., Relation between structural patterns and magnetism in small iron oxide clusters: reentrance of the magnetic moment at high oxidation ratios (2021) Phys. Chem. Chem. Phys., 23, pp. 246-272; Erlebach, A., Hühn, C., Jana, R., Sierka, M., Structure and magnetic properties of (Fe2O3)n clusters (n = 1-5) (2014) Phys. Chem. Chem. Phys., 16 (48), pp. 26421-26426}, correspondence_address1={Blachowicz, T.; Silesian University of Technology, S. Konarskiego 22B str., 44-100 Gliwice, Poland; email: tomasz.blachowicz@polsl.pl}, publisher={Elsevier B.V.}, issn={03048853}, @@ -66,11 +67,11 @@ def test_scopus_works(): doi={10.1016/j.jmmm.2022.169805}, art_number={169805}, url={https://www.scopus.com/inward/record.uri?eid=2-s2.0-85135886322&doi=10.1016%2fj.jmmm.2022.169805&partnerID=40&md5=759122e06fc4fd8cc4a87011561de73b}, -affiliation={Koszalin University of Technology, Faculty of Electronics and Informatics, Koszalin, Poland; Silesian University of Technology, Institute of Physics – Center for Science and Education, Gliwice, Poland; Bielefeld University of Applied Sciences, Faculty of Engineering Sciences and Mathematics, Bielefeld, Germany}, +affiliation={Koszalin University of Technology, Faculty of Electronics and Informatics, Koszalin, Poland; Silesian University of Technology, Institute of Physics - Center for Science and Education, Gliwice, Poland; Bielefeld University of Applied Sciences, Faculty of Engineering Sciences and Mathematics, Bielefeld, Germany}, abstract={Improved magnetic memory systems belong to the main research topics in spintronics. Here we show micromagnetic simulations used to analyze the energy density of nano-scaled iron spheres. Layers of different thickness, partly coated with iron oxide, were tested in terms of spatial uniformity of the physical system energy. For a single non-coated or iron-oxide coated droplet, the spatial distribution of the total energy is not uniform and depends on the nano-droplet size. Additionally, for systems consisting of four objects, the relation between relative distance and the resultant magnetization distribution was analyzed. The mutual relation between droplet size and the underlying magnetization distribution as well as the character of local energy extrema was investigated. The size changes for the four-droplet system were compared with the single object behavior to obtain a criterion for the minimum distance between spheres to behave as a single object. The calculations revealed that the oxidized spheres could be placed closer to each other in comparison to the non-coated system. For the proposed oxide coated system, the increase of this maximum packing density is equal to about 12%, as compared to the non-coated system. © 2022 Elsevier B.V.}, author_keywords={Ferromagnetism; Magnetic anisotropy; Micromagnetism}, keywords={Drops; Ferromagnetic materials; Ferromagnetism; Magnetic anisotropy; Magnetic storage; Magnetization; Spheres, Bit-patterned media; Coated systems; Droplets sizes; Energy distributions; Magnetic memory; Magnetization distribution; Memory systems; Micromagnetisms; Nano-spheres; Single object, Iron oxides}, -references={Richter, H., Dobin, A., Heinonen, O., Gao, K., Veerdonk, R., Lynch, R., Xue, J., Brockie, R., Recording on Bit-Patterned Media at Densities of 1 Tb/in and Beyond (2006) IEEE Trans. Magn., 42, pp. 2255-2260; Koltsov, D.K., Adeyeye, A.O., Welland, M.E., Tricker, D.M., Single-domain circular nanomagnets (1999) Phys. Rev. Lett., 83, p. 1042; Zhang, W., Haas, S., Phase diagram of magnetization reversal processes in nanorings (2010) Phys. Rev. B, 81; He, K., Smith, D.J., McCartney, M.R., Effects of vortex chirality and shape anisotropy on magnetization reversal of Co nanorings (invited) (2010) J. Appl. Phys., 107, p. 09D307; Blachowicz, T., Ehrmann, A., Square nano-magnets as bit-patterned media with doubled possible data density (2017) Mater. Today:. Proc., 4, pp. S226-S231; Döpke, C., Grothe, T., Steblinski, P., Klöcker, M., Sabantina, L., Kosmalska, D., Blachowicz, T., Ehrmann, A., Magnetic Nanofiber Mats for Data Storage and Transfer (2019) Nanomaterials, 9, p. 92; Steblinski, P., Blachowicz, T., Conception of magnetic memory switched by time dependent current density and current electron spin polarization (2019) International Journal of Electronics and Telecommunications, 65, p. 309; Shutyi, A.M., Sementsov, D.I., Multistability of the Magnetization Reversal of a Nanoparticle with Cubic Anisotropy (2020) JETP Lett., 111, pp. 619-626; Russier, V., De-Montferrand, C., Lalatonne, Y., Motte, L., Magnetization of densely packed interacting magnetic nanoparticles with cubic and uniaxial anisotropies: a Monte Carlo study (2013) J. Appl. Phys., 114; Simeonidis, K., Martinez-Boubeta, C., Serantes, D., Ruta, S., Chubykalo-Fesenko, O., Chantrell, R., Oró-Solé, J., Angelakeris, M., Controlling Magnetization Reversal and Hyperthermia Efficiency in Core−Shell Iron−Iron Oxide Magnetic Nanoparticles by Tuning the Interphase Coupling (2020) ACS Appl. Nano Mater., 3, p. 4465; Nemati, Z., Alonso, J., Khurshid, H., Phan, M.H., Srikanth, H., Core/shell iron/iron oxide nanoparticles: are they promising for magnetic hyperthermia? (2016) RSC Adv., 6, p. 38697; Kim, Y.-W., Park, H.S., Hyun Soon Park Microstructural and Magnetic Characterization of Iron Oxide Nanoparticles Fabricated by Pulsed Wire Evaporation (2019) Electron. Mater. Lett., 15 (6), pp. 665-672; Scholz, W., Fidler, J., Schrefl, T., Suess, D., Dittrich, R., Forster, H., Tsiantos, V., Scalable parallel micromagnetic solvers for magnetic nanostructures (2003) Comput. Mater. Sci., 28, p. 366; (2019), TAO Users Manual, ANL/MCS-TM-322 Rev. 3.11, Argonne National Laboratory; Kostopoulou, A., Brintakis, K., Vasilakaki, M., Trohidou, K.N., Douvalis, A.P., Lascialfari, A., Manna, L., Lappas, A., Assembly-mediated Interplay of Dipolar Interactions and Surface Spin Disorder in Colloidal Maghemite Nanoclusters (2014) Nanoscale, 6 (7), pp. 3764-3776; Coey, J.M.D., Magnetism and Magnetic Materials (2010), Cambridge University Press; Cullity, B.D., Graham, C.D., Introduction to magnetic materials (2008), 2nd ed. Wiley; Johansson, C., Hanson, M., Pedersen, M.S., Morup, S., Magnetic properties of magnetic liquids with iron-oxide particles—the influence of anisotropy and interactions (1997) J. Magn. Magn. Mater., 173, pp. 5-14; Aguilera-del-Toro, R.H., Aguilera-Granja, F., Torres, M.B., Vega, A., Relation between structural patterns and magnetism in small iron oxide clusters: reentrance of the magnetic moment at high oxidation ratios (2021) Phys. Chem. Chem. Phys., 23, pp. 246-272; Erlebach, A., Hühn, C., Jana, R., Sierka, M., Structure and magnetic properties of (Fe2O3)n clusters (n = 1–5) (2014) Phys. Chem. Chem. Phys., 16 (48), pp. 26421-26426}, +references={Richter, H., Dobin, A., Heinonen, O., Gao, K., Veerdonk, R., Lynch, R., Xue, J., Brockie, R., Recording on Bit-Patterned Media at Densities of 1 Tb/in and Beyond (2006) IEEE Trans. Magn., 42, pp. 2255-2260; Koltsov, D.K., Adeyeye, A.O., Welland, M.E., Tricker, D.M., Single-domain circular nanomagnets (1999) Phys. Rev. Lett., 83, p. 1042; Zhang, W., Haas, S., Phase diagram of magnetization reversal processes in nanorings (2010) Phys. Rev. B, 81; He, K., Smith, D.J., McCartney, M.R., Effects of vortex chirality and shape anisotropy on magnetization reversal of Co nanorings (invited) (2010) J. Appl. Phys., 107, p. 09D307; Blachowicz, T., Ehrmann, A., Square nano-magnets as bit-patterned media with doubled possible data density (2017) Mater. Today:. Proc., 4, pp. S226-S231; Döpke, C., Grothe, T., Steblinski, P., Klöcker, M., Sabantina, L., Kosmalska, D., Blachowicz, T., Ehrmann, A., Magnetic Nanofiber Mats for Data Storage and Transfer (2019) Nanomaterials, 9, p. 92; Steblinski, P., Blachowicz, T., Conception of magnetic memory switched by time dependent current density and current electron spin polarization (2019) International Journal of Electronics and Telecommunications, 65, p. 309; Shutyi, A.M., Sementsov, D.I., Multistability of the Magnetization Reversal of a Nanoparticle with Cubic Anisotropy (2020) JETP Lett., 111, pp. 619-626; Russier, V., De-Montferrand, C., Lalatonne, Y., Motte, L., Magnetization of densely packed interacting magnetic nanoparticles with cubic and uniaxial anisotropies: a Monte Carlo study (2013) J. Appl. Phys., 114; Simeonidis, K., Martinez-Boubeta, C., Serantes, D., Ruta, S., Chubykalo-Fesenko, O., Chantrell, R., Oró-Solé, J., Angelakeris, M., Controlling Magnetization Reversal and Hyperthermia Efficiency in Core-Shell Iron-Iron Oxide Magnetic Nanoparticles by Tuning the Interphase Coupling (2020) ACS Appl. Nano Mater., 3, p. 4465; Nemati, Z., Alonso, J., Khurshid, H., Phan, M.H., Srikanth, H., Core/shell iron/iron oxide nanoparticles: are they promising for magnetic hyperthermia? (2016) RSC Adv., 6, p. 38697; Kim, Y.-W., Park, H.S., Hyun Soon Park Microstructural and Magnetic Characterization of Iron Oxide Nanoparticles Fabricated by Pulsed Wire Evaporation (2019) Electron. Mater. Lett., 15 (6), pp. 665-672; Scholz, W., Fidler, J., Schrefl, T., Suess, D., Dittrich, R., Forster, H., Tsiantos, V., Scalable parallel micromagnetic solvers for magnetic nanostructures (2003) Comput. Mater. Sci., 28, p. 366; (2019), TAO Users Manual, ANL/MCS-TM-322 Rev. 3.11, Argonne National Laboratory; Kostopoulou, A., Brintakis, K., Vasilakaki, M., Trohidou, K.N., Douvalis, A.P., Lascialfari, A., Manna, L., Lappas, A., Assembly-mediated Interplay of Dipolar Interactions and Surface Spin Disorder in Colloidal Maghemite Nanoclusters (2014) Nanoscale, 6 (7), pp. 3764-3776; Coey, J.M.D., Magnetism and Magnetic Materials (2010), Cambridge University Press; Cullity, B.D., Graham, C.D., Introduction to magnetic materials (2008), 2nd ed. Wiley; Johansson, C., Hanson, M., Pedersen, M.S., Morup, S., Magnetic properties of magnetic liquids with iron-oxide particles—the influence of anisotropy and interactions (1997) J. Magn. Magn. Mater., 173, pp. 5-14; Aguilera-del-Toro, R.H., Aguilera-Granja, F., Torres, M.B., Vega, A., Relation between structural patterns and magnetism in small iron oxide clusters: reentrance of the magnetic moment at high oxidation ratios (2021) Phys. Chem. Chem. Phys., 23, pp. 246-272; Erlebach, A., Hühn, C., Jana, R., Sierka, M., Structure and magnetic properties of (Fe2O3)n clusters (n = 1-5) (2014) Phys. Chem. Chem. Phys., 16 (48), pp. 26421-26426}, correspondence_address1={Blachowicz, T.; Silesian University of Technology, S. Konarskiego 22B str., 44-100 Gliwice, Poland; email: tomasz.blachowicz@polsl.pl}, publisher={Elsevier B.V.}, issn={03048853}, @@ -80,15 +81,14 @@ def test_scopus_works(): document_type={Article}, source={Scopus}, } -""" +""" # noqa: E501 ) data = read_scopus_bib(file) - assert len(data.articles) == 3 + assert len(data.articles) == 2 # noqa: PLR2004 assert ( data.articles[1].title - == "Analysis of the energy distribution of iron nano-spheres for bit-patterned media" + == "Analysis of the energy distribution of iron nano-spheres for bit-patterned media" # noqa: E501 ) - assert data.articles[0].times_cited == 12 + assert data.articles[0].times_cited == 12 # noqa: PLR2004 assert data.articles[1].times_cited == 0 - assert data.articles[2].times_cited is None - assert len(list(data.citation_pairs)) == 49 + assert len(list(data.citation_pairs)) == 29 # noqa: PLR2004 diff --git a/tests/entities/collection_builders/test_scopus_ris.py b/tests/entities/collection_builders/test_scopus_ris.py index adaddd0..ba2852c 100644 --- a/tests/entities/collection_builders/test_scopus_ris.py +++ b/tests/entities/collection_builders/test_scopus_ris.py @@ -3,7 +3,8 @@ from bibx import read_scopus_ris -def test_scopus_works(): +def test_scopus_works() -> None: + """Test that we can read a scopus RIS file.""" file = io.StringIO( """ TY - JOUR @@ -33,15 +34,15 @@ def test_scopus_works(): Papusoi, C., Desai, M., Acharya, R., (2015) J. Phys. D Appl. Phys., 48; UR - https://www.scopus.com/inward/record.uri?eid=2-s2.0-85091557653&doi=10.1063%2f5.0020407&partnerID=40&md5=1471f5e876fae65e040690b345036add ER - -""" +""" # noqa: E501 ) data = read_scopus_ris(file) assert len(data.articles) == 1 (article,) = data.articles assert ( article.title - == "FORC signatures and switching-field distributions of dipolar coupled nanowire-based hysterons" + == "FORC signatures and switching-field distributions of dipolar coupled nanowire-based hysterons" # noqa: E501 ) - assert article.year == 2020 - assert len(list(data.citation_pairs)) == 10 + assert article.year == 2020 # noqa: PLR2004 + assert len(list(data.citation_pairs)) == 10 # noqa: PLR2004 assert article.times_cited is None diff --git a/tests/entities/collection_builders/test_wos.py b/tests/entities/collection_builders/test_wos.py index bf9a053..1c8bb59 100644 --- a/tests/entities/collection_builders/test_wos.py +++ b/tests/entities/collection_builders/test_wos.py @@ -3,7 +3,8 @@ from bibx import read_wos -def test_scopus_works(): +def test_scopus_works() -> None: + """Test that we can read a Scopus file.""" file = io.StringIO( """ FN Thomson Reuters Web of Science™ @@ -105,14 +106,14 @@ def test_scopus_works(): GA EU7BQ UT WOS:000401190100002 ER -""" +""" # noqa: E501 ) data = read_wos(file) assert len(data.articles) == 1 (article,) = data.articles assert ( article.title - == "In situ grazing incidence small-angle X-ray scattering study of solvent vapor annealing in lamellae-forming block copolymer thin films: Trade-off of defects in deswelling" + == "In situ grazing incidence small-angle X-ray scattering study of solvent vapor annealing in lamellae-forming block copolymer thin films: Trade-off of defects in deswelling" # noqa: E501 ) - assert len(list(data.citation_pairs)) == 37 + assert len(list(data.citation_pairs)) == 37 # noqa: PLR2004 assert article.times_cited == 0 diff --git a/tests/entities/test_collection.py b/tests/entities/test_collection.py index c4d8801..c831821 100644 --- a/tests/entities/test_collection.py +++ b/tests/entities/test_collection.py @@ -3,6 +3,7 @@ articles = [ Article( + ids={"doi:1"}, authors=["A"], year=2010, title="Aa", @@ -18,6 +19,7 @@ _label=None, ), Article( + ids={"doi:12"}, authors=["B"], year=2000, title="Bb", @@ -33,6 +35,7 @@ _label=None, ), Article( + ids={"doi:13"}, authors=["C"], year=2021, title="Cc", @@ -48,6 +51,7 @@ _label=None, ), Article( + ids={"doi:14"}, authors=["D"], year=2022, title="Dd", @@ -63,6 +67,7 @@ _label=None, ), Article( + ids={"doi:15"}, authors=["E"], year=2005, title="Ee", @@ -78,6 +83,7 @@ _label=None, ), Article( + ids={"doi:16"}, authors=["F"], year=2005, title="Ff", @@ -93,6 +99,7 @@ _label=None, ), Article( + ids={"doi:17"}, authors=["J"], year=2010, title="Jj", @@ -108,6 +115,7 @@ _label=None, ), Article( + ids={"doi:18"}, authors=["H"], year=2000, title="Hh", @@ -123,6 +131,7 @@ _label=None, ), Article( + ids={"doi:19"}, authors=["I"], year=2021, title="Ii", @@ -137,6 +146,7 @@ _label=None, ), Article( + ids={"doi:19"}, authors=["I"], year=None, title="Ii", @@ -151,6 +161,7 @@ _label=None, ), Article( + ids={"doi:19"}, authors=["I"], title="Ii", journal="Aii", @@ -166,29 +177,31 @@ ] -def test_published_by_year(): +def test_published_by_year() -> None: + """Test that we can get the number of articles published by year.""" collection = Collection(articles=articles) res = collection.published_by_year() - assert res.get(2000) == 2 + assert res.get(2000) == 2 # noqa: PLR2004 assert res.get(2001) == 0 assert res.get(2002) == 0 - assert res.get(2005) == 2 - assert res.get(2010) == 2 - assert res.get(2021) == 2 + assert res.get(2005) == 2 # noqa: PLR2004 + assert res.get(2010) == 2 # noqa: PLR2004 + assert res.get(2021) == 2 # noqa: PLR2004 assert res.get(2022) == 1 assert res.get(2023) == 0 -def test_cited_by_year(): +def test_cited_by_year() -> None: + """Test that we can get the number of citations by year.""" collection = Collection(articles=articles) res = collection.cited_by_year() - assert res.get(2000) == 22 + assert res.get(2000) == 22 # noqa: PLR2004 assert res.get(2001) == 0 assert res.get(2002) == 0 assert res.get(2005) == 0 assert res.get(2010) == 1 - assert res.get(2021) == 12 - assert res.get(2022) == 2 + assert res.get(2021) == 12 # noqa: PLR2004 + assert res.get(2022) == 2 # noqa: PLR2004 assert res.get(2023) == 0 diff --git a/tests/test_works.py b/tests/test_works.py deleted file mode 100644 index 638023a..0000000 --- a/tests/test_works.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_works(): - assert True