From 1b0d532cb32280f15122e5778fcdc4b90f4fde9c Mon Sep 17 00:00:00 2001 From: Oscar Arbelaez Date: Mon, 3 Feb 2025 10:52:41 +0000 Subject: [PATCH 1/3] Add option to pull most references --- src/bibx/builders/openalex.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/bibx/builders/openalex.py b/src/bibx/builders/openalex.py index 902c72d..a999ae0 100644 --- a/src/bibx/builders/openalex.py +++ b/src/bibx/builders/openalex.py @@ -12,7 +12,8 @@ logger = logging.getLogger(__name__) -MAX_REFERENCES = 400 +_COMMON_REFERENCES = 400 +_MOST_REFERENCES = 2000 class HandleReferences(Enum): @@ -20,6 +21,7 @@ class HandleReferences(Enum): BASIC = "basic" COMMON = "common" + MOST = "most" FULL = "full" @@ -44,21 +46,23 @@ def build(self) -> Collection: works = self.client.list_recent_articles(self.query, self.limit) cache = {work.id: work for work in works} references: list[str] = [] + missing = set() for work in works: references.extend(work.referenced_works) - if self.references == HandleReferences.COMMON: + if self.references in (HandleReferences.COMMON, HandleReferences.MOST): counter = Counter(references) - most_common = {key for key, _ in counter.most_common(MAX_REFERENCES)} + count = ( + _MOST_REFERENCES + if self.references == HandleReferences.MOST + else _COMMON_REFERENCES + ) + most_common = {key for key, _ in counter.most_common(count)} missing = most_common - set(cache.keys()) - logger.info("fetching %d missing references", len(missing)) - missing_works = self.client.list_articles_by_openalex_id(list(missing)) - cache.update({work.id: work for work in missing_works}) if self.references == HandleReferences.FULL: missing = set(references) - set(cache.keys()) - logger.info("fetching %d missing references", len(missing)) - missing_works = self.client.list_articles_by_openalex_id(list(missing)) - cache.update({work.id: work for work in missing_works}) - + logger.info("fetching %d missing references", len(missing)) + missing_works = self.client.list_articles_by_openalex_id(list(missing)) + cache.update({work.id: work for work in missing_works}) article_cache = { openalexid: self._work_to_article(work) for openalexid, work in cache.items() From 581ba323e6afda6744165cd1b081d8063b49d4a2 Mon Sep 17 00:00:00 2001 From: Oscar Arbelaez Date: Mon, 3 Feb 2025 10:56:46 +0000 Subject: [PATCH 2/3] Change to enrich references --- src/bibx/__init__.py | 8 ++++---- src/bibx/builders/openalex.py | 12 ++++++------ src/bibx/cli.py | 8 ++++---- src/bibx/clients/openalex.py | 2 ++ 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/src/bibx/__init__.py b/src/bibx/__init__.py index cfb5d5f..7f771d5 100644 --- a/src/bibx/__init__.py +++ b/src/bibx/__init__.py @@ -5,7 +5,7 @@ from bibx.algorithms.sap import Sap from bibx.article import Article -from bibx.builders.openalex import HandleReferences, OpenAlexCollectionBuilder +from bibx.builders.openalex import EnrichReferences, OpenAlexCollectionBuilder from bibx.builders.scopus_bib import ScopusBibCollectionBuilder from bibx.builders.scopus_ris import ScopusRisCollectionBuilder from bibx.builders.wos import WosCollectionBuilder @@ -17,7 +17,7 @@ __all__ = [ "Article", "Collection", - "HandleReferences", + "EnrichReferences", "Sap", "query_openalex", "read_any", @@ -32,10 +32,10 @@ def query_openalex( query: str, limit: int = 600, - references: HandleReferences = HandleReferences.BASIC, + enrich: EnrichReferences = EnrichReferences.BASIC, ) -> Collection: """Query OpenAlex and return a collection.""" - return OpenAlexCollectionBuilder(query, limit, references=references).build() + return OpenAlexCollectionBuilder(query, limit, enrich=enrich).build() def read_scopus_bib(*files: TextIO) -> Collection: diff --git a/src/bibx/builders/openalex.py b/src/bibx/builders/openalex.py index a999ae0..2f31583 100644 --- a/src/bibx/builders/openalex.py +++ b/src/bibx/builders/openalex.py @@ -16,7 +16,7 @@ _MOST_REFERENCES = 2000 -class HandleReferences(Enum): +class EnrichReferences(Enum): """How to handle references when building an openalex collection.""" BASIC = "basic" @@ -32,12 +32,12 @@ def __init__( self, query: str, limit: int = 600, - references: HandleReferences = HandleReferences.BASIC, + enrich: EnrichReferences = EnrichReferences.BASIC, client: Optional[OpenAlexClient] = None, ) -> None: self.query = query self.limit = limit - self.references = references + self.enrich = enrich self.client = client or OpenAlexClient() def build(self) -> Collection: @@ -49,16 +49,16 @@ def build(self) -> Collection: missing = set() for work in works: references.extend(work.referenced_works) - if self.references in (HandleReferences.COMMON, HandleReferences.MOST): + if self.enrich in (EnrichReferences.COMMON, EnrichReferences.MOST): counter = Counter(references) count = ( _MOST_REFERENCES - if self.references == HandleReferences.MOST + if self.enrich == EnrichReferences.MOST else _COMMON_REFERENCES ) most_common = {key for key, _ in counter.most_common(count)} missing = most_common - set(cache.keys()) - if self.references == HandleReferences.FULL: + if self.enrich == EnrichReferences.FULL: missing = set(references) - set(cache.keys()) logger.info("fetching %d missing references", len(missing)) missing_works = self.client.list_articles_by_openalex_id(list(missing)) diff --git a/src/bibx/cli.py b/src/bibx/cli.py index 453b6e4..559c223 100644 --- a/src/bibx/cli.py +++ b/src/bibx/cli.py @@ -16,7 +16,7 @@ ) from bibx.algorithms.preprocess import Preprocess from bibx.algorithms.sap import Sap -from bibx.builders.openalex import HandleReferences +from bibx.builders.openalex import EnrichReferences from bibx.collection import Collection app = typer.Typer() @@ -80,9 +80,9 @@ def sap(filename: str) -> None: @app.command() def openalex( query: list[str], - references: HandleReferences = typer.Option( + enrich: EnrichReferences = typer.Option( help="how to handle references", - default=HandleReferences.BASIC, + default=EnrichReferences.BASIC, ), verbose: bool = typer.Option( help="be more verbose", @@ -92,7 +92,7 @@ def openalex( """Run the sap algorithm on a seed file of any supported format.""" if verbose: logging.basicConfig(level=logging.INFO) - c = query_openalex(" ".join(query), references=references) + c = query_openalex(" ".join(query), enrich=enrich) s = Sap() graph = s.create_graph(c) graph = s.clean_graph(graph) diff --git a/src/bibx/clients/openalex.py b/src/bibx/clients/openalex.py index 2d4981f..726ba58 100644 --- a/src/bibx/clients/openalex.py +++ b/src/bibx/clients/openalex.py @@ -171,6 +171,8 @@ def list_recent_articles(self, query: str, limit: int = 600) -> list[Work]: def list_articles_by_openalex_id(self, ids: list[str]) -> list[Work]: """List articles by openalex id.""" + if not ids: + return [] select = ",".join(Work.model_fields.keys()) results: list[Work] = [] with ThreadPoolExecutor(max_workers=5) as executor: From f02f396b7971d15ee51308909a13dc1b2bb874cd Mon Sep 17 00:00:00 2001 From: Oscar Arbelaez Date: Mon, 3 Feb 2025 10:56:52 +0000 Subject: [PATCH 3/3] Update version --- src/bibx/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bibx/__init__.py b/src/bibx/__init__.py index 7f771d5..96ddce5 100644 --- a/src/bibx/__init__.py +++ b/src/bibx/__init__.py @@ -26,7 +26,7 @@ "read_wos", ] -__version__ = "0.4.1" +__version__ = "0.5.0" def query_openalex(