diff --git a/src/bibx/__init__.py b/src/bibx/__init__.py index cfb5d5f..96ddce5 100644 --- a/src/bibx/__init__.py +++ b/src/bibx/__init__.py @@ -5,7 +5,7 @@ from bibx.algorithms.sap import Sap from bibx.article import Article -from bibx.builders.openalex import HandleReferences, OpenAlexCollectionBuilder +from bibx.builders.openalex import EnrichReferences, OpenAlexCollectionBuilder from bibx.builders.scopus_bib import ScopusBibCollectionBuilder from bibx.builders.scopus_ris import ScopusRisCollectionBuilder from bibx.builders.wos import WosCollectionBuilder @@ -17,7 +17,7 @@ __all__ = [ "Article", "Collection", - "HandleReferences", + "EnrichReferences", "Sap", "query_openalex", "read_any", @@ -26,16 +26,16 @@ "read_wos", ] -__version__ = "0.4.1" +__version__ = "0.5.0" def query_openalex( query: str, limit: int = 600, - references: HandleReferences = HandleReferences.BASIC, + enrich: EnrichReferences = EnrichReferences.BASIC, ) -> Collection: """Query OpenAlex and return a collection.""" - return OpenAlexCollectionBuilder(query, limit, references=references).build() + return OpenAlexCollectionBuilder(query, limit, enrich=enrich).build() def read_scopus_bib(*files: TextIO) -> Collection: diff --git a/src/bibx/builders/openalex.py b/src/bibx/builders/openalex.py index 902c72d..2f31583 100644 --- a/src/bibx/builders/openalex.py +++ b/src/bibx/builders/openalex.py @@ -12,14 +12,16 @@ logger = logging.getLogger(__name__) -MAX_REFERENCES = 400 +_COMMON_REFERENCES = 400 +_MOST_REFERENCES = 2000 -class HandleReferences(Enum): +class EnrichReferences(Enum): """How to handle references when building an openalex collection.""" BASIC = "basic" COMMON = "common" + MOST = "most" FULL = "full" @@ -30,12 +32,12 @@ def __init__( self, query: str, limit: int = 600, - references: HandleReferences = HandleReferences.BASIC, + enrich: EnrichReferences = EnrichReferences.BASIC, client: Optional[OpenAlexClient] = None, ) -> None: self.query = query self.limit = limit - self.references = references + self.enrich = enrich self.client = client or OpenAlexClient() def build(self) -> Collection: @@ -44,21 +46,23 @@ def build(self) -> Collection: works = self.client.list_recent_articles(self.query, self.limit) cache = {work.id: work for work in works} references: list[str] = [] + missing = set() for work in works: references.extend(work.referenced_works) - if self.references == HandleReferences.COMMON: + if self.enrich in (EnrichReferences.COMMON, EnrichReferences.MOST): counter = Counter(references) - most_common = {key for key, _ in counter.most_common(MAX_REFERENCES)} + count = ( + _MOST_REFERENCES + if self.enrich == EnrichReferences.MOST + else _COMMON_REFERENCES + ) + most_common = {key for key, _ in counter.most_common(count)} missing = most_common - set(cache.keys()) - logger.info("fetching %d missing references", len(missing)) - missing_works = self.client.list_articles_by_openalex_id(list(missing)) - cache.update({work.id: work for work in missing_works}) - if self.references == HandleReferences.FULL: + if self.enrich == EnrichReferences.FULL: missing = set(references) - set(cache.keys()) - logger.info("fetching %d missing references", len(missing)) - missing_works = self.client.list_articles_by_openalex_id(list(missing)) - cache.update({work.id: work for work in missing_works}) - + logger.info("fetching %d missing references", len(missing)) + missing_works = self.client.list_articles_by_openalex_id(list(missing)) + cache.update({work.id: work for work in missing_works}) article_cache = { openalexid: self._work_to_article(work) for openalexid, work in cache.items() diff --git a/src/bibx/cli.py b/src/bibx/cli.py index 453b6e4..559c223 100644 --- a/src/bibx/cli.py +++ b/src/bibx/cli.py @@ -16,7 +16,7 @@ ) from bibx.algorithms.preprocess import Preprocess from bibx.algorithms.sap import Sap -from bibx.builders.openalex import HandleReferences +from bibx.builders.openalex import EnrichReferences from bibx.collection import Collection app = typer.Typer() @@ -80,9 +80,9 @@ def sap(filename: str) -> None: @app.command() def openalex( query: list[str], - references: HandleReferences = typer.Option( + enrich: EnrichReferences = typer.Option( help="how to handle references", - default=HandleReferences.BASIC, + default=EnrichReferences.BASIC, ), verbose: bool = typer.Option( help="be more verbose", @@ -92,7 +92,7 @@ def openalex( """Run the sap algorithm on a seed file of any supported format.""" if verbose: logging.basicConfig(level=logging.INFO) - c = query_openalex(" ".join(query), references=references) + c = query_openalex(" ".join(query), enrich=enrich) s = Sap() graph = s.create_graph(c) graph = s.clean_graph(graph) diff --git a/src/bibx/clients/openalex.py b/src/bibx/clients/openalex.py index 2d4981f..726ba58 100644 --- a/src/bibx/clients/openalex.py +++ b/src/bibx/clients/openalex.py @@ -171,6 +171,8 @@ def list_recent_articles(self, query: str, limit: int = 600) -> list[Work]: def list_articles_by_openalex_id(self, ids: list[str]) -> list[Work]: """List articles by openalex id.""" + if not ids: + return [] select = ",".join(Work.model_fields.keys()) results: list[Work] = [] with ThreadPoolExecutor(max_workers=5) as executor: