Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/bibx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from bibx.algorithms.sap import Sap
from bibx.article import Article
from bibx.builders.openalex import HandleReferences, OpenAlexCollectionBuilder
from bibx.builders.openalex import EnrichReferences, OpenAlexCollectionBuilder
from bibx.builders.scopus_bib import ScopusBibCollectionBuilder
from bibx.builders.scopus_ris import ScopusRisCollectionBuilder
from bibx.builders.wos import WosCollectionBuilder
Expand All @@ -17,7 +17,7 @@
__all__ = [
"Article",
"Collection",
"HandleReferences",
"EnrichReferences",
"Sap",
"query_openalex",
"read_any",
Expand All @@ -26,16 +26,16 @@
"read_wos",
]

__version__ = "0.4.1"
__version__ = "0.5.0"


def query_openalex(
query: str,
limit: int = 600,
references: HandleReferences = HandleReferences.BASIC,
enrich: EnrichReferences = EnrichReferences.BASIC,
) -> Collection:
"""Query OpenAlex and return a collection."""
return OpenAlexCollectionBuilder(query, limit, references=references).build()
return OpenAlexCollectionBuilder(query, limit, enrich=enrich).build()


def read_scopus_bib(*files: TextIO) -> Collection:
Expand Down
32 changes: 18 additions & 14 deletions src/bibx/builders/openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,16 @@

logger = logging.getLogger(__name__)

MAX_REFERENCES = 400
_COMMON_REFERENCES = 400
_MOST_REFERENCES = 2000


class HandleReferences(Enum):
class EnrichReferences(Enum):
"""How to handle references when building an openalex collection."""

BASIC = "basic"
COMMON = "common"
MOST = "most"
FULL = "full"


Expand All @@ -30,12 +32,12 @@ def __init__(
self,
query: str,
limit: int = 600,
references: HandleReferences = HandleReferences.BASIC,
enrich: EnrichReferences = EnrichReferences.BASIC,
client: Optional[OpenAlexClient] = None,
) -> None:
self.query = query
self.limit = limit
self.references = references
self.enrich = enrich
self.client = client or OpenAlexClient()

def build(self) -> Collection:
Expand All @@ -44,21 +46,23 @@ def build(self) -> Collection:
works = self.client.list_recent_articles(self.query, self.limit)
cache = {work.id: work for work in works}
references: list[str] = []
missing = set()
for work in works:
references.extend(work.referenced_works)
if self.references == HandleReferences.COMMON:
if self.enrich in (EnrichReferences.COMMON, EnrichReferences.MOST):
counter = Counter(references)
most_common = {key for key, _ in counter.most_common(MAX_REFERENCES)}
count = (
_MOST_REFERENCES
if self.enrich == EnrichReferences.MOST
else _COMMON_REFERENCES
)
most_common = {key for key, _ in counter.most_common(count)}
missing = most_common - set(cache.keys())
logger.info("fetching %d missing references", len(missing))
missing_works = self.client.list_articles_by_openalex_id(list(missing))
cache.update({work.id: work for work in missing_works})
if self.references == HandleReferences.FULL:
if self.enrich == EnrichReferences.FULL:
missing = set(references) - set(cache.keys())
logger.info("fetching %d missing references", len(missing))
missing_works = self.client.list_articles_by_openalex_id(list(missing))
cache.update({work.id: work for work in missing_works})

logger.info("fetching %d missing references", len(missing))
missing_works = self.client.list_articles_by_openalex_id(list(missing))
cache.update({work.id: work for work in missing_works})
article_cache = {
openalexid: self._work_to_article(work)
for openalexid, work in cache.items()
Expand Down
8 changes: 4 additions & 4 deletions src/bibx/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
)
from bibx.algorithms.preprocess import Preprocess
from bibx.algorithms.sap import Sap
from bibx.builders.openalex import HandleReferences
from bibx.builders.openalex import EnrichReferences
from bibx.collection import Collection

app = typer.Typer()
Expand Down Expand Up @@ -80,9 +80,9 @@ def sap(filename: str) -> None:
@app.command()
def openalex(
query: list[str],
references: HandleReferences = typer.Option(
enrich: EnrichReferences = typer.Option(
help="how to handle references",
default=HandleReferences.BASIC,
default=EnrichReferences.BASIC,
),
verbose: bool = typer.Option(
help="be more verbose",
Expand All @@ -92,7 +92,7 @@ def openalex(
"""Run the sap algorithm on a seed file of any supported format."""
if verbose:
logging.basicConfig(level=logging.INFO)
c = query_openalex(" ".join(query), references=references)
c = query_openalex(" ".join(query), enrich=enrich)
s = Sap()
graph = s.create_graph(c)
graph = s.clean_graph(graph)
Expand Down
2 changes: 2 additions & 0 deletions src/bibx/clients/openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ def list_recent_articles(self, query: str, limit: int = 600) -> list[Work]:

def list_articles_by_openalex_id(self, ids: list[str]) -> list[Work]:
"""List articles by openalex id."""
if not ids:
return []
select = ",".join(Work.model_fields.keys())
results: list[Work] = []
with ThreadPoolExecutor(max_workers=5) as executor:
Expand Down