Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 45 additions & 4 deletions transmogrifier/sources/json/libguides.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
LIBGUIDES_GUIDES_URL,
LIBGUIDES_TOKEN_URL,
)
from transmogrifier.exceptions import SkippedRecordEvent
from transmogrifier.sources.jsontransformer import JSONTransformer
from transmogrifier.sources.transformer import JSON

Expand Down Expand Up @@ -112,10 +113,12 @@ def fetch_guides(self, token: str) -> pd.DataFrame:
def get_guide_by_url(self, url: str) -> pd.Series:
"""Get metadata for a single guide via a URL."""
# strip GET parameter preview=...; duplicate for base URL
url = re.sub(r"&preview=[^&]*", "", url)
url = re.sub(r"([&?])preview=.*", "", url)
url = url.removesuffix("/")

matches = self.api_guides_df[
(self.api_guides_df.url == url) | (self.api_guides_df.friendly_url == url)
(self.api_guides_df.url.str.lower() == url.lower())
| (self.api_guides_df.friendly_url.str.lower() == url.lower())
]
if len(matches) == 1:
return matches.iloc[0]
Expand Down Expand Up @@ -194,6 +197,7 @@ def record_is_excluded(self, source_record: dict) -> bool:
self._excluded_per_non_libguides_domain(source_record)
or self._excluded_per_allowed_rules(source_record)
or self._excluded_per_missing_html(source_record)
or self._exclude_sub_page_that_is_root_page(source_record)
)

@staticmethod
Expand All @@ -214,6 +218,31 @@ def _excluded_per_missing_html(self, source_record: dict) -> bool:
"""Exclude a record if the crawled HTML is empty (e.g. a redirect)."""
return source_record["html_base64"].strip() == ""

def _exclude_sub_page_that_is_root_page(self, source_record: dict) -> bool:
"""Exclude sub-pages that are effectively the guide root page.

For guides that have sub-pages, e.g. `/foo`, there will be a sub-page that is
effectively the root page itself. These sub-pages will have a "position = 1"
and "parent_id = 0" properties (i.e., a top-level page in the first position).
These can be skipped. Note that sub-pages can have sub-sub-pages as well. These
will have "position = 1" but a parent_id of the sub-page. These should NOT be
skipped as they are standalone, unique pages.
"""
url = source_record["url"]

try:
guide = self.api_client.get_guide_by_url(url)
except ValueError as exc:
logger.warning(exc)
return True # if we cannot find URL in API data, skip (likely crawl noise)

# if no position, assume root page and do NOT skip
if pd.isna(guide.position):
return False

# if position = 1 and top-level page (parent_id = 0), skip
return guide.position == "1" and guide.parent_id == "0"

@classmethod
@lru_cache(maxsize=8)
def parse_html(cls, html_base64: str) -> Tag:
Expand Down Expand Up @@ -260,14 +289,26 @@ def extract_dublin_core_metadata(cls, html_base64: str) -> dict:
def get_source_link(cls, source_record: dict) -> str:
"""Use the 'friendly' URL from LibGuides API data."""
url = source_record["url"]
guide = cls.api_client.get_guide_by_url(url)
try:
guide = cls.api_client.get_guide_by_url(url)
except ValueError:
logger.warning("Could not find guide in API data for URL: %s", url)
return url
friendly_url = guide.get("friendly_url") or ""
return friendly_url.strip() or url

@classmethod
def get_source_record_id(cls, source_record: dict) -> str:
"""Use numeric 'id' field from Libguides metadata with 'guides-' prefix."""
guide = cls.api_client.get_guide_by_url(cls.get_source_link(source_record))
try:
guide = cls.api_client.get_guide_by_url(cls.get_source_link(source_record))
except ValueError as exc:
message = (
"Could not determine source record ID, skipping record with URL: "
f"{source_record['url']}"
)
logger.warning(message)
raise SkippedRecordEvent(message) from exc
return f"guides-{guide['id']}"

def get_timdex_record_id(self, source_record: dict) -> str:
Expand Down
2 changes: 1 addition & 1 deletion transmogrifier/sources/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ def transform(self, source_record: dict[str, JSON] | Tag) -> timdex.TimdexRecord
raise DeletedRecordEvent(timdex_record_id)
if self.record_is_excluded(source_record):
source_record_id = self.get_source_record_id(source_record)
logger.info(f"Record ID {source_record_id} is excluded, skipping.")
logger.debug(f"Record ID {source_record_id} is excluded, skipping.")
raise SkippedRecordEvent(source_record_id)

timdex_record = timdex.TimdexRecord(
Expand Down
Loading