From db5ce993db288109e95cc6282177cf27ab68efd0 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Tue, 17 Mar 2026 10:07:23 -0400 Subject: [PATCH 1/3] Skip libguides sub-pages that are root page Why these changes are being introduced: It was discovered that we had duplicates in LibGuides since adding sub-pages. This was because for guides that have sub-pages, the sub-page with "position = 1" is effectively the root guide, just by a non-friendly URL. How this addresses that need: We can quite easily skip these guides by detecting "position = 1" and "name = 'Home'" when analyzing the sub-page metadata from the LibGuides API. Side effects of this change: * Reduce duplication Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/USE-455 --- transmogrifier/sources/json/libguides.py | 28 ++++++++++++++++++++++-- transmogrifier/sources/transformer.py | 2 +- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/transmogrifier/sources/json/libguides.py b/transmogrifier/sources/json/libguides.py index b5f6f39..62d76d0 100644 --- a/transmogrifier/sources/json/libguides.py +++ b/transmogrifier/sources/json/libguides.py @@ -112,10 +112,11 @@ def fetch_guides(self, token: str) -> pd.DataFrame: def get_guide_by_url(self, url: str) -> pd.Series: """Get metadata for a single guide via a URL.""" # strip GET parameter preview=...; duplicate for base URL - url = re.sub(r"&preview=[^&]*", "", url) + url = re.sub(r"([&?])preview=.*", "", url) matches = self.api_guides_df[ - (self.api_guides_df.url == url) | (self.api_guides_df.friendly_url == url) + (self.api_guides_df.url.str.lower() == url.lower()) + | (self.api_guides_df.friendly_url.str.lower() == url.lower()) ] if len(matches) == 1: return matches.iloc[0] @@ -194,6 +195,7 @@ def record_is_excluded(self, source_record: dict) -> bool: self._excluded_per_non_libguides_domain(source_record) or self._excluded_per_allowed_rules(source_record) or self._excluded_per_missing_html(source_record) + or self._exclude_sub_page_that_is_root_page(source_record) ) @staticmethod @@ -214,6 +216,28 @@ def _excluded_per_missing_html(self, source_record: dict) -> bool: """Exclude a record if the crawled HTML is empty (e.g. a redirect).""" return source_record["html_base64"].strip() == "" + def _exclude_sub_page_that_is_root_page(self, source_record: dict) -> bool: + """Exclude sub-pages that are effectively the guide root page. + + For guides that have sub-pages, e.g. `/foo`, there will be a sug-page that is + effectively the root page itself. These sub-pages will have a "position = 1" + and "name = 'Home'" properties. These can be skipped. + """ + url = source_record["url"] + + try: + guide = self.api_client.get_guide_by_url(url) + except ValueError as exc: + logger.warning(exc) + return True # if we cannot find URL in API data, skip (likely crawl noise) + + # if no position, assume root page and do NOT skip + if pd.isna(guide.position): + return False + + # if position = 1 and name = 'Home', skip + return int(guide.position) == 1 and guide["name"].strip() == "Home" + @classmethod @lru_cache(maxsize=8) def parse_html(cls, html_base64: str) -> Tag: diff --git a/transmogrifier/sources/transformer.py b/transmogrifier/sources/transformer.py index 4860cf3..05a6853 100644 --- a/transmogrifier/sources/transformer.py +++ b/transmogrifier/sources/transformer.py @@ -355,7 +355,7 @@ def transform(self, source_record: dict[str, JSON] | Tag) -> timdex.TimdexRecord raise DeletedRecordEvent(timdex_record_id) if self.record_is_excluded(source_record): source_record_id = self.get_source_record_id(source_record) - logger.info(f"Record ID {source_record_id} is excluded, skipping.") + logger.debug(f"Record ID {source_record_id} is excluded, skipping.") raise SkippedRecordEvent(source_record_id) timdex_record = timdex.TimdexRecord( From edb16156e3da18d952947230df93c4e6f9f817f1 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Tue, 17 Mar 2026 11:46:04 -0400 Subject: [PATCH 2/3] Small tweak to sub-page exclusion logic --- transmogrifier/sources/json/libguides.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/transmogrifier/sources/json/libguides.py b/transmogrifier/sources/json/libguides.py index 62d76d0..1d43a71 100644 --- a/transmogrifier/sources/json/libguides.py +++ b/transmogrifier/sources/json/libguides.py @@ -113,6 +113,7 @@ def get_guide_by_url(self, url: str) -> pd.Series: """Get metadata for a single guide via a URL.""" # strip GET parameter preview=...; duplicate for base URL url = re.sub(r"([&?])preview=.*", "", url) + url = url.removesuffix("/") matches = self.api_guides_df[ (self.api_guides_df.url.str.lower() == url.lower()) @@ -219,9 +220,12 @@ def _excluded_per_missing_html(self, source_record: dict) -> bool: def _exclude_sub_page_that_is_root_page(self, source_record: dict) -> bool: """Exclude sub-pages that are effectively the guide root page. - For guides that have sub-pages, e.g. `/foo`, there will be a sug-page that is + For guides that have sub-pages, e.g. `/foo`, there will be a sub-page that is effectively the root page itself. These sub-pages will have a "position = 1" - and "name = 'Home'" properties. These can be skipped. + and "parent_id = 0" properties (i.e., a top-level page in the first position). + These can be skipped. Note that sub-pages can have sub-sub-pages as well. These + will have "position = 1" but a parent_id of the sub-page. These should NOT be + skipped as they are standalone, unique pages. """ url = source_record["url"] @@ -235,8 +239,8 @@ def _exclude_sub_page_that_is_root_page(self, source_record: dict) -> bool: if pd.isna(guide.position): return False - # if position = 1 and name = 'Home', skip - return int(guide.position) == 1 and guide["name"].strip() == "Home" + # if position = 1 and top-level page (parent_id = 0), skip + return guide.position == "1" and guide.parent_id == "0" @classmethod @lru_cache(maxsize=8) From 5b8945df2d6fef7f686d07c9acfcc36cf5b7a38f Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Tue, 17 Mar 2026 13:22:46 -0400 Subject: [PATCH 3/3] Warn on failed URL lookup and raise SkippedRecordEvent --- transmogrifier/sources/json/libguides.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/transmogrifier/sources/json/libguides.py b/transmogrifier/sources/json/libguides.py index 1d43a71..940e422 100644 --- a/transmogrifier/sources/json/libguides.py +++ b/transmogrifier/sources/json/libguides.py @@ -17,6 +17,7 @@ LIBGUIDES_GUIDES_URL, LIBGUIDES_TOKEN_URL, ) +from transmogrifier.exceptions import SkippedRecordEvent from transmogrifier.sources.jsontransformer import JSONTransformer from transmogrifier.sources.transformer import JSON @@ -288,14 +289,26 @@ def extract_dublin_core_metadata(cls, html_base64: str) -> dict: def get_source_link(cls, source_record: dict) -> str: """Use the 'friendly' URL from LibGuides API data.""" url = source_record["url"] - guide = cls.api_client.get_guide_by_url(url) + try: + guide = cls.api_client.get_guide_by_url(url) + except ValueError: + logger.warning("Could not find guide in API data for URL: %s", url) + return url friendly_url = guide.get("friendly_url") or "" return friendly_url.strip() or url @classmethod def get_source_record_id(cls, source_record: dict) -> str: """Use numeric 'id' field from Libguides metadata with 'guides-' prefix.""" - guide = cls.api_client.get_guide_by_url(cls.get_source_link(source_record)) + try: + guide = cls.api_client.get_guide_by_url(cls.get_source_link(source_record)) + except ValueError as exc: + message = ( + "Could not determine source record ID, skipping record with URL: " + f"{source_record['url']}" + ) + logger.warning(message) + raise SkippedRecordEvent(message) from exc return f"guides-{guide['id']}" def get_timdex_record_id(self, source_record: dict) -> str: