From db5ce993db288109e95cc6282177cf27ab68efd0 Mon Sep 17 00:00:00 2001
From: Graham Hukill <ghukill@gmail.com>
Date: Tue, 17 Mar 2026 10:07:23 -0400
Subject: [PATCH 1/3] Skip libguides sub-pages that are root page

Why these changes are being introduced:

It was discovered that we had duplicates in LibGuides since adding sub-pages.  This
was because for guides that have sub-pages, the sub-page with "position = 1" is
effectively the root guide, just by a non-friendly URL.

How this addresses that need:

We can quite easily skip these guides by detecting "position = 1" and
"name = 'Home'" when analyzing the sub-page metadata from the LibGuides API.

Side effects of this change:
* Reduce duplication

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/USE-455
---
 transmogrifier/sources/json/libguides.py | 28 ++++++++++++++++++++++--
 transmogrifier/sources/transformer.py    |  2 +-
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/transmogrifier/sources/json/libguides.py b/transmogrifier/sources/json/libguides.py
index b5f6f39..62d76d0 100644
--- a/transmogrifier/sources/json/libguides.py
+++ b/transmogrifier/sources/json/libguides.py
@@ -112,10 +112,11 @@ def fetch_guides(self, token: str) -> pd.DataFrame:
     def get_guide_by_url(self, url: str) -> pd.Series:
         """Get metadata for a single guide via a URL."""
         # strip GET parameter preview=...; duplicate for base URL
-        url = re.sub(r"&preview=[^&]*", "", url)
+        url = re.sub(r"([&?])preview=.*", "", url)
 
         matches = self.api_guides_df[
-            (self.api_guides_df.url == url) | (self.api_guides_df.friendly_url == url)
+            (self.api_guides_df.url.str.lower() == url.lower())
+            | (self.api_guides_df.friendly_url.str.lower() == url.lower())
         ]
         if len(matches) == 1:
             return matches.iloc[0]
@@ -194,6 +195,7 @@ def record_is_excluded(self, source_record: dict) -> bool:
             self._excluded_per_non_libguides_domain(source_record)
             or self._excluded_per_allowed_rules(source_record)
             or self._excluded_per_missing_html(source_record)
+            or self._exclude_sub_page_that_is_root_page(source_record)
         )
 
     @staticmethod
@@ -214,6 +216,28 @@ def _excluded_per_missing_html(self, source_record: dict) -> bool:
         """Exclude a record if the crawled HTML is empty (e.g. a redirect)."""
         return source_record["html_base64"].strip() == ""
 
+    def _exclude_sub_page_that_is_root_page(self, source_record: dict) -> bool:
+        """Exclude sub-pages that are effectively the guide root page.
+
+        For guides that have sub-pages, e.g. `/foo`, there will be a sug-page that is
+        effectively the root page itself.  These sub-pages will have a "position = 1"
+        and "name = 'Home'" properties.  These can be skipped.
+        """
+        url = source_record["url"]
+
+        try:
+            guide = self.api_client.get_guide_by_url(url)
+        except ValueError as exc:
+            logger.warning(exc)
+            return True  # if we cannot find URL in API data, skip (likely crawl noise)
+
+        # if no position, assume root page and do NOT skip
+        if pd.isna(guide.position):
+            return False
+
+        # if position = 1 and name = 'Home', skip
+        return int(guide.position) == 1 and guide["name"].strip() == "Home"
+
     @classmethod
     @lru_cache(maxsize=8)
     def parse_html(cls, html_base64: str) -> Tag:
diff --git a/transmogrifier/sources/transformer.py b/transmogrifier/sources/transformer.py
index 4860cf3..05a6853 100644
--- a/transmogrifier/sources/transformer.py
+++ b/transmogrifier/sources/transformer.py
@@ -355,7 +355,7 @@ def transform(self, source_record: dict[str, JSON] | Tag) -> timdex.TimdexRecord
             raise DeletedRecordEvent(timdex_record_id)
         if self.record_is_excluded(source_record):
             source_record_id = self.get_source_record_id(source_record)
-            logger.info(f"Record ID {source_record_id} is excluded, skipping.")
+            logger.debug(f"Record ID {source_record_id} is excluded, skipping.")
             raise SkippedRecordEvent(source_record_id)
 
         timdex_record = timdex.TimdexRecord(

From edb16156e3da18d952947230df93c4e6f9f817f1 Mon Sep 17 00:00:00 2001
From: Graham Hukill <ghukill@gmail.com>
Date: Tue, 17 Mar 2026 11:46:04 -0400
Subject: [PATCH 2/3] Small tweak to sub-page exclusion logic

---
 transmogrifier/sources/json/libguides.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/transmogrifier/sources/json/libguides.py b/transmogrifier/sources/json/libguides.py
index 62d76d0..1d43a71 100644
--- a/transmogrifier/sources/json/libguides.py
+++ b/transmogrifier/sources/json/libguides.py
@@ -113,6 +113,7 @@ def get_guide_by_url(self, url: str) -> pd.Series:
         """Get metadata for a single guide via a URL."""
         # strip GET parameter preview=...; duplicate for base URL
         url = re.sub(r"([&?])preview=.*", "", url)
+        url = url.removesuffix("/")
 
         matches = self.api_guides_df[
             (self.api_guides_df.url.str.lower() == url.lower())
@@ -219,9 +220,12 @@ def _excluded_per_missing_html(self, source_record: dict) -> bool:
     def _exclude_sub_page_that_is_root_page(self, source_record: dict) -> bool:
         """Exclude sub-pages that are effectively the guide root page.
 
-        For guides that have sub-pages, e.g. `/foo`, there will be a sug-page that is
+        For guides that have sub-pages, e.g. `/foo`, there will be a sub-page that is
         effectively the root page itself.  These sub-pages will have a "position = 1"
-        and "name = 'Home'" properties.  These can be skipped.
+        and "parent_id = 0" properties (i.e., a top-level page in the first position).
+        These can be skipped.  Note that sub-pages can have sub-sub-pages as well.  These
+        will have "position = 1" but a parent_id of the sub-page.  These should NOT be
+        skipped as they are standalone, unique pages.
         """
         url = source_record["url"]
 
@@ -235,8 +239,8 @@ def _exclude_sub_page_that_is_root_page(self, source_record: dict) -> bool:
         if pd.isna(guide.position):
             return False
 
-        # if position = 1 and name = 'Home', skip
-        return int(guide.position) == 1 and guide["name"].strip() == "Home"
+        # if position = 1 and top-level page (parent_id = 0), skip
+        return guide.position == "1" and guide.parent_id == "0"
 
     @classmethod
     @lru_cache(maxsize=8)

From 5b8945df2d6fef7f686d07c9acfcc36cf5b7a38f Mon Sep 17 00:00:00 2001
From: Graham Hukill <ghukill@gmail.com>
Date: Tue, 17 Mar 2026 13:22:46 -0400
Subject: [PATCH 3/3] Warn on failed URL lookup and raise SkippedRecordEvent

---
 transmogrifier/sources/json/libguides.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/transmogrifier/sources/json/libguides.py b/transmogrifier/sources/json/libguides.py
index 1d43a71..940e422 100644
--- a/transmogrifier/sources/json/libguides.py
+++ b/transmogrifier/sources/json/libguides.py
@@ -17,6 +17,7 @@
     LIBGUIDES_GUIDES_URL,
     LIBGUIDES_TOKEN_URL,
 )
+from transmogrifier.exceptions import SkippedRecordEvent
 from transmogrifier.sources.jsontransformer import JSONTransformer
 from transmogrifier.sources.transformer import JSON
 
@@ -288,14 +289,26 @@ def extract_dublin_core_metadata(cls, html_base64: str) -> dict:
     def get_source_link(cls, source_record: dict) -> str:
         """Use the 'friendly' URL from LibGuides API data."""
         url = source_record["url"]
-        guide = cls.api_client.get_guide_by_url(url)
+        try:
+            guide = cls.api_client.get_guide_by_url(url)
+        except ValueError:
+            logger.warning("Could not find guide in API data for URL: %s", url)
+            return url
         friendly_url = guide.get("friendly_url") or ""
         return friendly_url.strip() or url
 
     @classmethod
     def get_source_record_id(cls, source_record: dict) -> str:
         """Use numeric 'id' field from Libguides metadata with 'guides-' prefix."""
-        guide = cls.api_client.get_guide_by_url(cls.get_source_link(source_record))
+        try:
+            guide = cls.api_client.get_guide_by_url(cls.get_source_link(source_record))
+        except ValueError as exc:
+            message = (
+                "Could not determine source record ID, skipping record with URL: "
+                f"{source_record['url']}"
+            )
+            logger.warning(message)
+            raise SkippedRecordEvent(message) from exc
         return f"guides-{guide['id']}"
 
     def get_timdex_record_id(self, source_record: dict) -> str: