MITLibraries · ghukill · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
@@ -17,6 +17,7 @@
     LIBGUIDES_GUIDES_URL,
     LIBGUIDES_TOKEN_URL,
 )
+from transmogrifier.exceptions import SkippedRecordEvent
 from transmogrifier.sources.jsontransformer import JSONTransformer
 from transmogrifier.sources.transformer import JSON
 
@@ -112,10 +113,12 @@ def fetch_guides(self, token: str) -> pd.DataFrame:
     def get_guide_by_url(self, url: str) -> pd.Series:
         """Get metadata for a single guide via a URL."""
         # strip GET parameter preview=...; duplicate for base URL
-        url = re.sub(r"&preview=[^&]*", "", url)
+        url = re.sub(r"([&?])preview=.*", "", url)
+        url = url.removesuffix("/")
 
         matches = self.api_guides_df[
-            (self.api_guides_df.url == url) | (self.api_guides_df.friendly_url == url)
+            (self.api_guides_df.url.str.lower() == url.lower())
+            | (self.api_guides_df.friendly_url.str.lower() == url.lower())
         ]
         if len(matches) == 1:
             return matches.iloc[0]
@@ -194,6 +197,7 @@ def record_is_excluded(self, source_record: dict) -> bool:
             self._excluded_per_non_libguides_domain(source_record)
             or self._excluded_per_allowed_rules(source_record)
             or self._excluded_per_missing_html(source_record)
+            or self._exclude_sub_page_that_is_root_page(source_record)
         )
 
     @staticmethod
@@ -214,6 +218,31 @@ def _excluded_per_missing_html(self, source_record: dict) -> bool:
         """Exclude a record if the crawled HTML is empty (e.g. a redirect)."""
         return source_record["html_base64"].strip() == ""
 
+    def _exclude_sub_page_that_is_root_page(self, source_record: dict) -> bool:
+        """Exclude sub-pages that are effectively the guide root page.
+
+        For guides that have sub-pages, e.g. `/foo`, there will be a sub-page that is
+        effectively the root page itself.  These sub-pages will have a "position = 1"
+        and "parent_id = 0" properties (i.e., a top-level page in the first position).
+        These can be skipped.  Note that sub-pages can have sub-sub-pages as well.  These
+        will have "position = 1" but a parent_id of the sub-page.  These should NOT be
+        skipped as they are standalone, unique pages.
+        """
+        url = source_record["url"]
+
+        try:
+            guide = self.api_client.get_guide_by_url(url)
+        except ValueError as exc:
+            logger.warning(exc)
+            return True  # if we cannot find URL in API data, skip (likely crawl noise)
+
+        # if no position, assume root page and do NOT skip
+        if pd.isna(guide.position):
+            return False
+
+        # if position = 1 and top-level page (parent_id = 0), skip
+        return guide.position == "1" and guide.parent_id == "0"
+
     @classmethod
     @lru_cache(maxsize=8)
     def parse_html(cls, html_base64: str) -> Tag:
@@ -260,14 +289,26 @@ def extract_dublin_core_metadata(cls, html_base64: str) -> dict:
     def get_source_link(cls, source_record: dict) -> str:
         """Use the 'friendly' URL from LibGuides API data."""
         url = source_record["url"]
-        guide = cls.api_client.get_guide_by_url(url)
+        try:
+            guide = cls.api_client.get_guide_by_url(url)
+        except ValueError:
+            logger.warning("Could not find guide in API data for URL: %s", url)
+            return url
         friendly_url = guide.get("friendly_url") or ""
         return friendly_url.strip() or url
 
     @classmethod
     def get_source_record_id(cls, source_record: dict) -> str:
         """Use numeric 'id' field from Libguides metadata with 'guides-' prefix."""
-        guide = cls.api_client.get_guide_by_url(cls.get_source_link(source_record))
+        try:
+            guide = cls.api_client.get_guide_by_url(cls.get_source_link(source_record))
+        except ValueError as exc:
+            message = (
+                "Could not determine source record ID, skipping record with URL: "
+                f"{source_record['url']}"
+            )
+            logger.warning(message)
+            raise SkippedRecordEvent(message) from exc
         return f"guides-{guide['id']}"
 
     def get_timdex_record_id(self, source_record: dict) -> str:

@@ -355,7 +355,7 @@ def transform(self, source_record: dict[str, JSON] | Tag) -> timdex.TimdexRecord
             raise DeletedRecordEvent(timdex_record_id)
         if self.record_is_excluded(source_record):
             source_record_id = self.get_source_record_id(source_record)
-            logger.info(f"Record ID {source_record_id} is excluded, skipping.")
+            logger.debug(f"Record ID {source_record_id} is excluded, skipping.")
             raise SkippedRecordEvent(source_record_id)
 
         timdex_record = timdex.TimdexRecord(