diff --git a/transmogrifier/sources/json/libguides.py b/transmogrifier/sources/json/libguides.py index b5f6f39..940e422 100644 --- a/transmogrifier/sources/json/libguides.py +++ b/transmogrifier/sources/json/libguides.py @@ -17,6 +17,7 @@ LIBGUIDES_GUIDES_URL, LIBGUIDES_TOKEN_URL, ) +from transmogrifier.exceptions import SkippedRecordEvent from transmogrifier.sources.jsontransformer import JSONTransformer from transmogrifier.sources.transformer import JSON @@ -112,10 +113,12 @@ def fetch_guides(self, token: str) -> pd.DataFrame: def get_guide_by_url(self, url: str) -> pd.Series: """Get metadata for a single guide via a URL.""" # strip GET parameter preview=...; duplicate for base URL - url = re.sub(r"&preview=[^&]*", "", url) + url = re.sub(r"([&?])preview=.*", "", url) + url = url.removesuffix("/") matches = self.api_guides_df[ - (self.api_guides_df.url == url) | (self.api_guides_df.friendly_url == url) + (self.api_guides_df.url.str.lower() == url.lower()) + | (self.api_guides_df.friendly_url.str.lower() == url.lower()) ] if len(matches) == 1: return matches.iloc[0] @@ -194,6 +197,7 @@ def record_is_excluded(self, source_record: dict) -> bool: self._excluded_per_non_libguides_domain(source_record) or self._excluded_per_allowed_rules(source_record) or self._excluded_per_missing_html(source_record) + or self._exclude_sub_page_that_is_root_page(source_record) ) @staticmethod @@ -214,6 +218,31 @@ def _excluded_per_missing_html(self, source_record: dict) -> bool: """Exclude a record if the crawled HTML is empty (e.g. a redirect).""" return source_record["html_base64"].strip() == "" + def _exclude_sub_page_that_is_root_page(self, source_record: dict) -> bool: + """Exclude sub-pages that are effectively the guide root page. + + For guides that have sub-pages, e.g. `/foo`, there will be a sub-page that is + effectively the root page itself. These sub-pages will have a "position = 1" + and "parent_id = 0" properties (i.e., a top-level page in the first position). + These can be skipped. Note that sub-pages can have sub-sub-pages as well. These + will have "position = 1" but a parent_id of the sub-page. These should NOT be + skipped as they are standalone, unique pages. + """ + url = source_record["url"] + + try: + guide = self.api_client.get_guide_by_url(url) + except ValueError as exc: + logger.warning(exc) + return True # if we cannot find URL in API data, skip (likely crawl noise) + + # if no position, assume root page and do NOT skip + if pd.isna(guide.position): + return False + + # if position = 1 and top-level page (parent_id = 0), skip + return guide.position == "1" and guide.parent_id == "0" + @classmethod @lru_cache(maxsize=8) def parse_html(cls, html_base64: str) -> Tag: @@ -260,14 +289,26 @@ def extract_dublin_core_metadata(cls, html_base64: str) -> dict: def get_source_link(cls, source_record: dict) -> str: """Use the 'friendly' URL from LibGuides API data.""" url = source_record["url"] - guide = cls.api_client.get_guide_by_url(url) + try: + guide = cls.api_client.get_guide_by_url(url) + except ValueError: + logger.warning("Could not find guide in API data for URL: %s", url) + return url friendly_url = guide.get("friendly_url") or "" return friendly_url.strip() or url @classmethod def get_source_record_id(cls, source_record: dict) -> str: """Use numeric 'id' field from Libguides metadata with 'guides-' prefix.""" - guide = cls.api_client.get_guide_by_url(cls.get_source_link(source_record)) + try: + guide = cls.api_client.get_guide_by_url(cls.get_source_link(source_record)) + except ValueError as exc: + message = ( + "Could not determine source record ID, skipping record with URL: " + f"{source_record['url']}" + ) + logger.warning(message) + raise SkippedRecordEvent(message) from exc return f"guides-{guide['id']}" def get_timdex_record_id(self, source_record: dict) -> str: diff --git a/transmogrifier/sources/transformer.py b/transmogrifier/sources/transformer.py index 4860cf3..05a6853 100644 --- a/transmogrifier/sources/transformer.py +++ b/transmogrifier/sources/transformer.py @@ -355,7 +355,7 @@ def transform(self, source_record: dict[str, JSON] | Tag) -> timdex.TimdexRecord raise DeletedRecordEvent(timdex_record_id) if self.record_is_excluded(source_record): source_record_id = self.get_source_record_id(source_record) - logger.info(f"Record ID {source_record_id} is excluded, skipping.") + logger.debug(f"Record ID {source_record_id} is excluded, skipping.") raise SkippedRecordEvent(source_record_id) timdex_record = timdex.TimdexRecord(