From bd3dfd9c66454073bc1bd25bd7eee6f797d5589c Mon Sep 17 00:00:00 2001 From: Hasan Haq Date: Sun, 8 Jun 2025 10:04:07 -0500 Subject: [PATCH] Improve wiki markdown formatting --- bot/scripts/wiki_to_markdown.py | 28 +++++++++++++++++++++-- tests/unit_tests/test_wiki_to_markdown.py | 10 ++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/bot/scripts/wiki_to_markdown.py b/bot/scripts/wiki_to_markdown.py index aaffa2b..b163809 100644 --- a/bot/scripts/wiki_to_markdown.py +++ b/bot/scripts/wiki_to_markdown.py @@ -2,6 +2,27 @@ import urllib.parse +def _remove_templates(text: str) -> str: + """Remove wiki templates enclosed in double curly braces, handling nested + templates gracefully.""" + result = [] + depth = 0 + i = 0 + while i < len(text): + if text.startswith("{{", i): + depth += 1 + i += 2 + continue + if text.startswith("}}", i) and depth: + depth -= 1 + i += 2 + continue + if depth == 0: + result.append(text[i]) + i += 1 + return "".join(result) + + def wiki_to_markdown(text, remove_urls, wiki_url_prefix='https://www.weezerpedia.com/wiki/'): # Remove everything from "See also" or "References" downward @@ -9,8 +30,9 @@ def wiki_to_markdown(text, remove_urls, wiki_url_prefix='https://www.weezerpedia # Remove Wikipedia-style image placeholders entirely text = re.sub(r"\[\[Image:.+?\]\]", "", text) - # Remove wiki formatting for templates (e.g., {{Infobox}}) - text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL) + # Remove wiki formatting for templates (e.g., {{Infobox}}), handling nested + # templates so citation and footnote markers don't leak into the text. + text = _remove_templates(text) # Convert '''bold''' to **bold**, ensuring no extra spaces between words text = re.sub(r"'''(\S.*?\S)'''", r"**\1**", text) @@ -36,6 +58,8 @@ def wiki_to_markdown(text, remove_urls, wiki_url_prefix='https://www.weezerpedia # Remove tags, , and any other unsupported HTML-like tags text = re.sub(r"<.*?>", "", text) + # Strip out footnote markers like [1], [a], etc. + text = re.sub(r"\[\s*[0-9a-zA-Z]+\s*\]", "", text) # Remove Category tags [[Category:Something]] entirely text = re.sub(r"\[\[Category:.+?\]\]", "", text) diff --git a/tests/unit_tests/test_wiki_to_markdown.py b/tests/unit_tests/test_wiki_to_markdown.py index ac509c3..6805341 100644 --- a/tests/unit_tests/test_wiki_to_markdown.py +++ b/tests/unit_tests/test_wiki_to_markdown.py @@ -6,3 +6,13 @@ def test_internal_wiki_link_only_label(): def test_internal_wiki_link_with_label_and_path(): assert wiki_to_markdown('[[Blue Album|the Blue Album]]', False) == '[the Blue Album](https://www.weezerpedia.com/wiki/Blue%20Album)' + + +def test_remove_numeric_footnote_markers(): + text = "Weezer formed in 1992.[1] They released the Blue Album." + assert wiki_to_markdown(text, False) == "Weezer formed in 1992. They released the Blue Album." + + +def test_remove_nested_template(): + input_text = 'Info {{cite web|url={{URL|https://example.com}}|title=Example}} end' + assert wiki_to_markdown(input_text, False) == 'Info end'