Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions bot/scripts/wiki_to_markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,37 @@
import urllib.parse


def _remove_templates(text: str) -> str:
"""Remove wiki templates enclosed in double curly braces, handling nested
templates gracefully."""
result = []
depth = 0
i = 0
while i < len(text):
if text.startswith("{{", i):
depth += 1
i += 2
continue
if text.startswith("}}", i) and depth:
depth -= 1
i += 2
continue
if depth == 0:
result.append(text[i])
i += 1
return "".join(result)


def wiki_to_markdown(text, remove_urls, wiki_url_prefix='https://www.weezerpedia.com/wiki/'):

# Remove everything from "See also" or "References" downward
text = re.split(r"==See also==", text)[0]
# Remove Wikipedia-style image placeholders entirely
text = re.sub(r"\[\[Image:.+?\]\]", "", text)

# Remove wiki formatting for templates (e.g., {{Infobox}})
text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL)
# Remove wiki formatting for templates (e.g., {{Infobox}}), handling nested
# templates so citation and footnote markers don't leak into the text.
text = _remove_templates(text)

# Convert '''bold''' to **bold**, ensuring no extra spaces between words
text = re.sub(r"'''(\S.*?\S)'''", r"**\1**", text)
Expand All @@ -36,6 +58,8 @@ def wiki_to_markdown(text, remove_urls, wiki_url_prefix='https://www.weezerpedia

# Remove <ref> tags, <references />, and any other unsupported HTML-like tags
text = re.sub(r"<.*?>", "", text)
# Strip out footnote markers like [1], [a], etc.
text = re.sub(r"\[\s*[0-9a-zA-Z]+\s*\]", "", text)

# Remove Category tags [[Category:Something]] entirely
text = re.sub(r"\[\[Category:.+?\]\]", "", text)
Expand Down
10 changes: 10 additions & 0 deletions tests/unit_tests/test_wiki_to_markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,13 @@ def test_internal_wiki_link_only_label():

def test_internal_wiki_link_with_label_and_path():
assert wiki_to_markdown('[[Blue Album|the Blue Album]]', False) == '[the Blue Album](https://www.weezerpedia.com/wiki/Blue%20Album)'


def test_remove_numeric_footnote_markers():
text = "Weezer formed in 1992.[1] They released the Blue Album."
assert wiki_to_markdown(text, False) == "Weezer formed in 1992. They released the Blue Album."


def test_remove_nested_template():
input_text = 'Info {{cite web|url={{URL|https://example.com}}|title=Example}} end'
assert wiki_to_markdown(input_text, False) == 'Info end'