From 3a3e8bcdef0887c21a4215a64cf4871d6bfee30d Mon Sep 17 00:00:00 2001
From: voidborne-d <voidborne-d@users.noreply.github.com>
Date: Sat, 4 Apr 2026 12:56:34 +0000
Subject: [PATCH] fix: align image filenames between parse_html and
 extract_images when Blank-Page divs are present

parse_html counts all divs (including Blank-Page) in its div_idx counter,
but extract_images iterates over chunks from parse_layout which filters
out Blank-Page divs. When a document has Blank-Page divs before Image/Figure
divs, the two functions generate different filenames for the same image:
- HTML/Markdown references e.g. 'hash_3_img.webp'
- extract_images saves as e.g. 'hash_1_img.webp'
This causes broken image references in the output.

Fix: store the original div position (counting all divs) in LayoutBlock.div_idx
during parse_layout, carry it through to chunks, and use it in extract_images
to generate filenames consistent with parse_html.

Includes 17 tests covering:
- 5 regression tests for the Blank-Page mismatch bug (all fail on old code)
- 3 baseline tests (no Blank-Page, normal operation)
- 4 parse_layout div_idx tracking tests
- 1 markdown output consistency test
- 4 edge cases (all blanks, missing img tag, empty HTML, no images)
---
 chandra/output.py                    |  11 +-
 tests/test_image_name_consistency.py | 316 +++++++++++++++++++++++++++
 2 files changed, 323 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_image_name_consistency.py
diff --git a/chandra/output.py b/chandra/output.py
index 030a4fd..d2469ac 100644
--- a/chandra/output.py
+++ b/chandra/output.py
@@ -23,9 +23,7 @@ def get_image_name(html: str, div_idx: int):
 
 def extract_images(html: str, chunks: dict, image: Image.Image):
     images = {}
-    div_idx = 0
     for idx, chunk in enumerate(chunks):
-        div_idx += 1
         if chunk["label"] in ["Image", "Figure"]:
             img = BeautifulSoup(chunk["content"], "html.parser").find("img")
             if not img:
@@ -36,6 +34,10 @@ def extract_images(html: str, chunks: dict, image: Image.Image):
             except ValueError:
                 # Happens when bbox coordinates are invalid
                 continue
+            # Use the original div index stored by parse_layout so the
+            # filename matches the one written by parse_html (which counts
+            # all divs, including Blank-Page).
+            div_idx = chunk.get("div_idx", idx + 1)
             img_name = get_image_name(html, div_idx)
             images[img_name] = block_image
     return images
@@ -194,6 +196,7 @@ class LayoutBlock:
     bbox: list[int]
     label: str
     content: str
+    div_idx: int = 0
 
 
 def parse_layout(html: str, image: Image.Image, bbox_scale=settings.BBOX_SCALE):
@@ -203,7 +206,7 @@ def parse_layout(html: str, image: Image.Image, bbox_scale=settings.BBOX_SCALE):
     width_scaler = width / bbox_scale
     height_scaler = height / bbox_scale
     layout_blocks = []
-    for div in top_level_divs:
+    for div_idx, div in enumerate(top_level_divs, start=1):
         label = div.get("data-label")
         if label == "Blank-Page":
             continue
@@ -235,7 +238,7 @@ def parse_layout(html: str, image: Image.Image, bbox_scale=settings.BBOX_SCALE):
             del tag["data-bbox"]
         content = str(content_soup)
 
-        layout_blocks.append(LayoutBlock(bbox=bbox, label=label, content=content))
+        layout_blocks.append(LayoutBlock(bbox=bbox, label=label, content=content, div_idx=div_idx))
     return layout_blocks
 
 
diff --git a/tests/test_image_name_consistency.py b/tests/test_image_name_consistency.py
new file mode 100644
index 0000000..e0b14a9
--- /dev/null
+++ b/tests/test_image_name_consistency.py
@@ -0,0 +1,316 @@
+"""Tests for image filename consistency between parse_html and extract_images.
+
+When a document contains Blank-Page divs before Image/Figure divs, the div
+index used to generate image filenames must be identical in both parse_html
+(which creates the <img src="..."> references in the output HTML) and
+extract_images (which saves the cropped image files).
+
+Before the fix, extract_images iterated over `chunks` (which has Blank-Page
+divs already filtered out by parse_layout), maintaining its own div_idx
+counter that started from 1 for the first *non-blank* div.  parse_html, on
+the other hand, counted *all* divs including Blank-Page, so its div_idx for
+the same Image div was higher.  This caused the HTML to reference an image
+filename that didn't match the filename under which extract_images saved it.
+"""
+
+import hashlib
+import re
+
+import pytest
+from PIL import Image
+
+from chandra.output import (
+    extract_images,
+    get_image_name,
+    parse_chunks,
+    parse_html,
+    parse_layout,
+    parse_markdown,
+    LayoutBlock,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_image(width=200, height=200):
+    """Create a simple test image."""
+    return Image.new("RGB", (width, height), "white")
+
+
+def _build_html(*divs):
+    """Build an HTML string from a list of div strings."""
+    return "".join(divs)
+
+
+def _div(label, content, bbox="0 0 500 500"):
+    """Convenience to create a single div block."""
+    return f'<div data-label="{label}" data-bbox="{bbox}">{content}</div>'
+
+
+def _image_div(bbox="0 0 500 500"):
+    return _div("Image", '<img alt="photo"/>', bbox)
+
+
+def _figure_div(bbox="0 0 500 500"):
+    return _div("Figure", '<img alt="diagram"/>', bbox)
+
+
+def _text_div(text="Hello", bbox="0 0 500 100"):
+    return _div("Text", text, bbox)
+
+
+def _blank_div():
+    return '<div data-label="Blank-Page" data-bbox="0 0 1000 1000"></div>'
+
+
+def _extract_img_srcs(html_out):
+    """Extract all img src values from rendered HTML."""
+    return re.findall(r'src=["\']([^"\']+)["\']', html_out)
+
+
+# ---------------------------------------------------------------------------
+# Core regression test: Blank-Page before Image
+# ---------------------------------------------------------------------------
+
+class TestBlankPageImageMismatch:
+    """Regression tests for the div_idx mismatch when Blank-Page precedes Image."""
+
+    def test_single_blank_before_image(self):
+        """Blank-Page (idx 1) + Image (idx 2): filenames must match."""
+        html = _build_html(_blank_div(), _image_div())
+        image = _make_image()
+
+        html_out = parse_html(html)
+        chunks = parse_chunks(html, image)
+        images = extract_images(html, chunks, image)
+
+        srcs = _extract_img_srcs(html_out)
+        assert len(srcs) == 1, f"Expected 1 img src, got {srcs}"
+        assert len(images) == 1, f"Expected 1 extracted image, got {len(images)}"
+        assert srcs[0] in images, (
+            f"HTML references '{srcs[0]}' but extract_images saved {list(images.keys())}"
+        )
+
+    def test_multiple_blanks_before_image(self):
+        """Three Blank-Page divs then an Image: idx must be 4 in both."""
+        html = _build_html(_blank_div(), _blank_div(), _blank_div(), _image_div())
+        image = _make_image()
+
+        html_out = parse_html(html)
+        chunks = parse_chunks(html, image)
+        images = extract_images(html, chunks, image)
+
+        srcs = _extract_img_srcs(html_out)
+        assert len(srcs) == 1
+        assert len(images) == 1
+        assert srcs[0] in images
+
+    def test_blank_between_text_and_image(self):
+        """Text + Blank-Page + Image: Image div_idx should be 3."""
+        html = _build_html(_text_div(), _blank_div(), _image_div())
+        image = _make_image()
+
+        html_out = parse_html(html)
+        chunks = parse_chunks(html, image)
+        images = extract_images(html, chunks, image)
+
+        srcs = _extract_img_srcs(html_out)
+        assert len(srcs) == 1
+        assert len(images) == 1
+        assert srcs[0] in images
+
+    def test_multiple_images_with_blanks(self):
+        """Two Image divs separated by Blank-Pages: all filenames must match."""
+        html = _build_html(
+            _image_div("0 0 500 500"),
+            _blank_div(),
+            _blank_div(),
+            _image_div("500 0 1000 500"),
+        )
+        image = _make_image(1000, 500)
+
+        html_out = parse_html(html)
+        chunks = parse_chunks(html, image)
+        images = extract_images(html, chunks, image)
+
+        srcs = _extract_img_srcs(html_out)
+        assert len(srcs) == 2, f"Expected 2 img srcs, got {srcs}"
+        assert len(images) == 2, f"Expected 2 extracted images, got {len(images)}"
+        for src in srcs:
+            assert src in images, (
+                f"HTML references '{src}' but extract_images saved {list(images.keys())}"
+            )
+
+    def test_figure_with_blanks(self):
+        """Figure label should behave identically to Image."""
+        html = _build_html(_blank_div(), _figure_div())
+        image = _make_image()
+
+        html_out = parse_html(html)
+        chunks = parse_chunks(html, image)
+        images = extract_images(html, chunks, image)
+
+        srcs = _extract_img_srcs(html_out)
+        assert len(srcs) == 1
+        assert len(images) == 1
+        assert srcs[0] in images
+
+
+# ---------------------------------------------------------------------------
+# No-blank baseline: ensure we didn't break the normal case
+# ---------------------------------------------------------------------------
+
+class TestNoBlankPages:
+    """Verify the fix doesn't regress the happy path (no Blank-Page divs)."""
+
+    def test_single_image_no_blanks(self):
+        html = _build_html(_image_div())
+        image = _make_image()
+
+        html_out = parse_html(html)
+        chunks = parse_chunks(html, image)
+        images = extract_images(html, chunks, image)
+
+        srcs = _extract_img_srcs(html_out)
+        assert len(srcs) == 1
+        assert srcs[0] in images
+
+    def test_text_then_image_no_blanks(self):
+        html = _build_html(_text_div(), _image_div())
+        image = _make_image()
+
+        html_out = parse_html(html)
+        chunks = parse_chunks(html, image)
+        images = extract_images(html, chunks, image)
+
+        srcs = _extract_img_srcs(html_out)
+        assert len(srcs) == 1
+        assert srcs[0] in images
+
+    def test_multiple_images_no_blanks(self):
+        html = _build_html(
+            _image_div("0 0 500 500"),
+            _text_div("middle"),
+            _figure_div("500 0 1000 500"),
+        )
+        image = _make_image(1000, 500)
+
+        html_out = parse_html(html)
+        chunks = parse_chunks(html, image)
+        images = extract_images(html, chunks, image)
+
+        srcs = _extract_img_srcs(html_out)
+        assert len(srcs) == 2
+        assert len(images) == 2
+        for src in srcs:
+            assert src in images
+
+
+# ---------------------------------------------------------------------------
+# parse_layout div_idx tracking
+# ---------------------------------------------------------------------------
+
+class TestLayoutDivIdx:
+    """Verify parse_layout stores the correct original div_idx."""
+
+    def test_div_idx_sequential_no_blanks(self):
+        html = _build_html(_text_div(), _image_div())
+        image = _make_image()
+        blocks = parse_layout(html, image)
+        assert len(blocks) == 2
+        assert blocks[0].div_idx == 1
+        assert blocks[1].div_idx == 2
+
+    def test_div_idx_skips_blank_page(self):
+        html = _build_html(_text_div(), _blank_div(), _image_div())
+        image = _make_image()
+        blocks = parse_layout(html, image)
+        # Blank-Page is filtered, but Image gets div_idx=3 (its real position)
+        assert len(blocks) == 2
+        assert blocks[0].div_idx == 1  # Text
+        assert blocks[1].div_idx == 3  # Image (Blank-Page was at 2)
+
+    def test_div_idx_multiple_blanks(self):
+        html = _build_html(_blank_div(), _blank_div(), _text_div(), _blank_div(), _image_div())
+        image = _make_image()
+        blocks = parse_layout(html, image)
+        assert len(blocks) == 2
+        assert blocks[0].div_idx == 3  # Text (after 2 blanks)
+        assert blocks[1].div_idx == 5  # Image (after 3 blanks total)
+
+    def test_div_idx_preserved_in_chunks(self):
+        """parse_chunks (which calls parse_layout → asdict) should carry div_idx."""
+        html = _build_html(_blank_div(), _image_div())
+        image = _make_image()
+        chunks = parse_chunks(html, image)
+        assert len(chunks) == 1
+        assert chunks[0]["div_idx"] == 2
+
+
+# ---------------------------------------------------------------------------
+# Markdown output also references correct image names
+# ---------------------------------------------------------------------------
+
+class TestMarkdownImageNames:
+    """parse_markdown uses parse_html internally, so image names should match too."""
+
+    def test_markdown_image_name_with_blanks(self):
+        html = _build_html(_blank_div(), _image_div())
+        image = _make_image()
+
+        md = parse_markdown(html)
+        chunks = parse_chunks(html, image)
+        images = extract_images(html, chunks, image)
+
+        # The markdown should contain the image filename
+        img_names = list(images.keys())
+        assert len(img_names) == 1
+        assert img_names[0] in md, (
+            f"Markdown does not reference '{img_names[0]}'. Markdown:\n{md}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Edge cases
+# ---------------------------------------------------------------------------
+
+class TestEdgeCases:
+    """Edge cases for robustness."""
+
+    def test_all_blank_pages(self):
+        """All divs are Blank-Page: no images, no chunks."""
+        html = _build_html(_blank_div(), _blank_div())
+        image = _make_image()
+
+        chunks = parse_chunks(html, image)
+        images = extract_images(html, chunks, image)
+        assert len(chunks) == 0
+        assert len(images) == 0
+
+    def test_image_without_img_tag(self):
+        """Image div without <img> tag: no image extracted, but no crash."""
+        html = _build_html(_blank_div(), _div("Image", "just text"))
+        image = _make_image()
+
+        chunks = parse_chunks(html, image)
+        images = extract_images(html, chunks, image)
+        # parse_html adds an img tag, but extract_images checks chunks
+        # which don't have one
+        assert len(images) == 0
+
+    def test_empty_html(self):
+        html = ""
+        image = _make_image()
+        chunks = parse_chunks(html, image)
+        images = extract_images(html, chunks, image)
+        assert len(chunks) == 0
+        assert len(images) == 0
+
+    def test_no_image_blocks(self):
+        html = _build_html(_text_div(), _text_div())
+        image = _make_image()
+        chunks = parse_chunks(html, image)
+        images = extract_images(html, chunks, image)
+        assert len(images) == 0