datalab-to · DeoJin · Mar 19, 2026
diff --git a/README.md b/README.md
@@ -151,6 +151,29 @@ Each processed file creates a subdirectory with:
 - `<filename>_metadata.json` - Metadata (page info, token count, etc.)
 - Extracted images are saved directly in the output directory
 
+### Python API
+
+Use `BatchInputItem` when calling `InferenceManager.generate`, and omit `page_range` to process every page in a PDF:
+
+```python
+from chandra.input import load_pdf_images
+from chandra.model import BatchInputItem, InferenceManager
+
+manager = InferenceManager(method="hf")
+images = load_pdf_images("document.pdf")
+
+batch = [
+    BatchInputItem(
+        image=image,
+        prompt_type="ocr_layout",
+    )
+    for image in images
+]
+
+results = manager.generate(batch)
+print(results[0].markdown)
+```
+
 ### Streamlit Web App
 
 Launch the interactive demo for single-page processing:

diff --git a/chandra/input.py b/chandra/input.py
@@ -26,7 +26,7 @@ def load_image(
 
 def load_pdf_images(
     filepath: str,
-    page_range: List[int],
+    page_range: List[int] | None = None,
     image_dpi: int = settings.IMAGE_DPI,
     min_pdf_image_dim: int = settings.MIN_PDF_IMAGE_DIM,
 ) -> List[Image.Image]:

diff --git a/tests/unit/test_input.py b/tests/unit/test_input.py
@@ -0,0 +1,67 @@
+from PIL import Image
+
+from chandra.input import load_pdf_images
+
+
+class FakeRenderedPage:
+    def __init__(self, size):
+        self.size = size
+
+    def to_pil(self):
+        return Image.new("RGB", self.size, "white")
+
+
+class FakePage:
+    def __init__(self, width=200, height=300):
+        self.width = width
+        self.height = height
+
+    def get_width(self):
+        return self.width
+
+    def get_height(self):
+        return self.height
+
+    def render(self, scale):
+        assert scale > 0
+        return FakeRenderedPage((self.width, self.height))
+
+
+class FakePdfDocument:
+    def __init__(self, filepath):
+        self.filepath = filepath
+        self.pages = [FakePage(200, 300), FakePage(300, 400)]
+        self.forms_initialized = False
+        self.closed = False
+
+    def init_forms(self):
+        self.forms_initialized = True
+
+    def __len__(self):
+        return len(self.pages)
+
+    def __getitem__(self, index):
+        return self.pages[index]
+
+    def close(self):
+        self.closed = True
+
+
+def test_load_pdf_images_processes_all_pages_when_page_range_omitted(monkeypatch):
+    monkeypatch.setattr("chandra.input.pdfium.PdfDocument", FakePdfDocument)
+    monkeypatch.setattr("chandra.input.flatten", lambda page: None)
+
+    images = load_pdf_images("dummy.pdf")
+
+    assert len(images) == 2
+    assert [image.size for image in images] == [(200, 300), (300, 400)]
+
+
+def test_load_pdf_images_respects_page_range(monkeypatch):
+    monkeypatch.setattr("chandra.input.pdfium.PdfDocument", FakePdfDocument)
+    monkeypatch.setattr("chandra.input.flatten", lambda page: None)
+
+    images = load_pdf_images("dummy.pdf", page_range=[1])
+
+    assert len(images) == 1
+    assert images[0].size == (300, 400)