Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,29 @@ Each processed file creates a subdirectory with:
- `<filename>_metadata.json` - Metadata (page info, token count, etc.)
- Extracted images are saved directly in the output directory

### Python API

Use `BatchInputItem` when calling `InferenceManager.generate`, and omit `page_range` to process every page in a PDF:

```python
from chandra.input import load_pdf_images
from chandra.model import BatchInputItem, InferenceManager

manager = InferenceManager(method="hf")
images = load_pdf_images("document.pdf")

batch = [
BatchInputItem(
image=image,
prompt_type="ocr_layout",
)
for image in images
]

results = manager.generate(batch)
print(results[0].markdown)
```

### Streamlit Web App

Launch the interactive demo for single-page processing:
Expand Down
2 changes: 1 addition & 1 deletion chandra/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def load_image(

def load_pdf_images(
filepath: str,
page_range: List[int],
page_range: List[int] | None = None,
image_dpi: int = settings.IMAGE_DPI,
min_pdf_image_dim: int = settings.MIN_PDF_IMAGE_DIM,
) -> List[Image.Image]:
Expand Down
67 changes: 67 additions & 0 deletions tests/unit/test_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from PIL import Image

from chandra.input import load_pdf_images


class FakeRenderedPage:
def __init__(self, size):
self.size = size

def to_pil(self):
return Image.new("RGB", self.size, "white")


class FakePage:
def __init__(self, width=200, height=300):
self.width = width
self.height = height

def get_width(self):
return self.width

def get_height(self):
return self.height

def render(self, scale):
assert scale > 0
return FakeRenderedPage((self.width, self.height))


class FakePdfDocument:
def __init__(self, filepath):
self.filepath = filepath
self.pages = [FakePage(200, 300), FakePage(300, 400)]
self.forms_initialized = False
self.closed = False

def init_forms(self):
self.forms_initialized = True

def __len__(self):
return len(self.pages)

def __getitem__(self, index):
return self.pages[index]

def close(self):
self.closed = True


def test_load_pdf_images_processes_all_pages_when_page_range_omitted(monkeypatch):
monkeypatch.setattr("chandra.input.pdfium.PdfDocument", FakePdfDocument)
monkeypatch.setattr("chandra.input.flatten", lambda page: None)

images = load_pdf_images("dummy.pdf")

assert len(images) == 2
assert [image.size for image in images] == [(200, 300), (300, 400)]


def test_load_pdf_images_respects_page_range(monkeypatch):
monkeypatch.setattr("chandra.input.pdfium.PdfDocument", FakePdfDocument)
monkeypatch.setattr("chandra.input.flatten", lambda page: None)

images = load_pdf_images("dummy.pdf", page_range=[1])

assert len(images) == 1
assert images[0].size == (300, 400)