beardedeagle · beardedeagle · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026
diff --git a/.beads/interactions.jsonl b/.beads/interactions.jsonl
@@ -8,3 +8,9 @@
 {"id":"int-e2c83df6","kind":"field_change","created_at":"2026-04-03T04:07:29.73357Z","actor":"beardedeagle","issue_id":"ollm-cly","extra":{"field":"status","new_value":"closed","old_value":"in_progress","reason":"Completed"}}
 {"id":"int-7c080521","kind":"field_change","created_at":"2026-04-03T04:58:45.426115Z","actor":"beardedeagle","issue_id":"ollm-7zk","extra":{"field":"status","new_value":"closed","old_value":"open","reason":"Completed"}}
 {"id":"int-bf6e89f2","kind":"field_change","created_at":"2026-04-03T06:33:55.374489Z","actor":"beardedeagle","issue_id":"ollm-nnt","extra":{"field":"status","new_value":"closed","old_value":"open","reason":"Completed"}}
+{"id":"int-a6481fd7","kind":"field_change","created_at":"2026-04-03T09:48:50.10993Z","actor":"beardedeagle","issue_id":"ollm-6b7","extra":{"field":"status","new_value":"closed","old_value":"in_progress","reason":"Completed"}}
+{"id":"int-32d6d982","kind":"field_change","created_at":"2026-04-03T11:31:04.109777Z","actor":"beardedeagle","issue_id":"ollm-6b7","extra":{"field":"status","new_value":"open","old_value":"closed"}}
+{"id":"int-9e450675","kind":"field_change","created_at":"2026-04-03T11:33:42.488488Z","actor":"beardedeagle","issue_id":"ollm-6b7","extra":{"field":"status","new_value":"in_progress","old_value":"open"}}
+{"id":"int-5cd99253","kind":"field_change","created_at":"2026-04-03T11:52:07.85795Z","actor":"beardedeagle","issue_id":"ollm-6b7","extra":{"field":"status","new_value":"closed","old_value":"in_progress","reason":"Completed"}}
+{"id":"int-25387d16","kind":"field_change","created_at":"2026-04-03T13:58:30.61251Z","actor":"beardedeagle","issue_id":"ollm-qm9","extra":{"field":"status","new_value":"closed","old_value":"open","reason":"Completed"}}
+{"id":"int-f18a669e","kind":"field_change","created_at":"2026-04-03T13:58:30.652694Z","actor":"beardedeagle","issue_id":"ollm-dnl","extra":{"field":"status","new_value":"closed","old_value":"open","reason":"Completed"}}
diff --git a/README.md b/README.md
@@ -160,12 +160,24 @@ full-history KV in memory. When the bounded
 the recent-context token budget and oldest tokens are evicted once the window
 is exceeded.
 
-On optimized-native decoder-only text runtimes, long prompts are ingested
-through bounded prefill chunks before the final decode step. That keeps prompt
-execution from growing one full prompt-wide activation step at a time on very
-long inputs while preserving the external prompt/chat contract. Prompt-scaling
-benchmarks remain the right place to evaluate the TTFT and memory tradeoff on
-target hardware.
+On the causal runtime lanes that support chunked prefill, long prompts are
+ingested through bounded prefill chunks before the final decode step. The
+current strategy lanes are:
+
+- `optimized-native-text`
+- `optimized-native-multimodal`
+- `transformers-generic-text`
+- `transformers-generic-multimodal`
+- `transformers-generic-seq2seq-source`
+
+That keeps prompt execution from growing one full prompt-wide activation step
+at a time on very long inputs while preserving the external prompt/chat
+contract. Prompt-scaling benchmarks remain the right place to evaluate the
+TTFT and memory tradeoff on target hardware. Prompt tokenization now streams
+from rendered prompt pieces inside the strategy path, prefix attention masks
+are synthesized lazily per chunk, and seq2seq source prompts use the dedicated
+`transformers-generic-seq2seq-source` lane instead of pretending they share the
+causal-cache contract.
 
 Configuration layering uses an explicit precedence contract:
 

diff --git a/docs/benchmarking.md b/docs/benchmarking.md
@@ -179,7 +179,8 @@ Interpretation notes:
 - output throughput is generated output tokens divided by total generation latency
 - peak RSS includes a source label; long-lived warm/scaling/session probes use stage-local sampled peaks instead of process-lifetime peaks
 - allocator-gap metrics are reported as reserved-minus-allocated style slack when the backend exposes the required counters; unsupported backends serialize them as `null`
-- optimized-native decoder-only prompt-scaling runs exercise bounded chunked prefill on long text prompts, so the prompt-length sweep is the intended place to inspect the memory versus TTFT tradeoff for this feature
+- text prompt-scaling runs exercise bounded chunked prefill on the supported text strategy lanes, so the prompt-length sweep is the intended place to inspect the memory versus TTFT tradeoff for this feature
+- request metrics also include a `chunked_prefill` section that states the selected strategy ID, whether the active runtime was eligible, whether the strategy actually ran, and the implemented execution boundary for streamed prompt tokenization plus lazy prefix-mask synthesis
 
 On loader-streamed families such as optimized Gemma3 on CPU, a long per-turn
 session-growth response can become dominated by repeated safetensor layer reads

diff --git a/docs/guides/optimization.md b/docs/guides/optimization.md
@@ -15,10 +15,19 @@ Native families:
 - `gpt-oss`
 - `voxtral`
 
-Optimized-native decoder-only text prompts use bounded chunked prefill for
-long prompt ingestion before the final decode step. This is a memory-control
-path, not a blanket latency optimization, so prompt-scaling benchmarks are the
-truthful way to evaluate whether the chunking tradeoff helps on a given host.
+Supported causal runtime lanes use bounded chunked prefill for long prompt
+ingestion before the final decode step. The current lanes are
+`optimized-native-text`, `optimized-native-multimodal`,
+`transformers-generic-text`, and `transformers-generic-multimodal`.
+Encoder-decoder source prompts use the dedicated
+`transformers-generic-seq2seq-source` lane.
+This is a memory-control path, not a blanket latency optimization, so
+prompt-scaling benchmarks are the truthful way to evaluate whether the chunking
+tradeoff helps on a given host.
+
+Prompt tokenization now streams from rendered prompt pieces inside these
+strategy handlers, and causal lanes synthesize prefix attention masks lazily
+per chunk instead of materializing the full mask before ingestion starts.
 
 ### Transformers-generic
 Used for compatible local or materialized models that can run through the generic Transformers-backed path.

diff --git a/src/ollm/runtime/benchmark/chunked_prefill_serialization.py b/src/ollm/runtime/benchmark/chunked_prefill_serialization.py
@@ -0,0 +1,73 @@
+"""Chunked-prefill benchmark JSON parsing helpers."""
+
+from collections.abc import Mapping
+from typing import cast
+
+from ollm.runtime.chunked_prefill import (
+    ChunkedPrefillAttentionMaskMode,
+    ChunkedPrefillExecutionBoundary,
+    ChunkedPrefillGapDecision,
+    ChunkedPrefillGapId,
+    ChunkedPrefillRecommendation,
+    ChunkedPrefillScopeSurface,
+    ChunkedPrefillStrategyId,
+)
+
+
+def parse_chunked_prefill(
+    value: object,
+    *,
+    require_bool,
+    require_object_mapping,
+    require_sequence,
+    require_string,
+) -> ChunkedPrefillScopeSurface:
+    if not isinstance(value, Mapping):
+        raise ValueError("chunked_prefill must be an object")
+    payload = cast(Mapping[str, object], value)
+    gap_items = require_sequence(payload, "gap_inventory")
+    return ChunkedPrefillScopeSurface(
+        strategy_id=_optional_strategy_id(payload, require_string=require_string),
+        runtime_eligible=require_bool(payload, "runtime_eligible"),
+        applied=require_bool(payload, "applied"),
+        activation_reason=require_string(payload, "activation_reason"),
+        execution_boundary=ChunkedPrefillExecutionBoundary(
+            require_string(payload, "execution_boundary")
+        ),
+        attention_mask_mode=ChunkedPrefillAttentionMaskMode(
+            require_string(payload, "attention_mask_mode")
+        ),
+        gap_inventory=tuple(
+            parse_chunked_prefill_gap(
+                require_object_mapping(item, f"gap_inventory[{index}]"),
+                require_string=require_string,
+            )
+            for index, item in enumerate(gap_items)
+        ),
+    )
+
+
+def parse_chunked_prefill_gap(
+    payload: Mapping[str, object],
+    *,
+    require_string,
+) -> ChunkedPrefillGapDecision:
+    return ChunkedPrefillGapDecision(
+        gap_id=ChunkedPrefillGapId(require_string(payload, "gap_id")),
+        current_behavior=require_string(payload, "current_behavior"),
+        recommendation=ChunkedPrefillRecommendation(
+            require_string(payload, "recommendation")
+        ),
+        rationale=require_string(payload, "rationale"),
+    )
+
+
+def _optional_strategy_id(
+    payload: Mapping[str, object],
+    *,
+    require_string,
+) -> ChunkedPrefillStrategyId | None:
+    value = payload.get("strategy_id")
+    if value is None:
+        return None
+    return ChunkedPrefillStrategyId(require_string(payload, "strategy_id"))
diff --git a/src/ollm/runtime/benchmark/details.py b/src/ollm/runtime/benchmark/details.py
@@ -36,7 +36,6 @@ def build_cold_probe_details(
 
 def summarize_request_metrics(samples: list[RequestProbeMetrics]) -> dict[str, object]:
     """Summarize request-level runtime probe metrics."""
-
     from ollm.runtime.benchmark.offload_summary import summarize_request_offload
 
     return {
@@ -82,6 +81,7 @@ def summarize_request_metrics(samples: list[RequestProbeMetrics]) -> dict[str, o
                 ]
             ),
         },
+        "chunked_prefill": samples[-1].chunked_prefill.to_dict(),
         "memory": summarize_stage_resources([sample.resources for sample in samples]),
         "cache": {
             "cache_mode": single_optional_string(

diff --git a/src/ollm/runtime/benchmark/probe_execution.py b/src/ollm/runtime/benchmark/probe_execution.py
@@ -217,6 +217,7 @@ def execute_request_probe(
             kv_cache_adaptation=kv_cache_adaptation,
             cache_dir_size_mb=cache_dir_size,
             cache_state=cache_state,
+            chunked_prefill=trace.chunked_prefill,
             allocator_gap_mb=allocator_gap_mb,
             allocator_gap_ratio=allocator_gap_ratio,
             native_runtime_profile=native_runtime_profile,

diff --git a/src/ollm/runtime/benchmark/probe_serialization.py b/src/ollm/runtime/benchmark/probe_serialization.py
@@ -6,6 +6,7 @@
 
 from ollm.kv_cache.matrix import KVCacheAdaptationSurface
 from ollm.kv_cache.state import KVCacheStateSnapshot
+from ollm.runtime.benchmark.chunked_prefill_serialization import parse_chunked_prefill
 from ollm.runtime.benchmark.probe_types import (
     EventTimingSummary,
     NativeRuntimeProfile,
@@ -244,6 +245,13 @@ def _parse_request_probe_metrics(payload: Mapping[str, object]) -> RequestProbeM
         ),
         cache_dir_size_mb=_optional_float(payload, "cache_dir_size_mb"),
         cache_state=_parse_cache_state(payload.get("cache_state")),
+        chunked_prefill=parse_chunked_prefill(
+            payload.get("chunked_prefill"),
+            require_bool=_require_bool,
+            require_object_mapping=_require_object_mapping,
+            require_sequence=_require_sequence,
+            require_string=_require_string,
+        ),
         allocator_gap_mb=_optional_float(payload, "allocator_gap_mb"),
         allocator_gap_ratio=_optional_float(payload, "allocator_gap_ratio"),
         native_runtime_profile=_parse_native_runtime_profile(
@@ -447,6 +455,13 @@ def _optional_int(payload: Mapping[str, object], key: str) -> int | None:
     raise ValueError(f"probe field '{key}' must be an integer or null")
 
 
+def _require_bool(payload: Mapping[str, object], key: str) -> bool:
+    value = payload.get(key)
+    if isinstance(value, bool):
+        return value
+    raise ValueError(f"probe field '{key}' must be a boolean")
+
+
 def _require_string(payload: Mapping[str, object], key: str) -> str:
     value = payload.get(key)
     if isinstance(value, str):

diff --git a/src/ollm/runtime/benchmark/probe_types.py b/src/ollm/runtime/benchmark/probe_types.py
@@ -5,6 +5,7 @@
 from ollm.kv_cache.matrix import KVCacheAdaptationSurface
 from ollm.kv_cache.state import KVCacheStateSnapshot
 from ollm.runtime.benchmark.resources import StageResourceSnapshot
+from ollm.runtime.chunked_prefill import ChunkedPrefillScopeSurface
 
 
 @dataclass(frozen=True, slots=True)
@@ -50,6 +51,7 @@ class RequestProbeMetrics:
     kv_cache_adaptation: KVCacheAdaptationSurface | None
     cache_dir_size_mb: float | None
     cache_state: KVCacheStateSnapshot | None
+    chunked_prefill: ChunkedPrefillScopeSurface
     allocator_gap_mb: float | None
     allocator_gap_ratio: float | None
     native_runtime_profile: NativeRuntimeProfile | None
@@ -71,6 +73,7 @@ def to_dict(self) -> dict[str, object]:
         payload["cache_state"] = (
             None if self.cache_state is None else self.cache_state.to_dict()
         )
+        payload["chunked_prefill"] = self.chunked_prefill.to_dict()
         payload["kv_cache_adaptation"] = (
             None
             if self.kv_cache_adaptation is None