From f38031ad7a61e52373a8ce643176a1b702ee7e35 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:46:35 +0100
Subject: [PATCH 01/26] feat(sandbox): add sandbox_agent package init

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 a2a/sandbox_agent/src/sandbox_agent/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/__init__.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/__init__.py b/a2a/sandbox_agent/src/sandbox_agent/__init__.py
new file mode 100644
index 00000000..e69de29b

From b4c2d653272c982b4ea66983ec09a3d3dc936ffa Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:47:29 +0100
Subject: [PATCH 02/26] feat(sandbox): A2A server with event streaming, session
 management, and graph card endpoint

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 a2a/sandbox_agent/src/sandbox_agent/agent.py | 1046 ++++++++++++++++++
 1 file changed, 1046 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/agent.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/agent.py b/a2a/sandbox_agent/src/sandbox_agent/agent.py
new file mode 100644
index 00000000..70e67ba7
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/agent.py
@@ -0,0 +1,1046 @@
+"""A2A agent server for the Sandbox Legion.
+
+Wires together the workspace manager, permission checker, sources config,
+and LangGraph graph to serve the A2A protocol over HTTP.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import json
+import logging
+import os
+from pathlib import Path
+from textwrap import dedent
+from typing import Any
+
+import uvicorn
+from a2a.server.agent_execution import AgentExecutor, RequestContext
+from a2a.server.apps import A2AStarletteApplication
+from a2a.server.events.event_queue import EventQueue
+from a2a.server.request_handlers import DefaultRequestHandler
+from a2a.server.tasks import InMemoryTaskStore, TaskUpdater
+
+try:
+    from a2a.server.tasks import DatabaseTaskStore
+
+    _HAS_SQL_STORE = True
+except ImportError:
+    _HAS_SQL_STORE = False
+from a2a.types import (
+    AgentCapabilities,
+    AgentCard,
+    AgentExtension,
+    AgentSkill,
+    TaskState,
+    TextPart,
+)
+from a2a.utils import new_agent_text_message, new_task
+from langchain_core.messages import HumanMessage
+from starlette.routing import Route
+
+from langgraph.checkpoint.memory import MemorySaver
+
+from sandbox_agent.budget import AgentBudget
+from sandbox_agent.configuration import Configuration
+from sandbox_agent.event_serializer import LangGraphSerializer
+from sandbox_agent.graph import _load_skill, build_graph
+from sandbox_agent.graph_card import build_graph_card
+from sandbox_agent.observability import setup_observability
+from sandbox_agent.permissions import PermissionChecker
+from sandbox_agent.sources import SourcesConfig
+from sandbox_agent.workspace import WorkspaceManager
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+# Package root is two levels up from __file__
+# (__file__ = src/sandbox_agent/agent.py -> package root = .)
+_PACKAGE_ROOT = Path(__file__).resolve().parent.parent.parent
+
+
+def _load_json(filename: str) -> dict:
+    """Load a JSON file from the package root directory.
+
+    Parameters
+    ----------
+    filename:
+        Name of the JSON file (e.g. ``settings.json`` or ``sources.json``).
+
+    Returns
+    -------
+    dict
+        Parsed JSON content.
+    """
+    path = _PACKAGE_ROOT / filename
+    with open(path, encoding="utf-8") as fh:
+        return json.load(fh)
+
+
+# ---------------------------------------------------------------------------
+# TOFU (Trust-On-First-Use) verification
+# ---------------------------------------------------------------------------
+
+_TOFU_HASH_FILE = ".tofu-hashes.json"
+
+# Files in the workspace root to track for TOFU verification.
+_TOFU_TRACKED_FILES = ("CLAUDE.md", "sources.json", "settings.json")
+
+
+def _hash_file(path: Path) -> str | None:
+    """Return the SHA-256 hex digest of a file, or None if it doesn't exist."""
+    if not path.is_file():
+        return None
+    h = hashlib.sha256()
+    h.update(path.read_bytes())
+    return h.hexdigest()
+
+
+def _compute_tofu_hashes(root: Path) -> dict[str, str]:
+    """Compute SHA-256 hashes for tracked files under *root*.
+
+    Returns a dict mapping filename -> hex digest (only for files that exist).
+    """
+    hashes: dict[str, str] = {}
+    for name in _TOFU_TRACKED_FILES:
+        digest = _hash_file(root / name)
+        if digest is not None:
+            hashes[name] = digest
+    return hashes
+
+
+def _tofu_verify(root: Path) -> None:
+    """Run TOFU verification on startup.
+
+    On first run, computes and stores hashes of tracked files.  On subsequent
+    runs, compares current hashes against the stored ones and logs a WARNING
+    if any file has changed (possible tampering).  Does NOT block startup.
+    """
+    # Write to /tmp to avoid PermissionError when OCP assigns arbitrary UID
+    # (the /app directory is owned by UID 1001 but OCP may run as a different UID)
+    hash_file = Path("/tmp") / _TOFU_HASH_FILE
+    current_hashes = _compute_tofu_hashes(root)
+
+    if not current_hashes:
+        logger.info("TOFU: no tracked files found in %s; skipping.", root)
+        return
+
+    if hash_file.is_file():
+        try:
+            with open(hash_file, encoding="utf-8") as fh:
+                stored_hashes = json.load(fh)
+        except (json.JSONDecodeError, OSError) as exc:
+            logger.warning("TOFU: could not read %s: %s", hash_file, exc)
+            stored_hashes = {}
+
+        # Compare each tracked file.
+        changed: list[str] = []
+        added: list[str] = []
+        removed: list[str] = []
+        for name, digest in current_hashes.items():
+            stored = stored_hashes.get(name)
+            if stored is None:
+                added.append(name)
+            elif stored != digest:
+                changed.append(name)
+        for name in stored_hashes:
+            if name not in current_hashes:
+                removed.append(name)
+
+        if changed or added or removed:
+            logger.warning(
+                "TOFU: workspace file integrity mismatch! "
+                "changed=%s, added=%s, removed=%s. "
+                "This may indicate tampering. Updating stored hashes.",
+                changed, added, removed,
+            )
+            # Update stored hashes (trust the new state).
+            with open(hash_file, "w", encoding="utf-8") as fh:
+                json.dump(current_hashes, fh, indent=2)
+        else:
+            logger.info("TOFU: all tracked files match stored hashes.")
+    else:
+        # First run: store hashes.
+        logger.info("TOFU: first run -- storing hashes for %s", list(current_hashes.keys()))
+        with open(hash_file, "w", encoding="utf-8") as fh:
+            json.dump(current_hashes, fh, indent=2)
+
+
+# ---------------------------------------------------------------------------
+# Agent Card
+# ---------------------------------------------------------------------------
+
+
+def get_agent_card(host: str, port: int) -> AgentCard:
+    """Return an A2A AgentCard for the Sandbox Legion.
+
+    Parameters
+    ----------
+    host:
+        Hostname or IP address the agent is listening on.
+    port:
+        Port number the agent is listening on.
+    """
+    capabilities = AgentCapabilities(
+        streaming=True,
+        extensions=[
+            AgentExtension(
+                uri="urn:kagenti:agent-graph-card:v1",
+                description="Processing graph topology and event schemas",
+                required=False,
+                params={"endpoint": "/.well-known/agent-graph-card.json"},
+            ),
+        ],
+    )
+    # Scan workspace for loaded skill files (.claude/skills/**/*.md)
+    # Skills found on disk are advertised in the agent card so the UI
+    # can show them in the / autocomplete (SkillWhisperer).
+    skills: list[AgentSkill] = []
+    workspace = os.environ.get("WORKSPACE_DIR", "/workspace")
+    skills_dir = Path(workspace) / ".claude" / "skills"
+    if skills_dir.is_dir():
+        seen_ids: set[str] = set()
+        for md_file in sorted(skills_dir.rglob("SKILL.md")):
+            # Directory-based skills: auth:keycloak-confidential-client/SKILL.md
+            # Skill ID = directory name relative to skills_dir
+            rel_dir = md_file.parent.relative_to(skills_dir)
+            skill_id = str(rel_dir).replace("/", ":")
+            if skill_id in seen_ids or skill_id == ".":
+                continue
+            seen_ids.add(skill_id)
+            # Read description from the skill file (skip frontmatter)
+            try:
+                content = md_file.read_text(errors="replace")
+                desc = ""
+                for line in content.split("\n"):
+                    line = line.strip()
+                    if line.startswith("description:"):
+                        desc = line.split(":", 1)[1].strip().strip("'\"")
+                        break
+                    if line.startswith("# ") and not desc:
+                        desc = line.lstrip("# ").strip()
+                if not desc:
+                    desc = skill_id
+            except Exception:
+                desc = skill_id
+            skills.append(
+                AgentSkill(
+                    id=skill_id,
+                    name=skill_id,
+                    description=desc[:200],
+                    tags=["skill"],
+                )
+            )
+        logger.info("Found %d skills in %s", len(skills), skills_dir)
+
+    # Always include the base sandbox skill
+    skills.append(
+        AgentSkill(
+            id="sandbox_legion",
+            name="Sandbox Legion",
+            description=(
+                "Sandboxed coding assistant with shell execution, file read/write, "
+                "web fetch, explore, and delegate capabilities."
+            ),
+            tags=["shell", "file", "workspace", "sandbox"],
+            examples=[
+                "Run 'ls -la' in my workspace",
+                "Create a Python script that prints hello world",
+                "Read the contents of output/results.txt",
+            ],
+        )
+    )
+    return AgentCard(
+        name="Sandbox Legion",
+        description=dedent(
+            """\
+            A sandboxed coding assistant that can execute shell commands, \
+            read files, and write files inside isolated per-context workspaces.
+
+            ## Key Features
+            - **Shell execution** with three-tier permission checks (allow/deny/HITL)
+            - **File read/write** with path-traversal prevention
+            - **Per-context workspaces** for multi-turn isolation
+            """,
+        ),
+        url=f"http://{host}:{port}/",
+        version="1.0.0",
+        default_input_modes=["text"],
+        default_output_modes=["text"],
+        capabilities=capabilities,
+        skills=skills,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Agent Executor
+# ---------------------------------------------------------------------------
+
+
+class SandboxAgentExecutor(AgentExecutor):
+    """A2A executor that delegates to the LangGraph sandbox graph."""
+
+    # Per-context_id locks to serialize concurrent graph executions for the
+    # same conversation.  A simple dict + mutex approach with periodic cleanup
+    # of unused entries.
+    _context_locks: dict[str, asyncio.Lock] = {}
+    _context_locks_mutex: asyncio.Lock = asyncio.Lock()
+
+    async def _get_context_lock(self, context_id: str) -> asyncio.Lock:
+        """Return (and lazily create) the asyncio.Lock for *context_id*.
+
+        A class-level mutex guards the dict so that two concurrent requests
+        for the same new context_id don't each create their own Lock.
+        """
+        async with self._context_locks_mutex:
+            lock = self._context_locks.get(context_id)
+            if lock is None:
+                lock = asyncio.Lock()
+                self._context_locks[context_id] = lock
+            return lock
+
+    def __init__(self) -> None:
+        settings = _load_json("settings.json")
+        sources = _load_json("sources.json")
+
+        self._permission_checker = PermissionChecker(settings)
+        self._sources_config = SourcesConfig.from_dict(sources)
+
+        config = Configuration()  # type: ignore[call-arg]
+
+        # Use PostgreSQL checkpointer if configured, else in-memory
+        self._checkpoint_db_url = config.checkpoint_db_url
+        self._checkpointer = None  # Lazy-initialized in execute()
+        self._checkpointer_initialized = False
+        if not self._checkpoint_db_url or self._checkpoint_db_url == "memory":
+            self._checkpointer = MemorySaver()
+            self._checkpointer_initialized = True
+            logger.info("Using in-memory checkpointer (set CHECKPOINT_DB_URL for persistence)")
+        else:
+            logger.info("PostgreSQL checkpointer configured: %s", self._checkpoint_db_url.split("@")[-1])
+        self._workspace_manager = WorkspaceManager(
+            workspace_root=config.workspace_root,
+            agent_name="sandbox-legion",
+            ttl_days=config.context_ttl_days,
+        )
+
+        # C19: Clean up expired workspaces on startup.
+        cleaned = self._workspace_manager.cleanup_expired()
+        if cleaned:
+            logger.info("Cleaned up %d expired workspaces: %s", len(cleaned), cleaned)
+
+        # TOFU: verify workspace config file integrity on startup.
+        # Logs warnings on mismatch but does not block the agent from starting.
+        _tofu_verify(_PACKAGE_ROOT)
+
+    async def _ensure_checkpointer(self) -> None:
+        """Initialize or re-initialize the PostgreSQL checkpointer.
+
+        Creates a new connection pool if not initialized yet, or if the
+        existing connection is stale (e.g., after a PostgreSQL restart).
+        """
+        if not self._checkpoint_db_url:
+            return
+
+        needs_init = not self._checkpointer_initialized
+
+        # Check if existing connection is stale
+        if self._checkpointer_initialized and self._checkpointer:
+            try:
+                # Lightweight health check — attempt a simple query
+                pool = getattr(self._checkpointer, 'conn', None) or getattr(self._checkpointer, '_conn', None)
+                if pool and hasattr(pool, 'execute'):
+                    await pool.execute("SELECT 1")
+            except Exception:
+                logger.warning("PostgreSQL checkpointer connection stale — re-initializing")
+                # Close old connection
+                if hasattr(self, '_checkpointer_cm') and self._checkpointer_cm:
+                    try:
+                        await self._checkpointer_cm.__aexit__(None, None, None)
+                    except Exception:
+                        pass
+                needs_init = True
+                self._checkpointer_initialized = False
+
+        if needs_init:
+            from langgraph.checkpoint.postgres.aio import AsyncPostgresSaver
+
+            cm = AsyncPostgresSaver.from_conn_string(self._checkpoint_db_url)
+            self._checkpointer = await cm.__aenter__()
+            self._checkpointer_cm = cm
+            await self._checkpointer.setup()
+            self._checkpointer_initialized = True
+            logger.info("PostgreSQL checkpointer initialized")
+
+    # ------------------------------------------------------------------
+
+    async def execute(
+        self, context: RequestContext, event_queue: EventQueue
+    ) -> None:
+        """Execute a user request through the LangGraph sandbox graph.
+
+        Steps:
+        1. Get or create an A2A task.
+        2. Resolve the workspace directory from context_id.
+        3. Build and stream the LangGraph graph.
+        4. Emit status updates and artifacts via TaskUpdater.
+        """
+        # 1. Get or create task
+        task = context.current_task
+        if not task:
+            task = new_task(context.message)  # type: ignore
+            await event_queue.enqueue_event(task)
+
+        task_updater = TaskUpdater(event_queue, task.id, task.context_id)
+
+        # 2. Resolve workspace from context_id
+        context_id = task.context_id
+        if context_id:
+            workspace_path = self._workspace_manager.ensure_workspace(context_id)
+            logger.info("Using workspace for context_id=%s: %s", context_id, workspace_path)
+        else:
+            workspace_path = "/tmp/sandbox-stateless"
+            Path(workspace_path).mkdir(parents=True, exist_ok=True)
+            logger.info("No context_id; using stateless workspace: %s", workspace_path)
+
+        # Lazy-init PostgreSQL checkpointer on first execute()
+        await self._ensure_checkpointer()
+
+        # 3. Build graph with shared checkpointer for multi-turn memory
+        namespace = os.environ.get("NAMESPACE", "team1")
+        graph = build_graph(
+            workspace_path=workspace_path,
+            permission_checker=self._permission_checker,
+            sources_config=self._sources_config,
+            checkpointer=self._checkpointer,
+            context_id=context_id or "stateless",
+            namespace=namespace,
+        )
+
+        # 4. Stream graph execution with thread_id for checkpointer routing.
+        #    Acquire a per-context_id lock so that two concurrent requests for
+        #    the same conversation are serialized (the LangGraph checkpointer
+        #    is not safe for parallel writes to the same thread_id).
+        lock = await self._get_context_lock(context_id or "stateless")
+        logger.info(
+            "Acquiring context lock for context_id=%s (already locked: %s)",
+            context_id,
+            lock.locked(),
+        )
+
+        async with lock:
+            messages = [HumanMessage(content=context.get_user_input())]
+            input_state: dict[str, Any] = {
+                "messages": messages,
+                "workspace_path": workspace_path,
+                "context_id": context_id or "stateless",
+            }
+
+            # Extract skill from A2A message metadata and load its content.
+            # TODO(Session N): Once base image moves to kagenti repo, use
+            # skill_pack_loader.py at startup to clone verified skill packs
+            # from skill-packs.yaml into /workspace/.claude/skills/ before
+            # the first message. Currently skills must be pre-populated.
+            msg = context.message
+            skill_id = None
+            if msg and msg.metadata:
+                skill_id = msg.metadata.get("skill")
+
+            if skill_id:
+                skill_content = _load_skill(workspace_path, skill_id)
+                if skill_content:
+                    input_state["skill_instructions"] = (
+                        f'<skill name="{skill_id}">\n'
+                        f"{skill_content}\n"
+                        f"</skill>\n\n"
+                        f"Follow the skill instructions above for this task."
+                    )
+                    logger.info("Loaded skill '%s' for context_id=%s", skill_id, context_id)
+                else:
+                    logger.warning("Skill '%s' requested but not found in workspace %s", skill_id, workspace_path)
+
+            graph_config = {
+                "configurable": {"thread_id": context_id or "stateless"},
+                "recursion_limit": AgentBudget().recursion_limit,
+            }
+            logger.info("Processing messages: %s (thread_id=%s)", input_state, context_id)
+
+            try:
+                output = None
+                serializer = LangGraphSerializer(context_id=context_id)
+                llm_request_ids: list[str] = []
+
+                # Run graph in a shielded background task so client disconnect
+                # does NOT cancel the LangGraph execution.  Events are fed
+                # through an asyncio.Queue; the consumer (below) forwards them
+                # to the A2A event stream.  If the consumer is cancelled the
+                # graph keeps running and saves results to the task store.
+                _SENTINEL = object()
+                event_queue: asyncio.Queue = asyncio.Queue()
+
+                async def _run_graph() -> None:
+                    """Execute graph and push events to queue (shielded)."""
+                    nonlocal graph
+                    max_retries = 3
+                    for attempt in range(max_retries + 1):
+                        try:
+                            async for ev in graph.astream(
+                                input_state, config=graph_config, stream_mode="updates"
+                            ):
+                                await event_queue.put(ev)
+                            break  # success
+                        except Exception as retry_err:
+                            err_str = str(retry_err).lower()
+                            is_quota = "insufficient_quota" in err_str
+                            is_rate = "rate_limit" in err_str or "429" in err_str
+                            is_db_stale = "connection is closed" in err_str or "operationalerror" in err_str
+                            if is_quota:
+                                logger.error("LLM quota exceeded: %s", retry_err)
+                                await event_queue.put(
+                                    {"_error": "LLM API quota exceeded. Check billing."}
+                                )
+                                break
+                            elif is_db_stale and attempt < max_retries:
+                                logger.warning(
+                                    "DB connection stale (%d/%d), re-initializing checkpointer: %s",
+                                    attempt + 1, max_retries, retry_err,
+                                )
+                                await self._ensure_checkpointer()
+                                # Rebuild graph with fresh checkpointer
+                                graph = build_graph(
+                                    workspace_path=workspace_path,
+                                    permission_checker=self._permission_checker,
+                                    sources_config=self._sources_config,
+                                    checkpointer=self._checkpointer,
+                                    context_id=context_id or "stateless",
+                                    namespace=namespace,
+                                )
+                                continue
+                            elif is_rate and attempt < max_retries:
+                                delay = 2 ** (attempt + 1)
+                                logger.warning(
+                                    "Rate limited (%d/%d), retrying in %ds: %s",
+                                    attempt + 1, max_retries, delay, retry_err,
+                                )
+                                await asyncio.sleep(delay)
+                                continue
+                            else:
+                                logger.error("Graph execution failed: %s", retry_err, exc_info=True)
+                                await event_queue.put({"_error": str(retry_err)})
+                                break
+                    await event_queue.put(_SENTINEL)
+
+                # Shield the graph task from cancellation
+                graph_task = asyncio.ensure_future(asyncio.shield(_run_graph()))
+
+                # Consume events from the queue — this side CAN be cancelled
+                event_count = 0
+                client_disconnected = False
+                while True:
+                    try:
+                        event = await event_queue.get()
+                    except asyncio.CancelledError:
+                        logger.warning(
+                            "Event consumer cancelled (context=%s) — graph continues in background",
+                            context_id,
+                        )
+                        client_disconnected = True
+                        break
+                    if event is _SENTINEL:
+                        break
+                    if "_error" in event:
+                        error_msg = event["_error"]
+                        await task_updater.update_status(
+                            TaskState.working,
+                            new_agent_text_message(
+                                json.dumps({"type": "error", "message": error_msg}),
+                                task_updater.context_id,
+                                task_updater.task_id,
+                            ),
+                        )
+                        parts = [TextPart(text=f"Error: {error_msg}")]
+                        await task_updater.add_artifact(parts)
+                        await task_updater.failed()
+                        return
+
+                    event_count += 1
+                    node_names = list(event.keys())
+                    logger.info(
+                        "Graph event %d: nodes=%s (context=%s)",
+                        event_count, node_names, context_id,
+                    )
+
+                    # Skip __interrupt__ events (HITL pause) — these contain
+                    # tuples, not dicts, and shouldn't be serialized.
+                    if "__interrupt__" in event:
+                        logger.info(
+                            "Graph interrupted (HITL) at event %d: %s",
+                            event_count, event.get("__interrupt__"),
+                        )
+                        # Emit a structured HITL event for the frontend
+                        hitl_data = event.get("__interrupt__", ())
+                        hitl_msg = str(hitl_data[0]) if hitl_data else "Approval required"
+                        hitl_json = json.dumps({
+                            "type": "hitl_request",
+                            "loop_id": serializer._loop_id,
+                            "message": hitl_msg[:500],
+                        })
+                        await task_updater.update_status(
+                            TaskState.working,
+                            new_agent_text_message(
+                                hitl_json + "\n",
+                                task_updater.context_id,
+                                task_updater.task_id,
+                            ),
+                        )
+                        continue
+
+                    # Send intermediate status updates as structured JSON
+                    try:
+                        serialized_lines = "\n".join(
+                            serializer.serialize(key, value)
+                            for key, value in event.items()
+                            if isinstance(value, dict)
+                        ) + "\n"
+                        await task_updater.update_status(
+                            TaskState.working,
+                            new_agent_text_message(
+                                serialized_lines,
+                                task_updater.context_id,
+                                task_updater.task_id,
+                            ),
+                        )
+                        line_types = []
+                        for line in serialized_lines.split("\n"):
+                            line = line.strip()
+                            if line:
+                                try:
+                                    lt = json.loads(line).get("type", "?")
+                                    line_types.append(lt)
+                                except json.JSONDecodeError:
+                                    line_types.append("parse_error")
+                        logger.info("A2A_EMIT session=%s lines=%d types=%s",
+                            context_id, len(line_types), line_types)
+                    except asyncio.CancelledError:
+                        logger.warning(
+                            "SSE update cancelled at event %d (context=%s) — client disconnected",
+                            event_count, context_id,
+                        )
+                        client_disconnected = True
+                        break
+                    except Exception as update_err:
+                        logger.error(
+                            "Failed to send SSE update for event %d: %s",
+                            event_count, update_err,
+                        )
+                    output = event
+
+                    # Capture LLM request_ids from AIMessage responses
+                    for _node_val in event.values():
+                        if isinstance(_node_val, dict):
+                            for _msg in _node_val.get("messages", []):
+                                _rid = getattr(_msg, "response_metadata", {}).get("id")
+                                if _rid and _rid not in llm_request_ids:
+                                    llm_request_ids.append(_rid)
+
+                # If client disconnected, wait for graph to finish in background
+                if client_disconnected:
+                    logger.info("Waiting for graph to complete in background (context=%s)", context_id)
+                    try:
+                        await asyncio.wait_for(graph_task, timeout=300)
+                    except (asyncio.TimeoutError, asyncio.CancelledError):
+                        logger.warning("Graph background task timed out or cancelled (context=%s)", context_id)
+                    # Drain remaining events — serialize and persist them
+                    # since the SSE consumer was cancelled and missed these.
+                    bg_event_count = 0
+                    bg_serialized_lines: list[str] = []
+                    while not event_queue.empty():
+                        ev = event_queue.get_nowait()
+                        if ev is _SENTINEL or "_error" in ev:
+                            continue
+                        output = ev
+                        bg_event_count += 1
+                        # Serialize each event so it can be persisted
+                        try:
+                            for key, value in ev.items():
+                                if isinstance(value, dict):
+                                    serialized = serializer.serialize(key, value)
+                                    bg_serialized_lines.append(serialized)
+                        except Exception as ser_err:
+                            logger.warning("Failed to serialize bg event %d: %s", bg_event_count, ser_err)
+                    if bg_event_count > 0:
+                        logger.info(
+                            "Drained %d background events for context=%s, serialized %d lines",
+                            bg_event_count, context_id, len(bg_serialized_lines),
+                        )
+                        # Persist via task_updater so the events appear in history
+                        for line_block in bg_serialized_lines:
+                            try:
+                                await task_updater.update_status(
+                                    TaskState.working,
+                                    new_agent_text_message(
+                                        line_block + "\n",
+                                        task_updater.context_id,
+                                        task_updater.task_id,
+                                    ),
+                                )
+                            except Exception:
+                                pass  # best-effort
+
+                # Extract final answer from the last event.
+                # The reporter node sets {"final_answer": "..."}.
+                # Fall back to checking messages from reporter or executor.
+                final_answer = None
+                if output:
+                    # 1. Check reporter node output (plan-execute-reflect)
+                    reporter_output = output.get("reporter", {})
+                    if isinstance(reporter_output, dict):
+                        final_answer = reporter_output.get("final_answer")
+
+                    # 2. Fall back to executor/assistant message content
+                    if not final_answer:
+                        for node_name in ("reporter", "executor", "assistant"):
+                            node_output = output.get(node_name, {})
+                            if isinstance(node_output, dict):
+                                msgs = node_output.get("messages", [])
+                                if msgs:
+                                    content = getattr(msgs[-1], "content", None)
+                                    if isinstance(content, list):
+                                        final_answer = "\n".join(
+                                            block.get("text", "") if isinstance(block, dict) else str(block)
+                                            for block in content
+                                            if isinstance(block, dict) and block.get("type") == "text"
+                                        ) or None
+                                    elif content:
+                                        final_answer = str(content)
+                                    if final_answer:
+                                        break
+
+                if final_answer is None:
+                    final_answer = "No response generated."
+
+                # Store LLM request_ids in task metadata for token usage tracking
+                if llm_request_ids:
+                    try:
+                        existing_meta = {}
+                        if task.metadata:
+                            existing_meta = dict(task.metadata) if not isinstance(task.metadata, dict) else task.metadata
+                        existing_meta["llm_request_ids"] = llm_request_ids
+                        task.metadata = existing_meta
+                        logger.info(
+                            "Stored %d LLM request_ids in task metadata for context_id=%s",
+                            len(llm_request_ids), context_id,
+                        )
+                    except Exception as meta_err:
+                        logger.warning("Failed to store llm_request_ids: %s", meta_err)
+
+                # Add artifact with final answer and complete
+                parts = [TextPart(text=final_answer)]
+                await task_updater.add_artifact(parts)
+                await task_updater.complete()
+
+            except asyncio.CancelledError:
+                logger.warning(
+                    "Graph execution context cancelled for context=%s — client likely disconnected. "
+                    "Agent will continue processing and save results to task store.",
+                    context_id,
+                )
+                # Don't return — fall through to save results to task store.
+                # The A2A SDK persists the task, so the client can poll later.
+            except Exception as e:
+                logger.error("Graph execution error: %s", e, exc_info=True)
+                error_msg = json.dumps({"type": "error", "message": str(e)})
+                await task_updater.update_status(
+                    TaskState.working,
+                    new_agent_text_message(
+                        error_msg,
+                        task_updater.context_id,
+                        task_updater.task_id,
+                    ),
+                )
+                parts = [TextPart(text=f"Error: {e}")]
+                await task_updater.add_artifact(parts)
+                await task_updater.failed()
+
+        # Periodic cleanup: remove locks that are no longer held and whose
+        # context_id has not been seen recently.  We do this opportunistically
+        # after each execution to avoid unbounded growth.
+        async with self._context_locks_mutex:
+            stale = [cid for cid, lk in self._context_locks.items() if not lk.locked()]
+            # Keep the dict from growing without bound, but only drop entries
+            # when there are more than 1000 idle locks.
+            if len(stale) > 1000:
+                for cid in stale:
+                    del self._context_locks[cid]
+                logger.debug("Cleaned up %d idle context locks", len(stale))
+
+    # ------------------------------------------------------------------
+
+    async def cancel(
+        self, context: RequestContext, event_queue: EventQueue
+    ) -> None:
+        """Cancel is not supported."""
+        raise Exception("cancel not supported")
+
+
+# ---------------------------------------------------------------------------
+# Server entry point
+# ---------------------------------------------------------------------------
+
+
+class _MergingDatabaseTaskStore(DatabaseTaskStore):
+    """DatabaseTaskStore that preserves backend-managed metadata fields.
+
+    The backend writes fields like ``owner``, ``agent_name``, ``loop_events``
+    to the ``metadata`` column. The default ``save()`` uses SQLAlchemy
+    ``merge()`` which overwrites the entire row, losing those fields.
+
+    This subclass reads existing metadata before writing and merges
+    backend-managed keys so they survive A2A SDK updates.
+    """
+
+    _BACKEND_KEYS = frozenset({
+        "owner", "visibility", "title", "agent_name", "loop_events",
+    })
+
+    async def save(self, task, context=None):
+        """Save task while preserving backend-managed metadata fields."""
+        await self._ensure_initialized()
+
+        # Read existing metadata before overwriting
+        existing_meta = {}
+        async with self.async_session_maker() as session:
+            from sqlalchemy import select
+            stmt = select(self.task_model).where(self.task_model.id == task.id)
+            result = await session.execute(stmt)
+            existing = result.scalar_one_or_none()
+            if existing and existing.task_metadata:
+                raw = existing.task_metadata
+                if isinstance(raw, dict):
+                    existing_meta = raw
+                elif isinstance(raw, str):
+                    import json
+                    try:
+                        existing_meta = json.loads(raw)
+                    except (json.JSONDecodeError, TypeError):
+                        pass
+
+        # Merge: start with new task metadata, overlay backend fields from existing
+        merged = dict(task.metadata or {}) if task.metadata else {}
+        for key in self._BACKEND_KEYS:
+            if key in existing_meta and key not in merged:
+                merged[key] = existing_meta[key]
+
+        # Update task metadata with merged result
+        task.metadata = merged if merged else task.metadata
+
+        # Call parent save (which does session.merge)
+        db_task = self._to_orm(task)
+        async with self.async_session_maker.begin() as session:
+            await session.merge(db_task)
+            logger.debug("Task %s saved with merged metadata (keys=%s)",
+                         task.id, list(merged.keys()) if merged else [])
+
+
+def _create_task_store():
+    """Create the appropriate TaskStore based on configuration.
+
+    Uses _MergingDatabaseTaskStore (PostgreSQL) when TASK_STORE_DB_URL
+    is set. Falls back to InMemoryTaskStore for dev/test.
+
+    The merging store preserves backend-managed metadata fields (owner,
+    agent_name, loop_events) that would otherwise be overwritten by
+    the A2A SDK's session.merge().
+    """
+    import os
+
+    db_url = os.environ.get("TASK_STORE_DB_URL", "")
+    if db_url and _HAS_SQL_STORE:
+        from sqlalchemy.ext.asyncio import create_async_engine
+
+        engine = create_async_engine(
+            db_url,
+            pool_size=5,
+            max_overflow=3,
+            pool_recycle=300,  # Recycle connections every 5 min
+            pool_pre_ping=True,  # Verify connection before use
+        )
+        store = _MergingDatabaseTaskStore(engine)
+        logger.info("Using MergingDatabaseTaskStore: %s", db_url.split("@")[-1])
+        return store
+
+    logger.info("Using InMemoryTaskStore (set TASK_STORE_DB_URL for persistence)")
+    return InMemoryTaskStore()
+
+
+def _load_skill_packs_at_startup() -> None:
+    """Clone skill repos into /workspace/.claude/skills/ at startup.
+
+    Reads SKILL_REPOS env var (comma-separated git URLs with optional
+    path suffix after #). Falls back to kagenti repo skills.
+
+    TODO(Session N): Replace with skill_pack_loader.py once the base
+    image moves to the kagenti repo.
+    """
+    import subprocess
+
+    workspace = os.environ.get("WORKSPACE_DIR", "/workspace")
+    skills_dir = Path(workspace) / ".claude" / "skills"
+
+    if skills_dir.exists() and any(skills_dir.rglob("*.md")):
+        logger.info("Skills already loaded at %s, skipping clone", skills_dir)
+        return
+
+    # Default: clone kagenti skills from the upstream public repo
+    repos = os.environ.get(
+        "SKILL_REPOS",
+        "https://github.com/kagenti/kagenti.git#.claude/skills",
+    )
+
+    for entry in repos.split(","):
+        entry = entry.strip()
+        if not entry:
+            continue
+
+        # Parse "url@branch#path" format
+        branch = None
+        if "#" in entry:
+            url_part, skill_path = entry.rsplit("#", 1)
+        else:
+            url_part, skill_path = entry, ".claude/skills"
+        if "@" in url_part and not url_part.startswith("git@"):
+            repo_url, branch = url_part.rsplit("@", 1)
+        else:
+            repo_url = url_part
+
+        clone_dir = Path(workspace) / ".skill-repos" / repo_url.split("/")[-1].replace(".git", "")
+
+        # Remove stale clone if exists (pod restart)
+        if clone_dir.exists():
+            subprocess.run(["rm", "-rf", str(clone_dir)], capture_output=True, timeout=10)
+
+        try:
+            cmd = ["git", "clone", "--depth", "1", "--single-branch"]
+            if branch:
+                cmd += ["--branch", branch]
+            cmd += [repo_url, str(clone_dir)]
+            logger.info("Cloning skills from %s branch=%s (path: %s)", repo_url, branch or "default", skill_path)
+            subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                timeout=120,
+            )
+
+            src = clone_dir / skill_path
+            if src.is_dir():
+                skills_dir.mkdir(parents=True, exist_ok=True)
+                # Copy skill files (preserve directory structure)
+                subprocess.run(
+                    ["cp", "-r"] + [str(p) for p in src.iterdir()] + [str(skills_dir)],
+                    capture_output=True,
+                    timeout=30,
+                )
+                count = len(list(skills_dir.rglob("*.md")))
+                logger.info("Loaded %d skill files from %s", count, repo_url)
+            else:
+                logger.warning("Skill path %s not found in %s", skill_path, repo_url)
+        except subprocess.TimeoutExpired:
+            logger.warning("Timeout cloning %s", repo_url)
+        except Exception as e:
+            logger.warning("Failed to clone skills from %s: %s", repo_url, e)
+
+
+def run() -> None:
+    """Create the A2A server application and run it with uvicorn."""
+    # Landlock probe: verify filesystem isolation works before accepting requests.
+    # Runs in a forked child (Landlock is irreversible). Exits the process if
+    # the kernel does not support Landlock or the probe fails.
+    if os.environ.get("SANDBOX_LANDLOCK") == "true":
+        from sandbox_agent.landlock_probe import probe_landlock
+
+        abi = probe_landlock()  # exits process if Landlock unavailable
+        logger.info("Landlock probe passed -- ABI version %d", abi)
+
+    # Initialize OTel GenAI auto-instrumentation (if OTEL_EXPORTER_OTLP_ENDPOINT is set).
+    # NOTE: Only LangChain/OpenAI auto-instrumentation is enabled here.
+    # The HTTP middleware is disabled because it interferes with SSE streaming
+    # (BaseHTTPMiddleware captures response body, breaking streaming connections).
+    # TODO: Replace with per-node span emission from AgentGraphCard processing.
+    setup_observability()
+
+    # Load skills from git repos before building the agent card
+    _load_skill_packs_at_startup()
+
+    agent_card = get_agent_card(host="0.0.0.0", port=8000)
+
+    request_handler = DefaultRequestHandler(
+        agent_executor=SandboxAgentExecutor(),
+        task_store=_create_task_store(),
+    )
+
+    server = A2AStarletteApplication(
+        agent_card=agent_card,
+        http_handler=request_handler,
+    )
+
+    # Build the Starlette app
+    app = server.build()
+
+    # NOTE: OTel HTTP middleware REMOVED — it breaks SSE streaming.
+    # BaseHTTPMiddleware wraps the response body iterator, which causes
+    # CancelledError propagation when SSE clients disconnect. This kills
+    # the event queue and prevents event delivery.
+    # Future: emit spans from AgentGraphCard event processing instead.
+
+    # Add the /.well-known/agent-card.json route
+    app.routes.insert(
+        0,
+        Route(
+            "/.well-known/agent-card.json",
+            server._handle_get_agent_card,
+            methods=["GET"],
+            name="agent_card_well_known",
+        ),
+    )
+
+    # Build the graph card from the compiled LangGraph.
+    # We compile a temporary graph just for introspection (no checkpointer needed).
+    _graph_card_cache: dict[str, Any] = {}
+
+    async def _handle_graph_card(request: Any) -> Any:  # noqa: ARG001
+        from starlette.responses import JSONResponse
+
+        if not _graph_card_cache:
+            # Build a graph for introspection only (no checkpointer, dummy config)
+            from sandbox_agent.permissions import PermissionChecker
+            from sandbox_agent.sources import SourcesConfig
+            pc = PermissionChecker(settings={"workspace": "/workspace", "permissions": {}})
+            sc = SourcesConfig()
+            compiled = build_graph(
+                workspace_path="/workspace",
+                permission_checker=pc,
+                sources_config=sc,
+                checkpointer=None,
+            )
+            _graph_card_cache.update(
+                build_graph_card(compiled, agent_id="sandbox-legion-v1")
+            )
+        return JSONResponse(_graph_card_cache)
+
+    app.routes.insert(
+        0,
+        Route(
+            "/.well-known/agent-graph-card.json",
+            _handle_graph_card,
+            methods=["GET"],
+            name="agent_graph_card",
+        ),
+    )
+
+    uvicorn.run(app, host="0.0.0.0", port=8000)

From f29250db09bc931240284ef07df24d710766e822 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:47:34 +0100
Subject: [PATCH 03/26] feat(sandbox): budget tracking with iteration, token,
 tool-call, and wall-clock limits

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 a2a/sandbox_agent/src/sandbox_agent/budget.py | 177 ++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/budget.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/budget.py b/a2a/sandbox_agent/src/sandbox_agent/budget.py
new file mode 100644
index 00000000..87816781
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/budget.py
@@ -0,0 +1,177 @@
+"""Budget tracking for the plan-execute-reflect reasoning loop.
+
+Prevents runaway execution by capping iterations, tool calls per step,
+total token usage, and wall clock time. When the budget is exceeded the
+reflector forces the loop to terminate gracefully.
+
+Token budget is enforced via the LLM Budget Proxy:
+- The proxy intercepts all LLM calls and checks per-session token usage
+- When budget is exceeded, the proxy returns HTTP 402
+- The agent catches 402 errors and terminates gracefully
+- The local ``tokens_used`` counter tracks in-process usage for budget
+  summary events (emitted to the UI) and for the local ``exceeded`` check
+
+Budget scopes:
+- **Per-message** (single graph run): max_iterations, max_wall_clock_s, recursion_limit
+- **Per-step** (within one plan step): max_tool_calls_per_step
+- **Per-session** (across A2A turns + restarts): enforced by LLM Budget Proxy
+
+Budget parameters are configurable via environment variables:
+
+- ``SANDBOX_MAX_ITERATIONS`` (default: 100)
+- ``SANDBOX_MAX_TOOL_CALLS_PER_STEP`` (default: 10)
+- ``SANDBOX_MAX_TOKENS`` (default: 1000000) — passed to proxy via metadata
+- ``SANDBOX_MAX_WALL_CLOCK_S`` (default: 3600) — max seconds per message (1 hour)
+- ``SANDBOX_HITL_INTERVAL`` (default: 50)
+- ``SANDBOX_RECURSION_LIMIT`` (default: 50)
+- ``SANDBOX_LLM_TIMEOUT`` (default: 300) — seconds per LLM call
+- ``SANDBOX_LLM_MAX_RETRIES`` (default: 3) — retry on transient LLM errors
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import time
+from dataclasses import dataclass, field
+
+logger = logging.getLogger(__name__)
+
+
+def _env_int(name: str, default: int) -> int:
+    """Read an integer from the environment, falling back to *default*."""
+    raw = os.environ.get(name)
+    if raw is None:
+        return default
+    try:
+        return int(raw)
+    except ValueError:
+        return default
+
+
+@dataclass
+class AgentBudget:
+    """Tracks resource usage across the reasoning loop.
+
+    Attributes
+    ----------
+    max_iterations:
+        Maximum outer-loop iterations (planner → executor → reflector).
+    max_tool_calls_per_step:
+        Maximum tool invocations the executor may make for a single plan step.
+    max_tokens:
+        Approximate upper bound on total tokens consumed (prompt + completion).
+        Passed to the LLM Budget Proxy via request metadata.
+    max_wall_clock_s:
+        Maximum wall clock time in seconds for a single message run.
+    hitl_interval:
+        After this many iterations, the reflector suggests a human check-in.
+    recursion_limit:
+        LangGraph recursion limit passed to graph invocation config.
+    """
+
+    max_iterations: int = _env_int("SANDBOX_MAX_ITERATIONS", 200)
+    max_tool_calls_per_step: int = _env_int("SANDBOX_MAX_TOOL_CALLS_PER_STEP", 20)
+    max_tokens: int = _env_int("SANDBOX_MAX_TOKENS", 1_000_000)
+    max_wall_clock_s: int = _env_int("SANDBOX_MAX_WALL_CLOCK_S", 3600)  # 1 hour
+    hitl_interval: int = _env_int("SANDBOX_HITL_INTERVAL", 50)
+    recursion_limit: int = _env_int("SANDBOX_RECURSION_LIMIT", 300)
+    llm_timeout: int = _env_int("SANDBOX_LLM_TIMEOUT", 300)
+    llm_max_retries: int = _env_int("SANDBOX_LLM_MAX_RETRIES", 3)
+
+    # Mutable runtime counters — not constructor args.
+    iterations_used: int = field(default=0, init=False)
+    tokens_used: int = field(default=0, init=False)
+    tool_calls_this_step: int = field(default=0, init=False)
+    _start_time: float = field(default_factory=time.monotonic, init=False)
+
+    # -- helpers -------------------------------------------------------------
+
+    def tick_iteration(self) -> None:
+        """Advance the iteration counter by one."""
+        self.iterations_used += 1
+
+    def add_tokens(self, count: int) -> None:
+        """Accumulate *count* tokens (prompt + completion).
+
+        Tracks in-process token usage for budget summary events and the
+        local ``exceeded`` check. The authoritative budget enforcement
+        is done by the LLM Budget Proxy (returns 402 when exceeded).
+        """
+        self.tokens_used += count
+        if self.tokens_exceeded:
+            logger.warning(
+                "Budget: tokens exceeded %d/%d",
+                self.tokens_used,
+                self.max_tokens,
+            )
+
+    def tick_tool_call(self) -> None:
+        """Record a tool invocation within the current step."""
+        self.tool_calls_this_step += 1
+
+    def reset_step_tools(self) -> None:
+        """Reset the per-step tool-call counter (called between plan steps)."""
+        self.tool_calls_this_step = 0
+
+    # -- queries -------------------------------------------------------------
+
+    @property
+    def wall_clock_s(self) -> float:
+        """Seconds elapsed since this budget was created."""
+        return time.monotonic() - self._start_time
+
+    @property
+    def iterations_exceeded(self) -> bool:
+        return self.iterations_used >= self.max_iterations
+
+    @property
+    def tokens_exceeded(self) -> bool:
+        return self.tokens_used >= self.max_tokens
+
+    @property
+    def wall_clock_exceeded(self) -> bool:
+        return self.wall_clock_s >= self.max_wall_clock_s
+
+    @property
+    def step_tools_exceeded(self) -> bool:
+        return self.tool_calls_this_step >= self.max_tool_calls_per_step
+
+    @property
+    def exceeded(self) -> bool:
+        """Return True if *any* local budget limit has been reached.
+
+        Token budget is NOT checked here — it is enforced by the LLM
+        Budget Proxy (returns HTTP 402).  The agent catches 402 errors
+        in the executor/reflector/reporter nodes.
+        """
+        return self.iterations_exceeded or self.wall_clock_exceeded
+
+    @property
+    def exceeded_reason(self) -> str | None:
+        """Human-readable reason for why the budget was exceeded, or None."""
+        if self.iterations_exceeded:
+            return f"Iteration limit reached ({self.iterations_used}/{self.max_iterations})"
+        if self.wall_clock_exceeded:
+            return f"Time limit reached ({self.wall_clock_s:.0f}s/{self.max_wall_clock_s}s)"
+        return None
+
+    @property
+    def needs_hitl_checkin(self) -> bool:
+        """Return True when it's time for a human-in-the-loop check-in."""
+        return (
+            self.hitl_interval > 0
+            and self.iterations_used > 0
+            and self.iterations_used % self.hitl_interval == 0
+        )
+
+    def summary(self) -> dict:
+        """Return budget state as a dict for event serialization."""
+        return {
+            "tokens_used": self.tokens_used,
+            "tokens_budget": self.max_tokens,
+            "iterations_used": self.iterations_used,
+            "iterations_budget": self.max_iterations,
+            "wall_clock_s": round(self.wall_clock_s, 1),
+            "max_wall_clock_s": self.max_wall_clock_s,
+        }

From 03ccd9898e59991abdd625321613a8a4fbcd3677 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:47:40 +0100
Subject: [PATCH 04/26] feat(sandbox): pydantic configuration with per-node LLM
 model overrides

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/configuration.py        | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/configuration.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/configuration.py b/a2a/sandbox_agent/src/sandbox_agent/configuration.py
new file mode 100644
index 00000000..e712f1fd
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/configuration.py
@@ -0,0 +1,30 @@
+from pydantic_settings import BaseSettings
+
+
+class Configuration(BaseSettings):
+    llm_model: str = "llama3.1"
+    llm_api_base: str = "http://localhost:11434/v1"
+    llm_api_key: str = "dummy"
+    workspace_root: str = "/workspace"
+    checkpoint_db_url: str = "memory"
+    context_ttl_days: int = 7
+
+    # Per-node model overrides (empty = use llm_model default)
+    llm_model_planner: str = ""
+    llm_model_executor: str = ""
+    llm_model_reflector: str = ""
+    llm_model_reporter: str = ""
+    llm_model_thinking: str = ""  # bare LLM for thinking iterations
+    llm_model_micro_reasoning: str = ""  # LLM+tools for micro-reasoning
+
+    def model_for_node(self, node: str) -> str:
+        """Return the model to use for a specific node type."""
+        overrides = {
+            "planner": self.llm_model_planner,
+            "executor": self.llm_model_executor,
+            "reflector": self.llm_model_reflector,
+            "reporter": self.llm_model_reporter,
+            "thinking": self.llm_model_thinking,
+            "micro_reasoning": self.llm_model_micro_reasoning,
+        }
+        return overrides.get(node, "") or self.llm_model

From df4beba6e5d5c1052cf8dfbd3836e3aba57a26b1 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:47:47 +0100
Subject: [PATCH 05/26] feat(sandbox): context builders for per-node message
 isolation in the reasoning loop

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/context_builders.py     | 739 ++++++++++++++++++
 1 file changed, 739 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/context_builders.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/context_builders.py b/a2a/sandbox_agent/src/sandbox_agent/context_builders.py
new file mode 100644
index 00000000..c3404711
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/context_builders.py
@@ -0,0 +1,739 @@
+"""Pure functions that build the message list for each reasoning node,
+and an ``invoke_llm`` wrapper that guarantees the debug output matches
+exactly what was sent to the LLM.
+
+Each builder takes the graph state and returns a list of BaseMessage objects
+that the node should pass to ``llm.ainvoke()``.  The functions are
+independently testable and enforce context isolation — no node sees
+messages it shouldn't.
+
+Context contracts:
+
+    Planner   — SystemMessage(prompt + step status) + HumanMessage(user request only).
+                Does NOT include own previous AIMessages (prevents replan duplication).
+    Executor  — SystemMessage(prompt) + HumanMessage(step brief) + this step's tool pairs.
+                Stops at [STEP_BOUNDARY] SystemMessage. Never sees planner output.
+    Reflector — SystemMessage(prompt) + last 3 tool-call AI→Tool pairs.
+                Filters out non-tool AIMessages (planner/reflector text).
+    Reporter  — SystemMessage(prompt) + full history (intentional for summarization).
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Any
+
+from langchain_core.messages import (
+    AIMessage,
+    BaseMessage,
+    HumanMessage,
+    SystemMessage,
+    ToolMessage,
+)
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Planner context
+# ---------------------------------------------------------------------------
+
+_MAX_PLANNER_HISTORY_MSGS = 6  # user request + a few recent tool results
+
+
+def build_planner_context(
+    state: dict[str, Any],
+    system_content: str,
+) -> list[BaseMessage]:
+    """Build the message list for the planner node.
+
+    On fresh plan (iteration 0): SystemMessage + all user HumanMessages.
+    On replan (iteration > 0): SystemMessage + user request + last few
+    ToolMessages for context.  **Excludes** previous planner AIMessages
+    to prevent the LLM from seeing and duplicating its own plan.
+
+    The step status and tool history are already in ``system_content``
+    (built by the caller), so they don't need to appear as messages.
+    """
+    messages = state.get("messages", [])
+    iteration = state.get("iteration", 0)
+
+    if iteration == 0:
+        # Fresh plan: include only HumanMessages (user requests)
+        user_msgs = [m for m in messages if isinstance(m, HumanMessage)]
+        return [SystemMessage(content=system_content)] + user_msgs
+
+    # Replan: user request + last few tool results for context.
+    # Explicitly EXCLUDE previous planner AIMessages to prevent duplication.
+    user_msgs = [m for m in messages if isinstance(m, HumanMessage)]
+    # Take the first user message (original request)
+    first_user = user_msgs[:1] if user_msgs else []
+
+    # Include last few ToolMessages so planner knows what was tried
+    recent_tools: list[BaseMessage] = []
+    for m in reversed(messages):
+        if isinstance(m, ToolMessage):
+            recent_tools.insert(0, m)
+            if len(recent_tools) >= _MAX_PLANNER_HISTORY_MSGS:
+                break
+
+    result = [SystemMessage(content=system_content)] + first_user + recent_tools
+    logger.info(
+        "Planner context: %d messages (iteration=%d, %d tool results)",
+        len(result), iteration, len(recent_tools),
+        extra={"session_id": state.get("context_id", ""), "node": "planner"},
+    )
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Executor context
+# ---------------------------------------------------------------------------
+
+_CHARS_PER_TOKEN = 4
+_MAX_CONTEXT_CHARS = 30_000 * _CHARS_PER_TOKEN  # ~120k chars
+
+
+def build_executor_context(
+    state: dict[str, Any],
+    system_content: str,
+) -> list[BaseMessage]:
+    """Build the message list for the executor node.
+
+    On new step (tool_call_count == 0):
+        SystemMessage(prompt) + HumanMessage(step brief).
+        The executor sees ONLY the step description — no plan, no history.
+
+    On continuing step (tool_call_count > 0):
+        SystemMessage(prompt) + HumanMessage(step brief) + this step's
+        AI→Tool message pairs + HumanMessage(reflection prompt).
+        The reflection prompt at the END forces the LLM to think about
+        the results before calling the next tool.
+    """
+    all_msgs = state.get("messages", [])
+    current_step = state.get("current_step", 0)
+    tool_call_count = state.get("_tool_call_count", 0)
+    plan = state.get("plan", [])
+    step_text = plan[current_step] if current_step < len(plan) else "N/A"
+    step_brief = state.get(
+        "skill_instructions",
+        f"Execute step {current_step + 1}: {step_text}",
+    )
+
+    first_msg = [HumanMessage(content=step_brief)]
+
+    if tool_call_count == 0:
+        # New step: only the step brief
+        windowed: list[BaseMessage] = []
+    else:
+        # Continuing: walk back to [STEP_BOUNDARY N] SystemMessage,
+        # then inject a HumanMessage reflection after EACH ToolMessage.
+        raw_windowed: list[BaseMessage] = []
+        used_chars = 0
+        for m in reversed(all_msgs):
+            content = str(getattr(m, "content", ""))
+            if isinstance(m, SystemMessage) and content.startswith(
+                f"[STEP_BOUNDARY {current_step}]"
+            ):
+                break
+            msg_chars = len(content)
+            if used_chars + msg_chars > _MAX_CONTEXT_CHARS:
+                break
+            raw_windowed.insert(0, m)
+            used_chars += msg_chars
+
+        # Inject reflection HumanMessage after each ToolMessage
+        windowed = []
+        call_num = 0
+        for m in raw_windowed:
+            windowed.append(m)
+            if isinstance(m, ToolMessage):
+                call_num += 1
+                tool_name = getattr(m, "name", "unknown")
+                content = str(getattr(m, "content", ""))
+                # Determine status from exit code
+                if "EXIT_CODE:" in content:
+                    import re as _re
+                    ec_match = _re.search(r"EXIT_CODE:\s*(\d+)", content)
+                    status = "FAILED" if ec_match and ec_match.group(1) != "0" else "OK"
+                    error_hint = content[:150] if status == "FAILED" else ""
+                elif content.startswith("Error:") or "Permission denied" in content:
+                    status = "FAILED"
+                    error_hint = content[:150]
+                else:
+                    status = "OK"
+                    error_hint = ""
+
+                reflection_parts = [
+                    f"Tool '{tool_name}' call {call_num} {status}.",
+                ]
+                if error_hint:
+                    reflection_parts.append(f"Error: {error_hint}")
+                if "unknown flag" in content.lower() or "invalid option" in content.lower():
+                    reflection_parts.append(
+                        "The flag is INVALID. Run the command with --help to see valid flags."
+                    )
+                reflection_parts.append(
+                    f"Goal: \"{step_text[:100]}\"\n"
+                    f"If goal ACHIEVED → stop, summarize result. "
+                    f"If FAILED → try DIFFERENT approach. "
+                    f"NEVER repeat same command."
+                )
+                windowed.append(HumanMessage(content=" ".join(reflection_parts)))
+
+    result = [SystemMessage(content=system_content)] + first_msg + windowed
+    logger.info(
+        "Executor context: %d messages, ~%dk chars (from %d total)",
+        len(result), sum(len(str(getattr(m, "content", ""))) for m in result) // 1000,
+        len(all_msgs),
+        extra={
+            "session_id": state.get("context_id", ""),
+            "node": "executor",
+            "current_step": current_step,
+            "tool_call_count": tool_call_count,
+        },
+    )
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Reflector context
+# ---------------------------------------------------------------------------
+
+_MAX_REFLECTOR_PAIRS = 10  # last 10 AI→Tool pairs (20 messages max)
+
+
+def build_reflector_context(
+    state: dict[str, Any],
+    system_content: str,
+) -> list[BaseMessage]:
+    """Build the message list for the reflector node.
+
+    Includes only the last ``_MAX_REFLECTOR_PAIRS`` AI→Tool pairs from
+    the message history.  **Filters out** AIMessages that have no
+    ``tool_calls`` (planner plan text, reflector decisions, executor
+    summaries) to prevent plan leakage.
+
+    The plan text and step results are already in ``system_content``
+    (formatted from state fields), so they don't need to appear as
+    conversation messages.
+    """
+    messages = state.get("messages", [])
+
+    recent_msgs: list[BaseMessage] = []
+    pair_count = 0
+    for m in reversed(messages):
+        if isinstance(m, SystemMessage):
+            continue
+        # Skip AIMessages without tool_calls (planner/reflector text output).
+        # These would leak plan context into the reflector.
+        if isinstance(m, AIMessage) and not getattr(m, "tool_calls", None):
+            continue
+        recent_msgs.insert(0, m)
+        if isinstance(m, AIMessage) and getattr(m, "tool_calls", None):
+            pair_count += 1
+            if pair_count >= _MAX_REFLECTOR_PAIRS:
+                break
+
+    result = [SystemMessage(content=system_content)] + recent_msgs
+    logger.info(
+        "Reflector context: %d messages (%d tool pairs from %d total)",
+        len(result), pair_count, len(messages),
+        extra={"session_id": state.get("context_id", ""), "node": "reflector"},
+    )
+    return result
+
+
+# ---------------------------------------------------------------------------
+# LLM invocation wrapper — captures exactly what the LLM sees
+# ---------------------------------------------------------------------------
+
+_DEBUG_PROMPTS = os.environ.get("SANDBOX_DEBUG_PROMPTS", "1") == "1"
+
+
+@dataclass
+class LLMCallCapture:
+    """Captures the exact input/output of an LLM invocation.
+
+    Always populated (not conditional on _DEBUG_PROMPTS) so that the
+    node result can decide what to include.  This guarantees the debug
+    view shows exactly what the LLM received — no drift.
+    """
+
+    messages: list = field(default_factory=list)
+    response: Any = None
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+    model: str = ""
+    bound_tools: list = field(default_factory=list)  # tool schemas sent to LLM
+
+    # -- Convenience methods for node result dicts -------------------------
+
+    def debug_fields(self) -> dict[str, Any]:
+        """Return prompt debug fields for the node result dict.
+
+        Only populated when ``SANDBOX_DEBUG_PROMPTS=1`` (default).
+        These are large payloads (system prompt, message list, full
+        response) — optional to reduce event size in production.
+        Token counts and budget are always included via ``token_fields()``.
+        """
+        if not _DEBUG_PROMPTS:
+            return {}
+        result: dict[str, Any] = {
+            "_system_prompt": self._system_prompt()[:10000],
+            "_prompt_messages": self._summarize_messages(),
+            "_llm_response": self._format_response(),
+        }
+        if self.bound_tools:
+            result["_bound_tools"] = self.bound_tools[:50]
+        return result
+
+    def token_fields(self) -> dict[str, Any]:
+        """Return token usage fields for the node result dict."""
+        return {
+            "model": self.model,
+            "prompt_tokens": self.prompt_tokens,
+            "completion_tokens": self.completion_tokens,
+        }
+
+    # -- Internal helpers --------------------------------------------------
+
+    def _system_prompt(self) -> str:
+        """Extract the system prompt from the captured messages."""
+        for m in self.messages:
+            if isinstance(m, SystemMessage):
+                return str(m.content)
+        return ""
+
+    def _summarize_messages(self) -> list[dict[str, str]]:
+        """Summarize messages as {role, preview} dicts.
+
+        Skips the first SystemMessage since it's already shown as _system_prompt.
+        """
+        result = []
+        skip_first_system = True
+        for msg in self.messages:
+            if skip_first_system and isinstance(msg, SystemMessage):
+                skip_first_system = False
+                continue
+            role = getattr(msg, "type", "unknown")
+            content = getattr(msg, "content", "")
+            if isinstance(content, list):
+                content = " ".join(
+                    b.get("text", "")
+                    for b in content
+                    if isinstance(b, dict) and b.get("type") == "text"
+                )
+            text = str(content)
+            tool_calls = getattr(msg, "tool_calls", None)
+            if tool_calls:
+                tc_parts = []
+                for tc in tool_calls:
+                    name = tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?")
+                    args = tc.get("args", {}) if isinstance(tc, dict) else getattr(tc, "args", {})
+                    args_str = str(args)[:500] if args else ""
+                    tc_parts.append(f"{name}({args_str})" if args_str else name)
+                text = f"[tool_calls: {'; '.join(tc_parts)}] {text[:2000]}"
+            tool_name = getattr(msg, "name", None)
+            if role == "tool" and tool_name:
+                text = f"[{tool_name}] {text[:3000]}"
+            else:
+                text = text[:5000]
+            result.append({"role": role, "preview": text})
+        return result
+
+    def _format_response(self) -> dict[str, Any]:
+        """Format the LLM response as OpenAI-style dict."""
+        resp = self.response
+        if resp is None:
+            return {}
+        try:
+            meta = getattr(resp, "response_metadata", {}) or {}
+            content = resp.content
+            if isinstance(content, list):
+                content = " ".join(
+                    b.get("text", "")
+                    for b in content
+                    if isinstance(b, dict) and b.get("type") == "text"
+                ) or None
+            tool_calls_out = None
+            if resp.tool_calls:
+                tool_calls_out = [
+                    {
+                        "id": tc.get("id", "") if isinstance(tc, dict) else getattr(tc, "id", ""),
+                        "type": "function",
+                        "function": {
+                            "name": tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?"),
+                            "arguments": json.dumps(
+                                tc.get("args", {}) if isinstance(tc, dict) else getattr(tc, "args", {})
+                            ),
+                        },
+                    }
+                    for tc in resp.tool_calls
+                ]
+            return {
+                "choices": [{
+                    "message": {
+                        "role": "assistant",
+                        "content": content if content else None,
+                        "tool_calls": tool_calls_out,
+                    },
+                    "finish_reason": meta.get("finish_reason", "unknown"),
+                }],
+                "model": meta.get("model", ""),
+                "usage": {
+                    "prompt_tokens": self.prompt_tokens,
+                    "completion_tokens": self.completion_tokens,
+                },
+                "id": meta.get("id", ""),
+            }
+        except Exception:
+            return {"error": "Failed to format response"}
+
+
+def _extract_bound_tools(llm: Any) -> list[dict[str, Any]]:
+    """Extract tool schemas from a LangChain RunnableBinding."""
+    try:
+        tools = getattr(llm, "kwargs", {}).get("tools", [])
+        if not tools:
+            first = getattr(llm, "first", None)
+            if first:
+                tools = getattr(first, "kwargs", {}).get("tools", [])
+        result = []
+        for t in tools[:50]:
+            if isinstance(t, dict):
+                fn = t.get("function", t)
+                result.append({"name": fn.get("name", "?"), "description": fn.get("description", "")[:100]})
+            elif hasattr(t, "name"):
+                result.append({"name": t.name, "description": getattr(t, "description", "")[:100]})
+        return result
+    except Exception:
+        return []
+
+
+async def invoke_llm(
+    llm: Any,
+    messages: list[BaseMessage],
+    *,
+    node: str = "",
+    session_id: str = "",
+    workspace_path: str = "",
+) -> tuple[AIMessage, LLMCallCapture]:
+    """Invoke the LLM and capture the exact input/output.
+
+    If ``workspace_path`` is provided, the workspace preamble is
+    automatically prepended to the first SystemMessage. This ensures
+    every LLM call sees the workspace path rule — nodes don't need
+    to inject it manually.
+
+    Returns ``(response, capture)`` where capture contains:
+    - ``messages``: the exact messages sent to the LLM (with preamble)
+    - ``response``: the AIMessage returned
+    - ``prompt_tokens`` / ``completion_tokens``: token usage
+    - ``model``: model name from response metadata
+
+    Usage in a node::
+
+        messages = build_executor_context(state, system_content)
+        response, capture = await invoke_llm(
+            llm, messages, node="executor",
+            workspace_path=state.get("workspace_path", "/workspace"),
+        )
+    """
+    # Inject workspace preamble into the first SystemMessage
+    if workspace_path and messages:
+        from sandbox_agent.prompts import WORKSPACE_PREAMBLE
+
+        preamble = WORKSPACE_PREAMBLE.format(workspace_path=workspace_path)
+        if isinstance(messages[0], SystemMessage):
+            messages = [
+                SystemMessage(content=preamble + "\n" + messages[0].content),
+                *messages[1:],
+            ]
+        else:
+            # No SystemMessage — prepend one
+            messages = [SystemMessage(content=preamble), *messages]
+
+    response = await llm.ainvoke(messages)
+
+    usage = getattr(response, "usage_metadata", None) or {}
+    prompt_tokens = usage.get("input_tokens", 0) or usage.get("prompt_tokens", 0)
+    completion_tokens = usage.get("output_tokens", 0) or usage.get("completion_tokens", 0)
+    model_name = (getattr(response, "response_metadata", None) or {}).get("model", "")
+
+    # Extract bound tools from the LLM (RunnableBinding stores them in kwargs)
+    bound_tools = _extract_bound_tools(llm)
+
+    capture = LLMCallCapture(
+        messages=list(messages),
+        response=response,
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        model=model_name,
+        bound_tools=bound_tools,
+    )
+
+    logger.info(
+        "LLM call [%s]: %d messages, %d prompt tokens, %d completion tokens, model=%s",
+        node, len(messages), prompt_tokens, completion_tokens, model_name,
+        extra={"session_id": session_id, "node": node,
+               "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens},
+    )
+
+    return response, capture
+
+
+def _build_tool_descriptions(llm_with_tools: Any) -> str:
+    """Build a textual description of bound tools for the thinking prompt."""
+    tools = _extract_bound_tools(llm_with_tools)
+    if not tools:
+        return ""
+    lines = ["Available tools:"]
+    for t in tools:
+        name = t.get("name", "?")
+        desc = t.get("description", "")
+        lines.append(f"  - {name}: {desc}" if desc else f"  - {name}")
+    return "\n".join(lines)
+
+
+async def invoke_with_tool_loop(
+    llm_with_tools: Any,
+    llm_reason: Any | None,
+    messages: list[BaseMessage],
+    *,
+    node: str,
+    session_id: str,
+    workspace_path: str,
+    thinking_budget: int = 5,
+    max_parallel_tool_calls: int = 5,
+    max_cycles: int = 1,
+    tools: list | None = None,
+) -> tuple[AIMessage, LLMCallCapture, list[dict[str, Any]]]:
+    """Invoke LLM with optional thinking iterations + micro-reasoning + tool execution.
+
+    Returns ``(response, capture, sub_events)`` where sub_events is a list
+    of thinking event dicts — one per thinking iteration.
+
+    When ``tools`` is provided AND ``max_cycles > 1``, runs a full
+    think → tool-call → execute → see-result → think loop internally.
+    Tools are executed via ``asyncio.gather`` for parallel calls.
+
+    When ``llm_reason`` is provided (thinking mode):
+      1. Thinking loop (up to ``thinking_budget`` iterations):
+         Bare LLM reasons about what to do.
+      2. Micro-reasoning: LLM with tools makes tool calls.
+      3. If ``tools`` provided: execute tools, feed results back, loop.
+
+    When ``llm_reason`` is None (single-phase mode):
+      One call to llm_with_tools with implicit auto. No sub_events.
+    """
+    import asyncio
+
+    sub_events: list[dict[str, Any]] = []
+    total_thinking_tokens = 0
+    all_captures: list[LLMCallCapture] = []
+
+    # Build tool lookup for direct execution
+    tool_map: dict[str, Any] = {}
+    if tools:
+        for t in tools:
+            name = getattr(t, "name", None)
+            if name:
+                tool_map[name] = t
+
+    # Track conversation for multi-cycle loops
+    cycle_messages = list(messages)
+
+    for cycle in range(max(max_cycles, 1)):
+        last_reasoning = ""
+
+        if llm_reason is not None:
+            # --- Thinking phase ---
+            thinking_history: list[BaseMessage] = []
+
+            for i in range(thinking_budget):
+                thinking_messages = list(cycle_messages) + thinking_history
+
+                if i == 0:
+                    thinking_messages.append(
+                        HumanMessage(content="Brief analysis (2-3 sentences max): "
+                                     "What is the best tool call for this step? "
+                                     "If step is already done, say READY: step complete.")
+                    )
+                else:
+                    thinking_messages.append(
+                        HumanMessage(content="Refine in 1-2 sentences. "
+                                     "When ready: READY: <one-line action plan>")
+                    )
+
+                reason_response, reason_capture = await invoke_llm(
+                    llm_reason, thinking_messages,
+                    node=f"{node}-think-{cycle+1}.{i+1}", session_id=session_id,
+                    workspace_path=workspace_path,
+                )
+                last_reasoning = str(reason_response.content or "").strip()
+                total_thinking_tokens += reason_capture.prompt_tokens + reason_capture.completion_tokens
+
+                sub_events.append({
+                    "type": "thinking",
+                    "node": node,
+                    "cycle": cycle + 1,
+                    "iteration": i + 1,
+                    "total_iterations": 0,
+                    "reasoning": last_reasoning,
+                    **reason_capture.debug_fields(),
+                    **reason_capture.token_fields(),
+                })
+
+                thinking_summary = last_reasoning[:200] + ("..." if len(last_reasoning) > 200 else "")
+                thinking_history.extend([
+                    AIMessage(content=thinking_summary),
+                    HumanMessage(content=f"(Thinking {i+1} recorded. Continue or signal READY:)"),
+                ])
+
+                if last_reasoning.upper().startswith("READY:"):
+                    break
+
+            # --- Micro-reasoning: LLM with tools ---
+            tool_messages = cycle_messages + [
+                AIMessage(content=last_reasoning or "I need to call a tool for this step."),
+                HumanMessage(content="Now execute your planned action. Rules:\n"
+                             "- Call step_done(summary='...') if the step is ALREADY COMPLETE.\n"
+                             "- Call ONE tool if there's a single action to take.\n"
+                             "- Call multiple tools ONLY if they are independent (can run in parallel).\n"
+                             "- NEVER call the same tool twice with similar args."),
+            ]
+            response, capture = await invoke_llm(
+                llm_with_tools, tool_messages,
+                node=f"{node}-tool-{cycle+1}", session_id=session_id,
+                workspace_path=workspace_path,
+            )
+            capture.prompt_tokens += total_thinking_tokens
+            all_captures.append(capture)
+
+        else:
+            # Single-phase: one LLM call with implicit auto
+            response, capture = await invoke_llm(
+                llm_with_tools, cycle_messages,
+                node=f"{node}-{cycle+1}" if max_cycles > 1 else node,
+                session_id=session_id,
+                workspace_path=workspace_path,
+            )
+            all_captures.append(capture)
+
+        # --- Intercept step_done ---
+        if response.tool_calls:
+            done_calls = [tc for tc in response.tool_calls if tc.get("name") == "step_done"]
+            if done_calls:
+                summary = done_calls[0].get("args", {}).get("summary", last_reasoning or "")
+                logger.info("step_done called in cycle %d: %s", cycle + 1, summary[:100],
+                            extra={"session_id": session_id, "node": node})
+                response = AIMessage(content=summary)
+                break
+
+        # If micro-reasoning produced tool calls but no text, merge last thinking
+        if last_reasoning and response.tool_calls and not response.content:
+            response = AIMessage(content=last_reasoning, tool_calls=response.tool_calls)
+
+        # Enforce max parallel tool calls
+        if len(response.tool_calls) > max_parallel_tool_calls:
+            response = AIMessage(
+                content=response.content,
+                tool_calls=response.tool_calls[:max_parallel_tool_calls],
+            )
+
+        # --- Execute tools if we have them and there are tool calls ---
+        if response.tool_calls and tool_map and max_cycles > 1:
+            # Emit tool_call sub_event BEFORE execution (so UI shows the call)
+            import uuid as _uuid
+            call_id = str(_uuid.uuid4())[:8]
+            sub_events.append({
+                "type": "tool_call",
+                "node": node,
+                "cycle": cycle + 1,
+                "call_id": call_id,
+                "tools": [
+                    {"name": tc.get("name", "?"), "args": tc.get("args", {})}
+                    for tc in response.tool_calls
+                ],
+            })
+
+            # Execute all tool calls in parallel via asyncio.gather
+            async def _run_tool(tc: dict) -> ToolMessage:
+                name = tc.get("name", "unknown")
+                args = tc.get("args", {})
+                tc_id = tc.get("id", "unknown")
+                tool_fn = tool_map.get(name)
+                if tool_fn is None:
+                    return ToolMessage(content=f"Error: tool '{name}' not found", tool_call_id=tc_id, name=name)
+                try:
+                    result = await tool_fn.ainvoke(args)
+                    return ToolMessage(content=str(result)[:10000], tool_call_id=tc_id, name=name)
+                except Exception as exc:
+                    return ToolMessage(content=f"Error: {exc}", tool_call_id=tc_id, name=name)
+
+            tool_results = await asyncio.gather(*[_run_tool(tc) for tc in response.tool_calls])
+
+            # Add tool call + results to conversation for next cycle
+            cycle_messages.append(response)
+            cycle_messages.extend(tool_results)
+
+            # Emit tool_result sub_events AFTER execution (so UI shows results)
+            for tm in tool_results:
+                content_str = str(getattr(tm, "content", ""))
+                import re as _re
+                exit_match = _re.search(r"EXIT_CODE:\s*(\d+)", content_str)
+                is_error = (
+                    (exit_match is not None and exit_match.group(1) != "0")
+                    or content_str.startswith("Error:")
+                )
+                sub_events.append({
+                    "type": "tool_result",
+                    "node": node,
+                    "cycle": cycle + 1,
+                    "call_id": call_id,
+                    "name": getattr(tm, "name", "unknown"),
+                    "output": content_str[:2000],
+                    "status": "error" if is_error else "success",
+                })
+
+            logger.info(
+                "Cycle %d/%d [%s]: %d tool calls executed, continuing",
+                cycle + 1, max_cycles, node, len(response.tool_calls),
+                extra={"session_id": session_id, "node": node},
+            )
+            continue  # Next cycle
+        else:
+            # No tools to execute or last cycle — return response
+            break
+
+    # If we executed tools internally, strip tool_calls from final response
+    # so the graph doesn't try to re-execute them via ToolNode
+    if tool_map and max_cycles > 1 and response.tool_calls:
+        last_content = str(response.content or "")
+        if not last_content:
+            last_content = f"Completed {cycle + 1} think-act cycles."
+        response = AIMessage(content=last_content)
+
+    # Update total_iterations on all thinking sub_events
+    thinking_events = [e for e in sub_events if e.get("type") == "thinking"]
+    total_iters = len(thinking_events)
+    for evt in thinking_events:
+        evt["total_iterations"] = total_iters
+
+    # Merge all captures into the last one
+    final_capture = all_captures[-1] if all_captures else LLMCallCapture()
+    for c in all_captures[:-1]:
+        final_capture.prompt_tokens += c.prompt_tokens
+        final_capture.completion_tokens += c.completion_tokens
+
+    logger.info(
+        "Tool loop %s: %d cycles, %d thinking iterations, %d total tokens",
+        node, cycle + 1, total_iters,
+        final_capture.prompt_tokens + final_capture.completion_tokens,
+        extra={"session_id": session_id, "node": node},
+    )
+
+    return response, final_capture, sub_events

From 720d0ecc8df3d4f6c902a3ea7d23bbf944089198 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:47:52 +0100
Subject: [PATCH 06/26] feat(sandbox): typed event schema for LangGraph node
 events streamed to UI

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/event_schema.py         | 121 ++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/event_schema.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/event_schema.py b/a2a/sandbox_agent/src/sandbox_agent/event_schema.py
new file mode 100644
index 00000000..d99fb4c2
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/event_schema.py
@@ -0,0 +1,121 @@
+# Copyright 2025 IBM Corp.
+# Licensed under the Apache License, Version 2.0
+
+"""Typed event schema for LangGraph node events.
+
+Each LangGraph node emits a distinct event type. The dataclasses here are
+the single source of truth; the TypeScript frontend mirrors these types
+in ``agentLoop.ts``.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import asdict, dataclass, field
+from typing import List
+
+
+class NodeEventType:
+    """Constants for the ``type`` discriminator on every LoopEvent."""
+
+    PLANNER_OUTPUT = "planner_output"
+    EXECUTOR_STEP = "executor_step"
+    TOOL_CALL = "tool_call"
+    TOOL_RESULT = "tool_result"
+    REFLECTOR_DECISION = "reflector_decision"
+    REPORTER_OUTPUT = "reporter_output"
+    BUDGET_UPDATE = "budget_update"
+    HITL_REQUEST = "hitl_request"
+
+
+# ---------------------------------------------------------------------------
+# Base
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class LoopEvent:
+    """Base event emitted by a graph node during the reasoning loop."""
+
+    type: str  # One of NodeEventType constants
+    loop_id: str  # Unique per reasoning loop invocation
+    model: str = ""
+    prompt_tokens: int = 0
+    completion_tokens: int = 0
+
+    def to_json(self) -> str:
+        return json.dumps(asdict(self))
+
+
+# ---------------------------------------------------------------------------
+# Concrete event types
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class PlannerOutput(LoopEvent):
+    """Planner created or revised a plan."""
+
+    type: str = NodeEventType.PLANNER_OUTPUT
+    steps: List[str] = field(default_factory=list)
+    iteration: int = 0
+
+
+@dataclass
+class ExecutorStep(LoopEvent):
+    """Executor is working on a plan step."""
+
+    type: str = NodeEventType.EXECUTOR_STEP
+    step: int = 0
+    total_steps: int = 0
+    description: str = ""
+    reasoning: str = ""  # Full LLM response text (up to 2000 chars)
+
+
+@dataclass
+class ToolCall(LoopEvent):
+    """Executor invoked a tool."""
+
+    type: str = NodeEventType.TOOL_CALL
+    step: int = 0
+    name: str = ""
+    args: str = ""
+
+
+@dataclass
+class ToolResult(LoopEvent):
+    """Tool returned a result."""
+
+    type: str = NodeEventType.TOOL_RESULT
+    step: int = 0
+    name: str = ""
+    output: str = ""
+
+
+@dataclass
+class ReflectorDecision(LoopEvent):
+    """Reflector reviewed execution and decided next action."""
+
+    type: str = NodeEventType.REFLECTOR_DECISION
+    decision: str = ""  # "continue", "replan", "done"
+    assessment: str = ""  # Full reflection text
+    iteration: int = 0
+
+
+@dataclass
+class ReporterOutput(LoopEvent):
+    """Reporter generated the final answer."""
+
+    type: str = NodeEventType.REPORTER_OUTPUT
+    content: str = ""
+
+
+@dataclass
+class BudgetUpdate(LoopEvent):
+    """Budget tracking update."""
+
+    type: str = NodeEventType.BUDGET_UPDATE
+    tokens_used: int = 0
+    tokens_budget: int = 0
+    wall_clock_s: float = 0
+    max_wall_clock_s: float = 0

From 67b5971f9e4d6186d2805351d4cb30147d771161 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:47:59 +0100
Subject: [PATCH 07/26] feat(sandbox): event serializer converting LangGraph
 events to common JSON streaming format

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/event_serializer.py     | 697 ++++++++++++++++++
 1 file changed, 697 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/event_serializer.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py b/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py
new file mode 100644
index 00000000..4191a67b
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py
@@ -0,0 +1,697 @@
+"""Framework-specific event serializers for structured JSON streaming.
+
+Each agent framework (LangGraph, CrewAI, AG2) has its own internal event
+format. Serializers convert framework events into a common JSON schema
+that the backend and frontend understand.
+
+Event types (new — node-specific):
+    planner_output     — Planner created/revised a plan
+    executor_step      — Executor starts working on a plan step
+    tool_call          — Tool invoked (unchanged)
+    tool_result        — Tool returned output (unchanged)
+    reflector_decision — Reflector decides continue/replan/done
+    reporter_output    — Reporter generates the final answer
+    budget_update      — Budget tracking
+    error              — An error occurred during execution
+    hitl_request       — Human-in-the-loop approval is needed
+
+Legacy types (kept for backward compatibility):
+    plan          — Alias for planner_output
+    plan_step     — Alias for executor_step
+    reflection    — Alias for reflector_decision
+    llm_response  — Generic LLM text (used for unknown nodes only)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import uuid
+from abc import ABC, abstractmethod
+from typing import Any
+
+from sandbox_agent import plan_store as ps
+
+logger = logging.getLogger(__name__)
+
+
+def _safe_tc(tc: Any) -> dict[str, Any]:
+    """Safely extract name/args from a tool call object.
+
+    LangChain tool_calls can be dicts, ToolCall TypedDicts, or
+    InvalidToolCall objects (tuples). Handle all formats gracefully.
+    """
+    try:
+        if isinstance(tc, dict):
+            return {"name": tc.get("name", "unknown"), "args": tc.get("args", {})}
+        if hasattr(tc, "name"):
+            return {"name": getattr(tc, "name", "unknown"), "args": getattr(tc, "args", {})}
+        if isinstance(tc, (list, tuple)) and len(tc) >= 2:
+            return {"name": str(tc[0]), "args": tc[1] if isinstance(tc[1], dict) else {}}
+    except Exception:
+        pass
+    return {"name": "unknown", "args": {}}
+
+
+class FrameworkEventSerializer(ABC):
+    """Base class for framework-specific event serialization.
+
+    Subclass this for each agent framework (LangGraph, CrewAI, AG2).
+    The ``serialize`` method must return a JSON string with at least
+    a ``type`` field.
+    """
+
+    @abstractmethod
+    def serialize(self, key: str, value: dict) -> str:
+        """Serialize a framework event into a JSON string.
+
+        Parameters
+        ----------
+        key:
+            The graph node name (e.g. "assistant", "tools").
+        value:
+            The event payload from the framework's streaming API.
+
+        Returns
+        -------
+        str
+            A JSON string with at least ``{"type": "..."}``
+        """
+        ...
+
+
+class LangGraphSerializer(FrameworkEventSerializer):
+    """Serialize LangGraph ``stream_mode='updates'`` events.
+
+    LangGraph emits events like::
+
+        {"assistant": {"messages": [AIMessage(...)]}}
+        {"tools": {"messages": [ToolMessage(...)]}}
+
+    This serializer extracts tool calls, tool results, and LLM
+    responses into structured JSON.
+
+    When the graph uses a plan-execute-reflect reasoning loop, all
+    events include a ``loop_id`` so the frontend can group them into
+    an expandable AgentLoopCard.
+    """
+
+    # Nodes whose events are sub-items of the preceding node visit
+    # (they don't get their own node_visit number).
+    _TOOL_NODES = frozenset({"tools", "planner_tools", "reflector_tools"})
+
+    def __init__(self, loop_id: str | None = None, context_id: str | None = None) -> None:
+        self._loop_id = loop_id or str(uuid.uuid4())[:8]
+        self._step_index = 0
+        self._event_counter = 0  # global sequence number for ordering
+        self._node_visit = 0     # graph node visit counter (main sections)
+        self._sub_index = 0      # position within current node visit
+        self._last_node_key: str = ""  # track previous node for visit grouping
+        self._micro_step: int = 0
+        self._context_id = context_id or "unknown"
+        self._last_call_id: str = ""
+        self._prev_node: str | None = None  # previous node for node_transition events
+
+    def serialize(self, key: str, value: dict) -> str:
+
+        # Emit node_transition meta-event when the node changes
+        transition_line: str | None = None
+        if self._prev_node is not None and key != self._prev_node:
+            self._event_counter += 1
+            transition_event = {
+                "type": "node_transition",
+                "loop_id": self._loop_id,
+                "from_node": self._prev_node,
+                "to_node": key,
+                "event_index": self._event_counter,
+                "langgraph_node": key,
+            }
+            transition_line = json.dumps(transition_event)
+        self._prev_node = key
+
+        # Node visit tracking:
+        # - Tool nodes (tools, planner_tools, reflector_tools) inherit parent visit
+        # - Same node type re-entering (executor→tools→executor) stays on same visit
+        # - Different node type (executor→reflector, reflector→planner) = new visit
+        if key not in self._TOOL_NODES:
+            if key != self._last_node_key:
+                self._node_visit += 1
+                self._sub_index = 0
+            self._last_node_key = key
+        # event_counter incremented per JSON line in post-processing.
+
+        # Track actual plan step from state for step grouping
+        current_step = value.get("current_step")
+        if current_step is not None:
+            new_step = current_step + 1  # 1-based for display
+            if new_step != self._step_index:
+                self._step_index = new_step
+                self._micro_step = 0  # reset micro_step on plan step change
+
+        # Reasoning-loop nodes may emit state fields instead of messages
+        if key == "router":
+            # Router is an internal node — emit minimal event for logging
+            route = value.get("_route", "new")
+            result = json.dumps({
+                "type": "router",
+                "loop_id": self._loop_id,
+                "route": route,
+                "plan_status": value.get("plan_status", ""),
+            })
+        elif key == "planner":
+            result = self._serialize_planner(value)
+        elif key == "reflector":
+            result = self._serialize_reflector(value)
+        elif key == "step_selector":
+            # Reset micro_step on every step transition
+            self._micro_step = 0
+            current_step = value.get("current_step", 0)
+            plan_steps = value.get("plan_steps", [])
+            step_desc = ""
+            if current_step < len(plan_steps):
+                step_entry = plan_steps[current_step]
+                step_desc = step_entry.get("description", "") if isinstance(step_entry, dict) else str(step_entry)
+            brief = value.get("skill_instructions", "")
+            # Strip the "STEP BRIEF FROM COORDINATOR:" prefix
+            if "STEP BRIEF" in brief:
+                brief = brief.split("---")[0].replace("STEP BRIEF FROM COORDINATOR:", "").strip()
+            result = json.dumps({
+                "type": "step_selector",
+                "loop_id": self._loop_id,
+                "current_step": current_step,
+                "description": f"Advancing to step {current_step + 1}: {step_desc[:80]}",
+                "brief": brief[:500],
+                "done": value.get("done", False),
+            })
+        elif key == "reporter":
+            result = self._serialize_reporter(value)
+        else:
+            msgs = value.get("messages", [])
+            if not msgs:
+                result = json.dumps({"type": "llm_response", "content": f"[{key}]"})
+            else:
+                msg = msgs[-1]
+
+                if key == "executor":
+                    result = self._serialize_executor(msg, value)
+                elif key == "tools":
+                    result = self._serialize_tool_result(msg)
+                else:
+                    # Unknown node — treat as informational
+                    content = getattr(msg, "content", "")
+                    if isinstance(content, list):
+                        text = self._extract_text_blocks(content)
+                    else:
+                        text = str(content)[:2000] if content else f"[{key}]"
+                    result = json.dumps({"type": "llm_response", "content": text})
+
+        # Append budget_update event if _budget_summary is in the value dict
+        budget_summary = value.get("_budget_summary")
+        if budget_summary and isinstance(budget_summary, dict):
+            budget_event = json.dumps({
+                "type": "budget_update",
+                "loop_id": self._loop_id,
+                **budget_summary,
+            })
+            result = result + "\n" + budget_event
+
+        # Post-process: ensure ALL event lines have step + unique event_index.
+        # Each JSON line gets its own event_index (no duplicates).
+        # Legacy event types (plan, plan_step, reflection) are skipped from
+        # indexing to avoid inflating the counter.
+        enriched_lines = []
+
+        # Prepend node_transition event if one was emitted
+        if transition_line is not None:
+            enriched_lines.append(transition_line)
+
+        for line in result.split("\n"):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                evt = json.loads(line)
+                if "step" not in evt:
+                    cs = evt.get("current_step")
+                    evt["step"] = (cs + 1) if cs is not None else self._step_index
+                event_type = evt.get("type", "?")
+                self._event_counter += 1
+                evt["event_index"] = self._event_counter
+                evt["node_visit"] = self._node_visit
+                evt["sub_index"] = self._sub_index
+                evt["langgraph_node"] = key
+                self._sub_index += 1
+                enriched_lines.append(json.dumps(evt))
+            except json.JSONDecodeError:
+                enriched_lines.append(line)
+                event_type = "parse_error"
+            logger.info("SERIALIZE session=%s loop=%s type=%s step=%s ei=%s",
+                self._context_id, self._loop_id, event_type,
+                self._step_index, self._event_counter,
+                extra={"session_id": self._context_id, "node": key,
+                       "event_type": event_type, "step": self._step_index})
+
+        return "\n".join(enriched_lines)
+
+    def _serialize_assistant(self, msg: Any) -> str:
+        """Serialize an assistant (LLM) node output.
+
+        When the LLM calls tools, it often also produces reasoning text.
+        We emit BOTH the thinking content and the tool call as separate
+        JSON lines so the UI shows the full chain:
+            {"type": "llm_response", "content": "Let me check..."}
+            {"type": "tool_call", "tools": [...]}
+        """
+        tool_calls = getattr(msg, "tool_calls", [])
+        content = getattr(msg, "content", "")
+
+        # Extract any text content from the LLM
+        if isinstance(content, list):
+            text = self._extract_text_blocks(content)
+        else:
+            text = str(content)[:2000] if content else ""
+
+        if tool_calls:
+            parts = []
+            # Emit thinking/reasoning text first (if present)
+            if text.strip():
+                parts.append(json.dumps({"type": "llm_response", "content": text}))
+            # Then emit the tool call
+            parts.append(json.dumps({
+                "type": "tool_call",
+                "tools": [
+                    _safe_tc(tc)
+                    for tc in tool_calls
+                ],
+            }))
+            return "\n".join(parts)
+
+        return json.dumps({"type": "llm_response", "content": text})
+
+    def _serialize_executor(self, msg: Any, value: dict | None = None) -> str:
+        """Serialize an executor node output with loop_id for AgentLoopCard."""
+        tool_calls = getattr(msg, "tool_calls", [])
+        content = getattr(msg, "content", "")
+
+        if isinstance(content, list):
+            text = self._extract_text_blocks(content)
+        else:
+            text = str(content)[:2000] if content else ""
+
+        parts = []
+        _v = value or {}
+
+        # Emit sub_events: thinking iterations, tool calls, tool results
+        sub_events = _v.get("_sub_events", [])
+        for se in sub_events:
+            se_type = se.get("type", "")
+            if se_type == "thinking":
+                thinking_event = {
+                    "type": "thinking",
+                    "loop_id": self._loop_id,
+                    "cycle": se.get("cycle", 1),
+                    "iteration": se.get("iteration", 1),
+                    "total_iterations": se.get("total_iterations", 1),
+                    "reasoning": se.get("reasoning", "")[:50000],
+                    "node": se.get("node", "executor"),
+                    "model": se.get("model", ""),
+                    "prompt_tokens": se.get("prompt_tokens", 0),
+                    "completion_tokens": se.get("completion_tokens", 0),
+                }
+                for field in ("_system_prompt", "_prompt_messages", "_bound_tools", "_llm_response"):
+                    if field in se:
+                        thinking_event[field.lstrip("_")] = se[field]
+                parts.append(json.dumps(thinking_event))
+            elif se_type == "tool_call":
+                parts.append(json.dumps({
+                    "type": "tool_call",
+                    "loop_id": self._loop_id,
+                    "call_id": se.get("call_id", ""),
+                    "cycle": se.get("cycle", 1),
+                    "tools": se.get("tools", []),
+                }))
+            elif se_type == "tool_result":
+                parts.append(json.dumps({
+                    "type": "tool_result",
+                    "loop_id": self._loop_id,
+                    "call_id": se.get("call_id", ""),
+                    "cycle": se.get("cycle", 1),
+                    "name": se.get("name", "unknown"),
+                    "output": se.get("output", "")[:2000],
+                    "status": se.get("status", "success"),
+                }))
+
+        self._micro_step += 1
+
+        # Skip micro_reasoning for dedup responses (no LLM call happened)
+        if not _v.get("_dedup"):
+            # Annotate micro_reasoning with thinking count
+            if sub_events:
+                _v = {**_v, "_thinking_count": len(sub_events)}
+            parts.append(self._serialize_micro_reasoning(msg, _v))
+
+        plan = _v.get("plan", [])
+        model = _v.get("model", "")
+        prompt_tokens = _v.get("prompt_tokens", 0)
+        completion_tokens = _v.get("completion_tokens", 0)
+        prompt_data = self._extract_prompt_data(_v)
+
+        # Emit executor_step event so UI shows which step is executing
+        current_plan_step = _v.get("current_step", 0)
+        step_payload = {
+            "type": "executor_step",
+            "loop_id": self._loop_id,
+            "plan_step": current_plan_step,
+            "iteration": _v.get("iteration", 0),
+            "total_steps": len(plan) if plan else 0,
+            "description": text[:200] if text else "",
+            "reasoning": text[:2000] if text else "",
+            "model": model,
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            **prompt_data,
+        }
+        parts.append(json.dumps(step_payload))
+
+        if tool_calls:
+            # Use LangGraph's tool_call_id for proper pairing with tool_result
+            tc0 = tool_calls[0] if tool_calls else {}
+            call_id = (
+                tc0.get("id") if isinstance(tc0, dict)
+                else getattr(tc0, "id", None)
+            ) or str(uuid.uuid4())[:8]
+            self._last_call_id = call_id
+            parts.append(json.dumps({
+                "type": "tool_call",
+                "loop_id": self._loop_id,
+                "call_id": call_id,
+                "tools": [
+                    _safe_tc(tc)
+                    for tc in tool_calls
+                ],
+            }))
+            return "\n".join(parts)
+
+        # Emit tool_call event for text-parsed tools (no structured tool_calls)
+        parsed_tools = _v.get("parsed_tools", [])
+        if parsed_tools:
+            call_id = str(uuid.uuid4())[:8]
+            self._last_call_id = call_id
+            parts.append(json.dumps({
+                "type": "tool_call",
+                "loop_id": self._loop_id,
+                "call_id": call_id,
+                "tools": [
+                    {"name": t["name"], "args": t.get("args", {})}
+                    for t in parsed_tools
+                ],
+            }))
+
+        return "\n".join(parts)
+
+    def _serialize_micro_reasoning(self, msg: Any, value: dict) -> str:
+        """Emit a micro_reasoning event capturing the LLM's intermediate reasoning."""
+        content = getattr(msg, "content", "")
+        if isinstance(content, list):
+            text = self._extract_text_blocks(content)
+        else:
+            text = str(content)[:50000] if content else ""
+
+        tool_calls = getattr(msg, "tool_calls", [])
+        next_action = "tool_call" if tool_calls else "done"
+
+        # When the LLM responds with only tool calls and no text reasoning,
+        # generate a summary so the micro-reasoning block isn't empty.
+        if not text and tool_calls:
+            summaries = []
+            for tc in tool_calls[:5]:
+                name = tc.get("name", "?")
+                args = tc.get("args", {})
+                args_str = json.dumps(args, default=str)[:200]
+                summaries.append(f"→ {name}({args_str})")
+            text = "Decided next action:\n" + "\n".join(summaries)
+
+        event: dict = {
+            "type": "micro_reasoning",
+            "loop_id": self._loop_id,
+            "micro_step": self._micro_step,
+            "after_call_id": self._last_call_id,
+            "reasoning": text[:50000],
+            "next_action": next_action,
+            "model": value.get("model", ""),
+            "prompt_tokens": value.get("prompt_tokens", 0),
+            "completion_tokens": value.get("completion_tokens", 0),
+            **self._extract_prompt_data(value),
+        }
+        # Include previous tool result for UI context (shows WHY this decision)
+        prev = value.get("_last_tool_result")
+        if prev:
+            event["previous_tool"] = prev
+        # Annotate with thinking iteration count for UI badge
+        tc = value.get("_thinking_count", 0)
+        if tc:
+            event["thinking_count"] = tc
+        return json.dumps(event)
+
+    def _serialize_tool_result(self, msg: Any) -> str:
+        """Serialize a tool node output with loop_id."""
+        name = getattr(msg, "name", "unknown")
+        content = getattr(msg, "content", "")
+        content_str = str(content)
+        # Determine error status from exit code, not content keywords.
+        # The shell tool appends "EXIT_CODE: N" for non-zero exits.
+        # Keyword matching (e.g. "failure", "error") causes false positives
+        # when command output contains those words in normal data.
+        import re as _re
+        exit_match = _re.search(r"EXIT_CODE:\s*(\d+)", content_str)
+        is_error = (
+            (exit_match is not None and exit_match.group(1) != "0")
+            or content_str.startswith("\u274c")
+            or content_str.startswith("Error: ")
+            or "Permission denied" in content_str
+            or "command not found" in content_str
+        )
+        status = "error" if is_error else "success"
+        # Use LangGraph's tool_call_id for proper pairing with tool_call
+        call_id = getattr(msg, "tool_call_id", None) or self._last_call_id
+        return json.dumps({
+            "type": "tool_result",
+            "loop_id": self._loop_id,
+            "call_id": call_id,
+            "name": str(name),
+            "output": content_str[:2000],
+            "status": status,
+        })
+
+    @staticmethod
+    def _enrich_with_plan_store(payload: dict, value: dict) -> None:
+        """Add PlanStore flat steps to payload if available."""
+        store = value.get("_plan_store", {})
+        if store and store.get("steps"):
+            payload["plan_steps"] = ps.to_flat_plan_steps(store)
+
+    @staticmethod
+    def _extract_prompt_data(value: dict) -> dict:
+        """Extract prompt visibility fields from node output."""
+        data: dict = {}
+        sp = value.get("_system_prompt", "")
+        if sp:
+            data["system_prompt"] = sp[:50000]
+        pm = value.get("_prompt_messages")
+        if pm:
+            data["prompt_messages"] = pm[:100]  # max 100 messages
+        bt = value.get("_bound_tools")
+        if bt:
+            data["bound_tools"] = bt[:50]  # max 50 tools
+        lr = value.get("_llm_response")
+        if lr:
+            data["llm_response"] = lr
+        return data
+
+    def _serialize_planner(self, value: dict) -> str:
+        """Serialize a planner node output — emits planner_output + legacy plan."""
+        plan_steps = value.get("plan_steps", [])
+        plan = [s.get("description", "") for s in plan_steps] if plan_steps else value.get("plan", [])
+        iteration = value.get("iteration", 1)
+
+        # Also include any LLM text from the planner's message
+        msgs = value.get("messages", [])
+        text = ""
+        if msgs:
+            content = getattr(msgs[-1], "content", "")
+            if isinstance(content, list):
+                text = self._extract_text_blocks(content)
+            else:
+                text = str(content)[:2000] if content else ""
+
+        model = value.get("model", "")
+        prompt_tokens = value.get("prompt_tokens", 0)
+        completion_tokens = value.get("completion_tokens", 0)
+        prompt_data = self._extract_prompt_data(value)
+
+        # Distinguish initial plan from replan
+        is_replan = iteration > 1
+        event_type = "replanner_output" if is_replan else "planner_output"
+
+        payload = {
+            "type": event_type,
+            "loop_id": self._loop_id,
+            "steps": plan,
+            "iteration": iteration,
+            "content": text,
+            "model": model,
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            **prompt_data,
+        }
+
+        self._enrich_with_plan_store(payload, value)
+
+        return json.dumps(payload)
+
+    def _serialize_reflector(self, value: dict) -> str:
+        """Serialize a reflector node output — emits reflector_decision + legacy reflection."""
+        done = value.get("done", False)
+        current_step = value.get("current_step", 0)
+        step_results = value.get("step_results", [])
+
+        # Extract decision text from message if present
+        msgs = value.get("messages", [])
+        text = ""
+        if msgs:
+            content = getattr(msgs[-1], "content", "")
+            if isinstance(content, list):
+                text = self._extract_text_blocks(content)
+            else:
+                text = str(content)[:500] if content else ""
+
+        # Derive the decision keyword from the text
+        decision = "done" if done else self._extract_decision(text)
+
+        # Strip prompt echo from assessment — the LLM sometimes echoes the
+        # system prompt instructions.  Extract only the actual decision word
+        # or a brief justification, never the echoed prompt.
+        assessment = text.strip()
+
+        # If the response contains prompt markers, it's an echo — just use the decision.
+        prompt_markers = (
+            "Output the single word:",
+            "output ONLY the decision word",
+            "Decide ONE of the following",
+            "DECISION PROCESS:",
+            "STALL DETECTION:",
+            "REPLAN RULES:",
+        )
+        is_prompt_echo = any(marker in assessment for marker in prompt_markers)
+        if is_prompt_echo or not assessment or len(assessment) > 200:
+            assessment = decision
+
+        # Reset micro_step counter for next iteration
+        self._micro_step = 0
+
+        model = value.get("model", "")
+        prompt_tokens = value.get("prompt_tokens", 0)
+        completion_tokens = value.get("completion_tokens", 0)
+        iteration = value.get("iteration", 0)
+        prompt_data = self._extract_prompt_data(value)
+
+        payload = {
+            "type": "reflector_decision",
+            "loop_id": self._loop_id,
+            "decision": decision,
+            "assessment": assessment,
+            "iteration": iteration,
+            "done": done,
+            "current_step": current_step,
+            "model": model,
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            **prompt_data,
+        }
+
+        self._enrich_with_plan_store(payload, value)
+
+        return json.dumps(payload)
+
+    def _serialize_reporter(self, value: dict) -> str:
+        """Serialize a reporter node output — emits reporter_output.
+
+        When the reporter LLM calls the ``respond_to_user`` escape tool
+        instead of producing text content, we extract the ``response``
+        argument and emit it as a clean ``reporter_output`` event rather
+        than a raw ``tool_call`` event.
+        """
+        final_answer = value.get("final_answer", "")
+
+        # Check messages for respond_to_user tool call or text content
+        if not final_answer:
+            msgs = value.get("messages", [])
+            for msg in msgs:
+                # Check for respond_to_user tool call first
+                tool_calls = getattr(msg, "tool_calls", None)
+                if tool_calls:
+                    for tc in tool_calls:
+                        tc_info = _safe_tc(tc)
+                        if tc_info["name"] == "respond_to_user":
+                            args = tc_info["args"]
+                            final_answer = (
+                                args.get("response", "")
+                                if isinstance(args, dict)
+                                else str(args)
+                            )
+                            break
+                    if final_answer:
+                        break
+
+                # Fall back to text content
+                content = getattr(msg, "content", "")
+                if content:
+                    if isinstance(content, list):
+                        final_answer = self._extract_text_blocks(content)
+                    else:
+                        final_answer = str(content)[:2000]
+                    if final_answer:
+                        break
+
+        model = value.get("model", "")
+        prompt_tokens = value.get("prompt_tokens", 0)
+        completion_tokens = value.get("completion_tokens", 0)
+        prompt_data = self._extract_prompt_data(value)
+
+        payload = {
+            "type": "reporter_output",
+            "loop_id": self._loop_id,
+            "content": final_answer[:2000],
+            "model": model,
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            **prompt_data,
+        }
+
+        files_touched = value.get("files_touched", [])
+        if files_touched:
+            payload["files_touched"] = files_touched[:30]
+
+        return json.dumps(payload)
+
+    @staticmethod
+    def _extract_decision(text: str) -> str:
+        """Extract a decision keyword from reflector text.
+
+        Returns one of: ``continue``, ``replan``, ``done``, ``hitl``.
+        Defaults to ``continue`` if the text is ambiguous.
+        """
+        text_lower = text.strip().lower()
+        for decision in ("done", "replan", "hitl", "continue"):
+            if decision in text_lower:
+                return decision
+        return "continue"
+
+    @staticmethod
+    def _extract_text_blocks(content: list) -> str:
+        """Extract text from a list of content blocks."""
+        return " ".join(
+            b.get("text", "")
+            for b in content
+            if isinstance(b, dict) and b.get("type") == "text"
+        )[:2000]

From d3bb92ee4c7458ede2ac61fdeb7a9aa4ad79df33 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:48:05 +0100
Subject: [PATCH 08/26] feat(sandbox): shell executor with permission-checked
 command execution in workspace

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/executor.py             | 364 ++++++++++++++++++
 1 file changed, 364 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/executor.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/executor.py b/a2a/sandbox_agent/src/sandbox_agent/executor.py
new file mode 100644
index 00000000..7d3777a6
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/executor.py
@@ -0,0 +1,364 @@
+"""Sandbox executor -- runs shell commands inside a context workspace.
+
+Every command is checked against the :class:`PermissionChecker` before
+execution.  The three possible outcomes are:
+
+  DENY  -- an error :class:`ExecutionResult` is returned immediately
+  HITL  -- :class:`HitlRequired` is raised so the LangGraph graph can
+           trigger an ``interrupt()`` for human approval
+  ALLOW -- the command is executed via ``asyncio.create_subprocess_shell``
+           inside *workspace_path* with a timeout from :class:`SourcesConfig`
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import shlex
+from dataclasses import dataclass
+
+from sandbox_agent.permissions import PermissionChecker, PermissionResult
+from sandbox_agent.sources import SourcesConfig
+
+logger = logging.getLogger(__name__)
+
+# Shell interpreters that can execute arbitrary code via -c / -e flags.
+_INTERPRETERS = frozenset({"bash", "sh", "python", "python3", "perl", "ruby", "node"})
+
+# Flags that take an inline command string as the next argument.
+_EXEC_FLAGS = frozenset({"-c", "-e", "--eval"})
+
+
+# ---------------------------------------------------------------------------
+# Exceptions
+# ---------------------------------------------------------------------------
+
+
+class HitlRequired(Exception):
+    """Raised when an operation needs human approval.
+
+    Attributes
+    ----------
+    command:
+        The shell command that requires approval.
+    """
+
+    def __init__(self, command: str) -> None:
+        self.command = command
+        super().__init__(f"Human approval required for command: {command}")
+
+
+# ---------------------------------------------------------------------------
+# Result dataclass
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class ExecutionResult:
+    """Captures the outcome of a shell command execution."""
+
+    stdout: str
+    stderr: str
+    exit_code: int
+
+
+# ---------------------------------------------------------------------------
+# Executor
+# ---------------------------------------------------------------------------
+
+
+class SandboxExecutor:
+    """Runs shell commands in a workspace directory with permission checks.
+
+    Parameters
+    ----------
+    workspace_path:
+        Absolute path to the workspace directory where commands execute.
+    permission_checker:
+        A :class:`PermissionChecker` instance for evaluating operations.
+    sources_config:
+        A :class:`SourcesConfig` instance providing runtime limits.
+    """
+
+    def __init__(
+        self,
+        workspace_path: str,
+        permission_checker: PermissionChecker,
+        sources_config: SourcesConfig,
+    ) -> None:
+        self._workspace_path = workspace_path
+        self._permission_checker = permission_checker
+        self._sources_config = sources_config
+        self._use_landlock = os.environ.get("SANDBOX_LANDLOCK") == "true"
+        if self._use_landlock:
+            logger.info("Landlock isolation ENABLED for workspace %s", workspace_path)
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    async def run_shell(self, command: str) -> ExecutionResult:
+        """Run a shell command after checking permissions and sources.json.
+
+        Parameters
+        ----------
+        command:
+            The shell command string to execute.
+
+        Returns
+        -------
+        ExecutionResult
+            On success (ALLOW) or on DENY (with a non-zero exit code and
+            an error message in stderr).
+
+        Raises
+        ------
+        HitlRequired
+            When the command matches neither allow nor deny rules and
+            requires human approval.
+        """
+        # 1. Extract the command prefix for permission matching.
+        #    Try "cmd subcmd" first (e.g. "pip install"), then fall back
+        #    to just "cmd" (e.g. "grep").
+        operation = command.strip()
+
+        # 1a. Check for interpreter bypass (e.g. bash -c "curl evil.com").
+        #     If the outer command is an interpreter with -c/-e, recursively
+        #     check the inner command against the same permission + sources
+        #     pipeline.  This prevents circumventing deny rules by wrapping
+        #     a blocked command in `bash -c "..."`.
+        bypass_denial = self._check_interpreter_bypass(operation)
+        if bypass_denial is not None:
+            return ExecutionResult(
+                stdout="",
+                stderr=bypass_denial,
+                exit_code=1,
+            )
+
+        permission = self._check_permission(operation)
+
+        # 2. Act on the permission result.
+        if permission is PermissionResult.DENY:
+            return ExecutionResult(
+                stdout="",
+                stderr=f"Permission denied: command '{command}' is denied by policy.",
+                exit_code=1,
+            )
+
+        if permission is PermissionResult.HITL:
+            raise HitlRequired(command)
+
+        # 3. Check sources.json enforcement (package blocking, git remote
+        #    allowlist) as a second layer of defense-in-depth.
+        sources_denial = self._check_sources(operation)
+        if sources_denial:
+            return ExecutionResult(
+                stdout="",
+                stderr=sources_denial,
+                exit_code=1,
+            )
+
+        # 4. ALLOW -- execute the command.
+        return await self._execute(command)
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    def _check_interpreter_bypass(self, command: str) -> str | None:
+        """Check if a command uses an interpreter to bypass restrictions.
+
+        Detects patterns like ``bash -c "curl evil.com"`` or
+        ``python3 -c "import os; os.system('rm -rf /')"`` and recursively
+        checks the inner command against permissions and sources policy.
+
+        Returns
+        -------
+        str or None
+            An error message if the inner command is denied, or *None* if
+            no interpreter bypass was detected (or the inner command is OK).
+        """
+        try:
+            parts = shlex.split(command)
+        except ValueError:
+            return None
+
+        if len(parts) < 3:
+            return None
+
+        # Resolve the binary name (handle /usr/bin/bash -> bash).
+        cmd = parts[0].rsplit("/", 1)[-1]
+        if cmd not in _INTERPRETERS:
+            return None
+
+        if parts[1] not in _EXEC_FLAGS:
+            return None
+
+        # Everything after the exec flag is the inner command.
+        inner_command = " ".join(parts[2:])
+        logger.warning(
+            "Interpreter bypass detected: '%s' wraps inner command '%s'",
+            command,
+            inner_command,
+        )
+
+        # Recursively check the inner command against permission rules.
+        inner_permission = self._check_permission(inner_command)
+        if inner_permission is PermissionResult.DENY:
+            return (
+                f"Permission denied: interpreter bypass detected. "
+                f"Inner command '{inner_command}' is denied by policy."
+            )
+
+        # Also check the inner command against sources.json policy
+        # (e.g. git clone to a disallowed remote inside bash -c).
+        inner_sources_denial = self._check_sources(inner_command)
+        if inner_sources_denial:
+            return (
+                f"Blocked: interpreter bypass detected. "
+                f"Inner command violates sources policy: {inner_sources_denial}"
+            )
+
+        return None
+
+    def _check_permission(self, operation: str) -> PermissionResult:
+        """Check the permission for a shell operation.
+
+        The permission checker expects the full command string as the
+        operation.  It internally handles prefix matching (e.g. matching
+        "grep -r foo" against the rule ``shell(grep:*)``).
+        """
+        return self._permission_checker.check("shell", operation)
+
+    def _check_sources(self, operation: str) -> str | None:
+        """Check sources.json enforcement for package and git operations.
+
+        Returns an error message string if the operation is blocked by
+        sources.json, or None if it is allowed.
+        """
+        import re
+
+        parts = operation.split()
+        if not parts:
+            return None
+
+        # --- Package manager checks ---
+        # pip install <package>
+        if len(parts) >= 3 and parts[0] == "pip" and parts[1] == "install":
+            if not self._sources_config.is_package_manager_enabled("pip"):
+                return "Blocked by sources.json: pip is not enabled."
+            for pkg in parts[2:]:
+                if pkg.startswith("-"):
+                    continue  # skip flags
+                # Strip version specifiers (e.g. "requests>=2.0")
+                pkg_name = re.split(r"[><=!~]", pkg)[0]
+                if pkg_name and self._sources_config.is_package_blocked("pip", pkg_name):
+                    return f"Blocked by sources.json: package '{pkg_name}' is on the blocked list."
+
+        # npm install <package>
+        if len(parts) >= 3 and parts[0] == "npm" and parts[1] == "install":
+            if not self._sources_config.is_package_manager_enabled("npm"):
+                return "Blocked by sources.json: npm is not enabled."
+            for pkg in parts[2:]:
+                if pkg.startswith("-"):
+                    continue
+                pkg_name = re.split(r"[@><=!~]", pkg)[0]
+                if pkg_name and self._sources_config.is_package_blocked("npm", pkg_name):
+                    return f"Blocked by sources.json: package '{pkg_name}' is on the blocked list."
+
+        # --- Git remote checks ---
+        # git clone <url>
+        if len(parts) >= 3 and parts[0] == "git" and parts[1] == "clone":
+            # Find the URL argument (skip flags like --depth, --branch)
+            url = None
+            i = 2
+            while i < len(parts):
+                if parts[i].startswith("-"):
+                    # Skip flag and its value if it takes one
+                    if parts[i] in ("--depth", "--branch", "-b"):
+                        i += 2
+                        continue
+                    i += 1
+                    continue
+                url = parts[i]
+                break
+            if url and not self._sources_config.is_git_remote_allowed(url):
+                return f"Blocked by sources.json: git remote '{url}' is not in allowed_remotes."
+
+        return None
+
+    async def _execute(self, command: str) -> ExecutionResult:
+        """Execute *command* in the workspace directory with a timeout.
+
+        When ``SANDBOX_LANDLOCK=true``, each command is executed inside a
+        Landlock-restricted subprocess that can only write to the workspace
+        and a session-specific /tmp directory.  There is no fallback --
+        if Landlock fails, the command fails.
+        """
+        timeout = self._sources_config.max_execution_time_seconds
+
+        if self._use_landlock:
+            return await self._execute_landlock(command, timeout)
+
+        try:
+            process = await asyncio.create_subprocess_shell(
+                command,
+                cwd=self._workspace_path,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+
+            try:
+                stdout_bytes, stderr_bytes = await asyncio.wait_for(
+                    process.communicate(),
+                    timeout=timeout,
+                )
+            except asyncio.TimeoutError:
+                # Kill the process and its children.
+                try:
+                    process.kill()
+                except ProcessLookupError:
+                    pass  # already exited
+                # Wait for the process to be reaped.
+                await process.wait()
+                return ExecutionResult(
+                    stdout="",
+                    stderr=(
+                        f"Command timed out after {timeout} seconds "
+                        f"and was killed: '{command}'"
+                    ),
+                    exit_code=-1,
+                )
+
+            return ExecutionResult(
+                stdout=(stdout_bytes or b"").decode("utf-8", errors="replace"),
+                stderr=(stderr_bytes or b"").decode("utf-8", errors="replace"),
+                exit_code=process.returncode if process.returncode is not None else -1,
+            )
+
+        except OSError as exc:
+            return ExecutionResult(
+                stdout="",
+                stderr=f"Failed to start command: {exc}",
+                exit_code=-1,
+            )
+
+    async def _execute_landlock(self, command: str, timeout: float) -> ExecutionResult:
+        """Execute *command* inside a Landlock-sandboxed subprocess.
+
+        No fallback -- if Landlock application fails in the child, the
+        error propagates as a non-zero exit code.
+        """
+        from sandbox_agent.sandbox_subprocess import sandboxed_subprocess
+
+        returncode, stdout, stderr = await sandboxed_subprocess(
+            command=command,
+            workspace_path=self._workspace_path,
+            timeout=timeout,
+        )
+        return ExecutionResult(
+            stdout=stdout,
+            stderr=stderr,
+            exit_code=returncode,
+        )

From 8fd4b9a3c5ec4d7ff6cecf7d244c93f891578f0a Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:48:10 +0100
Subject: [PATCH 09/26] feat(sandbox): LangGraph agent graph with
 plan-execute-reflect loop and sandboxed tools

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 a2a/sandbox_agent/src/sandbox_agent/graph.py | 989 +++++++++++++++++++
 1 file changed, 989 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/graph.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/graph.py b/a2a/sandbox_agent/src/sandbox_agent/graph.py
new file mode 100644
index 00000000..5cbe603e
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/graph.py
@@ -0,0 +1,989 @@
+"""LangGraph agent graph with plan-execute-reflect reasoning loop.
+
+The graph binds six tools to an LLM and uses a structured reasoning loop:
+
+- **shell**: runs commands via :class:`SandboxExecutor` (with permission checks)
+- **file_read**: reads files relative to the workspace (prevents path traversal)
+- **file_write**: writes files relative to the workspace (prevents path traversal)
+- **web_fetch**: fetches web content from allowed domains
+- **explore**: spawns a read-only sub-agent for codebase research
+- **delegate**: spawns a child agent session for delegated tasks
+
+Graph architecture (router → plan → execute → reflect):
+
+```mermaid
+graph TD
+    START((User Message)) --> router
+    router -->|new/replan| planner
+    router -->|resume| executor
+
+    planner --> executor
+    executor -->|tool_calls| tools
+    tools --> executor
+    executor -->|no tool_calls| reflector
+
+    reflector -->|execute| executor
+    reflector -->|replan| planner
+    reflector -->|done| reporter
+    reporter --> END((Final Answer))
+
+    style router fill:#4CAF50,color:white
+    style planner fill:#2196F3,color:white
+    style executor fill:#FF9800,color:white
+    style tools fill:#607D8B,color:white
+    style reflector fill:#9C27B0,color:white
+    style reporter fill:#F44336,color:white
+```
+
+Key flows:
+- **execute**: Step succeeded → executor runs the next plan step
+- **replan**: Step failed → planner creates a new plan → executor runs it
+- **done**: Task complete → reporter summarizes results
+
+The executor uses micro-reflection: one tool call per LLM invocation,
+see result, decide next action. Budget limits (iterations, tokens,
+wall clock) are the only hard stops.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from pathlib import Path
+from typing import Any, Optional
+
+from langchain_core.tools import tool
+from langchain_openai import ChatOpenAI
+from langgraph.graph import MessagesState, StateGraph
+from langgraph.prebuilt import ToolNode, tools_condition
+from langgraph.types import Send, interrupt
+
+try:
+    from langgraph.errors import GraphInterrupt
+except ImportError:
+    # Fallback for older langgraph versions
+    GraphInterrupt = type("GraphInterrupt", (Exception,), {})
+
+from sandbox_agent.budget import AgentBudget
+from sandbox_agent.executor import HitlRequired, SandboxExecutor
+from sandbox_agent.permissions import PermissionChecker
+from sandbox_agent.reasoning import (
+    PlanStep,
+    _DEBUG_PROMPTS,
+    executor_node,
+    planner_node,
+    reflector_node,
+    reporter_node,
+    route_entry,
+    route_reflector,
+    router_node,
+)
+from sandbox_agent import plan_store as ps
+from sandbox_agent.sources import SourcesConfig
+from sandbox_agent.subagents import make_delegate_tool, make_explore_tool
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# State
+# ---------------------------------------------------------------------------
+
+
+class SandboxState(MessagesState):
+    """Extended MessagesState carrying sandbox-specific fields.
+
+    Attributes
+    ----------
+    context_id:
+        A2A context identifier for multi-turn conversations.
+    workspace_path:
+        Absolute path to the per-context workspace directory.
+    final_answer:
+        The agent's final answer (set when the graph completes).
+    plan:
+        Flat list of step descriptions (backward compat with serializer).
+    plan_steps:
+        Structured per-step tracking with status, tool calls, results.
+        This is the source of truth; ``plan`` is derived from it.
+    plan_status:
+        Lifecycle status of the plan across A2A turns:
+        ``"executing"`` | ``"completed"`` | ``"failed"`` | ``"awaiting_continue"``
+    plan_version:
+        Incremented on each replan.
+    original_request:
+        The user's first message that created this plan.
+    current_step:
+        Index of the plan step currently being executed (0-based).
+    step_results:
+        Summary of each completed step's output.
+    iteration:
+        Outer-loop iteration counter (planner → executor → reflector).
+    replan_count:
+        Number of times the reflector has chosen "replan". Used to cap
+        the replan loop and force termination after MAX_REPLAN_COUNT.
+    done:
+        Flag set by reflector when the task is complete.
+    skill_instructions:
+        Optional skill content loaded from a ``.claude/skills/`` file.
+    recent_decisions:
+        Rolling window of the last 10 reflector decisions (continue/replan/done).
+    _route:
+        Internal routing signal from the router node (not persisted).
+    """
+
+    context_id: str
+    workspace_path: str
+    final_answer: str
+    plan: list[str]
+    plan_steps: list[PlanStep]
+    plan_status: str
+    plan_version: int
+    original_request: str
+    current_step: int
+    step_results: list[str]
+    iteration: int
+    replan_count: int
+    done: bool
+    skill_instructions: str
+    prompt_tokens: int
+    completion_tokens: int
+    recent_decisions: list[str]
+    _tool_call_count: int
+    _route: str
+    _system_prompt: str
+    _prompt_messages: list[dict]
+    _budget_summary: dict
+    _no_tool_count: int
+    _sub_events: list[dict]
+    _last_tool_result: dict
+    _bound_tools: list[dict]
+    _llm_response: dict
+    _plan_store: dict
+    files_touched: list[str]
+    model: str
+
+
+# ---------------------------------------------------------------------------
+# Skill loader
+# ---------------------------------------------------------------------------
+
+
+def _load_skill(workspace: str, skill_id: str) -> str | None:
+    """Load a skill file from the workspace's ``.claude/skills/`` directory.
+
+    Parameters
+    ----------
+    workspace:
+        Absolute path to the workspace root (or repo root).
+    skill_id:
+        Skill identifier, e.g. ``"rca:ci"`` or ``"tdd:hypershift"``.
+        Colons are converted to directory separators so ``rca:ci``
+        resolves to ``rca/ci.md``.
+
+    Returns
+    -------
+    str | None
+        The skill file content, or ``None`` if no matching file exists.
+    """
+    # Search in multiple locations:
+    # 1. Per-session workspace: /workspace/{contextId}/.claude/skills/
+    # 2. Shared workspace root: /workspace/.claude/skills/ (cloned at startup)
+    workspace_root = os.environ.get("WORKSPACE_DIR", "/workspace")
+    search_dirs = [
+        Path(workspace) / ".claude" / "skills",
+        Path(workspace_root) / ".claude" / "skills",
+    ]
+
+    for skills_dir in search_dirs:
+        if not skills_dir.is_dir():
+            continue
+
+        # Primary path: replace ':' with '/' → rca:ci → rca/ci.md
+        primary = skills_dir / f"{skill_id.replace(':', '/')}.md"
+        if primary.is_file():
+            logger.info("Loaded skill '%s' from %s", skill_id, primary)
+            return primary.read_text(encoding="utf-8", errors="replace")
+
+        # Try SKILL.md inside directory named with colons → rca:ci/SKILL.md
+        skill_dir = skills_dir / skill_id.replace(":", "/")
+        skill_md = skill_dir / "SKILL.md"
+        if skill_md.is_file():
+            logger.info("Loaded skill '%s' from %s", skill_id, skill_md)
+            return skill_md.read_text(encoding="utf-8", errors="replace")
+
+        # Directory named with literal colon → rca:ci/SKILL.md
+        colon_dir = skills_dir / skill_id
+        colon_skill = colon_dir / "SKILL.md"
+        if colon_skill.is_file():
+            logger.info("Loaded skill '%s' from %s (colon dir)", skill_id, colon_skill)
+            return colon_skill.read_text(encoding="utf-8", errors="replace")
+
+    logger.warning("Skill '%s' not found in any search path", skill_id)
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Tool factories
+# ---------------------------------------------------------------------------
+
+
+def _make_shell_tool(executor: SandboxExecutor) -> Any:
+    """Return a LangChain tool that delegates to *executor.run_shell*.
+
+    On :class:`HitlRequired`, the tool calls LangGraph ``interrupt()`` to
+    pause the graph and require explicit human approval before resuming.
+    The graph will not continue until the human responds.
+    """
+
+    @tool
+    async def shell(command: str) -> str:
+        """Execute a shell command in the session workspace.
+
+        The working directory is the per-session workspace. Use relative
+        paths for files in this session. Files created here are visible
+        in the Files tab.
+
+        Args:
+            command: The shell command to run.
+
+        Returns:
+            Command output (stdout + stderr), or pauses for human approval.
+        """
+        # Warn on bare `cd` — it has no effect in isolated shell execution
+        if command.strip().startswith("cd ") and "&&" not in command:
+            logger.warning(
+                "Bare 'cd' command detected — has no effect in isolated shell: %s",
+                command,
+            )
+
+        try:
+            result = await executor.run_shell(command)
+        except HitlRequired as exc:
+            # Pause graph execution — requires human approval to resume.
+            # The interrupt() call suspends the graph state. The A2A task
+            # transitions to input_required. Only an explicit human
+            # approval (via the HITLManager channel) resumes execution.
+            approval = interrupt({
+                "type": "approval_required",
+                "command": exc.command,
+                "message": f"Command '{exc.command}' requires human approval.",
+            })
+            # If we reach here, the human approved — execute the command.
+            if isinstance(approval, dict) and approval.get("approved"):
+                result = await executor._execute(command)
+            else:
+                return f"DENIED: command '{exc.command}' was rejected by human review."
+
+        # Retry on rate-limit errors (GitHub API, etc.) with exponential backoff
+        output = _format_result(result)
+        if result.exit_code != 0 and _is_rate_limited(output):
+            import asyncio
+            for attempt in range(1, 4):  # up to 3 retries
+                delay = 2 ** attempt  # 2s, 4s, 8s
+                logger.info("Rate limit detected, retry %d/3 after %ds", attempt, delay)
+                await asyncio.sleep(delay)
+                try:
+                    result = await executor.run_shell(command)
+                except HitlRequired:
+                    break  # don't retry HITL
+                output = _format_result(result)
+                if result.exit_code == 0 or not _is_rate_limited(output):
+                    break
+
+        return output
+
+    return shell
+
+
+_MAX_TOOL_OUTPUT = 10_000  # chars — prevent context window blowout
+
+
+def _format_result(result: Any) -> str:
+    """Format an ExecutionResult into a string, truncating large output."""
+    parts: list[str] = []
+    if result.stdout:
+        parts.append(result.stdout)
+    if result.stderr:
+        if result.exit_code != 0:
+            parts.append(f"STDERR: {result.stderr}")
+        else:
+            # Informational stderr (e.g., git clone progress) — not an error
+            parts.append(result.stderr)
+    if result.exit_code != 0:
+        parts.append(f"EXIT_CODE: {result.exit_code}")
+    text = "\n".join(parts) if parts else "(no output)"
+    if len(text) > _MAX_TOOL_OUTPUT:
+        kept = text[:_MAX_TOOL_OUTPUT]
+        dropped = len(text) - _MAX_TOOL_OUTPUT
+        text = f"{kept}\n\n[OUTPUT TRUNCATED — {dropped:,} chars omitted. Redirect large output to a file: command > output/result.txt]"
+    return text
+
+
+def _is_rate_limited(output: str) -> bool:
+    """Detect rate-limit errors in command output."""
+    lower = output.lower()
+    return any(pattern in lower for pattern in (
+        "rate limit exceeded",
+        "rate limit",
+        "too many requests",
+        "429",
+        "api rate limit",
+        "secondary rate limit",
+    ))
+
+
+def _make_file_read_tool(workspace_path: str) -> Any:
+    """Return a LangChain tool that reads files relative to *workspace_path*.
+
+    The tool prevents path traversal by resolving the path and ensuring it
+    stays within the workspace directory.
+    """
+    ws_root = Path(workspace_path).resolve()
+
+    @tool
+    async def file_read(path: str) -> str:
+        """Read a file from the workspace.
+
+        Args:
+            path: Relative path within the workspace directory.
+
+        Returns:
+            The file contents, or an error message.
+        """
+        resolved = (ws_root / path).resolve()
+
+        # Prevent path traversal.
+        if not resolved.is_relative_to(ws_root):
+            return f"Error: path '{path}' resolves outside the workspace."
+
+        if not resolved.is_file():
+            return f"Error: file not found at '{path}'."
+
+        try:
+            return resolved.read_text(encoding="utf-8", errors="replace")
+        except OSError as exc:
+            return f"Error reading file: {exc}"
+
+    return file_read
+
+
+def _make_file_write_tool(workspace_path: str) -> Any:
+    """Return a LangChain tool that writes files relative to *workspace_path*.
+
+    The tool prevents path traversal and creates parent directories as needed.
+    """
+    ws_root = Path(workspace_path).resolve()
+
+    @tool
+    async def file_write(path: str, content: str) -> str:
+        """Write content to a file in the workspace.
+
+        Args:
+            path: Relative path within the workspace directory.
+            content: The text content to write.
+
+        Returns:
+            A confirmation message, or an error message.
+        """
+        resolved = (ws_root / path).resolve()
+
+        # Prevent path traversal.
+        if not resolved.is_relative_to(ws_root):
+            return f"Error: path '{path}' resolves outside the workspace."
+
+        try:
+            resolved.parent.mkdir(parents=True, exist_ok=True)
+            resolved.write_text(content, encoding="utf-8")
+            return f"Successfully wrote {len(content)} bytes to '{path}'."
+        except OSError as exc:
+            return f"Error writing file: {exc}"
+
+    return file_write
+
+
+def _make_grep_tool(workspace_path: str) -> Any:
+    """Return a LangChain tool that searches file contents with regex."""
+    ws_root = Path(workspace_path).resolve()
+
+    @tool
+    async def grep(pattern: str, path: str = ".", include: str = "") -> str:
+        """Search for a regex pattern in file contents under the workspace.
+
+        Args:
+            pattern: Regex pattern to search for (e.g. 'def main', 'ERROR|FAIL').
+            path: Relative directory or file to search in (default: workspace root).
+            include: Glob filter for filenames (e.g. '*.py', '*.ts'). Empty = all files.
+
+        Returns:
+            Matching lines with file paths and line numbers, or an error message.
+        """
+        import asyncio as _aio
+
+        search_path = (ws_root / path).resolve()
+        if not search_path.is_relative_to(ws_root):
+            return f"Error: path '{path}' resolves outside the workspace."
+
+        cmd = ["grep", "-rn", "--color=never"]
+        if include:
+            cmd.extend(["--include", include])
+        cmd.extend([pattern, str(search_path)])
+
+        try:
+            proc = await _aio.create_subprocess_exec(
+                *cmd, stdout=_aio.subprocess.PIPE, stderr=_aio.subprocess.PIPE,
+            )
+            stdout, stderr = await _aio.wait_for(proc.communicate(), timeout=30)
+            out = stdout.decode(errors="replace")[:10000]
+            if proc.returncode == 1:
+                return "No matches found."
+            if proc.returncode != 0:
+                return f"Error: {stderr.decode(errors='replace')[:500]}"
+            # Make paths relative to workspace
+            return out.replace(str(ws_root) + "/", "")
+        except Exception as exc:
+            return f"Error running grep: {exc}"
+
+    return grep
+
+
+def _make_glob_tool(workspace_path: str) -> Any:
+    """Return a LangChain tool that finds files by glob pattern."""
+    ws_root = Path(workspace_path).resolve()
+
+    @tool
+    async def glob(pattern: str) -> str:
+        """Find files matching a glob pattern in the workspace.
+
+        Args:
+            pattern: Glob pattern (e.g. '**/*.py', 'src/**/*.ts', '*.md').
+
+        Returns:
+            Newline-separated list of matching file paths relative to workspace.
+        """
+        import fnmatch
+        matches = []
+        for p in sorted(ws_root.rglob("*")):
+            if p.is_file():
+                # Resolve symlinks and verify the real path stays inside workspace
+                resolved = p.resolve()
+                if not resolved.is_relative_to(ws_root):
+                    continue
+                rel = str(p.relative_to(ws_root))
+                if fnmatch.fnmatch(rel, pattern) or fnmatch.fnmatch(p.name, pattern):
+                    matches.append(rel)
+                    if len(matches) >= 200:
+                        matches.append(f"... truncated ({len(matches)}+ matches)")
+                        break
+        return "\n".join(matches) if matches else "No files matched."
+
+    return glob
+
+
+def _make_web_fetch_tool(sources_config: SourcesConfig) -> Any:
+    """Return a LangChain tool that fetches web content from allowed domains.
+
+    The tool checks the URL's domain against ``sources.json`` allowed_domains
+    before making the request.
+    """
+
+    @tool
+    async def web_fetch(url: str) -> str:
+        """Fetch content from a URL.
+
+        Domain filtering is handled by the outbound Squid proxy at the
+        network level. This tool fetches any URL the proxy allows.
+
+        Args:
+            url: The full URL to fetch (e.g. https://github.com/org/repo/issues/1).
+
+        Returns:
+            The page content as text, or an error message.
+        """
+        import httpx
+        from urllib.parse import urlparse
+
+        parsed = urlparse(url)
+        domain = parsed.hostname or ""
+
+        if not sources_config.is_web_access_enabled():
+            return "Error: web access is disabled in sources.json."
+
+        # Domain filtering is delegated to the Squid proxy.
+        # Log the domain for observability but don't block.
+        logger.info("web_fetch: domain=%s url=%s", domain, url[:200])
+
+        try:
+            async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
+                resp = await client.get(url, headers={"User-Agent": "kagenti-sandbox-agent/1.0"})
+                resp.raise_for_status()
+
+                content_type = resp.headers.get("content-type", "")
+                text = resp.text
+
+                # For HTML, try to extract readable text
+                if "text/html" in content_type:
+                    # Simple HTML tag stripping for readability
+                    import re
+                    text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL)
+                    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL)
+                    text = re.sub(r'<[^>]+>', ' ', text)
+                    text = re.sub(r'\s+', ' ', text).strip()
+
+                # Truncate very long responses
+                if len(text) > 50000:
+                    text = text[:50000] + "\n\n[Content truncated at 50000 characters]"
+
+                return text
+
+        except httpx.HTTPStatusError as exc:
+            return f"Error: HTTP {exc.response.status_code} fetching {url}"
+        except httpx.RequestError as exc:
+            return f"Error: could not fetch {url}: {exc}"
+
+    return web_fetch
+
+
+# ---------------------------------------------------------------------------
+# Escape tool for Llama 4 Scout
+# ---------------------------------------------------------------------------
+# Llama 4 Scout ALWAYS calls a tool when tools are bound (tool_choice=auto
+# acts like required). The respond_to_user tool lets planner/reflector
+# "escape" the tool loop by calling this tool with their final text output.
+
+
+@tool
+def respond_to_user(response: str) -> str:
+    """Return your final text response. Call this when you have enough
+    information and don't need any more tools.
+
+    Args:
+        response: The complete text response to return to the user.
+
+    Returns:
+        The response text unchanged.
+    """
+    return response
+
+
+@tool
+def step_done(summary: str) -> str:
+    """Signal that the current step is COMPLETE. Call this instead of
+    other tools when the step goal has been achieved and no more
+    tool calls are needed.
+
+    Args:
+        summary: Brief summary of what was accomplished in this step.
+
+    Returns:
+        The summary text.
+    """
+    return summary
+
+
+# ---------------------------------------------------------------------------
+# Graph builder
+# ---------------------------------------------------------------------------
+
+
+def build_graph(
+    workspace_path: str,
+    permission_checker: PermissionChecker,
+    sources_config: SourcesConfig,
+    checkpointer: Optional[Any] = None,
+    context_id: str = "",
+    namespace: str = "team1",
+) -> Any:
+    """Build and compile the LangGraph agent graph.
+
+    Parameters
+    ----------
+    workspace_path:
+        Absolute path to the per-context workspace directory.
+    permission_checker:
+        A :class:`PermissionChecker` for evaluating shell operations.
+    sources_config:
+        A :class:`SourcesConfig` providing runtime limits.
+    checkpointer:
+        Optional LangGraph checkpointer for PostgreSQL-based state
+        persistence across A2A turns.
+
+    Returns
+    -------
+    CompiledGraph
+        A compiled LangGraph graph with ``ainvoke`` / ``astream`` methods.
+    """
+    # -- Executor -----------------------------------------------------------
+    executor = SandboxExecutor(
+        workspace_path=workspace_path,
+        permission_checker=permission_checker,
+        sources_config=sources_config,
+    )
+
+    # -- LLM ----------------------------------------------------------------
+    from sandbox_agent.configuration import Configuration
+
+    config = Configuration()  # type: ignore[call-arg]
+    # -- Budget -------------------------------------------------------------
+    budget = AgentBudget()
+
+    llm = ChatOpenAI(
+        model=config.llm_model,
+        base_url=config.llm_api_base,
+        api_key=config.llm_api_key,
+        timeout=budget.llm_timeout,
+        max_retries=budget.llm_max_retries,
+        model_kwargs={
+            "extra_body": {
+                "metadata": {
+                    "session_id": context_id,
+                    "agent_name": os.environ.get("AGENT_NAME", "sandbox-legion"),
+                    "namespace": namespace,
+                    "max_session_tokens": budget.max_tokens,
+                }
+            }
+        },
+    )
+
+    # -- Per-node model overrides -------------------------------------------
+    def _make_llm(node_type: str) -> ChatOpenAI:
+        """Create an LLM instance for a specific node type, using model override if set."""
+        node_model = config.model_for_node(node_type)
+        return ChatOpenAI(
+            model=node_model,
+            base_url=config.llm_api_base,
+            api_key=config.llm_api_key,
+            timeout=budget.llm_timeout,
+            max_retries=budget.llm_max_retries,
+            model_kwargs={
+                "extra_body": {
+                    "metadata": {
+                        "session_id": context_id,
+                        "agent_name": os.environ.get("AGENT_NAME", "sandbox-legion"),
+                        "namespace": namespace,
+                        "max_session_tokens": budget.max_tokens,
+                    }
+                }
+            },
+        )
+
+    # Only create separate instances when overrides differ from default
+    llm_for_planner = _make_llm("planner") if config.llm_model_planner else llm
+    llm_for_executor = _make_llm("executor") if config.llm_model_executor else llm
+    llm_for_reflector = _make_llm("reflector") if config.llm_model_reflector else llm
+    llm_for_reporter = _make_llm("reporter") if config.llm_model_reporter else llm
+    llm_for_thinking = _make_llm("thinking") if config.llm_model_thinking else llm
+    llm_for_micro = _make_llm("micro_reasoning") if config.llm_model_micro_reasoning else llm
+
+    # -- Tools --------------------------------------------------------------
+    # Create tool instances once — shared across node subsets.
+    shell_tool = _make_shell_tool(executor)
+    file_read_tool = _make_file_read_tool(workspace_path)
+    file_write_tool = _make_file_write_tool(workspace_path)
+    grep_tool = _make_grep_tool(workspace_path)
+    glob_tool = _make_glob_tool(workspace_path)
+    web_fetch_tool = _make_web_fetch_tool(sources_config)
+
+    core_tools = [shell_tool, file_read_tool, file_write_tool, grep_tool, glob_tool, web_fetch_tool]
+    tools = core_tools + [
+        make_explore_tool(workspace_path, llm),
+        step_done,
+        # delegate disabled — causes crashes when agent can't resolve paths
+        # make_delegate_tool(workspace_path, llm, context_id, core_tools, namespace),
+    ]
+
+    # -- Per-node tool subsets ------------------------------------------------
+    # Each reasoning node gets its own tools and tool_choice mode:
+    #   executor:  ALL tools, tool_choice="any" (must call tools)
+    #   planner:   glob, grep, file_read, file_write + respond_to_user (escape)
+    #   reflector: glob, grep, file_read + respond_to_user (escape)
+    #   router/reporter/step_selector: no tools (text-only)
+
+    read_only_tools = [file_read_tool, grep_tool, glob_tool, respond_to_user]
+    planner_tools = [file_read_tool, grep_tool, glob_tool, file_write_tool, respond_to_user]
+
+    # SANDBOX_FORCE_TOOL_CHOICE=1 (wizard "Force Tool Calling" toggle):
+    # When forced: two-phase executor call:
+    #   Phase 1: llm_executor_reason (implicit auto) — produces text reasoning
+    #   Phase 2: llm_executor (tool_choice="any") — produces structured tool call
+    # When not forced: single-phase (implicit auto, model chooses text or tools)
+    force_tools = os.environ.get("SANDBOX_FORCE_TOOL_CHOICE", "0") == "1"
+    if force_tools:
+        llm_executor = llm_for_executor.bind_tools(tools, tool_choice="any")
+        llm_executor_reason = llm_for_thinking  # bare LLM for thinking, NO tools
+    else:
+        llm_executor = llm_for_executor.bind_tools(tools)  # implicit auto
+        llm_executor_reason = None  # no two-phase needed
+    llm_planner = llm_for_planner.bind_tools(planner_tools)  # always auto
+
+    # All nodes with tools use tool_choice="auto"
+    llm_reflector = llm_for_reflector.bind_tools(read_only_tools)  # read-only for verification
+    llm_reporter = llm_for_reporter.bind_tools(read_only_tools)  # read-only for file verification
+
+    # ToolNodes for each node's tool subset
+    _executor_tool_node = ToolNode(tools)
+    _planner_tool_node = ToolNode(planner_tools)
+    _reflector_tool_node = ToolNode(read_only_tools)
+
+    # -- Graph nodes (router-plan-execute-reflect) ---------------------------
+    # Each node function from reasoning.py takes (state, llm) — we wrap them
+    # in closures that capture the appropriate LLM instance.
+
+    async def _router(state: SandboxState) -> dict[str, Any]:
+        return await router_node(state)
+
+    async def _planner(state: SandboxState) -> dict[str, Any]:
+        return await planner_node(state, llm_planner, budget=budget)
+
+    async def _executor(state: SandboxState) -> dict[str, Any]:
+        return await executor_node(state, llm_executor, budget=budget, llm_reason=llm_executor_reason)
+
+    async def _reflector(state: SandboxState) -> dict[str, Any]:
+        return await reflector_node(state, llm_reflector, budget=budget)
+
+    async def _reporter(state: SandboxState) -> dict[str, Any]:
+        return await reporter_node(
+            state, llm_reporter, budget=budget,
+            llm_reason=llm_executor_reason,
+            tools=read_only_tools,
+        )
+
+    async def _step_selector(state: SandboxState) -> dict[str, Any]:
+        """Pick the next step and prepare focused context for the executor.
+
+        Uses a lightweight LLM call to review plan progress and write
+        a targeted brief for the executor — what to do, what worked/failed
+        before, and what to avoid.
+        """
+        from langchain_core.messages import SystemMessage as SM, HumanMessage as HM
+
+        plan = state.get("plan", [])
+        plan_steps = list(state.get("plan_steps", []))
+        current = state.get("current_step", 0)
+        messages = state.get("messages", [])
+
+        # --- PlanStore: parallel nested plan tracking ---
+        store = state.get("_plan_store", {})
+        if store and store.get("steps"):
+            current_info = ps.get_current_step(store)
+            if current_info:
+                step_key, step_data = current_info
+                try:
+                    store = ps.set_step_status(store, step_key, "running")
+                except ValueError:
+                    logger.warning("PlanStore: step %s not found, skipping", step_key)
+
+        # Find next non-done step
+        next_step = current
+        for i in range(current, len(plan_steps)):
+            _ps = plan_steps[i]
+            status = _ps.get("status", "pending") if isinstance(_ps, dict) else "pending"
+            if status != "done":
+                next_step = i
+                break
+        else:
+            next_step = len(plan)
+
+        # Mark selected step as running
+        if next_step < len(plan_steps) and isinstance(plan_steps[next_step], dict):
+            plan_steps[next_step] = {**plan_steps[next_step], "status": "running"}
+
+        # Build plan status summary
+        plan_summary = []
+        for i, step in enumerate(plan):
+            _ps = plan_steps[i] if i < len(plan_steps) else {}
+            status = _ps.get("status", "pending") if isinstance(_ps, dict) else "pending"
+            marker = "✓" if status == "done" else "→" if i == next_step else " "
+            result_hint = ""
+            if isinstance(_ps, dict) and _ps.get("result_summary"):
+                result_hint = f" — {_ps['result_summary'][:100]}"
+            plan_summary.append(f"  {marker} {i+1}. [{status}] {step[:80]}{result_hint}")
+
+        # Gather recent tool results (last 3 ToolMessages)
+        recent_results = []
+        for m in reversed(messages[-10:]):
+            if hasattr(m, 'name') and getattr(m, 'type', '') == 'tool':
+                content = str(getattr(m, 'content', ''))[:300]
+                recent_results.insert(0, f"  [{m.name}] {content}")
+                if len(recent_results) >= 3:
+                    break
+
+        if next_step >= len(plan):
+            # All done
+            logger.info("StepSelector: all %d steps complete", len(plan))
+            result_done: dict[str, Any] = {
+                "current_step": next_step,
+                "plan_steps": plan_steps,
+                "_tool_call_count": 0,
+                "done": True,
+            }
+            if store:
+                result_done["_plan_store"] = store
+            return result_done
+
+        # Quick LLM call — write a focused brief for the executor
+        step_text = plan[next_step] if next_step < len(plan) else "N/A"
+        prompt = f"""You are a step coordinator. Write a 2-3 sentence brief for the executor.
+
+Plan progress:
+{chr(10).join(plan_summary)}
+
+Next step to execute: {next_step + 1}. {step_text}
+
+Recent tool results:
+{chr(10).join(recent_results) if recent_results else '(none yet)'}
+
+WORKSPACE RULE: Each shell command starts fresh in /workspace. Bare `cd` has no effect.
+If the step involves a cloned repo, always write `cd repos/<repo> && <command>` in the brief.
+Example: "cd repos/kagenti && gh pr list" — never just "gh pr list".
+
+Write a brief: what EXACTLY to do for step {next_step + 1}, what context from previous steps is relevant, and what to watch out for. Be specific about commands/tools to use, and always include the full `cd <dir> && command` pattern when a cloned repo is involved."""
+
+        sys_msg = SM(content="You are a concise step coordinator. Output ONLY the brief, no preamble.")
+        user_msg = HM(content=prompt)
+        try:
+            response = await llm.ainvoke([sys_msg, user_msg])
+            brief = response.content.strip()
+            usage = getattr(response, 'usage_metadata', None) or {}
+            budget.add_tokens(
+                usage.get('input_tokens', 0) + usage.get('output_tokens', 0)
+            )
+        except Exception as e:
+            logger.warning("StepSelector LLM call failed: %s — using default brief", e)
+            brief = f"Execute step {next_step + 1}: {step_text}"
+            response = None
+
+        logger.info("StepSelector: step %d/%d brief: %s", next_step + 1, len(plan), brief[:100])
+        result: dict[str, Any] = {
+            "current_step": next_step,
+            "plan_steps": plan_steps,
+            "_tool_call_count": 0,
+            "skill_instructions": f"STEP BRIEF FROM COORDINATOR:\n{brief}\n\n---\n",
+        }
+        if store:
+            result["_plan_store"] = store
+        if _DEBUG_PROMPTS:
+            from sandbox_agent.context_builders import LLMCallCapture
+            result["_system_prompt"] = prompt[:10000]
+            result["_prompt_messages"] = [
+                {"role": "system", "preview": "Step coordinator brief prompt"},
+                {"role": "human", "preview": prompt[:500]},
+            ]
+            if response:
+                capture = LLMCallCapture(response=response)
+                result["_llm_response"] = capture._format_response()
+        return result
+
+    # -- Safe ToolNode wrappers — never crash the graph ----------------------
+
+    def _make_safe_tool_wrapper(tool_node: ToolNode, label: str):
+        """Create a safe tool execution wrapper for a ToolNode."""
+        async def _safe(state: SandboxState) -> dict[str, Any]:
+            from langchain_core.messages import ToolMessage
+            try:
+                return await tool_node.ainvoke(state)
+            except (GraphInterrupt, KeyboardInterrupt, SystemExit):
+                raise
+            except Exception as exc:
+                logger.error("%s ToolNode error: %s", label, exc, exc_info=True)
+                messages = state.get("messages", [])
+                error_msgs = []
+                if messages:
+                    last = messages[-1]
+                    for tc in getattr(last, "tool_calls", []):
+                        tc_id = tc.get("id", "unknown") if isinstance(tc, dict) else getattr(tc, "id", "unknown")
+                        tc_name = tc.get("name", "unknown") if isinstance(tc, dict) else getattr(tc, "name", "unknown")
+                        error_msgs.append(ToolMessage(
+                            content=f"Tool error: {exc}",
+                            tool_call_id=tc_id,
+                            name=tc_name,
+                        ))
+                if not error_msgs:
+                    error_msgs.append(ToolMessage(
+                        content=f"Tool execution failed: {exc}",
+                        tool_call_id="error",
+                        name="unknown",
+                    ))
+                return {"messages": error_msgs}
+        return _safe
+
+    _reporter_tool_node = ToolNode(read_only_tools)
+
+    _safe_executor_tools = _make_safe_tool_wrapper(_executor_tool_node, "executor")
+    _safe_planner_tools = _make_safe_tool_wrapper(_planner_tool_node, "planner")
+    _safe_reflector_tools = _make_safe_tool_wrapper(_reflector_tool_node, "reflector")
+    _safe_reporter_tools = _make_safe_tool_wrapper(_reporter_tool_node, "reporter")
+
+    # -- Assemble graph -----------------------------------------------------
+    #
+    # Topology (all nodes use tool_choice="auto"):
+    #
+    #   router → [plan]   → planner ⇄ planner_tools → step_selector
+    #            [resume] → step_selector
+    #
+    #   step_selector → executor ⇄ tools → reflector ⇄ reflector_tools
+    #
+    #   reflector_route → [done]     → reporter → END
+    #                     [continue] → step_selector
+    #                     [replan]   → planner
+    #
+    # Tool subsets:
+    #   planner:  glob, grep, file_read, file_write (inspect workspace, save plans)
+    #   executor: all tools (shell, files, grep, glob, web_fetch, explore, delegate)
+    #   reflector: glob, grep, file_read (verify step outcomes before deciding)
+    #
+    graph = StateGraph(SandboxState)
+    graph.add_node("router", _router)
+    graph.add_node("planner", _planner)
+    graph.add_node("planner_tools", _safe_planner_tools)
+    graph.add_node("step_selector", _step_selector)
+    graph.add_node("executor", _executor)
+    graph.add_node("tools", _safe_executor_tools)
+    graph.add_node("reflector", _reflector)
+    graph.add_node("reflector_tools", _safe_reflector_tools)
+    graph.add_node("reporter", _reporter)
+
+    # Entry: router decides resume vs plan
+    graph.set_entry_point("router")
+    graph.add_conditional_edges(
+        "router",
+        route_entry,
+        {"resume": "step_selector", "plan": "planner"},
+    )
+
+    # Planner → planner_tools (if tool_calls) or → step_selector (if no tool_calls)
+    graph.add_conditional_edges(
+        "planner",
+        tools_condition,
+        {"tools": "planner_tools", "__end__": "step_selector"},
+    )
+    graph.add_edge("planner_tools", "planner")
+
+    graph.add_edge("step_selector", "executor")
+
+    # Executor → executor_tools (if tool_calls) or → reflector (if no tool_calls)
+    graph.add_conditional_edges(
+        "executor",
+        tools_condition,
+        {"tools": "tools", "__end__": "reflector"},
+    )
+    graph.add_edge("tools", "executor")
+
+    # Reflector → reflector_tools (if tool_calls) or → route decision
+    graph.add_conditional_edges(
+        "reflector",
+        tools_condition,
+        {"tools": "reflector_tools", "__end__": "reflector_route"},
+    )
+    graph.add_edge("reflector_tools", "reflector")
+
+    # Reflector route → reporter (done), step_selector (continue), or planner (replan)
+    graph.add_node("reflector_route", lambda state: state)  # pass-through
+    graph.add_conditional_edges(
+        "reflector_route",
+        route_reflector,
+        {"done": "reporter", "execute": "step_selector", "replan": "planner"},
+    )
+    # Reporter executes tools internally via invoke_with_tool_loop
+    graph.add_edge("reporter", "__end__")
+
+    return graph.compile(checkpointer=checkpointer)

From e8768fdcba1312233712e9502585e85326fb72e6 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:48:17 +0100
Subject: [PATCH 10/26] feat(sandbox): graph card manifest with event catalog
 and topology introspection

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/graph_card.py           | 580 ++++++++++++++++++
 1 file changed, 580 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/graph_card.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/graph_card.py b/a2a/sandbox_agent/src/sandbox_agent/graph_card.py
new file mode 100644
index 00000000..896e7b9d
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/graph_card.py
@@ -0,0 +1,580 @@
+# Copyright 2025 IBM Corp.
+# Licensed under the Apache License, Version 2.0
+
+"""AgentGraphCard — self-describing manifest for the agent's processing graph.
+
+This module defines the event catalog and generates a "graph card" from
+LangGraph introspection.  The graph card is a structured dict that tells
+consumers (UI, backend, observability) everything they need to render the
+agent's reasoning loop:
+
+* **EVENT_CATALOG** — every event type the agent can stream, with category,
+  field definitions, and debug-field metadata so the UI knows what to expect
+  and how to render it.
+* **COMMON_EVENT_FIELDS** — fields injected by the serializer into every
+  event (type, loop_id, node_visit, event_index, etc.).
+* **TOPOLOGY_NODE_DESCRIPTIONS** — human-readable descriptions for each
+  LangGraph node.
+* **build_graph_card()** — introspects a compiled LangGraph ``CompiledGraph``
+  and returns the full card as a plain dict.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+# ---------------------------------------------------------------------------
+# Common fields injected into every serialized event
+# ---------------------------------------------------------------------------
+
+COMMON_EVENT_FIELDS: Dict[str, Dict[str, str]] = {
+    "type": {
+        "type": "str",
+        "description": "Event type discriminator (one of EVENT_CATALOG keys).",
+    },
+    "loop_id": {
+        "type": "str",
+        "description": "Unique identifier for this reasoning-loop invocation.",
+    },
+    "langgraph_node": {
+        "type": "str",
+        "description": "Name of the LangGraph node that produced this event.",
+    },
+    "node_visit": {
+        "type": "int",
+        "description": "Monotonic counter incremented each time a new major node is visited.",
+    },
+    "event_index": {
+        "type": "int",
+        "description": "Global sequence number across all events in a loop (for ordering).",
+    },
+    "model": {
+        "type": "str",
+        "description": "LLM model identifier used for this event (empty if no LLM call).",
+    },
+    "prompt_tokens": {
+        "type": "int",
+        "description": "Number of prompt tokens consumed by this event's LLM call.",
+    },
+    "completion_tokens": {
+        "type": "int",
+        "description": "Number of completion tokens produced by this event's LLM call.",
+    },
+}
+
+# ---------------------------------------------------------------------------
+# Event catalog
+# ---------------------------------------------------------------------------
+
+#: Complete catalog of every event type the sandbox agent can stream.
+#:
+#: Each entry contains:
+#:   category        – semantic grouping for the UI
+#:   description     – what this event represents
+#:   langgraph_nodes – LangGraph node names that can produce this event
+#:   has_llm_call    – whether the event involves an LLM invocation
+#:   terminal        – True only for the final-answer event
+#:   fields          – data fields specific to this event type
+#:   debug_fields    – fields available in debug / inspector mode
+EVENT_CATALOG: Dict[str, Dict[str, Any]] = {
+    # ── Reasoning ─────────────────────────────────────────────────────
+    "planner_output": {
+        "category": "reasoning",
+        "description": "Planner created or revised a multi-step plan.",
+        "langgraph_nodes": ["planner"],
+        "has_llm_call": True,
+        "fields": {
+            "steps": {
+                "type": "List[str]",
+                "description": "Ordered list of plan step descriptions.",
+            },
+            "iteration": {
+                "type": "int",
+                "description": "Planning iteration (0 = initial, >0 = replan).",
+            },
+        },
+        "debug_fields": {
+            "system_prompt": {
+                "type": "str",
+                "description": "System prompt sent to the planner LLM.",
+            },
+            "bound_tools": {
+                "type": "List[str]",
+                "description": "Tool names bound to the planner LLM.",
+            },
+            "prompt_messages": {
+                "type": "List[dict]",
+                "description": "Full message history sent to the LLM.",
+            },
+            "llm_response": {
+                "type": "str",
+                "description": "Raw LLM response text.",
+            },
+        },
+    },
+    "executor_step": {
+        "category": "reasoning",
+        "description": "Executor selected and began working on a plan step.",
+        "langgraph_nodes": ["step_selector"],
+        "has_llm_call": False,
+        "fields": {
+            "step": {
+                "type": "int",
+                "description": "Current step index (1-based).",
+            },
+            "total_steps": {
+                "type": "int",
+                "description": "Total number of plan steps.",
+            },
+            "description": {
+                "type": "str",
+                "description": "Human-readable description of the current step.",
+            },
+            "reasoning": {
+                "type": "str",
+                "description": "LLM response text (up to 2000 chars).",
+            },
+        },
+        "debug_fields": {
+            "logic": {
+                "type": "str",
+                "description": "Step selection logic: picks current_step from plan_steps.",
+            },
+        },
+    },
+    "thinking": {
+        "category": "reasoning",
+        "description": (
+            "Intermediate thinking iteration from a reasoning LLM "
+            "(bare model, no tools)."
+        ),
+        "langgraph_nodes": ["planner", "executor", "reflector"],
+        "has_llm_call": True,
+        "fields": {
+            "content": {
+                "type": "str",
+                "description": "Thinking text produced by the reasoning LLM.",
+            },
+            "iteration": {
+                "type": "int",
+                "description": "Thinking iteration number within this node visit.",
+            },
+            "total_iterations": {
+                "type": "int",
+                "description": "Total thinking iterations in this cycle.",
+            },
+        },
+        "debug_fields": {
+            "system_prompt": {
+                "type": "str",
+                "description": "System prompt for the thinking LLM.",
+            },
+            "bound_tools": {
+                "type": "List[str]",
+                "description": "Always empty — thinking LLM has no tools.",
+            },
+            "prompt_messages": {
+                "type": "List[dict]",
+                "description": "Messages sent to the thinking LLM.",
+            },
+            "llm_response": {
+                "type": "str",
+                "description": "Raw thinking response.",
+            },
+        },
+    },
+    "micro_reasoning": {
+        "category": "reasoning",
+        "description": (
+            "Executor's intermediate LLM reasoning within a single plan step "
+            "(tool-loop iteration)."
+        ),
+        "langgraph_nodes": ["executor"],
+        "has_llm_call": True,
+        "fields": {
+            "content": {
+                "type": "str",
+                "description": "Reasoning text from the micro-reasoning LLM.",
+            },
+            "step": {
+                "type": "int",
+                "description": "Current plan step index.",
+            },
+            "micro_step": {
+                "type": "int",
+                "description": "Tool-loop iteration within the current plan step.",
+            },
+            "thinking_count": {
+                "type": "int",
+                "description": "Number of thinking iterations that preceded this reasoning.",
+            },
+        },
+        "debug_fields": {
+            "system_prompt": {
+                "type": "str",
+                "description": "System prompt for the micro-reasoning LLM.",
+            },
+            "bound_tools": {
+                "type": "List[str]",
+                "description": "Tool names available to the micro-reasoning LLM.",
+            },
+            "prompt_messages": {
+                "type": "List[dict]",
+                "description": "Messages sent to the micro-reasoning LLM.",
+            },
+            "llm_response": {
+                "type": "str",
+                "description": "Raw LLM response before tool extraction.",
+            },
+        },
+    },
+    # ── Execution ─────────────────────────────────────────────────────
+    "tool_call": {
+        "category": "execution",
+        "description": "A tool was invoked by the executor or planner LLM.",
+        "langgraph_nodes": ["executor", "planner"],
+        "has_llm_call": False,
+        "fields": {
+            "step": {
+                "type": "int",
+                "description": "Plan step that triggered this tool call.",
+            },
+            "name": {
+                "type": "str",
+                "description": "Tool name.",
+            },
+            "args": {
+                "type": "str",
+                "description": "JSON-encoded tool arguments.",
+            },
+        },
+        "debug_fields": {},
+    },
+    # ── Tool output ───────────────────────────────────────────────────
+    "tool_result": {
+        "category": "tool_output",
+        "description": "A tool returned its result.",
+        "langgraph_nodes": ["tools", "planner_tools", "reflector_tools"],
+        "has_llm_call": False,
+        "fields": {
+            "step": {
+                "type": "int",
+                "description": "Plan step this result belongs to.",
+            },
+            "name": {
+                "type": "str",
+                "description": "Tool name that produced the result.",
+            },
+            "output": {
+                "type": "str",
+                "description": "Tool output (may be truncated).",
+            },
+        },
+        "debug_fields": {},
+    },
+    # ── Decision ──────────────────────────────────────────────────────
+    "reflector_decision": {
+        "category": "decision",
+        "description": (
+            "Reflector reviewed execution and decided: continue, replan, or done."
+        ),
+        "langgraph_nodes": ["reflector"],
+        "has_llm_call": True,
+        "fields": {
+            "decision": {
+                "type": "str",
+                "description": "Routing decision.",
+                "enum": ["continue", "replan", "done"],
+            },
+            "assessment": {
+                "type": "str",
+                "description": "Full reflection assessment text.",
+            },
+            "iteration": {
+                "type": "int",
+                "description": "Reflect-execute loop iteration.",
+            },
+        },
+        "debug_fields": {
+            "system_prompt": {
+                "type": "str",
+                "description": "System prompt for the reflector LLM.",
+            },
+            "bound_tools": {
+                "type": "List[str]",
+                "description": "Read-only tools bound to the reflector.",
+            },
+            "prompt_messages": {
+                "type": "List[dict]",
+                "description": "Messages sent to the reflector LLM.",
+            },
+            "llm_response": {
+                "type": "str",
+                "description": "Raw reflector LLM output.",
+            },
+        },
+    },
+    "router_decision": {
+        "category": "decision",
+        "description": "Router decided whether to plan from scratch or resume execution.",
+        "langgraph_nodes": ["router"],
+        "has_llm_call": False,
+        "fields": {
+            "route": {
+                "type": "str",
+                "description": "Chosen route.",
+                "enum": ["plan", "resume"],
+            },
+            "plan_status": {
+                "type": "str",
+                "description": "Current plan status at time of routing.",
+            },
+        },
+        "debug_fields": {
+            "logic": {
+                "type": "str",
+                "description": (
+                    "Routing logic: checks plan_status to decide resume vs plan."
+                ),
+            },
+        },
+    },
+    # ── Terminal ──────────────────────────────────────────────────────
+    "reporter_output": {
+        "category": "terminal",
+        "description": "Reporter generated the final answer for the user.",
+        "langgraph_nodes": ["reporter"],
+        "has_llm_call": True,
+        "terminal": True,
+        "fields": {
+            "content": {
+                "type": "str",
+                "description": "Final answer content (markdown).",
+            },
+        },
+        "debug_fields": {
+            "system_prompt": {
+                "type": "str",
+                "description": "System prompt for the reporter LLM.",
+            },
+            "bound_tools": {
+                "type": "List[str]",
+                "description": "Tools available to the reporter (for citations).",
+            },
+            "prompt_messages": {
+                "type": "List[dict]",
+                "description": "Messages sent to the reporter LLM.",
+            },
+            "llm_response": {
+                "type": "str",
+                "description": "Raw reporter LLM output.",
+            },
+        },
+    },
+    # ── Meta ──────────────────────────────────────────────────────────
+    "budget_update": {
+        "category": "meta",
+        "description": "Budget tracking update (tokens consumed, wall-clock time).",
+        "langgraph_nodes": [],
+        "has_llm_call": False,
+        "fields": {
+            "tokens_used": {
+                "type": "int",
+                "description": "Total tokens consumed so far.",
+            },
+            "tokens_budget": {
+                "type": "int",
+                "description": "Maximum token budget.",
+            },
+            "wall_clock_s": {
+                "type": "float",
+                "description": "Elapsed wall-clock seconds.",
+            },
+            "max_wall_clock_s": {
+                "type": "float",
+                "description": "Maximum allowed wall-clock seconds.",
+            },
+        },
+        "debug_fields": {},
+    },
+    "node_transition": {
+        "category": "meta",
+        "description": (
+            "Internal marker indicating a graph-level transition between nodes."
+        ),
+        "langgraph_nodes": [],
+        "has_llm_call": False,
+        "fields": {
+            "from_node": {
+                "type": "str",
+                "description": "Node the transition originates from.",
+            },
+            "to_node": {
+                "type": "str",
+                "description": "Node the transition goes to.",
+            },
+        },
+        "debug_fields": {},
+    },
+    # ── Interaction ───────────────────────────────────────────────────
+    "hitl_request": {
+        "category": "interaction",
+        "description": (
+            "Human-in-the-loop approval request — the executor is pausing "
+            "to ask the user before proceeding."
+        ),
+        "langgraph_nodes": ["executor"],
+        "has_llm_call": False,
+        "fields": {
+            "tool_name": {
+                "type": "str",
+                "description": "Tool that requires approval.",
+            },
+            "args": {
+                "type": "str",
+                "description": "JSON-encoded tool arguments pending approval.",
+            },
+            "reason": {
+                "type": "str",
+                "description": "Why the agent is requesting approval.",
+            },
+        },
+        "debug_fields": {},
+    },
+}
+
+# Valid category values (mirrors the set used in EVENT_CATALOG).
+VALID_CATEGORIES = frozenset(
+    {
+        "reasoning",
+        "execution",
+        "tool_output",
+        "decision",
+        "terminal",
+        "meta",
+        "interaction",
+    }
+)
+
+# ---------------------------------------------------------------------------
+# LangGraph topology node descriptions
+# ---------------------------------------------------------------------------
+
+#: Human-readable description for each node in the compiled graph.
+TOPOLOGY_NODE_DESCRIPTIONS: Dict[str, str] = {
+    "router": (
+        "Entry node — decides whether to create a new plan or resume execution "
+        "of an existing plan."
+    ),
+    "planner": (
+        "Creates or revises a multi-step plan using an LLM with planning tools "
+        "(glob, grep, file_read, file_write)."
+    ),
+    "planner_tools": (
+        "Executes tool calls issued by the planner (workspace inspection, "
+        "plan persistence)."
+    ),
+    "step_selector": (
+        "Picks the next plan step to execute and prepares the executor context."
+    ),
+    "executor": (
+        "Executes the current plan step using an LLM with the full tool suite "
+        "(shell, files, grep, glob, web_fetch, explore, delegate)."
+    ),
+    "tools": (
+        "Executes tool calls issued by the executor."
+    ),
+    "reflector": (
+        "Reviews execution results and decides whether to continue, replan, "
+        "or declare done. Uses read-only tools (glob, grep, file_read)."
+    ),
+    "reflector_tools": (
+        "Executes read-only tool calls issued by the reflector for verification."
+    ),
+    "reflector_route": (
+        "Pass-through node that routes the reflector's decision to the next node "
+        "(reporter, step_selector, or planner)."
+    ),
+    "reporter": (
+        "Generates the final user-facing answer by synthesizing all execution "
+        "results. May invoke tools internally for citation verification."
+    ),
+}
+
+
+# ---------------------------------------------------------------------------
+# Graph card builder
+# ---------------------------------------------------------------------------
+
+
+def build_graph_card(
+    compiled: Any,
+    agent_id: str = "sandbox_agent",
+) -> Dict[str, Any]:
+    """Build the AgentGraphCard from a compiled LangGraph.
+
+    Parameters
+    ----------
+    compiled:
+        A ``CompiledStateGraph`` (or any object whose ``.get_graph()`` returns
+        a ``Graph`` with ``.nodes`` and ``.edges``).
+    agent_id:
+        Identifier for the agent (used in the card's ``id`` field).
+
+    Returns
+    -------
+    dict
+        A plain dict with keys:
+        - ``id`` — agent identifier
+        - ``framework`` — always ``"langgraph"``
+        - ``version`` — card schema version
+        - ``event_catalog`` — the full ``EVENT_CATALOG``
+        - ``common_event_fields`` — the ``COMMON_EVENT_FIELDS`` dict
+        - ``topology`` — ``{nodes, edges, entry_node}``
+    """
+    graph = compiled.get_graph()
+
+    # ── Nodes ─────────────────────────────────────────────────────────
+    raw_nodes: List[str] = [
+        node_id
+        for node_id in graph.nodes
+        if node_id not in ("__start__", "__end__")
+    ]
+    nodes: Dict[str, Dict[str, str]] = {}
+    for node_id in raw_nodes:
+        nodes[node_id] = {
+            "description": TOPOLOGY_NODE_DESCRIPTIONS.get(node_id, ""),
+        }
+
+    # ── Edges ─────────────────────────────────────────────────────────
+    edges: List[Dict[str, str]] = []
+    for edge in graph.edges:
+        source = edge.source if hasattr(edge, "source") else edge[0]
+        target = edge.target if hasattr(edge, "target") else edge[1]
+        # Skip __start__ / __end__ for cleaner topology
+        if source in ("__start__", "__end__") or target in ("__start__", "__end__"):
+            continue
+        edges.append({"source": source, "target": target})
+
+    # ── Entry node ────────────────────────────────────────────────────
+    # The entry node is the first node reachable from __start__.
+    entry_node: str = ""
+    for edge in graph.edges:
+        src = edge.source if hasattr(edge, "source") else edge[0]
+        tgt = edge.target if hasattr(edge, "target") else edge[1]
+        if src == "__start__":
+            entry_node = tgt
+            break
+
+    return {
+        "id": agent_id,
+        "framework": "langgraph",
+        "version": "1.0",
+        "event_catalog": EVENT_CATALOG,
+        "common_event_fields": COMMON_EVENT_FIELDS,
+        "topology": {
+            "nodes": nodes,
+            "edges": edges,
+            "entry_node": entry_node,
+        },
+    }

From ec59f47d24adad2992090d42fad4088c751b88d3 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:48:24 +0100
Subject: [PATCH 11/26] feat(sandbox): raw ctypes wrapper for Linux Landlock
 LSM syscalls (x86_64/aarch64)

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/landlock_ctypes.py      | 193 ++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py b/a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py
new file mode 100644
index 00000000..ff9b35ca
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py
@@ -0,0 +1,193 @@
+"""Raw ctypes wrapper for Linux Landlock LSM syscalls.
+
+Architecture-aware: supports x86_64 and aarch64 syscall numbers.
+Zero external dependencies -- pure ctypes + stdlib.
+
+Landlock is IRREVERSIBLE once applied to a thread. There is no undo.
+All functions in this module fail hard (raise OSError) on error.
+"""
+
+from __future__ import annotations
+
+import ctypes
+import os
+import platform
+import struct
+
+# ---------------------------------------------------------------------------
+# Syscall numbers by architecture
+# ---------------------------------------------------------------------------
+
+_ARCH = platform.machine()
+
+if _ARCH == "x86_64":
+    _SYS_LANDLOCK_CREATE_RULESET = 444
+    _SYS_LANDLOCK_ADD_RULE = 445
+    _SYS_LANDLOCK_RESTRICT_SELF = 446
+elif _ARCH == "aarch64":
+    _SYS_LANDLOCK_CREATE_RULESET = 441
+    _SYS_LANDLOCK_ADD_RULE = 442
+    _SYS_LANDLOCK_RESTRICT_SELF = 443
+else:
+    raise RuntimeError(f"Unsupported architecture for Landlock: {_ARCH}")
+
+# ---------------------------------------------------------------------------
+# Landlock constants
+# ---------------------------------------------------------------------------
+
+LANDLOCK_RULE_PATH_BENEATH = 1
+
+# ABI v1 access flags (13 flags)
+_ACCESS_FS_V1 = (
+    (1 << 0)   # EXECUTE
+    | (1 << 1)   # WRITE_FILE
+    | (1 << 2)   # READ_FILE
+    | (1 << 3)   # READ_DIR
+    | (1 << 4)   # REMOVE_DIR
+    | (1 << 5)   # REMOVE_FILE
+    | (1 << 6)   # MAKE_CHAR
+    | (1 << 7)   # MAKE_DIR
+    | (1 << 8)   # MAKE_REG
+    | (1 << 9)   # MAKE_SOCK
+    | (1 << 10)  # MAKE_FIFO
+    | (1 << 11)  # MAKE_BLOCK
+    | (1 << 12)  # MAKE_SYM
+)
+
+# ABI v2 adds REFER
+_ACCESS_FS_REFER = 1 << 13
+
+# ABI v3 adds TRUNCATE
+_ACCESS_FS_TRUNCATE = 1 << 14
+
+# Read-only subset (for ro_paths)
+ACCESS_FS_READ_ONLY = (
+    (1 << 0)   # EXECUTE
+    | (1 << 2)   # READ_FILE
+    | (1 << 3)   # READ_DIR
+)
+
+_libc = ctypes.CDLL("libc.so.6", use_errno=True)
+
+# ---------------------------------------------------------------------------
+# Syscall helpers
+# ---------------------------------------------------------------------------
+
+
+def _syscall(nr: int, *args: int) -> int:
+    """Invoke a raw syscall. Returns the result or raises OSError."""
+    result = _libc.syscall(ctypes.c_long(nr), *[ctypes.c_long(a) for a in args])
+    if result < 0:
+        errno = ctypes.get_errno()
+        raise OSError(errno, f"syscall {nr} failed: {os.strerror(errno)}")
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def get_abi_version() -> int:
+    """Query the kernel's Landlock ABI version.
+
+    Returns an integer >= 1 if Landlock is supported.
+    Raises OSError if Landlock is not available.
+    """
+    # landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION=1<<0)
+    LANDLOCK_CREATE_RULESET_VERSION = 1 << 0
+    return _syscall(_SYS_LANDLOCK_CREATE_RULESET, 0, 0, LANDLOCK_CREATE_RULESET_VERSION)
+
+
+def _get_fs_access_flags(abi_version: int) -> int:
+    """Return the full set of handled_access_fs flags for the given ABI version."""
+    flags = _ACCESS_FS_V1
+    if abi_version >= 2:
+        flags |= _ACCESS_FS_REFER
+    if abi_version >= 3:
+        flags |= _ACCESS_FS_TRUNCATE
+    return flags
+
+
+def _add_rule(ruleset_fd: int, path: str, access: int) -> None:
+    """Add a path-beneath rule to an existing Landlock ruleset.
+
+    Parameters
+    ----------
+    ruleset_fd:
+        File descriptor of the Landlock ruleset.
+    path:
+        Absolute filesystem path to allow.
+    access:
+        Bitmask of allowed access rights.
+    """
+    parent_fd = os.open(path, os.O_PATH | os.O_CLOEXEC)
+    try:
+        # struct landlock_path_beneath_attr {
+        #     __u64 allowed_access;   // 8 bytes
+        #     __s32 parent_fd;        // 4 bytes
+        #     // 4 bytes padding
+        # }
+        attr = struct.pack("QiI", access, parent_fd, 0)
+        attr_ptr = ctypes.c_char_p(attr)
+        _syscall(
+            _SYS_LANDLOCK_ADD_RULE,
+            ruleset_fd,
+            LANDLOCK_RULE_PATH_BENEATH,
+            ctypes.cast(attr_ptr, ctypes.c_void_p).value,
+            0,
+        )
+    finally:
+        os.close(parent_fd)
+
+
+def apply_landlock(rw_paths: list[str], ro_paths: list[str]) -> None:
+    """Create a Landlock ruleset, add path rules, and restrict the current thread.
+
+    This is IRREVERSIBLE. After this call, the thread can only access
+    the specified paths with the specified permissions.
+
+    Parameters
+    ----------
+    rw_paths:
+        Paths to allow full read-write access.
+    ro_paths:
+        Paths to allow read-only access (execute + read_file + read_dir).
+
+    Raises
+    ------
+    OSError
+        If any Landlock syscall fails. No fallback, no degraded mode.
+    """
+    abi = get_abi_version()
+    handled_access_fs = _get_fs_access_flags(abi)
+
+    # struct landlock_ruleset_attr { __u64 handled_access_fs; }
+    ruleset_attr = struct.pack("Q", handled_access_fs)
+    ruleset_attr_ptr = ctypes.c_char_p(ruleset_attr)
+    ruleset_fd = _syscall(
+        _SYS_LANDLOCK_CREATE_RULESET,
+        ctypes.cast(ruleset_attr_ptr, ctypes.c_void_p).value,
+        len(ruleset_attr),
+        0,
+    )
+
+    try:
+        # Add read-write path rules
+        for path in rw_paths:
+            if os.path.exists(path):
+                _add_rule(ruleset_fd, path, handled_access_fs)
+
+        # Add read-only path rules
+        for path in ro_paths:
+            if os.path.exists(path):
+                _add_rule(ruleset_fd, path, ACCESS_FS_READ_ONLY)
+
+        # prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) -- required before restrict_self
+        PR_SET_NO_NEW_PRIVS = 38
+        _libc.prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)
+
+        # landlock_restrict_self(ruleset_fd, 0)
+        _syscall(_SYS_LANDLOCK_RESTRICT_SELF, ruleset_fd, 0)
+    finally:
+        os.close(ruleset_fd)

From 58876bd6a1c2d804b9b2ef54e9f109468bd42d16 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:48:29 +0100
Subject: [PATCH 12/26] feat(sandbox): startup probe that verifies Landlock
 isolation in a forked child process

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/landlock_probe.py       | 132 ++++++++++++++++++
 1 file changed, 132 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py b/a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py
new file mode 100644
index 00000000..74f46888
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py
@@ -0,0 +1,132 @@
+"""Startup probe for Landlock filesystem isolation.
+
+Forks a child process to verify that Landlock actually works on this
+kernel.  The child applies Landlock, writes to an allowed directory,
+and verifies that reads outside the sandbox are blocked.
+
+Because Landlock is irreversible, the probe MUST run in a fork.
+If the probe fails, the process exits with sys.exit(1).
+"""
+
+from __future__ import annotations
+
+import logging
+import subprocess
+import sys
+import textwrap
+
+logger = logging.getLogger(__name__)
+
+
+def probe_landlock() -> int:
+    """Fork a child that applies Landlock and verifies it blocks /etc/hostname.
+
+    Returns the ABI version on success.
+    Calls sys.exit(1) if Landlock is unavailable or the probe fails.
+    """
+    # The child script imports landlock_ctypes from the same package.
+    # We run it as a subprocess so Landlock restrictions are confined
+    # to the child process and do not affect the parent.
+    child_script = textwrap.dedent("""\
+        import os
+        import sys
+        import tempfile
+
+        # Ensure the package is importable
+        sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+        from sandbox_agent.landlock_ctypes import apply_landlock, get_abi_version
+
+        abi = get_abi_version()
+
+        # Create a temp directory for the sandbox
+        tmp_dir = tempfile.mkdtemp(prefix="landlock_probe_")
+
+        # Read-only paths for basic system functionality
+        ro_paths = []
+        for p in ["/usr", "/lib", "/lib64", "/etc"]:
+            if os.path.exists(p):
+                ro_paths.append(p)
+
+        # Apply Landlock: only tmp_dir is writable
+        apply_landlock(rw_paths=[tmp_dir], ro_paths=ro_paths)
+
+        # Verify: writing inside the sandbox must succeed
+        test_file = os.path.join(tmp_dir, "probe_test.txt")
+        with open(test_file, "w") as f:
+            f.write("landlock probe ok")
+
+        # Verify: reading the file back must succeed
+        with open(test_file, "r") as f:
+            content = f.read()
+        assert content == "landlock probe ok", f"Read-back mismatch: {content!r}"
+
+        # Verify: writing OUTSIDE the sandbox must fail
+        blocked = False
+        try:
+            with open("/tmp/landlock_escape_test.txt", "w") as f:
+                f.write("should not work")
+        except PermissionError:
+            blocked = True
+        except OSError as e:
+            # EACCES (13) is also acceptable
+            if e.errno == 13:
+                blocked = True
+            else:
+                raise
+
+        if not blocked:
+            print("LANDLOCK_FAIL: write outside sandbox was NOT blocked", file=sys.stderr)
+            sys.exit(2)
+
+        print(f"LANDLOCK_OK abi={abi}")
+        sys.exit(0)
+    """)
+
+    # Find the package root so the child can import sandbox_agent
+    package_src = str(
+        __import__("pathlib").Path(__file__).resolve().parent.parent
+    )
+
+    result = subprocess.run(
+        [sys.executable, "-c", child_script],
+        capture_output=True,
+        text=True,
+        timeout=30,
+        env={
+            **dict(__import__("os").environ),
+            "PYTHONPATH": package_src,
+        },
+    )
+
+    if result.returncode != 0:
+        logger.error(
+            "Landlock probe FAILED (exit=%d):\nstdout: %s\nstderr: %s",
+            result.returncode,
+            result.stdout.strip(),
+            result.stderr.strip(),
+        )
+        print(
+            f"FATAL: Landlock probe failed. "
+            f"Kernel may not support Landlock or /proc/sys/kernel/unprivileged_landlock is 0.\n"
+            f"stderr: {result.stderr.strip()}",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    # Parse ABI version from stdout
+    stdout = result.stdout.strip()
+    abi_version = 0
+    for line in stdout.splitlines():
+        if line.startswith("LANDLOCK_OK"):
+            for part in line.split():
+                if part.startswith("abi="):
+                    abi_version = int(part.split("=", 1)[1])
+                    break
+
+    if abi_version < 1:
+        logger.error("Landlock probe returned invalid ABI version: %s", stdout)
+        sys.exit(1)
+
+    logger.info("Landlock probe passed -- ABI version %d", abi_version)
+    return abi_version

From 40437f2196798be2581404dcd2a582759d98e8e0 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:48:35 +0100
Subject: [PATCH 13/26] feat(sandbox): OpenTelemetry observability with tracing
 middleware and LangChain auto-instrumentation

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/observability.py        | 371 ++++++++++++++++++
 1 file changed, 371 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/observability.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/observability.py b/a2a/sandbox_agent/src/sandbox_agent/observability.py
new file mode 100644
index 00000000..259be8d2
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/observability.py
@@ -0,0 +1,371 @@
+"""
+OpenTelemetry observability setup for Sandbox Agent.
+
+Key Features:
+- Tracing middleware for root span with MLflow attributes
+- Auto-instrumentation of LangChain with OpenInference
+- Resource attributes for static agent metadata
+- W3C Trace Context propagation for distributed tracing
+
+Phase 1: Root span + auto-instrumentation only.
+Node-level manual spans will be added in a later phase.
+"""
+
+import json
+import logging
+import os
+from contextvars import ContextVar
+from typing import Any, Optional
+
+logger = logging.getLogger(__name__)
+
+# Agent metadata (static, used in Resource and spans)
+AGENT_NAME = os.getenv("SANDBOX_AGENT_NAME", "sandbox-legion")
+AGENT_VERSION = "1.0.0"
+AGENT_FRAMEWORK = "langgraph"
+
+# ContextVar to pass root span from middleware to agent code.
+# This allows execute() to access the middleware-created root span
+# even though trace.get_current_span() would return a child span.
+_root_span_var: ContextVar = ContextVar('root_span', default=None)
+
+
+def get_root_span():
+    """Get the root span created by tracing middleware.
+
+    Use this instead of trace.get_current_span() when you need to set
+    attributes on the root span (e.g., mlflow.spanOutputs for streaming).
+
+    Returns:
+        The root span, or None if not in a traced request context.
+    """
+    return _root_span_var.get()
+
+
+# OpenInference semantic conventions
+try:
+    from openinference.semconv.trace import SpanAttributes, OpenInferenceSpanKindValues
+    OPENINFERENCE_AVAILABLE = True
+except ImportError:
+    OPENINFERENCE_AVAILABLE = False
+    logger.warning("openinference-semantic-conventions not available")
+
+
+def _get_otlp_exporter(endpoint: str):
+    """Get HTTP OTLP exporter."""
+    from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+    if not endpoint.endswith("/v1/traces"):
+        endpoint = endpoint.rstrip("/") + "/v1/traces"
+    return OTLPSpanExporter(endpoint=endpoint)
+
+
+def setup_observability() -> bool:
+    """
+    Set up OpenTelemetry tracing with OpenInference instrumentation.
+
+    Call this ONCE at agent startup, before importing agent code.
+    NEVER raises — all exceptions are caught and logged. OTel issues
+    must never break the agent's main processing loop.
+
+    Returns:
+        True if tracing was set up successfully, False otherwise.
+    """
+    service_name = os.getenv("OTEL_SERVICE_NAME", "sandbox-agent")
+    namespace = os.getenv("K8S_NAMESPACE_NAME", "team1")
+    otlp_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "")
+
+    if not otlp_endpoint:
+        logger.warning(
+            "OTEL_EXPORTER_OTLP_ENDPOINT not set — tracing disabled. "
+            "Set this env var to enable OpenTelemetry tracing."
+        )
+        return False
+
+    try:
+        return _setup_observability_inner(service_name, namespace, otlp_endpoint)
+    except Exception:
+        logger.exception("OTel setup failed — tracing disabled (agent continues without tracing)")
+        return False
+
+
+def _setup_observability_inner(service_name: str, namespace: str, otlp_endpoint: str) -> bool:
+    """Internal setup — may raise. Called by setup_observability() which catches all errors."""
+    from opentelemetry import trace
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
+    from opentelemetry.propagate import set_global_textmap
+    from opentelemetry.propagators.composite import CompositePropagator
+    from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
+    from opentelemetry.baggage.propagation import W3CBaggagePropagator
+
+    logger.info("=" * 60)
+    logger.info("Setting up OpenTelemetry observability")
+    logger.info("  Service: %s", service_name)
+    logger.info("  Agent: %s", AGENT_NAME)
+    logger.info("  Framework: %s", AGENT_FRAMEWORK)
+    logger.info("  Namespace: %s", namespace)
+    logger.info("  OTLP Endpoint: %s", otlp_endpoint)
+    logger.info("=" * 60)
+
+    # Create resource with service and MLflow attributes.
+    # Resource attributes are STATIC and apply to ALL spans/traces.
+    # See: https://mlflow.org/docs/latest/genai/tracing/opentelemetry/
+    resource = Resource(attributes={
+        # Standard OTEL service attributes
+        SERVICE_NAME: service_name,
+        SERVICE_VERSION: AGENT_VERSION,
+        "service.namespace": namespace,
+        "k8s.namespace.name": namespace,
+        # MLflow static metadata (applies to all traces)
+        "mlflow.traceName": AGENT_NAME,
+        "mlflow.source": service_name,
+        # GenAI static attributes
+        "gen_ai.agent.name": AGENT_NAME,
+        "gen_ai.agent.version": AGENT_VERSION,
+        "gen_ai.system": AGENT_FRAMEWORK,
+    })
+
+    # Create and configure tracer provider
+    tracer_provider = TracerProvider(resource=resource)
+    tracer_provider.add_span_processor(
+        BatchSpanProcessor(_get_otlp_exporter(otlp_endpoint))
+    )
+    trace.set_tracer_provider(tracer_provider)
+
+    # Auto-instrument LangChain with OpenInference
+    try:
+        from openinference.instrumentation.langchain import LangChainInstrumentor
+        LangChainInstrumentor().instrument()
+        logger.info("LangChain instrumented with OpenInference")
+    except ImportError:
+        logger.warning("openinference-instrumentation-langchain not available")
+
+    # Configure W3C Trace Context propagation
+    set_global_textmap(CompositePropagator([
+        TraceContextTextMapPropagator(),
+        W3CBaggagePropagator(),
+    ]))
+
+    # Instrument OpenAI for GenAI semantic conventions
+    try:
+        from opentelemetry.instrumentation.openai import OpenAIInstrumentor
+        OpenAIInstrumentor().instrument()
+        logger.info("OpenAI instrumented with GenAI semantic conventions")
+    except ImportError:
+        logger.warning("opentelemetry-instrumentation-openai not available")
+
+    return True
+
+
+# Tracer for manual spans — use OpenInference-compatible name
+_tracer = None
+TRACER_NAME = "openinference.instrumentation.agent"
+
+
+def get_tracer():
+    """Get tracer for creating manual spans."""
+    from opentelemetry import trace
+
+    global _tracer
+    if _tracer is None:
+        _tracer = trace.get_tracer(TRACER_NAME)
+    return _tracer
+
+
+def enrich_current_span(**kwargs: Any) -> None:
+    """Add attributes to the currently active span.
+
+    Convenience helper so agent code can annotate spans without importing
+    opentelemetry directly.
+
+    Args:
+        **kwargs: Attribute key-value pairs to set on the current span.
+    """
+    from opentelemetry import trace
+
+    span = trace.get_current_span()
+    if span and span.is_recording():
+        for key, value in kwargs.items():
+            span.set_attribute(key, value)
+
+
+def create_tracing_middleware():
+    """
+    Create Starlette middleware that wraps all requests in a root tracing span.
+
+    This middleware:
+    1. Creates a root span BEFORE A2A handlers run
+    2. Sets MLflow/GenAI attributes on the root span
+    3. Parses A2A JSON-RPC request to extract user input
+    4. Captures response to set output attributes
+    5. For streaming (SSE) responses, sets status without capturing body
+
+    Usage in agent.py:
+        from sandbox_agent.observability import create_tracing_middleware
+        app = server.build()
+        app.add_middleware(BaseHTTPMiddleware, dispatch=create_tracing_middleware())
+    """
+    from starlette.requests import Request
+    from starlette.responses import Response, StreamingResponse
+    from opentelemetry import trace, context
+    from opentelemetry.trace import Status, StatusCode, SpanKind
+
+    async def tracing_middleware(request: Request, call_next):
+        # Skip non-API paths (health checks, agent card, etc.)
+        if request.url.path in [
+            "/health", "/ready",
+            "/.well-known/agent-card.json",
+            "/.well-known/agent-graph-card.json",
+        ]:
+            return await call_next(request)
+
+        tracer = get_tracer()
+
+        # Parse request body to extract user input and context
+        user_input = None
+        context_id = None
+        message_id = None
+
+        try:
+            body = await request.body()
+            if body:
+                data = json.loads(body)
+                # A2A JSON-RPC format: params.message.parts[0].text
+                params = data.get("params", {})
+                message = params.get("message", {})
+                parts = message.get("parts", [])
+                if parts and isinstance(parts, list):
+                    user_input = parts[0].get("text", "")
+                context_id = params.get("contextId") or message.get("contextId")
+                message_id = message.get("messageId")
+        except Exception as e:
+            logger.debug("Could not parse request body: %s", e)
+
+        # Break parent chain to make this a true root span.
+        # Without this, the span would inherit parent from W3C Trace Context headers.
+        empty_ctx = context.Context()
+        detach_token = context.attach(empty_ctx)
+
+        try:
+            # Create root span with correct GenAI naming convention.
+            # Per https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/
+            # Span name: "invoke_agent {gen_ai.agent.name}"
+            span_name = f"invoke_agent {AGENT_NAME}"
+
+            with tracer.start_as_current_span(
+                span_name,
+                kind=SpanKind.INTERNAL,  # In-process agent (not remote service)
+            ) as span:
+                # Store span in ContextVar so agent code can access it.
+                # trace.get_current_span() in execute() returns the innermost
+                # span (A2A span), not our root span.
+                span_token = _root_span_var.set(span)
+
+                # === GenAI Semantic Conventions (Required) ===
+                span.set_attribute("gen_ai.operation.name", "invoke_agent")
+                span.set_attribute("gen_ai.provider.name", AGENT_FRAMEWORK)
+                span.set_attribute("gen_ai.agent.name", AGENT_NAME)
+                span.set_attribute("gen_ai.agent.version", AGENT_VERSION)
+
+                # Set input attributes (Prompt column in MLflow)
+                if user_input:
+                    span.set_attribute("gen_ai.prompt", user_input[:1000])
+                    span.set_attribute("input.value", user_input[:1000])
+                    span.set_attribute("mlflow.spanInputs", user_input[:1000])
+
+                # Session tracking — use context_id or message_id as fallback
+                session_id = context_id or message_id
+
+                if session_id:
+                    span.set_attribute("gen_ai.conversation.id", session_id)
+                    span.set_attribute("mlflow.trace.session", session_id)
+                    span.set_attribute("session.id", session_id)
+
+                # MLflow trace metadata (appears in trace list columns)
+                span.set_attribute("mlflow.spanType", "AGENT")
+                span.set_attribute("mlflow.traceName", AGENT_NAME)
+                span.set_attribute("mlflow.runName", f"{AGENT_NAME}-invoke")
+                span.set_attribute("mlflow.source", os.getenv("OTEL_SERVICE_NAME", "sandbox-agent"))
+                span.set_attribute("mlflow.version", AGENT_VERSION)
+
+                # User tracking — extract from auth header if available
+                auth_header = request.headers.get("authorization", "")
+                if auth_header:
+                    span.set_attribute("mlflow.user", "authenticated")
+                    span.set_attribute("enduser.id", "authenticated")
+                else:
+                    span.set_attribute("mlflow.user", "anonymous")
+                    span.set_attribute("enduser.id", "anonymous")
+
+                # OpenInference span kind (for Phoenix)
+                if OPENINFERENCE_AVAILABLE:
+                    span.set_attribute(
+                        SpanAttributes.OPENINFERENCE_SPAN_KIND,
+                        OpenInferenceSpanKindValues.AGENT.value,
+                    )
+
+                try:
+                    # Call the next handler (A2A)
+                    response = await call_next(request)
+
+                    # Try to capture response for output attributes.
+                    # This only works for non-streaming responses.
+                    if isinstance(response, Response) and not isinstance(
+                        response, StreamingResponse
+                    ):
+                        # Read response body — we MUST recreate response after
+                        _chunks: list[bytes] = []
+                        async for chunk in response.body_iterator:
+                            _chunks.append(chunk)
+                        response_body = b"".join(_chunks)
+
+                        # Try to parse and extract output for MLflow
+                        try:
+                            if response_body:
+                                resp_data = json.loads(response_body)
+                                result = resp_data.get("result", {})
+                                artifacts = result.get("artifacts", [])
+                                if artifacts:
+                                    parts = artifacts[0].get("parts", [])
+                                    if parts:
+                                        output_text = parts[0].get("text", "")
+                                        if output_text:
+                                            span.set_attribute(
+                                                "gen_ai.completion", output_text[:1000]
+                                            )
+                                            span.set_attribute(
+                                                "output.value", output_text[:1000]
+                                            )
+                                            span.set_attribute(
+                                                "mlflow.spanOutputs", output_text[:1000]
+                                            )
+                        except Exception as e:
+                            logger.debug("Could not parse response body: %s", e)
+
+                        # Always recreate response since we consumed the iterator
+                        span.set_status(Status(StatusCode.OK))
+                        return Response(
+                            content=response_body,
+                            status_code=response.status_code,
+                            headers=dict(response.headers),
+                            media_type=response.media_type,
+                        )
+
+                    # For streaming responses (SSE), just set status and return.
+                    # Don't try to capture the full stream body.
+                    span.set_status(Status(StatusCode.OK))
+                    return response
+
+                except Exception as e:
+                    span.set_status(Status(StatusCode.ERROR, str(e)))
+                    span.record_exception(e)
+                    raise
+                finally:
+                    # Reset the ContextVar to avoid leaking span reference
+                    _root_span_var.reset(span_token)
+        finally:
+            # Always detach the context to restore parent chain for other requests
+            context.detach(detach_token)
+
+    return tracing_middleware

From 5d93d5b1f43041aeabaa4f7d75009be7d159a4a7 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:48:41 +0100
Subject: [PATCH 14/26] feat(sandbox): three-tier permission checker with
 deny/allow/HITL rules from settings.json

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/permissions.py          | 403 ++++++++++++++++++
 1 file changed, 403 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/permissions.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/permissions.py b/a2a/sandbox_agent/src/sandbox_agent/permissions.py
new file mode 100644
index 00000000..9e3a8190
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/permissions.py
@@ -0,0 +1,403 @@
+"""Three-tier permission checker modeled after Claude Code's settings.json.
+
+Every tool call from the LangGraph agent is checked against allow/deny rules
+before execution:
+
+  DENY  -- operation matches a deny rule (rejected immediately)
+  ALLOW -- operation matches an allow rule (auto-executed)
+  HITL  -- operation matches neither (triggers LangGraph interrupt() for
+           human approval)
+
+Rules use the format ``type(prefix:glob)`` where *type* is ``shell``,
+``file``, ``network``, etc.  Examples:
+
+  shell(grep:*)           -- any shell command starting with "grep"
+  file(read:/workspace/**) -- file reads anywhere under /workspace/
+  network(outbound:*)     -- any outbound network access
+
+Deny rules are checked **first** (deny takes precedence over allow).
+"""
+
+from __future__ import annotations
+
+import enum
+import fnmatch
+import re
+from typing import Any
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+# Pattern: ``type(value:glob)``
+_RULE_RE = re.compile(r"^(?P<type>[a-z]+)\((?P<body>.+)\)$")
+
+
+class PermissionResult(enum.Enum):
+    """Outcome of a permission check."""
+
+    ALLOW = "allow"
+    DENY = "deny"
+    HITL = "hitl"
+
+
+class PermissionChecker:
+    """Evaluate operations against a settings dict with allow/deny rules.
+
+    Parameters
+    ----------
+    settings:
+        Parsed *settings.json* dict. Expected shape::
+
+            {
+              "context_workspace": "/workspace/${CONTEXT_ID}",
+              "permissions": {
+                "allow": ["shell(grep:*)", ...],
+                "deny":  ["shell(sudo:*)", ...]
+              }
+            }
+    """
+
+    def __init__(self, settings: dict[str, Any]) -> None:
+        workspace = self._resolve_workspace(settings)
+        perms = settings.get("permissions", {})
+        self._deny_rules = self._parse_rules(perms.get("deny", []), workspace)
+        self._allow_rules = self._parse_rules(perms.get("allow", []), workspace)
+
+    # ------------------------------------------------------------------
+    # Core method
+    # ------------------------------------------------------------------
+
+    # Shell metacharacters that separate independent commands.
+    _COMPOUND_SEPARATORS = ("&&", "||", ";", "|")
+
+    def check(self, operation_type: str, operation: str) -> PermissionResult:
+        """Return ALLOW, DENY, or HITL for a given *operation_type* + *operation*.
+
+        Parameters
+        ----------
+        operation_type:
+            High-level category, e.g. ``"shell"``, ``"file"``, ``"network"``.
+        operation:
+            The concrete operation string, e.g. ``"grep -r foo ."`` for a
+            shell command or ``"read:/workspace/ctx1/main.py"`` for a file
+            operation.
+        """
+        # For shell commands with compound operators (&&, ||, ;, |),
+        # check each segment independently.
+        if operation_type == "shell":
+            segments = self._split_compound(operation)
+            if len(segments) > 1:
+                return self._check_compound(segments)
+
+        return self._check_single(operation_type, operation)
+
+    def _check_single(self, operation_type: str, operation: str) -> PermissionResult:
+        """Check a single (non-compound) operation."""
+        # Deny rules are checked first -- deny takes precedence.
+        if self._matches_any(operation_type, operation, self._deny_rules):
+            return PermissionResult.DENY
+
+        # For shell operations, also check for interpreter bypass:
+        # e.g. bash -c "curl ..." should be denied if curl is denied.
+        # Additionally, if the outer command is an interpreter (bash/sh/python)
+        # and embeds unknown commands, route to HITL rather than auto-allowing.
+        if operation_type == "shell":
+            embedded_commands = self.check_interpreter_bypass(operation)
+            if embedded_commands:
+                for embedded in embedded_commands:
+                    if self._matches_any("shell", embedded, self._deny_rules):
+                        return PermissionResult.DENY
+                # Embedded commands exist but none are denied.  Route to HITL
+                # so a human reviews what the interpreter will execute, rather
+                # than auto-allowing via the outer shell(bash:*) rule.
+                return PermissionResult.HITL
+
+        if self._matches_any(operation_type, operation, self._allow_rules):
+            return PermissionResult.ALLOW
+
+        return PermissionResult.HITL
+
+    def _check_compound(self, segments: list[str]) -> PermissionResult:
+        """Check each segment of a compound shell command.
+
+        All segments must be ALLOW for the compound to be ALLOW.
+        Any DENY makes the whole compound DENY.
+        Otherwise HITL.
+        """
+        has_hitl = False
+        for seg in segments:
+            result = self._check_single("shell", seg)
+            if result is PermissionResult.DENY:
+                return PermissionResult.DENY
+            if result is PermissionResult.HITL:
+                has_hitl = True
+        return PermissionResult.HITL if has_hitl else PermissionResult.ALLOW
+
+    @classmethod
+    def _split_compound(cls, operation: str) -> list[str]:
+        """Split a shell command on compound operators (&&, ||, ;, |).
+
+        Returns a list of stripped command segments. If no operators are
+        found, returns a single-element list with the original command.
+        """
+        # Replace multi-char operators first to avoid confusion with single |
+        temp = operation
+        sentinel = "\x00"
+        for sep in ("&&", "||", ";"):
+            temp = temp.replace(sep, sentinel)
+        # Now split on single | (but not if it was part of || already replaced)
+        temp = temp.replace("|", sentinel)
+        segments = [s.strip() for s in temp.split(sentinel) if s.strip()]
+        return segments if segments else [operation]
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _resolve_workspace(settings: dict[str, Any]) -> str:
+        """Derive the workspace root from ``context_workspace``.
+
+        The value may contain ``${CONTEXT_ID}`` (or similar) placeholders.
+        We strip those so that glob rules like ``${WORKSPACE}/**`` can be
+        expanded to the bare workspace prefix (e.g. ``/workspace``).
+        """
+        raw = settings.get("context_workspace", "/workspace")
+        # Remove a trailing ``/${SOME_VAR}`` placeholder (e.g. ``/${CONTEXT_ID}``)
+        # so we keep only the static prefix.
+        return re.sub(r"/\$\{[^}]+\}$", "", raw)
+
+    @staticmethod
+    def _parse_rules(
+        raw_rules: list[str], workspace: str
+    ) -> list[tuple[str, str]]:
+        """Parse rule strings into ``(operation_type, glob_pattern)`` pairs.
+
+        ``${WORKSPACE}`` inside a rule body is expanded to *workspace*.
+        """
+        parsed: list[tuple[str, str]] = []
+        for rule in raw_rules:
+            m = _RULE_RE.match(rule)
+            if m is None:
+                continue  # skip malformed rules
+            rule_type = m.group("type")
+            body = m.group("body")
+            # Expand ${WORKSPACE} variable
+            body = body.replace("${WORKSPACE}", workspace)
+            parsed.append((rule_type, body))
+        return parsed
+
+    @staticmethod
+    def _matches_any(
+        operation_type: str,
+        operation: str,
+        rules: list[tuple[str, str]],
+    ) -> bool:
+        """Return True if *operation* matches at least one rule."""
+        for rule_type, pattern in rules:
+            if rule_type != operation_type:
+                continue
+            if PermissionChecker._match_rule(pattern, operation_type, operation):
+                return True
+        return False
+
+    @staticmethod
+    def _match_rule(pattern: str, operation_type: str, operation: str) -> bool:
+        """Match a single rule body against the operation.
+
+        Rule body format is ``prefix:glob`` (the part inside the parentheses).
+
+        For **shell** operations the *prefix* may be multi-word (e.g.
+        ``pip install``, ``git clone``).  The matcher checks whether the
+        operation starts with the prefix.  If the glob part is ``*`` (the
+        most common case), any suffix is accepted.
+
+        For **file** / **network** operations the operation string is
+        expected to be ``action:path`` (e.g. ``read:/workspace/foo.py``).
+        The rule body is ``action:path_glob`` so we split on the first
+        colon of both and compare action + fnmatch on the path.
+        """
+        if operation_type == "shell":
+            return PermissionChecker._match_shell(pattern, operation)
+        return PermissionChecker._match_structured(pattern, operation)
+
+    # -- shell matching ---------------------------------------------------
+
+    # Interpreters that can execute arbitrary code via -c / -e flags.
+    _INTERPRETERS = frozenset({"bash", "sh", "python", "python3", "perl", "ruby", "node"})
+
+    # Flags that take an inline command string as the next argument.
+    _EXEC_FLAGS = frozenset({"-c", "-e", "--eval"})
+
+    @staticmethod
+    def _match_shell(pattern: str, operation: str) -> bool:
+        """Match a shell rule pattern against a concrete command string.
+
+        *pattern* has the form ``command_prefix:glob`` where the glob is
+        almost always ``*``.  ``command_prefix`` may contain spaces (e.g.
+        ``pip install``, ``rm -rf /``).
+        """
+        # Split only on the *last* colon so multi-word prefixes survive.
+        colon_idx = pattern.rfind(":")
+        if colon_idx == -1:
+            return False
+        prefix = pattern[:colon_idx]
+        glob_part = pattern[colon_idx + 1:]
+
+        if not operation:
+            return False
+
+        # Wildcard prefix (*) matches any command
+        if prefix == "*":
+            return fnmatch.fnmatch(operation, glob_part)
+
+        # The operation must start with the prefix (case-sensitive).
+        if not operation.startswith(prefix):
+            return False
+
+        # What comes after the prefix (may be empty).
+        remainder = operation[len(prefix):]
+
+        # If there is a remainder, it must be separated by a space or be
+        # empty (exact match).  This prevents "grep" matching "grepping".
+        if remainder and not remainder[0] == " ":
+            return False
+
+        remainder = remainder.lstrip()
+
+        # Match the remainder against the glob (``*`` matches everything).
+        return fnmatch.fnmatch(remainder, glob_part)
+
+    @classmethod
+    def check_interpreter_bypass(cls, operation: str) -> list[str]:
+        """Extract embedded commands from interpreter invocations.
+
+        If *operation* uses an interpreter (bash, sh, python, etc.) with
+        an inline execution flag (``-c``, ``-e``), extract the embedded
+        command string so it can be checked against deny rules separately.
+
+        Returns a list of embedded command strings (empty if none found).
+        """
+        if not operation:
+            return []
+
+        parts = operation.split()
+        if not parts:
+            return []
+
+        # Check if the command starts with a known interpreter.
+        cmd = parts[0].rsplit("/", 1)[-1]  # handle /usr/bin/bash etc.
+        if cmd not in cls._INTERPRETERS:
+            return []
+
+        embedded: list[str] = []
+        i = 1
+        while i < len(parts):
+            if parts[i] in cls._EXEC_FLAGS and i + 1 < len(parts):
+                # Everything after the flag is the inline command.
+                inline = " ".join(parts[i + 1:])
+                # Strip surrounding quotes if present.
+                if len(inline) >= 2 and inline[0] in ('"', "'") and inline[-1] == inline[0]:
+                    inline = inline[1:-1]
+                embedded.append(inline)
+                break
+            i += 1
+
+        # Split embedded commands on shell metacharacters: |, &&, ||, ;
+        # so that "curl evil.com && rm -rf /" checks each segment.
+        for emb in list(embedded):
+            for sep in ("&&", "||", ";", "|"):
+                if sep in emb:
+                    for segment in emb.split(sep):
+                        segment = segment.strip()
+                        if segment and segment not in embedded:
+                            embedded.append(segment)
+
+        return embedded
+
+    # -- structured (file / network) matching ----------------------------
+
+    @staticmethod
+    def _match_structured(pattern: str, operation: str) -> bool:
+        """Match ``action:path_glob`` against ``action:concrete_path``.
+
+        Both *pattern* and *operation* are expected to contain at least one
+        colon separating the action from the path.
+        """
+        p_colon = pattern.find(":")
+        o_colon = operation.find(":")
+        if p_colon == -1 or o_colon == -1:
+            return False
+
+        p_action = pattern[:p_colon]
+        p_path_glob = pattern[p_colon + 1:]
+
+        o_action = operation[:o_colon]
+        o_path = operation[o_colon + 1:]
+
+        if p_action != o_action:
+            return False
+
+        # The path glob may itself end with ``:*`` from the rule syntax
+        # (e.g. ``/etc/shadow:*``).  Strip a trailing ``:*`` from the
+        # glob -- the colon-star is a "match any extra args" marker in the
+        # rule syntax, not part of the filesystem path.
+        if p_path_glob.endswith(":*"):
+            p_path_glob = p_path_glob[:-2]
+
+        # If the glob is now empty, it means the rule was something like
+        # ``network(outbound:*)`` -- match everything.
+        if p_path_glob == "*":
+            return True
+
+        # Use fnmatch for glob-style matching (supports ``**``).
+        # fnmatch doesn't natively handle ``**`` the way gitignore does,
+        # so we convert ``**`` to a sentinel and back.
+        return _glob_match(p_path_glob, o_path)
+
+
+# ---------------------------------------------------------------------------
+# Glob helper
+# ---------------------------------------------------------------------------
+
+
+def _glob_match(pattern: str, text: str) -> bool:
+    """Glob-style match that treats ``**`` as "zero or more path segments".
+
+    Python's :func:`fnmatch.fnmatch` treats ``*`` as "anything except
+    nothing" but does *not* cross ``/`` boundaries in the same way as
+    gitignore's ``**``.  This helper converts ``**`` patterns into
+    regular expressions for correct matching.
+    """
+    # Fast path: exact match or simple star.
+    if pattern == text:
+        return True
+
+    # Convert the glob to a regex.
+    # ``**`` -> match anything including ``/``
+    # ``*``  -> match anything except ``/``
+    # ``?``  -> match a single char except ``/``
+    parts: list[str] = []
+    i = 0
+    while i < len(pattern):
+        c = pattern[i]
+        if c == "*":
+            if i + 1 < len(pattern) and pattern[i + 1] == "*":
+                parts.append(".*")
+                i += 2
+                # Skip a following ``/`` so ``**/`` works correctly.
+                if i < len(pattern) and pattern[i] == "/":
+                    i += 1
+                continue
+            parts.append("[^/]*")
+        elif c == "?":
+            parts.append("[^/]")
+        elif c in r"\.[](){}+^$|":
+            parts.append("\\" + c)
+        else:
+            parts.append(c)
+        i += 1
+
+    regex = "^" + "".join(parts) + "$"
+    return re.match(regex, text) is not None

From 205094ac3ceeb640adb9a6c7a33372711db3ffba Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:48:47 +0100
Subject: [PATCH 15/26] feat(sandbox): append-only nested plan store with main
 steps and alternative subplans

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/plan_store.py           | 330 ++++++++++++++++++
 1 file changed, 330 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/plan_store.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/plan_store.py b/a2a/sandbox_agent/src/sandbox_agent/plan_store.py
new file mode 100644
index 00000000..47501753
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/plan_store.py
@@ -0,0 +1,330 @@
+"""Append-only nested plan container.
+
+Stores the agent's execution plan as a nested structure of main steps
+and subplans. Only additions are allowed after initial creation — the
+replanner can add new main steps (after all existing are terminal) or
+create alternative subplans within a step.
+
+Structure::
+
+    {
+        "version": 1,
+        "steps": {
+            "1": {
+                "description": "Clone the repo",
+                "status": "done",
+                "subplans": {
+                    "a": {
+                        "substeps": {
+                            "1": {"description": "git clone ...", "status": "done"},
+                        },
+                        "status": "done",
+                        "created_by": "planner",
+                    }
+                },
+                "active_subplan": "a",
+            },
+            "2": {
+                "description": "Analyze CI logs",
+                "status": "running",
+                "subplans": {
+                    "a": {"substeps": {...}, "status": "failed", "created_by": "planner"},
+                    "b": {"substeps": {...}, "status": "running", "created_by": "replanner"},
+                },
+                "active_subplan": "b",
+            },
+        },
+    }
+
+Status transitions (one-way):
+    pending → running → done | failed | cancelled
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+# Valid status values and their terminal flag
+_TERMINAL = frozenset({"done", "failed", "cancelled"})
+_VALID_STATUS = frozenset({"pending", "running"}) | _TERMINAL
+
+
+# ---------------------------------------------------------------------------
+# Construction
+# ---------------------------------------------------------------------------
+
+
+def create_plan(steps: list[str], creator: str = "planner") -> dict[str, Any]:
+    """Create a new plan store from a list of step descriptions.
+
+    Each step gets a single subplan "a" with one substep matching
+    the step description (for simple plans where steps = substeps).
+    """
+    plan: dict[str, Any] = {"version": 1, "steps": {}}
+    for i, desc in enumerate(steps):
+        step_key = str(i + 1)
+        plan["steps"][step_key] = {
+            "description": desc,
+            "status": "pending",
+            "subplans": {
+                "a": {
+                    "substeps": {
+                        "1": {"description": desc, "status": "pending"},
+                    },
+                    "status": "pending",
+                    "created_by": creator,
+                },
+            },
+            "active_subplan": "a",
+        }
+    # Mark first step as running
+    if plan["steps"]:
+        plan["steps"]["1"]["status"] = "running"
+        plan["steps"]["1"]["subplans"]["a"]["status"] = "running"
+    return plan
+
+
+# ---------------------------------------------------------------------------
+# Mutations (append-only)
+# ---------------------------------------------------------------------------
+
+
+def add_steps(
+    plan: dict[str, Any],
+    new_steps: list[str],
+    creator: str = "replanner",
+) -> dict[str, Any]:
+    """Add new main steps to the plan.
+
+    Only allowed when ALL existing steps are terminal (done/failed/cancelled).
+    Returns a new plan dict (does not mutate in place).
+
+    Raises ValueError if preconditions are not met.
+    """
+    if creator != "replanner":
+        raise ValueError(f"Only replanner can add steps, got creator={creator}")
+
+    steps = plan.get("steps", {})
+    non_terminal = [
+        k for k, s in steps.items()
+        if s.get("status") not in _TERMINAL
+    ]
+    if non_terminal:
+        raise ValueError(
+            f"Cannot add steps: steps {non_terminal} are still active"
+        )
+
+    new_plan = _deep_copy(plan)
+    next_idx = max((int(k) for k in steps), default=0) + 1
+    for i, desc in enumerate(new_steps):
+        step_key = str(next_idx + i)
+        new_plan["steps"][step_key] = {
+            "description": desc,
+            "status": "pending",
+            "subplans": {
+                "a": {
+                    "substeps": {
+                        "1": {"description": desc, "status": "pending"},
+                    },
+                    "status": "pending",
+                    "created_by": creator,
+                },
+            },
+            "active_subplan": "a",
+        }
+
+    # Mark first new step as running
+    first_new = str(next_idx)
+    if first_new in new_plan["steps"]:
+        new_plan["steps"][first_new]["status"] = "running"
+        new_plan["steps"][first_new]["subplans"]["a"]["status"] = "running"
+
+    logger.info(
+        "Added %d steps (start=%s) by %s", len(new_steps), first_new, creator,
+    )
+    return new_plan
+
+
+def add_alternative_subplan(
+    plan: dict[str, Any],
+    step_key: str,
+    substeps: list[str],
+) -> tuple[dict[str, Any], str]:
+    """Create an alternative subplan for a step (replanner only).
+
+    Returns (new_plan, subplan_key) where subplan_key is the new key (b, c, ...).
+    The active_subplan is switched to the new one.
+    """
+    new_plan = _deep_copy(plan)
+    step = new_plan["steps"].get(step_key)
+    if step is None:
+        raise ValueError(f"Step {step_key} not found")
+
+    existing_keys = sorted(step["subplans"].keys())
+    next_key = chr(ord("a") + len(existing_keys))
+
+    step["subplans"][next_key] = {
+        "substeps": {
+            str(i + 1): {"description": desc, "status": "pending"}
+            for i, desc in enumerate(substeps)
+        },
+        "status": "running",
+        "created_by": "replanner",
+    }
+    step["active_subplan"] = next_key
+    step["status"] = "running"
+
+    logger.info(
+        "Created alternative subplan '%s' for step %s (%d substeps)",
+        next_key, step_key, len(substeps),
+    )
+    return new_plan, next_key
+
+
+# ---------------------------------------------------------------------------
+# Status updates
+# ---------------------------------------------------------------------------
+
+
+def set_step_status(
+    plan: dict[str, Any],
+    step_key: str,
+    status: str,
+) -> dict[str, Any]:
+    """Update a step's status. Validates one-way transitions."""
+    if status not in _VALID_STATUS:
+        raise ValueError(f"Invalid status: {status}")
+    new_plan = _deep_copy(plan)
+    step = new_plan["steps"].get(step_key)
+    if step is None:
+        raise ValueError(f"Step {step_key} not found")
+    old = step["status"]
+    if old in _TERMINAL:
+        logger.warning("Step %s already terminal (%s), ignoring → %s", step_key, old, status)
+        return new_plan
+    step["status"] = status
+    # Also update the active subplan status
+    active = step.get("active_subplan", "a")
+    if active in step.get("subplans", {}):
+        sp = step["subplans"][active]
+        if sp.get("status") not in _TERMINAL:
+            sp["status"] = status
+    return new_plan
+
+
+def set_substep_status(
+    plan: dict[str, Any],
+    step_key: str,
+    substep_key: str,
+    status: str,
+    result_summary: str = "",
+    tool_calls: list[str] | None = None,
+) -> dict[str, Any]:
+    """Update a substep's status within the active subplan."""
+    if status not in _VALID_STATUS:
+        raise ValueError(f"Invalid status: {status}")
+    new_plan = _deep_copy(plan)
+    step = new_plan["steps"].get(step_key)
+    if step is None:
+        raise ValueError(f"Step {step_key} not found")
+    active = step.get("active_subplan", "a")
+    subplan = step.get("subplans", {}).get(active)
+    if subplan is None:
+        raise ValueError(f"Subplan {active} not found in step {step_key}")
+    substep = subplan.get("substeps", {}).get(substep_key)
+    if substep is None:
+        raise ValueError(f"Substep {substep_key} not found in subplan {active}")
+    substep["status"] = status
+    if result_summary:
+        substep["result_summary"] = result_summary
+    if tool_calls:
+        substep["tool_calls"] = tool_calls
+    return new_plan
+
+
+# ---------------------------------------------------------------------------
+# Queries
+# ---------------------------------------------------------------------------
+
+
+def get_current_step(plan: dict[str, Any]) -> tuple[str, dict[str, Any]] | None:
+    """Return (step_key, step_dict) for the first non-terminal step."""
+    for key in sorted(plan.get("steps", {}), key=int):
+        step = plan["steps"][key]
+        if step.get("status") not in _TERMINAL:
+            return key, step
+    return None
+
+
+def get_active_substep(plan: dict[str, Any], step_key: str) -> tuple[str, dict] | None:
+    """Return (substep_key, substep_dict) for the first pending/running substep."""
+    step = plan.get("steps", {}).get(step_key)
+    if step is None:
+        return None
+    active = step.get("active_subplan", "a")
+    subplan = step.get("subplans", {}).get(active)
+    if subplan is None:
+        return None
+    for sk in sorted(subplan.get("substeps", {}), key=int):
+        ss = subplan["substeps"][sk]
+        if ss.get("status") not in _TERMINAL:
+            return sk, ss
+    return None
+
+
+def step_count(plan: dict[str, Any]) -> int:
+    """Total number of main steps."""
+    return len(plan.get("steps", {}))
+
+
+def done_count(plan: dict[str, Any]) -> int:
+    """Number of completed main steps."""
+    return sum(1 for s in plan.get("steps", {}).values() if s.get("status") == "done")
+
+
+def all_terminal(plan: dict[str, Any]) -> bool:
+    """True if ALL main steps are in a terminal status."""
+    steps = plan.get("steps", {})
+    return bool(steps) and all(s.get("status") in _TERMINAL for s in steps.values())
+
+
+def to_flat_plan(plan: dict[str, Any]) -> list[str]:
+    """Convert to flat list of step descriptions (backward compat)."""
+    return [
+        plan["steps"][k]["description"]
+        for k in sorted(plan.get("steps", {}), key=int)
+    ]
+
+
+def to_flat_plan_steps(plan: dict[str, Any]) -> list[dict[str, Any]]:
+    """Convert to flat PlanStep list (backward compat with serializer/UI)."""
+    result = []
+    for key in sorted(plan.get("steps", {}), key=int):
+        step = plan["steps"][key]
+        active = step.get("active_subplan", "a")
+        subplan = step.get("subplans", {}).get(active, {})
+        alt_count = len(step.get("subplans", {})) - 1  # alternatives (excl. original)
+        result.append({
+            "index": int(key) - 1,  # 0-based for compat
+            "description": step["description"],
+            "status": step["status"],
+            "active_subplan": active,
+            "alternative_count": alt_count,
+            "substeps": list(subplan.get("substeps", {}).values()),
+            "created_by": subplan.get("created_by", "planner"),
+        })
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Internal
+# ---------------------------------------------------------------------------
+
+
+def _deep_copy(d: dict) -> dict:
+    """Fast deep copy for JSON-compatible dicts."""
+    import json
+    return json.loads(json.dumps(d))

From 2b060dc3909baf0b9db7407939568ef90219eb14 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:48:53 +0100
Subject: [PATCH 16/26] feat(sandbox): system prompt templates for planner,
 executor, reflector, and reporter nodes

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/prompts.py              | 235 ++++++++++++++++++
 1 file changed, 235 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/prompts.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/prompts.py b/a2a/sandbox_agent/src/sandbox_agent/prompts.py
new file mode 100644
index 00000000..3c2856dd
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/prompts.py
@@ -0,0 +1,235 @@
+"""System prompt templates for the plan-execute-reflect reasoning loop.
+
+Each prompt corresponds to a reasoning node:
+- PLANNER_SYSTEM: Decomposes user requests into numbered plans
+- EXECUTOR_SYSTEM: Executes individual plan steps with tools
+- REFLECTOR_SYSTEM: Reviews step output, decides continue/replan/done
+- REPORTER_SYSTEM: Summarizes accumulated results into final answer
+
+All prompts receive the workspace preamble via ``with_workspace()``.
+"""
+
+# ---------------------------------------------------------------------------
+# Universal workspace preamble — injected into ALL system prompts
+# ---------------------------------------------------------------------------
+
+WORKSPACE_PREAMBLE = """\
+WORKSPACE (MOST IMPORTANT RULE):
+Your workspace absolute path is: {workspace_path}
+ALL file access MUST use this path prefix.
+
+- shell commands: ALWAYS use absolute paths starting with {workspace_path}/
+  Example: `ls {workspace_path}/repos/kagenti`
+  Example: `cd {workspace_path}/repos/kagenti && gh run list`
+  Example: `cd {workspace_path}/repos/kagenti && gh run view 123 --log-failed > {workspace_path}/output/ci.log`
+- file_read, file_write, grep, glob: use RELATIVE paths (e.g. `output/report.md`, `repos/kagenti/README.md`).
+  These tools resolve paths relative to the workspace automatically.
+- NEVER use `../../` or guess paths. NEVER use bare `/workspace/` without the session ID.
+
+Pre-created subdirs: repos/ (clone here), output/ (reports/logs), data/, scripts/
+"""
+
+
+def with_workspace(template: str, workspace_path: str, **kwargs: str) -> str:
+    """Prepend the workspace preamble to a system prompt template and format.
+
+    Usage::
+
+        system_content = with_workspace(
+            EXECUTOR_SYSTEM,
+            workspace_path="/workspace/abc123",
+            current_step=1,
+            step_text="Clone repo",
+        )
+    """
+    full = WORKSPACE_PREAMBLE + "\n" + template
+    try:
+        return full.format(workspace_path=workspace_path, **kwargs)
+    except (KeyError, IndexError):
+        # Fallback: try formatting without workspace if template has unknown keys
+        try:
+            return WORKSPACE_PREAMBLE.format(workspace_path=workspace_path) + "\n" + template.format(**kwargs)
+        except (KeyError, IndexError):
+            return WORKSPACE_PREAMBLE.format(workspace_path=workspace_path) + "\n" + template
+
+
+PLANNER_SYSTEM = """\
+You are a planning module for a sandboxed coding assistant.
+
+Given the user's request and any prior execution results, produce a concise
+numbered plan.  Each step should be a single actionable item that can be
+executed with the available tools (shell, file_read, file_write, grep, glob,
+web_fetch, explore).
+
+IMPORTANT: Almost every request requires tools. The user is asking you to DO
+things, not just talk. Create file = file_write. Run command = shell.
+Clone repo = shell. Read file = file_read. Search code = grep/glob.
+
+Rules:
+- Every step should name the specific tool to use.
+- Keep steps concrete and tool-oriented — no vague "analyze" or "think" steps.
+- For multi-step analysis, debugging, or investigation tasks, add a final
+  step: "Write findings summary to report.md" with sections: Problem,
+  Investigation, Root Cause, Resolution.
+- Number each step starting at 1.
+- Output ONLY the numbered list, nothing else.
+
+Example ("create a file hello.txt with 'hello world'"):
+1. Use file_write to create hello.txt with content "hello world".
+
+Example ("list files"):
+1. Run `ls -la` in the workspace using shell.
+
+Example ("create a Python project with tests"):
+1. Create directory structure: shell(`mkdir -p src tests`).
+2. Write src/main.py using file_write.
+3. Write tests/test_main.py using file_write.
+4. Run tests: shell(`python -m pytest tests/`).
+
+Example ("analyze CI failures for owner/repo PR #758"):
+1. Clone repo: shell(`git clone https://github.com/owner/repo.git {workspace_path}/repos/repo`).
+2. List failures: shell(`cd {workspace_path}/repos/repo && gh run list --status failure --limit 5`).
+3. Download logs: shell(`cd {workspace_path}/repos/repo && gh run view <run_id> --log-failed > {workspace_path}/output/ci-run.log`).
+4. Extract errors: grep(`FAILED|ERROR|AssertionError` in output/ci-run.log).
+5. Write findings to report.md with sections: Root Cause, Impact, Fix.
+
+IMPORTANT for gh CLI:
+- GH_TOKEN and GITHUB_TOKEN are ALREADY set in the environment. Do NOT
+  run `export GH_TOKEN=...` — it's unnecessary and will break auth.
+- Always clone the target repo FIRST, then `cd` into it before gh commands.
+- gh auto-detects the repo from git remote "origin" — it MUST run inside the cloned repo.
+- Use `cd {workspace_path}/repos/<name> && gh <command>` in a single shell call.
+"""
+
+EXECUTOR_SYSTEM = """\
+You are a sandboxed coding assistant executing step {current_step} of a plan.
+
+Current step: {step_text}
+Tool calls so far this step: {tool_call_count}/{max_tool_calls}
+
+Available tools:
+- **shell**: Execute a shell command. Returns stdout+stderr and exit code.
+- **file_read**: Read a file from the workspace.
+- **file_write**: Write content to a file in the workspace.
+- **grep**: Search file contents with regex. Faster than shell grep, workspace-scoped.
+- **glob**: Find files by pattern (e.g. '**/*.py'). Faster than shell find.
+- **web_fetch**: Fetch content from a URL (allowed domains only).
+- **explore**: Spawn a read-only sub-agent for codebase research.
+
+
+EXECUTION MODEL — step-by-step with micro-reflection:
+You operate in a loop: call ONE tool → see the result → decide what to do next.
+After each tool result, THINK about what happened before calling the next tool.
+- Did the command succeed? Check the exit code and output.
+- If it failed, adapt your approach — don't blindly retry the same thing.
+- If it succeeded, what's the logical next action for this step?
+
+CRITICAL RULES:
+- Call exactly ONE tool per response. You will see the result and can call another.
+- You MUST use the function/tool calling API — not text descriptions of calls.
+- DO NOT write or invent command output. Call the tool, wait for the result.
+- If a tool call fails, report the ACTUAL error — do not invent output.
+- Slash commands like /rca:ci are for humans, not for you. You use tools.
+- If you cannot call a tool for any reason, respond with exactly:
+  CANNOT_CALL_TOOL: <reason>
+
+STEP BOUNDARY — CRITICAL:
+- You are ONLY executing step {current_step}: "{step_text}"
+- When THIS step is done, STOP calling tools immediately.
+- Do NOT start the next step. The reflector will advance you.
+- Summarize what you accomplished and stop.
+
+When the step is COMPLETE (goal achieved or cannot be achieved), stop calling
+tools and summarize what you accomplished with the actual tool output.
+
+## Handling Large Output
+Tool output is truncated to 10KB. For commands that produce large output:
+- Redirect to a file: `command > {workspace_path}/output/result.json`
+- Then analyze with grep: grep(`pattern` in output/result.json)
+
+## Debugging Guidelines
+- If a command fails with "unknown flag" or "invalid option" → run `command --help`
+  to see valid flags. Do NOT guess flag names.
+- After each tool call, analyze the output carefully before deciding the next action.
+- Check error output (stderr) and exit code before retrying.
+- If you get the same result twice → the step is done, stop and summarize.
+"""
+
+REFLECTOR_SYSTEM = """\
+You are a reflection module reviewing the output of a plan step.
+
+Plan:
+{plan_text}
+
+Current step ({current_step} of {total_steps}): {step_text}
+Step result: {step_result}
+Remaining steps: {remaining_steps}
+
+Iteration: {iteration} of {max_iterations}
+Replan count so far: {replan_count} (higher counts mean more rework — weigh this when deciding)
+Tool calls this iteration: {tool_calls_this_iter}
+Recent decisions: {recent_decisions}
+{replan_history}
+
+STALL DETECTION:
+- If the executor made 0 tool calls, the step likely FAILED.
+- If the step result is just text describing what WOULD be done (not actual
+  tool output), that means the executor did not call any tools. Treat as failure.
+
+RETRY vs REPLAN:
+- **retry** = same step failed, try a DIFFERENT approach for THIS step only.
+  Example: `gh run view --log-failed` failed → retry with `gh api` instead.
+  The executor re-runs the current step with a modified brief. Completed steps
+  are preserved. Use retry FIRST before replan.
+- **replan** = the overall approach is fundamentally wrong. Creates a new plan
+  but preserves already-completed steps (never restarts from step 1).
+  Only use replan if retry won't help (e.g., wrong repo cloned, wrong PR).
+- Do NOT replan with the same approach that already failed.
+- A high replan count suggests diminishing returns — consider "done" with
+  partial results.
+
+DECISION PROCESS:
+1. Did the current step succeed? Check tool output for real results (not just "no output").
+2. If it failed, can you try a different approach for the SAME step? → retry.
+3. If the whole approach is wrong → replan.
+4. If step succeeded and remaining steps exist → continue.
+5. If ALL plan steps are complete (remaining = NONE) → done.
+
+Decide ONE of the following (output ONLY the decision word):
+- **continue** — Current step done, remaining steps exist → move to next step.
+- **retry** — Current step failed, re-execute with a different approach.
+- **replan** — Overall approach is wrong, create new plan (keeps done steps).
+- **done** — ALL plan steps complete (remaining = NONE), task is fully answered.
+- **hitl** — Human input is needed to proceed.
+
+Output the single word: continue, retry, replan, done, or hitl.
+"""
+
+REPORTER_SYSTEM = """\
+You are a reporting module.  Summarize the results of all executed steps
+into a clear, concise final answer for the user.
+
+Plan:
+{plan_text}
+
+Step status:
+{step_status_text}
+
+Step results:
+{results_text}
+
+{limit_note}
+
+RULES:
+- Only report facts from actual tool output — NEVER fabricate data.
+- If a step FAILED, explain WHY it failed (include the error message).
+- If steps are PARTIAL, summarize what was accomplished so far.
+- If no real data was obtained, say "Unable to retrieve data" rather than
+  making up results.
+- Include relevant command output, file paths, or next steps.
+- Do NOT include the plan itself — just the results.
+- Do NOT say "The task has been completed" — present the actual findings.
+- Do NOT echo or repeat these instructions in your response.
+- Start your response directly with the summary content.
+- List ALL workspace file paths in full form (e.g. repos/kagenti/report.md).
+"""

From 8b5c3e28ec4f1cc2faa8eead305e364b8de95ea9 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:48:59 +0100
Subject: [PATCH 17/26] feat(sandbox): plan-execute-reflect reasoning loop with
 router, planner, executor, reflector, and reporter nodes

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/reasoning.py            | 1668 +++++++++++++++++
 1 file changed, 1668 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/reasoning.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/reasoning.py b/a2a/sandbox_agent/src/sandbox_agent/reasoning.py
new file mode 100644
index 00000000..dcd471fe
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/reasoning.py
@@ -0,0 +1,1668 @@
+"""Plan-execute-reflect reasoning loop node functions.
+
+Five LangGraph node functions implement structured multi-step reasoning:
+
+1. **router** — Entry point. Checks plan_status to decide: resume existing
+   plan, replan with new context, or start fresh.
+2. **planner** — Decomposes the user request into numbered steps.
+   Detects simple (single-step) requests and marks them done-after-execute.
+3. **executor** — Runs the current plan step with bound tools (existing
+   react pattern).
+4. **reflector** — Reviews execution output, decides: ``continue`` (next
+   step), ``replan``, ``done``, or ``hitl``.  Updates per-step status.
+5. **reporter** — Formats accumulated step results into a final answer.
+   Sets terminal ``plan_status`` based on how the loop ended.
+
+Plan state persists across A2A turns via the LangGraph checkpointer.
+When the user or looper sends "continue", the router resumes execution
+at the current step. Any other message triggers a replan that sees the
+previous plan's progress.
+
+# TODO: Research explicit PlanStore approach as alternative to checkpointer.
+# Pros of PlanStore: plan queryable outside graph (UI), full schema control,
+#   plan versioning independent of LangGraph internals.
+# Cons: more code, risk of plan/checkpointer state divergence, need custom
+#   persistence layer.  Current approach (A) uses checkpointer for atomic
+#   state which is simpler and less error-prone.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+import uuid
+from typing import Any, TypedDict
+
+from langchain_core.messages import AIMessage, SystemMessage, ToolMessage
+
+from sandbox_agent.budget import AgentBudget
+from sandbox_agent import plan_store as ps
+
+# openai raises APIStatusError for non-2xx responses (e.g. 402 from the budget proxy)
+try:
+    from openai import APIStatusError as _APIStatusError
+except ImportError:
+    _APIStatusError = None  # type: ignore[assignment,misc]
+
+
+def _is_budget_exceeded_error(exc: Exception) -> bool:
+    """Check if an exception is a 402 budget-exceeded from the LLM proxy."""
+    if _APIStatusError and isinstance(exc, _APIStatusError):
+        return exc.status_code == 402
+    return "budget_exceeded" in str(exc).lower() or "402" in str(exc)
+
+logger = logging.getLogger(__name__)
+
+# Sentinel text returned by the executor when all tool calls in a step have
+# already been executed (dedup logic).  This is an internal coordination
+# message and must never appear in user-visible output.
+_DEDUP_SENTINEL = (
+    "Step completed — all requested tool calls "
+    "have been executed and results are available."
+)
+
+import os as _os
+
+# Debug prompts: include full system prompt + message history in events.
+# Disabled by default to reduce event size and prevent OOM on large sessions.
+_DEBUG_PROMPTS = _os.environ.get("SANDBOX_DEBUG_PROMPTS", "1") == "1"
+
+# Messages that trigger plan resumption rather than replanning.
+_CONTINUE_PHRASES = frozenset({
+    "continue", "continue on the plan", "go on", "proceed",
+    "keep going", "next", "carry on",
+})
+
+
+# ---------------------------------------------------------------------------
+# PlanStep — structured per-step tracking
+# ---------------------------------------------------------------------------
+
+
+class PlanStep(TypedDict, total=False):
+    """A single step in the plan with status tracking."""
+    index: int
+    description: str
+    status: str          # "pending" | "running" | "done" | "failed" | "skipped"
+    tool_calls: list[str]
+    result_summary: str
+    iteration_added: int
+
+
+
+def _summarize_bound_tools(llm_with_tools: Any) -> list[dict[str, Any]]:
+    """Extract bound tool schemas from a LangChain RunnableBinding for debug display.
+
+    Returns a list of tool definitions in OpenAI format so the UI can show
+    exactly what tools + schemas the LLM receives.
+    """
+    try:
+        # LangChain bind_tools stores tools in kwargs['tools']
+        tools = getattr(llm_with_tools, "kwargs", {}).get("tools", [])
+        if not tools:
+            # Try first.kwargs for nested bindings
+            first = getattr(llm_with_tools, "first", None)
+            if first:
+                tools = getattr(first, "kwargs", {}).get("tools", [])
+        result = []
+        for t in tools:
+            if isinstance(t, dict):
+                # Already in OpenAI format
+                result.append({
+                    "name": t.get("function", {}).get("name", "?"),
+                    "description": t.get("function", {}).get("description", "")[:200],
+                    "parameters": t.get("function", {}).get("parameters", {}),
+                })
+            else:
+                # LangChain tool object
+                result.append({
+                    "name": getattr(t, "name", "?"),
+                    "description": (getattr(t, "description", "") or "")[:200],
+                    "parameters": getattr(t, "args_schema", {}) if hasattr(t, "args_schema") else {},
+                })
+        return result
+    except Exception:
+        return []
+
+
+def _make_plan_steps(
+    descriptions: list[str], iteration: int = 0
+) -> list[PlanStep]:
+    """Convert a list of step descriptions into PlanStep dicts."""
+    return [
+        PlanStep(
+            index=i,
+            description=desc,
+            status="pending",
+            tool_calls=[],
+            result_summary="",
+            iteration_added=iteration,
+        )
+        for i, desc in enumerate(descriptions)
+    ]
+
+
+def _plan_descriptions(plan_steps: list[PlanStep]) -> list[str]:
+    """Extract flat description list from plan_steps (for backward compat)."""
+    return [s.get("description", "") for s in plan_steps]
+
+
+def _safe_format(template: str, **kwargs: Any) -> str:
+    """Format a prompt template, falling back to raw template on errors."""
+    try:
+        return template.format(**kwargs)
+    except (KeyError, IndexError) as exc:
+        logger.warning("Prompt format error (%s), using raw template", exc)
+        return template
+
+
+# ---------------------------------------------------------------------------
+# Text-based tool call parser
+# ---------------------------------------------------------------------------
+# Some model servers (e.g. vLLM without --enable-auto-tool-choice) return
+# tool invocations as text like:
+#   [shell(command="ls -la"), file_read(path="foo.py")]
+# instead of structured tool_calls in the OpenAI response format.
+# This parser converts that text into proper AIMessage.tool_calls so
+# LangGraph's tools_condition routes to the ToolNode.
+# ---------------------------------------------------------------------------
+
+# Matches: tool_name(key="value", key2="value2")
+# Handles: shell("ls") (positional), shell(command="ls") (keyword)
+_TOOL_CALL_RE = re.compile(
+    r'(\w+)\(([^)]*)\)',
+)
+
+# Matches Llama 4 Scout format: [label, tool_name]{"key": "value"}
+# Examples: [clone_repo, shell]{"command": "git clone ..."}
+#           [rca:ci, delegate]{"task": "analyze CI logs"}
+_LABEL_TOOL_JSON_RE = re.compile(
+    r'\[[^\]]*,\s*(\w+)\]\s*(\{[^}]+\})',
+)
+
+# Known tool names — only parse calls for tools we actually have
+_KNOWN_TOOLS = {"shell", "file_read", "file_write", "grep", "glob", "web_fetch", "explore", "delegate"}
+
+# First-param defaults for tools that accept a positional argument
+_POSITIONAL_PARAM = {
+    "shell": "command",
+    "file_read": "path",
+    "grep": "pattern",
+    "glob": "pattern",
+    "web_fetch": "url",
+    "explore": "query",
+    "delegate": "task",
+}
+
+
+def _parse_kwargs(args_str: str, tool_name: str) -> dict[str, Any]:
+    """Parse 'key="value", key2="value2"' or '"positional"' into a dict."""
+    args_str = args_str.strip()
+    if not args_str:
+        return {}
+
+    result: dict[str, Any] = {}
+
+    # Try keyword arguments first: key="value" or key='value'
+    kw_pattern = re.compile(r'(\w+)\s*=\s*(?:"((?:[^"\\]|\\.)*)"|\'((?:[^\'\\]|\\.)*)\')')
+    kw_matches = kw_pattern.findall(args_str)
+    if kw_matches:
+        for key, val_dq, val_sq in kw_matches:
+            val = val_dq if val_dq else val_sq
+            val = val.replace('\\"', '"').replace("\\'", "'")
+            result[key] = val
+        return result
+
+    # Positional: just a quoted string like "ls -la" or 'ls -la'
+    pos_match = re.match(r'^["\'](.+?)["\']$', args_str, re.DOTALL)
+    if pos_match:
+        param_name = _POSITIONAL_PARAM.get(tool_name, "input")
+        result[param_name] = pos_match.group(1).replace('\\"', '"')
+        return result
+
+    # Unquoted positional (rare but handle it)
+    param_name = _POSITIONAL_PARAM.get(tool_name, "input")
+    result[param_name] = args_str
+    return result
+
+
+def parse_text_tool_calls(content: str) -> list[dict[str, Any]]:
+    """Extract tool calls from text content.
+
+    Returns a list of dicts matching LangChain ToolCall format:
+      [{"name": "shell", "args": {"command": "ls"}, "id": "...", "type": "tool_call"}]
+
+    Returns empty list if no recognizable tool calls found.
+    """
+    if not content:
+        return []
+
+    # Look for the pattern: [tool(...), tool(...)] or just tool(...)
+    # Strip surrounding brackets if present
+    text = content.strip()
+    if text.startswith("[") and text.endswith("]"):
+        text = text[1:-1].strip()
+        # Remove trailing comma
+        if text.endswith(","):
+            text = text[:-1].strip()
+
+    calls = []
+
+    # Try Llama 4 format first: [label, tool_name]{"key": "value"}
+    for match in _LABEL_TOOL_JSON_RE.finditer(content):
+        tool_name = match.group(1)
+        json_str = match.group(2)
+        if tool_name not in _KNOWN_TOOLS:
+            continue
+        try:
+            args = json.loads(json_str)
+            if isinstance(args, dict):
+                calls.append({
+                    "name": tool_name,
+                    "args": args,
+                    "id": f"text-{uuid.uuid4().hex[:12]}",
+                    "type": "tool_call",
+                })
+        except json.JSONDecodeError:
+            continue
+
+    if calls:
+        return calls
+
+    # Fall back to legacy format: tool_name(args)
+    for match in _TOOL_CALL_RE.finditer(text):
+        tool_name = match.group(1)
+        args_str = match.group(2)
+
+        if tool_name not in _KNOWN_TOOLS:
+            continue
+
+        args = _parse_kwargs(args_str, tool_name)
+        calls.append({
+            "name": tool_name,
+            "args": args,
+            "id": f"text-{uuid.uuid4().hex[:12]}",
+            "type": "tool_call",
+        })
+
+    return calls
+
+
+def maybe_patch_tool_calls(response: AIMessage) -> AIMessage:
+    """If the response has no tool_calls but contains text-based calls, patch them in.
+
+    Controlled by SANDBOX_TEXT_TOOL_PARSING env var (default: "1" = enabled).
+    """
+    if response.tool_calls:
+        # Model returned structured tool_calls — use as-is
+        return response
+
+    if _os.environ.get("SANDBOX_TEXT_TOOL_PARSING", "1") != "1":
+        return response
+
+    content = response.content
+    if isinstance(content, list):
+        # Multi-part content — extract text parts
+        content = " ".join(
+            b.get("text", "") for b in content
+            if isinstance(b, dict) and b.get("type") == "text"
+        )
+
+    parsed = parse_text_tool_calls(content)
+    if not parsed:
+        return response
+
+    logger.info(
+        "Parsed %d text-based tool call(s): %s",
+        len(parsed),
+        [c["name"] for c in parsed],
+    )
+
+    # Create a new AIMessage with the parsed tool_calls
+    return AIMessage(
+        content="",  # Clear text content — tools will produce output
+        tool_calls=parsed,
+    )
+
+# Default budget — used when no explicit budget is passed.
+DEFAULT_BUDGET = AgentBudget()
+
+
+# ---------------------------------------------------------------------------
+# Prompts
+# ---------------------------------------------------------------------------
+
+from sandbox_agent.prompts import (
+    PLANNER_SYSTEM as _PLANNER_SYSTEM,
+    EXECUTOR_SYSTEM as _EXECUTOR_SYSTEM,
+    REFLECTOR_SYSTEM as _REFLECTOR_SYSTEM,
+    REPORTER_SYSTEM as _REPORTER_SYSTEM,
+)
+
+
+def _intercept_respond_to_user(response: Any, node_name: str) -> AIMessage | None:
+    """Check for respond_to_user escape tool in an LLM response.
+
+    Llama 4 Scout always calls a tool when tools are bound, so
+    ``respond_to_user`` is the escape hatch for nodes that need to
+    produce text output (planner, reflector).
+
+    Returns a *new* AIMessage with the extracted text content and no
+    tool_calls (so ``tools_condition`` routes correctly), or ``None``
+    if no escape tool was found.
+    """
+    if not getattr(response, "tool_calls", None):
+        return None
+
+    tool_names = [
+        tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?")
+        for tc in response.tool_calls
+    ]
+    logger.info("%s called tools: %s", node_name, tool_names,
+                extra={"node": node_name.lower()})
+
+    for tc in response.tool_calls:
+        name = tc.get("name", "") if isinstance(tc, dict) else getattr(tc, "name", "")
+        if name == "respond_to_user":
+            args = tc.get("args", {}) if isinstance(tc, dict) else getattr(tc, "args", {})
+            response_text = args.get("response", "")
+            logger.info(
+                "%s escaped via respond_to_user (%d chars)", node_name, len(response_text),
+                extra={"node": node_name.lower()},
+            )
+            # Return a clean AIMessage — no tool_calls so the graph
+            # routes to the next node instead of the tool node.
+            return AIMessage(
+                content=response_text,
+                response_metadata=getattr(response, "response_metadata", {}),
+                usage_metadata=getattr(response, "usage_metadata", None),
+            )
+
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Node functions
+# ---------------------------------------------------------------------------
+
+
+async def router_node(state: dict[str, Any]) -> dict[str, Any]:
+    """Entry-point node: decide whether to resume, replan, or start fresh.
+
+    Returns state updates that downstream conditional edges read via
+    :func:`route_entry`.
+    """
+    plan_status = state.get("plan_status", "")
+    plan_steps = state.get("plan_steps", [])
+    messages = state.get("messages", [])
+
+    # Extract the latest user message text
+    last_text = ""
+    if messages:
+        content = getattr(messages[-1], "content", "")
+        if isinstance(content, list):
+            last_text = " ".join(
+                b.get("text", "") for b in content
+                if isinstance(b, dict) and b.get("type") == "text"
+            )
+        else:
+            last_text = str(content)
+    last_text_lower = last_text.strip().lower()
+
+    has_active_plan = plan_status == "awaiting_continue" and len(plan_steps) > 0
+    is_continue = last_text_lower in _CONTINUE_PHRASES
+
+    if has_active_plan and is_continue:
+        # Resume: mark next pending step as running
+        current_step = state.get("current_step", 0)
+        if current_step < len(plan_steps):
+            plan_steps = list(plan_steps)  # copy for mutation
+            plan_steps[current_step] = {**plan_steps[current_step], "status": "running"}
+        logger.info(
+            "Router: RESUME plan at step %d/%d (plan_status=%s)",
+            current_step + 1, len(plan_steps), plan_status,
+            extra={"session_id": state.get("context_id", ""), "node": "router",
+                   "current_step": current_step, "plan_status": plan_status},
+        )
+        return {
+            "_route": "resume",
+            "plan_steps": plan_steps,
+            "plan_status": "executing",
+        }
+    elif has_active_plan:
+        # Replan: new instruction arrives while plan exists
+        # Reset replan_count — this is a user-driven replan, not an agent loop
+        logger.info(
+            "Router: REPLAN — new message while plan active (plan_status=%s, steps=%d)",
+            plan_status, len(plan_steps),
+            extra={"session_id": state.get("context_id", ""), "node": "router",
+                   "plan_status": plan_status},
+        )
+        return {
+            "_route": "replan",
+            "plan_status": "executing",
+            "original_request": last_text,
+            "replan_count": 0,
+            "recent_decisions": [],
+        }
+    else:
+        # New: no active plan
+        logger.info("Router: NEW plan (plan_status=%s)", plan_status,
+                    extra={"session_id": state.get("context_id", ""), "node": "router",
+                           "plan_status": plan_status})
+        return {
+            "_route": "new",
+            "plan_status": "executing",
+            "original_request": last_text,
+        }
+
+
+def route_entry(state: dict[str, Any]) -> str:
+    """Conditional edge from router: resume → executor, else → planner."""
+    route = state.get("_route", "new")
+    if route == "resume":
+        return "resume"
+    return "plan"  # both "replan" and "new" go to planner
+
+
+def _is_trivial_text_request(messages: list) -> bool:
+    """Detect requests that need no tools — just a text response.
+
+    Matches patterns like "Say exactly: ...", "What was the marker?",
+    simple greetings, or questions that can be answered from conversation
+    context alone.
+    """
+    if not messages:
+        return False
+    last = messages[-1]
+    content = getattr(last, "content", "")
+    if isinstance(content, list):
+        content = " ".join(
+            b.get("text", "") for b in content
+            if isinstance(b, dict) and b.get("type") == "text"
+        )
+    text = str(content).strip().lower()
+    if not text:
+        return False
+
+    # Patterns that clearly need no tools
+    trivial_patterns = (
+        "say exactly",
+        "repeat ",
+        "what was the marker",
+        "what did i say",
+        "what did i tell",
+        "hello",
+        "hi",
+        "who are you",
+    )
+    return any(text.startswith(p) or p in text for p in trivial_patterns)
+
+
+async def planner_node(
+    state: dict[str, Any],
+    llm: Any,
+    budget: AgentBudget | None = None,
+) -> dict[str, Any]:
+    """Decompose the user request into a numbered plan.
+
+    On re-entry (iteration > 0), the planner also sees prior step results so
+    it can adjust the remaining plan.
+    """
+    if budget is None:
+        budget = DEFAULT_BUDGET
+    messages = state["messages"]
+    iteration = state.get("iteration", 0)
+    step_results = state.get("step_results", [])
+
+    prev_plan_steps = state.get("plan_steps", [])
+
+    # Fast-path: trivial text-only requests skip the planner LLM call entirely
+    if iteration == 0 and not prev_plan_steps and _is_trivial_text_request(messages):
+        logger.info("Fast-path: trivial text request — single-step plan, no LLM call",
+                    extra={"session_id": state.get("context_id", ""), "node": "planner",
+                           "iteration": 0, "step_count": 1, "plan_version": 1})
+        trivial_steps = _make_plan_steps(["Respond to the user."], iteration=0)
+        store = ps.create_plan(["Respond to the user."], creator="planner")
+        return {
+            "plan": ["Respond to the user."],
+            "plan_steps": trivial_steps,
+            "plan_version": 1,
+            "current_step": 0,
+            "iteration": 1,
+            "done": False,
+            "_plan_store": store,
+        }
+
+    # Build context for the planner — include previous plan with per-step status
+    context_parts = []
+    if prev_plan_steps:
+        # Show the structured plan with per-step status
+        context_parts.append("Previous plan (with status):")
+        for prev_ps in prev_plan_steps:
+            idx = prev_ps.get("index", 0)
+            desc = prev_ps.get("description", "")
+            status = prev_ps.get("status", "pending").upper()
+            result = prev_ps.get("result_summary", "")
+            line = f"  {idx+1}. [{status}] {desc}"
+            if result:
+                line += f" — {result[:150]}"
+            context_parts.append(line)
+        done_count = sum(1 for s in prev_plan_steps if s.get("status") == "done")
+        context_parts.append(f"Progress: {done_count}/{len(prev_plan_steps)} steps completed.")
+        context_parts.append("")
+    elif iteration > 0:
+        # Fallback: use flat plan list for backward compat
+        original_plan = state.get("plan", [])
+        current_step = state.get("current_step", 0)
+        if original_plan:
+            context_parts.append("Original plan:")
+            for i, step in enumerate(original_plan):
+                status = "DONE" if i < current_step else "PENDING"
+                context_parts.append(f"  {i+1}. [{status}] {step}")
+            context_parts.append(f"Progress: {current_step}/{len(original_plan)} steps completed.")
+            context_parts.append("")
+
+    if iteration > 0 or prev_plan_steps:
+        # Extract tool call history from messages
+        tool_history = []
+        for msg in messages:
+            tool_calls = getattr(msg, "tool_calls", None)
+            if tool_calls:
+                for tc in tool_calls:
+                    name = tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?")
+                    args = tc.get("args", {}) if isinstance(tc, dict) else getattr(tc, "args", {})
+                    args_str = str(args)[:100]
+                    tool_history.append(f"  CALLED: {name}({args_str})")
+            if hasattr(msg, "name") and hasattr(msg, "content") and getattr(msg, "type", "") == "tool":
+                output = str(getattr(msg, "content", ""))[:200]
+                tool_history.append(f"  RESULT ({msg.name}): {output}")
+
+        if tool_history:
+            context_parts.append("Tool calls already executed (DO NOT repeat these):")
+            context_parts.extend(tool_history[-20:])
+            context_parts.append("")
+
+        if step_results:
+            context_parts.append("Previous step results:")
+            for i, result in enumerate(step_results, 1):
+                context_parts.append(f"  Step {i}: {result}")
+            context_parts.append("")
+
+        context_parts.append(
+            "Adjust the plan for remaining work. Do NOT repeat steps that already succeeded."
+        )
+
+    system_content = _PLANNER_SYSTEM
+    if context_parts:
+        system_content += "\n" + "\n".join(context_parts)
+
+    # Prepend skill instructions when a skill was loaded from metadata.
+    skill_instructions = state.get("skill_instructions", "")
+    if skill_instructions:
+        system_content = skill_instructions + "\n\n" + system_content
+
+    from sandbox_agent.context_builders import build_planner_context, invoke_llm
+
+    plan_messages = build_planner_context(state, system_content)
+
+    try:
+        response, planner_capture = await invoke_llm(
+            llm, plan_messages,
+            node="planner", session_id=state.get("context_id", ""),
+            workspace_path=state.get("workspace_path", "/workspace"),
+        )
+    except Exception as exc:
+        if _is_budget_exceeded_error(exc):
+            logger.warning("Budget exceeded in planner (402 from proxy): %s", exc,
+                       extra={"session_id": state.get("context_id", ""), "node": "planner",
+                              "iteration": iteration})
+            return {
+                "messages": [AIMessage(content=f"Budget exceeded: {exc}")],
+                "done": True,
+                "_budget_summary": budget.summary(),
+            }
+        raise
+
+    prompt_tokens = planner_capture.prompt_tokens
+    completion_tokens = planner_capture.completion_tokens
+    model_name = planner_capture.model
+    budget.add_tokens(prompt_tokens + completion_tokens)
+
+    # Check for respond_to_user escape tool (needed for Llama 4 Scout).
+    escaped = _intercept_respond_to_user(response, "Planner")
+    if escaped is not None:
+        response = escaped
+    elif getattr(response, 'tool_calls', None):
+        # Non-escape tools — pass through for graph tool execution
+        return {
+            "messages": [response],
+            **planner_capture.token_fields(),
+            "_budget_summary": budget.summary(),
+            **planner_capture.debug_fields(),
+        }
+
+    plan = _parse_plan(response.content)
+    plan_version = state.get("plan_version", 0) + 1
+    new_plan_steps = _make_plan_steps(plan, iteration=iteration)
+    store = ps.create_plan(plan, creator="planner" if iteration == 0 else "replanner")
+
+    logger.info("Planner produced %d steps (iteration %d, version %d): %s",
+                len(plan), iteration, plan_version, plan,
+                extra={"session_id": state.get("context_id", ""), "node": "planner",
+                       "iteration": iteration, "step_count": len(plan),
+                       "plan_version": plan_version})
+
+    # On replan, preserve completed steps — don't restart from step 0.
+    # Find the first non-done step in the NEW plan to continue from.
+    # On first plan (no prev steps), start at 0.
+    prev_steps = state.get("plan_steps", [])
+    if prev_steps:
+        # Replan: carry forward "done" status from previous steps that match
+        done_count = sum(1 for s in prev_steps if s.get("status") == "done")
+        start_step = min(done_count, len(new_plan_steps) - 1) if new_plan_steps else 0
+        # Mark steps before start_step as done in new plan (they were done before)
+        for i in range(start_step):
+            if i < len(new_plan_steps):
+                new_plan_steps[i] = {**new_plan_steps[i], "status": "done"}
+        logger.info("Replan: preserving %d done steps, starting at step %d",
+                     start_step, start_step + 1,
+                     extra={"session_id": state.get("context_id", ""), "node": "planner"})
+    else:
+        start_step = 0
+
+    return {
+        "messages": [response],
+        "plan": plan,
+        "plan_steps": new_plan_steps,
+        "plan_version": plan_version,
+        "current_step": start_step,
+        "iteration": iteration + 1,
+        "done": False,
+        "_plan_store": store,
+        **planner_capture.token_fields(),
+        "_budget_summary": budget.summary(),
+        **planner_capture.debug_fields(),
+    }
+
+
+MAX_THINK_ACT_CYCLES = int(_os.environ.get("SANDBOX_MAX_THINK_ACT_CYCLES",
+                                             _os.environ.get("SANDBOX_MAX_TOOL_CALLS_PER_STEP", "20")))
+THINKING_ITERATION_BUDGET = int(_os.environ.get("SANDBOX_THINKING_ITERATION_BUDGET", "2"))
+MAX_PARALLEL_TOOL_CALLS = int(_os.environ.get("SANDBOX_MAX_PARALLEL_TOOL_CALLS", "5"))
+
+
+async def executor_node(
+    state: dict[str, Any],
+    llm_with_tools: Any,
+    budget: AgentBudget | None = None,
+    llm_reason: Any | None = None,
+) -> dict[str, Any]:
+    """Execute the current plan step using the LLM with bound tools.
+
+    When ``llm_reason`` is provided (thinking mode):
+    1. Thinking loop: up to THINKING_ITERATION_BUDGET bare LLM iterations
+    2. Micro-reasoning: LLM with tools (tool_choice=any) makes up to
+       MAX_PARALLEL_TOOL_CALLS parallel tool calls.
+    """
+    if budget is None:
+        budget = DEFAULT_BUDGET
+    plan = state.get("plan", [])
+    current_step = state.get("current_step", 0)
+    tool_call_count = state.get("_tool_call_count", 0)
+
+    if current_step >= len(plan):
+        # No more steps — signal completion to reflector
+        return {
+            "messages": [AIMessage(content="All plan steps completed.")],
+            "current_step": current_step,
+            "done": True,
+        }
+
+    # Guard: too many think-act cycles for this step — force completion
+    if tool_call_count >= MAX_THINK_ACT_CYCLES:
+        logger.warning(
+            "Step %d hit think-act cycle limit (%d/%d) — forcing step completion",
+            current_step, tool_call_count, MAX_THINK_ACT_CYCLES,
+            extra={"session_id": state.get("context_id", ""), "node": "executor",
+                   "current_step": current_step, "tool_call_count": tool_call_count},
+        )
+        result: dict[str, Any] = {
+            "messages": [AIMessage(content=f"Step {current_step + 1} reached think-act cycle limit ({MAX_THINK_ACT_CYCLES}). Moving to reflection.")],
+            "current_step": current_step,
+            "_tool_call_count": 0,
+            "_budget_summary": budget.summary(),
+        }
+        if _DEBUG_PROMPTS:
+            result["_system_prompt"] = f"[Think-act cycle limit reached — no LLM call]\nStep {current_step + 1}: {tool_call_count}/{MAX_THINK_ACT_CYCLES} cycles"
+            result["_prompt_messages"] = [{"role": "system", "preview": f"Step {current_step + 1} cycle limit ({tool_call_count}/{MAX_THINK_ACT_CYCLES})"}]
+            result["_llm_response"] = "[no LLM call — cycle limit]"
+        return result
+
+    step_text = plan[current_step]
+    system_content = _safe_format(
+        _EXECUTOR_SYSTEM,
+        current_step=current_step + 1,
+        step_text=step_text,
+        tool_call_count=tool_call_count,
+        max_tool_calls=MAX_THINK_ACT_CYCLES,
+        workspace_path=state.get("workspace_path", "/workspace"),
+    )
+
+    # Prepend skill instructions when a skill was loaded from metadata.
+    skill_instructions = state.get("skill_instructions", "")
+    if skill_instructions:
+        system_content = skill_instructions + "\n\n" + system_content
+
+    # Check budget before making the LLM call (refresh from LiteLLM first)
+
+    if budget.exceeded:
+        logger.warning("Budget exceeded in executor: %s", budget.exceeded_reason,
+                       extra={"session_id": state.get("context_id", ""), "node": "executor",
+                              "current_step": current_step})
+        result: dict[str, Any] = {
+            "messages": [AIMessage(content=f"Budget exceeded: {budget.exceeded_reason}")],
+            "current_step": current_step,
+            "done": True,
+        }
+        if _DEBUG_PROMPTS:
+            result["_system_prompt"] = f"[Budget exceeded — no LLM call]\n{budget.exceeded_reason}"
+            result["_prompt_messages"] = [{"role": "system", "preview": f"Budget exceeded: {budget.exceeded_reason}"}]
+            result["_llm_response"] = "[no LLM call — budget exceeded]"
+        return result
+
+    # Step-scoped message context for the executor.
+    #
+    # On NEW step (tool_call_count == 0):
+    #   Only the step brief as a HumanMessage — executor treats this as a
+    #   fresh task. Does NOT see the plan, previous steps, or reflector msgs.
+    #
+    # On CONTINUING step (tool_call_count > 0):
+    #   The step brief + this step's tool calls/results only. Walk backwards
+    #   from current messages, stopping when we hit a non-tool/non-AI message
+    #   (which marks the boundary of this step's context).
+
+    from sandbox_agent.context_builders import build_executor_context, invoke_with_tool_loop
+
+    messages = build_executor_context(state, system_content)
+
+    try:
+        response, capture, sub_events = await invoke_with_tool_loop(
+            llm_with_tools, llm_reason, messages,
+            node="executor", session_id=state.get("context_id", ""),
+            workspace_path=state.get("workspace_path", "/workspace"),
+            thinking_budget=THINKING_ITERATION_BUDGET,
+            max_parallel_tool_calls=MAX_PARALLEL_TOOL_CALLS,
+        )
+    except Exception as exc:
+        if _is_budget_exceeded_error(exc):
+            logger.warning("Budget exceeded in executor (402 from proxy): %s", exc,
+                           extra={"session_id": state.get("context_id", ""), "node": "executor",
+                                  "current_step": current_step})
+            return {
+                "messages": [AIMessage(content=f"Budget exceeded: {exc}")],
+                "current_step": current_step,
+                "done": True,
+                "_budget_summary": budget.summary(),
+            }
+        raise
+
+    # Track no-tool executions — if the LLM produces text instead of
+    # tool calls, increment counter. After 2 consecutive no-tool runs
+    # for the same step, mark the step as failed and advance.
+    no_tool_count = state.get("_no_tool_count", 0)
+
+    # Token usage and model from the capture (guaranteed to match what was sent)
+    prompt_tokens = capture.prompt_tokens
+    completion_tokens = capture.completion_tokens
+    model_name = capture.model
+    budget.add_tokens(prompt_tokens + completion_tokens)
+
+    # If the model returned text-based tool calls instead of structured
+    # tool_calls (common with vLLM without --enable-auto-tool-choice),
+    # parse them so tools_condition routes to the ToolNode.
+    # Capture the pre-patch content for event serialization.
+    pre_patch_content = response.content
+    had_structured_tools = bool(response.tool_calls)
+    response = maybe_patch_tool_calls(response)
+
+    # -- Enforce parallel tool call limit -----------------------------------------
+    # Allow up to MAX_PARALLEL_TOOL_CALLS per think-act cycle.
+    # invoke_with_tool_loop already enforces this in thinking mode,
+    # but single-phase mode needs the safety check here.
+    if len(response.tool_calls) > MAX_PARALLEL_TOOL_CALLS:
+        logger.info(
+            "Executor returned %d tool calls — keeping first %d (parallel limit)",
+            len(response.tool_calls), MAX_PARALLEL_TOOL_CALLS,
+            extra={"session_id": state.get("context_id", ""), "node": "executor",
+                   "current_step": current_step, "tool_call_count": tool_call_count},
+        )
+        response = AIMessage(
+            content=response.content,
+            tool_calls=response.tool_calls[:MAX_PARALLEL_TOOL_CALLS],
+        )
+
+    # -- Detect unparsed text tool call attempts (stall signal) ----------------
+    # If the model wrote text that looks like a tool call but wasn't parsed,
+    # log a warning. The reflector will catch the zero-tool-call pattern.
+    if not response.tool_calls and pre_patch_content:
+        text_hint = str(pre_patch_content).lower()
+        if any(kw in text_hint for kw in ("shell(", "file_read(", "file_write(",
+                                            "```bash", "```shell", "i would run",
+                                            "i will execute", "let me run")):
+            logger.warning(
+                "Executor produced text resembling a tool call but no actual "
+                "tool_calls were generated — likely a stalled iteration",
+                extra={"session_id": state.get("context_id", ""), "node": "executor",
+                       "current_step": current_step, "tool_call_count": tool_call_count},
+            )
+
+    # -- Loop detection: stop if the executor repeats the same tool call ----
+    # With dedup removed (each call has unique LangGraph ID), we need to
+    # detect when the executor is stuck calling the same tool with the same
+    # args repeatedly. Check against the last 3 tool calls in this step.
+    if response.tool_calls and tool_call_count > 0:
+        all_msgs = state.get("messages", [])
+        # Collect recent tool calls from this step (after boundary)
+        recent_calls: list[tuple[str, str]] = []
+        for m in reversed(all_msgs):
+            content = str(getattr(m, "content", ""))
+            if isinstance(m, SystemMessage) and content.startswith(f"[STEP_BOUNDARY {current_step}]"):
+                break
+            if isinstance(m, AIMessage) and getattr(m, "tool_calls", None):
+                for tc in m.tool_calls:
+                    recent_calls.append((tc["name"], repr(sorted(tc["args"].items()))))
+                    if len(recent_calls) >= 3:
+                        break
+                if len(recent_calls) >= 3:
+                    break
+
+        # Check if the current call matches any of the last 3
+        for tc in response.tool_calls:
+            current_key = (tc["name"], repr(sorted(tc["args"].items())))
+            repeat_count = sum(1 for rc in recent_calls if rc == current_key)
+            if repeat_count >= 2:
+                logger.warning(
+                    "Loop detected: %s(%s) called %d times in last 3 — forcing step completion",
+                    tc["name"], str(tc["args"])[:80], repeat_count + 1,
+                    extra={"session_id": state.get("context_id", ""), "node": "executor",
+                           "current_step": current_step},
+                )
+                return {
+                    "messages": [AIMessage(
+                        content=f"Step {current_step + 1} stuck in loop: "
+                        f"{tc['name']}() called {repeat_count + 1} times with same args. "
+                        f"Moving to reflection."
+                    )],
+                    "current_step": current_step,
+                    "_tool_call_count": 0,
+                    "_budget_summary": budget.summary(),
+                }
+
+    # Build parsed_tools list for event serialization when tools came
+    # from text parsing (not structured tool_calls).
+    parsed_tools: list[dict[str, Any]] = []
+    if not had_structured_tools and response.tool_calls:
+        parsed_tools = [
+            {"name": tc["name"], "args": tc.get("args", {})}
+            for tc in response.tool_calls
+        ]
+
+    # If no tool calls after patching, the executor is either:
+    # (a) Legitimately done with the step (summarizing results) — NORMAL
+    # (b) Stalled and unable to call tools — only if it never called ANY tool
+    #
+    # With micro-reflection, the executor may produce text after a failed
+    # tool call to summarize/report — that's valid step completion, not a stall.
+    if not response.tool_calls:
+        if tool_call_count > 0:
+            # Executor already called tools this step — text response means
+            # it's done summarizing. This is normal completion, not a stall.
+            logger.info(
+                "Executor produced text response after %d tool calls for step %d — step complete",
+                tool_call_count, current_step,
+                extra={"session_id": state.get("context_id", ""), "node": "executor",
+                       "current_step": current_step, "tool_call_count": tool_call_count},
+            )
+        else:
+            no_tool_count += 1
+            logger.warning(
+                "Executor produced no tool calls for step %d (attempt %d/2)",
+                current_step, no_tool_count,
+                extra={"session_id": state.get("context_id", ""), "node": "executor",
+                       "current_step": current_step, "tool_call_count": 0},
+            )
+            if no_tool_count >= 2:
+                logger.warning("Executor failed to call tools after 2 attempts — marking step failed",
+                               extra={"session_id": state.get("context_id", ""), "node": "executor",
+                                      "current_step": current_step, "tool_call_count": 0})
+                # Keep the actual LLM response (with text reasoning) for the UI.
+                # Append failure note but preserve the model's output for micro_reasoning.
+                actual_content = str(response.content or "")
+                failure_note = f"\n\n[Step {current_step + 1} failed: executor could not call tools after 2 attempts.]"
+                return {
+                    "messages": [AIMessage(content=actual_content + failure_note)],
+                    "current_step": current_step,
+                    "done": True if current_step + 1 >= len(plan) else False,
+                    "_no_tool_count": 0,
+                    **capture.debug_fields(),
+                }
+    else:
+        no_tool_count = 0  # reset on successful tool call
+
+    # Increment think-act cycle count (each cycle = 1, regardless of parallel tool count)
+    new_tool_call_count = tool_call_count + 1 if response.tool_calls else tool_call_count
+
+    # Extract last tool result for micro_reasoning context (shows WHY the
+    # agent made this decision in the UI event stream).
+    _last_tool_result = None
+    for m in reversed(state.get("messages", [])):
+        if isinstance(m, ToolMessage):
+            content_str = str(getattr(m, "content", ""))
+            _last_tool_result = {
+                "name": getattr(m, "name", "unknown"),
+                "output": content_str[:500],
+                "status": "error" if "EXIT_CODE:" in content_str else "success",
+            }
+            break
+
+    # On first call (tool_call_count == 0), inject a SystemMessage boundary
+    # marker into state. SystemMessage is NOT sent to the LLM (the executor
+    # builds its own message list), but stays in state["messages"] so the
+    # windowing logic on subsequent calls can find where this step started.
+    step_brief = state.get("skill_instructions", f"Execute step {current_step + 1}: {step_text}")
+    step_msgs: list = []
+    if tool_call_count == 0:
+        step_msgs.append(SystemMessage(content=f"[STEP_BOUNDARY {current_step}] {step_brief[:500]}"))
+
+    result: dict[str, Any] = {
+        "messages": step_msgs + [response],
+        "current_step": current_step,
+        **capture.token_fields(),
+        "_budget_summary": budget.summary(),
+        **capture.debug_fields(),
+        "_no_tool_count": no_tool_count,
+        "_tool_call_count": new_tool_call_count,
+        **({"_last_tool_result": _last_tool_result} if _last_tool_result else {}),
+    }
+    if sub_events:
+        result["_sub_events"] = sub_events
+    if parsed_tools:
+        result["parsed_tools"] = parsed_tools
+    return result
+
+
+async def reflector_node(
+    state: dict[str, Any],
+    llm: Any,
+    budget: AgentBudget | None = None,
+) -> dict[str, Any]:
+    """Review step output and decide whether to continue, replan, or finish.
+
+    Parameters
+    ----------
+    budget:
+        Optional :class:`AgentBudget` for enforcing iteration limits.
+        When the budget is exceeded the reflector forces ``done``.
+    """
+    if budget is None:
+        budget = DEFAULT_BUDGET
+
+    plan = state.get("plan", [])
+    current_step = state.get("current_step", 0)
+    step_results = list(state.get("step_results", []))
+    iteration = state.get("iteration", 0)
+    replan_count = state.get("replan_count", 0)
+    done = state.get("done", False)
+    recent_decisions = list(state.get("recent_decisions", []))
+    store = state.get("_plan_store", {})
+
+    # If executor signaled done (ran out of steps), go straight to done
+    if done:
+        result: dict[str, Any] = {"done": True, "decision": "done", "assessment": "Executor signaled completion."}
+        if _DEBUG_PROMPTS:
+            result["_system_prompt"] = "[Executor signaled done — no LLM call]"
+        return result
+
+    def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]:
+        """Helper for early termination — marks current step partial/failed, rest skipped."""
+        fd_ps = list(state.get("plan_steps", []))
+        step_status = "failed" if mark_failed else "partial"
+        if current_step < len(fd_ps):
+            fd_ps[current_step] = {**fd_ps[current_step], "status": step_status}
+        for i in range(current_step + 1, len(fd_ps)):
+            if fd_ps[i].get("status") == "pending":
+                fd_ps[i] = {**fd_ps[i], "status": "skipped"}
+        logger.warning("%s — forcing done", reason,
+                       extra={"session_id": state.get("context_id", ""), "node": "reflector",
+                              "current_step": current_step, "replan_count": replan_count})
+        result: dict[str, Any] = {
+            "step_results": step_results,
+            "plan_steps": fd_ps,
+            "current_step": current_step + 1,
+            "done": True,
+            "replan_count": replan_count,
+            "assessment": reason,
+            "decision": "done",
+        }
+        # Include prompt context so the UI can show why the reflector
+        # terminated early (budget, stall, duplicate output).
+        if _DEBUG_PROMPTS:
+            result["_system_prompt"] = f"[Early termination — no LLM call]\n{reason}"
+        if store:
+            result["_plan_store"] = store
+        return result
+
+    # Budget guard — force termination if ANY budget limit exceeded
+
+    if budget.exceeded:
+        return _force_done(f"Budget exceeded: {budget.exceeded_reason}", mark_failed=True)
+
+    # Count tool calls in this iteration (from executor's last message)
+    messages = state["messages"]
+    tool_calls_this_iter = 0
+    last_content = ""
+    if messages:
+        last_msg = messages[-1]
+        tool_calls_this_iter = len(getattr(last_msg, "tool_calls", []) or [])
+        content = getattr(last_msg, "content", "")
+        if isinstance(content, list):
+            last_content = " ".join(
+                b.get("text", "") for b in content
+                if isinstance(b, dict) and b.get("type") == "text"
+            )
+        else:
+            last_content = str(content)
+
+    # Stall detection removed — the reflector's LLM call decides whether to
+    # continue, replan, or stop. Hardcoded stall guards were overriding the
+    # reflector's judgment and force-terminating sessions prematurely.
+    # The iteration limit and wall-clock limit are sufficient safeguards.
+
+    # If last_content is empty (dedup path) or the old sentinel, recover the
+    # actual last tool result from the message history so the reflector sees real output.
+    if not last_content.strip() or _DEDUP_SENTINEL in last_content:
+        for msg in reversed(messages):
+            if isinstance(msg, ToolMessage):
+                last_content = str(getattr(msg, "content", ""))
+                logger.info("Reflector: substituted dedup sentinel with last tool result (%d chars)",
+                            len(last_content),
+                            extra={"session_id": state.get("context_id", ""), "node": "reflector",
+                                   "current_step": current_step})
+                break
+
+    step_results.append(last_content[:500])
+
+    step_text = plan[current_step] if current_step < len(plan) else "N/A"
+    plan_text = "\n".join(f"{i+1}. {s}" for i, s in enumerate(plan))
+    results_text = last_content[:1000]
+
+    # Hint: if the step result contains error signals, prepend a note
+    error_signals = ("error", "fatal", "failed", "exit_code", "stderr", "denied", "cannot")
+    if any(sig in results_text.lower() for sig in error_signals):
+        results_text = (
+            "[NOTE: The step result below contains error indicators. "
+            "Consider 'replan' to try a different approach.]\n\n" + results_text
+        )
+
+    # Build replan history context — show the LLM what prior replans tried
+    replan_history_text = ""
+    if replan_count > 0:
+        replan_history_lines = [
+            f"REPLAN HISTORY ({replan_count} prior replan(s)):"
+        ]
+        # Collect failed step summaries from plan_steps
+        for hist_ps in state.get("plan_steps", []):
+            if hist_ps.get("status") == "failed":
+                summary = hist_ps.get("result_summary", "no details")
+                replan_history_lines.append(
+                    f"  - Step {hist_ps.get('index', '?')+1} FAILED: {hist_ps.get('description', '?')[:80]}"
+                    f" — {summary[:150]}"
+                )
+        replan_history_lines.append(
+            "Do NOT repeat approaches that already failed. Try something fundamentally different,"
+            " or choose 'done' to report partial results."
+        )
+        replan_history_text = "\n".join(replan_history_lines)
+
+    # Ask LLM to reflect
+    recent_str = ", ".join(recent_decisions[-5:]) if recent_decisions else "none"
+    # Build remaining steps text so reflector knows what's left
+    remaining = [f"{i+1}. {plan[i]}" for i in range(current_step + 1, len(plan))]
+    remaining_text = ", ".join(remaining[:5]) if remaining else "NONE — all steps complete"
+
+    # Build step execution summary for reflector context
+    step_tool_calls = 0
+    step_tools_used: set[str] = set()
+    step_errors = 0
+    for msg in messages:
+        content = str(getattr(msg, "content", ""))
+        if isinstance(msg, SystemMessage) and content.startswith(f"[STEP_BOUNDARY {current_step}]"):
+            step_tool_calls = 0
+            step_tools_used = set()
+            step_errors = 0
+            continue
+        if isinstance(msg, AIMessage) and getattr(msg, "tool_calls", None):
+            for tc in msg.tool_calls:
+                step_tool_calls += 1
+                name = tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?")
+                step_tools_used.add(name)
+        if isinstance(msg, ToolMessage):
+            if "EXIT_CODE:" in content and "EXIT_CODE: 0" not in content:
+                step_errors += 1
+
+    step_summary = (
+        f"Step execution summary: {step_tool_calls} tool calls using {', '.join(sorted(step_tools_used)) or 'none'}, "
+        f"{step_errors} errors"
+    )
+
+    system_content = _safe_format(
+        _REFLECTOR_SYSTEM,
+        plan_text=plan_text,
+        current_step=current_step + 1,
+        total_steps=len(plan),
+        step_text=step_text,
+        step_result=results_text,
+        remaining_steps=remaining_text,
+        iteration=iteration,
+        max_iterations=budget.max_iterations,
+        replan_count=replan_count,
+        tool_calls_this_iter=tool_calls_this_iter,
+        recent_decisions=recent_str,
+        replan_history=replan_history_text,
+    )
+    system_content = step_summary + "\n\n" + system_content
+    from sandbox_agent.context_builders import build_reflector_context, invoke_llm
+
+    reflect_messages = build_reflector_context(state, system_content)
+    try:
+        response, capture = await invoke_llm(
+            llm, reflect_messages,
+            node="reflector", session_id=state.get("context_id", ""),
+            workspace_path=state.get("workspace_path", "/workspace"),
+        )
+    except Exception as exc:
+        if _is_budget_exceeded_error(exc):
+            logger.warning("Budget exceeded in reflector (402 from proxy): %s", exc,
+                           extra={"session_id": state.get("context_id", ""), "node": "reflector",
+                                  "current_step": current_step, "replan_count": replan_count})
+            return _force_done(f"Budget exceeded: {exc}")
+        raise
+
+    prompt_tokens = capture.prompt_tokens
+    completion_tokens = capture.completion_tokens
+    model_name = capture.model
+    budget.add_tokens(prompt_tokens + completion_tokens)
+
+    # Check for respond_to_user escape tool (needed for Llama 4 Scout).
+    escaped = _intercept_respond_to_user(response, "Reflector")
+    if escaped is not None:
+        response = escaped
+    elif getattr(response, 'tool_calls', None):
+        # Non-escape tools — pass through for graph tool execution
+        return {
+            "messages": [response],
+            **capture.token_fields(),
+            "_budget_summary": budget.summary(),
+            **capture.debug_fields(),
+        }
+
+    decision = _parse_decision(response.content)
+
+    # Guard: if the LLM says "done" but there are remaining plan steps,
+    # override to "continue". The LLM (esp. Llama 4 Scout) often confuses
+    # "step completed" with "task completed".
+    steps_remaining = len(plan) - (current_step + 1)
+    if decision == "done" and steps_remaining > 0:
+        logger.warning(
+            "Reflector said 'done' but %d plan steps remain — overriding to 'continue'",
+            steps_remaining,
+            extra={"session_id": state.get("context_id", ""), "node": "reflector",
+                   "decision": "done->continue", "current_step": current_step,
+                   "replan_count": replan_count},
+        )
+        decision = "continue"
+
+    recent_decisions.append(decision)
+    recent_decisions = recent_decisions[-10:]
+
+    # Update plan_steps with per-step status
+    plan_steps = list(state.get("plan_steps", []))
+    # Extract tool names used in this step from messages
+    step_tools: list[str] = []
+    for msg in messages:
+        for tc in getattr(msg, "tool_calls", []) or []:
+            name = tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?")
+            if name not in step_tools:
+                step_tools.append(name)
+
+    if current_step < len(plan_steps):
+        cur_ps = {**plan_steps[current_step]}
+        cur_ps["tool_calls"] = step_tools
+        cur_ps["result_summary"] = last_content[:200]
+        plan_steps[current_step] = cur_ps
+
+    logger.info(
+        "Reflector decision: %s (step %d/%d, iter %d, replans=%d, tools=%d, recent=%s)",
+        decision, current_step + 1, len(plan), iteration,
+        replan_count, tool_calls_this_iter,
+        recent_decisions[-3:],
+        extra={"session_id": state.get("context_id", ""), "node": "reflector",
+               "decision": decision, "current_step": current_step,
+               "replan_count": replan_count, "iteration": iteration},
+    )
+
+    base_result: dict[str, Any] = {
+        "messages": [response],
+        "step_results": step_results,
+        "recent_decisions": recent_decisions,
+        "plan_steps": plan_steps,
+        **capture.token_fields(),
+        "_budget_summary": budget.summary(),
+        **capture.debug_fields(),
+    }
+
+    # Update PlanStore status (parallel to plan_steps updates below)
+    step_key = str(current_step + 1)
+    if store:
+        try:
+            if decision in ("done", "continue"):
+                store = ps.set_step_status(store, step_key, "done")
+            elif decision == "replan":
+                store = ps.set_step_status(store, step_key, "failed")
+            elif decision == "retry":
+                store = ps.set_step_status(store, step_key, "running")
+        except ValueError:
+            logger.warning("PlanStore: step %s not found (replan?), skipping status update",
+                           step_key, extra={"session_id": state.get("context_id", ""), "node": "reflector"})
+        base_result["_plan_store"] = store
+
+    if decision == "done":
+        # Mark current step done, remaining as skipped
+        if current_step < len(plan_steps):
+            plan_steps[current_step] = {**plan_steps[current_step], "status": "done"}
+        for i in range(current_step + 1, len(plan_steps)):
+            if plan_steps[i].get("status") == "pending":
+                plan_steps[i] = {**plan_steps[i], "status": "skipped"}
+        return {
+            **base_result,
+            "plan_steps": plan_steps,
+            "current_step": current_step + 1,
+            "done": True,
+            "replan_count": replan_count,
+        }
+    elif decision == "retry":
+        # Retry: re-execute current step with fresh context.
+        # Mark step as "retrying" (not failed) — executor gets another chance.
+        if current_step < len(plan_steps):
+            cur_ps = plan_steps[current_step]
+            retry_count = cur_ps.get("retry_count", 0) + 1
+            plan_steps[current_step] = {
+                **cur_ps,
+                "status": "retrying",
+                "retry_count": retry_count,
+            }
+        logger.info("Retry step %d (attempt %d) — re-executing with different approach",
+                     current_step + 1, plan_steps[current_step].get("retry_count", 1),
+                     extra={"session_id": state.get("context_id", ""), "node": "reflector",
+                            "decision": "retry", "current_step": current_step})
+        return {
+            **base_result,
+            "plan_steps": plan_steps,
+            "done": False,
+            "replan_count": replan_count,
+            "_tool_call_count": 0,  # reset tool calls for retry
+        }
+    elif decision == "replan":
+        new_replan_count = replan_count + 1
+        # Mark current step failed
+        if current_step < len(plan_steps):
+            plan_steps[current_step] = {**plan_steps[current_step], "status": "failed"}
+        logger.info("Replan %d — routing back to planner", new_replan_count,
+                    extra={"session_id": state.get("context_id", ""), "node": "reflector",
+                           "decision": "replan", "current_step": current_step,
+                           "replan_count": new_replan_count})
+        return {
+            **base_result,
+            "plan_steps": plan_steps,
+            "done": False,
+            "replan_count": new_replan_count,
+        }
+    else:
+        # Continue: mark current step done, advance
+        if current_step < len(plan_steps):
+            plan_steps[current_step] = {**plan_steps[current_step], "status": "done"}
+        next_step = current_step + 1
+        if next_step < len(plan_steps):
+            plan_steps[next_step] = {**plan_steps[next_step], "status": "running"}
+        if next_step >= len(plan):
+            # All steps done — route to done (reporter will summarize).
+            # Mark all steps done.
+            for i in range(len(plan_steps)):
+                if plan_steps[i].get("status") not in ("done", "failed", "skipped"):
+                    plan_steps[i] = {**plan_steps[i], "status": "done"}
+            logger.info(
+                "All %d planned steps completed — routing to reporter",
+                len(plan),
+                extra={"session_id": state.get("context_id", ""), "node": "reflector",
+                       "decision": "done", "current_step": current_step},
+            )
+            return {
+                **base_result,
+                "plan_steps": plan_steps,
+                "done": True,
+                "replan_count": replan_count,
+                "_tool_call_count": 0,
+            }
+        return {
+            **base_result,
+            "plan_steps": plan_steps,
+            "current_step": next_step,
+            "done": False,
+            "replan_count": replan_count,
+            "_tool_call_count": 0,
+        }
+
+
+async def reporter_node(
+    state: dict[str, Any],
+    llm: Any,
+    budget: AgentBudget | None = None,
+    llm_reason: Any | None = None,
+    tools: list | None = None,
+) -> dict[str, Any]:
+    """Format accumulated step results into a final answer.
+
+    Sets ``plan_status`` based on how the loop ended:
+    - All steps done → ``"completed"``
+    - Stall/budget forced done → ``"failed"`` (with ``awaiting_continue``
+      so user/looper can retry)
+    - Plan steps remain → ``"awaiting_continue"``
+
+    When ``llm_reason`` is provided, uses ``invoke_with_tool_loop`` for
+    thinking iterations and read-only tool calls (file verification).
+    Falls back to single ``invoke_llm`` when ``llm_reason`` is None.
+    """
+    if budget is None:
+        budget = DEFAULT_BUDGET
+    store = state.get("_plan_store", {})
+    plan = ps.to_flat_plan(store) if store else state.get("plan", [])
+    step_results = state.get("step_results", [])
+    plan_steps = state.get("plan_steps", [])
+
+    # Determine terminal plan_status based on step outcomes
+    if plan_steps:
+        done_count = sum(1 for s in plan_steps if s.get("status") == "done")
+        failed_count = sum(1 for s in plan_steps if s.get("status") == "failed")
+        partial_count = sum(1 for s in plan_steps if s.get("status") == "partial")
+        total = len(plan_steps)
+        if done_count == total:
+            terminal_status = "completed"
+        elif failed_count > 0 or partial_count > 0 or done_count < total:
+            terminal_status = "awaiting_continue"
+        else:
+            terminal_status = "completed"
+    else:
+        terminal_status = "completed"
+
+    # Filter out internal dedup sentinel from step_results so it never
+    # reaches the reporter prompt or the final answer.
+    step_results = [r for r in step_results if _DEDUP_SENTINEL not in r]
+
+    # Always run LLM to produce a user-facing summary.
+    # Previous code had a shortcut for single-step plans that passed through
+    # the last message directly, but this leaked reflector reasoning text.
+    if not step_results and not state.get("messages"):
+        return {"final_answer": "No response generated.", "plan_status": terminal_status}
+
+    plan_text = "\n".join(f"{i+1}. {s}" for i, s in enumerate(plan))
+    results_text = "\n".join(
+        f"Step {i+1}: {r}" for i, r in enumerate(step_results)
+    )
+
+    # Build step status summary from plan_steps
+    step_status_lines = []
+    has_partial = False
+    for rpt_ps in plan_steps:
+        idx = rpt_ps.get("index", 0)
+        status = rpt_ps.get("status", "unknown").upper()
+        if status == "PARTIAL":
+            has_partial = True
+        desc = rpt_ps.get("description", "")[:80]
+        result = rpt_ps.get("result_summary", "")[:100]
+        line = f"{idx+1}. [{status}] {desc}"
+        if result and status in ("FAILED", "PARTIAL"):
+            line += f" — {result}"
+        step_status_lines.append(line)
+    step_status_text = "\n".join(step_status_lines) if step_status_lines else "No step status available."
+
+    # Add context when the agent hit its step limit
+    done_count = sum(1 for s in plan_steps if s.get("status") == "done")
+    limit_note = ""
+    if has_partial:
+        limit_note = (
+            f"NOTE: The agent reached its step limit after {done_count} completed steps. "
+            "Summarize ALL results obtained so far — do not dismiss the work done."
+        )
+
+    system_content = _safe_format(
+        _REPORTER_SYSTEM,
+        plan_text=plan_text,
+        step_status_text=step_status_text,
+        results_text=results_text,
+        limit_note=limit_note,
+    )
+    # Filter dedup sentinel messages from conversation history passed to the
+    # reporter LLM so it cannot echo them in the final answer.
+    filtered_msgs = [
+        m for m in state["messages"]
+        if _DEDUP_SENTINEL not in str(getattr(m, "content", ""))
+    ]
+    reporter_messages = [SystemMessage(content=system_content)] + filtered_msgs
+
+    # Use invoke_with_tool_loop when llm_reason is available (thinking mode),
+    # otherwise fall back to single invoke_llm call.
+    sub_events: list[dict[str, Any]] = []
+    if llm_reason is not None:
+        from sandbox_agent.context_builders import invoke_with_tool_loop
+
+        try:
+            response, capture, sub_events = await invoke_with_tool_loop(
+                llm, llm_reason, reporter_messages,
+                node="reporter", session_id=state.get("context_id", ""),
+                workspace_path=state.get("workspace_path", "/workspace"),
+                thinking_budget=2,
+                max_parallel_tool_calls=3,
+                max_cycles=3,
+                tools=tools,
+            )
+        except Exception as exc:
+            if _is_budget_exceeded_error(exc):
+                logger.warning("Budget exceeded in reporter (402 from proxy): %s", exc,
+                               extra={"session_id": state.get("context_id", ""), "node": "reporter"})
+                return {
+                    "messages": [AIMessage(content="Task completed (budget exhausted before final summary).")],
+                    "final_answer": "Task completed (budget exhausted before final summary).",
+                    "plan_status": terminal_status,
+                    "done": True,
+                    "_budget_summary": budget.summary(),
+                }
+            raise
+    else:
+        from sandbox_agent.context_builders import invoke_llm
+
+        try:
+            response, capture = await invoke_llm(
+                llm, reporter_messages,
+                node="reporter", session_id=state.get("context_id", ""),
+                workspace_path=state.get("workspace_path", "/workspace"),
+            )
+        except Exception as exc:
+            if _is_budget_exceeded_error(exc):
+                logger.warning("Budget exceeded in reporter (402 from proxy): %s", exc,
+                               extra={"session_id": state.get("context_id", ""), "node": "reporter"})
+                return {
+                    "messages": [AIMessage(content="Task completed (budget exhausted before final summary).")],
+                    "final_answer": "Task completed (budget exhausted before final summary).",
+                    "plan_status": terminal_status,
+                    "done": True,
+                    "_budget_summary": budget.summary(),
+                }
+            raise
+
+    prompt_tokens = capture.prompt_tokens
+    completion_tokens = capture.completion_tokens
+    model_name = capture.model
+    budget.add_tokens(prompt_tokens + completion_tokens)
+
+    # Handle respond_to_user escape tool (Llama 4 Scout always calls tools)
+    escaped = _intercept_respond_to_user(response, "Reporter")
+    if escaped is not None:
+        response = escaped
+    elif getattr(response, 'tool_calls', None):
+        # Response has real tool calls — return to graph for tool execution
+        return {
+            "messages": [response],
+            **capture.token_fields(),
+            "_budget_summary": budget.summary(),
+            **capture.debug_fields(),
+        }
+
+    content = response.content
+    if isinstance(content, list):
+        text = " ".join(
+            b.get("text", "") for b in content
+            if isinstance(b, dict) and b.get("type") == "text"
+        )
+    else:
+        text = str(content)
+
+    # Extract files touched from tool call history
+    files_touched: list[str] = []
+    for msg in state.get("messages", []):
+        for tc in getattr(msg, "tool_calls", []) or []:
+            name = tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?")
+            args = tc.get("args", {}) if isinstance(tc, dict) else getattr(tc, "args", {})
+            if name in ("file_write", "file_read"):
+                path = args.get("path", "")
+                if path and path not in files_touched:
+                    files_touched.append(path)
+            elif name == "shell":
+                cmd = args.get("command", "")
+                # Extract file paths from common shell patterns
+                import re as _re
+                for match in _re.findall(r'(?:>|>>|tee)\s+(\S+)', cmd):
+                    if match not in files_touched:
+                        files_touched.append(match)
+
+    logger.info("Reporter: plan_status=%s (done=%d, failed=%d, total=%d)",
+                terminal_status,
+                sum(1 for s in plan_steps if s.get("status") == "done"),
+                sum(1 for s in plan_steps if s.get("status") == "failed"),
+                len(plan_steps),
+                extra={"session_id": state.get("context_id", ""), "node": "reporter"})
+
+    result: dict[str, Any] = {
+        "messages": [response],
+        "final_answer": text,
+        "plan_status": terminal_status,
+        "files_touched": files_touched[:30],  # cap at 30 files
+        **capture.token_fields(),
+        "_budget_summary": budget.summary(),
+        **capture.debug_fields(),
+    }
+    if sub_events:
+        result["_sub_events"] = sub_events
+    if store:
+        result["_plan_store"] = store
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Routing function for reflector conditional edges
+# ---------------------------------------------------------------------------
+
+
+def route_reflector(state: dict[str, Any]) -> str:
+    """Route from reflector based on decision.
+
+    ``done``     → reporter (final answer)
+    ``replan``   → planner (create new plan, preserving done steps)
+    ``retry``    → step_selector (re-run current step with different approach)
+    ``continue`` → step_selector (advance to next step)
+    """
+    if state.get("done", False):
+        return "done"
+    # Check the reflector's decision to distinguish continue vs replan vs retry
+    decision = (state.get("recent_decisions") or ["continue"])[-1]
+    if decision == "replan":
+        return "replan"
+    # Both "retry" and "continue" route to step_selector —
+    # retry keeps current_step the same, continue advances it.
+    return "execute"
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _parse_plan(content: str | list) -> list[str]:
+    """Extract numbered steps from LLM output.
+
+    Accepts both plain strings and content-block lists (tool-calling models).
+    Returns a list of step descriptions.
+    """
+    if isinstance(content, list):
+        text = " ".join(
+            b.get("text", "") for b in content
+            if isinstance(b, dict) and b.get("type") == "text"
+        )
+    else:
+        text = str(content)
+
+    steps: list[str] = []
+    for line in text.strip().splitlines():
+        line = line.strip()
+        # Match lines starting with a number followed by . or )
+        if line and len(line) > 2 and line[0].isdigit():
+            # Strip the number prefix: "1. Do X" -> "Do X"
+            for i, ch in enumerate(line):
+                if ch in ".)" and i < 4:
+                    step = line[i + 1:].strip()
+                    if step:
+                        steps.append(step)
+                    break
+
+    # Fallback: if parsing fails, treat the whole response as a single step
+    if not steps:
+        steps = [text.strip()[:500]]
+
+    return steps
+
+
+def _parse_decision(content: str | list) -> str:
+    """Extract the reflector decision from LLM output.
+
+    Returns one of: ``continue``, ``retry``, ``replan``, ``done``, ``hitl``.
+    Defaults to ``continue`` if the output is ambiguous.
+    """
+    if isinstance(content, list):
+        text = " ".join(
+            b.get("text", "") for b in content
+            if isinstance(b, dict) and b.get("type") == "text"
+        )
+    else:
+        text = str(content)
+
+    text_lower = text.strip().lower()
+
+    for decision in ("done", "retry", "replan", "hitl", "continue"):
+        if decision in text_lower:
+            return decision
+
+    return "continue"
+
+
+_BARE_DECISION_RE = re.compile(r'^(continue|retry|replan|done|hitl)\s*$', re.IGNORECASE)

From 133ba5c7321d31369fa4e78033ac06a53b43347b Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:49:04 +0100
Subject: [PATCH 18/26] feat(sandbox): per-tool-call Landlock isolation via
 subprocess fork with no-fallback policy

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/sandbox_subprocess.py   | 163 ++++++++++++++++++
 1 file changed, 163 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py b/a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py
new file mode 100644
index 00000000..cea9063e
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py
@@ -0,0 +1,163 @@
+"""Per-tool-call Landlock isolation via subprocess fork.
+
+Each command execution forks a child process that applies Landlock
+restrictions before executing the command. This ensures that even
+if the command is malicious, it cannot escape the workspace.
+
+The Landlock restrictions are:
+- rw_paths: workspace directory + session-specific /tmp
+- ro_paths: system directories needed for basic command execution
+
+There is NO fallback. If Landlock fails, the subprocess fails.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import logging
+import os
+import sys
+import textwrap
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+# Maximum output size to capture (prevent OOM on runaway commands)
+_MAX_OUTPUT_BYTES = 10 * 1024 * 1024  # 10 MB
+
+
+async def sandboxed_subprocess(
+    command: str,
+    workspace_path: str,
+    timeout: float = 120.0,
+    env: dict[str, str] | None = None,
+) -> tuple[int, str, str]:
+    """Execute a command inside a Landlock-restricted subprocess.
+
+    Forks a child process that:
+    1. Applies Landlock restricting filesystem access to workspace + system dirs
+    2. Executes the command via shell
+
+    Parameters
+    ----------
+    command:
+        Shell command string to execute.
+    workspace_path:
+        Absolute path to the session workspace (read-write).
+    timeout:
+        Maximum execution time in seconds.
+    env:
+        Optional extra environment variables for the child.
+
+    Returns
+    -------
+    tuple[int, str, str]
+        (returncode, stdout, stderr)
+
+    Raises
+    ------
+    OSError
+        If Landlock application fails in the child (propagated via non-zero exit).
+    """
+    # Create session-specific tmp directory
+    # Use a hash of workspace_path to create a unique tmp dir
+    ws_hash = hashlib.sha256(workspace_path.encode()).hexdigest()[:12]
+    session_tmp = f"/tmp/sandbox_{ws_hash}"
+    Path(session_tmp).mkdir(parents=True, exist_ok=True)
+
+    # Build the child script that applies Landlock then execs the command
+    # The child script is passed via -c to the Python interpreter
+    child_script = textwrap.dedent("""\
+        import os
+        import subprocess
+        import sys
+
+        # Import the landlock module from the package
+        sys.path.insert(0, os.environ["_LANDLOCK_PYTHONPATH"])
+        from sandbox_agent.landlock_ctypes import apply_landlock
+
+        workspace = os.environ["SANDBOX_WORKSPACE"]
+        session_tmp = os.environ["SANDBOX_TMP"]
+
+        # Collect read-only system paths that exist
+        ro_paths = []
+        for p in ["/usr", "/bin", "/lib", "/lib64", "/opt", "/etc",
+                  "/proc", "/dev/null", "/dev/urandom", "/app"]:
+            if os.path.exists(p):
+                ro_paths.append(p)
+
+        # Add Python prefix for stdlib access
+        prefix = sys.prefix
+        if os.path.exists(prefix) and prefix not in ro_paths:
+            ro_paths.append(prefix)
+
+        # Apply Landlock -- NO try/except, hard fail if this fails
+        apply_landlock(
+            rw_paths=[workspace, session_tmp],
+            ro_paths=ro_paths,
+        )
+
+        # Execute the user command
+        result = subprocess.run(
+            os.environ["_LANDLOCK_COMMAND"],
+            shell=True,
+            cwd=workspace,
+            capture_output=True,
+            timeout=float(os.environ.get("_LANDLOCK_TIMEOUT", "120")),
+        )
+
+        # Write stdout and stderr to fds 1 and 2
+        sys.stdout.buffer.write(result.stdout)
+        sys.stderr.buffer.write(result.stderr)
+        sys.exit(result.returncode)
+    """)
+
+    # Build environment for the child process
+    child_env = dict(os.environ)
+    if env:
+        child_env.update(env)
+
+    # Find package source directory for PYTHONPATH
+    package_src = str(Path(__file__).resolve().parent.parent)
+
+    child_env["SANDBOX_WORKSPACE"] = workspace_path
+    child_env["SANDBOX_TMP"] = session_tmp
+    child_env["_LANDLOCK_PYTHONPATH"] = package_src
+    child_env["_LANDLOCK_COMMAND"] = command
+    child_env["_LANDLOCK_TIMEOUT"] = str(timeout)
+
+    try:
+        process = await asyncio.create_subprocess_exec(
+            sys.executable, "-c", child_script,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            env=child_env,
+            cwd=workspace_path,
+        )
+
+        try:
+            stdout_bytes, stderr_bytes = await asyncio.wait_for(
+                process.communicate(),
+                timeout=timeout + 5,  # extra margin for Landlock setup
+            )
+        except asyncio.TimeoutError:
+            try:
+                process.kill()
+            except ProcessLookupError:
+                pass
+            await process.wait()
+            return (
+                -1,
+                "",
+                f"Sandboxed command timed out after {timeout} seconds: '{command}'",
+            )
+
+        stdout = (stdout_bytes or b"")[:_MAX_OUTPUT_BYTES].decode("utf-8", errors="replace")
+        stderr = (stderr_bytes or b"")[:_MAX_OUTPUT_BYTES].decode("utf-8", errors="replace")
+        returncode = process.returncode if process.returncode is not None else -1
+
+        return (returncode, stdout, stderr)
+
+    except OSError as exc:
+        return (-1, "", f"Failed to start sandboxed subprocess: {exc}")

From 5c0ff33ef87b1d175f669eade2c2862fc2d9ba1f Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:49:10 +0100
Subject: [PATCH 19/26] feat(sandbox): sources.json capability loader for
 package managers, registries, and runtime limits

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/sources.py              | 129 ++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/sources.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/sources.py b/a2a/sandbox_agent/src/sandbox_agent/sources.py
new file mode 100644
index 00000000..bd2bf68f
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/sources.py
@@ -0,0 +1,129 @@
+"""Capability loader for sources.json.
+
+sources.json is baked into the agent container image and declares what
+resources exist on the image: package managers, registries, git remotes,
+web domains, and runtime limits.  The sandbox executor uses it alongside
+settings.json -- settings.json controls what operations are *allowed*,
+sources.json controls what resources are *available*.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from fnmatch import fnmatch
+from pathlib import Path
+from typing import Any
+
+
+_DEFAULT_MAX_EXECUTION_TIME_SECONDS = 300
+_DEFAULT_MAX_MEMORY_MB = 2048
+
+
+@dataclass(frozen=True)
+class SourcesConfig:
+    """Structured representation of a ``sources.json`` file."""
+
+    _data: dict[str, Any] = field(default_factory=dict, repr=False)
+
+    # ------------------------------------------------------------------
+    # Construction helpers
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> SourcesConfig:
+        """Create a *SourcesConfig* from a parsed JSON dictionary."""
+        return cls(_data=data)
+
+    @classmethod
+    def from_file(cls, path: Path) -> SourcesConfig:
+        """Load a *SourcesConfig* from a ``sources.json`` file on disk."""
+        with open(path, encoding="utf-8") as fh:
+            return cls.from_dict(json.load(fh))
+
+    # ------------------------------------------------------------------
+    # Package-manager queries
+    # ------------------------------------------------------------------
+
+    def is_package_manager_enabled(self, name: str) -> bool:
+        """Return *True* if the named package manager is enabled."""
+        managers: dict[str, Any] = self._data.get("package_managers", {})
+        entry = managers.get(name)
+        if entry is None:
+            return False
+        return bool(entry.get("enabled", False))
+
+    def is_package_blocked(self, manager: str, package: str) -> bool:
+        """Return *True* if *package* is on the block-list for *manager*."""
+        managers: dict[str, Any] = self._data.get("package_managers", {})
+        entry = managers.get(manager)
+        if entry is None:
+            return False
+        blocked: list[str] = entry.get("blocked_packages", [])
+        return package in blocked
+
+    # ------------------------------------------------------------------
+    # Git-remote queries
+    # ------------------------------------------------------------------
+
+    def is_git_remote_allowed(self, url: str) -> bool:
+        """Return *True* if *url* matches one of the ``allowed_remotes`` patterns.
+
+        Pattern matching uses :func:`fnmatch.fnmatch`.  If git access is
+        disabled in the config the method always returns *False*.
+        """
+        git_section: dict[str, Any] = self._data.get("git", {})
+        if not git_section.get("enabled", False):
+            return False
+        patterns: list[str] = git_section.get("allowed_remotes", [])
+        return any(fnmatch(url, pattern) for pattern in patterns)
+
+    # ------------------------------------------------------------------
+    # Web-access queries
+    # ------------------------------------------------------------------
+
+    def is_web_access_enabled(self) -> bool:
+        """Return *True* if web access is enabled."""
+        return bool(self._data.get("web_access", {}).get("enabled", False))
+
+    def is_domain_allowed(self, domain: str) -> bool:
+        """Return *True* if *domain* matches the allowed_domains list.
+
+        Uses :func:`fnmatch.fnmatch` for pattern matching (e.g. ``*.github.com``).
+        Returns *False* if web access is disabled.
+        """
+        web: dict[str, Any] = self._data.get("web_access", {})
+        if not web.get("enabled", False):
+            return False
+
+        # Check blocked first
+        for pattern in web.get("blocked_domains", []):
+            if fnmatch(domain, pattern):
+                return False
+
+        # Check allowed
+        for pattern in web.get("allowed_domains", []):
+            if fnmatch(domain, pattern):
+                return True
+
+        return False
+
+    # ------------------------------------------------------------------
+    # Runtime-limit properties
+    # ------------------------------------------------------------------
+
+    @property
+    def max_execution_time_seconds(self) -> int:
+        """Maximum execution time for a single run, in seconds."""
+        runtime: dict[str, Any] = self._data.get("runtime", {})
+        return int(
+            runtime.get(
+                "max_execution_time_seconds", _DEFAULT_MAX_EXECUTION_TIME_SECONDS
+            )
+        )
+
+    @property
+    def max_memory_mb(self) -> int:
+        """Maximum memory for a single run, in megabytes."""
+        runtime: dict[str, Any] = self._data.get("runtime", {})
+        return int(runtime.get("max_memory_mb", _DEFAULT_MAX_MEMORY_MB))

From 3058627f92cdd583acdfcd38049cff9c4cffd8f5 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:49:16 +0100
Subject: [PATCH 20/26] feat(sandbox): sub-agent tools with explore (read-only)
 and delegate (multi-mode) strategies

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/subagents.py            | 462 ++++++++++++++++++
 1 file changed, 462 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/subagents.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/subagents.py b/a2a/sandbox_agent/src/sandbox_agent/subagents.py
new file mode 100644
index 00000000..c1b7fcb3
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/subagents.py
@@ -0,0 +1,462 @@
+"""Sub-agent spawning tools for the sandbox agent.
+
+Provides three tools:
+
+1. **explore**: Read-only in-process sub-graph (grep, read_file, list_files).
+   Good for codebase research and analysis.
+
+2. **delegate**: Multi-mode delegation with 4 strategies:
+   - in-process: LangGraph subgraph, shared filesystem (fast)
+   - shared-pvc: Separate pod with parent's PVC mounted
+   - isolated: Separate pod via SandboxClaim (full isolation)
+   - sidecar: New container in parent pod
+
+   The LLM auto-selects the best mode, or the caller can specify.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import os
+import subprocess
+import uuid
+from pathlib import Path
+from typing import Any, Optional
+
+import asyncpg
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_core.tools import tool
+from langgraph.graph import MessagesState, StateGraph
+from langgraph.prebuilt import ToolNode, tools_condition
+
+logger = logging.getLogger(__name__)
+
+# Maximum iterations for in-process sub-agents
+_MAX_SUB_AGENT_ITERATIONS = 15
+
+# Delegation mode configuration
+_DELEGATION_MODES = os.environ.get(
+    "DELEGATION_MODES", "in-process,shared-pvc,isolated,sidecar"
+).split(",")
+_DEFAULT_MODE = os.environ.get("DEFAULT_DELEGATION_MODE", "in-process")
+
+# Maximum iterations for in-process sub-agents to prevent runaway loops.
+_MAX_SUB_AGENT_ITERATIONS = 15
+
+
+# ---------------------------------------------------------------------------
+# In-process sub-agent: explore (C20, mode 1)
+# ---------------------------------------------------------------------------
+
+
+def _make_explore_tools(workspace: str) -> list[Any]:
+    """Build a read-only tool set for the explore sub-agent."""
+    ws_root = Path(workspace).resolve()
+
+    @tool
+    async def grep(pattern: str, path: str = ".") -> str:
+        """Search for a regex pattern in files under the workspace.
+
+        Args:
+            pattern: Regex pattern to search for.
+            path: Relative path to search in (default: workspace root).
+
+        Returns:
+            Matching lines with file paths and line numbers.
+        """
+        target = (ws_root / path).resolve()
+        if not target.is_relative_to(ws_root):
+            return "Error: path resolves outside the workspace."
+
+        try:
+            result = subprocess.run(
+                ["grep", "-rn", "--include=*.py", "--include=*.md",
+                 "--include=*.yaml", "--include=*.yml", "--include=*.json",
+                 "--include=*.txt", "--include=*.sh", "--include=*.go",
+                 pattern, str(target)],
+                capture_output=True, text=True, timeout=30,
+                cwd=str(ws_root),
+            )
+            output = result.stdout[:10000]
+            if not output:
+                return f"No matches found for pattern '{pattern}'"
+            return output
+        except subprocess.TimeoutExpired:
+            return "Search timed out after 30 seconds."
+        except FileNotFoundError:
+            return "grep command not available."
+
+    @tool
+    async def read_file(path: str) -> str:
+        """Read a file from the workspace (read-only).
+
+        Args:
+            path: Relative path within the workspace.
+
+        Returns:
+            File contents (truncated to 20000 chars).
+        """
+        resolved = (ws_root / path).resolve()
+        if not str(resolved).startswith(str(ws_root)):
+            return "Error: path resolves outside the workspace."
+        if not resolved.is_file():
+            return f"Error: file not found at '{path}'."
+        try:
+            content = resolved.read_text(encoding="utf-8", errors="replace")
+            if len(content) > 20000:
+                content = content[:20000] + "\n\n[Truncated at 20000 chars]"
+            return content
+        except OSError as exc:
+            return f"Error reading file: {exc}"
+
+    @tool
+    async def list_files(path: str = ".", pattern: str = "*") -> str:
+        """List files matching a glob pattern in the workspace.
+
+        Args:
+            path: Relative directory to search in (default: workspace root).
+            pattern: Glob pattern (default: all files).
+
+        Returns:
+            Newline-separated list of matching file paths.
+        """
+        target = (ws_root / path).resolve()
+        if not target.is_relative_to(ws_root):
+            return "Error: path resolves outside the workspace."
+        if not target.is_dir():
+            return f"Error: directory not found at '{path}'."
+
+        matches = sorted(str(p.relative_to(ws_root)) for p in target.rglob(pattern) if p.is_file())
+        if len(matches) > 200:
+            matches = matches[:200]
+            matches.append(f"... and more (truncated at 200)")
+        return "\n".join(matches) if matches else "No files found."
+
+    return [grep, read_file, list_files]
+
+
+def create_explore_graph(workspace: str, llm: Any) -> Any:
+    """Create a read-only explore sub-graph.
+
+    The sub-graph has access only to grep, read_file, and list_files.
+    It is bounded to ``_MAX_SUB_AGENT_ITERATIONS`` steps.
+    """
+    tools = _make_explore_tools(workspace)
+    llm_with_tools = llm.bind_tools(tools)
+
+    async def assistant(state: MessagesState) -> dict[str, Any]:
+        from sandbox_agent.reasoning import maybe_patch_tool_calls
+        system = SystemMessage(
+            content=(
+                "You are a codebase research assistant. Your job is to find "
+                "specific information in the workspace using grep, read_file, "
+                "and list_files. Be concise. Return a focused summary of what "
+                "you found. Do NOT modify any files."
+            )
+        )
+        messages = [system] + state["messages"]
+        response = await llm_with_tools.ainvoke(messages)
+        return {"messages": [maybe_patch_tool_calls(response)]}
+
+    graph = StateGraph(MessagesState)
+    graph.add_node("assistant", assistant)
+    graph.add_node("tools", ToolNode(tools))
+    graph.set_entry_point("assistant")
+    graph.add_conditional_edges("assistant", tools_condition)
+    graph.add_edge("tools", "assistant")
+
+    return graph.compile()
+
+
+def make_explore_tool(workspace: str, llm: Any) -> Any:
+    """Return a LangChain tool that spawns an in-process explore sub-agent."""
+
+    @tool
+    async def explore(query: str) -> str:
+        """Spawn a read-only sub-agent to research the codebase.
+
+        The sub-agent has access to grep, read_file, and list_files
+        but cannot write files or execute shell commands. Use this for
+        codebase exploration, finding definitions, and analyzing code.
+
+        Args:
+            query: What to search for or investigate in the codebase.
+
+        Returns:
+            A summary of findings from the explore sub-agent.
+        """
+        sub_graph = create_explore_graph(workspace, llm)
+        try:
+            result = await asyncio.wait_for(
+                sub_graph.ainvoke(
+                    {"messages": [HumanMessage(content=query)]},
+                    config={"recursion_limit": _MAX_SUB_AGENT_ITERATIONS},
+                ),
+                timeout=120,
+            )
+            messages = result.get("messages", [])
+            if messages:
+                last = messages[-1]
+                return last.content if hasattr(last, "content") else str(last)
+            return "No results from explore sub-agent."
+        except asyncio.TimeoutError:
+            return "Explore sub-agent timed out after 120 seconds."
+        except Exception as exc:
+            return f"Explore sub-agent error: {exc}"
+
+    return explore
+
+
+# ---------------------------------------------------------------------------
+# Child session database helpers
+# ---------------------------------------------------------------------------
+
+
+async def _register_child_session(
+    child_context_id: str,
+    parent_context_id: str,
+    agent_name: str,
+    task: str,
+) -> None:
+    """Register a child session in the tasks database so it appears in the sidebar."""
+    db_url = os.environ.get("TASK_STORE_DB_URL", "")
+    if not db_url:
+        return
+    # Convert async SQLAlchemy URL to asyncpg format
+    pg_url = db_url.replace("postgresql+asyncpg://", "postgresql://")
+    try:
+        conn = await asyncpg.connect(pg_url)
+        # Check if context already exists
+        existing = await conn.fetchval(
+            "SELECT COUNT(*) FROM tasks WHERE context_id = $1", child_context_id
+        )
+        if existing == 0:
+            metadata = json.dumps({
+                "agent_name": agent_name,
+                "parent_context_id": parent_context_id,
+                "title": task[:80],
+            })
+            status = json.dumps({"state": "working"})
+            await conn.execute(
+                "INSERT INTO tasks (id, context_id, status, metadata, history, artifacts) "
+                "VALUES ($1, $2, $3::jsonb, $4::jsonb, '[]'::jsonb, '[]'::jsonb)",
+                str(uuid.uuid4()),
+                child_context_id,
+                status,
+                metadata,
+            )
+            logger.info(
+                "Registered child session %s (parent=%s) in tasks DB",
+                child_context_id,
+                parent_context_id,
+            )
+        await conn.close()
+    except Exception as e:
+        logger.warning("Failed to register child session %s: %s", child_context_id, e)
+
+
+async def _complete_child_session(child_context_id: str, result: str) -> None:
+    """Mark a child session as completed in the database."""
+    db_url = os.environ.get("TASK_STORE_DB_URL", "")
+    if not db_url:
+        return
+    pg_url = db_url.replace("postgresql+asyncpg://", "postgresql://")
+    try:
+        conn = await asyncpg.connect(pg_url)
+        status = json.dumps({"state": "completed"})
+        # Store result as an artifact
+        artifacts = json.dumps([{"parts": [{"kind": "text", "text": result[:5000]}]}])
+        await conn.execute(
+            "UPDATE tasks SET status = $1::jsonb, artifacts = $2::jsonb WHERE context_id = $3",
+            status,
+            artifacts,
+            child_context_id,
+        )
+        logger.info("Marked child session %s as completed", child_context_id)
+        await conn.close()
+    except Exception as e:
+        logger.warning("Failed to complete child session %s: %s", child_context_id, e)
+
+
+# ---------------------------------------------------------------------------
+# Multi-mode delegation (Session E)
+# ---------------------------------------------------------------------------
+
+
+_SUBAGENT_EXCLUDED_TOOLS = {"delegate", "explore"}
+
+
+async def _run_in_process(
+    task: str,
+    workspace: str,
+    llm: Any,
+    child_context_id: str,
+    tools_list: list[Any] | None = None,
+    timeout: int = 120,
+) -> str:
+    """Execute a task as an in-process LangGraph subgraph."""
+    if tools_list is None:
+        tools_list = _make_explore_tools(workspace)
+    else:
+        # Exclude delegate/explore tools to prevent recursive sub-agent spawning.
+        tools_list = [t for t in tools_list if getattr(t, "name", "") not in _SUBAGENT_EXCLUDED_TOOLS]
+
+    llm_with_tools = llm.bind_tools(tools_list)
+
+    async def assistant(state: MessagesState) -> dict[str, Any]:
+        from sandbox_agent.reasoning import maybe_patch_tool_calls
+        system = SystemMessage(
+            content=(
+                "You are a sub-agent working on a delegated task. Complete the task "
+                "efficiently using the available tools. Return a clear summary of "
+                "what you did and the results."
+            )
+        )
+        messages = [system] + state["messages"]
+        response = await llm_with_tools.ainvoke(messages)
+        return {"messages": [maybe_patch_tool_calls(response)]}
+
+    graph = StateGraph(MessagesState)
+    graph.add_node("assistant", assistant)
+    graph.add_node("tools", ToolNode(tools_list))
+    graph.set_entry_point("assistant")
+    graph.add_conditional_edges("assistant", tools_condition)
+    graph.add_edge("tools", "assistant")
+    sub_graph = graph.compile()
+
+    try:
+        result = await asyncio.wait_for(
+            sub_graph.ainvoke(
+                {"messages": [HumanMessage(content=task)]},
+                config={
+                    "recursion_limit": _MAX_SUB_AGENT_ITERATIONS,
+                    "configurable": {"thread_id": child_context_id},
+                },
+            ),
+            timeout=timeout,
+        )
+        messages = result.get("messages", [])
+        if messages:
+            last = messages[-1]
+            return last.content if hasattr(last, "content") else str(last)
+        return "No results from in-process sub-agent."
+    except asyncio.TimeoutError:
+        return f"In-process sub-agent timed out after {timeout} seconds."
+    except Exception as exc:
+        logger.exception("In-process delegation failed for %s", child_context_id)
+        return f"In-process sub-agent error: {exc}"
+
+
+async def _run_shared_pvc(
+    task: str, child_context_id: str, namespace: str = "team1",
+    variant: str = "sandbox-legion", timeout_minutes: int = 30,
+) -> str:
+    """Spawn a pod that mounts the parent's PVC (placeholder)."""
+    logger.info("shared-pvc delegation: child=%s task=%s", child_context_id, task)
+    return (
+        f"Shared-PVC delegation requested for '{task}' "
+        f"(child={child_context_id}, namespace={namespace}). "
+        "Requires RWX StorageClass. Not yet implemented."
+    )
+
+
+async def _run_isolated(
+    task: str, child_context_id: str, namespace: str = "team1",
+    variant: str = "sandbox-legion", timeout_minutes: int = 30,
+) -> str:
+    """Spawn an isolated pod via SandboxClaim CRD (placeholder)."""
+    logger.info("isolated delegation: child=%s task=%s", child_context_id, task)
+    return (
+        f"Isolated delegation requested for '{task}' "
+        f"(child={child_context_id}, namespace={namespace}). "
+        "Requires SandboxClaim CRD + controller. Not yet implemented."
+    )
+
+
+async def _run_sidecar(
+    task: str, child_context_id: str, variant: str = "sandbox-legion",
+) -> str:
+    """Inject a sidecar container (placeholder)."""
+    logger.info("sidecar delegation: child=%s task=%s", child_context_id, task)
+    return (
+        f"Sidecar delegation requested for '{task}' "
+        f"(child={child_context_id}). Not yet implemented."
+    )
+
+
+def make_delegate_tool(
+    workspace: str,
+    llm: Any,
+    parent_context_id: str = "",
+    tools_list: list[Any] | None = None,
+    namespace: str = "team1",
+) -> Any:
+    """Return a LangChain tool for multi-mode delegation.
+
+    Args:
+        workspace: Path to the parent's workspace.
+        llm: The LLM instance for in-process subgraphs.
+        parent_context_id: The parent session's context_id.
+        tools_list: Optional tools for in-process subgraphs.
+        namespace: Kubernetes namespace for out-of-process modes.
+    """
+
+    @tool
+    async def delegate(
+        task: str,
+        mode: str = "auto",
+        variant: str = "sandbox-legion",
+        timeout_minutes: int = 30,
+    ) -> str:
+        """Delegate a task to a child session.
+
+        Spawns a child agent to work on the task independently.
+
+        Args:
+            task: Description of the task for the child session.
+            mode: Delegation mode — "auto" (LLM picks), "in-process",
+                "shared-pvc", "isolated", or "sidecar".
+            variant: Agent variant for out-of-process modes.
+            timeout_minutes: Timeout for the child session.
+
+        Returns:
+            The child session's result or status message.
+        """
+        child_context_id = f"child-{uuid.uuid4().hex[:12]}"
+
+        selected_mode = mode
+        if mode == "auto":
+            # Default all auto-mode to in-process until shared-pvc/isolated
+            # are implemented. This prevents placeholder responses.
+            selected_mode = "in-process"
+
+        if selected_mode not in _DELEGATION_MODES:
+            return f"Mode '{selected_mode}' not enabled. Available: {', '.join(_DELEGATION_MODES)}"
+
+        logger.info("Delegating: child=%s mode=%s parent=%s", child_context_id, selected_mode, parent_context_id)
+
+        # Register the child session in the tasks DB so it appears in the sidebar
+        await _register_child_session(child_context_id, parent_context_id, variant, task)
+
+        try:
+            if selected_mode == "in-process":
+                result = await _run_in_process(task, workspace, llm, child_context_id, tools_list, timeout_minutes * 60)
+            elif selected_mode == "shared-pvc":
+                result = await _run_shared_pvc(task, child_context_id, namespace, variant, timeout_minutes)
+            elif selected_mode == "isolated":
+                result = await _run_isolated(task, child_context_id, namespace, variant, timeout_minutes)
+            elif selected_mode == "sidecar":
+                result = await _run_sidecar(task, child_context_id, variant)
+            else:
+                result = f"Unknown mode: {selected_mode}"
+        except Exception as e:
+            result = f"Delegation failed: {e}"
+
+        # Mark the child session as completed in the tasks DB
+        await _complete_child_session(child_context_id, result)
+
+        return result
+
+    return delegate

From c7882d0dcc87acbdc43a2ee4a8c8f931b47b0332 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 20:49:22 +0100
Subject: [PATCH 21/26] feat(sandbox): workspace manager for per-context_id
 directory isolation on shared PVC

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 .../src/sandbox_agent/workspace.py            | 189 ++++++++++++++++++
 1 file changed, 189 insertions(+)
 create mode 100644 a2a/sandbox_agent/src/sandbox_agent/workspace.py

diff --git a/a2a/sandbox_agent/src/sandbox_agent/workspace.py b/a2a/sandbox_agent/src/sandbox_agent/workspace.py
new file mode 100644
index 00000000..e047d7d7
--- /dev/null
+++ b/a2a/sandbox_agent/src/sandbox_agent/workspace.py
@@ -0,0 +1,189 @@
+"""Workspace manager for per-context_id directory isolation.
+
+Each A2A context_id gets its own subdirectory under workspace_root
+(typically mounted from a shared RWX PVC at /workspace). The manager
+creates standardised subdirectories and tracks metadata in .context.json.
+"""
+
+import json
+import os
+from datetime import datetime, timezone
+from pathlib import Path
+
+WORKSPACE_SUBDIRS = ["scripts", "data", "repos", "output"]
+
+
+class WorkspaceManager:
+    """Manages per-context workspace directories on shared storage.
+
+    Parameters
+    ----------
+    workspace_root:
+        Absolute path to the shared workspace mount (e.g. ``/workspace``).
+    agent_name:
+        Name of the agent that owns the workspaces.
+    namespace:
+        Kubernetes namespace the agent is running in.
+    ttl_days:
+        Default time-to-live for workspace directories.
+    """
+
+    def __init__(
+        self,
+        workspace_root: str,
+        agent_name: str,
+        namespace: str = "",
+        ttl_days: int = 7,
+    ) -> None:
+        self.workspace_root = workspace_root
+        self.agent_name = agent_name
+        self.namespace = namespace
+        self.ttl_days = ttl_days
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    def get_workspace_path(self, context_id: str) -> str:
+        """Return the workspace path for *context_id* without creating it."""
+        return os.path.join(self.workspace_root, context_id)
+
+    def ensure_workspace(self, context_id: str) -> str:
+        """Create (or re-use) the workspace for *context_id*.
+
+        On first call the directory tree and ``.context.json`` are created.
+        On subsequent calls ``last_accessed_at`` in the metadata file is
+        updated.
+
+        Returns the absolute path to the workspace directory.
+
+        Raises
+        ------
+        ValueError
+            If *context_id* is empty.
+        """
+        if not context_id:
+            raise ValueError("context_id must not be empty")
+
+        workspace_path = self.get_workspace_path(context_id)
+        context_file = Path(workspace_path) / ".context.json"
+
+        # Create the workspace root and subdirs (idempotent via exist_ok).
+        for subdir in WORKSPACE_SUBDIRS:
+            os.makedirs(os.path.join(workspace_path, subdir), exist_ok=True)
+
+        now = datetime.now(timezone.utc).isoformat()
+
+        if context_file.exists():
+            # Update last_accessed_at, preserve everything else.
+            data = json.loads(context_file.read_text())
+            data["last_accessed_at"] = now
+            data["disk_usage_bytes"] = self._disk_usage(workspace_path)
+            context_file.write_text(json.dumps(data, indent=2) + "\n")
+        else:
+            # First time -- write fresh metadata.
+            data = {
+                "context_id": context_id,
+                "agent": self.agent_name,
+                "namespace": self.namespace,
+                "created_at": now,
+                "last_accessed_at": now,
+                "ttl_days": self.ttl_days,
+                "disk_usage_bytes": 0,
+            }
+            context_file.write_text(json.dumps(data, indent=2) + "\n")
+
+        return workspace_path
+
+    def list_contexts(self) -> list[str]:
+        """Return a list of context_ids that have workspace directories.
+
+        Only directories that contain a ``.context.json`` file are
+        considered valid contexts.
+        """
+        root = Path(self.workspace_root)
+        if not root.is_dir():
+            return []
+
+        contexts: list[str] = []
+        for entry in root.iterdir():
+            if entry.is_dir() and (entry / ".context.json").exists():
+                contexts.append(entry.name)
+        return contexts
+
+    def cleanup_expired(self) -> list[str]:
+        """Remove workspace directories whose TTL has expired.
+
+        Reads ``created_at`` and ``ttl_days`` from each context's
+        ``.context.json``.  If ``created_at + ttl_days`` is in the past,
+        the workspace directory is deleted.
+
+        Returns a list of context_ids that were cleaned up.
+        """
+        import shutil
+
+        root = Path(self.workspace_root)
+        if not root.is_dir():
+            return []
+
+        now = datetime.now(timezone.utc)
+        cleaned: list[str] = []
+
+        for entry in root.iterdir():
+            # Skip filesystem metadata dirs (ext4 lost+found, etc.)
+            if entry.name in ("lost+found",):
+                continue
+            context_file = entry / ".context.json"
+            if not entry.is_dir() or not context_file.exists():
+                continue
+
+            try:
+                data = json.loads(context_file.read_text())
+            except (json.JSONDecodeError, OSError):
+                continue
+
+            created_str = data.get("created_at")
+            ttl = data.get("ttl_days", self.ttl_days)
+
+            if not created_str:
+                continue
+
+            try:
+                created_at = datetime.fromisoformat(created_str)
+            except ValueError:
+                continue
+
+            from datetime import timedelta
+
+            if now > created_at + timedelta(days=ttl):
+                try:
+                    shutil.rmtree(entry)
+                    cleaned.append(entry.name)
+                except OSError:
+                    pass  # best-effort cleanup
+
+        return cleaned
+
+    def get_total_disk_usage(self) -> int:
+        """Return total disk usage in bytes across all workspaces."""
+        root = Path(self.workspace_root)
+        if not root.is_dir():
+            return 0
+        return self._disk_usage(str(root))
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _disk_usage(path: str) -> int:
+        """Return total size in bytes of all files under *path*."""
+        total = 0
+        for dirpath, _dirnames, filenames in os.walk(path):
+            for fname in filenames:
+                fpath = os.path.join(dirpath, fname)
+                try:
+                    total += os.path.getsize(fpath)
+                except OSError:
+                    pass
+        return total

From b1de4faf4a334495785f63769425319deebaece1 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 22:00:48 +0100
Subject: [PATCH 22/26] =?UTF-8?q?fix:=20resolve=20ruff=20lint=20violations?=
 =?UTF-8?q?=20=E2=80=94=20import=20ordering,=20unused=20vars,=20formatting?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Auto-fixed: 16 import ordering (I001), unnecessary f-strings (F541)
Manual: prefix 6 unused variables with underscore (F401)
Formatted: 16 files with ruff format
Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 a2a/sandbox_agent/src/sandbox_agent/agent.py  | 125 ++--
 a2a/sandbox_agent/src/sandbox_agent/budget.py |   6 +-
 .../src/sandbox_agent/context_builders.py     | 200 +++---
 .../src/sandbox_agent/event_serializer.py     | 193 +++---
 .../src/sandbox_agent/executor.py             |  11 +-
 a2a/sandbox_agent/src/sandbox_agent/graph.py  | 110 ++--
 .../src/sandbox_agent/graph_card.py           |  59 +-
 .../src/sandbox_agent/landlock_ctypes.py      |  26 +-
 .../src/sandbox_agent/landlock_probe.py       |   4 +-
 .../src/sandbox_agent/observability.py        |  90 +--
 .../src/sandbox_agent/permissions.py          |  14 +-
 .../src/sandbox_agent/plan_store.py           |  49 +-
 .../src/sandbox_agent/reasoning.py            | 582 +++++++++++-------
 .../src/sandbox_agent/sandbox_subprocess.py   |   4 +-
 .../src/sandbox_agent/sources.py              |   7 +-
 .../src/sandbox_agent/subagents.py            |  71 ++-
 16 files changed, 876 insertions(+), 675 deletions(-)

diff --git a/a2a/sandbox_agent/src/sandbox_agent/agent.py b/a2a/sandbox_agent/src/sandbox_agent/agent.py
index 70e67ba7..d75b29f6 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/agent.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/agent.py
@@ -38,9 +38,8 @@
 )
 from a2a.utils import new_agent_text_message, new_task
 from langchain_core.messages import HumanMessage
-from starlette.routing import Route
-
 from langgraph.checkpoint.memory import MemorySaver
+from starlette.routing import Route
 
 from sandbox_agent.budget import AgentBudget
 from sandbox_agent.configuration import Configuration
@@ -157,7 +156,9 @@ def _tofu_verify(root: Path) -> None:
                 "TOFU: workspace file integrity mismatch! "
                 "changed=%s, added=%s, removed=%s. "
                 "This may indicate tampering. Updating stored hashes.",
-                changed, added, removed,
+                changed,
+                added,
+                removed,
             )
             # Update stored hashes (trust the new state).
             with open(hash_file, "w", encoding="utf-8") as fh:
@@ -353,13 +354,13 @@ async def _ensure_checkpointer(self) -> None:
         if self._checkpointer_initialized and self._checkpointer:
             try:
                 # Lightweight health check — attempt a simple query
-                pool = getattr(self._checkpointer, 'conn', None) or getattr(self._checkpointer, '_conn', None)
-                if pool and hasattr(pool, 'execute'):
+                pool = getattr(self._checkpointer, "conn", None) or getattr(self._checkpointer, "_conn", None)
+                if pool and hasattr(pool, "execute"):
                     await pool.execute("SELECT 1")
             except Exception:
                 logger.warning("PostgreSQL checkpointer connection stale — re-initializing")
                 # Close old connection
-                if hasattr(self, '_checkpointer_cm') and self._checkpointer_cm:
+                if hasattr(self, "_checkpointer_cm") and self._checkpointer_cm:
                     try:
                         await self._checkpointer_cm.__aexit__(None, None, None)
                     except Exception:
@@ -379,9 +380,7 @@ async def _ensure_checkpointer(self) -> None:
 
     # ------------------------------------------------------------------
 
-    async def execute(
-        self, context: RequestContext, event_queue: EventQueue
-    ) -> None:
+    async def execute(self, context: RequestContext, event_queue: EventQueue) -> None:
         """Execute a user request through the LangGraph sandbox graph.
 
         Steps:
@@ -489,9 +488,7 @@ async def _run_graph() -> None:
                     max_retries = 3
                     for attempt in range(max_retries + 1):
                         try:
-                            async for ev in graph.astream(
-                                input_state, config=graph_config, stream_mode="updates"
-                            ):
+                            async for ev in graph.astream(input_state, config=graph_config, stream_mode="updates"):
                                 await event_queue.put(ev)
                             break  # success
                         except Exception as retry_err:
@@ -501,14 +498,14 @@ async def _run_graph() -> None:
                             is_db_stale = "connection is closed" in err_str or "operationalerror" in err_str
                             if is_quota:
                                 logger.error("LLM quota exceeded: %s", retry_err)
-                                await event_queue.put(
-                                    {"_error": "LLM API quota exceeded. Check billing."}
-                                )
+                                await event_queue.put({"_error": "LLM API quota exceeded. Check billing."})
                                 break
                             elif is_db_stale and attempt < max_retries:
                                 logger.warning(
                                     "DB connection stale (%d/%d), re-initializing checkpointer: %s",
-                                    attempt + 1, max_retries, retry_err,
+                                    attempt + 1,
+                                    max_retries,
+                                    retry_err,
                                 )
                                 await self._ensure_checkpointer()
                                 # Rebuild graph with fresh checkpointer
@@ -525,7 +522,10 @@ async def _run_graph() -> None:
                                 delay = 2 ** (attempt + 1)
                                 logger.warning(
                                     "Rate limited (%d/%d), retrying in %ds: %s",
-                                    attempt + 1, max_retries, delay, retry_err,
+                                    attempt + 1,
+                                    max_retries,
+                                    delay,
+                                    retry_err,
                                 )
                                 await asyncio.sleep(delay)
                                 continue
@@ -572,7 +572,9 @@ async def _run_graph() -> None:
                     node_names = list(event.keys())
                     logger.info(
                         "Graph event %d: nodes=%s (context=%s)",
-                        event_count, node_names, context_id,
+                        event_count,
+                        node_names,
+                        context_id,
                     )
 
                     # Skip __interrupt__ events (HITL pause) — these contain
@@ -580,16 +582,19 @@ async def _run_graph() -> None:
                     if "__interrupt__" in event:
                         logger.info(
                             "Graph interrupted (HITL) at event %d: %s",
-                            event_count, event.get("__interrupt__"),
+                            event_count,
+                            event.get("__interrupt__"),
                         )
                         # Emit a structured HITL event for the frontend
                         hitl_data = event.get("__interrupt__", ())
                         hitl_msg = str(hitl_data[0]) if hitl_data else "Approval required"
-                        hitl_json = json.dumps({
-                            "type": "hitl_request",
-                            "loop_id": serializer._loop_id,
-                            "message": hitl_msg[:500],
-                        })
+                        hitl_json = json.dumps(
+                            {
+                                "type": "hitl_request",
+                                "loop_id": serializer._loop_id,
+                                "message": hitl_msg[:500],
+                            }
+                        )
                         await task_updater.update_status(
                             TaskState.working,
                             new_agent_text_message(
@@ -602,11 +607,14 @@ async def _run_graph() -> None:
 
                     # Send intermediate status updates as structured JSON
                     try:
-                        serialized_lines = "\n".join(
-                            serializer.serialize(key, value)
-                            for key, value in event.items()
-                            if isinstance(value, dict)
-                        ) + "\n"
+                        serialized_lines = (
+                            "\n".join(
+                                serializer.serialize(key, value)
+                                for key, value in event.items()
+                                if isinstance(value, dict)
+                            )
+                            + "\n"
+                        )
                         await task_updater.update_status(
                             TaskState.working,
                             new_agent_text_message(
@@ -624,19 +632,20 @@ async def _run_graph() -> None:
                                     line_types.append(lt)
                                 except json.JSONDecodeError:
                                     line_types.append("parse_error")
-                        logger.info("A2A_EMIT session=%s lines=%d types=%s",
-                            context_id, len(line_types), line_types)
+                        logger.info("A2A_EMIT session=%s lines=%d types=%s", context_id, len(line_types), line_types)
                     except asyncio.CancelledError:
                         logger.warning(
                             "SSE update cancelled at event %d (context=%s) — client disconnected",
-                            event_count, context_id,
+                            event_count,
+                            context_id,
                         )
                         client_disconnected = True
                         break
                     except Exception as update_err:
                         logger.error(
                             "Failed to send SSE update for event %d: %s",
-                            event_count, update_err,
+                            event_count,
+                            update_err,
                         )
                     output = event
 
@@ -676,7 +685,9 @@ async def _run_graph() -> None:
                     if bg_event_count > 0:
                         logger.info(
                             "Drained %d background events for context=%s, serialized %d lines",
-                            bg_event_count, context_id, len(bg_serialized_lines),
+                            bg_event_count,
+                            context_id,
+                            len(bg_serialized_lines),
                         )
                         # Persist via task_updater so the events appear in history
                         for line_block in bg_serialized_lines:
@@ -711,11 +722,14 @@ async def _run_graph() -> None:
                                 if msgs:
                                     content = getattr(msgs[-1], "content", None)
                                     if isinstance(content, list):
-                                        final_answer = "\n".join(
-                                            block.get("text", "") if isinstance(block, dict) else str(block)
-                                            for block in content
-                                            if isinstance(block, dict) and block.get("type") == "text"
-                                        ) or None
+                                        final_answer = (
+                                            "\n".join(
+                                                block.get("text", "") if isinstance(block, dict) else str(block)
+                                                for block in content
+                                                if isinstance(block, dict) and block.get("type") == "text"
+                                            )
+                                            or None
+                                        )
                                     elif content:
                                         final_answer = str(content)
                                     if final_answer:
@@ -729,12 +743,15 @@ async def _run_graph() -> None:
                     try:
                         existing_meta = {}
                         if task.metadata:
-                            existing_meta = dict(task.metadata) if not isinstance(task.metadata, dict) else task.metadata
+                            existing_meta = (
+                                dict(task.metadata) if not isinstance(task.metadata, dict) else task.metadata
+                            )
                         existing_meta["llm_request_ids"] = llm_request_ids
                         task.metadata = existing_meta
                         logger.info(
                             "Stored %d LLM request_ids in task metadata for context_id=%s",
-                            len(llm_request_ids), context_id,
+                            len(llm_request_ids),
+                            context_id,
                         )
                     except Exception as meta_err:
                         logger.warning("Failed to store llm_request_ids: %s", meta_err)
@@ -781,9 +798,7 @@ async def _run_graph() -> None:
 
     # ------------------------------------------------------------------
 
-    async def cancel(
-        self, context: RequestContext, event_queue: EventQueue
-    ) -> None:
+    async def cancel(self, context: RequestContext, event_queue: EventQueue) -> None:
         """Cancel is not supported."""
         raise Exception("cancel not supported")
 
@@ -804,9 +819,15 @@ class _MergingDatabaseTaskStore(DatabaseTaskStore):
     backend-managed keys so they survive A2A SDK updates.
     """
 
-    _BACKEND_KEYS = frozenset({
-        "owner", "visibility", "title", "agent_name", "loop_events",
-    })
+    _BACKEND_KEYS = frozenset(
+        {
+            "owner",
+            "visibility",
+            "title",
+            "agent_name",
+            "loop_events",
+        }
+    )
 
     async def save(self, task, context=None):
         """Save task while preserving backend-managed metadata fields."""
@@ -816,6 +837,7 @@ async def save(self, task, context=None):
         existing_meta = {}
         async with self.async_session_maker() as session:
             from sqlalchemy import select
+
             stmt = select(self.task_model).where(self.task_model.id == task.id)
             result = await session.execute(stmt)
             existing = result.scalar_one_or_none()
@@ -825,6 +847,7 @@ async def save(self, task, context=None):
                     existing_meta = raw
                 elif isinstance(raw, str):
                     import json
+
                     try:
                         existing_meta = json.loads(raw)
                     except (json.JSONDecodeError, TypeError):
@@ -843,8 +866,7 @@ async def save(self, task, context=None):
         db_task = self._to_orm(task)
         async with self.async_session_maker.begin() as session:
             await session.merge(db_task)
-            logger.debug("Task %s saved with merged metadata (keys=%s)",
-                         task.id, list(merged.keys()) if merged else [])
+            logger.debug("Task %s saved with merged metadata (keys=%s)", task.id, list(merged.keys()) if merged else [])
 
 
 def _create_task_store():
@@ -1020,6 +1042,7 @@ async def _handle_graph_card(request: Any) -> Any:  # noqa: ARG001
             # Build a graph for introspection only (no checkpointer, dummy config)
             from sandbox_agent.permissions import PermissionChecker
             from sandbox_agent.sources import SourcesConfig
+
             pc = PermissionChecker(settings={"workspace": "/workspace", "permissions": {}})
             sc = SourcesConfig()
             compiled = build_graph(
@@ -1028,9 +1051,7 @@ async def _handle_graph_card(request: Any) -> Any:  # noqa: ARG001
                 sources_config=sc,
                 checkpointer=None,
             )
-            _graph_card_cache.update(
-                build_graph_card(compiled, agent_id="sandbox-legion-v1")
-            )
+            _graph_card_cache.update(build_graph_card(compiled, agent_id="sandbox-legion-v1"))
         return JSONResponse(_graph_card_cache)
 
     app.routes.insert(
diff --git a/a2a/sandbox_agent/src/sandbox_agent/budget.py b/a2a/sandbox_agent/src/sandbox_agent/budget.py
index 87816781..1add9fe0 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/budget.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/budget.py
@@ -159,11 +159,7 @@ def exceeded_reason(self) -> str | None:
     @property
     def needs_hitl_checkin(self) -> bool:
         """Return True when it's time for a human-in-the-loop check-in."""
-        return (
-            self.hitl_interval > 0
-            and self.iterations_used > 0
-            and self.iterations_used % self.hitl_interval == 0
-        )
+        return self.hitl_interval > 0 and self.iterations_used > 0 and self.iterations_used % self.hitl_interval == 0
 
     def summary(self) -> dict:
         """Return budget state as a dict for event serialization."""
diff --git a/a2a/sandbox_agent/src/sandbox_agent/context_builders.py b/a2a/sandbox_agent/src/sandbox_agent/context_builders.py
index c3404711..f9de62c1 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/context_builders.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/context_builders.py
@@ -82,7 +82,9 @@ def build_planner_context(
     result = [SystemMessage(content=system_content)] + first_user + recent_tools
     logger.info(
         "Planner context: %d messages (iteration=%d, %d tool results)",
-        len(result), iteration, len(recent_tools),
+        len(result),
+        iteration,
+        len(recent_tools),
         extra={"session_id": state.get("context_id", ""), "node": "planner"},
     )
     return result
@@ -134,9 +136,7 @@ def build_executor_context(
         used_chars = 0
         for m in reversed(all_msgs):
             content = str(getattr(m, "content", ""))
-            if isinstance(m, SystemMessage) and content.startswith(
-                f"[STEP_BOUNDARY {current_step}]"
-            ):
+            if isinstance(m, SystemMessage) and content.startswith(f"[STEP_BOUNDARY {current_step}]"):
                 break
             msg_chars = len(content)
             if used_chars + msg_chars > _MAX_CONTEXT_CHARS:
@@ -156,6 +156,7 @@ def build_executor_context(
                 # Determine status from exit code
                 if "EXIT_CODE:" in content:
                     import re as _re
+
                     ec_match = _re.search(r"EXIT_CODE:\s*(\d+)", content)
                     status = "FAILED" if ec_match and ec_match.group(1) != "0" else "OK"
                     error_hint = content[:150] if status == "FAILED" else ""
@@ -172,11 +173,9 @@ def build_executor_context(
                 if error_hint:
                     reflection_parts.append(f"Error: {error_hint}")
                 if "unknown flag" in content.lower() or "invalid option" in content.lower():
-                    reflection_parts.append(
-                        "The flag is INVALID. Run the command with --help to see valid flags."
-                    )
+                    reflection_parts.append("The flag is INVALID. Run the command with --help to see valid flags.")
                 reflection_parts.append(
-                    f"Goal: \"{step_text[:100]}\"\n"
+                    f'Goal: "{step_text[:100]}"\n'
                     f"If goal ACHIEVED → stop, summarize result. "
                     f"If FAILED → try DIFFERENT approach. "
                     f"NEVER repeat same command."
@@ -186,7 +185,8 @@ def build_executor_context(
     result = [SystemMessage(content=system_content)] + first_msg + windowed
     logger.info(
         "Executor context: %d messages, ~%dk chars (from %d total)",
-        len(result), sum(len(str(getattr(m, "content", ""))) for m in result) // 1000,
+        len(result),
+        sum(len(str(getattr(m, "content", ""))) for m in result) // 1000,
         len(all_msgs),
         extra={
             "session_id": state.get("context_id", ""),
@@ -240,7 +240,9 @@ def build_reflector_context(
     result = [SystemMessage(content=system_content)] + recent_msgs
     logger.info(
         "Reflector context: %d messages (%d tool pairs from %d total)",
-        len(result), pair_count, len(messages),
+        len(result),
+        pair_count,
+        len(messages),
         extra={"session_id": state.get("context_id", ""), "node": "reflector"},
     )
     return result
@@ -322,9 +324,7 @@ def _summarize_messages(self) -> list[dict[str, str]]:
             content = getattr(msg, "content", "")
             if isinstance(content, list):
                 content = " ".join(
-                    b.get("text", "")
-                    for b in content
-                    if isinstance(b, dict) and b.get("type") == "text"
+                    b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text"
                 )
             text = str(content)
             tool_calls = getattr(msg, "tool_calls", None)
@@ -353,11 +353,10 @@ def _format_response(self) -> dict[str, Any]:
             meta = getattr(resp, "response_metadata", {}) or {}
             content = resp.content
             if isinstance(content, list):
-                content = " ".join(
-                    b.get("text", "")
-                    for b in content
-                    if isinstance(b, dict) and b.get("type") == "text"
-                ) or None
+                content = (
+                    " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text")
+                    or None
+                )
             tool_calls_out = None
             if resp.tool_calls:
                 tool_calls_out = [
@@ -374,14 +373,16 @@ def _format_response(self) -> dict[str, Any]:
                     for tc in resp.tool_calls
                 ]
             return {
-                "choices": [{
-                    "message": {
-                        "role": "assistant",
-                        "content": content if content else None,
-                        "tool_calls": tool_calls_out,
-                    },
-                    "finish_reason": meta.get("finish_reason", "unknown"),
-                }],
+                "choices": [
+                    {
+                        "message": {
+                            "role": "assistant",
+                            "content": content if content else None,
+                            "tool_calls": tool_calls_out,
+                        },
+                        "finish_reason": meta.get("finish_reason", "unknown"),
+                    }
+                ],
                 "model": meta.get("model", ""),
                 "usage": {
                     "prompt_tokens": self.prompt_tokens,
@@ -477,9 +478,17 @@ async def invoke_llm(
 
     logger.info(
         "LLM call [%s]: %d messages, %d prompt tokens, %d completion tokens, model=%s",
-        node, len(messages), prompt_tokens, completion_tokens, model_name,
-        extra={"session_id": session_id, "node": node,
-               "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens},
+        node,
+        len(messages),
+        prompt_tokens,
+        completion_tokens,
+        model_name,
+        extra={
+            "session_id": session_id,
+            "node": node,
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+        },
     )
 
     return response, capture
@@ -558,40 +567,47 @@ async def invoke_with_tool_loop(
 
                 if i == 0:
                     thinking_messages.append(
-                        HumanMessage(content="Brief analysis (2-3 sentences max): "
-                                     "What is the best tool call for this step? "
-                                     "If step is already done, say READY: step complete.")
+                        HumanMessage(
+                            content="Brief analysis (2-3 sentences max): "
+                            "What is the best tool call for this step? "
+                            "If step is already done, say READY: step complete."
+                        )
                     )
                 else:
                     thinking_messages.append(
-                        HumanMessage(content="Refine in 1-2 sentences. "
-                                     "When ready: READY: <one-line action plan>")
+                        HumanMessage(content="Refine in 1-2 sentences. When ready: READY: <one-line action plan>")
                     )
 
                 reason_response, reason_capture = await invoke_llm(
-                    llm_reason, thinking_messages,
-                    node=f"{node}-think-{cycle+1}.{i+1}", session_id=session_id,
+                    llm_reason,
+                    thinking_messages,
+                    node=f"{node}-think-{cycle + 1}.{i + 1}",
+                    session_id=session_id,
                     workspace_path=workspace_path,
                 )
                 last_reasoning = str(reason_response.content or "").strip()
                 total_thinking_tokens += reason_capture.prompt_tokens + reason_capture.completion_tokens
 
-                sub_events.append({
-                    "type": "thinking",
-                    "node": node,
-                    "cycle": cycle + 1,
-                    "iteration": i + 1,
-                    "total_iterations": 0,
-                    "reasoning": last_reasoning,
-                    **reason_capture.debug_fields(),
-                    **reason_capture.token_fields(),
-                })
+                sub_events.append(
+                    {
+                        "type": "thinking",
+                        "node": node,
+                        "cycle": cycle + 1,
+                        "iteration": i + 1,
+                        "total_iterations": 0,
+                        "reasoning": last_reasoning,
+                        **reason_capture.debug_fields(),
+                        **reason_capture.token_fields(),
+                    }
+                )
 
                 thinking_summary = last_reasoning[:200] + ("..." if len(last_reasoning) > 200 else "")
-                thinking_history.extend([
-                    AIMessage(content=thinking_summary),
-                    HumanMessage(content=f"(Thinking {i+1} recorded. Continue or signal READY:)"),
-                ])
+                thinking_history.extend(
+                    [
+                        AIMessage(content=thinking_summary),
+                        HumanMessage(content=f"(Thinking {i + 1} recorded. Continue or signal READY:)"),
+                    ]
+                )
 
                 if last_reasoning.upper().startswith("READY:"):
                     break
@@ -599,15 +615,19 @@ async def invoke_with_tool_loop(
             # --- Micro-reasoning: LLM with tools ---
             tool_messages = cycle_messages + [
                 AIMessage(content=last_reasoning or "I need to call a tool for this step."),
-                HumanMessage(content="Now execute your planned action. Rules:\n"
-                             "- Call step_done(summary='...') if the step is ALREADY COMPLETE.\n"
-                             "- Call ONE tool if there's a single action to take.\n"
-                             "- Call multiple tools ONLY if they are independent (can run in parallel).\n"
-                             "- NEVER call the same tool twice with similar args."),
+                HumanMessage(
+                    content="Now execute your planned action. Rules:\n"
+                    "- Call step_done(summary='...') if the step is ALREADY COMPLETE.\n"
+                    "- Call ONE tool if there's a single action to take.\n"
+                    "- Call multiple tools ONLY if they are independent (can run in parallel).\n"
+                    "- NEVER call the same tool twice with similar args."
+                ),
             ]
             response, capture = await invoke_llm(
-                llm_with_tools, tool_messages,
-                node=f"{node}-tool-{cycle+1}", session_id=session_id,
+                llm_with_tools,
+                tool_messages,
+                node=f"{node}-tool-{cycle + 1}",
+                session_id=session_id,
                 workspace_path=workspace_path,
             )
             capture.prompt_tokens += total_thinking_tokens
@@ -616,8 +636,9 @@ async def invoke_with_tool_loop(
         else:
             # Single-phase: one LLM call with implicit auto
             response, capture = await invoke_llm(
-                llm_with_tools, cycle_messages,
-                node=f"{node}-{cycle+1}" if max_cycles > 1 else node,
+                llm_with_tools,
+                cycle_messages,
+                node=f"{node}-{cycle + 1}" if max_cycles > 1 else node,
                 session_id=session_id,
                 workspace_path=workspace_path,
             )
@@ -628,8 +649,12 @@ async def invoke_with_tool_loop(
             done_calls = [tc for tc in response.tool_calls if tc.get("name") == "step_done"]
             if done_calls:
                 summary = done_calls[0].get("args", {}).get("summary", last_reasoning or "")
-                logger.info("step_done called in cycle %d: %s", cycle + 1, summary[:100],
-                            extra={"session_id": session_id, "node": node})
+                logger.info(
+                    "step_done called in cycle %d: %s",
+                    cycle + 1,
+                    summary[:100],
+                    extra={"session_id": session_id, "node": node},
+                )
                 response = AIMessage(content=summary)
                 break
 
@@ -648,17 +673,17 @@ async def invoke_with_tool_loop(
         if response.tool_calls and tool_map and max_cycles > 1:
             # Emit tool_call sub_event BEFORE execution (so UI shows the call)
             import uuid as _uuid
+
             call_id = str(_uuid.uuid4())[:8]
-            sub_events.append({
-                "type": "tool_call",
-                "node": node,
-                "cycle": cycle + 1,
-                "call_id": call_id,
-                "tools": [
-                    {"name": tc.get("name", "?"), "args": tc.get("args", {})}
-                    for tc in response.tool_calls
-                ],
-            })
+            sub_events.append(
+                {
+                    "type": "tool_call",
+                    "node": node,
+                    "cycle": cycle + 1,
+                    "call_id": call_id,
+                    "tools": [{"name": tc.get("name", "?"), "args": tc.get("args", {})} for tc in response.tool_calls],
+                }
+            )
 
             # Execute all tool calls in parallel via asyncio.gather
             async def _run_tool(tc: dict) -> ToolMessage:
@@ -684,24 +709,27 @@ async def _run_tool(tc: dict) -> ToolMessage:
             for tm in tool_results:
                 content_str = str(getattr(tm, "content", ""))
                 import re as _re
+
                 exit_match = _re.search(r"EXIT_CODE:\s*(\d+)", content_str)
-                is_error = (
-                    (exit_match is not None and exit_match.group(1) != "0")
-                    or content_str.startswith("Error:")
+                is_error = (exit_match is not None and exit_match.group(1) != "0") or content_str.startswith("Error:")
+                sub_events.append(
+                    {
+                        "type": "tool_result",
+                        "node": node,
+                        "cycle": cycle + 1,
+                        "call_id": call_id,
+                        "name": getattr(tm, "name", "unknown"),
+                        "output": content_str[:2000],
+                        "status": "error" if is_error else "success",
+                    }
                 )
-                sub_events.append({
-                    "type": "tool_result",
-                    "node": node,
-                    "cycle": cycle + 1,
-                    "call_id": call_id,
-                    "name": getattr(tm, "name", "unknown"),
-                    "output": content_str[:2000],
-                    "status": "error" if is_error else "success",
-                })
 
             logger.info(
                 "Cycle %d/%d [%s]: %d tool calls executed, continuing",
-                cycle + 1, max_cycles, node, len(response.tool_calls),
+                cycle + 1,
+                max_cycles,
+                node,
+                len(response.tool_calls),
                 extra={"session_id": session_id, "node": node},
             )
             continue  # Next cycle
@@ -731,7 +759,9 @@ async def _run_tool(tc: dict) -> ToolMessage:
 
     logger.info(
         "Tool loop %s: %d cycles, %d thinking iterations, %d total tokens",
-        node, cycle + 1, total_iters,
+        node,
+        cycle + 1,
+        total_iters,
         final_capture.prompt_tokens + final_capture.completion_tokens,
         extra={"session_id": session_id, "node": node},
     )
diff --git a/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py b/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py
index 4191a67b..8e039ef7 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py
@@ -104,8 +104,8 @@ def __init__(self, loop_id: str | None = None, context_id: str | None = None) ->
         self._loop_id = loop_id or str(uuid.uuid4())[:8]
         self._step_index = 0
         self._event_counter = 0  # global sequence number for ordering
-        self._node_visit = 0     # graph node visit counter (main sections)
-        self._sub_index = 0      # position within current node visit
+        self._node_visit = 0  # graph node visit counter (main sections)
+        self._sub_index = 0  # position within current node visit
         self._last_node_key: str = ""  # track previous node for visit grouping
         self._micro_step: int = 0
         self._context_id = context_id or "unknown"
@@ -152,12 +152,14 @@ def serialize(self, key: str, value: dict) -> str:
         if key == "router":
             # Router is an internal node — emit minimal event for logging
             route = value.get("_route", "new")
-            result = json.dumps({
-                "type": "router",
-                "loop_id": self._loop_id,
-                "route": route,
-                "plan_status": value.get("plan_status", ""),
-            })
+            result = json.dumps(
+                {
+                    "type": "router",
+                    "loop_id": self._loop_id,
+                    "route": route,
+                    "plan_status": value.get("plan_status", ""),
+                }
+            )
         elif key == "planner":
             result = self._serialize_planner(value)
         elif key == "reflector":
@@ -175,14 +177,16 @@ def serialize(self, key: str, value: dict) -> str:
             # Strip the "STEP BRIEF FROM COORDINATOR:" prefix
             if "STEP BRIEF" in brief:
                 brief = brief.split("---")[0].replace("STEP BRIEF FROM COORDINATOR:", "").strip()
-            result = json.dumps({
-                "type": "step_selector",
-                "loop_id": self._loop_id,
-                "current_step": current_step,
-                "description": f"Advancing to step {current_step + 1}: {step_desc[:80]}",
-                "brief": brief[:500],
-                "done": value.get("done", False),
-            })
+            result = json.dumps(
+                {
+                    "type": "step_selector",
+                    "loop_id": self._loop_id,
+                    "current_step": current_step,
+                    "description": f"Advancing to step {current_step + 1}: {step_desc[:80]}",
+                    "brief": brief[:500],
+                    "done": value.get("done", False),
+                }
+            )
         elif key == "reporter":
             result = self._serialize_reporter(value)
         else:
@@ -208,11 +212,13 @@ def serialize(self, key: str, value: dict) -> str:
         # Append budget_update event if _budget_summary is in the value dict
         budget_summary = value.get("_budget_summary")
         if budget_summary and isinstance(budget_summary, dict):
-            budget_event = json.dumps({
-                "type": "budget_update",
-                "loop_id": self._loop_id,
-                **budget_summary,
-            })
+            budget_event = json.dumps(
+                {
+                    "type": "budget_update",
+                    "loop_id": self._loop_id,
+                    **budget_summary,
+                }
+            )
             result = result + "\n" + budget_event
 
         # Post-process: ensure ALL event lines have step + unique event_index.
@@ -245,11 +251,15 @@ def serialize(self, key: str, value: dict) -> str:
             except json.JSONDecodeError:
                 enriched_lines.append(line)
                 event_type = "parse_error"
-            logger.info("SERIALIZE session=%s loop=%s type=%s step=%s ei=%s",
-                self._context_id, self._loop_id, event_type,
-                self._step_index, self._event_counter,
-                extra={"session_id": self._context_id, "node": key,
-                       "event_type": event_type, "step": self._step_index})
+            logger.info(
+                "SERIALIZE session=%s loop=%s type=%s step=%s ei=%s",
+                self._context_id,
+                self._loop_id,
+                event_type,
+                self._step_index,
+                self._event_counter,
+                extra={"session_id": self._context_id, "node": key, "event_type": event_type, "step": self._step_index},
+            )
 
         return "\n".join(enriched_lines)
 
@@ -277,13 +287,14 @@ def _serialize_assistant(self, msg: Any) -> str:
             if text.strip():
                 parts.append(json.dumps({"type": "llm_response", "content": text}))
             # Then emit the tool call
-            parts.append(json.dumps({
-                "type": "tool_call",
-                "tools": [
-                    _safe_tc(tc)
-                    for tc in tool_calls
-                ],
-            }))
+            parts.append(
+                json.dumps(
+                    {
+                        "type": "tool_call",
+                        "tools": [_safe_tc(tc) for tc in tool_calls],
+                    }
+                )
+            )
             return "\n".join(parts)
 
         return json.dumps({"type": "llm_response", "content": text})
@@ -323,23 +334,31 @@ def _serialize_executor(self, msg: Any, value: dict | None = None) -> str:
                         thinking_event[field.lstrip("_")] = se[field]
                 parts.append(json.dumps(thinking_event))
             elif se_type == "tool_call":
-                parts.append(json.dumps({
-                    "type": "tool_call",
-                    "loop_id": self._loop_id,
-                    "call_id": se.get("call_id", ""),
-                    "cycle": se.get("cycle", 1),
-                    "tools": se.get("tools", []),
-                }))
+                parts.append(
+                    json.dumps(
+                        {
+                            "type": "tool_call",
+                            "loop_id": self._loop_id,
+                            "call_id": se.get("call_id", ""),
+                            "cycle": se.get("cycle", 1),
+                            "tools": se.get("tools", []),
+                        }
+                    )
+                )
             elif se_type == "tool_result":
-                parts.append(json.dumps({
-                    "type": "tool_result",
-                    "loop_id": self._loop_id,
-                    "call_id": se.get("call_id", ""),
-                    "cycle": se.get("cycle", 1),
-                    "name": se.get("name", "unknown"),
-                    "output": se.get("output", "")[:2000],
-                    "status": se.get("status", "success"),
-                }))
+                parts.append(
+                    json.dumps(
+                        {
+                            "type": "tool_result",
+                            "loop_id": self._loop_id,
+                            "call_id": se.get("call_id", ""),
+                            "cycle": se.get("cycle", 1),
+                            "name": se.get("name", "unknown"),
+                            "output": se.get("output", "")[:2000],
+                            "status": se.get("status", "success"),
+                        }
+                    )
+                )
 
         self._micro_step += 1
 
@@ -376,20 +395,18 @@ def _serialize_executor(self, msg: Any, value: dict | None = None) -> str:
         if tool_calls:
             # Use LangGraph's tool_call_id for proper pairing with tool_result
             tc0 = tool_calls[0] if tool_calls else {}
-            call_id = (
-                tc0.get("id") if isinstance(tc0, dict)
-                else getattr(tc0, "id", None)
-            ) or str(uuid.uuid4())[:8]
+            call_id = (tc0.get("id") if isinstance(tc0, dict) else getattr(tc0, "id", None)) or str(uuid.uuid4())[:8]
             self._last_call_id = call_id
-            parts.append(json.dumps({
-                "type": "tool_call",
-                "loop_id": self._loop_id,
-                "call_id": call_id,
-                "tools": [
-                    _safe_tc(tc)
-                    for tc in tool_calls
-                ],
-            }))
+            parts.append(
+                json.dumps(
+                    {
+                        "type": "tool_call",
+                        "loop_id": self._loop_id,
+                        "call_id": call_id,
+                        "tools": [_safe_tc(tc) for tc in tool_calls],
+                    }
+                )
+            )
             return "\n".join(parts)
 
         # Emit tool_call event for text-parsed tools (no structured tool_calls)
@@ -397,15 +414,16 @@ def _serialize_executor(self, msg: Any, value: dict | None = None) -> str:
         if parsed_tools:
             call_id = str(uuid.uuid4())[:8]
             self._last_call_id = call_id
-            parts.append(json.dumps({
-                "type": "tool_call",
-                "loop_id": self._loop_id,
-                "call_id": call_id,
-                "tools": [
-                    {"name": t["name"], "args": t.get("args", {})}
-                    for t in parsed_tools
-                ],
-            }))
+            parts.append(
+                json.dumps(
+                    {
+                        "type": "tool_call",
+                        "loop_id": self._loop_id,
+                        "call_id": call_id,
+                        "tools": [{"name": t["name"], "args": t.get("args", {})} for t in parsed_tools],
+                    }
+                )
+            )
 
         return "\n".join(parts)
 
@@ -463,6 +481,7 @@ def _serialize_tool_result(self, msg: Any) -> str:
         # Keyword matching (e.g. "failure", "error") causes false positives
         # when command output contains those words in normal data.
         import re as _re
+
         exit_match = _re.search(r"EXIT_CODE:\s*(\d+)", content_str)
         is_error = (
             (exit_match is not None and exit_match.group(1) != "0")
@@ -474,14 +493,16 @@ def _serialize_tool_result(self, msg: Any) -> str:
         status = "error" if is_error else "success"
         # Use LangGraph's tool_call_id for proper pairing with tool_call
         call_id = getattr(msg, "tool_call_id", None) or self._last_call_id
-        return json.dumps({
-            "type": "tool_result",
-            "loop_id": self._loop_id,
-            "call_id": call_id,
-            "name": str(name),
-            "output": content_str[:2000],
-            "status": status,
-        })
+        return json.dumps(
+            {
+                "type": "tool_result",
+                "loop_id": self._loop_id,
+                "call_id": call_id,
+                "name": str(name),
+                "output": content_str[:2000],
+                "status": status,
+            }
+        )
 
     @staticmethod
     def _enrich_with_plan_store(payload: dict, value: dict) -> None:
@@ -553,7 +574,7 @@ def _serialize_reflector(self, value: dict) -> str:
         """Serialize a reflector node output — emits reflector_decision + legacy reflection."""
         done = value.get("done", False)
         current_step = value.get("current_step", 0)
-        step_results = value.get("step_results", [])
+        _step_results = value.get("step_results", [])
 
         # Extract decision text from message if present
         msgs = value.get("messages", [])
@@ -634,11 +655,7 @@ def _serialize_reporter(self, value: dict) -> str:
                         tc_info = _safe_tc(tc)
                         if tc_info["name"] == "respond_to_user":
                             args = tc_info["args"]
-                            final_answer = (
-                                args.get("response", "")
-                                if isinstance(args, dict)
-                                else str(args)
-                            )
+                            final_answer = args.get("response", "") if isinstance(args, dict) else str(args)
                             break
                     if final_answer:
                         break
@@ -690,8 +707,4 @@ def _extract_decision(text: str) -> str:
     @staticmethod
     def _extract_text_blocks(content: list) -> str:
         """Extract text from a list of content blocks."""
-        return " ".join(
-            b.get("text", "")
-            for b in content
-            if isinstance(b, dict) and b.get("type") == "text"
-        )[:2000]
+        return " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text")[:2000]
diff --git a/a2a/sandbox_agent/src/sandbox_agent/executor.py b/a2a/sandbox_agent/src/sandbox_agent/executor.py
index 7d3777a6..6dc5f7eb 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/executor.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/executor.py
@@ -207,8 +207,7 @@ def _check_interpreter_bypass(self, command: str) -> str | None:
         inner_permission = self._check_permission(inner_command)
         if inner_permission is PermissionResult.DENY:
             return (
-                f"Permission denied: interpreter bypass detected. "
-                f"Inner command '{inner_command}' is denied by policy."
+                f"Permission denied: interpreter bypass detected. Inner command '{inner_command}' is denied by policy."
             )
 
         # Also check the inner command against sources.json policy
@@ -216,8 +215,7 @@ def _check_interpreter_bypass(self, command: str) -> str | None:
         inner_sources_denial = self._check_sources(inner_command)
         if inner_sources_denial:
             return (
-                f"Blocked: interpreter bypass detected. "
-                f"Inner command violates sources policy: {inner_sources_denial}"
+                f"Blocked: interpreter bypass detected. Inner command violates sources policy: {inner_sources_denial}"
             )
 
         return None
@@ -324,10 +322,7 @@ async def _execute(self, command: str) -> ExecutionResult:
                 await process.wait()
                 return ExecutionResult(
                     stdout="",
-                    stderr=(
-                        f"Command timed out after {timeout} seconds "
-                        f"and was killed: '{command}'"
-                    ),
+                    stderr=(f"Command timed out after {timeout} seconds and was killed: '{command}'"),
                     exit_code=-1,
                 )
 
diff --git a/a2a/sandbox_agent/src/sandbox_agent/graph.py b/a2a/sandbox_agent/src/sandbox_agent/graph.py
index 5cbe603e..6f89c33f 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/graph.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/graph.py
@@ -56,7 +56,7 @@
 from langchain_openai import ChatOpenAI
 from langgraph.graph import MessagesState, StateGraph
 from langgraph.prebuilt import ToolNode, tools_condition
-from langgraph.types import Send, interrupt
+from langgraph.types import interrupt
 
 try:
     from langgraph.errors import GraphInterrupt
@@ -64,12 +64,13 @@
     # Fallback for older langgraph versions
     GraphInterrupt = type("GraphInterrupt", (Exception,), {})
 
+from sandbox_agent import plan_store as ps
 from sandbox_agent.budget import AgentBudget
 from sandbox_agent.executor import HitlRequired, SandboxExecutor
 from sandbox_agent.permissions import PermissionChecker
 from sandbox_agent.reasoning import (
-    PlanStep,
     _DEBUG_PROMPTS,
+    PlanStep,
     executor_node,
     planner_node,
     reflector_node,
@@ -78,9 +79,8 @@
     route_reflector,
     router_node,
 )
-from sandbox_agent import plan_store as ps
 from sandbox_agent.sources import SourcesConfig
-from sandbox_agent.subagents import make_delegate_tool, make_explore_tool
+from sandbox_agent.subagents import make_explore_tool
 
 logger = logging.getLogger(__name__)
 
@@ -263,11 +263,13 @@ async def shell(command: str) -> str:
             # The interrupt() call suspends the graph state. The A2A task
             # transitions to input_required. Only an explicit human
             # approval (via the HITLManager channel) resumes execution.
-            approval = interrupt({
-                "type": "approval_required",
-                "command": exc.command,
-                "message": f"Command '{exc.command}' requires human approval.",
-            })
+            approval = interrupt(
+                {
+                    "type": "approval_required",
+                    "command": exc.command,
+                    "message": f"Command '{exc.command}' requires human approval.",
+                }
+            )
             # If we reach here, the human approved — execute the command.
             if isinstance(approval, dict) and approval.get("approved"):
                 result = await executor._execute(command)
@@ -278,8 +280,9 @@ async def shell(command: str) -> str:
         output = _format_result(result)
         if result.exit_code != 0 and _is_rate_limited(output):
             import asyncio
+
             for attempt in range(1, 4):  # up to 3 retries
-                delay = 2 ** attempt  # 2s, 4s, 8s
+                delay = 2**attempt  # 2s, 4s, 8s
                 logger.info("Rate limit detected, retry %d/3 after %ds", attempt, delay)
                 await asyncio.sleep(delay)
                 try:
@@ -322,14 +325,17 @@ def _format_result(result: Any) -> str:
 def _is_rate_limited(output: str) -> bool:
     """Detect rate-limit errors in command output."""
     lower = output.lower()
-    return any(pattern in lower for pattern in (
-        "rate limit exceeded",
-        "rate limit",
-        "too many requests",
-        "429",
-        "api rate limit",
-        "secondary rate limit",
-    ))
+    return any(
+        pattern in lower
+        for pattern in (
+            "rate limit exceeded",
+            "rate limit",
+            "too many requests",
+            "429",
+            "api rate limit",
+            "secondary rate limit",
+        )
+    )
 
 
 def _make_file_read_tool(workspace_path: str) -> Any:
@@ -430,7 +436,9 @@ async def grep(pattern: str, path: str = ".", include: str = "") -> str:
 
         try:
             proc = await _aio.create_subprocess_exec(
-                *cmd, stdout=_aio.subprocess.PIPE, stderr=_aio.subprocess.PIPE,
+                *cmd,
+                stdout=_aio.subprocess.PIPE,
+                stderr=_aio.subprocess.PIPE,
             )
             stdout, stderr = await _aio.wait_for(proc.communicate(), timeout=30)
             out = stdout.decode(errors="replace")[:10000]
@@ -461,6 +469,7 @@ async def glob(pattern: str) -> str:
             Newline-separated list of matching file paths relative to workspace.
         """
         import fnmatch
+
         matches = []
         for p in sorted(ws_root.rglob("*")):
             if p.is_file():
@@ -499,9 +508,10 @@ async def web_fetch(url: str) -> str:
         Returns:
             The page content as text, or an error message.
         """
-        import httpx
         from urllib.parse import urlparse
 
+        import httpx
+
         parsed = urlparse(url)
         domain = parsed.hostname or ""
 
@@ -524,10 +534,11 @@ async def web_fetch(url: str) -> str:
                 if "text/html" in content_type:
                     # Simple HTML tag stripping for readability
                     import re
-                    text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL)
-                    text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL)
-                    text = re.sub(r'<[^>]+>', ' ', text)
-                    text = re.sub(r'\s+', ' ', text).strip()
+
+                    text = re.sub(r"<script[^>]*>.*?</script>", "", text, flags=re.DOTALL)
+                    text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL)
+                    text = re.sub(r"<[^>]+>", " ", text)
+                    text = re.sub(r"\s+", " ", text).strip()
 
                 # Truncate very long responses
                 if len(text) > 50000:
@@ -672,7 +683,7 @@ def _make_llm(node_type: str) -> ChatOpenAI:
     llm_for_reflector = _make_llm("reflector") if config.llm_model_reflector else llm
     llm_for_reporter = _make_llm("reporter") if config.llm_model_reporter else llm
     llm_for_thinking = _make_llm("thinking") if config.llm_model_thinking else llm
-    llm_for_micro = _make_llm("micro_reasoning") if config.llm_model_micro_reasoning else llm
+    _llm_for_micro = _make_llm("micro_reasoning") if config.llm_model_micro_reasoning else llm
 
     # -- Tools --------------------------------------------------------------
     # Create tool instances once — shared across node subsets.
@@ -742,7 +753,9 @@ async def _reflector(state: SandboxState) -> dict[str, Any]:
 
     async def _reporter(state: SandboxState) -> dict[str, Any]:
         return await reporter_node(
-            state, llm_reporter, budget=budget,
+            state,
+            llm_reporter,
+            budget=budget,
             llm_reason=llm_executor_reason,
             tools=read_only_tools,
         )
@@ -754,7 +767,8 @@ async def _step_selector(state: SandboxState) -> dict[str, Any]:
         a targeted brief for the executor — what to do, what worked/failed
         before, and what to avoid.
         """
-        from langchain_core.messages import SystemMessage as SM, HumanMessage as HM
+        from langchain_core.messages import HumanMessage as HM
+        from langchain_core.messages import SystemMessage as SM
 
         plan = state.get("plan", [])
         plan_steps = list(state.get("plan_steps", []))
@@ -796,13 +810,13 @@ async def _step_selector(state: SandboxState) -> dict[str, Any]:
             result_hint = ""
             if isinstance(_ps, dict) and _ps.get("result_summary"):
                 result_hint = f" — {_ps['result_summary'][:100]}"
-            plan_summary.append(f"  {marker} {i+1}. [{status}] {step[:80]}{result_hint}")
+            plan_summary.append(f"  {marker} {i + 1}. [{status}] {step[:80]}{result_hint}")
 
         # Gather recent tool results (last 3 ToolMessages)
         recent_results = []
         for m in reversed(messages[-10:]):
-            if hasattr(m, 'name') and getattr(m, 'type', '') == 'tool':
-                content = str(getattr(m, 'content', ''))[:300]
+            if hasattr(m, "name") and getattr(m, "type", "") == "tool":
+                content = str(getattr(m, "content", ""))[:300]
                 recent_results.insert(0, f"  [{m.name}] {content}")
                 if len(recent_results) >= 3:
                     break
@@ -830,7 +844,7 @@ async def _step_selector(state: SandboxState) -> dict[str, Any]:
 Next step to execute: {next_step + 1}. {step_text}
 
 Recent tool results:
-{chr(10).join(recent_results) if recent_results else '(none yet)'}
+{chr(10).join(recent_results) if recent_results else "(none yet)"}
 
 WORKSPACE RULE: Each shell command starts fresh in /workspace. Bare `cd` has no effect.
 If the step involves a cloned repo, always write `cd repos/<repo> && <command>` in the brief.
@@ -843,10 +857,8 @@ async def _step_selector(state: SandboxState) -> dict[str, Any]:
         try:
             response = await llm.ainvoke([sys_msg, user_msg])
             brief = response.content.strip()
-            usage = getattr(response, 'usage_metadata', None) or {}
-            budget.add_tokens(
-                usage.get('input_tokens', 0) + usage.get('output_tokens', 0)
-            )
+            usage = getattr(response, "usage_metadata", None) or {}
+            budget.add_tokens(usage.get("input_tokens", 0) + usage.get("output_tokens", 0))
         except Exception as e:
             logger.warning("StepSelector LLM call failed: %s — using default brief", e)
             brief = f"Execute step {next_step + 1}: {step_text}"
@@ -863,6 +875,7 @@ async def _step_selector(state: SandboxState) -> dict[str, Any]:
             result["_plan_store"] = store
         if _DEBUG_PROMPTS:
             from sandbox_agent.context_builders import LLMCallCapture
+
             result["_system_prompt"] = prompt[:10000]
             result["_prompt_messages"] = [
                 {"role": "system", "preview": "Step coordinator brief prompt"},
@@ -877,8 +890,10 @@ async def _step_selector(state: SandboxState) -> dict[str, Any]:
 
     def _make_safe_tool_wrapper(tool_node: ToolNode, label: str):
         """Create a safe tool execution wrapper for a ToolNode."""
+
         async def _safe(state: SandboxState) -> dict[str, Any]:
             from langchain_core.messages import ToolMessage
+
             try:
                 return await tool_node.ainvoke(state)
             except (GraphInterrupt, KeyboardInterrupt, SystemExit):
@@ -892,18 +907,23 @@ async def _safe(state: SandboxState) -> dict[str, Any]:
                     for tc in getattr(last, "tool_calls", []):
                         tc_id = tc.get("id", "unknown") if isinstance(tc, dict) else getattr(tc, "id", "unknown")
                         tc_name = tc.get("name", "unknown") if isinstance(tc, dict) else getattr(tc, "name", "unknown")
-                        error_msgs.append(ToolMessage(
-                            content=f"Tool error: {exc}",
-                            tool_call_id=tc_id,
-                            name=tc_name,
-                        ))
+                        error_msgs.append(
+                            ToolMessage(
+                                content=f"Tool error: {exc}",
+                                tool_call_id=tc_id,
+                                name=tc_name,
+                            )
+                        )
                 if not error_msgs:
-                    error_msgs.append(ToolMessage(
-                        content=f"Tool execution failed: {exc}",
-                        tool_call_id="error",
-                        name="unknown",
-                    ))
+                    error_msgs.append(
+                        ToolMessage(
+                            content=f"Tool execution failed: {exc}",
+                            tool_call_id="error",
+                            name="unknown",
+                        )
+                    )
                 return {"messages": error_msgs}
+
         return _safe
 
     _reporter_tool_node = ToolNode(read_only_tools)
diff --git a/a2a/sandbox_agent/src/sandbox_agent/graph_card.py b/a2a/sandbox_agent/src/sandbox_agent/graph_card.py
index 896e7b9d..98d2bacb 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/graph_card.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/graph_card.py
@@ -144,10 +144,7 @@
     },
     "thinking": {
         "category": "reasoning",
-        "description": (
-            "Intermediate thinking iteration from a reasoning LLM "
-            "(bare model, no tools)."
-        ),
+        "description": ("Intermediate thinking iteration from a reasoning LLM (bare model, no tools)."),
         "langgraph_nodes": ["planner", "executor", "reflector"],
         "has_llm_call": True,
         "fields": {
@@ -185,10 +182,7 @@
     },
     "micro_reasoning": {
         "category": "reasoning",
-        "description": (
-            "Executor's intermediate LLM reasoning within a single plan step "
-            "(tool-loop iteration)."
-        ),
+        "description": ("Executor's intermediate LLM reasoning within a single plan step (tool-loop iteration)."),
         "langgraph_nodes": ["executor"],
         "has_llm_call": True,
         "fields": {
@@ -275,9 +269,7 @@
     # ── Decision ──────────────────────────────────────────────────────
     "reflector_decision": {
         "category": "decision",
-        "description": (
-            "Reflector reviewed execution and decided: continue, replan, or done."
-        ),
+        "description": ("Reflector reviewed execution and decided: continue, replan, or done."),
         "langgraph_nodes": ["reflector"],
         "has_llm_call": True,
         "fields": {
@@ -333,9 +325,7 @@
         "debug_fields": {
             "logic": {
                 "type": "str",
-                "description": (
-                    "Routing logic: checks plan_status to decide resume vs plan."
-                ),
+                "description": ("Routing logic: checks plan_status to decide resume vs plan."),
             },
         },
     },
@@ -399,9 +389,7 @@
     },
     "node_transition": {
         "category": "meta",
-        "description": (
-            "Internal marker indicating a graph-level transition between nodes."
-        ),
+        "description": ("Internal marker indicating a graph-level transition between nodes."),
         "langgraph_nodes": [],
         "has_llm_call": False,
         "fields": {
@@ -420,8 +408,7 @@
     "hitl_request": {
         "category": "interaction",
         "description": (
-            "Human-in-the-loop approval request — the executor is pausing "
-            "to ask the user before proceeding."
+            "Human-in-the-loop approval request — the executor is pausing to ask the user before proceeding."
         ),
         "langgraph_nodes": ["executor"],
         "has_llm_call": False,
@@ -462,38 +449,24 @@
 
 #: Human-readable description for each node in the compiled graph.
 TOPOLOGY_NODE_DESCRIPTIONS: Dict[str, str] = {
-    "router": (
-        "Entry node — decides whether to create a new plan or resume execution "
-        "of an existing plan."
-    ),
+    "router": ("Entry node — decides whether to create a new plan or resume execution of an existing plan."),
     "planner": (
-        "Creates or revises a multi-step plan using an LLM with planning tools "
-        "(glob, grep, file_read, file_write)."
-    ),
-    "planner_tools": (
-        "Executes tool calls issued by the planner (workspace inspection, "
-        "plan persistence)."
-    ),
-    "step_selector": (
-        "Picks the next plan step to execute and prepares the executor context."
+        "Creates or revises a multi-step plan using an LLM with planning tools (glob, grep, file_read, file_write)."
     ),
+    "planner_tools": ("Executes tool calls issued by the planner (workspace inspection, plan persistence)."),
+    "step_selector": ("Picks the next plan step to execute and prepares the executor context."),
     "executor": (
         "Executes the current plan step using an LLM with the full tool suite "
         "(shell, files, grep, glob, web_fetch, explore, delegate)."
     ),
-    "tools": (
-        "Executes tool calls issued by the executor."
-    ),
+    "tools": ("Executes tool calls issued by the executor."),
     "reflector": (
         "Reviews execution results and decides whether to continue, replan, "
         "or declare done. Uses read-only tools (glob, grep, file_read)."
     ),
-    "reflector_tools": (
-        "Executes read-only tool calls issued by the reflector for verification."
-    ),
+    "reflector_tools": ("Executes read-only tool calls issued by the reflector for verification."),
     "reflector_route": (
-        "Pass-through node that routes the reflector's decision to the next node "
-        "(reporter, step_selector, or planner)."
+        "Pass-through node that routes the reflector's decision to the next node (reporter, step_selector, or planner)."
     ),
     "reporter": (
         "Generates the final user-facing answer by synthesizing all execution "
@@ -535,11 +508,7 @@ def build_graph_card(
     graph = compiled.get_graph()
 
     # ── Nodes ─────────────────────────────────────────────────────────
-    raw_nodes: List[str] = [
-        node_id
-        for node_id in graph.nodes
-        if node_id not in ("__start__", "__end__")
-    ]
+    raw_nodes: List[str] = [node_id for node_id in graph.nodes if node_id not in ("__start__", "__end__")]
     nodes: Dict[str, Dict[str, str]] = {}
     for node_id in raw_nodes:
         nodes[node_id] = {
diff --git a/a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py b/a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py
index ff9b35ca..2228d924 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py
@@ -39,16 +39,16 @@
 
 # ABI v1 access flags (13 flags)
 _ACCESS_FS_V1 = (
-    (1 << 0)   # EXECUTE
-    | (1 << 1)   # WRITE_FILE
-    | (1 << 2)   # READ_FILE
-    | (1 << 3)   # READ_DIR
-    | (1 << 4)   # REMOVE_DIR
-    | (1 << 5)   # REMOVE_FILE
-    | (1 << 6)   # MAKE_CHAR
-    | (1 << 7)   # MAKE_DIR
-    | (1 << 8)   # MAKE_REG
-    | (1 << 9)   # MAKE_SOCK
+    (1 << 0)  # EXECUTE
+    | (1 << 1)  # WRITE_FILE
+    | (1 << 2)  # READ_FILE
+    | (1 << 3)  # READ_DIR
+    | (1 << 4)  # REMOVE_DIR
+    | (1 << 5)  # REMOVE_FILE
+    | (1 << 6)  # MAKE_CHAR
+    | (1 << 7)  # MAKE_DIR
+    | (1 << 8)  # MAKE_REG
+    | (1 << 9)  # MAKE_SOCK
     | (1 << 10)  # MAKE_FIFO
     | (1 << 11)  # MAKE_BLOCK
     | (1 << 12)  # MAKE_SYM
@@ -62,9 +62,9 @@
 
 # Read-only subset (for ro_paths)
 ACCESS_FS_READ_ONLY = (
-    (1 << 0)   # EXECUTE
-    | (1 << 2)   # READ_FILE
-    | (1 << 3)   # READ_DIR
+    (1 << 0)  # EXECUTE
+    | (1 << 2)  # READ_FILE
+    | (1 << 3)  # READ_DIR
 )
 
 _libc = ctypes.CDLL("libc.so.6", use_errno=True)
diff --git a/a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py b/a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py
index 74f46888..8b455dcc 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py
@@ -84,9 +84,7 @@ def probe_landlock() -> int:
     """)
 
     # Find the package root so the child can import sandbox_agent
-    package_src = str(
-        __import__("pathlib").Path(__file__).resolve().parent.parent
-    )
+    package_src = str(__import__("pathlib").Path(__file__).resolve().parent.parent)
 
     result = subprocess.run(
         [sys.executable, "-c", child_script],
diff --git a/a2a/sandbox_agent/src/sandbox_agent/observability.py b/a2a/sandbox_agent/src/sandbox_agent/observability.py
index 259be8d2..351f0623 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/observability.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/observability.py
@@ -15,7 +15,7 @@
 import logging
 import os
 from contextvars import ContextVar
-from typing import Any, Optional
+from typing import Any
 
 logger = logging.getLogger(__name__)
 
@@ -27,7 +27,7 @@
 # ContextVar to pass root span from middleware to agent code.
 # This allows execute() to access the middleware-created root span
 # even though trace.get_current_span() would return a child span.
-_root_span_var: ContextVar = ContextVar('root_span', default=None)
+_root_span_var: ContextVar = ContextVar("root_span", default=None)
 
 
 def get_root_span():
@@ -44,7 +44,8 @@ def get_root_span():
 
 # OpenInference semantic conventions
 try:
-    from openinference.semconv.trace import SpanAttributes, OpenInferenceSpanKindValues
+    from openinference.semconv.trace import OpenInferenceSpanKindValues, SpanAttributes
+
     OPENINFERENCE_AVAILABLE = True
 except ImportError:
     OPENINFERENCE_AVAILABLE = False
@@ -54,6 +55,7 @@ def get_root_span():
 def _get_otlp_exporter(endpoint: str):
     """Get HTTP OTLP exporter."""
     from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+
     if not endpoint.endswith("/v1/traces"):
         endpoint = endpoint.rstrip("/") + "/v1/traces"
     return OTLPSpanExporter(endpoint=endpoint)
@@ -76,8 +78,7 @@ def setup_observability() -> bool:
 
     if not otlp_endpoint:
         logger.warning(
-            "OTEL_EXPORTER_OTLP_ENDPOINT not set — tracing disabled. "
-            "Set this env var to enable OpenTelemetry tracing."
+            "OTEL_EXPORTER_OTLP_ENDPOINT not set — tracing disabled. Set this env var to enable OpenTelemetry tracing."
         )
         return False
 
@@ -91,13 +92,13 @@ def setup_observability() -> bool:
 def _setup_observability_inner(service_name: str, namespace: str, otlp_endpoint: str) -> bool:
     """Internal setup — may raise. Called by setup_observability() which catches all errors."""
     from opentelemetry import trace
-    from opentelemetry.sdk.trace import TracerProvider
-    from opentelemetry.sdk.trace.export import BatchSpanProcessor
-    from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
+    from opentelemetry.baggage.propagation import W3CBaggagePropagator
     from opentelemetry.propagate import set_global_textmap
     from opentelemetry.propagators.composite import CompositePropagator
+    from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION, Resource
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
     from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
-    from opentelemetry.baggage.propagation import W3CBaggagePropagator
 
     logger.info("=" * 60)
     logger.info("Setting up OpenTelemetry observability")
@@ -111,45 +112,51 @@ def _setup_observability_inner(service_name: str, namespace: str, otlp_endpoint:
     # Create resource with service and MLflow attributes.
     # Resource attributes are STATIC and apply to ALL spans/traces.
     # See: https://mlflow.org/docs/latest/genai/tracing/opentelemetry/
-    resource = Resource(attributes={
-        # Standard OTEL service attributes
-        SERVICE_NAME: service_name,
-        SERVICE_VERSION: AGENT_VERSION,
-        "service.namespace": namespace,
-        "k8s.namespace.name": namespace,
-        # MLflow static metadata (applies to all traces)
-        "mlflow.traceName": AGENT_NAME,
-        "mlflow.source": service_name,
-        # GenAI static attributes
-        "gen_ai.agent.name": AGENT_NAME,
-        "gen_ai.agent.version": AGENT_VERSION,
-        "gen_ai.system": AGENT_FRAMEWORK,
-    })
+    resource = Resource(
+        attributes={
+            # Standard OTEL service attributes
+            SERVICE_NAME: service_name,
+            SERVICE_VERSION: AGENT_VERSION,
+            "service.namespace": namespace,
+            "k8s.namespace.name": namespace,
+            # MLflow static metadata (applies to all traces)
+            "mlflow.traceName": AGENT_NAME,
+            "mlflow.source": service_name,
+            # GenAI static attributes
+            "gen_ai.agent.name": AGENT_NAME,
+            "gen_ai.agent.version": AGENT_VERSION,
+            "gen_ai.system": AGENT_FRAMEWORK,
+        }
+    )
 
     # Create and configure tracer provider
     tracer_provider = TracerProvider(resource=resource)
-    tracer_provider.add_span_processor(
-        BatchSpanProcessor(_get_otlp_exporter(otlp_endpoint))
-    )
+    tracer_provider.add_span_processor(BatchSpanProcessor(_get_otlp_exporter(otlp_endpoint)))
     trace.set_tracer_provider(tracer_provider)
 
     # Auto-instrument LangChain with OpenInference
     try:
         from openinference.instrumentation.langchain import LangChainInstrumentor
+
         LangChainInstrumentor().instrument()
         logger.info("LangChain instrumented with OpenInference")
     except ImportError:
         logger.warning("openinference-instrumentation-langchain not available")
 
     # Configure W3C Trace Context propagation
-    set_global_textmap(CompositePropagator([
-        TraceContextTextMapPropagator(),
-        W3CBaggagePropagator(),
-    ]))
+    set_global_textmap(
+        CompositePropagator(
+            [
+                TraceContextTextMapPropagator(),
+                W3CBaggagePropagator(),
+            ]
+        )
+    )
 
     # Instrument OpenAI for GenAI semantic conventions
     try:
         from opentelemetry.instrumentation.openai import OpenAIInstrumentor
+
         OpenAIInstrumentor().instrument()
         logger.info("OpenAI instrumented with GenAI semantic conventions")
     except ImportError:
@@ -206,15 +213,16 @@ def create_tracing_middleware():
         app = server.build()
         app.add_middleware(BaseHTTPMiddleware, dispatch=create_tracing_middleware())
     """
+    from opentelemetry import context
+    from opentelemetry.trace import SpanKind, Status, StatusCode
     from starlette.requests import Request
     from starlette.responses import Response, StreamingResponse
-    from opentelemetry import trace, context
-    from opentelemetry.trace import Status, StatusCode, SpanKind
 
     async def tracing_middleware(request: Request, call_next):
         # Skip non-API paths (health checks, agent card, etc.)
         if request.url.path in [
-            "/health", "/ready",
+            "/health",
+            "/ready",
             "/.well-known/agent-card.json",
             "/.well-known/agent-graph-card.json",
         ]:
@@ -311,9 +319,7 @@ async def tracing_middleware(request: Request, call_next):
 
                     # Try to capture response for output attributes.
                     # This only works for non-streaming responses.
-                    if isinstance(response, Response) and not isinstance(
-                        response, StreamingResponse
-                    ):
+                    if isinstance(response, Response) and not isinstance(response, StreamingResponse):
                         # Read response body — we MUST recreate response after
                         _chunks: list[bytes] = []
                         async for chunk in response.body_iterator:
@@ -331,15 +337,9 @@ async def tracing_middleware(request: Request, call_next):
                                     if parts:
                                         output_text = parts[0].get("text", "")
                                         if output_text:
-                                            span.set_attribute(
-                                                "gen_ai.completion", output_text[:1000]
-                                            )
-                                            span.set_attribute(
-                                                "output.value", output_text[:1000]
-                                            )
-                                            span.set_attribute(
-                                                "mlflow.spanOutputs", output_text[:1000]
-                                            )
+                                            span.set_attribute("gen_ai.completion", output_text[:1000])
+                                            span.set_attribute("output.value", output_text[:1000])
+                                            span.set_attribute("mlflow.spanOutputs", output_text[:1000])
                         except Exception as e:
                             logger.debug("Could not parse response body: %s", e)
 
diff --git a/a2a/sandbox_agent/src/sandbox_agent/permissions.py b/a2a/sandbox_agent/src/sandbox_agent/permissions.py
index 9e3a8190..7810c5ac 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/permissions.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/permissions.py
@@ -169,9 +169,7 @@ def _resolve_workspace(settings: dict[str, Any]) -> str:
         return re.sub(r"/\$\{[^}]+\}$", "", raw)
 
     @staticmethod
-    def _parse_rules(
-        raw_rules: list[str], workspace: str
-    ) -> list[tuple[str, str]]:
+    def _parse_rules(raw_rules: list[str], workspace: str) -> list[tuple[str, str]]:
         """Parse rule strings into ``(operation_type, glob_pattern)`` pairs.
 
         ``${WORKSPACE}`` inside a rule body is expanded to *workspace*.
@@ -243,7 +241,7 @@ def _match_shell(pattern: str, operation: str) -> bool:
         if colon_idx == -1:
             return False
         prefix = pattern[:colon_idx]
-        glob_part = pattern[colon_idx + 1:]
+        glob_part = pattern[colon_idx + 1 :]
 
         if not operation:
             return False
@@ -257,7 +255,7 @@ def _match_shell(pattern: str, operation: str) -> bool:
             return False
 
         # What comes after the prefix (may be empty).
-        remainder = operation[len(prefix):]
+        remainder = operation[len(prefix) :]
 
         # If there is a remainder, it must be separated by a space or be
         # empty (exact match).  This prevents "grep" matching "grepping".
@@ -296,7 +294,7 @@ def check_interpreter_bypass(cls, operation: str) -> list[str]:
         while i < len(parts):
             if parts[i] in cls._EXEC_FLAGS and i + 1 < len(parts):
                 # Everything after the flag is the inline command.
-                inline = " ".join(parts[i + 1:])
+                inline = " ".join(parts[i + 1 :])
                 # Strip surrounding quotes if present.
                 if len(inline) >= 2 and inline[0] in ('"', "'") and inline[-1] == inline[0]:
                     inline = inline[1:-1]
@@ -331,10 +329,10 @@ def _match_structured(pattern: str, operation: str) -> bool:
             return False
 
         p_action = pattern[:p_colon]
-        p_path_glob = pattern[p_colon + 1:]
+        p_path_glob = pattern[p_colon + 1 :]
 
         o_action = operation[:o_colon]
-        o_path = operation[o_colon + 1:]
+        o_path = operation[o_colon + 1 :]
 
         if p_action != o_action:
             return False
diff --git a/a2a/sandbox_agent/src/sandbox_agent/plan_store.py b/a2a/sandbox_agent/src/sandbox_agent/plan_store.py
index 47501753..456a768c 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/plan_store.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/plan_store.py
@@ -108,14 +108,9 @@ def add_steps(
         raise ValueError(f"Only replanner can add steps, got creator={creator}")
 
     steps = plan.get("steps", {})
-    non_terminal = [
-        k for k, s in steps.items()
-        if s.get("status") not in _TERMINAL
-    ]
+    non_terminal = [k for k, s in steps.items() if s.get("status") not in _TERMINAL]
     if non_terminal:
-        raise ValueError(
-            f"Cannot add steps: steps {non_terminal} are still active"
-        )
+        raise ValueError(f"Cannot add steps: steps {non_terminal} are still active")
 
     new_plan = _deep_copy(plan)
     next_idx = max((int(k) for k in steps), default=0) + 1
@@ -143,7 +138,10 @@ def add_steps(
         new_plan["steps"][first_new]["subplans"]["a"]["status"] = "running"
 
     logger.info(
-        "Added %d steps (start=%s) by %s", len(new_steps), first_new, creator,
+        "Added %d steps (start=%s) by %s",
+        len(new_steps),
+        first_new,
+        creator,
     )
     return new_plan
 
@@ -167,10 +165,7 @@ def add_alternative_subplan(
     next_key = chr(ord("a") + len(existing_keys))
 
     step["subplans"][next_key] = {
-        "substeps": {
-            str(i + 1): {"description": desc, "status": "pending"}
-            for i, desc in enumerate(substeps)
-        },
+        "substeps": {str(i + 1): {"description": desc, "status": "pending"} for i, desc in enumerate(substeps)},
         "status": "running",
         "created_by": "replanner",
     }
@@ -179,7 +174,9 @@ def add_alternative_subplan(
 
     logger.info(
         "Created alternative subplan '%s' for step %s (%d substeps)",
-        next_key, step_key, len(substeps),
+        next_key,
+        step_key,
+        len(substeps),
     )
     return new_plan, next_key
 
@@ -293,10 +290,7 @@ def all_terminal(plan: dict[str, Any]) -> bool:
 
 def to_flat_plan(plan: dict[str, Any]) -> list[str]:
     """Convert to flat list of step descriptions (backward compat)."""
-    return [
-        plan["steps"][k]["description"]
-        for k in sorted(plan.get("steps", {}), key=int)
-    ]
+    return [plan["steps"][k]["description"] for k in sorted(plan.get("steps", {}), key=int)]
 
 
 def to_flat_plan_steps(plan: dict[str, Any]) -> list[dict[str, Any]]:
@@ -307,15 +301,17 @@ def to_flat_plan_steps(plan: dict[str, Any]) -> list[dict[str, Any]]:
         active = step.get("active_subplan", "a")
         subplan = step.get("subplans", {}).get(active, {})
         alt_count = len(step.get("subplans", {})) - 1  # alternatives (excl. original)
-        result.append({
-            "index": int(key) - 1,  # 0-based for compat
-            "description": step["description"],
-            "status": step["status"],
-            "active_subplan": active,
-            "alternative_count": alt_count,
-            "substeps": list(subplan.get("substeps", {}).values()),
-            "created_by": subplan.get("created_by", "planner"),
-        })
+        result.append(
+            {
+                "index": int(key) - 1,  # 0-based for compat
+                "description": step["description"],
+                "status": step["status"],
+                "active_subplan": active,
+                "alternative_count": alt_count,
+                "substeps": list(subplan.get("substeps", {}).values()),
+                "created_by": subplan.get("created_by", "planner"),
+            }
+        )
     return result
 
 
@@ -327,4 +323,5 @@ def to_flat_plan_steps(plan: dict[str, Any]) -> list[dict[str, Any]]:
 def _deep_copy(d: dict) -> dict:
     """Fast deep copy for JSON-compatible dicts."""
     import json
+
     return json.loads(json.dumps(d))
diff --git a/a2a/sandbox_agent/src/sandbox_agent/reasoning.py b/a2a/sandbox_agent/src/sandbox_agent/reasoning.py
index dcd471fe..b75d6903 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/reasoning.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/reasoning.py
@@ -36,8 +36,8 @@
 
 from langchain_core.messages import AIMessage, SystemMessage, ToolMessage
 
-from sandbox_agent.budget import AgentBudget
 from sandbox_agent import plan_store as ps
+from sandbox_agent.budget import AgentBudget
 
 # openai raises APIStatusError for non-2xx responses (e.g. 402 from the budget proxy)
 try:
@@ -52,15 +52,13 @@ def _is_budget_exceeded_error(exc: Exception) -> bool:
         return exc.status_code == 402
     return "budget_exceeded" in str(exc).lower() or "402" in str(exc)
 
+
 logger = logging.getLogger(__name__)
 
 # Sentinel text returned by the executor when all tool calls in a step have
 # already been executed (dedup logic).  This is an internal coordination
 # message and must never appear in user-visible output.
-_DEDUP_SENTINEL = (
-    "Step completed — all requested tool calls "
-    "have been executed and results are available."
-)
+_DEDUP_SENTINEL = "Step completed — all requested tool calls have been executed and results are available."
 
 import os as _os
 
@@ -69,10 +67,17 @@ def _is_budget_exceeded_error(exc: Exception) -> bool:
 _DEBUG_PROMPTS = _os.environ.get("SANDBOX_DEBUG_PROMPTS", "1") == "1"
 
 # Messages that trigger plan resumption rather than replanning.
-_CONTINUE_PHRASES = frozenset({
-    "continue", "continue on the plan", "go on", "proceed",
-    "keep going", "next", "carry on",
-})
+_CONTINUE_PHRASES = frozenset(
+    {
+        "continue",
+        "continue on the plan",
+        "go on",
+        "proceed",
+        "keep going",
+        "next",
+        "carry on",
+    }
+)
 
 
 # ---------------------------------------------------------------------------
@@ -82,15 +87,15 @@ def _is_budget_exceeded_error(exc: Exception) -> bool:
 
 class PlanStep(TypedDict, total=False):
     """A single step in the plan with status tracking."""
+
     index: int
     description: str
-    status: str          # "pending" | "running" | "done" | "failed" | "skipped"
+    status: str  # "pending" | "running" | "done" | "failed" | "skipped"
     tool_calls: list[str]
     result_summary: str
     iteration_added: int
 
 
-
 def _summarize_bound_tools(llm_with_tools: Any) -> list[dict[str, Any]]:
     """Extract bound tool schemas from a LangChain RunnableBinding for debug display.
 
@@ -109,26 +114,28 @@ def _summarize_bound_tools(llm_with_tools: Any) -> list[dict[str, Any]]:
         for t in tools:
             if isinstance(t, dict):
                 # Already in OpenAI format
-                result.append({
-                    "name": t.get("function", {}).get("name", "?"),
-                    "description": t.get("function", {}).get("description", "")[:200],
-                    "parameters": t.get("function", {}).get("parameters", {}),
-                })
+                result.append(
+                    {
+                        "name": t.get("function", {}).get("name", "?"),
+                        "description": t.get("function", {}).get("description", "")[:200],
+                        "parameters": t.get("function", {}).get("parameters", {}),
+                    }
+                )
             else:
                 # LangChain tool object
-                result.append({
-                    "name": getattr(t, "name", "?"),
-                    "description": (getattr(t, "description", "") or "")[:200],
-                    "parameters": getattr(t, "args_schema", {}) if hasattr(t, "args_schema") else {},
-                })
+                result.append(
+                    {
+                        "name": getattr(t, "name", "?"),
+                        "description": (getattr(t, "description", "") or "")[:200],
+                        "parameters": getattr(t, "args_schema", {}) if hasattr(t, "args_schema") else {},
+                    }
+                )
         return result
     except Exception:
         return []
 
 
-def _make_plan_steps(
-    descriptions: list[str], iteration: int = 0
-) -> list[PlanStep]:
+def _make_plan_steps(descriptions: list[str], iteration: int = 0) -> list[PlanStep]:
     """Convert a list of step descriptions into PlanStep dicts."""
     return [
         PlanStep(
@@ -171,14 +178,14 @@ def _safe_format(template: str, **kwargs: Any) -> str:
 # Matches: tool_name(key="value", key2="value2")
 # Handles: shell("ls") (positional), shell(command="ls") (keyword)
 _TOOL_CALL_RE = re.compile(
-    r'(\w+)\(([^)]*)\)',
+    r"(\w+)\(([^)]*)\)",
 )
 
 # Matches Llama 4 Scout format: [label, tool_name]{"key": "value"}
 # Examples: [clone_repo, shell]{"command": "git clone ..."}
 #           [rca:ci, delegate]{"task": "analyze CI logs"}
 _LABEL_TOOL_JSON_RE = re.compile(
-    r'\[[^\]]*,\s*(\w+)\]\s*(\{[^}]+\})',
+    r"\[[^\]]*,\s*(\w+)\]\s*(\{[^}]+\})",
 )
 
 # Known tool names — only parse calls for tools we actually have
@@ -258,12 +265,14 @@ def parse_text_tool_calls(content: str) -> list[dict[str, Any]]:
         try:
             args = json.loads(json_str)
             if isinstance(args, dict):
-                calls.append({
-                    "name": tool_name,
-                    "args": args,
-                    "id": f"text-{uuid.uuid4().hex[:12]}",
-                    "type": "tool_call",
-                })
+                calls.append(
+                    {
+                        "name": tool_name,
+                        "args": args,
+                        "id": f"text-{uuid.uuid4().hex[:12]}",
+                        "type": "tool_call",
+                    }
+                )
         except json.JSONDecodeError:
             continue
 
@@ -279,12 +288,14 @@ def parse_text_tool_calls(content: str) -> list[dict[str, Any]]:
             continue
 
         args = _parse_kwargs(args_str, tool_name)
-        calls.append({
-            "name": tool_name,
-            "args": args,
-            "id": f"text-{uuid.uuid4().hex[:12]}",
-            "type": "tool_call",
-        })
+        calls.append(
+            {
+                "name": tool_name,
+                "args": args,
+                "id": f"text-{uuid.uuid4().hex[:12]}",
+                "type": "tool_call",
+            }
+        )
 
     return calls
 
@@ -304,10 +315,7 @@ def maybe_patch_tool_calls(response: AIMessage) -> AIMessage:
     content = response.content
     if isinstance(content, list):
         # Multi-part content — extract text parts
-        content = " ".join(
-            b.get("text", "") for b in content
-            if isinstance(b, dict) and b.get("type") == "text"
-        )
+        content = " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text")
 
     parsed = parse_text_tool_calls(content)
     if not parsed:
@@ -325,6 +333,7 @@ def maybe_patch_tool_calls(response: AIMessage) -> AIMessage:
         tool_calls=parsed,
     )
 
+
 # Default budget — used when no explicit budget is passed.
 DEFAULT_BUDGET = AgentBudget()
 
@@ -334,9 +343,15 @@ def maybe_patch_tool_calls(response: AIMessage) -> AIMessage:
 # ---------------------------------------------------------------------------
 
 from sandbox_agent.prompts import (
-    PLANNER_SYSTEM as _PLANNER_SYSTEM,
     EXECUTOR_SYSTEM as _EXECUTOR_SYSTEM,
+)
+from sandbox_agent.prompts import (
+    PLANNER_SYSTEM as _PLANNER_SYSTEM,
+)
+from sandbox_agent.prompts import (
     REFLECTOR_SYSTEM as _REFLECTOR_SYSTEM,
+)
+from sandbox_agent.prompts import (
     REPORTER_SYSTEM as _REPORTER_SYSTEM,
 )
 
@@ -356,11 +371,9 @@ def _intercept_respond_to_user(response: Any, node_name: str) -> AIMessage | Non
         return None
 
     tool_names = [
-        tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?")
-        for tc in response.tool_calls
+        tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?") for tc in response.tool_calls
     ]
-    logger.info("%s called tools: %s", node_name, tool_names,
-                extra={"node": node_name.lower()})
+    logger.info("%s called tools: %s", node_name, tool_names, extra={"node": node_name.lower()})
 
     for tc in response.tool_calls:
         name = tc.get("name", "") if isinstance(tc, dict) else getattr(tc, "name", "")
@@ -368,7 +381,9 @@ def _intercept_respond_to_user(response: Any, node_name: str) -> AIMessage | Non
             args = tc.get("args", {}) if isinstance(tc, dict) else getattr(tc, "args", {})
             response_text = args.get("response", "")
             logger.info(
-                "%s escaped via respond_to_user (%d chars)", node_name, len(response_text),
+                "%s escaped via respond_to_user (%d chars)",
+                node_name,
+                len(response_text),
                 extra={"node": node_name.lower()},
             )
             # Return a clean AIMessage — no tool_calls so the graph
@@ -402,10 +417,7 @@ async def router_node(state: dict[str, Any]) -> dict[str, Any]:
     if messages:
         content = getattr(messages[-1], "content", "")
         if isinstance(content, list):
-            last_text = " ".join(
-                b.get("text", "") for b in content
-                if isinstance(b, dict) and b.get("type") == "text"
-            )
+            last_text = " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text")
         else:
             last_text = str(content)
     last_text_lower = last_text.strip().lower()
@@ -421,9 +433,15 @@ async def router_node(state: dict[str, Any]) -> dict[str, Any]:
             plan_steps[current_step] = {**plan_steps[current_step], "status": "running"}
         logger.info(
             "Router: RESUME plan at step %d/%d (plan_status=%s)",
-            current_step + 1, len(plan_steps), plan_status,
-            extra={"session_id": state.get("context_id", ""), "node": "router",
-                   "current_step": current_step, "plan_status": plan_status},
+            current_step + 1,
+            len(plan_steps),
+            plan_status,
+            extra={
+                "session_id": state.get("context_id", ""),
+                "node": "router",
+                "current_step": current_step,
+                "plan_status": plan_status,
+            },
         )
         return {
             "_route": "resume",
@@ -435,9 +453,9 @@ async def router_node(state: dict[str, Any]) -> dict[str, Any]:
         # Reset replan_count — this is a user-driven replan, not an agent loop
         logger.info(
             "Router: REPLAN — new message while plan active (plan_status=%s, steps=%d)",
-            plan_status, len(plan_steps),
-            extra={"session_id": state.get("context_id", ""), "node": "router",
-                   "plan_status": plan_status},
+            plan_status,
+            len(plan_steps),
+            extra={"session_id": state.get("context_id", ""), "node": "router", "plan_status": plan_status},
         )
         return {
             "_route": "replan",
@@ -448,9 +466,11 @@ async def router_node(state: dict[str, Any]) -> dict[str, Any]:
         }
     else:
         # New: no active plan
-        logger.info("Router: NEW plan (plan_status=%s)", plan_status,
-                    extra={"session_id": state.get("context_id", ""), "node": "router",
-                           "plan_status": plan_status})
+        logger.info(
+            "Router: NEW plan (plan_status=%s)",
+            plan_status,
+            extra={"session_id": state.get("context_id", ""), "node": "router", "plan_status": plan_status},
+        )
         return {
             "_route": "new",
             "plan_status": "executing",
@@ -478,10 +498,7 @@ def _is_trivial_text_request(messages: list) -> bool:
     last = messages[-1]
     content = getattr(last, "content", "")
     if isinstance(content, list):
-        content = " ".join(
-            b.get("text", "") for b in content
-            if isinstance(b, dict) and b.get("type") == "text"
-        )
+        content = " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text")
     text = str(content).strip().lower()
     if not text:
         return False
@@ -520,9 +537,16 @@ async def planner_node(
 
     # Fast-path: trivial text-only requests skip the planner LLM call entirely
     if iteration == 0 and not prev_plan_steps and _is_trivial_text_request(messages):
-        logger.info("Fast-path: trivial text request — single-step plan, no LLM call",
-                    extra={"session_id": state.get("context_id", ""), "node": "planner",
-                           "iteration": 0, "step_count": 1, "plan_version": 1})
+        logger.info(
+            "Fast-path: trivial text request — single-step plan, no LLM call",
+            extra={
+                "session_id": state.get("context_id", ""),
+                "node": "planner",
+                "iteration": 0,
+                "step_count": 1,
+                "plan_version": 1,
+            },
+        )
         trivial_steps = _make_plan_steps(["Respond to the user."], iteration=0)
         store = ps.create_plan(["Respond to the user."], creator="planner")
         return {
@@ -545,7 +569,7 @@ async def planner_node(
             desc = prev_ps.get("description", "")
             status = prev_ps.get("status", "pending").upper()
             result = prev_ps.get("result_summary", "")
-            line = f"  {idx+1}. [{status}] {desc}"
+            line = f"  {idx + 1}. [{status}] {desc}"
             if result:
                 line += f" — {result[:150]}"
             context_parts.append(line)
@@ -560,7 +584,7 @@ async def planner_node(
             context_parts.append("Original plan:")
             for i, step in enumerate(original_plan):
                 status = "DONE" if i < current_step else "PENDING"
-                context_parts.append(f"  {i+1}. [{status}] {step}")
+                context_parts.append(f"  {i + 1}. [{status}] {step}")
             context_parts.append(f"Progress: {current_step}/{len(original_plan)} steps completed.")
             context_parts.append("")
 
@@ -590,9 +614,7 @@ async def planner_node(
                 context_parts.append(f"  Step {i}: {result}")
             context_parts.append("")
 
-        context_parts.append(
-            "Adjust the plan for remaining work. Do NOT repeat steps that already succeeded."
-        )
+        context_parts.append("Adjust the plan for remaining work. Do NOT repeat steps that already succeeded.")
 
     system_content = _PLANNER_SYSTEM
     if context_parts:
@@ -609,15 +631,19 @@ async def planner_node(
 
     try:
         response, planner_capture = await invoke_llm(
-            llm, plan_messages,
-            node="planner", session_id=state.get("context_id", ""),
+            llm,
+            plan_messages,
+            node="planner",
+            session_id=state.get("context_id", ""),
             workspace_path=state.get("workspace_path", "/workspace"),
         )
     except Exception as exc:
         if _is_budget_exceeded_error(exc):
-            logger.warning("Budget exceeded in planner (402 from proxy): %s", exc,
-                       extra={"session_id": state.get("context_id", ""), "node": "planner",
-                              "iteration": iteration})
+            logger.warning(
+                "Budget exceeded in planner (402 from proxy): %s",
+                exc,
+                extra={"session_id": state.get("context_id", ""), "node": "planner", "iteration": iteration},
+            )
             return {
                 "messages": [AIMessage(content=f"Budget exceeded: {exc}")],
                 "done": True,
@@ -627,14 +653,14 @@ async def planner_node(
 
     prompt_tokens = planner_capture.prompt_tokens
     completion_tokens = planner_capture.completion_tokens
-    model_name = planner_capture.model
+    _model_name = planner_capture.model
     budget.add_tokens(prompt_tokens + completion_tokens)
 
     # Check for respond_to_user escape tool (needed for Llama 4 Scout).
     escaped = _intercept_respond_to_user(response, "Planner")
     if escaped is not None:
         response = escaped
-    elif getattr(response, 'tool_calls', None):
+    elif getattr(response, "tool_calls", None):
         # Non-escape tools — pass through for graph tool execution
         return {
             "messages": [response],
@@ -648,11 +674,20 @@ async def planner_node(
     new_plan_steps = _make_plan_steps(plan, iteration=iteration)
     store = ps.create_plan(plan, creator="planner" if iteration == 0 else "replanner")
 
-    logger.info("Planner produced %d steps (iteration %d, version %d): %s",
-                len(plan), iteration, plan_version, plan,
-                extra={"session_id": state.get("context_id", ""), "node": "planner",
-                       "iteration": iteration, "step_count": len(plan),
-                       "plan_version": plan_version})
+    logger.info(
+        "Planner produced %d steps (iteration %d, version %d): %s",
+        len(plan),
+        iteration,
+        plan_version,
+        plan,
+        extra={
+            "session_id": state.get("context_id", ""),
+            "node": "planner",
+            "iteration": iteration,
+            "step_count": len(plan),
+            "plan_version": plan_version,
+        },
+    )
 
     # On replan, preserve completed steps — don't restart from step 0.
     # Find the first non-done step in the NEW plan to continue from.
@@ -666,9 +701,12 @@ async def planner_node(
         for i in range(start_step):
             if i < len(new_plan_steps):
                 new_plan_steps[i] = {**new_plan_steps[i], "status": "done"}
-        logger.info("Replan: preserving %d done steps, starting at step %d",
-                     start_step, start_step + 1,
-                     extra={"session_id": state.get("context_id", ""), "node": "planner"})
+        logger.info(
+            "Replan: preserving %d done steps, starting at step %d",
+            start_step,
+            start_step + 1,
+            extra={"session_id": state.get("context_id", ""), "node": "planner"},
+        )
     else:
         start_step = 0
 
@@ -687,8 +725,9 @@ async def planner_node(
     }
 
 
-MAX_THINK_ACT_CYCLES = int(_os.environ.get("SANDBOX_MAX_THINK_ACT_CYCLES",
-                                             _os.environ.get("SANDBOX_MAX_TOOL_CALLS_PER_STEP", "20")))
+MAX_THINK_ACT_CYCLES = int(
+    _os.environ.get("SANDBOX_MAX_THINK_ACT_CYCLES", _os.environ.get("SANDBOX_MAX_TOOL_CALLS_PER_STEP", "20"))
+)
 THINKING_ITERATION_BUDGET = int(_os.environ.get("SANDBOX_THINKING_ITERATION_BUDGET", "2"))
 MAX_PARALLEL_TOOL_CALLS = int(_os.environ.get("SANDBOX_MAX_PARALLEL_TOOL_CALLS", "5"))
 
@@ -724,19 +763,36 @@ async def executor_node(
     if tool_call_count >= MAX_THINK_ACT_CYCLES:
         logger.warning(
             "Step %d hit think-act cycle limit (%d/%d) — forcing step completion",
-            current_step, tool_call_count, MAX_THINK_ACT_CYCLES,
-            extra={"session_id": state.get("context_id", ""), "node": "executor",
-                   "current_step": current_step, "tool_call_count": tool_call_count},
+            current_step,
+            tool_call_count,
+            MAX_THINK_ACT_CYCLES,
+            extra={
+                "session_id": state.get("context_id", ""),
+                "node": "executor",
+                "current_step": current_step,
+                "tool_call_count": tool_call_count,
+            },
         )
         result: dict[str, Any] = {
-            "messages": [AIMessage(content=f"Step {current_step + 1} reached think-act cycle limit ({MAX_THINK_ACT_CYCLES}). Moving to reflection.")],
+            "messages": [
+                AIMessage(
+                    content=f"Step {current_step + 1} reached think-act cycle limit ({MAX_THINK_ACT_CYCLES}). Moving to reflection."
+                )
+            ],
             "current_step": current_step,
             "_tool_call_count": 0,
             "_budget_summary": budget.summary(),
         }
         if _DEBUG_PROMPTS:
-            result["_system_prompt"] = f"[Think-act cycle limit reached — no LLM call]\nStep {current_step + 1}: {tool_call_count}/{MAX_THINK_ACT_CYCLES} cycles"
-            result["_prompt_messages"] = [{"role": "system", "preview": f"Step {current_step + 1} cycle limit ({tool_call_count}/{MAX_THINK_ACT_CYCLES})"}]
+            result["_system_prompt"] = (
+                f"[Think-act cycle limit reached — no LLM call]\nStep {current_step + 1}: {tool_call_count}/{MAX_THINK_ACT_CYCLES} cycles"
+            )
+            result["_prompt_messages"] = [
+                {
+                    "role": "system",
+                    "preview": f"Step {current_step + 1} cycle limit ({tool_call_count}/{MAX_THINK_ACT_CYCLES})",
+                }
+            ]
             result["_llm_response"] = "[no LLM call — cycle limit]"
         return result
 
@@ -758,9 +814,11 @@ async def executor_node(
     # Check budget before making the LLM call (refresh from LiteLLM first)
 
     if budget.exceeded:
-        logger.warning("Budget exceeded in executor: %s", budget.exceeded_reason,
-                       extra={"session_id": state.get("context_id", ""), "node": "executor",
-                              "current_step": current_step})
+        logger.warning(
+            "Budget exceeded in executor: %s",
+            budget.exceeded_reason,
+            extra={"session_id": state.get("context_id", ""), "node": "executor", "current_step": current_step},
+        )
         result: dict[str, Any] = {
             "messages": [AIMessage(content=f"Budget exceeded: {budget.exceeded_reason}")],
             "current_step": current_step,
@@ -789,17 +847,22 @@ async def executor_node(
 
     try:
         response, capture, sub_events = await invoke_with_tool_loop(
-            llm_with_tools, llm_reason, messages,
-            node="executor", session_id=state.get("context_id", ""),
+            llm_with_tools,
+            llm_reason,
+            messages,
+            node="executor",
+            session_id=state.get("context_id", ""),
             workspace_path=state.get("workspace_path", "/workspace"),
             thinking_budget=THINKING_ITERATION_BUDGET,
             max_parallel_tool_calls=MAX_PARALLEL_TOOL_CALLS,
         )
     except Exception as exc:
         if _is_budget_exceeded_error(exc):
-            logger.warning("Budget exceeded in executor (402 from proxy): %s", exc,
-                           extra={"session_id": state.get("context_id", ""), "node": "executor",
-                                  "current_step": current_step})
+            logger.warning(
+                "Budget exceeded in executor (402 from proxy): %s",
+                exc,
+                extra={"session_id": state.get("context_id", ""), "node": "executor", "current_step": current_step},
+            )
             return {
                 "messages": [AIMessage(content=f"Budget exceeded: {exc}")],
                 "current_step": current_step,
@@ -816,7 +879,7 @@ async def executor_node(
     # Token usage and model from the capture (guaranteed to match what was sent)
     prompt_tokens = capture.prompt_tokens
     completion_tokens = capture.completion_tokens
-    model_name = capture.model
+    _model_name = capture.model
     budget.add_tokens(prompt_tokens + completion_tokens)
 
     # If the model returned text-based tool calls instead of structured
@@ -834,9 +897,14 @@ async def executor_node(
     if len(response.tool_calls) > MAX_PARALLEL_TOOL_CALLS:
         logger.info(
             "Executor returned %d tool calls — keeping first %d (parallel limit)",
-            len(response.tool_calls), MAX_PARALLEL_TOOL_CALLS,
-            extra={"session_id": state.get("context_id", ""), "node": "executor",
-                   "current_step": current_step, "tool_call_count": tool_call_count},
+            len(response.tool_calls),
+            MAX_PARALLEL_TOOL_CALLS,
+            extra={
+                "session_id": state.get("context_id", ""),
+                "node": "executor",
+                "current_step": current_step,
+                "tool_call_count": tool_call_count,
+            },
         )
         response = AIMessage(
             content=response.content,
@@ -848,14 +916,28 @@ async def executor_node(
     # log a warning. The reflector will catch the zero-tool-call pattern.
     if not response.tool_calls and pre_patch_content:
         text_hint = str(pre_patch_content).lower()
-        if any(kw in text_hint for kw in ("shell(", "file_read(", "file_write(",
-                                            "```bash", "```shell", "i would run",
-                                            "i will execute", "let me run")):
+        if any(
+            kw in text_hint
+            for kw in (
+                "shell(",
+                "file_read(",
+                "file_write(",
+                "```bash",
+                "```shell",
+                "i would run",
+                "i will execute",
+                "let me run",
+            )
+        ):
             logger.warning(
                 "Executor produced text resembling a tool call but no actual "
                 "tool_calls were generated — likely a stalled iteration",
-                extra={"session_id": state.get("context_id", ""), "node": "executor",
-                       "current_step": current_step, "tool_call_count": tool_call_count},
+                extra={
+                    "session_id": state.get("context_id", ""),
+                    "node": "executor",
+                    "current_step": current_step,
+                    "tool_call_count": tool_call_count,
+                },
             )
 
     # -- Loop detection: stop if the executor repeats the same tool call ----
@@ -885,16 +967,19 @@ async def executor_node(
             if repeat_count >= 2:
                 logger.warning(
                     "Loop detected: %s(%s) called %d times in last 3 — forcing step completion",
-                    tc["name"], str(tc["args"])[:80], repeat_count + 1,
-                    extra={"session_id": state.get("context_id", ""), "node": "executor",
-                           "current_step": current_step},
+                    tc["name"],
+                    str(tc["args"])[:80],
+                    repeat_count + 1,
+                    extra={"session_id": state.get("context_id", ""), "node": "executor", "current_step": current_step},
                 )
                 return {
-                    "messages": [AIMessage(
-                        content=f"Step {current_step + 1} stuck in loop: "
-                        f"{tc['name']}() called {repeat_count + 1} times with same args. "
-                        f"Moving to reflection."
-                    )],
+                    "messages": [
+                        AIMessage(
+                            content=f"Step {current_step + 1} stuck in loop: "
+                            f"{tc['name']}() called {repeat_count + 1} times with same args. "
+                            f"Moving to reflection."
+                        )
+                    ],
                     "current_step": current_step,
                     "_tool_call_count": 0,
                     "_budget_summary": budget.summary(),
@@ -904,10 +989,7 @@ async def executor_node(
     # from text parsing (not structured tool_calls).
     parsed_tools: list[dict[str, Any]] = []
     if not had_structured_tools and response.tool_calls:
-        parsed_tools = [
-            {"name": tc["name"], "args": tc.get("args", {})}
-            for tc in response.tool_calls
-        ]
+        parsed_tools = [{"name": tc["name"], "args": tc.get("args", {})} for tc in response.tool_calls]
 
     # If no tool calls after patching, the executor is either:
     # (a) Legitimately done with the step (summarizing results) — NORMAL
@@ -921,22 +1003,38 @@ async def executor_node(
             # it's done summarizing. This is normal completion, not a stall.
             logger.info(
                 "Executor produced text response after %d tool calls for step %d — step complete",
-                tool_call_count, current_step,
-                extra={"session_id": state.get("context_id", ""), "node": "executor",
-                       "current_step": current_step, "tool_call_count": tool_call_count},
+                tool_call_count,
+                current_step,
+                extra={
+                    "session_id": state.get("context_id", ""),
+                    "node": "executor",
+                    "current_step": current_step,
+                    "tool_call_count": tool_call_count,
+                },
             )
         else:
             no_tool_count += 1
             logger.warning(
                 "Executor produced no tool calls for step %d (attempt %d/2)",
-                current_step, no_tool_count,
-                extra={"session_id": state.get("context_id", ""), "node": "executor",
-                       "current_step": current_step, "tool_call_count": 0},
+                current_step,
+                no_tool_count,
+                extra={
+                    "session_id": state.get("context_id", ""),
+                    "node": "executor",
+                    "current_step": current_step,
+                    "tool_call_count": 0,
+                },
             )
             if no_tool_count >= 2:
-                logger.warning("Executor failed to call tools after 2 attempts — marking step failed",
-                               extra={"session_id": state.get("context_id", ""), "node": "executor",
-                                      "current_step": current_step, "tool_call_count": 0})
+                logger.warning(
+                    "Executor failed to call tools after 2 attempts — marking step failed",
+                    extra={
+                        "session_id": state.get("context_id", ""),
+                        "node": "executor",
+                        "current_step": current_step,
+                        "tool_call_count": 0,
+                    },
+                )
                 # Keep the actual LLM response (with text reasoning) for the UI.
                 # Append failure note but preserve the model's output for micro_reasoning.
                 actual_content = str(response.content or "")
@@ -1034,9 +1132,16 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]:
         for i in range(current_step + 1, len(fd_ps)):
             if fd_ps[i].get("status") == "pending":
                 fd_ps[i] = {**fd_ps[i], "status": "skipped"}
-        logger.warning("%s — forcing done", reason,
-                       extra={"session_id": state.get("context_id", ""), "node": "reflector",
-                              "current_step": current_step, "replan_count": replan_count})
+        logger.warning(
+            "%s — forcing done",
+            reason,
+            extra={
+                "session_id": state.get("context_id", ""),
+                "node": "reflector",
+                "current_step": current_step,
+                "replan_count": replan_count,
+            },
+        )
         result: dict[str, Any] = {
             "step_results": step_results,
             "plan_steps": fd_ps,
@@ -1069,8 +1174,7 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]:
         content = getattr(last_msg, "content", "")
         if isinstance(content, list):
             last_content = " ".join(
-                b.get("text", "") for b in content
-                if isinstance(b, dict) and b.get("type") == "text"
+                b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text"
             )
         else:
             last_content = str(content)
@@ -1086,16 +1190,21 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]:
         for msg in reversed(messages):
             if isinstance(msg, ToolMessage):
                 last_content = str(getattr(msg, "content", ""))
-                logger.info("Reflector: substituted dedup sentinel with last tool result (%d chars)",
-                            len(last_content),
-                            extra={"session_id": state.get("context_id", ""), "node": "reflector",
-                                   "current_step": current_step})
+                logger.info(
+                    "Reflector: substituted dedup sentinel with last tool result (%d chars)",
+                    len(last_content),
+                    extra={
+                        "session_id": state.get("context_id", ""),
+                        "node": "reflector",
+                        "current_step": current_step,
+                    },
+                )
                 break
 
     step_results.append(last_content[:500])
 
     step_text = plan[current_step] if current_step < len(plan) else "N/A"
-    plan_text = "\n".join(f"{i+1}. {s}" for i, s in enumerate(plan))
+    plan_text = "\n".join(f"{i + 1}. {s}" for i, s in enumerate(plan))
     results_text = last_content[:1000]
 
     # Hint: if the step result contains error signals, prepend a note
@@ -1109,15 +1218,13 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]:
     # Build replan history context — show the LLM what prior replans tried
     replan_history_text = ""
     if replan_count > 0:
-        replan_history_lines = [
-            f"REPLAN HISTORY ({replan_count} prior replan(s)):"
-        ]
+        replan_history_lines = [f"REPLAN HISTORY ({replan_count} prior replan(s)):"]
         # Collect failed step summaries from plan_steps
         for hist_ps in state.get("plan_steps", []):
             if hist_ps.get("status") == "failed":
                 summary = hist_ps.get("result_summary", "no details")
                 replan_history_lines.append(
-                    f"  - Step {hist_ps.get('index', '?')+1} FAILED: {hist_ps.get('description', '?')[:80]}"
+                    f"  - Step {hist_ps.get('index', '?') + 1} FAILED: {hist_ps.get('description', '?')[:80]}"
                     f" — {summary[:150]}"
                 )
         replan_history_lines.append(
@@ -1129,7 +1236,7 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]:
     # Ask LLM to reflect
     recent_str = ", ".join(recent_decisions[-5:]) if recent_decisions else "none"
     # Build remaining steps text so reflector knows what's left
-    remaining = [f"{i+1}. {plan[i]}" for i in range(current_step + 1, len(plan))]
+    remaining = [f"{i + 1}. {plan[i]}" for i in range(current_step + 1, len(plan))]
     remaining_text = ", ".join(remaining[:5]) if remaining else "NONE — all steps complete"
 
     # Build step execution summary for reflector context
@@ -1178,28 +1285,37 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]:
     reflect_messages = build_reflector_context(state, system_content)
     try:
         response, capture = await invoke_llm(
-            llm, reflect_messages,
-            node="reflector", session_id=state.get("context_id", ""),
+            llm,
+            reflect_messages,
+            node="reflector",
+            session_id=state.get("context_id", ""),
             workspace_path=state.get("workspace_path", "/workspace"),
         )
     except Exception as exc:
         if _is_budget_exceeded_error(exc):
-            logger.warning("Budget exceeded in reflector (402 from proxy): %s", exc,
-                           extra={"session_id": state.get("context_id", ""), "node": "reflector",
-                                  "current_step": current_step, "replan_count": replan_count})
+            logger.warning(
+                "Budget exceeded in reflector (402 from proxy): %s",
+                exc,
+                extra={
+                    "session_id": state.get("context_id", ""),
+                    "node": "reflector",
+                    "current_step": current_step,
+                    "replan_count": replan_count,
+                },
+            )
             return _force_done(f"Budget exceeded: {exc}")
         raise
 
     prompt_tokens = capture.prompt_tokens
     completion_tokens = capture.completion_tokens
-    model_name = capture.model
+    _model_name = capture.model
     budget.add_tokens(prompt_tokens + completion_tokens)
 
     # Check for respond_to_user escape tool (needed for Llama 4 Scout).
     escaped = _intercept_respond_to_user(response, "Reflector")
     if escaped is not None:
         response = escaped
-    elif getattr(response, 'tool_calls', None):
+    elif getattr(response, "tool_calls", None):
         # Non-escape tools — pass through for graph tool execution
         return {
             "messages": [response],
@@ -1218,9 +1334,13 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]:
         logger.warning(
             "Reflector said 'done' but %d plan steps remain — overriding to 'continue'",
             steps_remaining,
-            extra={"session_id": state.get("context_id", ""), "node": "reflector",
-                   "decision": "done->continue", "current_step": current_step,
-                   "replan_count": replan_count},
+            extra={
+                "session_id": state.get("context_id", ""),
+                "node": "reflector",
+                "decision": "done->continue",
+                "current_step": current_step,
+                "replan_count": replan_count,
+            },
         )
         decision = "continue"
 
@@ -1245,12 +1365,21 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]:
 
     logger.info(
         "Reflector decision: %s (step %d/%d, iter %d, replans=%d, tools=%d, recent=%s)",
-        decision, current_step + 1, len(plan), iteration,
-        replan_count, tool_calls_this_iter,
+        decision,
+        current_step + 1,
+        len(plan),
+        iteration,
+        replan_count,
+        tool_calls_this_iter,
         recent_decisions[-3:],
-        extra={"session_id": state.get("context_id", ""), "node": "reflector",
-               "decision": decision, "current_step": current_step,
-               "replan_count": replan_count, "iteration": iteration},
+        extra={
+            "session_id": state.get("context_id", ""),
+            "node": "reflector",
+            "decision": decision,
+            "current_step": current_step,
+            "replan_count": replan_count,
+            "iteration": iteration,
+        },
     )
 
     base_result: dict[str, Any] = {
@@ -1274,8 +1403,11 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]:
             elif decision == "retry":
                 store = ps.set_step_status(store, step_key, "running")
         except ValueError:
-            logger.warning("PlanStore: step %s not found (replan?), skipping status update",
-                           step_key, extra={"session_id": state.get("context_id", ""), "node": "reflector"})
+            logger.warning(
+                "PlanStore: step %s not found (replan?), skipping status update",
+                step_key,
+                extra={"session_id": state.get("context_id", ""), "node": "reflector"},
+            )
         base_result["_plan_store"] = store
 
     if decision == "done":
@@ -1303,10 +1435,17 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]:
                 "status": "retrying",
                 "retry_count": retry_count,
             }
-        logger.info("Retry step %d (attempt %d) — re-executing with different approach",
-                     current_step + 1, plan_steps[current_step].get("retry_count", 1),
-                     extra={"session_id": state.get("context_id", ""), "node": "reflector",
-                            "decision": "retry", "current_step": current_step})
+        logger.info(
+            "Retry step %d (attempt %d) — re-executing with different approach",
+            current_step + 1,
+            plan_steps[current_step].get("retry_count", 1),
+            extra={
+                "session_id": state.get("context_id", ""),
+                "node": "reflector",
+                "decision": "retry",
+                "current_step": current_step,
+            },
+        )
         return {
             **base_result,
             "plan_steps": plan_steps,
@@ -1319,10 +1458,17 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]:
         # Mark current step failed
         if current_step < len(plan_steps):
             plan_steps[current_step] = {**plan_steps[current_step], "status": "failed"}
-        logger.info("Replan %d — routing back to planner", new_replan_count,
-                    extra={"session_id": state.get("context_id", ""), "node": "reflector",
-                           "decision": "replan", "current_step": current_step,
-                           "replan_count": new_replan_count})
+        logger.info(
+            "Replan %d — routing back to planner",
+            new_replan_count,
+            extra={
+                "session_id": state.get("context_id", ""),
+                "node": "reflector",
+                "decision": "replan",
+                "current_step": current_step,
+                "replan_count": new_replan_count,
+            },
+        )
         return {
             **base_result,
             "plan_steps": plan_steps,
@@ -1345,8 +1491,12 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]:
             logger.info(
                 "All %d planned steps completed — routing to reporter",
                 len(plan),
-                extra={"session_id": state.get("context_id", ""), "node": "reflector",
-                       "decision": "done", "current_step": current_step},
+                extra={
+                    "session_id": state.get("context_id", ""),
+                    "node": "reflector",
+                    "decision": "done",
+                    "current_step": current_step,
+                },
             )
             return {
                 **base_result,
@@ -1416,10 +1566,8 @@ async def reporter_node(
     if not step_results and not state.get("messages"):
         return {"final_answer": "No response generated.", "plan_status": terminal_status}
 
-    plan_text = "\n".join(f"{i+1}. {s}" for i, s in enumerate(plan))
-    results_text = "\n".join(
-        f"Step {i+1}: {r}" for i, r in enumerate(step_results)
-    )
+    plan_text = "\n".join(f"{i + 1}. {s}" for i, s in enumerate(plan))
+    results_text = "\n".join(f"Step {i + 1}: {r}" for i, r in enumerate(step_results))
 
     # Build step status summary from plan_steps
     step_status_lines = []
@@ -1431,7 +1579,7 @@ async def reporter_node(
             has_partial = True
         desc = rpt_ps.get("description", "")[:80]
         result = rpt_ps.get("result_summary", "")[:100]
-        line = f"{idx+1}. [{status}] {desc}"
+        line = f"{idx + 1}. [{status}] {desc}"
         if result and status in ("FAILED", "PARTIAL"):
             line += f" — {result}"
         step_status_lines.append(line)
@@ -1455,10 +1603,7 @@ async def reporter_node(
     )
     # Filter dedup sentinel messages from conversation history passed to the
     # reporter LLM so it cannot echo them in the final answer.
-    filtered_msgs = [
-        m for m in state["messages"]
-        if _DEDUP_SENTINEL not in str(getattr(m, "content", ""))
-    ]
+    filtered_msgs = [m for m in state["messages"] if _DEDUP_SENTINEL not in str(getattr(m, "content", ""))]
     reporter_messages = [SystemMessage(content=system_content)] + filtered_msgs
 
     # Use invoke_with_tool_loop when llm_reason is available (thinking mode),
@@ -1469,8 +1614,11 @@ async def reporter_node(
 
         try:
             response, capture, sub_events = await invoke_with_tool_loop(
-                llm, llm_reason, reporter_messages,
-                node="reporter", session_id=state.get("context_id", ""),
+                llm,
+                llm_reason,
+                reporter_messages,
+                node="reporter",
+                session_id=state.get("context_id", ""),
                 workspace_path=state.get("workspace_path", "/workspace"),
                 thinking_budget=2,
                 max_parallel_tool_calls=3,
@@ -1479,8 +1627,11 @@ async def reporter_node(
             )
         except Exception as exc:
             if _is_budget_exceeded_error(exc):
-                logger.warning("Budget exceeded in reporter (402 from proxy): %s", exc,
-                               extra={"session_id": state.get("context_id", ""), "node": "reporter"})
+                logger.warning(
+                    "Budget exceeded in reporter (402 from proxy): %s",
+                    exc,
+                    extra={"session_id": state.get("context_id", ""), "node": "reporter"},
+                )
                 return {
                     "messages": [AIMessage(content="Task completed (budget exhausted before final summary).")],
                     "final_answer": "Task completed (budget exhausted before final summary).",
@@ -1494,14 +1645,19 @@ async def reporter_node(
 
         try:
             response, capture = await invoke_llm(
-                llm, reporter_messages,
-                node="reporter", session_id=state.get("context_id", ""),
+                llm,
+                reporter_messages,
+                node="reporter",
+                session_id=state.get("context_id", ""),
                 workspace_path=state.get("workspace_path", "/workspace"),
             )
         except Exception as exc:
             if _is_budget_exceeded_error(exc):
-                logger.warning("Budget exceeded in reporter (402 from proxy): %s", exc,
-                               extra={"session_id": state.get("context_id", ""), "node": "reporter"})
+                logger.warning(
+                    "Budget exceeded in reporter (402 from proxy): %s",
+                    exc,
+                    extra={"session_id": state.get("context_id", ""), "node": "reporter"},
+                )
                 return {
                     "messages": [AIMessage(content="Task completed (budget exhausted before final summary).")],
                     "final_answer": "Task completed (budget exhausted before final summary).",
@@ -1513,14 +1669,14 @@ async def reporter_node(
 
     prompt_tokens = capture.prompt_tokens
     completion_tokens = capture.completion_tokens
-    model_name = capture.model
+    _model_name = capture.model
     budget.add_tokens(prompt_tokens + completion_tokens)
 
     # Handle respond_to_user escape tool (Llama 4 Scout always calls tools)
     escaped = _intercept_respond_to_user(response, "Reporter")
     if escaped is not None:
         response = escaped
-    elif getattr(response, 'tool_calls', None):
+    elif getattr(response, "tool_calls", None):
         # Response has real tool calls — return to graph for tool execution
         return {
             "messages": [response],
@@ -1531,10 +1687,7 @@ async def reporter_node(
 
     content = response.content
     if isinstance(content, list):
-        text = " ".join(
-            b.get("text", "") for b in content
-            if isinstance(b, dict) and b.get("type") == "text"
-        )
+        text = " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text")
     else:
         text = str(content)
 
@@ -1552,16 +1705,19 @@ async def reporter_node(
                 cmd = args.get("command", "")
                 # Extract file paths from common shell patterns
                 import re as _re
-                for match in _re.findall(r'(?:>|>>|tee)\s+(\S+)', cmd):
+
+                for match in _re.findall(r"(?:>|>>|tee)\s+(\S+)", cmd):
                     if match not in files_touched:
                         files_touched.append(match)
 
-    logger.info("Reporter: plan_status=%s (done=%d, failed=%d, total=%d)",
-                terminal_status,
-                sum(1 for s in plan_steps if s.get("status") == "done"),
-                sum(1 for s in plan_steps if s.get("status") == "failed"),
-                len(plan_steps),
-                extra={"session_id": state.get("context_id", ""), "node": "reporter"})
+    logger.info(
+        "Reporter: plan_status=%s (done=%d, failed=%d, total=%d)",
+        terminal_status,
+        sum(1 for s in plan_steps if s.get("status") == "done"),
+        sum(1 for s in plan_steps if s.get("status") == "failed"),
+        len(plan_steps),
+        extra={"session_id": state.get("context_id", ""), "node": "reporter"},
+    )
 
     result: dict[str, Any] = {
         "messages": [response],
@@ -1615,10 +1771,7 @@ def _parse_plan(content: str | list) -> list[str]:
     Returns a list of step descriptions.
     """
     if isinstance(content, list):
-        text = " ".join(
-            b.get("text", "") for b in content
-            if isinstance(b, dict) and b.get("type") == "text"
-        )
+        text = " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text")
     else:
         text = str(content)
 
@@ -1630,7 +1783,7 @@ def _parse_plan(content: str | list) -> list[str]:
             # Strip the number prefix: "1. Do X" -> "Do X"
             for i, ch in enumerate(line):
                 if ch in ".)" and i < 4:
-                    step = line[i + 1:].strip()
+                    step = line[i + 1 :].strip()
                     if step:
                         steps.append(step)
                     break
@@ -1649,10 +1802,7 @@ def _parse_decision(content: str | list) -> str:
     Defaults to ``continue`` if the output is ambiguous.
     """
     if isinstance(content, list):
-        text = " ".join(
-            b.get("text", "") for b in content
-            if isinstance(b, dict) and b.get("type") == "text"
-        )
+        text = " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text")
     else:
         text = str(content)
 
@@ -1665,4 +1815,4 @@ def _parse_decision(content: str | list) -> str:
     return "continue"
 
 
-_BARE_DECISION_RE = re.compile(r'^(continue|retry|replan|done|hitl)\s*$', re.IGNORECASE)
+_BARE_DECISION_RE = re.compile(r"^(continue|retry|replan|done|hitl)\s*$", re.IGNORECASE)
diff --git a/a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py b/a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py
index cea9063e..cb26b69b 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py
@@ -129,7 +129,9 @@ async def sandboxed_subprocess(
 
     try:
         process = await asyncio.create_subprocess_exec(
-            sys.executable, "-c", child_script,
+            sys.executable,
+            "-c",
+            child_script,
             stdout=asyncio.subprocess.PIPE,
             stderr=asyncio.subprocess.PIPE,
             env=child_env,
diff --git a/a2a/sandbox_agent/src/sandbox_agent/sources.py b/a2a/sandbox_agent/src/sandbox_agent/sources.py
index bd2bf68f..016fb887 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/sources.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/sources.py
@@ -15,7 +15,6 @@
 from pathlib import Path
 from typing import Any
 
-
 _DEFAULT_MAX_EXECUTION_TIME_SECONDS = 300
 _DEFAULT_MAX_MEMORY_MB = 2048
 
@@ -116,11 +115,7 @@ def is_domain_allowed(self, domain: str) -> bool:
     def max_execution_time_seconds(self) -> int:
         """Maximum execution time for a single run, in seconds."""
         runtime: dict[str, Any] = self._data.get("runtime", {})
-        return int(
-            runtime.get(
-                "max_execution_time_seconds", _DEFAULT_MAX_EXECUTION_TIME_SECONDS
-            )
-        )
+        return int(runtime.get("max_execution_time_seconds", _DEFAULT_MAX_EXECUTION_TIME_SECONDS))
 
     @property
     def max_memory_mb(self) -> int:
diff --git a/a2a/sandbox_agent/src/sandbox_agent/subagents.py b/a2a/sandbox_agent/src/sandbox_agent/subagents.py
index c1b7fcb3..d2ba5f70 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/subagents.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/subagents.py
@@ -23,7 +23,7 @@
 import subprocess
 import uuid
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any
 
 import asyncpg
 from langchain_core.messages import HumanMessage, SystemMessage
@@ -37,9 +37,7 @@
 _MAX_SUB_AGENT_ITERATIONS = 15
 
 # Delegation mode configuration
-_DELEGATION_MODES = os.environ.get(
-    "DELEGATION_MODES", "in-process,shared-pvc,isolated,sidecar"
-).split(",")
+_DELEGATION_MODES = os.environ.get("DELEGATION_MODES", "in-process,shared-pvc,isolated,sidecar").split(",")
 _DEFAULT_MODE = os.environ.get("DEFAULT_DELEGATION_MODE", "in-process")
 
 # Maximum iterations for in-process sub-agents to prevent runaway loops.
@@ -72,11 +70,23 @@ async def grep(pattern: str, path: str = ".") -> str:
 
         try:
             result = subprocess.run(
-                ["grep", "-rn", "--include=*.py", "--include=*.md",
-                 "--include=*.yaml", "--include=*.yml", "--include=*.json",
-                 "--include=*.txt", "--include=*.sh", "--include=*.go",
-                 pattern, str(target)],
-                capture_output=True, text=True, timeout=30,
+                [
+                    "grep",
+                    "-rn",
+                    "--include=*.py",
+                    "--include=*.md",
+                    "--include=*.yaml",
+                    "--include=*.yml",
+                    "--include=*.json",
+                    "--include=*.txt",
+                    "--include=*.sh",
+                    "--include=*.go",
+                    pattern,
+                    str(target),
+                ],
+                capture_output=True,
+                text=True,
+                timeout=30,
                 cwd=str(ws_root),
             )
             output = result.stdout[:10000]
@@ -131,7 +141,7 @@ async def list_files(path: str = ".", pattern: str = "*") -> str:
         matches = sorted(str(p.relative_to(ws_root)) for p in target.rglob(pattern) if p.is_file())
         if len(matches) > 200:
             matches = matches[:200]
-            matches.append(f"... and more (truncated at 200)")
+            matches.append("... and more (truncated at 200)")
         return "\n".join(matches) if matches else "No files found."
 
     return [grep, read_file, list_files]
@@ -148,6 +158,7 @@ def create_explore_graph(workspace: str, llm: Any) -> Any:
 
     async def assistant(state: MessagesState) -> dict[str, Any]:
         from sandbox_agent.reasoning import maybe_patch_tool_calls
+
         system = SystemMessage(
             content=(
                 "You are a codebase research assistant. Your job is to find "
@@ -229,15 +240,15 @@ async def _register_child_session(
     try:
         conn = await asyncpg.connect(pg_url)
         # Check if context already exists
-        existing = await conn.fetchval(
-            "SELECT COUNT(*) FROM tasks WHERE context_id = $1", child_context_id
-        )
+        existing = await conn.fetchval("SELECT COUNT(*) FROM tasks WHERE context_id = $1", child_context_id)
         if existing == 0:
-            metadata = json.dumps({
-                "agent_name": agent_name,
-                "parent_context_id": parent_context_id,
-                "title": task[:80],
-            })
+            metadata = json.dumps(
+                {
+                    "agent_name": agent_name,
+                    "parent_context_id": parent_context_id,
+                    "title": task[:80],
+                }
+            )
             status = json.dumps({"state": "working"})
             await conn.execute(
                 "INSERT INTO tasks (id, context_id, status, metadata, history, artifacts) "
@@ -307,6 +318,7 @@ async def _run_in_process(
 
     async def assistant(state: MessagesState) -> dict[str, Any]:
         from sandbox_agent.reasoning import maybe_patch_tool_calls
+
         system = SystemMessage(
             content=(
                 "You are a sub-agent working on a delegated task. Complete the task "
@@ -350,8 +362,11 @@ async def assistant(state: MessagesState) -> dict[str, Any]:
 
 
 async def _run_shared_pvc(
-    task: str, child_context_id: str, namespace: str = "team1",
-    variant: str = "sandbox-legion", timeout_minutes: int = 30,
+    task: str,
+    child_context_id: str,
+    namespace: str = "team1",
+    variant: str = "sandbox-legion",
+    timeout_minutes: int = 30,
 ) -> str:
     """Spawn a pod that mounts the parent's PVC (placeholder)."""
     logger.info("shared-pvc delegation: child=%s task=%s", child_context_id, task)
@@ -363,8 +378,11 @@ async def _run_shared_pvc(
 
 
 async def _run_isolated(
-    task: str, child_context_id: str, namespace: str = "team1",
-    variant: str = "sandbox-legion", timeout_minutes: int = 30,
+    task: str,
+    child_context_id: str,
+    namespace: str = "team1",
+    variant: str = "sandbox-legion",
+    timeout_minutes: int = 30,
 ) -> str:
     """Spawn an isolated pod via SandboxClaim CRD (placeholder)."""
     logger.info("isolated delegation: child=%s task=%s", child_context_id, task)
@@ -376,14 +394,13 @@ async def _run_isolated(
 
 
 async def _run_sidecar(
-    task: str, child_context_id: str, variant: str = "sandbox-legion",
+    task: str,
+    child_context_id: str,
+    variant: str = "sandbox-legion",
 ) -> str:
     """Inject a sidecar container (placeholder)."""
     logger.info("sidecar delegation: child=%s task=%s", child_context_id, task)
-    return (
-        f"Sidecar delegation requested for '{task}' "
-        f"(child={child_context_id}). Not yet implemented."
-    )
+    return f"Sidecar delegation requested for '{task}' (child={child_context_id}). Not yet implemented."
 
 
 def make_delegate_tool(

From 4f804c6bba2e2523922b744993a3926efa54e39e Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 22:40:53 +0100
Subject: [PATCH 23/26] fix: move try/except import after clean import block
 (I001)

Ruff I001 requires contiguous import blocks. The try/except for
DatabaseTaskStore was breaking the a2a imports block. Moved it
after all clean imports.

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 a2a/sandbox_agent/src/sandbox_agent/agent.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/a2a/sandbox_agent/src/sandbox_agent/agent.py b/a2a/sandbox_agent/src/sandbox_agent/agent.py
index d75b29f6..be8346a9 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/agent.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/agent.py
@@ -21,13 +21,6 @@
 from a2a.server.events.event_queue import EventQueue
 from a2a.server.request_handlers import DefaultRequestHandler
 from a2a.server.tasks import InMemoryTaskStore, TaskUpdater
-
-try:
-    from a2a.server.tasks import DatabaseTaskStore
-
-    _HAS_SQL_STORE = True
-except ImportError:
-    _HAS_SQL_STORE = False
 from a2a.types import (
     AgentCapabilities,
     AgentCard,
@@ -41,6 +34,13 @@
 from langgraph.checkpoint.memory import MemorySaver
 from starlette.routing import Route
 
+try:
+    from a2a.server.tasks import DatabaseTaskStore
+
+    _HAS_SQL_STORE = True
+except ImportError:
+    _HAS_SQL_STORE = False
+
 from sandbox_agent.budget import AgentBudget
 from sandbox_agent.configuration import Configuration
 from sandbox_agent.event_serializer import LangGraphSerializer

From 1212d242839406ac3980896db508bf2e9d547e13 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 22:44:14 +0100
Subject: [PATCH 24/26] fix: remove from __future__ import annotations (not
 needed for Python 3.11+, fixes I001)

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 a2a/sandbox_agent/src/sandbox_agent/agent.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/a2a/sandbox_agent/src/sandbox_agent/agent.py b/a2a/sandbox_agent/src/sandbox_agent/agent.py
index be8346a9..6eb960b6 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/agent.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/agent.py
@@ -4,8 +4,6 @@
 and LangGraph graph to serve the A2A protocol over HTTP.
 """
 
-from __future__ import annotations
-
 import asyncio
 import hashlib
 import json

From 1cdf692fb3f55834ea83859383994fffa9a8f893 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Mon, 16 Mar 2026 22:48:50 +0100
Subject: [PATCH 25/26] fix: sort imports per ruff 0.11.4 ordering (a2a as
 first-party)

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 a2a/sandbox_agent/src/sandbox_agent/agent.py            | 7 ++++---
 a2a/sandbox_agent/src/sandbox_agent/event_serializer.py | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/a2a/sandbox_agent/src/sandbox_agent/agent.py b/a2a/sandbox_agent/src/sandbox_agent/agent.py
index 6eb960b6..0718d764 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/agent.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/agent.py
@@ -14,6 +14,10 @@
 from typing import Any
 
 import uvicorn
+from langchain_core.messages import HumanMessage
+from langgraph.checkpoint.memory import MemorySaver
+from starlette.routing import Route
+
 from a2a.server.agent_execution import AgentExecutor, RequestContext
 from a2a.server.apps import A2AStarletteApplication
 from a2a.server.events.event_queue import EventQueue
@@ -28,9 +32,6 @@
     TextPart,
 )
 from a2a.utils import new_agent_text_message, new_task
-from langchain_core.messages import HumanMessage
-from langgraph.checkpoint.memory import MemorySaver
-from starlette.routing import Route
 
 try:
     from a2a.server.tasks import DatabaseTaskStore
diff --git a/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py b/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py
index 8e039ef7..c67f7c7c 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py
@@ -113,7 +113,6 @@ def __init__(self, loop_id: str | None = None, context_id: str | None = None) ->
         self._prev_node: str | None = None  # previous node for node_transition events
 
     def serialize(self, key: str, value: dict) -> str:
-
         # Emit node_transition meta-event when the node changes
         transition_line: str | None = None
         if self._prev_node is not None and key != self._prev_node:

From bed64f562ac88a45b7ee53bcda0bfdabc2576752 Mon Sep 17 00:00:00 2001
From: Ladislav Smola <lsmola@redhat.com>
Date: Tue, 17 Mar 2026 06:52:28 +0100
Subject: [PATCH 26/26] fix(security): path traversal, shlex parsing, dead code
 cleanup

- Validate context_id against traversal (workspace.py)
- Use is_relative_to instead of startswith (subagents.py)
- Use shlex.split for interpreter/sources checks (permissions.py, executor.py)
- Remove duplicate _MAX_SUB_AGENT_ITERATIONS (subagents.py)
- Remove dead _BARE_DECISION_RE (reasoning.py)

Signed-off-by: Ladislav Smola <lsmola@redhat.com>
---
 a2a/sandbox_agent/src/sandbox_agent/executor.py    |  5 ++++-
 a2a/sandbox_agent/src/sandbox_agent/permissions.py |  6 +++++-
 a2a/sandbox_agent/src/sandbox_agent/reasoning.py   |  3 ---
 a2a/sandbox_agent/src/sandbox_agent/subagents.py   |  5 +----
 a2a/sandbox_agent/src/sandbox_agent/workspace.py   | 12 +++++++++---
 5 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/a2a/sandbox_agent/src/sandbox_agent/executor.py b/a2a/sandbox_agent/src/sandbox_agent/executor.py
index 6dc5f7eb..672e85c6 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/executor.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/executor.py
@@ -237,7 +237,10 @@ def _check_sources(self, operation: str) -> str | None:
         """
         import re
 
-        parts = operation.split()
+        try:
+            parts = shlex.split(operation)
+        except ValueError:
+            parts = operation.split()
         if not parts:
             return None
 
diff --git a/a2a/sandbox_agent/src/sandbox_agent/permissions.py b/a2a/sandbox_agent/src/sandbox_agent/permissions.py
index 7810c5ac..b634dbe4 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/permissions.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/permissions.py
@@ -23,6 +23,7 @@
 import enum
 import fnmatch
 import re
+import shlex
 from typing import Any
 
 # ---------------------------------------------------------------------------
@@ -280,7 +281,10 @@ def check_interpreter_bypass(cls, operation: str) -> list[str]:
         if not operation:
             return []
 
-        parts = operation.split()
+        try:
+            parts = shlex.split(operation)
+        except ValueError:
+            parts = operation.split()
         if not parts:
             return []
 
diff --git a/a2a/sandbox_agent/src/sandbox_agent/reasoning.py b/a2a/sandbox_agent/src/sandbox_agent/reasoning.py
index b75d6903..ec5a6c71 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/reasoning.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/reasoning.py
@@ -1813,6 +1813,3 @@ def _parse_decision(content: str | list) -> str:
             return decision
 
     return "continue"
-
-
-_BARE_DECISION_RE = re.compile(r"^(continue|retry|replan|done|hitl)\s*$", re.IGNORECASE)
diff --git a/a2a/sandbox_agent/src/sandbox_agent/subagents.py b/a2a/sandbox_agent/src/sandbox_agent/subagents.py
index d2ba5f70..02f2cbfa 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/subagents.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/subagents.py
@@ -40,9 +40,6 @@
 _DELEGATION_MODES = os.environ.get("DELEGATION_MODES", "in-process,shared-pvc,isolated,sidecar").split(",")
 _DEFAULT_MODE = os.environ.get("DEFAULT_DELEGATION_MODE", "in-process")
 
-# Maximum iterations for in-process sub-agents to prevent runaway loops.
-_MAX_SUB_AGENT_ITERATIONS = 15
-
 
 # ---------------------------------------------------------------------------
 # In-process sub-agent: explore (C20, mode 1)
@@ -109,7 +106,7 @@ async def read_file(path: str) -> str:
             File contents (truncated to 20000 chars).
         """
         resolved = (ws_root / path).resolve()
-        if not str(resolved).startswith(str(ws_root)):
+        if not resolved.is_relative_to(ws_root):
             return "Error: path resolves outside the workspace."
         if not resolved.is_file():
             return f"Error: file not found at '{path}'."
diff --git a/a2a/sandbox_agent/src/sandbox_agent/workspace.py b/a2a/sandbox_agent/src/sandbox_agent/workspace.py
index e047d7d7..5858eb62 100644
--- a/a2a/sandbox_agent/src/sandbox_agent/workspace.py
+++ b/a2a/sandbox_agent/src/sandbox_agent/workspace.py
@@ -44,8 +44,15 @@ def __init__(
     # Public API
     # ------------------------------------------------------------------
 
+    @staticmethod
+    def _validate_context_id(context_id: str) -> None:
+        """Reject context IDs that could escape the workspace root."""
+        if not context_id or "/" in context_id or ".." in context_id or "\x00" in context_id:
+            raise ValueError(f"Invalid context_id: {context_id!r}")
+
     def get_workspace_path(self, context_id: str) -> str:
         """Return the workspace path for *context_id* without creating it."""
+        self._validate_context_id(context_id)
         return os.path.join(self.workspace_root, context_id)
 
     def ensure_workspace(self, context_id: str) -> str:
@@ -60,10 +67,9 @@ def ensure_workspace(self, context_id: str) -> str:
         Raises
         ------
         ValueError
-            If *context_id* is empty.
+            If *context_id* is empty or contains path-traversal characters.
         """
-        if not context_id:
-            raise ValueError("context_id must not be empty")
+        self._validate_context_id(context_id)
 
         workspace_path = self.get_workspace_path(context_id)
         context_file = Path(workspace_path) / ".context.json"