From f38031ad7a61e52373a8ce643176a1b702ee7e35 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:46:35 +0100 Subject: [PATCH 01/26] feat(sandbox): add sandbox_agent package init Signed-off-by: Ladislav Smola --- a2a/sandbox_agent/src/sandbox_agent/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/__init__.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/__init__.py b/a2a/sandbox_agent/src/sandbox_agent/__init__.py new file mode 100644 index 00000000..e69de29b From b4c2d653272c982b4ea66983ec09a3d3dc936ffa Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:47:29 +0100 Subject: [PATCH 02/26] feat(sandbox): A2A server with event streaming, session management, and graph card endpoint Signed-off-by: Ladislav Smola --- a2a/sandbox_agent/src/sandbox_agent/agent.py | 1046 ++++++++++++++++++ 1 file changed, 1046 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/agent.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/agent.py b/a2a/sandbox_agent/src/sandbox_agent/agent.py new file mode 100644 index 00000000..70e67ba7 --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/agent.py @@ -0,0 +1,1046 @@ +"""A2A agent server for the Sandbox Legion. + +Wires together the workspace manager, permission checker, sources config, +and LangGraph graph to serve the A2A protocol over HTTP. +""" + +from __future__ import annotations + +import asyncio +import hashlib +import json +import logging +import os +from pathlib import Path +from textwrap import dedent +from typing import Any + +import uvicorn +from a2a.server.agent_execution import AgentExecutor, RequestContext +from a2a.server.apps import A2AStarletteApplication +from a2a.server.events.event_queue import EventQueue +from a2a.server.request_handlers import DefaultRequestHandler +from a2a.server.tasks import InMemoryTaskStore, TaskUpdater + +try: + from a2a.server.tasks import DatabaseTaskStore + + _HAS_SQL_STORE = True +except ImportError: + _HAS_SQL_STORE = False +from a2a.types import ( + AgentCapabilities, + AgentCard, + AgentExtension, + AgentSkill, + TaskState, + TextPart, +) +from a2a.utils import new_agent_text_message, new_task +from langchain_core.messages import HumanMessage +from starlette.routing import Route + +from langgraph.checkpoint.memory import MemorySaver + +from sandbox_agent.budget import AgentBudget +from sandbox_agent.configuration import Configuration +from sandbox_agent.event_serializer import LangGraphSerializer +from sandbox_agent.graph import _load_skill, build_graph +from sandbox_agent.graph_card import build_graph_card +from sandbox_agent.observability import setup_observability +from sandbox_agent.permissions import PermissionChecker +from sandbox_agent.sources import SourcesConfig +from sandbox_agent.workspace import WorkspaceManager + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +# Package root is two levels up from __file__ +# (__file__ = src/sandbox_agent/agent.py -> package root = .) +_PACKAGE_ROOT = Path(__file__).resolve().parent.parent.parent + + +def _load_json(filename: str) -> dict: + """Load a JSON file from the package root directory. + + Parameters + ---------- + filename: + Name of the JSON file (e.g. ``settings.json`` or ``sources.json``). + + Returns + ------- + dict + Parsed JSON content. + """ + path = _PACKAGE_ROOT / filename + with open(path, encoding="utf-8") as fh: + return json.load(fh) + + +# --------------------------------------------------------------------------- +# TOFU (Trust-On-First-Use) verification +# --------------------------------------------------------------------------- + +_TOFU_HASH_FILE = ".tofu-hashes.json" + +# Files in the workspace root to track for TOFU verification. +_TOFU_TRACKED_FILES = ("CLAUDE.md", "sources.json", "settings.json") + + +def _hash_file(path: Path) -> str | None: + """Return the SHA-256 hex digest of a file, or None if it doesn't exist.""" + if not path.is_file(): + return None + h = hashlib.sha256() + h.update(path.read_bytes()) + return h.hexdigest() + + +def _compute_tofu_hashes(root: Path) -> dict[str, str]: + """Compute SHA-256 hashes for tracked files under *root*. + + Returns a dict mapping filename -> hex digest (only for files that exist). + """ + hashes: dict[str, str] = {} + for name in _TOFU_TRACKED_FILES: + digest = _hash_file(root / name) + if digest is not None: + hashes[name] = digest + return hashes + + +def _tofu_verify(root: Path) -> None: + """Run TOFU verification on startup. + + On first run, computes and stores hashes of tracked files. On subsequent + runs, compares current hashes against the stored ones and logs a WARNING + if any file has changed (possible tampering). Does NOT block startup. + """ + # Write to /tmp to avoid PermissionError when OCP assigns arbitrary UID + # (the /app directory is owned by UID 1001 but OCP may run as a different UID) + hash_file = Path("/tmp") / _TOFU_HASH_FILE + current_hashes = _compute_tofu_hashes(root) + + if not current_hashes: + logger.info("TOFU: no tracked files found in %s; skipping.", root) + return + + if hash_file.is_file(): + try: + with open(hash_file, encoding="utf-8") as fh: + stored_hashes = json.load(fh) + except (json.JSONDecodeError, OSError) as exc: + logger.warning("TOFU: could not read %s: %s", hash_file, exc) + stored_hashes = {} + + # Compare each tracked file. + changed: list[str] = [] + added: list[str] = [] + removed: list[str] = [] + for name, digest in current_hashes.items(): + stored = stored_hashes.get(name) + if stored is None: + added.append(name) + elif stored != digest: + changed.append(name) + for name in stored_hashes: + if name not in current_hashes: + removed.append(name) + + if changed or added or removed: + logger.warning( + "TOFU: workspace file integrity mismatch! " + "changed=%s, added=%s, removed=%s. " + "This may indicate tampering. Updating stored hashes.", + changed, added, removed, + ) + # Update stored hashes (trust the new state). + with open(hash_file, "w", encoding="utf-8") as fh: + json.dump(current_hashes, fh, indent=2) + else: + logger.info("TOFU: all tracked files match stored hashes.") + else: + # First run: store hashes. + logger.info("TOFU: first run -- storing hashes for %s", list(current_hashes.keys())) + with open(hash_file, "w", encoding="utf-8") as fh: + json.dump(current_hashes, fh, indent=2) + + +# --------------------------------------------------------------------------- +# Agent Card +# --------------------------------------------------------------------------- + + +def get_agent_card(host: str, port: int) -> AgentCard: + """Return an A2A AgentCard for the Sandbox Legion. + + Parameters + ---------- + host: + Hostname or IP address the agent is listening on. + port: + Port number the agent is listening on. + """ + capabilities = AgentCapabilities( + streaming=True, + extensions=[ + AgentExtension( + uri="urn:kagenti:agent-graph-card:v1", + description="Processing graph topology and event schemas", + required=False, + params={"endpoint": "/.well-known/agent-graph-card.json"}, + ), + ], + ) + # Scan workspace for loaded skill files (.claude/skills/**/*.md) + # Skills found on disk are advertised in the agent card so the UI + # can show them in the / autocomplete (SkillWhisperer). + skills: list[AgentSkill] = [] + workspace = os.environ.get("WORKSPACE_DIR", "/workspace") + skills_dir = Path(workspace) / ".claude" / "skills" + if skills_dir.is_dir(): + seen_ids: set[str] = set() + for md_file in sorted(skills_dir.rglob("SKILL.md")): + # Directory-based skills: auth:keycloak-confidential-client/SKILL.md + # Skill ID = directory name relative to skills_dir + rel_dir = md_file.parent.relative_to(skills_dir) + skill_id = str(rel_dir).replace("/", ":") + if skill_id in seen_ids or skill_id == ".": + continue + seen_ids.add(skill_id) + # Read description from the skill file (skip frontmatter) + try: + content = md_file.read_text(errors="replace") + desc = "" + for line in content.split("\n"): + line = line.strip() + if line.startswith("description:"): + desc = line.split(":", 1)[1].strip().strip("'\"") + break + if line.startswith("# ") and not desc: + desc = line.lstrip("# ").strip() + if not desc: + desc = skill_id + except Exception: + desc = skill_id + skills.append( + AgentSkill( + id=skill_id, + name=skill_id, + description=desc[:200], + tags=["skill"], + ) + ) + logger.info("Found %d skills in %s", len(skills), skills_dir) + + # Always include the base sandbox skill + skills.append( + AgentSkill( + id="sandbox_legion", + name="Sandbox Legion", + description=( + "Sandboxed coding assistant with shell execution, file read/write, " + "web fetch, explore, and delegate capabilities." + ), + tags=["shell", "file", "workspace", "sandbox"], + examples=[ + "Run 'ls -la' in my workspace", + "Create a Python script that prints hello world", + "Read the contents of output/results.txt", + ], + ) + ) + return AgentCard( + name="Sandbox Legion", + description=dedent( + """\ + A sandboxed coding assistant that can execute shell commands, \ + read files, and write files inside isolated per-context workspaces. + + ## Key Features + - **Shell execution** with three-tier permission checks (allow/deny/HITL) + - **File read/write** with path-traversal prevention + - **Per-context workspaces** for multi-turn isolation + """, + ), + url=f"http://{host}:{port}/", + version="1.0.0", + default_input_modes=["text"], + default_output_modes=["text"], + capabilities=capabilities, + skills=skills, + ) + + +# --------------------------------------------------------------------------- +# Agent Executor +# --------------------------------------------------------------------------- + + +class SandboxAgentExecutor(AgentExecutor): + """A2A executor that delegates to the LangGraph sandbox graph.""" + + # Per-context_id locks to serialize concurrent graph executions for the + # same conversation. A simple dict + mutex approach with periodic cleanup + # of unused entries. + _context_locks: dict[str, asyncio.Lock] = {} + _context_locks_mutex: asyncio.Lock = asyncio.Lock() + + async def _get_context_lock(self, context_id: str) -> asyncio.Lock: + """Return (and lazily create) the asyncio.Lock for *context_id*. + + A class-level mutex guards the dict so that two concurrent requests + for the same new context_id don't each create their own Lock. + """ + async with self._context_locks_mutex: + lock = self._context_locks.get(context_id) + if lock is None: + lock = asyncio.Lock() + self._context_locks[context_id] = lock + return lock + + def __init__(self) -> None: + settings = _load_json("settings.json") + sources = _load_json("sources.json") + + self._permission_checker = PermissionChecker(settings) + self._sources_config = SourcesConfig.from_dict(sources) + + config = Configuration() # type: ignore[call-arg] + + # Use PostgreSQL checkpointer if configured, else in-memory + self._checkpoint_db_url = config.checkpoint_db_url + self._checkpointer = None # Lazy-initialized in execute() + self._checkpointer_initialized = False + if not self._checkpoint_db_url or self._checkpoint_db_url == "memory": + self._checkpointer = MemorySaver() + self._checkpointer_initialized = True + logger.info("Using in-memory checkpointer (set CHECKPOINT_DB_URL for persistence)") + else: + logger.info("PostgreSQL checkpointer configured: %s", self._checkpoint_db_url.split("@")[-1]) + self._workspace_manager = WorkspaceManager( + workspace_root=config.workspace_root, + agent_name="sandbox-legion", + ttl_days=config.context_ttl_days, + ) + + # C19: Clean up expired workspaces on startup. + cleaned = self._workspace_manager.cleanup_expired() + if cleaned: + logger.info("Cleaned up %d expired workspaces: %s", len(cleaned), cleaned) + + # TOFU: verify workspace config file integrity on startup. + # Logs warnings on mismatch but does not block the agent from starting. + _tofu_verify(_PACKAGE_ROOT) + + async def _ensure_checkpointer(self) -> None: + """Initialize or re-initialize the PostgreSQL checkpointer. + + Creates a new connection pool if not initialized yet, or if the + existing connection is stale (e.g., after a PostgreSQL restart). + """ + if not self._checkpoint_db_url: + return + + needs_init = not self._checkpointer_initialized + + # Check if existing connection is stale + if self._checkpointer_initialized and self._checkpointer: + try: + # Lightweight health check — attempt a simple query + pool = getattr(self._checkpointer, 'conn', None) or getattr(self._checkpointer, '_conn', None) + if pool and hasattr(pool, 'execute'): + await pool.execute("SELECT 1") + except Exception: + logger.warning("PostgreSQL checkpointer connection stale — re-initializing") + # Close old connection + if hasattr(self, '_checkpointer_cm') and self._checkpointer_cm: + try: + await self._checkpointer_cm.__aexit__(None, None, None) + except Exception: + pass + needs_init = True + self._checkpointer_initialized = False + + if needs_init: + from langgraph.checkpoint.postgres.aio import AsyncPostgresSaver + + cm = AsyncPostgresSaver.from_conn_string(self._checkpoint_db_url) + self._checkpointer = await cm.__aenter__() + self._checkpointer_cm = cm + await self._checkpointer.setup() + self._checkpointer_initialized = True + logger.info("PostgreSQL checkpointer initialized") + + # ------------------------------------------------------------------ + + async def execute( + self, context: RequestContext, event_queue: EventQueue + ) -> None: + """Execute a user request through the LangGraph sandbox graph. + + Steps: + 1. Get or create an A2A task. + 2. Resolve the workspace directory from context_id. + 3. Build and stream the LangGraph graph. + 4. Emit status updates and artifacts via TaskUpdater. + """ + # 1. Get or create task + task = context.current_task + if not task: + task = new_task(context.message) # type: ignore + await event_queue.enqueue_event(task) + + task_updater = TaskUpdater(event_queue, task.id, task.context_id) + + # 2. Resolve workspace from context_id + context_id = task.context_id + if context_id: + workspace_path = self._workspace_manager.ensure_workspace(context_id) + logger.info("Using workspace for context_id=%s: %s", context_id, workspace_path) + else: + workspace_path = "/tmp/sandbox-stateless" + Path(workspace_path).mkdir(parents=True, exist_ok=True) + logger.info("No context_id; using stateless workspace: %s", workspace_path) + + # Lazy-init PostgreSQL checkpointer on first execute() + await self._ensure_checkpointer() + + # 3. Build graph with shared checkpointer for multi-turn memory + namespace = os.environ.get("NAMESPACE", "team1") + graph = build_graph( + workspace_path=workspace_path, + permission_checker=self._permission_checker, + sources_config=self._sources_config, + checkpointer=self._checkpointer, + context_id=context_id or "stateless", + namespace=namespace, + ) + + # 4. Stream graph execution with thread_id for checkpointer routing. + # Acquire a per-context_id lock so that two concurrent requests for + # the same conversation are serialized (the LangGraph checkpointer + # is not safe for parallel writes to the same thread_id). + lock = await self._get_context_lock(context_id or "stateless") + logger.info( + "Acquiring context lock for context_id=%s (already locked: %s)", + context_id, + lock.locked(), + ) + + async with lock: + messages = [HumanMessage(content=context.get_user_input())] + input_state: dict[str, Any] = { + "messages": messages, + "workspace_path": workspace_path, + "context_id": context_id or "stateless", + } + + # Extract skill from A2A message metadata and load its content. + # TODO(Session N): Once base image moves to kagenti repo, use + # skill_pack_loader.py at startup to clone verified skill packs + # from skill-packs.yaml into /workspace/.claude/skills/ before + # the first message. Currently skills must be pre-populated. + msg = context.message + skill_id = None + if msg and msg.metadata: + skill_id = msg.metadata.get("skill") + + if skill_id: + skill_content = _load_skill(workspace_path, skill_id) + if skill_content: + input_state["skill_instructions"] = ( + f'\n' + f"{skill_content}\n" + f"\n\n" + f"Follow the skill instructions above for this task." + ) + logger.info("Loaded skill '%s' for context_id=%s", skill_id, context_id) + else: + logger.warning("Skill '%s' requested but not found in workspace %s", skill_id, workspace_path) + + graph_config = { + "configurable": {"thread_id": context_id or "stateless"}, + "recursion_limit": AgentBudget().recursion_limit, + } + logger.info("Processing messages: %s (thread_id=%s)", input_state, context_id) + + try: + output = None + serializer = LangGraphSerializer(context_id=context_id) + llm_request_ids: list[str] = [] + + # Run graph in a shielded background task so client disconnect + # does NOT cancel the LangGraph execution. Events are fed + # through an asyncio.Queue; the consumer (below) forwards them + # to the A2A event stream. If the consumer is cancelled the + # graph keeps running and saves results to the task store. + _SENTINEL = object() + event_queue: asyncio.Queue = asyncio.Queue() + + async def _run_graph() -> None: + """Execute graph and push events to queue (shielded).""" + nonlocal graph + max_retries = 3 + for attempt in range(max_retries + 1): + try: + async for ev in graph.astream( + input_state, config=graph_config, stream_mode="updates" + ): + await event_queue.put(ev) + break # success + except Exception as retry_err: + err_str = str(retry_err).lower() + is_quota = "insufficient_quota" in err_str + is_rate = "rate_limit" in err_str or "429" in err_str + is_db_stale = "connection is closed" in err_str or "operationalerror" in err_str + if is_quota: + logger.error("LLM quota exceeded: %s", retry_err) + await event_queue.put( + {"_error": "LLM API quota exceeded. Check billing."} + ) + break + elif is_db_stale and attempt < max_retries: + logger.warning( + "DB connection stale (%d/%d), re-initializing checkpointer: %s", + attempt + 1, max_retries, retry_err, + ) + await self._ensure_checkpointer() + # Rebuild graph with fresh checkpointer + graph = build_graph( + workspace_path=workspace_path, + permission_checker=self._permission_checker, + sources_config=self._sources_config, + checkpointer=self._checkpointer, + context_id=context_id or "stateless", + namespace=namespace, + ) + continue + elif is_rate and attempt < max_retries: + delay = 2 ** (attempt + 1) + logger.warning( + "Rate limited (%d/%d), retrying in %ds: %s", + attempt + 1, max_retries, delay, retry_err, + ) + await asyncio.sleep(delay) + continue + else: + logger.error("Graph execution failed: %s", retry_err, exc_info=True) + await event_queue.put({"_error": str(retry_err)}) + break + await event_queue.put(_SENTINEL) + + # Shield the graph task from cancellation + graph_task = asyncio.ensure_future(asyncio.shield(_run_graph())) + + # Consume events from the queue — this side CAN be cancelled + event_count = 0 + client_disconnected = False + while True: + try: + event = await event_queue.get() + except asyncio.CancelledError: + logger.warning( + "Event consumer cancelled (context=%s) — graph continues in background", + context_id, + ) + client_disconnected = True + break + if event is _SENTINEL: + break + if "_error" in event: + error_msg = event["_error"] + await task_updater.update_status( + TaskState.working, + new_agent_text_message( + json.dumps({"type": "error", "message": error_msg}), + task_updater.context_id, + task_updater.task_id, + ), + ) + parts = [TextPart(text=f"Error: {error_msg}")] + await task_updater.add_artifact(parts) + await task_updater.failed() + return + + event_count += 1 + node_names = list(event.keys()) + logger.info( + "Graph event %d: nodes=%s (context=%s)", + event_count, node_names, context_id, + ) + + # Skip __interrupt__ events (HITL pause) — these contain + # tuples, not dicts, and shouldn't be serialized. + if "__interrupt__" in event: + logger.info( + "Graph interrupted (HITL) at event %d: %s", + event_count, event.get("__interrupt__"), + ) + # Emit a structured HITL event for the frontend + hitl_data = event.get("__interrupt__", ()) + hitl_msg = str(hitl_data[0]) if hitl_data else "Approval required" + hitl_json = json.dumps({ + "type": "hitl_request", + "loop_id": serializer._loop_id, + "message": hitl_msg[:500], + }) + await task_updater.update_status( + TaskState.working, + new_agent_text_message( + hitl_json + "\n", + task_updater.context_id, + task_updater.task_id, + ), + ) + continue + + # Send intermediate status updates as structured JSON + try: + serialized_lines = "\n".join( + serializer.serialize(key, value) + for key, value in event.items() + if isinstance(value, dict) + ) + "\n" + await task_updater.update_status( + TaskState.working, + new_agent_text_message( + serialized_lines, + task_updater.context_id, + task_updater.task_id, + ), + ) + line_types = [] + for line in serialized_lines.split("\n"): + line = line.strip() + if line: + try: + lt = json.loads(line).get("type", "?") + line_types.append(lt) + except json.JSONDecodeError: + line_types.append("parse_error") + logger.info("A2A_EMIT session=%s lines=%d types=%s", + context_id, len(line_types), line_types) + except asyncio.CancelledError: + logger.warning( + "SSE update cancelled at event %d (context=%s) — client disconnected", + event_count, context_id, + ) + client_disconnected = True + break + except Exception as update_err: + logger.error( + "Failed to send SSE update for event %d: %s", + event_count, update_err, + ) + output = event + + # Capture LLM request_ids from AIMessage responses + for _node_val in event.values(): + if isinstance(_node_val, dict): + for _msg in _node_val.get("messages", []): + _rid = getattr(_msg, "response_metadata", {}).get("id") + if _rid and _rid not in llm_request_ids: + llm_request_ids.append(_rid) + + # If client disconnected, wait for graph to finish in background + if client_disconnected: + logger.info("Waiting for graph to complete in background (context=%s)", context_id) + try: + await asyncio.wait_for(graph_task, timeout=300) + except (asyncio.TimeoutError, asyncio.CancelledError): + logger.warning("Graph background task timed out or cancelled (context=%s)", context_id) + # Drain remaining events — serialize and persist them + # since the SSE consumer was cancelled and missed these. + bg_event_count = 0 + bg_serialized_lines: list[str] = [] + while not event_queue.empty(): + ev = event_queue.get_nowait() + if ev is _SENTINEL or "_error" in ev: + continue + output = ev + bg_event_count += 1 + # Serialize each event so it can be persisted + try: + for key, value in ev.items(): + if isinstance(value, dict): + serialized = serializer.serialize(key, value) + bg_serialized_lines.append(serialized) + except Exception as ser_err: + logger.warning("Failed to serialize bg event %d: %s", bg_event_count, ser_err) + if bg_event_count > 0: + logger.info( + "Drained %d background events for context=%s, serialized %d lines", + bg_event_count, context_id, len(bg_serialized_lines), + ) + # Persist via task_updater so the events appear in history + for line_block in bg_serialized_lines: + try: + await task_updater.update_status( + TaskState.working, + new_agent_text_message( + line_block + "\n", + task_updater.context_id, + task_updater.task_id, + ), + ) + except Exception: + pass # best-effort + + # Extract final answer from the last event. + # The reporter node sets {"final_answer": "..."}. + # Fall back to checking messages from reporter or executor. + final_answer = None + if output: + # 1. Check reporter node output (plan-execute-reflect) + reporter_output = output.get("reporter", {}) + if isinstance(reporter_output, dict): + final_answer = reporter_output.get("final_answer") + + # 2. Fall back to executor/assistant message content + if not final_answer: + for node_name in ("reporter", "executor", "assistant"): + node_output = output.get(node_name, {}) + if isinstance(node_output, dict): + msgs = node_output.get("messages", []) + if msgs: + content = getattr(msgs[-1], "content", None) + if isinstance(content, list): + final_answer = "\n".join( + block.get("text", "") if isinstance(block, dict) else str(block) + for block in content + if isinstance(block, dict) and block.get("type") == "text" + ) or None + elif content: + final_answer = str(content) + if final_answer: + break + + if final_answer is None: + final_answer = "No response generated." + + # Store LLM request_ids in task metadata for token usage tracking + if llm_request_ids: + try: + existing_meta = {} + if task.metadata: + existing_meta = dict(task.metadata) if not isinstance(task.metadata, dict) else task.metadata + existing_meta["llm_request_ids"] = llm_request_ids + task.metadata = existing_meta + logger.info( + "Stored %d LLM request_ids in task metadata for context_id=%s", + len(llm_request_ids), context_id, + ) + except Exception as meta_err: + logger.warning("Failed to store llm_request_ids: %s", meta_err) + + # Add artifact with final answer and complete + parts = [TextPart(text=final_answer)] + await task_updater.add_artifact(parts) + await task_updater.complete() + + except asyncio.CancelledError: + logger.warning( + "Graph execution context cancelled for context=%s — client likely disconnected. " + "Agent will continue processing and save results to task store.", + context_id, + ) + # Don't return — fall through to save results to task store. + # The A2A SDK persists the task, so the client can poll later. + except Exception as e: + logger.error("Graph execution error: %s", e, exc_info=True) + error_msg = json.dumps({"type": "error", "message": str(e)}) + await task_updater.update_status( + TaskState.working, + new_agent_text_message( + error_msg, + task_updater.context_id, + task_updater.task_id, + ), + ) + parts = [TextPart(text=f"Error: {e}")] + await task_updater.add_artifact(parts) + await task_updater.failed() + + # Periodic cleanup: remove locks that are no longer held and whose + # context_id has not been seen recently. We do this opportunistically + # after each execution to avoid unbounded growth. + async with self._context_locks_mutex: + stale = [cid for cid, lk in self._context_locks.items() if not lk.locked()] + # Keep the dict from growing without bound, but only drop entries + # when there are more than 1000 idle locks. + if len(stale) > 1000: + for cid in stale: + del self._context_locks[cid] + logger.debug("Cleaned up %d idle context locks", len(stale)) + + # ------------------------------------------------------------------ + + async def cancel( + self, context: RequestContext, event_queue: EventQueue + ) -> None: + """Cancel is not supported.""" + raise Exception("cancel not supported") + + +# --------------------------------------------------------------------------- +# Server entry point +# --------------------------------------------------------------------------- + + +class _MergingDatabaseTaskStore(DatabaseTaskStore): + """DatabaseTaskStore that preserves backend-managed metadata fields. + + The backend writes fields like ``owner``, ``agent_name``, ``loop_events`` + to the ``metadata`` column. The default ``save()`` uses SQLAlchemy + ``merge()`` which overwrites the entire row, losing those fields. + + This subclass reads existing metadata before writing and merges + backend-managed keys so they survive A2A SDK updates. + """ + + _BACKEND_KEYS = frozenset({ + "owner", "visibility", "title", "agent_name", "loop_events", + }) + + async def save(self, task, context=None): + """Save task while preserving backend-managed metadata fields.""" + await self._ensure_initialized() + + # Read existing metadata before overwriting + existing_meta = {} + async with self.async_session_maker() as session: + from sqlalchemy import select + stmt = select(self.task_model).where(self.task_model.id == task.id) + result = await session.execute(stmt) + existing = result.scalar_one_or_none() + if existing and existing.task_metadata: + raw = existing.task_metadata + if isinstance(raw, dict): + existing_meta = raw + elif isinstance(raw, str): + import json + try: + existing_meta = json.loads(raw) + except (json.JSONDecodeError, TypeError): + pass + + # Merge: start with new task metadata, overlay backend fields from existing + merged = dict(task.metadata or {}) if task.metadata else {} + for key in self._BACKEND_KEYS: + if key in existing_meta and key not in merged: + merged[key] = existing_meta[key] + + # Update task metadata with merged result + task.metadata = merged if merged else task.metadata + + # Call parent save (which does session.merge) + db_task = self._to_orm(task) + async with self.async_session_maker.begin() as session: + await session.merge(db_task) + logger.debug("Task %s saved with merged metadata (keys=%s)", + task.id, list(merged.keys()) if merged else []) + + +def _create_task_store(): + """Create the appropriate TaskStore based on configuration. + + Uses _MergingDatabaseTaskStore (PostgreSQL) when TASK_STORE_DB_URL + is set. Falls back to InMemoryTaskStore for dev/test. + + The merging store preserves backend-managed metadata fields (owner, + agent_name, loop_events) that would otherwise be overwritten by + the A2A SDK's session.merge(). + """ + import os + + db_url = os.environ.get("TASK_STORE_DB_URL", "") + if db_url and _HAS_SQL_STORE: + from sqlalchemy.ext.asyncio import create_async_engine + + engine = create_async_engine( + db_url, + pool_size=5, + max_overflow=3, + pool_recycle=300, # Recycle connections every 5 min + pool_pre_ping=True, # Verify connection before use + ) + store = _MergingDatabaseTaskStore(engine) + logger.info("Using MergingDatabaseTaskStore: %s", db_url.split("@")[-1]) + return store + + logger.info("Using InMemoryTaskStore (set TASK_STORE_DB_URL for persistence)") + return InMemoryTaskStore() + + +def _load_skill_packs_at_startup() -> None: + """Clone skill repos into /workspace/.claude/skills/ at startup. + + Reads SKILL_REPOS env var (comma-separated git URLs with optional + path suffix after #). Falls back to kagenti repo skills. + + TODO(Session N): Replace with skill_pack_loader.py once the base + image moves to the kagenti repo. + """ + import subprocess + + workspace = os.environ.get("WORKSPACE_DIR", "/workspace") + skills_dir = Path(workspace) / ".claude" / "skills" + + if skills_dir.exists() and any(skills_dir.rglob("*.md")): + logger.info("Skills already loaded at %s, skipping clone", skills_dir) + return + + # Default: clone kagenti skills from the upstream public repo + repos = os.environ.get( + "SKILL_REPOS", + "https://github.com/kagenti/kagenti.git#.claude/skills", + ) + + for entry in repos.split(","): + entry = entry.strip() + if not entry: + continue + + # Parse "url@branch#path" format + branch = None + if "#" in entry: + url_part, skill_path = entry.rsplit("#", 1) + else: + url_part, skill_path = entry, ".claude/skills" + if "@" in url_part and not url_part.startswith("git@"): + repo_url, branch = url_part.rsplit("@", 1) + else: + repo_url = url_part + + clone_dir = Path(workspace) / ".skill-repos" / repo_url.split("/")[-1].replace(".git", "") + + # Remove stale clone if exists (pod restart) + if clone_dir.exists(): + subprocess.run(["rm", "-rf", str(clone_dir)], capture_output=True, timeout=10) + + try: + cmd = ["git", "clone", "--depth", "1", "--single-branch"] + if branch: + cmd += ["--branch", branch] + cmd += [repo_url, str(clone_dir)] + logger.info("Cloning skills from %s branch=%s (path: %s)", repo_url, branch or "default", skill_path) + subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=120, + ) + + src = clone_dir / skill_path + if src.is_dir(): + skills_dir.mkdir(parents=True, exist_ok=True) + # Copy skill files (preserve directory structure) + subprocess.run( + ["cp", "-r"] + [str(p) for p in src.iterdir()] + [str(skills_dir)], + capture_output=True, + timeout=30, + ) + count = len(list(skills_dir.rglob("*.md"))) + logger.info("Loaded %d skill files from %s", count, repo_url) + else: + logger.warning("Skill path %s not found in %s", skill_path, repo_url) + except subprocess.TimeoutExpired: + logger.warning("Timeout cloning %s", repo_url) + except Exception as e: + logger.warning("Failed to clone skills from %s: %s", repo_url, e) + + +def run() -> None: + """Create the A2A server application and run it with uvicorn.""" + # Landlock probe: verify filesystem isolation works before accepting requests. + # Runs in a forked child (Landlock is irreversible). Exits the process if + # the kernel does not support Landlock or the probe fails. + if os.environ.get("SANDBOX_LANDLOCK") == "true": + from sandbox_agent.landlock_probe import probe_landlock + + abi = probe_landlock() # exits process if Landlock unavailable + logger.info("Landlock probe passed -- ABI version %d", abi) + + # Initialize OTel GenAI auto-instrumentation (if OTEL_EXPORTER_OTLP_ENDPOINT is set). + # NOTE: Only LangChain/OpenAI auto-instrumentation is enabled here. + # The HTTP middleware is disabled because it interferes with SSE streaming + # (BaseHTTPMiddleware captures response body, breaking streaming connections). + # TODO: Replace with per-node span emission from AgentGraphCard processing. + setup_observability() + + # Load skills from git repos before building the agent card + _load_skill_packs_at_startup() + + agent_card = get_agent_card(host="0.0.0.0", port=8000) + + request_handler = DefaultRequestHandler( + agent_executor=SandboxAgentExecutor(), + task_store=_create_task_store(), + ) + + server = A2AStarletteApplication( + agent_card=agent_card, + http_handler=request_handler, + ) + + # Build the Starlette app + app = server.build() + + # NOTE: OTel HTTP middleware REMOVED — it breaks SSE streaming. + # BaseHTTPMiddleware wraps the response body iterator, which causes + # CancelledError propagation when SSE clients disconnect. This kills + # the event queue and prevents event delivery. + # Future: emit spans from AgentGraphCard event processing instead. + + # Add the /.well-known/agent-card.json route + app.routes.insert( + 0, + Route( + "/.well-known/agent-card.json", + server._handle_get_agent_card, + methods=["GET"], + name="agent_card_well_known", + ), + ) + + # Build the graph card from the compiled LangGraph. + # We compile a temporary graph just for introspection (no checkpointer needed). + _graph_card_cache: dict[str, Any] = {} + + async def _handle_graph_card(request: Any) -> Any: # noqa: ARG001 + from starlette.responses import JSONResponse + + if not _graph_card_cache: + # Build a graph for introspection only (no checkpointer, dummy config) + from sandbox_agent.permissions import PermissionChecker + from sandbox_agent.sources import SourcesConfig + pc = PermissionChecker(settings={"workspace": "/workspace", "permissions": {}}) + sc = SourcesConfig() + compiled = build_graph( + workspace_path="/workspace", + permission_checker=pc, + sources_config=sc, + checkpointer=None, + ) + _graph_card_cache.update( + build_graph_card(compiled, agent_id="sandbox-legion-v1") + ) + return JSONResponse(_graph_card_cache) + + app.routes.insert( + 0, + Route( + "/.well-known/agent-graph-card.json", + _handle_graph_card, + methods=["GET"], + name="agent_graph_card", + ), + ) + + uvicorn.run(app, host="0.0.0.0", port=8000) From f29250db09bc931240284ef07df24d710766e822 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:47:34 +0100 Subject: [PATCH 03/26] feat(sandbox): budget tracking with iteration, token, tool-call, and wall-clock limits Signed-off-by: Ladislav Smola --- a2a/sandbox_agent/src/sandbox_agent/budget.py | 177 ++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/budget.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/budget.py b/a2a/sandbox_agent/src/sandbox_agent/budget.py new file mode 100644 index 00000000..87816781 --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/budget.py @@ -0,0 +1,177 @@ +"""Budget tracking for the plan-execute-reflect reasoning loop. + +Prevents runaway execution by capping iterations, tool calls per step, +total token usage, and wall clock time. When the budget is exceeded the +reflector forces the loop to terminate gracefully. + +Token budget is enforced via the LLM Budget Proxy: +- The proxy intercepts all LLM calls and checks per-session token usage +- When budget is exceeded, the proxy returns HTTP 402 +- The agent catches 402 errors and terminates gracefully +- The local ``tokens_used`` counter tracks in-process usage for budget + summary events (emitted to the UI) and for the local ``exceeded`` check + +Budget scopes: +- **Per-message** (single graph run): max_iterations, max_wall_clock_s, recursion_limit +- **Per-step** (within one plan step): max_tool_calls_per_step +- **Per-session** (across A2A turns + restarts): enforced by LLM Budget Proxy + +Budget parameters are configurable via environment variables: + +- ``SANDBOX_MAX_ITERATIONS`` (default: 100) +- ``SANDBOX_MAX_TOOL_CALLS_PER_STEP`` (default: 10) +- ``SANDBOX_MAX_TOKENS`` (default: 1000000) — passed to proxy via metadata +- ``SANDBOX_MAX_WALL_CLOCK_S`` (default: 3600) — max seconds per message (1 hour) +- ``SANDBOX_HITL_INTERVAL`` (default: 50) +- ``SANDBOX_RECURSION_LIMIT`` (default: 50) +- ``SANDBOX_LLM_TIMEOUT`` (default: 300) — seconds per LLM call +- ``SANDBOX_LLM_MAX_RETRIES`` (default: 3) — retry on transient LLM errors +""" + +from __future__ import annotations + +import logging +import os +import time +from dataclasses import dataclass, field + +logger = logging.getLogger(__name__) + + +def _env_int(name: str, default: int) -> int: + """Read an integer from the environment, falling back to *default*.""" + raw = os.environ.get(name) + if raw is None: + return default + try: + return int(raw) + except ValueError: + return default + + +@dataclass +class AgentBudget: + """Tracks resource usage across the reasoning loop. + + Attributes + ---------- + max_iterations: + Maximum outer-loop iterations (planner → executor → reflector). + max_tool_calls_per_step: + Maximum tool invocations the executor may make for a single plan step. + max_tokens: + Approximate upper bound on total tokens consumed (prompt + completion). + Passed to the LLM Budget Proxy via request metadata. + max_wall_clock_s: + Maximum wall clock time in seconds for a single message run. + hitl_interval: + After this many iterations, the reflector suggests a human check-in. + recursion_limit: + LangGraph recursion limit passed to graph invocation config. + """ + + max_iterations: int = _env_int("SANDBOX_MAX_ITERATIONS", 200) + max_tool_calls_per_step: int = _env_int("SANDBOX_MAX_TOOL_CALLS_PER_STEP", 20) + max_tokens: int = _env_int("SANDBOX_MAX_TOKENS", 1_000_000) + max_wall_clock_s: int = _env_int("SANDBOX_MAX_WALL_CLOCK_S", 3600) # 1 hour + hitl_interval: int = _env_int("SANDBOX_HITL_INTERVAL", 50) + recursion_limit: int = _env_int("SANDBOX_RECURSION_LIMIT", 300) + llm_timeout: int = _env_int("SANDBOX_LLM_TIMEOUT", 300) + llm_max_retries: int = _env_int("SANDBOX_LLM_MAX_RETRIES", 3) + + # Mutable runtime counters — not constructor args. + iterations_used: int = field(default=0, init=False) + tokens_used: int = field(default=0, init=False) + tool_calls_this_step: int = field(default=0, init=False) + _start_time: float = field(default_factory=time.monotonic, init=False) + + # -- helpers ------------------------------------------------------------- + + def tick_iteration(self) -> None: + """Advance the iteration counter by one.""" + self.iterations_used += 1 + + def add_tokens(self, count: int) -> None: + """Accumulate *count* tokens (prompt + completion). + + Tracks in-process token usage for budget summary events and the + local ``exceeded`` check. The authoritative budget enforcement + is done by the LLM Budget Proxy (returns 402 when exceeded). + """ + self.tokens_used += count + if self.tokens_exceeded: + logger.warning( + "Budget: tokens exceeded %d/%d", + self.tokens_used, + self.max_tokens, + ) + + def tick_tool_call(self) -> None: + """Record a tool invocation within the current step.""" + self.tool_calls_this_step += 1 + + def reset_step_tools(self) -> None: + """Reset the per-step tool-call counter (called between plan steps).""" + self.tool_calls_this_step = 0 + + # -- queries ------------------------------------------------------------- + + @property + def wall_clock_s(self) -> float: + """Seconds elapsed since this budget was created.""" + return time.monotonic() - self._start_time + + @property + def iterations_exceeded(self) -> bool: + return self.iterations_used >= self.max_iterations + + @property + def tokens_exceeded(self) -> bool: + return self.tokens_used >= self.max_tokens + + @property + def wall_clock_exceeded(self) -> bool: + return self.wall_clock_s >= self.max_wall_clock_s + + @property + def step_tools_exceeded(self) -> bool: + return self.tool_calls_this_step >= self.max_tool_calls_per_step + + @property + def exceeded(self) -> bool: + """Return True if *any* local budget limit has been reached. + + Token budget is NOT checked here — it is enforced by the LLM + Budget Proxy (returns HTTP 402). The agent catches 402 errors + in the executor/reflector/reporter nodes. + """ + return self.iterations_exceeded or self.wall_clock_exceeded + + @property + def exceeded_reason(self) -> str | None: + """Human-readable reason for why the budget was exceeded, or None.""" + if self.iterations_exceeded: + return f"Iteration limit reached ({self.iterations_used}/{self.max_iterations})" + if self.wall_clock_exceeded: + return f"Time limit reached ({self.wall_clock_s:.0f}s/{self.max_wall_clock_s}s)" + return None + + @property + def needs_hitl_checkin(self) -> bool: + """Return True when it's time for a human-in-the-loop check-in.""" + return ( + self.hitl_interval > 0 + and self.iterations_used > 0 + and self.iterations_used % self.hitl_interval == 0 + ) + + def summary(self) -> dict: + """Return budget state as a dict for event serialization.""" + return { + "tokens_used": self.tokens_used, + "tokens_budget": self.max_tokens, + "iterations_used": self.iterations_used, + "iterations_budget": self.max_iterations, + "wall_clock_s": round(self.wall_clock_s, 1), + "max_wall_clock_s": self.max_wall_clock_s, + } From 03ccd9898e59991abdd625321613a8a4fbcd3677 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:47:40 +0100 Subject: [PATCH 04/26] feat(sandbox): pydantic configuration with per-node LLM model overrides Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/configuration.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/configuration.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/configuration.py b/a2a/sandbox_agent/src/sandbox_agent/configuration.py new file mode 100644 index 00000000..e712f1fd --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/configuration.py @@ -0,0 +1,30 @@ +from pydantic_settings import BaseSettings + + +class Configuration(BaseSettings): + llm_model: str = "llama3.1" + llm_api_base: str = "http://localhost:11434/v1" + llm_api_key: str = "dummy" + workspace_root: str = "/workspace" + checkpoint_db_url: str = "memory" + context_ttl_days: int = 7 + + # Per-node model overrides (empty = use llm_model default) + llm_model_planner: str = "" + llm_model_executor: str = "" + llm_model_reflector: str = "" + llm_model_reporter: str = "" + llm_model_thinking: str = "" # bare LLM for thinking iterations + llm_model_micro_reasoning: str = "" # LLM+tools for micro-reasoning + + def model_for_node(self, node: str) -> str: + """Return the model to use for a specific node type.""" + overrides = { + "planner": self.llm_model_planner, + "executor": self.llm_model_executor, + "reflector": self.llm_model_reflector, + "reporter": self.llm_model_reporter, + "thinking": self.llm_model_thinking, + "micro_reasoning": self.llm_model_micro_reasoning, + } + return overrides.get(node, "") or self.llm_model From df4beba6e5d5c1052cf8dfbd3836e3aba57a26b1 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:47:47 +0100 Subject: [PATCH 05/26] feat(sandbox): context builders for per-node message isolation in the reasoning loop Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/context_builders.py | 739 ++++++++++++++++++ 1 file changed, 739 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/context_builders.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/context_builders.py b/a2a/sandbox_agent/src/sandbox_agent/context_builders.py new file mode 100644 index 00000000..c3404711 --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/context_builders.py @@ -0,0 +1,739 @@ +"""Pure functions that build the message list for each reasoning node, +and an ``invoke_llm`` wrapper that guarantees the debug output matches +exactly what was sent to the LLM. + +Each builder takes the graph state and returns a list of BaseMessage objects +that the node should pass to ``llm.ainvoke()``. The functions are +independently testable and enforce context isolation — no node sees +messages it shouldn't. + +Context contracts: + + Planner — SystemMessage(prompt + step status) + HumanMessage(user request only). + Does NOT include own previous AIMessages (prevents replan duplication). + Executor — SystemMessage(prompt) + HumanMessage(step brief) + this step's tool pairs. + Stops at [STEP_BOUNDARY] SystemMessage. Never sees planner output. + Reflector — SystemMessage(prompt) + last 3 tool-call AI→Tool pairs. + Filters out non-tool AIMessages (planner/reflector text). + Reporter — SystemMessage(prompt) + full history (intentional for summarization). +""" + +from __future__ import annotations + +import json +import logging +import os +from dataclasses import dataclass, field +from typing import Any + +from langchain_core.messages import ( + AIMessage, + BaseMessage, + HumanMessage, + SystemMessage, + ToolMessage, +) + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Planner context +# --------------------------------------------------------------------------- + +_MAX_PLANNER_HISTORY_MSGS = 6 # user request + a few recent tool results + + +def build_planner_context( + state: dict[str, Any], + system_content: str, +) -> list[BaseMessage]: + """Build the message list for the planner node. + + On fresh plan (iteration 0): SystemMessage + all user HumanMessages. + On replan (iteration > 0): SystemMessage + user request + last few + ToolMessages for context. **Excludes** previous planner AIMessages + to prevent the LLM from seeing and duplicating its own plan. + + The step status and tool history are already in ``system_content`` + (built by the caller), so they don't need to appear as messages. + """ + messages = state.get("messages", []) + iteration = state.get("iteration", 0) + + if iteration == 0: + # Fresh plan: include only HumanMessages (user requests) + user_msgs = [m for m in messages if isinstance(m, HumanMessage)] + return [SystemMessage(content=system_content)] + user_msgs + + # Replan: user request + last few tool results for context. + # Explicitly EXCLUDE previous planner AIMessages to prevent duplication. + user_msgs = [m for m in messages if isinstance(m, HumanMessage)] + # Take the first user message (original request) + first_user = user_msgs[:1] if user_msgs else [] + + # Include last few ToolMessages so planner knows what was tried + recent_tools: list[BaseMessage] = [] + for m in reversed(messages): + if isinstance(m, ToolMessage): + recent_tools.insert(0, m) + if len(recent_tools) >= _MAX_PLANNER_HISTORY_MSGS: + break + + result = [SystemMessage(content=system_content)] + first_user + recent_tools + logger.info( + "Planner context: %d messages (iteration=%d, %d tool results)", + len(result), iteration, len(recent_tools), + extra={"session_id": state.get("context_id", ""), "node": "planner"}, + ) + return result + + +# --------------------------------------------------------------------------- +# Executor context +# --------------------------------------------------------------------------- + +_CHARS_PER_TOKEN = 4 +_MAX_CONTEXT_CHARS = 30_000 * _CHARS_PER_TOKEN # ~120k chars + + +def build_executor_context( + state: dict[str, Any], + system_content: str, +) -> list[BaseMessage]: + """Build the message list for the executor node. + + On new step (tool_call_count == 0): + SystemMessage(prompt) + HumanMessage(step brief). + The executor sees ONLY the step description — no plan, no history. + + On continuing step (tool_call_count > 0): + SystemMessage(prompt) + HumanMessage(step brief) + this step's + AI→Tool message pairs + HumanMessage(reflection prompt). + The reflection prompt at the END forces the LLM to think about + the results before calling the next tool. + """ + all_msgs = state.get("messages", []) + current_step = state.get("current_step", 0) + tool_call_count = state.get("_tool_call_count", 0) + plan = state.get("plan", []) + step_text = plan[current_step] if current_step < len(plan) else "N/A" + step_brief = state.get( + "skill_instructions", + f"Execute step {current_step + 1}: {step_text}", + ) + + first_msg = [HumanMessage(content=step_brief)] + + if tool_call_count == 0: + # New step: only the step brief + windowed: list[BaseMessage] = [] + else: + # Continuing: walk back to [STEP_BOUNDARY N] SystemMessage, + # then inject a HumanMessage reflection after EACH ToolMessage. + raw_windowed: list[BaseMessage] = [] + used_chars = 0 + for m in reversed(all_msgs): + content = str(getattr(m, "content", "")) + if isinstance(m, SystemMessage) and content.startswith( + f"[STEP_BOUNDARY {current_step}]" + ): + break + msg_chars = len(content) + if used_chars + msg_chars > _MAX_CONTEXT_CHARS: + break + raw_windowed.insert(0, m) + used_chars += msg_chars + + # Inject reflection HumanMessage after each ToolMessage + windowed = [] + call_num = 0 + for m in raw_windowed: + windowed.append(m) + if isinstance(m, ToolMessage): + call_num += 1 + tool_name = getattr(m, "name", "unknown") + content = str(getattr(m, "content", "")) + # Determine status from exit code + if "EXIT_CODE:" in content: + import re as _re + ec_match = _re.search(r"EXIT_CODE:\s*(\d+)", content) + status = "FAILED" if ec_match and ec_match.group(1) != "0" else "OK" + error_hint = content[:150] if status == "FAILED" else "" + elif content.startswith("Error:") or "Permission denied" in content: + status = "FAILED" + error_hint = content[:150] + else: + status = "OK" + error_hint = "" + + reflection_parts = [ + f"Tool '{tool_name}' call {call_num} {status}.", + ] + if error_hint: + reflection_parts.append(f"Error: {error_hint}") + if "unknown flag" in content.lower() or "invalid option" in content.lower(): + reflection_parts.append( + "The flag is INVALID. Run the command with --help to see valid flags." + ) + reflection_parts.append( + f"Goal: \"{step_text[:100]}\"\n" + f"If goal ACHIEVED → stop, summarize result. " + f"If FAILED → try DIFFERENT approach. " + f"NEVER repeat same command." + ) + windowed.append(HumanMessage(content=" ".join(reflection_parts))) + + result = [SystemMessage(content=system_content)] + first_msg + windowed + logger.info( + "Executor context: %d messages, ~%dk chars (from %d total)", + len(result), sum(len(str(getattr(m, "content", ""))) for m in result) // 1000, + len(all_msgs), + extra={ + "session_id": state.get("context_id", ""), + "node": "executor", + "current_step": current_step, + "tool_call_count": tool_call_count, + }, + ) + return result + + +# --------------------------------------------------------------------------- +# Reflector context +# --------------------------------------------------------------------------- + +_MAX_REFLECTOR_PAIRS = 10 # last 10 AI→Tool pairs (20 messages max) + + +def build_reflector_context( + state: dict[str, Any], + system_content: str, +) -> list[BaseMessage]: + """Build the message list for the reflector node. + + Includes only the last ``_MAX_REFLECTOR_PAIRS`` AI→Tool pairs from + the message history. **Filters out** AIMessages that have no + ``tool_calls`` (planner plan text, reflector decisions, executor + summaries) to prevent plan leakage. + + The plan text and step results are already in ``system_content`` + (formatted from state fields), so they don't need to appear as + conversation messages. + """ + messages = state.get("messages", []) + + recent_msgs: list[BaseMessage] = [] + pair_count = 0 + for m in reversed(messages): + if isinstance(m, SystemMessage): + continue + # Skip AIMessages without tool_calls (planner/reflector text output). + # These would leak plan context into the reflector. + if isinstance(m, AIMessage) and not getattr(m, "tool_calls", None): + continue + recent_msgs.insert(0, m) + if isinstance(m, AIMessage) and getattr(m, "tool_calls", None): + pair_count += 1 + if pair_count >= _MAX_REFLECTOR_PAIRS: + break + + result = [SystemMessage(content=system_content)] + recent_msgs + logger.info( + "Reflector context: %d messages (%d tool pairs from %d total)", + len(result), pair_count, len(messages), + extra={"session_id": state.get("context_id", ""), "node": "reflector"}, + ) + return result + + +# --------------------------------------------------------------------------- +# LLM invocation wrapper — captures exactly what the LLM sees +# --------------------------------------------------------------------------- + +_DEBUG_PROMPTS = os.environ.get("SANDBOX_DEBUG_PROMPTS", "1") == "1" + + +@dataclass +class LLMCallCapture: + """Captures the exact input/output of an LLM invocation. + + Always populated (not conditional on _DEBUG_PROMPTS) so that the + node result can decide what to include. This guarantees the debug + view shows exactly what the LLM received — no drift. + """ + + messages: list = field(default_factory=list) + response: Any = None + prompt_tokens: int = 0 + completion_tokens: int = 0 + model: str = "" + bound_tools: list = field(default_factory=list) # tool schemas sent to LLM + + # -- Convenience methods for node result dicts ------------------------- + + def debug_fields(self) -> dict[str, Any]: + """Return prompt debug fields for the node result dict. + + Only populated when ``SANDBOX_DEBUG_PROMPTS=1`` (default). + These are large payloads (system prompt, message list, full + response) — optional to reduce event size in production. + Token counts and budget are always included via ``token_fields()``. + """ + if not _DEBUG_PROMPTS: + return {} + result: dict[str, Any] = { + "_system_prompt": self._system_prompt()[:10000], + "_prompt_messages": self._summarize_messages(), + "_llm_response": self._format_response(), + } + if self.bound_tools: + result["_bound_tools"] = self.bound_tools[:50] + return result + + def token_fields(self) -> dict[str, Any]: + """Return token usage fields for the node result dict.""" + return { + "model": self.model, + "prompt_tokens": self.prompt_tokens, + "completion_tokens": self.completion_tokens, + } + + # -- Internal helpers -------------------------------------------------- + + def _system_prompt(self) -> str: + """Extract the system prompt from the captured messages.""" + for m in self.messages: + if isinstance(m, SystemMessage): + return str(m.content) + return "" + + def _summarize_messages(self) -> list[dict[str, str]]: + """Summarize messages as {role, preview} dicts. + + Skips the first SystemMessage since it's already shown as _system_prompt. + """ + result = [] + skip_first_system = True + for msg in self.messages: + if skip_first_system and isinstance(msg, SystemMessage): + skip_first_system = False + continue + role = getattr(msg, "type", "unknown") + content = getattr(msg, "content", "") + if isinstance(content, list): + content = " ".join( + b.get("text", "") + for b in content + if isinstance(b, dict) and b.get("type") == "text" + ) + text = str(content) + tool_calls = getattr(msg, "tool_calls", None) + if tool_calls: + tc_parts = [] + for tc in tool_calls: + name = tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?") + args = tc.get("args", {}) if isinstance(tc, dict) else getattr(tc, "args", {}) + args_str = str(args)[:500] if args else "" + tc_parts.append(f"{name}({args_str})" if args_str else name) + text = f"[tool_calls: {'; '.join(tc_parts)}] {text[:2000]}" + tool_name = getattr(msg, "name", None) + if role == "tool" and tool_name: + text = f"[{tool_name}] {text[:3000]}" + else: + text = text[:5000] + result.append({"role": role, "preview": text}) + return result + + def _format_response(self) -> dict[str, Any]: + """Format the LLM response as OpenAI-style dict.""" + resp = self.response + if resp is None: + return {} + try: + meta = getattr(resp, "response_metadata", {}) or {} + content = resp.content + if isinstance(content, list): + content = " ".join( + b.get("text", "") + for b in content + if isinstance(b, dict) and b.get("type") == "text" + ) or None + tool_calls_out = None + if resp.tool_calls: + tool_calls_out = [ + { + "id": tc.get("id", "") if isinstance(tc, dict) else getattr(tc, "id", ""), + "type": "function", + "function": { + "name": tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?"), + "arguments": json.dumps( + tc.get("args", {}) if isinstance(tc, dict) else getattr(tc, "args", {}) + ), + }, + } + for tc in resp.tool_calls + ] + return { + "choices": [{ + "message": { + "role": "assistant", + "content": content if content else None, + "tool_calls": tool_calls_out, + }, + "finish_reason": meta.get("finish_reason", "unknown"), + }], + "model": meta.get("model", ""), + "usage": { + "prompt_tokens": self.prompt_tokens, + "completion_tokens": self.completion_tokens, + }, + "id": meta.get("id", ""), + } + except Exception: + return {"error": "Failed to format response"} + + +def _extract_bound_tools(llm: Any) -> list[dict[str, Any]]: + """Extract tool schemas from a LangChain RunnableBinding.""" + try: + tools = getattr(llm, "kwargs", {}).get("tools", []) + if not tools: + first = getattr(llm, "first", None) + if first: + tools = getattr(first, "kwargs", {}).get("tools", []) + result = [] + for t in tools[:50]: + if isinstance(t, dict): + fn = t.get("function", t) + result.append({"name": fn.get("name", "?"), "description": fn.get("description", "")[:100]}) + elif hasattr(t, "name"): + result.append({"name": t.name, "description": getattr(t, "description", "")[:100]}) + return result + except Exception: + return [] + + +async def invoke_llm( + llm: Any, + messages: list[BaseMessage], + *, + node: str = "", + session_id: str = "", + workspace_path: str = "", +) -> tuple[AIMessage, LLMCallCapture]: + """Invoke the LLM and capture the exact input/output. + + If ``workspace_path`` is provided, the workspace preamble is + automatically prepended to the first SystemMessage. This ensures + every LLM call sees the workspace path rule — nodes don't need + to inject it manually. + + Returns ``(response, capture)`` where capture contains: + - ``messages``: the exact messages sent to the LLM (with preamble) + - ``response``: the AIMessage returned + - ``prompt_tokens`` / ``completion_tokens``: token usage + - ``model``: model name from response metadata + + Usage in a node:: + + messages = build_executor_context(state, system_content) + response, capture = await invoke_llm( + llm, messages, node="executor", + workspace_path=state.get("workspace_path", "/workspace"), + ) + """ + # Inject workspace preamble into the first SystemMessage + if workspace_path and messages: + from sandbox_agent.prompts import WORKSPACE_PREAMBLE + + preamble = WORKSPACE_PREAMBLE.format(workspace_path=workspace_path) + if isinstance(messages[0], SystemMessage): + messages = [ + SystemMessage(content=preamble + "\n" + messages[0].content), + *messages[1:], + ] + else: + # No SystemMessage — prepend one + messages = [SystemMessage(content=preamble), *messages] + + response = await llm.ainvoke(messages) + + usage = getattr(response, "usage_metadata", None) or {} + prompt_tokens = usage.get("input_tokens", 0) or usage.get("prompt_tokens", 0) + completion_tokens = usage.get("output_tokens", 0) or usage.get("completion_tokens", 0) + model_name = (getattr(response, "response_metadata", None) or {}).get("model", "") + + # Extract bound tools from the LLM (RunnableBinding stores them in kwargs) + bound_tools = _extract_bound_tools(llm) + + capture = LLMCallCapture( + messages=list(messages), + response=response, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + model=model_name, + bound_tools=bound_tools, + ) + + logger.info( + "LLM call [%s]: %d messages, %d prompt tokens, %d completion tokens, model=%s", + node, len(messages), prompt_tokens, completion_tokens, model_name, + extra={"session_id": session_id, "node": node, + "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens}, + ) + + return response, capture + + +def _build_tool_descriptions(llm_with_tools: Any) -> str: + """Build a textual description of bound tools for the thinking prompt.""" + tools = _extract_bound_tools(llm_with_tools) + if not tools: + return "" + lines = ["Available tools:"] + for t in tools: + name = t.get("name", "?") + desc = t.get("description", "") + lines.append(f" - {name}: {desc}" if desc else f" - {name}") + return "\n".join(lines) + + +async def invoke_with_tool_loop( + llm_with_tools: Any, + llm_reason: Any | None, + messages: list[BaseMessage], + *, + node: str, + session_id: str, + workspace_path: str, + thinking_budget: int = 5, + max_parallel_tool_calls: int = 5, + max_cycles: int = 1, + tools: list | None = None, +) -> tuple[AIMessage, LLMCallCapture, list[dict[str, Any]]]: + """Invoke LLM with optional thinking iterations + micro-reasoning + tool execution. + + Returns ``(response, capture, sub_events)`` where sub_events is a list + of thinking event dicts — one per thinking iteration. + + When ``tools`` is provided AND ``max_cycles > 1``, runs a full + think → tool-call → execute → see-result → think loop internally. + Tools are executed via ``asyncio.gather`` for parallel calls. + + When ``llm_reason`` is provided (thinking mode): + 1. Thinking loop (up to ``thinking_budget`` iterations): + Bare LLM reasons about what to do. + 2. Micro-reasoning: LLM with tools makes tool calls. + 3. If ``tools`` provided: execute tools, feed results back, loop. + + When ``llm_reason`` is None (single-phase mode): + One call to llm_with_tools with implicit auto. No sub_events. + """ + import asyncio + + sub_events: list[dict[str, Any]] = [] + total_thinking_tokens = 0 + all_captures: list[LLMCallCapture] = [] + + # Build tool lookup for direct execution + tool_map: dict[str, Any] = {} + if tools: + for t in tools: + name = getattr(t, "name", None) + if name: + tool_map[name] = t + + # Track conversation for multi-cycle loops + cycle_messages = list(messages) + + for cycle in range(max(max_cycles, 1)): + last_reasoning = "" + + if llm_reason is not None: + # --- Thinking phase --- + thinking_history: list[BaseMessage] = [] + + for i in range(thinking_budget): + thinking_messages = list(cycle_messages) + thinking_history + + if i == 0: + thinking_messages.append( + HumanMessage(content="Brief analysis (2-3 sentences max): " + "What is the best tool call for this step? " + "If step is already done, say READY: step complete.") + ) + else: + thinking_messages.append( + HumanMessage(content="Refine in 1-2 sentences. " + "When ready: READY: ") + ) + + reason_response, reason_capture = await invoke_llm( + llm_reason, thinking_messages, + node=f"{node}-think-{cycle+1}.{i+1}", session_id=session_id, + workspace_path=workspace_path, + ) + last_reasoning = str(reason_response.content or "").strip() + total_thinking_tokens += reason_capture.prompt_tokens + reason_capture.completion_tokens + + sub_events.append({ + "type": "thinking", + "node": node, + "cycle": cycle + 1, + "iteration": i + 1, + "total_iterations": 0, + "reasoning": last_reasoning, + **reason_capture.debug_fields(), + **reason_capture.token_fields(), + }) + + thinking_summary = last_reasoning[:200] + ("..." if len(last_reasoning) > 200 else "") + thinking_history.extend([ + AIMessage(content=thinking_summary), + HumanMessage(content=f"(Thinking {i+1} recorded. Continue or signal READY:)"), + ]) + + if last_reasoning.upper().startswith("READY:"): + break + + # --- Micro-reasoning: LLM with tools --- + tool_messages = cycle_messages + [ + AIMessage(content=last_reasoning or "I need to call a tool for this step."), + HumanMessage(content="Now execute your planned action. Rules:\n" + "- Call step_done(summary='...') if the step is ALREADY COMPLETE.\n" + "- Call ONE tool if there's a single action to take.\n" + "- Call multiple tools ONLY if they are independent (can run in parallel).\n" + "- NEVER call the same tool twice with similar args."), + ] + response, capture = await invoke_llm( + llm_with_tools, tool_messages, + node=f"{node}-tool-{cycle+1}", session_id=session_id, + workspace_path=workspace_path, + ) + capture.prompt_tokens += total_thinking_tokens + all_captures.append(capture) + + else: + # Single-phase: one LLM call with implicit auto + response, capture = await invoke_llm( + llm_with_tools, cycle_messages, + node=f"{node}-{cycle+1}" if max_cycles > 1 else node, + session_id=session_id, + workspace_path=workspace_path, + ) + all_captures.append(capture) + + # --- Intercept step_done --- + if response.tool_calls: + done_calls = [tc for tc in response.tool_calls if tc.get("name") == "step_done"] + if done_calls: + summary = done_calls[0].get("args", {}).get("summary", last_reasoning or "") + logger.info("step_done called in cycle %d: %s", cycle + 1, summary[:100], + extra={"session_id": session_id, "node": node}) + response = AIMessage(content=summary) + break + + # If micro-reasoning produced tool calls but no text, merge last thinking + if last_reasoning and response.tool_calls and not response.content: + response = AIMessage(content=last_reasoning, tool_calls=response.tool_calls) + + # Enforce max parallel tool calls + if len(response.tool_calls) > max_parallel_tool_calls: + response = AIMessage( + content=response.content, + tool_calls=response.tool_calls[:max_parallel_tool_calls], + ) + + # --- Execute tools if we have them and there are tool calls --- + if response.tool_calls and tool_map and max_cycles > 1: + # Emit tool_call sub_event BEFORE execution (so UI shows the call) + import uuid as _uuid + call_id = str(_uuid.uuid4())[:8] + sub_events.append({ + "type": "tool_call", + "node": node, + "cycle": cycle + 1, + "call_id": call_id, + "tools": [ + {"name": tc.get("name", "?"), "args": tc.get("args", {})} + for tc in response.tool_calls + ], + }) + + # Execute all tool calls in parallel via asyncio.gather + async def _run_tool(tc: dict) -> ToolMessage: + name = tc.get("name", "unknown") + args = tc.get("args", {}) + tc_id = tc.get("id", "unknown") + tool_fn = tool_map.get(name) + if tool_fn is None: + return ToolMessage(content=f"Error: tool '{name}' not found", tool_call_id=tc_id, name=name) + try: + result = await tool_fn.ainvoke(args) + return ToolMessage(content=str(result)[:10000], tool_call_id=tc_id, name=name) + except Exception as exc: + return ToolMessage(content=f"Error: {exc}", tool_call_id=tc_id, name=name) + + tool_results = await asyncio.gather(*[_run_tool(tc) for tc in response.tool_calls]) + + # Add tool call + results to conversation for next cycle + cycle_messages.append(response) + cycle_messages.extend(tool_results) + + # Emit tool_result sub_events AFTER execution (so UI shows results) + for tm in tool_results: + content_str = str(getattr(tm, "content", "")) + import re as _re + exit_match = _re.search(r"EXIT_CODE:\s*(\d+)", content_str) + is_error = ( + (exit_match is not None and exit_match.group(1) != "0") + or content_str.startswith("Error:") + ) + sub_events.append({ + "type": "tool_result", + "node": node, + "cycle": cycle + 1, + "call_id": call_id, + "name": getattr(tm, "name", "unknown"), + "output": content_str[:2000], + "status": "error" if is_error else "success", + }) + + logger.info( + "Cycle %d/%d [%s]: %d tool calls executed, continuing", + cycle + 1, max_cycles, node, len(response.tool_calls), + extra={"session_id": session_id, "node": node}, + ) + continue # Next cycle + else: + # No tools to execute or last cycle — return response + break + + # If we executed tools internally, strip tool_calls from final response + # so the graph doesn't try to re-execute them via ToolNode + if tool_map and max_cycles > 1 and response.tool_calls: + last_content = str(response.content or "") + if not last_content: + last_content = f"Completed {cycle + 1} think-act cycles." + response = AIMessage(content=last_content) + + # Update total_iterations on all thinking sub_events + thinking_events = [e for e in sub_events if e.get("type") == "thinking"] + total_iters = len(thinking_events) + for evt in thinking_events: + evt["total_iterations"] = total_iters + + # Merge all captures into the last one + final_capture = all_captures[-1] if all_captures else LLMCallCapture() + for c in all_captures[:-1]: + final_capture.prompt_tokens += c.prompt_tokens + final_capture.completion_tokens += c.completion_tokens + + logger.info( + "Tool loop %s: %d cycles, %d thinking iterations, %d total tokens", + node, cycle + 1, total_iters, + final_capture.prompt_tokens + final_capture.completion_tokens, + extra={"session_id": session_id, "node": node}, + ) + + return response, final_capture, sub_events From 720d0ecc8df3d4f6c902a3ea7d23bbf944089198 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:47:52 +0100 Subject: [PATCH 06/26] feat(sandbox): typed event schema for LangGraph node events streamed to UI Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/event_schema.py | 121 ++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/event_schema.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/event_schema.py b/a2a/sandbox_agent/src/sandbox_agent/event_schema.py new file mode 100644 index 00000000..d99fb4c2 --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/event_schema.py @@ -0,0 +1,121 @@ +# Copyright 2025 IBM Corp. +# Licensed under the Apache License, Version 2.0 + +"""Typed event schema for LangGraph node events. + +Each LangGraph node emits a distinct event type. The dataclasses here are +the single source of truth; the TypeScript frontend mirrors these types +in ``agentLoop.ts``. +""" + +from __future__ import annotations + +import json +from dataclasses import asdict, dataclass, field +from typing import List + + +class NodeEventType: + """Constants for the ``type`` discriminator on every LoopEvent.""" + + PLANNER_OUTPUT = "planner_output" + EXECUTOR_STEP = "executor_step" + TOOL_CALL = "tool_call" + TOOL_RESULT = "tool_result" + REFLECTOR_DECISION = "reflector_decision" + REPORTER_OUTPUT = "reporter_output" + BUDGET_UPDATE = "budget_update" + HITL_REQUEST = "hitl_request" + + +# --------------------------------------------------------------------------- +# Base +# --------------------------------------------------------------------------- + + +@dataclass +class LoopEvent: + """Base event emitted by a graph node during the reasoning loop.""" + + type: str # One of NodeEventType constants + loop_id: str # Unique per reasoning loop invocation + model: str = "" + prompt_tokens: int = 0 + completion_tokens: int = 0 + + def to_json(self) -> str: + return json.dumps(asdict(self)) + + +# --------------------------------------------------------------------------- +# Concrete event types +# --------------------------------------------------------------------------- + + +@dataclass +class PlannerOutput(LoopEvent): + """Planner created or revised a plan.""" + + type: str = NodeEventType.PLANNER_OUTPUT + steps: List[str] = field(default_factory=list) + iteration: int = 0 + + +@dataclass +class ExecutorStep(LoopEvent): + """Executor is working on a plan step.""" + + type: str = NodeEventType.EXECUTOR_STEP + step: int = 0 + total_steps: int = 0 + description: str = "" + reasoning: str = "" # Full LLM response text (up to 2000 chars) + + +@dataclass +class ToolCall(LoopEvent): + """Executor invoked a tool.""" + + type: str = NodeEventType.TOOL_CALL + step: int = 0 + name: str = "" + args: str = "" + + +@dataclass +class ToolResult(LoopEvent): + """Tool returned a result.""" + + type: str = NodeEventType.TOOL_RESULT + step: int = 0 + name: str = "" + output: str = "" + + +@dataclass +class ReflectorDecision(LoopEvent): + """Reflector reviewed execution and decided next action.""" + + type: str = NodeEventType.REFLECTOR_DECISION + decision: str = "" # "continue", "replan", "done" + assessment: str = "" # Full reflection text + iteration: int = 0 + + +@dataclass +class ReporterOutput(LoopEvent): + """Reporter generated the final answer.""" + + type: str = NodeEventType.REPORTER_OUTPUT + content: str = "" + + +@dataclass +class BudgetUpdate(LoopEvent): + """Budget tracking update.""" + + type: str = NodeEventType.BUDGET_UPDATE + tokens_used: int = 0 + tokens_budget: int = 0 + wall_clock_s: float = 0 + max_wall_clock_s: float = 0 From 67b5971f9e4d6186d2805351d4cb30147d771161 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:47:59 +0100 Subject: [PATCH 07/26] feat(sandbox): event serializer converting LangGraph events to common JSON streaming format Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/event_serializer.py | 697 ++++++++++++++++++ 1 file changed, 697 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/event_serializer.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py b/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py new file mode 100644 index 00000000..4191a67b --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py @@ -0,0 +1,697 @@ +"""Framework-specific event serializers for structured JSON streaming. + +Each agent framework (LangGraph, CrewAI, AG2) has its own internal event +format. Serializers convert framework events into a common JSON schema +that the backend and frontend understand. + +Event types (new — node-specific): + planner_output — Planner created/revised a plan + executor_step — Executor starts working on a plan step + tool_call — Tool invoked (unchanged) + tool_result — Tool returned output (unchanged) + reflector_decision — Reflector decides continue/replan/done + reporter_output — Reporter generates the final answer + budget_update — Budget tracking + error — An error occurred during execution + hitl_request — Human-in-the-loop approval is needed + +Legacy types (kept for backward compatibility): + plan — Alias for planner_output + plan_step — Alias for executor_step + reflection — Alias for reflector_decision + llm_response — Generic LLM text (used for unknown nodes only) +""" + +from __future__ import annotations + +import json +import logging +import uuid +from abc import ABC, abstractmethod +from typing import Any + +from sandbox_agent import plan_store as ps + +logger = logging.getLogger(__name__) + + +def _safe_tc(tc: Any) -> dict[str, Any]: + """Safely extract name/args from a tool call object. + + LangChain tool_calls can be dicts, ToolCall TypedDicts, or + InvalidToolCall objects (tuples). Handle all formats gracefully. + """ + try: + if isinstance(tc, dict): + return {"name": tc.get("name", "unknown"), "args": tc.get("args", {})} + if hasattr(tc, "name"): + return {"name": getattr(tc, "name", "unknown"), "args": getattr(tc, "args", {})} + if isinstance(tc, (list, tuple)) and len(tc) >= 2: + return {"name": str(tc[0]), "args": tc[1] if isinstance(tc[1], dict) else {}} + except Exception: + pass + return {"name": "unknown", "args": {}} + + +class FrameworkEventSerializer(ABC): + """Base class for framework-specific event serialization. + + Subclass this for each agent framework (LangGraph, CrewAI, AG2). + The ``serialize`` method must return a JSON string with at least + a ``type`` field. + """ + + @abstractmethod + def serialize(self, key: str, value: dict) -> str: + """Serialize a framework event into a JSON string. + + Parameters + ---------- + key: + The graph node name (e.g. "assistant", "tools"). + value: + The event payload from the framework's streaming API. + + Returns + ------- + str + A JSON string with at least ``{"type": "..."}`` + """ + ... + + +class LangGraphSerializer(FrameworkEventSerializer): + """Serialize LangGraph ``stream_mode='updates'`` events. + + LangGraph emits events like:: + + {"assistant": {"messages": [AIMessage(...)]}} + {"tools": {"messages": [ToolMessage(...)]}} + + This serializer extracts tool calls, tool results, and LLM + responses into structured JSON. + + When the graph uses a plan-execute-reflect reasoning loop, all + events include a ``loop_id`` so the frontend can group them into + an expandable AgentLoopCard. + """ + + # Nodes whose events are sub-items of the preceding node visit + # (they don't get their own node_visit number). + _TOOL_NODES = frozenset({"tools", "planner_tools", "reflector_tools"}) + + def __init__(self, loop_id: str | None = None, context_id: str | None = None) -> None: + self._loop_id = loop_id or str(uuid.uuid4())[:8] + self._step_index = 0 + self._event_counter = 0 # global sequence number for ordering + self._node_visit = 0 # graph node visit counter (main sections) + self._sub_index = 0 # position within current node visit + self._last_node_key: str = "" # track previous node for visit grouping + self._micro_step: int = 0 + self._context_id = context_id or "unknown" + self._last_call_id: str = "" + self._prev_node: str | None = None # previous node for node_transition events + + def serialize(self, key: str, value: dict) -> str: + + # Emit node_transition meta-event when the node changes + transition_line: str | None = None + if self._prev_node is not None and key != self._prev_node: + self._event_counter += 1 + transition_event = { + "type": "node_transition", + "loop_id": self._loop_id, + "from_node": self._prev_node, + "to_node": key, + "event_index": self._event_counter, + "langgraph_node": key, + } + transition_line = json.dumps(transition_event) + self._prev_node = key + + # Node visit tracking: + # - Tool nodes (tools, planner_tools, reflector_tools) inherit parent visit + # - Same node type re-entering (executor→tools→executor) stays on same visit + # - Different node type (executor→reflector, reflector→planner) = new visit + if key not in self._TOOL_NODES: + if key != self._last_node_key: + self._node_visit += 1 + self._sub_index = 0 + self._last_node_key = key + # event_counter incremented per JSON line in post-processing. + + # Track actual plan step from state for step grouping + current_step = value.get("current_step") + if current_step is not None: + new_step = current_step + 1 # 1-based for display + if new_step != self._step_index: + self._step_index = new_step + self._micro_step = 0 # reset micro_step on plan step change + + # Reasoning-loop nodes may emit state fields instead of messages + if key == "router": + # Router is an internal node — emit minimal event for logging + route = value.get("_route", "new") + result = json.dumps({ + "type": "router", + "loop_id": self._loop_id, + "route": route, + "plan_status": value.get("plan_status", ""), + }) + elif key == "planner": + result = self._serialize_planner(value) + elif key == "reflector": + result = self._serialize_reflector(value) + elif key == "step_selector": + # Reset micro_step on every step transition + self._micro_step = 0 + current_step = value.get("current_step", 0) + plan_steps = value.get("plan_steps", []) + step_desc = "" + if current_step < len(plan_steps): + step_entry = plan_steps[current_step] + step_desc = step_entry.get("description", "") if isinstance(step_entry, dict) else str(step_entry) + brief = value.get("skill_instructions", "") + # Strip the "STEP BRIEF FROM COORDINATOR:" prefix + if "STEP BRIEF" in brief: + brief = brief.split("---")[0].replace("STEP BRIEF FROM COORDINATOR:", "").strip() + result = json.dumps({ + "type": "step_selector", + "loop_id": self._loop_id, + "current_step": current_step, + "description": f"Advancing to step {current_step + 1}: {step_desc[:80]}", + "brief": brief[:500], + "done": value.get("done", False), + }) + elif key == "reporter": + result = self._serialize_reporter(value) + else: + msgs = value.get("messages", []) + if not msgs: + result = json.dumps({"type": "llm_response", "content": f"[{key}]"}) + else: + msg = msgs[-1] + + if key == "executor": + result = self._serialize_executor(msg, value) + elif key == "tools": + result = self._serialize_tool_result(msg) + else: + # Unknown node — treat as informational + content = getattr(msg, "content", "") + if isinstance(content, list): + text = self._extract_text_blocks(content) + else: + text = str(content)[:2000] if content else f"[{key}]" + result = json.dumps({"type": "llm_response", "content": text}) + + # Append budget_update event if _budget_summary is in the value dict + budget_summary = value.get("_budget_summary") + if budget_summary and isinstance(budget_summary, dict): + budget_event = json.dumps({ + "type": "budget_update", + "loop_id": self._loop_id, + **budget_summary, + }) + result = result + "\n" + budget_event + + # Post-process: ensure ALL event lines have step + unique event_index. + # Each JSON line gets its own event_index (no duplicates). + # Legacy event types (plan, plan_step, reflection) are skipped from + # indexing to avoid inflating the counter. + enriched_lines = [] + + # Prepend node_transition event if one was emitted + if transition_line is not None: + enriched_lines.append(transition_line) + + for line in result.split("\n"): + line = line.strip() + if not line: + continue + try: + evt = json.loads(line) + if "step" not in evt: + cs = evt.get("current_step") + evt["step"] = (cs + 1) if cs is not None else self._step_index + event_type = evt.get("type", "?") + self._event_counter += 1 + evt["event_index"] = self._event_counter + evt["node_visit"] = self._node_visit + evt["sub_index"] = self._sub_index + evt["langgraph_node"] = key + self._sub_index += 1 + enriched_lines.append(json.dumps(evt)) + except json.JSONDecodeError: + enriched_lines.append(line) + event_type = "parse_error" + logger.info("SERIALIZE session=%s loop=%s type=%s step=%s ei=%s", + self._context_id, self._loop_id, event_type, + self._step_index, self._event_counter, + extra={"session_id": self._context_id, "node": key, + "event_type": event_type, "step": self._step_index}) + + return "\n".join(enriched_lines) + + def _serialize_assistant(self, msg: Any) -> str: + """Serialize an assistant (LLM) node output. + + When the LLM calls tools, it often also produces reasoning text. + We emit BOTH the thinking content and the tool call as separate + JSON lines so the UI shows the full chain: + {"type": "llm_response", "content": "Let me check..."} + {"type": "tool_call", "tools": [...]} + """ + tool_calls = getattr(msg, "tool_calls", []) + content = getattr(msg, "content", "") + + # Extract any text content from the LLM + if isinstance(content, list): + text = self._extract_text_blocks(content) + else: + text = str(content)[:2000] if content else "" + + if tool_calls: + parts = [] + # Emit thinking/reasoning text first (if present) + if text.strip(): + parts.append(json.dumps({"type": "llm_response", "content": text})) + # Then emit the tool call + parts.append(json.dumps({ + "type": "tool_call", + "tools": [ + _safe_tc(tc) + for tc in tool_calls + ], + })) + return "\n".join(parts) + + return json.dumps({"type": "llm_response", "content": text}) + + def _serialize_executor(self, msg: Any, value: dict | None = None) -> str: + """Serialize an executor node output with loop_id for AgentLoopCard.""" + tool_calls = getattr(msg, "tool_calls", []) + content = getattr(msg, "content", "") + + if isinstance(content, list): + text = self._extract_text_blocks(content) + else: + text = str(content)[:2000] if content else "" + + parts = [] + _v = value or {} + + # Emit sub_events: thinking iterations, tool calls, tool results + sub_events = _v.get("_sub_events", []) + for se in sub_events: + se_type = se.get("type", "") + if se_type == "thinking": + thinking_event = { + "type": "thinking", + "loop_id": self._loop_id, + "cycle": se.get("cycle", 1), + "iteration": se.get("iteration", 1), + "total_iterations": se.get("total_iterations", 1), + "reasoning": se.get("reasoning", "")[:50000], + "node": se.get("node", "executor"), + "model": se.get("model", ""), + "prompt_tokens": se.get("prompt_tokens", 0), + "completion_tokens": se.get("completion_tokens", 0), + } + for field in ("_system_prompt", "_prompt_messages", "_bound_tools", "_llm_response"): + if field in se: + thinking_event[field.lstrip("_")] = se[field] + parts.append(json.dumps(thinking_event)) + elif se_type == "tool_call": + parts.append(json.dumps({ + "type": "tool_call", + "loop_id": self._loop_id, + "call_id": se.get("call_id", ""), + "cycle": se.get("cycle", 1), + "tools": se.get("tools", []), + })) + elif se_type == "tool_result": + parts.append(json.dumps({ + "type": "tool_result", + "loop_id": self._loop_id, + "call_id": se.get("call_id", ""), + "cycle": se.get("cycle", 1), + "name": se.get("name", "unknown"), + "output": se.get("output", "")[:2000], + "status": se.get("status", "success"), + })) + + self._micro_step += 1 + + # Skip micro_reasoning for dedup responses (no LLM call happened) + if not _v.get("_dedup"): + # Annotate micro_reasoning with thinking count + if sub_events: + _v = {**_v, "_thinking_count": len(sub_events)} + parts.append(self._serialize_micro_reasoning(msg, _v)) + + plan = _v.get("plan", []) + model = _v.get("model", "") + prompt_tokens = _v.get("prompt_tokens", 0) + completion_tokens = _v.get("completion_tokens", 0) + prompt_data = self._extract_prompt_data(_v) + + # Emit executor_step event so UI shows which step is executing + current_plan_step = _v.get("current_step", 0) + step_payload = { + "type": "executor_step", + "loop_id": self._loop_id, + "plan_step": current_plan_step, + "iteration": _v.get("iteration", 0), + "total_steps": len(plan) if plan else 0, + "description": text[:200] if text else "", + "reasoning": text[:2000] if text else "", + "model": model, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + **prompt_data, + } + parts.append(json.dumps(step_payload)) + + if tool_calls: + # Use LangGraph's tool_call_id for proper pairing with tool_result + tc0 = tool_calls[0] if tool_calls else {} + call_id = ( + tc0.get("id") if isinstance(tc0, dict) + else getattr(tc0, "id", None) + ) or str(uuid.uuid4())[:8] + self._last_call_id = call_id + parts.append(json.dumps({ + "type": "tool_call", + "loop_id": self._loop_id, + "call_id": call_id, + "tools": [ + _safe_tc(tc) + for tc in tool_calls + ], + })) + return "\n".join(parts) + + # Emit tool_call event for text-parsed tools (no structured tool_calls) + parsed_tools = _v.get("parsed_tools", []) + if parsed_tools: + call_id = str(uuid.uuid4())[:8] + self._last_call_id = call_id + parts.append(json.dumps({ + "type": "tool_call", + "loop_id": self._loop_id, + "call_id": call_id, + "tools": [ + {"name": t["name"], "args": t.get("args", {})} + for t in parsed_tools + ], + })) + + return "\n".join(parts) + + def _serialize_micro_reasoning(self, msg: Any, value: dict) -> str: + """Emit a micro_reasoning event capturing the LLM's intermediate reasoning.""" + content = getattr(msg, "content", "") + if isinstance(content, list): + text = self._extract_text_blocks(content) + else: + text = str(content)[:50000] if content else "" + + tool_calls = getattr(msg, "tool_calls", []) + next_action = "tool_call" if tool_calls else "done" + + # When the LLM responds with only tool calls and no text reasoning, + # generate a summary so the micro-reasoning block isn't empty. + if not text and tool_calls: + summaries = [] + for tc in tool_calls[:5]: + name = tc.get("name", "?") + args = tc.get("args", {}) + args_str = json.dumps(args, default=str)[:200] + summaries.append(f"→ {name}({args_str})") + text = "Decided next action:\n" + "\n".join(summaries) + + event: dict = { + "type": "micro_reasoning", + "loop_id": self._loop_id, + "micro_step": self._micro_step, + "after_call_id": self._last_call_id, + "reasoning": text[:50000], + "next_action": next_action, + "model": value.get("model", ""), + "prompt_tokens": value.get("prompt_tokens", 0), + "completion_tokens": value.get("completion_tokens", 0), + **self._extract_prompt_data(value), + } + # Include previous tool result for UI context (shows WHY this decision) + prev = value.get("_last_tool_result") + if prev: + event["previous_tool"] = prev + # Annotate with thinking iteration count for UI badge + tc = value.get("_thinking_count", 0) + if tc: + event["thinking_count"] = tc + return json.dumps(event) + + def _serialize_tool_result(self, msg: Any) -> str: + """Serialize a tool node output with loop_id.""" + name = getattr(msg, "name", "unknown") + content = getattr(msg, "content", "") + content_str = str(content) + # Determine error status from exit code, not content keywords. + # The shell tool appends "EXIT_CODE: N" for non-zero exits. + # Keyword matching (e.g. "failure", "error") causes false positives + # when command output contains those words in normal data. + import re as _re + exit_match = _re.search(r"EXIT_CODE:\s*(\d+)", content_str) + is_error = ( + (exit_match is not None and exit_match.group(1) != "0") + or content_str.startswith("\u274c") + or content_str.startswith("Error: ") + or "Permission denied" in content_str + or "command not found" in content_str + ) + status = "error" if is_error else "success" + # Use LangGraph's tool_call_id for proper pairing with tool_call + call_id = getattr(msg, "tool_call_id", None) or self._last_call_id + return json.dumps({ + "type": "tool_result", + "loop_id": self._loop_id, + "call_id": call_id, + "name": str(name), + "output": content_str[:2000], + "status": status, + }) + + @staticmethod + def _enrich_with_plan_store(payload: dict, value: dict) -> None: + """Add PlanStore flat steps to payload if available.""" + store = value.get("_plan_store", {}) + if store and store.get("steps"): + payload["plan_steps"] = ps.to_flat_plan_steps(store) + + @staticmethod + def _extract_prompt_data(value: dict) -> dict: + """Extract prompt visibility fields from node output.""" + data: dict = {} + sp = value.get("_system_prompt", "") + if sp: + data["system_prompt"] = sp[:50000] + pm = value.get("_prompt_messages") + if pm: + data["prompt_messages"] = pm[:100] # max 100 messages + bt = value.get("_bound_tools") + if bt: + data["bound_tools"] = bt[:50] # max 50 tools + lr = value.get("_llm_response") + if lr: + data["llm_response"] = lr + return data + + def _serialize_planner(self, value: dict) -> str: + """Serialize a planner node output — emits planner_output + legacy plan.""" + plan_steps = value.get("plan_steps", []) + plan = [s.get("description", "") for s in plan_steps] if plan_steps else value.get("plan", []) + iteration = value.get("iteration", 1) + + # Also include any LLM text from the planner's message + msgs = value.get("messages", []) + text = "" + if msgs: + content = getattr(msgs[-1], "content", "") + if isinstance(content, list): + text = self._extract_text_blocks(content) + else: + text = str(content)[:2000] if content else "" + + model = value.get("model", "") + prompt_tokens = value.get("prompt_tokens", 0) + completion_tokens = value.get("completion_tokens", 0) + prompt_data = self._extract_prompt_data(value) + + # Distinguish initial plan from replan + is_replan = iteration > 1 + event_type = "replanner_output" if is_replan else "planner_output" + + payload = { + "type": event_type, + "loop_id": self._loop_id, + "steps": plan, + "iteration": iteration, + "content": text, + "model": model, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + **prompt_data, + } + + self._enrich_with_plan_store(payload, value) + + return json.dumps(payload) + + def _serialize_reflector(self, value: dict) -> str: + """Serialize a reflector node output — emits reflector_decision + legacy reflection.""" + done = value.get("done", False) + current_step = value.get("current_step", 0) + step_results = value.get("step_results", []) + + # Extract decision text from message if present + msgs = value.get("messages", []) + text = "" + if msgs: + content = getattr(msgs[-1], "content", "") + if isinstance(content, list): + text = self._extract_text_blocks(content) + else: + text = str(content)[:500] if content else "" + + # Derive the decision keyword from the text + decision = "done" if done else self._extract_decision(text) + + # Strip prompt echo from assessment — the LLM sometimes echoes the + # system prompt instructions. Extract only the actual decision word + # or a brief justification, never the echoed prompt. + assessment = text.strip() + + # If the response contains prompt markers, it's an echo — just use the decision. + prompt_markers = ( + "Output the single word:", + "output ONLY the decision word", + "Decide ONE of the following", + "DECISION PROCESS:", + "STALL DETECTION:", + "REPLAN RULES:", + ) + is_prompt_echo = any(marker in assessment for marker in prompt_markers) + if is_prompt_echo or not assessment or len(assessment) > 200: + assessment = decision + + # Reset micro_step counter for next iteration + self._micro_step = 0 + + model = value.get("model", "") + prompt_tokens = value.get("prompt_tokens", 0) + completion_tokens = value.get("completion_tokens", 0) + iteration = value.get("iteration", 0) + prompt_data = self._extract_prompt_data(value) + + payload = { + "type": "reflector_decision", + "loop_id": self._loop_id, + "decision": decision, + "assessment": assessment, + "iteration": iteration, + "done": done, + "current_step": current_step, + "model": model, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + **prompt_data, + } + + self._enrich_with_plan_store(payload, value) + + return json.dumps(payload) + + def _serialize_reporter(self, value: dict) -> str: + """Serialize a reporter node output — emits reporter_output. + + When the reporter LLM calls the ``respond_to_user`` escape tool + instead of producing text content, we extract the ``response`` + argument and emit it as a clean ``reporter_output`` event rather + than a raw ``tool_call`` event. + """ + final_answer = value.get("final_answer", "") + + # Check messages for respond_to_user tool call or text content + if not final_answer: + msgs = value.get("messages", []) + for msg in msgs: + # Check for respond_to_user tool call first + tool_calls = getattr(msg, "tool_calls", None) + if tool_calls: + for tc in tool_calls: + tc_info = _safe_tc(tc) + if tc_info["name"] == "respond_to_user": + args = tc_info["args"] + final_answer = ( + args.get("response", "") + if isinstance(args, dict) + else str(args) + ) + break + if final_answer: + break + + # Fall back to text content + content = getattr(msg, "content", "") + if content: + if isinstance(content, list): + final_answer = self._extract_text_blocks(content) + else: + final_answer = str(content)[:2000] + if final_answer: + break + + model = value.get("model", "") + prompt_tokens = value.get("prompt_tokens", 0) + completion_tokens = value.get("completion_tokens", 0) + prompt_data = self._extract_prompt_data(value) + + payload = { + "type": "reporter_output", + "loop_id": self._loop_id, + "content": final_answer[:2000], + "model": model, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + **prompt_data, + } + + files_touched = value.get("files_touched", []) + if files_touched: + payload["files_touched"] = files_touched[:30] + + return json.dumps(payload) + + @staticmethod + def _extract_decision(text: str) -> str: + """Extract a decision keyword from reflector text. + + Returns one of: ``continue``, ``replan``, ``done``, ``hitl``. + Defaults to ``continue`` if the text is ambiguous. + """ + text_lower = text.strip().lower() + for decision in ("done", "replan", "hitl", "continue"): + if decision in text_lower: + return decision + return "continue" + + @staticmethod + def _extract_text_blocks(content: list) -> str: + """Extract text from a list of content blocks.""" + return " ".join( + b.get("text", "") + for b in content + if isinstance(b, dict) and b.get("type") == "text" + )[:2000] From d3bb92ee4c7458ede2ac61fdeb7a9aa4ad79df33 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:48:05 +0100 Subject: [PATCH 08/26] feat(sandbox): shell executor with permission-checked command execution in workspace Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/executor.py | 364 ++++++++++++++++++ 1 file changed, 364 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/executor.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/executor.py b/a2a/sandbox_agent/src/sandbox_agent/executor.py new file mode 100644 index 00000000..7d3777a6 --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/executor.py @@ -0,0 +1,364 @@ +"""Sandbox executor -- runs shell commands inside a context workspace. + +Every command is checked against the :class:`PermissionChecker` before +execution. The three possible outcomes are: + + DENY -- an error :class:`ExecutionResult` is returned immediately + HITL -- :class:`HitlRequired` is raised so the LangGraph graph can + trigger an ``interrupt()`` for human approval + ALLOW -- the command is executed via ``asyncio.create_subprocess_shell`` + inside *workspace_path* with a timeout from :class:`SourcesConfig` +""" + +from __future__ import annotations + +import asyncio +import logging +import os +import shlex +from dataclasses import dataclass + +from sandbox_agent.permissions import PermissionChecker, PermissionResult +from sandbox_agent.sources import SourcesConfig + +logger = logging.getLogger(__name__) + +# Shell interpreters that can execute arbitrary code via -c / -e flags. +_INTERPRETERS = frozenset({"bash", "sh", "python", "python3", "perl", "ruby", "node"}) + +# Flags that take an inline command string as the next argument. +_EXEC_FLAGS = frozenset({"-c", "-e", "--eval"}) + + +# --------------------------------------------------------------------------- +# Exceptions +# --------------------------------------------------------------------------- + + +class HitlRequired(Exception): + """Raised when an operation needs human approval. + + Attributes + ---------- + command: + The shell command that requires approval. + """ + + def __init__(self, command: str) -> None: + self.command = command + super().__init__(f"Human approval required for command: {command}") + + +# --------------------------------------------------------------------------- +# Result dataclass +# --------------------------------------------------------------------------- + + +@dataclass +class ExecutionResult: + """Captures the outcome of a shell command execution.""" + + stdout: str + stderr: str + exit_code: int + + +# --------------------------------------------------------------------------- +# Executor +# --------------------------------------------------------------------------- + + +class SandboxExecutor: + """Runs shell commands in a workspace directory with permission checks. + + Parameters + ---------- + workspace_path: + Absolute path to the workspace directory where commands execute. + permission_checker: + A :class:`PermissionChecker` instance for evaluating operations. + sources_config: + A :class:`SourcesConfig` instance providing runtime limits. + """ + + def __init__( + self, + workspace_path: str, + permission_checker: PermissionChecker, + sources_config: SourcesConfig, + ) -> None: + self._workspace_path = workspace_path + self._permission_checker = permission_checker + self._sources_config = sources_config + self._use_landlock = os.environ.get("SANDBOX_LANDLOCK") == "true" + if self._use_landlock: + logger.info("Landlock isolation ENABLED for workspace %s", workspace_path) + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + async def run_shell(self, command: str) -> ExecutionResult: + """Run a shell command after checking permissions and sources.json. + + Parameters + ---------- + command: + The shell command string to execute. + + Returns + ------- + ExecutionResult + On success (ALLOW) or on DENY (with a non-zero exit code and + an error message in stderr). + + Raises + ------ + HitlRequired + When the command matches neither allow nor deny rules and + requires human approval. + """ + # 1. Extract the command prefix for permission matching. + # Try "cmd subcmd" first (e.g. "pip install"), then fall back + # to just "cmd" (e.g. "grep"). + operation = command.strip() + + # 1a. Check for interpreter bypass (e.g. bash -c "curl evil.com"). + # If the outer command is an interpreter with -c/-e, recursively + # check the inner command against the same permission + sources + # pipeline. This prevents circumventing deny rules by wrapping + # a blocked command in `bash -c "..."`. + bypass_denial = self._check_interpreter_bypass(operation) + if bypass_denial is not None: + return ExecutionResult( + stdout="", + stderr=bypass_denial, + exit_code=1, + ) + + permission = self._check_permission(operation) + + # 2. Act on the permission result. + if permission is PermissionResult.DENY: + return ExecutionResult( + stdout="", + stderr=f"Permission denied: command '{command}' is denied by policy.", + exit_code=1, + ) + + if permission is PermissionResult.HITL: + raise HitlRequired(command) + + # 3. Check sources.json enforcement (package blocking, git remote + # allowlist) as a second layer of defense-in-depth. + sources_denial = self._check_sources(operation) + if sources_denial: + return ExecutionResult( + stdout="", + stderr=sources_denial, + exit_code=1, + ) + + # 4. ALLOW -- execute the command. + return await self._execute(command) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _check_interpreter_bypass(self, command: str) -> str | None: + """Check if a command uses an interpreter to bypass restrictions. + + Detects patterns like ``bash -c "curl evil.com"`` or + ``python3 -c "import os; os.system('rm -rf /')"`` and recursively + checks the inner command against permissions and sources policy. + + Returns + ------- + str or None + An error message if the inner command is denied, or *None* if + no interpreter bypass was detected (or the inner command is OK). + """ + try: + parts = shlex.split(command) + except ValueError: + return None + + if len(parts) < 3: + return None + + # Resolve the binary name (handle /usr/bin/bash -> bash). + cmd = parts[0].rsplit("/", 1)[-1] + if cmd not in _INTERPRETERS: + return None + + if parts[1] not in _EXEC_FLAGS: + return None + + # Everything after the exec flag is the inner command. + inner_command = " ".join(parts[2:]) + logger.warning( + "Interpreter bypass detected: '%s' wraps inner command '%s'", + command, + inner_command, + ) + + # Recursively check the inner command against permission rules. + inner_permission = self._check_permission(inner_command) + if inner_permission is PermissionResult.DENY: + return ( + f"Permission denied: interpreter bypass detected. " + f"Inner command '{inner_command}' is denied by policy." + ) + + # Also check the inner command against sources.json policy + # (e.g. git clone to a disallowed remote inside bash -c). + inner_sources_denial = self._check_sources(inner_command) + if inner_sources_denial: + return ( + f"Blocked: interpreter bypass detected. " + f"Inner command violates sources policy: {inner_sources_denial}" + ) + + return None + + def _check_permission(self, operation: str) -> PermissionResult: + """Check the permission for a shell operation. + + The permission checker expects the full command string as the + operation. It internally handles prefix matching (e.g. matching + "grep -r foo" against the rule ``shell(grep:*)``). + """ + return self._permission_checker.check("shell", operation) + + def _check_sources(self, operation: str) -> str | None: + """Check sources.json enforcement for package and git operations. + + Returns an error message string if the operation is blocked by + sources.json, or None if it is allowed. + """ + import re + + parts = operation.split() + if not parts: + return None + + # --- Package manager checks --- + # pip install + if len(parts) >= 3 and parts[0] == "pip" and parts[1] == "install": + if not self._sources_config.is_package_manager_enabled("pip"): + return "Blocked by sources.json: pip is not enabled." + for pkg in parts[2:]: + if pkg.startswith("-"): + continue # skip flags + # Strip version specifiers (e.g. "requests>=2.0") + pkg_name = re.split(r"[><=!~]", pkg)[0] + if pkg_name and self._sources_config.is_package_blocked("pip", pkg_name): + return f"Blocked by sources.json: package '{pkg_name}' is on the blocked list." + + # npm install + if len(parts) >= 3 and parts[0] == "npm" and parts[1] == "install": + if not self._sources_config.is_package_manager_enabled("npm"): + return "Blocked by sources.json: npm is not enabled." + for pkg in parts[2:]: + if pkg.startswith("-"): + continue + pkg_name = re.split(r"[@><=!~]", pkg)[0] + if pkg_name and self._sources_config.is_package_blocked("npm", pkg_name): + return f"Blocked by sources.json: package '{pkg_name}' is on the blocked list." + + # --- Git remote checks --- + # git clone + if len(parts) >= 3 and parts[0] == "git" and parts[1] == "clone": + # Find the URL argument (skip flags like --depth, --branch) + url = None + i = 2 + while i < len(parts): + if parts[i].startswith("-"): + # Skip flag and its value if it takes one + if parts[i] in ("--depth", "--branch", "-b"): + i += 2 + continue + i += 1 + continue + url = parts[i] + break + if url and not self._sources_config.is_git_remote_allowed(url): + return f"Blocked by sources.json: git remote '{url}' is not in allowed_remotes." + + return None + + async def _execute(self, command: str) -> ExecutionResult: + """Execute *command* in the workspace directory with a timeout. + + When ``SANDBOX_LANDLOCK=true``, each command is executed inside a + Landlock-restricted subprocess that can only write to the workspace + and a session-specific /tmp directory. There is no fallback -- + if Landlock fails, the command fails. + """ + timeout = self._sources_config.max_execution_time_seconds + + if self._use_landlock: + return await self._execute_landlock(command, timeout) + + try: + process = await asyncio.create_subprocess_shell( + command, + cwd=self._workspace_path, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + try: + stdout_bytes, stderr_bytes = await asyncio.wait_for( + process.communicate(), + timeout=timeout, + ) + except asyncio.TimeoutError: + # Kill the process and its children. + try: + process.kill() + except ProcessLookupError: + pass # already exited + # Wait for the process to be reaped. + await process.wait() + return ExecutionResult( + stdout="", + stderr=( + f"Command timed out after {timeout} seconds " + f"and was killed: '{command}'" + ), + exit_code=-1, + ) + + return ExecutionResult( + stdout=(stdout_bytes or b"").decode("utf-8", errors="replace"), + stderr=(stderr_bytes or b"").decode("utf-8", errors="replace"), + exit_code=process.returncode if process.returncode is not None else -1, + ) + + except OSError as exc: + return ExecutionResult( + stdout="", + stderr=f"Failed to start command: {exc}", + exit_code=-1, + ) + + async def _execute_landlock(self, command: str, timeout: float) -> ExecutionResult: + """Execute *command* inside a Landlock-sandboxed subprocess. + + No fallback -- if Landlock application fails in the child, the + error propagates as a non-zero exit code. + """ + from sandbox_agent.sandbox_subprocess import sandboxed_subprocess + + returncode, stdout, stderr = await sandboxed_subprocess( + command=command, + workspace_path=self._workspace_path, + timeout=timeout, + ) + return ExecutionResult( + stdout=stdout, + stderr=stderr, + exit_code=returncode, + ) From 8fd4b9a3c5ec4d7ff6cecf7d244c93f891578f0a Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:48:10 +0100 Subject: [PATCH 09/26] feat(sandbox): LangGraph agent graph with plan-execute-reflect loop and sandboxed tools Signed-off-by: Ladislav Smola --- a2a/sandbox_agent/src/sandbox_agent/graph.py | 989 +++++++++++++++++++ 1 file changed, 989 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/graph.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/graph.py b/a2a/sandbox_agent/src/sandbox_agent/graph.py new file mode 100644 index 00000000..5cbe603e --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/graph.py @@ -0,0 +1,989 @@ +"""LangGraph agent graph with plan-execute-reflect reasoning loop. + +The graph binds six tools to an LLM and uses a structured reasoning loop: + +- **shell**: runs commands via :class:`SandboxExecutor` (with permission checks) +- **file_read**: reads files relative to the workspace (prevents path traversal) +- **file_write**: writes files relative to the workspace (prevents path traversal) +- **web_fetch**: fetches web content from allowed domains +- **explore**: spawns a read-only sub-agent for codebase research +- **delegate**: spawns a child agent session for delegated tasks + +Graph architecture (router → plan → execute → reflect): + +```mermaid +graph TD + START((User Message)) --> router + router -->|new/replan| planner + router -->|resume| executor + + planner --> executor + executor -->|tool_calls| tools + tools --> executor + executor -->|no tool_calls| reflector + + reflector -->|execute| executor + reflector -->|replan| planner + reflector -->|done| reporter + reporter --> END((Final Answer)) + + style router fill:#4CAF50,color:white + style planner fill:#2196F3,color:white + style executor fill:#FF9800,color:white + style tools fill:#607D8B,color:white + style reflector fill:#9C27B0,color:white + style reporter fill:#F44336,color:white +``` + +Key flows: +- **execute**: Step succeeded → executor runs the next plan step +- **replan**: Step failed → planner creates a new plan → executor runs it +- **done**: Task complete → reporter summarizes results + +The executor uses micro-reflection: one tool call per LLM invocation, +see result, decide next action. Budget limits (iterations, tokens, +wall clock) are the only hard stops. +""" + +from __future__ import annotations + +import logging +import os +from pathlib import Path +from typing import Any, Optional + +from langchain_core.tools import tool +from langchain_openai import ChatOpenAI +from langgraph.graph import MessagesState, StateGraph +from langgraph.prebuilt import ToolNode, tools_condition +from langgraph.types import Send, interrupt + +try: + from langgraph.errors import GraphInterrupt +except ImportError: + # Fallback for older langgraph versions + GraphInterrupt = type("GraphInterrupt", (Exception,), {}) + +from sandbox_agent.budget import AgentBudget +from sandbox_agent.executor import HitlRequired, SandboxExecutor +from sandbox_agent.permissions import PermissionChecker +from sandbox_agent.reasoning import ( + PlanStep, + _DEBUG_PROMPTS, + executor_node, + planner_node, + reflector_node, + reporter_node, + route_entry, + route_reflector, + router_node, +) +from sandbox_agent import plan_store as ps +from sandbox_agent.sources import SourcesConfig +from sandbox_agent.subagents import make_delegate_tool, make_explore_tool + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# State +# --------------------------------------------------------------------------- + + +class SandboxState(MessagesState): + """Extended MessagesState carrying sandbox-specific fields. + + Attributes + ---------- + context_id: + A2A context identifier for multi-turn conversations. + workspace_path: + Absolute path to the per-context workspace directory. + final_answer: + The agent's final answer (set when the graph completes). + plan: + Flat list of step descriptions (backward compat with serializer). + plan_steps: + Structured per-step tracking with status, tool calls, results. + This is the source of truth; ``plan`` is derived from it. + plan_status: + Lifecycle status of the plan across A2A turns: + ``"executing"`` | ``"completed"`` | ``"failed"`` | ``"awaiting_continue"`` + plan_version: + Incremented on each replan. + original_request: + The user's first message that created this plan. + current_step: + Index of the plan step currently being executed (0-based). + step_results: + Summary of each completed step's output. + iteration: + Outer-loop iteration counter (planner → executor → reflector). + replan_count: + Number of times the reflector has chosen "replan". Used to cap + the replan loop and force termination after MAX_REPLAN_COUNT. + done: + Flag set by reflector when the task is complete. + skill_instructions: + Optional skill content loaded from a ``.claude/skills/`` file. + recent_decisions: + Rolling window of the last 10 reflector decisions (continue/replan/done). + _route: + Internal routing signal from the router node (not persisted). + """ + + context_id: str + workspace_path: str + final_answer: str + plan: list[str] + plan_steps: list[PlanStep] + plan_status: str + plan_version: int + original_request: str + current_step: int + step_results: list[str] + iteration: int + replan_count: int + done: bool + skill_instructions: str + prompt_tokens: int + completion_tokens: int + recent_decisions: list[str] + _tool_call_count: int + _route: str + _system_prompt: str + _prompt_messages: list[dict] + _budget_summary: dict + _no_tool_count: int + _sub_events: list[dict] + _last_tool_result: dict + _bound_tools: list[dict] + _llm_response: dict + _plan_store: dict + files_touched: list[str] + model: str + + +# --------------------------------------------------------------------------- +# Skill loader +# --------------------------------------------------------------------------- + + +def _load_skill(workspace: str, skill_id: str) -> str | None: + """Load a skill file from the workspace's ``.claude/skills/`` directory. + + Parameters + ---------- + workspace: + Absolute path to the workspace root (or repo root). + skill_id: + Skill identifier, e.g. ``"rca:ci"`` or ``"tdd:hypershift"``. + Colons are converted to directory separators so ``rca:ci`` + resolves to ``rca/ci.md``. + + Returns + ------- + str | None + The skill file content, or ``None`` if no matching file exists. + """ + # Search in multiple locations: + # 1. Per-session workspace: /workspace/{contextId}/.claude/skills/ + # 2. Shared workspace root: /workspace/.claude/skills/ (cloned at startup) + workspace_root = os.environ.get("WORKSPACE_DIR", "/workspace") + search_dirs = [ + Path(workspace) / ".claude" / "skills", + Path(workspace_root) / ".claude" / "skills", + ] + + for skills_dir in search_dirs: + if not skills_dir.is_dir(): + continue + + # Primary path: replace ':' with '/' → rca:ci → rca/ci.md + primary = skills_dir / f"{skill_id.replace(':', '/')}.md" + if primary.is_file(): + logger.info("Loaded skill '%s' from %s", skill_id, primary) + return primary.read_text(encoding="utf-8", errors="replace") + + # Try SKILL.md inside directory named with colons → rca:ci/SKILL.md + skill_dir = skills_dir / skill_id.replace(":", "/") + skill_md = skill_dir / "SKILL.md" + if skill_md.is_file(): + logger.info("Loaded skill '%s' from %s", skill_id, skill_md) + return skill_md.read_text(encoding="utf-8", errors="replace") + + # Directory named with literal colon → rca:ci/SKILL.md + colon_dir = skills_dir / skill_id + colon_skill = colon_dir / "SKILL.md" + if colon_skill.is_file(): + logger.info("Loaded skill '%s' from %s (colon dir)", skill_id, colon_skill) + return colon_skill.read_text(encoding="utf-8", errors="replace") + + logger.warning("Skill '%s' not found in any search path", skill_id) + return None + + +# --------------------------------------------------------------------------- +# Tool factories +# --------------------------------------------------------------------------- + + +def _make_shell_tool(executor: SandboxExecutor) -> Any: + """Return a LangChain tool that delegates to *executor.run_shell*. + + On :class:`HitlRequired`, the tool calls LangGraph ``interrupt()`` to + pause the graph and require explicit human approval before resuming. + The graph will not continue until the human responds. + """ + + @tool + async def shell(command: str) -> str: + """Execute a shell command in the session workspace. + + The working directory is the per-session workspace. Use relative + paths for files in this session. Files created here are visible + in the Files tab. + + Args: + command: The shell command to run. + + Returns: + Command output (stdout + stderr), or pauses for human approval. + """ + # Warn on bare `cd` — it has no effect in isolated shell execution + if command.strip().startswith("cd ") and "&&" not in command: + logger.warning( + "Bare 'cd' command detected — has no effect in isolated shell: %s", + command, + ) + + try: + result = await executor.run_shell(command) + except HitlRequired as exc: + # Pause graph execution — requires human approval to resume. + # The interrupt() call suspends the graph state. The A2A task + # transitions to input_required. Only an explicit human + # approval (via the HITLManager channel) resumes execution. + approval = interrupt({ + "type": "approval_required", + "command": exc.command, + "message": f"Command '{exc.command}' requires human approval.", + }) + # If we reach here, the human approved — execute the command. + if isinstance(approval, dict) and approval.get("approved"): + result = await executor._execute(command) + else: + return f"DENIED: command '{exc.command}' was rejected by human review." + + # Retry on rate-limit errors (GitHub API, etc.) with exponential backoff + output = _format_result(result) + if result.exit_code != 0 and _is_rate_limited(output): + import asyncio + for attempt in range(1, 4): # up to 3 retries + delay = 2 ** attempt # 2s, 4s, 8s + logger.info("Rate limit detected, retry %d/3 after %ds", attempt, delay) + await asyncio.sleep(delay) + try: + result = await executor.run_shell(command) + except HitlRequired: + break # don't retry HITL + output = _format_result(result) + if result.exit_code == 0 or not _is_rate_limited(output): + break + + return output + + return shell + + +_MAX_TOOL_OUTPUT = 10_000 # chars — prevent context window blowout + + +def _format_result(result: Any) -> str: + """Format an ExecutionResult into a string, truncating large output.""" + parts: list[str] = [] + if result.stdout: + parts.append(result.stdout) + if result.stderr: + if result.exit_code != 0: + parts.append(f"STDERR: {result.stderr}") + else: + # Informational stderr (e.g., git clone progress) — not an error + parts.append(result.stderr) + if result.exit_code != 0: + parts.append(f"EXIT_CODE: {result.exit_code}") + text = "\n".join(parts) if parts else "(no output)" + if len(text) > _MAX_TOOL_OUTPUT: + kept = text[:_MAX_TOOL_OUTPUT] + dropped = len(text) - _MAX_TOOL_OUTPUT + text = f"{kept}\n\n[OUTPUT TRUNCATED — {dropped:,} chars omitted. Redirect large output to a file: command > output/result.txt]" + return text + + +def _is_rate_limited(output: str) -> bool: + """Detect rate-limit errors in command output.""" + lower = output.lower() + return any(pattern in lower for pattern in ( + "rate limit exceeded", + "rate limit", + "too many requests", + "429", + "api rate limit", + "secondary rate limit", + )) + + +def _make_file_read_tool(workspace_path: str) -> Any: + """Return a LangChain tool that reads files relative to *workspace_path*. + + The tool prevents path traversal by resolving the path and ensuring it + stays within the workspace directory. + """ + ws_root = Path(workspace_path).resolve() + + @tool + async def file_read(path: str) -> str: + """Read a file from the workspace. + + Args: + path: Relative path within the workspace directory. + + Returns: + The file contents, or an error message. + """ + resolved = (ws_root / path).resolve() + + # Prevent path traversal. + if not resolved.is_relative_to(ws_root): + return f"Error: path '{path}' resolves outside the workspace." + + if not resolved.is_file(): + return f"Error: file not found at '{path}'." + + try: + return resolved.read_text(encoding="utf-8", errors="replace") + except OSError as exc: + return f"Error reading file: {exc}" + + return file_read + + +def _make_file_write_tool(workspace_path: str) -> Any: + """Return a LangChain tool that writes files relative to *workspace_path*. + + The tool prevents path traversal and creates parent directories as needed. + """ + ws_root = Path(workspace_path).resolve() + + @tool + async def file_write(path: str, content: str) -> str: + """Write content to a file in the workspace. + + Args: + path: Relative path within the workspace directory. + content: The text content to write. + + Returns: + A confirmation message, or an error message. + """ + resolved = (ws_root / path).resolve() + + # Prevent path traversal. + if not resolved.is_relative_to(ws_root): + return f"Error: path '{path}' resolves outside the workspace." + + try: + resolved.parent.mkdir(parents=True, exist_ok=True) + resolved.write_text(content, encoding="utf-8") + return f"Successfully wrote {len(content)} bytes to '{path}'." + except OSError as exc: + return f"Error writing file: {exc}" + + return file_write + + +def _make_grep_tool(workspace_path: str) -> Any: + """Return a LangChain tool that searches file contents with regex.""" + ws_root = Path(workspace_path).resolve() + + @tool + async def grep(pattern: str, path: str = ".", include: str = "") -> str: + """Search for a regex pattern in file contents under the workspace. + + Args: + pattern: Regex pattern to search for (e.g. 'def main', 'ERROR|FAIL'). + path: Relative directory or file to search in (default: workspace root). + include: Glob filter for filenames (e.g. '*.py', '*.ts'). Empty = all files. + + Returns: + Matching lines with file paths and line numbers, or an error message. + """ + import asyncio as _aio + + search_path = (ws_root / path).resolve() + if not search_path.is_relative_to(ws_root): + return f"Error: path '{path}' resolves outside the workspace." + + cmd = ["grep", "-rn", "--color=never"] + if include: + cmd.extend(["--include", include]) + cmd.extend([pattern, str(search_path)]) + + try: + proc = await _aio.create_subprocess_exec( + *cmd, stdout=_aio.subprocess.PIPE, stderr=_aio.subprocess.PIPE, + ) + stdout, stderr = await _aio.wait_for(proc.communicate(), timeout=30) + out = stdout.decode(errors="replace")[:10000] + if proc.returncode == 1: + return "No matches found." + if proc.returncode != 0: + return f"Error: {stderr.decode(errors='replace')[:500]}" + # Make paths relative to workspace + return out.replace(str(ws_root) + "/", "") + except Exception as exc: + return f"Error running grep: {exc}" + + return grep + + +def _make_glob_tool(workspace_path: str) -> Any: + """Return a LangChain tool that finds files by glob pattern.""" + ws_root = Path(workspace_path).resolve() + + @tool + async def glob(pattern: str) -> str: + """Find files matching a glob pattern in the workspace. + + Args: + pattern: Glob pattern (e.g. '**/*.py', 'src/**/*.ts', '*.md'). + + Returns: + Newline-separated list of matching file paths relative to workspace. + """ + import fnmatch + matches = [] + for p in sorted(ws_root.rglob("*")): + if p.is_file(): + # Resolve symlinks and verify the real path stays inside workspace + resolved = p.resolve() + if not resolved.is_relative_to(ws_root): + continue + rel = str(p.relative_to(ws_root)) + if fnmatch.fnmatch(rel, pattern) or fnmatch.fnmatch(p.name, pattern): + matches.append(rel) + if len(matches) >= 200: + matches.append(f"... truncated ({len(matches)}+ matches)") + break + return "\n".join(matches) if matches else "No files matched." + + return glob + + +def _make_web_fetch_tool(sources_config: SourcesConfig) -> Any: + """Return a LangChain tool that fetches web content from allowed domains. + + The tool checks the URL's domain against ``sources.json`` allowed_domains + before making the request. + """ + + @tool + async def web_fetch(url: str) -> str: + """Fetch content from a URL. + + Domain filtering is handled by the outbound Squid proxy at the + network level. This tool fetches any URL the proxy allows. + + Args: + url: The full URL to fetch (e.g. https://github.com/org/repo/issues/1). + + Returns: + The page content as text, or an error message. + """ + import httpx + from urllib.parse import urlparse + + parsed = urlparse(url) + domain = parsed.hostname or "" + + if not sources_config.is_web_access_enabled(): + return "Error: web access is disabled in sources.json." + + # Domain filtering is delegated to the Squid proxy. + # Log the domain for observability but don't block. + logger.info("web_fetch: domain=%s url=%s", domain, url[:200]) + + try: + async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client: + resp = await client.get(url, headers={"User-Agent": "kagenti-sandbox-agent/1.0"}) + resp.raise_for_status() + + content_type = resp.headers.get("content-type", "") + text = resp.text + + # For HTML, try to extract readable text + if "text/html" in content_type: + # Simple HTML tag stripping for readability + import re + text = re.sub(r']*>.*?', '', text, flags=re.DOTALL) + text = re.sub(r']*>.*?', '', text, flags=re.DOTALL) + text = re.sub(r'<[^>]+>', ' ', text) + text = re.sub(r'\s+', ' ', text).strip() + + # Truncate very long responses + if len(text) > 50000: + text = text[:50000] + "\n\n[Content truncated at 50000 characters]" + + return text + + except httpx.HTTPStatusError as exc: + return f"Error: HTTP {exc.response.status_code} fetching {url}" + except httpx.RequestError as exc: + return f"Error: could not fetch {url}: {exc}" + + return web_fetch + + +# --------------------------------------------------------------------------- +# Escape tool for Llama 4 Scout +# --------------------------------------------------------------------------- +# Llama 4 Scout ALWAYS calls a tool when tools are bound (tool_choice=auto +# acts like required). The respond_to_user tool lets planner/reflector +# "escape" the tool loop by calling this tool with their final text output. + + +@tool +def respond_to_user(response: str) -> str: + """Return your final text response. Call this when you have enough + information and don't need any more tools. + + Args: + response: The complete text response to return to the user. + + Returns: + The response text unchanged. + """ + return response + + +@tool +def step_done(summary: str) -> str: + """Signal that the current step is COMPLETE. Call this instead of + other tools when the step goal has been achieved and no more + tool calls are needed. + + Args: + summary: Brief summary of what was accomplished in this step. + + Returns: + The summary text. + """ + return summary + + +# --------------------------------------------------------------------------- +# Graph builder +# --------------------------------------------------------------------------- + + +def build_graph( + workspace_path: str, + permission_checker: PermissionChecker, + sources_config: SourcesConfig, + checkpointer: Optional[Any] = None, + context_id: str = "", + namespace: str = "team1", +) -> Any: + """Build and compile the LangGraph agent graph. + + Parameters + ---------- + workspace_path: + Absolute path to the per-context workspace directory. + permission_checker: + A :class:`PermissionChecker` for evaluating shell operations. + sources_config: + A :class:`SourcesConfig` providing runtime limits. + checkpointer: + Optional LangGraph checkpointer for PostgreSQL-based state + persistence across A2A turns. + + Returns + ------- + CompiledGraph + A compiled LangGraph graph with ``ainvoke`` / ``astream`` methods. + """ + # -- Executor ----------------------------------------------------------- + executor = SandboxExecutor( + workspace_path=workspace_path, + permission_checker=permission_checker, + sources_config=sources_config, + ) + + # -- LLM ---------------------------------------------------------------- + from sandbox_agent.configuration import Configuration + + config = Configuration() # type: ignore[call-arg] + # -- Budget ------------------------------------------------------------- + budget = AgentBudget() + + llm = ChatOpenAI( + model=config.llm_model, + base_url=config.llm_api_base, + api_key=config.llm_api_key, + timeout=budget.llm_timeout, + max_retries=budget.llm_max_retries, + model_kwargs={ + "extra_body": { + "metadata": { + "session_id": context_id, + "agent_name": os.environ.get("AGENT_NAME", "sandbox-legion"), + "namespace": namespace, + "max_session_tokens": budget.max_tokens, + } + } + }, + ) + + # -- Per-node model overrides ------------------------------------------- + def _make_llm(node_type: str) -> ChatOpenAI: + """Create an LLM instance for a specific node type, using model override if set.""" + node_model = config.model_for_node(node_type) + return ChatOpenAI( + model=node_model, + base_url=config.llm_api_base, + api_key=config.llm_api_key, + timeout=budget.llm_timeout, + max_retries=budget.llm_max_retries, + model_kwargs={ + "extra_body": { + "metadata": { + "session_id": context_id, + "agent_name": os.environ.get("AGENT_NAME", "sandbox-legion"), + "namespace": namespace, + "max_session_tokens": budget.max_tokens, + } + } + }, + ) + + # Only create separate instances when overrides differ from default + llm_for_planner = _make_llm("planner") if config.llm_model_planner else llm + llm_for_executor = _make_llm("executor") if config.llm_model_executor else llm + llm_for_reflector = _make_llm("reflector") if config.llm_model_reflector else llm + llm_for_reporter = _make_llm("reporter") if config.llm_model_reporter else llm + llm_for_thinking = _make_llm("thinking") if config.llm_model_thinking else llm + llm_for_micro = _make_llm("micro_reasoning") if config.llm_model_micro_reasoning else llm + + # -- Tools -------------------------------------------------------------- + # Create tool instances once — shared across node subsets. + shell_tool = _make_shell_tool(executor) + file_read_tool = _make_file_read_tool(workspace_path) + file_write_tool = _make_file_write_tool(workspace_path) + grep_tool = _make_grep_tool(workspace_path) + glob_tool = _make_glob_tool(workspace_path) + web_fetch_tool = _make_web_fetch_tool(sources_config) + + core_tools = [shell_tool, file_read_tool, file_write_tool, grep_tool, glob_tool, web_fetch_tool] + tools = core_tools + [ + make_explore_tool(workspace_path, llm), + step_done, + # delegate disabled — causes crashes when agent can't resolve paths + # make_delegate_tool(workspace_path, llm, context_id, core_tools, namespace), + ] + + # -- Per-node tool subsets ------------------------------------------------ + # Each reasoning node gets its own tools and tool_choice mode: + # executor: ALL tools, tool_choice="any" (must call tools) + # planner: glob, grep, file_read, file_write + respond_to_user (escape) + # reflector: glob, grep, file_read + respond_to_user (escape) + # router/reporter/step_selector: no tools (text-only) + + read_only_tools = [file_read_tool, grep_tool, glob_tool, respond_to_user] + planner_tools = [file_read_tool, grep_tool, glob_tool, file_write_tool, respond_to_user] + + # SANDBOX_FORCE_TOOL_CHOICE=1 (wizard "Force Tool Calling" toggle): + # When forced: two-phase executor call: + # Phase 1: llm_executor_reason (implicit auto) — produces text reasoning + # Phase 2: llm_executor (tool_choice="any") — produces structured tool call + # When not forced: single-phase (implicit auto, model chooses text or tools) + force_tools = os.environ.get("SANDBOX_FORCE_TOOL_CHOICE", "0") == "1" + if force_tools: + llm_executor = llm_for_executor.bind_tools(tools, tool_choice="any") + llm_executor_reason = llm_for_thinking # bare LLM for thinking, NO tools + else: + llm_executor = llm_for_executor.bind_tools(tools) # implicit auto + llm_executor_reason = None # no two-phase needed + llm_planner = llm_for_planner.bind_tools(planner_tools) # always auto + + # All nodes with tools use tool_choice="auto" + llm_reflector = llm_for_reflector.bind_tools(read_only_tools) # read-only for verification + llm_reporter = llm_for_reporter.bind_tools(read_only_tools) # read-only for file verification + + # ToolNodes for each node's tool subset + _executor_tool_node = ToolNode(tools) + _planner_tool_node = ToolNode(planner_tools) + _reflector_tool_node = ToolNode(read_only_tools) + + # -- Graph nodes (router-plan-execute-reflect) --------------------------- + # Each node function from reasoning.py takes (state, llm) — we wrap them + # in closures that capture the appropriate LLM instance. + + async def _router(state: SandboxState) -> dict[str, Any]: + return await router_node(state) + + async def _planner(state: SandboxState) -> dict[str, Any]: + return await planner_node(state, llm_planner, budget=budget) + + async def _executor(state: SandboxState) -> dict[str, Any]: + return await executor_node(state, llm_executor, budget=budget, llm_reason=llm_executor_reason) + + async def _reflector(state: SandboxState) -> dict[str, Any]: + return await reflector_node(state, llm_reflector, budget=budget) + + async def _reporter(state: SandboxState) -> dict[str, Any]: + return await reporter_node( + state, llm_reporter, budget=budget, + llm_reason=llm_executor_reason, + tools=read_only_tools, + ) + + async def _step_selector(state: SandboxState) -> dict[str, Any]: + """Pick the next step and prepare focused context for the executor. + + Uses a lightweight LLM call to review plan progress and write + a targeted brief for the executor — what to do, what worked/failed + before, and what to avoid. + """ + from langchain_core.messages import SystemMessage as SM, HumanMessage as HM + + plan = state.get("plan", []) + plan_steps = list(state.get("plan_steps", [])) + current = state.get("current_step", 0) + messages = state.get("messages", []) + + # --- PlanStore: parallel nested plan tracking --- + store = state.get("_plan_store", {}) + if store and store.get("steps"): + current_info = ps.get_current_step(store) + if current_info: + step_key, step_data = current_info + try: + store = ps.set_step_status(store, step_key, "running") + except ValueError: + logger.warning("PlanStore: step %s not found, skipping", step_key) + + # Find next non-done step + next_step = current + for i in range(current, len(plan_steps)): + _ps = plan_steps[i] + status = _ps.get("status", "pending") if isinstance(_ps, dict) else "pending" + if status != "done": + next_step = i + break + else: + next_step = len(plan) + + # Mark selected step as running + if next_step < len(plan_steps) and isinstance(plan_steps[next_step], dict): + plan_steps[next_step] = {**plan_steps[next_step], "status": "running"} + + # Build plan status summary + plan_summary = [] + for i, step in enumerate(plan): + _ps = plan_steps[i] if i < len(plan_steps) else {} + status = _ps.get("status", "pending") if isinstance(_ps, dict) else "pending" + marker = "✓" if status == "done" else "→" if i == next_step else " " + result_hint = "" + if isinstance(_ps, dict) and _ps.get("result_summary"): + result_hint = f" — {_ps['result_summary'][:100]}" + plan_summary.append(f" {marker} {i+1}. [{status}] {step[:80]}{result_hint}") + + # Gather recent tool results (last 3 ToolMessages) + recent_results = [] + for m in reversed(messages[-10:]): + if hasattr(m, 'name') and getattr(m, 'type', '') == 'tool': + content = str(getattr(m, 'content', ''))[:300] + recent_results.insert(0, f" [{m.name}] {content}") + if len(recent_results) >= 3: + break + + if next_step >= len(plan): + # All done + logger.info("StepSelector: all %d steps complete", len(plan)) + result_done: dict[str, Any] = { + "current_step": next_step, + "plan_steps": plan_steps, + "_tool_call_count": 0, + "done": True, + } + if store: + result_done["_plan_store"] = store + return result_done + + # Quick LLM call — write a focused brief for the executor + step_text = plan[next_step] if next_step < len(plan) else "N/A" + prompt = f"""You are a step coordinator. Write a 2-3 sentence brief for the executor. + +Plan progress: +{chr(10).join(plan_summary)} + +Next step to execute: {next_step + 1}. {step_text} + +Recent tool results: +{chr(10).join(recent_results) if recent_results else '(none yet)'} + +WORKSPACE RULE: Each shell command starts fresh in /workspace. Bare `cd` has no effect. +If the step involves a cloned repo, always write `cd repos/ && ` in the brief. +Example: "cd repos/kagenti && gh pr list" — never just "gh pr list". + +Write a brief: what EXACTLY to do for step {next_step + 1}, what context from previous steps is relevant, and what to watch out for. Be specific about commands/tools to use, and always include the full `cd && command` pattern when a cloned repo is involved.""" + + sys_msg = SM(content="You are a concise step coordinator. Output ONLY the brief, no preamble.") + user_msg = HM(content=prompt) + try: + response = await llm.ainvoke([sys_msg, user_msg]) + brief = response.content.strip() + usage = getattr(response, 'usage_metadata', None) or {} + budget.add_tokens( + usage.get('input_tokens', 0) + usage.get('output_tokens', 0) + ) + except Exception as e: + logger.warning("StepSelector LLM call failed: %s — using default brief", e) + brief = f"Execute step {next_step + 1}: {step_text}" + response = None + + logger.info("StepSelector: step %d/%d brief: %s", next_step + 1, len(plan), brief[:100]) + result: dict[str, Any] = { + "current_step": next_step, + "plan_steps": plan_steps, + "_tool_call_count": 0, + "skill_instructions": f"STEP BRIEF FROM COORDINATOR:\n{brief}\n\n---\n", + } + if store: + result["_plan_store"] = store + if _DEBUG_PROMPTS: + from sandbox_agent.context_builders import LLMCallCapture + result["_system_prompt"] = prompt[:10000] + result["_prompt_messages"] = [ + {"role": "system", "preview": "Step coordinator brief prompt"}, + {"role": "human", "preview": prompt[:500]}, + ] + if response: + capture = LLMCallCapture(response=response) + result["_llm_response"] = capture._format_response() + return result + + # -- Safe ToolNode wrappers — never crash the graph ---------------------- + + def _make_safe_tool_wrapper(tool_node: ToolNode, label: str): + """Create a safe tool execution wrapper for a ToolNode.""" + async def _safe(state: SandboxState) -> dict[str, Any]: + from langchain_core.messages import ToolMessage + try: + return await tool_node.ainvoke(state) + except (GraphInterrupt, KeyboardInterrupt, SystemExit): + raise + except Exception as exc: + logger.error("%s ToolNode error: %s", label, exc, exc_info=True) + messages = state.get("messages", []) + error_msgs = [] + if messages: + last = messages[-1] + for tc in getattr(last, "tool_calls", []): + tc_id = tc.get("id", "unknown") if isinstance(tc, dict) else getattr(tc, "id", "unknown") + tc_name = tc.get("name", "unknown") if isinstance(tc, dict) else getattr(tc, "name", "unknown") + error_msgs.append(ToolMessage( + content=f"Tool error: {exc}", + tool_call_id=tc_id, + name=tc_name, + )) + if not error_msgs: + error_msgs.append(ToolMessage( + content=f"Tool execution failed: {exc}", + tool_call_id="error", + name="unknown", + )) + return {"messages": error_msgs} + return _safe + + _reporter_tool_node = ToolNode(read_only_tools) + + _safe_executor_tools = _make_safe_tool_wrapper(_executor_tool_node, "executor") + _safe_planner_tools = _make_safe_tool_wrapper(_planner_tool_node, "planner") + _safe_reflector_tools = _make_safe_tool_wrapper(_reflector_tool_node, "reflector") + _safe_reporter_tools = _make_safe_tool_wrapper(_reporter_tool_node, "reporter") + + # -- Assemble graph ----------------------------------------------------- + # + # Topology (all nodes use tool_choice="auto"): + # + # router → [plan] → planner ⇄ planner_tools → step_selector + # [resume] → step_selector + # + # step_selector → executor ⇄ tools → reflector ⇄ reflector_tools + # + # reflector_route → [done] → reporter → END + # [continue] → step_selector + # [replan] → planner + # + # Tool subsets: + # planner: glob, grep, file_read, file_write (inspect workspace, save plans) + # executor: all tools (shell, files, grep, glob, web_fetch, explore, delegate) + # reflector: glob, grep, file_read (verify step outcomes before deciding) + # + graph = StateGraph(SandboxState) + graph.add_node("router", _router) + graph.add_node("planner", _planner) + graph.add_node("planner_tools", _safe_planner_tools) + graph.add_node("step_selector", _step_selector) + graph.add_node("executor", _executor) + graph.add_node("tools", _safe_executor_tools) + graph.add_node("reflector", _reflector) + graph.add_node("reflector_tools", _safe_reflector_tools) + graph.add_node("reporter", _reporter) + + # Entry: router decides resume vs plan + graph.set_entry_point("router") + graph.add_conditional_edges( + "router", + route_entry, + {"resume": "step_selector", "plan": "planner"}, + ) + + # Planner → planner_tools (if tool_calls) or → step_selector (if no tool_calls) + graph.add_conditional_edges( + "planner", + tools_condition, + {"tools": "planner_tools", "__end__": "step_selector"}, + ) + graph.add_edge("planner_tools", "planner") + + graph.add_edge("step_selector", "executor") + + # Executor → executor_tools (if tool_calls) or → reflector (if no tool_calls) + graph.add_conditional_edges( + "executor", + tools_condition, + {"tools": "tools", "__end__": "reflector"}, + ) + graph.add_edge("tools", "executor") + + # Reflector → reflector_tools (if tool_calls) or → route decision + graph.add_conditional_edges( + "reflector", + tools_condition, + {"tools": "reflector_tools", "__end__": "reflector_route"}, + ) + graph.add_edge("reflector_tools", "reflector") + + # Reflector route → reporter (done), step_selector (continue), or planner (replan) + graph.add_node("reflector_route", lambda state: state) # pass-through + graph.add_conditional_edges( + "reflector_route", + route_reflector, + {"done": "reporter", "execute": "step_selector", "replan": "planner"}, + ) + # Reporter executes tools internally via invoke_with_tool_loop + graph.add_edge("reporter", "__end__") + + return graph.compile(checkpointer=checkpointer) From e8768fdcba1312233712e9502585e85326fb72e6 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:48:17 +0100 Subject: [PATCH 10/26] feat(sandbox): graph card manifest with event catalog and topology introspection Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/graph_card.py | 580 ++++++++++++++++++ 1 file changed, 580 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/graph_card.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/graph_card.py b/a2a/sandbox_agent/src/sandbox_agent/graph_card.py new file mode 100644 index 00000000..896e7b9d --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/graph_card.py @@ -0,0 +1,580 @@ +# Copyright 2025 IBM Corp. +# Licensed under the Apache License, Version 2.0 + +"""AgentGraphCard — self-describing manifest for the agent's processing graph. + +This module defines the event catalog and generates a "graph card" from +LangGraph introspection. The graph card is a structured dict that tells +consumers (UI, backend, observability) everything they need to render the +agent's reasoning loop: + +* **EVENT_CATALOG** — every event type the agent can stream, with category, + field definitions, and debug-field metadata so the UI knows what to expect + and how to render it. +* **COMMON_EVENT_FIELDS** — fields injected by the serializer into every + event (type, loop_id, node_visit, event_index, etc.). +* **TOPOLOGY_NODE_DESCRIPTIONS** — human-readable descriptions for each + LangGraph node. +* **build_graph_card()** — introspects a compiled LangGraph ``CompiledGraph`` + and returns the full card as a plain dict. +""" + +from __future__ import annotations + +from typing import Any, Dict, List + +# --------------------------------------------------------------------------- +# Common fields injected into every serialized event +# --------------------------------------------------------------------------- + +COMMON_EVENT_FIELDS: Dict[str, Dict[str, str]] = { + "type": { + "type": "str", + "description": "Event type discriminator (one of EVENT_CATALOG keys).", + }, + "loop_id": { + "type": "str", + "description": "Unique identifier for this reasoning-loop invocation.", + }, + "langgraph_node": { + "type": "str", + "description": "Name of the LangGraph node that produced this event.", + }, + "node_visit": { + "type": "int", + "description": "Monotonic counter incremented each time a new major node is visited.", + }, + "event_index": { + "type": "int", + "description": "Global sequence number across all events in a loop (for ordering).", + }, + "model": { + "type": "str", + "description": "LLM model identifier used for this event (empty if no LLM call).", + }, + "prompt_tokens": { + "type": "int", + "description": "Number of prompt tokens consumed by this event's LLM call.", + }, + "completion_tokens": { + "type": "int", + "description": "Number of completion tokens produced by this event's LLM call.", + }, +} + +# --------------------------------------------------------------------------- +# Event catalog +# --------------------------------------------------------------------------- + +#: Complete catalog of every event type the sandbox agent can stream. +#: +#: Each entry contains: +#: category – semantic grouping for the UI +#: description – what this event represents +#: langgraph_nodes – LangGraph node names that can produce this event +#: has_llm_call – whether the event involves an LLM invocation +#: terminal – True only for the final-answer event +#: fields – data fields specific to this event type +#: debug_fields – fields available in debug / inspector mode +EVENT_CATALOG: Dict[str, Dict[str, Any]] = { + # ── Reasoning ───────────────────────────────────────────────────── + "planner_output": { + "category": "reasoning", + "description": "Planner created or revised a multi-step plan.", + "langgraph_nodes": ["planner"], + "has_llm_call": True, + "fields": { + "steps": { + "type": "List[str]", + "description": "Ordered list of plan step descriptions.", + }, + "iteration": { + "type": "int", + "description": "Planning iteration (0 = initial, >0 = replan).", + }, + }, + "debug_fields": { + "system_prompt": { + "type": "str", + "description": "System prompt sent to the planner LLM.", + }, + "bound_tools": { + "type": "List[str]", + "description": "Tool names bound to the planner LLM.", + }, + "prompt_messages": { + "type": "List[dict]", + "description": "Full message history sent to the LLM.", + }, + "llm_response": { + "type": "str", + "description": "Raw LLM response text.", + }, + }, + }, + "executor_step": { + "category": "reasoning", + "description": "Executor selected and began working on a plan step.", + "langgraph_nodes": ["step_selector"], + "has_llm_call": False, + "fields": { + "step": { + "type": "int", + "description": "Current step index (1-based).", + }, + "total_steps": { + "type": "int", + "description": "Total number of plan steps.", + }, + "description": { + "type": "str", + "description": "Human-readable description of the current step.", + }, + "reasoning": { + "type": "str", + "description": "LLM response text (up to 2000 chars).", + }, + }, + "debug_fields": { + "logic": { + "type": "str", + "description": "Step selection logic: picks current_step from plan_steps.", + }, + }, + }, + "thinking": { + "category": "reasoning", + "description": ( + "Intermediate thinking iteration from a reasoning LLM " + "(bare model, no tools)." + ), + "langgraph_nodes": ["planner", "executor", "reflector"], + "has_llm_call": True, + "fields": { + "content": { + "type": "str", + "description": "Thinking text produced by the reasoning LLM.", + }, + "iteration": { + "type": "int", + "description": "Thinking iteration number within this node visit.", + }, + "total_iterations": { + "type": "int", + "description": "Total thinking iterations in this cycle.", + }, + }, + "debug_fields": { + "system_prompt": { + "type": "str", + "description": "System prompt for the thinking LLM.", + }, + "bound_tools": { + "type": "List[str]", + "description": "Always empty — thinking LLM has no tools.", + }, + "prompt_messages": { + "type": "List[dict]", + "description": "Messages sent to the thinking LLM.", + }, + "llm_response": { + "type": "str", + "description": "Raw thinking response.", + }, + }, + }, + "micro_reasoning": { + "category": "reasoning", + "description": ( + "Executor's intermediate LLM reasoning within a single plan step " + "(tool-loop iteration)." + ), + "langgraph_nodes": ["executor"], + "has_llm_call": True, + "fields": { + "content": { + "type": "str", + "description": "Reasoning text from the micro-reasoning LLM.", + }, + "step": { + "type": "int", + "description": "Current plan step index.", + }, + "micro_step": { + "type": "int", + "description": "Tool-loop iteration within the current plan step.", + }, + "thinking_count": { + "type": "int", + "description": "Number of thinking iterations that preceded this reasoning.", + }, + }, + "debug_fields": { + "system_prompt": { + "type": "str", + "description": "System prompt for the micro-reasoning LLM.", + }, + "bound_tools": { + "type": "List[str]", + "description": "Tool names available to the micro-reasoning LLM.", + }, + "prompt_messages": { + "type": "List[dict]", + "description": "Messages sent to the micro-reasoning LLM.", + }, + "llm_response": { + "type": "str", + "description": "Raw LLM response before tool extraction.", + }, + }, + }, + # ── Execution ───────────────────────────────────────────────────── + "tool_call": { + "category": "execution", + "description": "A tool was invoked by the executor or planner LLM.", + "langgraph_nodes": ["executor", "planner"], + "has_llm_call": False, + "fields": { + "step": { + "type": "int", + "description": "Plan step that triggered this tool call.", + }, + "name": { + "type": "str", + "description": "Tool name.", + }, + "args": { + "type": "str", + "description": "JSON-encoded tool arguments.", + }, + }, + "debug_fields": {}, + }, + # ── Tool output ─────────────────────────────────────────────────── + "tool_result": { + "category": "tool_output", + "description": "A tool returned its result.", + "langgraph_nodes": ["tools", "planner_tools", "reflector_tools"], + "has_llm_call": False, + "fields": { + "step": { + "type": "int", + "description": "Plan step this result belongs to.", + }, + "name": { + "type": "str", + "description": "Tool name that produced the result.", + }, + "output": { + "type": "str", + "description": "Tool output (may be truncated).", + }, + }, + "debug_fields": {}, + }, + # ── Decision ────────────────────────────────────────────────────── + "reflector_decision": { + "category": "decision", + "description": ( + "Reflector reviewed execution and decided: continue, replan, or done." + ), + "langgraph_nodes": ["reflector"], + "has_llm_call": True, + "fields": { + "decision": { + "type": "str", + "description": "Routing decision.", + "enum": ["continue", "replan", "done"], + }, + "assessment": { + "type": "str", + "description": "Full reflection assessment text.", + }, + "iteration": { + "type": "int", + "description": "Reflect-execute loop iteration.", + }, + }, + "debug_fields": { + "system_prompt": { + "type": "str", + "description": "System prompt for the reflector LLM.", + }, + "bound_tools": { + "type": "List[str]", + "description": "Read-only tools bound to the reflector.", + }, + "prompt_messages": { + "type": "List[dict]", + "description": "Messages sent to the reflector LLM.", + }, + "llm_response": { + "type": "str", + "description": "Raw reflector LLM output.", + }, + }, + }, + "router_decision": { + "category": "decision", + "description": "Router decided whether to plan from scratch or resume execution.", + "langgraph_nodes": ["router"], + "has_llm_call": False, + "fields": { + "route": { + "type": "str", + "description": "Chosen route.", + "enum": ["plan", "resume"], + }, + "plan_status": { + "type": "str", + "description": "Current plan status at time of routing.", + }, + }, + "debug_fields": { + "logic": { + "type": "str", + "description": ( + "Routing logic: checks plan_status to decide resume vs plan." + ), + }, + }, + }, + # ── Terminal ────────────────────────────────────────────────────── + "reporter_output": { + "category": "terminal", + "description": "Reporter generated the final answer for the user.", + "langgraph_nodes": ["reporter"], + "has_llm_call": True, + "terminal": True, + "fields": { + "content": { + "type": "str", + "description": "Final answer content (markdown).", + }, + }, + "debug_fields": { + "system_prompt": { + "type": "str", + "description": "System prompt for the reporter LLM.", + }, + "bound_tools": { + "type": "List[str]", + "description": "Tools available to the reporter (for citations).", + }, + "prompt_messages": { + "type": "List[dict]", + "description": "Messages sent to the reporter LLM.", + }, + "llm_response": { + "type": "str", + "description": "Raw reporter LLM output.", + }, + }, + }, + # ── Meta ────────────────────────────────────────────────────────── + "budget_update": { + "category": "meta", + "description": "Budget tracking update (tokens consumed, wall-clock time).", + "langgraph_nodes": [], + "has_llm_call": False, + "fields": { + "tokens_used": { + "type": "int", + "description": "Total tokens consumed so far.", + }, + "tokens_budget": { + "type": "int", + "description": "Maximum token budget.", + }, + "wall_clock_s": { + "type": "float", + "description": "Elapsed wall-clock seconds.", + }, + "max_wall_clock_s": { + "type": "float", + "description": "Maximum allowed wall-clock seconds.", + }, + }, + "debug_fields": {}, + }, + "node_transition": { + "category": "meta", + "description": ( + "Internal marker indicating a graph-level transition between nodes." + ), + "langgraph_nodes": [], + "has_llm_call": False, + "fields": { + "from_node": { + "type": "str", + "description": "Node the transition originates from.", + }, + "to_node": { + "type": "str", + "description": "Node the transition goes to.", + }, + }, + "debug_fields": {}, + }, + # ── Interaction ─────────────────────────────────────────────────── + "hitl_request": { + "category": "interaction", + "description": ( + "Human-in-the-loop approval request — the executor is pausing " + "to ask the user before proceeding." + ), + "langgraph_nodes": ["executor"], + "has_llm_call": False, + "fields": { + "tool_name": { + "type": "str", + "description": "Tool that requires approval.", + }, + "args": { + "type": "str", + "description": "JSON-encoded tool arguments pending approval.", + }, + "reason": { + "type": "str", + "description": "Why the agent is requesting approval.", + }, + }, + "debug_fields": {}, + }, +} + +# Valid category values (mirrors the set used in EVENT_CATALOG). +VALID_CATEGORIES = frozenset( + { + "reasoning", + "execution", + "tool_output", + "decision", + "terminal", + "meta", + "interaction", + } +) + +# --------------------------------------------------------------------------- +# LangGraph topology node descriptions +# --------------------------------------------------------------------------- + +#: Human-readable description for each node in the compiled graph. +TOPOLOGY_NODE_DESCRIPTIONS: Dict[str, str] = { + "router": ( + "Entry node — decides whether to create a new plan or resume execution " + "of an existing plan." + ), + "planner": ( + "Creates or revises a multi-step plan using an LLM with planning tools " + "(glob, grep, file_read, file_write)." + ), + "planner_tools": ( + "Executes tool calls issued by the planner (workspace inspection, " + "plan persistence)." + ), + "step_selector": ( + "Picks the next plan step to execute and prepares the executor context." + ), + "executor": ( + "Executes the current plan step using an LLM with the full tool suite " + "(shell, files, grep, glob, web_fetch, explore, delegate)." + ), + "tools": ( + "Executes tool calls issued by the executor." + ), + "reflector": ( + "Reviews execution results and decides whether to continue, replan, " + "or declare done. Uses read-only tools (glob, grep, file_read)." + ), + "reflector_tools": ( + "Executes read-only tool calls issued by the reflector for verification." + ), + "reflector_route": ( + "Pass-through node that routes the reflector's decision to the next node " + "(reporter, step_selector, or planner)." + ), + "reporter": ( + "Generates the final user-facing answer by synthesizing all execution " + "results. May invoke tools internally for citation verification." + ), +} + + +# --------------------------------------------------------------------------- +# Graph card builder +# --------------------------------------------------------------------------- + + +def build_graph_card( + compiled: Any, + agent_id: str = "sandbox_agent", +) -> Dict[str, Any]: + """Build the AgentGraphCard from a compiled LangGraph. + + Parameters + ---------- + compiled: + A ``CompiledStateGraph`` (or any object whose ``.get_graph()`` returns + a ``Graph`` with ``.nodes`` and ``.edges``). + agent_id: + Identifier for the agent (used in the card's ``id`` field). + + Returns + ------- + dict + A plain dict with keys: + - ``id`` — agent identifier + - ``framework`` — always ``"langgraph"`` + - ``version`` — card schema version + - ``event_catalog`` — the full ``EVENT_CATALOG`` + - ``common_event_fields`` — the ``COMMON_EVENT_FIELDS`` dict + - ``topology`` — ``{nodes, edges, entry_node}`` + """ + graph = compiled.get_graph() + + # ── Nodes ───────────────────────────────────────────────────────── + raw_nodes: List[str] = [ + node_id + for node_id in graph.nodes + if node_id not in ("__start__", "__end__") + ] + nodes: Dict[str, Dict[str, str]] = {} + for node_id in raw_nodes: + nodes[node_id] = { + "description": TOPOLOGY_NODE_DESCRIPTIONS.get(node_id, ""), + } + + # ── Edges ───────────────────────────────────────────────────────── + edges: List[Dict[str, str]] = [] + for edge in graph.edges: + source = edge.source if hasattr(edge, "source") else edge[0] + target = edge.target if hasattr(edge, "target") else edge[1] + # Skip __start__ / __end__ for cleaner topology + if source in ("__start__", "__end__") or target in ("__start__", "__end__"): + continue + edges.append({"source": source, "target": target}) + + # ── Entry node ──────────────────────────────────────────────────── + # The entry node is the first node reachable from __start__. + entry_node: str = "" + for edge in graph.edges: + src = edge.source if hasattr(edge, "source") else edge[0] + tgt = edge.target if hasattr(edge, "target") else edge[1] + if src == "__start__": + entry_node = tgt + break + + return { + "id": agent_id, + "framework": "langgraph", + "version": "1.0", + "event_catalog": EVENT_CATALOG, + "common_event_fields": COMMON_EVENT_FIELDS, + "topology": { + "nodes": nodes, + "edges": edges, + "entry_node": entry_node, + }, + } From ec59f47d24adad2992090d42fad4088c751b88d3 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:48:24 +0100 Subject: [PATCH 11/26] feat(sandbox): raw ctypes wrapper for Linux Landlock LSM syscalls (x86_64/aarch64) Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/landlock_ctypes.py | 193 ++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py b/a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py new file mode 100644 index 00000000..ff9b35ca --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py @@ -0,0 +1,193 @@ +"""Raw ctypes wrapper for Linux Landlock LSM syscalls. + +Architecture-aware: supports x86_64 and aarch64 syscall numbers. +Zero external dependencies -- pure ctypes + stdlib. + +Landlock is IRREVERSIBLE once applied to a thread. There is no undo. +All functions in this module fail hard (raise OSError) on error. +""" + +from __future__ import annotations + +import ctypes +import os +import platform +import struct + +# --------------------------------------------------------------------------- +# Syscall numbers by architecture +# --------------------------------------------------------------------------- + +_ARCH = platform.machine() + +if _ARCH == "x86_64": + _SYS_LANDLOCK_CREATE_RULESET = 444 + _SYS_LANDLOCK_ADD_RULE = 445 + _SYS_LANDLOCK_RESTRICT_SELF = 446 +elif _ARCH == "aarch64": + _SYS_LANDLOCK_CREATE_RULESET = 441 + _SYS_LANDLOCK_ADD_RULE = 442 + _SYS_LANDLOCK_RESTRICT_SELF = 443 +else: + raise RuntimeError(f"Unsupported architecture for Landlock: {_ARCH}") + +# --------------------------------------------------------------------------- +# Landlock constants +# --------------------------------------------------------------------------- + +LANDLOCK_RULE_PATH_BENEATH = 1 + +# ABI v1 access flags (13 flags) +_ACCESS_FS_V1 = ( + (1 << 0) # EXECUTE + | (1 << 1) # WRITE_FILE + | (1 << 2) # READ_FILE + | (1 << 3) # READ_DIR + | (1 << 4) # REMOVE_DIR + | (1 << 5) # REMOVE_FILE + | (1 << 6) # MAKE_CHAR + | (1 << 7) # MAKE_DIR + | (1 << 8) # MAKE_REG + | (1 << 9) # MAKE_SOCK + | (1 << 10) # MAKE_FIFO + | (1 << 11) # MAKE_BLOCK + | (1 << 12) # MAKE_SYM +) + +# ABI v2 adds REFER +_ACCESS_FS_REFER = 1 << 13 + +# ABI v3 adds TRUNCATE +_ACCESS_FS_TRUNCATE = 1 << 14 + +# Read-only subset (for ro_paths) +ACCESS_FS_READ_ONLY = ( + (1 << 0) # EXECUTE + | (1 << 2) # READ_FILE + | (1 << 3) # READ_DIR +) + +_libc = ctypes.CDLL("libc.so.6", use_errno=True) + +# --------------------------------------------------------------------------- +# Syscall helpers +# --------------------------------------------------------------------------- + + +def _syscall(nr: int, *args: int) -> int: + """Invoke a raw syscall. Returns the result or raises OSError.""" + result = _libc.syscall(ctypes.c_long(nr), *[ctypes.c_long(a) for a in args]) + if result < 0: + errno = ctypes.get_errno() + raise OSError(errno, f"syscall {nr} failed: {os.strerror(errno)}") + return result + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + + +def get_abi_version() -> int: + """Query the kernel's Landlock ABI version. + + Returns an integer >= 1 if Landlock is supported. + Raises OSError if Landlock is not available. + """ + # landlock_create_ruleset(NULL, 0, LANDLOCK_CREATE_RULESET_VERSION=1<<0) + LANDLOCK_CREATE_RULESET_VERSION = 1 << 0 + return _syscall(_SYS_LANDLOCK_CREATE_RULESET, 0, 0, LANDLOCK_CREATE_RULESET_VERSION) + + +def _get_fs_access_flags(abi_version: int) -> int: + """Return the full set of handled_access_fs flags for the given ABI version.""" + flags = _ACCESS_FS_V1 + if abi_version >= 2: + flags |= _ACCESS_FS_REFER + if abi_version >= 3: + flags |= _ACCESS_FS_TRUNCATE + return flags + + +def _add_rule(ruleset_fd: int, path: str, access: int) -> None: + """Add a path-beneath rule to an existing Landlock ruleset. + + Parameters + ---------- + ruleset_fd: + File descriptor of the Landlock ruleset. + path: + Absolute filesystem path to allow. + access: + Bitmask of allowed access rights. + """ + parent_fd = os.open(path, os.O_PATH | os.O_CLOEXEC) + try: + # struct landlock_path_beneath_attr { + # __u64 allowed_access; // 8 bytes + # __s32 parent_fd; // 4 bytes + # // 4 bytes padding + # } + attr = struct.pack("QiI", access, parent_fd, 0) + attr_ptr = ctypes.c_char_p(attr) + _syscall( + _SYS_LANDLOCK_ADD_RULE, + ruleset_fd, + LANDLOCK_RULE_PATH_BENEATH, + ctypes.cast(attr_ptr, ctypes.c_void_p).value, + 0, + ) + finally: + os.close(parent_fd) + + +def apply_landlock(rw_paths: list[str], ro_paths: list[str]) -> None: + """Create a Landlock ruleset, add path rules, and restrict the current thread. + + This is IRREVERSIBLE. After this call, the thread can only access + the specified paths with the specified permissions. + + Parameters + ---------- + rw_paths: + Paths to allow full read-write access. + ro_paths: + Paths to allow read-only access (execute + read_file + read_dir). + + Raises + ------ + OSError + If any Landlock syscall fails. No fallback, no degraded mode. + """ + abi = get_abi_version() + handled_access_fs = _get_fs_access_flags(abi) + + # struct landlock_ruleset_attr { __u64 handled_access_fs; } + ruleset_attr = struct.pack("Q", handled_access_fs) + ruleset_attr_ptr = ctypes.c_char_p(ruleset_attr) + ruleset_fd = _syscall( + _SYS_LANDLOCK_CREATE_RULESET, + ctypes.cast(ruleset_attr_ptr, ctypes.c_void_p).value, + len(ruleset_attr), + 0, + ) + + try: + # Add read-write path rules + for path in rw_paths: + if os.path.exists(path): + _add_rule(ruleset_fd, path, handled_access_fs) + + # Add read-only path rules + for path in ro_paths: + if os.path.exists(path): + _add_rule(ruleset_fd, path, ACCESS_FS_READ_ONLY) + + # prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) -- required before restrict_self + PR_SET_NO_NEW_PRIVS = 38 + _libc.prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) + + # landlock_restrict_self(ruleset_fd, 0) + _syscall(_SYS_LANDLOCK_RESTRICT_SELF, ruleset_fd, 0) + finally: + os.close(ruleset_fd) From 58876bd6a1c2d804b9b2ef54e9f109468bd42d16 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:48:29 +0100 Subject: [PATCH 12/26] feat(sandbox): startup probe that verifies Landlock isolation in a forked child process Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/landlock_probe.py | 132 ++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py b/a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py new file mode 100644 index 00000000..74f46888 --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py @@ -0,0 +1,132 @@ +"""Startup probe for Landlock filesystem isolation. + +Forks a child process to verify that Landlock actually works on this +kernel. The child applies Landlock, writes to an allowed directory, +and verifies that reads outside the sandbox are blocked. + +Because Landlock is irreversible, the probe MUST run in a fork. +If the probe fails, the process exits with sys.exit(1). +""" + +from __future__ import annotations + +import logging +import subprocess +import sys +import textwrap + +logger = logging.getLogger(__name__) + + +def probe_landlock() -> int: + """Fork a child that applies Landlock and verifies it blocks /etc/hostname. + + Returns the ABI version on success. + Calls sys.exit(1) if Landlock is unavailable or the probe fails. + """ + # The child script imports landlock_ctypes from the same package. + # We run it as a subprocess so Landlock restrictions are confined + # to the child process and do not affect the parent. + child_script = textwrap.dedent("""\ + import os + import sys + import tempfile + + # Ensure the package is importable + sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + + from sandbox_agent.landlock_ctypes import apply_landlock, get_abi_version + + abi = get_abi_version() + + # Create a temp directory for the sandbox + tmp_dir = tempfile.mkdtemp(prefix="landlock_probe_") + + # Read-only paths for basic system functionality + ro_paths = [] + for p in ["/usr", "/lib", "/lib64", "/etc"]: + if os.path.exists(p): + ro_paths.append(p) + + # Apply Landlock: only tmp_dir is writable + apply_landlock(rw_paths=[tmp_dir], ro_paths=ro_paths) + + # Verify: writing inside the sandbox must succeed + test_file = os.path.join(tmp_dir, "probe_test.txt") + with open(test_file, "w") as f: + f.write("landlock probe ok") + + # Verify: reading the file back must succeed + with open(test_file, "r") as f: + content = f.read() + assert content == "landlock probe ok", f"Read-back mismatch: {content!r}" + + # Verify: writing OUTSIDE the sandbox must fail + blocked = False + try: + with open("/tmp/landlock_escape_test.txt", "w") as f: + f.write("should not work") + except PermissionError: + blocked = True + except OSError as e: + # EACCES (13) is also acceptable + if e.errno == 13: + blocked = True + else: + raise + + if not blocked: + print("LANDLOCK_FAIL: write outside sandbox was NOT blocked", file=sys.stderr) + sys.exit(2) + + print(f"LANDLOCK_OK abi={abi}") + sys.exit(0) + """) + + # Find the package root so the child can import sandbox_agent + package_src = str( + __import__("pathlib").Path(__file__).resolve().parent.parent + ) + + result = subprocess.run( + [sys.executable, "-c", child_script], + capture_output=True, + text=True, + timeout=30, + env={ + **dict(__import__("os").environ), + "PYTHONPATH": package_src, + }, + ) + + if result.returncode != 0: + logger.error( + "Landlock probe FAILED (exit=%d):\nstdout: %s\nstderr: %s", + result.returncode, + result.stdout.strip(), + result.stderr.strip(), + ) + print( + f"FATAL: Landlock probe failed. " + f"Kernel may not support Landlock or /proc/sys/kernel/unprivileged_landlock is 0.\n" + f"stderr: {result.stderr.strip()}", + file=sys.stderr, + ) + sys.exit(1) + + # Parse ABI version from stdout + stdout = result.stdout.strip() + abi_version = 0 + for line in stdout.splitlines(): + if line.startswith("LANDLOCK_OK"): + for part in line.split(): + if part.startswith("abi="): + abi_version = int(part.split("=", 1)[1]) + break + + if abi_version < 1: + logger.error("Landlock probe returned invalid ABI version: %s", stdout) + sys.exit(1) + + logger.info("Landlock probe passed -- ABI version %d", abi_version) + return abi_version From 40437f2196798be2581404dcd2a582759d98e8e0 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:48:35 +0100 Subject: [PATCH 13/26] feat(sandbox): OpenTelemetry observability with tracing middleware and LangChain auto-instrumentation Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/observability.py | 371 ++++++++++++++++++ 1 file changed, 371 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/observability.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/observability.py b/a2a/sandbox_agent/src/sandbox_agent/observability.py new file mode 100644 index 00000000..259be8d2 --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/observability.py @@ -0,0 +1,371 @@ +""" +OpenTelemetry observability setup for Sandbox Agent. + +Key Features: +- Tracing middleware for root span with MLflow attributes +- Auto-instrumentation of LangChain with OpenInference +- Resource attributes for static agent metadata +- W3C Trace Context propagation for distributed tracing + +Phase 1: Root span + auto-instrumentation only. +Node-level manual spans will be added in a later phase. +""" + +import json +import logging +import os +from contextvars import ContextVar +from typing import Any, Optional + +logger = logging.getLogger(__name__) + +# Agent metadata (static, used in Resource and spans) +AGENT_NAME = os.getenv("SANDBOX_AGENT_NAME", "sandbox-legion") +AGENT_VERSION = "1.0.0" +AGENT_FRAMEWORK = "langgraph" + +# ContextVar to pass root span from middleware to agent code. +# This allows execute() to access the middleware-created root span +# even though trace.get_current_span() would return a child span. +_root_span_var: ContextVar = ContextVar('root_span', default=None) + + +def get_root_span(): + """Get the root span created by tracing middleware. + + Use this instead of trace.get_current_span() when you need to set + attributes on the root span (e.g., mlflow.spanOutputs for streaming). + + Returns: + The root span, or None if not in a traced request context. + """ + return _root_span_var.get() + + +# OpenInference semantic conventions +try: + from openinference.semconv.trace import SpanAttributes, OpenInferenceSpanKindValues + OPENINFERENCE_AVAILABLE = True +except ImportError: + OPENINFERENCE_AVAILABLE = False + logger.warning("openinference-semantic-conventions not available") + + +def _get_otlp_exporter(endpoint: str): + """Get HTTP OTLP exporter.""" + from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter + if not endpoint.endswith("/v1/traces"): + endpoint = endpoint.rstrip("/") + "/v1/traces" + return OTLPSpanExporter(endpoint=endpoint) + + +def setup_observability() -> bool: + """ + Set up OpenTelemetry tracing with OpenInference instrumentation. + + Call this ONCE at agent startup, before importing agent code. + NEVER raises — all exceptions are caught and logged. OTel issues + must never break the agent's main processing loop. + + Returns: + True if tracing was set up successfully, False otherwise. + """ + service_name = os.getenv("OTEL_SERVICE_NAME", "sandbox-agent") + namespace = os.getenv("K8S_NAMESPACE_NAME", "team1") + otlp_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "") + + if not otlp_endpoint: + logger.warning( + "OTEL_EXPORTER_OTLP_ENDPOINT not set — tracing disabled. " + "Set this env var to enable OpenTelemetry tracing." + ) + return False + + try: + return _setup_observability_inner(service_name, namespace, otlp_endpoint) + except Exception: + logger.exception("OTel setup failed — tracing disabled (agent continues without tracing)") + return False + + +def _setup_observability_inner(service_name: str, namespace: str, otlp_endpoint: str) -> bool: + """Internal setup — may raise. Called by setup_observability() which catches all errors.""" + from opentelemetry import trace + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION + from opentelemetry.propagate import set_global_textmap + from opentelemetry.propagators.composite import CompositePropagator + from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator + from opentelemetry.baggage.propagation import W3CBaggagePropagator + + logger.info("=" * 60) + logger.info("Setting up OpenTelemetry observability") + logger.info(" Service: %s", service_name) + logger.info(" Agent: %s", AGENT_NAME) + logger.info(" Framework: %s", AGENT_FRAMEWORK) + logger.info(" Namespace: %s", namespace) + logger.info(" OTLP Endpoint: %s", otlp_endpoint) + logger.info("=" * 60) + + # Create resource with service and MLflow attributes. + # Resource attributes are STATIC and apply to ALL spans/traces. + # See: https://mlflow.org/docs/latest/genai/tracing/opentelemetry/ + resource = Resource(attributes={ + # Standard OTEL service attributes + SERVICE_NAME: service_name, + SERVICE_VERSION: AGENT_VERSION, + "service.namespace": namespace, + "k8s.namespace.name": namespace, + # MLflow static metadata (applies to all traces) + "mlflow.traceName": AGENT_NAME, + "mlflow.source": service_name, + # GenAI static attributes + "gen_ai.agent.name": AGENT_NAME, + "gen_ai.agent.version": AGENT_VERSION, + "gen_ai.system": AGENT_FRAMEWORK, + }) + + # Create and configure tracer provider + tracer_provider = TracerProvider(resource=resource) + tracer_provider.add_span_processor( + BatchSpanProcessor(_get_otlp_exporter(otlp_endpoint)) + ) + trace.set_tracer_provider(tracer_provider) + + # Auto-instrument LangChain with OpenInference + try: + from openinference.instrumentation.langchain import LangChainInstrumentor + LangChainInstrumentor().instrument() + logger.info("LangChain instrumented with OpenInference") + except ImportError: + logger.warning("openinference-instrumentation-langchain not available") + + # Configure W3C Trace Context propagation + set_global_textmap(CompositePropagator([ + TraceContextTextMapPropagator(), + W3CBaggagePropagator(), + ])) + + # Instrument OpenAI for GenAI semantic conventions + try: + from opentelemetry.instrumentation.openai import OpenAIInstrumentor + OpenAIInstrumentor().instrument() + logger.info("OpenAI instrumented with GenAI semantic conventions") + except ImportError: + logger.warning("opentelemetry-instrumentation-openai not available") + + return True + + +# Tracer for manual spans — use OpenInference-compatible name +_tracer = None +TRACER_NAME = "openinference.instrumentation.agent" + + +def get_tracer(): + """Get tracer for creating manual spans.""" + from opentelemetry import trace + + global _tracer + if _tracer is None: + _tracer = trace.get_tracer(TRACER_NAME) + return _tracer + + +def enrich_current_span(**kwargs: Any) -> None: + """Add attributes to the currently active span. + + Convenience helper so agent code can annotate spans without importing + opentelemetry directly. + + Args: + **kwargs: Attribute key-value pairs to set on the current span. + """ + from opentelemetry import trace + + span = trace.get_current_span() + if span and span.is_recording(): + for key, value in kwargs.items(): + span.set_attribute(key, value) + + +def create_tracing_middleware(): + """ + Create Starlette middleware that wraps all requests in a root tracing span. + + This middleware: + 1. Creates a root span BEFORE A2A handlers run + 2. Sets MLflow/GenAI attributes on the root span + 3. Parses A2A JSON-RPC request to extract user input + 4. Captures response to set output attributes + 5. For streaming (SSE) responses, sets status without capturing body + + Usage in agent.py: + from sandbox_agent.observability import create_tracing_middleware + app = server.build() + app.add_middleware(BaseHTTPMiddleware, dispatch=create_tracing_middleware()) + """ + from starlette.requests import Request + from starlette.responses import Response, StreamingResponse + from opentelemetry import trace, context + from opentelemetry.trace import Status, StatusCode, SpanKind + + async def tracing_middleware(request: Request, call_next): + # Skip non-API paths (health checks, agent card, etc.) + if request.url.path in [ + "/health", "/ready", + "/.well-known/agent-card.json", + "/.well-known/agent-graph-card.json", + ]: + return await call_next(request) + + tracer = get_tracer() + + # Parse request body to extract user input and context + user_input = None + context_id = None + message_id = None + + try: + body = await request.body() + if body: + data = json.loads(body) + # A2A JSON-RPC format: params.message.parts[0].text + params = data.get("params", {}) + message = params.get("message", {}) + parts = message.get("parts", []) + if parts and isinstance(parts, list): + user_input = parts[0].get("text", "") + context_id = params.get("contextId") or message.get("contextId") + message_id = message.get("messageId") + except Exception as e: + logger.debug("Could not parse request body: %s", e) + + # Break parent chain to make this a true root span. + # Without this, the span would inherit parent from W3C Trace Context headers. + empty_ctx = context.Context() + detach_token = context.attach(empty_ctx) + + try: + # Create root span with correct GenAI naming convention. + # Per https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-agent-spans/ + # Span name: "invoke_agent {gen_ai.agent.name}" + span_name = f"invoke_agent {AGENT_NAME}" + + with tracer.start_as_current_span( + span_name, + kind=SpanKind.INTERNAL, # In-process agent (not remote service) + ) as span: + # Store span in ContextVar so agent code can access it. + # trace.get_current_span() in execute() returns the innermost + # span (A2A span), not our root span. + span_token = _root_span_var.set(span) + + # === GenAI Semantic Conventions (Required) === + span.set_attribute("gen_ai.operation.name", "invoke_agent") + span.set_attribute("gen_ai.provider.name", AGENT_FRAMEWORK) + span.set_attribute("gen_ai.agent.name", AGENT_NAME) + span.set_attribute("gen_ai.agent.version", AGENT_VERSION) + + # Set input attributes (Prompt column in MLflow) + if user_input: + span.set_attribute("gen_ai.prompt", user_input[:1000]) + span.set_attribute("input.value", user_input[:1000]) + span.set_attribute("mlflow.spanInputs", user_input[:1000]) + + # Session tracking — use context_id or message_id as fallback + session_id = context_id or message_id + + if session_id: + span.set_attribute("gen_ai.conversation.id", session_id) + span.set_attribute("mlflow.trace.session", session_id) + span.set_attribute("session.id", session_id) + + # MLflow trace metadata (appears in trace list columns) + span.set_attribute("mlflow.spanType", "AGENT") + span.set_attribute("mlflow.traceName", AGENT_NAME) + span.set_attribute("mlflow.runName", f"{AGENT_NAME}-invoke") + span.set_attribute("mlflow.source", os.getenv("OTEL_SERVICE_NAME", "sandbox-agent")) + span.set_attribute("mlflow.version", AGENT_VERSION) + + # User tracking — extract from auth header if available + auth_header = request.headers.get("authorization", "") + if auth_header: + span.set_attribute("mlflow.user", "authenticated") + span.set_attribute("enduser.id", "authenticated") + else: + span.set_attribute("mlflow.user", "anonymous") + span.set_attribute("enduser.id", "anonymous") + + # OpenInference span kind (for Phoenix) + if OPENINFERENCE_AVAILABLE: + span.set_attribute( + SpanAttributes.OPENINFERENCE_SPAN_KIND, + OpenInferenceSpanKindValues.AGENT.value, + ) + + try: + # Call the next handler (A2A) + response = await call_next(request) + + # Try to capture response for output attributes. + # This only works for non-streaming responses. + if isinstance(response, Response) and not isinstance( + response, StreamingResponse + ): + # Read response body — we MUST recreate response after + _chunks: list[bytes] = [] + async for chunk in response.body_iterator: + _chunks.append(chunk) + response_body = b"".join(_chunks) + + # Try to parse and extract output for MLflow + try: + if response_body: + resp_data = json.loads(response_body) + result = resp_data.get("result", {}) + artifacts = result.get("artifacts", []) + if artifacts: + parts = artifacts[0].get("parts", []) + if parts: + output_text = parts[0].get("text", "") + if output_text: + span.set_attribute( + "gen_ai.completion", output_text[:1000] + ) + span.set_attribute( + "output.value", output_text[:1000] + ) + span.set_attribute( + "mlflow.spanOutputs", output_text[:1000] + ) + except Exception as e: + logger.debug("Could not parse response body: %s", e) + + # Always recreate response since we consumed the iterator + span.set_status(Status(StatusCode.OK)) + return Response( + content=response_body, + status_code=response.status_code, + headers=dict(response.headers), + media_type=response.media_type, + ) + + # For streaming responses (SSE), just set status and return. + # Don't try to capture the full stream body. + span.set_status(Status(StatusCode.OK)) + return response + + except Exception as e: + span.set_status(Status(StatusCode.ERROR, str(e))) + span.record_exception(e) + raise + finally: + # Reset the ContextVar to avoid leaking span reference + _root_span_var.reset(span_token) + finally: + # Always detach the context to restore parent chain for other requests + context.detach(detach_token) + + return tracing_middleware From 5d93d5b1f43041aeabaa4f7d75009be7d159a4a7 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:48:41 +0100 Subject: [PATCH 14/26] feat(sandbox): three-tier permission checker with deny/allow/HITL rules from settings.json Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/permissions.py | 403 ++++++++++++++++++ 1 file changed, 403 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/permissions.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/permissions.py b/a2a/sandbox_agent/src/sandbox_agent/permissions.py new file mode 100644 index 00000000..9e3a8190 --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/permissions.py @@ -0,0 +1,403 @@ +"""Three-tier permission checker modeled after Claude Code's settings.json. + +Every tool call from the LangGraph agent is checked against allow/deny rules +before execution: + + DENY -- operation matches a deny rule (rejected immediately) + ALLOW -- operation matches an allow rule (auto-executed) + HITL -- operation matches neither (triggers LangGraph interrupt() for + human approval) + +Rules use the format ``type(prefix:glob)`` where *type* is ``shell``, +``file``, ``network``, etc. Examples: + + shell(grep:*) -- any shell command starting with "grep" + file(read:/workspace/**) -- file reads anywhere under /workspace/ + network(outbound:*) -- any outbound network access + +Deny rules are checked **first** (deny takes precedence over allow). +""" + +from __future__ import annotations + +import enum +import fnmatch +import re +from typing import Any + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +# Pattern: ``type(value:glob)`` +_RULE_RE = re.compile(r"^(?P[a-z]+)\((?P.+)\)$") + + +class PermissionResult(enum.Enum): + """Outcome of a permission check.""" + + ALLOW = "allow" + DENY = "deny" + HITL = "hitl" + + +class PermissionChecker: + """Evaluate operations against a settings dict with allow/deny rules. + + Parameters + ---------- + settings: + Parsed *settings.json* dict. Expected shape:: + + { + "context_workspace": "/workspace/${CONTEXT_ID}", + "permissions": { + "allow": ["shell(grep:*)", ...], + "deny": ["shell(sudo:*)", ...] + } + } + """ + + def __init__(self, settings: dict[str, Any]) -> None: + workspace = self._resolve_workspace(settings) + perms = settings.get("permissions", {}) + self._deny_rules = self._parse_rules(perms.get("deny", []), workspace) + self._allow_rules = self._parse_rules(perms.get("allow", []), workspace) + + # ------------------------------------------------------------------ + # Core method + # ------------------------------------------------------------------ + + # Shell metacharacters that separate independent commands. + _COMPOUND_SEPARATORS = ("&&", "||", ";", "|") + + def check(self, operation_type: str, operation: str) -> PermissionResult: + """Return ALLOW, DENY, or HITL for a given *operation_type* + *operation*. + + Parameters + ---------- + operation_type: + High-level category, e.g. ``"shell"``, ``"file"``, ``"network"``. + operation: + The concrete operation string, e.g. ``"grep -r foo ."`` for a + shell command or ``"read:/workspace/ctx1/main.py"`` for a file + operation. + """ + # For shell commands with compound operators (&&, ||, ;, |), + # check each segment independently. + if operation_type == "shell": + segments = self._split_compound(operation) + if len(segments) > 1: + return self._check_compound(segments) + + return self._check_single(operation_type, operation) + + def _check_single(self, operation_type: str, operation: str) -> PermissionResult: + """Check a single (non-compound) operation.""" + # Deny rules are checked first -- deny takes precedence. + if self._matches_any(operation_type, operation, self._deny_rules): + return PermissionResult.DENY + + # For shell operations, also check for interpreter bypass: + # e.g. bash -c "curl ..." should be denied if curl is denied. + # Additionally, if the outer command is an interpreter (bash/sh/python) + # and embeds unknown commands, route to HITL rather than auto-allowing. + if operation_type == "shell": + embedded_commands = self.check_interpreter_bypass(operation) + if embedded_commands: + for embedded in embedded_commands: + if self._matches_any("shell", embedded, self._deny_rules): + return PermissionResult.DENY + # Embedded commands exist but none are denied. Route to HITL + # so a human reviews what the interpreter will execute, rather + # than auto-allowing via the outer shell(bash:*) rule. + return PermissionResult.HITL + + if self._matches_any(operation_type, operation, self._allow_rules): + return PermissionResult.ALLOW + + return PermissionResult.HITL + + def _check_compound(self, segments: list[str]) -> PermissionResult: + """Check each segment of a compound shell command. + + All segments must be ALLOW for the compound to be ALLOW. + Any DENY makes the whole compound DENY. + Otherwise HITL. + """ + has_hitl = False + for seg in segments: + result = self._check_single("shell", seg) + if result is PermissionResult.DENY: + return PermissionResult.DENY + if result is PermissionResult.HITL: + has_hitl = True + return PermissionResult.HITL if has_hitl else PermissionResult.ALLOW + + @classmethod + def _split_compound(cls, operation: str) -> list[str]: + """Split a shell command on compound operators (&&, ||, ;, |). + + Returns a list of stripped command segments. If no operators are + found, returns a single-element list with the original command. + """ + # Replace multi-char operators first to avoid confusion with single | + temp = operation + sentinel = "\x00" + for sep in ("&&", "||", ";"): + temp = temp.replace(sep, sentinel) + # Now split on single | (but not if it was part of || already replaced) + temp = temp.replace("|", sentinel) + segments = [s.strip() for s in temp.split(sentinel) if s.strip()] + return segments if segments else [operation] + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + @staticmethod + def _resolve_workspace(settings: dict[str, Any]) -> str: + """Derive the workspace root from ``context_workspace``. + + The value may contain ``${CONTEXT_ID}`` (or similar) placeholders. + We strip those so that glob rules like ``${WORKSPACE}/**`` can be + expanded to the bare workspace prefix (e.g. ``/workspace``). + """ + raw = settings.get("context_workspace", "/workspace") + # Remove a trailing ``/${SOME_VAR}`` placeholder (e.g. ``/${CONTEXT_ID}``) + # so we keep only the static prefix. + return re.sub(r"/\$\{[^}]+\}$", "", raw) + + @staticmethod + def _parse_rules( + raw_rules: list[str], workspace: str + ) -> list[tuple[str, str]]: + """Parse rule strings into ``(operation_type, glob_pattern)`` pairs. + + ``${WORKSPACE}`` inside a rule body is expanded to *workspace*. + """ + parsed: list[tuple[str, str]] = [] + for rule in raw_rules: + m = _RULE_RE.match(rule) + if m is None: + continue # skip malformed rules + rule_type = m.group("type") + body = m.group("body") + # Expand ${WORKSPACE} variable + body = body.replace("${WORKSPACE}", workspace) + parsed.append((rule_type, body)) + return parsed + + @staticmethod + def _matches_any( + operation_type: str, + operation: str, + rules: list[tuple[str, str]], + ) -> bool: + """Return True if *operation* matches at least one rule.""" + for rule_type, pattern in rules: + if rule_type != operation_type: + continue + if PermissionChecker._match_rule(pattern, operation_type, operation): + return True + return False + + @staticmethod + def _match_rule(pattern: str, operation_type: str, operation: str) -> bool: + """Match a single rule body against the operation. + + Rule body format is ``prefix:glob`` (the part inside the parentheses). + + For **shell** operations the *prefix* may be multi-word (e.g. + ``pip install``, ``git clone``). The matcher checks whether the + operation starts with the prefix. If the glob part is ``*`` (the + most common case), any suffix is accepted. + + For **file** / **network** operations the operation string is + expected to be ``action:path`` (e.g. ``read:/workspace/foo.py``). + The rule body is ``action:path_glob`` so we split on the first + colon of both and compare action + fnmatch on the path. + """ + if operation_type == "shell": + return PermissionChecker._match_shell(pattern, operation) + return PermissionChecker._match_structured(pattern, operation) + + # -- shell matching --------------------------------------------------- + + # Interpreters that can execute arbitrary code via -c / -e flags. + _INTERPRETERS = frozenset({"bash", "sh", "python", "python3", "perl", "ruby", "node"}) + + # Flags that take an inline command string as the next argument. + _EXEC_FLAGS = frozenset({"-c", "-e", "--eval"}) + + @staticmethod + def _match_shell(pattern: str, operation: str) -> bool: + """Match a shell rule pattern against a concrete command string. + + *pattern* has the form ``command_prefix:glob`` where the glob is + almost always ``*``. ``command_prefix`` may contain spaces (e.g. + ``pip install``, ``rm -rf /``). + """ + # Split only on the *last* colon so multi-word prefixes survive. + colon_idx = pattern.rfind(":") + if colon_idx == -1: + return False + prefix = pattern[:colon_idx] + glob_part = pattern[colon_idx + 1:] + + if not operation: + return False + + # Wildcard prefix (*) matches any command + if prefix == "*": + return fnmatch.fnmatch(operation, glob_part) + + # The operation must start with the prefix (case-sensitive). + if not operation.startswith(prefix): + return False + + # What comes after the prefix (may be empty). + remainder = operation[len(prefix):] + + # If there is a remainder, it must be separated by a space or be + # empty (exact match). This prevents "grep" matching "grepping". + if remainder and not remainder[0] == " ": + return False + + remainder = remainder.lstrip() + + # Match the remainder against the glob (``*`` matches everything). + return fnmatch.fnmatch(remainder, glob_part) + + @classmethod + def check_interpreter_bypass(cls, operation: str) -> list[str]: + """Extract embedded commands from interpreter invocations. + + If *operation* uses an interpreter (bash, sh, python, etc.) with + an inline execution flag (``-c``, ``-e``), extract the embedded + command string so it can be checked against deny rules separately. + + Returns a list of embedded command strings (empty if none found). + """ + if not operation: + return [] + + parts = operation.split() + if not parts: + return [] + + # Check if the command starts with a known interpreter. + cmd = parts[0].rsplit("/", 1)[-1] # handle /usr/bin/bash etc. + if cmd not in cls._INTERPRETERS: + return [] + + embedded: list[str] = [] + i = 1 + while i < len(parts): + if parts[i] in cls._EXEC_FLAGS and i + 1 < len(parts): + # Everything after the flag is the inline command. + inline = " ".join(parts[i + 1:]) + # Strip surrounding quotes if present. + if len(inline) >= 2 and inline[0] in ('"', "'") and inline[-1] == inline[0]: + inline = inline[1:-1] + embedded.append(inline) + break + i += 1 + + # Split embedded commands on shell metacharacters: |, &&, ||, ; + # so that "curl evil.com && rm -rf /" checks each segment. + for emb in list(embedded): + for sep in ("&&", "||", ";", "|"): + if sep in emb: + for segment in emb.split(sep): + segment = segment.strip() + if segment and segment not in embedded: + embedded.append(segment) + + return embedded + + # -- structured (file / network) matching ---------------------------- + + @staticmethod + def _match_structured(pattern: str, operation: str) -> bool: + """Match ``action:path_glob`` against ``action:concrete_path``. + + Both *pattern* and *operation* are expected to contain at least one + colon separating the action from the path. + """ + p_colon = pattern.find(":") + o_colon = operation.find(":") + if p_colon == -1 or o_colon == -1: + return False + + p_action = pattern[:p_colon] + p_path_glob = pattern[p_colon + 1:] + + o_action = operation[:o_colon] + o_path = operation[o_colon + 1:] + + if p_action != o_action: + return False + + # The path glob may itself end with ``:*`` from the rule syntax + # (e.g. ``/etc/shadow:*``). Strip a trailing ``:*`` from the + # glob -- the colon-star is a "match any extra args" marker in the + # rule syntax, not part of the filesystem path. + if p_path_glob.endswith(":*"): + p_path_glob = p_path_glob[:-2] + + # If the glob is now empty, it means the rule was something like + # ``network(outbound:*)`` -- match everything. + if p_path_glob == "*": + return True + + # Use fnmatch for glob-style matching (supports ``**``). + # fnmatch doesn't natively handle ``**`` the way gitignore does, + # so we convert ``**`` to a sentinel and back. + return _glob_match(p_path_glob, o_path) + + +# --------------------------------------------------------------------------- +# Glob helper +# --------------------------------------------------------------------------- + + +def _glob_match(pattern: str, text: str) -> bool: + """Glob-style match that treats ``**`` as "zero or more path segments". + + Python's :func:`fnmatch.fnmatch` treats ``*`` as "anything except + nothing" but does *not* cross ``/`` boundaries in the same way as + gitignore's ``**``. This helper converts ``**`` patterns into + regular expressions for correct matching. + """ + # Fast path: exact match or simple star. + if pattern == text: + return True + + # Convert the glob to a regex. + # ``**`` -> match anything including ``/`` + # ``*`` -> match anything except ``/`` + # ``?`` -> match a single char except ``/`` + parts: list[str] = [] + i = 0 + while i < len(pattern): + c = pattern[i] + if c == "*": + if i + 1 < len(pattern) and pattern[i + 1] == "*": + parts.append(".*") + i += 2 + # Skip a following ``/`` so ``**/`` works correctly. + if i < len(pattern) and pattern[i] == "/": + i += 1 + continue + parts.append("[^/]*") + elif c == "?": + parts.append("[^/]") + elif c in r"\.[](){}+^$|": + parts.append("\\" + c) + else: + parts.append(c) + i += 1 + + regex = "^" + "".join(parts) + "$" + return re.match(regex, text) is not None From 205094ac3ceeb640adb9a6c7a33372711db3ffba Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:48:47 +0100 Subject: [PATCH 15/26] feat(sandbox): append-only nested plan store with main steps and alternative subplans Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/plan_store.py | 330 ++++++++++++++++++ 1 file changed, 330 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/plan_store.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/plan_store.py b/a2a/sandbox_agent/src/sandbox_agent/plan_store.py new file mode 100644 index 00000000..47501753 --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/plan_store.py @@ -0,0 +1,330 @@ +"""Append-only nested plan container. + +Stores the agent's execution plan as a nested structure of main steps +and subplans. Only additions are allowed after initial creation — the +replanner can add new main steps (after all existing are terminal) or +create alternative subplans within a step. + +Structure:: + + { + "version": 1, + "steps": { + "1": { + "description": "Clone the repo", + "status": "done", + "subplans": { + "a": { + "substeps": { + "1": {"description": "git clone ...", "status": "done"}, + }, + "status": "done", + "created_by": "planner", + } + }, + "active_subplan": "a", + }, + "2": { + "description": "Analyze CI logs", + "status": "running", + "subplans": { + "a": {"substeps": {...}, "status": "failed", "created_by": "planner"}, + "b": {"substeps": {...}, "status": "running", "created_by": "replanner"}, + }, + "active_subplan": "b", + }, + }, + } + +Status transitions (one-way): + pending → running → done | failed | cancelled +""" + +from __future__ import annotations + +import logging +from typing import Any + +logger = logging.getLogger(__name__) + +# Valid status values and their terminal flag +_TERMINAL = frozenset({"done", "failed", "cancelled"}) +_VALID_STATUS = frozenset({"pending", "running"}) | _TERMINAL + + +# --------------------------------------------------------------------------- +# Construction +# --------------------------------------------------------------------------- + + +def create_plan(steps: list[str], creator: str = "planner") -> dict[str, Any]: + """Create a new plan store from a list of step descriptions. + + Each step gets a single subplan "a" with one substep matching + the step description (for simple plans where steps = substeps). + """ + plan: dict[str, Any] = {"version": 1, "steps": {}} + for i, desc in enumerate(steps): + step_key = str(i + 1) + plan["steps"][step_key] = { + "description": desc, + "status": "pending", + "subplans": { + "a": { + "substeps": { + "1": {"description": desc, "status": "pending"}, + }, + "status": "pending", + "created_by": creator, + }, + }, + "active_subplan": "a", + } + # Mark first step as running + if plan["steps"]: + plan["steps"]["1"]["status"] = "running" + plan["steps"]["1"]["subplans"]["a"]["status"] = "running" + return plan + + +# --------------------------------------------------------------------------- +# Mutations (append-only) +# --------------------------------------------------------------------------- + + +def add_steps( + plan: dict[str, Any], + new_steps: list[str], + creator: str = "replanner", +) -> dict[str, Any]: + """Add new main steps to the plan. + + Only allowed when ALL existing steps are terminal (done/failed/cancelled). + Returns a new plan dict (does not mutate in place). + + Raises ValueError if preconditions are not met. + """ + if creator != "replanner": + raise ValueError(f"Only replanner can add steps, got creator={creator}") + + steps = plan.get("steps", {}) + non_terminal = [ + k for k, s in steps.items() + if s.get("status") not in _TERMINAL + ] + if non_terminal: + raise ValueError( + f"Cannot add steps: steps {non_terminal} are still active" + ) + + new_plan = _deep_copy(plan) + next_idx = max((int(k) for k in steps), default=0) + 1 + for i, desc in enumerate(new_steps): + step_key = str(next_idx + i) + new_plan["steps"][step_key] = { + "description": desc, + "status": "pending", + "subplans": { + "a": { + "substeps": { + "1": {"description": desc, "status": "pending"}, + }, + "status": "pending", + "created_by": creator, + }, + }, + "active_subplan": "a", + } + + # Mark first new step as running + first_new = str(next_idx) + if first_new in new_plan["steps"]: + new_plan["steps"][first_new]["status"] = "running" + new_plan["steps"][first_new]["subplans"]["a"]["status"] = "running" + + logger.info( + "Added %d steps (start=%s) by %s", len(new_steps), first_new, creator, + ) + return new_plan + + +def add_alternative_subplan( + plan: dict[str, Any], + step_key: str, + substeps: list[str], +) -> tuple[dict[str, Any], str]: + """Create an alternative subplan for a step (replanner only). + + Returns (new_plan, subplan_key) where subplan_key is the new key (b, c, ...). + The active_subplan is switched to the new one. + """ + new_plan = _deep_copy(plan) + step = new_plan["steps"].get(step_key) + if step is None: + raise ValueError(f"Step {step_key} not found") + + existing_keys = sorted(step["subplans"].keys()) + next_key = chr(ord("a") + len(existing_keys)) + + step["subplans"][next_key] = { + "substeps": { + str(i + 1): {"description": desc, "status": "pending"} + for i, desc in enumerate(substeps) + }, + "status": "running", + "created_by": "replanner", + } + step["active_subplan"] = next_key + step["status"] = "running" + + logger.info( + "Created alternative subplan '%s' for step %s (%d substeps)", + next_key, step_key, len(substeps), + ) + return new_plan, next_key + + +# --------------------------------------------------------------------------- +# Status updates +# --------------------------------------------------------------------------- + + +def set_step_status( + plan: dict[str, Any], + step_key: str, + status: str, +) -> dict[str, Any]: + """Update a step's status. Validates one-way transitions.""" + if status not in _VALID_STATUS: + raise ValueError(f"Invalid status: {status}") + new_plan = _deep_copy(plan) + step = new_plan["steps"].get(step_key) + if step is None: + raise ValueError(f"Step {step_key} not found") + old = step["status"] + if old in _TERMINAL: + logger.warning("Step %s already terminal (%s), ignoring → %s", step_key, old, status) + return new_plan + step["status"] = status + # Also update the active subplan status + active = step.get("active_subplan", "a") + if active in step.get("subplans", {}): + sp = step["subplans"][active] + if sp.get("status") not in _TERMINAL: + sp["status"] = status + return new_plan + + +def set_substep_status( + plan: dict[str, Any], + step_key: str, + substep_key: str, + status: str, + result_summary: str = "", + tool_calls: list[str] | None = None, +) -> dict[str, Any]: + """Update a substep's status within the active subplan.""" + if status not in _VALID_STATUS: + raise ValueError(f"Invalid status: {status}") + new_plan = _deep_copy(plan) + step = new_plan["steps"].get(step_key) + if step is None: + raise ValueError(f"Step {step_key} not found") + active = step.get("active_subplan", "a") + subplan = step.get("subplans", {}).get(active) + if subplan is None: + raise ValueError(f"Subplan {active} not found in step {step_key}") + substep = subplan.get("substeps", {}).get(substep_key) + if substep is None: + raise ValueError(f"Substep {substep_key} not found in subplan {active}") + substep["status"] = status + if result_summary: + substep["result_summary"] = result_summary + if tool_calls: + substep["tool_calls"] = tool_calls + return new_plan + + +# --------------------------------------------------------------------------- +# Queries +# --------------------------------------------------------------------------- + + +def get_current_step(plan: dict[str, Any]) -> tuple[str, dict[str, Any]] | None: + """Return (step_key, step_dict) for the first non-terminal step.""" + for key in sorted(plan.get("steps", {}), key=int): + step = plan["steps"][key] + if step.get("status") not in _TERMINAL: + return key, step + return None + + +def get_active_substep(plan: dict[str, Any], step_key: str) -> tuple[str, dict] | None: + """Return (substep_key, substep_dict) for the first pending/running substep.""" + step = plan.get("steps", {}).get(step_key) + if step is None: + return None + active = step.get("active_subplan", "a") + subplan = step.get("subplans", {}).get(active) + if subplan is None: + return None + for sk in sorted(subplan.get("substeps", {}), key=int): + ss = subplan["substeps"][sk] + if ss.get("status") not in _TERMINAL: + return sk, ss + return None + + +def step_count(plan: dict[str, Any]) -> int: + """Total number of main steps.""" + return len(plan.get("steps", {})) + + +def done_count(plan: dict[str, Any]) -> int: + """Number of completed main steps.""" + return sum(1 for s in plan.get("steps", {}).values() if s.get("status") == "done") + + +def all_terminal(plan: dict[str, Any]) -> bool: + """True if ALL main steps are in a terminal status.""" + steps = plan.get("steps", {}) + return bool(steps) and all(s.get("status") in _TERMINAL for s in steps.values()) + + +def to_flat_plan(plan: dict[str, Any]) -> list[str]: + """Convert to flat list of step descriptions (backward compat).""" + return [ + plan["steps"][k]["description"] + for k in sorted(plan.get("steps", {}), key=int) + ] + + +def to_flat_plan_steps(plan: dict[str, Any]) -> list[dict[str, Any]]: + """Convert to flat PlanStep list (backward compat with serializer/UI).""" + result = [] + for key in sorted(plan.get("steps", {}), key=int): + step = plan["steps"][key] + active = step.get("active_subplan", "a") + subplan = step.get("subplans", {}).get(active, {}) + alt_count = len(step.get("subplans", {})) - 1 # alternatives (excl. original) + result.append({ + "index": int(key) - 1, # 0-based for compat + "description": step["description"], + "status": step["status"], + "active_subplan": active, + "alternative_count": alt_count, + "substeps": list(subplan.get("substeps", {}).values()), + "created_by": subplan.get("created_by", "planner"), + }) + return result + + +# --------------------------------------------------------------------------- +# Internal +# --------------------------------------------------------------------------- + + +def _deep_copy(d: dict) -> dict: + """Fast deep copy for JSON-compatible dicts.""" + import json + return json.loads(json.dumps(d)) From 2b060dc3909baf0b9db7407939568ef90219eb14 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:48:53 +0100 Subject: [PATCH 16/26] feat(sandbox): system prompt templates for planner, executor, reflector, and reporter nodes Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/prompts.py | 235 ++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/prompts.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/prompts.py b/a2a/sandbox_agent/src/sandbox_agent/prompts.py new file mode 100644 index 00000000..3c2856dd --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/prompts.py @@ -0,0 +1,235 @@ +"""System prompt templates for the plan-execute-reflect reasoning loop. + +Each prompt corresponds to a reasoning node: +- PLANNER_SYSTEM: Decomposes user requests into numbered plans +- EXECUTOR_SYSTEM: Executes individual plan steps with tools +- REFLECTOR_SYSTEM: Reviews step output, decides continue/replan/done +- REPORTER_SYSTEM: Summarizes accumulated results into final answer + +All prompts receive the workspace preamble via ``with_workspace()``. +""" + +# --------------------------------------------------------------------------- +# Universal workspace preamble — injected into ALL system prompts +# --------------------------------------------------------------------------- + +WORKSPACE_PREAMBLE = """\ +WORKSPACE (MOST IMPORTANT RULE): +Your workspace absolute path is: {workspace_path} +ALL file access MUST use this path prefix. + +- shell commands: ALWAYS use absolute paths starting with {workspace_path}/ + Example: `ls {workspace_path}/repos/kagenti` + Example: `cd {workspace_path}/repos/kagenti && gh run list` + Example: `cd {workspace_path}/repos/kagenti && gh run view 123 --log-failed > {workspace_path}/output/ci.log` +- file_read, file_write, grep, glob: use RELATIVE paths (e.g. `output/report.md`, `repos/kagenti/README.md`). + These tools resolve paths relative to the workspace automatically. +- NEVER use `../../` or guess paths. NEVER use bare `/workspace/` without the session ID. + +Pre-created subdirs: repos/ (clone here), output/ (reports/logs), data/, scripts/ +""" + + +def with_workspace(template: str, workspace_path: str, **kwargs: str) -> str: + """Prepend the workspace preamble to a system prompt template and format. + + Usage:: + + system_content = with_workspace( + EXECUTOR_SYSTEM, + workspace_path="/workspace/abc123", + current_step=1, + step_text="Clone repo", + ) + """ + full = WORKSPACE_PREAMBLE + "\n" + template + try: + return full.format(workspace_path=workspace_path, **kwargs) + except (KeyError, IndexError): + # Fallback: try formatting without workspace if template has unknown keys + try: + return WORKSPACE_PREAMBLE.format(workspace_path=workspace_path) + "\n" + template.format(**kwargs) + except (KeyError, IndexError): + return WORKSPACE_PREAMBLE.format(workspace_path=workspace_path) + "\n" + template + + +PLANNER_SYSTEM = """\ +You are a planning module for a sandboxed coding assistant. + +Given the user's request and any prior execution results, produce a concise +numbered plan. Each step should be a single actionable item that can be +executed with the available tools (shell, file_read, file_write, grep, glob, +web_fetch, explore). + +IMPORTANT: Almost every request requires tools. The user is asking you to DO +things, not just talk. Create file = file_write. Run command = shell. +Clone repo = shell. Read file = file_read. Search code = grep/glob. + +Rules: +- Every step should name the specific tool to use. +- Keep steps concrete and tool-oriented — no vague "analyze" or "think" steps. +- For multi-step analysis, debugging, or investigation tasks, add a final + step: "Write findings summary to report.md" with sections: Problem, + Investigation, Root Cause, Resolution. +- Number each step starting at 1. +- Output ONLY the numbered list, nothing else. + +Example ("create a file hello.txt with 'hello world'"): +1. Use file_write to create hello.txt with content "hello world". + +Example ("list files"): +1. Run `ls -la` in the workspace using shell. + +Example ("create a Python project with tests"): +1. Create directory structure: shell(`mkdir -p src tests`). +2. Write src/main.py using file_write. +3. Write tests/test_main.py using file_write. +4. Run tests: shell(`python -m pytest tests/`). + +Example ("analyze CI failures for owner/repo PR #758"): +1. Clone repo: shell(`git clone https://github.com/owner/repo.git {workspace_path}/repos/repo`). +2. List failures: shell(`cd {workspace_path}/repos/repo && gh run list --status failure --limit 5`). +3. Download logs: shell(`cd {workspace_path}/repos/repo && gh run view --log-failed > {workspace_path}/output/ci-run.log`). +4. Extract errors: grep(`FAILED|ERROR|AssertionError` in output/ci-run.log). +5. Write findings to report.md with sections: Root Cause, Impact, Fix. + +IMPORTANT for gh CLI: +- GH_TOKEN and GITHUB_TOKEN are ALREADY set in the environment. Do NOT + run `export GH_TOKEN=...` — it's unnecessary and will break auth. +- Always clone the target repo FIRST, then `cd` into it before gh commands. +- gh auto-detects the repo from git remote "origin" — it MUST run inside the cloned repo. +- Use `cd {workspace_path}/repos/ && gh ` in a single shell call. +""" + +EXECUTOR_SYSTEM = """\ +You are a sandboxed coding assistant executing step {current_step} of a plan. + +Current step: {step_text} +Tool calls so far this step: {tool_call_count}/{max_tool_calls} + +Available tools: +- **shell**: Execute a shell command. Returns stdout+stderr and exit code. +- **file_read**: Read a file from the workspace. +- **file_write**: Write content to a file in the workspace. +- **grep**: Search file contents with regex. Faster than shell grep, workspace-scoped. +- **glob**: Find files by pattern (e.g. '**/*.py'). Faster than shell find. +- **web_fetch**: Fetch content from a URL (allowed domains only). +- **explore**: Spawn a read-only sub-agent for codebase research. + + +EXECUTION MODEL — step-by-step with micro-reflection: +You operate in a loop: call ONE tool → see the result → decide what to do next. +After each tool result, THINK about what happened before calling the next tool. +- Did the command succeed? Check the exit code and output. +- If it failed, adapt your approach — don't blindly retry the same thing. +- If it succeeded, what's the logical next action for this step? + +CRITICAL RULES: +- Call exactly ONE tool per response. You will see the result and can call another. +- You MUST use the function/tool calling API — not text descriptions of calls. +- DO NOT write or invent command output. Call the tool, wait for the result. +- If a tool call fails, report the ACTUAL error — do not invent output. +- Slash commands like /rca:ci are for humans, not for you. You use tools. +- If you cannot call a tool for any reason, respond with exactly: + CANNOT_CALL_TOOL: + +STEP BOUNDARY — CRITICAL: +- You are ONLY executing step {current_step}: "{step_text}" +- When THIS step is done, STOP calling tools immediately. +- Do NOT start the next step. The reflector will advance you. +- Summarize what you accomplished and stop. + +When the step is COMPLETE (goal achieved or cannot be achieved), stop calling +tools and summarize what you accomplished with the actual tool output. + +## Handling Large Output +Tool output is truncated to 10KB. For commands that produce large output: +- Redirect to a file: `command > {workspace_path}/output/result.json` +- Then analyze with grep: grep(`pattern` in output/result.json) + +## Debugging Guidelines +- If a command fails with "unknown flag" or "invalid option" → run `command --help` + to see valid flags. Do NOT guess flag names. +- After each tool call, analyze the output carefully before deciding the next action. +- Check error output (stderr) and exit code before retrying. +- If you get the same result twice → the step is done, stop and summarize. +""" + +REFLECTOR_SYSTEM = """\ +You are a reflection module reviewing the output of a plan step. + +Plan: +{plan_text} + +Current step ({current_step} of {total_steps}): {step_text} +Step result: {step_result} +Remaining steps: {remaining_steps} + +Iteration: {iteration} of {max_iterations} +Replan count so far: {replan_count} (higher counts mean more rework — weigh this when deciding) +Tool calls this iteration: {tool_calls_this_iter} +Recent decisions: {recent_decisions} +{replan_history} + +STALL DETECTION: +- If the executor made 0 tool calls, the step likely FAILED. +- If the step result is just text describing what WOULD be done (not actual + tool output), that means the executor did not call any tools. Treat as failure. + +RETRY vs REPLAN: +- **retry** = same step failed, try a DIFFERENT approach for THIS step only. + Example: `gh run view --log-failed` failed → retry with `gh api` instead. + The executor re-runs the current step with a modified brief. Completed steps + are preserved. Use retry FIRST before replan. +- **replan** = the overall approach is fundamentally wrong. Creates a new plan + but preserves already-completed steps (never restarts from step 1). + Only use replan if retry won't help (e.g., wrong repo cloned, wrong PR). +- Do NOT replan with the same approach that already failed. +- A high replan count suggests diminishing returns — consider "done" with + partial results. + +DECISION PROCESS: +1. Did the current step succeed? Check tool output for real results (not just "no output"). +2. If it failed, can you try a different approach for the SAME step? → retry. +3. If the whole approach is wrong → replan. +4. If step succeeded and remaining steps exist → continue. +5. If ALL plan steps are complete (remaining = NONE) → done. + +Decide ONE of the following (output ONLY the decision word): +- **continue** — Current step done, remaining steps exist → move to next step. +- **retry** — Current step failed, re-execute with a different approach. +- **replan** — Overall approach is wrong, create new plan (keeps done steps). +- **done** — ALL plan steps complete (remaining = NONE), task is fully answered. +- **hitl** — Human input is needed to proceed. + +Output the single word: continue, retry, replan, done, or hitl. +""" + +REPORTER_SYSTEM = """\ +You are a reporting module. Summarize the results of all executed steps +into a clear, concise final answer for the user. + +Plan: +{plan_text} + +Step status: +{step_status_text} + +Step results: +{results_text} + +{limit_note} + +RULES: +- Only report facts from actual tool output — NEVER fabricate data. +- If a step FAILED, explain WHY it failed (include the error message). +- If steps are PARTIAL, summarize what was accomplished so far. +- If no real data was obtained, say "Unable to retrieve data" rather than + making up results. +- Include relevant command output, file paths, or next steps. +- Do NOT include the plan itself — just the results. +- Do NOT say "The task has been completed" — present the actual findings. +- Do NOT echo or repeat these instructions in your response. +- Start your response directly with the summary content. +- List ALL workspace file paths in full form (e.g. repos/kagenti/report.md). +""" From 8b5c3e28ec4f1cc2faa8eead305e364b8de95ea9 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:48:59 +0100 Subject: [PATCH 17/26] feat(sandbox): plan-execute-reflect reasoning loop with router, planner, executor, reflector, and reporter nodes Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/reasoning.py | 1668 +++++++++++++++++ 1 file changed, 1668 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/reasoning.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/reasoning.py b/a2a/sandbox_agent/src/sandbox_agent/reasoning.py new file mode 100644 index 00000000..dcd471fe --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/reasoning.py @@ -0,0 +1,1668 @@ +"""Plan-execute-reflect reasoning loop node functions. + +Five LangGraph node functions implement structured multi-step reasoning: + +1. **router** — Entry point. Checks plan_status to decide: resume existing + plan, replan with new context, or start fresh. +2. **planner** — Decomposes the user request into numbered steps. + Detects simple (single-step) requests and marks them done-after-execute. +3. **executor** — Runs the current plan step with bound tools (existing + react pattern). +4. **reflector** — Reviews execution output, decides: ``continue`` (next + step), ``replan``, ``done``, or ``hitl``. Updates per-step status. +5. **reporter** — Formats accumulated step results into a final answer. + Sets terminal ``plan_status`` based on how the loop ended. + +Plan state persists across A2A turns via the LangGraph checkpointer. +When the user or looper sends "continue", the router resumes execution +at the current step. Any other message triggers a replan that sees the +previous plan's progress. + +# TODO: Research explicit PlanStore approach as alternative to checkpointer. +# Pros of PlanStore: plan queryable outside graph (UI), full schema control, +# plan versioning independent of LangGraph internals. +# Cons: more code, risk of plan/checkpointer state divergence, need custom +# persistence layer. Current approach (A) uses checkpointer for atomic +# state which is simpler and less error-prone. +""" + +from __future__ import annotations + +import json +import logging +import re +import uuid +from typing import Any, TypedDict + +from langchain_core.messages import AIMessage, SystemMessage, ToolMessage + +from sandbox_agent.budget import AgentBudget +from sandbox_agent import plan_store as ps + +# openai raises APIStatusError for non-2xx responses (e.g. 402 from the budget proxy) +try: + from openai import APIStatusError as _APIStatusError +except ImportError: + _APIStatusError = None # type: ignore[assignment,misc] + + +def _is_budget_exceeded_error(exc: Exception) -> bool: + """Check if an exception is a 402 budget-exceeded from the LLM proxy.""" + if _APIStatusError and isinstance(exc, _APIStatusError): + return exc.status_code == 402 + return "budget_exceeded" in str(exc).lower() or "402" in str(exc) + +logger = logging.getLogger(__name__) + +# Sentinel text returned by the executor when all tool calls in a step have +# already been executed (dedup logic). This is an internal coordination +# message and must never appear in user-visible output. +_DEDUP_SENTINEL = ( + "Step completed — all requested tool calls " + "have been executed and results are available." +) + +import os as _os + +# Debug prompts: include full system prompt + message history in events. +# Disabled by default to reduce event size and prevent OOM on large sessions. +_DEBUG_PROMPTS = _os.environ.get("SANDBOX_DEBUG_PROMPTS", "1") == "1" + +# Messages that trigger plan resumption rather than replanning. +_CONTINUE_PHRASES = frozenset({ + "continue", "continue on the plan", "go on", "proceed", + "keep going", "next", "carry on", +}) + + +# --------------------------------------------------------------------------- +# PlanStep — structured per-step tracking +# --------------------------------------------------------------------------- + + +class PlanStep(TypedDict, total=False): + """A single step in the plan with status tracking.""" + index: int + description: str + status: str # "pending" | "running" | "done" | "failed" | "skipped" + tool_calls: list[str] + result_summary: str + iteration_added: int + + + +def _summarize_bound_tools(llm_with_tools: Any) -> list[dict[str, Any]]: + """Extract bound tool schemas from a LangChain RunnableBinding for debug display. + + Returns a list of tool definitions in OpenAI format so the UI can show + exactly what tools + schemas the LLM receives. + """ + try: + # LangChain bind_tools stores tools in kwargs['tools'] + tools = getattr(llm_with_tools, "kwargs", {}).get("tools", []) + if not tools: + # Try first.kwargs for nested bindings + first = getattr(llm_with_tools, "first", None) + if first: + tools = getattr(first, "kwargs", {}).get("tools", []) + result = [] + for t in tools: + if isinstance(t, dict): + # Already in OpenAI format + result.append({ + "name": t.get("function", {}).get("name", "?"), + "description": t.get("function", {}).get("description", "")[:200], + "parameters": t.get("function", {}).get("parameters", {}), + }) + else: + # LangChain tool object + result.append({ + "name": getattr(t, "name", "?"), + "description": (getattr(t, "description", "") or "")[:200], + "parameters": getattr(t, "args_schema", {}) if hasattr(t, "args_schema") else {}, + }) + return result + except Exception: + return [] + + +def _make_plan_steps( + descriptions: list[str], iteration: int = 0 +) -> list[PlanStep]: + """Convert a list of step descriptions into PlanStep dicts.""" + return [ + PlanStep( + index=i, + description=desc, + status="pending", + tool_calls=[], + result_summary="", + iteration_added=iteration, + ) + for i, desc in enumerate(descriptions) + ] + + +def _plan_descriptions(plan_steps: list[PlanStep]) -> list[str]: + """Extract flat description list from plan_steps (for backward compat).""" + return [s.get("description", "") for s in plan_steps] + + +def _safe_format(template: str, **kwargs: Any) -> str: + """Format a prompt template, falling back to raw template on errors.""" + try: + return template.format(**kwargs) + except (KeyError, IndexError) as exc: + logger.warning("Prompt format error (%s), using raw template", exc) + return template + + +# --------------------------------------------------------------------------- +# Text-based tool call parser +# --------------------------------------------------------------------------- +# Some model servers (e.g. vLLM without --enable-auto-tool-choice) return +# tool invocations as text like: +# [shell(command="ls -la"), file_read(path="foo.py")] +# instead of structured tool_calls in the OpenAI response format. +# This parser converts that text into proper AIMessage.tool_calls so +# LangGraph's tools_condition routes to the ToolNode. +# --------------------------------------------------------------------------- + +# Matches: tool_name(key="value", key2="value2") +# Handles: shell("ls") (positional), shell(command="ls") (keyword) +_TOOL_CALL_RE = re.compile( + r'(\w+)\(([^)]*)\)', +) + +# Matches Llama 4 Scout format: [label, tool_name]{"key": "value"} +# Examples: [clone_repo, shell]{"command": "git clone ..."} +# [rca:ci, delegate]{"task": "analyze CI logs"} +_LABEL_TOOL_JSON_RE = re.compile( + r'\[[^\]]*,\s*(\w+)\]\s*(\{[^}]+\})', +) + +# Known tool names — only parse calls for tools we actually have +_KNOWN_TOOLS = {"shell", "file_read", "file_write", "grep", "glob", "web_fetch", "explore", "delegate"} + +# First-param defaults for tools that accept a positional argument +_POSITIONAL_PARAM = { + "shell": "command", + "file_read": "path", + "grep": "pattern", + "glob": "pattern", + "web_fetch": "url", + "explore": "query", + "delegate": "task", +} + + +def _parse_kwargs(args_str: str, tool_name: str) -> dict[str, Any]: + """Parse 'key="value", key2="value2"' or '"positional"' into a dict.""" + args_str = args_str.strip() + if not args_str: + return {} + + result: dict[str, Any] = {} + + # Try keyword arguments first: key="value" or key='value' + kw_pattern = re.compile(r'(\w+)\s*=\s*(?:"((?:[^"\\]|\\.)*)"|\'((?:[^\'\\]|\\.)*)\')') + kw_matches = kw_pattern.findall(args_str) + if kw_matches: + for key, val_dq, val_sq in kw_matches: + val = val_dq if val_dq else val_sq + val = val.replace('\\"', '"').replace("\\'", "'") + result[key] = val + return result + + # Positional: just a quoted string like "ls -la" or 'ls -la' + pos_match = re.match(r'^["\'](.+?)["\']$', args_str, re.DOTALL) + if pos_match: + param_name = _POSITIONAL_PARAM.get(tool_name, "input") + result[param_name] = pos_match.group(1).replace('\\"', '"') + return result + + # Unquoted positional (rare but handle it) + param_name = _POSITIONAL_PARAM.get(tool_name, "input") + result[param_name] = args_str + return result + + +def parse_text_tool_calls(content: str) -> list[dict[str, Any]]: + """Extract tool calls from text content. + + Returns a list of dicts matching LangChain ToolCall format: + [{"name": "shell", "args": {"command": "ls"}, "id": "...", "type": "tool_call"}] + + Returns empty list if no recognizable tool calls found. + """ + if not content: + return [] + + # Look for the pattern: [tool(...), tool(...)] or just tool(...) + # Strip surrounding brackets if present + text = content.strip() + if text.startswith("[") and text.endswith("]"): + text = text[1:-1].strip() + # Remove trailing comma + if text.endswith(","): + text = text[:-1].strip() + + calls = [] + + # Try Llama 4 format first: [label, tool_name]{"key": "value"} + for match in _LABEL_TOOL_JSON_RE.finditer(content): + tool_name = match.group(1) + json_str = match.group(2) + if tool_name not in _KNOWN_TOOLS: + continue + try: + args = json.loads(json_str) + if isinstance(args, dict): + calls.append({ + "name": tool_name, + "args": args, + "id": f"text-{uuid.uuid4().hex[:12]}", + "type": "tool_call", + }) + except json.JSONDecodeError: + continue + + if calls: + return calls + + # Fall back to legacy format: tool_name(args) + for match in _TOOL_CALL_RE.finditer(text): + tool_name = match.group(1) + args_str = match.group(2) + + if tool_name not in _KNOWN_TOOLS: + continue + + args = _parse_kwargs(args_str, tool_name) + calls.append({ + "name": tool_name, + "args": args, + "id": f"text-{uuid.uuid4().hex[:12]}", + "type": "tool_call", + }) + + return calls + + +def maybe_patch_tool_calls(response: AIMessage) -> AIMessage: + """If the response has no tool_calls but contains text-based calls, patch them in. + + Controlled by SANDBOX_TEXT_TOOL_PARSING env var (default: "1" = enabled). + """ + if response.tool_calls: + # Model returned structured tool_calls — use as-is + return response + + if _os.environ.get("SANDBOX_TEXT_TOOL_PARSING", "1") != "1": + return response + + content = response.content + if isinstance(content, list): + # Multi-part content — extract text parts + content = " ".join( + b.get("text", "") for b in content + if isinstance(b, dict) and b.get("type") == "text" + ) + + parsed = parse_text_tool_calls(content) + if not parsed: + return response + + logger.info( + "Parsed %d text-based tool call(s): %s", + len(parsed), + [c["name"] for c in parsed], + ) + + # Create a new AIMessage with the parsed tool_calls + return AIMessage( + content="", # Clear text content — tools will produce output + tool_calls=parsed, + ) + +# Default budget — used when no explicit budget is passed. +DEFAULT_BUDGET = AgentBudget() + + +# --------------------------------------------------------------------------- +# Prompts +# --------------------------------------------------------------------------- + +from sandbox_agent.prompts import ( + PLANNER_SYSTEM as _PLANNER_SYSTEM, + EXECUTOR_SYSTEM as _EXECUTOR_SYSTEM, + REFLECTOR_SYSTEM as _REFLECTOR_SYSTEM, + REPORTER_SYSTEM as _REPORTER_SYSTEM, +) + + +def _intercept_respond_to_user(response: Any, node_name: str) -> AIMessage | None: + """Check for respond_to_user escape tool in an LLM response. + + Llama 4 Scout always calls a tool when tools are bound, so + ``respond_to_user`` is the escape hatch for nodes that need to + produce text output (planner, reflector). + + Returns a *new* AIMessage with the extracted text content and no + tool_calls (so ``tools_condition`` routes correctly), or ``None`` + if no escape tool was found. + """ + if not getattr(response, "tool_calls", None): + return None + + tool_names = [ + tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?") + for tc in response.tool_calls + ] + logger.info("%s called tools: %s", node_name, tool_names, + extra={"node": node_name.lower()}) + + for tc in response.tool_calls: + name = tc.get("name", "") if isinstance(tc, dict) else getattr(tc, "name", "") + if name == "respond_to_user": + args = tc.get("args", {}) if isinstance(tc, dict) else getattr(tc, "args", {}) + response_text = args.get("response", "") + logger.info( + "%s escaped via respond_to_user (%d chars)", node_name, len(response_text), + extra={"node": node_name.lower()}, + ) + # Return a clean AIMessage — no tool_calls so the graph + # routes to the next node instead of the tool node. + return AIMessage( + content=response_text, + response_metadata=getattr(response, "response_metadata", {}), + usage_metadata=getattr(response, "usage_metadata", None), + ) + + return None + + +# --------------------------------------------------------------------------- +# Node functions +# --------------------------------------------------------------------------- + + +async def router_node(state: dict[str, Any]) -> dict[str, Any]: + """Entry-point node: decide whether to resume, replan, or start fresh. + + Returns state updates that downstream conditional edges read via + :func:`route_entry`. + """ + plan_status = state.get("plan_status", "") + plan_steps = state.get("plan_steps", []) + messages = state.get("messages", []) + + # Extract the latest user message text + last_text = "" + if messages: + content = getattr(messages[-1], "content", "") + if isinstance(content, list): + last_text = " ".join( + b.get("text", "") for b in content + if isinstance(b, dict) and b.get("type") == "text" + ) + else: + last_text = str(content) + last_text_lower = last_text.strip().lower() + + has_active_plan = plan_status == "awaiting_continue" and len(plan_steps) > 0 + is_continue = last_text_lower in _CONTINUE_PHRASES + + if has_active_plan and is_continue: + # Resume: mark next pending step as running + current_step = state.get("current_step", 0) + if current_step < len(plan_steps): + plan_steps = list(plan_steps) # copy for mutation + plan_steps[current_step] = {**plan_steps[current_step], "status": "running"} + logger.info( + "Router: RESUME plan at step %d/%d (plan_status=%s)", + current_step + 1, len(plan_steps), plan_status, + extra={"session_id": state.get("context_id", ""), "node": "router", + "current_step": current_step, "plan_status": plan_status}, + ) + return { + "_route": "resume", + "plan_steps": plan_steps, + "plan_status": "executing", + } + elif has_active_plan: + # Replan: new instruction arrives while plan exists + # Reset replan_count — this is a user-driven replan, not an agent loop + logger.info( + "Router: REPLAN — new message while plan active (plan_status=%s, steps=%d)", + plan_status, len(plan_steps), + extra={"session_id": state.get("context_id", ""), "node": "router", + "plan_status": plan_status}, + ) + return { + "_route": "replan", + "plan_status": "executing", + "original_request": last_text, + "replan_count": 0, + "recent_decisions": [], + } + else: + # New: no active plan + logger.info("Router: NEW plan (plan_status=%s)", plan_status, + extra={"session_id": state.get("context_id", ""), "node": "router", + "plan_status": plan_status}) + return { + "_route": "new", + "plan_status": "executing", + "original_request": last_text, + } + + +def route_entry(state: dict[str, Any]) -> str: + """Conditional edge from router: resume → executor, else → planner.""" + route = state.get("_route", "new") + if route == "resume": + return "resume" + return "plan" # both "replan" and "new" go to planner + + +def _is_trivial_text_request(messages: list) -> bool: + """Detect requests that need no tools — just a text response. + + Matches patterns like "Say exactly: ...", "What was the marker?", + simple greetings, or questions that can be answered from conversation + context alone. + """ + if not messages: + return False + last = messages[-1] + content = getattr(last, "content", "") + if isinstance(content, list): + content = " ".join( + b.get("text", "") for b in content + if isinstance(b, dict) and b.get("type") == "text" + ) + text = str(content).strip().lower() + if not text: + return False + + # Patterns that clearly need no tools + trivial_patterns = ( + "say exactly", + "repeat ", + "what was the marker", + "what did i say", + "what did i tell", + "hello", + "hi", + "who are you", + ) + return any(text.startswith(p) or p in text for p in trivial_patterns) + + +async def planner_node( + state: dict[str, Any], + llm: Any, + budget: AgentBudget | None = None, +) -> dict[str, Any]: + """Decompose the user request into a numbered plan. + + On re-entry (iteration > 0), the planner also sees prior step results so + it can adjust the remaining plan. + """ + if budget is None: + budget = DEFAULT_BUDGET + messages = state["messages"] + iteration = state.get("iteration", 0) + step_results = state.get("step_results", []) + + prev_plan_steps = state.get("plan_steps", []) + + # Fast-path: trivial text-only requests skip the planner LLM call entirely + if iteration == 0 and not prev_plan_steps and _is_trivial_text_request(messages): + logger.info("Fast-path: trivial text request — single-step plan, no LLM call", + extra={"session_id": state.get("context_id", ""), "node": "planner", + "iteration": 0, "step_count": 1, "plan_version": 1}) + trivial_steps = _make_plan_steps(["Respond to the user."], iteration=0) + store = ps.create_plan(["Respond to the user."], creator="planner") + return { + "plan": ["Respond to the user."], + "plan_steps": trivial_steps, + "plan_version": 1, + "current_step": 0, + "iteration": 1, + "done": False, + "_plan_store": store, + } + + # Build context for the planner — include previous plan with per-step status + context_parts = [] + if prev_plan_steps: + # Show the structured plan with per-step status + context_parts.append("Previous plan (with status):") + for prev_ps in prev_plan_steps: + idx = prev_ps.get("index", 0) + desc = prev_ps.get("description", "") + status = prev_ps.get("status", "pending").upper() + result = prev_ps.get("result_summary", "") + line = f" {idx+1}. [{status}] {desc}" + if result: + line += f" — {result[:150]}" + context_parts.append(line) + done_count = sum(1 for s in prev_plan_steps if s.get("status") == "done") + context_parts.append(f"Progress: {done_count}/{len(prev_plan_steps)} steps completed.") + context_parts.append("") + elif iteration > 0: + # Fallback: use flat plan list for backward compat + original_plan = state.get("plan", []) + current_step = state.get("current_step", 0) + if original_plan: + context_parts.append("Original plan:") + for i, step in enumerate(original_plan): + status = "DONE" if i < current_step else "PENDING" + context_parts.append(f" {i+1}. [{status}] {step}") + context_parts.append(f"Progress: {current_step}/{len(original_plan)} steps completed.") + context_parts.append("") + + if iteration > 0 or prev_plan_steps: + # Extract tool call history from messages + tool_history = [] + for msg in messages: + tool_calls = getattr(msg, "tool_calls", None) + if tool_calls: + for tc in tool_calls: + name = tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?") + args = tc.get("args", {}) if isinstance(tc, dict) else getattr(tc, "args", {}) + args_str = str(args)[:100] + tool_history.append(f" CALLED: {name}({args_str})") + if hasattr(msg, "name") and hasattr(msg, "content") and getattr(msg, "type", "") == "tool": + output = str(getattr(msg, "content", ""))[:200] + tool_history.append(f" RESULT ({msg.name}): {output}") + + if tool_history: + context_parts.append("Tool calls already executed (DO NOT repeat these):") + context_parts.extend(tool_history[-20:]) + context_parts.append("") + + if step_results: + context_parts.append("Previous step results:") + for i, result in enumerate(step_results, 1): + context_parts.append(f" Step {i}: {result}") + context_parts.append("") + + context_parts.append( + "Adjust the plan for remaining work. Do NOT repeat steps that already succeeded." + ) + + system_content = _PLANNER_SYSTEM + if context_parts: + system_content += "\n" + "\n".join(context_parts) + + # Prepend skill instructions when a skill was loaded from metadata. + skill_instructions = state.get("skill_instructions", "") + if skill_instructions: + system_content = skill_instructions + "\n\n" + system_content + + from sandbox_agent.context_builders import build_planner_context, invoke_llm + + plan_messages = build_planner_context(state, system_content) + + try: + response, planner_capture = await invoke_llm( + llm, plan_messages, + node="planner", session_id=state.get("context_id", ""), + workspace_path=state.get("workspace_path", "/workspace"), + ) + except Exception as exc: + if _is_budget_exceeded_error(exc): + logger.warning("Budget exceeded in planner (402 from proxy): %s", exc, + extra={"session_id": state.get("context_id", ""), "node": "planner", + "iteration": iteration}) + return { + "messages": [AIMessage(content=f"Budget exceeded: {exc}")], + "done": True, + "_budget_summary": budget.summary(), + } + raise + + prompt_tokens = planner_capture.prompt_tokens + completion_tokens = planner_capture.completion_tokens + model_name = planner_capture.model + budget.add_tokens(prompt_tokens + completion_tokens) + + # Check for respond_to_user escape tool (needed for Llama 4 Scout). + escaped = _intercept_respond_to_user(response, "Planner") + if escaped is not None: + response = escaped + elif getattr(response, 'tool_calls', None): + # Non-escape tools — pass through for graph tool execution + return { + "messages": [response], + **planner_capture.token_fields(), + "_budget_summary": budget.summary(), + **planner_capture.debug_fields(), + } + + plan = _parse_plan(response.content) + plan_version = state.get("plan_version", 0) + 1 + new_plan_steps = _make_plan_steps(plan, iteration=iteration) + store = ps.create_plan(plan, creator="planner" if iteration == 0 else "replanner") + + logger.info("Planner produced %d steps (iteration %d, version %d): %s", + len(plan), iteration, plan_version, plan, + extra={"session_id": state.get("context_id", ""), "node": "planner", + "iteration": iteration, "step_count": len(plan), + "plan_version": plan_version}) + + # On replan, preserve completed steps — don't restart from step 0. + # Find the first non-done step in the NEW plan to continue from. + # On first plan (no prev steps), start at 0. + prev_steps = state.get("plan_steps", []) + if prev_steps: + # Replan: carry forward "done" status from previous steps that match + done_count = sum(1 for s in prev_steps if s.get("status") == "done") + start_step = min(done_count, len(new_plan_steps) - 1) if new_plan_steps else 0 + # Mark steps before start_step as done in new plan (they were done before) + for i in range(start_step): + if i < len(new_plan_steps): + new_plan_steps[i] = {**new_plan_steps[i], "status": "done"} + logger.info("Replan: preserving %d done steps, starting at step %d", + start_step, start_step + 1, + extra={"session_id": state.get("context_id", ""), "node": "planner"}) + else: + start_step = 0 + + return { + "messages": [response], + "plan": plan, + "plan_steps": new_plan_steps, + "plan_version": plan_version, + "current_step": start_step, + "iteration": iteration + 1, + "done": False, + "_plan_store": store, + **planner_capture.token_fields(), + "_budget_summary": budget.summary(), + **planner_capture.debug_fields(), + } + + +MAX_THINK_ACT_CYCLES = int(_os.environ.get("SANDBOX_MAX_THINK_ACT_CYCLES", + _os.environ.get("SANDBOX_MAX_TOOL_CALLS_PER_STEP", "20"))) +THINKING_ITERATION_BUDGET = int(_os.environ.get("SANDBOX_THINKING_ITERATION_BUDGET", "2")) +MAX_PARALLEL_TOOL_CALLS = int(_os.environ.get("SANDBOX_MAX_PARALLEL_TOOL_CALLS", "5")) + + +async def executor_node( + state: dict[str, Any], + llm_with_tools: Any, + budget: AgentBudget | None = None, + llm_reason: Any | None = None, +) -> dict[str, Any]: + """Execute the current plan step using the LLM with bound tools. + + When ``llm_reason`` is provided (thinking mode): + 1. Thinking loop: up to THINKING_ITERATION_BUDGET bare LLM iterations + 2. Micro-reasoning: LLM with tools (tool_choice=any) makes up to + MAX_PARALLEL_TOOL_CALLS parallel tool calls. + """ + if budget is None: + budget = DEFAULT_BUDGET + plan = state.get("plan", []) + current_step = state.get("current_step", 0) + tool_call_count = state.get("_tool_call_count", 0) + + if current_step >= len(plan): + # No more steps — signal completion to reflector + return { + "messages": [AIMessage(content="All plan steps completed.")], + "current_step": current_step, + "done": True, + } + + # Guard: too many think-act cycles for this step — force completion + if tool_call_count >= MAX_THINK_ACT_CYCLES: + logger.warning( + "Step %d hit think-act cycle limit (%d/%d) — forcing step completion", + current_step, tool_call_count, MAX_THINK_ACT_CYCLES, + extra={"session_id": state.get("context_id", ""), "node": "executor", + "current_step": current_step, "tool_call_count": tool_call_count}, + ) + result: dict[str, Any] = { + "messages": [AIMessage(content=f"Step {current_step + 1} reached think-act cycle limit ({MAX_THINK_ACT_CYCLES}). Moving to reflection.")], + "current_step": current_step, + "_tool_call_count": 0, + "_budget_summary": budget.summary(), + } + if _DEBUG_PROMPTS: + result["_system_prompt"] = f"[Think-act cycle limit reached — no LLM call]\nStep {current_step + 1}: {tool_call_count}/{MAX_THINK_ACT_CYCLES} cycles" + result["_prompt_messages"] = [{"role": "system", "preview": f"Step {current_step + 1} cycle limit ({tool_call_count}/{MAX_THINK_ACT_CYCLES})"}] + result["_llm_response"] = "[no LLM call — cycle limit]" + return result + + step_text = plan[current_step] + system_content = _safe_format( + _EXECUTOR_SYSTEM, + current_step=current_step + 1, + step_text=step_text, + tool_call_count=tool_call_count, + max_tool_calls=MAX_THINK_ACT_CYCLES, + workspace_path=state.get("workspace_path", "/workspace"), + ) + + # Prepend skill instructions when a skill was loaded from metadata. + skill_instructions = state.get("skill_instructions", "") + if skill_instructions: + system_content = skill_instructions + "\n\n" + system_content + + # Check budget before making the LLM call (refresh from LiteLLM first) + + if budget.exceeded: + logger.warning("Budget exceeded in executor: %s", budget.exceeded_reason, + extra={"session_id": state.get("context_id", ""), "node": "executor", + "current_step": current_step}) + result: dict[str, Any] = { + "messages": [AIMessage(content=f"Budget exceeded: {budget.exceeded_reason}")], + "current_step": current_step, + "done": True, + } + if _DEBUG_PROMPTS: + result["_system_prompt"] = f"[Budget exceeded — no LLM call]\n{budget.exceeded_reason}" + result["_prompt_messages"] = [{"role": "system", "preview": f"Budget exceeded: {budget.exceeded_reason}"}] + result["_llm_response"] = "[no LLM call — budget exceeded]" + return result + + # Step-scoped message context for the executor. + # + # On NEW step (tool_call_count == 0): + # Only the step brief as a HumanMessage — executor treats this as a + # fresh task. Does NOT see the plan, previous steps, or reflector msgs. + # + # On CONTINUING step (tool_call_count > 0): + # The step brief + this step's tool calls/results only. Walk backwards + # from current messages, stopping when we hit a non-tool/non-AI message + # (which marks the boundary of this step's context). + + from sandbox_agent.context_builders import build_executor_context, invoke_with_tool_loop + + messages = build_executor_context(state, system_content) + + try: + response, capture, sub_events = await invoke_with_tool_loop( + llm_with_tools, llm_reason, messages, + node="executor", session_id=state.get("context_id", ""), + workspace_path=state.get("workspace_path", "/workspace"), + thinking_budget=THINKING_ITERATION_BUDGET, + max_parallel_tool_calls=MAX_PARALLEL_TOOL_CALLS, + ) + except Exception as exc: + if _is_budget_exceeded_error(exc): + logger.warning("Budget exceeded in executor (402 from proxy): %s", exc, + extra={"session_id": state.get("context_id", ""), "node": "executor", + "current_step": current_step}) + return { + "messages": [AIMessage(content=f"Budget exceeded: {exc}")], + "current_step": current_step, + "done": True, + "_budget_summary": budget.summary(), + } + raise + + # Track no-tool executions — if the LLM produces text instead of + # tool calls, increment counter. After 2 consecutive no-tool runs + # for the same step, mark the step as failed and advance. + no_tool_count = state.get("_no_tool_count", 0) + + # Token usage and model from the capture (guaranteed to match what was sent) + prompt_tokens = capture.prompt_tokens + completion_tokens = capture.completion_tokens + model_name = capture.model + budget.add_tokens(prompt_tokens + completion_tokens) + + # If the model returned text-based tool calls instead of structured + # tool_calls (common with vLLM without --enable-auto-tool-choice), + # parse them so tools_condition routes to the ToolNode. + # Capture the pre-patch content for event serialization. + pre_patch_content = response.content + had_structured_tools = bool(response.tool_calls) + response = maybe_patch_tool_calls(response) + + # -- Enforce parallel tool call limit ----------------------------------------- + # Allow up to MAX_PARALLEL_TOOL_CALLS per think-act cycle. + # invoke_with_tool_loop already enforces this in thinking mode, + # but single-phase mode needs the safety check here. + if len(response.tool_calls) > MAX_PARALLEL_TOOL_CALLS: + logger.info( + "Executor returned %d tool calls — keeping first %d (parallel limit)", + len(response.tool_calls), MAX_PARALLEL_TOOL_CALLS, + extra={"session_id": state.get("context_id", ""), "node": "executor", + "current_step": current_step, "tool_call_count": tool_call_count}, + ) + response = AIMessage( + content=response.content, + tool_calls=response.tool_calls[:MAX_PARALLEL_TOOL_CALLS], + ) + + # -- Detect unparsed text tool call attempts (stall signal) ---------------- + # If the model wrote text that looks like a tool call but wasn't parsed, + # log a warning. The reflector will catch the zero-tool-call pattern. + if not response.tool_calls and pre_patch_content: + text_hint = str(pre_patch_content).lower() + if any(kw in text_hint for kw in ("shell(", "file_read(", "file_write(", + "```bash", "```shell", "i would run", + "i will execute", "let me run")): + logger.warning( + "Executor produced text resembling a tool call but no actual " + "tool_calls were generated — likely a stalled iteration", + extra={"session_id": state.get("context_id", ""), "node": "executor", + "current_step": current_step, "tool_call_count": tool_call_count}, + ) + + # -- Loop detection: stop if the executor repeats the same tool call ---- + # With dedup removed (each call has unique LangGraph ID), we need to + # detect when the executor is stuck calling the same tool with the same + # args repeatedly. Check against the last 3 tool calls in this step. + if response.tool_calls and tool_call_count > 0: + all_msgs = state.get("messages", []) + # Collect recent tool calls from this step (after boundary) + recent_calls: list[tuple[str, str]] = [] + for m in reversed(all_msgs): + content = str(getattr(m, "content", "")) + if isinstance(m, SystemMessage) and content.startswith(f"[STEP_BOUNDARY {current_step}]"): + break + if isinstance(m, AIMessage) and getattr(m, "tool_calls", None): + for tc in m.tool_calls: + recent_calls.append((tc["name"], repr(sorted(tc["args"].items())))) + if len(recent_calls) >= 3: + break + if len(recent_calls) >= 3: + break + + # Check if the current call matches any of the last 3 + for tc in response.tool_calls: + current_key = (tc["name"], repr(sorted(tc["args"].items()))) + repeat_count = sum(1 for rc in recent_calls if rc == current_key) + if repeat_count >= 2: + logger.warning( + "Loop detected: %s(%s) called %d times in last 3 — forcing step completion", + tc["name"], str(tc["args"])[:80], repeat_count + 1, + extra={"session_id": state.get("context_id", ""), "node": "executor", + "current_step": current_step}, + ) + return { + "messages": [AIMessage( + content=f"Step {current_step + 1} stuck in loop: " + f"{tc['name']}() called {repeat_count + 1} times with same args. " + f"Moving to reflection." + )], + "current_step": current_step, + "_tool_call_count": 0, + "_budget_summary": budget.summary(), + } + + # Build parsed_tools list for event serialization when tools came + # from text parsing (not structured tool_calls). + parsed_tools: list[dict[str, Any]] = [] + if not had_structured_tools and response.tool_calls: + parsed_tools = [ + {"name": tc["name"], "args": tc.get("args", {})} + for tc in response.tool_calls + ] + + # If no tool calls after patching, the executor is either: + # (a) Legitimately done with the step (summarizing results) — NORMAL + # (b) Stalled and unable to call tools — only if it never called ANY tool + # + # With micro-reflection, the executor may produce text after a failed + # tool call to summarize/report — that's valid step completion, not a stall. + if not response.tool_calls: + if tool_call_count > 0: + # Executor already called tools this step — text response means + # it's done summarizing. This is normal completion, not a stall. + logger.info( + "Executor produced text response after %d tool calls for step %d — step complete", + tool_call_count, current_step, + extra={"session_id": state.get("context_id", ""), "node": "executor", + "current_step": current_step, "tool_call_count": tool_call_count}, + ) + else: + no_tool_count += 1 + logger.warning( + "Executor produced no tool calls for step %d (attempt %d/2)", + current_step, no_tool_count, + extra={"session_id": state.get("context_id", ""), "node": "executor", + "current_step": current_step, "tool_call_count": 0}, + ) + if no_tool_count >= 2: + logger.warning("Executor failed to call tools after 2 attempts — marking step failed", + extra={"session_id": state.get("context_id", ""), "node": "executor", + "current_step": current_step, "tool_call_count": 0}) + # Keep the actual LLM response (with text reasoning) for the UI. + # Append failure note but preserve the model's output for micro_reasoning. + actual_content = str(response.content or "") + failure_note = f"\n\n[Step {current_step + 1} failed: executor could not call tools after 2 attempts.]" + return { + "messages": [AIMessage(content=actual_content + failure_note)], + "current_step": current_step, + "done": True if current_step + 1 >= len(plan) else False, + "_no_tool_count": 0, + **capture.debug_fields(), + } + else: + no_tool_count = 0 # reset on successful tool call + + # Increment think-act cycle count (each cycle = 1, regardless of parallel tool count) + new_tool_call_count = tool_call_count + 1 if response.tool_calls else tool_call_count + + # Extract last tool result for micro_reasoning context (shows WHY the + # agent made this decision in the UI event stream). + _last_tool_result = None + for m in reversed(state.get("messages", [])): + if isinstance(m, ToolMessage): + content_str = str(getattr(m, "content", "")) + _last_tool_result = { + "name": getattr(m, "name", "unknown"), + "output": content_str[:500], + "status": "error" if "EXIT_CODE:" in content_str else "success", + } + break + + # On first call (tool_call_count == 0), inject a SystemMessage boundary + # marker into state. SystemMessage is NOT sent to the LLM (the executor + # builds its own message list), but stays in state["messages"] so the + # windowing logic on subsequent calls can find where this step started. + step_brief = state.get("skill_instructions", f"Execute step {current_step + 1}: {step_text}") + step_msgs: list = [] + if tool_call_count == 0: + step_msgs.append(SystemMessage(content=f"[STEP_BOUNDARY {current_step}] {step_brief[:500]}")) + + result: dict[str, Any] = { + "messages": step_msgs + [response], + "current_step": current_step, + **capture.token_fields(), + "_budget_summary": budget.summary(), + **capture.debug_fields(), + "_no_tool_count": no_tool_count, + "_tool_call_count": new_tool_call_count, + **({"_last_tool_result": _last_tool_result} if _last_tool_result else {}), + } + if sub_events: + result["_sub_events"] = sub_events + if parsed_tools: + result["parsed_tools"] = parsed_tools + return result + + +async def reflector_node( + state: dict[str, Any], + llm: Any, + budget: AgentBudget | None = None, +) -> dict[str, Any]: + """Review step output and decide whether to continue, replan, or finish. + + Parameters + ---------- + budget: + Optional :class:`AgentBudget` for enforcing iteration limits. + When the budget is exceeded the reflector forces ``done``. + """ + if budget is None: + budget = DEFAULT_BUDGET + + plan = state.get("plan", []) + current_step = state.get("current_step", 0) + step_results = list(state.get("step_results", [])) + iteration = state.get("iteration", 0) + replan_count = state.get("replan_count", 0) + done = state.get("done", False) + recent_decisions = list(state.get("recent_decisions", [])) + store = state.get("_plan_store", {}) + + # If executor signaled done (ran out of steps), go straight to done + if done: + result: dict[str, Any] = {"done": True, "decision": "done", "assessment": "Executor signaled completion."} + if _DEBUG_PROMPTS: + result["_system_prompt"] = "[Executor signaled done — no LLM call]" + return result + + def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]: + """Helper for early termination — marks current step partial/failed, rest skipped.""" + fd_ps = list(state.get("plan_steps", [])) + step_status = "failed" if mark_failed else "partial" + if current_step < len(fd_ps): + fd_ps[current_step] = {**fd_ps[current_step], "status": step_status} + for i in range(current_step + 1, len(fd_ps)): + if fd_ps[i].get("status") == "pending": + fd_ps[i] = {**fd_ps[i], "status": "skipped"} + logger.warning("%s — forcing done", reason, + extra={"session_id": state.get("context_id", ""), "node": "reflector", + "current_step": current_step, "replan_count": replan_count}) + result: dict[str, Any] = { + "step_results": step_results, + "plan_steps": fd_ps, + "current_step": current_step + 1, + "done": True, + "replan_count": replan_count, + "assessment": reason, + "decision": "done", + } + # Include prompt context so the UI can show why the reflector + # terminated early (budget, stall, duplicate output). + if _DEBUG_PROMPTS: + result["_system_prompt"] = f"[Early termination — no LLM call]\n{reason}" + if store: + result["_plan_store"] = store + return result + + # Budget guard — force termination if ANY budget limit exceeded + + if budget.exceeded: + return _force_done(f"Budget exceeded: {budget.exceeded_reason}", mark_failed=True) + + # Count tool calls in this iteration (from executor's last message) + messages = state["messages"] + tool_calls_this_iter = 0 + last_content = "" + if messages: + last_msg = messages[-1] + tool_calls_this_iter = len(getattr(last_msg, "tool_calls", []) or []) + content = getattr(last_msg, "content", "") + if isinstance(content, list): + last_content = " ".join( + b.get("text", "") for b in content + if isinstance(b, dict) and b.get("type") == "text" + ) + else: + last_content = str(content) + + # Stall detection removed — the reflector's LLM call decides whether to + # continue, replan, or stop. Hardcoded stall guards were overriding the + # reflector's judgment and force-terminating sessions prematurely. + # The iteration limit and wall-clock limit are sufficient safeguards. + + # If last_content is empty (dedup path) or the old sentinel, recover the + # actual last tool result from the message history so the reflector sees real output. + if not last_content.strip() or _DEDUP_SENTINEL in last_content: + for msg in reversed(messages): + if isinstance(msg, ToolMessage): + last_content = str(getattr(msg, "content", "")) + logger.info("Reflector: substituted dedup sentinel with last tool result (%d chars)", + len(last_content), + extra={"session_id": state.get("context_id", ""), "node": "reflector", + "current_step": current_step}) + break + + step_results.append(last_content[:500]) + + step_text = plan[current_step] if current_step < len(plan) else "N/A" + plan_text = "\n".join(f"{i+1}. {s}" for i, s in enumerate(plan)) + results_text = last_content[:1000] + + # Hint: if the step result contains error signals, prepend a note + error_signals = ("error", "fatal", "failed", "exit_code", "stderr", "denied", "cannot") + if any(sig in results_text.lower() for sig in error_signals): + results_text = ( + "[NOTE: The step result below contains error indicators. " + "Consider 'replan' to try a different approach.]\n\n" + results_text + ) + + # Build replan history context — show the LLM what prior replans tried + replan_history_text = "" + if replan_count > 0: + replan_history_lines = [ + f"REPLAN HISTORY ({replan_count} prior replan(s)):" + ] + # Collect failed step summaries from plan_steps + for hist_ps in state.get("plan_steps", []): + if hist_ps.get("status") == "failed": + summary = hist_ps.get("result_summary", "no details") + replan_history_lines.append( + f" - Step {hist_ps.get('index', '?')+1} FAILED: {hist_ps.get('description', '?')[:80]}" + f" — {summary[:150]}" + ) + replan_history_lines.append( + "Do NOT repeat approaches that already failed. Try something fundamentally different," + " or choose 'done' to report partial results." + ) + replan_history_text = "\n".join(replan_history_lines) + + # Ask LLM to reflect + recent_str = ", ".join(recent_decisions[-5:]) if recent_decisions else "none" + # Build remaining steps text so reflector knows what's left + remaining = [f"{i+1}. {plan[i]}" for i in range(current_step + 1, len(plan))] + remaining_text = ", ".join(remaining[:5]) if remaining else "NONE — all steps complete" + + # Build step execution summary for reflector context + step_tool_calls = 0 + step_tools_used: set[str] = set() + step_errors = 0 + for msg in messages: + content = str(getattr(msg, "content", "")) + if isinstance(msg, SystemMessage) and content.startswith(f"[STEP_BOUNDARY {current_step}]"): + step_tool_calls = 0 + step_tools_used = set() + step_errors = 0 + continue + if isinstance(msg, AIMessage) and getattr(msg, "tool_calls", None): + for tc in msg.tool_calls: + step_tool_calls += 1 + name = tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?") + step_tools_used.add(name) + if isinstance(msg, ToolMessage): + if "EXIT_CODE:" in content and "EXIT_CODE: 0" not in content: + step_errors += 1 + + step_summary = ( + f"Step execution summary: {step_tool_calls} tool calls using {', '.join(sorted(step_tools_used)) or 'none'}, " + f"{step_errors} errors" + ) + + system_content = _safe_format( + _REFLECTOR_SYSTEM, + plan_text=plan_text, + current_step=current_step + 1, + total_steps=len(plan), + step_text=step_text, + step_result=results_text, + remaining_steps=remaining_text, + iteration=iteration, + max_iterations=budget.max_iterations, + replan_count=replan_count, + tool_calls_this_iter=tool_calls_this_iter, + recent_decisions=recent_str, + replan_history=replan_history_text, + ) + system_content = step_summary + "\n\n" + system_content + from sandbox_agent.context_builders import build_reflector_context, invoke_llm + + reflect_messages = build_reflector_context(state, system_content) + try: + response, capture = await invoke_llm( + llm, reflect_messages, + node="reflector", session_id=state.get("context_id", ""), + workspace_path=state.get("workspace_path", "/workspace"), + ) + except Exception as exc: + if _is_budget_exceeded_error(exc): + logger.warning("Budget exceeded in reflector (402 from proxy): %s", exc, + extra={"session_id": state.get("context_id", ""), "node": "reflector", + "current_step": current_step, "replan_count": replan_count}) + return _force_done(f"Budget exceeded: {exc}") + raise + + prompt_tokens = capture.prompt_tokens + completion_tokens = capture.completion_tokens + model_name = capture.model + budget.add_tokens(prompt_tokens + completion_tokens) + + # Check for respond_to_user escape tool (needed for Llama 4 Scout). + escaped = _intercept_respond_to_user(response, "Reflector") + if escaped is not None: + response = escaped + elif getattr(response, 'tool_calls', None): + # Non-escape tools — pass through for graph tool execution + return { + "messages": [response], + **capture.token_fields(), + "_budget_summary": budget.summary(), + **capture.debug_fields(), + } + + decision = _parse_decision(response.content) + + # Guard: if the LLM says "done" but there are remaining plan steps, + # override to "continue". The LLM (esp. Llama 4 Scout) often confuses + # "step completed" with "task completed". + steps_remaining = len(plan) - (current_step + 1) + if decision == "done" and steps_remaining > 0: + logger.warning( + "Reflector said 'done' but %d plan steps remain — overriding to 'continue'", + steps_remaining, + extra={"session_id": state.get("context_id", ""), "node": "reflector", + "decision": "done->continue", "current_step": current_step, + "replan_count": replan_count}, + ) + decision = "continue" + + recent_decisions.append(decision) + recent_decisions = recent_decisions[-10:] + + # Update plan_steps with per-step status + plan_steps = list(state.get("plan_steps", [])) + # Extract tool names used in this step from messages + step_tools: list[str] = [] + for msg in messages: + for tc in getattr(msg, "tool_calls", []) or []: + name = tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?") + if name not in step_tools: + step_tools.append(name) + + if current_step < len(plan_steps): + cur_ps = {**plan_steps[current_step]} + cur_ps["tool_calls"] = step_tools + cur_ps["result_summary"] = last_content[:200] + plan_steps[current_step] = cur_ps + + logger.info( + "Reflector decision: %s (step %d/%d, iter %d, replans=%d, tools=%d, recent=%s)", + decision, current_step + 1, len(plan), iteration, + replan_count, tool_calls_this_iter, + recent_decisions[-3:], + extra={"session_id": state.get("context_id", ""), "node": "reflector", + "decision": decision, "current_step": current_step, + "replan_count": replan_count, "iteration": iteration}, + ) + + base_result: dict[str, Any] = { + "messages": [response], + "step_results": step_results, + "recent_decisions": recent_decisions, + "plan_steps": plan_steps, + **capture.token_fields(), + "_budget_summary": budget.summary(), + **capture.debug_fields(), + } + + # Update PlanStore status (parallel to plan_steps updates below) + step_key = str(current_step + 1) + if store: + try: + if decision in ("done", "continue"): + store = ps.set_step_status(store, step_key, "done") + elif decision == "replan": + store = ps.set_step_status(store, step_key, "failed") + elif decision == "retry": + store = ps.set_step_status(store, step_key, "running") + except ValueError: + logger.warning("PlanStore: step %s not found (replan?), skipping status update", + step_key, extra={"session_id": state.get("context_id", ""), "node": "reflector"}) + base_result["_plan_store"] = store + + if decision == "done": + # Mark current step done, remaining as skipped + if current_step < len(plan_steps): + plan_steps[current_step] = {**plan_steps[current_step], "status": "done"} + for i in range(current_step + 1, len(plan_steps)): + if plan_steps[i].get("status") == "pending": + plan_steps[i] = {**plan_steps[i], "status": "skipped"} + return { + **base_result, + "plan_steps": plan_steps, + "current_step": current_step + 1, + "done": True, + "replan_count": replan_count, + } + elif decision == "retry": + # Retry: re-execute current step with fresh context. + # Mark step as "retrying" (not failed) — executor gets another chance. + if current_step < len(plan_steps): + cur_ps = plan_steps[current_step] + retry_count = cur_ps.get("retry_count", 0) + 1 + plan_steps[current_step] = { + **cur_ps, + "status": "retrying", + "retry_count": retry_count, + } + logger.info("Retry step %d (attempt %d) — re-executing with different approach", + current_step + 1, plan_steps[current_step].get("retry_count", 1), + extra={"session_id": state.get("context_id", ""), "node": "reflector", + "decision": "retry", "current_step": current_step}) + return { + **base_result, + "plan_steps": plan_steps, + "done": False, + "replan_count": replan_count, + "_tool_call_count": 0, # reset tool calls for retry + } + elif decision == "replan": + new_replan_count = replan_count + 1 + # Mark current step failed + if current_step < len(plan_steps): + plan_steps[current_step] = {**plan_steps[current_step], "status": "failed"} + logger.info("Replan %d — routing back to planner", new_replan_count, + extra={"session_id": state.get("context_id", ""), "node": "reflector", + "decision": "replan", "current_step": current_step, + "replan_count": new_replan_count}) + return { + **base_result, + "plan_steps": plan_steps, + "done": False, + "replan_count": new_replan_count, + } + else: + # Continue: mark current step done, advance + if current_step < len(plan_steps): + plan_steps[current_step] = {**plan_steps[current_step], "status": "done"} + next_step = current_step + 1 + if next_step < len(plan_steps): + plan_steps[next_step] = {**plan_steps[next_step], "status": "running"} + if next_step >= len(plan): + # All steps done — route to done (reporter will summarize). + # Mark all steps done. + for i in range(len(plan_steps)): + if plan_steps[i].get("status") not in ("done", "failed", "skipped"): + plan_steps[i] = {**plan_steps[i], "status": "done"} + logger.info( + "All %d planned steps completed — routing to reporter", + len(plan), + extra={"session_id": state.get("context_id", ""), "node": "reflector", + "decision": "done", "current_step": current_step}, + ) + return { + **base_result, + "plan_steps": plan_steps, + "done": True, + "replan_count": replan_count, + "_tool_call_count": 0, + } + return { + **base_result, + "plan_steps": plan_steps, + "current_step": next_step, + "done": False, + "replan_count": replan_count, + "_tool_call_count": 0, + } + + +async def reporter_node( + state: dict[str, Any], + llm: Any, + budget: AgentBudget | None = None, + llm_reason: Any | None = None, + tools: list | None = None, +) -> dict[str, Any]: + """Format accumulated step results into a final answer. + + Sets ``plan_status`` based on how the loop ended: + - All steps done → ``"completed"`` + - Stall/budget forced done → ``"failed"`` (with ``awaiting_continue`` + so user/looper can retry) + - Plan steps remain → ``"awaiting_continue"`` + + When ``llm_reason`` is provided, uses ``invoke_with_tool_loop`` for + thinking iterations and read-only tool calls (file verification). + Falls back to single ``invoke_llm`` when ``llm_reason`` is None. + """ + if budget is None: + budget = DEFAULT_BUDGET + store = state.get("_plan_store", {}) + plan = ps.to_flat_plan(store) if store else state.get("plan", []) + step_results = state.get("step_results", []) + plan_steps = state.get("plan_steps", []) + + # Determine terminal plan_status based on step outcomes + if plan_steps: + done_count = sum(1 for s in plan_steps if s.get("status") == "done") + failed_count = sum(1 for s in plan_steps if s.get("status") == "failed") + partial_count = sum(1 for s in plan_steps if s.get("status") == "partial") + total = len(plan_steps) + if done_count == total: + terminal_status = "completed" + elif failed_count > 0 or partial_count > 0 or done_count < total: + terminal_status = "awaiting_continue" + else: + terminal_status = "completed" + else: + terminal_status = "completed" + + # Filter out internal dedup sentinel from step_results so it never + # reaches the reporter prompt or the final answer. + step_results = [r for r in step_results if _DEDUP_SENTINEL not in r] + + # Always run LLM to produce a user-facing summary. + # Previous code had a shortcut for single-step plans that passed through + # the last message directly, but this leaked reflector reasoning text. + if not step_results and not state.get("messages"): + return {"final_answer": "No response generated.", "plan_status": terminal_status} + + plan_text = "\n".join(f"{i+1}. {s}" for i, s in enumerate(plan)) + results_text = "\n".join( + f"Step {i+1}: {r}" for i, r in enumerate(step_results) + ) + + # Build step status summary from plan_steps + step_status_lines = [] + has_partial = False + for rpt_ps in plan_steps: + idx = rpt_ps.get("index", 0) + status = rpt_ps.get("status", "unknown").upper() + if status == "PARTIAL": + has_partial = True + desc = rpt_ps.get("description", "")[:80] + result = rpt_ps.get("result_summary", "")[:100] + line = f"{idx+1}. [{status}] {desc}" + if result and status in ("FAILED", "PARTIAL"): + line += f" — {result}" + step_status_lines.append(line) + step_status_text = "\n".join(step_status_lines) if step_status_lines else "No step status available." + + # Add context when the agent hit its step limit + done_count = sum(1 for s in plan_steps if s.get("status") == "done") + limit_note = "" + if has_partial: + limit_note = ( + f"NOTE: The agent reached its step limit after {done_count} completed steps. " + "Summarize ALL results obtained so far — do not dismiss the work done." + ) + + system_content = _safe_format( + _REPORTER_SYSTEM, + plan_text=plan_text, + step_status_text=step_status_text, + results_text=results_text, + limit_note=limit_note, + ) + # Filter dedup sentinel messages from conversation history passed to the + # reporter LLM so it cannot echo them in the final answer. + filtered_msgs = [ + m for m in state["messages"] + if _DEDUP_SENTINEL not in str(getattr(m, "content", "")) + ] + reporter_messages = [SystemMessage(content=system_content)] + filtered_msgs + + # Use invoke_with_tool_loop when llm_reason is available (thinking mode), + # otherwise fall back to single invoke_llm call. + sub_events: list[dict[str, Any]] = [] + if llm_reason is not None: + from sandbox_agent.context_builders import invoke_with_tool_loop + + try: + response, capture, sub_events = await invoke_with_tool_loop( + llm, llm_reason, reporter_messages, + node="reporter", session_id=state.get("context_id", ""), + workspace_path=state.get("workspace_path", "/workspace"), + thinking_budget=2, + max_parallel_tool_calls=3, + max_cycles=3, + tools=tools, + ) + except Exception as exc: + if _is_budget_exceeded_error(exc): + logger.warning("Budget exceeded in reporter (402 from proxy): %s", exc, + extra={"session_id": state.get("context_id", ""), "node": "reporter"}) + return { + "messages": [AIMessage(content="Task completed (budget exhausted before final summary).")], + "final_answer": "Task completed (budget exhausted before final summary).", + "plan_status": terminal_status, + "done": True, + "_budget_summary": budget.summary(), + } + raise + else: + from sandbox_agent.context_builders import invoke_llm + + try: + response, capture = await invoke_llm( + llm, reporter_messages, + node="reporter", session_id=state.get("context_id", ""), + workspace_path=state.get("workspace_path", "/workspace"), + ) + except Exception as exc: + if _is_budget_exceeded_error(exc): + logger.warning("Budget exceeded in reporter (402 from proxy): %s", exc, + extra={"session_id": state.get("context_id", ""), "node": "reporter"}) + return { + "messages": [AIMessage(content="Task completed (budget exhausted before final summary).")], + "final_answer": "Task completed (budget exhausted before final summary).", + "plan_status": terminal_status, + "done": True, + "_budget_summary": budget.summary(), + } + raise + + prompt_tokens = capture.prompt_tokens + completion_tokens = capture.completion_tokens + model_name = capture.model + budget.add_tokens(prompt_tokens + completion_tokens) + + # Handle respond_to_user escape tool (Llama 4 Scout always calls tools) + escaped = _intercept_respond_to_user(response, "Reporter") + if escaped is not None: + response = escaped + elif getattr(response, 'tool_calls', None): + # Response has real tool calls — return to graph for tool execution + return { + "messages": [response], + **capture.token_fields(), + "_budget_summary": budget.summary(), + **capture.debug_fields(), + } + + content = response.content + if isinstance(content, list): + text = " ".join( + b.get("text", "") for b in content + if isinstance(b, dict) and b.get("type") == "text" + ) + else: + text = str(content) + + # Extract files touched from tool call history + files_touched: list[str] = [] + for msg in state.get("messages", []): + for tc in getattr(msg, "tool_calls", []) or []: + name = tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?") + args = tc.get("args", {}) if isinstance(tc, dict) else getattr(tc, "args", {}) + if name in ("file_write", "file_read"): + path = args.get("path", "") + if path and path not in files_touched: + files_touched.append(path) + elif name == "shell": + cmd = args.get("command", "") + # Extract file paths from common shell patterns + import re as _re + for match in _re.findall(r'(?:>|>>|tee)\s+(\S+)', cmd): + if match not in files_touched: + files_touched.append(match) + + logger.info("Reporter: plan_status=%s (done=%d, failed=%d, total=%d)", + terminal_status, + sum(1 for s in plan_steps if s.get("status") == "done"), + sum(1 for s in plan_steps if s.get("status") == "failed"), + len(plan_steps), + extra={"session_id": state.get("context_id", ""), "node": "reporter"}) + + result: dict[str, Any] = { + "messages": [response], + "final_answer": text, + "plan_status": terminal_status, + "files_touched": files_touched[:30], # cap at 30 files + **capture.token_fields(), + "_budget_summary": budget.summary(), + **capture.debug_fields(), + } + if sub_events: + result["_sub_events"] = sub_events + if store: + result["_plan_store"] = store + return result + + +# --------------------------------------------------------------------------- +# Routing function for reflector conditional edges +# --------------------------------------------------------------------------- + + +def route_reflector(state: dict[str, Any]) -> str: + """Route from reflector based on decision. + + ``done`` → reporter (final answer) + ``replan`` → planner (create new plan, preserving done steps) + ``retry`` → step_selector (re-run current step with different approach) + ``continue`` → step_selector (advance to next step) + """ + if state.get("done", False): + return "done" + # Check the reflector's decision to distinguish continue vs replan vs retry + decision = (state.get("recent_decisions") or ["continue"])[-1] + if decision == "replan": + return "replan" + # Both "retry" and "continue" route to step_selector — + # retry keeps current_step the same, continue advances it. + return "execute" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _parse_plan(content: str | list) -> list[str]: + """Extract numbered steps from LLM output. + + Accepts both plain strings and content-block lists (tool-calling models). + Returns a list of step descriptions. + """ + if isinstance(content, list): + text = " ".join( + b.get("text", "") for b in content + if isinstance(b, dict) and b.get("type") == "text" + ) + else: + text = str(content) + + steps: list[str] = [] + for line in text.strip().splitlines(): + line = line.strip() + # Match lines starting with a number followed by . or ) + if line and len(line) > 2 and line[0].isdigit(): + # Strip the number prefix: "1. Do X" -> "Do X" + for i, ch in enumerate(line): + if ch in ".)" and i < 4: + step = line[i + 1:].strip() + if step: + steps.append(step) + break + + # Fallback: if parsing fails, treat the whole response as a single step + if not steps: + steps = [text.strip()[:500]] + + return steps + + +def _parse_decision(content: str | list) -> str: + """Extract the reflector decision from LLM output. + + Returns one of: ``continue``, ``retry``, ``replan``, ``done``, ``hitl``. + Defaults to ``continue`` if the output is ambiguous. + """ + if isinstance(content, list): + text = " ".join( + b.get("text", "") for b in content + if isinstance(b, dict) and b.get("type") == "text" + ) + else: + text = str(content) + + text_lower = text.strip().lower() + + for decision in ("done", "retry", "replan", "hitl", "continue"): + if decision in text_lower: + return decision + + return "continue" + + +_BARE_DECISION_RE = re.compile(r'^(continue|retry|replan|done|hitl)\s*$', re.IGNORECASE) From 133ba5c7321d31369fa4e78033ac06a53b43347b Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:49:04 +0100 Subject: [PATCH 18/26] feat(sandbox): per-tool-call Landlock isolation via subprocess fork with no-fallback policy Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/sandbox_subprocess.py | 163 ++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py b/a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py new file mode 100644 index 00000000..cea9063e --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py @@ -0,0 +1,163 @@ +"""Per-tool-call Landlock isolation via subprocess fork. + +Each command execution forks a child process that applies Landlock +restrictions before executing the command. This ensures that even +if the command is malicious, it cannot escape the workspace. + +The Landlock restrictions are: +- rw_paths: workspace directory + session-specific /tmp +- ro_paths: system directories needed for basic command execution + +There is NO fallback. If Landlock fails, the subprocess fails. +""" + +from __future__ import annotations + +import asyncio +import hashlib +import logging +import os +import sys +import textwrap +from pathlib import Path + +logger = logging.getLogger(__name__) + +# Maximum output size to capture (prevent OOM on runaway commands) +_MAX_OUTPUT_BYTES = 10 * 1024 * 1024 # 10 MB + + +async def sandboxed_subprocess( + command: str, + workspace_path: str, + timeout: float = 120.0, + env: dict[str, str] | None = None, +) -> tuple[int, str, str]: + """Execute a command inside a Landlock-restricted subprocess. + + Forks a child process that: + 1. Applies Landlock restricting filesystem access to workspace + system dirs + 2. Executes the command via shell + + Parameters + ---------- + command: + Shell command string to execute. + workspace_path: + Absolute path to the session workspace (read-write). + timeout: + Maximum execution time in seconds. + env: + Optional extra environment variables for the child. + + Returns + ------- + tuple[int, str, str] + (returncode, stdout, stderr) + + Raises + ------ + OSError + If Landlock application fails in the child (propagated via non-zero exit). + """ + # Create session-specific tmp directory + # Use a hash of workspace_path to create a unique tmp dir + ws_hash = hashlib.sha256(workspace_path.encode()).hexdigest()[:12] + session_tmp = f"/tmp/sandbox_{ws_hash}" + Path(session_tmp).mkdir(parents=True, exist_ok=True) + + # Build the child script that applies Landlock then execs the command + # The child script is passed via -c to the Python interpreter + child_script = textwrap.dedent("""\ + import os + import subprocess + import sys + + # Import the landlock module from the package + sys.path.insert(0, os.environ["_LANDLOCK_PYTHONPATH"]) + from sandbox_agent.landlock_ctypes import apply_landlock + + workspace = os.environ["SANDBOX_WORKSPACE"] + session_tmp = os.environ["SANDBOX_TMP"] + + # Collect read-only system paths that exist + ro_paths = [] + for p in ["/usr", "/bin", "/lib", "/lib64", "/opt", "/etc", + "/proc", "/dev/null", "/dev/urandom", "/app"]: + if os.path.exists(p): + ro_paths.append(p) + + # Add Python prefix for stdlib access + prefix = sys.prefix + if os.path.exists(prefix) and prefix not in ro_paths: + ro_paths.append(prefix) + + # Apply Landlock -- NO try/except, hard fail if this fails + apply_landlock( + rw_paths=[workspace, session_tmp], + ro_paths=ro_paths, + ) + + # Execute the user command + result = subprocess.run( + os.environ["_LANDLOCK_COMMAND"], + shell=True, + cwd=workspace, + capture_output=True, + timeout=float(os.environ.get("_LANDLOCK_TIMEOUT", "120")), + ) + + # Write stdout and stderr to fds 1 and 2 + sys.stdout.buffer.write(result.stdout) + sys.stderr.buffer.write(result.stderr) + sys.exit(result.returncode) + """) + + # Build environment for the child process + child_env = dict(os.environ) + if env: + child_env.update(env) + + # Find package source directory for PYTHONPATH + package_src = str(Path(__file__).resolve().parent.parent) + + child_env["SANDBOX_WORKSPACE"] = workspace_path + child_env["SANDBOX_TMP"] = session_tmp + child_env["_LANDLOCK_PYTHONPATH"] = package_src + child_env["_LANDLOCK_COMMAND"] = command + child_env["_LANDLOCK_TIMEOUT"] = str(timeout) + + try: + process = await asyncio.create_subprocess_exec( + sys.executable, "-c", child_script, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + env=child_env, + cwd=workspace_path, + ) + + try: + stdout_bytes, stderr_bytes = await asyncio.wait_for( + process.communicate(), + timeout=timeout + 5, # extra margin for Landlock setup + ) + except asyncio.TimeoutError: + try: + process.kill() + except ProcessLookupError: + pass + await process.wait() + return ( + -1, + "", + f"Sandboxed command timed out after {timeout} seconds: '{command}'", + ) + + stdout = (stdout_bytes or b"")[:_MAX_OUTPUT_BYTES].decode("utf-8", errors="replace") + stderr = (stderr_bytes or b"")[:_MAX_OUTPUT_BYTES].decode("utf-8", errors="replace") + returncode = process.returncode if process.returncode is not None else -1 + + return (returncode, stdout, stderr) + + except OSError as exc: + return (-1, "", f"Failed to start sandboxed subprocess: {exc}") From 5c0ff33ef87b1d175f669eade2c2862fc2d9ba1f Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:49:10 +0100 Subject: [PATCH 19/26] feat(sandbox): sources.json capability loader for package managers, registries, and runtime limits Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/sources.py | 129 ++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/sources.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/sources.py b/a2a/sandbox_agent/src/sandbox_agent/sources.py new file mode 100644 index 00000000..bd2bf68f --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/sources.py @@ -0,0 +1,129 @@ +"""Capability loader for sources.json. + +sources.json is baked into the agent container image and declares what +resources exist on the image: package managers, registries, git remotes, +web domains, and runtime limits. The sandbox executor uses it alongside +settings.json -- settings.json controls what operations are *allowed*, +sources.json controls what resources are *available*. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from fnmatch import fnmatch +from pathlib import Path +from typing import Any + + +_DEFAULT_MAX_EXECUTION_TIME_SECONDS = 300 +_DEFAULT_MAX_MEMORY_MB = 2048 + + +@dataclass(frozen=True) +class SourcesConfig: + """Structured representation of a ``sources.json`` file.""" + + _data: dict[str, Any] = field(default_factory=dict, repr=False) + + # ------------------------------------------------------------------ + # Construction helpers + # ------------------------------------------------------------------ + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> SourcesConfig: + """Create a *SourcesConfig* from a parsed JSON dictionary.""" + return cls(_data=data) + + @classmethod + def from_file(cls, path: Path) -> SourcesConfig: + """Load a *SourcesConfig* from a ``sources.json`` file on disk.""" + with open(path, encoding="utf-8") as fh: + return cls.from_dict(json.load(fh)) + + # ------------------------------------------------------------------ + # Package-manager queries + # ------------------------------------------------------------------ + + def is_package_manager_enabled(self, name: str) -> bool: + """Return *True* if the named package manager is enabled.""" + managers: dict[str, Any] = self._data.get("package_managers", {}) + entry = managers.get(name) + if entry is None: + return False + return bool(entry.get("enabled", False)) + + def is_package_blocked(self, manager: str, package: str) -> bool: + """Return *True* if *package* is on the block-list for *manager*.""" + managers: dict[str, Any] = self._data.get("package_managers", {}) + entry = managers.get(manager) + if entry is None: + return False + blocked: list[str] = entry.get("blocked_packages", []) + return package in blocked + + # ------------------------------------------------------------------ + # Git-remote queries + # ------------------------------------------------------------------ + + def is_git_remote_allowed(self, url: str) -> bool: + """Return *True* if *url* matches one of the ``allowed_remotes`` patterns. + + Pattern matching uses :func:`fnmatch.fnmatch`. If git access is + disabled in the config the method always returns *False*. + """ + git_section: dict[str, Any] = self._data.get("git", {}) + if not git_section.get("enabled", False): + return False + patterns: list[str] = git_section.get("allowed_remotes", []) + return any(fnmatch(url, pattern) for pattern in patterns) + + # ------------------------------------------------------------------ + # Web-access queries + # ------------------------------------------------------------------ + + def is_web_access_enabled(self) -> bool: + """Return *True* if web access is enabled.""" + return bool(self._data.get("web_access", {}).get("enabled", False)) + + def is_domain_allowed(self, domain: str) -> bool: + """Return *True* if *domain* matches the allowed_domains list. + + Uses :func:`fnmatch.fnmatch` for pattern matching (e.g. ``*.github.com``). + Returns *False* if web access is disabled. + """ + web: dict[str, Any] = self._data.get("web_access", {}) + if not web.get("enabled", False): + return False + + # Check blocked first + for pattern in web.get("blocked_domains", []): + if fnmatch(domain, pattern): + return False + + # Check allowed + for pattern in web.get("allowed_domains", []): + if fnmatch(domain, pattern): + return True + + return False + + # ------------------------------------------------------------------ + # Runtime-limit properties + # ------------------------------------------------------------------ + + @property + def max_execution_time_seconds(self) -> int: + """Maximum execution time for a single run, in seconds.""" + runtime: dict[str, Any] = self._data.get("runtime", {}) + return int( + runtime.get( + "max_execution_time_seconds", _DEFAULT_MAX_EXECUTION_TIME_SECONDS + ) + ) + + @property + def max_memory_mb(self) -> int: + """Maximum memory for a single run, in megabytes.""" + runtime: dict[str, Any] = self._data.get("runtime", {}) + return int(runtime.get("max_memory_mb", _DEFAULT_MAX_MEMORY_MB)) From 3058627f92cdd583acdfcd38049cff9c4cffd8f5 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:49:16 +0100 Subject: [PATCH 20/26] feat(sandbox): sub-agent tools with explore (read-only) and delegate (multi-mode) strategies Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/subagents.py | 462 ++++++++++++++++++ 1 file changed, 462 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/subagents.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/subagents.py b/a2a/sandbox_agent/src/sandbox_agent/subagents.py new file mode 100644 index 00000000..c1b7fcb3 --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/subagents.py @@ -0,0 +1,462 @@ +"""Sub-agent spawning tools for the sandbox agent. + +Provides three tools: + +1. **explore**: Read-only in-process sub-graph (grep, read_file, list_files). + Good for codebase research and analysis. + +2. **delegate**: Multi-mode delegation with 4 strategies: + - in-process: LangGraph subgraph, shared filesystem (fast) + - shared-pvc: Separate pod with parent's PVC mounted + - isolated: Separate pod via SandboxClaim (full isolation) + - sidecar: New container in parent pod + + The LLM auto-selects the best mode, or the caller can specify. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +import subprocess +import uuid +from pathlib import Path +from typing import Any, Optional + +import asyncpg +from langchain_core.messages import HumanMessage, SystemMessage +from langchain_core.tools import tool +from langgraph.graph import MessagesState, StateGraph +from langgraph.prebuilt import ToolNode, tools_condition + +logger = logging.getLogger(__name__) + +# Maximum iterations for in-process sub-agents +_MAX_SUB_AGENT_ITERATIONS = 15 + +# Delegation mode configuration +_DELEGATION_MODES = os.environ.get( + "DELEGATION_MODES", "in-process,shared-pvc,isolated,sidecar" +).split(",") +_DEFAULT_MODE = os.environ.get("DEFAULT_DELEGATION_MODE", "in-process") + +# Maximum iterations for in-process sub-agents to prevent runaway loops. +_MAX_SUB_AGENT_ITERATIONS = 15 + + +# --------------------------------------------------------------------------- +# In-process sub-agent: explore (C20, mode 1) +# --------------------------------------------------------------------------- + + +def _make_explore_tools(workspace: str) -> list[Any]: + """Build a read-only tool set for the explore sub-agent.""" + ws_root = Path(workspace).resolve() + + @tool + async def grep(pattern: str, path: str = ".") -> str: + """Search for a regex pattern in files under the workspace. + + Args: + pattern: Regex pattern to search for. + path: Relative path to search in (default: workspace root). + + Returns: + Matching lines with file paths and line numbers. + """ + target = (ws_root / path).resolve() + if not target.is_relative_to(ws_root): + return "Error: path resolves outside the workspace." + + try: + result = subprocess.run( + ["grep", "-rn", "--include=*.py", "--include=*.md", + "--include=*.yaml", "--include=*.yml", "--include=*.json", + "--include=*.txt", "--include=*.sh", "--include=*.go", + pattern, str(target)], + capture_output=True, text=True, timeout=30, + cwd=str(ws_root), + ) + output = result.stdout[:10000] + if not output: + return f"No matches found for pattern '{pattern}'" + return output + except subprocess.TimeoutExpired: + return "Search timed out after 30 seconds." + except FileNotFoundError: + return "grep command not available." + + @tool + async def read_file(path: str) -> str: + """Read a file from the workspace (read-only). + + Args: + path: Relative path within the workspace. + + Returns: + File contents (truncated to 20000 chars). + """ + resolved = (ws_root / path).resolve() + if not str(resolved).startswith(str(ws_root)): + return "Error: path resolves outside the workspace." + if not resolved.is_file(): + return f"Error: file not found at '{path}'." + try: + content = resolved.read_text(encoding="utf-8", errors="replace") + if len(content) > 20000: + content = content[:20000] + "\n\n[Truncated at 20000 chars]" + return content + except OSError as exc: + return f"Error reading file: {exc}" + + @tool + async def list_files(path: str = ".", pattern: str = "*") -> str: + """List files matching a glob pattern in the workspace. + + Args: + path: Relative directory to search in (default: workspace root). + pattern: Glob pattern (default: all files). + + Returns: + Newline-separated list of matching file paths. + """ + target = (ws_root / path).resolve() + if not target.is_relative_to(ws_root): + return "Error: path resolves outside the workspace." + if not target.is_dir(): + return f"Error: directory not found at '{path}'." + + matches = sorted(str(p.relative_to(ws_root)) for p in target.rglob(pattern) if p.is_file()) + if len(matches) > 200: + matches = matches[:200] + matches.append(f"... and more (truncated at 200)") + return "\n".join(matches) if matches else "No files found." + + return [grep, read_file, list_files] + + +def create_explore_graph(workspace: str, llm: Any) -> Any: + """Create a read-only explore sub-graph. + + The sub-graph has access only to grep, read_file, and list_files. + It is bounded to ``_MAX_SUB_AGENT_ITERATIONS`` steps. + """ + tools = _make_explore_tools(workspace) + llm_with_tools = llm.bind_tools(tools) + + async def assistant(state: MessagesState) -> dict[str, Any]: + from sandbox_agent.reasoning import maybe_patch_tool_calls + system = SystemMessage( + content=( + "You are a codebase research assistant. Your job is to find " + "specific information in the workspace using grep, read_file, " + "and list_files. Be concise. Return a focused summary of what " + "you found. Do NOT modify any files." + ) + ) + messages = [system] + state["messages"] + response = await llm_with_tools.ainvoke(messages) + return {"messages": [maybe_patch_tool_calls(response)]} + + graph = StateGraph(MessagesState) + graph.add_node("assistant", assistant) + graph.add_node("tools", ToolNode(tools)) + graph.set_entry_point("assistant") + graph.add_conditional_edges("assistant", tools_condition) + graph.add_edge("tools", "assistant") + + return graph.compile() + + +def make_explore_tool(workspace: str, llm: Any) -> Any: + """Return a LangChain tool that spawns an in-process explore sub-agent.""" + + @tool + async def explore(query: str) -> str: + """Spawn a read-only sub-agent to research the codebase. + + The sub-agent has access to grep, read_file, and list_files + but cannot write files or execute shell commands. Use this for + codebase exploration, finding definitions, and analyzing code. + + Args: + query: What to search for or investigate in the codebase. + + Returns: + A summary of findings from the explore sub-agent. + """ + sub_graph = create_explore_graph(workspace, llm) + try: + result = await asyncio.wait_for( + sub_graph.ainvoke( + {"messages": [HumanMessage(content=query)]}, + config={"recursion_limit": _MAX_SUB_AGENT_ITERATIONS}, + ), + timeout=120, + ) + messages = result.get("messages", []) + if messages: + last = messages[-1] + return last.content if hasattr(last, "content") else str(last) + return "No results from explore sub-agent." + except asyncio.TimeoutError: + return "Explore sub-agent timed out after 120 seconds." + except Exception as exc: + return f"Explore sub-agent error: {exc}" + + return explore + + +# --------------------------------------------------------------------------- +# Child session database helpers +# --------------------------------------------------------------------------- + + +async def _register_child_session( + child_context_id: str, + parent_context_id: str, + agent_name: str, + task: str, +) -> None: + """Register a child session in the tasks database so it appears in the sidebar.""" + db_url = os.environ.get("TASK_STORE_DB_URL", "") + if not db_url: + return + # Convert async SQLAlchemy URL to asyncpg format + pg_url = db_url.replace("postgresql+asyncpg://", "postgresql://") + try: + conn = await asyncpg.connect(pg_url) + # Check if context already exists + existing = await conn.fetchval( + "SELECT COUNT(*) FROM tasks WHERE context_id = $1", child_context_id + ) + if existing == 0: + metadata = json.dumps({ + "agent_name": agent_name, + "parent_context_id": parent_context_id, + "title": task[:80], + }) + status = json.dumps({"state": "working"}) + await conn.execute( + "INSERT INTO tasks (id, context_id, status, metadata, history, artifacts) " + "VALUES ($1, $2, $3::jsonb, $4::jsonb, '[]'::jsonb, '[]'::jsonb)", + str(uuid.uuid4()), + child_context_id, + status, + metadata, + ) + logger.info( + "Registered child session %s (parent=%s) in tasks DB", + child_context_id, + parent_context_id, + ) + await conn.close() + except Exception as e: + logger.warning("Failed to register child session %s: %s", child_context_id, e) + + +async def _complete_child_session(child_context_id: str, result: str) -> None: + """Mark a child session as completed in the database.""" + db_url = os.environ.get("TASK_STORE_DB_URL", "") + if not db_url: + return + pg_url = db_url.replace("postgresql+asyncpg://", "postgresql://") + try: + conn = await asyncpg.connect(pg_url) + status = json.dumps({"state": "completed"}) + # Store result as an artifact + artifacts = json.dumps([{"parts": [{"kind": "text", "text": result[:5000]}]}]) + await conn.execute( + "UPDATE tasks SET status = $1::jsonb, artifacts = $2::jsonb WHERE context_id = $3", + status, + artifacts, + child_context_id, + ) + logger.info("Marked child session %s as completed", child_context_id) + await conn.close() + except Exception as e: + logger.warning("Failed to complete child session %s: %s", child_context_id, e) + + +# --------------------------------------------------------------------------- +# Multi-mode delegation (Session E) +# --------------------------------------------------------------------------- + + +_SUBAGENT_EXCLUDED_TOOLS = {"delegate", "explore"} + + +async def _run_in_process( + task: str, + workspace: str, + llm: Any, + child_context_id: str, + tools_list: list[Any] | None = None, + timeout: int = 120, +) -> str: + """Execute a task as an in-process LangGraph subgraph.""" + if tools_list is None: + tools_list = _make_explore_tools(workspace) + else: + # Exclude delegate/explore tools to prevent recursive sub-agent spawning. + tools_list = [t for t in tools_list if getattr(t, "name", "") not in _SUBAGENT_EXCLUDED_TOOLS] + + llm_with_tools = llm.bind_tools(tools_list) + + async def assistant(state: MessagesState) -> dict[str, Any]: + from sandbox_agent.reasoning import maybe_patch_tool_calls + system = SystemMessage( + content=( + "You are a sub-agent working on a delegated task. Complete the task " + "efficiently using the available tools. Return a clear summary of " + "what you did and the results." + ) + ) + messages = [system] + state["messages"] + response = await llm_with_tools.ainvoke(messages) + return {"messages": [maybe_patch_tool_calls(response)]} + + graph = StateGraph(MessagesState) + graph.add_node("assistant", assistant) + graph.add_node("tools", ToolNode(tools_list)) + graph.set_entry_point("assistant") + graph.add_conditional_edges("assistant", tools_condition) + graph.add_edge("tools", "assistant") + sub_graph = graph.compile() + + try: + result = await asyncio.wait_for( + sub_graph.ainvoke( + {"messages": [HumanMessage(content=task)]}, + config={ + "recursion_limit": _MAX_SUB_AGENT_ITERATIONS, + "configurable": {"thread_id": child_context_id}, + }, + ), + timeout=timeout, + ) + messages = result.get("messages", []) + if messages: + last = messages[-1] + return last.content if hasattr(last, "content") else str(last) + return "No results from in-process sub-agent." + except asyncio.TimeoutError: + return f"In-process sub-agent timed out after {timeout} seconds." + except Exception as exc: + logger.exception("In-process delegation failed for %s", child_context_id) + return f"In-process sub-agent error: {exc}" + + +async def _run_shared_pvc( + task: str, child_context_id: str, namespace: str = "team1", + variant: str = "sandbox-legion", timeout_minutes: int = 30, +) -> str: + """Spawn a pod that mounts the parent's PVC (placeholder).""" + logger.info("shared-pvc delegation: child=%s task=%s", child_context_id, task) + return ( + f"Shared-PVC delegation requested for '{task}' " + f"(child={child_context_id}, namespace={namespace}). " + "Requires RWX StorageClass. Not yet implemented." + ) + + +async def _run_isolated( + task: str, child_context_id: str, namespace: str = "team1", + variant: str = "sandbox-legion", timeout_minutes: int = 30, +) -> str: + """Spawn an isolated pod via SandboxClaim CRD (placeholder).""" + logger.info("isolated delegation: child=%s task=%s", child_context_id, task) + return ( + f"Isolated delegation requested for '{task}' " + f"(child={child_context_id}, namespace={namespace}). " + "Requires SandboxClaim CRD + controller. Not yet implemented." + ) + + +async def _run_sidecar( + task: str, child_context_id: str, variant: str = "sandbox-legion", +) -> str: + """Inject a sidecar container (placeholder).""" + logger.info("sidecar delegation: child=%s task=%s", child_context_id, task) + return ( + f"Sidecar delegation requested for '{task}' " + f"(child={child_context_id}). Not yet implemented." + ) + + +def make_delegate_tool( + workspace: str, + llm: Any, + parent_context_id: str = "", + tools_list: list[Any] | None = None, + namespace: str = "team1", +) -> Any: + """Return a LangChain tool for multi-mode delegation. + + Args: + workspace: Path to the parent's workspace. + llm: The LLM instance for in-process subgraphs. + parent_context_id: The parent session's context_id. + tools_list: Optional tools for in-process subgraphs. + namespace: Kubernetes namespace for out-of-process modes. + """ + + @tool + async def delegate( + task: str, + mode: str = "auto", + variant: str = "sandbox-legion", + timeout_minutes: int = 30, + ) -> str: + """Delegate a task to a child session. + + Spawns a child agent to work on the task independently. + + Args: + task: Description of the task for the child session. + mode: Delegation mode — "auto" (LLM picks), "in-process", + "shared-pvc", "isolated", or "sidecar". + variant: Agent variant for out-of-process modes. + timeout_minutes: Timeout for the child session. + + Returns: + The child session's result or status message. + """ + child_context_id = f"child-{uuid.uuid4().hex[:12]}" + + selected_mode = mode + if mode == "auto": + # Default all auto-mode to in-process until shared-pvc/isolated + # are implemented. This prevents placeholder responses. + selected_mode = "in-process" + + if selected_mode not in _DELEGATION_MODES: + return f"Mode '{selected_mode}' not enabled. Available: {', '.join(_DELEGATION_MODES)}" + + logger.info("Delegating: child=%s mode=%s parent=%s", child_context_id, selected_mode, parent_context_id) + + # Register the child session in the tasks DB so it appears in the sidebar + await _register_child_session(child_context_id, parent_context_id, variant, task) + + try: + if selected_mode == "in-process": + result = await _run_in_process(task, workspace, llm, child_context_id, tools_list, timeout_minutes * 60) + elif selected_mode == "shared-pvc": + result = await _run_shared_pvc(task, child_context_id, namespace, variant, timeout_minutes) + elif selected_mode == "isolated": + result = await _run_isolated(task, child_context_id, namespace, variant, timeout_minutes) + elif selected_mode == "sidecar": + result = await _run_sidecar(task, child_context_id, variant) + else: + result = f"Unknown mode: {selected_mode}" + except Exception as e: + result = f"Delegation failed: {e}" + + # Mark the child session as completed in the tasks DB + await _complete_child_session(child_context_id, result) + + return result + + return delegate From c7882d0dcc87acbdc43a2ee4a8c8f931b47b0332 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 20:49:22 +0100 Subject: [PATCH 21/26] feat(sandbox): workspace manager for per-context_id directory isolation on shared PVC Signed-off-by: Ladislav Smola --- .../src/sandbox_agent/workspace.py | 189 ++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 a2a/sandbox_agent/src/sandbox_agent/workspace.py diff --git a/a2a/sandbox_agent/src/sandbox_agent/workspace.py b/a2a/sandbox_agent/src/sandbox_agent/workspace.py new file mode 100644 index 00000000..e047d7d7 --- /dev/null +++ b/a2a/sandbox_agent/src/sandbox_agent/workspace.py @@ -0,0 +1,189 @@ +"""Workspace manager for per-context_id directory isolation. + +Each A2A context_id gets its own subdirectory under workspace_root +(typically mounted from a shared RWX PVC at /workspace). The manager +creates standardised subdirectories and tracks metadata in .context.json. +""" + +import json +import os +from datetime import datetime, timezone +from pathlib import Path + +WORKSPACE_SUBDIRS = ["scripts", "data", "repos", "output"] + + +class WorkspaceManager: + """Manages per-context workspace directories on shared storage. + + Parameters + ---------- + workspace_root: + Absolute path to the shared workspace mount (e.g. ``/workspace``). + agent_name: + Name of the agent that owns the workspaces. + namespace: + Kubernetes namespace the agent is running in. + ttl_days: + Default time-to-live for workspace directories. + """ + + def __init__( + self, + workspace_root: str, + agent_name: str, + namespace: str = "", + ttl_days: int = 7, + ) -> None: + self.workspace_root = workspace_root + self.agent_name = agent_name + self.namespace = namespace + self.ttl_days = ttl_days + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def get_workspace_path(self, context_id: str) -> str: + """Return the workspace path for *context_id* without creating it.""" + return os.path.join(self.workspace_root, context_id) + + def ensure_workspace(self, context_id: str) -> str: + """Create (or re-use) the workspace for *context_id*. + + On first call the directory tree and ``.context.json`` are created. + On subsequent calls ``last_accessed_at`` in the metadata file is + updated. + + Returns the absolute path to the workspace directory. + + Raises + ------ + ValueError + If *context_id* is empty. + """ + if not context_id: + raise ValueError("context_id must not be empty") + + workspace_path = self.get_workspace_path(context_id) + context_file = Path(workspace_path) / ".context.json" + + # Create the workspace root and subdirs (idempotent via exist_ok). + for subdir in WORKSPACE_SUBDIRS: + os.makedirs(os.path.join(workspace_path, subdir), exist_ok=True) + + now = datetime.now(timezone.utc).isoformat() + + if context_file.exists(): + # Update last_accessed_at, preserve everything else. + data = json.loads(context_file.read_text()) + data["last_accessed_at"] = now + data["disk_usage_bytes"] = self._disk_usage(workspace_path) + context_file.write_text(json.dumps(data, indent=2) + "\n") + else: + # First time -- write fresh metadata. + data = { + "context_id": context_id, + "agent": self.agent_name, + "namespace": self.namespace, + "created_at": now, + "last_accessed_at": now, + "ttl_days": self.ttl_days, + "disk_usage_bytes": 0, + } + context_file.write_text(json.dumps(data, indent=2) + "\n") + + return workspace_path + + def list_contexts(self) -> list[str]: + """Return a list of context_ids that have workspace directories. + + Only directories that contain a ``.context.json`` file are + considered valid contexts. + """ + root = Path(self.workspace_root) + if not root.is_dir(): + return [] + + contexts: list[str] = [] + for entry in root.iterdir(): + if entry.is_dir() and (entry / ".context.json").exists(): + contexts.append(entry.name) + return contexts + + def cleanup_expired(self) -> list[str]: + """Remove workspace directories whose TTL has expired. + + Reads ``created_at`` and ``ttl_days`` from each context's + ``.context.json``. If ``created_at + ttl_days`` is in the past, + the workspace directory is deleted. + + Returns a list of context_ids that were cleaned up. + """ + import shutil + + root = Path(self.workspace_root) + if not root.is_dir(): + return [] + + now = datetime.now(timezone.utc) + cleaned: list[str] = [] + + for entry in root.iterdir(): + # Skip filesystem metadata dirs (ext4 lost+found, etc.) + if entry.name in ("lost+found",): + continue + context_file = entry / ".context.json" + if not entry.is_dir() or not context_file.exists(): + continue + + try: + data = json.loads(context_file.read_text()) + except (json.JSONDecodeError, OSError): + continue + + created_str = data.get("created_at") + ttl = data.get("ttl_days", self.ttl_days) + + if not created_str: + continue + + try: + created_at = datetime.fromisoformat(created_str) + except ValueError: + continue + + from datetime import timedelta + + if now > created_at + timedelta(days=ttl): + try: + shutil.rmtree(entry) + cleaned.append(entry.name) + except OSError: + pass # best-effort cleanup + + return cleaned + + def get_total_disk_usage(self) -> int: + """Return total disk usage in bytes across all workspaces.""" + root = Path(self.workspace_root) + if not root.is_dir(): + return 0 + return self._disk_usage(str(root)) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + @staticmethod + def _disk_usage(path: str) -> int: + """Return total size in bytes of all files under *path*.""" + total = 0 + for dirpath, _dirnames, filenames in os.walk(path): + for fname in filenames: + fpath = os.path.join(dirpath, fname) + try: + total += os.path.getsize(fpath) + except OSError: + pass + return total From b1de4faf4a334495785f63769425319deebaece1 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 22:00:48 +0100 Subject: [PATCH 22/26] =?UTF-8?q?fix:=20resolve=20ruff=20lint=20violations?= =?UTF-8?q?=20=E2=80=94=20import=20ordering,=20unused=20vars,=20formatting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Auto-fixed: 16 import ordering (I001), unnecessary f-strings (F541) Manual: prefix 6 unused variables with underscore (F401) Formatted: 16 files with ruff format Signed-off-by: Ladislav Smola --- a2a/sandbox_agent/src/sandbox_agent/agent.py | 125 ++-- a2a/sandbox_agent/src/sandbox_agent/budget.py | 6 +- .../src/sandbox_agent/context_builders.py | 200 +++--- .../src/sandbox_agent/event_serializer.py | 193 +++--- .../src/sandbox_agent/executor.py | 11 +- a2a/sandbox_agent/src/sandbox_agent/graph.py | 110 ++-- .../src/sandbox_agent/graph_card.py | 59 +- .../src/sandbox_agent/landlock_ctypes.py | 26 +- .../src/sandbox_agent/landlock_probe.py | 4 +- .../src/sandbox_agent/observability.py | 90 +-- .../src/sandbox_agent/permissions.py | 14 +- .../src/sandbox_agent/plan_store.py | 49 +- .../src/sandbox_agent/reasoning.py | 582 +++++++++++------- .../src/sandbox_agent/sandbox_subprocess.py | 4 +- .../src/sandbox_agent/sources.py | 7 +- .../src/sandbox_agent/subagents.py | 71 ++- 16 files changed, 876 insertions(+), 675 deletions(-) diff --git a/a2a/sandbox_agent/src/sandbox_agent/agent.py b/a2a/sandbox_agent/src/sandbox_agent/agent.py index 70e67ba7..d75b29f6 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/agent.py +++ b/a2a/sandbox_agent/src/sandbox_agent/agent.py @@ -38,9 +38,8 @@ ) from a2a.utils import new_agent_text_message, new_task from langchain_core.messages import HumanMessage -from starlette.routing import Route - from langgraph.checkpoint.memory import MemorySaver +from starlette.routing import Route from sandbox_agent.budget import AgentBudget from sandbox_agent.configuration import Configuration @@ -157,7 +156,9 @@ def _tofu_verify(root: Path) -> None: "TOFU: workspace file integrity mismatch! " "changed=%s, added=%s, removed=%s. " "This may indicate tampering. Updating stored hashes.", - changed, added, removed, + changed, + added, + removed, ) # Update stored hashes (trust the new state). with open(hash_file, "w", encoding="utf-8") as fh: @@ -353,13 +354,13 @@ async def _ensure_checkpointer(self) -> None: if self._checkpointer_initialized and self._checkpointer: try: # Lightweight health check — attempt a simple query - pool = getattr(self._checkpointer, 'conn', None) or getattr(self._checkpointer, '_conn', None) - if pool and hasattr(pool, 'execute'): + pool = getattr(self._checkpointer, "conn", None) or getattr(self._checkpointer, "_conn", None) + if pool and hasattr(pool, "execute"): await pool.execute("SELECT 1") except Exception: logger.warning("PostgreSQL checkpointer connection stale — re-initializing") # Close old connection - if hasattr(self, '_checkpointer_cm') and self._checkpointer_cm: + if hasattr(self, "_checkpointer_cm") and self._checkpointer_cm: try: await self._checkpointer_cm.__aexit__(None, None, None) except Exception: @@ -379,9 +380,7 @@ async def _ensure_checkpointer(self) -> None: # ------------------------------------------------------------------ - async def execute( - self, context: RequestContext, event_queue: EventQueue - ) -> None: + async def execute(self, context: RequestContext, event_queue: EventQueue) -> None: """Execute a user request through the LangGraph sandbox graph. Steps: @@ -489,9 +488,7 @@ async def _run_graph() -> None: max_retries = 3 for attempt in range(max_retries + 1): try: - async for ev in graph.astream( - input_state, config=graph_config, stream_mode="updates" - ): + async for ev in graph.astream(input_state, config=graph_config, stream_mode="updates"): await event_queue.put(ev) break # success except Exception as retry_err: @@ -501,14 +498,14 @@ async def _run_graph() -> None: is_db_stale = "connection is closed" in err_str or "operationalerror" in err_str if is_quota: logger.error("LLM quota exceeded: %s", retry_err) - await event_queue.put( - {"_error": "LLM API quota exceeded. Check billing."} - ) + await event_queue.put({"_error": "LLM API quota exceeded. Check billing."}) break elif is_db_stale and attempt < max_retries: logger.warning( "DB connection stale (%d/%d), re-initializing checkpointer: %s", - attempt + 1, max_retries, retry_err, + attempt + 1, + max_retries, + retry_err, ) await self._ensure_checkpointer() # Rebuild graph with fresh checkpointer @@ -525,7 +522,10 @@ async def _run_graph() -> None: delay = 2 ** (attempt + 1) logger.warning( "Rate limited (%d/%d), retrying in %ds: %s", - attempt + 1, max_retries, delay, retry_err, + attempt + 1, + max_retries, + delay, + retry_err, ) await asyncio.sleep(delay) continue @@ -572,7 +572,9 @@ async def _run_graph() -> None: node_names = list(event.keys()) logger.info( "Graph event %d: nodes=%s (context=%s)", - event_count, node_names, context_id, + event_count, + node_names, + context_id, ) # Skip __interrupt__ events (HITL pause) — these contain @@ -580,16 +582,19 @@ async def _run_graph() -> None: if "__interrupt__" in event: logger.info( "Graph interrupted (HITL) at event %d: %s", - event_count, event.get("__interrupt__"), + event_count, + event.get("__interrupt__"), ) # Emit a structured HITL event for the frontend hitl_data = event.get("__interrupt__", ()) hitl_msg = str(hitl_data[0]) if hitl_data else "Approval required" - hitl_json = json.dumps({ - "type": "hitl_request", - "loop_id": serializer._loop_id, - "message": hitl_msg[:500], - }) + hitl_json = json.dumps( + { + "type": "hitl_request", + "loop_id": serializer._loop_id, + "message": hitl_msg[:500], + } + ) await task_updater.update_status( TaskState.working, new_agent_text_message( @@ -602,11 +607,14 @@ async def _run_graph() -> None: # Send intermediate status updates as structured JSON try: - serialized_lines = "\n".join( - serializer.serialize(key, value) - for key, value in event.items() - if isinstance(value, dict) - ) + "\n" + serialized_lines = ( + "\n".join( + serializer.serialize(key, value) + for key, value in event.items() + if isinstance(value, dict) + ) + + "\n" + ) await task_updater.update_status( TaskState.working, new_agent_text_message( @@ -624,19 +632,20 @@ async def _run_graph() -> None: line_types.append(lt) except json.JSONDecodeError: line_types.append("parse_error") - logger.info("A2A_EMIT session=%s lines=%d types=%s", - context_id, len(line_types), line_types) + logger.info("A2A_EMIT session=%s lines=%d types=%s", context_id, len(line_types), line_types) except asyncio.CancelledError: logger.warning( "SSE update cancelled at event %d (context=%s) — client disconnected", - event_count, context_id, + event_count, + context_id, ) client_disconnected = True break except Exception as update_err: logger.error( "Failed to send SSE update for event %d: %s", - event_count, update_err, + event_count, + update_err, ) output = event @@ -676,7 +685,9 @@ async def _run_graph() -> None: if bg_event_count > 0: logger.info( "Drained %d background events for context=%s, serialized %d lines", - bg_event_count, context_id, len(bg_serialized_lines), + bg_event_count, + context_id, + len(bg_serialized_lines), ) # Persist via task_updater so the events appear in history for line_block in bg_serialized_lines: @@ -711,11 +722,14 @@ async def _run_graph() -> None: if msgs: content = getattr(msgs[-1], "content", None) if isinstance(content, list): - final_answer = "\n".join( - block.get("text", "") if isinstance(block, dict) else str(block) - for block in content - if isinstance(block, dict) and block.get("type") == "text" - ) or None + final_answer = ( + "\n".join( + block.get("text", "") if isinstance(block, dict) else str(block) + for block in content + if isinstance(block, dict) and block.get("type") == "text" + ) + or None + ) elif content: final_answer = str(content) if final_answer: @@ -729,12 +743,15 @@ async def _run_graph() -> None: try: existing_meta = {} if task.metadata: - existing_meta = dict(task.metadata) if not isinstance(task.metadata, dict) else task.metadata + existing_meta = ( + dict(task.metadata) if not isinstance(task.metadata, dict) else task.metadata + ) existing_meta["llm_request_ids"] = llm_request_ids task.metadata = existing_meta logger.info( "Stored %d LLM request_ids in task metadata for context_id=%s", - len(llm_request_ids), context_id, + len(llm_request_ids), + context_id, ) except Exception as meta_err: logger.warning("Failed to store llm_request_ids: %s", meta_err) @@ -781,9 +798,7 @@ async def _run_graph() -> None: # ------------------------------------------------------------------ - async def cancel( - self, context: RequestContext, event_queue: EventQueue - ) -> None: + async def cancel(self, context: RequestContext, event_queue: EventQueue) -> None: """Cancel is not supported.""" raise Exception("cancel not supported") @@ -804,9 +819,15 @@ class _MergingDatabaseTaskStore(DatabaseTaskStore): backend-managed keys so they survive A2A SDK updates. """ - _BACKEND_KEYS = frozenset({ - "owner", "visibility", "title", "agent_name", "loop_events", - }) + _BACKEND_KEYS = frozenset( + { + "owner", + "visibility", + "title", + "agent_name", + "loop_events", + } + ) async def save(self, task, context=None): """Save task while preserving backend-managed metadata fields.""" @@ -816,6 +837,7 @@ async def save(self, task, context=None): existing_meta = {} async with self.async_session_maker() as session: from sqlalchemy import select + stmt = select(self.task_model).where(self.task_model.id == task.id) result = await session.execute(stmt) existing = result.scalar_one_or_none() @@ -825,6 +847,7 @@ async def save(self, task, context=None): existing_meta = raw elif isinstance(raw, str): import json + try: existing_meta = json.loads(raw) except (json.JSONDecodeError, TypeError): @@ -843,8 +866,7 @@ async def save(self, task, context=None): db_task = self._to_orm(task) async with self.async_session_maker.begin() as session: await session.merge(db_task) - logger.debug("Task %s saved with merged metadata (keys=%s)", - task.id, list(merged.keys()) if merged else []) + logger.debug("Task %s saved with merged metadata (keys=%s)", task.id, list(merged.keys()) if merged else []) def _create_task_store(): @@ -1020,6 +1042,7 @@ async def _handle_graph_card(request: Any) -> Any: # noqa: ARG001 # Build a graph for introspection only (no checkpointer, dummy config) from sandbox_agent.permissions import PermissionChecker from sandbox_agent.sources import SourcesConfig + pc = PermissionChecker(settings={"workspace": "/workspace", "permissions": {}}) sc = SourcesConfig() compiled = build_graph( @@ -1028,9 +1051,7 @@ async def _handle_graph_card(request: Any) -> Any: # noqa: ARG001 sources_config=sc, checkpointer=None, ) - _graph_card_cache.update( - build_graph_card(compiled, agent_id="sandbox-legion-v1") - ) + _graph_card_cache.update(build_graph_card(compiled, agent_id="sandbox-legion-v1")) return JSONResponse(_graph_card_cache) app.routes.insert( diff --git a/a2a/sandbox_agent/src/sandbox_agent/budget.py b/a2a/sandbox_agent/src/sandbox_agent/budget.py index 87816781..1add9fe0 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/budget.py +++ b/a2a/sandbox_agent/src/sandbox_agent/budget.py @@ -159,11 +159,7 @@ def exceeded_reason(self) -> str | None: @property def needs_hitl_checkin(self) -> bool: """Return True when it's time for a human-in-the-loop check-in.""" - return ( - self.hitl_interval > 0 - and self.iterations_used > 0 - and self.iterations_used % self.hitl_interval == 0 - ) + return self.hitl_interval > 0 and self.iterations_used > 0 and self.iterations_used % self.hitl_interval == 0 def summary(self) -> dict: """Return budget state as a dict for event serialization.""" diff --git a/a2a/sandbox_agent/src/sandbox_agent/context_builders.py b/a2a/sandbox_agent/src/sandbox_agent/context_builders.py index c3404711..f9de62c1 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/context_builders.py +++ b/a2a/sandbox_agent/src/sandbox_agent/context_builders.py @@ -82,7 +82,9 @@ def build_planner_context( result = [SystemMessage(content=system_content)] + first_user + recent_tools logger.info( "Planner context: %d messages (iteration=%d, %d tool results)", - len(result), iteration, len(recent_tools), + len(result), + iteration, + len(recent_tools), extra={"session_id": state.get("context_id", ""), "node": "planner"}, ) return result @@ -134,9 +136,7 @@ def build_executor_context( used_chars = 0 for m in reversed(all_msgs): content = str(getattr(m, "content", "")) - if isinstance(m, SystemMessage) and content.startswith( - f"[STEP_BOUNDARY {current_step}]" - ): + if isinstance(m, SystemMessage) and content.startswith(f"[STEP_BOUNDARY {current_step}]"): break msg_chars = len(content) if used_chars + msg_chars > _MAX_CONTEXT_CHARS: @@ -156,6 +156,7 @@ def build_executor_context( # Determine status from exit code if "EXIT_CODE:" in content: import re as _re + ec_match = _re.search(r"EXIT_CODE:\s*(\d+)", content) status = "FAILED" if ec_match and ec_match.group(1) != "0" else "OK" error_hint = content[:150] if status == "FAILED" else "" @@ -172,11 +173,9 @@ def build_executor_context( if error_hint: reflection_parts.append(f"Error: {error_hint}") if "unknown flag" in content.lower() or "invalid option" in content.lower(): - reflection_parts.append( - "The flag is INVALID. Run the command with --help to see valid flags." - ) + reflection_parts.append("The flag is INVALID. Run the command with --help to see valid flags.") reflection_parts.append( - f"Goal: \"{step_text[:100]}\"\n" + f'Goal: "{step_text[:100]}"\n' f"If goal ACHIEVED → stop, summarize result. " f"If FAILED → try DIFFERENT approach. " f"NEVER repeat same command." @@ -186,7 +185,8 @@ def build_executor_context( result = [SystemMessage(content=system_content)] + first_msg + windowed logger.info( "Executor context: %d messages, ~%dk chars (from %d total)", - len(result), sum(len(str(getattr(m, "content", ""))) for m in result) // 1000, + len(result), + sum(len(str(getattr(m, "content", ""))) for m in result) // 1000, len(all_msgs), extra={ "session_id": state.get("context_id", ""), @@ -240,7 +240,9 @@ def build_reflector_context( result = [SystemMessage(content=system_content)] + recent_msgs logger.info( "Reflector context: %d messages (%d tool pairs from %d total)", - len(result), pair_count, len(messages), + len(result), + pair_count, + len(messages), extra={"session_id": state.get("context_id", ""), "node": "reflector"}, ) return result @@ -322,9 +324,7 @@ def _summarize_messages(self) -> list[dict[str, str]]: content = getattr(msg, "content", "") if isinstance(content, list): content = " ".join( - b.get("text", "") - for b in content - if isinstance(b, dict) and b.get("type") == "text" + b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text" ) text = str(content) tool_calls = getattr(msg, "tool_calls", None) @@ -353,11 +353,10 @@ def _format_response(self) -> dict[str, Any]: meta = getattr(resp, "response_metadata", {}) or {} content = resp.content if isinstance(content, list): - content = " ".join( - b.get("text", "") - for b in content - if isinstance(b, dict) and b.get("type") == "text" - ) or None + content = ( + " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text") + or None + ) tool_calls_out = None if resp.tool_calls: tool_calls_out = [ @@ -374,14 +373,16 @@ def _format_response(self) -> dict[str, Any]: for tc in resp.tool_calls ] return { - "choices": [{ - "message": { - "role": "assistant", - "content": content if content else None, - "tool_calls": tool_calls_out, - }, - "finish_reason": meta.get("finish_reason", "unknown"), - }], + "choices": [ + { + "message": { + "role": "assistant", + "content": content if content else None, + "tool_calls": tool_calls_out, + }, + "finish_reason": meta.get("finish_reason", "unknown"), + } + ], "model": meta.get("model", ""), "usage": { "prompt_tokens": self.prompt_tokens, @@ -477,9 +478,17 @@ async def invoke_llm( logger.info( "LLM call [%s]: %d messages, %d prompt tokens, %d completion tokens, model=%s", - node, len(messages), prompt_tokens, completion_tokens, model_name, - extra={"session_id": session_id, "node": node, - "prompt_tokens": prompt_tokens, "completion_tokens": completion_tokens}, + node, + len(messages), + prompt_tokens, + completion_tokens, + model_name, + extra={ + "session_id": session_id, + "node": node, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + }, ) return response, capture @@ -558,40 +567,47 @@ async def invoke_with_tool_loop( if i == 0: thinking_messages.append( - HumanMessage(content="Brief analysis (2-3 sentences max): " - "What is the best tool call for this step? " - "If step is already done, say READY: step complete.") + HumanMessage( + content="Brief analysis (2-3 sentences max): " + "What is the best tool call for this step? " + "If step is already done, say READY: step complete." + ) ) else: thinking_messages.append( - HumanMessage(content="Refine in 1-2 sentences. " - "When ready: READY: ") + HumanMessage(content="Refine in 1-2 sentences. When ready: READY: ") ) reason_response, reason_capture = await invoke_llm( - llm_reason, thinking_messages, - node=f"{node}-think-{cycle+1}.{i+1}", session_id=session_id, + llm_reason, + thinking_messages, + node=f"{node}-think-{cycle + 1}.{i + 1}", + session_id=session_id, workspace_path=workspace_path, ) last_reasoning = str(reason_response.content or "").strip() total_thinking_tokens += reason_capture.prompt_tokens + reason_capture.completion_tokens - sub_events.append({ - "type": "thinking", - "node": node, - "cycle": cycle + 1, - "iteration": i + 1, - "total_iterations": 0, - "reasoning": last_reasoning, - **reason_capture.debug_fields(), - **reason_capture.token_fields(), - }) + sub_events.append( + { + "type": "thinking", + "node": node, + "cycle": cycle + 1, + "iteration": i + 1, + "total_iterations": 0, + "reasoning": last_reasoning, + **reason_capture.debug_fields(), + **reason_capture.token_fields(), + } + ) thinking_summary = last_reasoning[:200] + ("..." if len(last_reasoning) > 200 else "") - thinking_history.extend([ - AIMessage(content=thinking_summary), - HumanMessage(content=f"(Thinking {i+1} recorded. Continue or signal READY:)"), - ]) + thinking_history.extend( + [ + AIMessage(content=thinking_summary), + HumanMessage(content=f"(Thinking {i + 1} recorded. Continue or signal READY:)"), + ] + ) if last_reasoning.upper().startswith("READY:"): break @@ -599,15 +615,19 @@ async def invoke_with_tool_loop( # --- Micro-reasoning: LLM with tools --- tool_messages = cycle_messages + [ AIMessage(content=last_reasoning or "I need to call a tool for this step."), - HumanMessage(content="Now execute your planned action. Rules:\n" - "- Call step_done(summary='...') if the step is ALREADY COMPLETE.\n" - "- Call ONE tool if there's a single action to take.\n" - "- Call multiple tools ONLY if they are independent (can run in parallel).\n" - "- NEVER call the same tool twice with similar args."), + HumanMessage( + content="Now execute your planned action. Rules:\n" + "- Call step_done(summary='...') if the step is ALREADY COMPLETE.\n" + "- Call ONE tool if there's a single action to take.\n" + "- Call multiple tools ONLY if they are independent (can run in parallel).\n" + "- NEVER call the same tool twice with similar args." + ), ] response, capture = await invoke_llm( - llm_with_tools, tool_messages, - node=f"{node}-tool-{cycle+1}", session_id=session_id, + llm_with_tools, + tool_messages, + node=f"{node}-tool-{cycle + 1}", + session_id=session_id, workspace_path=workspace_path, ) capture.prompt_tokens += total_thinking_tokens @@ -616,8 +636,9 @@ async def invoke_with_tool_loop( else: # Single-phase: one LLM call with implicit auto response, capture = await invoke_llm( - llm_with_tools, cycle_messages, - node=f"{node}-{cycle+1}" if max_cycles > 1 else node, + llm_with_tools, + cycle_messages, + node=f"{node}-{cycle + 1}" if max_cycles > 1 else node, session_id=session_id, workspace_path=workspace_path, ) @@ -628,8 +649,12 @@ async def invoke_with_tool_loop( done_calls = [tc for tc in response.tool_calls if tc.get("name") == "step_done"] if done_calls: summary = done_calls[0].get("args", {}).get("summary", last_reasoning or "") - logger.info("step_done called in cycle %d: %s", cycle + 1, summary[:100], - extra={"session_id": session_id, "node": node}) + logger.info( + "step_done called in cycle %d: %s", + cycle + 1, + summary[:100], + extra={"session_id": session_id, "node": node}, + ) response = AIMessage(content=summary) break @@ -648,17 +673,17 @@ async def invoke_with_tool_loop( if response.tool_calls and tool_map and max_cycles > 1: # Emit tool_call sub_event BEFORE execution (so UI shows the call) import uuid as _uuid + call_id = str(_uuid.uuid4())[:8] - sub_events.append({ - "type": "tool_call", - "node": node, - "cycle": cycle + 1, - "call_id": call_id, - "tools": [ - {"name": tc.get("name", "?"), "args": tc.get("args", {})} - for tc in response.tool_calls - ], - }) + sub_events.append( + { + "type": "tool_call", + "node": node, + "cycle": cycle + 1, + "call_id": call_id, + "tools": [{"name": tc.get("name", "?"), "args": tc.get("args", {})} for tc in response.tool_calls], + } + ) # Execute all tool calls in parallel via asyncio.gather async def _run_tool(tc: dict) -> ToolMessage: @@ -684,24 +709,27 @@ async def _run_tool(tc: dict) -> ToolMessage: for tm in tool_results: content_str = str(getattr(tm, "content", "")) import re as _re + exit_match = _re.search(r"EXIT_CODE:\s*(\d+)", content_str) - is_error = ( - (exit_match is not None and exit_match.group(1) != "0") - or content_str.startswith("Error:") + is_error = (exit_match is not None and exit_match.group(1) != "0") or content_str.startswith("Error:") + sub_events.append( + { + "type": "tool_result", + "node": node, + "cycle": cycle + 1, + "call_id": call_id, + "name": getattr(tm, "name", "unknown"), + "output": content_str[:2000], + "status": "error" if is_error else "success", + } ) - sub_events.append({ - "type": "tool_result", - "node": node, - "cycle": cycle + 1, - "call_id": call_id, - "name": getattr(tm, "name", "unknown"), - "output": content_str[:2000], - "status": "error" if is_error else "success", - }) logger.info( "Cycle %d/%d [%s]: %d tool calls executed, continuing", - cycle + 1, max_cycles, node, len(response.tool_calls), + cycle + 1, + max_cycles, + node, + len(response.tool_calls), extra={"session_id": session_id, "node": node}, ) continue # Next cycle @@ -731,7 +759,9 @@ async def _run_tool(tc: dict) -> ToolMessage: logger.info( "Tool loop %s: %d cycles, %d thinking iterations, %d total tokens", - node, cycle + 1, total_iters, + node, + cycle + 1, + total_iters, final_capture.prompt_tokens + final_capture.completion_tokens, extra={"session_id": session_id, "node": node}, ) diff --git a/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py b/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py index 4191a67b..8e039ef7 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py +++ b/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py @@ -104,8 +104,8 @@ def __init__(self, loop_id: str | None = None, context_id: str | None = None) -> self._loop_id = loop_id or str(uuid.uuid4())[:8] self._step_index = 0 self._event_counter = 0 # global sequence number for ordering - self._node_visit = 0 # graph node visit counter (main sections) - self._sub_index = 0 # position within current node visit + self._node_visit = 0 # graph node visit counter (main sections) + self._sub_index = 0 # position within current node visit self._last_node_key: str = "" # track previous node for visit grouping self._micro_step: int = 0 self._context_id = context_id or "unknown" @@ -152,12 +152,14 @@ def serialize(self, key: str, value: dict) -> str: if key == "router": # Router is an internal node — emit minimal event for logging route = value.get("_route", "new") - result = json.dumps({ - "type": "router", - "loop_id": self._loop_id, - "route": route, - "plan_status": value.get("plan_status", ""), - }) + result = json.dumps( + { + "type": "router", + "loop_id": self._loop_id, + "route": route, + "plan_status": value.get("plan_status", ""), + } + ) elif key == "planner": result = self._serialize_planner(value) elif key == "reflector": @@ -175,14 +177,16 @@ def serialize(self, key: str, value: dict) -> str: # Strip the "STEP BRIEF FROM COORDINATOR:" prefix if "STEP BRIEF" in brief: brief = brief.split("---")[0].replace("STEP BRIEF FROM COORDINATOR:", "").strip() - result = json.dumps({ - "type": "step_selector", - "loop_id": self._loop_id, - "current_step": current_step, - "description": f"Advancing to step {current_step + 1}: {step_desc[:80]}", - "brief": brief[:500], - "done": value.get("done", False), - }) + result = json.dumps( + { + "type": "step_selector", + "loop_id": self._loop_id, + "current_step": current_step, + "description": f"Advancing to step {current_step + 1}: {step_desc[:80]}", + "brief": brief[:500], + "done": value.get("done", False), + } + ) elif key == "reporter": result = self._serialize_reporter(value) else: @@ -208,11 +212,13 @@ def serialize(self, key: str, value: dict) -> str: # Append budget_update event if _budget_summary is in the value dict budget_summary = value.get("_budget_summary") if budget_summary and isinstance(budget_summary, dict): - budget_event = json.dumps({ - "type": "budget_update", - "loop_id": self._loop_id, - **budget_summary, - }) + budget_event = json.dumps( + { + "type": "budget_update", + "loop_id": self._loop_id, + **budget_summary, + } + ) result = result + "\n" + budget_event # Post-process: ensure ALL event lines have step + unique event_index. @@ -245,11 +251,15 @@ def serialize(self, key: str, value: dict) -> str: except json.JSONDecodeError: enriched_lines.append(line) event_type = "parse_error" - logger.info("SERIALIZE session=%s loop=%s type=%s step=%s ei=%s", - self._context_id, self._loop_id, event_type, - self._step_index, self._event_counter, - extra={"session_id": self._context_id, "node": key, - "event_type": event_type, "step": self._step_index}) + logger.info( + "SERIALIZE session=%s loop=%s type=%s step=%s ei=%s", + self._context_id, + self._loop_id, + event_type, + self._step_index, + self._event_counter, + extra={"session_id": self._context_id, "node": key, "event_type": event_type, "step": self._step_index}, + ) return "\n".join(enriched_lines) @@ -277,13 +287,14 @@ def _serialize_assistant(self, msg: Any) -> str: if text.strip(): parts.append(json.dumps({"type": "llm_response", "content": text})) # Then emit the tool call - parts.append(json.dumps({ - "type": "tool_call", - "tools": [ - _safe_tc(tc) - for tc in tool_calls - ], - })) + parts.append( + json.dumps( + { + "type": "tool_call", + "tools": [_safe_tc(tc) for tc in tool_calls], + } + ) + ) return "\n".join(parts) return json.dumps({"type": "llm_response", "content": text}) @@ -323,23 +334,31 @@ def _serialize_executor(self, msg: Any, value: dict | None = None) -> str: thinking_event[field.lstrip("_")] = se[field] parts.append(json.dumps(thinking_event)) elif se_type == "tool_call": - parts.append(json.dumps({ - "type": "tool_call", - "loop_id": self._loop_id, - "call_id": se.get("call_id", ""), - "cycle": se.get("cycle", 1), - "tools": se.get("tools", []), - })) + parts.append( + json.dumps( + { + "type": "tool_call", + "loop_id": self._loop_id, + "call_id": se.get("call_id", ""), + "cycle": se.get("cycle", 1), + "tools": se.get("tools", []), + } + ) + ) elif se_type == "tool_result": - parts.append(json.dumps({ - "type": "tool_result", - "loop_id": self._loop_id, - "call_id": se.get("call_id", ""), - "cycle": se.get("cycle", 1), - "name": se.get("name", "unknown"), - "output": se.get("output", "")[:2000], - "status": se.get("status", "success"), - })) + parts.append( + json.dumps( + { + "type": "tool_result", + "loop_id": self._loop_id, + "call_id": se.get("call_id", ""), + "cycle": se.get("cycle", 1), + "name": se.get("name", "unknown"), + "output": se.get("output", "")[:2000], + "status": se.get("status", "success"), + } + ) + ) self._micro_step += 1 @@ -376,20 +395,18 @@ def _serialize_executor(self, msg: Any, value: dict | None = None) -> str: if tool_calls: # Use LangGraph's tool_call_id for proper pairing with tool_result tc0 = tool_calls[0] if tool_calls else {} - call_id = ( - tc0.get("id") if isinstance(tc0, dict) - else getattr(tc0, "id", None) - ) or str(uuid.uuid4())[:8] + call_id = (tc0.get("id") if isinstance(tc0, dict) else getattr(tc0, "id", None)) or str(uuid.uuid4())[:8] self._last_call_id = call_id - parts.append(json.dumps({ - "type": "tool_call", - "loop_id": self._loop_id, - "call_id": call_id, - "tools": [ - _safe_tc(tc) - for tc in tool_calls - ], - })) + parts.append( + json.dumps( + { + "type": "tool_call", + "loop_id": self._loop_id, + "call_id": call_id, + "tools": [_safe_tc(tc) for tc in tool_calls], + } + ) + ) return "\n".join(parts) # Emit tool_call event for text-parsed tools (no structured tool_calls) @@ -397,15 +414,16 @@ def _serialize_executor(self, msg: Any, value: dict | None = None) -> str: if parsed_tools: call_id = str(uuid.uuid4())[:8] self._last_call_id = call_id - parts.append(json.dumps({ - "type": "tool_call", - "loop_id": self._loop_id, - "call_id": call_id, - "tools": [ - {"name": t["name"], "args": t.get("args", {})} - for t in parsed_tools - ], - })) + parts.append( + json.dumps( + { + "type": "tool_call", + "loop_id": self._loop_id, + "call_id": call_id, + "tools": [{"name": t["name"], "args": t.get("args", {})} for t in parsed_tools], + } + ) + ) return "\n".join(parts) @@ -463,6 +481,7 @@ def _serialize_tool_result(self, msg: Any) -> str: # Keyword matching (e.g. "failure", "error") causes false positives # when command output contains those words in normal data. import re as _re + exit_match = _re.search(r"EXIT_CODE:\s*(\d+)", content_str) is_error = ( (exit_match is not None and exit_match.group(1) != "0") @@ -474,14 +493,16 @@ def _serialize_tool_result(self, msg: Any) -> str: status = "error" if is_error else "success" # Use LangGraph's tool_call_id for proper pairing with tool_call call_id = getattr(msg, "tool_call_id", None) or self._last_call_id - return json.dumps({ - "type": "tool_result", - "loop_id": self._loop_id, - "call_id": call_id, - "name": str(name), - "output": content_str[:2000], - "status": status, - }) + return json.dumps( + { + "type": "tool_result", + "loop_id": self._loop_id, + "call_id": call_id, + "name": str(name), + "output": content_str[:2000], + "status": status, + } + ) @staticmethod def _enrich_with_plan_store(payload: dict, value: dict) -> None: @@ -553,7 +574,7 @@ def _serialize_reflector(self, value: dict) -> str: """Serialize a reflector node output — emits reflector_decision + legacy reflection.""" done = value.get("done", False) current_step = value.get("current_step", 0) - step_results = value.get("step_results", []) + _step_results = value.get("step_results", []) # Extract decision text from message if present msgs = value.get("messages", []) @@ -634,11 +655,7 @@ def _serialize_reporter(self, value: dict) -> str: tc_info = _safe_tc(tc) if tc_info["name"] == "respond_to_user": args = tc_info["args"] - final_answer = ( - args.get("response", "") - if isinstance(args, dict) - else str(args) - ) + final_answer = args.get("response", "") if isinstance(args, dict) else str(args) break if final_answer: break @@ -690,8 +707,4 @@ def _extract_decision(text: str) -> str: @staticmethod def _extract_text_blocks(content: list) -> str: """Extract text from a list of content blocks.""" - return " ".join( - b.get("text", "") - for b in content - if isinstance(b, dict) and b.get("type") == "text" - )[:2000] + return " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text")[:2000] diff --git a/a2a/sandbox_agent/src/sandbox_agent/executor.py b/a2a/sandbox_agent/src/sandbox_agent/executor.py index 7d3777a6..6dc5f7eb 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/executor.py +++ b/a2a/sandbox_agent/src/sandbox_agent/executor.py @@ -207,8 +207,7 @@ def _check_interpreter_bypass(self, command: str) -> str | None: inner_permission = self._check_permission(inner_command) if inner_permission is PermissionResult.DENY: return ( - f"Permission denied: interpreter bypass detected. " - f"Inner command '{inner_command}' is denied by policy." + f"Permission denied: interpreter bypass detected. Inner command '{inner_command}' is denied by policy." ) # Also check the inner command against sources.json policy @@ -216,8 +215,7 @@ def _check_interpreter_bypass(self, command: str) -> str | None: inner_sources_denial = self._check_sources(inner_command) if inner_sources_denial: return ( - f"Blocked: interpreter bypass detected. " - f"Inner command violates sources policy: {inner_sources_denial}" + f"Blocked: interpreter bypass detected. Inner command violates sources policy: {inner_sources_denial}" ) return None @@ -324,10 +322,7 @@ async def _execute(self, command: str) -> ExecutionResult: await process.wait() return ExecutionResult( stdout="", - stderr=( - f"Command timed out after {timeout} seconds " - f"and was killed: '{command}'" - ), + stderr=(f"Command timed out after {timeout} seconds and was killed: '{command}'"), exit_code=-1, ) diff --git a/a2a/sandbox_agent/src/sandbox_agent/graph.py b/a2a/sandbox_agent/src/sandbox_agent/graph.py index 5cbe603e..6f89c33f 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/graph.py +++ b/a2a/sandbox_agent/src/sandbox_agent/graph.py @@ -56,7 +56,7 @@ from langchain_openai import ChatOpenAI from langgraph.graph import MessagesState, StateGraph from langgraph.prebuilt import ToolNode, tools_condition -from langgraph.types import Send, interrupt +from langgraph.types import interrupt try: from langgraph.errors import GraphInterrupt @@ -64,12 +64,13 @@ # Fallback for older langgraph versions GraphInterrupt = type("GraphInterrupt", (Exception,), {}) +from sandbox_agent import plan_store as ps from sandbox_agent.budget import AgentBudget from sandbox_agent.executor import HitlRequired, SandboxExecutor from sandbox_agent.permissions import PermissionChecker from sandbox_agent.reasoning import ( - PlanStep, _DEBUG_PROMPTS, + PlanStep, executor_node, planner_node, reflector_node, @@ -78,9 +79,8 @@ route_reflector, router_node, ) -from sandbox_agent import plan_store as ps from sandbox_agent.sources import SourcesConfig -from sandbox_agent.subagents import make_delegate_tool, make_explore_tool +from sandbox_agent.subagents import make_explore_tool logger = logging.getLogger(__name__) @@ -263,11 +263,13 @@ async def shell(command: str) -> str: # The interrupt() call suspends the graph state. The A2A task # transitions to input_required. Only an explicit human # approval (via the HITLManager channel) resumes execution. - approval = interrupt({ - "type": "approval_required", - "command": exc.command, - "message": f"Command '{exc.command}' requires human approval.", - }) + approval = interrupt( + { + "type": "approval_required", + "command": exc.command, + "message": f"Command '{exc.command}' requires human approval.", + } + ) # If we reach here, the human approved — execute the command. if isinstance(approval, dict) and approval.get("approved"): result = await executor._execute(command) @@ -278,8 +280,9 @@ async def shell(command: str) -> str: output = _format_result(result) if result.exit_code != 0 and _is_rate_limited(output): import asyncio + for attempt in range(1, 4): # up to 3 retries - delay = 2 ** attempt # 2s, 4s, 8s + delay = 2**attempt # 2s, 4s, 8s logger.info("Rate limit detected, retry %d/3 after %ds", attempt, delay) await asyncio.sleep(delay) try: @@ -322,14 +325,17 @@ def _format_result(result: Any) -> str: def _is_rate_limited(output: str) -> bool: """Detect rate-limit errors in command output.""" lower = output.lower() - return any(pattern in lower for pattern in ( - "rate limit exceeded", - "rate limit", - "too many requests", - "429", - "api rate limit", - "secondary rate limit", - )) + return any( + pattern in lower + for pattern in ( + "rate limit exceeded", + "rate limit", + "too many requests", + "429", + "api rate limit", + "secondary rate limit", + ) + ) def _make_file_read_tool(workspace_path: str) -> Any: @@ -430,7 +436,9 @@ async def grep(pattern: str, path: str = ".", include: str = "") -> str: try: proc = await _aio.create_subprocess_exec( - *cmd, stdout=_aio.subprocess.PIPE, stderr=_aio.subprocess.PIPE, + *cmd, + stdout=_aio.subprocess.PIPE, + stderr=_aio.subprocess.PIPE, ) stdout, stderr = await _aio.wait_for(proc.communicate(), timeout=30) out = stdout.decode(errors="replace")[:10000] @@ -461,6 +469,7 @@ async def glob(pattern: str) -> str: Newline-separated list of matching file paths relative to workspace. """ import fnmatch + matches = [] for p in sorted(ws_root.rglob("*")): if p.is_file(): @@ -499,9 +508,10 @@ async def web_fetch(url: str) -> str: Returns: The page content as text, or an error message. """ - import httpx from urllib.parse import urlparse + import httpx + parsed = urlparse(url) domain = parsed.hostname or "" @@ -524,10 +534,11 @@ async def web_fetch(url: str) -> str: if "text/html" in content_type: # Simple HTML tag stripping for readability import re - text = re.sub(r']*>.*?', '', text, flags=re.DOTALL) - text = re.sub(r']*>.*?', '', text, flags=re.DOTALL) - text = re.sub(r'<[^>]+>', ' ', text) - text = re.sub(r'\s+', ' ', text).strip() + + text = re.sub(r"]*>.*?", "", text, flags=re.DOTALL) + text = re.sub(r"]*>.*?", "", text, flags=re.DOTALL) + text = re.sub(r"<[^>]+>", " ", text) + text = re.sub(r"\s+", " ", text).strip() # Truncate very long responses if len(text) > 50000: @@ -672,7 +683,7 @@ def _make_llm(node_type: str) -> ChatOpenAI: llm_for_reflector = _make_llm("reflector") if config.llm_model_reflector else llm llm_for_reporter = _make_llm("reporter") if config.llm_model_reporter else llm llm_for_thinking = _make_llm("thinking") if config.llm_model_thinking else llm - llm_for_micro = _make_llm("micro_reasoning") if config.llm_model_micro_reasoning else llm + _llm_for_micro = _make_llm("micro_reasoning") if config.llm_model_micro_reasoning else llm # -- Tools -------------------------------------------------------------- # Create tool instances once — shared across node subsets. @@ -742,7 +753,9 @@ async def _reflector(state: SandboxState) -> dict[str, Any]: async def _reporter(state: SandboxState) -> dict[str, Any]: return await reporter_node( - state, llm_reporter, budget=budget, + state, + llm_reporter, + budget=budget, llm_reason=llm_executor_reason, tools=read_only_tools, ) @@ -754,7 +767,8 @@ async def _step_selector(state: SandboxState) -> dict[str, Any]: a targeted brief for the executor — what to do, what worked/failed before, and what to avoid. """ - from langchain_core.messages import SystemMessage as SM, HumanMessage as HM + from langchain_core.messages import HumanMessage as HM + from langchain_core.messages import SystemMessage as SM plan = state.get("plan", []) plan_steps = list(state.get("plan_steps", [])) @@ -796,13 +810,13 @@ async def _step_selector(state: SandboxState) -> dict[str, Any]: result_hint = "" if isinstance(_ps, dict) and _ps.get("result_summary"): result_hint = f" — {_ps['result_summary'][:100]}" - plan_summary.append(f" {marker} {i+1}. [{status}] {step[:80]}{result_hint}") + plan_summary.append(f" {marker} {i + 1}. [{status}] {step[:80]}{result_hint}") # Gather recent tool results (last 3 ToolMessages) recent_results = [] for m in reversed(messages[-10:]): - if hasattr(m, 'name') and getattr(m, 'type', '') == 'tool': - content = str(getattr(m, 'content', ''))[:300] + if hasattr(m, "name") and getattr(m, "type", "") == "tool": + content = str(getattr(m, "content", ""))[:300] recent_results.insert(0, f" [{m.name}] {content}") if len(recent_results) >= 3: break @@ -830,7 +844,7 @@ async def _step_selector(state: SandboxState) -> dict[str, Any]: Next step to execute: {next_step + 1}. {step_text} Recent tool results: -{chr(10).join(recent_results) if recent_results else '(none yet)'} +{chr(10).join(recent_results) if recent_results else "(none yet)"} WORKSPACE RULE: Each shell command starts fresh in /workspace. Bare `cd` has no effect. If the step involves a cloned repo, always write `cd repos/ && ` in the brief. @@ -843,10 +857,8 @@ async def _step_selector(state: SandboxState) -> dict[str, Any]: try: response = await llm.ainvoke([sys_msg, user_msg]) brief = response.content.strip() - usage = getattr(response, 'usage_metadata', None) or {} - budget.add_tokens( - usage.get('input_tokens', 0) + usage.get('output_tokens', 0) - ) + usage = getattr(response, "usage_metadata", None) or {} + budget.add_tokens(usage.get("input_tokens", 0) + usage.get("output_tokens", 0)) except Exception as e: logger.warning("StepSelector LLM call failed: %s — using default brief", e) brief = f"Execute step {next_step + 1}: {step_text}" @@ -863,6 +875,7 @@ async def _step_selector(state: SandboxState) -> dict[str, Any]: result["_plan_store"] = store if _DEBUG_PROMPTS: from sandbox_agent.context_builders import LLMCallCapture + result["_system_prompt"] = prompt[:10000] result["_prompt_messages"] = [ {"role": "system", "preview": "Step coordinator brief prompt"}, @@ -877,8 +890,10 @@ async def _step_selector(state: SandboxState) -> dict[str, Any]: def _make_safe_tool_wrapper(tool_node: ToolNode, label: str): """Create a safe tool execution wrapper for a ToolNode.""" + async def _safe(state: SandboxState) -> dict[str, Any]: from langchain_core.messages import ToolMessage + try: return await tool_node.ainvoke(state) except (GraphInterrupt, KeyboardInterrupt, SystemExit): @@ -892,18 +907,23 @@ async def _safe(state: SandboxState) -> dict[str, Any]: for tc in getattr(last, "tool_calls", []): tc_id = tc.get("id", "unknown") if isinstance(tc, dict) else getattr(tc, "id", "unknown") tc_name = tc.get("name", "unknown") if isinstance(tc, dict) else getattr(tc, "name", "unknown") - error_msgs.append(ToolMessage( - content=f"Tool error: {exc}", - tool_call_id=tc_id, - name=tc_name, - )) + error_msgs.append( + ToolMessage( + content=f"Tool error: {exc}", + tool_call_id=tc_id, + name=tc_name, + ) + ) if not error_msgs: - error_msgs.append(ToolMessage( - content=f"Tool execution failed: {exc}", - tool_call_id="error", - name="unknown", - )) + error_msgs.append( + ToolMessage( + content=f"Tool execution failed: {exc}", + tool_call_id="error", + name="unknown", + ) + ) return {"messages": error_msgs} + return _safe _reporter_tool_node = ToolNode(read_only_tools) diff --git a/a2a/sandbox_agent/src/sandbox_agent/graph_card.py b/a2a/sandbox_agent/src/sandbox_agent/graph_card.py index 896e7b9d..98d2bacb 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/graph_card.py +++ b/a2a/sandbox_agent/src/sandbox_agent/graph_card.py @@ -144,10 +144,7 @@ }, "thinking": { "category": "reasoning", - "description": ( - "Intermediate thinking iteration from a reasoning LLM " - "(bare model, no tools)." - ), + "description": ("Intermediate thinking iteration from a reasoning LLM (bare model, no tools)."), "langgraph_nodes": ["planner", "executor", "reflector"], "has_llm_call": True, "fields": { @@ -185,10 +182,7 @@ }, "micro_reasoning": { "category": "reasoning", - "description": ( - "Executor's intermediate LLM reasoning within a single plan step " - "(tool-loop iteration)." - ), + "description": ("Executor's intermediate LLM reasoning within a single plan step (tool-loop iteration)."), "langgraph_nodes": ["executor"], "has_llm_call": True, "fields": { @@ -275,9 +269,7 @@ # ── Decision ────────────────────────────────────────────────────── "reflector_decision": { "category": "decision", - "description": ( - "Reflector reviewed execution and decided: continue, replan, or done." - ), + "description": ("Reflector reviewed execution and decided: continue, replan, or done."), "langgraph_nodes": ["reflector"], "has_llm_call": True, "fields": { @@ -333,9 +325,7 @@ "debug_fields": { "logic": { "type": "str", - "description": ( - "Routing logic: checks plan_status to decide resume vs plan." - ), + "description": ("Routing logic: checks plan_status to decide resume vs plan."), }, }, }, @@ -399,9 +389,7 @@ }, "node_transition": { "category": "meta", - "description": ( - "Internal marker indicating a graph-level transition between nodes." - ), + "description": ("Internal marker indicating a graph-level transition between nodes."), "langgraph_nodes": [], "has_llm_call": False, "fields": { @@ -420,8 +408,7 @@ "hitl_request": { "category": "interaction", "description": ( - "Human-in-the-loop approval request — the executor is pausing " - "to ask the user before proceeding." + "Human-in-the-loop approval request — the executor is pausing to ask the user before proceeding." ), "langgraph_nodes": ["executor"], "has_llm_call": False, @@ -462,38 +449,24 @@ #: Human-readable description for each node in the compiled graph. TOPOLOGY_NODE_DESCRIPTIONS: Dict[str, str] = { - "router": ( - "Entry node — decides whether to create a new plan or resume execution " - "of an existing plan." - ), + "router": ("Entry node — decides whether to create a new plan or resume execution of an existing plan."), "planner": ( - "Creates or revises a multi-step plan using an LLM with planning tools " - "(glob, grep, file_read, file_write)." - ), - "planner_tools": ( - "Executes tool calls issued by the planner (workspace inspection, " - "plan persistence)." - ), - "step_selector": ( - "Picks the next plan step to execute and prepares the executor context." + "Creates or revises a multi-step plan using an LLM with planning tools (glob, grep, file_read, file_write)." ), + "planner_tools": ("Executes tool calls issued by the planner (workspace inspection, plan persistence)."), + "step_selector": ("Picks the next plan step to execute and prepares the executor context."), "executor": ( "Executes the current plan step using an LLM with the full tool suite " "(shell, files, grep, glob, web_fetch, explore, delegate)." ), - "tools": ( - "Executes tool calls issued by the executor." - ), + "tools": ("Executes tool calls issued by the executor."), "reflector": ( "Reviews execution results and decides whether to continue, replan, " "or declare done. Uses read-only tools (glob, grep, file_read)." ), - "reflector_tools": ( - "Executes read-only tool calls issued by the reflector for verification." - ), + "reflector_tools": ("Executes read-only tool calls issued by the reflector for verification."), "reflector_route": ( - "Pass-through node that routes the reflector's decision to the next node " - "(reporter, step_selector, or planner)." + "Pass-through node that routes the reflector's decision to the next node (reporter, step_selector, or planner)." ), "reporter": ( "Generates the final user-facing answer by synthesizing all execution " @@ -535,11 +508,7 @@ def build_graph_card( graph = compiled.get_graph() # ── Nodes ───────────────────────────────────────────────────────── - raw_nodes: List[str] = [ - node_id - for node_id in graph.nodes - if node_id not in ("__start__", "__end__") - ] + raw_nodes: List[str] = [node_id for node_id in graph.nodes if node_id not in ("__start__", "__end__")] nodes: Dict[str, Dict[str, str]] = {} for node_id in raw_nodes: nodes[node_id] = { diff --git a/a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py b/a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py index ff9b35ca..2228d924 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py +++ b/a2a/sandbox_agent/src/sandbox_agent/landlock_ctypes.py @@ -39,16 +39,16 @@ # ABI v1 access flags (13 flags) _ACCESS_FS_V1 = ( - (1 << 0) # EXECUTE - | (1 << 1) # WRITE_FILE - | (1 << 2) # READ_FILE - | (1 << 3) # READ_DIR - | (1 << 4) # REMOVE_DIR - | (1 << 5) # REMOVE_FILE - | (1 << 6) # MAKE_CHAR - | (1 << 7) # MAKE_DIR - | (1 << 8) # MAKE_REG - | (1 << 9) # MAKE_SOCK + (1 << 0) # EXECUTE + | (1 << 1) # WRITE_FILE + | (1 << 2) # READ_FILE + | (1 << 3) # READ_DIR + | (1 << 4) # REMOVE_DIR + | (1 << 5) # REMOVE_FILE + | (1 << 6) # MAKE_CHAR + | (1 << 7) # MAKE_DIR + | (1 << 8) # MAKE_REG + | (1 << 9) # MAKE_SOCK | (1 << 10) # MAKE_FIFO | (1 << 11) # MAKE_BLOCK | (1 << 12) # MAKE_SYM @@ -62,9 +62,9 @@ # Read-only subset (for ro_paths) ACCESS_FS_READ_ONLY = ( - (1 << 0) # EXECUTE - | (1 << 2) # READ_FILE - | (1 << 3) # READ_DIR + (1 << 0) # EXECUTE + | (1 << 2) # READ_FILE + | (1 << 3) # READ_DIR ) _libc = ctypes.CDLL("libc.so.6", use_errno=True) diff --git a/a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py b/a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py index 74f46888..8b455dcc 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py +++ b/a2a/sandbox_agent/src/sandbox_agent/landlock_probe.py @@ -84,9 +84,7 @@ def probe_landlock() -> int: """) # Find the package root so the child can import sandbox_agent - package_src = str( - __import__("pathlib").Path(__file__).resolve().parent.parent - ) + package_src = str(__import__("pathlib").Path(__file__).resolve().parent.parent) result = subprocess.run( [sys.executable, "-c", child_script], diff --git a/a2a/sandbox_agent/src/sandbox_agent/observability.py b/a2a/sandbox_agent/src/sandbox_agent/observability.py index 259be8d2..351f0623 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/observability.py +++ b/a2a/sandbox_agent/src/sandbox_agent/observability.py @@ -15,7 +15,7 @@ import logging import os from contextvars import ContextVar -from typing import Any, Optional +from typing import Any logger = logging.getLogger(__name__) @@ -27,7 +27,7 @@ # ContextVar to pass root span from middleware to agent code. # This allows execute() to access the middleware-created root span # even though trace.get_current_span() would return a child span. -_root_span_var: ContextVar = ContextVar('root_span', default=None) +_root_span_var: ContextVar = ContextVar("root_span", default=None) def get_root_span(): @@ -44,7 +44,8 @@ def get_root_span(): # OpenInference semantic conventions try: - from openinference.semconv.trace import SpanAttributes, OpenInferenceSpanKindValues + from openinference.semconv.trace import OpenInferenceSpanKindValues, SpanAttributes + OPENINFERENCE_AVAILABLE = True except ImportError: OPENINFERENCE_AVAILABLE = False @@ -54,6 +55,7 @@ def get_root_span(): def _get_otlp_exporter(endpoint: str): """Get HTTP OTLP exporter.""" from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter + if not endpoint.endswith("/v1/traces"): endpoint = endpoint.rstrip("/") + "/v1/traces" return OTLPSpanExporter(endpoint=endpoint) @@ -76,8 +78,7 @@ def setup_observability() -> bool: if not otlp_endpoint: logger.warning( - "OTEL_EXPORTER_OTLP_ENDPOINT not set — tracing disabled. " - "Set this env var to enable OpenTelemetry tracing." + "OTEL_EXPORTER_OTLP_ENDPOINT not set — tracing disabled. Set this env var to enable OpenTelemetry tracing." ) return False @@ -91,13 +92,13 @@ def setup_observability() -> bool: def _setup_observability_inner(service_name: str, namespace: str, otlp_endpoint: str) -> bool: """Internal setup — may raise. Called by setup_observability() which catches all errors.""" from opentelemetry import trace - from opentelemetry.sdk.trace import TracerProvider - from opentelemetry.sdk.trace.export import BatchSpanProcessor - from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION + from opentelemetry.baggage.propagation import W3CBaggagePropagator from opentelemetry.propagate import set_global_textmap from opentelemetry.propagators.composite import CompositePropagator + from opentelemetry.sdk.resources import SERVICE_NAME, SERVICE_VERSION, Resource + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator - from opentelemetry.baggage.propagation import W3CBaggagePropagator logger.info("=" * 60) logger.info("Setting up OpenTelemetry observability") @@ -111,45 +112,51 @@ def _setup_observability_inner(service_name: str, namespace: str, otlp_endpoint: # Create resource with service and MLflow attributes. # Resource attributes are STATIC and apply to ALL spans/traces. # See: https://mlflow.org/docs/latest/genai/tracing/opentelemetry/ - resource = Resource(attributes={ - # Standard OTEL service attributes - SERVICE_NAME: service_name, - SERVICE_VERSION: AGENT_VERSION, - "service.namespace": namespace, - "k8s.namespace.name": namespace, - # MLflow static metadata (applies to all traces) - "mlflow.traceName": AGENT_NAME, - "mlflow.source": service_name, - # GenAI static attributes - "gen_ai.agent.name": AGENT_NAME, - "gen_ai.agent.version": AGENT_VERSION, - "gen_ai.system": AGENT_FRAMEWORK, - }) + resource = Resource( + attributes={ + # Standard OTEL service attributes + SERVICE_NAME: service_name, + SERVICE_VERSION: AGENT_VERSION, + "service.namespace": namespace, + "k8s.namespace.name": namespace, + # MLflow static metadata (applies to all traces) + "mlflow.traceName": AGENT_NAME, + "mlflow.source": service_name, + # GenAI static attributes + "gen_ai.agent.name": AGENT_NAME, + "gen_ai.agent.version": AGENT_VERSION, + "gen_ai.system": AGENT_FRAMEWORK, + } + ) # Create and configure tracer provider tracer_provider = TracerProvider(resource=resource) - tracer_provider.add_span_processor( - BatchSpanProcessor(_get_otlp_exporter(otlp_endpoint)) - ) + tracer_provider.add_span_processor(BatchSpanProcessor(_get_otlp_exporter(otlp_endpoint))) trace.set_tracer_provider(tracer_provider) # Auto-instrument LangChain with OpenInference try: from openinference.instrumentation.langchain import LangChainInstrumentor + LangChainInstrumentor().instrument() logger.info("LangChain instrumented with OpenInference") except ImportError: logger.warning("openinference-instrumentation-langchain not available") # Configure W3C Trace Context propagation - set_global_textmap(CompositePropagator([ - TraceContextTextMapPropagator(), - W3CBaggagePropagator(), - ])) + set_global_textmap( + CompositePropagator( + [ + TraceContextTextMapPropagator(), + W3CBaggagePropagator(), + ] + ) + ) # Instrument OpenAI for GenAI semantic conventions try: from opentelemetry.instrumentation.openai import OpenAIInstrumentor + OpenAIInstrumentor().instrument() logger.info("OpenAI instrumented with GenAI semantic conventions") except ImportError: @@ -206,15 +213,16 @@ def create_tracing_middleware(): app = server.build() app.add_middleware(BaseHTTPMiddleware, dispatch=create_tracing_middleware()) """ + from opentelemetry import context + from opentelemetry.trace import SpanKind, Status, StatusCode from starlette.requests import Request from starlette.responses import Response, StreamingResponse - from opentelemetry import trace, context - from opentelemetry.trace import Status, StatusCode, SpanKind async def tracing_middleware(request: Request, call_next): # Skip non-API paths (health checks, agent card, etc.) if request.url.path in [ - "/health", "/ready", + "/health", + "/ready", "/.well-known/agent-card.json", "/.well-known/agent-graph-card.json", ]: @@ -311,9 +319,7 @@ async def tracing_middleware(request: Request, call_next): # Try to capture response for output attributes. # This only works for non-streaming responses. - if isinstance(response, Response) and not isinstance( - response, StreamingResponse - ): + if isinstance(response, Response) and not isinstance(response, StreamingResponse): # Read response body — we MUST recreate response after _chunks: list[bytes] = [] async for chunk in response.body_iterator: @@ -331,15 +337,9 @@ async def tracing_middleware(request: Request, call_next): if parts: output_text = parts[0].get("text", "") if output_text: - span.set_attribute( - "gen_ai.completion", output_text[:1000] - ) - span.set_attribute( - "output.value", output_text[:1000] - ) - span.set_attribute( - "mlflow.spanOutputs", output_text[:1000] - ) + span.set_attribute("gen_ai.completion", output_text[:1000]) + span.set_attribute("output.value", output_text[:1000]) + span.set_attribute("mlflow.spanOutputs", output_text[:1000]) except Exception as e: logger.debug("Could not parse response body: %s", e) diff --git a/a2a/sandbox_agent/src/sandbox_agent/permissions.py b/a2a/sandbox_agent/src/sandbox_agent/permissions.py index 9e3a8190..7810c5ac 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/permissions.py +++ b/a2a/sandbox_agent/src/sandbox_agent/permissions.py @@ -169,9 +169,7 @@ def _resolve_workspace(settings: dict[str, Any]) -> str: return re.sub(r"/\$\{[^}]+\}$", "", raw) @staticmethod - def _parse_rules( - raw_rules: list[str], workspace: str - ) -> list[tuple[str, str]]: + def _parse_rules(raw_rules: list[str], workspace: str) -> list[tuple[str, str]]: """Parse rule strings into ``(operation_type, glob_pattern)`` pairs. ``${WORKSPACE}`` inside a rule body is expanded to *workspace*. @@ -243,7 +241,7 @@ def _match_shell(pattern: str, operation: str) -> bool: if colon_idx == -1: return False prefix = pattern[:colon_idx] - glob_part = pattern[colon_idx + 1:] + glob_part = pattern[colon_idx + 1 :] if not operation: return False @@ -257,7 +255,7 @@ def _match_shell(pattern: str, operation: str) -> bool: return False # What comes after the prefix (may be empty). - remainder = operation[len(prefix):] + remainder = operation[len(prefix) :] # If there is a remainder, it must be separated by a space or be # empty (exact match). This prevents "grep" matching "grepping". @@ -296,7 +294,7 @@ def check_interpreter_bypass(cls, operation: str) -> list[str]: while i < len(parts): if parts[i] in cls._EXEC_FLAGS and i + 1 < len(parts): # Everything after the flag is the inline command. - inline = " ".join(parts[i + 1:]) + inline = " ".join(parts[i + 1 :]) # Strip surrounding quotes if present. if len(inline) >= 2 and inline[0] in ('"', "'") and inline[-1] == inline[0]: inline = inline[1:-1] @@ -331,10 +329,10 @@ def _match_structured(pattern: str, operation: str) -> bool: return False p_action = pattern[:p_colon] - p_path_glob = pattern[p_colon + 1:] + p_path_glob = pattern[p_colon + 1 :] o_action = operation[:o_colon] - o_path = operation[o_colon + 1:] + o_path = operation[o_colon + 1 :] if p_action != o_action: return False diff --git a/a2a/sandbox_agent/src/sandbox_agent/plan_store.py b/a2a/sandbox_agent/src/sandbox_agent/plan_store.py index 47501753..456a768c 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/plan_store.py +++ b/a2a/sandbox_agent/src/sandbox_agent/plan_store.py @@ -108,14 +108,9 @@ def add_steps( raise ValueError(f"Only replanner can add steps, got creator={creator}") steps = plan.get("steps", {}) - non_terminal = [ - k for k, s in steps.items() - if s.get("status") not in _TERMINAL - ] + non_terminal = [k for k, s in steps.items() if s.get("status") not in _TERMINAL] if non_terminal: - raise ValueError( - f"Cannot add steps: steps {non_terminal} are still active" - ) + raise ValueError(f"Cannot add steps: steps {non_terminal} are still active") new_plan = _deep_copy(plan) next_idx = max((int(k) for k in steps), default=0) + 1 @@ -143,7 +138,10 @@ def add_steps( new_plan["steps"][first_new]["subplans"]["a"]["status"] = "running" logger.info( - "Added %d steps (start=%s) by %s", len(new_steps), first_new, creator, + "Added %d steps (start=%s) by %s", + len(new_steps), + first_new, + creator, ) return new_plan @@ -167,10 +165,7 @@ def add_alternative_subplan( next_key = chr(ord("a") + len(existing_keys)) step["subplans"][next_key] = { - "substeps": { - str(i + 1): {"description": desc, "status": "pending"} - for i, desc in enumerate(substeps) - }, + "substeps": {str(i + 1): {"description": desc, "status": "pending"} for i, desc in enumerate(substeps)}, "status": "running", "created_by": "replanner", } @@ -179,7 +174,9 @@ def add_alternative_subplan( logger.info( "Created alternative subplan '%s' for step %s (%d substeps)", - next_key, step_key, len(substeps), + next_key, + step_key, + len(substeps), ) return new_plan, next_key @@ -293,10 +290,7 @@ def all_terminal(plan: dict[str, Any]) -> bool: def to_flat_plan(plan: dict[str, Any]) -> list[str]: """Convert to flat list of step descriptions (backward compat).""" - return [ - plan["steps"][k]["description"] - for k in sorted(plan.get("steps", {}), key=int) - ] + return [plan["steps"][k]["description"] for k in sorted(plan.get("steps", {}), key=int)] def to_flat_plan_steps(plan: dict[str, Any]) -> list[dict[str, Any]]: @@ -307,15 +301,17 @@ def to_flat_plan_steps(plan: dict[str, Any]) -> list[dict[str, Any]]: active = step.get("active_subplan", "a") subplan = step.get("subplans", {}).get(active, {}) alt_count = len(step.get("subplans", {})) - 1 # alternatives (excl. original) - result.append({ - "index": int(key) - 1, # 0-based for compat - "description": step["description"], - "status": step["status"], - "active_subplan": active, - "alternative_count": alt_count, - "substeps": list(subplan.get("substeps", {}).values()), - "created_by": subplan.get("created_by", "planner"), - }) + result.append( + { + "index": int(key) - 1, # 0-based for compat + "description": step["description"], + "status": step["status"], + "active_subplan": active, + "alternative_count": alt_count, + "substeps": list(subplan.get("substeps", {}).values()), + "created_by": subplan.get("created_by", "planner"), + } + ) return result @@ -327,4 +323,5 @@ def to_flat_plan_steps(plan: dict[str, Any]) -> list[dict[str, Any]]: def _deep_copy(d: dict) -> dict: """Fast deep copy for JSON-compatible dicts.""" import json + return json.loads(json.dumps(d)) diff --git a/a2a/sandbox_agent/src/sandbox_agent/reasoning.py b/a2a/sandbox_agent/src/sandbox_agent/reasoning.py index dcd471fe..b75d6903 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/reasoning.py +++ b/a2a/sandbox_agent/src/sandbox_agent/reasoning.py @@ -36,8 +36,8 @@ from langchain_core.messages import AIMessage, SystemMessage, ToolMessage -from sandbox_agent.budget import AgentBudget from sandbox_agent import plan_store as ps +from sandbox_agent.budget import AgentBudget # openai raises APIStatusError for non-2xx responses (e.g. 402 from the budget proxy) try: @@ -52,15 +52,13 @@ def _is_budget_exceeded_error(exc: Exception) -> bool: return exc.status_code == 402 return "budget_exceeded" in str(exc).lower() or "402" in str(exc) + logger = logging.getLogger(__name__) # Sentinel text returned by the executor when all tool calls in a step have # already been executed (dedup logic). This is an internal coordination # message and must never appear in user-visible output. -_DEDUP_SENTINEL = ( - "Step completed — all requested tool calls " - "have been executed and results are available." -) +_DEDUP_SENTINEL = "Step completed — all requested tool calls have been executed and results are available." import os as _os @@ -69,10 +67,17 @@ def _is_budget_exceeded_error(exc: Exception) -> bool: _DEBUG_PROMPTS = _os.environ.get("SANDBOX_DEBUG_PROMPTS", "1") == "1" # Messages that trigger plan resumption rather than replanning. -_CONTINUE_PHRASES = frozenset({ - "continue", "continue on the plan", "go on", "proceed", - "keep going", "next", "carry on", -}) +_CONTINUE_PHRASES = frozenset( + { + "continue", + "continue on the plan", + "go on", + "proceed", + "keep going", + "next", + "carry on", + } +) # --------------------------------------------------------------------------- @@ -82,15 +87,15 @@ def _is_budget_exceeded_error(exc: Exception) -> bool: class PlanStep(TypedDict, total=False): """A single step in the plan with status tracking.""" + index: int description: str - status: str # "pending" | "running" | "done" | "failed" | "skipped" + status: str # "pending" | "running" | "done" | "failed" | "skipped" tool_calls: list[str] result_summary: str iteration_added: int - def _summarize_bound_tools(llm_with_tools: Any) -> list[dict[str, Any]]: """Extract bound tool schemas from a LangChain RunnableBinding for debug display. @@ -109,26 +114,28 @@ def _summarize_bound_tools(llm_with_tools: Any) -> list[dict[str, Any]]: for t in tools: if isinstance(t, dict): # Already in OpenAI format - result.append({ - "name": t.get("function", {}).get("name", "?"), - "description": t.get("function", {}).get("description", "")[:200], - "parameters": t.get("function", {}).get("parameters", {}), - }) + result.append( + { + "name": t.get("function", {}).get("name", "?"), + "description": t.get("function", {}).get("description", "")[:200], + "parameters": t.get("function", {}).get("parameters", {}), + } + ) else: # LangChain tool object - result.append({ - "name": getattr(t, "name", "?"), - "description": (getattr(t, "description", "") or "")[:200], - "parameters": getattr(t, "args_schema", {}) if hasattr(t, "args_schema") else {}, - }) + result.append( + { + "name": getattr(t, "name", "?"), + "description": (getattr(t, "description", "") or "")[:200], + "parameters": getattr(t, "args_schema", {}) if hasattr(t, "args_schema") else {}, + } + ) return result except Exception: return [] -def _make_plan_steps( - descriptions: list[str], iteration: int = 0 -) -> list[PlanStep]: +def _make_plan_steps(descriptions: list[str], iteration: int = 0) -> list[PlanStep]: """Convert a list of step descriptions into PlanStep dicts.""" return [ PlanStep( @@ -171,14 +178,14 @@ def _safe_format(template: str, **kwargs: Any) -> str: # Matches: tool_name(key="value", key2="value2") # Handles: shell("ls") (positional), shell(command="ls") (keyword) _TOOL_CALL_RE = re.compile( - r'(\w+)\(([^)]*)\)', + r"(\w+)\(([^)]*)\)", ) # Matches Llama 4 Scout format: [label, tool_name]{"key": "value"} # Examples: [clone_repo, shell]{"command": "git clone ..."} # [rca:ci, delegate]{"task": "analyze CI logs"} _LABEL_TOOL_JSON_RE = re.compile( - r'\[[^\]]*,\s*(\w+)\]\s*(\{[^}]+\})', + r"\[[^\]]*,\s*(\w+)\]\s*(\{[^}]+\})", ) # Known tool names — only parse calls for tools we actually have @@ -258,12 +265,14 @@ def parse_text_tool_calls(content: str) -> list[dict[str, Any]]: try: args = json.loads(json_str) if isinstance(args, dict): - calls.append({ - "name": tool_name, - "args": args, - "id": f"text-{uuid.uuid4().hex[:12]}", - "type": "tool_call", - }) + calls.append( + { + "name": tool_name, + "args": args, + "id": f"text-{uuid.uuid4().hex[:12]}", + "type": "tool_call", + } + ) except json.JSONDecodeError: continue @@ -279,12 +288,14 @@ def parse_text_tool_calls(content: str) -> list[dict[str, Any]]: continue args = _parse_kwargs(args_str, tool_name) - calls.append({ - "name": tool_name, - "args": args, - "id": f"text-{uuid.uuid4().hex[:12]}", - "type": "tool_call", - }) + calls.append( + { + "name": tool_name, + "args": args, + "id": f"text-{uuid.uuid4().hex[:12]}", + "type": "tool_call", + } + ) return calls @@ -304,10 +315,7 @@ def maybe_patch_tool_calls(response: AIMessage) -> AIMessage: content = response.content if isinstance(content, list): # Multi-part content — extract text parts - content = " ".join( - b.get("text", "") for b in content - if isinstance(b, dict) and b.get("type") == "text" - ) + content = " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text") parsed = parse_text_tool_calls(content) if not parsed: @@ -325,6 +333,7 @@ def maybe_patch_tool_calls(response: AIMessage) -> AIMessage: tool_calls=parsed, ) + # Default budget — used when no explicit budget is passed. DEFAULT_BUDGET = AgentBudget() @@ -334,9 +343,15 @@ def maybe_patch_tool_calls(response: AIMessage) -> AIMessage: # --------------------------------------------------------------------------- from sandbox_agent.prompts import ( - PLANNER_SYSTEM as _PLANNER_SYSTEM, EXECUTOR_SYSTEM as _EXECUTOR_SYSTEM, +) +from sandbox_agent.prompts import ( + PLANNER_SYSTEM as _PLANNER_SYSTEM, +) +from sandbox_agent.prompts import ( REFLECTOR_SYSTEM as _REFLECTOR_SYSTEM, +) +from sandbox_agent.prompts import ( REPORTER_SYSTEM as _REPORTER_SYSTEM, ) @@ -356,11 +371,9 @@ def _intercept_respond_to_user(response: Any, node_name: str) -> AIMessage | Non return None tool_names = [ - tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?") - for tc in response.tool_calls + tc.get("name", "?") if isinstance(tc, dict) else getattr(tc, "name", "?") for tc in response.tool_calls ] - logger.info("%s called tools: %s", node_name, tool_names, - extra={"node": node_name.lower()}) + logger.info("%s called tools: %s", node_name, tool_names, extra={"node": node_name.lower()}) for tc in response.tool_calls: name = tc.get("name", "") if isinstance(tc, dict) else getattr(tc, "name", "") @@ -368,7 +381,9 @@ def _intercept_respond_to_user(response: Any, node_name: str) -> AIMessage | Non args = tc.get("args", {}) if isinstance(tc, dict) else getattr(tc, "args", {}) response_text = args.get("response", "") logger.info( - "%s escaped via respond_to_user (%d chars)", node_name, len(response_text), + "%s escaped via respond_to_user (%d chars)", + node_name, + len(response_text), extra={"node": node_name.lower()}, ) # Return a clean AIMessage — no tool_calls so the graph @@ -402,10 +417,7 @@ async def router_node(state: dict[str, Any]) -> dict[str, Any]: if messages: content = getattr(messages[-1], "content", "") if isinstance(content, list): - last_text = " ".join( - b.get("text", "") for b in content - if isinstance(b, dict) and b.get("type") == "text" - ) + last_text = " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text") else: last_text = str(content) last_text_lower = last_text.strip().lower() @@ -421,9 +433,15 @@ async def router_node(state: dict[str, Any]) -> dict[str, Any]: plan_steps[current_step] = {**plan_steps[current_step], "status": "running"} logger.info( "Router: RESUME plan at step %d/%d (plan_status=%s)", - current_step + 1, len(plan_steps), plan_status, - extra={"session_id": state.get("context_id", ""), "node": "router", - "current_step": current_step, "plan_status": plan_status}, + current_step + 1, + len(plan_steps), + plan_status, + extra={ + "session_id": state.get("context_id", ""), + "node": "router", + "current_step": current_step, + "plan_status": plan_status, + }, ) return { "_route": "resume", @@ -435,9 +453,9 @@ async def router_node(state: dict[str, Any]) -> dict[str, Any]: # Reset replan_count — this is a user-driven replan, not an agent loop logger.info( "Router: REPLAN — new message while plan active (plan_status=%s, steps=%d)", - plan_status, len(plan_steps), - extra={"session_id": state.get("context_id", ""), "node": "router", - "plan_status": plan_status}, + plan_status, + len(plan_steps), + extra={"session_id": state.get("context_id", ""), "node": "router", "plan_status": plan_status}, ) return { "_route": "replan", @@ -448,9 +466,11 @@ async def router_node(state: dict[str, Any]) -> dict[str, Any]: } else: # New: no active plan - logger.info("Router: NEW plan (plan_status=%s)", plan_status, - extra={"session_id": state.get("context_id", ""), "node": "router", - "plan_status": plan_status}) + logger.info( + "Router: NEW plan (plan_status=%s)", + plan_status, + extra={"session_id": state.get("context_id", ""), "node": "router", "plan_status": plan_status}, + ) return { "_route": "new", "plan_status": "executing", @@ -478,10 +498,7 @@ def _is_trivial_text_request(messages: list) -> bool: last = messages[-1] content = getattr(last, "content", "") if isinstance(content, list): - content = " ".join( - b.get("text", "") for b in content - if isinstance(b, dict) and b.get("type") == "text" - ) + content = " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text") text = str(content).strip().lower() if not text: return False @@ -520,9 +537,16 @@ async def planner_node( # Fast-path: trivial text-only requests skip the planner LLM call entirely if iteration == 0 and not prev_plan_steps and _is_trivial_text_request(messages): - logger.info("Fast-path: trivial text request — single-step plan, no LLM call", - extra={"session_id": state.get("context_id", ""), "node": "planner", - "iteration": 0, "step_count": 1, "plan_version": 1}) + logger.info( + "Fast-path: trivial text request — single-step plan, no LLM call", + extra={ + "session_id": state.get("context_id", ""), + "node": "planner", + "iteration": 0, + "step_count": 1, + "plan_version": 1, + }, + ) trivial_steps = _make_plan_steps(["Respond to the user."], iteration=0) store = ps.create_plan(["Respond to the user."], creator="planner") return { @@ -545,7 +569,7 @@ async def planner_node( desc = prev_ps.get("description", "") status = prev_ps.get("status", "pending").upper() result = prev_ps.get("result_summary", "") - line = f" {idx+1}. [{status}] {desc}" + line = f" {idx + 1}. [{status}] {desc}" if result: line += f" — {result[:150]}" context_parts.append(line) @@ -560,7 +584,7 @@ async def planner_node( context_parts.append("Original plan:") for i, step in enumerate(original_plan): status = "DONE" if i < current_step else "PENDING" - context_parts.append(f" {i+1}. [{status}] {step}") + context_parts.append(f" {i + 1}. [{status}] {step}") context_parts.append(f"Progress: {current_step}/{len(original_plan)} steps completed.") context_parts.append("") @@ -590,9 +614,7 @@ async def planner_node( context_parts.append(f" Step {i}: {result}") context_parts.append("") - context_parts.append( - "Adjust the plan for remaining work. Do NOT repeat steps that already succeeded." - ) + context_parts.append("Adjust the plan for remaining work. Do NOT repeat steps that already succeeded.") system_content = _PLANNER_SYSTEM if context_parts: @@ -609,15 +631,19 @@ async def planner_node( try: response, planner_capture = await invoke_llm( - llm, plan_messages, - node="planner", session_id=state.get("context_id", ""), + llm, + plan_messages, + node="planner", + session_id=state.get("context_id", ""), workspace_path=state.get("workspace_path", "/workspace"), ) except Exception as exc: if _is_budget_exceeded_error(exc): - logger.warning("Budget exceeded in planner (402 from proxy): %s", exc, - extra={"session_id": state.get("context_id", ""), "node": "planner", - "iteration": iteration}) + logger.warning( + "Budget exceeded in planner (402 from proxy): %s", + exc, + extra={"session_id": state.get("context_id", ""), "node": "planner", "iteration": iteration}, + ) return { "messages": [AIMessage(content=f"Budget exceeded: {exc}")], "done": True, @@ -627,14 +653,14 @@ async def planner_node( prompt_tokens = planner_capture.prompt_tokens completion_tokens = planner_capture.completion_tokens - model_name = planner_capture.model + _model_name = planner_capture.model budget.add_tokens(prompt_tokens + completion_tokens) # Check for respond_to_user escape tool (needed for Llama 4 Scout). escaped = _intercept_respond_to_user(response, "Planner") if escaped is not None: response = escaped - elif getattr(response, 'tool_calls', None): + elif getattr(response, "tool_calls", None): # Non-escape tools — pass through for graph tool execution return { "messages": [response], @@ -648,11 +674,20 @@ async def planner_node( new_plan_steps = _make_plan_steps(plan, iteration=iteration) store = ps.create_plan(plan, creator="planner" if iteration == 0 else "replanner") - logger.info("Planner produced %d steps (iteration %d, version %d): %s", - len(plan), iteration, plan_version, plan, - extra={"session_id": state.get("context_id", ""), "node": "planner", - "iteration": iteration, "step_count": len(plan), - "plan_version": plan_version}) + logger.info( + "Planner produced %d steps (iteration %d, version %d): %s", + len(plan), + iteration, + plan_version, + plan, + extra={ + "session_id": state.get("context_id", ""), + "node": "planner", + "iteration": iteration, + "step_count": len(plan), + "plan_version": plan_version, + }, + ) # On replan, preserve completed steps — don't restart from step 0. # Find the first non-done step in the NEW plan to continue from. @@ -666,9 +701,12 @@ async def planner_node( for i in range(start_step): if i < len(new_plan_steps): new_plan_steps[i] = {**new_plan_steps[i], "status": "done"} - logger.info("Replan: preserving %d done steps, starting at step %d", - start_step, start_step + 1, - extra={"session_id": state.get("context_id", ""), "node": "planner"}) + logger.info( + "Replan: preserving %d done steps, starting at step %d", + start_step, + start_step + 1, + extra={"session_id": state.get("context_id", ""), "node": "planner"}, + ) else: start_step = 0 @@ -687,8 +725,9 @@ async def planner_node( } -MAX_THINK_ACT_CYCLES = int(_os.environ.get("SANDBOX_MAX_THINK_ACT_CYCLES", - _os.environ.get("SANDBOX_MAX_TOOL_CALLS_PER_STEP", "20"))) +MAX_THINK_ACT_CYCLES = int( + _os.environ.get("SANDBOX_MAX_THINK_ACT_CYCLES", _os.environ.get("SANDBOX_MAX_TOOL_CALLS_PER_STEP", "20")) +) THINKING_ITERATION_BUDGET = int(_os.environ.get("SANDBOX_THINKING_ITERATION_BUDGET", "2")) MAX_PARALLEL_TOOL_CALLS = int(_os.environ.get("SANDBOX_MAX_PARALLEL_TOOL_CALLS", "5")) @@ -724,19 +763,36 @@ async def executor_node( if tool_call_count >= MAX_THINK_ACT_CYCLES: logger.warning( "Step %d hit think-act cycle limit (%d/%d) — forcing step completion", - current_step, tool_call_count, MAX_THINK_ACT_CYCLES, - extra={"session_id": state.get("context_id", ""), "node": "executor", - "current_step": current_step, "tool_call_count": tool_call_count}, + current_step, + tool_call_count, + MAX_THINK_ACT_CYCLES, + extra={ + "session_id": state.get("context_id", ""), + "node": "executor", + "current_step": current_step, + "tool_call_count": tool_call_count, + }, ) result: dict[str, Any] = { - "messages": [AIMessage(content=f"Step {current_step + 1} reached think-act cycle limit ({MAX_THINK_ACT_CYCLES}). Moving to reflection.")], + "messages": [ + AIMessage( + content=f"Step {current_step + 1} reached think-act cycle limit ({MAX_THINK_ACT_CYCLES}). Moving to reflection." + ) + ], "current_step": current_step, "_tool_call_count": 0, "_budget_summary": budget.summary(), } if _DEBUG_PROMPTS: - result["_system_prompt"] = f"[Think-act cycle limit reached — no LLM call]\nStep {current_step + 1}: {tool_call_count}/{MAX_THINK_ACT_CYCLES} cycles" - result["_prompt_messages"] = [{"role": "system", "preview": f"Step {current_step + 1} cycle limit ({tool_call_count}/{MAX_THINK_ACT_CYCLES})"}] + result["_system_prompt"] = ( + f"[Think-act cycle limit reached — no LLM call]\nStep {current_step + 1}: {tool_call_count}/{MAX_THINK_ACT_CYCLES} cycles" + ) + result["_prompt_messages"] = [ + { + "role": "system", + "preview": f"Step {current_step + 1} cycle limit ({tool_call_count}/{MAX_THINK_ACT_CYCLES})", + } + ] result["_llm_response"] = "[no LLM call — cycle limit]" return result @@ -758,9 +814,11 @@ async def executor_node( # Check budget before making the LLM call (refresh from LiteLLM first) if budget.exceeded: - logger.warning("Budget exceeded in executor: %s", budget.exceeded_reason, - extra={"session_id": state.get("context_id", ""), "node": "executor", - "current_step": current_step}) + logger.warning( + "Budget exceeded in executor: %s", + budget.exceeded_reason, + extra={"session_id": state.get("context_id", ""), "node": "executor", "current_step": current_step}, + ) result: dict[str, Any] = { "messages": [AIMessage(content=f"Budget exceeded: {budget.exceeded_reason}")], "current_step": current_step, @@ -789,17 +847,22 @@ async def executor_node( try: response, capture, sub_events = await invoke_with_tool_loop( - llm_with_tools, llm_reason, messages, - node="executor", session_id=state.get("context_id", ""), + llm_with_tools, + llm_reason, + messages, + node="executor", + session_id=state.get("context_id", ""), workspace_path=state.get("workspace_path", "/workspace"), thinking_budget=THINKING_ITERATION_BUDGET, max_parallel_tool_calls=MAX_PARALLEL_TOOL_CALLS, ) except Exception as exc: if _is_budget_exceeded_error(exc): - logger.warning("Budget exceeded in executor (402 from proxy): %s", exc, - extra={"session_id": state.get("context_id", ""), "node": "executor", - "current_step": current_step}) + logger.warning( + "Budget exceeded in executor (402 from proxy): %s", + exc, + extra={"session_id": state.get("context_id", ""), "node": "executor", "current_step": current_step}, + ) return { "messages": [AIMessage(content=f"Budget exceeded: {exc}")], "current_step": current_step, @@ -816,7 +879,7 @@ async def executor_node( # Token usage and model from the capture (guaranteed to match what was sent) prompt_tokens = capture.prompt_tokens completion_tokens = capture.completion_tokens - model_name = capture.model + _model_name = capture.model budget.add_tokens(prompt_tokens + completion_tokens) # If the model returned text-based tool calls instead of structured @@ -834,9 +897,14 @@ async def executor_node( if len(response.tool_calls) > MAX_PARALLEL_TOOL_CALLS: logger.info( "Executor returned %d tool calls — keeping first %d (parallel limit)", - len(response.tool_calls), MAX_PARALLEL_TOOL_CALLS, - extra={"session_id": state.get("context_id", ""), "node": "executor", - "current_step": current_step, "tool_call_count": tool_call_count}, + len(response.tool_calls), + MAX_PARALLEL_TOOL_CALLS, + extra={ + "session_id": state.get("context_id", ""), + "node": "executor", + "current_step": current_step, + "tool_call_count": tool_call_count, + }, ) response = AIMessage( content=response.content, @@ -848,14 +916,28 @@ async def executor_node( # log a warning. The reflector will catch the zero-tool-call pattern. if not response.tool_calls and pre_patch_content: text_hint = str(pre_patch_content).lower() - if any(kw in text_hint for kw in ("shell(", "file_read(", "file_write(", - "```bash", "```shell", "i would run", - "i will execute", "let me run")): + if any( + kw in text_hint + for kw in ( + "shell(", + "file_read(", + "file_write(", + "```bash", + "```shell", + "i would run", + "i will execute", + "let me run", + ) + ): logger.warning( "Executor produced text resembling a tool call but no actual " "tool_calls were generated — likely a stalled iteration", - extra={"session_id": state.get("context_id", ""), "node": "executor", - "current_step": current_step, "tool_call_count": tool_call_count}, + extra={ + "session_id": state.get("context_id", ""), + "node": "executor", + "current_step": current_step, + "tool_call_count": tool_call_count, + }, ) # -- Loop detection: stop if the executor repeats the same tool call ---- @@ -885,16 +967,19 @@ async def executor_node( if repeat_count >= 2: logger.warning( "Loop detected: %s(%s) called %d times in last 3 — forcing step completion", - tc["name"], str(tc["args"])[:80], repeat_count + 1, - extra={"session_id": state.get("context_id", ""), "node": "executor", - "current_step": current_step}, + tc["name"], + str(tc["args"])[:80], + repeat_count + 1, + extra={"session_id": state.get("context_id", ""), "node": "executor", "current_step": current_step}, ) return { - "messages": [AIMessage( - content=f"Step {current_step + 1} stuck in loop: " - f"{tc['name']}() called {repeat_count + 1} times with same args. " - f"Moving to reflection." - )], + "messages": [ + AIMessage( + content=f"Step {current_step + 1} stuck in loop: " + f"{tc['name']}() called {repeat_count + 1} times with same args. " + f"Moving to reflection." + ) + ], "current_step": current_step, "_tool_call_count": 0, "_budget_summary": budget.summary(), @@ -904,10 +989,7 @@ async def executor_node( # from text parsing (not structured tool_calls). parsed_tools: list[dict[str, Any]] = [] if not had_structured_tools and response.tool_calls: - parsed_tools = [ - {"name": tc["name"], "args": tc.get("args", {})} - for tc in response.tool_calls - ] + parsed_tools = [{"name": tc["name"], "args": tc.get("args", {})} for tc in response.tool_calls] # If no tool calls after patching, the executor is either: # (a) Legitimately done with the step (summarizing results) — NORMAL @@ -921,22 +1003,38 @@ async def executor_node( # it's done summarizing. This is normal completion, not a stall. logger.info( "Executor produced text response after %d tool calls for step %d — step complete", - tool_call_count, current_step, - extra={"session_id": state.get("context_id", ""), "node": "executor", - "current_step": current_step, "tool_call_count": tool_call_count}, + tool_call_count, + current_step, + extra={ + "session_id": state.get("context_id", ""), + "node": "executor", + "current_step": current_step, + "tool_call_count": tool_call_count, + }, ) else: no_tool_count += 1 logger.warning( "Executor produced no tool calls for step %d (attempt %d/2)", - current_step, no_tool_count, - extra={"session_id": state.get("context_id", ""), "node": "executor", - "current_step": current_step, "tool_call_count": 0}, + current_step, + no_tool_count, + extra={ + "session_id": state.get("context_id", ""), + "node": "executor", + "current_step": current_step, + "tool_call_count": 0, + }, ) if no_tool_count >= 2: - logger.warning("Executor failed to call tools after 2 attempts — marking step failed", - extra={"session_id": state.get("context_id", ""), "node": "executor", - "current_step": current_step, "tool_call_count": 0}) + logger.warning( + "Executor failed to call tools after 2 attempts — marking step failed", + extra={ + "session_id": state.get("context_id", ""), + "node": "executor", + "current_step": current_step, + "tool_call_count": 0, + }, + ) # Keep the actual LLM response (with text reasoning) for the UI. # Append failure note but preserve the model's output for micro_reasoning. actual_content = str(response.content or "") @@ -1034,9 +1132,16 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]: for i in range(current_step + 1, len(fd_ps)): if fd_ps[i].get("status") == "pending": fd_ps[i] = {**fd_ps[i], "status": "skipped"} - logger.warning("%s — forcing done", reason, - extra={"session_id": state.get("context_id", ""), "node": "reflector", - "current_step": current_step, "replan_count": replan_count}) + logger.warning( + "%s — forcing done", + reason, + extra={ + "session_id": state.get("context_id", ""), + "node": "reflector", + "current_step": current_step, + "replan_count": replan_count, + }, + ) result: dict[str, Any] = { "step_results": step_results, "plan_steps": fd_ps, @@ -1069,8 +1174,7 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]: content = getattr(last_msg, "content", "") if isinstance(content, list): last_content = " ".join( - b.get("text", "") for b in content - if isinstance(b, dict) and b.get("type") == "text" + b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text" ) else: last_content = str(content) @@ -1086,16 +1190,21 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]: for msg in reversed(messages): if isinstance(msg, ToolMessage): last_content = str(getattr(msg, "content", "")) - logger.info("Reflector: substituted dedup sentinel with last tool result (%d chars)", - len(last_content), - extra={"session_id": state.get("context_id", ""), "node": "reflector", - "current_step": current_step}) + logger.info( + "Reflector: substituted dedup sentinel with last tool result (%d chars)", + len(last_content), + extra={ + "session_id": state.get("context_id", ""), + "node": "reflector", + "current_step": current_step, + }, + ) break step_results.append(last_content[:500]) step_text = plan[current_step] if current_step < len(plan) else "N/A" - plan_text = "\n".join(f"{i+1}. {s}" for i, s in enumerate(plan)) + plan_text = "\n".join(f"{i + 1}. {s}" for i, s in enumerate(plan)) results_text = last_content[:1000] # Hint: if the step result contains error signals, prepend a note @@ -1109,15 +1218,13 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]: # Build replan history context — show the LLM what prior replans tried replan_history_text = "" if replan_count > 0: - replan_history_lines = [ - f"REPLAN HISTORY ({replan_count} prior replan(s)):" - ] + replan_history_lines = [f"REPLAN HISTORY ({replan_count} prior replan(s)):"] # Collect failed step summaries from plan_steps for hist_ps in state.get("plan_steps", []): if hist_ps.get("status") == "failed": summary = hist_ps.get("result_summary", "no details") replan_history_lines.append( - f" - Step {hist_ps.get('index', '?')+1} FAILED: {hist_ps.get('description', '?')[:80]}" + f" - Step {hist_ps.get('index', '?') + 1} FAILED: {hist_ps.get('description', '?')[:80]}" f" — {summary[:150]}" ) replan_history_lines.append( @@ -1129,7 +1236,7 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]: # Ask LLM to reflect recent_str = ", ".join(recent_decisions[-5:]) if recent_decisions else "none" # Build remaining steps text so reflector knows what's left - remaining = [f"{i+1}. {plan[i]}" for i in range(current_step + 1, len(plan))] + remaining = [f"{i + 1}. {plan[i]}" for i in range(current_step + 1, len(plan))] remaining_text = ", ".join(remaining[:5]) if remaining else "NONE — all steps complete" # Build step execution summary for reflector context @@ -1178,28 +1285,37 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]: reflect_messages = build_reflector_context(state, system_content) try: response, capture = await invoke_llm( - llm, reflect_messages, - node="reflector", session_id=state.get("context_id", ""), + llm, + reflect_messages, + node="reflector", + session_id=state.get("context_id", ""), workspace_path=state.get("workspace_path", "/workspace"), ) except Exception as exc: if _is_budget_exceeded_error(exc): - logger.warning("Budget exceeded in reflector (402 from proxy): %s", exc, - extra={"session_id": state.get("context_id", ""), "node": "reflector", - "current_step": current_step, "replan_count": replan_count}) + logger.warning( + "Budget exceeded in reflector (402 from proxy): %s", + exc, + extra={ + "session_id": state.get("context_id", ""), + "node": "reflector", + "current_step": current_step, + "replan_count": replan_count, + }, + ) return _force_done(f"Budget exceeded: {exc}") raise prompt_tokens = capture.prompt_tokens completion_tokens = capture.completion_tokens - model_name = capture.model + _model_name = capture.model budget.add_tokens(prompt_tokens + completion_tokens) # Check for respond_to_user escape tool (needed for Llama 4 Scout). escaped = _intercept_respond_to_user(response, "Reflector") if escaped is not None: response = escaped - elif getattr(response, 'tool_calls', None): + elif getattr(response, "tool_calls", None): # Non-escape tools — pass through for graph tool execution return { "messages": [response], @@ -1218,9 +1334,13 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]: logger.warning( "Reflector said 'done' but %d plan steps remain — overriding to 'continue'", steps_remaining, - extra={"session_id": state.get("context_id", ""), "node": "reflector", - "decision": "done->continue", "current_step": current_step, - "replan_count": replan_count}, + extra={ + "session_id": state.get("context_id", ""), + "node": "reflector", + "decision": "done->continue", + "current_step": current_step, + "replan_count": replan_count, + }, ) decision = "continue" @@ -1245,12 +1365,21 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]: logger.info( "Reflector decision: %s (step %d/%d, iter %d, replans=%d, tools=%d, recent=%s)", - decision, current_step + 1, len(plan), iteration, - replan_count, tool_calls_this_iter, + decision, + current_step + 1, + len(plan), + iteration, + replan_count, + tool_calls_this_iter, recent_decisions[-3:], - extra={"session_id": state.get("context_id", ""), "node": "reflector", - "decision": decision, "current_step": current_step, - "replan_count": replan_count, "iteration": iteration}, + extra={ + "session_id": state.get("context_id", ""), + "node": "reflector", + "decision": decision, + "current_step": current_step, + "replan_count": replan_count, + "iteration": iteration, + }, ) base_result: dict[str, Any] = { @@ -1274,8 +1403,11 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]: elif decision == "retry": store = ps.set_step_status(store, step_key, "running") except ValueError: - logger.warning("PlanStore: step %s not found (replan?), skipping status update", - step_key, extra={"session_id": state.get("context_id", ""), "node": "reflector"}) + logger.warning( + "PlanStore: step %s not found (replan?), skipping status update", + step_key, + extra={"session_id": state.get("context_id", ""), "node": "reflector"}, + ) base_result["_plan_store"] = store if decision == "done": @@ -1303,10 +1435,17 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]: "status": "retrying", "retry_count": retry_count, } - logger.info("Retry step %d (attempt %d) — re-executing with different approach", - current_step + 1, plan_steps[current_step].get("retry_count", 1), - extra={"session_id": state.get("context_id", ""), "node": "reflector", - "decision": "retry", "current_step": current_step}) + logger.info( + "Retry step %d (attempt %d) — re-executing with different approach", + current_step + 1, + plan_steps[current_step].get("retry_count", 1), + extra={ + "session_id": state.get("context_id", ""), + "node": "reflector", + "decision": "retry", + "current_step": current_step, + }, + ) return { **base_result, "plan_steps": plan_steps, @@ -1319,10 +1458,17 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]: # Mark current step failed if current_step < len(plan_steps): plan_steps[current_step] = {**plan_steps[current_step], "status": "failed"} - logger.info("Replan %d — routing back to planner", new_replan_count, - extra={"session_id": state.get("context_id", ""), "node": "reflector", - "decision": "replan", "current_step": current_step, - "replan_count": new_replan_count}) + logger.info( + "Replan %d — routing back to planner", + new_replan_count, + extra={ + "session_id": state.get("context_id", ""), + "node": "reflector", + "decision": "replan", + "current_step": current_step, + "replan_count": new_replan_count, + }, + ) return { **base_result, "plan_steps": plan_steps, @@ -1345,8 +1491,12 @@ def _force_done(reason: str, *, mark_failed: bool = False) -> dict[str, Any]: logger.info( "All %d planned steps completed — routing to reporter", len(plan), - extra={"session_id": state.get("context_id", ""), "node": "reflector", - "decision": "done", "current_step": current_step}, + extra={ + "session_id": state.get("context_id", ""), + "node": "reflector", + "decision": "done", + "current_step": current_step, + }, ) return { **base_result, @@ -1416,10 +1566,8 @@ async def reporter_node( if not step_results and not state.get("messages"): return {"final_answer": "No response generated.", "plan_status": terminal_status} - plan_text = "\n".join(f"{i+1}. {s}" for i, s in enumerate(plan)) - results_text = "\n".join( - f"Step {i+1}: {r}" for i, r in enumerate(step_results) - ) + plan_text = "\n".join(f"{i + 1}. {s}" for i, s in enumerate(plan)) + results_text = "\n".join(f"Step {i + 1}: {r}" for i, r in enumerate(step_results)) # Build step status summary from plan_steps step_status_lines = [] @@ -1431,7 +1579,7 @@ async def reporter_node( has_partial = True desc = rpt_ps.get("description", "")[:80] result = rpt_ps.get("result_summary", "")[:100] - line = f"{idx+1}. [{status}] {desc}" + line = f"{idx + 1}. [{status}] {desc}" if result and status in ("FAILED", "PARTIAL"): line += f" — {result}" step_status_lines.append(line) @@ -1455,10 +1603,7 @@ async def reporter_node( ) # Filter dedup sentinel messages from conversation history passed to the # reporter LLM so it cannot echo them in the final answer. - filtered_msgs = [ - m for m in state["messages"] - if _DEDUP_SENTINEL not in str(getattr(m, "content", "")) - ] + filtered_msgs = [m for m in state["messages"] if _DEDUP_SENTINEL not in str(getattr(m, "content", ""))] reporter_messages = [SystemMessage(content=system_content)] + filtered_msgs # Use invoke_with_tool_loop when llm_reason is available (thinking mode), @@ -1469,8 +1614,11 @@ async def reporter_node( try: response, capture, sub_events = await invoke_with_tool_loop( - llm, llm_reason, reporter_messages, - node="reporter", session_id=state.get("context_id", ""), + llm, + llm_reason, + reporter_messages, + node="reporter", + session_id=state.get("context_id", ""), workspace_path=state.get("workspace_path", "/workspace"), thinking_budget=2, max_parallel_tool_calls=3, @@ -1479,8 +1627,11 @@ async def reporter_node( ) except Exception as exc: if _is_budget_exceeded_error(exc): - logger.warning("Budget exceeded in reporter (402 from proxy): %s", exc, - extra={"session_id": state.get("context_id", ""), "node": "reporter"}) + logger.warning( + "Budget exceeded in reporter (402 from proxy): %s", + exc, + extra={"session_id": state.get("context_id", ""), "node": "reporter"}, + ) return { "messages": [AIMessage(content="Task completed (budget exhausted before final summary).")], "final_answer": "Task completed (budget exhausted before final summary).", @@ -1494,14 +1645,19 @@ async def reporter_node( try: response, capture = await invoke_llm( - llm, reporter_messages, - node="reporter", session_id=state.get("context_id", ""), + llm, + reporter_messages, + node="reporter", + session_id=state.get("context_id", ""), workspace_path=state.get("workspace_path", "/workspace"), ) except Exception as exc: if _is_budget_exceeded_error(exc): - logger.warning("Budget exceeded in reporter (402 from proxy): %s", exc, - extra={"session_id": state.get("context_id", ""), "node": "reporter"}) + logger.warning( + "Budget exceeded in reporter (402 from proxy): %s", + exc, + extra={"session_id": state.get("context_id", ""), "node": "reporter"}, + ) return { "messages": [AIMessage(content="Task completed (budget exhausted before final summary).")], "final_answer": "Task completed (budget exhausted before final summary).", @@ -1513,14 +1669,14 @@ async def reporter_node( prompt_tokens = capture.prompt_tokens completion_tokens = capture.completion_tokens - model_name = capture.model + _model_name = capture.model budget.add_tokens(prompt_tokens + completion_tokens) # Handle respond_to_user escape tool (Llama 4 Scout always calls tools) escaped = _intercept_respond_to_user(response, "Reporter") if escaped is not None: response = escaped - elif getattr(response, 'tool_calls', None): + elif getattr(response, "tool_calls", None): # Response has real tool calls — return to graph for tool execution return { "messages": [response], @@ -1531,10 +1687,7 @@ async def reporter_node( content = response.content if isinstance(content, list): - text = " ".join( - b.get("text", "") for b in content - if isinstance(b, dict) and b.get("type") == "text" - ) + text = " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text") else: text = str(content) @@ -1552,16 +1705,19 @@ async def reporter_node( cmd = args.get("command", "") # Extract file paths from common shell patterns import re as _re - for match in _re.findall(r'(?:>|>>|tee)\s+(\S+)', cmd): + + for match in _re.findall(r"(?:>|>>|tee)\s+(\S+)", cmd): if match not in files_touched: files_touched.append(match) - logger.info("Reporter: plan_status=%s (done=%d, failed=%d, total=%d)", - terminal_status, - sum(1 for s in plan_steps if s.get("status") == "done"), - sum(1 for s in plan_steps if s.get("status") == "failed"), - len(plan_steps), - extra={"session_id": state.get("context_id", ""), "node": "reporter"}) + logger.info( + "Reporter: plan_status=%s (done=%d, failed=%d, total=%d)", + terminal_status, + sum(1 for s in plan_steps if s.get("status") == "done"), + sum(1 for s in plan_steps if s.get("status") == "failed"), + len(plan_steps), + extra={"session_id": state.get("context_id", ""), "node": "reporter"}, + ) result: dict[str, Any] = { "messages": [response], @@ -1615,10 +1771,7 @@ def _parse_plan(content: str | list) -> list[str]: Returns a list of step descriptions. """ if isinstance(content, list): - text = " ".join( - b.get("text", "") for b in content - if isinstance(b, dict) and b.get("type") == "text" - ) + text = " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text") else: text = str(content) @@ -1630,7 +1783,7 @@ def _parse_plan(content: str | list) -> list[str]: # Strip the number prefix: "1. Do X" -> "Do X" for i, ch in enumerate(line): if ch in ".)" and i < 4: - step = line[i + 1:].strip() + step = line[i + 1 :].strip() if step: steps.append(step) break @@ -1649,10 +1802,7 @@ def _parse_decision(content: str | list) -> str: Defaults to ``continue`` if the output is ambiguous. """ if isinstance(content, list): - text = " ".join( - b.get("text", "") for b in content - if isinstance(b, dict) and b.get("type") == "text" - ) + text = " ".join(b.get("text", "") for b in content if isinstance(b, dict) and b.get("type") == "text") else: text = str(content) @@ -1665,4 +1815,4 @@ def _parse_decision(content: str | list) -> str: return "continue" -_BARE_DECISION_RE = re.compile(r'^(continue|retry|replan|done|hitl)\s*$', re.IGNORECASE) +_BARE_DECISION_RE = re.compile(r"^(continue|retry|replan|done|hitl)\s*$", re.IGNORECASE) diff --git a/a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py b/a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py index cea9063e..cb26b69b 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py +++ b/a2a/sandbox_agent/src/sandbox_agent/sandbox_subprocess.py @@ -129,7 +129,9 @@ async def sandboxed_subprocess( try: process = await asyncio.create_subprocess_exec( - sys.executable, "-c", child_script, + sys.executable, + "-c", + child_script, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, env=child_env, diff --git a/a2a/sandbox_agent/src/sandbox_agent/sources.py b/a2a/sandbox_agent/src/sandbox_agent/sources.py index bd2bf68f..016fb887 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/sources.py +++ b/a2a/sandbox_agent/src/sandbox_agent/sources.py @@ -15,7 +15,6 @@ from pathlib import Path from typing import Any - _DEFAULT_MAX_EXECUTION_TIME_SECONDS = 300 _DEFAULT_MAX_MEMORY_MB = 2048 @@ -116,11 +115,7 @@ def is_domain_allowed(self, domain: str) -> bool: def max_execution_time_seconds(self) -> int: """Maximum execution time for a single run, in seconds.""" runtime: dict[str, Any] = self._data.get("runtime", {}) - return int( - runtime.get( - "max_execution_time_seconds", _DEFAULT_MAX_EXECUTION_TIME_SECONDS - ) - ) + return int(runtime.get("max_execution_time_seconds", _DEFAULT_MAX_EXECUTION_TIME_SECONDS)) @property def max_memory_mb(self) -> int: diff --git a/a2a/sandbox_agent/src/sandbox_agent/subagents.py b/a2a/sandbox_agent/src/sandbox_agent/subagents.py index c1b7fcb3..d2ba5f70 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/subagents.py +++ b/a2a/sandbox_agent/src/sandbox_agent/subagents.py @@ -23,7 +23,7 @@ import subprocess import uuid from pathlib import Path -from typing import Any, Optional +from typing import Any import asyncpg from langchain_core.messages import HumanMessage, SystemMessage @@ -37,9 +37,7 @@ _MAX_SUB_AGENT_ITERATIONS = 15 # Delegation mode configuration -_DELEGATION_MODES = os.environ.get( - "DELEGATION_MODES", "in-process,shared-pvc,isolated,sidecar" -).split(",") +_DELEGATION_MODES = os.environ.get("DELEGATION_MODES", "in-process,shared-pvc,isolated,sidecar").split(",") _DEFAULT_MODE = os.environ.get("DEFAULT_DELEGATION_MODE", "in-process") # Maximum iterations for in-process sub-agents to prevent runaway loops. @@ -72,11 +70,23 @@ async def grep(pattern: str, path: str = ".") -> str: try: result = subprocess.run( - ["grep", "-rn", "--include=*.py", "--include=*.md", - "--include=*.yaml", "--include=*.yml", "--include=*.json", - "--include=*.txt", "--include=*.sh", "--include=*.go", - pattern, str(target)], - capture_output=True, text=True, timeout=30, + [ + "grep", + "-rn", + "--include=*.py", + "--include=*.md", + "--include=*.yaml", + "--include=*.yml", + "--include=*.json", + "--include=*.txt", + "--include=*.sh", + "--include=*.go", + pattern, + str(target), + ], + capture_output=True, + text=True, + timeout=30, cwd=str(ws_root), ) output = result.stdout[:10000] @@ -131,7 +141,7 @@ async def list_files(path: str = ".", pattern: str = "*") -> str: matches = sorted(str(p.relative_to(ws_root)) for p in target.rglob(pattern) if p.is_file()) if len(matches) > 200: matches = matches[:200] - matches.append(f"... and more (truncated at 200)") + matches.append("... and more (truncated at 200)") return "\n".join(matches) if matches else "No files found." return [grep, read_file, list_files] @@ -148,6 +158,7 @@ def create_explore_graph(workspace: str, llm: Any) -> Any: async def assistant(state: MessagesState) -> dict[str, Any]: from sandbox_agent.reasoning import maybe_patch_tool_calls + system = SystemMessage( content=( "You are a codebase research assistant. Your job is to find " @@ -229,15 +240,15 @@ async def _register_child_session( try: conn = await asyncpg.connect(pg_url) # Check if context already exists - existing = await conn.fetchval( - "SELECT COUNT(*) FROM tasks WHERE context_id = $1", child_context_id - ) + existing = await conn.fetchval("SELECT COUNT(*) FROM tasks WHERE context_id = $1", child_context_id) if existing == 0: - metadata = json.dumps({ - "agent_name": agent_name, - "parent_context_id": parent_context_id, - "title": task[:80], - }) + metadata = json.dumps( + { + "agent_name": agent_name, + "parent_context_id": parent_context_id, + "title": task[:80], + } + ) status = json.dumps({"state": "working"}) await conn.execute( "INSERT INTO tasks (id, context_id, status, metadata, history, artifacts) " @@ -307,6 +318,7 @@ async def _run_in_process( async def assistant(state: MessagesState) -> dict[str, Any]: from sandbox_agent.reasoning import maybe_patch_tool_calls + system = SystemMessage( content=( "You are a sub-agent working on a delegated task. Complete the task " @@ -350,8 +362,11 @@ async def assistant(state: MessagesState) -> dict[str, Any]: async def _run_shared_pvc( - task: str, child_context_id: str, namespace: str = "team1", - variant: str = "sandbox-legion", timeout_minutes: int = 30, + task: str, + child_context_id: str, + namespace: str = "team1", + variant: str = "sandbox-legion", + timeout_minutes: int = 30, ) -> str: """Spawn a pod that mounts the parent's PVC (placeholder).""" logger.info("shared-pvc delegation: child=%s task=%s", child_context_id, task) @@ -363,8 +378,11 @@ async def _run_shared_pvc( async def _run_isolated( - task: str, child_context_id: str, namespace: str = "team1", - variant: str = "sandbox-legion", timeout_minutes: int = 30, + task: str, + child_context_id: str, + namespace: str = "team1", + variant: str = "sandbox-legion", + timeout_minutes: int = 30, ) -> str: """Spawn an isolated pod via SandboxClaim CRD (placeholder).""" logger.info("isolated delegation: child=%s task=%s", child_context_id, task) @@ -376,14 +394,13 @@ async def _run_isolated( async def _run_sidecar( - task: str, child_context_id: str, variant: str = "sandbox-legion", + task: str, + child_context_id: str, + variant: str = "sandbox-legion", ) -> str: """Inject a sidecar container (placeholder).""" logger.info("sidecar delegation: child=%s task=%s", child_context_id, task) - return ( - f"Sidecar delegation requested for '{task}' " - f"(child={child_context_id}). Not yet implemented." - ) + return f"Sidecar delegation requested for '{task}' (child={child_context_id}). Not yet implemented." def make_delegate_tool( From 4f804c6bba2e2523922b744993a3926efa54e39e Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 22:40:53 +0100 Subject: [PATCH 23/26] fix: move try/except import after clean import block (I001) Ruff I001 requires contiguous import blocks. The try/except for DatabaseTaskStore was breaking the a2a imports block. Moved it after all clean imports. Signed-off-by: Ladislav Smola --- a2a/sandbox_agent/src/sandbox_agent/agent.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/a2a/sandbox_agent/src/sandbox_agent/agent.py b/a2a/sandbox_agent/src/sandbox_agent/agent.py index d75b29f6..be8346a9 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/agent.py +++ b/a2a/sandbox_agent/src/sandbox_agent/agent.py @@ -21,13 +21,6 @@ from a2a.server.events.event_queue import EventQueue from a2a.server.request_handlers import DefaultRequestHandler from a2a.server.tasks import InMemoryTaskStore, TaskUpdater - -try: - from a2a.server.tasks import DatabaseTaskStore - - _HAS_SQL_STORE = True -except ImportError: - _HAS_SQL_STORE = False from a2a.types import ( AgentCapabilities, AgentCard, @@ -41,6 +34,13 @@ from langgraph.checkpoint.memory import MemorySaver from starlette.routing import Route +try: + from a2a.server.tasks import DatabaseTaskStore + + _HAS_SQL_STORE = True +except ImportError: + _HAS_SQL_STORE = False + from sandbox_agent.budget import AgentBudget from sandbox_agent.configuration import Configuration from sandbox_agent.event_serializer import LangGraphSerializer From 1212d242839406ac3980896db508bf2e9d547e13 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 22:44:14 +0100 Subject: [PATCH 24/26] fix: remove from __future__ import annotations (not needed for Python 3.11+, fixes I001) Signed-off-by: Ladislav Smola --- a2a/sandbox_agent/src/sandbox_agent/agent.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/a2a/sandbox_agent/src/sandbox_agent/agent.py b/a2a/sandbox_agent/src/sandbox_agent/agent.py index be8346a9..6eb960b6 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/agent.py +++ b/a2a/sandbox_agent/src/sandbox_agent/agent.py @@ -4,8 +4,6 @@ and LangGraph graph to serve the A2A protocol over HTTP. """ -from __future__ import annotations - import asyncio import hashlib import json From 1cdf692fb3f55834ea83859383994fffa9a8f893 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Mon, 16 Mar 2026 22:48:50 +0100 Subject: [PATCH 25/26] fix: sort imports per ruff 0.11.4 ordering (a2a as first-party) Signed-off-by: Ladislav Smola --- a2a/sandbox_agent/src/sandbox_agent/agent.py | 7 ++++--- a2a/sandbox_agent/src/sandbox_agent/event_serializer.py | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/a2a/sandbox_agent/src/sandbox_agent/agent.py b/a2a/sandbox_agent/src/sandbox_agent/agent.py index 6eb960b6..0718d764 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/agent.py +++ b/a2a/sandbox_agent/src/sandbox_agent/agent.py @@ -14,6 +14,10 @@ from typing import Any import uvicorn +from langchain_core.messages import HumanMessage +from langgraph.checkpoint.memory import MemorySaver +from starlette.routing import Route + from a2a.server.agent_execution import AgentExecutor, RequestContext from a2a.server.apps import A2AStarletteApplication from a2a.server.events.event_queue import EventQueue @@ -28,9 +32,6 @@ TextPart, ) from a2a.utils import new_agent_text_message, new_task -from langchain_core.messages import HumanMessage -from langgraph.checkpoint.memory import MemorySaver -from starlette.routing import Route try: from a2a.server.tasks import DatabaseTaskStore diff --git a/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py b/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py index 8e039ef7..c67f7c7c 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py +++ b/a2a/sandbox_agent/src/sandbox_agent/event_serializer.py @@ -113,7 +113,6 @@ def __init__(self, loop_id: str | None = None, context_id: str | None = None) -> self._prev_node: str | None = None # previous node for node_transition events def serialize(self, key: str, value: dict) -> str: - # Emit node_transition meta-event when the node changes transition_line: str | None = None if self._prev_node is not None and key != self._prev_node: From bed64f562ac88a45b7ee53bcda0bfdabc2576752 Mon Sep 17 00:00:00 2001 From: Ladislav Smola Date: Tue, 17 Mar 2026 06:52:28 +0100 Subject: [PATCH 26/26] fix(security): path traversal, shlex parsing, dead code cleanup - Validate context_id against traversal (workspace.py) - Use is_relative_to instead of startswith (subagents.py) - Use shlex.split for interpreter/sources checks (permissions.py, executor.py) - Remove duplicate _MAX_SUB_AGENT_ITERATIONS (subagents.py) - Remove dead _BARE_DECISION_RE (reasoning.py) Signed-off-by: Ladislav Smola --- a2a/sandbox_agent/src/sandbox_agent/executor.py | 5 ++++- a2a/sandbox_agent/src/sandbox_agent/permissions.py | 6 +++++- a2a/sandbox_agent/src/sandbox_agent/reasoning.py | 3 --- a2a/sandbox_agent/src/sandbox_agent/subagents.py | 5 +---- a2a/sandbox_agent/src/sandbox_agent/workspace.py | 12 +++++++++--- 5 files changed, 19 insertions(+), 12 deletions(-) diff --git a/a2a/sandbox_agent/src/sandbox_agent/executor.py b/a2a/sandbox_agent/src/sandbox_agent/executor.py index 6dc5f7eb..672e85c6 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/executor.py +++ b/a2a/sandbox_agent/src/sandbox_agent/executor.py @@ -237,7 +237,10 @@ def _check_sources(self, operation: str) -> str | None: """ import re - parts = operation.split() + try: + parts = shlex.split(operation) + except ValueError: + parts = operation.split() if not parts: return None diff --git a/a2a/sandbox_agent/src/sandbox_agent/permissions.py b/a2a/sandbox_agent/src/sandbox_agent/permissions.py index 7810c5ac..b634dbe4 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/permissions.py +++ b/a2a/sandbox_agent/src/sandbox_agent/permissions.py @@ -23,6 +23,7 @@ import enum import fnmatch import re +import shlex from typing import Any # --------------------------------------------------------------------------- @@ -280,7 +281,10 @@ def check_interpreter_bypass(cls, operation: str) -> list[str]: if not operation: return [] - parts = operation.split() + try: + parts = shlex.split(operation) + except ValueError: + parts = operation.split() if not parts: return [] diff --git a/a2a/sandbox_agent/src/sandbox_agent/reasoning.py b/a2a/sandbox_agent/src/sandbox_agent/reasoning.py index b75d6903..ec5a6c71 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/reasoning.py +++ b/a2a/sandbox_agent/src/sandbox_agent/reasoning.py @@ -1813,6 +1813,3 @@ def _parse_decision(content: str | list) -> str: return decision return "continue" - - -_BARE_DECISION_RE = re.compile(r"^(continue|retry|replan|done|hitl)\s*$", re.IGNORECASE) diff --git a/a2a/sandbox_agent/src/sandbox_agent/subagents.py b/a2a/sandbox_agent/src/sandbox_agent/subagents.py index d2ba5f70..02f2cbfa 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/subagents.py +++ b/a2a/sandbox_agent/src/sandbox_agent/subagents.py @@ -40,9 +40,6 @@ _DELEGATION_MODES = os.environ.get("DELEGATION_MODES", "in-process,shared-pvc,isolated,sidecar").split(",") _DEFAULT_MODE = os.environ.get("DEFAULT_DELEGATION_MODE", "in-process") -# Maximum iterations for in-process sub-agents to prevent runaway loops. -_MAX_SUB_AGENT_ITERATIONS = 15 - # --------------------------------------------------------------------------- # In-process sub-agent: explore (C20, mode 1) @@ -109,7 +106,7 @@ async def read_file(path: str) -> str: File contents (truncated to 20000 chars). """ resolved = (ws_root / path).resolve() - if not str(resolved).startswith(str(ws_root)): + if not resolved.is_relative_to(ws_root): return "Error: path resolves outside the workspace." if not resolved.is_file(): return f"Error: file not found at '{path}'." diff --git a/a2a/sandbox_agent/src/sandbox_agent/workspace.py b/a2a/sandbox_agent/src/sandbox_agent/workspace.py index e047d7d7..5858eb62 100644 --- a/a2a/sandbox_agent/src/sandbox_agent/workspace.py +++ b/a2a/sandbox_agent/src/sandbox_agent/workspace.py @@ -44,8 +44,15 @@ def __init__( # Public API # ------------------------------------------------------------------ + @staticmethod + def _validate_context_id(context_id: str) -> None: + """Reject context IDs that could escape the workspace root.""" + if not context_id or "/" in context_id or ".." in context_id or "\x00" in context_id: + raise ValueError(f"Invalid context_id: {context_id!r}") + def get_workspace_path(self, context_id: str) -> str: """Return the workspace path for *context_id* without creating it.""" + self._validate_context_id(context_id) return os.path.join(self.workspace_root, context_id) def ensure_workspace(self, context_id: str) -> str: @@ -60,10 +67,9 @@ def ensure_workspace(self, context_id: str) -> str: Raises ------ ValueError - If *context_id* is empty. + If *context_id* is empty or contains path-traversal characters. """ - if not context_id: - raise ValueError("context_id must not be empty") + self._validate_context_id(context_id) workspace_path = self.get_workspace_path(context_id) context_file = Path(workspace_path) / ".context.json"