Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
f38031a
feat(sandbox): add sandbox_agent package init
Ladas Mar 16, 2026
b4c2d65
feat(sandbox): A2A server with event streaming, session management, a…
Ladas Mar 16, 2026
f29250d
feat(sandbox): budget tracking with iteration, token, tool-call, and …
Ladas Mar 16, 2026
03ccd98
feat(sandbox): pydantic configuration with per-node LLM model overrides
Ladas Mar 16, 2026
df4beba
feat(sandbox): context builders for per-node message isolation in the…
Ladas Mar 16, 2026
720d0ec
feat(sandbox): typed event schema for LangGraph node events streamed …
Ladas Mar 16, 2026
67b5971
feat(sandbox): event serializer converting LangGraph events to common…
Ladas Mar 16, 2026
d3bb92e
feat(sandbox): shell executor with permission-checked command executi…
Ladas Mar 16, 2026
8fd4b9a
feat(sandbox): LangGraph agent graph with plan-execute-reflect loop a…
Ladas Mar 16, 2026
e8768fd
feat(sandbox): graph card manifest with event catalog and topology in…
Ladas Mar 16, 2026
ec59f47
feat(sandbox): raw ctypes wrapper for Linux Landlock LSM syscalls (x8…
Ladas Mar 16, 2026
58876bd
feat(sandbox): startup probe that verifies Landlock isolation in a fo…
Ladas Mar 16, 2026
40437f2
feat(sandbox): OpenTelemetry observability with tracing middleware an…
Ladas Mar 16, 2026
5d93d5b
feat(sandbox): three-tier permission checker with deny/allow/HITL rul…
Ladas Mar 16, 2026
205094a
feat(sandbox): append-only nested plan store with main steps and alte…
Ladas Mar 16, 2026
2b060dc
feat(sandbox): system prompt templates for planner, executor, reflect…
Ladas Mar 16, 2026
8b5c3e2
feat(sandbox): plan-execute-reflect reasoning loop with router, plann…
Ladas Mar 16, 2026
133ba5c
feat(sandbox): per-tool-call Landlock isolation via subprocess fork w…
Ladas Mar 16, 2026
5c0ff33
feat(sandbox): sources.json capability loader for package managers, r…
Ladas Mar 16, 2026
3058627
feat(sandbox): sub-agent tools with explore (read-only) and delegate …
Ladas Mar 16, 2026
c7882d0
feat(sandbox): workspace manager for per-context_id directory isolati…
Ladas Mar 16, 2026
b1de4fa
fix: resolve ruff lint violations — import ordering, unused vars, for…
Ladas Mar 16, 2026
4f804c6
fix: move try/except import after clean import block (I001)
Ladas Mar 16, 2026
1212d24
fix: remove from __future__ import annotations (not needed for Python…
Ladas Mar 16, 2026
1cdf692
fix: sort imports per ruff 0.11.4 ordering (a2a as first-party)
Ladas Mar 16, 2026
bed64f5
fix(security): path traversal, shlex parsing, dead code cleanup
Ladas Mar 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
1,066 changes: 1,066 additions & 0 deletions a2a/sandbox_agent/src/sandbox_agent/agent.py

Large diffs are not rendered by default.

173 changes: 173 additions & 0 deletions a2a/sandbox_agent/src/sandbox_agent/budget.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
"""Budget tracking for the plan-execute-reflect reasoning loop.

Prevents runaway execution by capping iterations, tool calls per step,
total token usage, and wall clock time. When the budget is exceeded the
reflector forces the loop to terminate gracefully.

Token budget is enforced via the LLM Budget Proxy:
- The proxy intercepts all LLM calls and checks per-session token usage
- When budget is exceeded, the proxy returns HTTP 402
- The agent catches 402 errors and terminates gracefully
- The local ``tokens_used`` counter tracks in-process usage for budget
summary events (emitted to the UI) and for the local ``exceeded`` check

Budget scopes:
- **Per-message** (single graph run): max_iterations, max_wall_clock_s, recursion_limit
- **Per-step** (within one plan step): max_tool_calls_per_step
- **Per-session** (across A2A turns + restarts): enforced by LLM Budget Proxy

Budget parameters are configurable via environment variables:

- ``SANDBOX_MAX_ITERATIONS`` (default: 100)
- ``SANDBOX_MAX_TOOL_CALLS_PER_STEP`` (default: 10)
- ``SANDBOX_MAX_TOKENS`` (default: 1000000) — passed to proxy via metadata
- ``SANDBOX_MAX_WALL_CLOCK_S`` (default: 3600) — max seconds per message (1 hour)
- ``SANDBOX_HITL_INTERVAL`` (default: 50)
- ``SANDBOX_RECURSION_LIMIT`` (default: 50)
- ``SANDBOX_LLM_TIMEOUT`` (default: 300) — seconds per LLM call
- ``SANDBOX_LLM_MAX_RETRIES`` (default: 3) — retry on transient LLM errors
"""

from __future__ import annotations

import logging
import os
import time
from dataclasses import dataclass, field

logger = logging.getLogger(__name__)


def _env_int(name: str, default: int) -> int:
"""Read an integer from the environment, falling back to *default*."""
raw = os.environ.get(name)
if raw is None:
return default
try:
return int(raw)
except ValueError:
return default


@dataclass
class AgentBudget:
"""Tracks resource usage across the reasoning loop.

Attributes
----------
max_iterations:
Maximum outer-loop iterations (planner → executor → reflector).
max_tool_calls_per_step:
Maximum tool invocations the executor may make for a single plan step.
max_tokens:
Approximate upper bound on total tokens consumed (prompt + completion).
Passed to the LLM Budget Proxy via request metadata.
max_wall_clock_s:
Maximum wall clock time in seconds for a single message run.
hitl_interval:
After this many iterations, the reflector suggests a human check-in.
recursion_limit:
LangGraph recursion limit passed to graph invocation config.
"""

max_iterations: int = _env_int("SANDBOX_MAX_ITERATIONS", 200)
max_tool_calls_per_step: int = _env_int("SANDBOX_MAX_TOOL_CALLS_PER_STEP", 20)
max_tokens: int = _env_int("SANDBOX_MAX_TOKENS", 1_000_000)
max_wall_clock_s: int = _env_int("SANDBOX_MAX_WALL_CLOCK_S", 3600) # 1 hour
hitl_interval: int = _env_int("SANDBOX_HITL_INTERVAL", 50)
recursion_limit: int = _env_int("SANDBOX_RECURSION_LIMIT", 300)
llm_timeout: int = _env_int("SANDBOX_LLM_TIMEOUT", 300)
llm_max_retries: int = _env_int("SANDBOX_LLM_MAX_RETRIES", 3)

# Mutable runtime counters — not constructor args.
iterations_used: int = field(default=0, init=False)
tokens_used: int = field(default=0, init=False)
tool_calls_this_step: int = field(default=0, init=False)
_start_time: float = field(default_factory=time.monotonic, init=False)

# -- helpers -------------------------------------------------------------

def tick_iteration(self) -> None:
"""Advance the iteration counter by one."""
self.iterations_used += 1

def add_tokens(self, count: int) -> None:
"""Accumulate *count* tokens (prompt + completion).

Tracks in-process token usage for budget summary events and the
local ``exceeded`` check. The authoritative budget enforcement
is done by the LLM Budget Proxy (returns 402 when exceeded).
"""
self.tokens_used += count
if self.tokens_exceeded:
logger.warning(
"Budget: tokens exceeded %d/%d",
self.tokens_used,
self.max_tokens,
)

def tick_tool_call(self) -> None:
"""Record a tool invocation within the current step."""
self.tool_calls_this_step += 1

def reset_step_tools(self) -> None:
"""Reset the per-step tool-call counter (called between plan steps)."""
self.tool_calls_this_step = 0

# -- queries -------------------------------------------------------------

@property
def wall_clock_s(self) -> float:
"""Seconds elapsed since this budget was created."""
return time.monotonic() - self._start_time

@property
def iterations_exceeded(self) -> bool:
return self.iterations_used >= self.max_iterations

@property
def tokens_exceeded(self) -> bool:
return self.tokens_used >= self.max_tokens

@property
def wall_clock_exceeded(self) -> bool:
return self.wall_clock_s >= self.max_wall_clock_s

@property
def step_tools_exceeded(self) -> bool:
return self.tool_calls_this_step >= self.max_tool_calls_per_step

@property
def exceeded(self) -> bool:
"""Return True if *any* local budget limit has been reached.

Token budget is NOT checked here — it is enforced by the LLM
Budget Proxy (returns HTTP 402). The agent catches 402 errors
in the executor/reflector/reporter nodes.
"""
return self.iterations_exceeded or self.wall_clock_exceeded

@property
def exceeded_reason(self) -> str | None:
"""Human-readable reason for why the budget was exceeded, or None."""
if self.iterations_exceeded:
return f"Iteration limit reached ({self.iterations_used}/{self.max_iterations})"
if self.wall_clock_exceeded:
return f"Time limit reached ({self.wall_clock_s:.0f}s/{self.max_wall_clock_s}s)"
return None

@property
def needs_hitl_checkin(self) -> bool:
"""Return True when it's time for a human-in-the-loop check-in."""
return self.hitl_interval > 0 and self.iterations_used > 0 and self.iterations_used % self.hitl_interval == 0

def summary(self) -> dict:
"""Return budget state as a dict for event serialization."""
return {
"tokens_used": self.tokens_used,
"tokens_budget": self.max_tokens,
"iterations_used": self.iterations_used,
"iterations_budget": self.max_iterations,
"wall_clock_s": round(self.wall_clock_s, 1),
"max_wall_clock_s": self.max_wall_clock_s,
}
30 changes: 30 additions & 0 deletions a2a/sandbox_agent/src/sandbox_agent/configuration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from pydantic_settings import BaseSettings


class Configuration(BaseSettings):
llm_model: str = "llama3.1"
llm_api_base: str = "http://localhost:11434/v1"
llm_api_key: str = "dummy"
workspace_root: str = "/workspace"
checkpoint_db_url: str = "memory"
context_ttl_days: int = 7

# Per-node model overrides (empty = use llm_model default)
llm_model_planner: str = ""
llm_model_executor: str = ""
llm_model_reflector: str = ""
llm_model_reporter: str = ""
llm_model_thinking: str = "" # bare LLM for thinking iterations
llm_model_micro_reasoning: str = "" # LLM+tools for micro-reasoning

def model_for_node(self, node: str) -> str:
"""Return the model to use for a specific node type."""
overrides = {
"planner": self.llm_model_planner,
"executor": self.llm_model_executor,
"reflector": self.llm_model_reflector,
"reporter": self.llm_model_reporter,
"thinking": self.llm_model_thinking,
"micro_reasoning": self.llm_model_micro_reasoning,
}
return overrides.get(node, "") or self.llm_model
Loading
Loading