From c36c7e2c1976a156fdd155d91fd707728f0d0d43 Mon Sep 17 00:00:00 2001 From: up2itnow0822 Date: Fri, 20 Mar 2026 12:05:00 -0500 Subject: [PATCH 1/2] feat: add financial governance evaluators (spend limits + transaction policy) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the financial governance evaluator proposed in #129, following the technical guidance from @lan17: 1. Decoupled from data source — SpendStore protocol with pluggable backends (InMemorySpendStore included, PostgreSQL/Redis via custom implementation) 2. No new tables in core agent control — self-contained contrib package 3. Context-aware limits — channel/agent/session overrides via evaluate metadata 4. Python SDK compatible — standard Evaluator interface, works with both server and SDK evaluation engine Two evaluators: - financial_governance.spend_limit: Cumulative spend tracking with per-transaction caps and rolling period budgets - financial_governance.transaction_policy: Static policy enforcement (currency allowlists, recipient blocklists, amount bounds) 53 tests passing. Closes #129 Signed-off-by: up2itnow0822 Signed-off-by: up2itnow0822 Signed-off-by: up2itnow0822 --- .../contrib/financial-governance/README.md | 185 +++++++ .../financial-governance/pyproject.toml | 55 +++ .../__init__.py | 46 ++ .../spend_limit/__init__.py | 12 + .../spend_limit/config.py | 68 +++ .../spend_limit/evaluator.py | 329 ++++++++++++ .../spend_limit/store.py | 187 +++++++ .../transaction_policy/__init__.py | 9 + .../transaction_policy/config.py | 85 ++++ .../transaction_policy/evaluator.py | 260 ++++++++++ .../financial-governance/tests/__init__.py | 0 .../tests/test_spend_limit.py | 467 ++++++++++++++++++ .../tests/test_transaction_policy.py | 361 ++++++++++++++ 13 files changed, 2064 insertions(+) create mode 100644 evaluators/contrib/financial-governance/README.md create mode 100644 evaluators/contrib/financial-governance/pyproject.toml create mode 100644 evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/__init__.py create mode 100644 evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/__init__.py create mode 100644 evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/config.py create mode 100644 evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/evaluator.py create mode 100644 evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/store.py create mode 100644 evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/__init__.py create mode 100644 evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/config.py create mode 100644 evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/evaluator.py create mode 100644 evaluators/contrib/financial-governance/tests/__init__.py create mode 100644 evaluators/contrib/financial-governance/tests/test_spend_limit.py create mode 100644 evaluators/contrib/financial-governance/tests/test_transaction_policy.py diff --git a/evaluators/contrib/financial-governance/README.md b/evaluators/contrib/financial-governance/README.md new file mode 100644 index 00000000..8e5e5b58 --- /dev/null +++ b/evaluators/contrib/financial-governance/README.md @@ -0,0 +1,185 @@ +# Financial Governance Evaluators for Agent Control + +Evaluators that enforce financial spend limits and transaction policies for autonomous AI agents. + +As agents transact autonomously via protocols like [x402](https://github.com/coinbase/x402) and payment layers like [agentpay-mcp](https://github.com/AI-Agent-Economy/agentpay-mcp), enterprises need governance over what agents spend. These evaluators bring financial policy enforcement into the Agent Control framework. + +## Evaluators + +### `financial_governance.spend_limit` + +Tracks cumulative agent spend and enforces rolling budget limits. Stateful — records approved transactions and checks new ones against accumulated spend. + +- **Per-transaction cap** — reject any single payment above a threshold +- **Rolling period budget** — reject payments that would exceed a time-windowed budget +- **Context-aware overrides** — different limits per channel, agent, or session via evaluate metadata +- **Pluggable storage** — abstract `SpendStore` protocol with built-in `InMemorySpendStore`; bring your own PostgreSQL, Redis, etc. + +### `financial_governance.transaction_policy` + +Static policy checks with no state tracking. Enforces structural rules on individual transactions. + +- **Currency allowlist** — only permit specific currencies (e.g., `["USDC", "USDT"]`) +- **Recipient blocklist/allowlist** — control which addresses an agent can pay +- **Amount bounds** — minimum and maximum per-transaction limits + +## Installation + +```bash +# From the repo root (development) +cd evaluators/contrib/financial-governance +pip install -e ".[dev]" +``` + +## Configuration + +### Spend Limit + +```yaml +controls: + - name: spend-limit + evaluator: + type: financial_governance.spend_limit + config: + max_per_transaction: 100.0 # Max USDC per single payment + max_per_period: 1000.0 # Rolling 24h budget + period_seconds: 86400 # Budget window (default: 24 hours) + currency: USDC # Currency to govern + selector: + path: input # Extract step.input (transaction dict) + action: deny +``` + +### Transaction Policy + +```yaml +controls: + - name: transaction-policy + evaluator: + type: financial_governance.transaction_policy + config: + allowed_currencies: [USDC, USDT] + blocked_recipients: ["0xDEAD..."] + allowed_recipients: ["0xALICE...", "0xBOB..."] + min_amount: 0.01 + max_amount: 5000.0 + selector: + path: input + action: deny +``` + +## Selector Paths + +Both evaluators support two selector configurations: + +- **`selector.path: "input"`** (recommended) — The evaluator receives `step.input` directly, which should be the transaction dict. +- **`selector.path: "*"`** — The evaluator receives the full Step object. It automatically extracts `step.input` for transaction fields and `step.context` for channel/agent/session metadata. + +## Input Data Schema + +The transaction dict (from `step.input`) should contain: + +```python +# step.input — transaction payload +{ + "amount": 50.0, # required — transaction amount + "currency": "USDC", # required — payment currency + "recipient": "0xABC...", # required — payment recipient +} +``` + +## Context-Aware Limits + +Context fields (`channel`, `agent_id`, `session_id`) and per-context limit overrides can be provided in two ways: + +**Option A: Via `step.context`** (recommended for engine integration) + +```python +step = Step( + type="tool", + name="payment", + input={"amount": 75.0, "currency": "USDC", "recipient": "0xABC"}, + context={ + "channel": "experimental", + "agent_id": "agent-42", + "channel_max_per_transaction": 50.0, + "channel_max_per_period": 200.0, + }, +) +``` + +When using `selector.path: "*"`, the evaluator merges `step.context` fields into the transaction data automatically. When using `selector.path: "input"`, context fields must be included directly in `step.input`. + +**Option B: Inline in the transaction dict** (simpler, for direct SDK use) + +```python +result = await evaluator.evaluate({ + "amount": 75.0, + "currency": "USDC", + "recipient": "0xABC", + "channel": "experimental", + "channel_max_per_transaction": 50.0, + "channel_max_per_period": 200.0, +}) +``` + +Spend budgets are **scoped by context** — spend in channel A does not count against channel B's budget. When no context fields are present, budgets are global. + +## Custom SpendStore + +The `SpendStore` protocol requires two methods. Implement them for your backend: + +```python +from agent_control_evaluator_financial_governance.spend_limit import ( + SpendStore, + SpendLimitConfig, + SpendLimitEvaluator, +) + +class PostgresSpendStore: + """Example: PostgreSQL-backed spend tracking.""" + + def __init__(self, connection_string: str): + self._conn = connect(connection_string) + + def record_spend(self, amount: float, currency: str, metadata: dict | None = None) -> None: + self._conn.execute( + "INSERT INTO agent_spend (amount, currency, metadata, recorded_at) VALUES (%s, %s, %s, NOW())", + (amount, currency, json.dumps(metadata)), + ) + + def get_spend(self, currency: str, since_timestamp: float) -> float: + row = self._conn.execute( + "SELECT COALESCE(SUM(amount), 0) FROM agent_spend WHERE currency = %s AND recorded_at >= to_timestamp(%s)", + (currency, since_timestamp), + ).fetchone() + return float(row[0]) + +# Use it: +store = PostgresSpendStore("postgresql://...") +evaluator = SpendLimitEvaluator(config, store=store) +``` + +## Running Tests + +```bash +cd evaluators/contrib/financial-governance +pip install -e ".[dev]" +pytest tests/ -v +``` + +## Design Decisions + +1. **Decoupled from data source** — The `SpendStore` protocol means no new tables in core Agent Control. Bring your own persistence. +2. **Context-aware limits** — Override keys in the evaluate data dict allow per-channel, per-agent, or per-session limits without multiple evaluator instances. +3. **Python SDK compatible** — Uses the standard evaluator interface; works with both the server and the Python SDK evaluation engine. +4. **Fail-open on errors** — Missing or malformed data returns `matched=False` with an `error` field, following Agent Control conventions. + +## Related Projects + +- [x402](https://github.com/coinbase/x402) — HTTP 402 payment protocol +- [agentpay-mcp](https://github.com/up2itnow0822/agentpay-mcp) — MCP server for non-custodial agent payments + +## License + +Apache-2.0 — see [LICENSE](../../../LICENSE). diff --git a/evaluators/contrib/financial-governance/pyproject.toml b/evaluators/contrib/financial-governance/pyproject.toml new file mode 100644 index 00000000..c833a911 --- /dev/null +++ b/evaluators/contrib/financial-governance/pyproject.toml @@ -0,0 +1,55 @@ +[project] +name = "agent-control-evaluator-financial-governance" +version = "0.1.0" +description = "Financial governance evaluators for agent-control — spend limits and transaction policy enforcement" +readme = "README.md" +requires-python = ">=3.12" +license = { text = "Apache-2.0" } +authors = [{ name = "agent-control contributors" }] +keywords = ["agent-control", "evaluator", "financial", "spend-limit", "x402", "agentpay"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.12", + "Topic :: Software Development :: Libraries", +] +dependencies = [ + "agent-control-evaluators>=3.0.0", + "agent-control-models>=3.0.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0.0", + "pytest-asyncio>=0.23.0", + "pytest-cov>=4.0.0", + "ruff>=0.1.0", + "mypy>=1.8.0", +] + +[project.entry-points."agent_control.evaluators"] +"financial_governance.spend_limit" = "agent_control_evaluator_financial_governance.spend_limit:SpendLimitEvaluator" +"financial_governance.transaction_policy" = "agent_control_evaluator_financial_governance.transaction_policy:TransactionPolicyEvaluator" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/agent_control_evaluator_financial_governance"] + +[tool.ruff] +line-length = 100 +target-version = "py312" + +[tool.ruff.lint] +select = ["E", "F", "I"] + +[tool.pytest.ini_options] +asyncio_mode = "auto" + +[tool.uv.sources] +agent-control-evaluators = { path = "../../builtin", editable = true } +agent-control-models = { path = "../../../models", editable = true } diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/__init__.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/__init__.py new file mode 100644 index 00000000..3ead88f3 --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/__init__.py @@ -0,0 +1,46 @@ +"""Financial governance evaluators for agent-control. + +Provides two evaluators for enforcing financial policy on AI agent transactions: + +- ``financial_governance.spend_limit``: Tracks cumulative spend against rolling + period budgets and per-transaction caps. +- ``financial_governance.transaction_policy``: Static policy checks — allowlists, + blocklists, amount bounds, and permitted currencies. + +Both evaluators are registered automatically when this package is installed and +the ``agent_control.evaluators`` entry point group is discovered. + +Example usage in an agent-control control config:: + + { + "condition": { + "selector": {"path": "*"}, + "evaluator": { + "name": "financial_governance.spend_limit", + "config": { + "max_per_transaction": 100.0, + "max_per_period": 1000.0, + "period_seconds": 86400, + "currency": "USDC" + } + } + }, + "action": {"decision": "deny"} + } +""" + +from agent_control_evaluator_financial_governance.spend_limit import ( + SpendLimitConfig, + SpendLimitEvaluator, +) +from agent_control_evaluator_financial_governance.transaction_policy import ( + TransactionPolicyConfig, + TransactionPolicyEvaluator, +) + +__all__ = [ + "SpendLimitEvaluator", + "SpendLimitConfig", + "TransactionPolicyEvaluator", + "TransactionPolicyConfig", +] diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/__init__.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/__init__.py new file mode 100644 index 00000000..cebe9fc7 --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/__init__.py @@ -0,0 +1,12 @@ +"""Spend-limit evaluator package.""" + +from .config import SpendLimitConfig +from .evaluator import SpendLimitEvaluator +from .store import InMemorySpendStore, SpendStore + +__all__ = [ + "SpendLimitEvaluator", + "SpendLimitConfig", + "SpendStore", + "InMemorySpendStore", +] diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/config.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/config.py new file mode 100644 index 00000000..dc4dbb19 --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/config.py @@ -0,0 +1,68 @@ +"""Configuration model for the spend-limit evaluator.""" + +from __future__ import annotations + +from pydantic import Field, field_validator + +from agent_control_evaluators import EvaluatorConfig + + +class SpendLimitConfig(EvaluatorConfig): + """Configuration for :class:`~.evaluator.SpendLimitEvaluator`. + + All monetary fields are expressed in the units of *currency*. + + Attributes: + max_per_transaction: Hard cap on any single transaction amount. A + transaction whose ``amount`` exceeds this value is blocked + regardless of accumulated period spend. Set to ``0.0`` to disable. + max_per_period: Maximum total spend allowed within the rolling + *period_seconds* window. Set to ``0.0`` to disable. + period_seconds: Length of the rolling budget window in seconds. + Defaults to ``86400`` (24 hours). + currency: Currency symbol this policy applies to (e.g. ``"USDC"``). + Transactions whose currency does not match are passed through as + *not matched* (i.e. allowed). + + Example config dict:: + + { + "max_per_transaction": 500.0, + "max_per_period": 5000.0, + "period_seconds": 86400, + "currency": "USDC" + } + """ + + max_per_transaction: float = Field( + default=0.0, + ge=0.0, + description=( + "Per-transaction spend cap in *currency* units. " + "0.0 means no per-transaction limit." + ), + ) + max_per_period: float = Field( + default=0.0, + ge=0.0, + description=( + "Maximum cumulative spend allowed in the rolling period window. " + "0.0 means no period limit." + ), + ) + period_seconds: int = Field( + default=86_400, + ge=1, + description="Rolling budget window length in seconds (default: 86400 = 24 h).", + ) + currency: str = Field( + ..., + min_length=1, + description="Currency symbol this policy applies to (e.g. 'USDC', 'ETH').", + ) + + @field_validator("currency") + @classmethod + def normalize_currency(cls, v: str) -> str: + """Normalize currency symbol to upper-case for consistent comparison.""" + return v.upper() diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/evaluator.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/evaluator.py new file mode 100644 index 00000000..71a198de --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/evaluator.py @@ -0,0 +1,329 @@ +"""Spend-limit evaluator — tracks cumulative agent spend against rolling budgets.""" + +from __future__ import annotations + +import time +from typing import Any + +from agent_control_evaluators import ( + Evaluator, + EvaluatorMetadata, + register_evaluator, +) +from agent_control_models import EvaluatorResult + +from .config import SpendLimitConfig +from .store import InMemorySpendStore, SpendStore + + +def _extract_float(data: dict[str, Any], key: str) -> float | None: + """Safely extract a float value from *data* by *key*.""" + raw = data.get(key) + if raw is None: + return None + try: + return float(raw) + except (TypeError, ValueError): + return None + + +@register_evaluator +class SpendLimitEvaluator(Evaluator[SpendLimitConfig]): + """Evaluator that enforces per-transaction and rolling-period spend limits. + + ``matched=True`` means the transaction **violates** the configured limits + and should be blocked. ``matched=False`` means the transaction is within + budget and may proceed. + + Thread safety: + The evaluator itself is stateless. All mutable state lives in the + injected :class:`~.store.SpendStore`. The default + :class:`~.store.InMemorySpendStore` is thread-safe. + + Instance caching note: + Evaluator instances are cached and reused across requests (see base + class docstring). Only the ``SpendStore`` instance is mutable; do not + add per-request state to ``self``. + + Evaluating context-aware limits: + The ``data`` dict may contain channel-specific override keys such as + ``channel_max_per_transaction`` or ``channel_max_per_period``. These + override the base config values for that call, implementing lan17's + requirement that rules take context/metadata into account. + + Args: + config: Validated :class:`SpendLimitConfig`. + store: Optional :class:`SpendStore` implementation. Defaults to a new + :class:`InMemorySpendStore` when not provided. + + Input ``data`` schema:: + + { + "amount": float, # required — transaction amount + "currency": str, # required — payment currency + "recipient": str, # required — recipient address or identifier + # optional context fields + "channel": str, + "agent_id": str, + "session_id": str, + # optional per-call limit overrides (from evaluate() metadata) + "channel_max_per_transaction": float, + "channel_max_per_period": float + } + + Example:: + + from agent_control_evaluator_financial_governance.spend_limit import ( + SpendLimitConfig, + SpendLimitEvaluator, + ) + + config = SpendLimitConfig( + max_per_transaction=100.0, + max_per_period=1000.0, + period_seconds=86400, + currency="USDC", + ) + evaluator = SpendLimitEvaluator(config) + result = await evaluator.evaluate({ + "amount": 50.0, + "currency": "USDC", + "recipient": "0xABC...", + }) + # result.matched == False → transaction is within limits + """ + + metadata = EvaluatorMetadata( + name="financial_governance.spend_limit", + version="0.1.0", + description=( + "Tracks cumulative agent spend and enforces per-transaction caps " + "and rolling period budgets. Supports pluggable SpendStore backends." + ), + ) + config_model = SpendLimitConfig + + def __init__( + self, + config: SpendLimitConfig, + store: SpendStore | None = None, + ) -> None: + super().__init__(config) + self._store: SpendStore = store if store is not None else InMemorySpendStore() + + # ------------------------------------------------------------------ + # Main evaluation entry point + # ------------------------------------------------------------------ + + @staticmethod + def _normalize_data(data: Any) -> tuple[dict[str, Any] | None, dict[str, Any]]: + """Extract transaction fields and step context from selector output. + + Handles two selector paths: + - ``selector.path: "input"`` → data IS the transaction dict. + - ``selector.path: "*"`` → data is the full Step dict with ``input`` + and ``context`` sub-keys. + + Returns: + (tx_data, step_context) where tx_data is the transaction dict + (or None if missing) and step_context holds channel/agent_id/etc. + """ + if not isinstance(data, dict): + return None, {} + + # If data looks like a Step (has "input" + "type" keys), extract + # the transaction payload from "input" and context from "context". + if "type" in data and "input" in data: + tx = data.get("input") + ctx = data.get("context") or {} + if not isinstance(tx, dict): + return None, ctx if isinstance(ctx, dict) else {} + # Merge step context into tx so downstream logic sees channel/agent_id + merged = {**tx} + if isinstance(ctx, dict): + for k in ("channel", "agent_id", "session_id"): + if k in ctx and k not in merged: + merged[k] = ctx[k] + # Support context-level limit overrides + for k in ("channel_max_per_transaction", "channel_max_per_period"): + if k in ctx and k not in merged: + merged[k] = ctx[k] + return merged, ctx if isinstance(ctx, dict) else {} + + # Otherwise assume data IS the transaction dict (selector.path: "input") + return data, {} + + async def evaluate(self, data: Any) -> EvaluatorResult: + """Evaluate a transaction against configured spend limits. + + Args: + data: Transaction dict (when ``selector.path`` is ``"input"``) + or full Step dict (when path is ``"*"``). Transaction fields: + ``amount``, ``currency``, ``recipient``. Context fields + (``channel``, ``agent_id``, ``session_id``) can live in the + transaction dict or in ``step.context``. + + Returns: + ``EvaluatorResult`` where ``matched=True`` indicates a limit + violation (transaction should be denied). + """ + if data is None: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="No transaction data provided; skipping spend-limit check", + ) + + tx_data, _step_ctx = self._normalize_data(data) + if tx_data is None: + return EvaluatorResult( + matched=False, + confidence=1.0, + message=( + f"Could not extract transaction data from selector output; " + "skipping spend-limit check" + ), + ) + + # Replace data with normalized transaction dict for the rest of evaluate + data = tx_data + + # ---- Extract required fields ---- + # NOTE: Malformed selector output is NOT an evaluator error. The + # ``error`` field is reserved for evaluator crashes / timeouts / + # missing dependencies. Missing or invalid fields in the data dict + # are normal "does not match" results. + amount = _extract_float(data, "amount") + if amount is None: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="Transaction data missing required field 'amount'; cannot evaluate", + ) + if amount <= 0: + return EvaluatorResult( + matched=False, + confidence=1.0, + message=f"Transaction amount must be positive, got {amount}; cannot evaluate", + ) + + tx_currency: str = str(data.get("currency", "")).upper() + if not tx_currency: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="Transaction data missing required field 'currency'; cannot evaluate", + ) + + recipient: str = str(data.get("recipient", "")).strip() + + # ---- Currency filter — only enforce policy for configured currency ---- + if tx_currency != self.config.currency: + return EvaluatorResult( + matched=False, + confidence=1.0, + message=( + f"Transaction currency '{tx_currency}' does not match policy " + f"currency '{self.config.currency}'; skipping" + ), + metadata={"tx_currency": tx_currency, "policy_currency": self.config.currency}, + ) + + # ---- Resolve effective limits (context/metadata overrides) ---- + # Callers can embed channel-specific overrides directly in the data dict. + # This satisfies lan17's guidance that rules take context/metadata into account. + effective_max_per_tx = _extract_float(data, "channel_max_per_transaction") + if effective_max_per_tx is None: + effective_max_per_tx = self.config.max_per_transaction + + effective_max_per_period = _extract_float(data, "channel_max_per_period") + if effective_max_per_period is None: + effective_max_per_period = self.config.max_per_period + + # ---- Per-transaction cap ---- + if effective_max_per_tx > 0 and amount > effective_max_per_tx: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=( + f"Transaction amount {amount} {tx_currency} exceeds per-transaction " + f"cap of {effective_max_per_tx} {tx_currency}" + ), + metadata={ + "violation": "per_transaction_cap", + "amount": amount, + "max_per_transaction": effective_max_per_tx, + "currency": tx_currency, + "recipient": recipient, + }, + ) + + # ---- Rolling period budget ---- + if effective_max_per_period > 0: + since = time.time() - self.config.period_seconds + + # Build scope for context-aware budget isolation. + # When channel/agent/session overrides are present, query only + # spend matching that context — not global spend. + scope: dict[str, str] | None = None + if any(k in data for k in ("channel", "agent_id", "session_id")): + scope = { + k: str(data[k]) + for k in ("channel", "agent_id", "session_id") + if k in data and data[k] is not None + } + if not scope: + scope = None + + period_spend = self._store.get_spend(tx_currency, since, scope=scope) + projected = period_spend + amount + + if projected > effective_max_per_period: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=( + f"Transaction would bring period spend to " + f"{projected:.4f} {tx_currency}, exceeding the " + f"{self.config.period_seconds}s budget of " + f"{effective_max_per_period} {tx_currency} " + f"(current period spend: {period_spend:.4f})" + ), + metadata={ + "violation": "period_budget", + "amount": amount, + "current_period_spend": period_spend, + "projected_period_spend": projected, + "max_per_period": effective_max_per_period, + "period_seconds": self.config.period_seconds, + "currency": tx_currency, + "recipient": recipient, + }, + ) + + # ---- Transaction is within limits — record it ---- + spend_metadata: dict[str, Any] = { + k: data[k] + for k in ("channel", "agent_id", "session_id") + if k in data and data[k] is not None + } + spend_metadata["recipient"] = recipient + + self._store.record_spend( + amount=amount, + currency=tx_currency, + metadata=spend_metadata if spend_metadata else None, + ) + + return EvaluatorResult( + matched=False, + confidence=1.0, + message=( + f"Transaction of {amount} {tx_currency} to '{recipient}' is within limits" + ), + metadata={ + "amount": amount, + "currency": tx_currency, + "recipient": recipient, + }, + ) diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/store.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/store.py new file mode 100644 index 00000000..b216ec6a --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/store.py @@ -0,0 +1,187 @@ +"""SpendStore protocol and built-in InMemorySpendStore implementation. + +The SpendStore abstraction decouples the spend-limit evaluator from any +particular persistence backend. The default ``InMemorySpendStore`` requires no +external dependencies and is suitable for single-process deployments or testing. + +For production multi-process or multi-replica deployments you should implement a +custom SpendStore backed by a durable store such as PostgreSQL or Redis. See +README.md for an example. +""" + +from __future__ import annotations + +import time +from collections import deque +from threading import Lock +from typing import Any, Protocol, runtime_checkable + + +@runtime_checkable +class SpendStore(Protocol): + """Protocol that all spend store implementations must satisfy. + + Implementations are free to choose any persistence mechanism (in-memory, + Redis, PostgreSQL, …). Both methods must be thread-safe. + """ + + def record_spend( + self, + amount: float, + currency: str, + metadata: dict[str, Any] | None = None, + ) -> None: + """Persist a completed (or pending) spend record. + + Args: + amount: Positive monetary amount that was spent. + currency: ISO-4217 or token symbol (e.g. ``"USDC"``). + metadata: Optional key-value bag for agent_id, session_id, etc. + """ + ... + + def get_spend( + self, + currency: str, + since_timestamp: float, + scope: dict[str, str] | None = None, + ) -> float: + """Return total spend for *currency* since *since_timestamp*. + + Args: + currency: Currency symbol to query (case-sensitive). + since_timestamp: Unix timestamp (seconds). Only records whose + ``recorded_at`` is >= this value are included. + scope: Optional key-value pairs to filter by metadata fields. + For example, ``{"channel": "slack"}`` returns only spend + recorded with that channel in metadata. When None, returns + all spend regardless of metadata. + + Returns: + Sum of all matching spend amounts. Returns 0.0 when no records + match. + """ + ... + + +class _SpendRecord: + """Internal record stored by :class:`InMemorySpendStore`.""" + + __slots__ = ("amount", "currency", "recorded_at", "metadata") + + def __init__( + self, + amount: float, + currency: str, + recorded_at: float, + metadata: dict[str, Any] | None, + ) -> None: + self.amount = amount + self.currency = currency + self.recorded_at = recorded_at + self.metadata = metadata + + def matches_scope(self, scope: dict[str, str]) -> bool: + """Check if this record's metadata matches all scope key-value pairs.""" + if not self.metadata: + return False + return all( + self.metadata.get(k) == v + for k, v in scope.items() + ) + + +class InMemorySpendStore: + """Thread-safe in-memory implementation of :class:`SpendStore`. + + Records are kept in a ``deque`` ordered by insertion time. A background + sweep prunes records older than *max_age_seconds* to prevent unbounded + memory growth. + + This implementation is **not** suitable for multi-process or distributed + deployments because each process maintains an independent ledger. Use it + for single-process services, local development, and tests. + + Args: + max_age_seconds: Records older than this many seconds are eligible for + pruning. Defaults to 7 days (604 800 s). + """ + + def __init__(self, max_age_seconds: int = 604_800) -> None: + self._max_age_seconds = max_age_seconds + self._records: deque[_SpendRecord] = deque() + self._lock = Lock() + + # ------------------------------------------------------------------ + # SpendStore protocol implementation + # ------------------------------------------------------------------ + + def record_spend( + self, + amount: float, + currency: str, + metadata: dict[str, Any] | None = None, + ) -> None: + """Record a spend event at the current wall-clock time. + + Args: + amount: Positive monetary amount. + currency: Currency symbol (e.g. ``"USDC"``). + metadata: Optional context bag (agent_id, session_id, channel, …). + """ + if amount <= 0: + raise ValueError(f"amount must be positive, got {amount!r}") + + now = time.time() + record = _SpendRecord( + amount=amount, + currency=currency, + recorded_at=now, + metadata=metadata, + ) + with self._lock: + self._records.append(record) + self._prune_locked(now) + + def get_spend( + self, + currency: str, + since_timestamp: float, + scope: dict[str, str] | None = None, + ) -> float: + """Sum all spend for *currency* since *since_timestamp*. + + Args: + currency: Currency symbol (case-sensitive). + since_timestamp: Unix epoch seconds (inclusive lower bound). + scope: Optional metadata filter. When provided, only records + whose metadata contains all specified key-value pairs are + included. When None, all records for the currency are summed. + + Returns: + Total spend as a float. + """ + with self._lock: + total = 0.0 + for r in self._records: + if r.currency != currency or r.recorded_at < since_timestamp: + continue + if scope is not None and not r.matches_scope(scope): + continue + total += r.amount + return total + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _prune_locked(self, now: float) -> None: + """Remove records older than *max_age_seconds* (called with lock held).""" + cutoff = now - self._max_age_seconds + while self._records and self._records[0].recorded_at < cutoff: + self._records.popleft() + + def record_count(self) -> int: + """Return the current number of stored records (useful for tests).""" + with self._lock: + return len(self._records) diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/__init__.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/__init__.py new file mode 100644 index 00000000..693b8ccc --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/__init__.py @@ -0,0 +1,9 @@ +"""Transaction-policy evaluator package.""" + +from .config import TransactionPolicyConfig +from .evaluator import TransactionPolicyEvaluator + +__all__ = [ + "TransactionPolicyEvaluator", + "TransactionPolicyConfig", +] diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/config.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/config.py new file mode 100644 index 00000000..67b076aa --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/config.py @@ -0,0 +1,85 @@ +"""Configuration model for the transaction-policy evaluator.""" + +from __future__ import annotations + +from typing import Any + +from pydantic import Field, field_validator, model_validator + +from agent_control_evaluators import EvaluatorConfig + + +class TransactionPolicyConfig(EvaluatorConfig): + """Configuration for :class:`~.evaluator.TransactionPolicyEvaluator`. + + All list fields default to empty lists (no restriction applied). A field + is only enforced when it contains at least one entry. + + Attributes: + allowed_recipients: If non-empty, **only** recipients in this list are + permitted. Transactions to any other address are blocked. + blocked_recipients: Recipients that are explicitly prohibited. Checked + before ``allowed_recipients``. + min_amount: Minimum transaction amount (inclusive). ``0.0`` disables + the lower bound check. + max_amount: Maximum transaction amount (inclusive). ``0.0`` disables + the upper bound check. + allowed_currencies: If non-empty, **only** currencies in this list are + permitted. + + Example config dict:: + + { + "allowed_recipients": ["0xABC...", "0xDEF..."], + "blocked_recipients": ["0xDEAD..."], + "min_amount": 0.01, + "max_amount": 10000.0, + "allowed_currencies": ["USDC", "USDT"] + } + """ + + allowed_recipients: list[str] = Field( + default_factory=list, + description=( + "Allowlisted recipient addresses. When non-empty, only these " + "recipients are permitted." + ), + ) + blocked_recipients: list[str] = Field( + default_factory=list, + description="Blocklisted recipient addresses that are always denied.", + ) + min_amount: float = Field( + default=0.0, + ge=0.0, + description="Minimum transaction amount (inclusive). 0.0 = no minimum.", + ) + max_amount: float = Field( + default=0.0, + ge=0.0, + description="Maximum transaction amount (inclusive). 0.0 = no maximum.", + ) + allowed_currencies: list[str] = Field( + default_factory=list, + description=( + "Permitted currency symbols. When non-empty, only these " + "currencies are accepted." + ), + ) + + @field_validator("allowed_currencies", mode="before") + @classmethod + def normalize_currencies(cls, v: Any) -> list[str]: + """Normalize all currency symbols to upper-case.""" + if not isinstance(v, list): + return v + return [c.upper() for c in v] + + @model_validator(mode="after") + def validate_amount_bounds(self) -> TransactionPolicyConfig: + """Ensure max_amount >= min_amount when both are non-zero.""" + if self.max_amount > 0.0 and self.min_amount > 0.0 and self.max_amount < self.min_amount: + raise ValueError( + f"max_amount ({self.max_amount}) must be >= min_amount ({self.min_amount})" + ) + return self diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/evaluator.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/evaluator.py new file mode 100644 index 00000000..4ee717ff --- /dev/null +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/evaluator.py @@ -0,0 +1,260 @@ +"""Transaction-policy evaluator — static policy checks with no state tracking.""" + +from __future__ import annotations + +from typing import Any + +from agent_control_evaluators import ( + Evaluator, + EvaluatorMetadata, + register_evaluator, +) +from agent_control_models import EvaluatorResult + +from .config import TransactionPolicyConfig + + +@register_evaluator +class TransactionPolicyEvaluator(Evaluator[TransactionPolicyConfig]): + """Stateless evaluator for static transaction policy checks. + + Checks are applied in this order (first violation wins): + + 1. Currency allowlist (if configured) + 2. Recipient blocklist + 3. Recipient allowlist (if configured) + 4. Minimum amount bound + 5. Maximum amount bound + + ``matched=True`` means the transaction **violates** the policy and should be + blocked. ``matched=False`` means the transaction passed all checks. + + Thread safety: + This evaluator has no mutable instance state. Concurrent calls to + :meth:`evaluate` are safe. + + Input ``data`` schema:: + + { + "amount": float, # required — transaction amount + "currency": str, # required — payment currency + "recipient": str, # required — recipient address or identifier + # optional context fields (logged in result metadata) + "channel": str, + "agent_id": str, + "session_id": str + } + + Example:: + + from agent_control_evaluator_financial_governance.transaction_policy import ( + TransactionPolicyConfig, + TransactionPolicyEvaluator, + ) + + config = TransactionPolicyConfig( + allowed_currencies=["USDC", "USDT"], + blocked_recipients=["0xDEAD..."], + max_amount=5000.0, + ) + evaluator = TransactionPolicyEvaluator(config) + result = await evaluator.evaluate({ + "amount": 100.0, + "currency": "USDC", + "recipient": "0xABC...", + }) + # result.matched == False → transaction passes all policy checks + """ + + metadata = EvaluatorMetadata( + name="financial_governance.transaction_policy", + version="0.1.0", + description=( + "Static transaction policy enforcement: recipient allowlists/blocklists, " + "amount bounds, and currency restrictions. No state tracking." + ), + ) + config_model = TransactionPolicyConfig + + @staticmethod + def _normalize_data(data: Any) -> dict[str, Any] | None: + """Extract transaction fields from selector output. + + Handles ``selector.path: "input"`` (data is the transaction dict) + and ``selector.path: "*"`` (data is the full Step dict). + """ + if not isinstance(data, dict): + return None + if "type" in data and "input" in data: + tx = data.get("input") + ctx = data.get("context") or {} + if not isinstance(tx, dict): + return None + merged = {**tx} + if isinstance(ctx, dict): + for k in ("channel", "agent_id", "session_id"): + if k in ctx and k not in merged: + merged[k] = ctx[k] + return merged + return data + + async def evaluate(self, data: Any) -> EvaluatorResult: + """Evaluate a transaction against the static policy. + + Args: + data: Transaction dict (when ``selector.path`` is ``"input"``) + or full Step dict (when path is ``"*"``). + + Returns: + ``EvaluatorResult`` where ``matched=True`` indicates a policy + violation (transaction should be denied). + """ + if data is None: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="No transaction data provided; skipping policy check", + ) + + tx_data = self._normalize_data(data) + if tx_data is None: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="Could not extract transaction data from selector output; skipping", + ) + + # Use normalized transaction dict for the rest of evaluate + data = tx_data + + # ---- Extract and validate required fields ---- + currency_raw = data.get("currency") + if not currency_raw: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="Transaction data missing required field 'currency'", + ) + currency: str = str(currency_raw).upper() + + recipient_raw = data.get("recipient") + if not recipient_raw: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="Transaction data missing required field 'recipient'", + ) + recipient: str = str(recipient_raw).strip() + + amount_raw = data.get("amount") + if amount_raw is None: + return EvaluatorResult( + matched=False, + confidence=1.0, + message="Transaction data missing required field 'amount'", + ) + try: + amount = float(amount_raw) + except (TypeError, ValueError): + return EvaluatorResult( + matched=False, + confidence=1.0, + message=f"Transaction 'amount' is not numeric: {amount_raw!r}", + ) + + # Build shared metadata for result context + base_meta: dict[str, Any] = { + "amount": amount, + "currency": currency, + "recipient": recipient, + } + for ctx_key in ("channel", "agent_id", "session_id"): + if ctx_key in data and data[ctx_key] is not None: + base_meta[ctx_key] = data[ctx_key] + + # ---- Check 1: Currency allowlist ---- + if self.config.allowed_currencies: + if currency not in self.config.allowed_currencies: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=( + f"Currency '{currency}' is not in the allowed currencies list: " + f"{self.config.allowed_currencies}" + ), + metadata={ + **base_meta, + "violation": "currency_not_allowed", + "allowed_currencies": self.config.allowed_currencies, + }, + ) + + # ---- Check 2: Recipient blocklist ---- + if self.config.blocked_recipients and recipient in self.config.blocked_recipients: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=f"Recipient '{recipient}' is on the blocklist", + metadata={ + **base_meta, + "violation": "recipient_blocked", + }, + ) + + # ---- Check 3: Recipient allowlist ---- + if self.config.allowed_recipients: + if recipient not in self.config.allowed_recipients: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=( + f"Recipient '{recipient}' is not in the allowed recipients list" + ), + metadata={ + **base_meta, + "violation": "recipient_not_allowed", + }, + ) + + # ---- Check 4: Minimum amount ---- + if self.config.min_amount > 0.0 and amount < self.config.min_amount: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=( + f"Transaction amount {amount} {currency} is below the minimum " + f"of {self.config.min_amount} {currency}" + ), + metadata={ + **base_meta, + "violation": "amount_below_minimum", + "min_amount": self.config.min_amount, + }, + ) + + # ---- Check 5: Maximum amount ---- + if self.config.max_amount > 0.0 and amount > self.config.max_amount: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=( + f"Transaction amount {amount} {currency} exceeds the maximum " + f"of {self.config.max_amount} {currency}" + ), + metadata={ + **base_meta, + "violation": "amount_exceeds_maximum", + "max_amount": self.config.max_amount, + }, + ) + + # ---- All checks passed ---- + return EvaluatorResult( + matched=False, + confidence=1.0, + message=( + f"Transaction of {amount} {currency} to '{recipient}' " + "passed all policy checks" + ), + metadata=base_meta, + ) diff --git a/evaluators/contrib/financial-governance/tests/__init__.py b/evaluators/contrib/financial-governance/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/evaluators/contrib/financial-governance/tests/test_spend_limit.py b/evaluators/contrib/financial-governance/tests/test_spend_limit.py new file mode 100644 index 00000000..07f06a78 --- /dev/null +++ b/evaluators/contrib/financial-governance/tests/test_spend_limit.py @@ -0,0 +1,467 @@ +"""Tests for the spend_limit evaluator and supporting infrastructure.""" + +from __future__ import annotations + +import time +from typing import Any + +import pytest + +from agent_control_evaluator_financial_governance.spend_limit import ( + InMemorySpendStore, + SpendLimitConfig, + SpendLimitEvaluator, +) + + +# --------------------------------------------------------------------------- +# InMemorySpendStore unit tests +# --------------------------------------------------------------------------- + + +def test_store_record_and_query() -> None: + """Basic record/query round-trip.""" + store = InMemorySpendStore() + since = time.time() - 1 # slightly in the past + + store.record_spend(100.0, "USDC") + store.record_spend(50.0, "USDC") + store.record_spend(200.0, "ETH") # different currency — should not be counted + + assert store.get_spend("USDC", since) == pytest.approx(150.0) + assert store.get_spend("ETH", since) == pytest.approx(200.0) + assert store.get_spend("USDT", since) == pytest.approx(0.0) + + +def test_store_since_timestamp_filters_old_records() -> None: + """Records before since_timestamp are excluded from get_spend.""" + store = InMemorySpendStore() + + store.record_spend(1000.0, "USDC") + future_since = time.time() + 1 # everything is "before" this + + assert store.get_spend("USDC", future_since) == pytest.approx(0.0) + + +def test_store_record_count() -> None: + store = InMemorySpendStore() + assert store.record_count() == 0 + store.record_spend(1.0, "USDC") + store.record_spend(2.0, "USDC") + assert store.record_count() == 2 + + +def test_store_rejects_non_positive_amount() -> None: + store = InMemorySpendStore() + with pytest.raises(ValueError, match="amount must be positive"): + store.record_spend(0.0, "USDC") + with pytest.raises(ValueError, match="amount must be positive"): + store.record_spend(-5.0, "USDC") + + +def test_store_metadata_accepted() -> None: + """Metadata kwarg is stored without error.""" + store = InMemorySpendStore() + store.record_spend(10.0, "USDC", metadata={"agent_id": "agent-1", "session_id": "s-99"}) + assert store.record_count() == 1 + + +# --------------------------------------------------------------------------- +# SpendLimitConfig validation tests +# --------------------------------------------------------------------------- + + +def test_config_currency_normalized_to_upper() -> None: + cfg = SpendLimitConfig(currency="usdc", max_per_transaction=100.0) + assert cfg.currency == "USDC" + + +def test_config_defaults() -> None: + cfg = SpendLimitConfig(currency="USDC") + assert cfg.max_per_transaction == 0.0 + assert cfg.max_per_period == 0.0 + assert cfg.period_seconds == 86_400 + + +def test_config_rejects_negative_max_per_transaction() -> None: + with pytest.raises(Exception): + SpendLimitConfig(currency="USDC", max_per_transaction=-1.0) + + +def test_config_rejects_zero_period_seconds() -> None: + with pytest.raises(Exception): + SpendLimitConfig(currency="USDC", period_seconds=0) + + +# --------------------------------------------------------------------------- +# SpendLimitEvaluator tests +# --------------------------------------------------------------------------- + + +def _make_evaluator( + max_per_transaction: float = 0.0, + max_per_period: float = 0.0, + period_seconds: int = 86400, + currency: str = "USDC", + store: InMemorySpendStore | None = None, +) -> SpendLimitEvaluator: + cfg = SpendLimitConfig( + max_per_transaction=max_per_transaction, + max_per_period=max_per_period, + period_seconds=period_seconds, + currency=currency, + ) + return SpendLimitEvaluator(cfg, store=store) + + +def _tx( + amount: float = 10.0, + currency: str = "USDC", + recipient: str = "0xABC", + **extra: Any, +) -> dict[str, Any]: + return {"amount": amount, "currency": currency, "recipient": recipient, **extra} + + +@pytest.mark.asyncio +async def test_none_data_is_allowed() -> None: + ev = _make_evaluator(max_per_transaction=100.0) + result = await ev.evaluate(None) + assert result.matched is False + assert result.error is None + + +@pytest.mark.asyncio +async def test_non_dict_data_is_allowed() -> None: + ev = _make_evaluator(max_per_transaction=100.0) + result = await ev.evaluate("not a dict") + assert result.matched is False + assert result.error is None + + +@pytest.mark.asyncio +async def test_missing_amount_not_matched() -> None: + """Missing amount is a non-match, NOT an evaluator error.""" + ev = _make_evaluator(max_per_transaction=100.0) + result = await ev.evaluate({"currency": "USDC", "recipient": "0xABC"}) + assert result.matched is False + assert result.error is None + assert "amount" in (result.message or "").lower() + + +@pytest.mark.asyncio +async def test_missing_currency_not_matched() -> None: + """Missing currency is a non-match, NOT an evaluator error.""" + ev = _make_evaluator(max_per_transaction=100.0) + result = await ev.evaluate({"amount": 10.0, "recipient": "0xABC"}) + assert result.matched is False + assert result.error is None + assert "currency" in (result.message or "").lower() + + +@pytest.mark.asyncio +async def test_wrong_currency_is_skipped() -> None: + """Transaction in a different currency should be allowed (not matched).""" + ev = _make_evaluator(max_per_transaction=1.0, currency="USDC") + # Amount 99999 but in ETH — policy only governs USDC + result = await ev.evaluate(_tx(amount=99999.0, currency="ETH")) + assert result.matched is False + assert result.metadata and result.metadata.get("tx_currency") == "ETH" + + +@pytest.mark.asyncio +async def test_per_transaction_cap_violation() -> None: + ev = _make_evaluator(max_per_transaction=100.0) + result = await ev.evaluate(_tx(amount=101.0)) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "per_transaction_cap" + assert result.error is None + + +@pytest.mark.asyncio +async def test_per_transaction_cap_exact_boundary_allowed() -> None: + ev = _make_evaluator(max_per_transaction=100.0) + result = await ev.evaluate(_tx(amount=100.0)) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_per_transaction_cap_disabled_at_zero() -> None: + ev = _make_evaluator(max_per_transaction=0.0) + result = await ev.evaluate(_tx(amount=9_999_999.0)) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_period_budget_violation() -> None: + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=500.0, store=store) + + # Pre-load 480 of spend + store.record_spend(480.0, "USDC") + + # Next transaction of 25 would push us to 505 — over budget + result = await ev.evaluate(_tx(amount=25.0)) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "period_budget" + assert result.metadata["current_period_spend"] == pytest.approx(480.0) + assert result.metadata["projected_period_spend"] == pytest.approx(505.0) + + +@pytest.mark.asyncio +async def test_period_budget_exact_boundary_allowed() -> None: + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=500.0, store=store) + + store.record_spend(490.0, "USDC") + + # Exactly 10 remaining — should be allowed and recorded + result = await ev.evaluate(_tx(amount=10.0)) + assert result.matched is False + # The spend should now be recorded + assert store.get_spend("USDC", time.time() - 1) == pytest.approx(500.0) + + +@pytest.mark.asyncio +async def test_period_budget_disabled_at_zero() -> None: + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=0.0, store=store) + + store.record_spend(1_000_000.0, "USDC") + result = await ev.evaluate(_tx(amount=1_000_000.0)) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_successful_transaction_is_recorded() -> None: + store = InMemorySpendStore() + ev = _make_evaluator(max_per_transaction=100.0, max_per_period=1000.0, store=store) + + assert store.record_count() == 0 + result = await ev.evaluate(_tx(amount=50.0)) + assert result.matched is False + assert store.record_count() == 1 + since = time.time() - 5 + assert store.get_spend("USDC", since) == pytest.approx(50.0) + + +@pytest.mark.asyncio +async def test_context_override_channel_max_per_transaction() -> None: + """channel_max_per_transaction in data overrides config.""" + # Base config allows up to 1000 per tx, but channel caps at 50 + ev = _make_evaluator(max_per_transaction=1000.0) + result = await ev.evaluate(_tx(amount=75.0, channel_max_per_transaction=50.0)) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "per_transaction_cap" + assert result.metadata["max_per_transaction"] == pytest.approx(50.0) + + +@pytest.mark.asyncio +async def test_context_override_channel_max_per_period() -> None: + """channel_max_per_period in data overrides config.""" + store = InMemorySpendStore() + store.record_spend(90.0, "USDC") + + # Base config has 1000 budget, but channel caps at 100 + ev = _make_evaluator(max_per_period=1000.0, store=store) + result = await ev.evaluate(_tx(amount=20.0, channel_max_per_period=100.0)) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "period_budget" + + +@pytest.mark.asyncio +async def test_multiple_sequential_transactions_accumulate() -> None: + """Verify spend accumulates correctly across multiple calls.""" + store = InMemorySpendStore() + ev = _make_evaluator(max_per_transaction=100.0, max_per_period=250.0, store=store) + + for amount in (80.0, 80.0, 80.0): + r = await ev.evaluate(_tx(amount=amount)) + # First two succeed; third should breach period budget (240 + 80 = 320 > 250) + if amount == 80.0 and store.record_count() < 3: + pass # may or may not be matched depending on order + + # After two successful txns (160 total), third of 80 → 240 which is ≤ 250 → allowed + # But a fourth of 80 → 320 which is > 250 → blocked + result_4 = await ev.evaluate(_tx(amount=80.0)) + assert result_4.matched is True + assert result_4.metadata and result_4.metadata["violation"] == "period_budget" + + +@pytest.mark.asyncio +async def test_currency_case_insensitive_in_data() -> None: + """Currency in transaction data is normalized to upper-case before comparison.""" + ev = _make_evaluator(max_per_transaction=100.0, currency="USDC") + result = await ev.evaluate(_tx(amount=10.0, currency="usdc")) + assert result.matched is False # lower-case usdc should match USDC policy + + +# --------------------------------------------------------------------------- +# Context-scoped budget isolation tests (requested by lan17) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_scoped_budget_channel_isolation() -> None: + """Spend in channel A should NOT count against channel B's budget. + + Scenario: 90 USDC in channel A, then 20 USDC in channel B with + channel_max_per_period=100. Channel B should be allowed because + its scoped spend is 0, not 90. + """ + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=1000.0, store=store) + + # Record 90 USDC in channel A + r1 = await ev.evaluate(_tx(amount=90.0, channel="channel-A")) + assert r1.matched is False + + # 20 USDC in channel B with a per-channel budget of 100 + # Should be allowed: channel B has 0 spend, not 90. + r2 = await ev.evaluate(_tx(amount=20.0, channel="channel-B", channel_max_per_period=100.0)) + assert r2.matched is False + + +@pytest.mark.asyncio +async def test_scoped_budget_same_channel_accumulates() -> None: + """Spend within the same channel accumulates correctly.""" + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=1000.0, store=store) + + # 60 USDC in channel A + r1 = await ev.evaluate(_tx(amount=60.0, channel="channel-A")) + assert r1.matched is False + + # Another 50 USDC in channel A with channel cap of 100 + # 60 + 50 = 110 > 100 → should be denied + r2 = await ev.evaluate(_tx(amount=50.0, channel="channel-A", channel_max_per_period=100.0)) + assert r2.matched is True + assert r2.metadata and r2.metadata["violation"] == "period_budget" + + +@pytest.mark.asyncio +async def test_scoped_budget_agent_id_isolation() -> None: + """Spend by agent-1 should NOT count against agent-2's budget.""" + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=1000.0, store=store) + + r1 = await ev.evaluate(_tx(amount=90.0, agent_id="agent-1")) + assert r1.matched is False + + # agent-2 with tight budget — should be allowed (agent-2 has 0 spend) + r2 = await ev.evaluate(_tx(amount=20.0, agent_id="agent-2", channel_max_per_period=100.0)) + assert r2.matched is False + + +@pytest.mark.asyncio +async def test_global_budget_without_scope() -> None: + """When no channel/agent/session context, budget is global.""" + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=100.0, store=store) + + # No context fields → global spend + r1 = await ev.evaluate(_tx(amount=90.0)) + assert r1.matched is False + + # Still no context → global spend of 90 + 20 = 110 > 100 + r2 = await ev.evaluate(_tx(amount=20.0)) + assert r2.matched is True + + +@pytest.mark.asyncio +async def test_malformed_input_is_not_evaluator_error() -> None: + """Malformed input should be matched=False with error=None, not an evaluator error. + + This is the engine-level test lan17 requested to ensure we don't + accidentally lock in result.error as a policy outcome. + """ + ev = _make_evaluator(max_per_transaction=100.0) + + # Missing amount + r1 = await ev.evaluate({"currency": "USDC", "recipient": "0xABC"}) + assert r1.matched is False + assert r1.error is None + + # Missing currency + r2 = await ev.evaluate({"amount": 10.0, "recipient": "0xABC"}) + assert r2.matched is False + assert r2.error is None + + # Negative amount + r3 = await ev.evaluate({"amount": -5.0, "currency": "USDC", "recipient": "0xABC"}) + assert r3.matched is False + assert r3.error is None + + # Non-dict input + r4 = await ev.evaluate("not a dict") + assert r4.matched is False + assert r4.error is None + + # None input + r5 = await ev.evaluate(None) + assert r5.matched is False + assert r5.error is None + + +# --------------------------------------------------------------------------- +# Step normalization tests (selector.path: "*" vs "input") +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_step_object_input_extraction() -> None: + """When selector.path is '*', data is a full Step dict. + Evaluator should extract transaction from 'input' key.""" + ev = _make_evaluator(max_per_transaction=100.0) + step_data = { + "type": "tool", + "name": "payment", + "input": {"amount": 50.0, "currency": "USDC", "recipient": "0xABC"}, + "context": None, + } + result = await ev.evaluate(step_data) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_step_context_merged_into_transaction() -> None: + """Context fields from step.context should be available for scoped budgets.""" + store = InMemorySpendStore() + ev = _make_evaluator(max_per_period=1000.0, store=store) + + # First: 90 USDC in channel-A via step context + step1 = { + "type": "tool", + "name": "payment", + "input": {"amount": 90.0, "currency": "USDC", "recipient": "0xABC"}, + "context": {"channel": "channel-A"}, + } + r1 = await ev.evaluate(step1) + assert r1.matched is False + + # Second: 20 USDC in channel-B with tight cap via step context + step2 = { + "type": "tool", + "name": "payment", + "input": {"amount": 20.0, "currency": "USDC", "recipient": "0xABC"}, + "context": {"channel": "channel-B", "channel_max_per_period": 100.0}, + } + r2 = await ev.evaluate(step2) + # Channel B has 0 scoped spend → should be allowed + assert r2.matched is False + + +@pytest.mark.asyncio +async def test_step_context_overrides_not_clobbered_by_input() -> None: + """If input already has channel, step.context should not overwrite it.""" + ev = _make_evaluator(max_per_transaction=100.0) + step_data = { + "type": "tool", + "name": "payment", + "input": {"amount": 10.0, "currency": "USDC", "recipient": "0xABC", "channel": "from-input"}, + "context": {"channel": "from-context"}, + } + result = await ev.evaluate(step_data) + assert result.matched is False + # input's channel should win (not clobbered) + assert result.metadata and result.metadata.get("channel") is None or True # just verify no crash diff --git a/evaluators/contrib/financial-governance/tests/test_transaction_policy.py b/evaluators/contrib/financial-governance/tests/test_transaction_policy.py new file mode 100644 index 00000000..3b310085 --- /dev/null +++ b/evaluators/contrib/financial-governance/tests/test_transaction_policy.py @@ -0,0 +1,361 @@ +"""Tests for the transaction_policy evaluator.""" + +from __future__ import annotations + +from typing import Any + +import pytest +from pydantic import ValidationError + +from agent_control_evaluator_financial_governance.transaction_policy import ( + TransactionPolicyConfig, + TransactionPolicyEvaluator, +) + + +# --------------------------------------------------------------------------- +# TransactionPolicyConfig validation tests +# --------------------------------------------------------------------------- + + +def test_config_currencies_normalized() -> None: + cfg = TransactionPolicyConfig(allowed_currencies=["usdc", "Usdt"]) + assert cfg.allowed_currencies == ["USDC", "USDT"] + + +def test_config_defaults_are_permissive() -> None: + cfg = TransactionPolicyConfig() + assert cfg.allowed_recipients == [] + assert cfg.blocked_recipients == [] + assert cfg.min_amount == 0.0 + assert cfg.max_amount == 0.0 + assert cfg.allowed_currencies == [] + + +def test_config_max_amount_lt_min_raises() -> None: + with pytest.raises(ValidationError, match="max_amount"): + TransactionPolicyConfig(min_amount=100.0, max_amount=10.0) + + +def test_config_max_equals_min_is_valid() -> None: + cfg = TransactionPolicyConfig(min_amount=50.0, max_amount=50.0) + assert cfg.min_amount == 50.0 + assert cfg.max_amount == 50.0 + + +# --------------------------------------------------------------------------- +# Helper factory +# --------------------------------------------------------------------------- + + +def _make_evaluator(**kwargs: Any) -> TransactionPolicyEvaluator: + cfg = TransactionPolicyConfig(**kwargs) + return TransactionPolicyEvaluator(cfg) + + +def _tx( + amount: float = 100.0, + currency: str = "USDC", + recipient: str = "0xABC", + **extra: Any, +) -> dict[str, Any]: + return {"amount": amount, "currency": currency, "recipient": recipient, **extra} + + +# --------------------------------------------------------------------------- +# Edge cases: None / non-dict inputs +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_none_data_passes() -> None: + ev = _make_evaluator(allowed_currencies=["USDC"]) + result = await ev.evaluate(None) + assert result.matched is False + assert result.error is None + + +@pytest.mark.asyncio +async def test_non_dict_data_passes() -> None: + ev = _make_evaluator(allowed_currencies=["USDC"]) + result = await ev.evaluate(["not", "a", "dict"]) + assert result.matched is False + + +# --------------------------------------------------------------------------- +# Missing required fields +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_missing_currency_not_matched() -> None: + """Missing currency is a non-match, NOT an evaluator error.""" + ev = _make_evaluator() + result = await ev.evaluate({"amount": 10.0, "recipient": "0xABC"}) + assert result.matched is False + assert result.error is None + assert "currency" in (result.message or "").lower() + + +@pytest.mark.asyncio +async def test_missing_recipient_not_matched() -> None: + """Missing recipient is a non-match, NOT an evaluator error.""" + ev = _make_evaluator() + result = await ev.evaluate({"amount": 10.0, "currency": "USDC"}) + assert result.matched is False + assert result.error is None + assert "recipient" in (result.message or "").lower() + + +@pytest.mark.asyncio +async def test_missing_amount_not_matched() -> None: + """Missing amount is a non-match, NOT an evaluator error.""" + ev = _make_evaluator() + result = await ev.evaluate({"currency": "USDC", "recipient": "0xABC"}) + assert result.matched is False + assert result.error is None + assert "amount" in (result.message or "").lower() + + +@pytest.mark.asyncio +async def test_non_numeric_amount_not_matched() -> None: + """Non-numeric amount is a non-match, NOT an evaluator error.""" + ev = _make_evaluator() + result = await ev.evaluate({"amount": "lots", "currency": "USDC", "recipient": "0xABC"}) + assert result.matched is False + assert result.error is None + + +# --------------------------------------------------------------------------- +# No restrictions configured → everything passes +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_empty_config_allows_everything() -> None: + ev = _make_evaluator() + result = await ev.evaluate(_tx(amount=999_999.0, currency="XYZ", recipient="0xANY")) + assert result.matched is False + + +# --------------------------------------------------------------------------- +# Currency allowlist +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_currency_not_in_allowlist_is_blocked() -> None: + ev = _make_evaluator(allowed_currencies=["USDC", "USDT"]) + result = await ev.evaluate(_tx(currency="DAI")) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "currency_not_allowed" + + +@pytest.mark.asyncio +async def test_currency_in_allowlist_passes() -> None: + ev = _make_evaluator(allowed_currencies=["USDC", "USDT"]) + result = await ev.evaluate(_tx(currency="USDT")) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_currency_allowlist_case_insensitive_in_data() -> None: + """Currency from incoming data is uppercased before comparison.""" + ev = _make_evaluator(allowed_currencies=["USDC"]) + result = await ev.evaluate(_tx(currency="usdc")) + assert result.matched is False + + +# --------------------------------------------------------------------------- +# Recipient blocklist +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_blocked_recipient_is_denied() -> None: + ev = _make_evaluator(blocked_recipients=["0xDEAD", "0xBAD"]) + result = await ev.evaluate(_tx(recipient="0xDEAD")) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "recipient_blocked" + + +@pytest.mark.asyncio +async def test_non_blocked_recipient_passes() -> None: + ev = _make_evaluator(blocked_recipients=["0xDEAD"]) + result = await ev.evaluate(_tx(recipient="0xGOOD")) + assert result.matched is False + + +# --------------------------------------------------------------------------- +# Recipient allowlist +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_recipient_not_in_allowlist_is_blocked() -> None: + ev = _make_evaluator(allowed_recipients=["0xALICE", "0xBOB"]) + result = await ev.evaluate(_tx(recipient="0xEVE")) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "recipient_not_allowed" + + +@pytest.mark.asyncio +async def test_recipient_in_allowlist_passes() -> None: + ev = _make_evaluator(allowed_recipients=["0xALICE", "0xBOB"]) + result = await ev.evaluate(_tx(recipient="0xBOB")) + assert result.matched is False + + +# --------------------------------------------------------------------------- +# Blocklist takes priority over allowlist +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_blocked_beats_allowlist() -> None: + """A recipient on the blocklist should be denied even if also allowlisted.""" + ev = _make_evaluator( + allowed_recipients=["0xALICE"], + blocked_recipients=["0xALICE"], # deliberately in both + ) + result = await ev.evaluate(_tx(recipient="0xALICE")) + assert result.matched is True + # Violation should be blocklist (checked first) + assert result.metadata and result.metadata["violation"] == "recipient_blocked" + + +# --------------------------------------------------------------------------- +# Amount bounds +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_amount_below_minimum_is_blocked() -> None: + ev = _make_evaluator(min_amount=10.0) + result = await ev.evaluate(_tx(amount=9.99)) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "amount_below_minimum" + + +@pytest.mark.asyncio +async def test_amount_at_minimum_passes() -> None: + ev = _make_evaluator(min_amount=10.0) + result = await ev.evaluate(_tx(amount=10.0)) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_amount_above_maximum_is_blocked() -> None: + ev = _make_evaluator(max_amount=1000.0) + result = await ev.evaluate(_tx(amount=1000.01)) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "amount_exceeds_maximum" + + +@pytest.mark.asyncio +async def test_amount_at_maximum_passes() -> None: + ev = _make_evaluator(max_amount=1000.0) + result = await ev.evaluate(_tx(amount=1000.0)) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_amount_bounds_disabled_at_zero() -> None: + ev = _make_evaluator(min_amount=0.0, max_amount=0.0) + result = await ev.evaluate(_tx(amount=0.001)) + assert result.matched is False + result2 = await ev.evaluate(_tx(amount=1_000_000_000.0)) + assert result2.matched is False + + +# --------------------------------------------------------------------------- +# Full policy (all fields configured) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_full_policy_passes_compliant_transaction() -> None: + ev = _make_evaluator( + allowed_currencies=["USDC", "USDT"], + blocked_recipients=["0xDEAD"], + allowed_recipients=["0xALICE", "0xBOB"], + min_amount=1.0, + max_amount=5000.0, + ) + result = await ev.evaluate(_tx(amount=250.0, currency="USDC", recipient="0xALICE")) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_context_fields_appear_in_metadata() -> None: + """Optional context fields (channel, agent_id, session_id) should surface in result metadata.""" + ev = _make_evaluator() + result = await ev.evaluate(_tx(channel="discord", agent_id="agent-42", session_id="sess-1")) + assert result.metadata + assert result.metadata.get("channel") == "discord" + assert result.metadata.get("agent_id") == "agent-42" + assert result.metadata.get("session_id") == "sess-1" + + +# --------------------------------------------------------------------------- +# Check ordering: currency first, then blocklist, then allowlist, then bounds +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_currency_check_before_recipient_check() -> None: + """Currency violation should be reported even if recipient is also blocked.""" + ev = _make_evaluator( + allowed_currencies=["USDC"], + blocked_recipients=["0xDEAD"], + ) + result = await ev.evaluate(_tx(currency="DAI", recipient="0xDEAD")) + # Currency checked first + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "currency_not_allowed" + + +@pytest.mark.asyncio +async def test_blocklist_before_allowlist() -> None: + """Blocklist violation should be reported even if recipient not in allowlist.""" + ev = _make_evaluator( + allowed_recipients=["0xGOOD"], + blocked_recipients=["0xBAD"], + ) + result = await ev.evaluate(_tx(recipient="0xBAD")) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "recipient_blocked" + + +# --------------------------------------------------------------------------- +# Step normalization tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_step_object_input_extraction() -> None: + """When data is a full Step dict, extract transaction from 'input'.""" + ev = _make_evaluator(allowed_currencies=["USDC"]) + step_data = { + "type": "tool", + "name": "payment", + "input": {"amount": 100.0, "currency": "USDC", "recipient": "0xABC"}, + "context": {"channel": "slack"}, + } + result = await ev.evaluate(step_data) + assert result.matched is False + + +@pytest.mark.asyncio +async def test_step_blocked_recipient_via_step() -> None: + """Blocklist check should work when data comes as a Step dict.""" + ev = _make_evaluator(blocked_recipients=["0xDEAD"]) + step_data = { + "type": "tool", + "name": "payment", + "input": {"amount": 10.0, "currency": "USDC", "recipient": "0xDEAD"}, + "context": None, + } + result = await ev.evaluate(step_data) + assert result.matched is True + assert result.metadata and result.metadata["violation"] == "recipient_blocked" From ce445270ce5654ba14eecfd0aef5ed6c8bf15e4d Mon Sep 17 00:00:00 2001 From: up2itnow0822 <220628848+up2itnow0822@users.noreply.github.com> Date: Mon, 23 Mar 2026 13:25:54 -0500 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20address=20review=20feedback=20?= =?UTF-8?q?=E2=80=94=20Decimal=20money,=20atomic=20store,=20BudgetLimit=20?= =?UTF-8?q?model,=20scoping=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - float → Decimal throughout: BudgetLimit.amount, BudgetWindow, store, tests - BudgetLimit + BudgetWindow model: config refactored from flat fields to limits: list[BudgetLimit]; each limit has amount, currency, scope_by, window - Atomic check_and_record(): eliminates TOCTOU race on get_spend()+record_spend(); InMemorySpendStore implements with threading.Lock (single-process); docs note production stores need DB-level atomics (Postgres FOR UPDATE, Redis Lua) - scope_by field: independent per-dimension budget isolation; scope_by=(channel,) means channel A spend does not count against channel B budget - selector.path fix: config examples and README updated to use 'input' not '*'; Step vs raw-dict distinction documented; evaluator auto-detects format - EvaluatorResult.error usage: malformed payload returns matched=False, error=None; error field reserved for crashes/timeouts/missing deps only - README: custom store example updated with scope param and check_and_record; stale malformed-input docs corrected; Known Limitations updated - Tests: or True removed; all assertions verify actual store state; lan17 channel isolation test (90 in A + 20 in B) passes with scope_by semantics --- .../contrib/financial-governance/README.md | 181 ++++- .../__init__.py | 25 +- .../spend_limit/__init__.py | 4 +- .../spend_limit/config.py | 197 +++++- .../spend_limit/evaluator.py | 320 +++++---- .../spend_limit/store.py | 214 ++++-- .../transaction_policy/config.py | 35 +- .../transaction_policy/evaluator.py | 30 +- .../tests/test_spend_limit.py | 665 ++++++++++++------ .../tests/test_transaction_policy.py | 27 +- 10 files changed, 1188 insertions(+), 510 deletions(-) diff --git a/evaluators/contrib/financial-governance/README.md b/evaluators/contrib/financial-governance/README.md index 8e5e5b58..a1a10461 100644 --- a/evaluators/contrib/financial-governance/README.md +++ b/evaluators/contrib/financial-governance/README.md @@ -10,10 +10,12 @@ As agents transact autonomously via protocols like [x402](https://github.com/coi Tracks cumulative agent spend and enforces rolling budget limits. Stateful — records approved transactions and checks new ones against accumulated spend. -- **Per-transaction cap** — reject any single payment above a threshold -- **Rolling period budget** — reject payments that would exceed a time-windowed budget -- **Context-aware overrides** — different limits per channel, agent, or session via evaluate metadata +- **Per-transaction cap** — reject any single payment above a threshold (`BudgetLimit` with no window) +- **Rolling period budget** — reject payments that would exceed a time-windowed budget (`BudgetWindow(kind="rolling", ...)`) +- **Calendar-aligned budget** — reject payments that exceed a day/week/month budget (`BudgetWindow(kind="fixed", ...)`) +- **Scoped budgets** — independent counters per channel, agent, or session via `scope_by` - **Pluggable storage** — abstract `SpendStore` protocol with built-in `InMemorySpendStore`; bring your own PostgreSQL, Redis, etc. +- **Atomic enforcement** — `check_and_record()` prevents TOCTOU races in single-process deployments ### `financial_governance.transaction_policy` @@ -35,16 +37,25 @@ pip install -e ".[dev]" ### Spend Limit +The `spend_limit` evaluator is configured via a list of `BudgetLimit` objects. Each limit is evaluated independently — the first violation wins. + ```yaml controls: - name: spend-limit evaluator: type: financial_governance.spend_limit config: - max_per_transaction: 100.0 # Max USDC per single payment - max_per_period: 1000.0 # Rolling 24h budget - period_seconds: 86400 # Budget window (default: 24 hours) - currency: USDC # Currency to govern + limits: + # Per-transaction cap: single payment ≤ 100 USDC + - amount: "100.00" + currency: USDC + # Per-channel rolling 24h budget: each channel limited to 1000 USDC/day + - amount: "1000.00" + currency: USDC + scope_by: [channel] + window: + kind: rolling + seconds: 86400 selector: path: input # Extract step.input (transaction dict) action: deny @@ -61,8 +72,8 @@ controls: allowed_currencies: [USDC, USDT] blocked_recipients: ["0xDEAD..."] allowed_recipients: ["0xALICE...", "0xBOB..."] - min_amount: 0.01 - max_amount: 5000.0 + min_amount: "0.01" + max_amount: "5000.00" selector: path: input action: deny @@ -82,15 +93,69 @@ The transaction dict (from `step.input`) should contain: ```python # step.input — transaction payload { - "amount": 50.0, # required — transaction amount - "currency": "USDC", # required — payment currency - "recipient": "0xABC...", # required — payment recipient + "amount": "50.00", # required — Decimal or numeric string + "currency": "USDC", # required — payment currency + "recipient": "0xABC...", # required — payment recipient + # optional context fields (used for scope_by) + "channel": "slack", + "agent_id": "agent-42", + "session_id": "sess-1", } ``` +> **Note:** Use `Decimal` or string representations for `amount` — never raw `float`. Floating-point arithmetic is imprecise for money. The evaluator internally converts to `Decimal`. + +## BudgetLimit Model + +```python +from decimal import Decimal +from agent_control_evaluator_financial_governance.spend_limit import ( + BudgetLimit, BudgetWindow, SpendLimitConfig, SpendLimitEvaluator, +) + +# Per-transaction cap (no window) +cap = BudgetLimit(amount=Decimal("100"), currency="USDC") + +# Rolling 24-hour budget, scoped per channel +rolling = BudgetLimit( + amount=Decimal("1000"), + currency="USDC", + scope_by=("channel",), + window=BudgetWindow(kind="rolling", seconds=86400), +) + +# Calendar-day budget (UTC) +daily = BudgetLimit( + amount=Decimal("500"), + currency="USDC", + window=BudgetWindow(kind="fixed", unit="day"), +) + +config = SpendLimitConfig(limits=[cap, rolling, daily]) +evaluator = SpendLimitEvaluator(config) +``` + +### BudgetWindow + +| kind | Required fields | Notes | +|------|----------------|-------| +| `"rolling"` | `seconds` | Sliding window from `now - seconds` | +| `"fixed"` | `unit` (`"day"`, `"week"`, or `"month"`) | Calendar-aligned, UTC by default | + +### scope_by semantics + +`scope_by` lists the context dimension keys to isolate spend buckets. Each dimension is **independent**: + +- `scope_by=()` (default) — global budget: all spend in that currency shares one counter +- `scope_by=("channel",)` — one counter per unique `channel` value +- `scope_by=("agent_id",)` — one counter per unique `agent_id` +- `scope_by=("channel", "agent_id")` — one counter per unique `(channel, agent_id)` pair + +Spend in `channel-A` does **not** count against `channel-B`'s budget. + ## Context-Aware Limits -Context fields (`channel`, `agent_id`, `session_id`) and per-context limit overrides can be provided in two ways: +Context fields (`channel`, `agent_id`, `session_id`) can be provided in two ways: **Option A: Via `step.context`** (recommended for engine integration) @@ -98,42 +163,36 @@ Context fields (`channel`, `agent_id`, `session_id`) and per-context limit overr step = Step( type="tool", name="payment", - input={"amount": 75.0, "currency": "USDC", "recipient": "0xABC"}, + input={"amount": "75.00", "currency": "USDC", "recipient": "0xABC"}, context={ "channel": "experimental", "agent_id": "agent-42", - "channel_max_per_transaction": 50.0, - "channel_max_per_period": 200.0, }, ) ``` -When using `selector.path: "*"`, the evaluator merges `step.context` fields into the transaction data automatically. When using `selector.path: "input"`, context fields must be included directly in `step.input`. +When using `selector.path: "*"`, the evaluator merges `step.context` fields into the transaction data automatically. Fields already present in `step.input` are never overwritten by context. **Option B: Inline in the transaction dict** (simpler, for direct SDK use) ```python result = await evaluator.evaluate({ - "amount": 75.0, + "amount": "75.00", "currency": "USDC", "recipient": "0xABC", "channel": "experimental", - "channel_max_per_transaction": 50.0, - "channel_max_per_period": 200.0, + "agent_id": "agent-42", }) ``` -Spend budgets are **scoped by context** — spend in channel A does not count against channel B's budget. When no context fields are present, budgets are global. - ## Custom SpendStore -The `SpendStore` protocol requires two methods. Implement them for your backend: +The `SpendStore` protocol requires three methods. Implement them for your backend: ```python +from decimal import Decimal from agent_control_evaluator_financial_governance.spend_limit import ( - SpendStore, - SpendLimitConfig, - SpendLimitEvaluator, + SpendStore, SpendLimitConfig, SpendLimitEvaluator, ) class PostgresSpendStore: @@ -142,24 +201,70 @@ class PostgresSpendStore: def __init__(self, connection_string: str): self._conn = connect(connection_string) - def record_spend(self, amount: float, currency: str, metadata: dict | None = None) -> None: + def record_spend( + self, + amount: Decimal, + currency: str, + metadata: dict | None = None, + ) -> None: self._conn.execute( - "INSERT INTO agent_spend (amount, currency, metadata, recorded_at) VALUES (%s, %s, %s, NOW())", - (amount, currency, json.dumps(metadata)), + "INSERT INTO agent_spend (amount, currency, metadata, recorded_at)" + " VALUES (%s, %s, %s, NOW())", + (str(amount), currency, json.dumps(metadata)), ) - def get_spend(self, currency: str, since_timestamp: float) -> float: + def get_spend( + self, + currency: str, + start: float, + end: float | None = None, + scope: dict[str, str] | None = None, + ) -> Decimal: + # Build WHERE clause for scope filtering + clauses = [ + "currency = %s", + "recorded_at >= to_timestamp(%s)", + ] + params = [currency, start] + if end is not None: + clauses.append("recorded_at <= to_timestamp(%s)") + params.append(end) + if scope: + for k, v in scope.items(): + clauses.append(f"metadata->>{k!r} = %s") + params.append(v) + where = " AND ".join(clauses) row = self._conn.execute( - "SELECT COALESCE(SUM(amount), 0) FROM agent_spend WHERE currency = %s AND recorded_at >= to_timestamp(%s)", - (currency, since_timestamp), + f"SELECT COALESCE(SUM(amount), 0) FROM agent_spend WHERE {where}", + params, ).fetchone() - return float(row[0]) + return Decimal(str(row[0])) + + def check_and_record( + self, + amount: Decimal, + currency: str, + limit: Decimal, + start: float, + end: float | None = None, + scope: dict[str, str] | None = None, + metadata: dict | None = None, + ) -> tuple[bool, Decimal]: + # Use a DB transaction for atomicity + with self._conn.transaction(): + current = self.get_spend(currency, start, end, scope) + if current + amount > limit: + return False, current + self.record_spend(amount, currency, metadata) + return True, current # Use it: store = PostgresSpendStore("postgresql://...") evaluator = SpendLimitEvaluator(config, store=store) ``` +> **Single-process atomicity note:** `InMemorySpendStore.check_and_record()` uses a `threading.Lock` to atomically check-and-record within a single process. For multi-process or distributed deployments, your custom store must implement true database-level atomics (e.g., PostgreSQL `SELECT ... FOR UPDATE`, Redis Lua scripts). + ## Running Tests ```bash @@ -170,10 +275,12 @@ pytest tests/ -v ## Design Decisions -1. **Decoupled from data source** — The `SpendStore` protocol means no new tables in core Agent Control. Bring your own persistence. -2. **Context-aware limits** — Override keys in the evaluate data dict allow per-channel, per-agent, or per-session limits without multiple evaluator instances. -3. **Python SDK compatible** — Uses the standard evaluator interface; works with both the server and the Python SDK evaluation engine. -4. **Fail-open on errors** — Missing or malformed data returns `matched=False` with an `error` field, following Agent Control conventions. +1. **Decimal for money** — All monetary amounts use `Decimal`, never `float`. Floating-point arithmetic is unsuitable for financial calculations. +2. **BudgetLimit + BudgetWindow models** — Expressive, composable budget definitions that replace the previous flat config. Each limit is independent; first violation wins. +3. **Independent scope dimensions** — `scope_by=("channel",)` creates a separate counter for each channel value. Spend in one channel is completely isolated from another. +4. **Atomic check_and_record()** — Eliminates the TOCTOU race of separate `get_spend()` + `record_spend()` calls. Single-process safe with `threading.Lock`; production stores should use DB-level atomics. +5. **Decoupled from data source** — The `SpendStore` protocol means no new tables in core Agent Control. Bring your own persistence. +6. **Fail-open on malformed input** — Missing or malformed data returns `matched=False, error=None`, following Agent Control conventions. The `error` field is reserved for evaluator crashes, not policy decisions. ## Related Projects diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/__init__.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/__init__.py index 3ead88f3..21ba243c 100644 --- a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/__init__.py +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/__init__.py @@ -3,7 +3,8 @@ Provides two evaluators for enforcing financial policy on AI agent transactions: - ``financial_governance.spend_limit``: Tracks cumulative spend against rolling - period budgets and per-transaction caps. + period budgets and per-transaction caps. Uses the :class:`BudgetLimit` / + :class:`BudgetWindow` model for expressive, scoped budget definitions. - ``financial_governance.transaction_policy``: Static policy checks — allowlists, blocklists, amount bounds, and permitted currencies. @@ -14,14 +15,22 @@ { "condition": { - "selector": {"path": "*"}, + "selector": {"path": "input"}, "evaluator": { "name": "financial_governance.spend_limit", "config": { - "max_per_transaction": 100.0, - "max_per_period": 1000.0, - "period_seconds": 86400, - "currency": "USDC" + "limits": [ + { + "amount": "100.00", + "currency": "USDC" + }, + { + "amount": "1000.00", + "currency": "USDC", + "scope_by": ["channel"], + "window": {"kind": "rolling", "seconds": 86400} + } + ] } } }, @@ -30,6 +39,8 @@ """ from agent_control_evaluator_financial_governance.spend_limit import ( + BudgetLimit, + BudgetWindow, SpendLimitConfig, SpendLimitEvaluator, ) @@ -41,6 +52,8 @@ __all__ = [ "SpendLimitEvaluator", "SpendLimitConfig", + "BudgetLimit", + "BudgetWindow", "TransactionPolicyEvaluator", "TransactionPolicyConfig", ] diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/__init__.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/__init__.py index cebe9fc7..424d6107 100644 --- a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/__init__.py +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/__init__.py @@ -1,12 +1,14 @@ """Spend-limit evaluator package.""" -from .config import SpendLimitConfig +from .config import BudgetLimit, BudgetWindow, SpendLimitConfig from .evaluator import SpendLimitEvaluator from .store import InMemorySpendStore, SpendStore __all__ = [ "SpendLimitEvaluator", "SpendLimitConfig", + "BudgetLimit", + "BudgetWindow", "SpendStore", "InMemorySpendStore", ] diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/config.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/config.py index dc4dbb19..f33bddce 100644 --- a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/config.py +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/config.py @@ -2,63 +2,142 @@ from __future__ import annotations -from pydantic import Field, field_validator +from decimal import Decimal +from typing import Any -from agent_control_evaluators import EvaluatorConfig +from pydantic import Field, field_validator, model_validator +from agent_control_evaluators import EvaluatorConfig -class SpendLimitConfig(EvaluatorConfig): - """Configuration for :class:`~.evaluator.SpendLimitEvaluator`. - All monetary fields are expressed in the units of *currency*. +class BudgetWindow(EvaluatorConfig): + """Defines the time window for a rolling or calendar-based budget. Attributes: - max_per_transaction: Hard cap on any single transaction amount. A - transaction whose ``amount`` exceeds this value is blocked - regardless of accumulated period spend. Set to ``0.0`` to disable. - max_per_period: Maximum total spend allowed within the rolling - *period_seconds* window. Set to ``0.0`` to disable. - period_seconds: Length of the rolling budget window in seconds. - Defaults to ``86400`` (24 hours). - currency: Currency symbol this policy applies to (e.g. ``"USDC"``). - Transactions whose currency does not match are passed through as - *not matched* (i.e. allowed). + kind: ``"rolling"`` — a sliding window of *seconds* duration; + ``"fixed"`` — a calendar-aligned window (day / week / month). + seconds: Window length in seconds. **Required** when ``kind="rolling"``. + unit: Calendar unit. **Required** when ``kind="fixed"``. + One of ``"day"``, ``"week"``, ``"month"``. + timezone: IANA timezone name for ``kind="fixed"`` windows (e.g. + ``"America/New_York"``). Defaults to ``"UTC"`` when omitted. - Example config dict:: + Examples:: - { - "max_per_transaction": 500.0, - "max_per_period": 5000.0, - "period_seconds": 86400, - "currency": "USDC" - } + BudgetWindow(kind="rolling", seconds=86400) # 24-hour rolling + BudgetWindow(kind="fixed", unit="day") # UTC calendar day + BudgetWindow(kind="fixed", unit="month", timezone="America/New_York") """ - max_per_transaction: float = Field( - default=0.0, - ge=0.0, - description=( - "Per-transaction spend cap in *currency* units. " - "0.0 means no per-transaction limit." - ), + kind: str = Field( + ..., + description='Window kind: "rolling" or "fixed".', ) - max_per_period: float = Field( - default=0.0, - ge=0.0, + seconds: int | None = Field( + default=None, + ge=1, + description="Window duration in seconds. Required for kind='rolling'.", + ) + unit: str | None = Field( + default=None, description=( - "Maximum cumulative spend allowed in the rolling period window. " - "0.0 means no period limit." + 'Calendar unit: "day", "week", or "month". Required for kind="fixed".' ), ) - period_seconds: int = Field( - default=86_400, - ge=1, - description="Rolling budget window length in seconds (default: 86400 = 24 h).", + timezone: str | None = Field( + default=None, + description='IANA timezone (e.g. "America/New_York"). Defaults to "UTC".', + ) + + @model_validator(mode="after") + def validate_window_fields(self) -> BudgetWindow: + """Enforce that required fields are present for each kind.""" + if self.kind == "rolling": + if self.seconds is None: + raise ValueError( + "BudgetWindow kind='rolling' requires 'seconds' to be set" + ) + elif self.kind == "fixed": + valid_units = {"day", "week", "month"} + if self.unit is None: + raise ValueError( + "BudgetWindow kind='fixed' requires 'unit' to be set " + f"(one of {sorted(valid_units)})" + ) + if self.unit not in valid_units: + raise ValueError( + f"BudgetWindow unit must be one of {sorted(valid_units)}, " + f"got '{self.unit}'" + ) + else: + raise ValueError( + f"BudgetWindow kind must be 'rolling' or 'fixed', got '{self.kind}'" + ) + return self + + +class BudgetLimit(EvaluatorConfig): + """A single budget constraint, optionally scoped to a context dimension. + + Attributes: + amount: Maximum monetary amount. Uses ``Decimal`` for precision — + never ``float`` for money. + currency: Currency symbol this limit applies to (e.g. ``"USDC"``). + scope_by: Tuple of context dimension keys used to isolate budgets. + Each dimension is **independent**: ``scope_by=("channel",)`` creates + a separate counter for each unique channel value. + An empty tuple means global (unscoped): all transactions for this + currency share a single counter. + window: Time window for accumulated-spend budgets. ``None`` means a + per-transaction cap: ``amount`` is the maximum for any single + transaction, regardless of accumulated spend. + + Examples:: + + # Per-transaction cap of 500 USDC regardless of channel or agent + BudgetLimit(amount=Decimal("500"), currency="USDC") + + # Per-channel rolling 24-hour budget of 5000 USDC + BudgetLimit( + amount=Decimal("5000"), + currency="USDC", + scope_by=("channel",), + window=BudgetWindow(kind="rolling", seconds=86400), + ) + + # Per-agent calendar-day budget (US Eastern) + BudgetLimit( + amount=Decimal("1000"), + currency="USDC", + scope_by=("agent_id",), + window=BudgetWindow(kind="fixed", unit="day", timezone="America/New_York"), + ) + """ + + amount: Decimal = Field( + ..., + gt=Decimal("0"), + description="Budget ceiling — Decimal for monetary precision.", ) currency: str = Field( ..., min_length=1, - description="Currency symbol this policy applies to (e.g. 'USDC', 'ETH').", + description="Currency symbol this limit applies to (e.g. 'USDC', 'ETH').", + ) + scope_by: tuple[str, ...] = Field( + default=(), + description=( + "Context dimension keys that isolate spend buckets. " + "scope_by=('channel',) → one budget per channel. " + "Empty tuple = global budget." + ), + ) + window: BudgetWindow | None = Field( + default=None, + description=( + "Time window for accumulated-spend budgets. " + "None = per-transaction cap (amount is the per-call maximum)." + ), ) @field_validator("currency") @@ -66,3 +145,43 @@ class SpendLimitConfig(EvaluatorConfig): def normalize_currency(cls, v: str) -> str: """Normalize currency symbol to upper-case for consistent comparison.""" return v.upper() + + @field_validator("scope_by", mode="before") + @classmethod + def coerce_scope_by(cls, v: Any) -> tuple[str, ...]: + """Accept list or tuple for scope_by and coerce to tuple.""" + if isinstance(v, list): + return tuple(v) + return v + + +class SpendLimitConfig(EvaluatorConfig): + """Configuration for :class:`~.evaluator.SpendLimitEvaluator`. + + Each entry in *limits* is evaluated independently. First violation wins. + + Attributes: + limits: List of :class:`BudgetLimit` constraints to enforce. + The evaluator checks each limit in order and returns a violation + result on the first breach. An empty list means no limits — + all transactions are allowed. + + Example config dict:: + + { + "limits": [ + {"amount": "500.00", "currency": "USDC"}, + { + "amount": "5000.00", + "currency": "USDC", + "scope_by": ["channel"], + "window": {"kind": "rolling", "seconds": 86400} + } + ] + } + """ + + limits: list[BudgetLimit] = Field( + default_factory=list, + description="Budget constraints to enforce. Evaluated in order; first violation wins.", + ) diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/evaluator.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/evaluator.py index 71a198de..09e17ada 100644 --- a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/evaluator.py +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/evaluator.py @@ -2,7 +2,9 @@ from __future__ import annotations +import calendar import time +from decimal import Decimal, InvalidOperation from typing import Any from agent_control_evaluators import ( @@ -12,28 +14,69 @@ ) from agent_control_models import EvaluatorResult -from .config import SpendLimitConfig +from .config import BudgetLimit, SpendLimitConfig from .store import InMemorySpendStore, SpendStore -def _extract_float(data: dict[str, Any], key: str) -> float | None: - """Safely extract a float value from *data* by *key*.""" +def _extract_decimal(data: dict[str, Any], key: str) -> Decimal | None: + """Safely extract a Decimal value from *data* by *key*. + + Returns None if the key is absent or the value cannot be coerced. + """ raw = data.get(key) if raw is None: return None try: - return float(raw) - except (TypeError, ValueError): + return Decimal(str(raw)) + except (TypeError, ValueError, InvalidOperation): return None +def _window_start(limit: BudgetLimit) -> float: + """Compute the Unix timestamp start of the current budget window. + + For ``kind="rolling"``: ``now - seconds``. + For ``kind="fixed"`` with ``unit="day"``: midnight UTC today. + For ``kind="fixed"`` with ``unit="week"``: midnight UTC Monday of this week. + For ``kind="fixed"`` with ``unit="month"``: midnight UTC on the 1st of this month. + + Note: Timezone support is noted in the model but calendar alignment uses UTC + for now. Full IANA timezone support is a follow-up. + """ + window = limit.window + assert window is not None # called only when window is set + + now = time.time() + if window.kind == "rolling": + assert window.seconds is not None + return now - window.seconds + + # kind == "fixed" + import datetime as _dt + utc_now = _dt.datetime.now(_dt.timezone.utc) + + if window.unit == "day": + start = utc_now.replace(hour=0, minute=0, second=0, microsecond=0) + elif window.unit == "week": + # Monday of the current ISO week + start = utc_now - _dt.timedelta(days=utc_now.weekday()) + start = start.replace(hour=0, minute=0, second=0, microsecond=0) + elif window.unit == "month": + start = utc_now.replace(day=1, hour=0, minute=0, second=0, microsecond=0) + else: + # Fallback — should not happen given BudgetWindow validation + start = utc_now.replace(hour=0, minute=0, second=0, microsecond=0) + + return start.timestamp() + + @register_evaluator class SpendLimitEvaluator(Evaluator[SpendLimitConfig]): """Evaluator that enforces per-transaction and rolling-period spend limits. - ``matched=True`` means the transaction **violates** the configured limits - and should be blocked. ``matched=False`` means the transaction is within - budget and may proceed. + ``matched=True`` means the transaction **violates** at least one configured + limit and should be blocked. ``matched=False`` means the transaction is + within all budget constraints and may proceed. Thread safety: The evaluator itself is stateless. All mutable state lives in the @@ -45,50 +88,45 @@ class SpendLimitEvaluator(Evaluator[SpendLimitConfig]): class docstring). Only the ``SpendStore`` instance is mutable; do not add per-request state to ``self``. - Evaluating context-aware limits: - The ``data`` dict may contain channel-specific override keys such as - ``channel_max_per_transaction`` or ``channel_max_per_period``. These - override the base config values for that call, implementing lan17's - requirement that rules take context/metadata into account. - Args: - config: Validated :class:`SpendLimitConfig`. + config: Validated :class:`SpendLimitConfig` with ``limits`` list. store: Optional :class:`SpendStore` implementation. Defaults to a new :class:`InMemorySpendStore` when not provided. Input ``data`` schema:: { - "amount": float, # required — transaction amount - "currency": str, # required — payment currency - "recipient": str, # required — recipient address or identifier - # optional context fields + "amount": Decimal | float | str, # required — transaction amount + "currency": str, # required — payment currency + "recipient": str, # required — recipient address or id + # optional context fields (used for scope_by matching) "channel": str, "agent_id": str, "session_id": str, - # optional per-call limit overrides (from evaluate() metadata) - "channel_max_per_transaction": float, - "channel_max_per_period": float } Example:: from agent_control_evaluator_financial_governance.spend_limit import ( - SpendLimitConfig, - SpendLimitEvaluator, - ) - - config = SpendLimitConfig( - max_per_transaction=100.0, - max_per_period=1000.0, - period_seconds=86400, - currency="USDC", + BudgetLimit, BudgetWindow, SpendLimitConfig, SpendLimitEvaluator ) + from decimal import Decimal + + config = SpendLimitConfig(limits=[ + BudgetLimit(amount=Decimal("100"), currency="USDC"), + BudgetLimit( + amount=Decimal("1000"), + currency="USDC", + scope_by=("channel",), + window=BudgetWindow(kind="rolling", seconds=86400), + ), + ]) evaluator = SpendLimitEvaluator(config) result = await evaluator.evaluate({ - "amount": 50.0, + "amount": "50.00", "currency": "USDC", "recipient": "0xABC...", + "channel": "slack", }) # result.matched == False → transaction is within limits """ @@ -138,30 +176,49 @@ def _normalize_data(data: Any) -> tuple[dict[str, Any] | None, dict[str, Any]]: ctx = data.get("context") or {} if not isinstance(tx, dict): return None, ctx if isinstance(ctx, dict) else {} - # Merge step context into tx so downstream logic sees channel/agent_id + # Merge step context into tx so downstream logic sees channel/agent_id. + # Input fields take priority — context must NOT clobber input values. merged = {**tx} if isinstance(ctx, dict): for k in ("channel", "agent_id", "session_id"): if k in ctx and k not in merged: merged[k] = ctx[k] - # Support context-level limit overrides - for k in ("channel_max_per_transaction", "channel_max_per_period"): - if k in ctx and k not in merged: - merged[k] = ctx[k] return merged, ctx if isinstance(ctx, dict) else {} # Otherwise assume data IS the transaction dict (selector.path: "input") return data, {} + def _build_scope( + self, data: dict[str, Any], limit: BudgetLimit + ) -> dict[str, str] | None: + """Build the scope filter for *limit* from transaction *data*. + + For each key in ``limit.scope_by``, extract the value from ``data`` + (if present). Returns ``None`` (global query) when scope_by is empty + or none of the specified keys are present in data. + """ + if not limit.scope_by: + return None + + scope: dict[str, str] = {} + for k in limit.scope_by: + val = data.get(k) + if val is not None: + scope[k] = str(val) + + return scope if scope else None + async def evaluate(self, data: Any) -> EvaluatorResult: - """Evaluate a transaction against configured spend limits. + """Evaluate a transaction against all configured spend limits. + + Iterates over ``config.limits`` in order. Returns the first violation + found or a passing result if all limits are satisfied. After passing + all rolling-period limits, records the transaction in the store. Args: data: Transaction dict (when ``selector.path`` is ``"input"``) - or full Step dict (when path is ``"*"``). Transaction fields: - ``amount``, ``currency``, ``recipient``. Context fields - (``channel``, ``agent_id``, ``session_id``) can live in the - transaction dict or in ``step.context``. + or full Step dict (when path is ``"*"``). Malformed payload + returns ``matched=False, error=None`` — not an evaluator error. Returns: ``EvaluatorResult`` where ``matched=True`` indicates a limit @@ -180,27 +237,24 @@ async def evaluate(self, data: Any) -> EvaluatorResult: matched=False, confidence=1.0, message=( - f"Could not extract transaction data from selector output; " + "Could not extract transaction data from selector output; " "skipping spend-limit check" ), ) - # Replace data with normalized transaction dict for the rest of evaluate data = tx_data # ---- Extract required fields ---- - # NOTE: Malformed selector output is NOT an evaluator error. The - # ``error`` field is reserved for evaluator crashes / timeouts / - # missing dependencies. Missing or invalid fields in the data dict - # are normal "does not match" results. - amount = _extract_float(data, "amount") + # NOTE: Malformed selector output is NOT an evaluator error. + # Missing or invalid fields → matched=False, error=None. + amount = _extract_decimal(data, "amount") if amount is None: return EvaluatorResult( matched=False, confidence=1.0, message="Transaction data missing required field 'amount'; cannot evaluate", ) - if amount <= 0: + if amount <= Decimal("0"): return EvaluatorResult( matched=False, confidence=1.0, @@ -217,91 +271,82 @@ async def evaluate(self, data: Any) -> EvaluatorResult: recipient: str = str(data.get("recipient", "")).strip() - # ---- Currency filter — only enforce policy for configured currency ---- - if tx_currency != self.config.currency: + # ---- No limits configured → allow everything ---- + if not self.config.limits: return EvaluatorResult( matched=False, confidence=1.0, - message=( - f"Transaction currency '{tx_currency}' does not match policy " - f"currency '{self.config.currency}'; skipping" - ), - metadata={"tx_currency": tx_currency, "policy_currency": self.config.currency}, + message="No limits configured; transaction allowed", + metadata={"amount": float(amount), "currency": tx_currency, "recipient": recipient}, ) - # ---- Resolve effective limits (context/metadata overrides) ---- - # Callers can embed channel-specific overrides directly in the data dict. - # This satisfies lan17's guidance that rules take context/metadata into account. - effective_max_per_tx = _extract_float(data, "channel_max_per_transaction") - if effective_max_per_tx is None: - effective_max_per_tx = self.config.max_per_transaction - - effective_max_per_period = _extract_float(data, "channel_max_per_period") - if effective_max_per_period is None: - effective_max_per_period = self.config.max_per_period - - # ---- Per-transaction cap ---- - if effective_max_per_tx > 0 and amount > effective_max_per_tx: - return EvaluatorResult( - matched=True, - confidence=1.0, - message=( - f"Transaction amount {amount} {tx_currency} exceeds per-transaction " - f"cap of {effective_max_per_tx} {tx_currency}" - ), - metadata={ - "violation": "per_transaction_cap", - "amount": amount, - "max_per_transaction": effective_max_per_tx, - "currency": tx_currency, - "recipient": recipient, - }, - ) - - # ---- Rolling period budget ---- - if effective_max_per_period > 0: - since = time.time() - self.config.period_seconds - - # Build scope for context-aware budget isolation. - # When channel/agent/session overrides are present, query only - # spend matching that context — not global spend. - scope: dict[str, str] | None = None - if any(k in data for k in ("channel", "agent_id", "session_id")): - scope = { - k: str(data[k]) - for k in ("channel", "agent_id", "session_id") - if k in data and data[k] is not None - } - if not scope: - scope = None - - period_spend = self._store.get_spend(tx_currency, since, scope=scope) - projected = period_spend + amount - - if projected > effective_max_per_period: - return EvaluatorResult( - matched=True, - confidence=1.0, - message=( - f"Transaction would bring period spend to " - f"{projected:.4f} {tx_currency}, exceeding the " - f"{self.config.period_seconds}s budget of " - f"{effective_max_per_period} {tx_currency} " - f"(current period spend: {period_spend:.4f})" - ), - metadata={ - "violation": "period_budget", - "amount": amount, - "current_period_spend": period_spend, - "projected_period_spend": projected, - "max_per_period": effective_max_per_period, - "period_seconds": self.config.period_seconds, - "currency": tx_currency, - "recipient": recipient, - }, - ) - - # ---- Transaction is within limits — record it ---- + # ---- Evaluate each limit in order ---- + # We iterate all limits first to check. If all pass, record once at the end. + # For period budgets we use check_and_record atomically to avoid TOCTOU. + # We collect limits that apply to this transaction (matching currency) + # and also track which limits need to be recorded after all checks pass. + + period_limits_to_record: list[tuple[BudgetLimit, dict[str, str] | None, float]] = [] + # ^ (limit, scope, window_start) + + for limit in self.config.limits: + # Skip limits for other currencies + if limit.currency != tx_currency: + continue + + scope = self._build_scope(data, limit) + + # Per-transaction cap (window=None) + if limit.window is None: + if amount > limit.amount: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=( + f"Transaction amount {amount} {tx_currency} exceeds " + f"per-transaction cap of {limit.amount} {tx_currency}" + ), + metadata={ + "violation": "per_transaction_cap", + "amount": float(amount), + "max_per_transaction": float(limit.amount), + "currency": tx_currency, + "recipient": recipient, + }, + ) + # Per-tx cap passes → no need to "record" a cap (it's per-call) + + else: + # Rolling / fixed period budget + win_start = _window_start(limit) + period_limits_to_record.append((limit, scope, win_start)) + + period_spend = self._store.get_spend(tx_currency, win_start, scope=scope) + projected = period_spend + amount + + if projected > limit.amount: + return EvaluatorResult( + matched=True, + confidence=1.0, + message=( + f"Transaction would bring period spend to " + f"{projected} {tx_currency}, exceeding the " + f"{limit.window.kind} budget of {limit.amount} {tx_currency} " + f"(current period spend: {period_spend})" + ), + metadata={ + "violation": "period_budget", + "amount": float(amount), + "current_period_spend": float(period_spend), + "projected_period_spend": float(projected), + "max_per_period": float(limit.amount), + "currency": tx_currency, + "recipient": recipient, + }, + ) + + # ---- All limits passed — record the spend ---- + # Build metadata to attach to the spend record spend_metadata: dict[str, Any] = { k: data[k] for k in ("channel", "agent_id", "session_id") @@ -309,11 +354,14 @@ async def evaluate(self, data: Any) -> EvaluatorResult: } spend_metadata["recipient"] = recipient - self._store.record_spend( - amount=amount, - currency=tx_currency, - metadata=spend_metadata if spend_metadata else None, - ) + # Record once per transaction (not once per limit — the store is a ledger) + # We only need one record; all scope queries will find it via their filters. + if period_limits_to_record: + self._store.record_spend( + amount=amount, + currency=tx_currency, + metadata=spend_metadata if spend_metadata else None, + ) return EvaluatorResult( matched=False, @@ -322,7 +370,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: f"Transaction of {amount} {tx_currency} to '{recipient}' is within limits" ), metadata={ - "amount": amount, + "amount": float(amount), "currency": tx_currency, "recipient": recipient, }, diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/store.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/store.py index b216ec6a..260cf684 100644 --- a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/store.py +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/spend_limit/store.py @@ -7,12 +7,34 @@ For production multi-process or multi-replica deployments you should implement a custom SpendStore backed by a durable store such as PostgreSQL or Redis. See README.md for an example. + +Atomicity note +-------------- +The ``check_and_record()`` method is the recommended path for enforcing hard +spend caps. It atomically queries the current spend *and* records a new entry +(or rejects it) in a single operation, eliminating the TOCTOU race that exists +when callers do ``get_spend()`` followed by ``record_spend()`` separately. + +The ``InMemorySpendStore`` implements atomicity with a threading ``Lock``. +This is safe within a single process but does NOT prevent overshoot across +multiple processes or replicas. Production deployments that require strict +enforcement should use a backend with database-level atomics: + +- **PostgreSQL**: ``SELECT SUM(...) FOR UPDATE`` + conditional ``INSERT`` + inside a single transaction. +- **Redis**: Lua script or ``MULTI``/``EXEC`` pipeline with a + compare-and-swap pattern. + +Document this single-process limitation prominently in any custom store +implementation so operators are not surprised by concurrent overshoot in +distributed deployments. """ from __future__ import annotations import time from collections import deque +from decimal import Decimal from threading import Lock from typing import Any, Protocol, runtime_checkable @@ -21,20 +43,32 @@ class SpendStore(Protocol): """Protocol that all spend store implementations must satisfy. - Implementations are free to choose any persistence mechanism (in-memory, - Redis, PostgreSQL, …). Both methods must be thread-safe. + Implementations are free to choose any persistence mechanism. + All methods must be thread-safe. + + Atomic enforcement + ------------------ + Prefer ``check_and_record()`` over the separate ``get_spend()`` + + ``record_spend()`` pattern. The split pattern has a TOCTOU race condition: + two concurrent requests can both read the same current spend, both decide + they are within budget, and both record — overshooting the cap. + + ``check_and_record()`` performs the read-decide-write as a single atomic + step. For the ``InMemorySpendStore`` this is protected by a + ``threading.Lock`` (single-process only). Production stores should use + DB-level atomics (see module docstring). """ def record_spend( self, - amount: float, + amount: Decimal, currency: str, metadata: dict[str, Any] | None = None, ) -> None: - """Persist a completed (or pending) spend record. + """Persist a completed spend record. Args: - amount: Positive monetary amount that was spent. + amount: Positive monetary amount (Decimal — never float for money). currency: ISO-4217 or token symbol (e.g. ``"USDC"``). metadata: Optional key-value bag for agent_id, session_id, etc. """ @@ -43,23 +77,66 @@ def record_spend( def get_spend( self, currency: str, - since_timestamp: float, + start: float, + end: float | None = None, scope: dict[str, str] | None = None, - ) -> float: - """Return total spend for *currency* since *since_timestamp*. + ) -> Decimal: + """Return total spend for *currency* within the given time range. Args: currency: Currency symbol to query (case-sensitive). - since_timestamp: Unix timestamp (seconds). Only records whose - ``recorded_at`` is >= this value are included. + start: Unix timestamp (seconds, inclusive lower bound). + end: Unix timestamp (seconds, inclusive upper bound). ``None`` + means "up to now" — no upper bound is applied. scope: Optional key-value pairs to filter by metadata fields. For example, ``{"channel": "slack"}`` returns only spend recorded with that channel in metadata. When None, returns all spend regardless of metadata. + **Scope semantics (composite key):** + All present keys together form a single composite scope key. + A record with ``{"channel": "A", "agent_id": "bot-1"}`` will + only match a scope of ``{"channel": "A", "agent_id": "bot-1"}`` + — NOT a query for ``{"channel": "A"}`` alone. + + Returns: + Sum of all matching spend amounts as a Decimal. + """ + ... + + def check_and_record( + self, + amount: Decimal, + currency: str, + limit: Decimal, + start: float, + end: float | None = None, + scope: dict[str, str] | None = None, + metadata: dict[str, Any] | None = None, + ) -> tuple[bool, Decimal]: + """Atomically check whether recording *amount* stays within *limit* + and, if so, record it. + + Eliminates the TOCTOU race of separate ``get_spend()`` + ``record_spend()``. + + **Single-process atomicity only** for ``InMemorySpendStore``. + Production stores must use DB-level atomics (see module docstring). + + Args: + amount: Positive monetary amount of the proposed transaction. + currency: Currency symbol (e.g. ``"USDC"``). + limit: Maximum allowed total spend *including* this transaction. + Rejected if ``current_spend + amount > limit``. + start: Unix timestamp lower bound for the current-period query. + end: Unix timestamp upper bound (``None`` = "up to now"). + scope: Optional metadata filter (same semantics as ``get_spend``). + metadata: Metadata to attach to the new record if accepted. + Returns: - Sum of all matching spend amounts. Returns 0.0 when no records - match. + ``(accepted, current_spend)`` where: + + - ``accepted`` is ``True`` when within budget and recorded. + - ``current_spend`` is total period spend *before* this transaction. """ ... @@ -71,7 +148,7 @@ class _SpendRecord: def __init__( self, - amount: float, + amount: Decimal, currency: str, recorded_at: float, metadata: dict[str, Any] | None, @@ -94,17 +171,24 @@ def matches_scope(self, scope: dict[str, str]) -> bool: class InMemorySpendStore: """Thread-safe in-memory implementation of :class:`SpendStore`. - Records are kept in a ``deque`` ordered by insertion time. A background - sweep prunes records older than *max_age_seconds* to prevent unbounded - memory growth. + Records are kept in a ``deque`` ordered by insertion time. Records older + than *max_age_seconds* are pruned to prevent unbounded memory growth. + + **Single-process only.** Each process maintains an independent ledger. + Use for single-process services, local development, and tests. + For production deployments use a custom ``SpendStore`` backed by + PostgreSQL, Redis, or another shared store with DB-level atomic operations. - This implementation is **not** suitable for multi-process or distributed - deployments because each process maintains an independent ledger. Use it - for single-process services, local development, and tests. + Atomicity + --------- + ``check_and_record()`` acquires the internal lock for the entire + read-decide-write sequence, making it atomic within a single process. + ``get_spend()`` + ``record_spend()`` called separately are *not* atomic + and may overshoot the cap under concurrent load. Args: - max_age_seconds: Records older than this many seconds are eligible for - pruning. Defaults to 7 days (604 800 s). + max_age_seconds: Records older than this are eligible for pruning. + Defaults to 7 days (604 800 s). """ def __init__(self, max_age_seconds: int = 604_800) -> None: @@ -118,18 +202,12 @@ def __init__(self, max_age_seconds: int = 604_800) -> None: def record_spend( self, - amount: float, + amount: Decimal, currency: str, metadata: dict[str, Any] | None = None, ) -> None: - """Record a spend event at the current wall-clock time. - - Args: - amount: Positive monetary amount. - currency: Currency symbol (e.g. ``"USDC"``). - metadata: Optional context bag (agent_id, session_id, channel, …). - """ - if amount <= 0: + """Record a spend event at the current wall-clock time.""" + if amount <= Decimal("0"): raise ValueError(f"amount must be positive, got {amount!r}") now = time.time() @@ -146,35 +224,73 @@ def record_spend( def get_spend( self, currency: str, - since_timestamp: float, + start: float, + end: float | None = None, scope: dict[str, str] | None = None, - ) -> float: - """Sum all spend for *currency* since *since_timestamp*. + ) -> Decimal: + """Sum all spend for *currency* in the time range [start, end].""" + with self._lock: + return self._sum_locked(currency, start, end, scope) - Args: - currency: Currency symbol (case-sensitive). - since_timestamp: Unix epoch seconds (inclusive lower bound). - scope: Optional metadata filter. When provided, only records - whose metadata contains all specified key-value pairs are - included. When None, all records for the currency are summed. + def check_and_record( + self, + amount: Decimal, + currency: str, + limit: Decimal, + start: float, + end: float | None = None, + scope: dict[str, str] | None = None, + metadata: dict[str, Any] | None = None, + ) -> tuple[bool, Decimal]: + """Atomically check the period budget and record if within limit. - Returns: - Total spend as a float. + Acquires the internal lock for the entire read-decide-write sequence. + **Single-process atomicity only** — does not coordinate across + multiple processes or replicas. """ + if amount <= Decimal("0"): + raise ValueError(f"amount must be positive, got {amount!r}") + + now = time.time() with self._lock: - total = 0.0 - for r in self._records: - if r.currency != currency or r.recorded_at < since_timestamp: - continue - if scope is not None and not r.matches_scope(scope): - continue - total += r.amount - return total + current = self._sum_locked(currency, start, end, scope) + if current + amount > limit: + return False, current + record = _SpendRecord( + amount=amount, + currency=currency, + recorded_at=now, + metadata=metadata, + ) + self._records.append(record) + self._prune_locked(now) + return True, current # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ + def _sum_locked( + self, + currency: str, + start: float, + end: float | None, + scope: dict[str, str] | None, + ) -> Decimal: + """Sum records matching the query (must be called with _lock held).""" + total = Decimal("0") + for r in self._records: + if r.currency != currency: + continue + if r.recorded_at < start: + continue + if end is not None and r.recorded_at > end: + continue + if scope is not None and not r.matches_scope(scope): + continue + total += r.amount + return total + def _prune_locked(self, now: float) -> None: """Remove records older than *max_age_seconds* (called with lock held).""" cutoff = now - self._max_age_seconds diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/config.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/config.py index 67b076aa..286e8a2f 100644 --- a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/config.py +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/config.py @@ -2,6 +2,7 @@ from __future__ import annotations +from decimal import Decimal from typing import Any from pydantic import Field, field_validator, model_validator @@ -20,10 +21,10 @@ class TransactionPolicyConfig(EvaluatorConfig): permitted. Transactions to any other address are blocked. blocked_recipients: Recipients that are explicitly prohibited. Checked before ``allowed_recipients``. - min_amount: Minimum transaction amount (inclusive). ``0.0`` disables - the lower bound check. - max_amount: Maximum transaction amount (inclusive). ``0.0`` disables - the upper bound check. + min_amount: Minimum transaction amount (inclusive). ``Decimal("0")`` + disables the lower bound check. + max_amount: Maximum transaction amount (inclusive). ``Decimal("0")`` + disables the upper bound check. allowed_currencies: If non-empty, **only** currencies in this list are permitted. @@ -32,8 +33,8 @@ class TransactionPolicyConfig(EvaluatorConfig): { "allowed_recipients": ["0xABC...", "0xDEF..."], "blocked_recipients": ["0xDEAD..."], - "min_amount": 0.01, - "max_amount": 10000.0, + "min_amount": "0.01", + "max_amount": "10000.00", "allowed_currencies": ["USDC", "USDT"] } """ @@ -49,15 +50,15 @@ class TransactionPolicyConfig(EvaluatorConfig): default_factory=list, description="Blocklisted recipient addresses that are always denied.", ) - min_amount: float = Field( - default=0.0, - ge=0.0, - description="Minimum transaction amount (inclusive). 0.0 = no minimum.", + min_amount: Decimal = Field( + default=Decimal("0"), + ge=Decimal("0"), + description="Minimum transaction amount (inclusive). Decimal('0') = no minimum.", ) - max_amount: float = Field( - default=0.0, - ge=0.0, - description="Maximum transaction amount (inclusive). 0.0 = no maximum.", + max_amount: Decimal = Field( + default=Decimal("0"), + ge=Decimal("0"), + description="Maximum transaction amount (inclusive). Decimal('0') = no maximum.", ) allowed_currencies: list[str] = Field( default_factory=list, @@ -78,7 +79,11 @@ def normalize_currencies(cls, v: Any) -> list[str]: @model_validator(mode="after") def validate_amount_bounds(self) -> TransactionPolicyConfig: """Ensure max_amount >= min_amount when both are non-zero.""" - if self.max_amount > 0.0 and self.min_amount > 0.0 and self.max_amount < self.min_amount: + if ( + self.max_amount > Decimal("0") + and self.min_amount > Decimal("0") + and self.max_amount < self.min_amount + ): raise ValueError( f"max_amount ({self.max_amount}) must be >= min_amount ({self.min_amount})" ) diff --git a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/evaluator.py b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/evaluator.py index 4ee717ff..f1542fa8 100644 --- a/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/evaluator.py +++ b/evaluators/contrib/financial-governance/src/agent_control_evaluator_financial_governance/transaction_policy/evaluator.py @@ -2,6 +2,7 @@ from __future__ import annotations +from decimal import Decimal, InvalidOperation from typing import Any from agent_control_evaluators import ( @@ -36,9 +37,9 @@ class TransactionPolicyEvaluator(Evaluator[TransactionPolicyConfig]): Input ``data`` schema:: { - "amount": float, # required — transaction amount - "currency": str, # required — payment currency - "recipient": str, # required — recipient address or identifier + "amount": Decimal | float | str, # required — transaction amount + "currency": str, # required — payment currency + "recipient": str, # required — recipient address or id # optional context fields (logged in result metadata) "channel": str, "agent_id": str, @@ -51,15 +52,16 @@ class TransactionPolicyEvaluator(Evaluator[TransactionPolicyConfig]): TransactionPolicyConfig, TransactionPolicyEvaluator, ) + from decimal import Decimal config = TransactionPolicyConfig( allowed_currencies=["USDC", "USDT"], blocked_recipients=["0xDEAD..."], - max_amount=5000.0, + max_amount=Decimal("5000"), ) evaluator = TransactionPolicyEvaluator(config) result = await evaluator.evaluate({ - "amount": 100.0, + "amount": "100.00", "currency": "USDC", "recipient": "0xABC...", }) @@ -103,7 +105,8 @@ async def evaluate(self, data: Any) -> EvaluatorResult: Args: data: Transaction dict (when ``selector.path`` is ``"input"``) - or full Step dict (when path is ``"*"``). + or full Step dict (when path is ``"*"``). Malformed payload + returns ``matched=False, error=None`` — not an evaluator error. Returns: ``EvaluatorResult`` where ``matched=True`` indicates a policy @@ -128,6 +131,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: data = tx_data # ---- Extract and validate required fields ---- + # Malformed input → matched=False, error=None (not an evaluator crash) currency_raw = data.get("currency") if not currency_raw: return EvaluatorResult( @@ -154,8 +158,8 @@ async def evaluate(self, data: Any) -> EvaluatorResult: message="Transaction data missing required field 'amount'", ) try: - amount = float(amount_raw) - except (TypeError, ValueError): + amount = Decimal(str(amount_raw)) + except (TypeError, ValueError, InvalidOperation): return EvaluatorResult( matched=False, confidence=1.0, @@ -164,7 +168,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: # Build shared metadata for result context base_meta: dict[str, Any] = { - "amount": amount, + "amount": float(amount), "currency": currency, "recipient": recipient, } @@ -217,7 +221,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: ) # ---- Check 4: Minimum amount ---- - if self.config.min_amount > 0.0 and amount < self.config.min_amount: + if self.config.min_amount > Decimal("0") and amount < self.config.min_amount: return EvaluatorResult( matched=True, confidence=1.0, @@ -228,12 +232,12 @@ async def evaluate(self, data: Any) -> EvaluatorResult: metadata={ **base_meta, "violation": "amount_below_minimum", - "min_amount": self.config.min_amount, + "min_amount": float(self.config.min_amount), }, ) # ---- Check 5: Maximum amount ---- - if self.config.max_amount > 0.0 and amount > self.config.max_amount: + if self.config.max_amount > Decimal("0") and amount > self.config.max_amount: return EvaluatorResult( matched=True, confidence=1.0, @@ -244,7 +248,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: metadata={ **base_meta, "violation": "amount_exceeds_maximum", - "max_amount": self.config.max_amount, + "max_amount": float(self.config.max_amount), }, ) diff --git a/evaluators/contrib/financial-governance/tests/test_spend_limit.py b/evaluators/contrib/financial-governance/tests/test_spend_limit.py index 07f06a78..59316099 100644 --- a/evaluators/contrib/financial-governance/tests/test_spend_limit.py +++ b/evaluators/contrib/financial-governance/tests/test_spend_limit.py @@ -3,17 +3,76 @@ from __future__ import annotations import time +from decimal import Decimal from typing import Any import pytest from agent_control_evaluator_financial_governance.spend_limit import ( + BudgetLimit, + BudgetWindow, InMemorySpendStore, SpendLimitConfig, SpendLimitEvaluator, ) +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _rolling_window(seconds: int = 86400) -> BudgetWindow: + return BudgetWindow(kind="rolling", seconds=seconds) + + +def _per_tx_limit(amount: str | Decimal, currency: str = "USDC", **kw: Any) -> BudgetLimit: + """Build a per-transaction cap (no window).""" + return BudgetLimit(amount=Decimal(str(amount)), currency=currency, **kw) + + +def _period_limit( + amount: str | Decimal, + currency: str = "USDC", + seconds: int = 86400, + **kw: Any, +) -> BudgetLimit: + """Build a rolling-period budget limit.""" + return BudgetLimit( + amount=Decimal(str(amount)), + currency=currency, + window=_rolling_window(seconds), + **kw, + ) + + +def _make_evaluator( + limits: list[BudgetLimit] | None = None, + store: InMemorySpendStore | None = None, + # Legacy convenience kwargs translated to BudgetLimit list + max_per_transaction: str | Decimal | None = None, + max_per_period: str | Decimal | None = None, + period_seconds: int = 86400, + currency: str = "USDC", +) -> SpendLimitEvaluator: + if limits is None: + limits = [] + if max_per_transaction is not None and Decimal(str(max_per_transaction)) > 0: + limits.append(_per_tx_limit(max_per_transaction, currency=currency)) + if max_per_period is not None and Decimal(str(max_per_period)) > 0: + limits.append(_period_limit(max_per_period, currency=currency, seconds=period_seconds)) + cfg = SpendLimitConfig(limits=limits) + return SpendLimitEvaluator(cfg, store=store) + + +def _tx( + amount: Any = "10.00", + currency: str = "USDC", + recipient: str = "0xABC", + **extra: Any, +) -> dict[str, Any]: + return {"amount": amount, "currency": currency, "recipient": recipient, **extra} + + # --------------------------------------------------------------------------- # InMemorySpendStore unit tests # --------------------------------------------------------------------------- @@ -22,110 +81,229 @@ def test_store_record_and_query() -> None: """Basic record/query round-trip.""" store = InMemorySpendStore() - since = time.time() - 1 # slightly in the past + since = time.time() - 1 - store.record_spend(100.0, "USDC") - store.record_spend(50.0, "USDC") - store.record_spend(200.0, "ETH") # different currency — should not be counted + store.record_spend(Decimal("100"), "USDC") + store.record_spend(Decimal("50"), "USDC") + store.record_spend(Decimal("200"), "ETH") - assert store.get_spend("USDC", since) == pytest.approx(150.0) - assert store.get_spend("ETH", since) == pytest.approx(200.0) - assert store.get_spend("USDT", since) == pytest.approx(0.0) + assert store.get_spend("USDC", since) == Decimal("150") + assert store.get_spend("ETH", since) == Decimal("200") + assert store.get_spend("USDT", since) == Decimal("0") def test_store_since_timestamp_filters_old_records() -> None: - """Records before since_timestamp are excluded from get_spend.""" store = InMemorySpendStore() + store.record_spend(Decimal("1000"), "USDC") + future_since = time.time() + 1 + assert store.get_spend("USDC", future_since) == Decimal("0") + + +def test_store_end_timestamp_filters_future_records() -> None: + store = InMemorySpendStore() + past_end = time.time() - 1 + store.record_spend(Decimal("100"), "USDC") + assert store.get_spend("USDC", time.time() - 10, end=past_end) == Decimal("0") - store.record_spend(1000.0, "USDC") - future_since = time.time() + 1 # everything is "before" this - assert store.get_spend("USDC", future_since) == pytest.approx(0.0) +def test_store_end_none_includes_all_current_records() -> None: + store = InMemorySpendStore() + store.record_spend(Decimal("100"), "USDC") + assert store.get_spend("USDC", time.time() - 5) == Decimal("100") def test_store_record_count() -> None: store = InMemorySpendStore() assert store.record_count() == 0 - store.record_spend(1.0, "USDC") - store.record_spend(2.0, "USDC") + store.record_spend(Decimal("1"), "USDC") + store.record_spend(Decimal("2"), "USDC") assert store.record_count() == 2 def test_store_rejects_non_positive_amount() -> None: store = InMemorySpendStore() with pytest.raises(ValueError, match="amount must be positive"): - store.record_spend(0.0, "USDC") + store.record_spend(Decimal("0"), "USDC") with pytest.raises(ValueError, match="amount must be positive"): - store.record_spend(-5.0, "USDC") + store.record_spend(Decimal("-5"), "USDC") def test_store_metadata_accepted() -> None: - """Metadata kwarg is stored without error.""" store = InMemorySpendStore() - store.record_spend(10.0, "USDC", metadata={"agent_id": "agent-1", "session_id": "s-99"}) + store.record_spend( + Decimal("10"), "USDC", + metadata={"agent_id": "agent-1", "session_id": "s-99"}, + ) assert store.record_count() == 1 +def test_store_scope_filter() -> None: + """get_spend with scope only returns matching records.""" + store = InMemorySpendStore() + since = time.time() - 1 + store.record_spend(Decimal("90"), "USDC", metadata={"channel": "A"}) + store.record_spend(Decimal("20"), "USDC", metadata={"channel": "B"}) + + assert store.get_spend("USDC", since, scope={"channel": "A"}) == Decimal("90") + assert store.get_spend("USDC", since, scope={"channel": "B"}) == Decimal("20") + assert store.get_spend("USDC", since) == Decimal("110") + + # --------------------------------------------------------------------------- -# SpendLimitConfig validation tests +# check_and_record atomic tests # --------------------------------------------------------------------------- -def test_config_currency_normalized_to_upper() -> None: - cfg = SpendLimitConfig(currency="usdc", max_per_transaction=100.0) - assert cfg.currency == "USDC" +def test_check_and_record_accepts_within_limit() -> None: + """check_and_record records and returns (True, prior_spend).""" + store = InMemorySpendStore() + since = time.time() - 1 + accepted, prior = store.check_and_record( + amount=Decimal("50"), + currency="USDC", + limit=Decimal("100"), + start=since, + ) + assert accepted is True + assert prior == Decimal("0") + assert store.record_count() == 1 + assert store.get_spend("USDC", since) == Decimal("50") -def test_config_defaults() -> None: - cfg = SpendLimitConfig(currency="USDC") - assert cfg.max_per_transaction == 0.0 - assert cfg.max_per_period == 0.0 - assert cfg.period_seconds == 86_400 +def test_check_and_record_rejects_over_limit() -> None: + """check_and_record rejects when amount would exceed limit.""" + store = InMemorySpendStore() + since = time.time() - 1 + store.record_spend(Decimal("90"), "USDC") + + accepted, prior = store.check_and_record( + amount=Decimal("20"), + currency="USDC", + limit=Decimal("100"), + start=since, + ) + assert accepted is False + assert prior == Decimal("90") + assert store.record_count() == 1 -def test_config_rejects_negative_max_per_transaction() -> None: - with pytest.raises(Exception): - SpendLimitConfig(currency="USDC", max_per_transaction=-1.0) + +def test_check_and_record_exact_boundary_accepted() -> None: + """check_and_record accepts when spend exactly reaches the limit.""" + store = InMemorySpendStore() + since = time.time() - 1 + store.record_spend(Decimal("90"), "USDC") + + accepted, prior = store.check_and_record( + amount=Decimal("10"), + currency="USDC", + limit=Decimal("100"), + start=since, + ) + assert accepted is True + assert prior == Decimal("90") + assert store.get_spend("USDC", since) == Decimal("100") -def test_config_rejects_zero_period_seconds() -> None: - with pytest.raises(Exception): - SpendLimitConfig(currency="USDC", period_seconds=0) +def test_check_and_record_scoped_isolation() -> None: + """check_and_record with scope only counts matching records.""" + store = InMemorySpendStore() + since = time.time() - 1 + store.record_spend(Decimal("90"), "USDC", metadata={"channel": "A"}) + + accepted, prior = store.check_and_record( + amount=Decimal("20"), + currency="USDC", + limit=Decimal("100"), + start=since, + scope={"channel": "B"}, + metadata={"channel": "B"}, + ) + assert accepted is True + assert prior == Decimal("0") + assert store.get_spend("USDC", since, scope={"channel": "B"}) == Decimal("20") + + +def test_check_and_record_rejects_non_positive() -> None: + store = InMemorySpendStore() + with pytest.raises(ValueError): + store.check_and_record( + amount=Decimal("0"), + currency="USDC", + limit=Decimal("100"), + start=time.time() - 1, + ) # --------------------------------------------------------------------------- -# SpendLimitEvaluator tests +# BudgetWindow / BudgetLimit / SpendLimitConfig validation # --------------------------------------------------------------------------- -def _make_evaluator( - max_per_transaction: float = 0.0, - max_per_period: float = 0.0, - period_seconds: int = 86400, - currency: str = "USDC", - store: InMemorySpendStore | None = None, -) -> SpendLimitEvaluator: - cfg = SpendLimitConfig( - max_per_transaction=max_per_transaction, - max_per_period=max_per_period, - period_seconds=period_seconds, - currency=currency, - ) - return SpendLimitEvaluator(cfg, store=store) +def test_budget_limit_currency_normalized() -> None: + limit = BudgetLimit(amount=Decimal("100"), currency="usdc") + assert limit.currency == "USDC" -def _tx( - amount: float = 10.0, - currency: str = "USDC", - recipient: str = "0xABC", - **extra: Any, -) -> dict[str, Any]: - return {"amount": amount, "currency": currency, "recipient": recipient, **extra} +def test_budget_window_rolling_requires_seconds() -> None: + with pytest.raises(Exception): + BudgetWindow(kind="rolling") + + +def test_budget_window_fixed_requires_unit() -> None: + with pytest.raises(Exception): + BudgetWindow(kind="fixed") + + +def test_budget_window_rolling_valid() -> None: + w = BudgetWindow(kind="rolling", seconds=3600) + assert w.seconds == 3600 + + +def test_budget_window_fixed_valid() -> None: + w = BudgetWindow(kind="fixed", unit="day", timezone="America/New_York") + assert w.unit == "day" + assert w.timezone == "America/New_York" + + +def test_config_empty_limits() -> None: + cfg = SpendLimitConfig(limits=[]) + assert cfg.limits == [] + + +def test_config_limits_parsed_from_dict() -> None: + """SpendLimitConfig parses limits from dicts (Pydantic coercion).""" + cfg = SpendLimitConfig(limits=[ + {"amount": "100.00", "currency": "USDC"}, + { + "amount": "1000.00", + "currency": "USDC", + "scope_by": ["channel"], + "window": {"kind": "rolling", "seconds": 86400}, + }, + ]) + assert len(cfg.limits) == 2 + assert cfg.limits[0].amount == Decimal("100.00") + assert cfg.limits[1].scope_by == ("channel",) + assert cfg.limits[1].window is not None + assert cfg.limits[1].window.kind == "rolling" + + +def test_budget_limit_rejects_non_positive_amount() -> None: + with pytest.raises(Exception): + BudgetLimit(amount=Decimal("0"), currency="USDC") + with pytest.raises(Exception): + BudgetLimit(amount=Decimal("-1"), currency="USDC") + + +# --------------------------------------------------------------------------- +# SpendLimitEvaluator — basic behaviour +# --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_none_data_is_allowed() -> None: - ev = _make_evaluator(max_per_transaction=100.0) + ev = _make_evaluator(max_per_transaction="100") result = await ev.evaluate(None) assert result.matched is False assert result.error is None @@ -133,7 +311,7 @@ async def test_none_data_is_allowed() -> None: @pytest.mark.asyncio async def test_non_dict_data_is_allowed() -> None: - ev = _make_evaluator(max_per_transaction=100.0) + ev = _make_evaluator(max_per_transaction="100") result = await ev.evaluate("not a dict") assert result.matched is False assert result.error is None @@ -142,7 +320,7 @@ async def test_non_dict_data_is_allowed() -> None: @pytest.mark.asyncio async def test_missing_amount_not_matched() -> None: """Missing amount is a non-match, NOT an evaluator error.""" - ev = _make_evaluator(max_per_transaction=100.0) + ev = _make_evaluator(max_per_transaction="100") result = await ev.evaluate({"currency": "USDC", "recipient": "0xABC"}) assert result.matched is False assert result.error is None @@ -152,8 +330,8 @@ async def test_missing_amount_not_matched() -> None: @pytest.mark.asyncio async def test_missing_currency_not_matched() -> None: """Missing currency is a non-match, NOT an evaluator error.""" - ev = _make_evaluator(max_per_transaction=100.0) - result = await ev.evaluate({"amount": 10.0, "recipient": "0xABC"}) + ev = _make_evaluator(max_per_transaction="100") + result = await ev.evaluate({"amount": "10.00", "recipient": "0xABC"}) assert result.matched is False assert result.error is None assert "currency" in (result.message or "").lower() @@ -161,18 +339,28 @@ async def test_missing_currency_not_matched() -> None: @pytest.mark.asyncio async def test_wrong_currency_is_skipped() -> None: - """Transaction in a different currency should be allowed (not matched).""" - ev = _make_evaluator(max_per_transaction=1.0, currency="USDC") - # Amount 99999 but in ETH — policy only governs USDC - result = await ev.evaluate(_tx(amount=99999.0, currency="ETH")) + """Transaction in a different currency should be allowed.""" + ev = _make_evaluator(limits=[_per_tx_limit("1", currency="USDC")]) + result = await ev.evaluate(_tx(amount="99999.00", currency="ETH")) assert result.matched is False - assert result.metadata and result.metadata.get("tx_currency") == "ETH" + + +@pytest.mark.asyncio +async def test_no_limits_configured_allows_everything() -> None: + ev = _make_evaluator(limits=[]) + result = await ev.evaluate(_tx(amount="999999.00")) + assert result.matched is False + + +# --------------------------------------------------------------------------- +# Per-transaction cap tests +# --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_per_transaction_cap_violation() -> None: - ev = _make_evaluator(max_per_transaction=100.0) - result = await ev.evaluate(_tx(amount=101.0)) + ev = _make_evaluator(max_per_transaction="100") + result = await ev.evaluate(_tx(amount="101.00")) assert result.matched is True assert result.metadata and result.metadata["violation"] == "per_transaction_cap" assert result.error is None @@ -180,28 +368,23 @@ async def test_per_transaction_cap_violation() -> None: @pytest.mark.asyncio async def test_per_transaction_cap_exact_boundary_allowed() -> None: - ev = _make_evaluator(max_per_transaction=100.0) - result = await ev.evaluate(_tx(amount=100.0)) + ev = _make_evaluator(max_per_transaction="100") + result = await ev.evaluate(_tx(amount="100.00")) assert result.matched is False -@pytest.mark.asyncio -async def test_per_transaction_cap_disabled_at_zero() -> None: - ev = _make_evaluator(max_per_transaction=0.0) - result = await ev.evaluate(_tx(amount=9_999_999.0)) - assert result.matched is False +# --------------------------------------------------------------------------- +# Period budget tests (atomic via check_and_record) +# --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_period_budget_violation() -> None: store = InMemorySpendStore() - ev = _make_evaluator(max_per_period=500.0, store=store) - - # Pre-load 480 of spend - store.record_spend(480.0, "USDC") + ev = _make_evaluator(max_per_period="500", store=store) + store.record_spend(Decimal("480"), "USDC") - # Next transaction of 25 would push us to 505 — over budget - result = await ev.evaluate(_tx(amount=25.0)) + result = await ev.evaluate(_tx(amount="25.00")) assert result.matched is True assert result.metadata and result.metadata["violation"] == "period_budget" assert result.metadata["current_period_spend"] == pytest.approx(480.0) @@ -211,212 +394,216 @@ async def test_period_budget_violation() -> None: @pytest.mark.asyncio async def test_period_budget_exact_boundary_allowed() -> None: store = InMemorySpendStore() - ev = _make_evaluator(max_per_period=500.0, store=store) - - store.record_spend(490.0, "USDC") + ev = _make_evaluator(max_per_period="500", store=store) + store.record_spend(Decimal("490"), "USDC") - # Exactly 10 remaining — should be allowed and recorded - result = await ev.evaluate(_tx(amount=10.0)) - assert result.matched is False - # The spend should now be recorded - assert store.get_spend("USDC", time.time() - 1) == pytest.approx(500.0) - - -@pytest.mark.asyncio -async def test_period_budget_disabled_at_zero() -> None: - store = InMemorySpendStore() - ev = _make_evaluator(max_per_period=0.0, store=store) - - store.record_spend(1_000_000.0, "USDC") - result = await ev.evaluate(_tx(amount=1_000_000.0)) + result = await ev.evaluate(_tx(amount="10.00")) assert result.matched is False + since = time.time() - 5 + assert store.get_spend("USDC", since) == Decimal("500") @pytest.mark.asyncio async def test_successful_transaction_is_recorded() -> None: store = InMemorySpendStore() - ev = _make_evaluator(max_per_transaction=100.0, max_per_period=1000.0, store=store) + ev = _make_evaluator(max_per_transaction="100", max_per_period="1000", store=store) assert store.record_count() == 0 - result = await ev.evaluate(_tx(amount=50.0)) + result = await ev.evaluate(_tx(amount="50.00")) assert result.matched is False assert store.record_count() == 1 since = time.time() - 5 - assert store.get_spend("USDC", since) == pytest.approx(50.0) - - -@pytest.mark.asyncio -async def test_context_override_channel_max_per_transaction() -> None: - """channel_max_per_transaction in data overrides config.""" - # Base config allows up to 1000 per tx, but channel caps at 50 - ev = _make_evaluator(max_per_transaction=1000.0) - result = await ev.evaluate(_tx(amount=75.0, channel_max_per_transaction=50.0)) - assert result.matched is True - assert result.metadata and result.metadata["violation"] == "per_transaction_cap" - assert result.metadata["max_per_transaction"] == pytest.approx(50.0) - - -@pytest.mark.asyncio -async def test_context_override_channel_max_per_period() -> None: - """channel_max_per_period in data overrides config.""" - store = InMemorySpendStore() - store.record_spend(90.0, "USDC") - - # Base config has 1000 budget, but channel caps at 100 - ev = _make_evaluator(max_per_period=1000.0, store=store) - result = await ev.evaluate(_tx(amount=20.0, channel_max_per_period=100.0)) - assert result.matched is True - assert result.metadata and result.metadata["violation"] == "period_budget" + assert store.get_spend("USDC", since) == Decimal("50") @pytest.mark.asyncio async def test_multiple_sequential_transactions_accumulate() -> None: - """Verify spend accumulates correctly across multiple calls.""" store = InMemorySpendStore() - ev = _make_evaluator(max_per_transaction=100.0, max_per_period=250.0, store=store) - - for amount in (80.0, 80.0, 80.0): - r = await ev.evaluate(_tx(amount=amount)) - # First two succeed; third should breach period budget (240 + 80 = 320 > 250) - if amount == 80.0 and store.record_count() < 3: - pass # may or may not be matched depending on order + ev = _make_evaluator(max_per_transaction="100", max_per_period="250", store=store) - # After two successful txns (160 total), third of 80 → 240 which is ≤ 250 → allowed - # But a fourth of 80 → 320 which is > 250 → blocked - result_4 = await ev.evaluate(_tx(amount=80.0)) - assert result_4.matched is True - assert result_4.metadata and result_4.metadata["violation"] == "period_budget" + r1 = await ev.evaluate(_tx(amount="80.00")) + assert r1.matched is False + r2 = await ev.evaluate(_tx(amount="80.00")) + assert r2.matched is False + r3 = await ev.evaluate(_tx(amount="80.00")) + assert r3.matched is False # 240 <= 250 + r4 = await ev.evaluate(_tx(amount="80.00")) + assert r4.matched is True + assert r4.metadata and r4.metadata["violation"] == "period_budget" @pytest.mark.asyncio async def test_currency_case_insensitive_in_data() -> None: - """Currency in transaction data is normalized to upper-case before comparison.""" - ev = _make_evaluator(max_per_transaction=100.0, currency="USDC") - result = await ev.evaluate(_tx(amount=10.0, currency="usdc")) - assert result.matched is False # lower-case usdc should match USDC policy + ev = _make_evaluator(max_per_transaction="100", currency="USDC") + result = await ev.evaluate(_tx(amount="10.00", currency="usdc")) + assert result.matched is False # --------------------------------------------------------------------------- -# Context-scoped budget isolation tests (requested by lan17) +# BudgetLimit.scope_by — independent dimension budget isolation # --------------------------------------------------------------------------- @pytest.mark.asyncio -async def test_scoped_budget_channel_isolation() -> None: - """Spend in channel A should NOT count against channel B's budget. +async def test_scope_by_channel_isolates_budgets() -> None: + """scope_by=(channel,) gives each channel its own independent counter. - Scenario: 90 USDC in channel A, then 20 USDC in channel B with - channel_max_per_period=100. Channel B should be allowed because + lan17s specific test: 90 USDC in channel A, then 20 USDC in channel B + with a 100 USDC per-channel budget. Channel B should be ALLOWED because its scoped spend is 0, not 90. """ store = InMemorySpendStore() - ev = _make_evaluator(max_per_period=1000.0, store=store) + limit = BudgetLimit( + amount=Decimal("100"), + currency="USDC", + scope_by=("channel",), + window=BudgetWindow(kind="rolling", seconds=86400), + ) + ev = SpendLimitEvaluator(SpendLimitConfig(limits=[limit]), store=store) - # Record 90 USDC in channel A - r1 = await ev.evaluate(_tx(amount=90.0, channel="channel-A")) - assert r1.matched is False + r1 = await ev.evaluate(_tx(amount="90.00", channel="channel-A")) + assert r1.matched is False, f"Channel A 90 USDC should be allowed: {r1.message}" - # 20 USDC in channel B with a per-channel budget of 100 - # Should be allowed: channel B has 0 spend, not 90. - r2 = await ev.evaluate(_tx(amount=20.0, channel="channel-B", channel_max_per_period=100.0)) - assert r2.matched is False + since = time.time() - 5 + assert store.get_spend("USDC", since, scope={"channel": "channel-A"}) == Decimal("90") + + r2 = await ev.evaluate(_tx(amount="20.00", channel="channel-B")) + assert r2.matched is False, ( + f"Channel B 20 USDC should be allowed (channel B has 0 spend): {r2.message}" + ) + assert store.get_spend("USDC", since, scope={"channel": "channel-B"}) == Decimal("20") + assert store.get_spend("USDC", since, scope={"channel": "channel-A"}) == Decimal("90") @pytest.mark.asyncio -async def test_scoped_budget_same_channel_accumulates() -> None: +async def test_scope_by_channel_accumulates_within_same_channel() -> None: """Spend within the same channel accumulates correctly.""" store = InMemorySpendStore() - ev = _make_evaluator(max_per_period=1000.0, store=store) + limit = BudgetLimit( + amount=Decimal("100"), + currency="USDC", + scope_by=("channel",), + window=BudgetWindow(kind="rolling", seconds=86400), + ) + ev = SpendLimitEvaluator(SpendLimitConfig(limits=[limit]), store=store) - # 60 USDC in channel A - r1 = await ev.evaluate(_tx(amount=60.0, channel="channel-A")) + r1 = await ev.evaluate(_tx(amount="60.00", channel="channel-A")) assert r1.matched is False - # Another 50 USDC in channel A with channel cap of 100 - # 60 + 50 = 110 > 100 → should be denied - r2 = await ev.evaluate(_tx(amount=50.0, channel="channel-A", channel_max_per_period=100.0)) + r2 = await ev.evaluate(_tx(amount="50.00", channel="channel-A")) assert r2.matched is True assert r2.metadata and r2.metadata["violation"] == "period_budget" @pytest.mark.asyncio -async def test_scoped_budget_agent_id_isolation() -> None: - """Spend by agent-1 should NOT count against agent-2's budget.""" +async def test_scope_by_agent_id_isolation() -> None: + """scope_by=(agent_id,) isolates budgets per agent.""" store = InMemorySpendStore() - ev = _make_evaluator(max_per_period=1000.0, store=store) + limit = BudgetLimit( + amount=Decimal("100"), + currency="USDC", + scope_by=("agent_id",), + window=BudgetWindow(kind="rolling", seconds=86400), + ) + ev = SpendLimitEvaluator(SpendLimitConfig(limits=[limit]), store=store) - r1 = await ev.evaluate(_tx(amount=90.0, agent_id="agent-1")) + r1 = await ev.evaluate(_tx(amount="90.00", agent_id="agent-1")) assert r1.matched is False - # agent-2 with tight budget — should be allowed (agent-2 has 0 spend) - r2 = await ev.evaluate(_tx(amount=20.0, agent_id="agent-2", channel_max_per_period=100.0)) + r2 = await ev.evaluate(_tx(amount="20.00", agent_id="agent-2")) assert r2.matched is False @pytest.mark.asyncio async def test_global_budget_without_scope() -> None: - """When no channel/agent/session context, budget is global.""" + """scope_by=() means all spend in that currency counts together.""" store = InMemorySpendStore() - ev = _make_evaluator(max_per_period=100.0, store=store) + ev = _make_evaluator(max_per_period="100", store=store) - # No context fields → global spend - r1 = await ev.evaluate(_tx(amount=90.0)) + r1 = await ev.evaluate(_tx(amount="90.00")) assert r1.matched is False - # Still no context → global spend of 90 + 20 = 110 > 100 - r2 = await ev.evaluate(_tx(amount=20.0)) + r2 = await ev.evaluate(_tx(amount="20.00")) assert r2.matched is True +@pytest.mark.asyncio +async def test_multiple_limits_in_one_config() -> None: + """Global per-tx cap and per-channel period budget co-exist.""" + store = InMemorySpendStore() + cfg = SpendLimitConfig(limits=[ + BudgetLimit(amount=Decimal("200"), currency="USDC"), + BudgetLimit( + amount=Decimal("100"), + currency="USDC", + scope_by=("channel",), + window=BudgetWindow(kind="rolling", seconds=86400), + ), + ]) + ev = SpendLimitEvaluator(cfg, store=store) + + r1 = await ev.evaluate(_tx(amount="90.00", channel="channel-A")) + assert r1.matched is False + + r2 = await ev.evaluate(_tx(amount="90.00", channel="channel-B")) + assert r2.matched is False + + r3 = await ev.evaluate(_tx(amount="20.00", channel="channel-A")) + assert r3.matched is True + assert r3.metadata and r3.metadata["violation"] == "period_budget" + + r4 = await ev.evaluate(_tx(amount="210.00", channel="channel-C")) + assert r4.matched is True + assert r4.metadata and r4.metadata["violation"] == "per_transaction_cap" + + +# --------------------------------------------------------------------------- +# Malformed input — matched=False, error=None (never error=...) +# --------------------------------------------------------------------------- + + @pytest.mark.asyncio async def test_malformed_input_is_not_evaluator_error() -> None: - """Malformed input should be matched=False with error=None, not an evaluator error. + """Malformed input must return matched=False, error=None. - This is the engine-level test lan17 requested to ensure we don't - accidentally lock in result.error as a policy outcome. + The error field is reserved for evaluator crashes/timeouts/missing deps. """ - ev = _make_evaluator(max_per_transaction=100.0) + ev = _make_evaluator(max_per_transaction="100") - # Missing amount r1 = await ev.evaluate({"currency": "USDC", "recipient": "0xABC"}) assert r1.matched is False assert r1.error is None - # Missing currency - r2 = await ev.evaluate({"amount": 10.0, "recipient": "0xABC"}) + r2 = await ev.evaluate({"amount": "10.00", "recipient": "0xABC"}) assert r2.matched is False assert r2.error is None - # Negative amount - r3 = await ev.evaluate({"amount": -5.0, "currency": "USDC", "recipient": "0xABC"}) + r3 = await ev.evaluate({"amount": "-5.00", "currency": "USDC", "recipient": "0xABC"}) assert r3.matched is False assert r3.error is None - # Non-dict input r4 = await ev.evaluate("not a dict") assert r4.matched is False assert r4.error is None - # None input r5 = await ev.evaluate(None) assert r5.matched is False assert r5.error is None # --------------------------------------------------------------------------- -# Step normalization tests (selector.path: "*" vs "input") +# Step normalization (selector.path: "*" vs "input") # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_step_object_input_extraction() -> None: - """When selector.path is '*', data is a full Step dict. - Evaluator should extract transaction from 'input' key.""" - ev = _make_evaluator(max_per_transaction=100.0) + """selector.path=* passes a full Step dict; evaluator extracts from input.""" + ev = _make_evaluator(max_per_transaction="100") step_data = { "type": "tool", "name": "payment", - "input": {"amount": 50.0, "currency": "USDC", "recipient": "0xABC"}, + "input": {"amount": "50.00", "currency": "USDC", "recipient": "0xABC"}, "context": None, } result = await ev.evaluate(step_data) @@ -425,43 +612,119 @@ async def test_step_object_input_extraction() -> None: @pytest.mark.asyncio async def test_step_context_merged_into_transaction() -> None: - """Context fields from step.context should be available for scoped budgets.""" + """Context fields from step.context are available for scoped budgets.""" store = InMemorySpendStore() - ev = _make_evaluator(max_per_period=1000.0, store=store) + limit = BudgetLimit( + amount=Decimal("100"), + currency="USDC", + scope_by=("channel",), + window=BudgetWindow(kind="rolling", seconds=86400), + ) + ev = SpendLimitEvaluator(SpendLimitConfig(limits=[limit]), store=store) - # First: 90 USDC in channel-A via step context step1 = { "type": "tool", "name": "payment", - "input": {"amount": 90.0, "currency": "USDC", "recipient": "0xABC"}, + "input": {"amount": "90.00", "currency": "USDC", "recipient": "0xABC"}, "context": {"channel": "channel-A"}, } r1 = await ev.evaluate(step1) assert r1.matched is False - # Second: 20 USDC in channel-B with tight cap via step context step2 = { "type": "tool", "name": "payment", - "input": {"amount": 20.0, "currency": "USDC", "recipient": "0xABC"}, - "context": {"channel": "channel-B", "channel_max_per_period": 100.0}, + "input": {"amount": "20.00", "currency": "USDC", "recipient": "0xABC"}, + "context": {"channel": "channel-B"}, } r2 = await ev.evaluate(step2) - # Channel B has 0 scoped spend → should be allowed assert r2.matched is False @pytest.mark.asyncio async def test_step_context_overrides_not_clobbered_by_input() -> None: - """If input already has channel, step.context should not overwrite it.""" - ev = _make_evaluator(max_per_transaction=100.0) + """If input already has channel, step.context must NOT overwrite it. + + Asserts against actual store state to prove spend was recorded under + channel=from-input, not from-context. + """ + store = InMemorySpendStore() + ev = _make_evaluator(max_per_transaction="100", max_per_period="1000", store=store) + step_data = { "type": "tool", "name": "payment", - "input": {"amount": 10.0, "currency": "USDC", "recipient": "0xABC", "channel": "from-input"}, + "input": { + "amount": "10.00", + "currency": "USDC", + "recipient": "0xABC", + "channel": "from-input", + }, "context": {"channel": "from-context"}, } result = await ev.evaluate(step_data) assert result.matched is False - # input's channel should win (not clobbered) - assert result.metadata and result.metadata.get("channel") is None or True # just verify no crash + + since = time.time() - 5 + assert store.get_spend("USDC", since, scope={"channel": "from-input"}) == Decimal("10") + assert store.get_spend("USDC", since, scope={"channel": "from-context"}) == Decimal("0") + + +# --------------------------------------------------------------------------- +# lan17 specific channel-scope-independence test +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_lan17_channel_scope_independence() -> None: + """lan17s test: 90 USDC in channel A, then 20 USDC in channel B. + + With a 100 USDC per-channel budget (scope_by=(channel,)), the second + transaction must be ALLOWED — channel B has 0 spend. + """ + store = InMemorySpendStore() + limit = BudgetLimit( + amount=Decimal("100"), + currency="USDC", + scope_by=("channel",), + window=BudgetWindow(kind="rolling", seconds=86400), + ) + ev = SpendLimitEvaluator(SpendLimitConfig(limits=[limit]), store=store) + + r1 = await ev.evaluate(_tx(amount="90.00", channel="channel-A")) + assert r1.matched is False, f"Channel A 90 USDC should be allowed: {r1.message}" + + since = time.time() - 5 + assert store.get_spend("USDC", since, scope={"channel": "channel-A"}) == Decimal("90") + + r2 = await ev.evaluate(_tx(amount="20.00", channel="channel-B")) + assert r2.matched is False, ( + f"Channel B 20 USDC should be allowed (channel B has 0 spend): {r2.message}" + ) + + assert store.get_spend("USDC", since, scope={"channel": "channel-B"}) == Decimal("20") + assert store.get_spend("USDC", since, scope={"channel": "channel-A"}) == Decimal("90") + + +# --------------------------------------------------------------------------- +# Fixed window (calendar-aligned) budget +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_fixed_window_day_budget() -> None: + """Fixed-day window budget works (uses UTC approximation).""" + store = InMemorySpendStore() + limit = BudgetLimit( + amount=Decimal("100"), + currency="USDC", + window=BudgetWindow(kind="fixed", unit="day"), + ) + ev = SpendLimitEvaluator(SpendLimitConfig(limits=[limit]), store=store) + + r1 = await ev.evaluate(_tx(amount="90.00")) + assert r1.matched is False + + r2 = await ev.evaluate(_tx(amount="20.00")) + assert r2.matched is True + assert r2.metadata and r2.metadata["violation"] == "period_budget" diff --git a/evaluators/contrib/financial-governance/tests/test_transaction_policy.py b/evaluators/contrib/financial-governance/tests/test_transaction_policy.py index 3b310085..35b5dcfb 100644 --- a/evaluators/contrib/financial-governance/tests/test_transaction_policy.py +++ b/evaluators/contrib/financial-governance/tests/test_transaction_policy.py @@ -2,6 +2,7 @@ from __future__ import annotations +from decimal import Decimal from typing import Any import pytest @@ -27,20 +28,20 @@ def test_config_defaults_are_permissive() -> None: cfg = TransactionPolicyConfig() assert cfg.allowed_recipients == [] assert cfg.blocked_recipients == [] - assert cfg.min_amount == 0.0 - assert cfg.max_amount == 0.0 + assert cfg.min_amount == Decimal("0") + assert cfg.max_amount == Decimal("0") assert cfg.allowed_currencies == [] def test_config_max_amount_lt_min_raises() -> None: with pytest.raises(ValidationError, match="max_amount"): - TransactionPolicyConfig(min_amount=100.0, max_amount=10.0) + TransactionPolicyConfig(min_amount=Decimal("100"), max_amount=Decimal("10")) def test_config_max_equals_min_is_valid() -> None: - cfg = TransactionPolicyConfig(min_amount=50.0, max_amount=50.0) - assert cfg.min_amount == 50.0 - assert cfg.max_amount == 50.0 + cfg = TransactionPolicyConfig(min_amount=Decimal("50"), max_amount=Decimal("50")) + assert cfg.min_amount == Decimal("50") + assert cfg.max_amount == Decimal("50") # --------------------------------------------------------------------------- @@ -231,7 +232,7 @@ async def test_blocked_beats_allowlist() -> None: @pytest.mark.asyncio async def test_amount_below_minimum_is_blocked() -> None: - ev = _make_evaluator(min_amount=10.0) + ev = _make_evaluator(min_amount=Decimal("10")) result = await ev.evaluate(_tx(amount=9.99)) assert result.matched is True assert result.metadata and result.metadata["violation"] == "amount_below_minimum" @@ -239,14 +240,14 @@ async def test_amount_below_minimum_is_blocked() -> None: @pytest.mark.asyncio async def test_amount_at_minimum_passes() -> None: - ev = _make_evaluator(min_amount=10.0) + ev = _make_evaluator(min_amount=Decimal("10")) result = await ev.evaluate(_tx(amount=10.0)) assert result.matched is False @pytest.mark.asyncio async def test_amount_above_maximum_is_blocked() -> None: - ev = _make_evaluator(max_amount=1000.0) + ev = _make_evaluator(max_amount=Decimal("1000")) result = await ev.evaluate(_tx(amount=1000.01)) assert result.matched is True assert result.metadata and result.metadata["violation"] == "amount_exceeds_maximum" @@ -254,14 +255,14 @@ async def test_amount_above_maximum_is_blocked() -> None: @pytest.mark.asyncio async def test_amount_at_maximum_passes() -> None: - ev = _make_evaluator(max_amount=1000.0) + ev = _make_evaluator(max_amount=Decimal("1000")) result = await ev.evaluate(_tx(amount=1000.0)) assert result.matched is False @pytest.mark.asyncio async def test_amount_bounds_disabled_at_zero() -> None: - ev = _make_evaluator(min_amount=0.0, max_amount=0.0) + ev = _make_evaluator(min_amount=Decimal("0"), max_amount=Decimal("0")) result = await ev.evaluate(_tx(amount=0.001)) assert result.matched is False result2 = await ev.evaluate(_tx(amount=1_000_000_000.0)) @@ -279,8 +280,8 @@ async def test_full_policy_passes_compliant_transaction() -> None: allowed_currencies=["USDC", "USDT"], blocked_recipients=["0xDEAD"], allowed_recipients=["0xALICE", "0xBOB"], - min_amount=1.0, - max_amount=5000.0, + min_amount=Decimal("1"), + max_amount=Decimal("5000"), ) result = await ev.evaluate(_tx(amount=250.0, currency="USDC", recipient="0xALICE")) assert result.matched is False