diff --git a/.gitignore b/.gitignore
index 41ff351..6f98452 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,8 +4,6 @@ hadrian.toml
 /coverage
 .kreuzberg
 todo/
-AGENTS.md
-CLAUDE.md
 
 # Model catalog (fetched via scripts/fetch-model-catalog.sh)
 data/models-dev-catalog.json
diff --git a/AGENTS.md b/AGENTS.md
new file mode 120000
index 0000000..681311e
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1 @@
+CLAUDE.md
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..56e02ae
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,681 @@
+# CLAUDE.md
+
+Hadrian is an AI Gateway that provides a unified OpenAI-compatible API for routing requests to multiple LLM providers.
+
+Its purpose is to provide a high-quality, high-performance, production-ready AI Gateway for LLMs that's COMPLETELY open source and free to use with no restrictions.
+
+All 'enterprise' features are fully supported and free to use. It should run on anything from a Raspberry Pi to globally distributed multi-node multi-region cloud infrastructure.
+
+Code and documentation quality should be very high, and the project should be well-maintained.
+
+It should also provide the best in class interface for interacting with multiple models in a single conversation. It should support complex modes of interaction and push forward the state of the art.
+
+Features:
+
+- Web UI: multimodel chat, chat modes, reasoning, frontend tools (Python/JS/SQL/Charts), MCP, admin panel
+- Studio: image generation, TTS, transcription/translation with multimodel execution, and cost tracking
+- Single binary, single config file deployment
+- OpenAI-compatible API with OpenAPI docs and Scalar UI
+- Multi-tenancy (organizations, teams, projects, users)
+- Auth: API keys, OIDC, OAuth, Identity-Aware Proxy (IAP), CEL-based RBAC
+- Budget enforcement, usage tracking, cost tracking with microcents precision and forecasting (MSTL/ETS)
+- Guardrails (blocklist, PII, content limits), response validation
+- Dynamic model routing, provider health checks, fallbacks
+- Dynamic providers: user/org/team/project-scoped custom provider management
+- Model catalog: models.dev integration with background sync
+- Image generation, audio (TTS, transcription, translation)
+- Knowledge Bases / RAG: file upload, text extraction, chunking, vector search, re-ranking
+- Integrations: SQLite/Postgres, Redis, OpenTelemetry, Vault, S3
+
+The backend is written in Rust and uses Axum for routing and middleware.
+The frontend is written in React and TypeScript, with TailwindCSS for styling.
+
+## General guidelines
+
+- Write high-quality, readable, and maintainable code worthy of a senior software engineer or architect
+- Write idiomatic code using modern language features that are terse and not overly complicated or verbose
+- Rely on linting, formatting, and type checking to catch issues and write clean code
+- Aim for high test coverage. Write tests for all new code
+- Architect for data-intensive workloads but support single-user use cases as well
+- Consider the performance implications of all existing and new code
+- There has not been a release yet, so don't worry about breaking changes or maintaining backwards compatibility in the backend and frontend
+  - Modify the existing database migration file and schema as needed
+  - Make sure sqlite and postgres are kept in sync
+  - Update the API, modules, classes, functions, types, etc. as needed
+  - Update the config file as needed
+- Don't leave behind unused imports, `todo!`s, or dead code.
+  - Implement the functionality or explain why it can't be done yet
+  - For functionality that will be useful in the future, prompt the user to keep it / implement that functionality
+
+## Specific guidelines
+
+Read the files in the `agent_instructions` directory for details on the following, as needed:
+
+- `adding_admin_endpoint.md` — Adding admin endpoints (includes pagination patterns)
+- `adding_frontend_tool.md` — Adding frontend tools
+- `adding_provider.md` — Adding LLM providers
+- `database_changes.md` — Database migrations and schema changes
+- `modifying_chat_ui.md` — Chat UI performance patterns (stores, selectors, memoization). Read before making changes to the chat UI.
+
+## Backend
+
+### Build & Development Commands
+
+```bash
+cargo build                     # Build project
+cargo build --release           # Release build
+cargo test                      # Run unit tests
+cargo test -- --ignored         # Run integration tests
+cargo clippy                    # Lint code
+cargo +nightly fmt              # Format code (requires nightly)
+cargo fix                       # Fix lints
+cargo run                       # Run with default config (hadrian.toml)
+cargo run -- --config path.toml # Run with custom config
+cd deploy/tests && pnpm test    # Run end-to-end tests with testcontainers
+./scripts/coverage.sh           # Generate code coverage report
+```
+
+### Cargo Features
+
+Hierarchical feature profiles (default: `full`):
+
+- **`tiny`** — OpenAI + Test providers, no DB, no embedded assets (stateless proxy)
+- **`minimal`** — tiny + all providers (Anthropic, Azure, Bedrock, Vertex), SQLite, embedded UI, embedded catalog, wizard (dev/Windows/embedded)
+- **`standard`** — minimal + Postgres, Redis, OTLP, Prometheus, SSO, CEL, doc extraction, OpenAPI docs, S3, secrets managers (AWS/Azure/GCP/Vault)
+- **`full`** — standard + SAML, Kreuzberg, ClamAV
+- **`headless`** — all `full` features except embedded assets (UI, docs, catalog). Used by `cargo install` and for deployments that serve the frontend separately.
+
+```bash
+cargo build --no-default-features --features tiny       # Smallest binary
+cargo build --no-default-features --features minimal    # Fast compile
+cargo build --no-default-features --features standard   # Typical deployment
+cargo build                                             # Full (default)
+cargo build --no-default-features --features headless   # Full features, no embedded assets
+```
+
+Run `hadrian features` to list enabled/disabled features at runtime. CI tests `minimal`, `standard`, and `headless` profiles; Windows uses `minimal` to avoid OpenSSL.
+
+To use the ls command, use `/usr/bin/ls` instead of `ls` which will use exa.
+To use the sleep command, don't use `-s 5`, use `sleep 5s`.
+
+Server runs on `http://0.0.0.0:8080` by default.
+
+After making changes to the backend, run the following:
+- `cargo check` to check for compile errors
+- `cargo clippy` to lint code
+- `cargo +nightly fmt` to format code (requires nightly)
+- `cargo test` to run tests
+
+### CI Pipeline
+
+GitHub Actions workflow (`.github/workflows/ci.yml`) runs:
+- Backend: format, clippy, build, test, security audit (`cargo audit`, `cargo-deny`)
+- Frontend: lint, format, type check, build, security audit (`pnpm audit`)
+- Cross-platform builds (Linux, macOS Intel/ARM, Windows)
+- Docker build (shared image used by E2E tests)
+- E2E tests (TypeScript/Playwright with testcontainers, needs Docker build)
+- OpenAPI conformance check
+- Documentation build
+
+### Release Pipeline
+
+GitHub Actions workflow (`.github/workflows/release.yml`) triggers on version tags (`v*`) or manual dispatch (with dry-run option):
+- Builds frontend assets (UI, Storybook, docs) in a shared job
+- Builds release binaries for each target/feature combination:
+  - `x86_64-unknown-linux-gnu` (full, standard, minimal, tiny)
+  - `x86_64-unknown-linux-musl` (standard, minimal, tiny)
+  - `aarch64-unknown-linux-gnu` (standard, minimal, tiny)
+  - `aarch64-apple-darwin` (full, standard, minimal, tiny)
+  - `x86_64-pc-windows-msvc` (standard, minimal, tiny)
+- Creates GitHub Release with archives and SHA256 checksums (tag push only)
+- Dry-run mode builds artifacts and prints a summary without creating a release
+
+Helm chart workflow (`.github/workflows/helm.yml`) runs:
+- `helm lint` (standard and strict mode)
+- `helm template` with matrix of configurations (PostgreSQL, Redis, Ingress, etc.)
+- Schema validation with `ajv-cli`
+- Integration tests in ephemeral kind cluster
+
+## Architecture Overview
+
+### Multi-tenancy Hierarchy
+
+- **Organization** → top-level container; can have many **Users**, **Teams**, and **Projects**
+- **Team** → belongs to an Organization; can have many **Users** and **Projects**
+- **User** → belongs to an Organization (and optionally Teams); can own **Projects**
+- **Project** → owned by Organization, Team, or User; serves as workspace boundary
+
+**Resources** (owned by Teams, Users, or Projects):
+
+- Conversations
+- Providers
+- API Keys
+- Vector Stores
+- Files
+
+### Principal Model
+
+The Principal abstraction represents "who is making the request" regardless of credential type:
+
+- **User**: Human identity from OIDC/SAML/proxy or user-owned API key
+- **ServiceAccount**: Machine identity with explicit roles (service account-owned API key)
+- **Machine**: Shared credential (org/team/project-owned API key, no roles)
+
+Service accounts enable RBAC for API key authentication by providing roles that flow into CEL policy evaluation. When an API key owned by a service account is used, the service account's roles are mapped through `role_mapping` and included in the RBAC Subject.
+
+All admin endpoints use `authz.require()` for role-based access control. See `src/routes/admin/teams.rs` as a reference implementation.
+
+### Authorization (RBAC)
+
+Hadrian uses a two-tier CEL-based RBAC system:
+
+1. **System Policies** (global) — Defined in `hadrian.toml`, controlled by platform operators
+2. **Organization Policies** (per-org) — Stored in database, managed by org admins at runtime via `/admin/v1/orgs/{org_slug}/rbac-policies`
+
+**Evaluation order:**
+1. Check if RBAC disabled → allow all
+2. Evaluate system policies (config) in priority order → if match, return decision
+3. If `org_id` provided, evaluate org policies (database) in priority order → if match, return decision
+4. No match → apply `default_effect` (typically "deny" for admin, "allow" for API)
+
+**Middleware usage:**
+- `authz.require(resource, action)` — System policies only (admin endpoints)
+- `authz.require_api(resource, action)` — System + org policies (API endpoints)
+
+### Membership Model
+
+**Membership Source Tracking:**
+Organization and project memberships track their source for auditability:
+- `manual` — Added by an admin via API/UI
+- `jit` — Just-In-Time provisioned during SSO authentication
+- `scim` — Provisioned via SCIM protocol from an IdP
+
+**Single-Org Membership Constraint:**
+Each user can only belong to one organization at a time. This is enforced by a database unique index (`idx_org_memberships_single_org`), which is race-condition safe and returns a conflict error when violated.
+
+### Per-Organization SSO
+
+Per-org SSO allows each organization to configure its own identity provider (OIDC or SAML), replacing the global OIDC configuration. This enables multi-tenant deployments where different organizations use different IdPs.
+
+**Key concepts:**
+- SSO configs are stored in the database per organization (`org_sso_configs` table)
+- Client secrets are stored in an external secrets manager (Vault, AWS, etc.)
+- OIDC authenticators are lazily loaded when first needed
+- SSO enforcement modes: `optional`, `test` (shadow mode), `required`
+- Bearer token validation extracts org from JWT claim and validates against that org's IdP
+- Gateway JWT flow: decode `iss` → per-org registry lookup → lazy-load from DB → fall back to global JWT validator
+- `GatewayJwtRegistry` is pre-loaded at startup and kept in sync by SSO config CRUD
+- `AppState.global_jwt_validator` caches the global JWT validator so JWKS isn't re-fetched per request
+
+### Request Flow
+
+1. **Client** sends request to gateway
+2. **Middleware Pipeline** processes in order: init usage tracker → authenticate → check budget
+3. **Route Handler** parses model string, resolves provider (static config or dynamic from DB)
+4. **LLM Provider** forwards request, streams response
+5. **Usage Tracking** records tokens/cost asynchronously with full principal attribution (user, org, project, team, service account)
+
+### Document Processing Flow (RAG)
+
+1. **File Upload** (`POST /v1/files`) — Store raw file in database
+2. **Add to Vector Store** (`POST /v1/vector_stores/{id}/files`) — Trigger processing
+   - Note: 'Vector Stores' are called 'Knowledge Bases' in the UI. Do not refer to them as 'Vector Stores' there.
+3. **Document Processor** (inline or queue mode):
+   - Extract text via Kreuzberg (PDF, DOCX, HTML, etc.)
+   - OCR for scanned documents (optional)
+   - Chunk text (auto or fixed-size strategy)
+   - Generate embeddings per chunk
+   - Store in vector database with `processing_version`
+4. **Shadow-copy cleanup** — Delete old chunks only after successful processing
+5. **File status** updated to "completed" or "failed"
+
+Key patterns:
+- **Shadow-copy**: New chunks stored with `processing_version`, old deleted only on success
+- **Idempotent re-processing**: Failed files can be re-added to trigger reprocessing
+- **Stale detection**: In-progress files auto-reset after timeout (default 30 min)
+
+### Chat Modes Architecture
+
+The chat UI supports multiple interaction modes via pluggable handlers. The Mode Runner dispatches to mode-specific handlers that orchestrate LLM streams and aggregate responses.
+
+**Available modes:** synthesized, chained, debated, council, hierarchical, refined, routed, critiqued, elected, tournament, consensus, scattershot, explainer, confidence
+
+Modes use **instance IDs** (not model IDs) for role assignment to support multiple instances of the same model with different settings.
+
+### Frontend Tools Architecture
+
+Client-side tool execution runs in the browser via WASM. When the LLM returns `tool_calls`, the Tool Executor Registry dispatches to the appropriate executor:
+
+- **Pyodide** — Python execution (numpy, pandas, matplotlib available)
+- **QuickJS** — JavaScript execution (sandboxed)
+- **DuckDB** — SQL queries against uploaded CSV/Parquet files
+- **Vega** — Chart generation from Vega-Lite specs
+- **HTML** — Sandboxed iframe preview
+
+Tool results are sent back to the LLM to continue the conversation. Artifacts (charts, tables, images) are displayed inline in the chat.
+
+### Provider Features
+
+- **Thinking/Reasoning**: Anthropic extended thinking, OpenAI O1/O3 reasoning, Bedrock/Vertex native conversion. Configurable budget tokens and effort levels.
+- **Prompt Caching**: Anthropic `cache_control` messages, tracks cache creation/read tokens in usage.
+- **Image Support**: Base64 input (all providers), URL-based input for Anthropic (HTTPS only), image generation via `/v1/images/generations`.
+- **Audio Support**: TTS (`/v1/audio/speech`), transcription (`/v1/audio/transcriptions`), translation (`/v1/audio/translations`).
+
+### Studio
+
+Multi-model tool execution UI for image generation, TTS, transcription, and translation. Supports simultaneous execution across providers with cost tracking. Uses OPFS for client-side audio storage.
+
+### Dynamic Providers
+
+Users, orgs, teams, and projects can configure their own LLM providers at runtime. Credentials stored via secrets manager integration. Resolved during request routing with caching.
+
+### Model Catalog
+
+Embedded model metadata from models.dev with background sync worker. Provides capabilities, pricing, context limits, and modalities per model. Configurable via `[features.model_catalog]`.
+
+### Cost Tracking & Forecasting
+
+Usage tracked in microcents precision (1/1,000,000 of a dollar). `X-Cost-Microcents` response header. Forecasting via MSTL (14+ days data) with AutoETS fallback. 95% prediction intervals and budget exhaustion projection.
+
+### Performance Considerations
+
+- Database queries in API hot path should use caching
+- Avoid allocations in frequently called code
+- Use Cow<str> instead of String::from() where possible
+
+## Testing
+
+- Unit tests go in the same file as the code (`#[cfg(test)]`)
+- E2E tests use the TypeScript test suite in `deploy/tests/` with testcontainers
+- Test both SQLite and PostgreSQL paths for database code
+
+### Provider Testing (Wiremock)
+
+Provider e2e tests use recorded fixtures instead of live API calls:
+- Fixtures in `tests/fixtures/providers/{provider}/` (JSON request/response pairs)
+- Tests in `src/tests/provider_e2e.rs` using `rstest` for parameterization
+- Adding a provider = add `ProviderTestSpec` + fixture files
+- Record new fixtures: `cargo run --bin record_fixtures -- --help`
+- Set `HADRIAN_TEST_DEBUG=1` to save test responses to `tests/fixtures/providers/_debug/`
+
+### University E2E Tests
+
+Comprehensive deployment tests with Keycloak OIDC and CEL-based RBAC policies:
+
+```bash
+cd deploy/tests && pnpm test university    # Run university tests
+cd deploy/tests && pnpm test -- --grep "CEL"  # Run tests matching pattern
+```
+
+Tests cover:
+- OIDC authentication flow (token acquisition, claim verification)
+- CEL policy enforcement (cross-org isolation, role boundaries)
+- Budget enforcement and usage tracking
+- RAG/vector stores with cross-org permission isolation
+- Streaming API (SSE format, chunked responses)
+
+## API Conventions
+
+- All admin endpoints under `/admin/v1/`
+- OpenAI-compatible endpoints under `/v1/`
+  - All endpoints should conform to the OpenAI OpenAPI spec, with clearly-marked hadrian-specific extensions
+  - Mark extension fields with `**Hadrian Extension:**` at the start of their doc comment; run `./scripts/openapi-conformance.py` to verify
+  - Reference specs in `openapi/` directory (OpenAI, Anthropic, OpenRouter) — use local copies, fetch with `./scripts/fetch-openapi-specs.sh`
+- Use plural nouns for resources (`/admin/v1/users`, not `/user`)
+- Return JSON with consistent error shapes
+
+### Cursor-Based Pagination
+
+All list endpoints use cursor-based (keyset) pagination for stable, performant navigation. Do not use offset-based pagination.
+
+**Query parameters:**
+- `limit` (optional): Max records per page (default: 100, max: 1000)
+- `cursor` (optional): Opaque base64 cursor from previous response
+- `direction` (optional): `forward` (default) or `backward`
+
+**Response format:**
+```json
+{
+  "data": [...],
+  "pagination": {
+    "limit": 100,
+    "has_more": true,
+    "next_cursor": "MTczMzU4MDgwMDAwMDphYmMxMjM0...",
+    "prev_cursor": null
+  }
+}
+```
+
+**Important:** Truncate timestamps to milliseconds when creating entities, since cursors use millisecond precision. This prevents comparison issues in SQLite (which stores DateTime as TEXT).
+
+See `agent_instructions/adding_admin_endpoint.md` for implementation patterns (route handler, repository SQL, cursor encoding).
+
+## Configuration
+
+- Config file: `hadrian.toml` (TOML format)
+- Environment variables: use `${VAR_NAME}` syntax for interpolation
+- Secrets are automatically redacted in logs and API responses
+- See `src/config/` for all configuration options
+
+### Top-Level Config Sections
+
+| Section | Description |
+|---------|-------------|
+| `[server]` | HTTP server (host, port, TLS, CORS, trusted proxies, security headers) |
+| `[database]` | SQLite or PostgreSQL connection, pool settings, read replicas |
+| `[cache]` | In-memory or Redis cache for sessions, rate limits, API key lookups |
+| `[auth]` | Authentication mode (`none`, `api_key`, `idp`, `iap`), API key settings, per-org SSO, RBAC (CEL policies), session config |
+| `[providers]` | LLM providers (OpenAI, Anthropic, Bedrock, Vertex, Azure), retries, fallbacks, health checks |
+| `[limits]` | Rate limits, budget enforcement, request size limits |
+| `[features]` | Feature flags (see below) |
+| `[observability]` | Logging, tracing (OTLP), metrics (Prometheus), usage tracking, response validation |
+| `[ui]` | Web UI settings, branding, file upload limits, admin panel |
+| `[pricing]` | Model pricing for cost calculation and budget enforcement |
+| `[secrets]` | External secrets managers (Vault, AWS Secrets Manager, Azure Key Vault, GCP) |
+| `[retention]` | Data retention policies for automatic purging |
+| `[storage]` | File storage backend (local filesystem, S3-compatible) |
+
+### Key Provider Options
+
+- `[providers.<name>]` — Define providers (openai, anthropic, bedrock, vertex, azure_openai, test)
+- `fallback_providers` — List of providers to try on 5xx errors
+- `retries` — Per-provider retry settings (max_attempts, delays, backoff)
+- `health_check` — Background health monitoring
+- `circuit_breaker` — Automatic provider disabling on repeated failures
+- `streaming_buffer` — Buffer size for SSE streaming
+
+### Feature Flags
+
+- `[features.file_search]` — Knowledge Bases / RAG / vector search (embedding model, vector backend, chunking, reranking)
+- `[features.file_processing]` — RAG document ingestion (text extraction, OCR, chunking)
+- `[features.guardrails]` — Input/output guardrails (blocklist, PII detection, moderation APIs)
+- `[features.response_caching]` — Response caching with optional semantic similarity matching
+- `[features.prompt_caching]` — Anthropic prompt caching support
+- `[features.image_fetching]` — Fetch images from URLs for vision models
+- `[features.web_search]` — Web search tool integration
+- `[features.code_execution]` — Server-side code execution
+- `[features.model_catalog]` — Model metadata enrichment from models.dev
+- `[features.websocket]` — WebSocket for real-time events
+- `[features.vector_store_cleanup]` — Background cleanup for soft-deleted vector stores
+- `[features.fallback]` — Fallback and retry configuration
+- `[features.load_balancing]` — Load balancing configuration
+
+## Caching
+
+- In-memory cache for single-node deployments (`src/cache/`)
+- Redis required for multi-node deployments (for cache invalidation sync)
+- Cache API keys, user data, and provider configs
+- Invalidate cache on write operations
+
+## Key Files
+
+### Backend — Core
+
+- `src/main.rs` — Entry point only (module declarations, `main()`)
+- `src/app.rs` — `AppState` struct/construction, `build_app()` router setup, embedded assets
+- `src/init.rs` — Service initialization helpers (providers, secrets, embeddings)
+- `src/cli/` — CLI commands (`mod.rs` dispatch, `server.rs`, `worker.rs`, `bootstrap.rs`, `migrate.rs`, `init.rs`, `features.rs`, `openapi.rs`)
+- `src/config/mod.rs` — Configuration structures
+- `src/routes/api/` — API handlers split by domain (`chat.rs`, `embeddings.rs`, `models.rs`, `images.rs`, `audio.rs`, `files.rs`, `vector_stores.rs`)
+- `src/routes/admin/` — Admin handlers
+- `src/middleware/` — Axum middleware layers (auth, authz, rate limiting, security headers)
+- `src/db/repos/` — Repository traits for data access
+- `src/db/repos/cursor.rs` — Cursor-based pagination types (`Cursor`, `ListParams`, `ListResult`)
+- `openapi/` — Reference OpenAPI specs for providers
+- `src/openapi.rs` — OpenAPI schema and `PaginationMeta` type
+
+### Backend — Providers & Routing
+
+- `src/providers/` — LLM providers (openai, anthropic, bedrock, vertex, azure_openai)
+- `src/routing/resolver.rs` — Dynamic provider resolution
+- `src/models/dynamic_provider.rs` — Dynamic provider model
+- `src/routes/admin/dynamic_providers.rs` — Dynamic provider admin endpoints
+- `src/routes/admin/me_providers.rs` — Self-service provider endpoints
+- `src/jobs/provider_health_check.rs` — Background provider health monitoring
+
+### Backend — Auth & RBAC
+
+- `src/auth/principal.rs` — Principal derivation and Subject conversion
+- `src/models/service_account.rs` — Service account model and validation
+- `src/routes/admin/service_accounts.rs` — Service account admin endpoints
+- `src/middleware/layers/authz.rs` — Request authorization middleware, service account role injection
+- `src/authz/engine.rs` — CEL evaluation engine
+- `src/authz/registry.rs` — `PolicyRegistry` with per-org caching
+- `src/models/org_rbac_policy.rs` — Org policy models
+- `src/services/org_rbac_policies.rs` — Policy service with CEL validation
+- `src/routes/admin/org_rbac_policies.rs` — Org RBAC policy admin endpoints
+- `src/routes/admin/org_sso_configs.rs` — SSO config CRUD endpoints
+- `src/services/org_sso_configs.rs` — SSO config service layer
+- `src/middleware/layers/admin.rs` — Admin middleware, per-org JWT validation
+- `src/routes/auth.rs` — Auth routes, lazy OIDC authenticator loading
+- `src/auth/gateway_jwt.rs` — Per-org gateway JWT validator registry (issuer → org routing)
+- `src/auth/discovery.rs` — Shared OIDC discovery with SSRF validation
+
+### Backend — Knowledge Bases / RAG
+
+- `src/services/document_processor.rs` — File processing, text extraction, chunking
+- `src/services/file_search.rs` — Vector search, re-ranking, result formatting
+- `src/services/file_search_tool.rs` — file_search tool interception for Responses API
+- `src/cache/vector_store/` — Vector store backends (pgvector, Qdrant, etc.)
+- `src/db/repos/vector_stores.rs` — Vector store and file metadata repository
+- `src/jobs/vector_store_cleanup.rs` — Background cleanup for soft-deleted stores
+- `src/models/vector_store.rs` — VectorStore and VectorStoreFile models
+
+### Backend — Usage, Cost & Observability
+
+- `src/models/usage.rs` — `UsageLogEntry` with principal attribution fields
+- `src/services/usage.rs` — Usage analytics service (scoped queries by org, team, project, user, API key)
+- `src/routes/admin/usage.rs` — Usage admin endpoints including self-service `/admin/v1/me/usage/*`
+- `src/usage_buffer.rs` — Async usage buffering
+- `src/usage_sink.rs` — OTLP usage export with attribution attributes
+- `src/services/forecasting.rs` — Cost forecasting (MSTL/ETS)
+- `src/pricing/` — Model pricing calculations
+- `src/guardrails/` — Input/output guardrails (blocklist, PII, moderation APIs)
+- `src/validation/` — Response validation against OpenAI schema
+- `src/observability/siem/` — SIEM formatters
+
+### Backend — Other
+
+- `src/catalog/` — Model catalog registry
+- `src/jobs/model_catalog_sync.rs` — Background model catalog sync worker
+- `src/dlq/` — Dead letter queue
+- `src/events/mod.rs` — Event system
+- `src/retention/` — Data retention enforcement
+- `src/config/auth.rs` — `RbacConfig` for system policies
+- `src/db/postgres/users.rs` — Postgres user repo (including `add_to_org` constraint handling)
+- `src/db/sqlite/users.rs` — SQLite user repo
+
+### Frontend — Chat
+
+- `ui/src/stores/streamingStore.ts` — Token streaming state (ephemeral)
+- `ui/src/stores/conversationStore.ts` — Persistent messages (IndexedDB)
+- `ui/src/stores/chatUIStore.ts` — UI preferences (session-only)
+- `ui/src/stores/mcpStore.ts` — MCP server connections (localStorage)
+- `ui/src/stores/websocketStore.ts` — WebSocket events
+- `ui/src/stores/debugStore.ts` — Debug capture
+- `ui/src/pages/chat/modes/` — Mode handlers (14 modes)
+- `ui/src/pages/chat/modes/runner.ts` — Mode execution orchestration
+- `ui/src/pages/chat/modes/types.ts` — ModeHandler interface and context types
+- `ui/src/pages/chat/utils/toolExecutors.ts` — Tool executor registry and implementations
+- `ui/src/components/ChatMessageList/ChatMessageList.tsx` — Virtualized message list
+- `ui/src/components/MultiModelResponse/MultiModelResponse.tsx` — Model response cards
+- `ui/src/components/ModeProgress/` — Mode-specific progress UI components
+- `ui/src/hooks/useAutoScroll.ts` — Smart auto-scroll behavior
+- `ui/src/hooks/useIndexedDB.ts` — IndexedDB persistence for conversations
+
+### Frontend — Tools & Services
+
+- `ui/src/services/pyodide/` — Python execution via Pyodide WASM
+- `ui/src/services/quickjs/` — JavaScript execution via QuickJS WASM
+- `ui/src/services/duckdb/` — SQL queries via DuckDB WASM
+- `ui/src/services/mcp/` — MCP client and protocol types
+- `ui/src/services/opfs/` — OPFS audio storage
+- `ui/src/components/ToolExecution/` — Tool execution timeline UI
+- `ui/src/components/Artifact/` — Artifact rendering (charts, tables, images, code)
+
+### Frontend — Pages & Layout
+
+- `ui/src/pages/studio/` — Studio feature (image gen, TTS, transcription)
+- `ui/src/components/Studio/` — Studio UI components
+- `ui/src/components/UsageDashboard/` — Reusable usage dashboard with `UsageScope` discriminated union
+- `ui/src/pages/MyUsagePage.tsx` — Self-service usage page at `/usage`
+- `ui/src/components/AdminLayout/` — Dedicated admin area with its own sidebar
+- `ui/src/components/AppLayout/` — Main app layout with chat sidebar
+- `ui/src/components/VectorStores/` — Vector store UI components
+- `ui/src/api/` — Generated API client
+
+### Helm Chart
+
+- `helm/hadrian/` — Chart directory (Chart.yaml, values.yaml, values.schema.json)
+- `helm/hadrian/templates/` — Kubernetes manifests (deployment, configmap, secret, service, ingress, HPA, PDB, etc.)
+- `helm/hadrian/README.md` — Chart documentation with examples
+
+### Documentation
+
+- `docs/content/docs/` — MDX documentation pages
+- `docs/content/docs/api/` — Auto-generated OpenAPI documentation
+- `docs/lib/source.ts` — Content source configuration
+- `docs/lib/openapi.ts` — OpenAPI integration
+- `docs/components/story-embed.tsx` — Storybook iframe wrapper
+- `docs/scripts/generate-openapi-docs.ts` — OpenAPI page generator
+
+## Debugging Tips
+
+- Set `RUST_LOG=debug` for verbose logging
+- Use `observability.logging.format = "pretty"` for human-readable logs
+- Check `/health` endpoint for database connectivity
+- Documentation at `/docs`, API reference at `/api/docs` (Scalar)
+
+## Frontend
+
+The UI is in the `ui/` directory and uses:
+- React 19 with TypeScript
+- TailwindCSS for styling
+- Storybook for component development
+- @tanstack/react-query for data fetching
+- hey-api for OpenAPI client generation
+
+```bash
+pnpm install           # Install dependencies
+pnpm dev               # Start dev server
+pnpm build             # Production build
+pnpm lint              # Lint code
+pnpm format            # Format code
+pnpm storybook         # Component development
+pnpm test-storybook    # Run Storybook tests with vitest
+pnpm openapi-ts        # Regenerate from /api/openapi.json
+```
+
+### Frontend Conventions
+
+- Run the `./scripts/generate-openapi.sh` script to generate the OpenAPI client
+- Use React Query for all API calls (via generated hey-api client)
+- Components are in `ui/src/components/` with PascalCase directories.
+- Pages and large components should be broken down into multiple components.
+- Each component must have a `.stories.tsx` file for Storybook
+- Prefer Tailwind utility classes over custom CSS
+
+#### Accessibility (WCAG 2.1 AA)
+
+All UI components must meet WCAG 2.1 AA standards. Two tools enforce this automatically:
+
+- **`eslint-plugin-jsx-a11y`** — Static linting (runs with `pnpm lint`). Catches missing labels, invalid ARIA attributes, etc.
+- **`@storybook/addon-a11y`** — Runtime axe-core testing (runs with `pnpm test-storybook`). Set to `error` mode in `ui/.storybook/preview.ts` — all story files must pass.
+
+When writing new components:
+- Add `aria-label` to icon-only buttons (e.g., `aria-label="Copy code"`)
+- Associate form controls with labels (`useId()` + `htmlFor`, or `aria-label` for switches/toggles)
+- Use theme CSS variables for text colors — don't hard-code Tailwind colors below `-700` (light) or above `-400` (dark) on white/dark backgrounds
+- Don't reduce text opacity (no `/60`, `/70`, `/80` suffixes on `text-muted-foreground`)
+- Add `sr-only` text for empty table headers (action columns) and visually hidden labels
+- Add `tabIndex={0}` to scrollable containers that aren't natively focusable
+- For Storybook false positives (landmark nesting, heading order in isolation), suppress per-story via `parameters.a11y.config.rules` — never disable globally
+
+After making changes to the frontend, run the following:
+- `pnpm lint:fix` to fix lint errors
+- `pnpm format` to format code
+- `pnpm test-storybook` to run Storybook tests
+- `pnpm build` to build the production bundle
+
+Lint, formatting, and a11y errors must be resolved before finishing a change. If they need to be ignored, always prompt the user to explain why.
+
+### Chat UI Performance
+
+The chat UI is designed for high-performance multi-model streaming. When modifying chat components, preserve these patterns:
+
+- **6 Zustand stores**: streamingStore (ephemeral tokens), conversationStore (IndexedDB), chatUIStore (session), mcpStore (localStorage), websocketStore (real-time events), debugStore (debug capture)
+- **Surgical selectors**: Always use provided selector hooks (e.g., `useStreamContent(model)`), never subscribe to entire stores
+- **Memoization**: Components use custom `arePropsEqual` comparators; parent callbacks must use `useCallback`
+- **Virtualization**: `ChatMessageList` uses `@tanstack/react-virtual`; streaming responses render outside virtualization
+- **Model instances**: Streams and messages are keyed by instance ID (not model ID) to support multiple copies of the same model with different settings
+
+See `agent_instructions/modifying_chat_ui.md` for full details on stores, selectors, memoization patterns, and component responsibilities.
+
+## Documentation
+
+The documentation site is in `docs/` and uses Fumadocs (Next.js-based). It builds to static HTML that can be embedded in the gateway binary or served from a CDN.
+
+The docs pages need to be kept up-to-date with the code. If code changes are related to docs pages, update them with information users (not developers) need to know. Run `find docs/content -name '*.mdx' | sort` to see current docs pages and check if any need updating after a code change.
+
+Read the docs at https://www.fumadocs.dev/llms.txt before updates to docs pages. Always use this as a reference before starting any task.
+
+Quick start: https://www.fumadocs.dev/docs/index.mdx
+
+Note that eg. `/docs/navigation` means fetch `https://www.fumadocs.dev/docs/navigation.mdx`
+
+Fetching from the fumadocs domain requires using curl in bash.
+
+### Build & Development Commands
+
+```bash
+cd docs
+pnpm install           # Install dependencies
+pnpm dev               # Development server at http://localhost:3000
+pnpm build             # Build static site to docs/out/
+pnpm lint:fix          # Fix lint errors
+pnpm format            # Format code
+pnpm generate:openapi  # Regenerate API docs from OpenAPI spec
+```
+
+### Architecture
+
+- **Static export**: Builds to `docs/out/` for embedding or serving
+- **OpenAPI integration**: API reference pages auto-generated from `openapi/hadrian.openapi.json`
+- **Storybook embeds**: UI components are embedded via iframe from Storybook for complete style isolation
+  - Symlink `docs/public/storybook` → `../../ui/storybook-static`
+  - Use `<StoryEmbed storyId="component-name--story" />` in MDX
+  - Requires building Storybook before docs: `cd ui && pnpm storybook:build`
+
+### Writing Guidelines
+
+When writing documentation:
+
+- Start every page with a one-sentence summary of what it covers
+- Use active voice, second person, present tense, imperative mood ("Run the command" not "You should run the command")
+- Front-load keywords in headings ("Redis Configuration" not "How to Configure Redis")
+- Use realistic data in examples ("acme-corp", "production-api-key") not "foo/bar"
+- Use the storybook embeds to show component examples
+- Code blocks: always specify language, show complete working examples, include expected output
+- Keep pages focused — if past 1500 words, consider splitting
+- End pages with "Next Steps" linking to related topics
+- Run the linter and formatter after making changes
+
+## Security Rules
+
+### Authorization enforcement rule
+Every admin endpoint handler **must** extract `Extension(authz): Extension<AuthzContext>` and call `authz.require(resource, action)` before performing any operation. No exceptions. Reference `routes/admin/teams.rs` for the pattern.
+
+### Database scoping rule
+All `get_by_id()` repository calls from admin handlers with org context **must** use org-scoped variants (e.g., `get_by_id_and_org()`). Unscoped `get_by_id()` is only for internal/system code paths.
+
+### URL validation rule
+Any user-supplied URL the server will make HTTP requests to **must** go through `validate_base_url()` to block SSRF.
+
+### Error message rule
+Error messages returned to clients **must not** include internal paths, UUIDs, infrastructure details, or secret manager references.
+
+### Credential handling rule
+Never return provider credentials in API responses. Never fall back to treating a secret reference as a literal value.
+
+### Security defaults rule
+Security-relevant defaults must be fail-closed: invalid credentials = 401, `fail_on_evaluation_error` = true, IAP auth requires explicit `trusted_proxies`.
diff --git a/docs/app/(home)/page.tsx b/docs/app/(home)/page.tsx
index 49a50fd..2f5a5bd 100644
--- a/docs/app/(home)/page.tsx
+++ b/docs/app/(home)/page.tsx
@@ -74,9 +74,10 @@ function DemoGallery() {
   return (
     <div className="mx-auto max-w-screen-2xl px-4">
       <div
-        className="mx-auto mb-6 flex max-w-6xl flex-wrap justify-center gap-2"
+        className="scrollbar-none mx-auto mb-6 flex max-w-6xl gap-2 overflow-x-auto px-4 pb-2 sm:flex-wrap sm:justify-center sm:overflow-visible sm:px-0 sm:pb-0"
         role="tablist"
         aria-label="Demo gallery"
+        tabIndex={0}
       >
         {demos.map((demo) => (
           <button
@@ -84,9 +85,8 @@ function DemoGallery() {
             role="tab"
             aria-selected={active === demo.id}
             aria-controls={`demo-panel-${demo.id}`}
-            onMouseEnter={() => setActive(demo.id)}
             onClick={() => setActive(demo.id)}
-            className={`shrink-0 rounded-lg border px-4 py-3 text-left transition-colors ${
+            className={`shrink-0 cursor-pointer rounded-lg border px-4 py-3 text-left transition-colors ${
               active === demo.id
                 ? "border-fd-primary bg-fd-primary/10 text-fd-foreground"
                 : "border-fd-border bg-fd-card text-fd-muted-foreground hover:border-fd-primary/50 hover:text-fd-foreground"
@@ -97,16 +97,16 @@ function DemoGallery() {
           </button>
         ))}
       </div>
-      <div className="relative overflow-hidden rounded-xl border border-fd-border shadow-lg">
+      <div className="relative h-[500px] overflow-hidden rounded-xl border border-fd-border shadow-lg sm:h-[700px] lg:h-[950px]">
         {demos.map((demo) => (
           <div
             key={demo.id}
             id={`demo-panel-${demo.id}`}
             role="tabpanel"
             aria-label={demo.title}
-            className={active === demo.id ? "" : "invisible absolute inset-0"}
+            className={active === demo.id ? "h-full" : "invisible absolute inset-0"}
           >
-            <StoryEmbed storyId={demo.storyId} height={950} />
+            <StoryEmbed storyId={demo.storyId} height="100%" />
           </div>
         ))}
       </div>
diff --git a/docs/components/quick-start-selector.tsx b/docs/components/quick-start-selector.tsx
index 3102acb..9d69a71 100644
--- a/docs/components/quick-start-selector.tsx
+++ b/docs/components/quick-start-selector.tsx
@@ -5,7 +5,7 @@ import { Check, Copy, Download, X } from "lucide-react";
 
 type Method = "binary" | "docker" | "cargo";
 type OS = "linux-x86_64" | "linux-arm64" | "macos-arm64" | "windows";
-type Profile = "full" | "standard" | "minimal" | "tiny";
+type Profile = "full" | "headless" | "standard" | "minimal" | "tiny";
 type Libc = "gnu" | "musl";
 
 const osLabels: Record<OS, string> = {
@@ -20,6 +20,14 @@ const libcLabels: Record<Libc, string> = {
   musl: "musl",
 };
 
+const profileLabels: Record<Profile, string> = {
+  full: "Full",
+  headless: "Headless",
+  standard: "Standard",
+  minimal: "Minimal",
+  tiny: "Tiny",
+};
+
 function getTarget(os: OS, libc: Libc): string {
   switch (os) {
     case "linux-x86_64":
@@ -35,38 +43,42 @@ function getTarget(os: OS, libc: Libc): string {
 
 const profileSummaries: Record<Profile, string> = {
   full: "Everything",
+  headless: "Full features, no embedded assets (serve frontend separately)",
   standard: "Production deployment",
   minimal: "Development and embedded use",
   tiny: "Stateless proxy",
 };
 
+const allProfiles: Profile[] = ["full", "headless", "standard", "minimal", "tiny"];
+const embeddedAssetProfiles: Profile[] = ["minimal", "standard", "full"];
+
 const featureMatrix: { name: string; profiles: Profile[] }[] = [
-  { name: "OpenAI", profiles: ["tiny", "minimal", "standard", "full"] },
-  { name: "Anthropic", profiles: ["minimal", "standard", "full"] },
-  { name: "AWS Bedrock", profiles: ["minimal", "standard", "full"] },
-  { name: "Google Vertex AI", profiles: ["minimal", "standard", "full"] },
-  { name: "Azure OpenAI", profiles: ["minimal", "standard", "full"] },
-  { name: "SQLite", profiles: ["minimal", "standard", "full"] },
-  { name: "Embedded UI", profiles: ["minimal", "standard", "full"] },
-  { name: "Model catalog", profiles: ["minimal", "standard", "full"] },
-  { name: "Setup wizard", profiles: ["minimal", "standard", "full"] },
-  { name: "PostgreSQL", profiles: ["standard", "full"] },
-  { name: "Redis caching", profiles: ["standard", "full"] },
-  { name: "SSO (OIDC / OAuth)", profiles: ["standard", "full"] },
-  { name: "CEL RBAC", profiles: ["standard", "full"] },
-  { name: "S3 storage", profiles: ["standard", "full"] },
-  { name: "Secrets managers", profiles: ["standard", "full"] },
-  { name: "OTLP & Prometheus", profiles: ["standard", "full"] },
-  { name: "OpenAPI docs", profiles: ["standard", "full"] },
-  { name: "Embedded docs", profiles: ["standard", "full"] },
-  { name: "Doc extraction", profiles: ["standard", "full"] },
-  { name: "Cost forecasting", profiles: ["standard", "full"] },
-  { name: "CSV export", profiles: ["standard", "full"] },
-  { name: "Response validation", profiles: ["standard", "full"] },
-  { name: "JSON schema", profiles: ["standard", "full"] },
-  { name: "SAML SSO", profiles: ["full"] },
-  { name: "Kreuzberg OCR", profiles: ["full"] },
-  { name: "ClamAV scanning", profiles: ["full"] },
+  { name: "OpenAI", profiles: allProfiles },
+  { name: "Anthropic", profiles: ["minimal", "standard", "headless", "full"] },
+  { name: "AWS Bedrock", profiles: ["minimal", "standard", "headless", "full"] },
+  { name: "Google Vertex AI", profiles: ["minimal", "standard", "headless", "full"] },
+  { name: "Azure OpenAI", profiles: ["minimal", "standard", "headless", "full"] },
+  { name: "SQLite", profiles: ["minimal", "standard", "headless", "full"] },
+  { name: "Embedded UI", profiles: embeddedAssetProfiles },
+  { name: "Model catalog", profiles: embeddedAssetProfiles },
+  { name: "Setup wizard", profiles: embeddedAssetProfiles },
+  { name: "PostgreSQL", profiles: ["standard", "headless", "full"] },
+  { name: "Redis caching", profiles: ["standard", "headless", "full"] },
+  { name: "SSO (OIDC / OAuth)", profiles: ["standard", "headless", "full"] },
+  { name: "CEL RBAC", profiles: ["standard", "headless", "full"] },
+  { name: "S3 storage", profiles: ["standard", "headless", "full"] },
+  { name: "Secrets managers", profiles: ["standard", "headless", "full"] },
+  { name: "OTLP & Prometheus", profiles: ["standard", "headless", "full"] },
+  { name: "OpenAPI docs", profiles: ["standard", "headless", "full"] },
+  { name: "Embedded docs", profiles: embeddedAssetProfiles },
+  { name: "Doc extraction", profiles: ["standard", "headless", "full"] },
+  { name: "Cost forecasting", profiles: ["standard", "headless", "full"] },
+  { name: "CSV export", profiles: ["standard", "headless", "full"] },
+  { name: "Response validation", profiles: ["standard", "headless", "full"] },
+  { name: "JSON schema", profiles: ["standard", "headless", "full"] },
+  { name: "SAML SSO", profiles: ["headless", "full"] },
+  { name: "Kreuzberg OCR", profiles: ["headless", "full"] },
+  { name: "ClamAV scanning", profiles: ["headless", "full"] },
 ];
 
 function getInstallCommand(method: Method, os: OS, profile: Profile, libc: Libc): string {
@@ -135,7 +147,7 @@ function ToggleGroup<T extends string>({
             key={opt}
             onClick={() => onChange(opt)}
             disabled={isDisabled}
-            className={`rounded-md px-3 py-1.5 text-sm font-medium transition-colors ${
+            className={`rounded-md px-2.5 py-1 text-xs font-medium transition-colors sm:px-3 sm:py-1.5 sm:text-sm ${
               isDisabled
                 ? "cursor-not-allowed bg-fd-muted text-fd-muted-foreground/40"
                 : value === opt
@@ -152,9 +164,10 @@ function ToggleGroup<T extends string>({
 }
 
 function getDisabledProfiles(os: OS, libc: Libc): Set<Profile> | undefined {
-  if (os === "windows") return new Set(["full", "standard"]);
-  if (os === "linux-arm64") return new Set(["full"]);
-  if (os.startsWith("linux-") && libc === "musl") return new Set(["full"]);
+  // headless and full only built for linux-x86_64-gnu and macos-arm64
+  if (os === "windows") return new Set(["full", "headless"]);
+  if (os === "linux-arm64") return new Set(["full", "headless"]);
+  if (os.startsWith("linux-") && libc === "musl") return new Set(["full", "headless"]);
   return undefined;
 }
 
@@ -184,7 +197,7 @@ export function QuickStartSelector() {
 
   const handleLibcChange = (newLibc: Libc) => {
     setLibc(newLibc);
-    if (newLibc === "musl" && profile === "full") {
+    if (newLibc === "musl" && (profile === "full" || profile === "headless")) {
       setProfile("standard");
     }
   };
@@ -193,18 +206,7 @@ export function QuickStartSelector() {
   const downloadUrl = method === "binary" ? getDownloadUrl(os, profile, libc) : null;
 
   const handleCopy = async () => {
-    if (navigator.clipboard) {
-      await navigator.clipboard.writeText(command);
-    } else {
-      const textarea = document.createElement("textarea");
-      textarea.value = command;
-      textarea.style.position = "fixed";
-      textarea.style.opacity = "0";
-      document.body.appendChild(textarea);
-      textarea.select();
-      document.execCommand("copy");
-      document.body.removeChild(textarea);
-    }
+    await navigator.clipboard.writeText(command);
     setCopied(true);
     setTimeout(() => setCopied(false), 2000);
   };
@@ -212,8 +214,10 @@ export function QuickStartSelector() {
   return (
     <div className="not-prose overflow-hidden rounded-lg border border-fd-border bg-fd-card">
       <div className="space-y-3 border-b border-fd-border bg-fd-muted/50 p-4">
-        <div className="flex flex-wrap items-center gap-3">
-          <span className="w-16 shrink-0 text-sm font-medium text-fd-muted-foreground">Method</span>
+        <div className="flex flex-col gap-1.5 sm:flex-row sm:items-center sm:gap-3">
+          <span className="text-sm font-medium text-fd-muted-foreground sm:w-16 sm:shrink-0">
+            Method
+          </span>
           <ToggleGroup
             options={["binary", "docker", "cargo"] as Method[]}
             value={method}
@@ -223,8 +227,10 @@ export function QuickStartSelector() {
         </div>
         {method === "binary" && (
           <>
-            <div className="flex flex-wrap items-center gap-3">
-              <span className="w-16 shrink-0 text-sm font-medium text-fd-muted-foreground">OS</span>
+            <div className="flex flex-col gap-1.5 sm:flex-row sm:items-center sm:gap-3">
+              <span className="text-sm font-medium text-fd-muted-foreground sm:w-16 sm:shrink-0">
+                OS
+              </span>
               <ToggleGroup
                 options={["linux-x86_64", "linux-arm64", "macos-arm64", "windows"] as OS[]}
                 value={os}
@@ -233,8 +239,8 @@ export function QuickStartSelector() {
               />
             </div>
             {isLinux && (
-              <div className="flex flex-wrap items-center gap-3">
-                <span className="w-16 shrink-0 text-sm font-medium text-fd-muted-foreground">
+              <div className="flex flex-col gap-1.5 sm:flex-row sm:items-center sm:gap-3">
+                <span className="text-sm font-medium text-fd-muted-foreground sm:w-16 sm:shrink-0">
                   Libc
                 </span>
                 <ToggleGroup
@@ -246,14 +252,15 @@ export function QuickStartSelector() {
                 />
               </div>
             )}
-            <div className="flex flex-wrap items-center gap-3">
-              <span className="w-16 shrink-0 text-sm font-medium text-fd-muted-foreground">
+            <div className="flex flex-col gap-1.5 sm:flex-row sm:items-center sm:gap-3">
+              <span className="text-sm font-medium text-fd-muted-foreground sm:w-16 sm:shrink-0">
                 Features
               </span>
               <ToggleGroup
-                options={["full", "standard", "minimal", "tiny"] as Profile[]}
+                options={allProfiles}
                 value={profile}
                 onChange={setProfile}
+                labels={profileLabels}
                 disabled={disabledProfiles}
               />
             </div>
diff --git a/ui/src/components/Markdown/Markdown.tsx b/ui/src/components/Markdown/Markdown.tsx
index ab2686f..7a85192 100644
--- a/ui/src/components/Markdown/Markdown.tsx
+++ b/ui/src/components/Markdown/Markdown.tsx
@@ -45,7 +45,7 @@ export function Markdown({ content, className }: MarkdownProps) {
       ref={containerRef}
       className={cn(
         "markdown-content prose prose-sm dark:prose-invert",
-        "max-w-[calc(100vw-8rem)] sm:max-w-[500px] md:max-w-[600px] lg:max-w-[700px]",
+        "max-w-none",
         "[&_pre]:overflow-x-auto",
         className
       )}
diff --git a/ui/src/components/ModelSelector/ModelSelector.tsx b/ui/src/components/ModelSelector/ModelSelector.tsx
index ddc03dd..660cb1d 100644
--- a/ui/src/components/ModelSelector/ModelSelector.tsx
+++ b/ui/src/components/ModelSelector/ModelSelector.tsx
@@ -309,8 +309,15 @@ export function ModelSelector({
 
   return (
     <div className="flex items-center gap-2 min-w-0">
-      {/* Horizontally scrollable chip container */}
-      <div className="flex flex-wrap items-center gap-2 min-w-0">
+      {/* Model count badge - mobile only, shown when multiple models selected */}
+      {selectedInstances.length > 1 && (
+        <span className="sm:hidden shrink-0 rounded-full bg-muted px-1.5 py-0.5 text-[10px] font-medium text-muted-foreground tabular-nums">
+          {selectedInstances.length}
+        </span>
+      )}
+
+      {/* Chip container - horizontal scroll on mobile, wraps on desktop */}
+      <div className="flex items-center gap-2 min-w-0 overflow-x-auto sm:overflow-x-visible sm:flex-wrap scrollbar-none">
         <TooltipProvider>
           <DndContext
             sensors={sensors}
diff --git a/ui/src/components/MultiModelResponse/MultiModelResponse.stories.tsx b/ui/src/components/MultiModelResponse/MultiModelResponse.stories.tsx
index 6763dcb..fb1ca82 100644
--- a/ui/src/components/MultiModelResponse/MultiModelResponse.stories.tsx
+++ b/ui/src/components/MultiModelResponse/MultiModelResponse.stories.tsx
@@ -288,15 +288,15 @@ export const ViewModeToggle: Story = {
     // Should have 2 toggle buttons
     await expect(toggleButtons.length).toBe(2);
 
-    // In grid mode, cards should have min-w-[500px] class (horizontal layout)
-    let gridCards = canvasElement.querySelectorAll('[class*="min-w-[500px]"]');
+    // In grid mode, cards should have basis-[min(500px,85vw)] class (horizontal layout)
+    let gridCards = canvasElement.querySelectorAll('[class*="basis-"]');
     await expect(gridCards.length).toBe(2);
 
     // Click the stacked button (second toggle button)
     await userEvent.click(toggleButtons[1]);
 
-    // After clicking stacked, cards should NOT have min-w-[500px] (vertical layout)
-    gridCards = canvasElement.querySelectorAll('[class*="min-w-[500px]"]');
+    // After clicking stacked, cards should NOT have basis-[min(500px,85vw)] (vertical layout)
+    gridCards = canvasElement.querySelectorAll('[class*="basis-"]');
     await expect(gridCards.length).toBe(0);
 
     // Cards should now be full width (w-full)
diff --git a/ui/src/components/MultiModelResponse/MultiModelResponse.tsx b/ui/src/components/MultiModelResponse/MultiModelResponse.tsx
index 9afd61b..f134a7b 100644
--- a/ui/src/components/MultiModelResponse/MultiModelResponse.tsx
+++ b/ui/src/components/MultiModelResponse/MultiModelResponse.tsx
@@ -626,7 +626,7 @@ const ModelResponseCard = memo(function ModelResponseCard({
         "hover:shadow-md",
         "animate-slide-up-bounce",
         isSelectedBest && "ring-2 ring-success ring-offset-2 ring-offset-background",
-        useHorizontalLayout ? "min-w-[500px] w-[500px] shrink-0" : "w-full"
+        useHorizontalLayout ? "grow shrink-0 basis-[min(500px,85vw)]" : "w-full"
       )}
       style={{ animationDelay: `${index * 100}ms` }}
     >
diff --git a/ui/src/components/StreamingMarkdown/StreamingMarkdown.tsx b/ui/src/components/StreamingMarkdown/StreamingMarkdown.tsx
index a7885ad..ed2dbc6 100644
--- a/ui/src/components/StreamingMarkdown/StreamingMarkdown.tsx
+++ b/ui/src/components/StreamingMarkdown/StreamingMarkdown.tsx
@@ -75,7 +75,7 @@ function StreamingMarkdownComponent({ content, isStreaming, className }: Streami
     <div
       className={cn(
         "markdown-content prose prose-sm dark:prose-invert",
-        "max-w-[calc(100vw-8rem)] sm:max-w-[500px] md:max-w-[600px] lg:max-w-[700px]",
+        "max-w-none",
         "[&_pre]:overflow-x-auto",
         className
       )}