diff --git a/README.md b/README.md index 4aeb09a..387c665 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Define a team of AI agents in a simple JSON config, give them a task, and watch ## Features -- **Fully configurable agents** — names, instructions, models (including Grok-4.1-fast-reasoning), temperature, and plugins defined in JSON +- **Fully configurable agents** — names, instructions, models, and plugins defined in JSON; mix reasoning and non-reasoning models per agent - **Pluggable selection strategies** — sequential, round-robin, or LLM-driven selection - **Flexible termination strategies** — regex (e.g. `APPROVED`), max iterations, or composite - **9 built-in plugins** — FileSystem, Shell, Git, Http, Json, Search, Probe, Plan, and **CodeExecution** (sandboxed REPL) @@ -33,7 +33,7 @@ Define a team of AI agents in a simple JSON config, give them a task, and watch export XAI_API_KEY=your_key_here ``` -**Tip**: Grok-4.1-fast-reasoning (default in examples) offers excellent agent performance at **$0.20 / $0.50 per million input/output tokens**. +**Tip**: Use `grok-4-1-fast-non-reasoning` for action-heavy agents (those calling tools) and `grok-4-1-fast-reasoning` for judgment/approval roles. Reasoning models reject the `Temperature` parameter — omit it from the config when using them. --- @@ -111,21 +111,32 @@ Configs live under `config/`. The default is `config/orchestration.json`. { "Orchestration": { "Name": "SoftwareDevelopmentTeam", - "Description": "Developer writes code, Tester writes tests, Reviewer approves.", + "Description": "Developer writes code, Tester verifies, Reviewer approves.", "Agents": [ { "Name": "Developer", "Description": "Senior software engineer.", "Instructions": "You are an expert software developer...", + "Model": { + "ModelId": "grok-4-1-fast-non-reasoning", + "Endpoint": "https://api.x.ai/v1", + "ApiKeyEnvVar": "XAI_API_KEY", + "MaxTokens": 16384 + }, + "Plugins": ["FileSystem", "Shell", "Git", "Search"] + }, + { + "Name": "Reviewer", + "Description": "Tech lead who approves changes.", + "Instructions": "You are a senior tech lead...", "Model": { "ModelId": "grok-4-1-fast-reasoning", "Endpoint": "https://api.x.ai/v1", "ApiKeyEnvVar": "XAI_API_KEY", - "MaxTokens": 8192, - "Temperature": 0.2 + "MaxTokens": 8192 }, - "Plugins": ["FileSystem", "Git", "Shell", "Search"] + "Plugins": ["FileSystem", "Search"] } ], @@ -135,7 +146,7 @@ Configs live under `config/`. The default is `config/orchestration.json`. "Termination": { "Type": "composite", - "MaxIterations": 15, + "MaxIterations": 30, "Strategies": [ { "Type": "regex", @@ -144,7 +155,13 @@ Configs live under `config/`. The default is `config/orchestration.json`. } ] }, - "MaxCostUsd": 2.00 + + "Compaction": { + "TriggerTurnCount": 30, + "KeepRecentTurns": 8 + }, + + "MaxCostUsd": 3.00 } } ``` @@ -153,6 +170,30 @@ Configs live under `config/`. The default is `config/orchestration.json`. Set `MaxCostUsd` to cap the total estimated spend for a session. After the turn that pushes cumulative cost over the limit, the session stops and the checkpoint is saved so it can be resumed later. Omit the field (or set it to `null`) for no limit. +### Conversation compaction + +Long sessions accumulate history that consumes tokens on every subsequent turn. Set `Compaction` to automatically summarise old turns with an LLM call when the history grows past a threshold, replacing the oldest turns with a single compact summary while retaining the most recent turns verbatim. + +```json +"Compaction": { + "TriggerTurnCount": 30, + "KeepRecentTurns": 8 +} +``` + +`TriggerTurnCount` is the history length that triggers compaction. `KeepRecentTurns` is how many of the most recent turns to preserve unchanged. The cost of compacted turns is carried forward in the session total. Omit the section entirely to disable compaction. + +### Model selection: reasoning vs non-reasoning + +xAI (and similar providers) offer two model variants per tier: + +| Variant | Best for | +|---------|----------| +| `grok-4-1-fast-non-reasoning` | Action agents that call tools repeatedly — Developer, Engineer, Tester, Researcher | +| `grok-4-1-fast-reasoning` | Judgment agents that analyse and decide — Reviewer, Architect, Analyst | + +Reasoning models reject the `Temperature` parameter. Omit `Temperature` from the config (it defaults to null, which sends no value to the API) unless you are using a standard non-reasoning model that accepts it. + ### Security constraints ```json diff --git a/config/examples/devops-team.json b/config/examples/devops-team.json index 916a540..cbb99f4 100644 --- a/config/examples/devops-team.json +++ b/config/examples/devops-team.json @@ -1,47 +1,44 @@ { "Orchestration": { "Name": "DevOpsTeam", - "Description": "A three-agent DevOps pipeline: Architect designs, Engineer implements using Shell/Git/FileSystem, Reviewer approves and documents.", + "Description": "A three-agent DevOps pipeline: Architect designs, Engineer implements with tools, Reviewer verifies and approves.", "Agents": [ { "Name": "Architect", - "Description": "Senior software architect who designs solutions and writes implementation plans.", - "Instructions": "You are a senior software architect. When given a task:\n1. Analyze the requirements carefully.\n2. Produce a detailed implementation plan with clear steps.\n3. Specify which files to create/modify, what commands to run, and any dependencies needed.\n4. Hand off to the Engineer by saying 'HANDOFF TO ENGINEER' followed by your plan.", + "Description": "Senior architect who analyses requirements and produces a concrete implementation plan.", + "Instructions": "You are a senior software architect.\n\nWhen given a task:\n1. Use search_files and read_file to understand the existing codebase structure before planning.\n2. Produce a detailed, step-by-step implementation plan: which files to create or modify, what commands to run, what dependencies are needed.\n3. Be specific — name exact file paths and commands.\n4. Hand off by writing HANDOFF TO ENGINEER on its own line followed by your plan.", "Model": { "ModelId": "grok-4-1-fast-reasoning", "Endpoint": "https://api.x.ai/v1", "ApiKeyEnvVar": "XAI_API_KEY", - "MaxTokens": 4096, - "Temperature": 0.3 + "MaxTokens": 8192 }, - "Plugins": [] + "Plugins": ["FileSystem", "Search"] }, { "Name": "Engineer", - "Description": "Full-stack engineer who executes the implementation plan using tools.", - "Instructions": "You are a full-stack engineer. Execute the Architect's plan step by step using your available tools:\n- Use 'shell_run' to run commands (build, test, install packages, etc.)\n- Use 'write_file' and 'read_file' for file operations\n- Use 'git_status', 'git_add', 'git_commit' for version control\n- Use 'http_get' to fetch documentation or APIs if needed\n\nReport each step you take and its result. After completing the implementation and verifying it works, say 'HANDOFF TO REVIEWER'.", + "Description": "Full-stack engineer who executes the plan using tools — never describes changes without making them.", + "Instructions": "You are a full-stack engineer executing the Architect's plan.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ: Use read_file on any files you need to modify so you have their current content.\n\n2. IMPLEMENT: Use write_file with complete file content. Never output a diff or describe what you would write — write the full file.\n\n3. VERIFY WRITES: Use read_file immediately after writing to confirm content is correct.\n\n4. RUN: Use shell_exec to install dependencies, build, run tests, or whatever the plan requires. Include the exact stdout/stderr output.\n\n5. VERSION CONTROL: Use git_add and git_commit to commit your changes with a clear message.\n\n6. HAND OFF: Write HANDOFF TO REVIEWER on its own line with a list of changed files and the actual command output.\n\nRULES:\n- Never describe a change without making it with write_file or shell_exec.\n- Never claim a command succeeded without showing its real output.\n- If any step fails, fix it before proceeding.", "Model": { - "ModelId": "grok-4-1-fast-reasoning", + "ModelId": "grok-4-1-fast-non-reasoning", "Endpoint": "https://api.x.ai/v1", "ApiKeyEnvVar": "XAI_API_KEY", - "MaxTokens": 8192, - "Temperature": 0.1 + "MaxTokens": 16384 }, - "Plugins": [ "Shell", "FileSystem", "Git", "Http" ] + "Plugins": ["Shell", "FileSystem", "Git", "Http", "Search"] }, { "Name": "Reviewer", - "Description": "Tech lead who verifies implementation quality and writes documentation.", - "Instructions": "You are a tech lead. Review the implementation:\n1. Use 'shell_run' to run the tests and check they pass.\n2. Use 'read_file' to inspect the key source files.\n3. Use 'git_diff' to see all changes made.\n4. Verify code quality, correctness, and that the original requirements are met.\n\nIf everything is satisfactory:\n- Write a brief CHANGELOG entry summarizing what was built.\n- Respond with APPROVED followed by the changelog.\n\nIf changes are needed, describe them clearly for the Engineer to address.", + "Description": "Tech lead who independently verifies the implementation before approving.", + "Instructions": "You are a tech lead performing a final review.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE CODE: Use read_file on the changed files. Do not rely on the Engineer's summary.\n\n2. RUN THE TESTS: Use shell_exec to run the test suite and confirm it passes. Paste the real output.\n\n3. CHECK GIT HISTORY: Use git_diff or git_log to confirm the changes are committed cleanly.\n\n4. REVIEW for: correctness, security, performance, and whether the original requirements are met.\n\n5. DECIDE: If everything is solid, write APPROVED on its own line followed by a brief changelog entry. If changes are needed, give specific actionable feedback naming the file and line.", "Model": { "ModelId": "grok-4-1-fast-reasoning", "Endpoint": "https://api.x.ai/v1", "ApiKeyEnvVar": "XAI_API_KEY", - "MaxTokens": 4096, - "Temperature": 0.1 + "MaxTokens": 8192 }, - "Plugins": [ "Shell", "FileSystem", "Git" ] + "Plugins": ["Shell", "FileSystem", "Git"] } ], @@ -51,15 +48,22 @@ "Termination": { "Type": "composite", - "MaxIterations": 20, + "MaxIterations": 30, "Strategies": [ { "Type": "regex", "Pattern": "\\bAPPROVED\\b", - "MaxIterations": 20, - "AgentNames": [ "Reviewer" ] + "MaxIterations": 30, + "AgentNames": ["Reviewer"] } ] - } + }, + + "Compaction": { + "TriggerTurnCount": 30, + "KeepRecentTurns": 8 + }, + + "MaxCostUsd": 3.00 } } diff --git a/config/examples/research-team.json b/config/examples/research-team.json index 5b98f21..1c2f3e9 100644 --- a/config/examples/research-team.json +++ b/config/examples/research-team.json @@ -1,34 +1,32 @@ { "Orchestration": { "Name": "ResearchTeam", - "Description": "A two-agent research pipeline: Researcher gathers data via HTTP/JSON, Analyst synthesizes findings.", + "Description": "A two-agent research pipeline: Researcher fetches data with tools, Analyst synthesizes and writes the report.", "Agents": [ { "Name": "Researcher", - "Description": "Data researcher who fetches and extracts information from APIs and web sources.", - "Instructions": "You are a research specialist. For the given topic:\n1. Use 'http_get' to fetch relevant data from public APIs or web pages.\n2. Use 'json_get' and 'json_to_text' to extract the key facts from API responses.\n3. Use 'write_file' to save your raw findings to 'research/raw_data.txt'.\n4. Summarize what you found and say 'HANDOFF TO ANALYST'.", + "Description": "Data researcher who fetches real information using HTTP and filesystem tools.", + "Instructions": "You are a research specialist.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. FETCH: Use http_get to retrieve data from relevant public APIs or URLs. Include the exact response content.\n\n2. EXTRACT: Use json_get to pull specific fields from JSON responses. Don't paraphrase — capture the real data.\n\n3. SAVE: Use write_file to save your raw findings to 'research/raw_data.txt'. The file must exist on disk before you hand off.\n\n4. VERIFY: Use read_file on 'research/raw_data.txt' to confirm it was written correctly.\n\n5. HAND OFF: Write HANDOFF TO ANALYST on its own line with a summary of what sources you consulted and what data was captured.\n\nRULES:\n- Never summarize or paraphrase API responses — write the actual data to the file.\n- Never claim a file was written without verifying it with read_file.", "Model": { - "ModelId": "grok-4-1-fast-reasoning", + "ModelId": "grok-4-1-fast-non-reasoning", "Endpoint": "https://api.x.ai/v1", "ApiKeyEnvVar": "XAI_API_KEY", - "MaxTokens": 4096, - "Temperature": 0.3 + "MaxTokens": 8192 }, - "Plugins": [ "Http", "Json", "FileSystem" ] + "Plugins": ["Http", "Json", "FileSystem"] }, { "Name": "Analyst", - "Description": "Data analyst who synthesizes research into structured reports.", - "Instructions": "You are a data analyst. Review the Researcher's findings:\n1. Use 'read_file' to load 'research/raw_data.txt'.\n2. Analyze the data: identify key insights, patterns, and gaps.\n3. Use 'write_file' to save a structured report to 'research/report.md' in Markdown format.\n4. If the research is complete and the report is written, respond with APPROVED and a one-paragraph summary.", + "Description": "Data analyst who synthesizes research into a structured report.", + "Instructions": "You are a data analyst.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ: Use read_file to load 'research/raw_data.txt'. Do not proceed if the file is missing or empty — report BLOCKED: no research data found.\n\n2. ANALYSE: Identify key insights, patterns, trends, and gaps in the data.\n\n3. WRITE REPORT: Use write_file to save a structured Markdown report to 'research/report.md'. The report must have clear sections: Summary, Key Findings, and Recommendations.\n\n4. VERIFY: Use read_file to confirm 'research/report.md' was written correctly.\n\n5. COMPLETE: Write APPROVED on its own line followed by a one-paragraph summary of the findings.", "Model": { "ModelId": "grok-4-1-fast-reasoning", "Endpoint": "https://api.x.ai/v1", "ApiKeyEnvVar": "XAI_API_KEY", - "MaxTokens": 4096, - "Temperature": 0.2 + "MaxTokens": 8192 }, - "Plugins": [ "FileSystem", "Json" ] + "Plugins": ["FileSystem", "Json"] } ], @@ -38,15 +36,22 @@ "Termination": { "Type": "composite", - "MaxIterations": 10, + "MaxIterations": 15, "Strategies": [ { "Type": "regex", "Pattern": "\\bAPPROVED\\b", - "MaxIterations": 10, - "AgentNames": [ "Analyst" ] + "MaxIterations": 15, + "AgentNames": ["Analyst"] } ] - } + }, + + "Compaction": { + "TriggerTurnCount": 20, + "KeepRecentTurns": 6 + }, + + "MaxCostUsd": 1.00 } } diff --git a/config/orchestration.json b/config/orchestration.json index 86dd780..266e124 100644 --- a/config/orchestration.json +++ b/config/orchestration.json @@ -6,42 +6,39 @@ "Agents": [ { "Name": "Developer", - "Description": "Senior software engineer who writes clean, production-ready code.", - "Instructions": "You are an expert software developer. Your job is to implement the requested feature or fix. Write clean, well-structured, production-ready code. Follow language-specific best practices. Add error handling and document complex logic with inline comments. After completing your implementation, say 'HANDOFF TO TESTER' on its own line so the Tester knows to review your work.", + "Description": "Senior software engineer who implements features using tools.", + "Instructions": "You are an expert software developer with access to filesystem, shell, and git tools.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. EXPLORE: Use list_files and read_file to understand the existing code structure before writing anything. Read at least 2-3 related files to understand patterns and conventions.\n\n2. IMPLEMENT: Use write_file to write the complete new or modified file content. Do not describe what you would write — write it. Never output a diff.\n\n3. VERIFY: Use read_file to confirm the file was written correctly.\n\n4. BUILD/RUN: Use shell_exec to build or run the code and confirm it works. Include the exact output.\n\n5. HAND OFF: When done, write HANDOFF TO TESTER on its own line with a summary of which files changed and what was added.\n\nRULES:\n- Never describe a change without making it with write_file.\n- Never claim success without showing real tool output.\n- If a tool call fails, handle the error and try again.", "Model": { - "ModelId": "grok-4-1-fast-reasoning", + "ModelId": "grok-4-1-fast-non-reasoning", "Endpoint": "https://api.x.ai/v1", "ApiKeyEnvVar": "XAI_API_KEY", - "MaxTokens": 8192, - "Temperature": 0.2 + "MaxTokens": 16384 }, - "Plugins": [] + "Plugins": ["FileSystem", "Shell", "Git", "Search"] }, { "Name": "Tester", - "Description": "QA engineer who writes comprehensive tests and identifies edge cases.", - "Instructions": "You are an expert QA engineer. Review the Developer's code and write comprehensive unit and integration tests. Cover happy paths, edge cases, and error conditions. Point out any bugs, missing validation, or security issues you spot. After completing your test suite, say 'HANDOFF TO REVIEWER' on its own line.", + "Description": "QA engineer who independently verifies changes with real tool calls.", + "Instructions": "You are an expert QA engineer. DO NOT trust the Developer's account — verify everything independently.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE FILES: Use read_file on every file the Developer claimed to change. Confirm the changes are present. If not, report BLOCKED: changes not written.\n\n2. RUN EXISTING TESTS: Use shell_exec to run the existing test suite. Paste the exact stdout/stderr output.\n\n3. TEST THE NEW BEHAVIOUR: Exercise the new feature end-to-end. If it requires a project context, configuration, or data fixture — create it using write_file and shell_exec first. NEVER accept a guard-clause error (like 'file not found' or 'missing config') as proof the feature works. Set up the environment and run the real code path.\n\n4. HAND OFF: Write HANDOFF TO REVIEWER on its own line with the actual test output pasted verbatim.\n\nRULES:\n- Never claim a test passed without pasting real shell_exec output.\n- Never hand off after only triggering an early-exit guard — that means you did not test the feature.", "Model": { - "ModelId": "grok-4-1-fast-reasoning", + "ModelId": "grok-4-1-fast-non-reasoning", "Endpoint": "https://api.x.ai/v1", "ApiKeyEnvVar": "XAI_API_KEY", - "MaxTokens": 8192, - "Temperature": 0.2 + "MaxTokens": 16384 }, - "Plugins": [] + "Plugins": ["FileSystem", "Shell", "Search"] }, { "Name": "Reviewer", - "Description": "Tech lead who performs final code review and approves or requests changes.", - "Instructions": "You are a senior tech lead performing a final code review. Evaluate both the implementation and the tests for: code quality, correctness, security vulnerabilities, performance concerns, and architectural decisions. If everything meets production standards, respond with the word APPROVED on its own line followed by a brief summary of what was built. If changes are needed, provide specific, actionable feedback addressed to the Developer.", + "Description": "Tech lead who approves only after reading the code and verifying the test evidence.", + "Instructions": "You are a senior tech lead performing a final review.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE CODE: Use read_file on the changed files. Do not rely on summaries.\n\n2. REVIEW for: correctness, consistency with existing patterns, error handling, edge cases, and security.\n\n3. VERIFY THE TESTER'S EVIDENCE: The Tester must have run the feature end-to-end — not just triggered a guard clause. If the Tester's only evidence is an early-exit error or a description rather than real shell output, reject it and tell the Tester to run a proper test.\n\n4. DECIDE: If the code is correct and the testing is real, write APPROVED on its own line followed by a 2-3 sentence summary. If anything needs fixing, give specific actionable feedback naming the file and line.", "Model": { "ModelId": "grok-4-1-fast-reasoning", "Endpoint": "https://api.x.ai/v1", "ApiKeyEnvVar": "XAI_API_KEY", - "MaxTokens": 8192, - "Temperature": 0.1 + "MaxTokens": 8192 }, - "Plugins": [] + "Plugins": ["FileSystem", "Search"] } ], @@ -51,15 +48,22 @@ "Termination": { "Type": "composite", - "MaxIterations": 15, + "MaxIterations": 30, "Strategies": [ { "Type": "regex", "Pattern": "\\bAPPROVED\\b", - "MaxIterations": 15, - "AgentNames": [ "Reviewer" ] + "MaxIterations": 30, + "AgentNames": ["Reviewer"] } ] - } + }, + + "Compaction": { + "TriggerTurnCount": 30, + "KeepRecentTurns": 8 + }, + + "MaxCostUsd": 3.00 } } diff --git a/src/Infrastructure/Plugins/FileSystemPlugin.cs b/src/Infrastructure/Plugins/FileSystemPlugin.cs index affa027..b1142be 100644 --- a/src/Infrastructure/Plugins/FileSystemPlugin.cs +++ b/src/Infrastructure/Plugins/FileSystemPlugin.cs @@ -87,7 +87,7 @@ public string DeleteFile([Description("Path of the file to delete.")] string pat // Returns a [DENIED] error string when the path escapes the sandbox, null when safe. private string? ResolveSafe(string path, out string resolved) { - resolved = Path.GetFullPath(path); + resolved = Path.GetFullPath(ProcessHelper.ExpandHome(path)); if (_sandboxRoot is null) return null; diff --git a/src/Infrastructure/Plugins/ProcessHelper.cs b/src/Infrastructure/Plugins/ProcessHelper.cs index b522d0d..235a799 100644 --- a/src/Infrastructure/Plugins/ProcessHelper.cs +++ b/src/Infrastructure/Plugins/ProcessHelper.cs @@ -63,8 +63,25 @@ internal static async Task RunAsync( } } - private static string ResolveWorkDir(string? path) => - string.IsNullOrWhiteSpace(path) ? Directory.GetCurrentDirectory() : path; + /// + /// Expands a leading ~ to the current user's home directory. + /// Process.Start and Path.GetFullPath do not do this — only shells do. + /// + internal static string ExpandHome(string path) + { + if (path.StartsWith("~/") || path == "~") + return Path.Combine( + Environment.GetFolderPath(Environment.SpecialFolder.UserProfile), + path.Length > 2 ? path[2..] : string.Empty); + return path; + } + + private static string ResolveWorkDir(string? path) + { + if (string.IsNullOrWhiteSpace(path)) + return Directory.GetCurrentDirectory(); + return ExpandHome(path); + } } ///