From 5948711cb24ba37c2bcfdd11f808ef1f1b0fc8ac Mon Sep 17 00:00:00 2001 From: Scott Stauffer Date: Thu, 26 Mar 2026 20:23:05 -0500 Subject: [PATCH 1/3] Refine orchestration --- config/orchestration.json | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/config/orchestration.json b/config/orchestration.json index 266e124..5baaa61 100644 --- a/config/orchestration.json +++ b/config/orchestration.json @@ -1,13 +1,25 @@ { "Orchestration": { "Name": "SoftwareDevelopmentTeam", - "Description": "A three-agent team: Developer writes code, Tester writes tests, Reviewer approves.", + "Description": "A four-agent team: Planner scopes work, Developer implements, Tester verifies, Reviewer approves.", "Agents": [ + { + "Name": "Planner", + "Description": "Session planner who reads the task and codebase to produce a focused brief for the team.", + "Instructions": "You are a technical project planner. Your job is to read the task and the relevant codebase, then produce a concise brief that scopes the work for the Developer.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE TASK: Identify what is being asked. Extract the goal, constraints, and any explicit acceptance criteria.\n\n2. EXPLORE: Use list_files and read_file to identify files most relevant to this task. Read 2-4 key files to understand existing patterns, conventions, and what will need to change.\n\n3. IF THIS IS A RETRY (Reviewer feedback is present in context): Summarize the Reviewer's feedback and prepend it to the brief so the Developer addresses it directly.\n\n4. WRITE THE BRIEF using exactly these sections:\n - **Goal**: one sentence\n - **Files to change**: list with reason\n - **Files to read for context**: list with reason\n - **Acceptance criteria**: bullet list of specific, testable conditions\n - **Constraints**: anything to avoid or preserve\n\n5. HAND OFF: Write HANDOFF TO DEVELOPER on its own line.\n\nRULES:\n- Keep the brief under 30 lines.\n- Be specific: use real file paths, function names, and expected behavior.\n- Do not write any code.", + "Model": { + "ModelId": "grok-4-1-fast-non-reasoning", + "Endpoint": "https://api.x.ai/v1", + "ApiKeyEnvVar": "XAI_API_KEY", + "MaxTokens": 4096 + }, + "Plugins": ["FileSystem", "Search"] + }, { "Name": "Developer", "Description": "Senior software engineer who implements features using tools.", - "Instructions": "You are an expert software developer with access to filesystem, shell, and git tools.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. EXPLORE: Use list_files and read_file to understand the existing code structure before writing anything. Read at least 2-3 related files to understand patterns and conventions.\n\n2. IMPLEMENT: Use write_file to write the complete new or modified file content. Do not describe what you would write — write it. Never output a diff.\n\n3. VERIFY: Use read_file to confirm the file was written correctly.\n\n4. BUILD/RUN: Use shell_exec to build or run the code and confirm it works. Include the exact output.\n\n5. HAND OFF: When done, write HANDOFF TO TESTER on its own line with a summary of which files changed and what was added.\n\nRULES:\n- Never describe a change without making it with write_file.\n- Never claim success without showing real tool output.\n- If a tool call fails, handle the error and try again.", + "Instructions": "You are an expert software developer with access to filesystem, shell, and git tools.\n\nThe Planner has already explored the codebase and produced a brief. Read it before doing anything else.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE BRIEF: The Planner's brief is in the conversation context. Note the goal, files to change, files to read for context, acceptance criteria, and constraints. If the Tester has reported BUGS FOUND, read that report carefully — those are your specific fix targets.\n\n2. READ CONTEXT FILES: Use read_file on the files the Planner listed under 'Files to read for context'. Do not explore beyond what is listed unless a read reveals an unexpected dependency.\n\n3. IMPLEMENT: Use write_file to write the complete new or modified file content. Do not describe what you would write — write it. Never output a diff.\n\n4. VERIFY: Use read_file to confirm the file was written correctly.\n\n5. BUILD/RUN: Use shell_exec to build or run the code and confirm it works. Include the exact output.\n\n6. HAND OFF: Write HANDOFF TO TESTER on its own line with a summary of which files changed and what was added or fixed.\n\nRULES:\n- Never describe a change without making it with write_file.\n- Never claim success without showing real tool output.\n- If a tool call fails, handle the error and try again.\n- If returning from a Tester bug report, address every FAIL item explicitly.", "Model": { "ModelId": "grok-4-1-fast-non-reasoning", "Endpoint": "https://api.x.ai/v1", @@ -18,8 +30,8 @@ }, { "Name": "Tester", - "Description": "QA engineer who independently verifies changes with real tool calls.", - "Instructions": "You are an expert QA engineer. DO NOT trust the Developer's account — verify everything independently.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE FILES: Use read_file on every file the Developer claimed to change. Confirm the changes are present. If not, report BLOCKED: changes not written.\n\n2. RUN EXISTING TESTS: Use shell_exec to run the existing test suite. Paste the exact stdout/stderr output.\n\n3. TEST THE NEW BEHAVIOUR: Exercise the new feature end-to-end. If it requires a project context, configuration, or data fixture — create it using write_file and shell_exec first. NEVER accept a guard-clause error (like 'file not found' or 'missing config') as proof the feature works. Set up the environment and run the real code path.\n\n4. HAND OFF: Write HANDOFF TO REVIEWER on its own line with the actual test output pasted verbatim.\n\nRULES:\n- Never claim a test passed without pasting real shell_exec output.\n- Never hand off after only triggering an early-exit guard — that means you did not test the feature.", + "Description": "QA engineer who independently verifies changes with real tool calls and blocks promotion on any failure.", + "Instructions": "You are an expert QA engineer. DO NOT trust the Developer's account — verify everything independently.\n\nThe Planner produced an acceptance criteria list. Every criterion must be verified with real tool output. A single FAIL blocks promotion to Reviewer.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE BRIEF: Find the Planner's acceptance criteria in the conversation context. These are your test checklist — every item must pass.\n\n2. READ THE FILES: Use read_file on every file the Developer claimed to change. Confirm the changes are present. If any change is missing, immediately write BUGS FOUND on its own line, list what is missing, and stop.\n\n3. RUN EXISTING TESTS: Use shell_exec to run the existing test suite. Paste the exact stdout/stderr output. If any existing test fails, that is a FAIL.\n\n4. TEST EACH ACCEPTANCE CRITERION: Exercise each criterion end-to-end. If it requires a project context, configuration, or data fixture — create it using write_file and shell_exec first. NEVER accept a guard-clause error (like 'file not found' or 'missing config') as proof the feature works. Set up the environment and run the real code path. Record PASS or FAIL for each criterion with the exact shell output.\n\n5a. IF ALL CRITERIA PASS: Write HANDOFF TO REVIEWER on its own line. Include the full acceptance criteria checklist with PASS next to each item and the shell output pasted verbatim.\n\n5b. IF ANY CRITERION FAILS: Write BUGS FOUND on its own line. List every FAIL item with the exact error output and the file/line where the failure originates. Do not write HANDOFF TO REVIEWER.\n\nRULES:\n- Never claim a test passed without pasting real shell_exec output.\n- Never write HANDOFF TO REVIEWER if any acceptance criterion is FAIL.\n- Never write HANDOFF TO REVIEWER after only triggering an early-exit guard — that is not a real test.", "Model": { "ModelId": "grok-4-1-fast-non-reasoning", "Endpoint": "https://api.x.ai/v1", @@ -30,8 +42,8 @@ }, { "Name": "Reviewer", - "Description": "Tech lead who approves only after reading the code and verifying the test evidence.", - "Instructions": "You are a senior tech lead performing a final review.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE CODE: Use read_file on the changed files. Do not rely on summaries.\n\n2. REVIEW for: correctness, consistency with existing patterns, error handling, edge cases, and security.\n\n3. VERIFY THE TESTER'S EVIDENCE: The Tester must have run the feature end-to-end — not just triggered a guard clause. If the Tester's only evidence is an early-exit error or a description rather than real shell output, reject it and tell the Tester to run a proper test.\n\n4. DECIDE: If the code is correct and the testing is real, write APPROVED on its own line followed by a 2-3 sentence summary. If anything needs fixing, give specific actionable feedback naming the file and line.", + "Description": "Tech lead who approves only after reading the code and confirming all acceptance criteria are verified passing.", + "Instructions": "You are a senior tech lead performing a final review.\n\nYou may only write APPROVED if every acceptance criterion from the Planner's brief is marked PASS in the Tester's report AND the Tester provided real shell output for each. No exceptions.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE CODE: Use read_file on every changed file. Do not rely on summaries.\n\n2. REVIEW for: correctness, consistency with existing patterns, error handling, edge cases, and security.\n\n3. AUDIT THE TESTER'S EVIDENCE:\n a. Locate the Planner's acceptance criteria list.\n b. Confirm the Tester has a PASS with real shell output for every criterion.\n c. If any criterion is missing, marked FAIL, or backed only by a guard-clause error — reject immediately. Name the missing criterion and instruct the Tester to re-run it properly.\n\n4. DECIDE:\n - If the code is correct AND every acceptance criterion is verified PASS: write APPROVED on its own line followed by a 2-3 sentence summary.\n - If any criterion is unverified or failing: do NOT write APPROVED. Give specific, actionable feedback naming the file and line for code issues, or the criterion name for test gaps. The Developer or Tester must address these before you review again.", "Model": { "ModelId": "grok-4-1-fast-reasoning", "Endpoint": "https://api.x.ai/v1", @@ -43,17 +55,24 @@ ], "Selection": { - "Type": "sequential" + "Type": "llm", + "Model": { + "ModelId": "grok-4-1-fast-non-reasoning", + "Endpoint": "https://api.x.ai/v1", + "ApiKeyEnvVar": "XAI_API_KEY", + "MaxTokens": 512 + }, + "Prompt": "You are a workflow moderator for a software development team. Your only job is to choose which agent acts next. You must enforce a strict quality gate: the Reviewer may only see work after the Tester has verified every acceptance criterion passes.\n\nAgents: {{$agents}}\n\nRouting rules (apply the first rule that matches):\n1. No Planner brief exists yet → choose Planner.\n2. Planner wrote HANDOFF TO DEVELOPER → choose Developer.\n3. Developer wrote HANDOFF TO TESTER → choose Tester.\n4. Tester wrote BUGS FOUND → choose Developer. (Do NOT choose Planner unless the bug report reveals a fundamental scope misunderstanding.)\n5. Tester wrote HANDOFF TO REVIEWER → choose Reviewer.\n6. Reviewer did not write APPROVED and flagged a code issue → choose Developer.\n7. Reviewer did not write APPROVED and flagged a scope or requirements issue → choose Planner.\n8. Reviewer wrote APPROVED → the workflow is complete; choose Reviewer (it will not act again).\n\nHistory:\n{{$history}}\n\nReply with ONLY the agent name. No explanation." }, "Termination": { "Type": "composite", - "MaxIterations": 30, + "MaxIterations": 40, "Strategies": [ { "Type": "regex", "Pattern": "\\bAPPROVED\\b", - "MaxIterations": 30, + "MaxIterations": 40, "AgentNames": ["Reviewer"] } ] From 3c332e2665e2e1aed11d5488f6373745aacec2b4 Mon Sep 17 00:00:00 2001 From: Scott Stauffer Date: Thu, 26 Mar 2026 21:04:57 -0500 Subject: [PATCH 2/3] Remove strict --- config/orchestration.json | 8 ++-- src/Infrastructure/KernelFactory.cs | 57 +++++++++++++++++++++++++- src/Orchestration/AgentOrchestrator.cs | 5 +++ 3 files changed, 65 insertions(+), 5 deletions(-) diff --git a/config/orchestration.json b/config/orchestration.json index 5baaa61..f0d2a3c 100644 --- a/config/orchestration.json +++ b/config/orchestration.json @@ -43,7 +43,7 @@ { "Name": "Reviewer", "Description": "Tech lead who approves only after reading the code and confirming all acceptance criteria are verified passing.", - "Instructions": "You are a senior tech lead performing a final review.\n\nYou may only write APPROVED if every acceptance criterion from the Planner's brief is marked PASS in the Tester's report AND the Tester provided real shell output for each. No exceptions.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE CODE: Use read_file on every changed file. Do not rely on summaries.\n\n2. REVIEW for: correctness, consistency with existing patterns, error handling, edge cases, and security.\n\n3. AUDIT THE TESTER'S EVIDENCE:\n a. Locate the Planner's acceptance criteria list.\n b. Confirm the Tester has a PASS with real shell output for every criterion.\n c. If any criterion is missing, marked FAIL, or backed only by a guard-clause error — reject immediately. Name the missing criterion and instruct the Tester to re-run it properly.\n\n4. DECIDE:\n - If the code is correct AND every acceptance criterion is verified PASS: write APPROVED on its own line followed by a 2-3 sentence summary.\n - If any criterion is unverified or failing: do NOT write APPROVED. Give specific, actionable feedback naming the file and line for code issues, or the criterion name for test gaps. The Developer or Tester must address these before you review again.", + "Instructions": "You are a senior tech lead performing a final review.\n\nYou may only write APPROVED if every acceptance criterion from the Planner's brief is marked PASS in the Tester's report AND the Tester provided real shell output for each. No exceptions.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE CODE: Use read_file on every changed file. Do not rely on summaries.\n\n2. REVIEW for: correctness, consistency with existing patterns, error handling, edge cases, and security.\n\n3. AUDIT THE TESTER'S EVIDENCE:\n a. Locate the Planner's acceptance criteria list.\n b. Confirm the Tester has a PASS with real shell output for every criterion.\n c. If any criterion is missing, marked FAIL, or backed only by a guard-clause error — reject immediately. Name the missing criterion and instruct the Tester to re-run it properly.\n\n4. DECIDE — end your response with exactly one of these keywords on its own line:\n - Code correct AND all acceptance criteria verified PASS: write APPROVED followed by a 2-3 sentence summary.\n - Code or tests need fixing (scope unchanged): write REVISION REQUIRED, then give specific actionable feedback naming file and line.\n - Scope or requirements need rethinking: write REPLAN REQUIRED, then describe what the Planner needs to reconsider.", "Model": { "ModelId": "grok-4-1-fast-reasoning", "Endpoint": "https://api.x.ai/v1", @@ -57,12 +57,12 @@ "Selection": { "Type": "llm", "Model": { - "ModelId": "grok-4-1-fast-non-reasoning", + "ModelId": "grok-4-1-fast-reasoning", "Endpoint": "https://api.x.ai/v1", "ApiKeyEnvVar": "XAI_API_KEY", - "MaxTokens": 512 + "MaxTokens": 128 }, - "Prompt": "You are a workflow moderator for a software development team. Your only job is to choose which agent acts next. You must enforce a strict quality gate: the Reviewer may only see work after the Tester has verified every acceptance criterion passes.\n\nAgents: {{$agents}}\n\nRouting rules (apply the first rule that matches):\n1. No Planner brief exists yet → choose Planner.\n2. Planner wrote HANDOFF TO DEVELOPER → choose Developer.\n3. Developer wrote HANDOFF TO TESTER → choose Tester.\n4. Tester wrote BUGS FOUND → choose Developer. (Do NOT choose Planner unless the bug report reveals a fundamental scope misunderstanding.)\n5. Tester wrote HANDOFF TO REVIEWER → choose Reviewer.\n6. Reviewer did not write APPROVED and flagged a code issue → choose Developer.\n7. Reviewer did not write APPROVED and flagged a scope or requirements issue → choose Planner.\n8. Reviewer wrote APPROVED → the workflow is complete; choose Reviewer (it will not act again).\n\nHistory:\n{{$history}}\n\nReply with ONLY the agent name. No explanation." + "Prompt": "You are a routing agent. Scan the history for the MOST RECENT routing keyword and return the corresponding agent name.\n\nKeyword → Agent:\n HANDOFF TO DEVELOPER → Developer\n HANDOFF TO TESTER → Tester\n HANDOFF TO REVIEWER → Reviewer\n BUGS FOUND → Developer\n REVISION REQUIRED → Developer\n REPLAN REQUIRED → Planner\n APPROVED → Reviewer\n (no keyword found) → Planner\n\nAgents: {{$agents}}\n\nHistory:\n{{$history}}\n\nReply with ONLY the agent name. One word." }, "Termination": { diff --git a/src/Infrastructure/KernelFactory.cs b/src/Infrastructure/KernelFactory.cs index 606174c..ffec08d 100644 --- a/src/Infrastructure/KernelFactory.cs +++ b/src/Infrastructure/KernelFactory.cs @@ -44,7 +44,10 @@ private static HttpClient BuildResilientClient() { var handler = new MessageNameStripHandler { - InnerHandler = new TransientRetryHandler { InnerHandler = new SocketsHttpHandler() } + InnerHandler = new FunctionStrictStripHandler + { + InnerHandler = new TransientRetryHandler { InnerHandler = new SocketsHttpHandler() } + } }; return new HttpClient(handler) { Timeout = TimeSpan.FromMinutes(5) }; } @@ -115,6 +118,58 @@ private static Task Backoff(int attempt, CancellationToken ct) => Task.Delay(TimeSpan.FromSeconds(Math.Pow(2, attempt + 1)), ct); } +/// +/// Strips the strict field from tool function definitions before sending to APIs +/// that don't support it (e.g. xAI). OpenAI SDK 2.x serialises "strict": false +/// on every function definition; providers that don't recognise the field return 400. +/// +internal sealed class FunctionStrictStripHandler : DelegatingHandler +{ + protected override async Task SendAsync( + HttpRequestMessage request, CancellationToken cancellationToken) + { + if (request.Content is not null) + { + var body = await request.Content.ReadAsStringAsync(cancellationToken); + var stripped = StripFunctionStrict(body); + if (!ReferenceEquals(stripped, body)) + { + request.Content = new StringContent(stripped, Encoding.UTF8, + request.Content.Headers.ContentType?.MediaType ?? "application/json"); + } + } + + return await base.SendAsync(request, cancellationToken); + } + + private static string StripFunctionStrict(string json) + { + try + { + var node = JsonNode.Parse(json); + var tools = node?["tools"]?.AsArray(); + if (tools is null) return json; + + bool changed = false; + foreach (var tool in tools) + { + var fn = tool?["function"]?.AsObject(); + if (fn is not null && fn.ContainsKey("strict")) + { + fn.Remove("strict"); + changed = true; + } + } + + return changed ? node!.ToJsonString() : json; + } + catch + { + return json; // pass through unchanged on any parse error + } + } +} + /// /// Strips the name field from non-user messages before sending to APIs /// that only allow name on user role messages (e.g. xAI). diff --git a/src/Orchestration/AgentOrchestrator.cs b/src/Orchestration/AgentOrchestrator.cs index cc35a38..eeba396 100644 --- a/src/Orchestration/AgentOrchestrator.cs +++ b/src/Orchestration/AgentOrchestrator.cs @@ -177,6 +177,11 @@ public async IAsyncEnumerable StreamAsync( throw new BudgetExceededException(cumulativeCost, limit); yield return agentMessage; + + // Guard against SK's MaximumIterations being defeated by conversation compaction. + // Compaction resets history.Count (SK's termination check), but TurnIndex is monotonic. + if (config.Termination.MaxIterations > 0 && agentMessage.TurnIndex >= config.Termination.MaxIterations - 1) + break; } } From af5e301d249184065270616743153df69b5c374d Mon Sep 17 00:00:00 2001 From: Scott Stauffer Date: Thu, 26 Mar 2026 23:37:36 -0500 Subject: [PATCH 3/3] Refined orchestration, added routing --- README.md | 26 ++++++- config/orchestration.json | 31 ++++---- src/Cli/Commands/ValidateConfigCommand.cs | 8 +- src/Core/Models/AgentConfig.cs | 13 ++++ src/Core/Models/StrategyConfig.cs | 22 ++++++ src/Infrastructure/AgentFactory.cs | 25 +++--- .../Plugins/FileSystemPlugin.cs | 72 +++++++++++++++-- src/Infrastructure/Plugins/ProcessHelper.cs | 20 +++-- .../Strategies/KeywordSelectionStrategy.cs | 78 +++++++++++++++++++ .../Strategies/StrategyFactory.cs | 20 ++++- 10 files changed, 276 insertions(+), 39 deletions(-) create mode 100644 src/Orchestration/Strategies/KeywordSelectionStrategy.cs diff --git a/README.md b/README.md index 387c665..4a68036 100644 --- a/README.md +++ b/README.md @@ -141,7 +141,12 @@ Configs live under `config/`. The default is `config/orchestration.json`. ], "Selection": { - "Type": "sequential" + "Type": "keyword", + "DefaultAgent": "Developer", + "Routes": [ + { "Keyword": "HANDOFF TO REVIEWER", "Agent": "Reviewer" }, + { "Keyword": "REVISION REQUIRED", "Agent": "Developer" } + ] }, "Termination": { @@ -214,6 +219,7 @@ Both the `FileSystem` and `Shell` plugins enforce `FileSystemSandboxPath` — an | Type | Description | |------|-------------| | `sequential` | Agents take turns in the order they are defined | +| `keyword` | Deterministic routing: scans the last message for keyword substrings and routes to the corresponding agent. Requires `Routes` array. Falls back to `DefaultAgent` (or the first agent) when no keyword matches. Handles conversation compaction by scanning up to 3 recent messages. | | `llm` | An LLM call picks the next agent each turn. Requires `Prompt` and `Model` | ### Termination strategies @@ -240,6 +246,24 @@ Both the `FileSystem` and `Shell` plugins enforce `FileSystemSandboxPath` — an Add any plugin to an agent by listing its name in the `Plugins` array in the config. +### Agent tool-use enforcement (`FunctionChoice`) + +Each agent has a `FunctionChoice` field (default `"auto"`) that maps directly to `tool_choice` in the OpenAI API: + +| Value | Behaviour | +|-------|-----------| +| `"auto"` | Model may call tools or respond with text (default) | +| `"required"` | Model **must** call at least one tool per message — use this for action agents (Tester, Developer) to prevent the model from fabricating tool output as plain text instead of actually invoking tools | +| `"none"` | Tools are registered in the kernel but the model is blocked from calling them | + +```json +{ + "Name": "Tester", + "FunctionChoice": "required", + "Plugins": ["FileSystem", "Shell", "Search"] +} +``` + --- ## MCP server support diff --git a/config/orchestration.json b/config/orchestration.json index f0d2a3c..681575b 100644 --- a/config/orchestration.json +++ b/config/orchestration.json @@ -31,38 +31,41 @@ { "Name": "Tester", "Description": "QA engineer who independently verifies changes with real tool calls and blocks promotion on any failure.", - "Instructions": "You are an expert QA engineer. DO NOT trust the Developer's account — verify everything independently.\n\nThe Planner produced an acceptance criteria list. Every criterion must be verified with real tool output. A single FAIL blocks promotion to Reviewer.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE BRIEF: Find the Planner's acceptance criteria in the conversation context. These are your test checklist — every item must pass.\n\n2. READ THE FILES: Use read_file on every file the Developer claimed to change. Confirm the changes are present. If any change is missing, immediately write BUGS FOUND on its own line, list what is missing, and stop.\n\n3. RUN EXISTING TESTS: Use shell_exec to run the existing test suite. Paste the exact stdout/stderr output. If any existing test fails, that is a FAIL.\n\n4. TEST EACH ACCEPTANCE CRITERION: Exercise each criterion end-to-end. If it requires a project context, configuration, or data fixture — create it using write_file and shell_exec first. NEVER accept a guard-clause error (like 'file not found' or 'missing config') as proof the feature works. Set up the environment and run the real code path. Record PASS or FAIL for each criterion with the exact shell output.\n\n5a. IF ALL CRITERIA PASS: Write HANDOFF TO REVIEWER on its own line. Include the full acceptance criteria checklist with PASS next to each item and the shell output pasted verbatim.\n\n5b. IF ANY CRITERION FAILS: Write BUGS FOUND on its own line. List every FAIL item with the exact error output and the file/line where the failure originates. Do not write HANDOFF TO REVIEWER.\n\nRULES:\n- Never claim a test passed without pasting real shell_exec output.\n- Never write HANDOFF TO REVIEWER if any acceptance criterion is FAIL.\n- Never write HANDOFF TO REVIEWER after only triggering an early-exit guard — that is not a real test.", + "Instructions": "You are an expert QA engineer. DO NOT trust the Developer's account — verify everything independently.\n\n⚠️ ANTI-HALLUCINATION RULE — READ THIS FIRST:\nEvery shell output block you write MUST come from a shell_exec call you actually made in this turn.\nBefore you write any line that looks like `$ command` or command output, ask yourself: 'Did I call shell_exec for this?' If the answer is no — STOP. Call shell_exec first. Never write what you think the output should be. The Reviewer will independently re-run your tests; fabricated output will be caught and you will be routed back here.\n\nThe Planner produced an acceptance criteria list. Every criterion must be verified with real tool output. A single FAIL blocks promotion to Reviewer.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE BRIEF: Find the Planner's acceptance criteria in the conversation context. These are your test checklist — every item must pass.\n\n2. READ THE FILES: Use read_file on every file the Developer claimed to change. Confirm the changes are present. If any change is missing, immediately write BUGS FOUND on its own line, list what is missing, and stop.\n\n3. RUN EXISTING TESTS: Use shell_exec to run the existing test suite. Paste the exact stdout/stderr output. If any existing test fails, that is a FAIL.\n\n4. TEST EACH ACCEPTANCE CRITERION:\n - For each criterion, call shell_exec (or write_file + shell_exec) to exercise it end-to-end.\n - If the feature requires a project context or data fixture, create it with write_file and shell_exec before running the test.\n - NEVER accept a guard-clause error as proof the feature works. Set up the environment and run the real code path.\n - Record PASS or FAIL for each criterion with the exact shell_exec output — not a summary, the raw output.\n\n5a. IF ALL CRITERIA PASS: Write HANDOFF TO REVIEWER on its own line. Include the full checklist with PASS per item and verbatim shell output for each.\n\n5b. IF ANY CRITERION FAILS: Write BUGS FOUND on its own line. List every FAIL item with the exact error and the file/line where it originates. Do not write HANDOFF TO REVIEWER.\n\nRULES:\n- NEVER write output for a shell command you did not call with shell_exec.\n- NEVER write HANDOFF TO REVIEWER unless every criterion has a real shell_exec output showing PASS.\n- NEVER write HANDOFF TO REVIEWER after only a guard-clause error.", "Model": { - "ModelId": "grok-4-1-fast-non-reasoning", + "ModelId": "grok-4-1-fast-reasoning", "Endpoint": "https://api.x.ai/v1", "ApiKeyEnvVar": "XAI_API_KEY", - "MaxTokens": 16384 + "MaxTokens": 8192 }, + "FunctionChoice": "required", "Plugins": ["FileSystem", "Shell", "Search"] }, { "Name": "Reviewer", - "Description": "Tech lead who approves only after reading the code and confirming all acceptance criteria are verified passing.", - "Instructions": "You are a senior tech lead performing a final review.\n\nYou may only write APPROVED if every acceptance criterion from the Planner's brief is marked PASS in the Tester's report AND the Tester provided real shell output for each. No exceptions.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE CODE: Use read_file on every changed file. Do not rely on summaries.\n\n2. REVIEW for: correctness, consistency with existing patterns, error handling, edge cases, and security.\n\n3. AUDIT THE TESTER'S EVIDENCE:\n a. Locate the Planner's acceptance criteria list.\n b. Confirm the Tester has a PASS with real shell output for every criterion.\n c. If any criterion is missing, marked FAIL, or backed only by a guard-clause error — reject immediately. Name the missing criterion and instruct the Tester to re-run it properly.\n\n4. DECIDE — end your response with exactly one of these keywords on its own line:\n - Code correct AND all acceptance criteria verified PASS: write APPROVED followed by a 2-3 sentence summary.\n - Code or tests need fixing (scope unchanged): write REVISION REQUIRED, then give specific actionable feedback naming file and line.\n - Scope or requirements need rethinking: write REPLAN REQUIRED, then describe what the Planner needs to reconsider.", + "Description": "Tech lead who approves only after reading the code, spot-checking with shell, and confirming all acceptance criteria are verified passing.", + "Instructions": "You are a senior tech lead performing a final review.\n\nYou may only write APPROVED if every acceptance criterion from the Planner's brief is marked PASS in the Tester's report AND the Tester provided real shell output for each. No exceptions.\n\nFOLLOW THESE STEPS IN ORDER:\n\n1. READ THE CODE: Use read_file on every changed file. Do not rely on summaries.\n\n2. REVIEW for: correctness, consistency with existing patterns, error handling, edge cases, and security.\n\n3. SPOT-CHECK WITH SHELL: Pick the most critical acceptance criterion (usually 'does the code execute without errors'). Re-run it yourself using shell_exec. If it fails, write REVISION REQUIRED immediately — do not proceed to step 4.\n\n4. AUDIT THE TESTER'S EVIDENCE:\n a. Locate the Planner's acceptance criteria list.\n b. Confirm the Tester has a PASS with real shell output for every criterion.\n c. Check that the Tester's shell output is consistent with the code you read and your own spot-check in step 3. Inconsistencies (output looks invented, or contradicts actual file contents) are grounds for immediate rejection.\n d. If any criterion is missing, marked FAIL, backed only by a guard-clause error, or looks fabricated — write REVISION REQUIRED immediately, name the criterion, and explain what the Tester must re-run.\n\n5. DECIDE — end your response with exactly one of these keywords on its own line:\n - Code correct AND all acceptance criteria verified PASS: write APPROVED followed by a 2-3 sentence summary.\n - Code or tests need fixing (scope unchanged): write REVISION REQUIRED, then give specific actionable feedback naming file and line.\n - Scope or requirements need rethinking: write REPLAN REQUIRED, then describe what the Planner needs to reconsider.", "Model": { "ModelId": "grok-4-1-fast-reasoning", "Endpoint": "https://api.x.ai/v1", "ApiKeyEnvVar": "XAI_API_KEY", "MaxTokens": 8192 }, - "Plugins": ["FileSystem", "Search"] + "Plugins": ["FileSystem", "Shell", "Search"] } ], "Selection": { - "Type": "llm", - "Model": { - "ModelId": "grok-4-1-fast-reasoning", - "Endpoint": "https://api.x.ai/v1", - "ApiKeyEnvVar": "XAI_API_KEY", - "MaxTokens": 128 - }, - "Prompt": "You are a routing agent. Scan the history for the MOST RECENT routing keyword and return the corresponding agent name.\n\nKeyword → Agent:\n HANDOFF TO DEVELOPER → Developer\n HANDOFF TO TESTER → Tester\n HANDOFF TO REVIEWER → Reviewer\n BUGS FOUND → Developer\n REVISION REQUIRED → Developer\n REPLAN REQUIRED → Planner\n APPROVED → Reviewer\n (no keyword found) → Planner\n\nAgents: {{$agents}}\n\nHistory:\n{{$history}}\n\nReply with ONLY the agent name. One word." + "Type": "keyword", + "DefaultAgent": "Planner", + "Routes": [ + { "Keyword": "HANDOFF TO DEVELOPER", "Agent": "Developer" }, + { "Keyword": "HANDOFF TO TESTER", "Agent": "Tester" }, + { "Keyword": "HANDOFF TO REVIEWER", "Agent": "Reviewer" }, + { "Keyword": "BUGS FOUND", "Agent": "Developer" }, + { "Keyword": "REVISION REQUIRED", "Agent": "Developer" }, + { "Keyword": "REPLAN REQUIRED", "Agent": "Planner" } + ] }, "Termination": { diff --git a/src/Cli/Commands/ValidateConfigCommand.cs b/src/Cli/Commands/ValidateConfigCommand.cs index 6098eac..74e5f1c 100644 --- a/src/Cli/Commands/ValidateConfigCommand.cs +++ b/src/Cli/Commands/ValidateConfigCommand.cs @@ -98,6 +98,9 @@ public override int Execute(CommandContext context, ValidateConfigSettings setti else if (string.IsNullOrEmpty(Environment.GetEnvironmentVariable(agent.Model.ApiKeyEnvVar))) issues.Add(("warning", $"Agent '{agent.Name}': Env var '{agent.Model.ApiKeyEnvVar}' is not set in this shell.")); + if (agent.FunctionChoice.ToLowerInvariant() is not ("auto" or "required" or "none")) + issues.Add(("error", $"Agent '{agent.Name}': FunctionChoice '{agent.FunctionChoice}' is invalid. Valid values: auto, required, none.")); + if (settings.Strict) { var registered = pluginRegistry.RegisteredPlugins @@ -111,12 +114,15 @@ public override int Execute(CommandContext context, ValidateConfigSettings setti // Selection strategy var selType = config.Selection.Type.ToLowerInvariant(); - if (selType is not ("sequential" or "roundrobin" or "llm")) + if (selType is not ("sequential" or "roundrobin" or "llm" or "keyword")) issues.Add(("error", $"Unknown selection type: '{config.Selection.Type}'.")); if (selType == "llm" && config.Selection.Model is null) issues.Add(("error", "LLM selection requires Selection.Model to be set.")); + if (selType == "keyword" && (config.Selection.Routes is null || config.Selection.Routes.Count == 0)) + issues.Add(("error", "Keyword selection requires at least one entry in Routes.")); + // Termination strategy ValidateTermination(config.Termination, config.Agents, issues); diff --git a/src/Core/Models/AgentConfig.cs b/src/Core/Models/AgentConfig.cs index 9392e4e..8e1dabc 100644 --- a/src/Core/Models/AgentConfig.cs +++ b/src/Core/Models/AgentConfig.cs @@ -31,4 +31,17 @@ public record AgentConfig /// public List Plugins { get; init; } = []; + /// + /// Controls how the model uses tools each turn. + /// + /// auto (default): the model may call tools or respond with text. + /// required: the model MUST call at least one tool per message. Use this + /// for action agents (Tester, Developer) to prevent the model from fabricating tool + /// output as plain text instead of actually invoking the tools. + /// none: tools are registered but the model is not allowed to call them. + /// + /// Maps to tool_choice in the OpenAI API. + /// + public string FunctionChoice { get; init; } = "auto"; + } diff --git a/src/Core/Models/StrategyConfig.cs b/src/Core/Models/StrategyConfig.cs index bfa06c5..0442c71 100644 --- a/src/Core/Models/StrategyConfig.cs +++ b/src/Core/Models/StrategyConfig.cs @@ -10,6 +10,7 @@ public record SelectionStrategyConfig /// /// sequential: round-robin through agents in order (default). /// llm: an LLM call picks the next agent each turn. + /// keyword: deterministic routing based on keywords in the last message. /// /// public string Type { get; init; } = "sequential"; @@ -24,6 +25,27 @@ public record SelectionStrategyConfig /// Model config for the LLM-based selection strategy. /// public ModelConfig? Model { get; init; } + + /// + /// Routing rules for the keyword strategy. Evaluated in order; first match wins. + /// + public List? Routes { get; init; } + + /// + /// Default agent name for the keyword strategy when no keyword matches. + /// Defaults to the first agent in the config. + /// + public string? DefaultAgent { get; init; } +} + +/// A single keyword → agent routing rule. +public record KeywordRoute +{ + /// Substring to search for in the last message (case-insensitive). + public string Keyword { get; init; } = string.Empty; + + /// Agent name to route to when is found. + public string Agent { get; init; } = string.Empty; } /// diff --git a/src/Infrastructure/AgentFactory.cs b/src/Infrastructure/AgentFactory.cs index 255b5f9..a9a45b2 100644 --- a/src/Infrastructure/AgentFactory.cs +++ b/src/Infrastructure/AgentFactory.cs @@ -31,7 +31,7 @@ public ChatCompletionAgent Create(AgentConfig config) $"Registered plugins: {string.Join(", ", pluginRegistry.RegisteredPlugins)}"); } - var executionSettings = BuildExecutionSettings(config.Model); + var executionSettings = BuildExecutionSettings(config); return new ChatCompletionAgent { @@ -43,20 +43,27 @@ public ChatCompletionAgent Create(AgentConfig config) }; } - private static OpenAIPromptExecutionSettings BuildExecutionSettings(ModelConfig model) + private static OpenAIPromptExecutionSettings BuildExecutionSettings(AgentConfig agent) { var settings = new OpenAIPromptExecutionSettings { - // Required for SK to include the tools array in the API request and - // auto-invoke kernel functions when the model returns tool calls. - FunctionChoiceBehavior = FunctionChoiceBehavior.Auto() + // Controls whether the model must call tools, may call tools, or is blocked from + // calling tools. "required" is the most important value: it sends tool_choice=required + // to the API, which forces the model into tool-calling mode every turn and prevents it + // from fabricating tool output as plain text. + FunctionChoiceBehavior = agent.FunctionChoice.ToLowerInvariant() switch + { + "required" => FunctionChoiceBehavior.Required(), + "none" => FunctionChoiceBehavior.None(), + _ => FunctionChoiceBehavior.Auto() // "auto" or anything unrecognised + } }; - if (model.Temperature.HasValue) - settings.Temperature = model.Temperature.Value; + if (agent.Model.Temperature.HasValue) + settings.Temperature = agent.Model.Temperature.Value; - if (model.MaxTokens > 0) - settings.MaxTokens = model.MaxTokens; + if (agent.Model.MaxTokens > 0) + settings.MaxTokens = agent.Model.MaxTokens; return settings; } diff --git a/src/Infrastructure/Plugins/FileSystemPlugin.cs b/src/Infrastructure/Plugins/FileSystemPlugin.cs index b1142be..6b06249 100644 --- a/src/Infrastructure/Plugins/FileSystemPlugin.cs +++ b/src/Infrastructure/Plugins/FileSystemPlugin.cs @@ -21,8 +21,12 @@ public FileSystemPlugin(string? sandboxRoot = null) _sandboxRoot = sandboxRoot is not null ? Path.GetFullPath(sandboxRoot) : null; } + // Max characters returned by read_file. Large enough for any real source file; prevents + // binary files and generated assets from flooding the agent context with megabytes of data. + private const int ReadFileSizeLimit = 80_000; + [KernelFunction("read_file")] - [Description("Reads the full text content of a file.")] + [Description("Reads the text content of a file. Returns up to 80,000 characters; larger files are truncated with a notice. Binary files are rejected.")] public async Task ReadFileAsync([Description("Absolute or relative path to the file.")] string path) { var denial = ResolveSafe(path, out var resolved); @@ -31,7 +35,47 @@ public async Task ReadFileAsync([Description("Absolute or relative path if (!File.Exists(resolved)) return PluginResult.Error($"File not found: {resolved}"); - return await File.ReadAllTextAsync(resolved); + // Reject binary files early by sniffing the first 8 KB for null bytes. + // Reading a binary (executable, .so, compiled artifact) as text would dump + // megabytes of garbage into the agent context. + using (var probe = File.OpenRead(resolved)) + { + var buf = new byte[Math.Min(8192, probe.Length)]; + int read = await probe.ReadAsync(buf); + if (Array.IndexOf(buf, (byte)0, 0, read) >= 0) + return PluginResult.Error( + $"File '{resolved}' appears to be binary and cannot be read as text. " + + $"If you need to inspect it, use shell_run with 'file', 'strings', or 'xxd'."); + } + + // Read the file with a size cap so enormous files (generated JSON, lock files, etc.) + // don't flood the context. Use a StreamReader so we avoid loading the full file into + // memory before we can truncate it. + var sb = new System.Text.StringBuilder(Math.Min(ReadFileSizeLimit, (int)new FileInfo(resolved).Length)); + using var reader = new StreamReader(resolved); + + var charBuf = new char[4096]; + int totalRead = 0; + int charsRead; + + while ((charsRead = await reader.ReadAsync(charBuf, 0, charBuf.Length)) > 0) + { + var available = Math.Min(charsRead, ReadFileSizeLimit - totalRead); + sb.Append(charBuf, 0, available); + totalRead += available; + if (totalRead >= ReadFileSizeLimit) break; + } + + var content = sb.ToString(); + + if (!reader.EndOfStream) + { + var totalBytes = new FileInfo(resolved).Length; + content += $"\n\n[TRUNCATED — file is {totalBytes:N0} bytes; only the first {ReadFileSizeLimit:N0} characters are shown. " + + $"Use shell_run with 'tail -n +N {resolved}' to read further into the file.]"; + } + + return content; } [KernelFunction("write_file")] @@ -52,10 +96,10 @@ public async Task WriteFileAsync( } [KernelFunction("list_files")] - [Description("Lists files matching an optional pattern in a directory (recursive).")] + [Description("Lists files matching an optional pattern in a directory (recursive). Returns at most 500 paths; use a more specific pattern to narrow results.")] public string ListFiles( [Description("Directory to list.")] string directory, - [Description("Glob pattern, e.g. '*.cs'. Defaults to '*'.")] string pattern = "*") + [Description("Glob pattern, e.g. '*.cs', '*.kiwi'. Defaults to '*'.")] string pattern = "*") { var denial = ResolveSafe(directory, out var resolved); if (denial is not null) return denial; @@ -63,10 +107,22 @@ public string ListFiles( if (!Directory.Exists(resolved)) return PluginResult.Error($"Directory not found: {resolved}"); - var files = Directory.GetFiles(resolved, pattern, SearchOption.AllDirectories); - return files.Length == 0 - ? PluginResult.Info("No files matched.") - : string.Join("\n", files); + const int maxFiles = 500; + var files = Directory.EnumerateFiles(resolved, pattern, SearchOption.AllDirectories) + .Take(maxFiles + 1) + .ToList(); + + if (files.Count == 0) + return PluginResult.Info("No files matched."); + + var truncated = files.Count > maxFiles; + if (truncated) files.RemoveAt(files.Count - 1); + + var result = string.Join("\n", files); + if (truncated) + result += $"\n\n[TRUNCATED — only first {maxFiles} files shown. Use a more specific pattern to narrow results.]"; + + return result; } [KernelFunction("delete_file")] diff --git a/src/Infrastructure/Plugins/ProcessHelper.cs b/src/Infrastructure/Plugins/ProcessHelper.cs index 235a799..df48154 100644 --- a/src/Infrastructure/Plugins/ProcessHelper.cs +++ b/src/Infrastructure/Plugins/ProcessHelper.cs @@ -100,12 +100,22 @@ public string ToPluginOutput() var stderr = Stderr.TrimEnd(); if (Succeeded) - return string.IsNullOrEmpty(stdout) ? PluginResult.Ok("Command completed with no output.") : stdout; + { + // Always surface stderr even on success — some interpreters (e.g. kiwi, node) + // print errors to stderr and exit 0, which would otherwise look like a clean run. + if (string.IsNullOrEmpty(stdout) && string.IsNullOrEmpty(stderr)) + return PluginResult.Ok("Command completed with no output."); + + var parts = new List(); + if (!string.IsNullOrEmpty(stdout)) parts.Add(stdout); + if (!string.IsNullOrEmpty(stderr)) parts.Add($"[stderr] {stderr}"); + return string.Join("\n", parts); + } - var parts = new List { $"[EXIT {ExitCode}]" }; - if (!string.IsNullOrEmpty(stdout)) parts.Add(stdout); - if (!string.IsNullOrEmpty(stderr)) parts.Add($"[stderr] {stderr}"); - return string.Join("\n", parts); + var failParts = new List { $"[EXIT {ExitCode}]" }; + if (!string.IsNullOrEmpty(stdout)) failParts.Add(stdout); + if (!string.IsNullOrEmpty(stderr)) failParts.Add($"[stderr] {stderr}"); + return string.Join("\n", failParts); } } diff --git a/src/Orchestration/Strategies/KeywordSelectionStrategy.cs b/src/Orchestration/Strategies/KeywordSelectionStrategy.cs new file mode 100644 index 0000000..0f122e5 --- /dev/null +++ b/src/Orchestration/Strategies/KeywordSelectionStrategy.cs @@ -0,0 +1,78 @@ +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.Agents; +using Microsoft.SemanticKernel.Agents.Chat; +using Microsoft.SemanticKernel.ChatCompletion; + +namespace fuseraft.Orchestration.Strategies; + +/// +/// Deterministic keyword-based agent selection strategy. +/// Scans agent text responses (newest-first) for keyword substrings and routes +/// to the corresponding agent. First match wins. Falls back to +/// when no keyword matches. +/// +/// This is more reliable than LLM-based selection for large histories because it +/// does not depend on an LLM call and is immune to context-length issues. +/// +/// Only agent text messages are considered — tool call/result messages are skipped +/// so they cannot consume the lookback window and hide routing keywords. +/// +public sealed class KeywordSelectionStrategy : SelectionStrategy +{ + private readonly IReadOnlyList<(string Keyword, string AgentName)> _routes; + private readonly string _defaultAgentName; + + // How many agent text messages to look back through when scanning for routing keywords. + // Counts only real agent responses (not tool calls or tool results), so this window is + // not inflated by interleaved tool call/result messages in the SK history. + private const int AgentMessageLookback = 3; + + public KeywordSelectionStrategy( + IReadOnlyList<(string Keyword, string AgentName)> routes, + string defaultAgentName) + { + _routes = routes; + _defaultAgentName = defaultAgentName; + } + + protected override Task SelectAgentAsync( + IReadOnlyList agents, + IReadOnlyList history, + CancellationToken cancellationToken = default) + { + // Scan agent text messages newest-first, skipping tool call/result messages. + // Tool call messages have Role=Assistant but null/empty Content; tool result + // messages have Role=Tool. Both are excluded so they don't consume the lookback + // window and prevent routing keywords from being found. + int scanned = 0; + for (int i = history.Count - 1; i >= 0 && scanned < AgentMessageLookback; i--) + { + var msg = history[i]; + + // Skip tool call/result messages: tool results have Role=Tool; tool call + // frames are assistant messages with no text content. + if (msg.Role == AuthorRole.Tool) continue; + var content = msg.Content; + if (string.IsNullOrEmpty(content)) continue; + + scanned++; + + foreach (var (keyword, agentName) in _routes) + { + if (content.Contains(keyword, StringComparison.OrdinalIgnoreCase)) + { + var matched = agents.FirstOrDefault( + a => string.Equals(a.Name, agentName, StringComparison.OrdinalIgnoreCase)); + if (matched is not null) + return Task.FromResult(matched); + } + } + } + + var defaultAgent = agents.FirstOrDefault( + a => string.Equals(a.Name, _defaultAgentName, StringComparison.OrdinalIgnoreCase)) + ?? agents[0]; + + return Task.FromResult(defaultAgent); + } +} diff --git a/src/Orchestration/Strategies/StrategyFactory.cs b/src/Orchestration/Strategies/StrategyFactory.cs index f07e0da..e7cb487 100644 --- a/src/Orchestration/Strategies/StrategyFactory.cs +++ b/src/Orchestration/Strategies/StrategyFactory.cs @@ -19,8 +19,9 @@ public SelectionStrategy CreateSelection(SelectionStrategyConfig config, IReadOn { "sequential" or "roundrobin" => new SequentialSelectionStrategy(), "llm" => CreateLLMSelection(config, agents), + "keyword" => CreateKeywordSelection(config, agents), _ => throw new NotSupportedException( - $"Unknown selection strategy type: '{config.Type}'. Valid: sequential, llm.") + $"Unknown selection strategy type: '{config.Type}'. Valid: sequential, llm, keyword.") }; } @@ -54,6 +55,23 @@ private KernelFunctionSelectionStrategy CreateLLMSelection(SelectionStrategyConf }; } + private static KeywordSelectionStrategy CreateKeywordSelection( + SelectionStrategyConfig config, + IReadOnlyList agents) + { + if (config.Routes is not { Count: > 0 }) + throw new InvalidOperationException("Keyword selection strategy requires at least one entry in 'Routes'."); + + var routes = config.Routes + .Select(r => (r.Keyword, r.Agent)) + .ToList(); + + var defaultAgent = config.DefaultAgent + ?? (agents.Count > 0 ? agents[0].Name! : throw new InvalidOperationException("No agents defined.")); + + return new KeywordSelectionStrategy(routes, defaultAgent); + } + // Termination public TerminationStrategy CreateTermination(TerminationStrategyConfig config, IReadOnlyList agents)