amd · kovtcharov · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
@@ -13,6 +13,9 @@ on:
     branches: ["main"]
     paths:
       - 'src/gaia/eval/**'
+      - 'eval/scenarios/**'
+      - 'eval/corpus/**'
+      - 'eval/prompts/**'
       - 'tests/test_eval.py'
       - 'setup.py'
       - '.github/workflows/test_eval.yml'
@@ -21,6 +24,9 @@ on:
     types: [opened, synchronize, reopened, ready_for_review]
     paths:
       - 'src/gaia/eval/**'
+      - 'eval/scenarios/**'
+      - 'eval/corpus/**'
+      - 'eval/prompts/**'
       - 'tests/test_eval.py'
       - 'setup.py'
       - '.github/workflows/test_eval.yml'
@@ -79,6 +85,7 @@ jobs:
           node-version: '18'
 
       - name: Test webapp functionality
+        shell: pwsh
         run: |
           cd src/gaia/eval/webapp
           # Install dependencies
@@ -88,19 +95,21 @@ jobs:
           # Test that server can start (Windows-compatible version)
           $env:PORT = 3456  # Use non-default port to avoid conflicts
           $process = Start-Process node -ArgumentList "server.js" -PassThru -ErrorAction Stop
-          Start-Sleep -Seconds 3
-          if ($process.HasExited) {
-            Write-Error "Server failed to start or crashed immediately"
-            exit 1
-          }
-          # Try to connect to the server
           try {
+            Start-Sleep -Seconds 3
+            if ($process.HasExited) {
+              Write-Error "Server failed to start or crashed immediately"
+              exit 1
+            }
+            # Try to connect to the server
             $response = Invoke-WebRequest -Uri "http://localhost:3456" -TimeoutSec 5 -UseBasicParsing
             Write-Output "Server responded with status: $($response.StatusCode)"
+            Write-Output "Webapp server test passed"
           } catch {
-            Write-Error "Server did not respond to HTTP request"
-            Stop-Process -Id $process.Id -Force -ErrorAction SilentlyContinue
+            Write-Error "Server did not respond to HTTP request: $_"
             exit 1
+          } finally {
+            if (-not $process.HasExited) {
+              Stop-Process -Id $process.Id -Force -ErrorAction SilentlyContinue
+            }
           }
-          Stop-Process -Id $process.Id -Force
-          Write-Output "Webapp server test passed"
@@ -227,4 +227,7 @@ docs/playbooks/sd-agent/index-backup.mdx
 .claude/settings.local.json
 
 # Custom util scripts
-util/custom/*
+util/custom/*
+
+# Real-world eval corpus documents (sourced from public web, not committed)
+eval/corpus/real_world/
@@ -254,18 +254,18 @@ gaia/
 
 | Agent | Location | Description | Default Model |
 |-------|----------|-------------|---------------|
-| **ChatAgent** | `agents/chat/agent.py` | Document Q&A with RAG | Qwen3-Coder-30B |
-| **CodeAgent** | `agents/code/agent.py` | Code generation with orchestration | Qwen3-Coder-30B |
-| **JiraAgent** | `agents/jira/agent.py` | Jira issue management | Qwen3-Coder-30B |
-| **BlenderAgent** | `agents/blender/agent.py` | 3D scene automation | Qwen3-Coder-30B |
-| **DockerAgent** | `agents/docker/agent.py` | Container management | Qwen3-Coder-30B |
+| **ChatAgent** | `agents/chat/agent.py` | Document Q&A with RAG | Qwen3.5-35B |
+| **CodeAgent** | `agents/code/agent.py` | Code generation with orchestration | Qwen3.5-35B |
+| **JiraAgent** | `agents/jira/agent.py` | Jira issue management | Qwen3.5-35B |
+| **BlenderAgent** | `agents/blender/agent.py` | 3D scene automation | Qwen3.5-35B |
+| **DockerAgent** | `agents/docker/agent.py` | Container management | Qwen3.5-35B |
 | **MedicalIntakeAgent** | `agents/emr/agent.py` | Medical form processing | Qwen3-VL-4B (VLM) |
-| **RoutingAgent** | `agents/routing/agent.py` | Intelligent agent selection | Qwen3-Coder-30B |
+| **RoutingAgent** | `agents/routing/agent.py` | Intelligent agent selection | Qwen3.5-35B |
 | **SDAgent** | `agents/sd/agent.py` | Stable Diffusion image generation | SDXL-Turbo |
 
 ### Default Models
 - General tasks: `Qwen3-0.6B-GGUF`
-- Code/Agents: `Qwen3-Coder-30B-A3B-Instruct-GGUF`
+- Code/Agents: `Qwen3.5-35B-A3B-GGUF`
 - Vision tasks: `Qwen3-VL-4B-Instruct-GGUF`
 
 ## CLI Commands
@@ -530,3 +530,9 @@ Specialized agents are available in `.claude/agents/` for specific tasks (23 age
 - **ui-ux-designer** (opus) - User-centered design, accessibility
 
 When invoking a proactive agent from `.claude/agents/`, indicate which agent you are using in your response.
+
+## Learned Skills
+
+**Read these before starting related tasks:**
+
+- `.claude/skills/gaia-eval-benchmark.md` - How to run, audit, and trust/distrust the GAIA Agent UI eval benchmark; covers RAG cache integrity, response rendering bugs, eval judge leniency, and MCP session inspection (tags: eval, rag, mcp, gaia-agent-ui, debugging, hallucination, ci-cd, testing)
@@ -300,7 +300,8 @@
                 "group": "Evaluation Framework",
                 "pages": [
                   "reference/eval",
-                  "reference/eval/fix-code-testbench"
+                  "reference/eval/fix-code-testbench",
+                  "eval"
                 ]
               },
               "reference/dependency-management",