From 1b4abfa8cb8fac8ccd83092f1353d63a11366e09 Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <itomek@users.noreply.github.com>
Date: Mon, 16 Mar 2026 12:15:14 -0400
Subject: [PATCH 1/4] feat(cpp): add performance benchmarks and binary size
 tracking (#358)

- Add benchmark_cpp.yml workflow: binary size, startup time, loop latency, memory footprint
- Cache baseline on main pushes; compare on every PR with per-metric thresholds (10% binary, 15% other)
- Fix cache path mismatch: restore and rename before fresh run overwrites results
- Fix build_cpp.yml push/PR paths to include both workflow files
- Add timeout-minutes: 15 to benchmark job
- Add initCalled_ guard in BenchAgent to prevent duplicate tool registration
- Add MISSING metric detection in compareAndReport (catches crashed benchmarks)
- Add Timer::elapsedUs() guard against unmatched stop()
- Move GAIA_BUILD_BENCHMARKS option to top-level options block in CMakeLists.txt
- Ignore cpp/benchmark-*.json in .gitignore (ephemeral CI artifacts)
---
 .github/workflows/benchmark_cpp.yml | 153 +++++++++++++
 .github/workflows/build_cpp.yml     |  17 +-
 .gitignore                          |   3 +
 cpp/CMakeLists.txt                  |  34 +++
 cpp/benchmarks/bench_main.cpp       | 335 ++++++++++++++++++++++++++++
 cpp/benchmarks/bench_utils.h        | 274 +++++++++++++++++++++++
 cpp/benchmarks/mock_llm_server.h    | 154 +++++++++++++
 7 files changed, 969 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/benchmark_cpp.yml
 create mode 100644 cpp/benchmarks/bench_main.cpp
 create mode 100644 cpp/benchmarks/bench_utils.h
 create mode 100644 cpp/benchmarks/mock_llm_server.h

diff --git a/.github/workflows/benchmark_cpp.yml b/.github/workflows/benchmark_cpp.yml
new file mode 100644
index 00000000..d6a6af33
--- /dev/null
+++ b/.github/workflows/benchmark_cpp.yml
@@ -0,0 +1,153 @@
+# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+# C++ performance benchmark workflow.
+# Measures binary size, startup time, loop latency, and memory footprint.
+# Called from build_cpp.yml after the build job passes.
+
+name: C++ Benchmarks
+
+on:
+  workflow_call:
+
+permissions:
+  contents: read
+
+jobs:
+  benchmark:
+    name: C++ Benchmarks (${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 15
+
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Install OpenSSL (Linux)
+        if: runner.os == 'Linux'
+        run: sudo apt-get install -y libssl-dev
+
+      - name: Install OpenSSL (Windows)
+        if: runner.os == 'Windows'
+        run: choco install openssl --no-progress -y
+
+      - name: Restore FetchContent cache
+        uses: actions/cache@v4
+        with:
+          path: cpp/build-bench/_deps
+          key: fetchcontent-bench-${{ matrix.os }}-${{ hashFiles('cpp/CMakeLists.txt') }}
+
+      # Restore baseline from last main-branch run (prefix match gets latest)
+      - name: Restore benchmark baseline
+        id: restore-baseline
+        uses: actions/cache/restore@v4
+        with:
+          path: cpp/benchmark-results.json
+          key: benchmark-baseline-${{ matrix.os }}-dummy
+          restore-keys: benchmark-baseline-${{ matrix.os }}-
+
+      # Rename restored results to baseline path so the compare step can find it
+      - name: Rename restored cache to baseline
+        shell: bash
+        run: |
+          if [ -f cpp/benchmark-results.json ]; then
+            mv cpp/benchmark-results.json cpp/benchmark-baseline.json
+          fi
+
+      # Build static library + examples + benchmarks
+      - name: Configure CMake (static + benchmarks)
+        run: >
+          cmake -B cpp/build-bench -S cpp
+          -DCMAKE_BUILD_TYPE=Release
+          -DGAIA_BUILD_BENCHMARKS=ON
+          -DGAIA_BUILD_EXAMPLES=ON
+          -DGAIA_BUILD_TESTS=OFF
+          -DGAIA_BUILD_INTEGRATION_TESTS=OFF
+
+      - name: Build (static + benchmarks)
+        run: cmake --build cpp/build-bench --config Release --parallel
+
+      # Build shared library separately for DLL/SO size measurement
+      - name: Configure CMake (shared library)
+        run: >
+          cmake -B cpp/build-bench-shared -S cpp
+          -DCMAKE_BUILD_TYPE=Release
+          -DBUILD_SHARED_LIBS=ON
+          -DGAIA_BUILD_TESTS=OFF
+          -DGAIA_BUILD_EXAMPLES=OFF
+          -DGAIA_BUILD_BENCHMARKS=OFF
+
+      - name: Build (shared library)
+        run: cmake --build cpp/build-bench-shared --config Release --parallel
+
+      # Measure sizes and run all benchmarks on Linux
+      - name: Run benchmarks (Linux)
+        if: runner.os == 'Linux'
+        run: |
+          STATIC=$(stat -c%s cpp/build-bench/libgaia_core.a)
+          SHARED=$(stat -c%s cpp/build-bench-shared/libgaia_core.so)
+          EXE=$(stat -c%s cpp/build-bench/security_demo)
+          echo "Static lib:  $STATIC bytes"
+          echo "Shared lib:  $SHARED bytes"
+          echo "Example exe: $EXE bytes"
+          cpp/build-bench/gaia_benchmarks \
+            --output cpp/benchmark-results.json \
+            --static-lib-size "$STATIC" \
+            --shared-lib-size "$SHARED" \
+            --exe-size "$EXE"
+
+      # Measure sizes and run all benchmarks on Windows
+      - name: Run benchmarks (Windows)
+        if: runner.os == 'Windows'
+        shell: powershell
+        run: |
+          $static = (Get-Item "cpp/build-bench/Release/gaia_core.lib").Length
+          $shared = (Get-Item "cpp/build-bench-shared/Release/gaia_core.dll").Length
+          $exe    = (Get-Item "cpp/build-bench/Release/security_demo.exe").Length
+          Write-Host "Static lib:  $static bytes"
+          Write-Host "Shared lib:  $shared bytes"
+          Write-Host "Example exe: $exe bytes"
+          & "cpp/build-bench/Release/gaia_benchmarks.exe" `
+            --output "cpp/benchmark-results.json" `
+            --static-lib-size $static `
+            --shared-lib-size $shared `
+            --exe-size $exe
+
+      # Compare current results against baseline (binary: 10%, others: 15%)
+      - name: Compare against baseline
+        shell: bash
+        run: |
+          if [ -f cpp/benchmark-baseline.json ]; then
+            echo "Baseline found — running regression check"
+            if [ "$RUNNER_OS" = "Windows" ]; then
+              BENCH_EXE="cpp/build-bench/Release/gaia_benchmarks.exe"
+            else
+              BENCH_EXE="cpp/build-bench/gaia_benchmarks"
+            fi
+            "$BENCH_EXE" \
+              --compare \
+              --baseline cpp/benchmark-baseline.json \
+              --current  cpp/benchmark-results.json
+          else
+            echo "No baseline found — first run establishes the baseline"
+          fi
+
+      # Save new baseline only on pushes to main (immutable cache: unique key per run)
+      - name: Save benchmark baseline
+        if: github.ref == 'refs/heads/main' && github.event_name == 'push'
+        uses: actions/cache/save@v4
+        with:
+          path: cpp/benchmark-results.json
+          key: benchmark-baseline-${{ matrix.os }}-${{ github.run_id }}
+
+      # Always upload results as an artifact for inspection
+      - name: Upload benchmark results
+        uses: actions/upload-artifact@v6
+        if: always()
+        with:
+          name: cpp-benchmark-${{ matrix.os }}
+          path: cpp/benchmark-results.json
diff --git a/.github/workflows/build_cpp.yml b/.github/workflows/build_cpp.yml
index df59be6f..a02d0751 100644
--- a/.github/workflows/build_cpp.yml
+++ b/.github/workflows/build_cpp.yml
@@ -14,12 +14,15 @@ on:
     branches: [ main ]
     paths:
       - 'cpp/**'
+      - '.github/workflows/build_cpp.yml'
+      - '.github/workflows/benchmark_cpp.yml'
   pull_request:
     branches: [ main ]
     types: [opened, synchronize, reopened, ready_for_review]
     paths:
       - 'cpp/**'
       - '.github/workflows/build_cpp.yml'
+      - '.github/workflows/benchmark_cpp.yml'
   merge_group:
   workflow_dispatch:
 
@@ -330,11 +333,18 @@ jobs:
             lemonade-server-stderr.log
             lemonade-server.log
 
+  # Performance benchmarks (runs after build passes)
+  benchmark:
+    name: C++ Benchmarks
+    needs: [build-and-test]
+    if: needs.build-and-test.result == 'success'
+    uses: ./.github/workflows/benchmark_cpp.yml
+
   # Summary job
   cpp-build-summary:
     name: C++ Build Summary
     runs-on: ubuntu-latest
-    needs: [build-and-test, install-test, shared-lib-test, integration-test]
+    needs: [build-and-test, install-test, shared-lib-test, integration-test, benchmark]
     if: >-
       ${{ always() && !cancelled() &&
           needs.build-and-test.result != 'cancelled' }}
@@ -346,6 +356,7 @@ jobs:
           echo "Install Test:      ${{ needs.install-test.result }}"
           echo "Shared Lib Test:   ${{ needs.shared-lib-test.result }}"
           echo "Integration Tests: ${{ needs.integration-test.result }}"
+          echo "Benchmarks:        ${{ needs.benchmark.result }}"
           echo ""
 
           if [[ "${{ needs.build-and-test.result }}" == "skipped" ]]; then
@@ -363,6 +374,10 @@ jobs:
           if [[ "${{ needs.integration-test.result }}" == "failure" ]]; then
             echo "::warning::Integration tests failed (STX runner infrastructure issue)"
           fi
+          # Benchmarks are non-blocking (regression alerts are warnings only)
+          if [[ "${{ needs.benchmark.result }}" == "failure" ]]; then
+            echo "::warning::Benchmark regression detected — review cpp-benchmark-* artifacts"
+          fi
 
           if [[ "$FAILED" == "0" ]]; then
             echo "All required C++ jobs passed (unit tests, install round-trip, shared library)!"
diff --git a/.gitignore b/.gitignore
index 54b3c5b1..2c50f5fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -215,6 +215,9 @@ cpp/vcpkg_installed/
 CMakeFiles/
 cpp/build_ssl/
 
+# C++ benchmark output (ephemeral — stored in CI cache/artifacts, not source control)
+cpp/benchmark-*.json
+
 # SD test results and artifacts
 sd_model_sweep_results/
 quick_test_results/
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 254e56c1..649d5deb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -29,6 +29,9 @@ endif()
 option(GAIA_BUILD_INTEGRATION_TESTS
     "Build LLM integration tests (requires lemonade-server with Qwen3-4B-GGUF loaded)" OFF)
 
+# Performance benchmarks -- always OFF by default.
+option(GAIA_BUILD_BENCHMARKS "Build performance benchmarks" OFF)
+
 # ---------------------------------------------------------------------------
 # Dependencies -- prefer system packages, fall back to FetchContent
 # ---------------------------------------------------------------------------
@@ -238,6 +241,37 @@ if(GAIA_BUILD_INTEGRATION_TESTS)
     )
 endif()
 
+# ---------------------------------------------------------------------------
+# Benchmarks
+# ---------------------------------------------------------------------------
+if(GAIA_BUILD_BENCHMARKS)
+    add_executable(gaia_benchmarks benchmarks/bench_main.cpp)
+    target_link_libraries(gaia_benchmarks PRIVATE gaia::gaia_core)
+
+    target_include_directories(gaia_benchmarks PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks
+    )
+
+    # httplib is PRIVATE to gaia_core; benchmarks need it for the mock server
+    if(httplib_FOUND)
+        target_link_libraries(gaia_benchmarks PRIVATE httplib::httplib)
+    else()
+        target_include_directories(gaia_benchmarks SYSTEM PRIVATE
+            $<TARGET_PROPERTY:httplib::httplib,INTERFACE_INCLUDE_DIRECTORIES>)
+    endif()
+
+    # OpenSSL defines must match gaia_core for httplib header consistency
+    if(OpenSSL_FOUND)
+        target_compile_definitions(gaia_benchmarks PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
+        target_link_libraries(gaia_benchmarks PRIVATE OpenSSL::SSL OpenSSL::Crypto)
+    endif()
+
+    # Windows: psapi for GetProcessMemoryInfo
+    if(WIN32)
+        target_link_libraries(gaia_benchmarks PRIVATE psapi)
+    endif()
+endif()
+
 # ---------------------------------------------------------------------------
 # Install + package config
 # ---------------------------------------------------------------------------
diff --git a/cpp/benchmarks/bench_main.cpp b/cpp/benchmarks/bench_main.cpp
new file mode 100644
index 00000000..68aaa061
--- /dev/null
+++ b/cpp/benchmarks/bench_main.cpp
@@ -0,0 +1,335 @@
+// Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+// C++ framework performance benchmarks.
+// Measures binary size, startup time, loop latency, and memory footprint.
+//
+// Usage:
+//   # Run all benchmarks and write results
+//   gaia_benchmarks --output results.json \
+//       --static-lib-size <bytes> --shared-lib-size <bytes> --exe-size <bytes>
+//
+//   # Compare current vs baseline
+//   gaia_benchmarks --compare --baseline baseline.json --current results.json
+
+#include "bench_utils.h"
+#include "mock_llm_server.h"
+
+#include <gaia/agent.h>
+#include <gaia/console.h>
+#include <gaia/tool_registry.h>
+#include <gaia/types.h>
+
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// ---------------------------------------------------------------------------
+// BenchAgent — minimal agent subclass for benchmarking
+// ---------------------------------------------------------------------------
+
+class BenchAgent : public gaia::Agent {
+public:
+    explicit BenchAgent(const gaia::AgentConfig& config) : gaia::Agent(config) {
+        // Do NOT call init() here — startup benchmark calls benchInit() explicitly
+    }
+
+    /// Expose init() for explicit invocation in startup benchmark.
+    void benchInit() {
+        if (initCalled_) return;
+        initCalled_ = true;
+        init();
+        // Silence final-answer output so benchmark iterations don't flood stdout
+        setOutputHandler(std::make_unique<gaia::SilentConsole>(true));
+    }
+
+protected:
+    void registerTools() override {
+        gaia::ToolParameter msgParam;
+        msgParam.name = "message";
+        msgParam.type = gaia::ToolParamType::STRING;
+        msgParam.description = "Message to echo";
+        msgParam.required = true;
+
+        toolRegistry().registerTool(
+            "echo",
+            "Echo a message back",
+            [](const nlohmann::json& args) -> nlohmann::json {
+                return nlohmann::json{{"echoed", args.value("message", "")}};
+            },
+            {msgParam});
+    }
+
+    std::string getSystemPrompt() const override { return "You are a benchmark agent."; }
+
+private:
+    bool initCalled_ = false;
+};
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+static gaia::AgentConfig makeConfig(const std::string& url) {
+    gaia::AgentConfig cfg;
+    cfg.baseUrl = url;
+    cfg.modelId = "";      // empty → skip ensureModelLoaded()
+    cfg.maxSteps = 5;
+    cfg.silentMode = true; // suppress all console output
+    cfg.debug = false;
+    return cfg;
+}
+
+// ---------------------------------------------------------------------------
+// Benchmark 1: Binary Size
+// Records sizes passed via CLI args — actual measurement happens in CI shell.
+// ---------------------------------------------------------------------------
+
+static bench::BenchmarkResult benchStaticLibSize(long bytes) {
+    return {"binary_size_static_lib_bytes", static_cast<double>(bytes), "bytes"};
+}
+
+static bench::BenchmarkResult benchSharedLibSize(long bytes) {
+    return {"binary_size_shared_lib_bytes", static_cast<double>(bytes), "bytes"};
+}
+
+static bench::BenchmarkResult benchExeSize(long bytes) {
+    return {"binary_size_example_exe_bytes", static_cast<double>(bytes), "bytes"};
+}
+
+// ---------------------------------------------------------------------------
+// Benchmark 2: Startup Time
+// N iterations of: construct BenchAgent → benchInit() → systemPrompt()
+// No HTTP calls — modelId is empty, so no ensureModelLoaded().
+// ---------------------------------------------------------------------------
+
+static bench::BenchmarkResult benchStartupTime(int iterations = 100) {
+    std::cout << "  Running startup benchmark (" << iterations << " iterations)...\n";
+
+    // Use a dummy URL — no HTTP calls will be made (modelId is empty)
+    const std::string dummyUrl = "http://127.0.0.1:1"; // won't be contacted
+
+    std::vector<double> times;
+    times.reserve(iterations);
+
+    bench::Timer timer;
+    for (int i = 0; i < iterations; ++i) {
+        timer.start();
+        {
+            BenchAgent agent(makeConfig(dummyUrl));
+            agent.benchInit();
+            (void)agent.systemPrompt();
+        }
+        timer.stop();
+        times.push_back(timer.elapsedUs());
+    }
+
+    double med = bench::median(times);
+    std::cout << "  Startup median: " << std::fixed << std::setprecision(1) << med << " us\n";
+    return {"startup_time_median_us", med, "us"};
+}
+
+// ---------------------------------------------------------------------------
+// Benchmark 3: Loop Latency
+// N iterations of processQuery() with a mock server.
+// Each call uses a 2-step sequence: tool call → answer.
+// History is cleared between iterations so each call is independent.
+// ---------------------------------------------------------------------------
+
+static bench::BenchmarkResult benchLoopLatency(int iterations = 50) {
+    std::cout << "  Running loop latency benchmark (" << iterations << " iterations)...\n";
+
+    bench::MockLlmServer server;
+    BenchAgent agent(makeConfig(server.baseUrl()));
+    agent.benchInit();
+    agent.setDefaultPolicy(gaia::ToolPolicy::ALLOW);
+
+    std::vector<double> times;
+    times.reserve(iterations);
+
+    bench::Timer timer;
+    for (int i = 0; i < iterations; ++i) {
+        // Queue: tool call first, then answer
+        server.pushResponse(bench::kToolCall);
+        server.pushResponse(bench::kDefaultAnswer);
+
+        agent.clearHistory();
+
+        timer.start();
+        agent.processQuery("benchmark");
+        timer.stop();
+        times.push_back(timer.elapsedUs());
+    }
+
+    double med = bench::median(times);
+    std::cout << "  Loop latency median: " << std::fixed << std::setprecision(1) << med
+              << " us\n";
+    return {"loop_latency_median_us", med, "us"};
+}
+
+// ---------------------------------------------------------------------------
+// Benchmark 4: Memory Footprint
+// 20 processQuery() calls WITHOUT clearing history (conversation accumulates).
+// Measures baseline RSS, peak RSS, and per-step growth.
+// ---------------------------------------------------------------------------
+
+static std::vector<bench::BenchmarkResult> benchMemoryFootprint(int steps = 20) {
+    std::cout << "  Running memory benchmark (" << steps << " steps)...\n";
+
+    bench::MockLlmServer server;
+    BenchAgent agent(makeConfig(server.baseUrl()));
+    agent.benchInit();
+    agent.setDefaultPolicy(gaia::ToolPolicy::ALLOW);
+
+    // Force system prompt computation before measuring baseline
+    (void)agent.systemPrompt();
+
+    long baselineKb = bench::MemoryTracker::getCurrentRssKb();
+    std::cout << "  Baseline RSS: " << baselineKb << " KB\n";
+
+    long peakKb = baselineKb;
+    for (int i = 0; i < steps; ++i) {
+        // Each call returns an answer directly (no tool calls) so history grows
+        // by one user message + one assistant message per step.
+        server.pushResponse(bench::kDefaultAnswer);
+        agent.processQuery("benchmark step " + std::to_string(i));
+
+        long rss = bench::MemoryTracker::getCurrentRssKb();
+        if (rss > peakKb) peakKb = rss;
+    }
+
+    long finalKb = bench::MemoryTracker::getCurrentRssKb();
+    double perStepGrowth = (steps > 0) ? static_cast<double>(finalKb - baselineKb) / steps : 0.0;
+
+    std::cout << "  Peak RSS:         " << peakKb << " KB\n";
+    std::cout << "  Per-step growth:  " << std::fixed << std::setprecision(1) << perStepGrowth
+              << " KB\n";
+
+    return {
+        {"memory_baseline_kb", static_cast<double>(baselineKb), "KB"},
+        {"memory_peak_kb", static_cast<double>(peakKb), "KB"},
+        {"memory_per_step_growth_kb", perStepGrowth, "KB"},
+    };
+}
+
+// ---------------------------------------------------------------------------
+// CLI parsing helpers
+// ---------------------------------------------------------------------------
+
+static std::string getArg(const std::vector<std::string>& args, const std::string& flag,
+                           const std::string& defaultVal = "") {
+    for (size_t i = 0; i + 1 < args.size(); ++i) {
+        if (args[i] == flag) return args[i + 1];
+    }
+    return defaultVal;
+}
+
+static bool hasFlag(const std::vector<std::string>& args, const std::string& flag) {
+    for (const auto& a : args) {
+        if (a == flag) return true;
+    }
+    return false;
+}
+
+// ---------------------------------------------------------------------------
+// main
+// ---------------------------------------------------------------------------
+
+int main(int argc, char* argv[]) {
+    // Unset GAIA_CPP_BASE_URL so it does not interfere with benchmarks
+    // that explicitly set their own baseUrl via AgentConfig.
+#if defined(_WIN32)
+    _putenv_s("GAIA_CPP_BASE_URL", "");
+#else
+    unsetenv("GAIA_CPP_BASE_URL");
+#endif
+
+    std::vector<std::string> args(argv + 1, argv + argc);
+
+    // ---- Compare mode ----
+    if (hasFlag(args, "--compare")) {
+        std::string baseline = getArg(args, "--baseline");
+        std::string current = getArg(args, "--current");
+        if (baseline.empty() || current.empty()) {
+            std::cerr << "Usage: gaia_benchmarks --compare --baseline <file> --current <file>\n";
+            return 1;
+        }
+        try {
+            return bench::compareAndReport(baseline, current);
+        } catch (const std::exception& e) {
+            std::cerr << "Comparison failed: " << e.what() << "\n";
+            return 1;
+        }
+    }
+
+    // ---- Benchmark mode ----
+    std::string outputPath = getArg(args, "--output", "benchmark-results.json");
+
+    long staticLibBytes = 0;
+    long sharedLibBytes = 0;
+    long exeBytes = 0;
+    try {
+        staticLibBytes = std::stol(getArg(args, "--static-lib-size", "0"));
+        sharedLibBytes = std::stol(getArg(args, "--shared-lib-size", "0"));
+        exeBytes = std::stol(getArg(args, "--exe-size", "0"));
+    } catch (const std::exception& e) {
+        std::cerr << "Error: invalid size argument: " << e.what() << "\n";
+        std::cerr << "Usage: gaia_benchmarks --output <file>"
+                     " --static-lib-size <bytes> --shared-lib-size <bytes> --exe-size <bytes>\n";
+        return 1;
+    }
+
+    std::cout << "=== GAIA C++ Performance Benchmarks ===\n\n";
+
+    std::vector<bench::BenchmarkResult> results;
+
+    // Benchmark 1: Binary sizes (from CLI args)
+    std::cout << "Benchmark 1: Binary Sizes\n";
+    results.push_back(benchStaticLibSize(staticLibBytes));
+    results.push_back(benchSharedLibSize(sharedLibBytes));
+    results.push_back(benchExeSize(exeBytes));
+    std::cout << "  Static lib:  " << staticLibBytes << " bytes\n";
+    std::cout << "  Shared lib:  " << sharedLibBytes << " bytes\n";
+    std::cout << "  Example exe: " << exeBytes << " bytes\n\n";
+
+    // Benchmark 2: Startup time
+    std::cout << "Benchmark 2: Startup Time\n";
+    try {
+        results.push_back(benchStartupTime(100));
+    } catch (const std::exception& e) {
+        std::cerr << "  WARNING: Startup benchmark failed: " << e.what() << "\n";
+    }
+    std::cout << "\n";
+
+    // Benchmark 3: Loop latency
+    std::cout << "Benchmark 3: Loop Latency\n";
+    try {
+        results.push_back(benchLoopLatency(50));
+    } catch (const std::exception& e) {
+        std::cerr << "  WARNING: Loop latency benchmark failed: " << e.what() << "\n";
+    }
+    std::cout << "\n";
+
+    // Benchmark 4: Memory footprint
+    std::cout << "Benchmark 4: Memory Footprint\n";
+    try {
+        auto memResults = benchMemoryFootprint(20);
+        results.insert(results.end(), memResults.begin(), memResults.end());
+    } catch (const std::exception& e) {
+        std::cerr << "  WARNING: Memory benchmark failed: " << e.what() << "\n";
+    }
+    std::cout << "\n";
+
+    // Write results
+    try {
+        bench::writeBenchmarkResults(outputPath, results);
+        std::cout << "Results written to: " << outputPath << "\n";
+    } catch (const std::exception& e) {
+        std::cerr << "Failed to write results: " << e.what() << "\n";
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/cpp/benchmarks/bench_utils.h b/cpp/benchmarks/bench_utils.h
new file mode 100644
index 00000000..2716175c
--- /dev/null
+++ b/cpp/benchmarks/bench_utils.h
@@ -0,0 +1,274 @@
+// Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+// Benchmark utilities: timer, memory tracker, result I/O, and comparison.
+
+#pragma once
+
+#include <algorithm>
+#include <chrono>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+// Platform-specific memory headers
+#if defined(_WIN32)
+#    include <windows.h>
+#    include <psapi.h>
+#elif defined(__APPLE__)
+#    include <mach/mach.h>
+#else
+#    include <fstream>
+#endif
+
+#include <nlohmann/json.hpp>
+
+using json = nlohmann::json;
+
+namespace bench {
+
+// ---------------------------------------------------------------------------
+// Timer
+// ---------------------------------------------------------------------------
+
+class Timer {
+public:
+    void start() { start_ = std::chrono::high_resolution_clock::now(); }
+
+    void stop() { end_ = std::chrono::high_resolution_clock::now(); }
+
+    double elapsedUs() const {
+        if (end_ < start_) return 0.0;
+        return static_cast<double>(
+            std::chrono::duration_cast<std::chrono::microseconds>(end_ - start_).count());
+    }
+
+    double elapsedMs() const { return elapsedUs() / 1000.0; }
+
+private:
+    std::chrono::high_resolution_clock::time_point start_;
+    std::chrono::high_resolution_clock::time_point end_;
+};
+
+// ---------------------------------------------------------------------------
+// MemoryTracker — returns current process RSS in KB
+// ---------------------------------------------------------------------------
+
+class MemoryTracker {
+public:
+    static long getCurrentRssKb() {
+#if defined(_WIN32)
+        PROCESS_MEMORY_COUNTERS pmc;
+        if (GetProcessMemoryInfo(GetCurrentProcess(), &pmc, sizeof(pmc))) {
+            return static_cast<long>(pmc.WorkingSetSize / 1024);
+        }
+        return 0;
+#elif defined(__APPLE__)
+        mach_task_basic_info info;
+        mach_msg_type_number_t count = MACH_TASK_BASIC_INFO_COUNT;
+        if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO,
+                      reinterpret_cast<task_info_t>(&info), &count) == KERN_SUCCESS) {
+            return static_cast<long>(info.resident_size / 1024);
+        }
+        return 0;
+#else
+        // Linux: parse /proc/self/status for VmRSS
+        std::ifstream f("/proc/self/status");
+        std::string line;
+        while (std::getline(f, line)) {
+            if (line.rfind("VmRSS:", 0) == 0) {
+                std::istringstream iss(line);
+                std::string key;
+                long val = 0;
+                iss >> key >> val;
+                return val; // already in KB
+            }
+        }
+        return 0;
+#endif
+    }
+};
+
+// ---------------------------------------------------------------------------
+// BenchmarkResult
+// ---------------------------------------------------------------------------
+
+struct BenchmarkResult {
+    std::string name;
+    double value;
+    std::string unit;
+};
+
+// ---------------------------------------------------------------------------
+// JSON I/O
+// ---------------------------------------------------------------------------
+
+inline void writeBenchmarkResults(const std::string& path,
+                                   const std::vector<BenchmarkResult>& results) {
+    // Timestamp (thread-safe via gmtime_r / gmtime_s)
+    auto now = std::chrono::system_clock::now();
+    std::time_t t = std::chrono::system_clock::to_time_t(now);
+    std::tm tm_buf{};
+#if defined(_WIN32)
+    gmtime_s(&tm_buf, &t);
+#else
+    gmtime_r(&t, &tm_buf);
+#endif
+    std::ostringstream ts;
+    ts << std::put_time(&tm_buf, "%Y-%m-%dT%H:%M:%SZ");
+
+    // Platform string
+    std::string platform;
+#if defined(_WIN32)
+    platform = "windows";
+#elif defined(__APPLE__)
+    platform = "macos";
+#else
+    platform = "linux";
+#endif
+
+    json root;
+    root["timestamp"] = ts.str();
+    root["platform"] = platform;
+    json arr = json::array();
+    for (const auto& r : results) {
+        arr.push_back({{"name", r.name}, {"value", r.value}, {"unit", r.unit}});
+    }
+    root["results"] = arr;
+
+    std::ofstream f(path);
+    if (!f.is_open()) {
+        throw std::runtime_error("Cannot write benchmark results to: " + path);
+    }
+    f << root.dump(2) << "\n";
+}
+
+inline std::vector<BenchmarkResult> readBenchmarkResults(const std::string& path) {
+    std::ifstream f(path);
+    if (!f.is_open()) {
+        throw std::runtime_error("Cannot read benchmark results from: " + path);
+    }
+    json root = json::parse(f);
+    std::vector<BenchmarkResult> out;
+    for (const auto& r : root.at("results")) {
+        out.push_back({r.at("name").get<std::string>(), r.at("value").get<double>(),
+                       r.at("unit").get<std::string>()});
+    }
+    return out;
+}
+
+// ---------------------------------------------------------------------------
+// Per-metric thresholds
+// ---------------------------------------------------------------------------
+
+inline double thresholdForMetric(const std::string& name) {
+    // Binary size metrics: 10% threshold (issue: "Fail if size regresses >10%")
+    if (name.find("binary_size") != std::string::npos) {
+        return 10.0;
+    }
+    // All other metrics: 15% threshold
+    return 15.0;
+}
+
+// ---------------------------------------------------------------------------
+// compareAndReport: compare current vs baseline, return 0 if OK, 1 if regression
+// ---------------------------------------------------------------------------
+
+inline int compareAndReport(const std::string& baselinePath, const std::string& currentPath) {
+    std::vector<BenchmarkResult> baseline = readBenchmarkResults(baselinePath);
+    std::vector<BenchmarkResult> current = readBenchmarkResults(currentPath);
+
+    // Index baseline by name
+    std::map<std::string, double> baseMap;
+    for (const auto& r : baseline) {
+        baseMap[r.name] = r.value;
+    }
+
+    std::cout << "\n=== Benchmark Regression Report ===\n";
+    std::cout << std::left << std::setw(45) << "Metric"
+              << std::right << std::setw(12) << "Baseline"
+              << std::setw(12) << "Current"
+              << std::setw(10) << "Change"
+              << std::setw(12) << "Threshold"
+              << std::setw(10) << "Status" << "\n";
+    std::cout << std::string(101, '-') << "\n";
+
+    bool anyRegression = false;
+    for (const auto& r : current) {
+        auto it = baseMap.find(r.name);
+        if (it == baseMap.end()) {
+            std::cout << std::left << std::setw(45) << r.name
+                      << std::right << std::setw(12) << "N/A"
+                      << std::setw(12) << r.value
+                      << std::setw(10) << "N/A"
+                      << std::setw(12) << "N/A"
+                      << std::setw(10) << "NEW" << "\n";
+            continue;
+        }
+
+        double base = it->second;
+        double threshold = thresholdForMetric(r.name);
+        double pct = (base == 0.0) ? 0.0 : (r.value - base) / base * 100.0;
+
+        std::string status;
+        if (pct > threshold) {
+            status = "FAIL";
+            anyRegression = true;
+        } else if (pct < -1.0) {
+            status = "IMPROVED";
+        } else {
+            status = "OK";
+        }
+
+        std::cout << std::left << std::setw(45) << r.name << std::right
+                  << std::setw(12) << std::fixed << std::setprecision(1) << base
+                  << std::setw(12) << r.value
+                  << std::setw(9) << std::showpos << pct << "%" << std::noshowpos
+                  << std::setw(12) << (std::to_string(static_cast<int>(threshold)) + "%")
+                  << std::setw(10) << status << "\n";
+    }
+    // Report baseline metrics absent from the current run (benchmark may have crashed)
+    for (const auto& b : baseline) {
+        bool found = false;
+        for (const auto& r : current) {
+            if (r.name == b.name) { found = true; break; }
+        }
+        if (!found) {
+            std::cout << std::left << std::setw(45) << b.name
+                      << std::right << std::setw(12) << std::fixed << std::setprecision(1)
+                      << b.value
+                      << std::setw(12) << "N/A"
+                      << std::setw(10) << "N/A"
+                      << std::setw(12) << "N/A"
+                      << std::setw(10) << "MISSING" << "\n";
+            anyRegression = true;
+        }
+    }
+    std::cout << std::string(101, '-') << "\n";
+
+    if (anyRegression) {
+        std::cout << "\nRESULT: REGRESSION DETECTED — one or more metrics exceed threshold or are missing\n";
+        return 1;
+    }
+    std::cout << "\nRESULT: PASS — no regressions detected\n";
+    return 0;
+}
+
+// ---------------------------------------------------------------------------
+// Median helper
+// ---------------------------------------------------------------------------
+
+inline double median(std::vector<double> v) {
+    if (v.empty()) return 0.0;
+    std::sort(v.begin(), v.end());
+    size_t n = v.size();
+    return (n % 2 == 0) ? (v[n / 2 - 1] + v[n / 2]) / 2.0 : v[n / 2];
+}
+
+} // namespace bench
diff --git a/cpp/benchmarks/mock_llm_server.h b/cpp/benchmarks/mock_llm_server.h
new file mode 100644
index 00000000..fd420ff3
--- /dev/null
+++ b/cpp/benchmarks/mock_llm_server.h
@@ -0,0 +1,154 @@
+// Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+// In-process mock HTTP server mimicking the Lemonade Server API.
+// Used by benchmarks to avoid requiring a real LLM backend.
+
+#pragma once
+
+#include <atomic>
+#include <chrono>
+#include <deque>
+#include <mutex>
+#include <stdexcept>
+#include <string>
+#include <thread>
+
+#include <httplib.h>
+
+namespace bench {
+
+// Default chat completion response — agent returns a final answer immediately.
+static const std::string kDefaultAnswer = R"({"choices":[{"message":{"content":"{\"thought\":\"done\",\"goal\":\"complete\",\"answer\":\"benchmark result\"}"}}]})";
+
+// Tool-call response — agent calls the echo tool first.
+static const std::string kToolCall = R"({"choices":[{"message":{"content":"{\"thought\":\"calling tool\",\"goal\":\"test\",\"tool\":\"echo\",\"tool_args\":{\"message\":\"bench\"}}"}}]})";
+
+// Health response — reports mock-model as already loaded so ensureModelLoaded() skips /load.
+static const std::string kHealthOk = R"({"status":"ok","all_models_loaded":[{"model_name":"mock-model","recipe_options":{"ctx_size":16384}}]})";
+
+// Models list response
+static const std::string kModelsList = R"({"data":[{"id":"mock-model"}]})";
+
+// Load response
+static const std::string kLoadOk = R"({"status":"ok"})";
+
+class MockLlmServer {
+public:
+    /// Start server on an OS-assigned port.
+    /// Constructor blocks until the server is accepting connections.
+    MockLlmServer() : server_(std::make_unique<httplib::Server>()) {
+        registerHandlers();
+
+        // bind_to_any_port returns the OS-assigned port (avoids CI port conflicts)
+        port_ = server_->bind_to_any_port("127.0.0.1");
+        if (port_ <= 0) {
+            throw std::runtime_error("MockLlmServer: failed to bind to any port");
+        }
+
+        thread_ = std::thread([this]() { server_->listen_after_bind(); });
+
+        waitUntilReady();
+    }
+
+    ~MockLlmServer() {
+        server_->stop();
+        if (thread_.joinable()) {
+            thread_.join();
+        }
+    }
+
+    // Non-copyable, non-movable
+    MockLlmServer(const MockLlmServer&) = delete;
+    MockLlmServer& operator=(const MockLlmServer&) = delete;
+
+    /// The port the server is listening on.
+    int port() const { return port_; }
+
+    /// Base URL suitable for AgentConfig::baseUrl (without /api/v1 — LemonadeClient adds it).
+    std::string baseUrl() const { return "http://127.0.0.1:" + std::to_string(port_); }
+
+    /// Push a response to return for the next POST /chat/completions call.
+    /// When the queue is empty the default answer response is returned.
+    void pushResponse(const std::string& body) {
+        std::lock_guard<std::mutex> lk(mu_);
+        responseQueue_.push_back(body);
+    }
+
+    /// Push N copies of a response.
+    void pushResponses(const std::string& body, int n) {
+        std::lock_guard<std::mutex> lk(mu_);
+        for (int i = 0; i < n; ++i) {
+            responseQueue_.push_back(body);
+        }
+    }
+
+    /// Clear pending queued responses.
+    void clearQueue() {
+        std::lock_guard<std::mutex> lk(mu_);
+        responseQueue_.clear();
+    }
+
+    /// Number of chat completion requests handled so far.
+    int requestCount() const { return requestCount_.load(); }
+
+private:
+    void registerHandlers() {
+        // Health check — always reports mock-model loaded
+        server_->Get("/api/v1/health", [](const httplib::Request&, httplib::Response& res) {
+            res.set_content(kHealthOk, "application/json");
+        });
+
+        // Load model — no-op safety fallback
+        server_->Post("/api/v1/load", [](const httplib::Request&, httplib::Response& res) {
+            res.set_content(kLoadOk, "application/json");
+        });
+
+        // Models list
+        server_->Get("/api/v1/models", [](const httplib::Request&, httplib::Response& res) {
+            res.set_content(kModelsList, "application/json");
+        });
+
+        // Chat completions — dequeue a pre-loaded response or return default answer
+        server_->Post("/api/v1/chat/completions",
+                      [this](const httplib::Request&, httplib::Response& res) {
+                          ++requestCount_;
+                          std::string body;
+                          {
+                              std::lock_guard<std::mutex> lk(mu_);
+                              if (!responseQueue_.empty()) {
+                                  body = responseQueue_.front();
+                                  responseQueue_.pop_front();
+                              } else {
+                                  body = kDefaultAnswer;
+                              }
+                          }
+                          res.set_content(body, "application/json");
+                      });
+    }
+
+    void waitUntilReady() {
+        // Poll health endpoint until the server responds
+        httplib::Client cli("127.0.0.1", port_);
+        cli.set_connection_timeout(1);
+        cli.set_read_timeout(1);
+
+        for (int attempt = 0; attempt < 50; ++attempt) {
+            auto res = cli.Get("/api/v1/health");
+            if (res && res->status == 200) {
+                return;
+            }
+            std::this_thread::sleep_for(std::chrono::milliseconds(20));
+        }
+        throw std::runtime_error("MockLlmServer: server did not become ready");
+    }
+
+    std::unique_ptr<httplib::Server> server_;
+    std::thread thread_;
+    int port_ = 0;
+    std::mutex mu_;
+    std::deque<std::string> responseQueue_;
+    std::atomic<int> requestCount_{0};
+};
+
+} // namespace bench

From c526047a11dcd568061f285c1f096b3670064ade Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <itomek@users.noreply.github.com>
Date: Mon, 16 Mar 2026 12:50:47 -0400
Subject: [PATCH 2/4] feat(claude): add finalize-implementation slash command
 skill

Adds a reusable /finalize-implementation skill that runs tests, lint,
CI review simulation, and sub-agent code/architecture reviews in a loop
(max 5 iterations), then commits and creates a draft PR.

Co-Authored-By: Tomasz Waszczyk <tomasz@waszczyk.com>
---
 .../skills/finalize-implementation/SKILL.md   | 169 ++++++++++++++++++
 CLAUDE.md                                     |   6 +
 2 files changed, 175 insertions(+)
 create mode 100644 .claude/skills/finalize-implementation/SKILL.md

diff --git a/.claude/skills/finalize-implementation/SKILL.md b/.claude/skills/finalize-implementation/SKILL.md
new file mode 100644
index 00000000..3aca9de3
--- /dev/null
+++ b/.claude/skills/finalize-implementation/SKILL.md
@@ -0,0 +1,169 @@
+---
+name: finalize-implementation
+description: >
+  Run tests, lint, CI review simulation, fix issues in a loop, then commit and
+  create a draft PR. Invoke when you believe the implementation is complete and
+  ready for review. Runs inline to preserve full conversation context.
+model: sonnet
+disable-model-invocation: true
+---
+
+# Finalize Implementation
+
+Validate the implementation against tests, lint, and the CI review checklist,
+fix any issues found, then commit and open a draft PR.
+
+## Prerequisites
+
+Before starting, confirm:
+- Working directory is clean or changes are ready to finalize
+- You are on the correct feature branch (not `main`)
+
+If on `main`, stop and ask the user which branch to use.
+
+## Phase 1 — Baseline Verification
+
+Run in order:
+
+```bash
+python -m pytest tests/unit/ -v --tb=short --cache-clear
+```
+
+```bash
+python util/lint.py --all --fix
+```
+
+```bash
+python util/lint.py --all
+```
+
+Record:
+- Number of test failures (and which tests)
+- Any lint violations remaining after `--fix`
+
+If lint still fails after `--fix`, fix manually before proceeding.
+
+## Phase 2 — CI Simulation
+
+**Every iteration: read `.github/workflows/claude.yml` fresh — never use a
+cached version.**
+
+Steps:
+1. Read `.github/workflows/claude.yml` and extract the `custom_instructions`
+   from the `pr-review` job.
+2. Run:
+   ```bash
+   git diff origin/main...HEAD
+   git diff --name-status origin/main...HEAD
+   ```
+3. Review all changed files against the extracted checklist.
+4. Produce a structured report:
+
+```
+## CI Review Report — Iteration N
+
+### 🔴 Critical
+- [issue] (file:line)
+
+### 🟡 Important
+- [issue] (file:line)
+
+### 🟢 Minor
+- [issue] (file:line)
+```
+
+Severity definitions (from the CI checklist):
+- 🔴 Critical — security vulnerabilities, breaking changes, data loss risks
+- 🟡 Important — bugs, architectural concerns, missing tests, missing docs
+- 🟢 Minor — style, optimizations, non-blocking suggestions
+
+## Phase 3 — Remediation Loop
+
+**Hard cap: 5 iterations total** (Phase 1 + Phase 2 = one iteration).
+
+Each iteration:
+1. Fix 🔴 issues first, then 🟡 issues. Skip 🟢 unless trivial.
+2. Re-run Phase 1 (tests + lint with `--cache-clear`).
+3. Re-run Phase 2 (fresh `.github/workflows/claude.yml` read every time).
+4. Evaluate exit conditions.
+
+### Exit Conditions
+
+**Exit normally (proceed to Phase 4) when:**
+- Zero 🔴 and zero 🟡 issues remain AND all tests pass AND lint is clean
+
+**Exit with escalation (stop and report to user) when:**
+- 5 iterations reached and issues remain — report what's left and ask for guidance
+- The same 🔴 or 🟡 issue appears unchanged in 2 consecutive iterations — you
+  are stuck; report it immediately rather than continuing
+
+**Never silently skip a 🔴 issue to reach the exit condition.**
+
+## Phase 4 — Final Validation
+
+### 4a. Intent Check
+
+Using the full conversation context (this skill runs inline), verify:
+- The implementation matches what the user originally asked for
+- No scope creep was introduced during remediation
+- Nothing from the original request was accidentally dropped
+
+If there is a mismatch, fix it and re-run Phase 1 before continuing.
+
+### 4b. Sub-agent Reviews
+
+Launch both agents in parallel:
+
+```
+Agent: code-reviewer
+Prompt: Review all files changed in this branch (git diff origin/main...HEAD)
+for bugs, logic errors, security issues, and GAIA/AMD compliance.
+Report 🔴 Critical, 🟡 Important, 🟢 Minor issues only.
+```
+
+```
+Agent: architecture-reviewer
+Prompt: Review all files changed in this branch (git diff origin/main...HEAD)
+for SOLID principles, proper layering, dependency hygiene, and architectural
+consistency with the existing GAIA codebase.
+Report 🔴 Critical, 🟡 Important, 🟢 Minor issues only.
+```
+
+If either reviewer finds 🔴 or 🟡 issues, return to Phase 3 (counts against
+the 5-iteration cap).
+
+### 4c. Commit and PR
+
+Once clean, invoke:
+
+```
+Skill: commit-commands:commit-push-pr
+```
+
+The PR must:
+- Be created as a **draft**
+- Title derived from the branch name or original issue title
+- Body includes a link to the GitHub issue (if one was mentioned in the
+  conversation) using `Closes #NNN` or `Relates to #NNN`
+
+## Output
+
+After completion, print a summary table:
+
+```
+## Finalize Implementation — Complete
+
+| Step              | Result                        |
+|-------------------|-------------------------------|
+| Iterations used   | N / 5                         |
+| Tests             | ✅ Passing / ❌ N failures     |
+| Lint              | ✅ Clean / ❌ Violations       |
+| 🔴 Issues         | 0 resolved, 0 remaining       |
+| 🟡 Issues         | N resolved, 0 remaining       |
+| 🟢 Issues         | N noted (not blocking)        |
+| PR                | <URL or "Not created">        |
+```
+
+If the loop exited early due to the iteration cap or a stuck issue, replace the
+PR row with a clear description of what blocked completion and what the user
+should do next.
diff --git a/CLAUDE.md b/CLAUDE.md
index 6571a341..993342e7 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -496,3 +496,9 @@ Specialized agents are available in `.claude/agents/` for specific tasks (24 age
 - **ui-ux-designer** (opus) - User-centered design, accessibility
 
 When invoking a proactive agent from `.claude/agents/`, indicate which agent you are using in your response.
+
+## Skills (Slash Commands)
+
+Reusable workflows available in `.claude/skills/`. Invoke with `/skill-name`.
+
+- **`/finalize-implementation`** — Run tests, lint, CI review simulation, fix issues in a loop (max 5 iterations), then commit and create a draft PR. Invoke when you believe implementation is complete. Runs inline to preserve full conversation context (original issue, user intent).

From 5cd47e7a81e7716d0bcb8b57b7440bd04b346b8c Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <infancy_shred.0d@icloud.com>
Date: Wed, 18 Mar 2026 19:29:05 -0400
Subject: [PATCH 3/4] feat(finalize): add detailed PR merge preparation and
 review process

---
 .claude/commands/finalize.md | 123 +++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 .claude/commands/finalize.md

diff --git a/.claude/commands/finalize.md b/.claude/commands/finalize.md
new file mode 100644
index 00000000..41fd5f8d
--- /dev/null
+++ b/.claude/commands/finalize.md
@@ -0,0 +1,123 @@
+Get the current feature branch ready for PR merge. Follow these phases exactly.
+
+## Phase 1: Merge Latest Main
+
+1. Capture current branch: `git branch --show-current`
+2. Check for dirty working tree: `git status --porcelain`
+   - If dirty: `git stash`
+3. `git checkout main && git pull origin main`
+4. `git checkout <branch> && git merge main`
+5. If merge conflicts exist (git status shows conflict markers):
+   - Report the conflicting files
+   - **STOP** — do not proceed. The user must resolve conflicts manually.
+6. If stashed: `git stash pop`
+
+## Phase 2: Local PR Review (Replicating claude.yml)
+
+### Generate diff artifacts
+```bash
+git diff origin/main...HEAD > pr-diff.txt
+git diff --name-status origin/main...HEAD > pr-files.txt
+```
+
+### Self-review using the claude.yml checklist
+
+Read `.github/workflows/claude.yml` lines 78–246 for the full review criteria. Apply that same checklist to the current branch changes.
+
+**File reading strategy:**
+- Read `pr-diff.txt` first — it shows ALL changes
+- Read `pr-files.txt` to see which files changed
+- For large files (>1000 lines), use Grep with context or Read with offset/limit — do NOT read the entire file
+- Focus on reviewing CHANGED code, not entire files
+
+**Apply all 7 review sections:**
+
+1. **Code Quality & Patterns** — architecture consistency, pattern reuse, error handling, code style, CLAUDE.md compliance
+2. **Security** — SQL injection, command injection, XSS, secrets exposure, path traversal, unsafe deserialization, resource cleanup
+3. **Testing** — tests exist for new functionality, edge cases covered, test quality
+4. **Documentation** — docs updated for new features/CLI commands/SDK changes
+5. **Breaking Changes & Compatibility** — public API changes, backward compatibility
+6. **Performance & Architecture** — N+1 queries, inefficient algorithms, unnecessary dependencies
+7. **Commit Quality** — commit messages are clear and logical
+
+**Classify all findings:**
+- 🔴 **Critical** — Security issues, breaking changes, data loss risks
+- 🟡 **Important** — Bugs, architectural concerns, missing tests
+- 🟢 **Minor** — Style issues, optimizations, suggestions
+
+**DO NOT review or flag:**
+- Copyright headers (presence, absence, or year inconsistencies)
+- SPDX license identifiers
+- License-related boilerplate
+
+### Fix findings
+- Fix all 🔴 Critical and 🟡 Important issues immediately using Edit/Write tools
+- Skip 🟢 Minor issues unless the fix is trivial (one-liner)
+- For security issues: fix them directly
+
+### Clean up diff artifacts
+```bash
+rm -f pr-diff.txt pr-files.txt
+```
+
+## Phase 3: Ralph Wiggum Loop
+
+Loop until everything is green, **maximum 5 iterations**.
+
+On each iteration:
+
+### Step 1: Re-run local PR review
+Repeat Phase 2 (generate diffs, review, fix, clean up). If no new 🔴/🟡 issues, proceed to Step 2.
+
+### Step 2: Lint
+```bash
+python util/lint.py --all --fix
+```
+Check exit code. If lint still reports failures after `--fix`:
+- Read the lint output carefully
+- Manually fix remaining issues using Edit (common: import ordering, line length, f-string issues black can't auto-fix)
+- Re-run `python util/lint.py --all` to verify clean
+
+### Step 3: Run tests
+```bash
+python -m pytest tests/ -x --tb=short
+```
+- `-x` stops on first failure — analyze and fix before continuing
+- Tests requiring external services (Lemonade server) skip automatically via pytest markers
+- If tests fail: read the traceback, identify root cause, fix with Edit, then re-run
+
+### Step 4: Evaluate
+- If lint is clean AND tests pass AND no 🔴/🟡 issues in review → **exit loop, report success**
+- If max iterations (5) reached → report remaining issues and stop
+
+## Exit Report
+
+Always end with:
+
+```
+## Finalize Implementation Report
+
+**Branch:** <branch-name>
+**Iterations:** <n>/5
+
+### Lint
+✅ Clean / ❌ <remaining issues>
+
+### Tests
+✅ All passed (<N> tests) / ❌ <failure summary>
+
+### PR Review Verdict
+✅ Approve / ✅ Approve with suggestions (minor only) / ❌ Request changes — <summary>
+
+### Ready for PR
+✅ Yes — branch is ready to open/update PR
+❌ No — <list remaining blocking issues for user to resolve>
+```
+
+## Key Behaviors
+
+- **Never commit** — only fix files; the user decides when to commit
+- **Never skip the lint step** — lint failures will be caught by CI
+- **Prefer Edit over Write** — surgical fixes only
+- **Preserve existing tests** — if your fixes break tests, undo and rethink
+- **If uncertain about a fix** — describe the issue and ask rather than guessing

From e5c3acc4b512fbf815b915214baa85693f39a06a Mon Sep 17 00:00:00 2001
From: Tomasz Iniewicz <infancy_shred.0d@icloud.com>
Date: Thu, 19 Mar 2026 12:07:36 -0400
Subject: [PATCH 4/4] fix(settings): add missing comma for JSON formatting

---
 .vscode/settings.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 67a47a7a..38c691f0 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -7,5 +7,6 @@
     "python.testing.pytestArgs": [
         "tests/unit"
     ],
-    "notebook.output.wordWrap": true
+    "notebook.output.wordWrap": true,
+    "cmake.sourceDirectory": "${workspaceFolder}/cpp"
 }
\ No newline at end of file