From 1b4abfa8cb8fac8ccd83092f1353d63a11366e09 Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Mon, 16 Mar 2026 12:15:14 -0400 Subject: [PATCH 1/4] feat(cpp): add performance benchmarks and binary size tracking (#358) - Add benchmark_cpp.yml workflow: binary size, startup time, loop latency, memory footprint - Cache baseline on main pushes; compare on every PR with per-metric thresholds (10% binary, 15% other) - Fix cache path mismatch: restore and rename before fresh run overwrites results - Fix build_cpp.yml push/PR paths to include both workflow files - Add timeout-minutes: 15 to benchmark job - Add initCalled_ guard in BenchAgent to prevent duplicate tool registration - Add MISSING metric detection in compareAndReport (catches crashed benchmarks) - Add Timer::elapsedUs() guard against unmatched stop() - Move GAIA_BUILD_BENCHMARKS option to top-level options block in CMakeLists.txt - Ignore cpp/benchmark-*.json in .gitignore (ephemeral CI artifacts) --- .github/workflows/benchmark_cpp.yml | 153 +++++++++++++ .github/workflows/build_cpp.yml | 17 +- .gitignore | 3 + cpp/CMakeLists.txt | 34 +++ cpp/benchmarks/bench_main.cpp | 335 ++++++++++++++++++++++++++++ cpp/benchmarks/bench_utils.h | 274 +++++++++++++++++++++++ cpp/benchmarks/mock_llm_server.h | 154 +++++++++++++ 7 files changed, 969 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/benchmark_cpp.yml create mode 100644 cpp/benchmarks/bench_main.cpp create mode 100644 cpp/benchmarks/bench_utils.h create mode 100644 cpp/benchmarks/mock_llm_server.h diff --git a/.github/workflows/benchmark_cpp.yml b/.github/workflows/benchmark_cpp.yml new file mode 100644 index 00000000..d6a6af33 --- /dev/null +++ b/.github/workflows/benchmark_cpp.yml @@ -0,0 +1,153 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT + +# C++ performance benchmark workflow. +# Measures binary size, startup time, loop latency, and memory footprint. +# Called from build_cpp.yml after the build job passes. + +name: C++ Benchmarks + +on: + workflow_call: + +permissions: + contents: read + +jobs: + benchmark: + name: C++ Benchmarks (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + timeout-minutes: 15 + + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest] + + steps: + - uses: actions/checkout@v6 + + - name: Install OpenSSL (Linux) + if: runner.os == 'Linux' + run: sudo apt-get install -y libssl-dev + + - name: Install OpenSSL (Windows) + if: runner.os == 'Windows' + run: choco install openssl --no-progress -y + + - name: Restore FetchContent cache + uses: actions/cache@v4 + with: + path: cpp/build-bench/_deps + key: fetchcontent-bench-${{ matrix.os }}-${{ hashFiles('cpp/CMakeLists.txt') }} + + # Restore baseline from last main-branch run (prefix match gets latest) + - name: Restore benchmark baseline + id: restore-baseline + uses: actions/cache/restore@v4 + with: + path: cpp/benchmark-results.json + key: benchmark-baseline-${{ matrix.os }}-dummy + restore-keys: benchmark-baseline-${{ matrix.os }}- + + # Rename restored results to baseline path so the compare step can find it + - name: Rename restored cache to baseline + shell: bash + run: | + if [ -f cpp/benchmark-results.json ]; then + mv cpp/benchmark-results.json cpp/benchmark-baseline.json + fi + + # Build static library + examples + benchmarks + - name: Configure CMake (static + benchmarks) + run: > + cmake -B cpp/build-bench -S cpp + -DCMAKE_BUILD_TYPE=Release + -DGAIA_BUILD_BENCHMARKS=ON + -DGAIA_BUILD_EXAMPLES=ON + -DGAIA_BUILD_TESTS=OFF + -DGAIA_BUILD_INTEGRATION_TESTS=OFF + + - name: Build (static + benchmarks) + run: cmake --build cpp/build-bench --config Release --parallel + + # Build shared library separately for DLL/SO size measurement + - name: Configure CMake (shared library) + run: > + cmake -B cpp/build-bench-shared -S cpp + -DCMAKE_BUILD_TYPE=Release + -DBUILD_SHARED_LIBS=ON + -DGAIA_BUILD_TESTS=OFF + -DGAIA_BUILD_EXAMPLES=OFF + -DGAIA_BUILD_BENCHMARKS=OFF + + - name: Build (shared library) + run: cmake --build cpp/build-bench-shared --config Release --parallel + + # Measure sizes and run all benchmarks on Linux + - name: Run benchmarks (Linux) + if: runner.os == 'Linux' + run: | + STATIC=$(stat -c%s cpp/build-bench/libgaia_core.a) + SHARED=$(stat -c%s cpp/build-bench-shared/libgaia_core.so) + EXE=$(stat -c%s cpp/build-bench/security_demo) + echo "Static lib: $STATIC bytes" + echo "Shared lib: $SHARED bytes" + echo "Example exe: $EXE bytes" + cpp/build-bench/gaia_benchmarks \ + --output cpp/benchmark-results.json \ + --static-lib-size "$STATIC" \ + --shared-lib-size "$SHARED" \ + --exe-size "$EXE" + + # Measure sizes and run all benchmarks on Windows + - name: Run benchmarks (Windows) + if: runner.os == 'Windows' + shell: powershell + run: | + $static = (Get-Item "cpp/build-bench/Release/gaia_core.lib").Length + $shared = (Get-Item "cpp/build-bench-shared/Release/gaia_core.dll").Length + $exe = (Get-Item "cpp/build-bench/Release/security_demo.exe").Length + Write-Host "Static lib: $static bytes" + Write-Host "Shared lib: $shared bytes" + Write-Host "Example exe: $exe bytes" + & "cpp/build-bench/Release/gaia_benchmarks.exe" ` + --output "cpp/benchmark-results.json" ` + --static-lib-size $static ` + --shared-lib-size $shared ` + --exe-size $exe + + # Compare current results against baseline (binary: 10%, others: 15%) + - name: Compare against baseline + shell: bash + run: | + if [ -f cpp/benchmark-baseline.json ]; then + echo "Baseline found — running regression check" + if [ "$RUNNER_OS" = "Windows" ]; then + BENCH_EXE="cpp/build-bench/Release/gaia_benchmarks.exe" + else + BENCH_EXE="cpp/build-bench/gaia_benchmarks" + fi + "$BENCH_EXE" \ + --compare \ + --baseline cpp/benchmark-baseline.json \ + --current cpp/benchmark-results.json + else + echo "No baseline found — first run establishes the baseline" + fi + + # Save new baseline only on pushes to main (immutable cache: unique key per run) + - name: Save benchmark baseline + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + uses: actions/cache/save@v4 + with: + path: cpp/benchmark-results.json + key: benchmark-baseline-${{ matrix.os }}-${{ github.run_id }} + + # Always upload results as an artifact for inspection + - name: Upload benchmark results + uses: actions/upload-artifact@v6 + if: always() + with: + name: cpp-benchmark-${{ matrix.os }} + path: cpp/benchmark-results.json diff --git a/.github/workflows/build_cpp.yml b/.github/workflows/build_cpp.yml index df59be6f..a02d0751 100644 --- a/.github/workflows/build_cpp.yml +++ b/.github/workflows/build_cpp.yml @@ -14,12 +14,15 @@ on: branches: [ main ] paths: - 'cpp/**' + - '.github/workflows/build_cpp.yml' + - '.github/workflows/benchmark_cpp.yml' pull_request: branches: [ main ] types: [opened, synchronize, reopened, ready_for_review] paths: - 'cpp/**' - '.github/workflows/build_cpp.yml' + - '.github/workflows/benchmark_cpp.yml' merge_group: workflow_dispatch: @@ -330,11 +333,18 @@ jobs: lemonade-server-stderr.log lemonade-server.log + # Performance benchmarks (runs after build passes) + benchmark: + name: C++ Benchmarks + needs: [build-and-test] + if: needs.build-and-test.result == 'success' + uses: ./.github/workflows/benchmark_cpp.yml + # Summary job cpp-build-summary: name: C++ Build Summary runs-on: ubuntu-latest - needs: [build-and-test, install-test, shared-lib-test, integration-test] + needs: [build-and-test, install-test, shared-lib-test, integration-test, benchmark] if: >- ${{ always() && !cancelled() && needs.build-and-test.result != 'cancelled' }} @@ -346,6 +356,7 @@ jobs: echo "Install Test: ${{ needs.install-test.result }}" echo "Shared Lib Test: ${{ needs.shared-lib-test.result }}" echo "Integration Tests: ${{ needs.integration-test.result }}" + echo "Benchmarks: ${{ needs.benchmark.result }}" echo "" if [[ "${{ needs.build-and-test.result }}" == "skipped" ]]; then @@ -363,6 +374,10 @@ jobs: if [[ "${{ needs.integration-test.result }}" == "failure" ]]; then echo "::warning::Integration tests failed (STX runner infrastructure issue)" fi + # Benchmarks are non-blocking (regression alerts are warnings only) + if [[ "${{ needs.benchmark.result }}" == "failure" ]]; then + echo "::warning::Benchmark regression detected — review cpp-benchmark-* artifacts" + fi if [[ "$FAILED" == "0" ]]; then echo "All required C++ jobs passed (unit tests, install round-trip, shared library)!" diff --git a/.gitignore b/.gitignore index 54b3c5b1..2c50f5fd 100644 --- a/.gitignore +++ b/.gitignore @@ -215,6 +215,9 @@ cpp/vcpkg_installed/ CMakeFiles/ cpp/build_ssl/ +# C++ benchmark output (ephemeral — stored in CI cache/artifacts, not source control) +cpp/benchmark-*.json + # SD test results and artifacts sd_model_sweep_results/ quick_test_results/ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 254e56c1..649d5deb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -29,6 +29,9 @@ endif() option(GAIA_BUILD_INTEGRATION_TESTS "Build LLM integration tests (requires lemonade-server with Qwen3-4B-GGUF loaded)" OFF) +# Performance benchmarks -- always OFF by default. +option(GAIA_BUILD_BENCHMARKS "Build performance benchmarks" OFF) + # --------------------------------------------------------------------------- # Dependencies -- prefer system packages, fall back to FetchContent # --------------------------------------------------------------------------- @@ -238,6 +241,37 @@ if(GAIA_BUILD_INTEGRATION_TESTS) ) endif() +# --------------------------------------------------------------------------- +# Benchmarks +# --------------------------------------------------------------------------- +if(GAIA_BUILD_BENCHMARKS) + add_executable(gaia_benchmarks benchmarks/bench_main.cpp) + target_link_libraries(gaia_benchmarks PRIVATE gaia::gaia_core) + + target_include_directories(gaia_benchmarks PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks + ) + + # httplib is PRIVATE to gaia_core; benchmarks need it for the mock server + if(httplib_FOUND) + target_link_libraries(gaia_benchmarks PRIVATE httplib::httplib) + else() + target_include_directories(gaia_benchmarks SYSTEM PRIVATE + $) + endif() + + # OpenSSL defines must match gaia_core for httplib header consistency + if(OpenSSL_FOUND) + target_compile_definitions(gaia_benchmarks PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT) + target_link_libraries(gaia_benchmarks PRIVATE OpenSSL::SSL OpenSSL::Crypto) + endif() + + # Windows: psapi for GetProcessMemoryInfo + if(WIN32) + target_link_libraries(gaia_benchmarks PRIVATE psapi) + endif() +endif() + # --------------------------------------------------------------------------- # Install + package config # --------------------------------------------------------------------------- diff --git a/cpp/benchmarks/bench_main.cpp b/cpp/benchmarks/bench_main.cpp new file mode 100644 index 00000000..68aaa061 --- /dev/null +++ b/cpp/benchmarks/bench_main.cpp @@ -0,0 +1,335 @@ +// Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +// C++ framework performance benchmarks. +// Measures binary size, startup time, loop latency, and memory footprint. +// +// Usage: +// # Run all benchmarks and write results +// gaia_benchmarks --output results.json \ +// --static-lib-size --shared-lib-size --exe-size +// +// # Compare current vs baseline +// gaia_benchmarks --compare --baseline baseline.json --current results.json + +#include "bench_utils.h" +#include "mock_llm_server.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// BenchAgent — minimal agent subclass for benchmarking +// --------------------------------------------------------------------------- + +class BenchAgent : public gaia::Agent { +public: + explicit BenchAgent(const gaia::AgentConfig& config) : gaia::Agent(config) { + // Do NOT call init() here — startup benchmark calls benchInit() explicitly + } + + /// Expose init() for explicit invocation in startup benchmark. + void benchInit() { + if (initCalled_) return; + initCalled_ = true; + init(); + // Silence final-answer output so benchmark iterations don't flood stdout + setOutputHandler(std::make_unique(true)); + } + +protected: + void registerTools() override { + gaia::ToolParameter msgParam; + msgParam.name = "message"; + msgParam.type = gaia::ToolParamType::STRING; + msgParam.description = "Message to echo"; + msgParam.required = true; + + toolRegistry().registerTool( + "echo", + "Echo a message back", + [](const nlohmann::json& args) -> nlohmann::json { + return nlohmann::json{{"echoed", args.value("message", "")}}; + }, + {msgParam}); + } + + std::string getSystemPrompt() const override { return "You are a benchmark agent."; } + +private: + bool initCalled_ = false; +}; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +static gaia::AgentConfig makeConfig(const std::string& url) { + gaia::AgentConfig cfg; + cfg.baseUrl = url; + cfg.modelId = ""; // empty → skip ensureModelLoaded() + cfg.maxSteps = 5; + cfg.silentMode = true; // suppress all console output + cfg.debug = false; + return cfg; +} + +// --------------------------------------------------------------------------- +// Benchmark 1: Binary Size +// Records sizes passed via CLI args — actual measurement happens in CI shell. +// --------------------------------------------------------------------------- + +static bench::BenchmarkResult benchStaticLibSize(long bytes) { + return {"binary_size_static_lib_bytes", static_cast(bytes), "bytes"}; +} + +static bench::BenchmarkResult benchSharedLibSize(long bytes) { + return {"binary_size_shared_lib_bytes", static_cast(bytes), "bytes"}; +} + +static bench::BenchmarkResult benchExeSize(long bytes) { + return {"binary_size_example_exe_bytes", static_cast(bytes), "bytes"}; +} + +// --------------------------------------------------------------------------- +// Benchmark 2: Startup Time +// N iterations of: construct BenchAgent → benchInit() → systemPrompt() +// No HTTP calls — modelId is empty, so no ensureModelLoaded(). +// --------------------------------------------------------------------------- + +static bench::BenchmarkResult benchStartupTime(int iterations = 100) { + std::cout << " Running startup benchmark (" << iterations << " iterations)...\n"; + + // Use a dummy URL — no HTTP calls will be made (modelId is empty) + const std::string dummyUrl = "http://127.0.0.1:1"; // won't be contacted + + std::vector times; + times.reserve(iterations); + + bench::Timer timer; + for (int i = 0; i < iterations; ++i) { + timer.start(); + { + BenchAgent agent(makeConfig(dummyUrl)); + agent.benchInit(); + (void)agent.systemPrompt(); + } + timer.stop(); + times.push_back(timer.elapsedUs()); + } + + double med = bench::median(times); + std::cout << " Startup median: " << std::fixed << std::setprecision(1) << med << " us\n"; + return {"startup_time_median_us", med, "us"}; +} + +// --------------------------------------------------------------------------- +// Benchmark 3: Loop Latency +// N iterations of processQuery() with a mock server. +// Each call uses a 2-step sequence: tool call → answer. +// History is cleared between iterations so each call is independent. +// --------------------------------------------------------------------------- + +static bench::BenchmarkResult benchLoopLatency(int iterations = 50) { + std::cout << " Running loop latency benchmark (" << iterations << " iterations)...\n"; + + bench::MockLlmServer server; + BenchAgent agent(makeConfig(server.baseUrl())); + agent.benchInit(); + agent.setDefaultPolicy(gaia::ToolPolicy::ALLOW); + + std::vector times; + times.reserve(iterations); + + bench::Timer timer; + for (int i = 0; i < iterations; ++i) { + // Queue: tool call first, then answer + server.pushResponse(bench::kToolCall); + server.pushResponse(bench::kDefaultAnswer); + + agent.clearHistory(); + + timer.start(); + agent.processQuery("benchmark"); + timer.stop(); + times.push_back(timer.elapsedUs()); + } + + double med = bench::median(times); + std::cout << " Loop latency median: " << std::fixed << std::setprecision(1) << med + << " us\n"; + return {"loop_latency_median_us", med, "us"}; +} + +// --------------------------------------------------------------------------- +// Benchmark 4: Memory Footprint +// 20 processQuery() calls WITHOUT clearing history (conversation accumulates). +// Measures baseline RSS, peak RSS, and per-step growth. +// --------------------------------------------------------------------------- + +static std::vector benchMemoryFootprint(int steps = 20) { + std::cout << " Running memory benchmark (" << steps << " steps)...\n"; + + bench::MockLlmServer server; + BenchAgent agent(makeConfig(server.baseUrl())); + agent.benchInit(); + agent.setDefaultPolicy(gaia::ToolPolicy::ALLOW); + + // Force system prompt computation before measuring baseline + (void)agent.systemPrompt(); + + long baselineKb = bench::MemoryTracker::getCurrentRssKb(); + std::cout << " Baseline RSS: " << baselineKb << " KB\n"; + + long peakKb = baselineKb; + for (int i = 0; i < steps; ++i) { + // Each call returns an answer directly (no tool calls) so history grows + // by one user message + one assistant message per step. + server.pushResponse(bench::kDefaultAnswer); + agent.processQuery("benchmark step " + std::to_string(i)); + + long rss = bench::MemoryTracker::getCurrentRssKb(); + if (rss > peakKb) peakKb = rss; + } + + long finalKb = bench::MemoryTracker::getCurrentRssKb(); + double perStepGrowth = (steps > 0) ? static_cast(finalKb - baselineKb) / steps : 0.0; + + std::cout << " Peak RSS: " << peakKb << " KB\n"; + std::cout << " Per-step growth: " << std::fixed << std::setprecision(1) << perStepGrowth + << " KB\n"; + + return { + {"memory_baseline_kb", static_cast(baselineKb), "KB"}, + {"memory_peak_kb", static_cast(peakKb), "KB"}, + {"memory_per_step_growth_kb", perStepGrowth, "KB"}, + }; +} + +// --------------------------------------------------------------------------- +// CLI parsing helpers +// --------------------------------------------------------------------------- + +static std::string getArg(const std::vector& args, const std::string& flag, + const std::string& defaultVal = "") { + for (size_t i = 0; i + 1 < args.size(); ++i) { + if (args[i] == flag) return args[i + 1]; + } + return defaultVal; +} + +static bool hasFlag(const std::vector& args, const std::string& flag) { + for (const auto& a : args) { + if (a == flag) return true; + } + return false; +} + +// --------------------------------------------------------------------------- +// main +// --------------------------------------------------------------------------- + +int main(int argc, char* argv[]) { + // Unset GAIA_CPP_BASE_URL so it does not interfere with benchmarks + // that explicitly set their own baseUrl via AgentConfig. +#if defined(_WIN32) + _putenv_s("GAIA_CPP_BASE_URL", ""); +#else + unsetenv("GAIA_CPP_BASE_URL"); +#endif + + std::vector args(argv + 1, argv + argc); + + // ---- Compare mode ---- + if (hasFlag(args, "--compare")) { + std::string baseline = getArg(args, "--baseline"); + std::string current = getArg(args, "--current"); + if (baseline.empty() || current.empty()) { + std::cerr << "Usage: gaia_benchmarks --compare --baseline --current \n"; + return 1; + } + try { + return bench::compareAndReport(baseline, current); + } catch (const std::exception& e) { + std::cerr << "Comparison failed: " << e.what() << "\n"; + return 1; + } + } + + // ---- Benchmark mode ---- + std::string outputPath = getArg(args, "--output", "benchmark-results.json"); + + long staticLibBytes = 0; + long sharedLibBytes = 0; + long exeBytes = 0; + try { + staticLibBytes = std::stol(getArg(args, "--static-lib-size", "0")); + sharedLibBytes = std::stol(getArg(args, "--shared-lib-size", "0")); + exeBytes = std::stol(getArg(args, "--exe-size", "0")); + } catch (const std::exception& e) { + std::cerr << "Error: invalid size argument: " << e.what() << "\n"; + std::cerr << "Usage: gaia_benchmarks --output " + " --static-lib-size --shared-lib-size --exe-size \n"; + return 1; + } + + std::cout << "=== GAIA C++ Performance Benchmarks ===\n\n"; + + std::vector results; + + // Benchmark 1: Binary sizes (from CLI args) + std::cout << "Benchmark 1: Binary Sizes\n"; + results.push_back(benchStaticLibSize(staticLibBytes)); + results.push_back(benchSharedLibSize(sharedLibBytes)); + results.push_back(benchExeSize(exeBytes)); + std::cout << " Static lib: " << staticLibBytes << " bytes\n"; + std::cout << " Shared lib: " << sharedLibBytes << " bytes\n"; + std::cout << " Example exe: " << exeBytes << " bytes\n\n"; + + // Benchmark 2: Startup time + std::cout << "Benchmark 2: Startup Time\n"; + try { + results.push_back(benchStartupTime(100)); + } catch (const std::exception& e) { + std::cerr << " WARNING: Startup benchmark failed: " << e.what() << "\n"; + } + std::cout << "\n"; + + // Benchmark 3: Loop latency + std::cout << "Benchmark 3: Loop Latency\n"; + try { + results.push_back(benchLoopLatency(50)); + } catch (const std::exception& e) { + std::cerr << " WARNING: Loop latency benchmark failed: " << e.what() << "\n"; + } + std::cout << "\n"; + + // Benchmark 4: Memory footprint + std::cout << "Benchmark 4: Memory Footprint\n"; + try { + auto memResults = benchMemoryFootprint(20); + results.insert(results.end(), memResults.begin(), memResults.end()); + } catch (const std::exception& e) { + std::cerr << " WARNING: Memory benchmark failed: " << e.what() << "\n"; + } + std::cout << "\n"; + + // Write results + try { + bench::writeBenchmarkResults(outputPath, results); + std::cout << "Results written to: " << outputPath << "\n"; + } catch (const std::exception& e) { + std::cerr << "Failed to write results: " << e.what() << "\n"; + return 1; + } + + return 0; +} diff --git a/cpp/benchmarks/bench_utils.h b/cpp/benchmarks/bench_utils.h new file mode 100644 index 00000000..2716175c --- /dev/null +++ b/cpp/benchmarks/bench_utils.h @@ -0,0 +1,274 @@ +// Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +// Benchmark utilities: timer, memory tracker, result I/O, and comparison. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Platform-specific memory headers +#if defined(_WIN32) +# include +# include +#elif defined(__APPLE__) +# include +#else +# include +#endif + +#include + +using json = nlohmann::json; + +namespace bench { + +// --------------------------------------------------------------------------- +// Timer +// --------------------------------------------------------------------------- + +class Timer { +public: + void start() { start_ = std::chrono::high_resolution_clock::now(); } + + void stop() { end_ = std::chrono::high_resolution_clock::now(); } + + double elapsedUs() const { + if (end_ < start_) return 0.0; + return static_cast( + std::chrono::duration_cast(end_ - start_).count()); + } + + double elapsedMs() const { return elapsedUs() / 1000.0; } + +private: + std::chrono::high_resolution_clock::time_point start_; + std::chrono::high_resolution_clock::time_point end_; +}; + +// --------------------------------------------------------------------------- +// MemoryTracker — returns current process RSS in KB +// --------------------------------------------------------------------------- + +class MemoryTracker { +public: + static long getCurrentRssKb() { +#if defined(_WIN32) + PROCESS_MEMORY_COUNTERS pmc; + if (GetProcessMemoryInfo(GetCurrentProcess(), &pmc, sizeof(pmc))) { + return static_cast(pmc.WorkingSetSize / 1024); + } + return 0; +#elif defined(__APPLE__) + mach_task_basic_info info; + mach_msg_type_number_t count = MACH_TASK_BASIC_INFO_COUNT; + if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, + reinterpret_cast(&info), &count) == KERN_SUCCESS) { + return static_cast(info.resident_size / 1024); + } + return 0; +#else + // Linux: parse /proc/self/status for VmRSS + std::ifstream f("/proc/self/status"); + std::string line; + while (std::getline(f, line)) { + if (line.rfind("VmRSS:", 0) == 0) { + std::istringstream iss(line); + std::string key; + long val = 0; + iss >> key >> val; + return val; // already in KB + } + } + return 0; +#endif + } +}; + +// --------------------------------------------------------------------------- +// BenchmarkResult +// --------------------------------------------------------------------------- + +struct BenchmarkResult { + std::string name; + double value; + std::string unit; +}; + +// --------------------------------------------------------------------------- +// JSON I/O +// --------------------------------------------------------------------------- + +inline void writeBenchmarkResults(const std::string& path, + const std::vector& results) { + // Timestamp (thread-safe via gmtime_r / gmtime_s) + auto now = std::chrono::system_clock::now(); + std::time_t t = std::chrono::system_clock::to_time_t(now); + std::tm tm_buf{}; +#if defined(_WIN32) + gmtime_s(&tm_buf, &t); +#else + gmtime_r(&t, &tm_buf); +#endif + std::ostringstream ts; + ts << std::put_time(&tm_buf, "%Y-%m-%dT%H:%M:%SZ"); + + // Platform string + std::string platform; +#if defined(_WIN32) + platform = "windows"; +#elif defined(__APPLE__) + platform = "macos"; +#else + platform = "linux"; +#endif + + json root; + root["timestamp"] = ts.str(); + root["platform"] = platform; + json arr = json::array(); + for (const auto& r : results) { + arr.push_back({{"name", r.name}, {"value", r.value}, {"unit", r.unit}}); + } + root["results"] = arr; + + std::ofstream f(path); + if (!f.is_open()) { + throw std::runtime_error("Cannot write benchmark results to: " + path); + } + f << root.dump(2) << "\n"; +} + +inline std::vector readBenchmarkResults(const std::string& path) { + std::ifstream f(path); + if (!f.is_open()) { + throw std::runtime_error("Cannot read benchmark results from: " + path); + } + json root = json::parse(f); + std::vector out; + for (const auto& r : root.at("results")) { + out.push_back({r.at("name").get(), r.at("value").get(), + r.at("unit").get()}); + } + return out; +} + +// --------------------------------------------------------------------------- +// Per-metric thresholds +// --------------------------------------------------------------------------- + +inline double thresholdForMetric(const std::string& name) { + // Binary size metrics: 10% threshold (issue: "Fail if size regresses >10%") + if (name.find("binary_size") != std::string::npos) { + return 10.0; + } + // All other metrics: 15% threshold + return 15.0; +} + +// --------------------------------------------------------------------------- +// compareAndReport: compare current vs baseline, return 0 if OK, 1 if regression +// --------------------------------------------------------------------------- + +inline int compareAndReport(const std::string& baselinePath, const std::string& currentPath) { + std::vector baseline = readBenchmarkResults(baselinePath); + std::vector current = readBenchmarkResults(currentPath); + + // Index baseline by name + std::map baseMap; + for (const auto& r : baseline) { + baseMap[r.name] = r.value; + } + + std::cout << "\n=== Benchmark Regression Report ===\n"; + std::cout << std::left << std::setw(45) << "Metric" + << std::right << std::setw(12) << "Baseline" + << std::setw(12) << "Current" + << std::setw(10) << "Change" + << std::setw(12) << "Threshold" + << std::setw(10) << "Status" << "\n"; + std::cout << std::string(101, '-') << "\n"; + + bool anyRegression = false; + for (const auto& r : current) { + auto it = baseMap.find(r.name); + if (it == baseMap.end()) { + std::cout << std::left << std::setw(45) << r.name + << std::right << std::setw(12) << "N/A" + << std::setw(12) << r.value + << std::setw(10) << "N/A" + << std::setw(12) << "N/A" + << std::setw(10) << "NEW" << "\n"; + continue; + } + + double base = it->second; + double threshold = thresholdForMetric(r.name); + double pct = (base == 0.0) ? 0.0 : (r.value - base) / base * 100.0; + + std::string status; + if (pct > threshold) { + status = "FAIL"; + anyRegression = true; + } else if (pct < -1.0) { + status = "IMPROVED"; + } else { + status = "OK"; + } + + std::cout << std::left << std::setw(45) << r.name << std::right + << std::setw(12) << std::fixed << std::setprecision(1) << base + << std::setw(12) << r.value + << std::setw(9) << std::showpos << pct << "%" << std::noshowpos + << std::setw(12) << (std::to_string(static_cast(threshold)) + "%") + << std::setw(10) << status << "\n"; + } + // Report baseline metrics absent from the current run (benchmark may have crashed) + for (const auto& b : baseline) { + bool found = false; + for (const auto& r : current) { + if (r.name == b.name) { found = true; break; } + } + if (!found) { + std::cout << std::left << std::setw(45) << b.name + << std::right << std::setw(12) << std::fixed << std::setprecision(1) + << b.value + << std::setw(12) << "N/A" + << std::setw(10) << "N/A" + << std::setw(12) << "N/A" + << std::setw(10) << "MISSING" << "\n"; + anyRegression = true; + } + } + std::cout << std::string(101, '-') << "\n"; + + if (anyRegression) { + std::cout << "\nRESULT: REGRESSION DETECTED — one or more metrics exceed threshold or are missing\n"; + return 1; + } + std::cout << "\nRESULT: PASS — no regressions detected\n"; + return 0; +} + +// --------------------------------------------------------------------------- +// Median helper +// --------------------------------------------------------------------------- + +inline double median(std::vector v) { + if (v.empty()) return 0.0; + std::sort(v.begin(), v.end()); + size_t n = v.size(); + return (n % 2 == 0) ? (v[n / 2 - 1] + v[n / 2]) / 2.0 : v[n / 2]; +} + +} // namespace bench diff --git a/cpp/benchmarks/mock_llm_server.h b/cpp/benchmarks/mock_llm_server.h new file mode 100644 index 00000000..fd420ff3 --- /dev/null +++ b/cpp/benchmarks/mock_llm_server.h @@ -0,0 +1,154 @@ +// Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT +// +// In-process mock HTTP server mimicking the Lemonade Server API. +// Used by benchmarks to avoid requiring a real LLM backend. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace bench { + +// Default chat completion response — agent returns a final answer immediately. +static const std::string kDefaultAnswer = R"({"choices":[{"message":{"content":"{\"thought\":\"done\",\"goal\":\"complete\",\"answer\":\"benchmark result\"}"}}]})"; + +// Tool-call response — agent calls the echo tool first. +static const std::string kToolCall = R"({"choices":[{"message":{"content":"{\"thought\":\"calling tool\",\"goal\":\"test\",\"tool\":\"echo\",\"tool_args\":{\"message\":\"bench\"}}"}}]})"; + +// Health response — reports mock-model as already loaded so ensureModelLoaded() skips /load. +static const std::string kHealthOk = R"({"status":"ok","all_models_loaded":[{"model_name":"mock-model","recipe_options":{"ctx_size":16384}}]})"; + +// Models list response +static const std::string kModelsList = R"({"data":[{"id":"mock-model"}]})"; + +// Load response +static const std::string kLoadOk = R"({"status":"ok"})"; + +class MockLlmServer { +public: + /// Start server on an OS-assigned port. + /// Constructor blocks until the server is accepting connections. + MockLlmServer() : server_(std::make_unique()) { + registerHandlers(); + + // bind_to_any_port returns the OS-assigned port (avoids CI port conflicts) + port_ = server_->bind_to_any_port("127.0.0.1"); + if (port_ <= 0) { + throw std::runtime_error("MockLlmServer: failed to bind to any port"); + } + + thread_ = std::thread([this]() { server_->listen_after_bind(); }); + + waitUntilReady(); + } + + ~MockLlmServer() { + server_->stop(); + if (thread_.joinable()) { + thread_.join(); + } + } + + // Non-copyable, non-movable + MockLlmServer(const MockLlmServer&) = delete; + MockLlmServer& operator=(const MockLlmServer&) = delete; + + /// The port the server is listening on. + int port() const { return port_; } + + /// Base URL suitable for AgentConfig::baseUrl (without /api/v1 — LemonadeClient adds it). + std::string baseUrl() const { return "http://127.0.0.1:" + std::to_string(port_); } + + /// Push a response to return for the next POST /chat/completions call. + /// When the queue is empty the default answer response is returned. + void pushResponse(const std::string& body) { + std::lock_guard lk(mu_); + responseQueue_.push_back(body); + } + + /// Push N copies of a response. + void pushResponses(const std::string& body, int n) { + std::lock_guard lk(mu_); + for (int i = 0; i < n; ++i) { + responseQueue_.push_back(body); + } + } + + /// Clear pending queued responses. + void clearQueue() { + std::lock_guard lk(mu_); + responseQueue_.clear(); + } + + /// Number of chat completion requests handled so far. + int requestCount() const { return requestCount_.load(); } + +private: + void registerHandlers() { + // Health check — always reports mock-model loaded + server_->Get("/api/v1/health", [](const httplib::Request&, httplib::Response& res) { + res.set_content(kHealthOk, "application/json"); + }); + + // Load model — no-op safety fallback + server_->Post("/api/v1/load", [](const httplib::Request&, httplib::Response& res) { + res.set_content(kLoadOk, "application/json"); + }); + + // Models list + server_->Get("/api/v1/models", [](const httplib::Request&, httplib::Response& res) { + res.set_content(kModelsList, "application/json"); + }); + + // Chat completions — dequeue a pre-loaded response or return default answer + server_->Post("/api/v1/chat/completions", + [this](const httplib::Request&, httplib::Response& res) { + ++requestCount_; + std::string body; + { + std::lock_guard lk(mu_); + if (!responseQueue_.empty()) { + body = responseQueue_.front(); + responseQueue_.pop_front(); + } else { + body = kDefaultAnswer; + } + } + res.set_content(body, "application/json"); + }); + } + + void waitUntilReady() { + // Poll health endpoint until the server responds + httplib::Client cli("127.0.0.1", port_); + cli.set_connection_timeout(1); + cli.set_read_timeout(1); + + for (int attempt = 0; attempt < 50; ++attempt) { + auto res = cli.Get("/api/v1/health"); + if (res && res->status == 200) { + return; + } + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + throw std::runtime_error("MockLlmServer: server did not become ready"); + } + + std::unique_ptr server_; + std::thread thread_; + int port_ = 0; + std::mutex mu_; + std::deque responseQueue_; + std::atomic requestCount_{0}; +}; + +} // namespace bench From c526047a11dcd568061f285c1f096b3670064ade Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Mon, 16 Mar 2026 12:50:47 -0400 Subject: [PATCH 2/4] feat(claude): add finalize-implementation slash command skill Adds a reusable /finalize-implementation skill that runs tests, lint, CI review simulation, and sub-agent code/architecture reviews in a loop (max 5 iterations), then commits and creates a draft PR. Co-Authored-By: Tomasz Waszczyk --- .../skills/finalize-implementation/SKILL.md | 169 ++++++++++++++++++ CLAUDE.md | 6 + 2 files changed, 175 insertions(+) create mode 100644 .claude/skills/finalize-implementation/SKILL.md diff --git a/.claude/skills/finalize-implementation/SKILL.md b/.claude/skills/finalize-implementation/SKILL.md new file mode 100644 index 00000000..3aca9de3 --- /dev/null +++ b/.claude/skills/finalize-implementation/SKILL.md @@ -0,0 +1,169 @@ +--- +name: finalize-implementation +description: > + Run tests, lint, CI review simulation, fix issues in a loop, then commit and + create a draft PR. Invoke when you believe the implementation is complete and + ready for review. Runs inline to preserve full conversation context. +model: sonnet +disable-model-invocation: true +--- + +# Finalize Implementation + +Validate the implementation against tests, lint, and the CI review checklist, +fix any issues found, then commit and open a draft PR. + +## Prerequisites + +Before starting, confirm: +- Working directory is clean or changes are ready to finalize +- You are on the correct feature branch (not `main`) + +If on `main`, stop and ask the user which branch to use. + +## Phase 1 — Baseline Verification + +Run in order: + +```bash +python -m pytest tests/unit/ -v --tb=short --cache-clear +``` + +```bash +python util/lint.py --all --fix +``` + +```bash +python util/lint.py --all +``` + +Record: +- Number of test failures (and which tests) +- Any lint violations remaining after `--fix` + +If lint still fails after `--fix`, fix manually before proceeding. + +## Phase 2 — CI Simulation + +**Every iteration: read `.github/workflows/claude.yml` fresh — never use a +cached version.** + +Steps: +1. Read `.github/workflows/claude.yml` and extract the `custom_instructions` + from the `pr-review` job. +2. Run: + ```bash + git diff origin/main...HEAD + git diff --name-status origin/main...HEAD + ``` +3. Review all changed files against the extracted checklist. +4. Produce a structured report: + +``` +## CI Review Report — Iteration N + +### 🔴 Critical +- [issue] (file:line) + +### 🟡 Important +- [issue] (file:line) + +### 🟢 Minor +- [issue] (file:line) +``` + +Severity definitions (from the CI checklist): +- 🔴 Critical — security vulnerabilities, breaking changes, data loss risks +- 🟡 Important — bugs, architectural concerns, missing tests, missing docs +- 🟢 Minor — style, optimizations, non-blocking suggestions + +## Phase 3 — Remediation Loop + +**Hard cap: 5 iterations total** (Phase 1 + Phase 2 = one iteration). + +Each iteration: +1. Fix 🔴 issues first, then 🟡 issues. Skip 🟢 unless trivial. +2. Re-run Phase 1 (tests + lint with `--cache-clear`). +3. Re-run Phase 2 (fresh `.github/workflows/claude.yml` read every time). +4. Evaluate exit conditions. + +### Exit Conditions + +**Exit normally (proceed to Phase 4) when:** +- Zero 🔴 and zero 🟡 issues remain AND all tests pass AND lint is clean + +**Exit with escalation (stop and report to user) when:** +- 5 iterations reached and issues remain — report what's left and ask for guidance +- The same 🔴 or 🟡 issue appears unchanged in 2 consecutive iterations — you + are stuck; report it immediately rather than continuing + +**Never silently skip a 🔴 issue to reach the exit condition.** + +## Phase 4 — Final Validation + +### 4a. Intent Check + +Using the full conversation context (this skill runs inline), verify: +- The implementation matches what the user originally asked for +- No scope creep was introduced during remediation +- Nothing from the original request was accidentally dropped + +If there is a mismatch, fix it and re-run Phase 1 before continuing. + +### 4b. Sub-agent Reviews + +Launch both agents in parallel: + +``` +Agent: code-reviewer +Prompt: Review all files changed in this branch (git diff origin/main...HEAD) +for bugs, logic errors, security issues, and GAIA/AMD compliance. +Report 🔴 Critical, 🟡 Important, 🟢 Minor issues only. +``` + +``` +Agent: architecture-reviewer +Prompt: Review all files changed in this branch (git diff origin/main...HEAD) +for SOLID principles, proper layering, dependency hygiene, and architectural +consistency with the existing GAIA codebase. +Report 🔴 Critical, 🟡 Important, 🟢 Minor issues only. +``` + +If either reviewer finds 🔴 or 🟡 issues, return to Phase 3 (counts against +the 5-iteration cap). + +### 4c. Commit and PR + +Once clean, invoke: + +``` +Skill: commit-commands:commit-push-pr +``` + +The PR must: +- Be created as a **draft** +- Title derived from the branch name or original issue title +- Body includes a link to the GitHub issue (if one was mentioned in the + conversation) using `Closes #NNN` or `Relates to #NNN` + +## Output + +After completion, print a summary table: + +``` +## Finalize Implementation — Complete + +| Step | Result | +|-------------------|-------------------------------| +| Iterations used | N / 5 | +| Tests | ✅ Passing / ❌ N failures | +| Lint | ✅ Clean / ❌ Violations | +| 🔴 Issues | 0 resolved, 0 remaining | +| 🟡 Issues | N resolved, 0 remaining | +| 🟢 Issues | N noted (not blocking) | +| PR | | +``` + +If the loop exited early due to the iteration cap or a stuck issue, replace the +PR row with a clear description of what blocked completion and what the user +should do next. diff --git a/CLAUDE.md b/CLAUDE.md index 6571a341..993342e7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -496,3 +496,9 @@ Specialized agents are available in `.claude/agents/` for specific tasks (24 age - **ui-ux-designer** (opus) - User-centered design, accessibility When invoking a proactive agent from `.claude/agents/`, indicate which agent you are using in your response. + +## Skills (Slash Commands) + +Reusable workflows available in `.claude/skills/`. Invoke with `/skill-name`. + +- **`/finalize-implementation`** — Run tests, lint, CI review simulation, fix issues in a loop (max 5 iterations), then commit and create a draft PR. Invoke when you believe implementation is complete. Runs inline to preserve full conversation context (original issue, user intent). From 5cd47e7a81e7716d0bcb8b57b7440bd04b346b8c Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Wed, 18 Mar 2026 19:29:05 -0400 Subject: [PATCH 3/4] feat(finalize): add detailed PR merge preparation and review process --- .claude/commands/finalize.md | 123 +++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 .claude/commands/finalize.md diff --git a/.claude/commands/finalize.md b/.claude/commands/finalize.md new file mode 100644 index 00000000..41fd5f8d --- /dev/null +++ b/.claude/commands/finalize.md @@ -0,0 +1,123 @@ +Get the current feature branch ready for PR merge. Follow these phases exactly. + +## Phase 1: Merge Latest Main + +1. Capture current branch: `git branch --show-current` +2. Check for dirty working tree: `git status --porcelain` + - If dirty: `git stash` +3. `git checkout main && git pull origin main` +4. `git checkout && git merge main` +5. If merge conflicts exist (git status shows conflict markers): + - Report the conflicting files + - **STOP** — do not proceed. The user must resolve conflicts manually. +6. If stashed: `git stash pop` + +## Phase 2: Local PR Review (Replicating claude.yml) + +### Generate diff artifacts +```bash +git diff origin/main...HEAD > pr-diff.txt +git diff --name-status origin/main...HEAD > pr-files.txt +``` + +### Self-review using the claude.yml checklist + +Read `.github/workflows/claude.yml` lines 78–246 for the full review criteria. Apply that same checklist to the current branch changes. + +**File reading strategy:** +- Read `pr-diff.txt` first — it shows ALL changes +- Read `pr-files.txt` to see which files changed +- For large files (>1000 lines), use Grep with context or Read with offset/limit — do NOT read the entire file +- Focus on reviewing CHANGED code, not entire files + +**Apply all 7 review sections:** + +1. **Code Quality & Patterns** — architecture consistency, pattern reuse, error handling, code style, CLAUDE.md compliance +2. **Security** — SQL injection, command injection, XSS, secrets exposure, path traversal, unsafe deserialization, resource cleanup +3. **Testing** — tests exist for new functionality, edge cases covered, test quality +4. **Documentation** — docs updated for new features/CLI commands/SDK changes +5. **Breaking Changes & Compatibility** — public API changes, backward compatibility +6. **Performance & Architecture** — N+1 queries, inefficient algorithms, unnecessary dependencies +7. **Commit Quality** — commit messages are clear and logical + +**Classify all findings:** +- 🔴 **Critical** — Security issues, breaking changes, data loss risks +- 🟡 **Important** — Bugs, architectural concerns, missing tests +- 🟢 **Minor** — Style issues, optimizations, suggestions + +**DO NOT review or flag:** +- Copyright headers (presence, absence, or year inconsistencies) +- SPDX license identifiers +- License-related boilerplate + +### Fix findings +- Fix all 🔴 Critical and 🟡 Important issues immediately using Edit/Write tools +- Skip 🟢 Minor issues unless the fix is trivial (one-liner) +- For security issues: fix them directly + +### Clean up diff artifacts +```bash +rm -f pr-diff.txt pr-files.txt +``` + +## Phase 3: Ralph Wiggum Loop + +Loop until everything is green, **maximum 5 iterations**. + +On each iteration: + +### Step 1: Re-run local PR review +Repeat Phase 2 (generate diffs, review, fix, clean up). If no new 🔴/🟡 issues, proceed to Step 2. + +### Step 2: Lint +```bash +python util/lint.py --all --fix +``` +Check exit code. If lint still reports failures after `--fix`: +- Read the lint output carefully +- Manually fix remaining issues using Edit (common: import ordering, line length, f-string issues black can't auto-fix) +- Re-run `python util/lint.py --all` to verify clean + +### Step 3: Run tests +```bash +python -m pytest tests/ -x --tb=short +``` +- `-x` stops on first failure — analyze and fix before continuing +- Tests requiring external services (Lemonade server) skip automatically via pytest markers +- If tests fail: read the traceback, identify root cause, fix with Edit, then re-run + +### Step 4: Evaluate +- If lint is clean AND tests pass AND no 🔴/🟡 issues in review → **exit loop, report success** +- If max iterations (5) reached → report remaining issues and stop + +## Exit Report + +Always end with: + +``` +## Finalize Implementation Report + +**Branch:** +**Iterations:** /5 + +### Lint +✅ Clean / ❌ + +### Tests +✅ All passed ( tests) / ❌ + +### PR Review Verdict +✅ Approve / ✅ Approve with suggestions (minor only) / ❌ Request changes — + +### Ready for PR +✅ Yes — branch is ready to open/update PR +❌ No — +``` + +## Key Behaviors + +- **Never commit** — only fix files; the user decides when to commit +- **Never skip the lint step** — lint failures will be caught by CI +- **Prefer Edit over Write** — surgical fixes only +- **Preserve existing tests** — if your fixes break tests, undo and rethink +- **If uncertain about a fix** — describe the issue and ask rather than guessing From e5c3acc4b512fbf815b915214baa85693f39a06a Mon Sep 17 00:00:00 2001 From: Tomasz Iniewicz Date: Thu, 19 Mar 2026 12:07:36 -0400 Subject: [PATCH 4/4] fix(settings): add missing comma for JSON formatting --- .vscode/settings.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 67a47a7a..38c691f0 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -7,5 +7,6 @@ "python.testing.pytestArgs": [ "tests/unit" ], - "notebook.output.wordWrap": true + "notebook.output.wordWrap": true, + "cmake.sourceDirectory": "${workspaceFolder}/cpp" } \ No newline at end of file