From 99230ee5cccf5bd0059728a586e533d5e118bb3e Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Thu, 19 Mar 2026 20:06:20 +0100 Subject: [PATCH 1/7] tools: benchmarks: add support for Git worktrees in bfbencher bfbencher copies the source directory into a temporary directory, to prevent modifying the existing Git tree. For worktrees, there is not full .git directory, but a link to it, so the main Git repository (from which the worktree has been created) will be modified nonetheless (modifying the actual worktree instead of the copy). Detect if the repository is a worktree, and detach it if so. --- tools/benchmarks/bfbencher | 46 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tools/benchmarks/bfbencher b/tools/benchmarks/bfbencher index 08f62939a..564197068 100755 --- a/tools/benchmarks/bfbencher +++ b/tools/benchmarks/bfbencher @@ -548,10 +548,56 @@ class FilesystemSource: self._local = local_src_dir shutil.copytree(self._path, self._local, dirs_exist_ok=True) + self._detach_if_worktree() self._repo: git.Repo = git.Repo(self._local) self._retry_all: bool = False self._retry_failed: bool = False + def _detach_if_worktree(self) -> None: + """Convert a copied git worktree into a standalone repository. + + In a git worktree the .git entry is a file containing a gitdir pointer + to the original repo's worktree-specific state. shutil.copytree copies + that file verbatim, so all git operations on the copy would modify the + original worktree's HEAD and index. This method detects that case and + replaces the .git file with a self-contained .git directory built from + the worktree-specific state (HEAD, index) and the shared commondir + (objects, refs, config, …). + """ + git_entry = self._local / ".git" + if not git_entry.is_file(): + return + + content = git_entry.read_text().strip() + if not content.startswith("gitdir:"): + return + + wt_gitdir = pathlib.Path(content.split(":", 1)[1].strip()) + if not wt_gitdir.is_absolute(): + wt_gitdir = (self._local / wt_gitdir).resolve() + + commondir_file = wt_gitdir / "commondir" + if commondir_file.exists(): + commondir = (wt_gitdir / commondir_file.read_text().strip()).resolve() + else: + commondir = wt_gitdir + + # Replace the .git pointer file with a full standalone git directory. + git_entry.unlink() + shutil.copytree(commondir, self._local / ".git") + + # Drop worktrees/ — those entries are specific to the original repo. + worktrees_dir = self._local / ".git" / "worktrees" + if worktrees_dir.exists(): + shutil.rmtree(worktrees_dir) + + # Apply the worktree-specific HEAD and index, which track the state of + # this worktree and differ from the main worktree's equivalents. + for fname in ("HEAD", "index"): + src = wt_gitdir / fname + if src.exists(): + shutil.copy2(src, self._local / ".git" / fname) + @property def local(self) -> pathlib.Path: """Local path to the source repository copy.""" From 7a70d6353fed61913131ea24e72ac5257ce0486d Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Thu, 19 Mar 2026 20:11:49 +0100 Subject: [PATCH 2/7] tools: benchmarks: add history subcommand to bfbencher Restructure the CLI around subcommands to leave room for a future compare mode. All existing options and behaviour are unchanged; they now live under the history subcommand. Options that will be shared with future subcommands (host, cache, report paths, hardware isolation) are defined on a shared parent parser inherited by each subcommand. --- .github/workflows/ci.yaml | 1 + tools/benchmarks/bfbencher | 129 +++++++++++++++++++++---------------- 2 files changed, 73 insertions(+), 57 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 04049a330..4c41d5a5f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -178,6 +178,7 @@ jobs: BENCH_FAIL_ON="" fi tools/benchmarks/bfbencher \ + history \ --since 30bd49f \ --until $BENCH_UNTIL \ $BENCH_INCLUDE \ diff --git a/tools/benchmarks/bfbencher b/tools/benchmarks/bfbencher index 564197068..9f1c93db1 100755 --- a/tools/benchmarks/bfbencher +++ b/tools/benchmarks/bfbencher @@ -1128,71 +1128,27 @@ def run_benchmarks(args: argparse.Namespace): def main(): - parser = argparse.ArgumentParser( - prog="bfbencher", - description="Benchmark bpfilter performance across git commits.", - ) - - parser.add_argument( - "--since", - type=str, - help=f'oldest commit to benchmark. Use "wip" to start from the uncommitted changes (committed as "bfbencher: WIP"). Must be older than --until, or the same. Defaults to "{DEFAULT_FIRST_COMMIT_REF}"', - default=DEFAULT_FIRST_COMMIT_REF, - ) - parser.add_argument( - "--include", - type=str, - action="append", - default=[], - help='include an extra commit outside the range. Can be specified multiple times. Use "wip" to include uncommitted changes. Commits are sorted in git order with the range commits.', - ) - parser.add_argument( - "--until", - type=str, - help=f'newest commit to benchmark. Use "wip" to include uncommitted changes (committed as "bfbencher: WIP"). Must be newer than --since, or the same. Defaults to "{DEFAULT_LAST_COMMIT_REF}"', - default=DEFAULT_LAST_COMMIT_REF, - ) - parser.add_argument( + # Options shared across all subcommands. + shared = argparse.ArgumentParser(add_help=False) + shared.add_argument( "--sources", type=pathlib.Path, help=f'path to the bpfilter sources directory. Defaults to "{DEFAULT_SOURCE_PATH}".', default=DEFAULT_SOURCE_PATH, ) - parser.add_argument( + shared.add_argument( "--host", type=str, help=f'host to run the benchmark on. bfbencher will connect to the host using SSH, copy the project sources on it, and run the benchmarks. Defaults to "{DEFAULT_HOST[0]}" (current host).', default=DEFAULT_HOST[0], ) - parser.add_argument( + shared.add_argument( "--cache-dir", type=pathlib.Path, help=f"path to the directory containing the cached results. The cache is used to store benchmark results based on the hostname and the commit SHA, it is stored on the host running bfbencher. Defaults to {DEFAULT_CACHE_PATH}.", default=DEFAULT_CACHE_PATH, ) - parser.add_argument( - "--report-template-path", - type=pathlib.Path, - help=f'path to the Jinja2 template use to generate the HTML report. Defaults to "{DEFAULT_REPORT_TEMPLATE_PATH}"', - default=DEFAULT_REPORT_TEMPLATE_PATH, - ) - parser.add_argument( - "--report-path", - type=pathlib.Path, - help="path of the final HTML report.", - ) - parser.add_argument( - "--pr-report-template-path", - type=pathlib.Path, - help=f'path to the Jinja2 template use to generate the HTML pull-request report. Defaults to "{DEFAULT_PR_REPORT_TEMPLATE_PATH}"', - default=DEFAULT_PR_REPORT_TEMPLATE_PATH, - ) - parser.add_argument( - "--pr-report-path", - type=pathlib.Path, - help="path of the HTML summary report for pull requests (shows only significant changes).", - ) - parser.add_argument( + shared.add_argument( "--retry", "-r", type=str, @@ -1200,41 +1156,100 @@ def main(): default=[], help='retry benchmarks for specific commits, ignoring cached results. Use "failed" to retry all failed commits, "all" to retry everything, or a commit ref to retry a specific commit. Can be specified multiple times.', ) - parser.add_argument( + shared.add_argument( "--fail-on-significant-change", choices=["better", "worse", "any"], help="exit with non-zero status if any benchmark has a statistically significant change (better=improvement, worse=regression, any=either)", default=None, ) - parser.add_argument( + shared.add_argument( "--bind-node", type=int, help="CPU and memory node to bind the benchmark to.", default=None, ) - parser.add_argument( + shared.add_argument( "--no-preempt", action="store_true", - help="if set, use chrt to run the bechmark with real-time scheduling policy at the highest priority. This option should reduce jitter as only kernel threads could preempt it.", + help="if set, use chrt to run the benchmark with real-time scheduling policy at the highest priority. This option should reduce jitter as only kernel threads could preempt it.", default=False, ) - parser.add_argument( + shared.add_argument( "--cpu-pin", type=int, help="if set, defines the CPU to pin the benchmark to. If the CPU is isolated, it will reduce variability between runs.", default=None, ) - parser.add_argument( + shared.add_argument( "--slice", type=str, help="systemd slice to run the benchmark into. Required if --cpu-pin is isolated at the systemd level.", default=None, ) + parser = argparse.ArgumentParser( + prog="bfbencher", + description="Benchmark bpfilter performance across git commits.", + ) + subparsers = parser.add_subparsers(dest="command") + + history_parser = subparsers.add_parser( + "history", + parents=[shared], + help="benchmark performance across a range of commits", + description="Benchmark bpfilter performance across a range of commits and report changes over time.", + ) + history_parser.add_argument( + "--report-template-path", + type=pathlib.Path, + help=f'path to the Jinja2 template use to generate the HTML report. Defaults to "{DEFAULT_REPORT_TEMPLATE_PATH}"', + default=DEFAULT_REPORT_TEMPLATE_PATH, + ) + history_parser.add_argument( + "--report-path", + type=pathlib.Path, + help="path of the final HTML report.", + ) + history_parser.add_argument( + "--pr-report-template-path", + type=pathlib.Path, + help=f'path to the Jinja2 template use to generate the HTML pull-request report. Defaults to "{DEFAULT_PR_REPORT_TEMPLATE_PATH}"', + default=DEFAULT_PR_REPORT_TEMPLATE_PATH, + ) + history_parser.add_argument( + "--pr-report-path", + type=pathlib.Path, + help="path of the HTML summary report for pull requests (shows only significant changes).", + ) + history_parser.add_argument( + "--since", + type=str, + help=f'oldest commit to benchmark. Use "wip" to start from the uncommitted changes (committed as "bfbencher: WIP"). Must be older than --until, or the same. Defaults to "{DEFAULT_FIRST_COMMIT_REF}"', + default=DEFAULT_FIRST_COMMIT_REF, + ) + history_parser.add_argument( + "--until", + type=str, + help=f'newest commit to benchmark. Use "wip" to include uncommitted changes (committed as "bfbencher: WIP"). Must be newer than --since, or the same. Defaults to "{DEFAULT_LAST_COMMIT_REF}"', + default=DEFAULT_LAST_COMMIT_REF, + ) + history_parser.add_argument( + "--include", + type=str, + action="append", + default=[], + help='include an extra commit outside the range. Can be specified multiple times. Use "wip" to include uncommitted changes. Commits are sorted in git order with the range commits.', + ) + args = parser.parse_args() + if args.command is None: + parser.print_help() + raise SystemExit(1) + try: - run_benchmarks(args) + if args.command == "history": + run_benchmarks(args) except KeyboardInterrupt: renderer.log("Command interrupted by user") raise SystemExit(1) From 42c1f4a91be01068e02b0bea06afe7bcc124c517 Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Wed, 25 Mar 2026 18:16:24 +0100 Subject: [PATCH 3/7] tools: benchmarks: add compare subcommand to bfbencher Add a compare subcommand that benchmarks exactly two commits and reports the direct performance difference between them. Unlike the history subcommand which tracks trends over a sliding window, compare produces a side-by-side table showing base/ref absolute values and deltas for both runtime and instruction count. The benchmark loop and executor setup are factored into shared helpers (_create_executor, _benchmark_commits) used by both subcommands. An optional --json-output flag writes the results to a structured JSON file for consumption by external tools. --- tools/benchmarks/bfbencher | 218 +++++++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) diff --git a/tools/benchmarks/bfbencher b/tools/benchmarks/bfbencher index 9f1c93db1..4ad853641 100755 --- a/tools/benchmarks/bfbencher +++ b/tools/benchmarks/bfbencher @@ -111,6 +111,47 @@ class Renderer: self.console.print(table) + def print_compare_report( + self, + rows: list[Report.CompareRow], + base_sha: str, + ref_sha: str, + ) -> None: + def format_pct(pct: float) -> str: + color = "green" if pct < 0 else ("red" if pct > 0 else "white") + return f"[{color}]{pct:+.1f}%[/{color}]" + + table = rich.table.Table( + title=f"{base_sha[:SHORT_SHA_LEN]} → {ref_sha[:SHORT_SHA_LEN]}", + show_header=True, + ) + table.add_column("Benchmark", style="cyan") + table.add_column("Base", justify="right") + table.add_column("Ref", justify="right") + table.add_column("ΔTime", justify="right") + table.add_column("ΔTime%", justify="right") + table.add_column("Base Insn", justify="right") + table.add_column("Ref Insn", justify="right") + table.add_column("ΔInsn", justify="right") + table.add_column("ΔInsn%", justify="right") + + for row in rows: + table.add_row( + row.name, + row.base_time_str, + row.ref_time_str, + row.delta_time_str, + format_pct(row.delta_time_pct), + str(row.base_insn) if row.base_insn is not None else "-", + str(row.ref_insn) if row.ref_insn is not None else "-", + f"{row.delta_insn:+d}" if row.delta_insn is not None else "-", + format_pct(row.delta_insn_pct) + if row.delta_insn_pct is not None + else "-", + ) + + self.console.print(table) + renderer: Renderer = Renderer() @@ -221,6 +262,10 @@ class Benchmark: "nInsn": Analyzer(nInsns[-1], nInsns[-n - 1 : -1]) if nInsns else None, } + @property + def results(self) -> list[Result]: + return list(self._results) + @property def last(self) -> Result | None: return self._results[-1] if self._results else None @@ -930,6 +975,23 @@ class Report: runtime_ns: float = 0 # Runtime in nanoseconds for sorting insn_count: int = 0 # Instruction count for sorting + @dataclasses.dataclass + class CompareRow: + """Prepared data for a single benchmark row in compare mode.""" + + name: str + label: str + base_time_str: str + ref_time_str: str + delta_time_str: str + delta_time_pct: float + base_insn: int | None + ref_insn: int | None + delta_insn: int | None + delta_insn_pct: float | None + base_time_ns: float + ref_time_ns: float + def __init__(self, history: History): self._history = history @@ -1069,6 +1131,113 @@ class Report: rows = self._get_benchmark_rows(terms) renderer.print_report(rows, terms) + def _get_compare_rows(self, base_sha: str, ref_sha: str) -> list[CompareRow]: + rows = [] + for benchmark in self._history.sorted_benchmarks(): + base_result = next( + (r for r in benchmark.results if r.commit_sha == base_sha), None + ) + ref_result = next( + (r for r in benchmark.results if r.commit_sha == ref_sha), None + ) + if not base_result or not ref_result: + continue + + base_ns: float = float(base_result.time.to("ns").magnitude) # type: ignore[union-attr] + ref_ns: float = float(ref_result.time.to("ns").magnitude) # type: ignore[union-attr] + delta_ns: float = ref_ns - base_ns + delta_pct: float = (delta_ns / base_ns * 100) if base_ns else 0.0 + + base_insn = int(base_result.nInsn) if base_result.nInsn else None + ref_insn = int(ref_result.nInsn) if ref_result.nInsn else None + if base_insn is not None and ref_insn is not None: + delta_insn: int | None = int(ref_insn) - int(base_insn) + delta_insn_pct: float | None = ( + (delta_insn / base_insn * 100) if base_insn else 0 + ) + else: + delta_insn = None + delta_insn_pct = None + + rows.append( + Report.CompareRow( + name=benchmark.name, + label=benchmark.label, + base_time_str=f"{base_result.time:~.2f}", # type: ignore[union-attr] + ref_time_str=f"{ref_result.time:~.2f}", # type: ignore[union-attr] + delta_time_str=f"{delta_ns:+.2f} ns", + delta_time_pct=delta_pct, + base_insn=base_insn, + ref_insn=ref_insn, + delta_insn=delta_insn, + delta_insn_pct=delta_insn_pct, + base_time_ns=base_ns, + ref_time_ns=ref_ns, + ) + ) + + return rows + + def print_compare_report(self, base_sha: str, ref_sha: str) -> None: + rows = self._get_compare_rows(base_sha, ref_sha) + renderer.print_compare_report(rows, base_sha, ref_sha) + + def write_compare_json( + self, + path: pathlib.Path, + base_sha: str, + ref_sha: str, + host: str, + ) -> None: + rows = self._get_compare_rows(base_sha, ref_sha) + data = { + "base": base_sha, + "ref": ref_sha, + "host": host, + "benchmarks": [ + { + "name": r.name, + "base_time_ns": r.base_time_ns, + "ref_time_ns": r.ref_time_ns, + "delta_time_ns": r.ref_time_ns - r.base_time_ns, + "delta_time_pct": r.delta_time_pct, + "base_insn": r.base_insn, + "ref_insn": r.ref_insn, + "delta_insn": r.delta_insn, + "delta_insn_pct": r.delta_insn_pct, + } + for r in rows + ], + } + + with open(path, "w") as f: + json.dump(data, f, indent=2) + + +def _benchmark_commits(executor: Executor, args: argparse.Namespace) -> None: + for ctx in BenchmarkContext.commits(executor): + if not ctx.configure(): + continue + + if not ctx.make("bfcli"): + continue + + if not ctx.make("benchmark_bin"): + continue + + if not ctx.run_benchmark( + args.bind_node, args.no_preempt, args.cpu_pin, args.slice + ): + continue + + results = ctx.results + if not results: + executor.log(f"could not find {ctx.results_path}") + continue + + executor.add_results(ctx.commit, results) + executor.log("Done!") + def run_benchmarks(args: argparse.Namespace): executor = ( @@ -1127,6 +1296,31 @@ def run_benchmarks(args: argparse.Namespace): raise SystemExit(1) +def run_compare(args: argparse.Namespace): + source_repo = git.Repo(args.sources) + base_sha: str = source_repo.git.rev_parse(args.base) + ref_sha: str = source_repo.git.rev_parse(args.ref) + + args.since = args.base + args.until = args.base + args.include = [args.ref] + + executor = ( + LocalExecutor(args) if args.host in DEFAULT_HOST else RemoteExecutor(args) + ) + + with executor: + _benchmark_commits(executor, args) + + report = Report(executor.results) + report.print_compare_report(base_sha, ref_sha) + + if args.json_output: + report.write_compare_json( + args.json_output, base_sha, ref_sha, executor.host + ) + + def main(): # Options shared across all subcommands. shared = argparse.ArgumentParser(add_help=False) @@ -1241,6 +1435,28 @@ def main(): help='include an extra commit outside the range. Can be specified multiple times. Use "wip" to include uncommitted changes. Commits are sorted in git order with the range commits.', ) + compare_parser = subparsers.add_parser( + "compare", + parents=[shared], + help="compare performance between two specific commits", + description="Benchmark two specific commits and report the performance difference.", + ) + compare_parser.add_argument( + "base", + type=str, + help='baseline commit ref. Use "wip" for uncommitted changes.', + ) + compare_parser.add_argument( + "ref", + type=str, + help='commit ref to compare against the baseline. Use "wip" for uncommitted changes.', + ) + compare_parser.add_argument( + "--json-output", + type=pathlib.Path, + help="write comparison results to a JSON file.", + ) + args = parser.parse_args() if args.command is None: @@ -1250,6 +1466,8 @@ def main(): try: if args.command == "history": run_benchmarks(args) + elif args.command == "compare": + run_compare(args) except KeyboardInterrupt: renderer.log("Command interrupted by user") raise SystemExit(1) From 21a9ef36f9aada5104c4ab5a8d81b77fe23d465a Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Fri, 20 Mar 2026 09:02:18 +0100 Subject: [PATCH 4/7] tools: benchmarks: add option to fail bfbencher when a benchmark fails Add --fail-on-benchmark-error option to return non-zero from bfbencher if one of the benchmark run has failed. --- tools/benchmarks/bfbencher | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tools/benchmarks/bfbencher b/tools/benchmarks/bfbencher index 4ad853641..d95ab5d98 100755 --- a/tools/benchmarks/bfbencher +++ b/tools/benchmarks/bfbencher @@ -53,6 +53,7 @@ class Stats: self.n_failures = 0 self.n_cache_hits = 0 self.n_cache_misses = 0 + self.n_benchmark_errors = 0 def success(self, from_cache: bool = False) -> None: if from_cache: @@ -1235,6 +1236,13 @@ def _benchmark_commits(executor: Executor, args: argparse.Namespace) -> None: executor.log(f"could not find {ctx.results_path}") continue + for r in results: + if r.get("error_occurred"): + executor.stats.n_benchmark_errors += 1 + executor.log( + f"[red bold]benchmark error: {r['name']}: {r.get('error_message', '')}[/]" + ) + executor.add_results(ctx.commit, results) executor.log("Done!") @@ -1287,6 +1295,9 @@ def run_benchmarks(args: argparse.Namespace): ) report.print_report([20]) + if args.fail_on_benchmark_error and executor.stats.n_benchmark_errors > 0: + raise SystemExit(1) + if args.fail_on_significant_change: terms = [20] for benchmark in executor.results.sorted_benchmarks(): @@ -1320,6 +1331,9 @@ def run_compare(args: argparse.Namespace): args.json_output, base_sha, ref_sha, executor.host ) + if args.fail_on_benchmark_error and executor.stats.n_benchmark_errors > 0: + raise SystemExit(1) + def main(): # Options shared across all subcommands. @@ -1350,6 +1364,12 @@ def main(): default=[], help='retry benchmarks for specific commits, ignoring cached results. Use "failed" to retry all failed commits, "all" to retry everything, or a commit ref to retry a specific commit. Can be specified multiple times.', ) + shared.add_argument( + "--fail-on-benchmark-error", + action="store_true", + help="exit with non-zero status if any benchmark reports an error during execution", + default=False, + ) shared.add_argument( "--fail-on-significant-change", choices=["better", "worse", "any"], From 1b13b30b74c362b18abe10c7f09d54e8dbfccfa2 Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Thu, 19 Mar 2026 21:13:41 +0100 Subject: [PATCH 5/7] tools: add bfoptimize LLM-driven optimization loop Iteratively improves bpfilter's BPF bytecode generation using Claude. Each iteration, an Opus 4.6 planning call (with adaptive thinking) proposes one concrete optimization to src/bpfilter/cgen/. A Claude sub-agent then implements it directly in the repo, builds, runs the full test suite, and commits if tests pass. bfbencher compare validates the change; if the mean runtime delta across all benchmarks is negative the commit is kept and becomes the new baseline, otherwise it is reverted. Attempt history is persisted in .cache/bfoptimize/history.json across sessions so the LLM never retries a previously attempted optimization. Thinking blocks from each planning call are saved to .cache/bfoptimize/-thinking.txt for post-hoc inspection. --- tools/bfoptimize | 669 ++++++++++++++++++++++++ tools/bfoptimize-web | 1174 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1843 insertions(+) create mode 100755 tools/bfoptimize create mode 100755 tools/bfoptimize-web diff --git a/tools/bfoptimize b/tools/bfoptimize new file mode 100755 index 000000000..337e318a1 --- /dev/null +++ b/tools/bfoptimize @@ -0,0 +1,669 @@ +#!/usr/bin/env python3 +"""bfoptimize — LLM-driven BPF bytecode optimization loop for bpfilter.""" + +from __future__ import annotations + +import argparse +import asyncio +import datetime +import json +import multiprocessing +import os +import pathlib +import shutil +import subprocess +from typing import Any + +import anthropic +import diskcache # type: ignore[import-untyped] +import git +import numpy +import rich.console +import rich.table +from claude_agent_sdk import ( + AssistantMessage, + ClaudeAgentOptions, + ResultMessage, + TextBlock, + query, +) + +DEFAULT_SOURCES = pathlib.Path(".") +DEFAULT_BUILD_DIR = pathlib.Path("build") +DEFAULT_CACHE_DIR = pathlib.Path(".cache/bfoptimize") +DEFAULT_ITERATIONS = 10 +DEFAULT_MODEL = "claude-opus-4-6" +DEFAULT_EFFORT = "high" +CGEN_DIR = "src/libbpfilter/cgen" +SHORT_SHA_LEN = 7 + +console = rich.console.Console(log_path=False) + + +# --------------------------------------------------------------------------- +# History +# --------------------------------------------------------------------------- + + +class History: + """Persists attempt records and the current baseline SHA.""" + + def __init__(self, cache_dir: pathlib.Path) -> None: + self._path = cache_dir / "history.json" + self._data: dict[str, Any] = self._load() + + def _load(self) -> dict[str, Any]: + if self._path.exists(): + return json.loads(self._path.read_text()) + return {"baseline_sha": None, "attempts": []} + + def save(self) -> None: + self._path.parent.mkdir(parents=True, exist_ok=True) + self._path.write_text(json.dumps(self._data, indent=2)) + + @property + def baseline_sha(self) -> str | None: + return self._data.get("baseline_sha") + + @baseline_sha.setter + def baseline_sha(self, sha: str) -> None: + self._data["baseline_sha"] = sha + + @property + def attempts(self) -> list[dict[str, Any]]: + return self._data.get("attempts", []) + + def next_id(self) -> int: + attempts = self.attempts + return (attempts[-1]["id"] + 1) if attempts else 1 + + def add_attempt(self, attempt: dict[str, Any]) -> None: + self._data.setdefault("attempts", []).append(attempt) + self.save() + + def summary(self) -> str: + attempts = self.attempts + if not attempts: + return "No previous attempts." + lines = ["Previous optimization attempts (do not repeat these):"] + for a in attempts: + delta = ( + f"{a['delta_time_pct']:+.1f}%" + if a.get("delta_time_pct") is not None + else "N/A" + ) + lines.append( + f" #{a['id']} [{a['status']}] {a['description']} (weighted runtime delta: {delta})" + ) + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Source loading +# --------------------------------------------------------------------------- + + +def load_cgen_sources(sources_dir: pathlib.Path) -> str: + cgen_path = sources_dir / CGEN_DIR + parts: list[str] = [] + for f in sorted(cgen_path.rglob("*.[ch]")): + rel = f.relative_to(sources_dir) + try: + content = f.read_text(encoding="utf-8") + except UnicodeDecodeError: + continue + parts.append(f"=== {rel} ===\n{content}") + return "\n\n".join(parts) + + +def load_last_benchmark(cache_dir: pathlib.Path) -> str: + bench_path = cache_dir / "last_bench.json" + if not bench_path.exists(): + return "No benchmark data available yet." + try: + data = json.loads(bench_path.read_text()) + base = data.get("base", "?")[:SHORT_SHA_LEN] + ref = data.get("ref", "?")[:SHORT_SHA_LEN] + lines = [f"Last benchmark results ({base} → {ref}):"] + for b in data.get("benchmarks", []): + pct = b.get("delta_time_pct", 0) + lines.append( + f" {b['name']}: base={b['base_time_ns']:.1f}ns" + f" ref={b['ref_time_ns']:.1f}ns delta={pct:+.1f}%" + ) + return "\n".join(lines) + except Exception: + return "Benchmark data unavailable." + + +# --------------------------------------------------------------------------- +# Planning phase (extended thinking) +# --------------------------------------------------------------------------- + + +def plan_optimization( + client: anthropic.Anthropic, + cgen_sources: str, + history_summary: str, + benchmark_results: str, + cache_dir: pathlib.Path, + attempt_id: int, + model: str = DEFAULT_MODEL, + thinking: bool = True, + effort: str = DEFAULT_EFFORT, + context_1m: bool = False, + hint: str | None = None, +) -> str: + prompt = f"""You are optimizing the BPF bytecode generation logic in the bpfilter project. + +The cgen directory generates BPF programs that run in the Linux kernel for packet filtering. +Every nanosecond saved matters — these programs execute for every packet received by the host. + +## Source files in {CGEN_DIR}/ + +{cgen_sources} + +## Current benchmark results + +{benchmark_results} + +## Optimization history + +{history_summary} + +## Task + +Propose exactly ONE concrete optimization to the cgen code. Describe: +1. Which file(s) you will change and what specifically you will change +2. Why this will reduce the runtime of the generated BPF programs +3. Any risks or tricky edge cases to handle + +Be specific and actionable. Do not repeat any previously attempted optimization. +Output only the optimization proposal — no code yet.""" + + if hint: + prompt += ( + f"\n\n## Hint\n\n{hint}\n\n" + "This is a direction to consider, not a constraint — " + "you may propose a different optimization if you judge it more impactful." + ) + + stream_kwargs: dict[str, Any] = { + "model": model, + "max_tokens": 128000, + "messages": [{"role": "user", "content": prompt}], + "output_config": {"effort": effort}, + } + if thinking: + stream_kwargs["thinking"] = {"type": "adaptive"} + if context_1m: + stream_kwargs["extra_headers"] = {"anthropic-beta": "context-1m-2025-08-07"} + + with client.messages.stream(**stream_kwargs) as stream: + for event in stream: + if event.type == "content_block_delta": + if event.delta.type == "thinking_delta": + console.print(event.delta.thinking, end="") + elif event.delta.type == "text_delta": + console.print(event.delta.text, end="") + console.print("") + response = stream.get_final_message() + + # Persist thinking blocks for post-hoc inspection + thinking_texts = [b.thinking for b in response.content if b.type == "thinking"] + if thinking_texts: + thinking_path = cache_dir / f"{attempt_id}-thinking.txt" + thinking_path.write_text("\n\n---\n\n".join(thinking_texts)) # type: ignore[arg-type] + + return next(b.text for b in response.content if b.type == "text") # type: ignore[union-attr] + + +# --------------------------------------------------------------------------- +# Execution phase (Agent SDK) +# --------------------------------------------------------------------------- + + +async def execute_optimization( + sources_dir: pathlib.Path, + build_dir: pathlib.Path, + optimization_plan: str, + baseline_sha: str, +) -> str | None: + """Run the agent to implement the optimization. Returns new HEAD sha if committed.""" + ncpus = multiprocessing.cpu_count() + abs_sources = sources_dir.resolve() + abs_build = build_dir.resolve() + + prompt = f"""You are implementing a performance optimization to the bpfilter BPF bytecode generator. + +## Optimization to implement + +{optimization_plan} + +## Rules + +- Modify ONLY files under `{abs_sources}/{CGEN_DIR}/`. Do not touch any other files. +- Build: cmake -S {abs_sources} -B {abs_build} -DNO_DOCS=1 -DNO_TESTS=1 -DNO_CHECKS=1 \ +-DCMAKE_BUILD_TYPE=release && make -C {abs_build} -j{ncpus} bpfilter +- Test: make -C {abs_build} -j{ncpus} unit e2e integration +- If tests pass: commit with `git -C {abs_sources} commit -am "daemon: cgen: "` +- If build or tests fail: diagnose and fix. If you cannot make tests pass, revert ALL your \ +changes with `git -C {abs_sources} checkout -- {abs_sources}/{CGEN_DIR}/` and exit without \ +committing. +- The current baseline is {baseline_sha[:SHORT_SHA_LEN]}. Only commit when tests are green.""" + + async for message in query( + prompt=prompt, + options=ClaudeAgentOptions( + cwd=str(abs_sources), + permission_mode="bypassPermissions", + allowed_tools=["Read", "Edit", "Write", "Bash", "Glob", "Grep"], + ), + ): + if isinstance(message, AssistantMessage): + for block in message.content: + if isinstance(block, TextBlock) and block.text.strip(): + console.log(f"[dim]{block.text.strip()[:200]}[/dim]") + elif isinstance(message, ResultMessage) and message.result: + console.log(f"Agent done: {message.result[:200]}") + + repo = git.Repo(sources_dir) + new_sha = repo.head.commit.hexsha + return new_sha if new_sha != baseline_sha else None + + +# --------------------------------------------------------------------------- +# Benchmark step +# --------------------------------------------------------------------------- + + +def _benchmark_noise(bfbencher_cache_dir: pathlib.Path) -> dict[str, float]: + """Estimate per-benchmark noise (CV) from bfbencher's accumulated cache. + + Iterates all cached commit results and computes the coefficient of variation + (MAD/median, normalised via the 1.4826 consistency factor) for each + benchmark. Benchmarks with fewer than 3 data points are excluded so that + noise estimates are not based on a single outlier. + + Returns a mapping from benchmark name to CV. An empty dict is returned + when the cache is absent or unreadable. + """ + series: dict[str, list[float]] = {} + try: + cache = diskcache.Cache(bfbencher_cache_dir) + for key in cache: + val = cache.get(key) + if not isinstance(val, dict) or not val.get("success"): + continue + for r in val.get("results", []): + name = r.get("name", "") + t = float(r.get("cpu_time", 0)) + if t > 0: + series.setdefault(name, []).append(t) + cache.close() + except Exception: + return {} + + noise: dict[str, float] = {} + for name, times in series.items(): + if len(times) < 3: + continue + arr = numpy.array(times) + median = float(numpy.median(arr)) + if median == 0: + continue + mad = float(numpy.median(numpy.abs(arr - median))) + noise[name] = (mad * 1.4826) / median + return noise + + +def run_benchmark( + sources_dir: pathlib.Path, + cache_dir: pathlib.Path, + bfbencher: pathlib.Path, + baseline_sha: str, + result_sha: str, + extra_args: list[str], + attempt_id: int, +) -> float | None: + json_path = cache_dir / "last_bench.json" + cmd = [ + str(bfbencher), + "compare", + baseline_sha, + result_sha, + "--sources", + str(sources_dir), + "--cache-dir", + str(cache_dir / "bfbencher"), + "--json-output", + str(json_path), + "--fail-on-benchmark-error", + ] + extra_args + + console.log(f"Running bfbencher compare {baseline_sha[:SHORT_SHA_LEN]} → {result_sha[:SHORT_SHA_LEN]}") + result = subprocess.run(cmd, text=True, env={**os.environ, "PYTHONUNBUFFERED": "1"}) + if result.returncode != 0: + console.log("[red]bfbencher compare failed[/red]") + return None + + shutil.copy(json_path, cache_dir / f"bench-{attempt_id}.json") + + try: + data = json.loads(json_path.read_text()) + noise = _benchmark_noise(cache_dir / "bfbencher") + + pairs: list[tuple[float, float]] = [] + for b in data.get("benchmarks", []): + if b.get("delta_time_pct") is None: + continue + cv = noise.get(b["name"], 0.0) + # Weight = 1 / (1 + CV): high-noise benchmarks contribute less. + pairs.append((b["delta_time_pct"], 1.0 / (1.0 + cv))) + + if not pairs: + return None + + noisy = [(name, cv) for name, cv in noise.items() if cv > 0.01] + if noisy: + noisy_str = ", ".join( + f"{name} (CV={cv:.1%})" for name, cv in sorted(noisy, key=lambda x: -x[1]) + ) + console.log(f"[dim]Volatile benchmarks (down-weighted): {noisy_str}[/dim]") + + total_weight = sum(w for _, w in pairs) + return sum(d * w for d, w in pairs) / total_weight + except Exception as e: + console.log(f"[red]Failed to parse benchmark output: {e}[/red]") + return None + + +# --------------------------------------------------------------------------- +# Summary table +# --------------------------------------------------------------------------- + + +def print_summary(history: History) -> None: + table = rich.table.Table(title="bfoptimize summary", show_header=True) + table.add_column("#", justify="right") + table.add_column("Status", justify="center") + table.add_column("Δ Runtime", justify="right") + table.add_column("Description", style="cyan") + + for a in history.attempts: + status = a["status"] + color = ( + "green" + if status == "accepted" + else "red" + if status == "rejected_bench" + else "yellow" + ) + delta = ( + f"{a['delta_time_pct']:+.1f}%" + if a.get("delta_time_pct") is not None + else "-" + ) + table.add_row( + str(a["id"]), + f"[{color}]{status}[/{color}]", + delta, + a["description"][:80], + ) + + console.print(table) + + +# --------------------------------------------------------------------------- +# Main loop +# --------------------------------------------------------------------------- + + +async def run_optimize(args: argparse.Namespace) -> None: + args.cache_dir.mkdir(parents=True, exist_ok=True) + history = History(args.cache_dir) + client = anthropic.Anthropic() + repo = git.Repo(args.sources) + + if history.baseline_sha is None: + history.baseline_sha = repo.head.commit.hexsha + history.save() + + console.log(f"Baseline: {history.baseline_sha[:SHORT_SHA_LEN]}") + + bfbencher = (args.sources / "tools/benchmarks/bfbencher").resolve() + + bench_extra: list[str] = [] + if args.host: + bench_extra += ["--host", args.host] + if args.bind_node is not None: + bench_extra += ["--bind-node", str(args.bind_node)] + if args.no_preempt: + bench_extra += ["--no-preempt"] + if args.cpu_pin is not None: + bench_extra += ["--cpu-pin", str(args.cpu_pin)] + if args.slice: + bench_extra += ["--slice", args.slice] + + for i in range(args.iterations): + attempt_id = history.next_id() + console.log( + f"\n[bold cyan]─── Iteration {i + 1}/{args.iterations}" + f" (attempt #{attempt_id}) ───[/bold cyan]" + ) + + baseline_sha = history.baseline_sha + assert baseline_sha is not None + + # ── Plan ────────────────────────────────────────────────────────── + console.log("[bold]Planning...[/bold]") + cgen_sources = load_cgen_sources(args.sources) + benchmark_results = load_last_benchmark(args.cache_dir) + try: + plan = plan_optimization( + client, + cgen_sources, + history.summary(), + benchmark_results, + args.cache_dir, + attempt_id, + model=args.model, + thinking=args.thinking, + effort=args.effort, + context_1m=args.context_1m, + hint=args.hint, + ) + except Exception as e: + console.log(f"[red]Planning failed: {e}[/red]") + continue + + description = plan.strip().splitlines()[0][:120] + console.log(f"Proposal: {description}") + + # ── Execute ─────────────────────────────────────────────────────── + console.log("[bold]Executing...[/bold]") + try: + result_sha = await execute_optimization( + args.sources, + args.build_dir, + plan, + baseline_sha, + ) + except Exception as e: + console.log(f"[red]Agent execution failed: {e}[/red]") + history.add_attempt( + { + "id": attempt_id, + "description": description, + "status": "rejected_tests", + "baseline_sha": baseline_sha, + "result_sha": None, + "delta_time_pct": None, + "timestamp": datetime.datetime.now().isoformat(), + } + ) + continue + + if result_sha is None: + console.log("[yellow]Agent did not commit — rejected_tests[/yellow]") + history.add_attempt( + { + "id": attempt_id, + "description": description, + "status": "rejected_tests", + "baseline_sha": baseline_sha, + "result_sha": None, + "delta_time_pct": None, + "timestamp": datetime.datetime.now().isoformat(), + } + ) + continue + + console.log(f"Agent committed: {result_sha[:SHORT_SHA_LEN]}") + + # ── Benchmark ───────────────────────────────────────────────────── + console.log("[bold]Benchmarking...[/bold]") + delta = run_benchmark( + args.sources, + args.cache_dir, + bfbencher, + baseline_sha, + result_sha, + bench_extra, + attempt_id, + ) + + if delta is None or delta >= 0: + delta_str = f"{delta:+.1f}%" if delta is not None else "N/A" + console.log(f"[red]Rejected (bench): mean delta {delta_str}[/red]") + repo.git.reset("--hard", baseline_sha) + status = "rejected_bench" + else: + console.log(f"[green]Accepted: mean delta {delta:+.1f}%[/green]") + history.baseline_sha = result_sha + status = "accepted" + + history.add_attempt( + { + "id": attempt_id, + "description": description, + "status": status, + "baseline_sha": baseline_sha, + "result_sha": result_sha, + "delta_time_pct": delta, + "timestamp": datetime.datetime.now().isoformat(), + } + ) + + print_summary(history) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="bfoptimize", + description="LLM-driven BPF bytecode optimization loop for bpfilter.", + ) + parser.add_argument( + "--iterations", + "-n", + type=int, + default=DEFAULT_ITERATIONS, + help=f"number of optimization iterations to run (default: {DEFAULT_ITERATIONS})", + ) + parser.add_argument( + "--sources", + type=pathlib.Path, + default=DEFAULT_SOURCES, + help=f'path to the bpfilter source directory (default: "{DEFAULT_SOURCES}")', + ) + parser.add_argument( + "--build-dir", + type=pathlib.Path, + default=DEFAULT_BUILD_DIR, + help=f'cmake build directory (default: "{DEFAULT_BUILD_DIR}")', + ) + parser.add_argument( + "--cache-dir", + type=pathlib.Path, + default=DEFAULT_CACHE_DIR, + help=f'directory for history and benchmark cache (default: "{DEFAULT_CACHE_DIR}")', + ) + parser.add_argument( + "--host", + type=str, + default=None, + help="remote host for benchmarking (passed through to bfbencher)", + ) + parser.add_argument( + "--bind-node", + type=int, + default=None, + help="CPU/memory NUMA node to bind benchmarks to", + ) + parser.add_argument( + "--no-preempt", + action="store_true", + default=False, + help="run benchmarks with real-time scheduling (chrt -f 99)", + ) + parser.add_argument( + "--cpu-pin", + type=int, + default=None, + help="CPU to pin benchmark to", + ) + parser.add_argument( + "--slice", + type=str, + default=None, + help="systemd slice for benchmark execution", + ) + parser.add_argument( + "--model", + type=str, + default=DEFAULT_MODEL, + choices=["claude-opus-4-6", "claude-sonnet-4-6"], + help=f"Claude model to use for planning (default: {DEFAULT_MODEL})", + ) + parser.add_argument( + "--thinking", + action=argparse.BooleanOptionalAction, + default=True, + help="enable adaptive thinking during planning (default: enabled)", + ) + parser.add_argument( + "--effort", + type=str, + default=DEFAULT_EFFORT, + choices=["low", "medium", "high", "max"], + help=f"effort level for the planning call (default: {DEFAULT_EFFORT}; max is Opus only)", + ) + parser.add_argument( + "--context-1m", + action="store_true", + default=False, + help="enable 1M context window beta (claude-opus-4-6 and claude-sonnet-4-6 only)", + ) + parser.add_argument( + "--hint", + type=str, + default=None, + help="optional direction for the model (e.g. 'look into XXX'); it is provided as context but not enforced", + ) + + args = parser.parse_args() + + try: + asyncio.run(run_optimize(args)) + except KeyboardInterrupt: + console.log("Interrupted by user") + raise SystemExit(1) + + +if __name__ == "__main__": + main() diff --git a/tools/bfoptimize-web b/tools/bfoptimize-web new file mode 100755 index 000000000..fefdb952a --- /dev/null +++ b/tools/bfoptimize-web @@ -0,0 +1,1174 @@ +#!/usr/bin/env python3 +"""bfoptimize-web — Local web UI for bfoptimize.""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import pathlib +import signal +from typing import Any, AsyncGenerator + +import uvicorn +from fastapi import FastAPI, HTTPException +from fastapi.responses import HTMLResponse, PlainTextResponse, StreamingResponse +from pydantic import BaseModel + +# --------------------------------------------------------------------------- +# Global state +# --------------------------------------------------------------------------- + +_process: asyncio.subprocess.Process | None = None +_log_lines: list[str] = [] +_subscribers: list[asyncio.Queue[str | None]] = [] + +# Updated at startup from CLI args; overridden per-run from POST /run body +_cache_dir: pathlib.Path = pathlib.Path(".cache/bfoptimize") +_sources_dir: pathlib.Path = pathlib.Path(".") +_bfoptimize: pathlib.Path = pathlib.Path(__file__).parent / "bfoptimize" + +app = FastAPI() + +# --------------------------------------------------------------------------- +# Request models +# --------------------------------------------------------------------------- + + +class RunRequest(BaseModel): + iterations: int = 10 + sources: str = "." + build_dir: str = "build" + cache_dir: str = ".cache/bfoptimize" + host: str | None = None + bind_node: int | None = None + no_preempt: bool = False + cpu_pin: int | None = None + slice: str | None = None + model: str = "claude-opus-4-6" + thinking: bool = True + effort: str = "high" + context_1m: bool = False + hint: str | None = None + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _build_cmd(body: RunRequest) -> list[str]: + cmd = [ + str(_bfoptimize), + "--iterations", + str(body.iterations), + "--sources", + body.sources, + "--build-dir", + body.build_dir, + "--cache-dir", + body.cache_dir, + ] + if body.host: + cmd += ["--host", body.host] + if body.bind_node is not None: + cmd += ["--bind-node", str(body.bind_node)] + if body.no_preempt: + cmd += ["--no-preempt"] + if body.cpu_pin is not None: + cmd += ["--cpu-pin", str(body.cpu_pin)] + if body.slice: + cmd += ["--slice", body.slice] + cmd += ["--model", body.model] + cmd += ["--thinking" if body.thinking else "--no-thinking"] + cmd += ["--effort", body.effort] + if body.context_1m: + cmd += ["--context-1m"] + if body.hint: + cmd += ["--hint", body.hint] + return cmd + + +async def _broadcast_output(proc: asyncio.subprocess.Process) -> None: + assert proc.stdout is not None + async for raw in proc.stdout: + text = raw.decode(errors="replace").rstrip() + _log_lines.append(text) + for q in list(_subscribers): + await q.put(text) + await proc.wait() + for q in list(_subscribers): + await q.put(None) + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + + +@app.get("/", response_class=HTMLResponse) +async def index() -> str: + return HTML + + +@app.post("/run", status_code=202) +async def start_run(body: RunRequest) -> dict[str, str]: + global _process, _log_lines, _cache_dir, _sources_dir + if _process is not None and _process.returncode is None: + raise HTTPException(409, "Already running") + _cache_dir = pathlib.Path(body.cache_dir) + _sources_dir = pathlib.Path(body.sources) + _log_lines = [] + cmd = _build_cmd(body) + _process = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + ) + asyncio.create_task(_broadcast_output(_process)) + return {"status": "started"} + + +@app.delete("/run") +async def stop_run() -> dict[str, str]: + if _process is None or _process.returncode is not None: + raise HTTPException(404, "No running process") + _process.send_signal(signal.SIGTERM) + return {"status": "stopping"} + + +@app.get("/stream") +async def stream() -> StreamingResponse: + async def generator() -> AsyncGenerator[str, None]: + q: asyncio.Queue[str | None] = asyncio.Queue() + _subscribers.append(q) + try: + for line in list(_log_lines): + yield f"data: {line}\n\n" + if _process is None or _process.returncode is not None: + return + while True: + try: + item = await asyncio.wait_for(q.get(), timeout=15.0) + except asyncio.TimeoutError: + yield ": keepalive\n\n" + continue + if item is None: + break + yield f"data: {item}\n\n" + finally: + try: + _subscribers.remove(q) + except ValueError: + pass + + return StreamingResponse(generator(), media_type="text/event-stream") + + +@app.get("/history") +async def get_history() -> Any: + path = _cache_dir / "history.json" + if not path.exists(): + return {"baseline_sha": None, "attempts": []} + return json.loads(path.read_text()) + + +@app.delete("/history") +async def reset_history() -> dict[str, str]: + import shutil + + for p in _cache_dir.iterdir(): + if p.is_dir(): + shutil.rmtree(p) + else: + p.unlink() + return {"status": "reset"} + + +@app.get("/status") +async def get_status() -> dict[str, Any]: + if _process is None: + return {"state": "idle", "returncode": None} + if _process.returncode is None: + return {"state": "running", "returncode": None} + return {"state": "stopped", "returncode": _process.returncode} + + +@app.get("/bench/{bench_id}") +async def get_bench(bench_id: int) -> Any: + path = _cache_dir / f"bench-{bench_id}.json" + if not path.exists(): + raise HTTPException(404, "Bench data not found") + return json.loads(path.read_text()) + + +@app.get("/diff/{bench_id}") +async def get_diff(bench_id: int) -> PlainTextResponse: + history_path = _cache_dir / "history.json" + if not history_path.exists(): + raise HTTPException(404, "No history") + history = json.loads(history_path.read_text()) + attempt = next( + (a for a in history.get("attempts", []) if a["id"] == bench_id), None + ) + if attempt is None or not attempt.get("result_sha"): + raise HTTPException(404, "Attempt not found or has no result") + proc = await asyncio.create_subprocess_exec( + "git", + "-C", + str(_sources_dir.resolve()), + "diff", + attempt["baseline_sha"], + attempt["result_sha"], + "--", + "src/libbpfilter/cgen/", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + return PlainTextResponse(stdout.decode(errors="replace")) + + +# --------------------------------------------------------------------------- +# Embedded frontend +# --------------------------------------------------------------------------- + +HTML = """ + + + + +bfoptimize + + + + + + + + +
+
+ + +
+
+ Run configuration + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ + +
+
+
+
+
+
Hints
+
+
+ Optional directions for the model — it may ignore them if a better opportunity exists + +
+
+
+
Model
+
+ + +
+
+ + +
+
+
+ + +
+
+
+
+ + +
+
+ +
+
+
+ +
+ bfoptimize +
+
+
+
+ +
+
+ + + +
+ + Idle +
+ +
+
+ +
+
+
Attempts
+
Accepted
+
Rejected
+
Best cumul. Δ
+
+ +
+
+
+ Runtime delta per attempt + negative = faster than baseline +
+
+ +
+
+
+
+ Output + +
+

+    
+
+ +
+
Benchmark history
+
+
+
+ + No results yet — start a run to populate this section +
+
+ + +
+
+
+ + + +""" + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main() -> None: + global _cache_dir, _sources_dir + + parser = argparse.ArgumentParser( + prog="bfoptimize-web", + description="Local web UI for bfoptimize.", + ) + parser.add_argument( + "--port", + type=int, + default=8080, + help="TCP port to listen on (default: 8080)", + ) + parser.add_argument( + "--sources", + type=pathlib.Path, + default=pathlib.Path("."), + help='bpfilter source directory for git diff (default: ".")', + ) + parser.add_argument( + "--cache-dir", + type=pathlib.Path, + default=pathlib.Path(".cache/bfoptimize"), + help='bfoptimize cache directory (default: ".cache/bfoptimize")', + ) + args = parser.parse_args() + + _cache_dir = args.cache_dir + _sources_dir = args.sources + + uvicorn.run(app, host="127.0.0.1", port=args.port) + + +if __name__ == "__main__": + main() From 88182237e9274c5ccddfdbcd53bd770841fffb27 Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Fri, 20 Mar 2026 13:57:19 +0100 Subject: [PATCH 6/7] tools: benchmarks: move benchmarks into tests/ --- .github/workflows/ci.yaml | 4 +- CLAUDE.md | 2 +- CMakeLists.txt | 5 - README.md | 2 +- derivation.nix | 1 - doc/developers/build.rst | 3 +- tests/CMakeLists.txt | 1 + {tools => tests}/benchmarks/CMakeLists.txt | 2 - {tools => tests}/benchmarks/benchmark.cpp | 0 {tools => tests}/benchmarks/benchmark.hpp | 0 {tools => tests}/benchmarks/bfbencher | 8 +- {tools => tests}/benchmarks/main.cpp | 0 {tools => tests}/benchmarks/summary.html.j2 | 0 tests/check/CMakeLists.txt | 2 +- tools/benchmarks/results.html.j2 | 819 -------------------- tools/bfoptimize | 9 +- 16 files changed, 16 insertions(+), 842 deletions(-) rename {tools => tests}/benchmarks/CMakeLists.txt (98%) rename {tools => tests}/benchmarks/benchmark.cpp (100%) rename {tools => tests}/benchmarks/benchmark.hpp (100%) rename {tools => tests}/benchmarks/bfbencher (99%) rename {tools => tests}/benchmarks/main.cpp (100%) rename {tools => tests}/benchmarks/summary.html.j2 (100%) delete mode 100644 tools/benchmarks/results.html.j2 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4c41d5a5f..29e872fae 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -82,7 +82,7 @@ jobs: - name: Checkout bpfilter uses: actions/checkout@v2 - name: Configure the build - run: cmake -S $GITHUB_WORKSPACE -B $GITHUB_WORKSPACE/build -DNO_BENCHMARKS=1 + run: cmake -S $GITHUB_WORKSPACE -B $GITHUB_WORKSPACE/build - name: Build all run: make -C $GITHUB_WORKSPACE/build -j `nproc` @@ -177,7 +177,7 @@ jobs: BENCH_INCLUDE="" BENCH_FAIL_ON="" fi - tools/benchmarks/bfbencher \ + tests/benchmarks/bfbencher \ history \ --since 30bd49f \ --until $BENCH_UNTIL \ diff --git a/CLAUDE.md b/CLAUDE.md index fe4f209b1..ecf74b6c2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -68,7 +68,7 @@ make -C build doc ``` **Build options:** -- `-DNO_DOCS=1`, `-DNO_TESTS=1`, `-DNO_CHECKS=1`, `-DNO_BENCHMARKS=1` +- `-DNO_DOCS=1`, `-DNO_TESTS=1`, `-DNO_CHECKS=1` ## Code style diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e787ed8a..d3b5f48c6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,6 @@ include(GNUInstallDirs) option(NO_DOCS "Disable documentation generation" 0) option(NO_TESTS "Disable unit, end-to-end, and integration tests" 0) option(NO_CHECKS "Disable the check target (clang-tidy and clang-format" 0) -option(NO_BENCHMARKS "Disable the benchmark" 0) option(WITH_COVERAGE "Build with code coverage support. Disabled by default" 0) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -100,7 +99,3 @@ if (NOT ${NO_TESTS}) enable_testing() add_subdirectory(tests) endif () - -if (NOT ${NO_BENCHMARKS}) - add_subdirectory(tools/benchmarks) -endif () diff --git a/README.md b/README.md index 4705845a1..feab00460 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ sudo dnf install -y bpfilter bpfilter-devel sudo dnf install -y clang cmake gcc libbpf-devel bison flex sed xxd # Configure the project and build bpfilter -cmake -S $SOURCES_DIR -B $BUILD_DIR -DNO_DOCS=ON -DNO_TESTS=ON -DNO_CHECKS=ON -DNO_BENCHMARKS=ON +cmake -S $SOURCES_DIR -B $BUILD_DIR -DNO_DOCS=ON -DNO_TESTS=ON -DNO_CHECKS=ON make -C $BUILD_DIR ``` diff --git a/derivation.nix b/derivation.nix index 02da8cdb6..82b736e2b 100644 --- a/derivation.nix +++ b/derivation.nix @@ -66,7 +66,6 @@ in "-DNO_DOCS=1" "-DNO_TESTS=1" "-DNO_CHECKS=1" - "-DNO_BENCHMARKS=1" ]; # We do not run the unit tests because the nix build sandbox doesn't diff --git a/doc/developers/build.rst b/doc/developers/build.rst index f979cac32..25e4fa1d6 100644 --- a/doc/developers/build.rst +++ b/doc/developers/build.rst @@ -95,9 +95,8 @@ You can then use CMake to generate the build system: The usual CMake options are allowed (e.g. ``CMAKE_BUILD_TYPE``, ``CMAKE_INSTALL_PREFIX``...). The build configuration is modular, so you're free to enable/disable some parts of the projects according to your needs: - ``-DNO_DOCS``: disable the documentation, including the coverage and benchmarks report. -- ``-DNO_TESTS``: disable unit tests, end-to-end tests, and integration tests. +- ``-DNO_TESTS``: disable all tests. - ``-DNO_CHECKS``: disable style check and static analyzer. -- ``-DNO_BENCHMARKS``: disable benchmarks. A full configuration (without any part disabled) will provide the following targets: diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b3efbb8f3..4c8d0d6f9 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -7,6 +7,7 @@ add_subdirectory(e2e) add_subdirectory(fuzz) add_subdirectory(integration) add_subdirectory(check) +add_subdirectory(benchmarks) add_custom_target(test_bin DEPENDS unit_bin e2e_bin fuzz_parser diff --git a/tools/benchmarks/CMakeLists.txt b/tests/benchmarks/CMakeLists.txt similarity index 98% rename from tools/benchmarks/CMakeLists.txt rename to tests/benchmarks/CMakeLists.txt index a9fad4f2a..e5d3b54a3 100644 --- a/tools/benchmarks/CMakeLists.txt +++ b/tests/benchmarks/CMakeLists.txt @@ -1,8 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only # Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -enable_language(CXX) - find_package(benchmark REQUIRED) find_package(PkgConfig REQUIRED) pkg_check_modules(bpf REQUIRED IMPORTED_TARGET libbpf) diff --git a/tools/benchmarks/benchmark.cpp b/tests/benchmarks/benchmark.cpp similarity index 100% rename from tools/benchmarks/benchmark.cpp rename to tests/benchmarks/benchmark.cpp diff --git a/tools/benchmarks/benchmark.hpp b/tests/benchmarks/benchmark.hpp similarity index 100% rename from tools/benchmarks/benchmark.hpp rename to tests/benchmarks/benchmark.hpp diff --git a/tools/benchmarks/bfbencher b/tests/benchmarks/bfbencher similarity index 99% rename from tools/benchmarks/bfbencher rename to tests/benchmarks/bfbencher index d95ab5d98..9fe089403 100755 --- a/tools/benchmarks/bfbencher +++ b/tests/benchmarks/bfbencher @@ -36,8 +36,8 @@ DEFAULT_LAST_COMMIT_REF = "wip" DEFAULT_SOURCE_PATH = pathlib.Path(".") DEFAULT_CACHE_PATH = pathlib.Path(".cache/bfbencher") DEFAULT_USERNAME = getpass.getuser() -DEFAULT_REPORT_TEMPLATE_PATH = pathlib.Path("tools/benchmarks/results.html.j2") -DEFAULT_PR_REPORT_TEMPLATE_PATH = pathlib.Path("tools/benchmarks/summary.html.j2") +DEFAULT_REPORT_TEMPLATE_PATH = pathlib.Path("tests/benchmarks/results.html.j2") +DEFAULT_PR_REPORT_TEMPLATE_PATH = pathlib.Path("tests/benchmarks/summary.html.j2") DEFAULT_HOST = [socket.gethostname(), "localhost"] SHORT_SHA_LEN = 7 @@ -841,7 +841,7 @@ class BenchmarkContext: return self.build_dir / "output/sbin/bfcli" def configure( - self, doc: bool = False, tests: bool = False, checks: bool = False + self, doc: bool = False, checks: bool = False ) -> bool: cmd: list[str] = [ "cmake", @@ -853,8 +853,6 @@ class BenchmarkContext: if not doc: cmd += ["-DNO_DOCS=1"] - if not tests: - cmd += ["-DNO_TESTS=1"] if not checks: cmd += ["-DNO_CHECKS=1"] diff --git a/tools/benchmarks/main.cpp b/tests/benchmarks/main.cpp similarity index 100% rename from tools/benchmarks/main.cpp rename to tests/benchmarks/main.cpp diff --git a/tools/benchmarks/summary.html.j2 b/tests/benchmarks/summary.html.j2 similarity index 100% rename from tools/benchmarks/summary.html.j2 rename to tests/benchmarks/summary.html.j2 diff --git a/tests/check/CMakeLists.txt b/tests/check/CMakeLists.txt index e26f9baf1..d62632af0 100644 --- a/tests/check/CMakeLists.txt +++ b/tests/check/CMakeLists.txt @@ -13,7 +13,7 @@ file(GLOB_RECURSE bf_srcs file(GLOB_RECURSE bf_test_srcs ${CMAKE_SOURCE_DIR}/tests/*.h ${CMAKE_SOURCE_DIR}/tests/*.c ${CMAKE_SOURCE_DIR}/tests/*.hpp ${CMAKE_SOURCE_DIR}/tests/*.cpp - ${CMAKE_SOURCE_DIR}/tools/benchmarks/*.hpp ${CMAKE_SOURCE_DIR}/tools/benchmarks/*.cpp + ${CMAKE_SOURCE_DIR}/tests/benchmarks/*.hpp ${CMAKE_SOURCE_DIR}/tests/benchmarks/*.cpp ) set(bf_all_srcs ${bf_srcs} ${bf_test_srcs}) diff --git a/tools/benchmarks/results.html.j2 b/tools/benchmarks/results.html.j2 deleted file mode 100644 index 60d54a0c1..000000000 --- a/tools/benchmarks/results.html.j2 +++ /dev/null @@ -1,819 +0,0 @@ - - - - - - - Benchmark Results - - - - - - - - - - - -
- - - - -
-
-
-
-
Commit range
-
{{ first_commit_sha[:7] }}..{{ last_commit_sha[:7] }}
-
-
-
Host
-
{{ hostname }}
-
-
-
Commits
-
{{ n_commits }}
-
-
-
Results
- {% if stats.n_failures %} -
{{ stats.n_successes }} ({{ stats.n_failures }} failures)
- {% else %} -
{{ stats.n_successes }}
- {% endif %} -
-
-
-
- - -
-
-
-
-

Benchmark metrics

- {{ history.sorted_benchmarks() | length }} benchmarks -
- -
- Each row shows performance data for a single benchmark. Runtime is the CPU time - measured for the most recent commit, and Instructions is the BPF instruction count - of the generated program. The Δ (delta) columns show the percentage change compared - to the mean of the previous N commits (e.g., "5 commits" compares against the average of - commits 2-6). Only statistically significant changes (z-score > 2.5) are colored: - green for improvements (faster or fewer instructions), - red for regressions. Uncolored values indicate changes within - normal variance. Click a benchmark name to jump to its historical chart below. -
- - {%- from "summary.html.j2" import render_table -%} -
- {{ render_table(rows, terms, none, "bootstrap", true, ureg, none, get_class) }} -
-
-
-
- - -
-
-

Performance trends

- Historical data -
-
-
-
- - - - - diff --git a/tools/bfoptimize b/tools/bfoptimize index 337e318a1..77d716b36 100755 --- a/tools/bfoptimize +++ b/tools/bfoptimize @@ -341,7 +341,9 @@ def run_benchmark( "--fail-on-benchmark-error", ] + extra_args - console.log(f"Running bfbencher compare {baseline_sha[:SHORT_SHA_LEN]} → {result_sha[:SHORT_SHA_LEN]}") + console.log( + f"Running bfbencher compare {baseline_sha[:SHORT_SHA_LEN]} → {result_sha[:SHORT_SHA_LEN]}" + ) result = subprocess.run(cmd, text=True, env={**os.environ, "PYTHONUNBUFFERED": "1"}) if result.returncode != 0: console.log("[red]bfbencher compare failed[/red]") @@ -367,7 +369,8 @@ def run_benchmark( noisy = [(name, cv) for name, cv in noise.items() if cv > 0.01] if noisy: noisy_str = ", ".join( - f"{name} (CV={cv:.1%})" for name, cv in sorted(noisy, key=lambda x: -x[1]) + f"{name} (CV={cv:.1%})" + for name, cv in sorted(noisy, key=lambda x: -x[1]) ) console.log(f"[dim]Volatile benchmarks (down-weighted): {noisy_str}[/dim]") @@ -431,7 +434,7 @@ async def run_optimize(args: argparse.Namespace) -> None: console.log(f"Baseline: {history.baseline_sha[:SHORT_SHA_LEN]}") - bfbencher = (args.sources / "tools/benchmarks/bfbencher").resolve() + bfbencher = (args.sources / "tests/benchmarks/bfbencher").resolve() bench_extra: list[str] = [] if args.host: From b58470e4cb5c1c7d493b8c793b6cbaac2435f065 Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Fri, 20 Mar 2026 18:24:34 +0100 Subject: [PATCH 7/7] tests: benchmarks: add per-hook prologue benchmarks --- tests/benchmarks/main.cpp | 102 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/tests/benchmarks/main.cpp b/tests/benchmarks/main.cpp index c8f634aba..7517ef68b 100644 --- a/tests/benchmarks/main.cpp +++ b/tests/benchmarks/main.cpp @@ -94,6 +94,108 @@ void chain_policy_c(::benchmark::State &state) BENCHMARK(chain_policy_c); +void xdp_prologue_c(::benchmark::State &state) +{ + Chain chain("bf_benchmark", BF_HOOK_XDP, BF_VERDICT_ACCEPT); + + auto chainp = chain.get(); + int ret = bf_chain_set(chainp.get(), nullptr); + if (ret < 0) + throw std::runtime_error("failed to load chain"); + + auto prog = bft::Program(chain.name()); + + while (state.KeepRunningBatch(::bft::progRunRepeat)) { + auto stats = prog.run(::bft::pkt_local_ip6_tcp); + if (stats.retval != XDP_PASS) + state.SkipWithError("benchmark run failed"); + + state.SetIterationTime((double)stats.duration * stats.repeat); + } + + state.counters["nInsn"] = prog.nInsn(); + state.SetLabel("XDP prologue, accept policy"); +} + +BENCHMARK(xdp_prologue_c); + +void tc_ingress_prologue_c(::benchmark::State &state) +{ + Chain chain("bf_benchmark", BF_HOOK_TC_INGRESS, BF_VERDICT_ACCEPT); + + auto chainp = chain.get(); + int ret = bf_chain_set(chainp.get(), nullptr); + if (ret < 0) + throw std::runtime_error("failed to load chain"); + + auto prog = bft::Program(chain.name()); + + // TC_ACT_OK = 0 + while (state.KeepRunningBatch(::bft::progRunRepeat)) { + auto stats = prog.run(::bft::pkt_local_ip6_tcp); + if (stats.retval != 0) + state.SkipWithError("benchmark run failed"); + + state.SetIterationTime((double)stats.duration * stats.repeat); + } + + state.counters["nInsn"] = prog.nInsn(); + state.SetLabel("TC_INGRESS prologue, accept policy"); +} + +BENCHMARK(tc_ingress_prologue_c); + +void cgroup_skb_ingress_prologue_c(::benchmark::State &state) +{ + Chain chain("bf_benchmark", BF_HOOK_CGROUP_SKB_INGRESS, BF_VERDICT_ACCEPT); + + auto chainp = chain.get(); + int ret = bf_chain_set(chainp.get(), nullptr); + if (ret < 0) + throw std::runtime_error("failed to load chain"); + + auto prog = bft::Program(chain.name()); + + while (state.KeepRunningBatch(::bft::progRunRepeat)) { + auto stats = prog.run(::bft::pkt_local_ip6_tcp); + if (stats.retval != ::bft::CGROUP_SKB_ACCEPT) + state.SkipWithError("benchmark run failed"); + + state.SetIterationTime((double)stats.duration * stats.repeat); + } + + state.counters["nInsn"] = prog.nInsn(); + state.SetLabel("CGROUP_SKB_INGRESS prologue, accept policy"); +} + +BENCHMARK(cgroup_skb_ingress_prologue_c); + +void nf_local_in_prologue_c(::benchmark::State &state) +{ + Chain chain("bf_benchmark", BF_HOOK_NF_LOCAL_IN, BF_VERDICT_ACCEPT); + + auto chainp = chain.get(); + int ret = bf_chain_set(chainp.get(), nullptr); + if (ret < 0) + throw std::runtime_error("failed to load chain"); + + auto prog = bft::Program(chain.name()); + + // NF_ACCEPT = 1 + while (state.KeepRunningBatch(::bft::progRunRepeat)) { + auto stats = prog.run(::bft::pkt_local_ip6_tcp); + if (stats.retval != 1) + state.SkipWithError("benchmark run failed"); + + state.SetIterationTime((double)stats.duration * stats.repeat); + } + + state.counters["nInsn"] = prog.nInsn(); + state.SetLabel("NF_LOCAL_IN prologue, accept policy"); +} + +BENCHMARK(nf_local_in_prologue_c); + void single_rule__ip4_saddr(::benchmark::State &state) { Chain chain("bf_benchmark", BF_HOOK_XDP, BF_VERDICT_ACCEPT);