From 0cb835c9583b9bb9963f9788fbd9f02344538bee Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Tue, 10 Mar 2026 11:40:43 +0000
Subject: [PATCH 01/19] AI setup

---
 .Rbuildignore          |   3 +
 .positai/settings.json |  25 +++++
 AGENTS.md              | 205 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 233 insertions(+)
 create mode 100644 .positai/settings.json
 create mode 100644 AGENTS.md

diff --git a/.Rbuildignore b/.Rbuildignore
index f2ea6e43d..1aa03d51f 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -7,6 +7,7 @@
 ^src\-
 ^src\.
 CONTRIBUTING
+AGENTS.md
 README.md
 ^\.github
 ^\.lintr
@@ -34,3 +35,5 @@ revdep
 ^\.vs.*$
 ^\.vscode.*$
 ^CRAN-SUBMISSION$
+^\.positai$
+^\.claude$
diff --git a/.positai/settings.json b/.positai/settings.json
new file mode 100644
index 000000000..f3f178518
--- /dev/null
+++ b/.positai/settings.json
@@ -0,0 +1,25 @@
+{
+  "permission": {
+    "read": {
+      "*.r": "allow",
+      "*.h": "allow",
+      "*.cpp": "allow",
+      "*.R": "allow",
+      "*": "allow"
+    },
+    "edit": {
+      "*.cpp": "allow",
+      "*.h": "allow",
+      "*.R": "allow",
+      "*.md": "allow"
+    },
+    "bash": {
+      "echo *": "allow",
+      "grep *": "allow",
+      "head *": "allow"
+      "ls *": "allow",
+      "rm -f src/*.o src/*.dll": "allow",
+      "tail *": "allow",
+    }
+  }
+}
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 000000000..dd295d396
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,205 @@
+# TreeDist — Agent Memory
+
+## Project Overview
+
+**TreeDist** is an R package (GPL ≥ 3) providing a suite of topological distance metrics
+between phylogenetic trees. The mathematical core is implemented in C++17 and exposed to R
+via Rcpp. The primary optimization goal is speed: many real analyses compute pairwise
+distances over hundreds or thousands of trees, so inner loops must be tight.
+
+Current version: `2.12.0.9000` (development).  
+CRAN package page: <https://cran.r-project.org/package=TreeDist>
+
+---
+
+## Repository Layout
+
+```
+TreeDist/
+├── src/                  # C++17 source (the main optimization target)
+├── R/                    # R wrapper layer and pure-R helpers
+├── benchmark/            # Microbenchmark scripts (bench package)
+├── tests/testthat/       # Unit tests
+├── data-raw/             # Scripts that regenerate lookup tables / data
+├── vignettes/            # User-facing tutorials
+└── inst/                 # Installed extras
+```
+
+**Sibling repository** — `../TreeTools` is a companion package that TreeDist links against
+at the C++ level (`LinkingTo: TreeTools`). Edits to TreeTools headers (especially
+`SplitList.h`) can affect TreeDist performance and can be pushed to CRAN independently
+when ready.
+
+---
+
+## C++ Source Files
+
+| File | Size | Purpose |
+|------|------|---------|
+| `tree_distances.cpp` | 22 KB | Main distance calculations; calls into CostMatrix / LAP |
+| `tree_distances.h` | 15 KB | **CostMatrix** class; cache-aligned storage; `findRowSubmin` hot path |
+| `lap.cpp` | 10 KB | Jonker-Volgenant LAPJV linear assignment; extensively hand-optimized |
+| `spr.cpp` | 7 KB | SPR distance approximation |
+| `spr_lookup.cpp` | — | SPR lookup-table implementation |
+| `nni_distance.cpp` | 16 KB | NNI distance approximations; HybridBuffer allocation |
+| `li_diameters.h` | 30 KB | Precomputed NNI diameter lookup tables |
+| `information.h` | 6 KB | log₂ / factorial lookup tables (max 8192); cached at startup |
+| `binary_entropy_counts.cpp` | — | Binary entropy calculations |
+| `day_1985.cpp` | 10 KB | Consensus tree information |
+| `hmi.cpp` | 6 KB | Hierarchical Mutual Information |
+| `hpart.cpp` | 7 KB | Hierarchical partition structures |
+| `reduce_tree.cpp` | 11 KB | Prune trees to common tip set before distance calculation |
+| `path_vector.cpp` | 3 KB | Path distance vector |
+| `mast.cpp` | 5 KB | Maximum Agreement Subtree |
+| `RcppExports.cpp` | 20 KB | Auto-generated Rcpp glue (do not edit by hand) |
+| `ints.h` | — | Fixed-width integer typedefs (`splitbit`, `int16`, `int32`, …) |
+
+---
+
+## Key Optimizations Already in Place
+
+Understanding what has already been done avoids duplicating effort.
+
+### Memory / cache
+- **64-byte alignment** (`alignas(64)`) on `CostMatrix::data_` and `t_data_`.
+- `CostMatrix` pads row width to a multiple of `BLOCK_SIZE = 16` so SIMD loads never
+  straddle cache lines.
+- A **transposed copy** of the cost matrix is maintained to allow column-wise access with
+  sequential memory reads.
+
+### Arithmetic
+- **Lookup tables** in `information.h` for `log2` (values 0–(SL_MAX_TIPS-1)²) and
+  `log2_factorial` (up to 8192); initialized via `__attribute__((constructor))`.
+  Using these avoids runtime `std::log2()` calls on hot paths.
+- **Loop unrolling** — `CostMatrix::findRowSubmin()` manually unrolls 4× (comment:
+  "gives ~20% speedup").
+- **`__restrict__`** pointer annotations in `lap.cpp` and `tree_distances.h` to enable
+  compiler alias analysis.
+- **`__builtin_assume_aligned(ptr, 64)`** hints around inner loops.
+- **`__builtin_popcount()`** for split-bit counting.
+
+### Algorithm
+- **Column reduction in reverse order** in LAPJV (faster convergence).
+- **`findRowSubmin` two-pass strategy** avoids redundant comparisons.
+- **LAPJV v2.10.0** achieved a 2× speedup for large matrices via reorganisation.
+- **`HybridBuffer<T, StackSize>`** in `nni_distance.cpp`: small allocations go on the
+  stack (thresholds: 512 splits, 16 bins) to avoid heap overhead.
+- **Tree reduction** (`reduce_tree.cpp`): trees are pruned to their common tip set before
+  any distance is computed; this is a major algorithmic win for partially-overlapping
+  taxon sets.
+
+### Types
+- `cost = int_fast64_t` for LAP to avoid overflow and exploit 64-bit registers.
+- `BIG = numeric_limits<cost>::max() / SL_MAX_SPLITS` — the divide avoids integer
+  overflow inside the LAP inner loop.
+- `ROUND_PRECISION = 2048²` for safe rounding in cost scaling.
+
+---
+
+## Benchmark Infrastructure
+
+All benchmarks live in `benchmark/` and use the `bench` package.
+
+```r
+source("benchmark/_init.R")   # loads TreeTools, TreeDist; defines Benchmark()
+```
+
+`Benchmark(expr)` wraps `bench::mark()` with `min_iterations = 3` and `time_unit = "us"`.
+When run non-interactively (e.g. CI) results are serialised to `.bench.Rds` files.
+`_compare_results.R` compares PR results against `main` and fails the build on a
+regression > 4 %.
+
+### Existing benchmark scripts
+
+| Script | What it tests |
+|--------|---------------|
+| `bench-tree-distances.R` | CID, PID, RF, MCI on 100×50-tip and 40×200-tip tree sets |
+| `bench-LAPJV.R` | LAPJV on 40×40, 400×400, 1999×1999 uniform random matrices |
+| `bench-PathDist.R` | PathDist on 6×182-tip trees |
+
+To add a new benchmark, create `benchmark/bench-<topic>.R` following the existing pattern
+and add it to `_run_benchmarks.R`.
+
+---
+
+## Profiling with VTune
+
+VTune 2025.9 is installed at:
+
+```
+C:\Program Files (x86)\Intel\oneAPI\vtune\2025.9\bin64\vtune.exe
+```
+
+Typical workflow:
+1. Build TreeDist with debug symbols but optimisations enabled:
+   `PKG_CXXFLAGS="-O2 -g"` in `~/.R/Makevars`.
+2. Write a driver `.R` script that exercises the hot path in a loop (use the benchmark
+   scripts as a starting point).
+3. Run a hotspot collection:
+   ```
+   vtune -collect hotspots -result-dir vtune-out -- Rscript path/to/driver.R
+   ```
+4. View the report:
+   ```
+   vtune -report hotspots -result-dir vtune-out
+   ```
+   Or open the `.vtune` project in the VTune GUI.
+5. Pay attention to the **Memory Access** and **Threading** analyses for cache-miss and
+   false-sharing diagnostics.
+
+---
+
+## TreeTools Dependency
+
+`../TreeTools` is available locally and editable. It is linked at the C++ level via
+`LinkingTo`; the key header consumed by TreeDist is `<TreeTools/SplitList.h>`.
+
+Important constants defined there that affect TreeDist:
+- `SL_MAX_TIPS` — maximum number of leaf taxa per tree.
+- `SL_MAX_SPLITS` — maximum number of splits.
+- `splitbit` — the unsigned integer type used for bitset representation of splits.
+
+If a bottleneck traces back to a TreeTools header (e.g. SplitList layout, bit-width of
+`splitbit`), changes can be made in `../TreeTools`, tested locally by re-installing, and
+pushed to CRAN when stable.
+
+---
+
+## Development Workflow
+
+```r
+# Build and reload (from R)
+devtools::load_all()          # fast incremental rebuild
+devtools::test()              # run testthat suite
+
+# Or from the shell
+R CMD build .
+R CMD check TreeDist_*.tar.gz
+
+# Run a single benchmark interactively
+source("benchmark/_init.R")
+source("benchmark/bench-tree-distances.R")
+```
+
+C++ compilation flags are controlled by `src/Makevars.win` (Windows) / `src/Makevars`.
+The package requires **C++17** (`CXX_STD = CXX17`).
+
+---
+
+## Known Optimization Opportunities / TODOs
+
+- `information.h` line 19: comment suggests considering increasing `MAX_FACTORIAL_LOOKUP`
+  beyond 8192 or falling back to runtime calculation for larger values.
+- `information.h` lines 120–122: code duplication flagged for consolidation.
+- LAP inner loop: the 4× manual unroll works well; investigate whether AVX2 SIMD
+  intrinsics (`_mm256_*`) could replace it on modern hardware.
+- `CostMatrix` transpose: currently maintained as a full second copy; a cache-oblivious
+  blocking scheme (already partially implemented via `BLOCK_SIZE`) could reduce memory
+  bandwidth further.
+- SPR distance (`spr.cpp`, `spr_lookup.cpp`): the algorithm is relatively recent
+  (v2.8.0); profiling under VTune may reveal further hot spots.
+- Parallelism: `parallel.R` exposes a parallel interface at the R level; the C++ layer
+  does not yet use OpenMP. Pairwise distance matrices are an embarrassingly parallel
+  workload.
+- Large-tree path (`int32` migration, v2.12.0 dev): ensure new code paths are as
+  optimized as the original `int16` paths.

From fff9a510c5618436e838b0db101600485733669f Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Tue, 10 Mar 2026 13:08:59 +0000
Subject: [PATCH 02/19] OpenMP integration

---
 .positai/settings.json      |  11 +-
 AGENTS.md                   |  37 ++++++-
 R/RcppExports.R             |   4 +
 R/tree_distance_utilities.R |  16 +++
 src/RcppExports.cpp         |  13 +++
 src/lap.cpp                 |   5 +-
 src/pairwise_distances.cpp  | 199 ++++++++++++++++++++++++++++++++++++
 src/tree_distances.cpp      |  19 ----
 src/tree_distances.h        |  24 ++++-
 9 files changed, 297 insertions(+), 31 deletions(-)
 create mode 100644 src/pairwise_distances.cpp

diff --git a/.positai/settings.json b/.positai/settings.json
index f3f178518..9d0741541 100644
--- a/.positai/settings.json
+++ b/.positai/settings.json
@@ -3,6 +3,7 @@
     "read": {
       "*.r": "allow",
       "*.h": "allow",
+      "*.log": "allow",
       "*.cpp": "allow",
       "*.R": "allow",
       "*": "allow"
@@ -10,16 +11,22 @@
     "edit": {
       "*.cpp": "allow",
       "*.h": "allow",
+      "DESCRIPTION": "allow",
+      "src/Makevars": "allow",
+      "src/Makevars.win": "allow",
       "*.R": "allow",
       "*.md": "allow"
     },
     "bash": {
+      "cd C:/Users/pjjg18/GitHub/TreeDist": "allow",
+      "cat *": "allow",
       "echo *": "allow",
+      "find *": "allow",
       "grep *": "allow",
-      "head *": "allow"
+      "head *": "allow",
       "ls *": "allow",
       "rm -f src/*.o src/*.dll": "allow",
-      "tail *": "allow",
+      "tail *": "allow"
     }
   }
 }
diff --git a/AGENTS.md b/AGENTS.md
index dd295d396..280a133c7 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -116,6 +116,7 @@ regression > 4 %.
 | `bench-tree-distances.R` | CID, PID, RF, MCI on 100×50-tip and 40×200-tip tree sets |
 | `bench-LAPJV.R` | LAPJV on 40×40, 400×400, 1999×1999 uniform random matrices |
 | `bench-PathDist.R` | PathDist on 6×182-tip trees |
+| `bench-MCI-openmp.R` | MCI/CID OpenMP scaling on 100×50, 40×200, and 200×200-tip tree sets |
 
 To add a new benchmark, create `benchmark/bench-<topic>.R` following the existing pattern
 and add it to `_run_benchmarks.R`.
@@ -186,11 +187,38 @@ The package requires **C++17** (`CXX_STD = CXX17`).
 
 ---
 
+## Completed Optimizations (this dev cycle)
+
+### OpenMP parallelism for pairwise distances (`src/pairwise_distances.cpp`)
+Added `#pragma omp parallel for schedule(dynamic)` over the pairwise loop for
+`MutualClusteringInfo` / `ClusteringInfoDistance`.  Build infrastructure:
+
+- `src/Makevars` and `src/Makevars.win` created (these did not exist before);
+  both set `CXX_STD = CXX17` and `PKG_CXXFLAGS/PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)`.
+  On platforms without OpenMP the flags are empty and the package builds
+  single-threaded.
+- `lap()` in `lap.cpp` gained an `allow_interrupt = true` parameter; the batch
+  path passes `false` to avoid calling `Rcpp::checkUserInterrupt()` from a
+  worker thread.
+- `add_ic_element` moved from `tree_distances.cpp` (inside `namespace TreeDist`,
+  inaccessible to other TUs) into `tree_distances.h` as a proper `inline`,
+  fixing a latent ODR issue and making it visible to `pairwise_distances.cpp`.
+- R fast path: `.SplitDistanceAllPairs()` in `tree_distance_utilities.R` detects
+  `Func == MutualClusteringInfoSplits` with a same-tip-set, no-cluster call and
+  routes it to `cpp_mutual_clustering_all_pairs()`.
+
+**Benchmark script**: `benchmark/bench-MCI-openmp.R`
+**To measure speedup**: run `bench-MCI-openmp.R` in a **fresh R session** after
+`devtools::load_all()` (Windows locks the DLL in the session that compiles it).
+
+---
+
 ## Known Optimization Opportunities / TODOs
 
 - `information.h` line 19: comment suggests considering increasing `MAX_FACTORIAL_LOOKUP`
   beyond 8192 or falling back to runtime calculation for larger values.
-- `information.h` lines 120–122: code duplication flagged for consolidation.
+- `information.h` lines 120–122: `log2_factorial_table` is a verbatim copy from
+  `TreeSearch/src/expected_mi.cpp`; should be defined once (TreeTools?) and shared.
 - LAP inner loop: the 4× manual unroll works well; investigate whether AVX2 SIMD
   intrinsics (`_mm256_*`) could replace it on modern hardware.
 - `CostMatrix` transpose: currently maintained as a full second copy; a cache-oblivious
@@ -198,8 +226,9 @@ The package requires **C++17** (`CXX_STD = CXX17`).
   bandwidth further.
 - SPR distance (`spr.cpp`, `spr_lookup.cpp`): the algorithm is relatively recent
   (v2.8.0); profiling under VTune may reveal further hot spots.
-- Parallelism: `parallel.R` exposes a parallel interface at the R level; the C++ layer
-  does not yet use OpenMP. Pairwise distance matrices are an embarrassingly parallel
-  workload.
+- OpenMP for other metrics: `pairwise_distances.cpp` currently covers only MCI/CID.
+  Adding equivalent batch functions for `SharedPhylogeneticInfo`, `MatchingSplitInfo`,
+  `RobinsonFoulds`, and `MatchingSplitDistance` would extend the speedup; each needs
+  a score-only variant of its computation.
 - Large-tree path (`int32` migration, v2.12.0 dev): ensure new code paths are as
   optimized as the original `int16` paths.
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 81e688d39..2ca5d5977 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -78,6 +78,10 @@ cpp_nni_distance <- function(edge1, edge2, nTip) {
     .Call(`_TreeDist_cpp_nni_distance`, edge1, edge2, nTip)
 }
 
+cpp_mutual_clustering_all_pairs <- function(splits_list, n_tip) {
+    .Call(`_TreeDist_cpp_mutual_clustering_all_pairs`, splits_list, n_tip)
+}
+
 path_vector <- function(edge) {
     .Call(`_TreeDist_path_vector`, edge)
 }
diff --git a/R/tree_distance_utilities.R b/R/tree_distance_utilities.R
index 5bbef3ac3..e7de4f2fa 100644
--- a/R/tree_distance_utilities.R
+++ b/R/tree_distance_utilities.R
@@ -129,6 +129,22 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL,
                                    nTip = length(tipLabels), ...) {
   cluster <- getOption("TreeDist-cluster")
 
+  # Fast path: use the OpenMP batch function for mutual clustering when all
+  # trees share the same tip set and no R-level cluster has been configured.
+  # Matches the behaviour of the generic path but avoids per-pair R overhead.
+  if (!is.na(nTip) && is.null(cluster) &&
+      identical(Func, MutualClusteringInfoSplits)) {
+    splits <- as.Splits(splits1, tipLabels = tipLabels, asSplits = FALSE)
+    return(structure(
+      cpp_mutual_clustering_all_pairs(splits, as.integer(nTip)),
+      class  = "dist",
+      Size   = length(splits1),
+      Labels = names(splits1),
+      Diag   = FALSE,
+      Upper  = FALSE
+    ))
+  }
+
   if (is.na(nTip)) {
     splits <- lapply(splits1, as.Splits)
     
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 5c338bf32..3aa1a162c 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -211,6 +211,18 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// cpp_mutual_clustering_all_pairs
+NumericVector cpp_mutual_clustering_all_pairs(const List& splits_list, const int n_tip);
+RcppExport SEXP _TreeDist_cpp_mutual_clustering_all_pairs(SEXP splits_listSEXP, SEXP n_tipSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const List& >::type splits_list(splits_listSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_tip(n_tipSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_mutual_clustering_all_pairs(splits_list, n_tip));
+    return rcpp_result_gen;
+END_RCPP
+}
 // path_vector
 IntegerVector path_vector(IntegerMatrix edge);
 RcppExport SEXP _TreeDist_path_vector(SEXP edgeSEXP) {
@@ -432,6 +444,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_TreeDist_lapjv", (DL_FUNC) &_TreeDist_lapjv, 2},
     {"_TreeDist_cpp_mast", (DL_FUNC) &_TreeDist_cpp_mast, 3},
     {"_TreeDist_cpp_nni_distance", (DL_FUNC) &_TreeDist_cpp_nni_distance, 3},
+    {"_TreeDist_cpp_mutual_clustering_all_pairs", (DL_FUNC) &_TreeDist_cpp_mutual_clustering_all_pairs, 2},
     {"_TreeDist_path_vector", (DL_FUNC) &_TreeDist_path_vector, 1},
     {"_TreeDist_vec_diff_euclidean", (DL_FUNC) &_TreeDist_vec_diff_euclidean, 2},
     {"_TreeDist_pair_diff_euclidean", (DL_FUNC) &_TreeDist_pair_diff_euclidean, 1},
diff --git a/src/lap.cpp b/src/lap.cpp
index 978bd9a47..73f6c5989 100644
--- a/src/lap.cpp
+++ b/src/lap.cpp
@@ -69,7 +69,8 @@ inline bool nontrivially_less_than(cost a, cost b) noexcept {
 cost lap(const lap_row dim,
          cost_matrix &input_cost,
          std::vector<lap_col> &rowsol,
-         std::vector<lap_row> &colsol)
+         std::vector<lap_row> &colsol,
+         const bool allow_interrupt)
   
   // input:
   // dim        - problem size
@@ -186,7 +187,7 @@ cost lap(const lap_row dim,
           // Put in current k, and go back to that k.
           // Continue augmenting path i - j1 with i0.
           freeunassigned[--k] = i0;
-          Rcpp::checkUserInterrupt();
+          if (allow_interrupt) Rcpp::checkUserInterrupt();
         } else {
           // No further augmenting reduction possible.
           // Store i0 in list of free rows for next phase.
diff --git a/src/pairwise_distances.cpp b/src/pairwise_distances.cpp
new file mode 100644
index 000000000..b0cd93672
--- /dev/null
+++ b/src/pairwise_distances.cpp
@@ -0,0 +1,199 @@
+/* pairwise_distances.cpp
+ *
+ * Batch pairwise distance functions using OpenMP where available.
+ * Each exported function takes a list of split matrices (one per tree) and
+ * returns the lower-triangle distance vector in combn(n, 2) order, suitable
+ * for direct use as the data payload of an R dist object.
+ */
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <TreeTools/SplitList.h>
+#include <memory>
+#include <vector>
+#include <Rcpp/Lightest>
+
+#include "tree_distances.h"
+
+using Rcpp::List;
+using Rcpp::NumericVector;
+using Rcpp::RawMatrix;
+using TreeTools::SplitList;
+using TreeTools::count_bits;
+
+
+// Score-only version of mutual_clustering(). Thread-safe: uses only local
+// storage and read-only globals (lg2 table, lookup tables).
+// Passes allow_interrupt = false to lap() so it is safe to call from an
+// OpenMP parallel region.
+static double mutual_clustering_score(
+    const SplitList& a, const SplitList& b, const int32 n_tips
+) {
+  if (a.n_splits == 0 || b.n_splits == 0 || n_tips == 0) return 0.0;
+
+  const bool a_has_more = (a.n_splits > b.n_splits);
+  const int16 most_splits = a_has_more ? a.n_splits : b.n_splits;
+  const int16 a_extra     = a_has_more ? most_splits - b.n_splits : 0;
+  const int16 b_extra     = a_has_more ? 0 : most_splits - a.n_splits;
+  const double n_tips_rcp = 1.0 / static_cast<double>(n_tips);
+
+  constexpr cost max_score  = BIG;
+  constexpr double over_max = 1.0 / static_cast<double>(BIG);
+  const double max_over_tips = static_cast<double>(BIG) * n_tips_rcp;
+
+  cost_matrix score(most_splits);
+  double exact_score  = 0.0;
+  int16  exact_n      = 0;
+
+  std::vector<int>  a_match(a.n_splits, 0);
+  auto b_match = std::make_unique<int16[]>(b.n_splits);
+  std::fill(b_match.get(), b_match.get() + b.n_splits, int16(0));
+
+  for (int16 ai = 0; ai < a.n_splits; ++ai) {
+    if (a_match[ai]) continue;
+
+    const int16  na    = a.in_split[ai];
+    const int16  nA    = n_tips - na;
+    const auto*  a_row = a.state[ai];
+
+    for (int16 bi = 0; bi < b.n_splits; ++bi) {
+      int16       a_and_b = 0;
+      const auto* b_row   = b.state[bi];
+      for (int16 bin = 0; bin < a.n_bins; ++bin) {
+        a_and_b += count_bits(a_row[bin] & b_row[bin]);
+      }
+
+      const int16 nb    = b.in_split[bi];
+      const int16 nB    = n_tips - nb;
+      const int16 a_and_B = na - a_and_b;
+      const int16 A_and_b = nb - a_and_b;
+      const int16 A_and_B = nA - A_and_b;
+
+      if ((!a_and_B && !A_and_b) || (!a_and_b && !A_and_B)) {
+        // Exact match (nested or identical splits)
+        exact_score += TreeDist::ic_matching(na, nA, n_tips);
+        ++exact_n;
+        a_match[ai] = bi + 1;
+        b_match[bi] = ai + 1;
+        break;
+      } else if (a_and_b == A_and_b && a_and_b == a_and_B && a_and_b == A_and_B) {
+        score(ai, bi) = max_score; // Avoid rounding errors on orthogonal splits
+      } else {
+        double ic_sum = 0.0;
+        TreeDist::add_ic_element(ic_sum, a_and_b, na, nb, n_tips);
+        TreeDist::add_ic_element(ic_sum, a_and_B, na, nB, n_tips);
+        TreeDist::add_ic_element(ic_sum, A_and_b, nA, nb, n_tips);
+        TreeDist::add_ic_element(ic_sum, A_and_B, nA, nB, n_tips);
+        score(ai, bi) = max_score - static_cast<cost>(ic_sum * max_over_tips);
+      }
+    }
+
+    if (b.n_splits < most_splits) {
+      score.padRowAfterCol(ai, b.n_splits, max_score);
+    }
+  }
+
+  // Early exit when everything matched exactly
+  if (exact_n == b.n_splits || exact_n == a.n_splits) {
+    return exact_score * n_tips_rcp;
+  }
+
+  const int16 lap_n = most_splits - exact_n;
+  std::vector<lap_col> rowsol(lap_n);
+  std::vector<lap_row> colsol(lap_n);
+
+  if (exact_n) {
+    // Build a reduced cost matrix omitting exact-matched rows/cols
+    cost_matrix small(lap_n);
+    int16 a_pos = 0;
+    for (int16 ai = 0; ai < a.n_splits; ++ai) {
+      if (a_match[ai]) continue;
+      int16 b_pos = 0;
+      for (int16 bi = 0; bi < b.n_splits; ++bi) {
+        if (b_match[bi]) continue;
+        small(a_pos, b_pos) = score(ai, bi);
+        ++b_pos;
+      }
+      small.padRowAfterCol(a_pos, lap_n - a_extra, max_score);
+      ++a_pos;
+    }
+    small.padAfterRow(lap_n - b_extra, max_score);
+
+    const double lap_score =
+      static_cast<double>((max_score * lap_n) -
+                          lap(lap_n, small, rowsol, colsol, false)) * over_max;
+    return lap_score + exact_score * n_tips_rcp;
+
+  } else {
+    // No exact matches — pad and solve the full matrix
+    for (int16 ai = a.n_splits; ai < most_splits; ++ai) {
+      for (int16 bi = 0; bi < most_splits; ++bi) {
+        score(ai, bi) = max_score;
+      }
+    }
+    return static_cast<double>(
+      (max_score * lap_n) - lap(lap_n, score, rowsol, colsol, false)
+    ) / max_score;
+  }
+}
+
+
+//' Pairwise mutual clustering information — batch computation
+//'
+//' Internal function. Computes all pairwise MCI scores for a set of trees,
+//' using OpenMP threads when available (falling back to single-threaded
+//' execution otherwise). No interrupt checking is performed inside the
+//' parallel region; the outer R call remains interruptible between batches.
+//'
+//' @param splits_list A list of split matrices (class `Splits` or `RawMatrix`),
+//'   one per tree, all covering the same tip set.  Typically the object
+//'   returned by `as.Splits(trees, tipLabels = labs, asSplits = FALSE)`.
+//' @param n_tip Integer; number of tips shared by all trees.
+//' @return Numeric vector of length `n*(n-1)/2` containing pairwise MCI
+//'   scores in `combn(n, 2)` column-major order (i.e. the data payload of
+//'   an R `dist` object).
+//' @keywords internal
+// [[Rcpp::export]]
+NumericVector cpp_mutual_clustering_all_pairs(
+    const List& splits_list,
+    const int   n_tip
+) {
+  const int N = splits_list.size();
+  if (N < 2) return NumericVector(0);
+
+  const int n_pairs = N * (N - 1) / 2;
+
+  // Construct SplitList objects on the main thread: R SEXP access is not
+  // thread-safe, and the SplitList constructor reads from R RawMatrix objects.
+  // SplitList is not move-constructible, so we heap-allocate via unique_ptr.
+  std::vector<std::unique_ptr<SplitList>> splits;
+  splits.reserve(N);
+  for (int k = 0; k < N; ++k) {
+    splits.push_back(
+      std::make_unique<SplitList>(Rcpp::as<RawMatrix>(splits_list[k]))
+    );
+  }
+
+  NumericVector result(n_pairs);
+  double* const res = result.begin();
+
+  // Iterate over columns of the combn(N,2) lower triangle.
+  // Pair (col, row) with col < row maps to dist-vector index:
+  //   p = col*(N-1) - col*(col-1)/2 + row - col - 1
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic)
+#endif
+  for (int col = 0; col < N - 1; ++col) {
+#ifndef _OPENMP
+    Rcpp::checkUserInterrupt();
+#endif
+    for (int row = col + 1; row < N; ++row) {
+      const int p = col * (N - 1) - col * (col - 1) / 2 + row - col - 1;
+      res[p] = mutual_clustering_score(*splits[col], *splits[row], n_tip);
+    }
+  }
+
+  return result;
+}
diff --git a/src/tree_distances.cpp b/src/tree_distances.cpp
index 7666b175a..9d33f717d 100644
--- a/src/tree_distances.cpp
+++ b/src/tree_distances.cpp
@@ -33,25 +33,6 @@ namespace TreeDist {
     }
   }
 
-  inline void add_ic_element(double& ic_sum, int16 nkK, int16 nk, int16 nK,
-                             int16 n_tips) noexcept {
-    /* 
-     * See equation 16 in Meila 2007. I denote k' as K.
-     * nkK is converted to pkK in the calling function, when the sum of all
-     * elements is divided by n.
-     */
-    if (nkK && nk && nK) {
-      if (nkK == nk && nkK == nK && nkK << 1 == n_tips) {
-        ic_sum += nkK;
-      } else {
-        const int32 numerator = nkK * n_tips;
-        const int32 denominator = nk * nK;
-        if (numerator != denominator) {
-          ic_sum += nkK * (lg2[numerator] - lg2[denominator]);
-        }
-      }
-    }
-  }
 
 }
 
diff --git a/src/tree_distances.h b/src/tree_distances.h
index b5a2f97bc..16907bd31 100644
--- a/src/tree_distances.h
+++ b/src/tree_distances.h
@@ -379,13 +379,29 @@ using cost_matrix = CostMatrix;
 extern cost lap(lap_row dim,
                 cost_matrix &input_cost,
                 std::vector<lap_col> &rowsol,
-                std::vector<lap_row> &colsol);
-
-extern inline void add_ic_element(double& ic_sum, int16 nkK, int16 nk, int16 nK,
-                                  int16 n_tips);
+                std::vector<lap_row> &colsol,
+                bool allow_interrupt = true);
 
 namespace TreeDist {
 
+  // See equation 16 in Meila 2007 (k' denoted K).
+  // nkK is converted to pkK in the calling function when divided by n.
+  inline void add_ic_element(double& ic_sum, const int16 nkK, const int16 nk,
+                             const int16 nK, const int16 n_tips) noexcept {
+    if (nkK && nk && nK) {
+      if (nkK == nk && nkK == nK && nkK << 1 == n_tips) {
+        ic_sum += nkK;
+      } else {
+        const int32 numerator = nkK * n_tips;
+        const int32 denominator = nk * nK;
+        if (numerator != denominator) {
+          ic_sum += nkK * (lg2[numerator] - lg2[denominator]);
+        }
+      }
+    }
+  }
+
+
   // Returns lg2_unrooted[x] - lg2_trees_matching_split(y, x - y)
   [[nodiscard]] inline double mmsi_pair_score(const int16 x, const int16 y) noexcept {
     assert(SL_MAX_TIPS + 2 <= std::numeric_limits<int16>::max()); // verify int16 ok

From 7fbc97686123cdf73337cfaa4a30aff62c645b92 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Tue, 10 Mar 2026 13:40:37 +0000
Subject: [PATCH 03/19] Multithreaded calculation

---
 AGENTS.md                              | 31 ++++++++++++++--
 NEWS.md                                | 14 ++++++++
 R/parallel.R                           | 49 ++++++++++++++++++++-----
 R/tree_distance_utilities.R            |  3 +-
 man/StartParallel.Rd                   | 50 +++++++++++++++++++++-----
 man/TreeDist-package.Rd                |  1 +
 man/cpp_mutual_clustering_all_pairs.Rd | 27 ++++++++++++++
 src/RcppExports.cpp                    |  9 ++---
 src/pairwise_distances.cpp             |  5 +--
 9 files changed, 162 insertions(+), 27 deletions(-)
 create mode 100644 man/cpp_mutual_clustering_all_pairs.Rd

diff --git a/AGENTS.md b/AGENTS.md
index 280a133c7..a871f9580 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -203,13 +203,40 @@ Added `#pragma omp parallel for schedule(dynamic)` over the pairwise loop for
 - `add_ic_element` moved from `tree_distances.cpp` (inside `namespace TreeDist`,
   inaccessible to other TUs) into `tree_distances.h` as a proper `inline`,
   fixing a latent ODR issue and making it visible to `pairwise_distances.cpp`.
+- `SplitList` is not move-constructible; `std::vector<SplitList>` therefore
+  cannot be used (reserve/emplace_back require movability).  Fix: use
+  `std::vector<std::unique_ptr<SplitList>>` instead.
 - R fast path: `.SplitDistanceAllPairs()` in `tree_distance_utilities.R` detects
   `Func == MutualClusteringInfoSplits` with a same-tip-set, no-cluster call and
   routes it to `cpp_mutual_clustering_all_pairs()`.
 
 **Benchmark script**: `benchmark/bench-MCI-openmp.R`
-**To measure speedup**: run `bench-MCI-openmp.R` in a **fresh R session** after
-`devtools::load_all()` (Windows locks the DLL in the session that compiles it).
+**To measure speedup**: install with `install.packages(".", repos=NULL, type="source")`
+into a fresh library (avoids `devtools::load_all()` debug flags and Windows DLL lock).
+Use `TreeDist:::cpp_mutual_clustering_all_pairs` when calling from an installed package.
+
+#### Measured speedups (release build, -O2, 16-core Windows machine)
+
+| Scenario | Serial R loop | R parallel (16 workers) | OpenMP batch |
+|---|---|---|---|
+| 100 trees × 50 tips (4 950 pairs) | 117 ms | 78 ms | **17 ms** |
+| 40 trees × 200 tips (780 pairs) | 292 ms | 3 350 ms | **35 ms** |
+
+OpenMP vs serial: **7–8×**.  OpenMP vs R-parallel cluster: **5–95×** (R-parallel
+incurs ~2–3 s serialisation overhead regardless of problem size).
+
+#### R parallel cluster crossover (50-tip trees, 16 workers)
+
+The R parallel cluster is only competitive with the serial loop when the problem
+is large enough to amortise its ~2–3 s IPC/serialisation overhead.  For 50-tip
+trees that crossover is around **500 trees (~125 000 pairs)**, where serial takes
+~2.9 s and parallel takes ~2.8 s.  The OpenMP batch path remains faster than
+both at every size measured.
+
+Practical implication: **`StartParallel()` provides no benefit for MCI/CID** on
+machines where OpenMP is available, and actively harms performance for typical
+analysis sizes (≤ 200 trees).  The fast path therefore bypasses the R cluster
+entirely when `cluster` is `NULL`.
 
 ---
 
diff --git a/NEWS.md b/NEWS.md
index a08f516de..c6ae93d6d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,17 @@
+# TreeDist 2.12.0.9001 (2026-03-10)
+
+- Pairwise `ClusteringInfoDistance()` / `MutualClusteringInfo()` now uses
+  an OpenMP multi-threaded batch path when the package is compiled with
+  OpenMP support, giving ~7–8× speed-up over the serial path on a 16-core
+  machine.
+
+- The number of OpenMP threads is controlled by `options(mc.cores = N)`;
+  the default is `1` (single-threaded).  Set `mc.cores` to
+  `parallel::detectCores()` or a fixed integer to enable multi-threading.
+  `StartParallel()` / `StopParallel()` are no longer needed for
+  `ClusteringInfoDistance()` when OpenMP is available.
+
+
 # TreeDist 2.12.0.9000 (2026-02-19)
 
 - Tweak vignettes.
diff --git a/R/parallel.R b/R/parallel.R
index 1e86e1ee1..d8d57458d 100644
--- a/R/parallel.R
+++ b/R/parallel.R
@@ -3,28 +3,59 @@
 #' 
 #' Accelerate distance calculation by employing multiple \acronym{CPU} workers.
 #' 
-#' "TreeDist" parallelizes the calculation of tree to tree distances via
-#' the [`parCapply()`] function, using a user-defined cluster specified in
+#' ## OpenMP (recommended for `ClusteringInfoDistance` / `MutualClusteringInfo`)
+#' 
+#' When the package is built with \acronym{OpenMP} support (the default on
+#' Linux and Windows; optional on macOS), pairwise
+#' [`ClusteringInfoDistance()`] / [`MutualClusteringInfo()`] calculations use
+#' an efficient multi-threaded code path automatically — no cluster setup is
+#' required.
+#' 
+#' The number of \acronym{OpenMP} threads is controlled by the standard
+#' `"mc.cores"` option:
+#' 
+#' ```r
+#' options(mc.cores = parallel::detectCores())  # use all available cores
+#' options(mc.cores = 4L)                        # or a fixed number
+#' ```
+#' 
+#' The default is `1` (single-threaded). The \acronym{OpenMP} path is
+#' substantially faster than the R-cluster path for typical analysis sizes and
+#' is preferred when available.
+#' 
+#' ## R parallel cluster (other metrics)
+#' 
+#' For metrics that do not yet have an \acronym{OpenMP} batch implementation
+#' (e.g. [`RobinsonFoulds()`], [`MatchingSplitDistance()`]), "TreeDist"
+#' parallelizes via [`parCapply()`] using a cluster stored in
 #' `options("TreeDist-cluster")`.
 #' 
-#' `StartParallel()` calls `parallel::makeCluster()` and tells "TreeDist" to
-#' use the created cluster.
+#' `StartParallel()` calls `parallel::makeCluster()` and registers the cluster.
 #' 
-#' `SetParallel()` tells "TreeDist" to use a pre-existing or user-specified 
-#' cluster.
+#' `SetParallel()` registers a pre-existing cluster.
 #' 
-#' `StopParallel()` stops the current TreeDist cluster.
+#' `StopParallel()` stops the current TreeDist cluster and releases resources.
+#'
+#' Note that R-cluster parallelism carries a serialisation overhead of ~2–3 s,
+#' so it is only beneficial for large problems (roughly > 500 trees at 50 tips).
+#' For [`ClusteringInfoDistance()`] the \acronym{OpenMP} path is faster at
+#' every problem size and is used automatically when no cluster is registered.
 #' 
 #' @param \dots Parameters to pass to [`makeCluster()`].
 #' @param cl An existing cluster.
 #' 
 #' @examples
-#' if (interactive()) { # Only run in terminal
+#' # OpenMP parallelism: just set mc.cores before calling distance functions.
+#' options(mc.cores = 2L)
+#' # ClusteringInfoDistance(trees)  # now uses 2 OpenMP threads
+#' options(mc.cores = NULL)  # restore default (single-threaded)
+#' 
+#' if (interactive()) { # R cluster: only worthwhile for non-OpenMP metrics
 #'   library("TreeTools", quietly = TRUE)
 #'   nCores <- ceiling(parallel::detectCores() / 2)
 #'   StartParallel(nCores) # Takes a few seconds to set up processes
 #'   GetParallel()
-#'   ClusteringInfoDistance(as.phylo(0:6, 100))
+#'   RobinsonFoulds(as.phylo(0:6, 100))
 #'   StopParallel() # Returns system resources
 #' }
 #' @template MRS
diff --git a/R/tree_distance_utilities.R b/R/tree_distance_utilities.R
index e7de4f2fa..0509ed0eb 100644
--- a/R/tree_distance_utilities.R
+++ b/R/tree_distance_utilities.R
@@ -136,7 +136,8 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL,
       identical(Func, MutualClusteringInfoSplits)) {
     splits <- as.Splits(splits1, tipLabels = tipLabels, asSplits = FALSE)
     return(structure(
-      cpp_mutual_clustering_all_pairs(splits, as.integer(nTip)),
+      cpp_mutual_clustering_all_pairs(splits, as.integer(nTip),
+                                       as.integer(getOption("mc.cores", 1L))),
       class  = "dist",
       Size   = length(splits1),
       Labels = names(splits1),
diff --git a/man/StartParallel.Rd b/man/StartParallel.Rd
index 71d5efc88..7fef07ffc 100644
--- a/man/StartParallel.Rd
+++ b/man/StartParallel.Rd
@@ -35,25 +35,57 @@ StopParallel(quietly = FALSE)
 Accelerate distance calculation by employing multiple \acronym{CPU} workers.
 }
 \details{
-"TreeDist" parallelizes the calculation of tree to tree distances via
-the \code{\link[=parCapply]{parCapply()}} function, using a user-defined cluster specified in
+\subsection{OpenMP (recommended for \code{ClusteringInfoDistance} / \code{MutualClusteringInfo})}{
+
+When the package is built with \acronym{OpenMP} support (the default on
+Linux and Windows; optional on macOS), pairwise
+\code{\link[=ClusteringInfoDistance]{ClusteringInfoDistance()}} / \code{\link[=MutualClusteringInfo]{MutualClusteringInfo()}} calculations use
+an efficient multi-threaded code path automatically — no cluster setup is
+required.
+
+The number of \acronym{OpenMP} threads is controlled by the standard
+\code{"mc.cores"} option:
+
+\if{html}{\out{<div class="sourceCode r">}}\preformatted{options(mc.cores = parallel::detectCores())  # use all available cores
+options(mc.cores = 4L)                        # or a fixed number
+}\if{html}{\out{</div>}}
+
+The default is \code{1} (single-threaded). The \acronym{OpenMP} path is
+substantially faster than the R-cluster path for typical analysis sizes and
+is preferred when available.
+}
+
+\subsection{R parallel cluster (other metrics)}{
+
+For metrics that do not yet have an \acronym{OpenMP} batch implementation
+(e.g. \code{\link[=RobinsonFoulds]{RobinsonFoulds()}}, \code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}), "TreeDist"
+parallelizes via \code{\link[=parCapply]{parCapply()}} using a cluster stored in
 \code{options("TreeDist-cluster")}.
 
-\code{StartParallel()} calls \code{parallel::makeCluster()} and tells "TreeDist" to
-use the created cluster.
+\code{StartParallel()} calls \code{parallel::makeCluster()} and registers the cluster.
+
+\code{SetParallel()} registers a pre-existing cluster.
 
-\code{SetParallel()} tells "TreeDist" to use a pre-existing or user-specified
-cluster.
+\code{StopParallel()} stops the current TreeDist cluster and releases resources.
 
-\code{StopParallel()} stops the current TreeDist cluster.
+Note that R-cluster parallelism carries a serialisation overhead of ~2–3 s,
+so it is only beneficial for large problems (roughly > 500 trees at 50 tips).
+For \code{\link[=ClusteringInfoDistance]{ClusteringInfoDistance()}} the \acronym{OpenMP} path is faster at
+every problem size and is used automatically when no cluster is registered.
+}
 }
 \examples{
-if (interactive()) { # Only run in terminal
+# OpenMP parallelism: just set mc.cores before calling distance functions.
+options(mc.cores = 2L)
+# ClusteringInfoDistance(trees)  # now uses 2 OpenMP threads
+options(mc.cores = NULL)  # restore default (single-threaded)
+
+if (interactive()) { # R cluster: only worthwhile for non-OpenMP metrics
   library("TreeTools", quietly = TRUE)
   nCores <- ceiling(parallel::detectCores() / 2)
   StartParallel(nCores) # Takes a few seconds to set up processes
   GetParallel()
-  ClusteringInfoDistance(as.phylo(0:6, 100))
+  RobinsonFoulds(as.phylo(0:6, 100))
   StopParallel() # Returns system resources
 }
 }
diff --git a/man/TreeDist-package.Rd b/man/TreeDist-package.Rd
index c322229b2..b142288f7 100644
--- a/man/TreeDist-package.Rd
+++ b/man/TreeDist-package.Rd
@@ -139,6 +139,7 @@ Other contributors:
   \item Roy Jonker \email{roy_jonker@magiclogic.com} (LAP algorithm) [programmer, copyright holder]
   \item Yong Yang \email{yongyanglink@gmail.com} (LAP algorithm) [contributor, copyright holder]
   \item Yi Cao (LAP algorithm) [contributor, copyright holder]
+  \item Neil Kaye (Mercator image)
 }
 
 }
diff --git a/man/cpp_mutual_clustering_all_pairs.Rd b/man/cpp_mutual_clustering_all_pairs.Rd
new file mode 100644
index 000000000..3be2abff9
--- /dev/null
+++ b/man/cpp_mutual_clustering_all_pairs.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{cpp_mutual_clustering_all_pairs}
+\alias{cpp_mutual_clustering_all_pairs}
+\title{Pairwise mutual clustering information — batch computation}
+\usage{
+cpp_mutual_clustering_all_pairs(splits_list, n_tip, n_threads = 1L)
+}
+\arguments{
+\item{splits_list}{A list of split matrices (class \code{Splits} or \code{RawMatrix}),
+one per tree, all covering the same tip set.  Typically the object
+returned by \code{as.Splits(trees, tipLabels = labs, asSplits = FALSE)}.}
+
+\item{n_tip}{Integer; number of tips shared by all trees.}
+}
+\value{
+Numeric vector of length \code{n*(n-1)/2} containing pairwise MCI
+scores in \code{combn(n, 2)} column-major order (i.e. the data payload of
+an R \code{dist} object).
+}
+\description{
+Internal function. Computes all pairwise MCI scores for a set of trees,
+using OpenMP threads when available (falling back to single-threaded
+execution otherwise). No interrupt checking is performed inside the
+parallel region; the outer R call remains interruptible between batches.
+}
+\keyword{internal}
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 3aa1a162c..c8702cdbf 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -212,14 +212,15 @@ BEGIN_RCPP
 END_RCPP
 }
 // cpp_mutual_clustering_all_pairs
-NumericVector cpp_mutual_clustering_all_pairs(const List& splits_list, const int n_tip);
-RcppExport SEXP _TreeDist_cpp_mutual_clustering_all_pairs(SEXP splits_listSEXP, SEXP n_tipSEXP) {
+NumericVector cpp_mutual_clustering_all_pairs(const List& splits_list, const int n_tip, const int n_threads);
+RcppExport SEXP _TreeDist_cpp_mutual_clustering_all_pairs(SEXP splits_listSEXP, SEXP n_tipSEXP, SEXP n_threadsSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< const List& >::type splits_list(splits_listSEXP);
     Rcpp::traits::input_parameter< const int >::type n_tip(n_tipSEXP);
-    rcpp_result_gen = Rcpp::wrap(cpp_mutual_clustering_all_pairs(splits_list, n_tip));
+    Rcpp::traits::input_parameter< const int >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_mutual_clustering_all_pairs(splits_list, n_tip, n_threads));
     return rcpp_result_gen;
 END_RCPP
 }
@@ -444,7 +445,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_TreeDist_lapjv", (DL_FUNC) &_TreeDist_lapjv, 2},
     {"_TreeDist_cpp_mast", (DL_FUNC) &_TreeDist_cpp_mast, 3},
     {"_TreeDist_cpp_nni_distance", (DL_FUNC) &_TreeDist_cpp_nni_distance, 3},
-    {"_TreeDist_cpp_mutual_clustering_all_pairs", (DL_FUNC) &_TreeDist_cpp_mutual_clustering_all_pairs, 2},
+    {"_TreeDist_cpp_mutual_clustering_all_pairs", (DL_FUNC) &_TreeDist_cpp_mutual_clustering_all_pairs, 3},
     {"_TreeDist_path_vector", (DL_FUNC) &_TreeDist_path_vector, 1},
     {"_TreeDist_vec_diff_euclidean", (DL_FUNC) &_TreeDist_vec_diff_euclidean, 2},
     {"_TreeDist_pair_diff_euclidean", (DL_FUNC) &_TreeDist_pair_diff_euclidean, 1},
diff --git a/src/pairwise_distances.cpp b/src/pairwise_distances.cpp
index b0cd93672..f591ec812 100644
--- a/src/pairwise_distances.cpp
+++ b/src/pairwise_distances.cpp
@@ -158,7 +158,8 @@ static double mutual_clustering_score(
 // [[Rcpp::export]]
 NumericVector cpp_mutual_clustering_all_pairs(
     const List& splits_list,
-    const int   n_tip
+    const int   n_tip,
+    const int   n_threads = 1
 ) {
   const int N = splits_list.size();
   if (N < 2) return NumericVector(0);
@@ -183,7 +184,7 @@ NumericVector cpp_mutual_clustering_all_pairs(
   // Pair (col, row) with col < row maps to dist-vector index:
   //   p = col*(N-1) - col*(col-1)/2 + row - col - 1
 #ifdef _OPENMP
-#pragma omp parallel for schedule(dynamic)
+#pragma omp parallel for schedule(dynamic) num_threads(n_threads)
 #endif
   for (int col = 0; col < N - 1; ++col) {
 #ifndef _OPENMP

From c3169f9470a25c54c8a353767e2d2b3cf835fed5 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Tue, 10 Mar 2026 13:54:00 +0000
Subject: [PATCH 04/19] Code coverage

---
 src/tree_distances.h                     | 13 ++---
 tests/testthat/test-pairwise_distances.R | 63 ++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 8 deletions(-)
 create mode 100644 tests/testthat/test-pairwise_distances.R

diff --git a/src/tree_distances.h b/src/tree_distances.h
index 16907bd31..06b8661a9 100644
--- a/src/tree_distances.h
+++ b/src/tree_distances.h
@@ -389,14 +389,11 @@ namespace TreeDist {
   inline void add_ic_element(double& ic_sum, const int16 nkK, const int16 nk,
                              const int16 nK, const int16 n_tips) noexcept {
     if (nkK && nk && nK) {
-      if (nkK == nk && nkK == nK && nkK << 1 == n_tips) {
-        ic_sum += nkK;
-      } else {
-        const int32 numerator = nkK * n_tips;
-        const int32 denominator = nk * nK;
-        if (numerator != denominator) {
-          ic_sum += nkK * (lg2[numerator] - lg2[denominator]);
-        }
+      assert(!(nkK == nk && nkK == nK && nkK << 1 == n_tips));
+      const int32 numerator = nkK * n_tips;
+      const int32 denominator = nk * nK;
+      if (numerator != denominator) {
+        ic_sum += nkK * (lg2[numerator] - lg2[denominator]);
       }
     }
   }
diff --git a/tests/testthat/test-pairwise_distances.R b/tests/testthat/test-pairwise_distances.R
new file mode 100644
index 000000000..499dc21ab
--- /dev/null
+++ b/tests/testthat/test-pairwise_distances.R
@@ -0,0 +1,63 @@
+## Tests for mutual_clustering_score() branches in src/pairwise_distances.cpp.
+##
+## These branches are only reachable via cpp_mutual_clustering_all_pairs(),
+## invoked automatically by MutualClusteringInfo() when all trees share the
+## same tip set and no R-level cluster is active (the fast path in
+## .SplitDistanceAllPairs()).  The fast path returns a dist; MutualClusteringInfo
+## then converts it to a full matrix and fills the diagonal with ClusteringEntropy
+## values.  The off-diagonal entries equal the raw MCI scores from the batch C++
+## function.
+
+test_that("cpp_mutual_clustering_all_pairs: orthogonal splits score 0 (line 82)", {
+  # Two 8-tip trees, each with exactly one internal split.
+  # The splits cross orthogonally: every quadrant contains exactly 2 tips,
+  # so a_and_b == A_and_b == a_and_B == A_and_B == 2.
+  # This triggers the rounding-error guard (score = max_score) at line 82, and
+  # the LAP assigns that sole pair, yielding MCI = 0.
+  tr1 <- ape::read.tree(text = "((t1,t2,t3,t4),(t5,t6,t7,t8));")
+  tr2 <- ape::read.tree(text = "((t1,t2,t5,t6),(t3,t4,t7,t8));")
+  trees <- structure(list(tr1, tr2), class = "multiPhylo")
+
+  r <- MutualClusteringInfo(trees)
+
+  # r is a 2×2 matrix; the off-diagonal [2,1] is the pairwise MCI
+  expect_equal(r[2, 1], 0, tolerance = 1e-10)
+  # Agrees with the single-pair path (cpp_mutual_clustering, not the batch fn)
+  expect_equal(r[2, 1], MutualClusteringInfo(tr1, tr2), tolerance = 1e-10)
+})
+
+test_that("cpp_mutual_clustering_all_pairs: unequal split counts (lines 94, 131-138)", {
+  # Three 6-tip trees with different numbers of non-trivial splits:
+  #
+  #   tr_1a — 1 split:  {t1,t2,t3} | {t4,t5,t6}
+  #   tr_3  — 3 splits: {t1,t4}, {t2,t5}, {t3,t6}
+  #   tr_1b — 1 split:  {t1,t2}   | {t3,t4,t5,t6}
+  #
+  # The batch loop (col < row) processes three pairs using 0-indexed trees:
+  #
+  #   (col=0 → a=tr_1a[1 split], row=1 → b=tr_3[3 splits]):
+  #     a_has_more = FALSE; most_splits = 3.
+  #     No split pairs are exact matches, so exact_n = 0 → else branch:
+  #       loop fills phantom rows ai=1,2 with max_score        (lines 131–134)
+  #       LAP solves the full 3×3 matrix                       (lines 136–138)
+  #
+  #   (col=1 → a=tr_3[3 splits], row=2 → b=tr_1b[1 split]):
+  #     a_has_more = TRUE; most_splits = 3, b.n_splits = 1.
+  #     For every ai=0..2: padRowAfterCol(ai, 1, max_score)    (line 94)
+
+  tr_1a <- ape::read.tree(text = "((t1,t2,t3),(t4,t5,t6));")
+  tr_3  <- ape::read.tree(text = "(((t1,t4),(t2,t5)),(t3,t6));")
+  tr_1b <- ape::read.tree(text = "((t1,t2),(t3,t4,t5,t6));")
+  trees <- structure(list(tr_1a, tr_3, tr_1b), class = "multiPhylo")
+
+  r <- MutualClusteringInfo(trees)
+
+  # r is a 3×3 matrix; off-diagonal [i, j] with i > j is pair (j, i)
+  expect_equal(dim(r), c(3L, 3L))
+
+  # Cross-validate all three off-diagonal pairs against the single-pair path
+  # (which uses cpp_mutual_clustering in tree_distances.cpp, not the batch fn)
+  expect_equal(r[2, 1], MutualClusteringInfo(tr_1a, tr_3),  tolerance = 1e-9)
+  expect_equal(r[3, 1], MutualClusteringInfo(tr_1a, tr_1b), tolerance = 1e-9)
+  expect_equal(r[3, 2], MutualClusteringInfo(tr_3,  tr_1b), tolerance = 1e-9)
+})

From 01a7fa2155648a865af341500dfea61db1071ec4 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Tue, 10 Mar 2026 14:49:55 +0000
Subject: [PATCH 05/19] Multithreading

---
 AGENTS.md                   |  25 ++-
 R/RcppExports.R             |  44 +++-
 R/parallel.R                |  61 ++---
 R/tree_distance_info.R      |  16 ++
 R/tree_distance_utilities.R |  68 ++++--
 man/StartParallel.Rd        |  62 ++---
 man/TreeDistance.Rd         |  15 ++
 src/RcppExports.cpp         |  72 ++++++
 src/pairwise_distances.cpp  | 436 ++++++++++++++++++++++++++++++++++++
 9 files changed, 726 insertions(+), 73 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index a871f9580..7cb7f775c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -238,6 +238,26 @@ machines where OpenMP is available, and actively harms performance for typical
 analysis sizes (≤ 200 trees).  The fast path therefore bypasses the R cluster
 entirely when `cluster` is `NULL`.
 
+#### OpenMP rollout to all remaining distance metrics (this dev cycle)
+
+Extended OpenMP batch computation to all LAP-based and RF-info metrics.
+Added score-only static functions and `cpp_*_all_pairs` exports in
+`pairwise_distances.cpp` for:
+
+| C++ batch function | R `Func` intercepted | LAP? |
+|---|---|---|
+| `cpp_rf_info_all_pairs` | `InfoRobinsonFouldsSplits` | No |
+| `cpp_msd_all_pairs` | `MatchingSplitDistanceSplits` | Yes |
+| `cpp_msi_all_pairs` | `MatchingSplitInfoSplits` | Yes |
+| `cpp_shared_phylo_all_pairs` | `SharedPhylogeneticInfoSplits` | Yes |
+| `cpp_jaccard_all_pairs` | `NyeSplitSimilarity`, `JaccardSplitSimilarity` | Yes |
+
+The R fast-path in `.SplitDistanceAllPairs()` was refactored from a single
+`if` block into a unified `if (!is.na(nTip) && is.null(cluster))` guard with
+per-function branches, returning early only when a batch function matches.
+For `JaccardSplitSimilarity`, `k` and `allowConflict` are extracted from `...`
+with sensible defaults (1.0, TRUE) if absent.
+
 ---
 
 ## Known Optimization Opportunities / TODOs
@@ -253,9 +273,6 @@ entirely when `cluster` is `NULL`.
   bandwidth further.
 - SPR distance (`spr.cpp`, `spr_lookup.cpp`): the algorithm is relatively recent
   (v2.8.0); profiling under VTune may reveal further hot spots.
-- OpenMP for other metrics: `pairwise_distances.cpp` currently covers only MCI/CID.
-  Adding equivalent batch functions for `SharedPhylogeneticInfo`, `MatchingSplitInfo`,
-  `RobinsonFoulds`, and `MatchingSplitDistance` would extend the speedup; each needs
-  a score-only variant of its computation.
+- OpenMP for other metrics: **DONE** — see "Completed Optimizations" below.
 - Large-tree path (`int32` migration, v2.12.0 dev): ensure new code paths are as
   optimized as the original `int16` paths.
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 2ca5d5977..612219f87 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -78,8 +78,48 @@ cpp_nni_distance <- function(edge1, edge2, nTip) {
     .Call(`_TreeDist_cpp_nni_distance`, edge1, edge2, nTip)
 }
 
-cpp_mutual_clustering_all_pairs <- function(splits_list, n_tip) {
-    .Call(`_TreeDist_cpp_mutual_clustering_all_pairs`, splits_list, n_tip)
+#' Pairwise mutual clustering information — batch computation
+#'
+#' Internal function. Computes all pairwise MCI scores for a set of trees,
+#' using OpenMP threads when available (falling back to single-threaded
+#' execution otherwise). No interrupt checking is performed inside the
+#' parallel region; the outer R call remains interruptible between batches.
+#'
+#' @param splits_list A list of split matrices (class `Splits` or `RawMatrix`),
+#'   one per tree, all covering the same tip set.  Typically the object
+#'   returned by `as.Splits(trees, tipLabels = labs, asSplits = FALSE)`.
+#' @param n_tip Integer; number of tips shared by all trees.
+#' @return Numeric vector of length `n*(n-1)/2` containing pairwise MCI
+#'   scores in `combn(n, 2)` column-major order (i.e. the data payload of
+#'   an R `dist` object).
+#' @keywords internal
+cpp_mutual_clustering_all_pairs <- function(splits_list, n_tip, n_threads = 1L) {
+    .Call(`_TreeDist_cpp_mutual_clustering_all_pairs`, splits_list, n_tip, n_threads)
+}
+
+#' @keywords internal
+cpp_rf_info_all_pairs <- function(splits_list, n_tip, n_threads = 1L) {
+    .Call(`_TreeDist_cpp_rf_info_all_pairs`, splits_list, n_tip, n_threads)
+}
+
+#' @keywords internal
+cpp_msd_all_pairs <- function(splits_list, n_tip, n_threads = 1L) {
+    .Call(`_TreeDist_cpp_msd_all_pairs`, splits_list, n_tip, n_threads)
+}
+
+#' @keywords internal
+cpp_msi_all_pairs <- function(splits_list, n_tip, n_threads = 1L) {
+    .Call(`_TreeDist_cpp_msi_all_pairs`, splits_list, n_tip, n_threads)
+}
+
+#' @keywords internal
+cpp_shared_phylo_all_pairs <- function(splits_list, n_tip, n_threads = 1L) {
+    .Call(`_TreeDist_cpp_shared_phylo_all_pairs`, splits_list, n_tip, n_threads)
+}
+
+#' @keywords internal
+cpp_jaccard_all_pairs <- function(splits_list, n_tip, k = 1.0, allow_conflict = TRUE, n_threads = 1L) {
+    .Call(`_TreeDist_cpp_jaccard_all_pairs`, splits_list, n_tip, k, allow_conflict, n_threads)
 }
 
 path_vector <- function(edge) {
diff --git a/R/parallel.R b/R/parallel.R
index d8d57458d..9c835b064 100644
--- a/R/parallel.R
+++ b/R/parallel.R
@@ -3,13 +3,20 @@
 #' 
 #' Accelerate distance calculation by employing multiple \acronym{CPU} workers.
 #' 
-#' ## OpenMP (recommended for `ClusteringInfoDistance` / `MutualClusteringInfo`)
+#' ## OpenMP (recommended for all split-based metrics)
 #' 
 #' When the package is built with \acronym{OpenMP} support (the default on
-#' Linux and Windows; optional on macOS), pairwise
-#' [`ClusteringInfoDistance()`] / [`MutualClusteringInfo()`] calculations use
-#' an efficient multi-threaded code path automatically — no cluster setup is
-#' required.
+#' Linux and Windows; optional on macOS), all pairwise split-based distance
+#' calculations use an efficient multi-threaded batch path automatically —
+#' no cluster setup is required.  The affected functions are:
+#' 
+#' - [`ClusteringInfoDistance()`] / [`MutualClusteringInfo()`]
+#' - [`SharedPhylogeneticInfo()`] / [`DifferentPhylogeneticInfo()`]
+#' - [`MatchingSplitInfo()`] / [`MatchingSplitInfoDistance()`]
+#' - [`MatchingSplitDistance()`]
+#' - [`InfoRobinsonFoulds()`]
+#' - [`NyeSimilarity()`]
+#' - [`JaccardRobinsonFoulds()`]
 #' 
 #' The number of \acronym{OpenMP} threads is controlled by the standard
 #' `"mc.cores"` option:
@@ -19,43 +26,45 @@
 #' options(mc.cores = 4L)                        # or a fixed number
 #' ```
 #' 
-#' The default is `1` (single-threaded). The \acronym{OpenMP} path is
-#' substantially faster than the R-cluster path for typical analysis sizes and
-#' is preferred when available.
-#' 
-#' ## R parallel cluster (other metrics)
+#' The default is `1` (single-threaded).
 #' 
-#' For metrics that do not yet have an \acronym{OpenMP} batch implementation
-#' (e.g. [`RobinsonFoulds()`], [`MatchingSplitDistance()`]), "TreeDist"
-#' parallelizes via [`parCapply()`] using a cluster stored in
-#' `options("TreeDist-cluster")`.
+#' ## R parallel cluster
 #' 
-#' `StartParallel()` calls `parallel::makeCluster()` and registers the cluster.
+#' `StartParallel()` creates an R socket cluster (via [`makeCluster()`]) and
+#' registers it for use by TreeDist.  `SetParallel()` registers a pre-existing
+#' cluster.  `StopParallel()` stops the cluster and releases resources.
 #' 
-#' `SetParallel()` registers a pre-existing cluster.
+#' **When to use `StartParallel()`:** for metrics that do not have an
+#' \acronym{OpenMP} batch path, namely tree-object-based distances such as
+#' [`NNIDist()`] and [`MASTSize()`] / [`MASTInfo()`], or any function called
+#' via [`CompareAll()`].  R-cluster parallelism carries a serialisation overhead
+#' of ~2–3 s, so it is only beneficial for large problems.
 #' 
-#' `StopParallel()` stops the current TreeDist cluster and releases resources.
-#'
-#' Note that R-cluster parallelism carries a serialisation overhead of ~2–3 s,
-#' so it is only beneficial for large problems (roughly > 500 trees at 50 tips).
-#' For [`ClusteringInfoDistance()`] the \acronym{OpenMP} path is faster at
-#' every problem size and is used automatically when no cluster is registered.
+#' **When _not_ to use `StartParallel()`:** for the split-based metrics listed
+#' above.  Registering a cluster disables the \acronym{OpenMP} batch path for
+#' those functions, replacing a thread-local C++ loop with inter-process
+#' communication — which is slower at every problem size measured.  Call
+#' `StopParallel()` before computing split-based distances if a cluster is
+#' active.
 #' 
 #' @param \dots Parameters to pass to [`makeCluster()`].
 #' @param cl An existing cluster.
 #' 
 #' @examples
-#' # OpenMP parallelism: just set mc.cores before calling distance functions.
+#' # OpenMP parallelism: set mc.cores before calling any split-based metric.
 #' options(mc.cores = 2L)
-#' # ClusteringInfoDistance(trees)  # now uses 2 OpenMP threads
+#' # MutualClusteringInfo(trees)  # uses 2 OpenMP threads automatically
 #' options(mc.cores = NULL)  # restore default (single-threaded)
 #' 
-#' if (interactive()) { # R cluster: only worthwhile for non-OpenMP metrics
+#' if (interactive()) {
+#'   # R cluster: beneficial for NNIDist, MASTSize/MASTInfo, CompareAll(), etc.
+#'   # Do NOT activate while computing split-based distances (MCI, SPI, MSI, …)
+#'   # as it bypasses the faster OpenMP path.
 #'   library("TreeTools", quietly = TRUE)
 #'   nCores <- ceiling(parallel::detectCores() / 2)
 #'   StartParallel(nCores) # Takes a few seconds to set up processes
 #'   GetParallel()
-#'   RobinsonFoulds(as.phylo(0:6, 100))
+#'   CompareAll(as.phylo(0:6, 100), NNIDist)
 #'   StopParallel() # Returns system resources
 #' }
 #' @template MRS
diff --git a/R/tree_distance_info.R b/R/tree_distance_info.R
index 5052ac8c2..dcc8ea9ee 100644
--- a/R/tree_distance_info.R
+++ b/R/tree_distance_info.R
@@ -121,6 +121,22 @@
 #' which the maintainer plans to attempt in the future; please [comment on GitHub](
 #' https://github.com/ms609/TreeTools/issues/141) if you would find this useful.
 #' 
+#' # Parallelism
+#' 
+#' When `tree2 = NULL` and all trees share the same tip labels, pairwise
+#' distance calculation uses a multi-threaded \acronym{OpenMP} batch path
+#' automatically.  Control the number of threads with the `"mc.cores"` option:
+#' 
+#' ```r
+#' options(mc.cores = parallel::detectCores())  # use all cores
+#' options(mc.cores = 4L)                        # or a fixed number
+#' ```
+#' 
+#' Do **not** call [`StartParallel()`] for these functions: a registered
+#' R cluster disables the \acronym{OpenMP} path and replaces it with slower
+#' inter-process communication.  See [`StartParallel()`] for full guidance
+#' on when an R cluster is appropriate.
+#' 
 #' @template tree12ListParams
 #' 
 #' @param normalize If a numeric value is provided, this will be used as a 
diff --git a/R/tree_distance_utilities.R b/R/tree_distance_utilities.R
index 0509ed0eb..5e8ed907f 100644
--- a/R/tree_distance_utilities.R
+++ b/R/tree_distance_utilities.R
@@ -129,21 +129,59 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL,
                                    nTip = length(tipLabels), ...) {
   cluster <- getOption("TreeDist-cluster")
 
-  # Fast path: use the OpenMP batch function for mutual clustering when all
-  # trees share the same tip set and no R-level cluster has been configured.
-  # Matches the behaviour of the generic path but avoids per-pair R overhead.
-  if (!is.na(nTip) && is.null(cluster) &&
-      identical(Func, MutualClusteringInfoSplits)) {
-    splits <- as.Splits(splits1, tipLabels = tipLabels, asSplits = FALSE)
-    return(structure(
-      cpp_mutual_clustering_all_pairs(splits, as.integer(nTip),
-                                       as.integer(getOption("mc.cores", 1L))),
-      class  = "dist",
-      Size   = length(splits1),
-      Labels = names(splits1),
-      Diag   = FALSE,
-      Upper  = FALSE
-    ))
+  # Fast paths: use OpenMP batch functions when all trees share the same tip
+  # set and no R-level cluster has been configured.  Each branch mirrors the
+  # generic path exactly but avoids per-pair R overhead.
+  if (!is.na(nTip) && is.null(cluster)) {
+    .n_threads <- as.integer(getOption("mc.cores", 1L))
+    .batch_result <- if (identical(Func, MutualClusteringInfoSplits)) {
+      splits <- as.Splits(splits1, tipLabels = tipLabels, asSplits = FALSE)
+      cpp_mutual_clustering_all_pairs(splits, as.integer(nTip), .n_threads)
+
+    } else if (identical(Func, InfoRobinsonFouldsSplits)) {
+      splits <- as.Splits(splits1, tipLabels = tipLabels, asSplits = FALSE)
+      cpp_rf_info_all_pairs(splits, as.integer(nTip), .n_threads)
+
+    } else if (identical(Func, MatchingSplitDistanceSplits)) {
+      splits <- as.Splits(splits1, tipLabels = tipLabels, asSplits = FALSE)
+      cpp_msd_all_pairs(splits, as.integer(nTip), .n_threads)
+
+    } else if (identical(Func, MatchingSplitInfoSplits)) {
+      splits <- as.Splits(splits1, tipLabels = tipLabels, asSplits = FALSE)
+      cpp_msi_all_pairs(splits, as.integer(nTip), .n_threads)
+
+    } else if (identical(Func, SharedPhylogeneticInfoSplits)) {
+      splits <- as.Splits(splits1, tipLabels = tipLabels, asSplits = FALSE)
+      cpp_shared_phylo_all_pairs(splits, as.integer(nTip), .n_threads)
+
+    } else if (identical(Func, NyeSplitSimilarity)) {
+      splits <- as.Splits(splits1, tipLabels = tipLabels, asSplits = FALSE)
+      cpp_jaccard_all_pairs(splits, as.integer(nTip),
+                            k = 1.0, allow_conflict = TRUE, .n_threads)
+
+    } else if (identical(Func, JaccardSplitSimilarity)) {
+      dots <- list(...)
+      splits <- as.Splits(splits1, tipLabels = tipLabels, asSplits = FALSE)
+      cpp_jaccard_all_pairs(
+        splits, as.integer(nTip),
+        k            = as.double(if ("k" %in% names(dots)) dots[["k"]] else 1L),
+        allow_conflict = as.logical(
+          if ("allowConflict" %in% names(dots)) dots[["allowConflict"]] else TRUE),
+        .n_threads
+      )
+
+    } else {
+      NULL
+    }
+
+    if (!is.null(.batch_result)) {
+      return(structure(.batch_result,
+                       class  = "dist",
+                       Size   = length(splits1),
+                       Labels = names(splits1),
+                       Diag   = FALSE,
+                       Upper  = FALSE))
+    }
   }
 
   if (is.na(nTip)) {
diff --git a/man/StartParallel.Rd b/man/StartParallel.Rd
index 7fef07ffc..1a918897e 100644
--- a/man/StartParallel.Rd
+++ b/man/StartParallel.Rd
@@ -35,13 +35,21 @@ StopParallel(quietly = FALSE)
 Accelerate distance calculation by employing multiple \acronym{CPU} workers.
 }
 \details{
-\subsection{OpenMP (recommended for \code{ClusteringInfoDistance} / \code{MutualClusteringInfo})}{
+\subsection{OpenMP (recommended for all split-based metrics)}{
 
 When the package is built with \acronym{OpenMP} support (the default on
-Linux and Windows; optional on macOS), pairwise
-\code{\link[=ClusteringInfoDistance]{ClusteringInfoDistance()}} / \code{\link[=MutualClusteringInfo]{MutualClusteringInfo()}} calculations use
-an efficient multi-threaded code path automatically — no cluster setup is
-required.
+Linux and Windows; optional on macOS), all pairwise split-based distance
+calculations use an efficient multi-threaded batch path automatically —
+no cluster setup is required.  The affected functions are:
+\itemize{
+\item \code{\link[=ClusteringInfoDistance]{ClusteringInfoDistance()}} / \code{\link[=MutualClusteringInfo]{MutualClusteringInfo()}}
+\item \code{\link[=SharedPhylogeneticInfo]{SharedPhylogeneticInfo()}} / \code{\link[=DifferentPhylogeneticInfo]{DifferentPhylogeneticInfo()}}
+\item \code{\link[=MatchingSplitInfo]{MatchingSplitInfo()}} / \code{\link[=MatchingSplitInfoDistance]{MatchingSplitInfoDistance()}}
+\item \code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}
+\item \code{\link[=InfoRobinsonFoulds]{InfoRobinsonFoulds()}}
+\item \code{\link[=NyeSimilarity]{NyeSimilarity()}}
+\item \code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}
+}
 
 The number of \acronym{OpenMP} threads is controlled by the standard
 \code{"mc.cores"} option:
@@ -50,42 +58,44 @@ The number of \acronym{OpenMP} threads is controlled by the standard
 options(mc.cores = 4L)                        # or a fixed number
 }\if{html}{\out{</div>}}
 
-The default is \code{1} (single-threaded). The \acronym{OpenMP} path is
-substantially faster than the R-cluster path for typical analysis sizes and
-is preferred when available.
+The default is \code{1} (single-threaded).
 }
 
-\subsection{R parallel cluster (other metrics)}{
-
-For metrics that do not yet have an \acronym{OpenMP} batch implementation
-(e.g. \code{\link[=RobinsonFoulds]{RobinsonFoulds()}}, \code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}), "TreeDist"
-parallelizes via \code{\link[=parCapply]{parCapply()}} using a cluster stored in
-\code{options("TreeDist-cluster")}.
-
-\code{StartParallel()} calls \code{parallel::makeCluster()} and registers the cluster.
+\subsection{R parallel cluster}{
 
-\code{SetParallel()} registers a pre-existing cluster.
+\code{StartParallel()} creates an R socket cluster (via \code{\link[=makeCluster]{makeCluster()}}) and
+registers it for use by TreeDist.  \code{SetParallel()} registers a pre-existing
+cluster.  \code{StopParallel()} stops the cluster and releases resources.
 
-\code{StopParallel()} stops the current TreeDist cluster and releases resources.
+\strong{When to use \code{StartParallel()}:} for metrics that do not have an
+\acronym{OpenMP} batch path, namely tree-object-based distances such as
+\code{\link[=NNIDist]{NNIDist()}} and \code{\link[=MASTSize]{MASTSize()}} / \code{\link[=MASTInfo]{MASTInfo()}}, or any function called
+via \code{\link[=CompareAll]{CompareAll()}}.  R-cluster parallelism carries a serialisation overhead
+of ~2–3 s, so it is only beneficial for large problems.
 
-Note that R-cluster parallelism carries a serialisation overhead of ~2–3 s,
-so it is only beneficial for large problems (roughly > 500 trees at 50 tips).
-For \code{\link[=ClusteringInfoDistance]{ClusteringInfoDistance()}} the \acronym{OpenMP} path is faster at
-every problem size and is used automatically when no cluster is registered.
+\strong{When \emph{not} to use \code{StartParallel()}:} for the split-based metrics listed
+above.  Registering a cluster disables the \acronym{OpenMP} batch path for
+those functions, replacing a thread-local C++ loop with inter-process
+communication — which is slower at every problem size measured.  Call
+\code{StopParallel()} before computing split-based distances if a cluster is
+active.
 }
 }
 \examples{
-# OpenMP parallelism: just set mc.cores before calling distance functions.
+# OpenMP parallelism: set mc.cores before calling any split-based metric.
 options(mc.cores = 2L)
-# ClusteringInfoDistance(trees)  # now uses 2 OpenMP threads
+# MutualClusteringInfo(trees)  # uses 2 OpenMP threads automatically
 options(mc.cores = NULL)  # restore default (single-threaded)
 
-if (interactive()) { # R cluster: only worthwhile for non-OpenMP metrics
+if (interactive()) {
+  # R cluster: beneficial for NNIDist, MASTSize/MASTInfo, CompareAll(), etc.
+  # Do NOT activate while computing split-based distances (MCI, SPI, MSI, …)
+  # as it bypasses the faster OpenMP path.
   library("TreeTools", quietly = TRUE)
   nCores <- ceiling(parallel::detectCores() / 2)
   StartParallel(nCores) # Takes a few seconds to set up processes
   GetParallel()
-  RobinsonFoulds(as.phylo(0:6, 100))
+  CompareAll(as.phylo(0:6, 100), NNIDist)
   StopParallel() # Returns system resources
 }
 }
diff --git a/man/TreeDistance.Rd b/man/TreeDistance.Rd
index feede9e80..352d169c4 100644
--- a/man/TreeDistance.Rd
+++ b/man/TreeDistance.Rd
@@ -258,6 +258,21 @@ Trees with over 8192 leaves require further modification of the source code,
 which the maintainer plans to attempt in the future; please \href{https://github.com/ms609/TreeTools/issues/141}{comment on GitHub} if you would find this useful.
 }
 
+\section{Parallelism}{
+When \code{tree2 = NULL} and all trees share the same tip labels, pairwise
+distance calculation uses a multi-threaded \acronym{OpenMP} batch path
+automatically.  Control the number of threads with the \code{"mc.cores"} option:
+
+\if{html}{\out{<div class="sourceCode r">}}\preformatted{options(mc.cores = parallel::detectCores())  # use all cores
+options(mc.cores = 4L)                        # or a fixed number
+}\if{html}{\out{</div>}}
+
+Do \strong{not} call \code{\link[=StartParallel]{StartParallel()}} for these functions: a registered
+R cluster disables the \acronym{OpenMP} path and replaces it with slower
+inter-process communication.  See \code{\link[=StartParallel]{StartParallel()}} for full guidance
+on when an R cluster is appropriate.
+}
+
 \examples{
 tree1 <- ape::read.tree(text="((((a, b), c), d), (e, (f, (g, h))));")
 tree2 <- ape::read.tree(text="(((a, b), (c, d)), ((e, f), (g, h)));")
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index c8702cdbf..280865e7b 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -224,6 +224,73 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// cpp_rf_info_all_pairs
+NumericVector cpp_rf_info_all_pairs(const List& splits_list, const int n_tip, const int n_threads);
+RcppExport SEXP _TreeDist_cpp_rf_info_all_pairs(SEXP splits_listSEXP, SEXP n_tipSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const List& >::type splits_list(splits_listSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_tip(n_tipSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_rf_info_all_pairs(splits_list, n_tip, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
+// cpp_msd_all_pairs
+NumericVector cpp_msd_all_pairs(const List& splits_list, const int n_tip, const int n_threads);
+RcppExport SEXP _TreeDist_cpp_msd_all_pairs(SEXP splits_listSEXP, SEXP n_tipSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const List& >::type splits_list(splits_listSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_tip(n_tipSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_msd_all_pairs(splits_list, n_tip, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
+// cpp_msi_all_pairs
+NumericVector cpp_msi_all_pairs(const List& splits_list, const int n_tip, const int n_threads);
+RcppExport SEXP _TreeDist_cpp_msi_all_pairs(SEXP splits_listSEXP, SEXP n_tipSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const List& >::type splits_list(splits_listSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_tip(n_tipSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_msi_all_pairs(splits_list, n_tip, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
+// cpp_shared_phylo_all_pairs
+NumericVector cpp_shared_phylo_all_pairs(const List& splits_list, const int n_tip, const int n_threads);
+RcppExport SEXP _TreeDist_cpp_shared_phylo_all_pairs(SEXP splits_listSEXP, SEXP n_tipSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const List& >::type splits_list(splits_listSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_tip(n_tipSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_shared_phylo_all_pairs(splits_list, n_tip, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
+// cpp_jaccard_all_pairs
+NumericVector cpp_jaccard_all_pairs(const List& splits_list, const int n_tip, const double k, const bool allow_conflict, const int n_threads);
+RcppExport SEXP _TreeDist_cpp_jaccard_all_pairs(SEXP splits_listSEXP, SEXP n_tipSEXP, SEXP kSEXP, SEXP allow_conflictSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const List& >::type splits_list(splits_listSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_tip(n_tipSEXP);
+    Rcpp::traits::input_parameter< const double >::type k(kSEXP);
+    Rcpp::traits::input_parameter< const bool >::type allow_conflict(allow_conflictSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_jaccard_all_pairs(splits_list, n_tip, k, allow_conflict, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
 // path_vector
 IntegerVector path_vector(IntegerMatrix edge);
 RcppExport SEXP _TreeDist_path_vector(SEXP edgeSEXP) {
@@ -446,6 +513,11 @@ static const R_CallMethodDef CallEntries[] = {
     {"_TreeDist_cpp_mast", (DL_FUNC) &_TreeDist_cpp_mast, 3},
     {"_TreeDist_cpp_nni_distance", (DL_FUNC) &_TreeDist_cpp_nni_distance, 3},
     {"_TreeDist_cpp_mutual_clustering_all_pairs", (DL_FUNC) &_TreeDist_cpp_mutual_clustering_all_pairs, 3},
+    {"_TreeDist_cpp_rf_info_all_pairs", (DL_FUNC) &_TreeDist_cpp_rf_info_all_pairs, 3},
+    {"_TreeDist_cpp_msd_all_pairs", (DL_FUNC) &_TreeDist_cpp_msd_all_pairs, 3},
+    {"_TreeDist_cpp_msi_all_pairs", (DL_FUNC) &_TreeDist_cpp_msi_all_pairs, 3},
+    {"_TreeDist_cpp_shared_phylo_all_pairs", (DL_FUNC) &_TreeDist_cpp_shared_phylo_all_pairs, 3},
+    {"_TreeDist_cpp_jaccard_all_pairs", (DL_FUNC) &_TreeDist_cpp_jaccard_all_pairs, 5},
     {"_TreeDist_path_vector", (DL_FUNC) &_TreeDist_path_vector, 1},
     {"_TreeDist_vec_diff_euclidean", (DL_FUNC) &_TreeDist_vec_diff_euclidean, 2},
     {"_TreeDist_pair_diff_euclidean", (DL_FUNC) &_TreeDist_pair_diff_euclidean, 1},
diff --git a/src/pairwise_distances.cpp b/src/pairwise_distances.cpp
index f591ec812..a35f4958e 100644
--- a/src/pairwise_distances.cpp
+++ b/src/pairwise_distances.cpp
@@ -11,6 +11,9 @@
 #endif
 
 #include <TreeTools/SplitList.h>
+#include <algorithm>
+#include <cmath>
+#include <limits>
 #include <memory>
 #include <vector>
 #include <Rcpp/Lightest>
@@ -198,3 +201,436 @@ NumericVector cpp_mutual_clustering_all_pairs(
 
   return result;
 }
+
+
+// =============================================================================
+// InfoRobinsonFoulds (rf_info) — no LAP, thread-safe
+// =============================================================================
+
+static double rf_info_score(
+    const SplitList& a, const SplitList& b, const int32 n_tips
+) {
+  const int16 last_bin   = a.n_bins - 1;
+  const int16 unset_tips = (n_tips % SL_BIN_SIZE) ?
+    SL_BIN_SIZE - n_tips % SL_BIN_SIZE : 0;
+  const splitbit unset_mask      = ALL_ONES >> unset_tips;
+  const double   lg2_unrooted_n  = lg2_unrooted[n_tips];
+  double score = 0;
+
+  splitbit b_complement[SL_MAX_SPLITS][SL_MAX_BINS];
+  for (int16 i = 0; i < b.n_splits; ++i) {
+    for (int16 bin = 0; bin < last_bin; ++bin) {
+      b_complement[i][bin] = ~b.state[i][bin];
+    }
+    b_complement[i][last_bin] = b.state[i][last_bin] ^ unset_mask;
+  }
+
+  for (int16 ai = 0; ai < a.n_splits; ++ai) {
+    for (int16 bi = 0; bi < b.n_splits; ++bi) {
+      bool all_match = true, all_complement = true;
+      for (int16 bin = 0; bin < a.n_bins; ++bin) {
+        if (a.state[ai][bin] != b.state[bi][bin]) { all_match = false; break; }
+      }
+      if (!all_match) {
+        for (int16 bin = 0; bin < a.n_bins; ++bin) {
+          if (a.state[ai][bin] != b_complement[bi][bin]) {
+            all_complement = false; break;
+          }
+        }
+      }
+      if (all_match || all_complement) {
+        int16 leaves_in_split = 0;
+        for (int16 bin = 0; bin < a.n_bins; ++bin) {
+          leaves_in_split += count_bits(a.state[ai][bin]);
+        }
+        score += lg2_unrooted_n
+               - lg2_rooted[leaves_in_split]
+               - lg2_rooted[n_tips - leaves_in_split];
+        break;
+      }
+    }
+  }
+  return score;
+}
+
+//' @keywords internal
+// [[Rcpp::export]]
+NumericVector cpp_rf_info_all_pairs(
+    const List& splits_list,
+    const int   n_tip,
+    const int   n_threads = 1
+) {
+  const int N = splits_list.size();
+  if (N < 2) return NumericVector(0);
+  const int n_pairs = N * (N - 1) / 2;
+
+  std::vector<std::unique_ptr<SplitList>> splits;
+  splits.reserve(N);
+  for (int k = 0; k < N; ++k) {
+    splits.push_back(
+      std::make_unique<SplitList>(Rcpp::as<RawMatrix>(splits_list[k]))
+    );
+  }
+
+  NumericVector result(n_pairs);
+  double* const res = result.begin();
+
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic) num_threads(n_threads)
+#endif
+  for (int col = 0; col < N - 1; ++col) {
+#ifndef _OPENMP
+    Rcpp::checkUserInterrupt();
+#endif
+    for (int row = col + 1; row < N; ++row) {
+      const int p = col * (N - 1) - col * (col - 1) / 2 + row - col - 1;
+      res[p] = rf_info_score(*splits[col], *splits[row], n_tip);
+    }
+  }
+  return result;
+}
+
+
+// =============================================================================
+// MatchingSplitDistance — LAP-based
+// =============================================================================
+
+static double msd_score(
+    const SplitList& a, const SplitList& b, const int32 n_tips
+) {
+  const int16 most_splits = std::max(a.n_splits, b.n_splits);
+  if (most_splits == 0) return 0.0;
+  const int16 split_diff = most_splits - std::min(a.n_splits, b.n_splits);
+  const int16 half_tips  = n_tips / 2;
+  const cost  max_score  = BIG / most_splits;
+
+  cost_matrix score(most_splits);
+
+  for (int16 ai = 0; ai < a.n_splits; ++ai) {
+    for (int16 bi = 0; bi < b.n_splits; ++bi) {
+      splitbit total = 0;
+      for (int16 bin = 0; bin < a.n_bins; ++bin) {
+        total += count_bits(a.state[ai][bin] ^ b.state[bi][bin]);
+      }
+      score(ai, bi) = total;
+    }
+    for (int16 bi = 0; bi < b.n_splits; ++bi) {
+      if (score(ai, bi) > half_tips) score(ai, bi) = n_tips - score(ai, bi);
+    }
+    score.padRowAfterCol(ai, b.n_splits, max_score);
+  }
+  score.padAfterRow(a.n_splits, max_score);
+
+  std::vector<lap_col> rowsol(most_splits);
+  std::vector<lap_row> colsol(most_splits);
+
+  return static_cast<double>(
+    lap(most_splits, score, rowsol, colsol, false) - max_score * split_diff
+  );
+}
+
+//' @keywords internal
+// [[Rcpp::export]]
+NumericVector cpp_msd_all_pairs(
+    const List& splits_list,
+    const int   n_tip,
+    const int   n_threads = 1
+) {
+  const int N = splits_list.size();
+  if (N < 2) return NumericVector(0);
+  const int n_pairs = N * (N - 1) / 2;
+
+  std::vector<std::unique_ptr<SplitList>> splits;
+  splits.reserve(N);
+  for (int k = 0; k < N; ++k) {
+    splits.push_back(
+      std::make_unique<SplitList>(Rcpp::as<RawMatrix>(splits_list[k]))
+    );
+  }
+
+  NumericVector result(n_pairs);
+  double* const res = result.begin();
+
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic) num_threads(n_threads)
+#endif
+  for (int col = 0; col < N - 1; ++col) {
+#ifndef _OPENMP
+    Rcpp::checkUserInterrupt();
+#endif
+    for (int row = col + 1; row < N; ++row) {
+      const int p = col * (N - 1) - col * (col - 1) / 2 + row - col - 1;
+      res[p] = msd_score(*splits[col], *splits[row], n_tip);
+    }
+  }
+  return result;
+}
+
+
+// =============================================================================
+// MatchingSplitInfo — LAP-based
+// =============================================================================
+
+static double msi_score(
+    const SplitList& a, const SplitList& b, const int32 n_tips
+) {
+  const int16 most_splits = std::max(a.n_splits, b.n_splits);
+  if (most_splits == 0) return 0.0;
+
+  constexpr cost max_score = BIG;
+  const double max_possible = lg2_unrooted[n_tips]
+    - lg2_rooted[int16((n_tips + 1) / 2)]
+    - lg2_rooted[int16(n_tips / 2)];
+  const double score_over_possible = static_cast<double>(max_score) / max_possible;
+  const double possible_over_score = max_possible / static_cast<double>(max_score);
+
+  cost_matrix score(most_splits);
+  splitbit different[SL_MAX_BINS];
+
+  for (int16 ai = 0; ai < a.n_splits; ++ai) {
+    for (int16 bi = 0; bi < b.n_splits; ++bi) {
+      int16 n_different = 0, n_a_only = 0, n_a_and_b = 0;
+      for (int16 bin = 0; bin < a.n_bins; ++bin) {
+        different[bin]  = a.state[ai][bin] ^ b.state[bi][bin];
+        n_different    += count_bits(different[bin]);
+        n_a_only       += count_bits(a.state[ai][bin] &  different[bin]);
+        n_a_and_b      += count_bits(a.state[ai][bin] & ~different[bin]);
+      }
+      const int16 n_same = n_tips - n_different;
+      score(ai, bi) = cost(max_score - score_over_possible *
+        TreeDist::mmsi_score(n_same, n_a_and_b, n_different, n_a_only));
+    }
+    score.padRowAfterCol(ai, b.n_splits, max_score);
+  }
+  score.padAfterRow(a.n_splits, max_score);
+
+  std::vector<lap_col> rowsol(most_splits);
+  std::vector<lap_row> colsol(most_splits);
+
+  return static_cast<double>(
+    (max_score * most_splits) - lap(most_splits, score, rowsol, colsol, false)
+  ) * possible_over_score;
+}
+
+//' @keywords internal
+// [[Rcpp::export]]
+NumericVector cpp_msi_all_pairs(
+    const List& splits_list,
+    const int   n_tip,
+    const int   n_threads = 1
+) {
+  const int N = splits_list.size();
+  if (N < 2) return NumericVector(0);
+  const int n_pairs = N * (N - 1) / 2;
+
+  std::vector<std::unique_ptr<SplitList>> splits;
+  splits.reserve(N);
+  for (int k = 0; k < N; ++k) {
+    splits.push_back(
+      std::make_unique<SplitList>(Rcpp::as<RawMatrix>(splits_list[k]))
+    );
+  }
+
+  NumericVector result(n_pairs);
+  double* const res = result.begin();
+
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic) num_threads(n_threads)
+#endif
+  for (int col = 0; col < N - 1; ++col) {
+#ifndef _OPENMP
+    Rcpp::checkUserInterrupt();
+#endif
+    for (int row = col + 1; row < N; ++row) {
+      const int p = col * (N - 1) - col * (col - 1) / 2 + row - col - 1;
+      res[p] = msi_score(*splits[col], *splits[row], n_tip);
+    }
+  }
+  return result;
+}
+
+
+// =============================================================================
+// SharedPhylogeneticInfo — LAP-based
+// =============================================================================
+
+static double shared_phylo_score(
+    const SplitList& a, const SplitList& b, const int32 n_tips
+) {
+  const int16 most_splits = std::max(a.n_splits, b.n_splits);
+  if (most_splits == 0) return 0.0;
+
+  const int16 overlap_a = int16(n_tips + 1) / 2;
+  constexpr cost max_score = BIG;
+  const double best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips);
+  const double max_possible = lg2_unrooted[n_tips] - best_overlap;
+  const double score_over_possible = static_cast<double>(max_score) / max_possible;
+  const double possible_over_score = max_possible / static_cast<double>(max_score);
+
+  cost_matrix score(most_splits);
+
+  for (int16 ai = a.n_splits; ai--; ) {
+    for (int16 bi = b.n_splits; bi--; ) {
+      const double spi = TreeDist::spi_overlap(
+        a.state[ai], b.state[bi], n_tips,
+        a.in_split[ai], b.in_split[bi], a.n_bins);
+      score(ai, bi) = (spi == 0.0) ? max_score
+                                   : cost((spi - best_overlap) * score_over_possible);
+    }
+    score.padRowAfterCol(ai, b.n_splits, max_score);
+  }
+  score.padAfterRow(a.n_splits, max_score);
+
+  std::vector<lap_col> rowsol(most_splits);
+  std::vector<lap_row> colsol(most_splits);
+
+  return static_cast<double>(
+    (max_score * most_splits) - lap(most_splits, score, rowsol, colsol, false)
+  ) * possible_over_score;
+}
+
+//' @keywords internal
+// [[Rcpp::export]]
+NumericVector cpp_shared_phylo_all_pairs(
+    const List& splits_list,
+    const int   n_tip,
+    const int   n_threads = 1
+) {
+  const int N = splits_list.size();
+  if (N < 2) return NumericVector(0);
+  const int n_pairs = N * (N - 1) / 2;
+
+  std::vector<std::unique_ptr<SplitList>> splits;
+  splits.reserve(N);
+  for (int k = 0; k < N; ++k) {
+    splits.push_back(
+      std::make_unique<SplitList>(Rcpp::as<RawMatrix>(splits_list[k]))
+    );
+  }
+
+  NumericVector result(n_pairs);
+  double* const res = result.begin();
+
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic) num_threads(n_threads)
+#endif
+  for (int col = 0; col < N - 1; ++col) {
+#ifndef _OPENMP
+    Rcpp::checkUserInterrupt();
+#endif
+    for (int row = col + 1; row < N; ++row) {
+      const int p = col * (N - 1) - col * (col - 1) / 2 + row - col - 1;
+      res[p] = shared_phylo_score(*splits[col], *splits[row], n_tip);
+    }
+  }
+  return result;
+}
+
+
+// =============================================================================
+// Jaccard / Nye similarity — LAP-based; k and allow_conflict are per-call
+// =============================================================================
+
+static double jaccard_score(
+    const SplitList& a, const SplitList& b, const int32 n_tips,
+    const double exponent, const bool allow_conflict
+) {
+  const int16 most_splits = std::max(a.n_splits, b.n_splits);
+  if (most_splits == 0) return 0.0;
+
+  constexpr cost   max_score  = BIG;
+  constexpr double max_scoreL = static_cast<double>(max_score);
+
+  cost_matrix score(most_splits);
+
+  for (int16 ai = 0; ai < a.n_splits; ++ai) {
+    const int16 na = a.in_split[ai];
+    const int16 nA = n_tips - na;
+
+    for (int16 bi = 0; bi < b.n_splits; ++bi) {
+      int16 a_and_b = 0;
+      for (int16 bin = 0; bin < a.n_bins; ++bin) {
+        a_and_b += count_bits(a.state[ai][bin] & b.state[bi][bin]);
+      }
+      const int16 nb    = b.in_split[bi];
+      const int16 nB    = n_tips - nb;
+      const int16 a_and_B = na - a_and_b;
+      const int16 A_and_b = nb - a_and_b;
+      const int16 A_and_B = nB - a_and_B;
+
+      if (!allow_conflict && !(
+            a_and_b == na || a_and_B == na ||
+            A_and_b == nA || A_and_B == nA)) {
+        score(ai, bi) = max_score;
+      } else {
+        const int16 A_or_b = n_tips - a_and_B;
+        const int16 a_or_B = n_tips - A_and_b;
+        const int16 a_or_b = n_tips - A_and_B;
+        const int16 A_or_B = n_tips - a_and_b;
+        const double ars_ab = static_cast<double>(a_and_b) / static_cast<double>(a_or_b);
+        const double ars_Ab = static_cast<double>(A_and_b) / static_cast<double>(A_or_b);
+        const double ars_aB = static_cast<double>(a_and_B) / static_cast<double>(a_or_B);
+        const double ars_AB = static_cast<double>(A_and_B) / static_cast<double>(A_or_B);
+        const double min_both   = (ars_ab < ars_AB) ? ars_ab : ars_AB;
+        const double min_either = (ars_aB < ars_Ab) ? ars_aB : ars_Ab;
+        const double best = (min_both > min_either) ? min_both : min_either;
+
+        if (exponent == 1.0) {
+          score(ai, bi) = cost(max_scoreL - max_scoreL * best);
+        } else if (std::isinf(exponent)) {
+          score(ai, bi) = cost((best == 1.0) ? 0.0 : max_scoreL);
+        } else {
+          score(ai, bi) = cost(max_scoreL - max_scoreL * std::pow(best, exponent));
+        }
+      }
+    }
+    score.padRowAfterCol(ai, b.n_splits, max_score);
+  }
+  score.padAfterRow(a.n_splits, max_score);
+
+  std::vector<lap_col> rowsol(most_splits);
+  std::vector<lap_row> colsol(most_splits);
+
+  return static_cast<double>(
+    (max_score * most_splits) - lap(most_splits, score, rowsol, colsol, false)
+  ) / max_scoreL;
+}
+
+//' @keywords internal
+// [[Rcpp::export]]
+NumericVector cpp_jaccard_all_pairs(
+    const List&   splits_list,
+    const int     n_tip,
+    const double  k             = 1.0,
+    const bool    allow_conflict = true,
+    const int     n_threads     = 1
+) {
+  const int N = splits_list.size();
+  if (N < 2) return NumericVector(0);
+  const int n_pairs = N * (N - 1) / 2;
+
+  std::vector<std::unique_ptr<SplitList>> splits;
+  splits.reserve(N);
+  for (int i = 0; i < N; ++i) {
+    splits.push_back(
+      std::make_unique<SplitList>(Rcpp::as<RawMatrix>(splits_list[i]))
+    );
+  }
+
+  NumericVector result(n_pairs);
+  double* const res = result.begin();
+
+#ifdef _OPENMP
+#pragma omp parallel for schedule(dynamic) num_threads(n_threads)
+#endif
+  for (int col = 0; col < N - 1; ++col) {
+#ifndef _OPENMP
+    Rcpp::checkUserInterrupt();
+#endif
+    for (int row = col + 1; row < N; ++row) {
+      const int p = col * (N - 1) - col * (col - 1) / 2 + row - col - 1;
+      res[p] = jaccard_score(*splits[col], *splits[row], n_tip, k, allow_conflict);
+    }
+  }
+  return result;
+}

From f6cd4db91aa894e15005b721be13e7a80275cb31 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Tue, 10 Mar 2026 14:58:54 +0000
Subject: [PATCH 06/19] OpenMP

---
 inst/WORDLIST | 1 +
 1 file changed, 1 insertion(+)

diff --git a/inst/WORDLIST b/inst/WORDLIST
index ac56518cf..edb7570a4 100644
--- a/inst/WORDLIST
+++ b/inst/WORDLIST
@@ -43,6 +43,7 @@ NNIDist
 NyeTreeSimilarity
 OEIS
 ORCID
+OpenMP
 Perotti
 PhysRevE
 PlotTools

From 283fb19fb744e9ff44cdd7b0e07e030985720017 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Tue, 10 Mar 2026 14:59:56 +0000
Subject: [PATCH 07/19] "Neil", "Kaye", role = c("cph"),

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 319cd6e61..59fbaebda 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -11,7 +11,7 @@ Authors@R: c(person("Martin R.", "Smith",
              person("Yong", "Yang", email = "yongyanglink@gmail.com",
                     role = c("ctb", "cph"), comment = "LAP algorithm"),
              person("Yi", "Cao", role = c("ctb", "cph"), comment = "LAP algorithm"),
-             person("Neil", "Kaye", comment = "Mercator image")
+             person("Neil", "Kaye", role = c("cph"), comment = "Mercator image")
             )
 License: GPL (>= 3)
 Description: Implements measures of tree similarity, including 

From bae52e4632b36a6b3063a521422000fefc81ee0f Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Tue, 10 Mar 2026 15:42:15 +0000
Subject: [PATCH 08/19] macOS

---
 inst/WORDLIST | 1 +
 1 file changed, 1 insertion(+)

diff --git a/inst/WORDLIST b/inst/WORDLIST
index edb7570a4..cd993881d 100644
--- a/inst/WORDLIST
+++ b/inst/WORDLIST
@@ -98,6 +98,7 @@ ingroup
 interdecile
 jonker
 magiclogic
+macOS
 mergesort
 molbev
 msw

From fb6d61723ed431d56682adc39ce20963e8c3ea9c Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Tue, 10 Mar 2026 15:48:06 +0000
Subject: [PATCH 09/19] cache lt comparison

---
 src/lap.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/lap.cpp b/src/lap.cpp
index 73f6c5989..9614cb992 100644
--- a/src/lap.cpp
+++ b/src/lap.cpp
@@ -166,7 +166,8 @@ cost lap(const lap_row dim,
       lap_col j1 = min_idx;
       
       lap_row i0 = colsol[j1];
-      if (nontrivially_less_than(umin, usubmin)) {
+      const bool strictly_less = nontrivially_less_than(umin, usubmin);
+      if (strictly_less) {
         //  Change the reduction of the minimum column to increase the minimum
         //  reduced cost in the row to the subminimum.
         v[j1] -= (usubmin - umin);
@@ -183,7 +184,7 @@ cost lap(const lap_row dim,
       colsol[j1] = i;
       
       if (i0 > -1) { // Minimum column j1 assigned earlier.
-        if (nontrivially_less_than(umin, usubmin)) {
+        if (strictly_less) {
           // Put in current k, and go back to that k.
           // Continue augmenting path i - j1 with i0.
           freeunassigned[--k] = i0;

From da4798dcfdf97d8ac14766de3cf9785bcf7dda32 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Tue, 10 Mar 2026 15:52:14 +0000
Subject: [PATCH 10/19] Plan & prevent plot interruption

---
 AGENTS.md              | 53 ++++++++++++++++++++++++++++++++++++++++++
 tests/testthat/setup.R |  7 ++++++
 2 files changed, 60 insertions(+)
 create mode 100644 tests/testthat/setup.R

diff --git a/AGENTS.md b/AGENTS.md
index 7cb7f775c..3ae99878b 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -260,6 +260,59 @@ with sensible defaults (1.0, TRUE) if absent.
 
 ---
 
+## LAP Profiling Notes
+
+### Scaling behaviour (debug build, uniform random matrices)
+
+| n | median (μs) | implied α |
+|---|---|---|
+| 10 | 14.8 | — |
+| 25 | 28.1 | 0.70 |
+| 47 | 76.6 | 1.59 |
+| 97 | 377 | 2.20 |
+| 197 | 1798 | 2.21 |
+| 400 | 14953 | 2.99 |
+
+For the actual tree-distance workload (n ≈ 47 for 50-tip trees, n ≈ 197 for
+200-tip trees), the solver is already well into super-quadratic territory.
+The Dijkstra augmentation phase is the dominant cost from n ≈ 100 upward.
+
+### Phase analysis
+
+- **Column reduction** (n² total, O(n) per column via transposed sequential scan):
+  already well-optimised; the one-time transpose pays for itself immediately.
+- **Reduction transfer** (n² total): was blocked from SIMD auto-vectorisation
+  by an `if (j == j1) continue;` branch inside the minimum-search loop.
+  **Fixed** by splitting around `j1` into two clean vectorisable loops.
+- **Augmenting row reduction** (2 × free_rows × n): calls `findRowSubmin`,
+  already 4× manually unrolled.  `nontrivially_less_than()` was called twice
+  per free row with identical arguments; **fixed** by caching as `strictly_less`.
+- **Augment solution / Dijkstra** (free_rows × n²): the dominant phase for
+  n ≥ 100.  The inner update loop iterates over `col_list[up..dim-1]` via
+  indirect `j = col_list[k]` indexing, then accesses `row_i[j]`, `v_ptr[j]`,
+  and `d[j]` as scattered gathers — preventing SIMD vectorisation.
+
+### Dijkstra restructuring opportunity (unimplemented — needs release-build VTune)
+
+Replace the `col_list` permutation with a `bool scanned[dim]` mask and iterate
+directly over `0..dim-1`.  Pros: sequential access to `row_i`, `v_ptr`, `d`
+→ auto-vectorisable; no indirect gather.  Cons: visits all `dim` columns each
+iteration (vs progressively fewer with `col_list`), so ~2× more comparisons in
+the best case.  Net benefit is uncertain and must be measured on a release build.
+
+**To profile:** build with `PKG_CXXFLAGS="-O2 -g -fno-omit-frame-pointer"` and
+run `benchmark/vtune-driver.R` through VTune hotspot collection.
+
+### Test suite fix
+
+Added `tests/testthat/setup.R` that opens a null PDF device for the duration of
+all test runs.  This suppresses bare `plot()` / `TreeDistPlot()` calls in tests
+from appearing in the interactive graphics device, and prevents vdiffr snapshot
+rendering from leaking to screen.  vdiffr opens its own `svglite` device on top
+of the null device, so snapshot tests are unaffected.
+
+---
+
 ## Known Optimization Opportunities / TODOs
 
 - `information.h` line 19: comment suggests considering increasing `MAX_FACTORIAL_LOOKUP`
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
new file mode 100644
index 000000000..d3da6c074
--- /dev/null
+++ b/tests/testthat/setup.R
@@ -0,0 +1,7 @@
+# Redirect all graphics to a null device for the test session.
+# This prevents bare plot() calls and vdiffr snapshot rendering from
+# appearing in the interactive graphics device (e.g. the RStudio viewer).
+# vdiffr opens its own svglite device on top of this one, so snapshot
+# tests are unaffected.
+pdf(NULL)
+withr::defer(grDevices::dev.off(), teardown_env())

From 73c71bd1f88649ef73270a0be7ba955e6f6aa188 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Wed, 11 Mar 2026 07:56:08 +0000
Subject: [PATCH 11/19] LapScratch

Modest speedup
---
 src/lap.cpp                |  18 ++++---
 src/pairwise_distances.cpp | 104 +++++++++++++++++++++++++++----------
 src/tree_distances.h       |  61 ++++++++++++++++++++--
 3 files changed, 147 insertions(+), 36 deletions(-)

diff --git a/src/lap.cpp b/src/lap.cpp
index 9614cb992..2e09138f6 100644
--- a/src/lap.cpp
+++ b/src/lap.cpp
@@ -70,7 +70,8 @@ cost lap(const lap_row dim,
          cost_matrix &input_cost,
          std::vector<lap_col> &rowsol,
          std::vector<lap_row> &colsol,
-         const bool allow_interrupt)
+         const bool allow_interrupt,
+         LapScratch &scratch)
   
   // input:
   // dim        - problem size
@@ -82,9 +83,12 @@ cost lap(const lap_row dim,
   
 {
   lap_row num_free = 0;
-  alignas(64) std::vector<cost> v(((dim + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE);
+  scratch.ensure(dim);
+  auto& v       = scratch.v;
+  auto& matches = scratch.matches;
+  // matches must start at zero for the column-reduction counter
+  std::fill(matches.begin(), matches.begin() + dim, 0);
   const cost* __restrict__ v_ptr = v.data();
-  std::vector<lap_col> matches(dim); // Counts how many times a row could be assigned.
   
   // COLUMN REDUCTION
   for (lap_col j = dim; j--; ) { // Reverse order gives better results.
@@ -108,7 +112,7 @@ cost lap(const lap_row dim,
   }
   
   // REDUCTION TRANSFER
-  std::vector<lap_row> freeunassigned(dim);        // List of unassigned rows.
+  auto& freeunassigned = scratch.freeunassigned;   // List of unassigned rows.
   
   for (lap_row i = 0; i < dim; ++i) {
     if (matches[i] == 0) {
@@ -146,7 +150,7 @@ cost lap(const lap_row dim,
   }
   
   //   AUGMENTING ROW REDUCTION
-  std::vector<lap_col> col_list(dim);    // List of columns to be scanned in various ways.
+  auto& col_list = scratch.col_list;    // List of columns to be scanned in various ways.
   int loopcnt = 0;                       // do-loop to be done twice.
   
   do {
@@ -199,8 +203,8 @@ cost lap(const lap_row dim,
   } while (loopcnt < 2); // Repeat once.
   
   // AUGMENT SOLUTION for each free row.
-  std::vector<cost> d(dim);              // 'Cost-distance' in augmenting path calculation.
-  std::vector<lap_row> predecessor(dim); // Row-predecessor of column in augmenting/alternating path.
+  auto& d           = scratch.d;           // 'Cost-distance' in augmenting path calculation.
+  auto& predecessor = scratch.predecessor; // Row-predecessor of column in augmenting/alternating path.
   
   for (lap_row f = 0; f < num_free; ++f) {
     bool unassignedfound = false;
diff --git a/src/pairwise_distances.cpp b/src/pairwise_distances.cpp
index a35f4958e..f4b559444 100644
--- a/src/pairwise_distances.cpp
+++ b/src/pairwise_distances.cpp
@@ -32,7 +32,8 @@ using TreeTools::count_bits;
 // Passes allow_interrupt = false to lap() so it is safe to call from an
 // OpenMP parallel region.
 static double mutual_clustering_score(
-    const SplitList& a, const SplitList& b, const int32 n_tips
+    const SplitList& a, const SplitList& b, const int32 n_tips,
+    LapScratch& scratch
 ) {
   if (a.n_splits == 0 || b.n_splits == 0 || n_tips == 0) return 0.0;
 
@@ -104,8 +105,9 @@ static double mutual_clustering_score(
   }
 
   const int16 lap_n = most_splits - exact_n;
-  std::vector<lap_col> rowsol(lap_n);
-  std::vector<lap_row> colsol(lap_n);
+  scratch.ensure(most_splits);
+  auto& rowsol = scratch.rowsol;
+  auto& colsol = scratch.colsol;
 
   if (exact_n) {
     // Build a reduced cost matrix omitting exact-matched rows/cols
@@ -126,7 +128,7 @@ static double mutual_clustering_score(
 
     const double lap_score =
       static_cast<double>((max_score * lap_n) -
-                          lap(lap_n, small, rowsol, colsol, false)) * over_max;
+                          lap(lap_n, small, rowsol, colsol, false, scratch)) * over_max;
     return lap_score + exact_score * n_tips_rcp;
 
   } else {
@@ -137,7 +139,7 @@ static double mutual_clustering_score(
       }
     }
     return static_cast<double>(
-      (max_score * lap_n) - lap(lap_n, score, rowsol, colsol, false)
+      (max_score * lap_n) - lap(lap_n, score, rowsol, colsol, false, scratch)
     ) / max_score;
   }
 }
@@ -183,6 +185,11 @@ NumericVector cpp_mutual_clustering_all_pairs(
   NumericVector result(n_pairs);
   double* const res = result.begin();
 
+  // One LapScratch per thread — grown lazily on first use, never freed between
+  // pairs.  Indexed by omp_get_thread_num() (always 0 in the serial path).
+  const int n_scratch = std::max(1, n_threads);
+  std::vector<LapScratch> scratches(n_scratch);
+
   // Iterate over columns of the combn(N,2) lower triangle.
   // Pair (col, row) with col < row maps to dist-vector index:
   //   p = col*(N-1) - col*(col-1)/2 + row - col - 1
@@ -192,10 +199,15 @@ NumericVector cpp_mutual_clustering_all_pairs(
   for (int col = 0; col < N - 1; ++col) {
 #ifndef _OPENMP
     Rcpp::checkUserInterrupt();
+#endif
+#ifdef _OPENMP
+    LapScratch& scratch = scratches[omp_get_thread_num()];
+#else
+    LapScratch& scratch = scratches[0];
 #endif
     for (int row = col + 1; row < N; ++row) {
       const int p = col * (N - 1) - col * (col - 1) / 2 + row - col - 1;
-      res[p] = mutual_clustering_score(*splits[col], *splits[row], n_tip);
+      res[p] = mutual_clustering_score(*splits[col], *splits[row], n_tip, scratch);
     }
   }
 
@@ -296,7 +308,8 @@ NumericVector cpp_rf_info_all_pairs(
 // =============================================================================
 
 static double msd_score(
-    const SplitList& a, const SplitList& b, const int32 n_tips
+    const SplitList& a, const SplitList& b, const int32 n_tips,
+    LapScratch& scratch
 ) {
   const int16 most_splits = std::max(a.n_splits, b.n_splits);
   if (most_splits == 0) return 0.0;
@@ -321,11 +334,12 @@ static double msd_score(
   }
   score.padAfterRow(a.n_splits, max_score);
 
-  std::vector<lap_col> rowsol(most_splits);
-  std::vector<lap_row> colsol(most_splits);
+  scratch.ensure(most_splits);
+  auto& rowsol = scratch.rowsol;
+  auto& colsol = scratch.colsol;
 
   return static_cast<double>(
-    lap(most_splits, score, rowsol, colsol, false) - max_score * split_diff
+    lap(most_splits, score, rowsol, colsol, false, scratch) - max_score * split_diff
   );
 }
 
@@ -351,16 +365,24 @@ NumericVector cpp_msd_all_pairs(
   NumericVector result(n_pairs);
   double* const res = result.begin();
 
+  const int n_scratch = std::max(1, n_threads);
+  std::vector<LapScratch> scratches(n_scratch);
+
 #ifdef _OPENMP
 #pragma omp parallel for schedule(dynamic) num_threads(n_threads)
 #endif
   for (int col = 0; col < N - 1; ++col) {
 #ifndef _OPENMP
     Rcpp::checkUserInterrupt();
+#endif
+#ifdef _OPENMP
+    LapScratch& scratch = scratches[omp_get_thread_num()];
+#else
+    LapScratch& scratch = scratches[0];
 #endif
     for (int row = col + 1; row < N; ++row) {
       const int p = col * (N - 1) - col * (col - 1) / 2 + row - col - 1;
-      res[p] = msd_score(*splits[col], *splits[row], n_tip);
+      res[p] = msd_score(*splits[col], *splits[row], n_tip, scratch);
     }
   }
   return result;
@@ -372,7 +394,8 @@ NumericVector cpp_msd_all_pairs(
 // =============================================================================
 
 static double msi_score(
-    const SplitList& a, const SplitList& b, const int32 n_tips
+    const SplitList& a, const SplitList& b, const int32 n_tips,
+    LapScratch& scratch
 ) {
   const int16 most_splits = std::max(a.n_splits, b.n_splits);
   if (most_splits == 0) return 0.0;
@@ -404,11 +427,12 @@ static double msi_score(
   }
   score.padAfterRow(a.n_splits, max_score);
 
-  std::vector<lap_col> rowsol(most_splits);
-  std::vector<lap_row> colsol(most_splits);
+  scratch.ensure(most_splits);
+  auto& rowsol = scratch.rowsol;
+  auto& colsol = scratch.colsol;
 
   return static_cast<double>(
-    (max_score * most_splits) - lap(most_splits, score, rowsol, colsol, false)
+    (max_score * most_splits) - lap(most_splits, score, rowsol, colsol, false, scratch)
   ) * possible_over_score;
 }
 
@@ -434,16 +458,24 @@ NumericVector cpp_msi_all_pairs(
   NumericVector result(n_pairs);
   double* const res = result.begin();
 
+  const int n_scratch = std::max(1, n_threads);
+  std::vector<LapScratch> scratches(n_scratch);
+
 #ifdef _OPENMP
 #pragma omp parallel for schedule(dynamic) num_threads(n_threads)
 #endif
   for (int col = 0; col < N - 1; ++col) {
 #ifndef _OPENMP
     Rcpp::checkUserInterrupt();
+#endif
+#ifdef _OPENMP
+    LapScratch& scratch = scratches[omp_get_thread_num()];
+#else
+    LapScratch& scratch = scratches[0];
 #endif
     for (int row = col + 1; row < N; ++row) {
       const int p = col * (N - 1) - col * (col - 1) / 2 + row - col - 1;
-      res[p] = msi_score(*splits[col], *splits[row], n_tip);
+      res[p] = msi_score(*splits[col], *splits[row], n_tip, scratch);
     }
   }
   return result;
@@ -455,7 +487,8 @@ NumericVector cpp_msi_all_pairs(
 // =============================================================================
 
 static double shared_phylo_score(
-    const SplitList& a, const SplitList& b, const int32 n_tips
+    const SplitList& a, const SplitList& b, const int32 n_tips,
+    LapScratch& scratch
 ) {
   const int16 most_splits = std::max(a.n_splits, b.n_splits);
   if (most_splits == 0) return 0.0;
@@ -481,11 +514,12 @@ static double shared_phylo_score(
   }
   score.padAfterRow(a.n_splits, max_score);
 
-  std::vector<lap_col> rowsol(most_splits);
-  std::vector<lap_row> colsol(most_splits);
+  scratch.ensure(most_splits);
+  auto& rowsol = scratch.rowsol;
+  auto& colsol = scratch.colsol;
 
   return static_cast<double>(
-    (max_score * most_splits) - lap(most_splits, score, rowsol, colsol, false)
+    (max_score * most_splits) - lap(most_splits, score, rowsol, colsol, false, scratch)
   ) * possible_over_score;
 }
 
@@ -511,16 +545,24 @@ NumericVector cpp_shared_phylo_all_pairs(
   NumericVector result(n_pairs);
   double* const res = result.begin();
 
+  const int n_scratch = std::max(1, n_threads);
+  std::vector<LapScratch> scratches(n_scratch);
+
 #ifdef _OPENMP
 #pragma omp parallel for schedule(dynamic) num_threads(n_threads)
 #endif
   for (int col = 0; col < N - 1; ++col) {
 #ifndef _OPENMP
     Rcpp::checkUserInterrupt();
+#endif
+#ifdef _OPENMP
+    LapScratch& scratch = scratches[omp_get_thread_num()];
+#else
+    LapScratch& scratch = scratches[0];
 #endif
     for (int row = col + 1; row < N; ++row) {
       const int p = col * (N - 1) - col * (col - 1) / 2 + row - col - 1;
-      res[p] = shared_phylo_score(*splits[col], *splits[row], n_tip);
+      res[p] = shared_phylo_score(*splits[col], *splits[row], n_tip, scratch);
     }
   }
   return result;
@@ -533,7 +575,8 @@ NumericVector cpp_shared_phylo_all_pairs(
 
 static double jaccard_score(
     const SplitList& a, const SplitList& b, const int32 n_tips,
-    const double exponent, const bool allow_conflict
+    const double exponent, const bool allow_conflict,
+    LapScratch& scratch
 ) {
   const int16 most_splits = std::max(a.n_splits, b.n_splits);
   if (most_splits == 0) return 0.0;
@@ -588,11 +631,12 @@ static double jaccard_score(
   }
   score.padAfterRow(a.n_splits, max_score);
 
-  std::vector<lap_col> rowsol(most_splits);
-  std::vector<lap_row> colsol(most_splits);
+  scratch.ensure(most_splits);
+  auto& rowsol = scratch.rowsol;
+  auto& colsol = scratch.colsol;
 
   return static_cast<double>(
-    (max_score * most_splits) - lap(most_splits, score, rowsol, colsol, false)
+    (max_score * most_splits) - lap(most_splits, score, rowsol, colsol, false, scratch)
   ) / max_scoreL;
 }
 
@@ -620,16 +664,24 @@ NumericVector cpp_jaccard_all_pairs(
   NumericVector result(n_pairs);
   double* const res = result.begin();
 
+  const int n_scratch = std::max(1, n_threads);
+  std::vector<LapScratch> scratches(n_scratch);
+
 #ifdef _OPENMP
 #pragma omp parallel for schedule(dynamic) num_threads(n_threads)
 #endif
   for (int col = 0; col < N - 1; ++col) {
 #ifndef _OPENMP
     Rcpp::checkUserInterrupt();
+#endif
+#ifdef _OPENMP
+    LapScratch& scratch = scratches[omp_get_thread_num()];
+#else
+    LapScratch& scratch = scratches[0];
 #endif
     for (int row = col + 1; row < N; ++row) {
       const int p = col * (N - 1) - col * (col - 1) / 2 + row - col - 1;
-      res[p] = jaccard_score(*splits[col], *splits[row], n_tip, k, allow_conflict);
+      res[p] = jaccard_score(*splits[col], *splits[row], n_tip, k, allow_conflict, scratch);
     }
   }
   return result;
diff --git a/src/tree_distances.h b/src/tree_distances.h
index 06b8661a9..35d58c397 100644
--- a/src/tree_distances.h
+++ b/src/tree_distances.h
@@ -83,6 +83,13 @@ class CostMatrix {
     makeUntranspose();
   }
   
+  // Reset for reuse at the same dimension — clears the transpose cache so the
+  // next findColMin() re-builds it from fresh data.  Call between pair
+  // computations when reusing a thread-local CostMatrix instance.
+  void reset() noexcept { transposed_ = false; }
+
+  [[nodiscard]] size_t dim() const noexcept { return dim_; }
+
   void makeTranspose() noexcept {
     if (transposed_) return;
     
@@ -188,10 +195,12 @@ class CostMatrix {
     std::fill(data_.begin() + start_index, data_.begin() + end_index, value);
   }
   
-  std::pair<cost, lap_row> findColMin(lap_col j) {
+  std::pair<cost, lap_row> findColMin(lap_col j,
+                                      lap_row search_dim = -1) {
     makeTranspose();
     const cost* col_data = col(j);
-    const auto min_ptr = std::min_element(col_data, col_data + dim_);
+    const lap_row n = (search_dim < 0) ? static_cast<lap_row>(dim_) : search_dim;
+    const auto min_ptr = std::min_element(col_data, col_data + n);
     return {*min_ptr,
             static_cast<lap_row>(std::distance(col_data, min_ptr))};
   }
@@ -374,13 +383,59 @@ class CostMatrix {
 
 using cost_matrix = CostMatrix;
 
+// ---------------------------------------------------------------------------
+// LapScratch — reusable heap storage for one thread's LAPJV workspace.
+//
+// Pass to the scratch-taking overload of lap() to eliminate the ~6 per-call
+// std::vector allocations.  In a serial context construct once before the
+// loop; in an OpenMP context allocate one per thread and index by
+// omp_get_thread_num().  All vectors are grown lazily; never shrunk.
+// ---------------------------------------------------------------------------
+struct LapScratch {
+  std::vector<cost>    v;               // column dual variables (padded)
+  std::vector<lap_col> matches;         // assignment-count per row
+  std::vector<lap_row> freeunassigned;  // list of unassigned rows
+  std::vector<lap_col> col_list;        // column scan list
+  std::vector<cost>    d;               // Dijkstra distances
+  std::vector<lap_row> predecessor;     // augmenting-path predecessors
+  // rowsol / colsol are included so score functions that don't need the
+  // solution afterwards can avoid a separate allocation.
+  std::vector<lap_col> rowsol;
+  std::vector<lap_row> colsol;
+
+  void ensure(int dim) noexcept {
+    const int padded = static_cast<int>(
+        ((static_cast<size_t>(dim) + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE);
+    if (static_cast<int>(v.size())              < padded) v.resize(padded);
+    if (static_cast<int>(matches.size())        < dim)    matches.resize(dim);
+    if (static_cast<int>(freeunassigned.size()) < dim)    freeunassigned.resize(dim);
+    if (static_cast<int>(col_list.size())       < dim)    col_list.resize(dim);
+    if (static_cast<int>(d.size())              < dim)    d.resize(dim);
+    if (static_cast<int>(predecessor.size())    < dim)    predecessor.resize(dim);
+    if (static_cast<int>(rowsol.size())         < dim)    rowsol.resize(dim);
+    if (static_cast<int>(colsol.size())         < dim)    colsol.resize(dim);
+  }
+};
+
 /*************** FUNCTIONS  *******************/
 
+// Primary overload: caller supplies pre-allocated scratch (zero alloc in hot loop).
 extern cost lap(lap_row dim,
                 cost_matrix &input_cost,
                 std::vector<lap_col> &rowsol,
                 std::vector<lap_row> &colsol,
-                bool allow_interrupt = true);
+                bool allow_interrupt,
+                LapScratch &scratch);
+
+// Convenience overload: creates a temporary scratch (for one-off calls).
+inline cost lap(lap_row dim,
+                cost_matrix &input_cost,
+                std::vector<lap_col> &rowsol,
+                std::vector<lap_row> &colsol,
+                bool allow_interrupt = true) {
+  LapScratch scratch;
+  return lap(dim, input_cost, rowsol, colsol, allow_interrupt, scratch);
+}
 
 namespace TreeDist {
 

From 46eb4131821e01dd9b9c2475c7ad902e8eb94580 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Wed, 11 Mar 2026 16:30:12 +0000
Subject: [PATCH 12/19] Document failed optimization attempt

---
 AGENTS.md | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 3 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 3ae99878b..afc22bbac 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -109,6 +109,35 @@ When run non-interactively (e.g. CI) results are serialised to `.bench.Rds` file
 `_compare_results.R` compares PR results against `main` and fails the build on a
 regression > 4 %.
 
+### ⚠ Benchmarking protocol — always use a release install
+
+**Do not benchmark with `devtools::load_all()`.**  It appends
+`-UNDEBUG -Wall -pedantic -g -O0` to the compiler flags, which overrides the
+`-O2` in `~/.R/Makevars` and produces an unrepresentative (typically 3–5×
+slower) build.  All timing figures in this file were produced from a release
+install unless noted otherwise.
+
+**Correct workflow** (run from a *fresh* R session — do NOT load TreeDist first,
+to avoid the Windows DLL lock):
+
+```r
+source("benchmark/bench-release.R")
+
+# 1. Benchmark HEAD (before your patch)
+bench_release(label = "baseline")
+
+# 2. Apply your changes, then benchmark again
+bench_release(label = "my-optimisation")
+
+# 3. Compare
+bench_compare("baseline", "my-optimisation")
+```
+
+`bench_release()` installs the current working-tree source to a private temp
+library via `install.packages(..., type = "source")` (so Makevars flags apply
+correctly) and runs the suite in a `Rscript` subprocess.  Results are saved to
+`benchmark/results/<label>.Rds` and persist across sessions.
+
 ### Existing benchmark scripts
 
 | Script | What it tests |
@@ -238,6 +267,21 @@ machines where OpenMP is available, and actively harms performance for typical
 analysis sizes (≤ 200 trees).  The fast path therefore bypasses the R cluster
 entirely when `cluster` is `NULL`.
 
+#### Per-call heap allocation elimination via `LapScratch` (attempted, reverted)
+
+Attempted to eliminate the ~8 per-pair `std::vector` / `CostMatrix` heap
+allocations in `pairwise_distances.cpp` by defining a `LapScratch` struct that
+holds all scratch storage and is reused across pairs (one instance per OpenMP
+thread, indexed by `omp_get_thread_num()`).
+
+**Release-build (`-O2`) benchmarks showed no measurable difference (−0.7% to
++0.5% across all scenarios)** — the allocator is fast enough at `-O2` that
+allocation overhead is buried in the Dijkstra phase, which dominates throughout.
+
+The implementation was reverted because it added substantial complexity for
+zero measured benefit.
+
+
 #### OpenMP rollout to all remaining distance metrics (this dev cycle)
 
 Extended OpenMP batch computation to all LAP-based and RF-info metrics.
@@ -281,9 +325,12 @@ The Dijkstra augmentation phase is the dominant cost from n ≈ 100 upward.
 
 - **Column reduction** (n² total, O(n) per column via transposed sequential scan):
   already well-optimised; the one-time transpose pays for itself immediately.
-- **Reduction transfer** (n² total): was blocked from SIMD auto-vectorisation
-  by an `if (j == j1) continue;` branch inside the minimum-search loop.
-  **Fixed** by splitting around `j1` into two clean vectorisable loops.
+- **Reduction transfer** (n² total): the `if (j == j1) continue;` branch inside
+  the minimum-search loop was investigated as a vectorisation blocker. Assembly
+  analysis (GCC 14 / MinGW, `-O2 -march=native`) showed that **neither** the
+  branched nor the split form vectorises — `int_fast64_t` has no SIMD vector
+  type on this toolchain, and the outer `matches[i] != 1` check creates a
+  multi-loop nest the vectoriser refuses regardless. No change was made.
 - **Augmenting row reduction** (2 × free_rows × n): calls `findRowSubmin`,
   already 4× manually unrolled.  `nontrivially_less_than()` was called twice
   per free row with identical arguments; **fixed** by caching as `strictly_less`.

From 29c56b498ba8db224c77dd29fe5c17367d953d30 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Thu, 12 Mar 2026 07:51:25 +0000
Subject: [PATCH 13/19] Shrink l2 to fit in L1 cache

---
 src/pairwise_distances.cpp      | 9 +++++----
 src/tree_distance_functions.cpp | 2 +-
 src/tree_distances.cpp          | 9 +++++----
 src/tree_distances.h            | 7 ++++---
 4 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/pairwise_distances.cpp b/src/pairwise_distances.cpp
index f4b559444..d165fbbd4 100644
--- a/src/pairwise_distances.cpp
+++ b/src/pairwise_distances.cpp
@@ -46,6 +46,7 @@ static double mutual_clustering_score(
   constexpr cost max_score  = BIG;
   constexpr double over_max = 1.0 / static_cast<double>(BIG);
   const double max_over_tips = static_cast<double>(BIG) * n_tips_rcp;
+  const double lg2_n = lg2[n_tips];
 
   cost_matrix score(most_splits);
   double exact_score  = 0.0;
@@ -86,10 +87,10 @@ static double mutual_clustering_score(
         score(ai, bi) = max_score; // Avoid rounding errors on orthogonal splits
       } else {
         double ic_sum = 0.0;
-        TreeDist::add_ic_element(ic_sum, a_and_b, na, nb, n_tips);
-        TreeDist::add_ic_element(ic_sum, a_and_B, na, nB, n_tips);
-        TreeDist::add_ic_element(ic_sum, A_and_b, nA, nb, n_tips);
-        TreeDist::add_ic_element(ic_sum, A_and_B, nA, nB, n_tips);
+        TreeDist::add_ic_element(ic_sum, a_and_b, na, nb, n_tips, lg2_n);
+        TreeDist::add_ic_element(ic_sum, a_and_B, na, nB, n_tips, lg2_n);
+        TreeDist::add_ic_element(ic_sum, A_and_b, nA, nb, n_tips, lg2_n);
+        TreeDist::add_ic_element(ic_sum, A_and_B, nA, nB, n_tips, lg2_n);
         score(ai, bi) = max_score - static_cast<cost>(ic_sum * max_over_tips);
       }
     }
diff --git a/src/tree_distance_functions.cpp b/src/tree_distance_functions.cpp
index 3765c5b8c..aac34a482 100644
--- a/src/tree_distance_functions.cpp
+++ b/src/tree_distance_functions.cpp
@@ -7,7 +7,7 @@
 
 using namespace Rcpp;
 
-constexpr int32 LG2_SIZE = (int32(SL_MAX_TIPS) - 1) * (SL_MAX_TIPS - 1) + 1;
+constexpr int32 LG2_SIZE = SL_MAX_TIPS + 1;
 
 double lg2[LG2_SIZE];
 double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2];
diff --git a/src/tree_distances.cpp b/src/tree_distances.cpp
index 9d33f717d..bd47843f9 100644
--- a/src/tree_distances.cpp
+++ b/src/tree_distances.cpp
@@ -398,6 +398,7 @@ List mutual_clustering(const RawMatrix &x, const RawMatrix &y,
   constexpr cost max_score = BIG;
   constexpr double over_max_score = 1.0 / static_cast<double>(max_score);
   const double max_over_tips = static_cast<double>(max_score) * n_tips_reciprocal;
+  const double lg2_n = lg2[n_tips];
   
   cost_matrix score(most_splits);
   
@@ -443,10 +444,10 @@ List mutual_clustering(const RawMatrix &x, const RawMatrix &y,
         score(ai, bi) = max_score; // Avoid rounding errors
       } else {
         double ic_sum = 0.0;
-        TreeDist::add_ic_element(ic_sum, a_and_b, na, nb, n_tips);
-        TreeDist::add_ic_element(ic_sum, a_and_B, na, nB, n_tips);
-        TreeDist::add_ic_element(ic_sum, A_and_b, nA, nb, n_tips);
-        TreeDist::add_ic_element(ic_sum, A_and_B, nA, nB, n_tips);
+        TreeDist::add_ic_element(ic_sum, a_and_b, na, nb, n_tips, lg2_n);
+        TreeDist::add_ic_element(ic_sum, a_and_B, na, nB, n_tips, lg2_n);
+        TreeDist::add_ic_element(ic_sum, A_and_b, nA, nb, n_tips, lg2_n);
+        TreeDist::add_ic_element(ic_sum, A_and_B, nA, nB, n_tips, lg2_n);
         
         // Division by n_tips converts n(A&B) to P(A&B) for each ic_element
         score(ai, bi) = max_score - static_cast<cost>(ic_sum * max_over_tips);
diff --git a/src/tree_distances.h b/src/tree_distances.h
index 35d58c397..4b1994d0a 100644
--- a/src/tree_distances.h
+++ b/src/tree_distances.h
@@ -19,7 +19,7 @@ using lap_col = lap_dim;
 /***** Constants requiring initialization *****/
 
 constexpr splitbit ALL_ONES = (std::numeric_limits<splitbit>::max)();
-extern double lg2[int32(SL_MAX_TIPS - 1) * (SL_MAX_TIPS - 1) + 1];
+extern double lg2[SL_MAX_TIPS + 1];
 extern double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2];
 extern double lg2_unrooted[SL_MAX_TIPS + 2];
 extern double *lg2_rooted;
@@ -442,13 +442,14 @@ namespace TreeDist {
   // See equation 16 in Meila 2007 (k' denoted K).
   // nkK is converted to pkK in the calling function when divided by n.
   inline void add_ic_element(double& ic_sum, const int16 nkK, const int16 nk,
-                             const int16 nK, const int16 n_tips) noexcept {
+                             const int16 nK, const int16 n_tips,
+                             const double lg2_n) noexcept {
     if (nkK && nk && nK) {
       assert(!(nkK == nk && nkK == nK && nkK << 1 == n_tips));
       const int32 numerator = nkK * n_tips;
       const int32 denominator = nk * nK;
       if (numerator != denominator) {
-        ic_sum += nkK * (lg2[numerator] - lg2[denominator]);
+        ic_sum += nkK * (lg2[nkK] + lg2_n - lg2[nk] - lg2[nK]);
       }
     }
   }

From aa3a8ab85f327e93a737379f74ea964d47d752e8 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Thu, 12 Mar 2026 08:05:13 +0000
Subject: [PATCH 14/19] improve benchmarking commentary (from TreeTools)

---
 .github/workflows/benchmark.yml |  34 +++++-
 AGENTS.md                       |  48 +++++++-
 benchmark/compare-ab.R          | 189 ++++++++++++++++++++++++++++++++
 benchmark/compare-release.R     | 128 +++++++++++++++++++++
 4 files changed, 394 insertions(+), 5 deletions(-)
 create mode 100644 benchmark/compare-ab.R
 create mode 100644 benchmark/compare-release.R

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index e737f07ae..bb576aa8e 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -117,9 +117,39 @@ jobs:
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           script: |
-            github.rest.issues.createComment({
+            const benchmarkIdentifier = '<!-- benchmark-comment -->';
+            const outdatedPrefix = '> **⚠️ This benchmark result is outdated. See the latest comment below.**\n\n';
+            
+            // Get all comments on the PR
+            const comments = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number
+            });
+            
+            // Find previous benchmark comments by this bot
+            const previousBenchmarkComments = comments.data.filter(comment => 
+              comment.user.type === 'Bot' && 
+              comment.body.includes(benchmarkIdentifier) &&
+              !comment.body.startsWith('> **⚠️ This benchmark result is outdated.')
+            );
+            
+            // Mark previous comments as outdated
+            for (const comment of previousBenchmarkComments) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: comment.id,
+                body: outdatedPrefix + comment.body
+              });
+            }
+            
+            // Create new comment with identifier
+            const newCommentBody = benchmarkIdentifier + '\n' + process.env.BENCHMARK_MESSAGE;
+            
+            await github.rest.issues.createComment({
               owner: context.repo.owner,
               repo: context.repo.repo,
               issue_number: context.issue.number,
-              body: process.env.BENCHMARK_MESSAGE
+              body: newCommentBody
             });
diff --git a/AGENTS.md b/AGENTS.md
index afc22bbac..90a4cf14a 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -117,11 +117,53 @@ regression > 4 %.
 slower) build.  All timing figures in this file were produced from a release
 install unless noted otherwise.
 
-**Correct workflow** (run from a *fresh* R session — do NOT load TreeDist first,
-to avoid the Windows DLL lock):
+**Also beware of stale `.o` files.**  `devtools::load_all()` writes debug-
+compiled object files to `src/`.  A subsequent `install.packages()` sees
+up-to-date timestamps and skips recompilation, silently producing a "release"
+install that contains `-O0` objects.  `bench_ab()` and `bench_release()` now
+clean `src/*.o` and `src/*.dll` before installing to prevent this.
+
+#### Preferred workflow: A/B comparison (`compare-ab.R`)
+
+`compare-ab.R` installs a renamed copy of the package as **TreeDistRef**,
+then installs the current working-tree source as **TreeDist**, and loads
+both into a single `Rscript` subprocess for `bench::mark()` comparison.
+Because both packages share the same process, CPU state / thermals /
+background load affect each measurement equally — eliminating the
+environmental noise that plagued the old `bench_release()` approach.
+
+```r
+source("benchmark/compare-ab.R")
+
+# 1. On the code you want as baseline:
+install_ref()          # installs as TreeDistRef to benchmark/ref_lib/
+
+# 2. Make your changes, then:
+bench_ab()             # installs dev TreeDist, compares both in subprocess
+```
+
+`install_ref()` copies the source, renames the package (DESCRIPTION,
+NAMESPACE, RcppExports symbol prefixes, `R_init_` entry point), cleans
+stale `.o` files, and installs to `benchmark/ref_lib/`.  `bench_ab()`
+does the same for the dev version (to a temp lib), then runs the
+comparison.  Results are saved to `benchmark/results/ab.Rds`.
+
+Note: `compare-ab.R` and `compare-release.R` are local comparison tools
+and intentionally do **not** start with `bench-` — the `bench-` prefix
+is reserved for benchmark test scripts run by CI (`_run_benchmarks.R`).
+
+MSD (MatchingSplitDistance) serves as a built-in **canary**: it does not
+use `add_ic_element`, so any difference on MSD indicates a build-level
+or environmental problem rather than a real code change.
+
+#### Legacy workflow: `bench_release()` / `bench_compare()`
+
+Still available via `benchmark/compare-release.R`.  Useful for recording
+named snapshots, but comparisons across runs are vulnerable to
+environmental drift (~15–20% between sessions on this machine).
 
 ```r
-source("benchmark/bench-release.R")
+source("benchmark/compare-release.R")
 
 # 1. Benchmark HEAD (before your patch)
 bench_release(label = "baseline")
diff --git a/benchmark/compare-ab.R b/benchmark/compare-ab.R
new file mode 100644
index 000000000..60e000925
--- /dev/null
+++ b/benchmark/compare-ab.R
@@ -0,0 +1,189 @@
+# benchmark/compare-ab.R
+#
+# A/B benchmarking: install a "reference" copy of the package as TreeDistRef,
+# then compare it against the current working-tree source in a single process.
+#
+# Both builds use release flags (PKG_CXXFLAGS from Makevars.win, plus
+# ~/.R/Makevars — NO devtools -O0 override).  Both are loaded into the same
+# Rscript subprocess, so CPU state / thermals / background load affect each
+# measurement equally.
+#
+# USAGE:
+#
+#   source("benchmark/compare-ab.R")
+#
+#   # 1. On the code you want as baseline:
+#   install_ref()
+#
+#   # 2. Make your changes, then:
+#   bench_ab()        # installs dev, loads both, compares in subprocess
+#
+# ---------------------------------------------------------------------------
+
+.REPO     <- normalizePath("C:/Users/pjjg18/GitHub/TreeDist")
+.REF_LIB  <- file.path(.REPO, "benchmark", "ref_lib")
+.RESDIR   <- file.path(.REPO, "benchmark", "results")
+
+# ---------------------------------------------------------------------------
+# install_ref()
+#
+# Copies the current source to a temp directory, renames the package to
+# TreeDistRef (so both can coexist in the same R session), and installs
+# to benchmark/ref_lib/.
+# ---------------------------------------------------------------------------
+install_ref <- function() {
+  pkg <- file.path(tempdir(), "TreeDistRef_src")
+  if (dir.exists(pkg)) unlink(pkg, recursive = TRUE)
+
+  # Copy only the directories R CMD INSTALL needs (avoids copying .git/,
+  # benchmark/ref_lib/, and other large non-source artifacts).
+  dir.create(pkg, recursive = TRUE)
+  for (d in c("R", "src", "inst", "man", "vignettes", "data", "data-raw")) {
+    src_d <- file.path(.REPO, d)
+    if (dir.exists(src_d)) file.copy(src_d, pkg, recursive = TRUE)
+  }
+  for (f in c("DESCRIPTION", "NAMESPACE", "LICENSE", "LICENSE.md")) {
+    src_f <- file.path(.REPO, f)
+    if (file.exists(src_f)) file.copy(src_f, pkg)
+  }
+
+  # --- Rename package: TreeDist -> TreeDistRef ---
+
+  # DESCRIPTION
+  rewrite <- function(path, pattern, replacement) {
+    lines <- readLines(path, warn = FALSE)
+    lines <- gsub(pattern, replacement, lines)
+    writeLines(lines, path)
+  }
+  rewrite(file.path(pkg, "DESCRIPTION"),
+          "^Package: TreeDist$", "Package: TreeDistRef")
+
+  # NAMESPACE  (useDynLib, importFrom, etc.)
+  rewrite(file.path(pkg, "NAMESPACE"),
+          "\\bTreeDist\\b", "TreeDistRef")
+
+  # R/RcppExports.R  — registered routine symbols: _TreeDist_fn -> _TreeDistRef_fn
+  rewrite(file.path(pkg, "R", "RcppExports.R"),
+          "_TreeDist_", "_TreeDistRef_")
+
+  # src/RcppExports.cpp — routine symbols + DLL init function
+  rewrite(file.path(pkg, "src", "RcppExports.cpp"),
+          "_TreeDist_", "_TreeDistRef_")
+  rewrite(file.path(pkg, "src", "RcppExports.cpp"),
+          "R_init_TreeDist", "R_init_TreeDistRef")
+
+  # Clean any stale object files
+  stale <- Sys.glob(file.path(pkg, "src", c("*.o", "*.dll", "*.so")))
+  if (length(stale)) file.remove(stale)
+
+  # Install
+  dir.create(.REF_LIB, recursive = TRUE, showWarnings = FALSE)
+  message("── Installing reference (TreeDistRef) to ", .REF_LIB, " …")
+  install.packages(pkg, lib = .REF_LIB, repos = NULL, type = "source",
+                   INSTALL_opts = "--no-multiarch", quiet = FALSE)
+  message("── Reference installed.")
+  invisible(.REF_LIB)
+}
+
+# ---------------------------------------------------------------------------
+# bench_ab()
+#
+# Installs the current working-tree source (as TreeDist) to a temp library,
+# then runs a subprocess that loads both TreeDistRef and TreeDist and
+# compares them with bench::mark().
+# ---------------------------------------------------------------------------
+bench_ab <- function() {
+  # Verify reference exists
+  if (!file.exists(file.path(.REF_LIB, "TreeDistRef", "NAMESPACE"))) {
+    stop("No reference install found. Run install_ref() first.")
+  }
+
+  # Clean stale .o/.dll from dev source
+  stale <- Sys.glob(file.path(.REPO, "src", c("*.o", "*.dll", "*.so")))
+  if (length(stale)) file.remove(stale)
+
+  # Install dev version to a temp lib
+  dev_lib <- file.path(tempdir(), "TreeDist_bench_dev")
+  dir.create(dev_lib, recursive = TRUE, showWarnings = FALSE)
+  message("── Installing dev (TreeDist) to ", dev_lib, " …")
+  install.packages(.REPO, lib = dev_lib, repos = NULL, type = "source",
+                   INSTALL_opts = "--no-multiarch", quiet = FALSE)
+
+  dir.create(.RESDIR, recursive = TRUE, showWarnings = FALSE)
+  out_rds <- file.path(.RESDIR, "ab.Rds")
+
+  # Build subprocess script
+  script <- tempfile(fileext = ".R")
+  writeLines(c(
+    sprintf('.libPaths(c("%s", "%s", .libPaths()))',
+            gsub("\\\\", "/", dev_lib),
+            gsub("\\\\", "/", .REF_LIB)),
+    'suppressPackageStartupMessages({',
+    '  library(TreeTools)',
+    '  library(TreeDistRef)',
+    '  library(TreeDist)',
+    '})',
+    '',
+    'cat("TreeDistRef loaded from:", find.package("TreeDistRef"), "\\n")',
+    'cat("TreeDist    loaded from:", find.package("TreeDist"), "\\n\\n")',
+    '',
+    'set.seed(9137)',
+    'trees50  <- as.phylo(0:99,  tipLabels = paste0("t", seq_len(50)))',
+    'trees200 <- as.phylo(0:39,  tipLabels = paste0("t", seq_len(200)))',
+    '',
+    '# --- Correctness check ---',
+    'ref50  <- TreeDistRef::ClusteringInfoDistance(trees50)',
+    'dev50  <- TreeDist::ClusteringInfoDistance(trees50)',
+    'ref200 <- TreeDistRef::ClusteringInfoDistance(trees200)',
+    'dev200 <- TreeDist::ClusteringInfoDistance(trees200)',
+    'cat("Max |ref - dev|  50-tip:", max(abs(as.numeric(ref50)  - as.numeric(dev50))),  "\\n")',
+    'cat("Max |ref - dev| 200-tip:", max(abs(as.numeric(ref200) - as.numeric(dev200))), "\\n\\n")',
+    '',
+    '# --- Benchmarks ---',
+    'b_cid_50 <- bench::mark(',
+    '  ref = TreeDistRef::ClusteringInfoDistance(trees50),',
+    '  dev = TreeDist::ClusteringInfoDistance(trees50),',
+    '  min_iterations = 5, check = FALSE',
+    ')',
+    'cat("CID, 100 trees x 50 tips (4 950 pairs)\\n")',
+    'print(b_cid_50[, c("expression", "min", "median", "mem_alloc", "n_itr")])',
+    'cat("\\n")',
+    '',
+    'b_cid_200 <- bench::mark(',
+    '  ref = TreeDistRef::ClusteringInfoDistance(trees200),',
+    '  dev = TreeDist::ClusteringInfoDistance(trees200),',
+    '  min_iterations = 5, check = FALSE',
+    ')',
+    'cat("CID, 40 trees x 200 tips (780 pairs)\\n")',
+    'print(b_cid_200[, c("expression", "min", "median", "mem_alloc", "n_itr")])',
+    'cat("\\n")',
+    '',
+    'b_msd_50 <- bench::mark(',
+    '  ref = TreeDistRef::MatchingSplitDistance(trees50),',
+    '  dev = TreeDist::MatchingSplitDistance(trees50),',
+    '  min_iterations = 5, check = FALSE',
+    ')',
+    'cat("MSD, 100 trees x 50 tips (4 950 pairs)\\n")',
+    'print(b_msd_50[, c("expression", "min", "median", "mem_alloc", "n_itr")])',
+    'cat("\\n")',
+    '',
+    'b_msd_200 <- bench::mark(',
+    '  ref = TreeDistRef::MatchingSplitDistance(trees200),',
+    '  dev = TreeDist::MatchingSplitDistance(trees200),',
+    '  min_iterations = 5, check = FALSE',
+    ')',
+    'cat("MSD, 40 trees x 200 tips (780 pairs)\\n")',
+    'print(b_msd_200[, c("expression", "min", "median", "mem_alloc", "n_itr")])',
+    'cat("\\n")',
+    '',
+    sprintf('saveRDS(list(cid_50 = b_cid_50, cid_200 = b_cid_200, msd_50 = b_msd_50, msd_200 = b_msd_200), "%s")',
+            gsub("\\\\", "/", out_rds)),
+    'cat("Results saved to", normalizePath("', gsub("\\\\", "/", out_rds), '"), "\\n")'
+  ), script)
+
+  message("── Running A/B comparison in subprocess …")
+  ret <- system2("Rscript", script)
+  if (ret != 0L) stop("Benchmark subprocess failed (exit code ", ret, ")")
+
+  invisible(readRDS(out_rds))
+}
diff --git a/benchmark/compare-release.R b/benchmark/compare-release.R
new file mode 100644
index 000000000..860b57c7e
--- /dev/null
+++ b/benchmark/compare-release.R
@@ -0,0 +1,128 @@
+# benchmark/compare-release.R
+#
+# Release-quality benchmarking for pairwise distance functions.
+#
+# devtools::load_all() appends -UNDEBUG -g -O0 to the compiler flags, which
+# overrides the -O2 in ~/.R/Makevars and produces an unrepresentative build.
+# This script avoids that by installing the package to a private library via
+# install.packages(..., type = "source") and running each benchmark in a
+# fresh Rscript subprocess (sidestepping Windows DLL lock).
+#
+# USAGE (from a fresh R session — do NOT load TreeDist first):
+#
+#   source("benchmark/compare-release.R")
+#
+#   # Benchmark the current working tree (label defaults to git description)
+#   bench_release()
+#
+#   # Benchmark a specific label (e.g. before applying a patch)
+#   bench_release(label = "baseline")
+#
+#   # Compare two previously saved results
+#   bench_compare("baseline", "scratch-reuse")
+#
+# Results are saved to benchmark/results/<label>.Rds so they persist across
+# sessions and can be compared at any time.
+# ---------------------------------------------------------------------------
+
+.REPO   <- normalizePath("C:/Users/pjjg18/GitHub/TreeDist")
+.RESDIR <- file.path(.REPO, "benchmark", "results")
+
+# ---------------------------------------------------------------------------
+# bench_release(label)
+# Installs the current working-tree source to a private temp library,
+# runs the pairwise benchmarks in a subprocess, and saves results.
+# ---------------------------------------------------------------------------
+bench_release <- function(label = NULL) {
+  if (is.null(label)) {
+    sha   <- tryCatch(
+      trimws(system2("git", c("-C", .REPO, "describe", "--always", "--dirty"),
+                     stdout = TRUE, stderr = FALSE)),
+      error = function(e) "unknown"
+    )
+    label <- sha
+  }
+
+  lib_dir <- file.path(tempdir(), paste0("TreeDist_bench_", label))
+  dir.create(lib_dir, recursive = TRUE, showWarnings = FALSE)
+  dir.create(.RESDIR,  recursive = TRUE, showWarnings = FALSE)
+  out_rds <- file.path(.RESDIR, paste0(label, ".Rds"))
+
+  # Remove stale .o / .dll files that devtools::load_all() may have left
+  # behind with debug flags (-O0).  Without this, make sees up-to-date
+  # timestamps and skips recompilation, producing a "release" install that
+  # actually contains -O0 objects.
+  stale <- Sys.glob(file.path(.REPO, "src", c("*.o", "*.dll", "*.so")))
+  if (length(stale)) file.remove(stale)
+
+  message("── Installing '", label, "' to ", lib_dir, " …")
+  install.packages(.REPO, lib = lib_dir, repos = NULL, type = "source",
+                   INSTALL_opts = "--no-multiarch", quiet = FALSE)
+
+  # Run benchmarks in a subprocess so we get a clean DLL load and no
+  # devtools debug flags pollute the timing.
+  script <- tempfile(fileext = ".R")
+  writeLines(c(
+    sprintf('.libPaths(c("%s", .libPaths()))', gsub("\\\\", "/", lib_dir)),
+    'suppressPackageStartupMessages({',
+    '  library(TreeTools)',
+    '  library(TreeDist)',
+    '})',
+    'set.seed(9137)',
+    'trees50  <- as.phylo(0:99,  tipLabels = paste0("t", seq_len(50)))',
+    'trees200 <- as.phylo(0:39,  tipLabels = paste0("t", seq_len(200)))',
+    'b_cid_50  <- bench::mark(ClusteringInfoDist(trees50),  min_iterations = 5)',
+    'b_cid_200 <- bench::mark(ClusteringInfoDist(trees200), min_iterations = 5)',
+    'b_msd_50  <- bench::mark(MatchingSplitDistance(trees50),  min_iterations = 5)',
+    'b_msd_200 <- bench::mark(MatchingSplitDistance(trees200), min_iterations = 5)',
+    sprintf('saveRDS(list(cid_50=b_cid_50, cid_200=b_cid_200, msd_50=b_msd_50, msd_200=b_msd_200), "%s")',
+            gsub("\\\\", "/", out_rds))
+  ), script)
+
+  message("── Running benchmarks in subprocess …")
+  ret <- system2("Rscript", script)
+  if (ret != 0L) stop("Benchmark subprocess failed (exit code ", ret, ")")
+
+  message("── Results saved to ", out_rds)
+  invisible(readRDS(out_rds))
+}
+
+# ---------------------------------------------------------------------------
+# bench_compare(label_a, label_b)
+# Loads two saved result sets and prints a side-by-side comparison table.
+# ---------------------------------------------------------------------------
+bench_compare <- function(label_a, label_b) {
+  load_res <- function(label) {
+    path <- file.path(.RESDIR, paste0(label, ".Rds"))
+    if (!file.exists(path))
+      stop("No results found for label '", label, "': ", path)
+    readRDS(path)
+  }
+
+  a <- load_res(label_a)
+  b <- load_res(label_b)
+
+  # bench_mark medians are bench_time objects (stored in seconds)
+  to_ms <- function(bm) as.numeric(bm$median) * 1e3
+
+  keys <- c("cid_50", "cid_200", "msd_50", "msd_200")
+  desc <- c(
+    "ClusteringInfoDist  — 100 trees × 50 tips  (4 950 pairs)",
+    "ClusteringInfoDist  — 40 trees  × 200 tips  (780 pairs)",
+    "MatchingSplitDist   — 100 trees × 50 tips  (4 950 pairs)",
+    "MatchingSplitDist   — 40 trees  × 200 tips  (780 pairs)"
+  )
+
+  cat(sprintf("\n%-55s  %9s  %9s  %8s\n",
+              "Scenario", label_a, label_b, "speedup"))
+  cat(strrep("─", 85), "\n")
+
+  for (i in seq_along(keys)) {
+    ma <- to_ms(a[[keys[i]]])
+    mb <- to_ms(b[[keys[i]]])
+    cat(sprintf("%-55s  %7.1f ms  %7.1f ms  %+6.1f%%\n",
+                desc[i], ma, mb, (ma - mb) / ma * 100))
+  }
+  cat("\nNote: positive speedup = ", label_b, " is faster\n", sep = "")
+  invisible(list(a = a, b = b))
+}

From 4864d16db7c6f2d7f20fd064feffb42363e2f35b Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Thu, 12 Mar 2026 08:07:09 +0000
Subject: [PATCH 15/19] teardown instead of withr::

---
 tests/testthat/setup.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
index d3da6c074..a252d660d 100644
--- a/tests/testthat/setup.R
+++ b/tests/testthat/setup.R
@@ -4,4 +4,4 @@
 # vdiffr opens its own svglite device on top of this one, so snapshot
 # tests are unaffected.
 pdf(NULL)
-withr::defer(grDevices::dev.off(), teardown_env())
+teardown(dev.off())

From 387a408a7c38a60893fd56e3e2dec1ad58e54589 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Thu, 12 Mar 2026 08:28:37 +0000
Subject: [PATCH 16/19] Catch large-tree error

---
 src/tree_distance_functions.cpp          |  2 +-
 tests/testthat/test-pairwise_distances.R | 25 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/tree_distance_functions.cpp b/src/tree_distance_functions.cpp
index aac34a482..4a3b77e1f 100644
--- a/src/tree_distance_functions.cpp
+++ b/src/tree_distance_functions.cpp
@@ -26,7 +26,7 @@ __attribute__((constructor))
     assert(lg2_rooted[0] == 0);
     assert(lg2_rooted[1] == 0);
     for (int32 i = 2; i != SL_MAX_TIPS + SL_MAX_TIPS - 2; ++i) {
-      lg2_double_factorial[i] = lg2_double_factorial[i - 2] + lg2[i];
+      lg2_double_factorial[i] = lg2_double_factorial[i - 2] + log2(i);
     }
     for (int32 i = 3; i != SL_MAX_TIPS + 2; ++i) {
       lg2_unrooted[i] = lg2_double_factorial[i + i - 5];
diff --git a/tests/testthat/test-pairwise_distances.R b/tests/testthat/test-pairwise_distances.R
index 499dc21ab..0c18d0dca 100644
--- a/tests/testthat/test-pairwise_distances.R
+++ b/tests/testthat/test-pairwise_distances.R
@@ -61,3 +61,28 @@ test_that("cpp_mutual_clustering_all_pairs: unequal split counts (lines 94, 131-
   expect_equal(r[3, 1], MutualClusteringInfo(tr_1a, tr_1b), tolerance = 1e-9)
   expect_equal(r[3, 2], MutualClusteringInfo(tr_3,  tr_1b), tolerance = 1e-9)
 })
+
+test_that("Large trees: batch and single-pair paths agree (lg2 table bounds)", {
+  # Exercise the lg2 / lg2_double_factorial / lg2_unrooted tables at indices
+
+  # well beyond small-tree tests.  Catches out-of-bounds table access and
+  # ensures the log2 decomposition in add_ic_element remains correct for
+  # trees with many splits.
+  skip_on_cran()
+  set.seed(4728)
+  trees <- ape::as.phylo(0:4, tipLabels = paste0("t", seq_len(200)))
+
+  batch <- MutualClusteringInfo(trees)
+  n <- length(trees)
+  pairs <- which(upper.tri(batch), arr.ind = TRUE)
+  for (k in seq_len(nrow(pairs))) {
+    i <- pairs[k, 1]
+    j <- pairs[k, 2]
+    expect_equal(
+      batch[i, j],
+      MutualClusteringInfo(trees[[i]], trees[[j]]),
+      tolerance = 1e-8,
+      label = paste0("pair (", i, ",", j, ")")
+    )
+  }
+})

From 530050e73d3b6a339476f527343493da6e3f8e41 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Thu, 12 Mar 2026 08:54:09 +0000
Subject: [PATCH 17/19] Benchmark on check ok

---
 .github/workflows/R-CMD-check.yml | 127 ++++++++++++++++++++++++
 .github/workflows/benchmark.yml   | 155 ------------------------------
 2 files changed, 127 insertions(+), 155 deletions(-)
 delete mode 100644 .github/workflows/benchmark.yml

diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml
index 4c5033a2d..67d431d60 100644
--- a/.github/workflows/R-CMD-check.yml
+++ b/.github/workflows/R-CMD-check.yml
@@ -227,3 +227,130 @@ jobs:
         with:
           deps: ${{ matrix.config.deps }}
           extra-packages: ms609/TreeDistData
+
+  benchmark:
+    runs-on: ubuntu-latest
+    needs: check-release
+    if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch'
+    name: benchmark
+
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      RSPM: "https://packagemanager.posit.co/cran/__linux__/noble/latest"
+      _R_CHECK_BUILD_VIGNETTES_: false
+      _R_CHECK_CRAN_INCOMING_: false
+      _R_CHECK_FORCE_SUGGESTS_: false
+      R_REMOTES_STANDALONE: true
+      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
+      R_REALLY_FORCE_SYMBOLS: true
+      PKG_CXXFLAGS: "-O3"
+
+    steps:
+      - name: Checkout PR branch
+        uses: actions/checkout@v5
+        with:
+          path: pr
+
+      - name: Checkout target branch
+        uses: actions/checkout@v5
+        with:
+          ref: ${{ github.event.pull_request.base.ref || 'main' }}
+          path: main
+
+      - name: Set up R
+        uses: r-lib/actions/setup-r@v2
+        with:
+          extra-repositories: https://ms609.github.io/packages/
+
+      - name: Install R dependencies
+        uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          dependencies: '"hard"'
+          extra-packages: |
+            github::ms609/TreeTools
+          needs: benchmark
+
+      - name: Benchmark PR
+        uses: ms609/actions/benchmark-step@main
+        with:
+          path: pr
+          output: pr
+
+      - name: Benchmark target
+        uses: ms609/actions/benchmark-step@main
+        with:
+          path: main
+          output: main
+
+      - name: Benchmark PR again
+        uses: ms609/actions/benchmark-step@main
+        with:
+          path: pr
+          output: pr2
+
+      - run: dir pr-benchmark-results
+        working-directory: pr
+      - run: dir main-benchmark-results
+        working-directory: pr
+
+      - name: Compare benchmarks
+        id: compare_results
+        working-directory: pr
+        run: |
+          Rscript benchmark/_compare_results.R
+        shell: bash
+
+      - name: Upload PR benchmark results
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: pr-benchmark-results
+          path: pr/pr-benchmark-results/*.bench.Rds
+
+      - name: Upload main benchmark results
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: main-benchmark-results
+          path: pr/main-benchmark-results/*.bench.Rds
+
+      - name: Comment on PR
+        if: always() && github.event_name == 'pull_request' && steps.compare_results.outputs.report != ''
+        uses: actions/github-script@v7
+        env:
+          BENCHMARK_MESSAGE: ${{ steps.compare_results.outputs.report }}
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const benchmarkIdentifier = '<!-- benchmark-comment -->';
+            const outdatedPrefix = '> **⚠️ This benchmark result is outdated. See the latest comment below.**\n\n';
+
+            const comments = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number
+            });
+
+            const previousBenchmarkComments = comments.data.filter(comment =>
+              comment.user.type === 'Bot' &&
+              comment.body.includes(benchmarkIdentifier) &&
+              !comment.body.startsWith('> **⚠️ This benchmark result is outdated.')
+            );
+
+            for (const comment of previousBenchmarkComments) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: comment.id,
+                body: outdatedPrefix + comment.body
+              });
+            }
+
+            const newCommentBody = benchmarkIdentifier + '\n' + process.env.BENCHMARK_MESSAGE;
+
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: newCommentBody
+            });
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
deleted file mode 100644
index bb576aa8e..000000000
--- a/.github/workflows/benchmark.yml
+++ /dev/null
@@ -1,155 +0,0 @@
-name: Benchmark
-
-on:
-  workflow_dispatch:
-  workflow_run:
-    workflows: ["R-CMD-check"]
-    types:
-      - completed
-    paths:
-      - "src/**"
-      - "R/**"
-      - "**.R"
-      - "**.cpp"
-      - "**.c"
-      - "**.h"
-      - "**.hpp"
-      - "configure*"
-      - "Makevars*"
-      - "**benchmark.yml"
-      - "!memcheck**"
-      - "!docs**"
-      - "!inst**"
-      - "!man**"
-      - "!man-roxygen**"
-      - "!memcheck**"
-      - "!tests**"
-      - "!vignettes**"
-
-jobs:
-  benchmark:
-    runs-on: ubuntu-latest
-    
-    env:
-      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
-      RSPM: "https://packagemanager.rstudio.com/cran/__linux__/noble/latest"
-      _R_CHECK_BUILD_VIGNETTES_: false
-      _R_CHECK_CRAN_INCOMING_: false
-      _R_CHECK_FORCE_SUGGESTS_: false # CRAN settings
-      R_REMOTES_STANDALONE: true
-      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
-      R_REALLY_FORCE_SYMBOLS: true # Until R4.3
-      
-    steps:
-      - name: Checkout PR branch
-        uses: actions/checkout@v5
-        with:
-          path: pr
-          
-      - name: Checkout target branch
-        uses: actions/checkout@v5
-        with:
-          ref: ${{ github.event.pull_request.base.ref || 'main' }}
-          path: main
-
-      - name: Set up R
-        uses: r-lib/actions/setup-r@v2
-        with:
-          extra-repositories: https://ms609.github.io/packages/
-        
-      - name: Install R dependencies
-        uses: r-lib/actions/setup-r-dependencies@v2
-        with:
-          dependencies: '"hard"'
-          extra-packages: |
-            github::ms609/TreeTools
-          needs: benchmark
-          
-      - name: Benchmark PR
-        uses: ms609/actions/benchmark-step@main
-        with:
-          path: pr
-          output: pr
-        
-      - name: Benchmark target
-        uses: ms609/actions/benchmark-step@main
-        with: 
-          path: main
-          output: main
-          
-      - name: Benchmark PR again
-        uses: ms609/actions/benchmark-step@main
-        with: 
-          path: pr
-          output: pr2
-        
-      - run: dir pr-benchmark-results
-        working-directory: pr
-      - run: dir main-benchmark-results
-        working-directory: pr
-
-      - name: Compare benchmarks
-        id: compare_results
-        working-directory: pr
-        run: |
-          Rscript benchmark/_compare_results.R
-        shell: bash
-        
-      - name: Upload PR benchmark results
-        if: failure()
-        uses: actions/upload-artifact@v4
-        with:
-          name: pr-benchmark-results
-          path: pr/pr-benchmark-results/*.bench.Rds
-            
-      - name: Upload main benchmark results
-        if: failure()
-        uses: actions/upload-artifact@v4
-        with:
-          name: main-benchmark-results
-          path: pr/main-benchmark-results/*.bench.Rds
-        
-      - name: Comment on PR
-        if: always() && github.event_name == 'pull_request' && steps.compare_results.outputs.report != ''
-        uses: actions/github-script@v7
-        env:
-          BENCHMARK_MESSAGE: ${{ steps.compare_results.outputs.report }}
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            const benchmarkIdentifier = '<!-- benchmark-comment -->';
-            const outdatedPrefix = '> **⚠️ This benchmark result is outdated. See the latest comment below.**\n\n';
-            
-            // Get all comments on the PR
-            const comments = await github.rest.issues.listComments({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: context.issue.number
-            });
-            
-            // Find previous benchmark comments by this bot
-            const previousBenchmarkComments = comments.data.filter(comment => 
-              comment.user.type === 'Bot' && 
-              comment.body.includes(benchmarkIdentifier) &&
-              !comment.body.startsWith('> **⚠️ This benchmark result is outdated.')
-            );
-            
-            // Mark previous comments as outdated
-            for (const comment of previousBenchmarkComments) {
-              await github.rest.issues.updateComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                comment_id: comment.id,
-                body: outdatedPrefix + comment.body
-              });
-            }
-            
-            // Create new comment with identifier
-            const newCommentBody = benchmarkIdentifier + '\n' + process.env.BENCHMARK_MESSAGE;
-            
-            await github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: context.issue.number,
-              body: newCommentBody
-            });

From cdc8cd905955313ebf1f4e5102604ea68bc55318 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Thu, 12 Mar 2026 09:06:33 +0000
Subject: [PATCH 18/19] rm teardown

---
 DESCRIPTION            |  1 +
 codemeta.json          | 26 +++++++++++++++++++-------
 tests/testthat/setup.R |  2 +-
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 59fbaebda..a300e7bbb 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -79,6 +79,7 @@ Suggests:
   TreeSearch (>= 1.4.0),
   Umatrix,
   vdiffr (>= 1.0.0),
+  withr,
 LinkingTo:
   Rcpp,
   TreeTools (>= 2.1.0),
diff --git a/codemeta.json b/codemeta.json
index e0bd9bb74..8311bde00 100644
--- a/codemeta.json
+++ b/codemeta.json
@@ -14,7 +14,7 @@
     "name": "R",
     "url": "https://r-project.org"
   },
-  "runtimePlatform": "R version 4.5.2 (2025-10-31)",
+  "runtimePlatform": "R version 4.5.2 (2025-10-31 ucrt)",
   "provider": {
     "@id": "https://cran.r-project.org",
     "@type": "Organization",
@@ -28,11 +28,6 @@
       "familyName": "Smith",
       "email": "martin.smith@durham.ac.uk",
       "@id": "https://orcid.org/0000-0001-5660-1727"
-    },
-    {
-      "@type": "Person",
-      "givenName": "Neil",
-      "familyName": "Kaye"
     }
   ],
   "contributor": [
@@ -72,6 +67,11 @@
       "@type": "Person",
       "givenName": "Yi",
       "familyName": "Cao"
+    },
+    {
+      "@type": "Person",
+      "givenName": "Neil",
+      "familyName": "Kaye"
     }
   ],
   "maintainer": [
@@ -388,6 +388,18 @@
         "url": "https://cran.r-project.org"
       },
       "sameAs": "https://CRAN.R-project.org/package=vdiffr"
+    },
+    {
+      "@type": "SoftwareApplication",
+      "identifier": "withr",
+      "name": "withr",
+      "provider": {
+        "@id": "https://cran.r-project.org",
+        "@type": "Organization",
+        "name": "Comprehensive R Archive Network (CRAN)",
+        "url": "https://cran.r-project.org"
+      },
+      "sameAs": "https://CRAN.R-project.org/package=withr"
     }
   ],
   "softwareRequirements": {
@@ -493,7 +505,7 @@
     "SystemRequirements": "C++17, pandoc-citeproc"
   },
   "keywords": ["phylogenetics", "tree-distance"],
-  "fileSize": "1793.872KB",
+  "fileSize": "12501.765KB",
   "citation": [
     {
       "@type": "ScholarlyArticle",
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
index a252d660d..aaf28df46 100644
--- a/tests/testthat/setup.R
+++ b/tests/testthat/setup.R
@@ -4,4 +4,4 @@
 # vdiffr opens its own svglite device on top of this one, so snapshot
 # tests are unaffected.
 pdf(NULL)
-teardown(dev.off())
+withr::defer(dev.off(), teardown_env())

From 28dc20d13d9d759d494cba94a7ebb78926d4ae02 Mon Sep 17 00:00:00 2001
From: RevBayes analysis <1695515+ms609@users.noreply.github.com>
Date: Thu, 12 Mar 2026 09:12:28 +0000
Subject: [PATCH 19/19] Code coverage

---
 AGENTS.md                                | 61 +++++++++++++++++++++++-
 tests/testthat/test-pairwise_distances.R | 52 +++++++++++++++++---
 2 files changed, 106 insertions(+), 7 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 90a4cf14a..ea631ff5c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -323,6 +323,64 @@ allocation overhead is buried in the Dijkstra phase, which dominates throughout.
 The implementation was reverted because it added substantial complexity for
 zero measured benefit.
 
+#### Fused IC calculation in `mutual_clustering_score` (attempted, reverted)
+
+Attempted to reduce integer arithmetic in the `add_ic_element` inner loop by
+replacing the 4 independent `add_ic_element()` calls (8 integer multiplies
+total: `nkK * n_tips` + `nk * nK` per call) with a fused inline block
+requiring only 2 multiplies.  The remaining 6 products were derived via
+addition/subtraction from precomputed `na_n = na * n_tips`, `nA_n`,
+`nb_n_arr[]`, `n_sq`, and `den_ab = na * nb`.  Correctness was verified
+(max |batch − serial| ≈ 1e-14).
+
+**Release-build (`-O2`) benchmarks showed no reliable improvement (−13% to
++10% across scenarios, within run-to-run noise).**  The bottleneck in the IC
+loop is **memory-bound** (random lookups into the `lg2[]` table), not
+compute-bound.  Modern x86 integer multiply has 1-cycle throughput, so
+saving 6 multiplies per split pair (~6 cycles) is invisible next to lg2
+table cache misses (~4 cycles per L1 hit, much more on L2/L3).
+
+Key lesson: `add_ic_element` and the cost-matrix filling loop are dominated
+by `lg2[]` table access, not by integer arithmetic.  Future optimisations
+should target the table's working set size or access pattern rather than
+the surrounding arithmetic.
+
+#### lg2 table shrink: 32 MB → 16 KB via log2 decomposition
+
+The `lg2[]` lookup table stored `log2(i)` for `i = 0..(SL_MAX_TIPS−1)²`,
+requiring 4.2 M doubles = **32 MB**.  `add_ic_element` accessed it with
+product indices `lg2[nkK * n_tips]` and `lg2[nk * nK]`, so the working set
+for 200-tip trees was ~320 KB (indices up to 40 000) — too large for L1.
+
+**Fix:** decompose `log2(a × b) = log2(a) + log2(b)`:
+
+```cpp
+// Before: lg2[nkK * n_tips] - lg2[nk * nK]       (indices up to n²)
+// After:  lg2[nkK] + lg2_n  - lg2[nk] - lg2[nK]  (indices ≤ n_tips)
+```
+
+The table now has only `SL_MAX_TIPS + 1` entries (2 049 doubles = **16 KB**),
+trivially fitting in L1 for any tree size.  `lg2_n = lg2[n_tips]` is
+precomputed once per distance call and passed to `add_ic_element` as a
+new parameter.
+
+**Files changed:**
+- `tree_distances.h`: table declaration shrunk; `add_ic_element` gains
+  `lg2_n` parameter and uses decomposed lookups.
+- `tree_distance_functions.cpp`: `LG2_SIZE` reduced to `SL_MAX_TIPS + 1`.
+- `pairwise_distances.cpp`, `tree_distances.cpp`: callers updated to
+  precompute and pass `lg2_n`.
+
+**A/B benchmark (release build, same-process comparison via `compare-ab.R`):**
+
+| Scenario | ref | dev | Change |
+|---|---|---|---|
+| CID 100 × 50-tip | 75.4 ms | 71.2 ms | **−5.6%** |
+| CID 40 × 200-tip | 278 ms | 265 ms | **−4.7%** |
+| MSD 100 × 50-tip (canary) | 85.0 ms | 85.6 ms | +0.7% |
+| MSD 40 × 200-tip (canary) | 402 ms | 407 ms | +1.2% |
+
+Numerical accuracy unchanged (max |ref − dev| ≈ 5.7 × 10⁻¹⁴).
 
 #### OpenMP rollout to all remaining distance metrics (this dev cycle)
 
@@ -415,6 +473,7 @@ of the null device, so snapshot tests are unaffected.
   bandwidth further.
 - SPR distance (`spr.cpp`, `spr_lookup.cpp`): the algorithm is relatively recent
   (v2.8.0); profiling under VTune may reveal further hot spots.
-- OpenMP for other metrics: **DONE** — see "Completed Optimizations" below.
+- OpenMP for other metrics: **DONE** — see "Completed Optimizations" above.
+- lg2 table working set: **DONE** — shrunk from 32 MB to 16 KB; see above.
 - Large-tree path (`int32` migration, v2.12.0 dev): ensure new code paths are as
   optimized as the original `int16` paths.
diff --git a/tests/testthat/test-pairwise_distances.R b/tests/testthat/test-pairwise_distances.R
index 0c18d0dca..b03784a94 100644
--- a/tests/testthat/test-pairwise_distances.R
+++ b/tests/testthat/test-pairwise_distances.R
@@ -8,11 +8,11 @@
 ## values.  The off-diagonal entries equal the raw MCI scores from the batch C++
 ## function.
 
-test_that("cpp_mutual_clustering_all_pairs: orthogonal splits score 0 (line 82)", {
+test_that("cpp_mutual_clustering_all_pairs: orthogonal splits score 0", {
   # Two 8-tip trees, each with exactly one internal split.
   # The splits cross orthogonally: every quadrant contains exactly 2 tips,
   # so a_and_b == A_and_b == a_and_B == A_and_B == 2.
-  # This triggers the rounding-error guard (score = max_score) at line 82, and
+  # This triggers the rounding-error guard (score = max_score), and
   # the LAP assigns that sole pair, yielding MCI = 0.
   tr1 <- ape::read.tree(text = "((t1,t2,t3,t4),(t5,t6,t7,t8));")
   tr2 <- ape::read.tree(text = "((t1,t2,t5,t6),(t3,t4,t7,t8));")
@@ -26,7 +26,7 @@ test_that("cpp_mutual_clustering_all_pairs: orthogonal splits score 0 (line 82)"
   expect_equal(r[2, 1], MutualClusteringInfo(tr1, tr2), tolerance = 1e-10)
 })
 
-test_that("cpp_mutual_clustering_all_pairs: unequal split counts (lines 94, 131-138)", {
+test_that("cpp_mutual_clustering_all_pairs: unequal split counts", {
   # Three 6-tip trees with different numbers of non-trivial splits:
   #
   #   tr_1a — 1 split:  {t1,t2,t3} | {t4,t5,t6}
@@ -38,12 +38,12 @@ test_that("cpp_mutual_clustering_all_pairs: unequal split counts (lines 94, 131-
   #   (col=0 → a=tr_1a[1 split], row=1 → b=tr_3[3 splits]):
   #     a_has_more = FALSE; most_splits = 3.
   #     No split pairs are exact matches, so exact_n = 0 → else branch:
-  #       loop fills phantom rows ai=1,2 with max_score        (lines 131–134)
-  #       LAP solves the full 3×3 matrix                       (lines 136–138)
+  #       loop fills phantom rows ai=1,2 with max_score
+  #       LAP solves the full 3×3 matrix
   #
   #   (col=1 → a=tr_3[3 splits], row=2 → b=tr_1b[1 split]):
   #     a_has_more = TRUE; most_splits = 3, b.n_splits = 1.
-  #     For every ai=0..2: padRowAfterCol(ai, 1, max_score)    (line 94)
+  #     For every ai=0..2: padRowAfterCol(ai, 1, max_score)
 
   tr_1a <- ape::read.tree(text = "((t1,t2,t3),(t4,t5,t6));")
   tr_3  <- ape::read.tree(text = "(((t1,t4),(t2,t5)),(t3,t6));")
@@ -62,6 +62,46 @@ test_that("cpp_mutual_clustering_all_pairs: unequal split counts (lines 94, 131-
   expect_equal(r[3, 2], MutualClusteringInfo(tr_3,  tr_1b), tolerance = 1e-9)
 })
 
+test_that("cpp_rf_info_all_pairs: multi-bin complement (> 64 tips)", {
+  # Exercises b_complement[i][bin] = ~b.state[i][bin] at line 236, which
+  # only runs when n_bins >= 2 (trees with > 64 tips on 64-bit platforms).
+  skip_on_cran()
+  trees <- ape::as.phylo(0:2, tipLabels = paste0("t", seq_len(100)))
+  r <- InfoRobinsonFoulds(trees)
+  expect_true(all(r >= 0))
+  expect_equal(as.matrix(r)[2, 1],
+               InfoRobinsonFoulds(trees[[1]], trees[[2]]),
+               tolerance = 1e-10)
+})
+
+test_that("cpp_jaccard_all_pairs: allow_conflict = FALSE", {
+  # Random trees have conflicting splits, so with allowConflict = FALSE
+  # those pairs score max_score.
+  trees <- ape::as.phylo(0:2, tipLabels = paste0("t", seq_len(20)))
+  r <- JaccardRobinsonFoulds(trees, allowConflict = FALSE)
+  expect_true(all(r >= 0))
+  expect_equal(as.matrix(r)[2, 1],
+               JaccardRobinsonFoulds(trees[[1]], trees[[2]],
+                                     allowConflict = FALSE),
+               tolerance = 1e-10)
+})
+
+test_that("cpp_jaccard_all_pairs: k = Inf and k != 1", {
+  trees <- ape::as.phylo(0:2, tipLabels = paste0("t", seq_len(12)))
+  # k = Inf triggers the isinf branch
+  r_inf <- JaccardRobinsonFoulds(trees, k = Inf)
+  expect_true(all(r_inf >= 0))
+  expect_equal(as.matrix(r_inf)[2, 1],
+               JaccardRobinsonFoulds(trees[[1]], trees[[2]], k = Inf),
+               tolerance = 1e-10)
+  # k = 2 triggers the pow() branch
+  r_k2 <- JaccardRobinsonFoulds(trees, k = 2)
+  expect_true(all(r_k2 >= 0))
+  expect_equal(as.matrix(r_k2)[2, 1],
+               JaccardRobinsonFoulds(trees[[1]], trees[[2]], k = 2),
+               tolerance = 1e-10)
+})
+
 test_that("Large trees: batch and single-pair paths agree (lg2 table bounds)", {
   # Exercise the lg2 / lg2_double_factorial / lg2_unrooted tables at indices