From bcac5d15f6e9b7b2cbb61d9725ea29007d1c6426 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 17:20:49 -1000
Subject: [PATCH 01/61] treedb: expose vlog generation maintenance bytes in
 stats

---
 TreeDB/caching/db.go                          | 216 +++++++++++-------
 TreeDB/caching/expvar_stats.go                |   1 +
 TreeDB/caching/expvar_stats_test.go           |   4 +
 .../caching/vlog_generation_scheduler_test.go |  47 ++++
 4 files changed, 184 insertions(+), 84 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index b38354bec..6a1255668 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5142,90 +5142,94 @@ type DB struct {
 	valueLogMaxSegmentBytes                           int64
 	journalCompression                                bool
 
-	disableJournal                             bool
-	relaxedSync                                bool
-	notifyError                                func(error)
-	debugFlushPointers                         bool
-	debugFlushTiming                           bool
-	debugPtrEligible                           atomic.Int64
-	debugPtrUsed                               atomic.Int64
-	debugPtrNoPtr                              atomic.Int64
-	debugPtrDenied                             atomic.Int64
-	debugPtrDisabled                           atomic.Int64
-	checkpointRuns                             atomic.Uint64
-	checkpointTotalNs                          atomic.Uint64
-	checkpointMaxNs                            atomic.Uint64
-	checkpointNoopSkips                        atomic.Uint64
-	checkpointFlushMuWaitNs                    atomic.Uint64
-	checkpointFlushMuWaitMaxNs                 atomic.Uint64
-	checkpointAutoVacuumRuns                   atomic.Uint64
-	checkpointAutoVacuumLastCheckRun           atomic.Uint64
-	checkpointAutoVacuumLastPages              atomic.Uint64
-	checkpointAutoVacuumLastInternalP50        atomic.Uint64
-	checkpointAutoVacuumLastInternalAvg        atomic.Uint64
-	lastForegroundWriteUnixNano                atomic.Int64
-	lastForegroundReadUnixNano                 atomic.Int64
-	foregroundReadStampCounter                 atomic.Uint32
-	activeForegroundIterators                  atomic.Int64
-	retainedPruneLastStartUnixNano             atomic.Int64
-	retainedPruneMu                            sync.Mutex
-	retainedPruneDone                          chan struct{}
-	vlogGenerationRemapSuccesses               atomic.Uint64
-	vlogGenerationRemapFailures                atomic.Uint64
-	vlogGenerationRewriteBytesIn               atomic.Uint64
-	vlogGenerationRewriteBytesOut              atomic.Uint64
-	vlogGenerationRewriteRuns                  atomic.Uint64
-	vlogGenerationRewritePlanRuns              atomic.Uint64
-	vlogGenerationRewritePlanCanceled          atomic.Uint64
-	vlogGenerationRewritePlanErrors            atomic.Uint64
-	vlogGenerationRewritePlanEmpty             atomic.Uint64
-	vlogGenerationRewritePlanSelected          atomic.Uint64
-	vlogGenerationRewritePlanCanceledLastNS    atomic.Int64
-	vlogGenerationRewriteAgeBlockedUntilNS     atomic.Int64
-	vlogGenerationRewriteAgeBlockedWakeRunning atomic.Bool
-	vlogGenerationRewriteIneffectiveLastNS     atomic.Int64
-	vlogGenerationRewriteIneffectiveRuns       atomic.Uint64
-	vlogGenerationRewriteIneffectiveBytesIn    atomic.Uint64
-	vlogGenerationRewriteIneffectiveBytesOut   atomic.Uint64
-	vlogGenerationRewriteCanceledRuns          atomic.Uint64
-	vlogGenerationRewriteCanceledLastNS        atomic.Int64
-	vlogGenerationRewriteQueuePruneRuns        atomic.Uint64
-	vlogGenerationRewriteQueuePruneIDs         atomic.Uint64
-	vlogGenerationGCSegmentsDeleted            atomic.Uint64
-	vlogGenerationGCBytesDeleted               atomic.Uint64
-	vlogGenerationGCRuns                       atomic.Uint64
-	vlogGenerationVacuumRuns                   atomic.Uint64
-	vlogGenerationVacuumFailures               atomic.Uint64
-	vlogGenerationLastVacuumUnixNano           atomic.Int64
-	vlogGenerationLastRewritePlanUnixNano      atomic.Int64
-	vlogGenerationLastRewriteUnixNano          atomic.Int64
-	vlogGenerationLastGCUnixNano               atomic.Int64
-	vlogGenerationLastCheckpointKickUnixNano   atomic.Int64
-	vlogGenerationLastGCDryRunUnixNano         atomic.Int64
-	vlogGenerationLastGCDryRunBytesEligible    atomic.Int64
-	vlogGenerationLastGCDryRunSegsEligible     atomic.Int64
-	vlogGenerationChurnBytes                   atomic.Uint64
-	vlogGenerationSchedulerState               atomic.Uint32
-	vlogGenerationMaintenanceActive            atomic.Bool
-	vlogGenerationLastReason                   atomic.Uint32
-	vlogGenerationCheckpointKickRuns           atomic.Uint64
-	vlogGenerationCheckpointKickRewriteRuns    atomic.Uint64
-	vlogGenerationCheckpointKickGCRuns         atomic.Uint64
-	vlogGenerationCheckpointKickPending        atomic.Bool
-	vlogGenerationDeferredMaintenancePending   atomic.Bool
-	vlogGenerationDeferredMaintenanceRunning   atomic.Bool
-	vlogGenerationRewriteStageWakeObservedNS   atomic.Int64
-	vlogGenerationRewriteQueueMu               sync.Mutex
-	vlogGenerationCheckpointKickActive         atomic.Bool
-	vlogGenerationRewriteQueue                 []uint32
-	vlogGenerationRewriteLedger                []backenddb.ValueLogRewritePlanSegment
-	vlogGenerationRewritePenalties             map[uint32]valueLogGenerationRewritePenalty
-	vlogGenerationRewriteStagePending          bool
-	vlogGenerationRewriteStageObservedUnixNano int64
-	vlogGenerationRewriteQueueLoaded           bool
-	vlogGenerationLastChurnBps                 atomic.Int64
-	vlogGenerationLastChurnSampleBytes         atomic.Uint64
-	vlogGenerationLastChurnSampleNS            atomic.Int64
+	disableJournal                              bool
+	relaxedSync                                 bool
+	notifyError                                 func(error)
+	debugFlushPointers                          bool
+	debugFlushTiming                            bool
+	debugPtrEligible                            atomic.Int64
+	debugPtrUsed                                atomic.Int64
+	debugPtrNoPtr                               atomic.Int64
+	debugPtrDenied                              atomic.Int64
+	debugPtrDisabled                            atomic.Int64
+	checkpointRuns                              atomic.Uint64
+	checkpointTotalNs                           atomic.Uint64
+	checkpointMaxNs                             atomic.Uint64
+	checkpointNoopSkips                         atomic.Uint64
+	checkpointFlushMuWaitNs                     atomic.Uint64
+	checkpointFlushMuWaitMaxNs                  atomic.Uint64
+	checkpointAutoVacuumRuns                    atomic.Uint64
+	checkpointAutoVacuumLastCheckRun            atomic.Uint64
+	checkpointAutoVacuumLastPages               atomic.Uint64
+	checkpointAutoVacuumLastInternalP50         atomic.Uint64
+	checkpointAutoVacuumLastInternalAvg         atomic.Uint64
+	lastForegroundWriteUnixNano                 atomic.Int64
+	lastForegroundReadUnixNano                  atomic.Int64
+	foregroundReadStampCounter                  atomic.Uint32
+	activeForegroundIterators                   atomic.Int64
+	retainedPruneLastStartUnixNano              atomic.Int64
+	retainedPruneMu                             sync.Mutex
+	retainedPruneDone                           chan struct{}
+	vlogGenerationRemapSuccesses                atomic.Uint64
+	vlogGenerationRemapFailures                 atomic.Uint64
+	vlogGenerationRewriteBytesIn                atomic.Uint64
+	vlogGenerationRewriteBytesOut               atomic.Uint64
+	vlogGenerationRewriteReclaimedBytes         atomic.Uint64
+	vlogGenerationRewriteRuns                   atomic.Uint64
+	vlogGenerationRewritePlanRuns               atomic.Uint64
+	vlogGenerationRewritePlanCanceled           atomic.Uint64
+	vlogGenerationRewritePlanErrors             atomic.Uint64
+	vlogGenerationRewritePlanEmpty              atomic.Uint64
+	vlogGenerationRewritePlanSelected           atomic.Uint64
+	vlogGenerationRewritePlanSelectedBytes      atomic.Uint64
+	vlogGenerationRewritePlanSelectedLiveBytes  atomic.Uint64
+	vlogGenerationRewritePlanSelectedStaleBytes atomic.Uint64
+	vlogGenerationRewritePlanCanceledLastNS     atomic.Int64
+	vlogGenerationRewriteAgeBlockedUntilNS      atomic.Int64
+	vlogGenerationRewriteAgeBlockedWakeRunning  atomic.Bool
+	vlogGenerationRewriteIneffectiveLastNS      atomic.Int64
+	vlogGenerationRewriteIneffectiveRuns        atomic.Uint64
+	vlogGenerationRewriteIneffectiveBytesIn     atomic.Uint64
+	vlogGenerationRewriteIneffectiveBytesOut    atomic.Uint64
+	vlogGenerationRewriteCanceledRuns           atomic.Uint64
+	vlogGenerationRewriteCanceledLastNS         atomic.Int64
+	vlogGenerationRewriteQueuePruneRuns         atomic.Uint64
+	vlogGenerationRewriteQueuePruneIDs          atomic.Uint64
+	vlogGenerationGCSegmentsDeleted             atomic.Uint64
+	vlogGenerationGCBytesDeleted                atomic.Uint64
+	vlogGenerationGCRuns                        atomic.Uint64
+	vlogGenerationVacuumRuns                    atomic.Uint64
+	vlogGenerationVacuumFailures                atomic.Uint64
+	vlogGenerationLastVacuumUnixNano            atomic.Int64
+	vlogGenerationLastRewritePlanUnixNano       atomic.Int64
+	vlogGenerationLastRewriteUnixNano           atomic.Int64
+	vlogGenerationLastGCUnixNano                atomic.Int64
+	vlogGenerationLastCheckpointKickUnixNano    atomic.Int64
+	vlogGenerationLastGCDryRunUnixNano          atomic.Int64
+	vlogGenerationLastGCDryRunBytesEligible     atomic.Int64
+	vlogGenerationLastGCDryRunSegsEligible      atomic.Int64
+	vlogGenerationChurnBytes                    atomic.Uint64
+	vlogGenerationSchedulerState                atomic.Uint32
+	vlogGenerationMaintenanceActive             atomic.Bool
+	vlogGenerationLastReason                    atomic.Uint32
+	vlogGenerationCheckpointKickRuns            atomic.Uint64
+	vlogGenerationCheckpointKickRewriteRuns     atomic.Uint64
+	vlogGenerationCheckpointKickGCRuns          atomic.Uint64
+	vlogGenerationCheckpointKickPending         atomic.Bool
+	vlogGenerationDeferredMaintenancePending    atomic.Bool
+	vlogGenerationDeferredMaintenanceRunning    atomic.Bool
+	vlogGenerationRewriteStageWakeObservedNS    atomic.Int64
+	vlogGenerationRewriteQueueMu                sync.Mutex
+	vlogGenerationCheckpointKickActive          atomic.Bool
+	vlogGenerationRewriteQueue                  []uint32
+	vlogGenerationRewriteLedger                 []backenddb.ValueLogRewritePlanSegment
+	vlogGenerationRewritePenalties              map[uint32]valueLogGenerationRewritePenalty
+	vlogGenerationRewriteStagePending           bool
+	vlogGenerationRewriteStageObservedUnixNano  int64
+	vlogGenerationRewriteQueueLoaded            bool
+	vlogGenerationLastChurnBps                  atomic.Int64
+	vlogGenerationLastChurnSampleBytes          atomic.Uint64
+	vlogGenerationLastChurnSampleNS             atomic.Int64
 	// Rewrite budget token bucket (bytes) for online maintenance. This lets us
 	// interpret ValueLogRewriteBudgetBytesPerSec as a true per-second bandwidth
 	// budget while still running maintenance at coarse intervals.
@@ -12431,6 +12435,43 @@ func (db *DB) observeVlogGenerationRewritePlanOutcome(plan backenddb.ValueLogRew
 	}
 	if len(plan.SourceFileIDs) > 0 || len(plan.SelectedSegments) > 0 || plan.SegmentsSelected > 0 {
 		db.vlogGenerationRewritePlanSelected.Add(1)
+		selectedTotal := plan.SelectedBytesTotal
+		selectedLive := plan.SelectedBytesLive
+		selectedStale := plan.SelectedBytesStale
+		if len(plan.SelectedSegments) > 0 && (selectedTotal <= 0 || selectedLive <= 0 || selectedStale <= 0) {
+			fallbackTotal := int64(0)
+			fallbackLive := int64(0)
+			fallbackStale := int64(0)
+			for _, seg := range plan.SelectedSegments {
+				if seg.BytesTotal > 0 {
+					fallbackTotal += seg.BytesTotal
+				}
+				if seg.BytesLive > 0 {
+					fallbackLive += seg.BytesLive
+				}
+				if seg.BytesStale > 0 {
+					fallbackStale += seg.BytesStale
+				}
+			}
+			if selectedTotal <= 0 {
+				selectedTotal = fallbackTotal
+			}
+			if selectedLive <= 0 {
+				selectedLive = fallbackLive
+			}
+			if selectedStale <= 0 {
+				selectedStale = fallbackStale
+			}
+		}
+		if selectedTotal > 0 {
+			db.vlogGenerationRewritePlanSelectedBytes.Add(uint64(selectedTotal))
+		}
+		if selectedLive > 0 {
+			db.vlogGenerationRewritePlanSelectedLiveBytes.Add(uint64(selectedLive))
+		}
+		if selectedStale > 0 {
+			db.vlogGenerationRewritePlanSelectedStaleBytes.Add(uint64(selectedStale))
+		}
 		return
 	}
 	db.vlogGenerationRewritePlanEmpty.Add(1)
@@ -13747,6 +13788,9 @@ planned:
 				}
 				db.debugVlogMaintf("gc_after_rewrite_done reason=%s dur_ms=%.3f", vlogGenerationReasonString(reason), float64(time.Since(gcStart).Microseconds())/1000)
 			}
+			if effectiveBytesBefore > effectiveBytesAfter {
+				db.vlogGenerationRewriteReclaimedBytes.Add(uint64(effectiveBytesBefore - effectiveBytesAfter))
+			}
 			locallyEffectiveProcessedDebt := len(processedRewriteIDs) > 0 && processedLedgerOK && processedLedgerStaleBytes > 0 && stats.RecordsCopied > 0
 			if effectiveBytesBefore > 0 && effectiveBytesAfter >= effectiveBytesBefore && !locallyEffectiveProcessedDebt {
 				db.vlogGenerationRewriteIneffectiveRuns.Add(1)
@@ -19486,6 +19530,9 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.rewrite.plan_errors"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanErrors.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_empty"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanEmpty.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_selected"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelected.Load())
+	stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedBytes.Load())
+	stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_live"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedLiveBytes.Load())
+	stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedStaleBytes.Load())
 	stats["treedb.cache.vlog_generation.rewrite.canceled_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledRuns.Load())
 	stats["treedb.cache.vlog_generation.rewrite.canceled_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledLastNS.Load())
 	stats["treedb.cache.vlog_generation.rewrite.queue_prune_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteQueuePruneRuns.Load())
@@ -19495,6 +19542,7 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.rewrite.ineffective_bytes_out"] = fmt.Sprintf("%d", db.vlogGenerationRewriteIneffectiveBytesOut.Load())
 	stats["treedb.cache.vlog_generation.rewrite.ineffective_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteIneffectiveLastNS.Load())
 	stats["treedb.cache.vlog_generation.rewrite.ineffective_backoff_seconds"] = fmt.Sprintf("%.0f", vlogGenerationRewriteIneffectiveBackoff.Seconds())
+	stats["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteReclaimedBytes.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastRewritePlanUnixNano.Load())
 	stats["treedb.cache.vlog_generation.rewrite.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastRewriteUnixNano.Load())
 	stats["treedb.cache.vlog_generation.gc.deleted_segments"] = fmt.Sprintf("%d", db.vlogGenerationGCSegmentsDeleted.Load())
diff --git a/TreeDB/caching/expvar_stats.go b/TreeDB/caching/expvar_stats.go
index 8c3205ca0..5a7240f46 100644
--- a/TreeDB/caching/expvar_stats.go
+++ b/TreeDB/caching/expvar_stats.go
@@ -139,6 +139,7 @@ func selectTreeDBExpvarStats(stats map[string]string) map[string]any {
 			strings.HasPrefix(k, "treedb.cache.vlog_payload_split.") ||
 			strings.HasPrefix(k, "treedb.cache.vlog_auto.") ||
 			strings.HasPrefix(k, "treedb.cache.vlog_dict.") ||
+			strings.HasPrefix(k, "treedb.cache.vlog_generation.") ||
 			strings.HasPrefix(k, "treedb.cache.vlog_payload_kind.") ||
 			strings.HasPrefix(k, "treedb.cache.vlog_outer_leaf_codec.") ||
 			strings.HasPrefix(k, "treedb.cache.batch_arena.") {
diff --git a/TreeDB/caching/expvar_stats_test.go b/TreeDB/caching/expvar_stats_test.go
index f4de57519..ff1982510 100644
--- a/TreeDB/caching/expvar_stats_test.go
+++ b/TreeDB/caching/expvar_stats_test.go
@@ -28,6 +28,7 @@ func TestSelectTreeDBExpvarStatsFiltersAndCoerces(t *testing.T) {
 		"treedb.cache.vlog_dict.current_k":                              "32",
 		"treedb.cache.vlog_payload_kind.raw_bytes.single_value":         "2048",
 		"treedb.cache.vlog_outer_leaf_codec.raw_bytes.lz4":              "512",
+		"treedb.cache.vlog_generation.rewrite.reclaimed_bytes":          "1234",
 		"treedb.process.memory.heap_inuse_bytes":                        "4096",
 		"treedb.process.memory.pool_pressure_level":                     "critical",
 		"treedb.cache.batch_arena.pool_bytes_estimate":                  "65536",
@@ -80,6 +81,9 @@ func TestSelectTreeDBExpvarStatsFiltersAndCoerces(t *testing.T) {
 	if v, ok := got["treedb.cache.vlog_outer_leaf_codec.raw_bytes.lz4"].(int64); !ok || v != 512 {
 		t.Fatalf("vlog_outer_leaf_codec.raw_bytes.lz4=%T(%v) want int64(512)", got["treedb.cache.vlog_outer_leaf_codec.raw_bytes.lz4"], got["treedb.cache.vlog_outer_leaf_codec.raw_bytes.lz4"])
 	}
+	if v, ok := got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"].(int64); !ok || v != 1234 {
+		t.Fatalf("vlog_generation.rewrite.reclaimed_bytes=%T(%v) want int64(1234)", got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"], got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"])
+	}
 	if v, ok := got["treedb.process.memory.heap_inuse_bytes"].(int64); !ok || v != 4096 {
 		t.Fatalf("heap_inuse_bytes=%T(%v) want int64(4096)", got["treedb.process.memory.heap_inuse_bytes"], got["treedb.process.memory.heap_inuse_bytes"])
 	}
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 0beb47e61..154ed646d 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -273,6 +273,53 @@ func TestShouldRunVlogGenerationRewrite_NoTrigger(t *testing.T) {
 	}
 }
 
+func TestObserveVlogGenerationRewritePlanOutcome_SelectedTracksBytes(t *testing.T) {
+	db := &DB{}
+	db.observeVlogGenerationRewritePlanOutcome(backenddb.ValueLogRewritePlan{
+		SourceFileIDs:      []uint32{11},
+		SegmentsSelected:   1,
+		SelectedBytesTotal: 1024,
+		SelectedBytesLive:  640,
+		SelectedBytesStale: 384,
+	}, nil)
+	if got, want := db.vlogGenerationRewritePlanRuns.Load(), uint64(1); got != want {
+		t.Fatalf("plan runs=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationRewritePlanSelected.Load(), uint64(1); got != want {
+		t.Fatalf("plan selected=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationRewritePlanSelectedBytes.Load(), uint64(1024); got != want {
+		t.Fatalf("plan selected bytes total=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationRewritePlanSelectedLiveBytes.Load(), uint64(640); got != want {
+		t.Fatalf("plan selected bytes live=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationRewritePlanSelectedStaleBytes.Load(), uint64(384); got != want {
+		t.Fatalf("plan selected bytes stale=%d want=%d", got, want)
+	}
+}
+
+func TestObserveVlogGenerationRewritePlanOutcome_SelectedTracksSegmentFallbackBytes(t *testing.T) {
+	db := &DB{}
+	db.observeVlogGenerationRewritePlanOutcome(backenddb.ValueLogRewritePlan{
+		SourceFileIDs:    []uint32{11, 22},
+		SegmentsSelected: 2,
+		SelectedSegments: []backenddb.ValueLogRewritePlanSegment{
+			{FileID: 11, BytesTotal: 100, BytesLive: 25, BytesStale: 75},
+			{FileID: 22, BytesTotal: 120, BytesLive: 40, BytesStale: 80},
+		},
+	}, nil)
+	if got, want := db.vlogGenerationRewritePlanSelectedBytes.Load(), uint64(220); got != want {
+		t.Fatalf("fallback selected bytes total=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationRewritePlanSelectedLiveBytes.Load(), uint64(65); got != want {
+		t.Fatalf("fallback selected bytes live=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationRewritePlanSelectedStaleBytes.Load(), uint64(155); got != want {
+		t.Fatalf("fallback selected bytes stale=%d want=%d", got, want)
+	}
+}
+
 func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesQualityFloor(t *testing.T) {
 	db := &DB{valueLogRewriteTriggerRatioPPM: 200000}
 	if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want {

From 9ce5339a4e3df1de4fbf51631cc64fa12c81941b Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 17:38:33 -1000
Subject: [PATCH 02/61] treedb: add maintenance and vacuum skip counters

---
 TreeDB/caching/db.go                          | 63 +++++++++++++++++++
 .../caching/vlog_generation_scheduler_test.go | 56 +++++++++++++++++
 worklog/2026-03-27.md                         | 53 ++++++++++++++++
 3 files changed, 172 insertions(+)
 create mode 100644 worklog/2026-03-27.md

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 6a1255668..1ee972687 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5200,6 +5200,9 @@ type DB struct {
 	vlogGenerationGCRuns                        atomic.Uint64
 	vlogGenerationVacuumRuns                    atomic.Uint64
 	vlogGenerationVacuumFailures                atomic.Uint64
+	vlogGenerationVacuumSkippedDisabled         atomic.Uint64
+	vlogGenerationVacuumSkippedRewriteBytes     atomic.Uint64
+	vlogGenerationVacuumSkippedCooldown         atomic.Uint64
 	vlogGenerationLastVacuumUnixNano            atomic.Int64
 	vlogGenerationLastRewritePlanUnixNano       atomic.Int64
 	vlogGenerationLastRewriteUnixNano           atomic.Int64
@@ -5211,6 +5214,20 @@ type DB struct {
 	vlogGenerationChurnBytes                    atomic.Uint64
 	vlogGenerationSchedulerState                atomic.Uint32
 	vlogGenerationMaintenanceActive             atomic.Bool
+	vlogGenerationMaintenanceAttempts           atomic.Uint64
+	vlogGenerationMaintenanceAcquired           atomic.Uint64
+	vlogGenerationMaintenanceCollisions         atomic.Uint64
+	vlogGenerationMaintenanceSkipWALOnPeriodic  atomic.Uint64
+	vlogGenerationMaintenanceSkipPhase          atomic.Uint64
+	vlogGenerationMaintenanceSkipStageGate      atomic.Uint64
+	vlogGenerationMaintenanceSkipAgeBlocked     atomic.Uint64
+	vlogGenerationMaintenanceSkipPriority       atomic.Uint64
+	vlogGenerationMaintenanceSkipQuiet          atomic.Uint64
+	vlogGenerationMaintenanceSkipPreCheckpoint  atomic.Uint64
+	vlogGenerationMaintenanceSkipCheckpointing  atomic.Uint64
+	vlogGenerationMaintenancePassNoop           atomic.Uint64
+	vlogGenerationMaintenancePassWithRewrite    atomic.Uint64
+	vlogGenerationMaintenancePassWithGC         atomic.Uint64
 	vlogGenerationLastReason                    atomic.Uint32
 	vlogGenerationCheckpointKickRuns            atomic.Uint64
 	vlogGenerationCheckpointKickRewriteRuns     atomic.Uint64
@@ -13016,14 +13033,17 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 	if db == nil || db.closing.Load() || db.valueLogGenerationPolicy != uint8(backenddb.ValueLogGenerationHotWarmCold) {
 		return
 	}
+	db.vlogGenerationMaintenanceAttempts.Add(1)
 	// In WAL-on mode, the periodic "runGC" tick must not enter the maintenance
 	// engine at all. Checkpoint-coupled work belongs to the explicit
 	// checkpoint-kick/deferred paths; letting the periodic GC tick even acquire
 	// maintenanceActive can strand that slot behind hot restore-time locks.
 	if runGC && !db.disableJournal && !opts.bypassQuiet {
+		db.vlogGenerationMaintenanceSkipWALOnPeriodic.Add(1)
 		return
 	}
 	if db.suppressBackgroundVlogGenerationForMaintenancePhase() {
+		db.vlogGenerationMaintenanceSkipPhase.Add(1)
 		if opts.debugSource != "" {
 			db.debugVlogMaintf(
 				"maintenance_skip reason=maintenance_phase source=%s phase=%s checkpoint_pending=%t deferred_pending=%t",
@@ -13039,6 +13059,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 	// checkpoint-kick path can race otherwise, which causes overlapping rewrite
 	// runs to compete on the same resume queue.
 	if !db.vlogGenerationMaintenanceActive.CompareAndSwap(false, true) {
+		db.vlogGenerationMaintenanceCollisions.Add(1)
 		// Checkpoint-kick retries are high-priority and quiet-window-bypassed by
 		// design. If they collide with an active pass, queue exactly one retry to
 		// run right after the active pass exits.
@@ -13057,6 +13078,9 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 		return
 	}
 	acquired = true
+	db.vlogGenerationMaintenanceAcquired.Add(1)
+	rewriteRunsBefore := db.vlogGenerationRewriteRuns.Load()
+	gcRunsBefore := db.vlogGenerationGCRuns.Load()
 	activeSource := vlogGenerationMaintenanceDebugSource(opts)
 	activeStart := time.Now()
 	db.debugVlogMaintf(
@@ -13082,6 +13106,17 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 		// the original retry goroutine to still be alive.
 		db.scheduleDueVlogGenerationDeferredMaintenance()
 		db.schedulePendingVlogGenerationCheckpointKick()
+		rewriteRan := db.vlogGenerationRewriteRuns.Load() > rewriteRunsBefore
+		gcRan := db.vlogGenerationGCRuns.Load() > gcRunsBefore
+		if rewriteRan {
+			db.vlogGenerationMaintenancePassWithRewrite.Add(1)
+		}
+		if gcRan {
+			db.vlogGenerationMaintenancePassWithGC.Add(1)
+		}
+		if !rewriteRan && !gcRan {
+			db.vlogGenerationMaintenancePassNoop.Add(1)
+		}
 	}()
 	now := time.Now()
 	quiet := db.foregroundActivityQuietFor(now, vlogGenerationMaintenanceQuietWindow, vlogForegroundReadQuietWindow)
@@ -13124,16 +13159,19 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 			// passes spend long maintenance windows before the confirmation delay
 			// has elapsed. The only valid next step is to wait for confirmation.
 			if !vlogGenerationIsStageConfirmSource(opts) {
+				db.vlogGenerationMaintenanceSkipStageGate.Add(1)
 				return
 			}
 		} else if !vlogGenerationIsStageConfirmSource(opts) {
 			// When confirmation becomes due, reserve the maintenance slot for the
 			// explicit stage-confirm wake instead of letting generic retries or
 			// periodic passes reacquire it first.
+			db.vlogGenerationMaintenanceSkipStageGate.Add(1)
 			return
 		}
 	}
 	if !stagePending && ageBlockedDue && !vlogGenerationIsAgeBlockedSource(opts) {
+		db.vlogGenerationMaintenanceSkipAgeBlocked.Add(1)
 		return
 	}
 	// Checkpoint-collision retries and timer-driven confirmation wakes should run
@@ -13141,11 +13179,13 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 	// collisions where periodic maintenance keeps reacquiring the scheduler while
 	// the higher-priority retry is still trying to run.
 	if !opts.bypassQuiet && (db.vlogGenerationCheckpointKickPending.Load() || db.vlogGenerationDeferredMaintenancePending.Load()) {
+		db.vlogGenerationMaintenanceSkipPriority.Add(1)
 		return
 	}
 	// Explicit GC runs bypass the foreground quiet-window gate so callers can
 	// force a safety/cleanup pass even while foreground activity is ongoing.
 	if !runGC && !opts.bypassQuiet && !quiet {
+		db.vlogGenerationMaintenanceSkipQuiet.Add(1)
 		return
 	}
 	// In WAL-off mode, do not start rewrite/GC planning before the first
@@ -13156,6 +13196,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 	// before the first checkpoint; starving that path causes the main value-log
 	// lane to grow unchecked during restore.
 	if db.disableJournal && db.checkpointRuns.Load() == 0 && !runGC && len(rewriteQueue) == 0 && !opts.skipCheckpoint {
+		db.vlogGenerationMaintenanceSkipPreCheckpoint.Add(1)
 		return
 	}
 	// Retained-prune and generation maintenance use the same foreground quiet-window gate.
@@ -13176,10 +13217,12 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 				return
 			}
 		} else {
+			db.vlogGenerationMaintenanceSkipCheckpointing.Add(1)
 			return
 		}
 	}
 	if db.checkpointing.Load() {
+		db.vlogGenerationMaintenanceSkipCheckpointing.Add(1)
 		return
 	}
 	now = time.Now()
@@ -14144,6 +14187,7 @@ func (db *DB) maybeRunVlogGenerationIndexVacuum(rewriteBytesIn int64) {
 		return
 	}
 	if envBool(envDisableVlogGenerationVacuum) {
+		db.vlogGenerationVacuumSkippedDisabled.Add(1)
 		return
 	}
 	vacuumer, ok := db.backend.(backendIndexVacuumer)
@@ -14186,12 +14230,14 @@ func (db *DB) shouldRunVlogGenerationIndexVacuum(rewriteBytesIn int64, now time.
 		return false
 	}
 	if rewriteBytesIn < vlogGenerationVacuumTriggerRewriteBytes {
+		db.vlogGenerationVacuumSkippedRewriteBytes.Add(1)
 		return false
 	}
 	last := db.vlogGenerationLastVacuumUnixNano.Load()
 	if last > 0 {
 		lastAt := time.Unix(0, last)
 		if now.Sub(lastAt) < vlogGenerationVacuumMinInterval {
+			db.vlogGenerationVacuumSkippedCooldown.Add(1)
 			return false
 		}
 	}
@@ -19491,6 +19537,20 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.checkpoint_kick.runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickRuns.Load())
 	stats["treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickRewriteRuns.Load())
 	stats["treedb.cache.vlog_generation.checkpoint_kick.gc_runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickGCRuns.Load())
+	stats["treedb.cache.vlog_generation.maintenance.attempts"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceAttempts.Load())
+	stats["treedb.cache.vlog_generation.maintenance.acquired"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceAcquired.Load())
+	stats["treedb.cache.vlog_generation.maintenance.collisions"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceCollisions.Load())
+	stats["treedb.cache.vlog_generation.maintenance.skip.wal_on_periodic"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipWALOnPeriodic.Load())
+	stats["treedb.cache.vlog_generation.maintenance.skip.maintenance_phase"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipPhase.Load())
+	stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipStageGate.Load())
+	stats["treedb.cache.vlog_generation.maintenance.skip.age_blocked_gate"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipAgeBlocked.Load())
+	stats["treedb.cache.vlog_generation.maintenance.skip.priority_pending"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipPriority.Load())
+	stats["treedb.cache.vlog_generation.maintenance.skip.quiet_window"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipQuiet.Load())
+	stats["treedb.cache.vlog_generation.maintenance.skip.before_first_checkpoint"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipPreCheckpoint.Load())
+	stats["treedb.cache.vlog_generation.maintenance.skip.checkpoint_inflight"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipCheckpointing.Load())
+	stats["treedb.cache.vlog_generation.maintenance.passes.noop"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassNoop.Load())
+	stats["treedb.cache.vlog_generation.maintenance.passes.with_rewrite"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassWithRewrite.Load())
+	stats["treedb.cache.vlog_generation.maintenance.passes.with_gc"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassWithGC.Load())
 	stats["treedb.cache.vlog_generation.churn_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationChurnBytes.Load())
 	stats["treedb.cache.vlog_generation.churn_bytes_per_sec"] = fmt.Sprintf("%d", db.vlogGenerationLastChurnBps.Load())
 	stats["treedb.cache.vlog_generation.rewrite.queue_len"] = fmt.Sprintf("%d", rewriteQueueLen)
@@ -19554,6 +19614,9 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.gc.dry_run.last_eligible_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCDryRunSegsEligible.Load())
 	stats["treedb.cache.vlog_generation.vacuum.runs"] = fmt.Sprintf("%d", db.vlogGenerationVacuumRuns.Load())
 	stats["treedb.cache.vlog_generation.vacuum.failures"] = fmt.Sprintf("%d", db.vlogGenerationVacuumFailures.Load())
+	stats["treedb.cache.vlog_generation.vacuum.skipped_disabled"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedDisabled.Load())
+	stats["treedb.cache.vlog_generation.vacuum.skipped_rewrite_bytes"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedRewriteBytes.Load())
+	stats["treedb.cache.vlog_generation.vacuum.skipped_cooldown"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedCooldown.Load())
 	stats["treedb.cache.vlog_generation.vacuum.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastVacuumUnixNano.Load())
 	stats["treedb.cache.vlog_generation.remap.successes"] = fmt.Sprintf("%d", db.vlogGenerationRemapSuccesses.Load())
 	stats["treedb.cache.vlog_generation.remap.failures"] = fmt.Sprintf("%d", db.vlogGenerationRemapFailures.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 154ed646d..8211404bb 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -320,6 +320,62 @@ func TestObserveVlogGenerationRewritePlanOutcome_SelectedTracksSegmentFallbackBy
 	}
 }
 
+func TestMaybeRunVlogGenerationMaintenanceWithOptions_TracksWalOnPeriodicSkip(t *testing.T) {
+	db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)}
+	db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{})
+	if got, want := db.vlogGenerationMaintenanceAttempts.Load(), uint64(1); got != want {
+		t.Fatalf("maintenance attempts=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationMaintenanceSkipWALOnPeriodic.Load(), uint64(1); got != want {
+		t.Fatalf("maintenance wal-on periodic skips=%d want=%d", got, want)
+	}
+	if got := db.vlogGenerationMaintenanceAcquired.Load(); got != 0 {
+		t.Fatalf("maintenance acquired=%d want=0", got)
+	}
+}
+
+func TestMaybeRunVlogGenerationMaintenanceWithOptions_TracksCollision(t *testing.T) {
+	db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)}
+	db.vlogGenerationMaintenanceActive.Store(true)
+	db.maybeRunVlogGenerationMaintenanceWithOptions(false, vlogGenerationMaintenanceOptions{})
+	if got, want := db.vlogGenerationMaintenanceAttempts.Load(), uint64(1); got != want {
+		t.Fatalf("maintenance attempts=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationMaintenanceCollisions.Load(), uint64(1); got != want {
+		t.Fatalf("maintenance collisions=%d want=%d", got, want)
+	}
+	if got := db.vlogGenerationMaintenanceAcquired.Load(); got != 0 {
+		t.Fatalf("maintenance acquired=%d want=0", got)
+	}
+}
+
+func TestShouldRunVlogGenerationIndexVacuum_TracksSkipReasons(t *testing.T) {
+	db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)}
+	now := time.Now()
+	if db.shouldRunVlogGenerationIndexVacuum(vlogGenerationVacuumTriggerRewriteBytes-1, now) {
+		t.Fatalf("expected vacuum to skip below rewrite trigger")
+	}
+	if got, want := db.vlogGenerationVacuumSkippedRewriteBytes.Load(), uint64(1); got != want {
+		t.Fatalf("vacuum skipped_rewrite_bytes=%d want=%d", got, want)
+	}
+	db.vlogGenerationLastVacuumUnixNano.Store(now.UnixNano())
+	if db.shouldRunVlogGenerationIndexVacuum(vlogGenerationVacuumTriggerRewriteBytes, now) {
+		t.Fatalf("expected vacuum to skip during cooldown")
+	}
+	if got, want := db.vlogGenerationVacuumSkippedCooldown.Load(), uint64(1); got != want {
+		t.Fatalf("vacuum skipped_cooldown=%d want=%d", got, want)
+	}
+}
+
+func TestMaybeRunVlogGenerationIndexVacuum_TracksDisabledSkip(t *testing.T) {
+	t.Setenv(envDisableVlogGenerationVacuum, "1")
+	db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)}
+	db.maybeRunVlogGenerationIndexVacuum(vlogGenerationVacuumTriggerRewriteBytes)
+	if got, want := db.vlogGenerationVacuumSkippedDisabled.Load(), uint64(1); got != want {
+		t.Fatalf("vacuum skipped_disabled=%d want=%d", got, want)
+	}
+}
+
 func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesQualityFloor(t *testing.T) {
 	db := &DB{valueLogRewriteTriggerRatioPPM: 200000}
 	if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want {
diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md
new file mode 100644
index 000000000..5d0fa2ff5
--- /dev/null
+++ b/worklog/2026-03-27.md
@@ -0,0 +1,53 @@
+# Work Log - 2026-03-27
+
+- Added live value-log maintenance observability for `run_celestia` investigation:
+  - `TreeDB/caching/db.go`
+    - maintenance gate counters:
+      - `treedb.cache.vlog_generation.maintenance.attempts`
+      - `treedb.cache.vlog_generation.maintenance.acquired`
+      - `treedb.cache.vlog_generation.maintenance.collisions`
+      - `treedb.cache.vlog_generation.maintenance.skip.*`
+      - `treedb.cache.vlog_generation.maintenance.passes.{noop,with_rewrite,with_gc}`
+    - vacuum skip counters:
+      - `treedb.cache.vlog_generation.vacuum.skipped_disabled`
+      - `treedb.cache.vlog_generation.vacuum.skipped_rewrite_bytes`
+      - `treedb.cache.vlog_generation.vacuum.skipped_cooldown`
+  - Added/updated tests in `TreeDB/caching/vlog_generation_scheduler_test.go` for:
+    - WAL-on periodic skip accounting
+    - maintenance collision accounting
+    - vacuum skip-reason accounting
+
+- Validation:
+  - `go test ./TreeDB/caching -count=1`
+
+- `run_celestia` validation run (fast profile, local gomap override):
+  - command:
+    - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast STOP_AT_LOCAL_HEIGHT=500 FREEZE_REMOTE_HEIGHT_AT_START=1 NO_PROGRESS_WARN_SECONDS=120 NO_PROGRESS_FAIL_SECONDS=1800 HEAP_CAPTURE_RSS_DELTA_KB=1 CAPTURE_HEAP_ON_MAX_RSS=1 ~/run_celestia.sh`
+  - run home:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260327173138`
+  - duration / stop condition:
+    - `duration_seconds=277`
+    - local-height stop target hit at `local=10413004`
+  - disk:
+    - `end_app_bytes=5011158649`
+    - `disk-breakdown.log` shows dominant `maindb/wal/value-l0-*` files (~256MiB each)
+
+- Latest diagnostics snapshot used for maintenance counters:
+  - file:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260327173138/sync/diagnostics/pprof-heap-max-rss-7070580k-20260327173541.treedb_vars.json`
+  - key counters:
+    - `maintenance.attempts=637`
+    - `maintenance.acquired=56`
+    - `maintenance.collisions=581`
+    - `maintenance.passes.noop=53`
+    - `maintenance.passes.with_gc=2`
+    - `maintenance.passes.with_rewrite=0`
+    - `maintenance.skip.quiet_window=26`
+    - `gc.runs=2`
+    - `rewrite.runs=0`
+    - `vacuum.runs=0`
+    - `scheduler_last_reason=periodic_gc`
+
+- Interpretation (for next slice):
+  - During this early state-sync window, rewrite did not trigger and therefore vacuum never became eligible on the post-rewrite path.
+  - The dominant scheduler behavior was active-pass contention (`collisions=581`) and noop acquired passes; this is now directly measurable.

From bf11ec9d19a745e83d8dafd007b5f443f518da8a Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 17:49:26 -1000
Subject: [PATCH 03/61] treedb: coalesce maintenance retries under load

---
 TreeDB/caching/db.go                          | 33 +++++++++++++++++++
 .../caching/vlog_generation_scheduler_test.go | 30 +++++++++++++++++
 worklog/2026-03-27.md                         | 32 ++++++++++++++++++
 3 files changed, 95 insertions(+)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 1ee972687..438019ec9 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -12188,6 +12188,9 @@ func (db *DB) maybeRunPeriodicVlogGenerationMaintenance(runGC bool) bool {
 	if db == nil {
 		return false
 	}
+	if db.vlogGenerationMaintenanceActive.Load() {
+		return false
+	}
 	if db.suppressBackgroundVlogGenerationForMaintenancePhase() {
 		db.debugVlogMaintf("periodic_skip reason=maintenance_phase phase=%s run_gc=%t", maintenancePhaseString(uint32(db.MaintenancePhase())), runGC)
 		return false
@@ -12968,6 +12971,36 @@ func (db *DB) runVlogGenerationMaintenanceRetries(opts vlogGenerationMaintenance
 	deadline := time.Now().Add(retryWindow)
 	sleepDelay := 10 * time.Millisecond
 	for !db.closing.Load() {
+		// Once retry intent is already queued, avoid repeatedly colliding with
+		// the active maintenance pass; wait for release or deadline instead.
+		if db.vlogGenerationMaintenanceActive.Load() {
+			if stopWhenAcquired && db.vlogGenerationDeferredMaintenancePending.Load() {
+				if time.Now().After(deadline) {
+					return
+				}
+				time.Sleep(sleepDelay)
+				if sleepDelay < 100*time.Millisecond {
+					sleepDelay *= 2
+					if sleepDelay > 100*time.Millisecond {
+						sleepDelay = 100 * time.Millisecond
+					}
+				}
+				continue
+			}
+			if !stopWhenAcquired && db.vlogGenerationCheckpointKickPending.Load() {
+				if time.Now().After(deadline) {
+					return
+				}
+				time.Sleep(sleepDelay)
+				if sleepDelay < 100*time.Millisecond {
+					sleepDelay *= 2
+					if sleepDelay > 100*time.Millisecond {
+						sleepDelay = 100 * time.Millisecond
+					}
+				}
+				continue
+			}
+		}
 		attempt++
 		if opts.debugSource != "" {
 			db.debugVlogMaintf(
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 8211404bb..76eb3f288 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -376,6 +376,36 @@ func TestMaybeRunVlogGenerationIndexVacuum_TracksDisabledSkip(t *testing.T) {
 	}
 }
 
+func TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries(t *testing.T) {
+	db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)}
+
+	db.vlogGenerationMaintenanceActive.Store(true)
+	db.vlogGenerationCheckpointKickPending.Store(true)
+	db.runVlogGenerationMaintenanceRetries(vlogGenerationMaintenanceOptions{
+		bypassQuiet:           true,
+		skipRetainedPruneWait: true,
+		skipCheckpoint:        false,
+		rewriteDebtDrain:      true,
+		debugSource:           "checkpoint_pending",
+	}, 30*time.Millisecond, false)
+	if got := db.vlogGenerationMaintenanceCollisions.Load(); got != 0 {
+		t.Fatalf("checkpoint pending retry collisions=%d want=0", got)
+	}
+
+	db.vlogGenerationMaintenanceActive.Store(true)
+	db.vlogGenerationDeferredMaintenancePending.Store(true)
+	db.runVlogGenerationMaintenanceRetries(vlogGenerationMaintenanceOptions{
+		bypassQuiet:           true,
+		skipRetainedPruneWait: true,
+		skipCheckpoint:        false,
+		rewriteDebtDrain:      true,
+		debugSource:           "rewrite_stage_confirm",
+	}, 30*time.Millisecond, true)
+	if got := db.vlogGenerationMaintenanceCollisions.Load(); got != 0 {
+		t.Fatalf("deferred pending retry collisions=%d want=0", got)
+	}
+}
+
 func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesQualityFloor(t *testing.T) {
 	db := &DB{valueLogRewriteTriggerRatioPPM: 200000}
 	if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want {
diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md
index 5d0fa2ff5..586eab726 100644
--- a/worklog/2026-03-27.md
+++ b/worklog/2026-03-27.md
@@ -51,3 +51,35 @@
 - Interpretation (for next slice):
   - During this early state-sync window, rewrite did not trigger and therefore vacuum never became eligible on the post-rewrite path.
   - The dominant scheduler behavior was active-pass contention (`collisions=581`) and noop acquired passes; this is now directly measurable.
+
+- Collision-coalescing follow-up (same day):
+  - code changes:
+    - `TreeDB/caching/db.go`
+      - `runVlogGenerationMaintenanceRetries`: when retry intent is already pending and `maintenanceActive` is true, wait/backoff instead of re-entering `maybeRun...` and creating repeated collisions.
+      - `maybeRunPeriodicVlogGenerationMaintenance`: skip periodic entry while `maintenanceActive` is true.
+    - `TreeDB/caching/vlog_generation_scheduler_test.go`
+      - added `TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries`.
+  - validation:
+    - `go test ./TreeDB/caching -count=1`
+
+- `run_celestia` comparison rerun after collision-coalescing:
+  - command (same as prior comparison run):
+    - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast STOP_AT_LOCAL_HEIGHT=500 FREEZE_REMOTE_HEIGHT_AT_START=1 NO_PROGRESS_WARN_SECONDS=120 NO_PROGRESS_FAIL_SECONDS=1800 HEAP_CAPTURE_RSS_DELTA_KB=1 CAPTURE_HEAP_ON_MAX_RSS=1 ~/run_celestia.sh`
+  - run home:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260327174335`
+  - sampled snapshot:
+    - `pprof-heap-max-rss-6605092k-20260327174744.treedb_vars.json`
+
+- Baseline vs new counter delta (same metric keys/sampling style):
+  - baseline snapshot: `pprof-heap-max-rss-7070580k-20260327173541.treedb_vars.json`
+  - `maintenance.attempts`: `637 -> 124`
+  - `maintenance.acquired`: `56 -> 119`
+  - `maintenance.collisions`: `581 -> 5`
+  - `maintenance.passes.noop`: `53 -> 116`
+  - `maintenance.passes.with_rewrite`: `0 -> 1`
+  - `maintenance.passes.with_gc`: `2 -> 1`
+  - `rewrite.runs`: `0 -> 1`
+  - `vacuum.runs`: `0 -> 1`
+
+- Interpretation:
+  - Coalescing retry loops materially reduced collision churn and allowed at least one rewrite+vacuum pass to complete in the same early-state-sync lab window.

From c83e2d6164ef50699c194048f5e6bae0e39a220c Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 17:59:27 -1000
Subject: [PATCH 04/61] treedb: skip hot periodic maintenance preflight

---
 TreeDB/caching/db.go                          | 13 +++++++
 .../caching/vlog_generation_scheduler_test.go | 34 +++++++++++++++++++
 worklog/2026-03-27.md                         | 34 +++++++++++++++++++
 3 files changed, 81 insertions(+)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 438019ec9..5671e2383 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -12195,6 +12195,19 @@ func (db *DB) maybeRunPeriodicVlogGenerationMaintenance(runGC bool) bool {
 		db.debugVlogMaintf("periodic_skip reason=maintenance_phase phase=%s run_gc=%t", maintenancePhaseString(uint32(db.MaintenancePhase())), runGC)
 		return false
 	}
+	// Coarse preflight: while foreground activity is hot, avoid entering the
+	// maintenance engine unless a deferred/checkpoint wake is pending. This
+	// prevents high-frequency periodic no-op acquisitions.
+	if !runGC {
+		now := time.Now()
+		quiet := db.foregroundActivityQuietFor(now, vlogGenerationMaintenanceQuietWindow, vlogForegroundReadQuietWindow)
+		if !quiet &&
+			!db.vlogGenerationCheckpointKickPending.Load() &&
+			!db.vlogGenerationDeferredMaintenancePending.Load() &&
+			!db.vlogGenerationDeferredMaintenanceDue(now) {
+			return false
+		}
+	}
 	db.maybeRunVlogGenerationMaintenance(runGC)
 	return true
 }
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 76eb3f288..1de7c37bb 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -4112,6 +4112,40 @@ func TestVlogGenerationMaintenance_PeriodicSkipsWhenMaintenancePhaseNonSteady(t
 	}
 }
 
+func TestVlogGenerationMaintenance_PeriodicPreflightSkipsHotNoPending(t *testing.T) {
+	prepareDirectSchedulerTest(t)
+
+	dir := t.TempDir()
+	backend, err := backenddb.Open(backenddb.Options{Dir: dir})
+	if err != nil {
+		t.Fatalf("open backend: %v", err)
+	}
+	recorder := &rewriteBudgetRecordingBackend{
+		DB:              backend,
+		rewriteResponse: backenddb.ValueLogRewriteStats{BytesBefore: 64, BytesAfter: 32, RecordsCopied: 1},
+	}
+
+	db, cleanup := openRewriteQueueTestDB(t, dir, recorder)
+	t.Cleanup(cleanup)
+	skipRetainedPrune(db)
+
+	hot := time.Now().UnixNano()
+	db.lastForegroundWriteUnixNano.Store(hot)
+	db.lastForegroundReadUnixNano.Store(hot)
+	db.vlogGenerationCheckpointKickPending.Store(false)
+	db.vlogGenerationDeferredMaintenancePending.Store(false)
+
+	if ran := db.maybeRunPeriodicVlogGenerationMaintenance(false); ran {
+		t.Fatal("periodic maintenance unexpectedly entered during hot foreground with no pending wake")
+	}
+	if got := db.vlogGenerationMaintenanceAttempts.Load(); got != 0 {
+		t.Fatalf("maintenance attempts=%d want 0 on preflight skip", got)
+	}
+	if _, calls := recorder.recordedRewrite(); calls != 0 {
+		t.Fatalf("rewrite calls=%d want 0 on preflight skip", calls)
+	}
+}
+
 func TestCheckpoint_KickSkipsWhenMaintenancePhaseNonSteady(t *testing.T) {
 	disableVlogGenerationLoop(t)
 
diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md
index 586eab726..b41f022dd 100644
--- a/worklog/2026-03-27.md
+++ b/worklog/2026-03-27.md
@@ -83,3 +83,37 @@
 
 - Interpretation:
   - Coalescing retry loops materially reduced collision churn and allowed at least one rewrite+vacuum pass to complete in the same early-state-sync lab window.
+
+- Periodic preflight follow-up:
+  - code changes:
+    - `TreeDB/caching/db.go`
+      - `maybeRunPeriodicVlogGenerationMaintenance`: added hot-foreground preflight (when not `runGC`) to skip entering maintenance unless deferred/checkpoint wake is pending.
+    - `TreeDB/caching/vlog_generation_scheduler_test.go`
+      - added `TestVlogGenerationMaintenance_PeriodicPreflightSkipsHotNoPending`.
+  - validation:
+    - `go test ./TreeDB/caching -count=1`
+
+- Third `run_celestia` comparison run (same command profile):
+  - run home:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260327175340`
+  - sampled snapshot:
+    - `pprof-heap-max-rss-6560568k-20260327175747.treedb_vars.json`
+  - key counters:
+    - `maintenance.attempts=46`
+    - `maintenance.acquired=41`
+    - `maintenance.collisions=5`
+    - `maintenance.passes.noop=38`
+    - `maintenance.passes.with_rewrite=1`
+    - `maintenance.passes.with_gc=1`
+    - `maintenance.skip.quiet_window=0`
+    - `rewrite.runs=1`
+    - `vacuum.runs=1`
+
+- Multi-run trend (same lab recipe):
+  - baseline snapshot: `attempts=637`, `collisions=581`, `rewrite=0`, `vacuum=0`
+  - collision-coalesced snapshot: `attempts=124`, `collisions=5`, `rewrite=1`, `vacuum=1`
+  - preflight snapshot: `attempts=46`, `collisions=5`, `rewrite=1`, `vacuum=1`
+
+- Interpretation:
+  - Retry coalescing delivered the major contention reduction.
+  - Periodic preflight further reduced maintenance churn/noop entries while preserving rewrite+vacuum progress in this early-state-sync window.

From 31fbb0a53e9570c514dd09510370cb8f78ca0b27 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 18:17:42 -1000
Subject: [PATCH 05/61] treedb: add live vacuum/rewrite economics
 instrumentation

---
 TreeDB/caching/db.go                          | 202 +++++++++++++++++-
 .../caching/vlog_generation_scheduler_test.go | 121 +++++++++++
 worklog/2026-03-27.md                         |  28 +++
 3 files changed, 342 insertions(+), 9 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 5671e2383..20c877be7 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5228,6 +5228,8 @@ type DB struct {
 	vlogGenerationMaintenancePassNoop           atomic.Uint64
 	vlogGenerationMaintenancePassWithRewrite    atomic.Uint64
 	vlogGenerationMaintenancePassWithGC         atomic.Uint64
+	vlogGenerationMaintenancePassTotalNanos     atomic.Uint64
+	vlogGenerationMaintenancePassMaxNanos       atomic.Uint64
 	vlogGenerationLastReason                    atomic.Uint32
 	vlogGenerationCheckpointKickRuns            atomic.Uint64
 	vlogGenerationCheckpointKickRewriteRuns     atomic.Uint64
@@ -5252,6 +5254,15 @@ type DB struct {
 	// budget while still running maintenance at coarse intervals.
 	vlogGenerationRewriteBudgetLastUnixNano atomic.Int64
 	vlogGenerationRewriteBudgetTokensBytes  atomic.Int64
+	vlogGenerationRewriteBudgetConsumed     atomic.Uint64
+	vlogGenerationRewritePlanTotalNanos     atomic.Uint64
+	vlogGenerationRewritePlanMaxNanos       atomic.Uint64
+	vlogGenerationRewriteExecTotalNanos     atomic.Uint64
+	vlogGenerationRewriteExecMaxNanos       atomic.Uint64
+	vlogGenerationGCExecTotalNanos          atomic.Uint64
+	vlogGenerationGCExecMaxNanos            atomic.Uint64
+	vlogGenerationVacuumExecTotalNanos      atomic.Uint64
+	vlogGenerationVacuumExecMaxNanos        atomic.Uint64
 	bgErrMu                                 sync.Mutex
 	bgErr                                   error
 
@@ -12404,6 +12415,9 @@ func (db *DB) vlogGenerationConsumeRewriteBudgetBytes(n int64) {
 			next = 0
 		}
 		if db.vlogGenerationRewriteBudgetTokensBytes.CompareAndSwap(cur, next) {
+			if consumed := cur - next; consumed > 0 {
+				db.vlogGenerationRewriteBudgetConsumed.Add(uint64(consumed))
+			}
 			return
 		}
 	}
@@ -12438,6 +12452,50 @@ func sumVlogRewritePlanLiveBytes(segments []backenddb.ValueLogRewritePlanSegment
 	return sum, ok
 }
 
+func observeDurationNanos(total, max *atomic.Uint64, d time.Duration) {
+	if total == nil || max == nil || d <= 0 {
+		return
+	}
+	n := uint64(d)
+	total.Add(n)
+	updateAtomicMaxUint64(max, n)
+}
+
+func (db *DB) observeVlogGenerationMaintenancePassDuration(d time.Duration) {
+	if db == nil {
+		return
+	}
+	observeDurationNanos(&db.vlogGenerationMaintenancePassTotalNanos, &db.vlogGenerationMaintenancePassMaxNanos, d)
+}
+
+func (db *DB) observeVlogGenerationRewritePlanDuration(d time.Duration) {
+	if db == nil {
+		return
+	}
+	observeDurationNanos(&db.vlogGenerationRewritePlanTotalNanos, &db.vlogGenerationRewritePlanMaxNanos, d)
+}
+
+func (db *DB) observeVlogGenerationRewriteExecDuration(d time.Duration) {
+	if db == nil {
+		return
+	}
+	observeDurationNanos(&db.vlogGenerationRewriteExecTotalNanos, &db.vlogGenerationRewriteExecMaxNanos, d)
+}
+
+func (db *DB) observeVlogGenerationGCExecDuration(d time.Duration) {
+	if db == nil {
+		return
+	}
+	observeDurationNanos(&db.vlogGenerationGCExecTotalNanos, &db.vlogGenerationGCExecMaxNanos, d)
+}
+
+func (db *DB) observeVlogGenerationVacuumExecDuration(d time.Duration) {
+	if db == nil {
+		return
+	}
+	observeDurationNanos(&db.vlogGenerationVacuumExecTotalNanos, &db.vlogGenerationVacuumExecMaxNanos, d)
+}
+
 func vlogGenerationRewriteLedgerIDs(segments []backenddb.ValueLogRewritePlanSegment) []uint32 {
 	if len(segments) == 0 {
 		return nil
@@ -12453,9 +12511,14 @@ func vlogGenerationRewriteLedgerIDs(segments []backenddb.ValueLogRewritePlanSegm
 }
 
 func (db *DB) observeVlogGenerationRewritePlanOutcome(plan backenddb.ValueLogRewritePlan, err error) {
+	db.observeVlogGenerationRewritePlanOutcomeWithDuration(plan, err, 0)
+}
+
+func (db *DB) observeVlogGenerationRewritePlanOutcomeWithDuration(plan backenddb.ValueLogRewritePlan, err error, dur time.Duration) {
 	if db == nil {
 		return
 	}
+	db.observeVlogGenerationRewritePlanDuration(dur)
 	db.vlogGenerationRewritePlanRuns.Add(1)
 	if err != nil {
 		if isVlogGenerationPlannerCanceled(err) {
@@ -13139,13 +13202,15 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 		db.vlogGenerationDeferredMaintenancePending.Load(),
 	)
 	defer func() {
+		passDur := time.Since(activeStart)
 		db.debugVlogMaintf(
 			"maintenance_active_release source=%s dur_ms=%d checkpoint_pending=%t deferred_pending=%t",
 			activeSource,
-			time.Since(activeStart).Milliseconds(),
+			passDur.Milliseconds(),
 			db.vlogGenerationCheckpointKickPending.Load(),
 			db.vlogGenerationDeferredMaintenancePending.Load(),
 		)
+		db.observeVlogGenerationMaintenancePassDuration(passDur)
 		db.vlogGenerationMaintenanceActive.Store(false)
 		// If a deferred confirmation/age wake became due while this pass held the
 		// scheduler active, requeue it immediately on exit instead of relying on
@@ -13385,6 +13450,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 		planStart := time.Now()
 		plan, err := planner.ValueLogRewritePlan(ctx, planOpts)
 		cancel()
+		planDur := time.Since(planStart)
 		db.debugVlogMaintf(
 			"rewrite_plan stale_ratio_trigger min_ratio=%.6f max_source_bytes=%d selected=%d/%d selected_bytes_total=%d selected_bytes_live=%d selected_bytes_stale=%d total_bytes=%d live_bytes=%d stale_bytes=%d dur_ms=%.3f err=%v",
 			minStaleRatio,
@@ -13397,10 +13463,10 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 			plan.BytesTotal,
 			plan.BytesLive,
 			plan.BytesStale,
-			float64(time.Since(planStart).Microseconds())/1000,
+			float64(planDur.Microseconds())/1000,
 			err,
 		)
-		db.observeVlogGenerationRewritePlanOutcome(plan, err)
+		db.observeVlogGenerationRewritePlanOutcomeWithDuration(plan, err, planDur)
 		updatePlanTimestamp := false
 		if err != nil {
 			db.clearVlogGenerationRewriteAgeBlockedUntil()
@@ -13537,6 +13603,7 @@ planned:
 					MinSegmentAge:        vlogGenerationRewriteMinSegmentAge,
 				})
 				cancel()
+				planDur := time.Since(planStart)
 				db.debugVlogMaintf(
 					"rewrite_plan pre_rewrite max_source_bytes=%d min_ratio=%.6f min_stale_bytes=%d selected=%d/%d selected_bytes_total=%d selected_bytes_live=%d selected_bytes_stale=%d total_bytes=%d live_bytes=%d stale_bytes=%d dur_ms=%.3f err=%v",
 					maxSourceBytes,
@@ -13550,10 +13617,10 @@ planned:
 					plan.BytesTotal,
 					plan.BytesLive,
 					plan.BytesStale,
-					float64(time.Since(planStart).Microseconds())/1000,
+					float64(planDur.Microseconds())/1000,
 					err,
 				)
-				db.observeVlogGenerationRewritePlanOutcome(plan, err)
+				db.observeVlogGenerationRewritePlanOutcomeWithDuration(plan, err, planDur)
 				if err != nil {
 					db.clearVlogGenerationRewriteAgeBlockedUntil()
 					if isVlogGenerationPlannerCanceled(err) {
@@ -13826,8 +13893,10 @@ planned:
 			rewriteStart := time.Now()
 			stats, err := rewriter.ValueLogRewriteOnline(ctx, rewriteOpts)
 			cancel()
+			rewriteDur := time.Since(rewriteStart)
+			db.observeVlogGenerationRewriteExecDuration(rewriteDur)
 			if err != nil {
-				db.debugVlogMaintf("rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), err, float64(time.Since(rewriteStart).Microseconds())/1000)
+				db.debugVlogMaintf("rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), err, float64(rewriteDur.Microseconds())/1000)
 				if errors.Is(err, context.Canceled) {
 					db.observeVlogGenerationRewriteCanceled()
 					if len(processedRewriteIDs) > 0 {
@@ -13847,7 +13916,7 @@ planned:
 				stats.BytesBefore,
 				stats.BytesAfter,
 				stats.RecordsCopied,
-				float64(time.Since(rewriteStart).Microseconds())/1000,
+				float64(rewriteDur.Microseconds())/1000,
 			)
 			effectiveBytesBefore := int64(stats.BytesBefore)
 			effectiveBytesAfter := int64(stats.BytesAfter)
@@ -13864,8 +13933,10 @@ planned:
 					ProtectedPaths: db.valueLogProtectedPaths(),
 				})
 				gcCancel()
+				gcDur := time.Since(gcStart)
+				db.observeVlogGenerationGCExecDuration(gcDur)
 				if gcErr != nil {
-					db.debugVlogMaintf("gc_after_rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), gcErr, float64(time.Since(gcStart).Microseconds())/1000)
+					db.debugVlogMaintf("gc_after_rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), gcErr, float64(gcDur.Microseconds())/1000)
 					return fmt.Errorf("generational gc after rewrite: %w", gcErr)
 				}
 				if gcStats.BytesDeleted > 0 {
@@ -13875,7 +13946,7 @@ planned:
 						effectiveBytesAfter = 0
 					}
 				}
-				db.debugVlogMaintf("gc_after_rewrite_done reason=%s dur_ms=%.3f", vlogGenerationReasonString(reason), float64(time.Since(gcStart).Microseconds())/1000)
+				db.debugVlogMaintf("gc_after_rewrite_done reason=%s dur_ms=%.3f", vlogGenerationReasonString(reason), float64(gcDur.Microseconds())/1000)
 			}
 			if effectiveBytesBefore > effectiveBytesAfter {
 				db.vlogGenerationRewriteReclaimedBytes.Add(uint64(effectiveBytesBefore - effectiveBytesAfter))
@@ -14052,8 +14123,10 @@ planned:
 		db.vlogGenerationLastGCUnixNano.Store(now.UnixNano())
 		ctx, cancel := db.foregroundMaintenanceContext(30 * time.Second)
 		gcOpts := backenddb.ValueLogGCOptions{ProtectedPaths: db.valueLogProtectedPaths()}
+		gcStart := time.Now()
 		gcStats, err := gcer.ValueLogGC(ctx, gcOpts)
 		cancel()
+		db.observeVlogGenerationGCExecDuration(time.Since(gcStart))
 		if err != nil {
 			return fmt.Errorf("generational gc: %w", err)
 		}
@@ -14253,11 +14326,13 @@ func (db *DB) maybeRunVlogGenerationIndexVacuum(rewriteBytesIn int64) {
 		return err
 	}
 	var err error
+	vacuumStart := time.Now()
 	if db.maintenanceActive.Load() {
 		err = runVacuum()
 	} else {
 		err = db.runWithBackendMaintenance(runVacuum)
 	}
+	db.observeVlogGenerationVacuumExecDuration(time.Since(vacuumStart))
 	if err != nil {
 		db.vlogGenerationVacuumFailures.Add(1)
 		db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError)
@@ -19568,7 +19643,63 @@ func (db *DB) Stats() map[string]string {
 	db.vlogGenerationRewriteQueueMu.Lock()
 	rewriteQueueLen := len(db.vlogGenerationRewriteQueue)
 	rewriteQueueLoaded := db.vlogGenerationRewriteQueueLoaded
+	rewriteLedgerSegments := len(db.vlogGenerationRewriteLedger)
+	rewritePenaltiesActive := len(db.vlogGenerationRewritePenalties)
+	rewriteStagePending := db.vlogGenerationRewriteStagePending
+	rewriteStageObservedNS := db.vlogGenerationRewriteStageObservedUnixNano
+	rewriteLedgerBytesTotal := int64(0)
+	rewriteLedgerBytesLive := int64(0)
+	rewriteLedgerBytesStale := int64(0)
+	for i := range db.vlogGenerationRewriteLedger {
+		seg := db.vlogGenerationRewriteLedger[i]
+		if seg.BytesTotal > 0 {
+			rewriteLedgerBytesTotal += seg.BytesTotal
+		}
+		if seg.BytesLive > 0 {
+			rewriteLedgerBytesLive += seg.BytesLive
+		}
+		if seg.BytesStale > 0 {
+			rewriteLedgerBytesStale += seg.BytesStale
+		}
+	}
 	db.vlogGenerationRewriteQueueMu.Unlock()
+	rewriteAgeBlockedUntilNS := db.vlogGenerationRewriteAgeBlockedUntilNS.Load()
+	rewriteAgeBlockedRemainingMS := int64(0)
+	if rewriteAgeBlockedUntilNS > 0 {
+		if d := time.Until(time.Unix(0, rewriteAgeBlockedUntilNS)); d > 0 {
+			rewriteAgeBlockedRemainingMS = d.Milliseconds()
+		}
+	}
+	rewriteBudgetTokens := db.vlogGenerationRewriteBudgetTokensBytes.Load()
+	if rewriteBudgetTokens < 0 {
+		rewriteBudgetTokens = 0
+	}
+	rewriteBudgetCap := db.vlogGenerationRewriteBudgetCapBytes()
+	if rewriteBudgetCap < 0 {
+		rewriteBudgetCap = 0
+	}
+	rewriteBudgetUtilPct := 0.0
+	if rewriteBudgetCap > 0 {
+		rewriteBudgetUtilPct = (float64(rewriteBudgetTokens) / float64(rewriteBudgetCap)) * 100.0
+		if rewriteBudgetUtilPct > 100.0 {
+			rewriteBudgetUtilPct = 100.0
+		}
+	}
+	maintenancePassTotalNS := db.vlogGenerationMaintenancePassTotalNanos.Load()
+	maintenancePassMaxNS := db.vlogGenerationMaintenancePassMaxNanos.Load()
+	maintenancePasses := db.vlogGenerationMaintenanceAcquired.Load()
+	rewritePlanTotalNS := db.vlogGenerationRewritePlanTotalNanos.Load()
+	rewritePlanMaxNS := db.vlogGenerationRewritePlanMaxNanos.Load()
+	rewritePlanRuns := db.vlogGenerationRewritePlanRuns.Load()
+	rewriteExecTotalNS := db.vlogGenerationRewriteExecTotalNanos.Load()
+	rewriteExecMaxNS := db.vlogGenerationRewriteExecMaxNanos.Load()
+	rewriteRuns := db.vlogGenerationRewriteRuns.Load()
+	gcExecTotalNS := db.vlogGenerationGCExecTotalNanos.Load()
+	gcExecMaxNS := db.vlogGenerationGCExecMaxNanos.Load()
+	gcRuns := db.vlogGenerationGCRuns.Load()
+	vacuumExecTotalNS := db.vlogGenerationVacuumExecTotalNanos.Load()
+	vacuumExecMaxNS := db.vlogGenerationVacuumExecMaxNanos.Load()
+	vacuumRuns := db.vlogGenerationVacuumRuns.Load()
 	stats["treedb.cache.vlog_retained_segments"] = fmt.Sprintf("%d", vlogSegments)
 	stats["treedb.cache.vlog_retained_bytes_estimate"] = fmt.Sprintf("%d", vlogBytes)
 	stats["treedb.process.memory.vlog_retained_bytes_estimate"] = fmt.Sprintf("%d", vlogBytes)
@@ -19597,15 +19728,40 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.maintenance.passes.noop"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassNoop.Load())
 	stats["treedb.cache.vlog_generation.maintenance.passes.with_rewrite"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassWithRewrite.Load())
 	stats["treedb.cache.vlog_generation.maintenance.passes.with_gc"] = fmt.Sprintf("%d", db.vlogGenerationMaintenancePassWithGC.Load())
+	stats["treedb.cache.vlog_generation.maintenance.pass.total_ms"] = fmt.Sprintf("%.3f", float64(maintenancePassTotalNS)/float64(time.Millisecond))
+	stats["treedb.cache.vlog_generation.maintenance.pass.max_ms"] = fmt.Sprintf("%.3f", float64(maintenancePassMaxNS)/float64(time.Millisecond))
+	if maintenancePasses > 0 {
+		stats["treedb.cache.vlog_generation.maintenance.pass.avg_ms"] = fmt.Sprintf("%.3f", (float64(maintenancePassTotalNS)/float64(maintenancePasses))/float64(time.Millisecond))
+	} else {
+		stats["treedb.cache.vlog_generation.maintenance.pass.avg_ms"] = "0.000"
+	}
 	stats["treedb.cache.vlog_generation.churn_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationChurnBytes.Load())
 	stats["treedb.cache.vlog_generation.churn_bytes_per_sec"] = fmt.Sprintf("%d", db.vlogGenerationLastChurnBps.Load())
 	stats["treedb.cache.vlog_generation.rewrite.queue_len"] = fmt.Sprintf("%d", rewriteQueueLen)
 	stats["treedb.cache.vlog_generation.rewrite.queue_loaded"] = fmt.Sprintf("%t", rewriteQueueLoaded)
+	stats["treedb.cache.vlog_generation.rewrite.ledger_segments"] = fmt.Sprintf("%d", rewriteLedgerSegments)
+	stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_total"] = fmt.Sprintf("%d", rewriteLedgerBytesTotal)
+	stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_live"] = fmt.Sprintf("%d", rewriteLedgerBytesLive)
+	stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_stale"] = fmt.Sprintf("%d", rewriteLedgerBytesStale)
+	if rewriteLedgerBytesTotal > 0 {
+		stats["treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm"] = fmt.Sprintf("%d", (rewriteLedgerBytesStale*1_000_000)/rewriteLedgerBytesTotal)
+	} else {
+		stats["treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm"] = "0"
+	}
+	stats["treedb.cache.vlog_generation.rewrite.stage_pending"] = fmt.Sprintf("%t", rewriteStagePending)
+	stats["treedb.cache.vlog_generation.rewrite.stage_observed_unix_nano"] = fmt.Sprintf("%d", rewriteStageObservedNS)
+	stats["treedb.cache.vlog_generation.rewrite.penalties_active"] = fmt.Sprintf("%d", rewritePenaltiesActive)
+	stats["treedb.cache.vlog_generation.rewrite.age_blocked_until_unix_nano"] = fmt.Sprintf("%d", rewriteAgeBlockedUntilNS)
+	stats["treedb.cache.vlog_generation.rewrite.age_blocked_remaining_ms"] = fmt.Sprintf("%d", rewriteAgeBlockedRemainingMS)
 	stats["treedb.cache.vlog_generation.hot.segment_target_bytes"] = fmt.Sprintf("%d", db.valueLogGenerationHotTarget)
 	stats["treedb.cache.vlog_generation.warm.segment_target_bytes"] = fmt.Sprintf("%d", db.valueLogGenerationWarmTarget)
 	stats["treedb.cache.vlog_generation.cold.segment_target_bytes"] = fmt.Sprintf("%d", db.valueLogGenerationColdTarget)
 	stats["treedb.cache.vlog_generation.rewrite_budget.bytes_per_sec"] = fmt.Sprintf("%d", db.valueLogRewriteBudgetBytes)
 	stats["treedb.cache.vlog_generation.rewrite_budget.records_per_sec"] = fmt.Sprintf("%d", db.valueLogRewriteBudgetRecords)
+	stats["treedb.cache.vlog_generation.rewrite_budget.tokens_bytes"] = fmt.Sprintf("%d", rewriteBudgetTokens)
+	stats["treedb.cache.vlog_generation.rewrite_budget.tokens_cap_bytes"] = fmt.Sprintf("%d", rewriteBudgetCap)
+	stats["treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct"] = fmt.Sprintf("%.3f", rewriteBudgetUtilPct)
+	stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteBudgetConsumed.Load())
 	stats["treedb.cache.vlog_generation.rewrite_trigger.stale_ratio_ppm"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerRatioPPM)
 	stats["treedb.cache.vlog_generation.rewrite_trigger.total_bytes"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerBytes)
 	stats["treedb.cache.vlog_generation.rewrite_trigger.churn_per_sec"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerChurn)
@@ -19649,11 +19805,32 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.rewrite.ineffective_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteIneffectiveLastNS.Load())
 	stats["treedb.cache.vlog_generation.rewrite.ineffective_backoff_seconds"] = fmt.Sprintf("%.0f", vlogGenerationRewriteIneffectiveBackoff.Seconds())
 	stats["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteReclaimedBytes.Load())
+	stats["treedb.cache.vlog_generation.rewrite.plan.total_ms"] = fmt.Sprintf("%.3f", float64(rewritePlanTotalNS)/float64(time.Millisecond))
+	stats["treedb.cache.vlog_generation.rewrite.plan.max_ms"] = fmt.Sprintf("%.3f", float64(rewritePlanMaxNS)/float64(time.Millisecond))
+	if rewritePlanRuns > 0 {
+		stats["treedb.cache.vlog_generation.rewrite.plan.avg_ms"] = fmt.Sprintf("%.3f", (float64(rewritePlanTotalNS)/float64(rewritePlanRuns))/float64(time.Millisecond))
+	} else {
+		stats["treedb.cache.vlog_generation.rewrite.plan.avg_ms"] = "0.000"
+	}
+	stats["treedb.cache.vlog_generation.rewrite.exec.total_ms"] = fmt.Sprintf("%.3f", float64(rewriteExecTotalNS)/float64(time.Millisecond))
+	stats["treedb.cache.vlog_generation.rewrite.exec.max_ms"] = fmt.Sprintf("%.3f", float64(rewriteExecMaxNS)/float64(time.Millisecond))
+	if rewriteRuns > 0 {
+		stats["treedb.cache.vlog_generation.rewrite.exec.avg_ms"] = fmt.Sprintf("%.3f", (float64(rewriteExecTotalNS)/float64(rewriteRuns))/float64(time.Millisecond))
+	} else {
+		stats["treedb.cache.vlog_generation.rewrite.exec.avg_ms"] = "0.000"
+	}
 	stats["treedb.cache.vlog_generation.rewrite.plan_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastRewritePlanUnixNano.Load())
 	stats["treedb.cache.vlog_generation.rewrite.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastRewriteUnixNano.Load())
 	stats["treedb.cache.vlog_generation.gc.deleted_segments"] = fmt.Sprintf("%d", db.vlogGenerationGCSegmentsDeleted.Load())
 	stats["treedb.cache.vlog_generation.gc.deleted_bytes"] = fmt.Sprintf("%d", db.vlogGenerationGCBytesDeleted.Load())
 	stats["treedb.cache.vlog_generation.gc.runs"] = fmt.Sprintf("%d", db.vlogGenerationGCRuns.Load())
+	stats["treedb.cache.vlog_generation.gc.exec.total_ms"] = fmt.Sprintf("%.3f", float64(gcExecTotalNS)/float64(time.Millisecond))
+	stats["treedb.cache.vlog_generation.gc.exec.max_ms"] = fmt.Sprintf("%.3f", float64(gcExecMaxNS)/float64(time.Millisecond))
+	if gcRuns > 0 {
+		stats["treedb.cache.vlog_generation.gc.exec.avg_ms"] = fmt.Sprintf("%.3f", (float64(gcExecTotalNS)/float64(gcRuns))/float64(time.Millisecond))
+	} else {
+		stats["treedb.cache.vlog_generation.gc.exec.avg_ms"] = "0.000"
+	}
 	stats["treedb.cache.vlog_generation.gc.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastGCUnixNano.Load())
 	stats["treedb.cache.vlog_generation.gc.dry_run.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastGCDryRunUnixNano.Load())
 	stats["treedb.cache.vlog_generation.gc.dry_run.last_eligible_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCDryRunBytesEligible.Load())
@@ -19663,6 +19840,13 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.vacuum.skipped_disabled"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedDisabled.Load())
 	stats["treedb.cache.vlog_generation.vacuum.skipped_rewrite_bytes"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedRewriteBytes.Load())
 	stats["treedb.cache.vlog_generation.vacuum.skipped_cooldown"] = fmt.Sprintf("%d", db.vlogGenerationVacuumSkippedCooldown.Load())
+	stats["treedb.cache.vlog_generation.vacuum.exec.total_ms"] = fmt.Sprintf("%.3f", float64(vacuumExecTotalNS)/float64(time.Millisecond))
+	stats["treedb.cache.vlog_generation.vacuum.exec.max_ms"] = fmt.Sprintf("%.3f", float64(vacuumExecMaxNS)/float64(time.Millisecond))
+	if vacuumRuns > 0 {
+		stats["treedb.cache.vlog_generation.vacuum.exec.avg_ms"] = fmt.Sprintf("%.3f", (float64(vacuumExecTotalNS)/float64(vacuumRuns))/float64(time.Millisecond))
+	} else {
+		stats["treedb.cache.vlog_generation.vacuum.exec.avg_ms"] = "0.000"
+	}
 	stats["treedb.cache.vlog_generation.vacuum.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastVacuumUnixNano.Load())
 	stats["treedb.cache.vlog_generation.remap.successes"] = fmt.Sprintf("%d", db.vlogGenerationRemapSuccesses.Load())
 	stats["treedb.cache.vlog_generation.remap.failures"] = fmt.Sprintf("%d", db.vlogGenerationRemapFailures.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 1de7c37bb..c98a0b425 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -5598,3 +5598,124 @@ func TestVlogGenerationGC_SkipsDuringRecentForegroundWrites(t *testing.T) {
 		t.Fatalf("gc calls=%d/%d want 0/0 while foreground writes are hot", dryRunCalls, realCalls)
 	}
 }
+
+func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
+	prepareDirectSchedulerTest(t)
+
+	dir := t.TempDir()
+	backend, err := backenddb.Open(backenddb.Options{Dir: dir})
+	if err != nil {
+		t.Fatalf("open backend: %v", err)
+	}
+	recorder := &rewriteBudgetRecordingBackend{DB: backend}
+
+	db, cleanup := openRewriteQueueTestDB(t, dir, recorder)
+	defer cleanup()
+
+	db.vlogGenerationMaintenanceAcquired.Store(2)
+	db.vlogGenerationMaintenancePassTotalNanos.Store(uint64((40 * time.Millisecond).Nanoseconds()))
+	db.vlogGenerationMaintenancePassMaxNanos.Store(uint64((30 * time.Millisecond).Nanoseconds()))
+	db.vlogGenerationRewritePlanRuns.Store(4)
+	db.vlogGenerationRewritePlanTotalNanos.Store(uint64((80 * time.Millisecond).Nanoseconds()))
+	db.vlogGenerationRewritePlanMaxNanos.Store(uint64((50 * time.Millisecond).Nanoseconds()))
+	db.vlogGenerationRewriteRuns.Store(3)
+	db.vlogGenerationRewriteExecTotalNanos.Store(uint64((150 * time.Millisecond).Nanoseconds()))
+	db.vlogGenerationRewriteExecMaxNanos.Store(uint64((70 * time.Millisecond).Nanoseconds()))
+	db.vlogGenerationGCRuns.Store(2)
+	db.vlogGenerationGCExecTotalNanos.Store(uint64((60 * time.Millisecond).Nanoseconds()))
+	db.vlogGenerationGCExecMaxNanos.Store(uint64((35 * time.Millisecond).Nanoseconds()))
+	db.vlogGenerationVacuumRuns.Store(2)
+	db.vlogGenerationVacuumExecTotalNanos.Store(uint64((44 * time.Millisecond).Nanoseconds()))
+	db.vlogGenerationVacuumExecMaxNanos.Store(uint64((25 * time.Millisecond).Nanoseconds()))
+	db.vlogGenerationRewriteBudgetTokensBytes.Store(512)
+	db.vlogGenerationRewriteBudgetConsumed.Store(1536)
+	db.vlogGenerationRewriteAgeBlockedUntilNS.Store(time.Now().Add(5 * time.Second).UnixNano())
+
+	db.vlogGenerationRewriteQueueMu.Lock()
+	db.vlogGenerationRewriteQueueLoaded = true
+	db.vlogGenerationRewriteQueue = []uint32{11, 12}
+	db.vlogGenerationRewriteLedger = []backenddb.ValueLogRewritePlanSegment{
+		{FileID: 11, BytesTotal: 1000, BytesLive: 700, BytesStale: 300},
+		{FileID: 12, BytesTotal: 500, BytesLive: 500, BytesStale: 0},
+	}
+	db.vlogGenerationRewritePenalties = map[uint32]valueLogGenerationRewritePenalty{
+		11: {Attempts: 1, CooldownUntilUnixNano: time.Now().Add(time.Minute).UnixNano()},
+	}
+	db.vlogGenerationRewriteStagePending = true
+	db.vlogGenerationRewriteStageObservedUnixNano = 1234
+	db.vlogGenerationRewriteQueueMu.Unlock()
+
+	stats := db.Stats()
+	if got := stats["treedb.cache.vlog_generation.maintenance.pass.total_ms"]; got != "40.000" {
+		t.Fatalf("maintenance pass total ms=%q want 40.000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.maintenance.pass.max_ms"]; got != "30.000" {
+		t.Fatalf("maintenance pass max ms=%q want 30.000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.maintenance.pass.avg_ms"]; got != "20.000" {
+		t.Fatalf("maintenance pass avg ms=%q want 20.000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.plan.total_ms"]; got != "80.000" {
+		t.Fatalf("rewrite plan total ms=%q want 80.000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.plan.avg_ms"]; got != "20.000" {
+		t.Fatalf("rewrite plan avg ms=%q want 20.000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.exec.total_ms"]; got != "150.000" {
+		t.Fatalf("rewrite exec total ms=%q want 150.000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.exec.avg_ms"]; got != "50.000" {
+		t.Fatalf("rewrite exec avg ms=%q want 50.000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.exec.total_ms"]; got != "60.000" {
+		t.Fatalf("gc exec total ms=%q want 60.000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.exec.avg_ms"]; got != "30.000" {
+		t.Fatalf("gc exec avg ms=%q want 30.000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.vacuum.exec.total_ms"]; got != "44.000" {
+		t.Fatalf("vacuum exec total ms=%q want 44.000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.vacuum.exec.avg_ms"]; got != "22.000" {
+		t.Fatalf("vacuum exec avg ms=%q want 22.000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.ledger_segments"]; got != "2" {
+		t.Fatalf("rewrite ledger segments=%q want 2", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_total"]; got != "1500" {
+		t.Fatalf("rewrite ledger bytes total=%q want 1500", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_live"]; got != "1200" {
+		t.Fatalf("rewrite ledger bytes live=%q want 1200", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.ledger_bytes_stale"]; got != "300" {
+		t.Fatalf("rewrite ledger bytes stale=%q want 300", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm"]; got != "200000" {
+		t.Fatalf("rewrite ledger stale ratio ppm=%q want 200000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.stage_pending"]; got != "true" {
+		t.Fatalf("rewrite stage pending=%q want true", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.stage_observed_unix_nano"]; got != "1234" {
+		t.Fatalf("rewrite stage observed=%q want 1234", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.penalties_active"]; got != "1" {
+		t.Fatalf("rewrite penalties active=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.age_blocked_remaining_ms"]; got == "0" {
+		t.Fatalf("rewrite age blocked remaining ms=%q want >0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite_budget.tokens_bytes"]; got != "512" {
+		t.Fatalf("rewrite budget tokens bytes=%q want 512", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"]; got != "1536" {
+		t.Fatalf("rewrite budget consumed=%q want 1536", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite_budget.tokens_cap_bytes"]; got == "0" {
+		t.Fatalf("rewrite budget cap bytes=%q want non-zero", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct"]; got == "" {
+		t.Fatalf("rewrite budget utilization pct missing")
+	}
+}
diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md
index b41f022dd..9eebb55ec 100644
--- a/worklog/2026-03-27.md
+++ b/worklog/2026-03-27.md
@@ -117,3 +117,31 @@
 - Interpretation:
   - Retry coalescing delivered the major contention reduction.
   - Periodic preflight further reduced maintenance churn/noop entries while preserving rewrite+vacuum progress in this early-state-sync window.
+
+- Instrumentation-first follow-up for incremental rewrite economics:
+  - code changes:
+    - `TreeDB/caching/db.go`
+      - added maintenance/rewrite/gc/vacuum duration counters and stats:
+        - `treedb.cache.vlog_generation.maintenance.pass.{total,max,avg}_ms`
+        - `treedb.cache.vlog_generation.rewrite.plan.{total,max,avg}_ms`
+        - `treedb.cache.vlog_generation.rewrite.exec.{total,max,avg}_ms`
+        - `treedb.cache.vlog_generation.gc.exec.{total,max,avg}_ms`
+        - `treedb.cache.vlog_generation.vacuum.exec.{total,max,avg}_ms`
+      - added rewrite backlog/debt visibility stats:
+        - `treedb.cache.vlog_generation.rewrite.ledger_segments`
+        - `treedb.cache.vlog_generation.rewrite.ledger_bytes_{total,live,stale}`
+        - `treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm`
+        - `treedb.cache.vlog_generation.rewrite.stage_{pending,observed_unix_nano}`
+        - `treedb.cache.vlog_generation.rewrite.penalties_active`
+        - `treedb.cache.vlog_generation.rewrite.age_blocked_{until_unix_nano,remaining_ms}`
+      - added rewrite budget execution stats:
+        - `treedb.cache.vlog_generation.rewrite_budget.tokens_{bytes,cap_bytes}`
+        - `treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct`
+        - `treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total`
+      - tracked rewrite-budget token consumption inside `vlogGenerationConsumeRewriteBudgetBytes`.
+    - `TreeDB/caching/vlog_generation_scheduler_test.go`
+      - added `TestVlogGenerationStats_ReportRewriteBacklogAndDurations`.
+
+- Validation:
+  - `go test ./TreeDB/caching -run TestVlogGenerationStats_ReportRewriteBacklogAndDurations -count=1`
+  - `go test ./TreeDB/caching -count=1`

From 7cc50d6de34c87cc60d6991e308d96bb03ad62df Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 18:47:07 -1000
Subject: [PATCH 06/61] treedb: stop maintenance retry collision amplification

---
 TreeDB/caching/db.go                          | 36 +++++----------
 .../caching/vlog_generation_scheduler_test.go | 13 ++++++
 worklog/2026-03-27.md                         | 44 +++++++++++++++++++
 3 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 20c877be7..256613925 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -13047,35 +13047,21 @@ func (db *DB) runVlogGenerationMaintenanceRetries(opts vlogGenerationMaintenance
 	deadline := time.Now().Add(retryWindow)
 	sleepDelay := 10 * time.Millisecond
 	for !db.closing.Load() {
-		// Once retry intent is already queued, avoid repeatedly colliding with
-		// the active maintenance pass; wait for release or deadline instead.
+		// Retry loops should never hammer an already-active maintenance pass.
+		// Wait for release/deadline instead of repeatedly colliding and inflating
+		// maintenance.attempts/collisions under hot checkpoint-kick activity.
 		if db.vlogGenerationMaintenanceActive.Load() {
-			if stopWhenAcquired && db.vlogGenerationDeferredMaintenancePending.Load() {
-				if time.Now().After(deadline) {
-					return
-				}
-				time.Sleep(sleepDelay)
-				if sleepDelay < 100*time.Millisecond {
-					sleepDelay *= 2
-					if sleepDelay > 100*time.Millisecond {
-						sleepDelay = 100 * time.Millisecond
-					}
-				}
-				continue
+			if time.Now().After(deadline) {
+				return
 			}
-			if !stopWhenAcquired && db.vlogGenerationCheckpointKickPending.Load() {
-				if time.Now().After(deadline) {
-					return
-				}
-				time.Sleep(sleepDelay)
-				if sleepDelay < 100*time.Millisecond {
-					sleepDelay *= 2
-					if sleepDelay > 100*time.Millisecond {
-						sleepDelay = 100 * time.Millisecond
-					}
+			time.Sleep(sleepDelay)
+			if sleepDelay < 100*time.Millisecond {
+				sleepDelay *= 2
+				if sleepDelay > 100*time.Millisecond {
+					sleepDelay = 100 * time.Millisecond
 				}
-				continue
 			}
+			continue
 		}
 		attempt++
 		if opts.debugSource != "" {
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index c98a0b425..7bb0453e3 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -392,6 +392,19 @@ func TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries(t
 		t.Fatalf("checkpoint pending retry collisions=%d want=0", got)
 	}
 
+	db.vlogGenerationMaintenanceActive.Store(true)
+	db.vlogGenerationCheckpointKickPending.Store(false)
+	db.runVlogGenerationMaintenanceRetries(vlogGenerationMaintenanceOptions{
+		bypassQuiet:           true,
+		skipRetainedPruneWait: true,
+		skipCheckpoint:        false,
+		rewriteDebtDrain:      true,
+		debugSource:           "checkpoint_pending",
+	}, 30*time.Millisecond, false)
+	if got := db.vlogGenerationMaintenanceCollisions.Load(); got != 0 {
+		t.Fatalf("checkpoint retry collisions while active=%d want=0", got)
+	}
+
 	db.vlogGenerationMaintenanceActive.Store(true)
 	db.vlogGenerationDeferredMaintenancePending.Store(true)
 	db.runVlogGenerationMaintenanceRetries(vlogGenerationMaintenanceOptions{
diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md
index 9eebb55ec..b97f3e2e0 100644
--- a/worklog/2026-03-27.md
+++ b/worklog/2026-03-27.md
@@ -145,3 +145,47 @@
 - Validation:
   - `go test ./TreeDB/caching -run TestVlogGenerationStats_ReportRewriteBacklogAndDurations -count=1`
   - `go test ./TreeDB/caching -count=1`
+
+- `run_celestia` instrumentation readout (application.db instance in expvar snapshots):
+  - run (`STOP_AT_LOCAL_HEIGHT=500`):
+    - home: `/home/mikers/.celestia-app-mainnet-treedb-20260327181906`
+    - key counters at peak snapshot:
+      - `maintenance.attempts=45`, `acquired=41`, `collisions=4`
+      - `rewrite.plan_runs=2`, `rewrite.plan_selected=2`, `rewrite.runs=1`
+      - `gc.runs=1`, `vacuum.runs=1`
+      - `rewrite_budget.consumed_bytes_total=33073153`
+  - offline compaction sanity check on that run:
+    - pre: `du -sb application.db = 4679915182`
+    - `treemap vlog-rewrite ... -rw` output: `segments_before=20 segments_after=15 bytes_before=4607146646 bytes_after=1983182186 records=957832`
+    - post: `du -sb application.db = 2021086813`
+
+- Longer-window stress run exposed retry-collision amplification:
+  - run (`STOP_AT_LOCAL_HEIGHT=2000`):
+    - home: `/home/mikers/.celestia-app-mainnet-treedb-20260327182903`
+    - timeline showed `acquired` flat while attempts/collisions spiked:
+      - snapshot progression reached `maintenance.attempts=333`, `collisions=304`, `acquired=29`
+      - `rewrite.plan_runs=3` but `rewrite.plan_selected=0`, `rewrite.runs=0`
+      - `checkpoint_kick.pending=true` persisted during collision growth
+
+- Fix for retry-collision amplification:
+  - code changes:
+    - `TreeDB/caching/db.go`
+      - `runVlogGenerationMaintenanceRetries`: when `maintenanceActive` is true, always back off/wait until release/deadline instead of conditionally attempting based on pending flags.
+      - This avoids high-frequency CAS collisions from checkpoint-kick retry goroutines while a long maintenance pass is active.
+    - `TreeDB/caching/vlog_generation_scheduler_test.go`
+      - extended `TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries` with checkpoint-pending=false + active pass case to prevent regression.
+  - validation:
+    - `go test ./TreeDB/caching -run TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries -count=1`
+    - `go test ./TreeDB/caching -count=1`
+
+- Confirmation run after fix:
+  - run (`STOP_AT_LOCAL_HEIGHT=2000`):
+    - home: `/home/mikers/.celestia-app-mainnet-treedb-20260327184030`
+  - comparable snapshot (`20260327184427`) vs pre-fix bad snapshot (`20260327183236`):
+    - `maintenance.attempts: 333 -> 38`
+    - `maintenance.acquired: 29 -> 38`
+    - `maintenance.collisions: 304 -> 0`
+    - `rewrite.plan_selected: 0 -> 2`
+    - `rewrite.runs: 0 -> 1`
+    - `vacuum.runs: 0 -> 1`
+    - `rewrite_budget.consumed_bytes_total: 0 -> 33073906`

From 00355572957bb6532eec535392a4102c4df1287f Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 19:30:09 -1000
Subject: [PATCH 07/61] treedb: add stage-gate and rewrite segment counters

---
 TreeDB/caching/db.go                          | 25 ++++++++++
 .../caching/vlog_generation_scheduler_test.go | 16 +++++++
 worklog/2026-03-27.md                         | 47 +++++++++++++++++++
 3 files changed, 88 insertions(+)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 256613925..90e2119f6 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5181,6 +5181,7 @@ type DB struct {
 	vlogGenerationRewritePlanErrors             atomic.Uint64
 	vlogGenerationRewritePlanEmpty              atomic.Uint64
 	vlogGenerationRewritePlanSelected           atomic.Uint64
+	vlogGenerationRewritePlanSelectedSegments   atomic.Uint64
 	vlogGenerationRewritePlanSelectedBytes      atomic.Uint64
 	vlogGenerationRewritePlanSelectedLiveBytes  atomic.Uint64
 	vlogGenerationRewritePlanSelectedStaleBytes atomic.Uint64
@@ -5220,6 +5221,8 @@ type DB struct {
 	vlogGenerationMaintenanceSkipWALOnPeriodic  atomic.Uint64
 	vlogGenerationMaintenanceSkipPhase          atomic.Uint64
 	vlogGenerationMaintenanceSkipStageGate      atomic.Uint64
+	vlogGenerationMaintenanceSkipStageNotDue    atomic.Uint64
+	vlogGenerationMaintenanceSkipStageDue       atomic.Uint64
 	vlogGenerationMaintenanceSkipAgeBlocked     atomic.Uint64
 	vlogGenerationMaintenanceSkipPriority       atomic.Uint64
 	vlogGenerationMaintenanceSkipQuiet          atomic.Uint64
@@ -5259,6 +5262,7 @@ type DB struct {
 	vlogGenerationRewritePlanMaxNanos       atomic.Uint64
 	vlogGenerationRewriteExecTotalNanos     atomic.Uint64
 	vlogGenerationRewriteExecMaxNanos       atomic.Uint64
+	vlogGenerationRewriteExecSourceSegments atomic.Uint64
 	vlogGenerationGCExecTotalNanos          atomic.Uint64
 	vlogGenerationGCExecMaxNanos            atomic.Uint64
 	vlogGenerationVacuumExecTotalNanos      atomic.Uint64
@@ -12531,6 +12535,18 @@ func (db *DB) observeVlogGenerationRewritePlanOutcomeWithDuration(plan backenddb
 	}
 	if len(plan.SourceFileIDs) > 0 || len(plan.SelectedSegments) > 0 || plan.SegmentsSelected > 0 {
 		db.vlogGenerationRewritePlanSelected.Add(1)
+		selectedSegments := plan.SegmentsSelected
+		if selectedSegments <= 0 {
+			switch {
+			case len(plan.SelectedSegments) > 0:
+				selectedSegments = len(plan.SelectedSegments)
+			case len(plan.SourceFileIDs) > 0:
+				selectedSegments = len(plan.SourceFileIDs)
+			}
+		}
+		if selectedSegments > 0 {
+			db.vlogGenerationRewritePlanSelectedSegments.Add(uint64(selectedSegments))
+		}
 		selectedTotal := plan.SelectedBytesTotal
 		selectedLive := plan.SelectedBytesLive
 		selectedStale := plan.SelectedBytesStale
@@ -13257,6 +13273,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 			// has elapsed. The only valid next step is to wait for confirmation.
 			if !vlogGenerationIsStageConfirmSource(opts) {
 				db.vlogGenerationMaintenanceSkipStageGate.Add(1)
+				db.vlogGenerationMaintenanceSkipStageNotDue.Add(1)
 				return
 			}
 		} else if !vlogGenerationIsStageConfirmSource(opts) {
@@ -13264,6 +13281,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 			// explicit stage-confirm wake instead of letting generic retries or
 			// periodic passes reacquire it first.
 			db.vlogGenerationMaintenanceSkipStageGate.Add(1)
+			db.vlogGenerationMaintenanceSkipStageDue.Add(1)
 			return
 		}
 	}
@@ -14008,6 +14026,9 @@ planned:
 			}
 			db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle)
 			db.vlogGenerationRewriteRuns.Add(1)
+			if sourceSegments := len(rewriteOpts.SourceFileIDs); sourceSegments > 0 {
+				db.vlogGenerationRewriteExecSourceSegments.Add(uint64(sourceSegments))
+			}
 			rewriteBytesIn := int64(0)
 			if processedLedgerOK {
 				rewriteBytesIn = processedLedgerLiveBytes
@@ -19706,6 +19727,8 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.maintenance.skip.wal_on_periodic"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipWALOnPeriodic.Load())
 	stats["treedb.cache.vlog_generation.maintenance.skip.maintenance_phase"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipPhase.Load())
 	stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipStageGate.Load())
+	stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipStageNotDue.Load())
+	stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipStageDue.Load())
 	stats["treedb.cache.vlog_generation.maintenance.skip.age_blocked_gate"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipAgeBlocked.Load())
 	stats["treedb.cache.vlog_generation.maintenance.skip.priority_pending"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipPriority.Load())
 	stats["treedb.cache.vlog_generation.maintenance.skip.quiet_window"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceSkipQuiet.Load())
@@ -19778,9 +19801,11 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.rewrite.plan_errors"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanErrors.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_empty"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanEmpty.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_selected"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelected.Load())
+	stats["treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedSegments.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedBytes.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_live"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedLiveBytes.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedStaleBytes.Load())
+	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load())
 	stats["treedb.cache.vlog_generation.rewrite.canceled_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledRuns.Load())
 	stats["treedb.cache.vlog_generation.rewrite.canceled_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledLastNS.Load())
 	stats["treedb.cache.vlog_generation.rewrite.queue_prune_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteQueuePruneRuns.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 7bb0453e3..34d81d1a6 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -5643,6 +5643,10 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	db.vlogGenerationRewriteBudgetTokensBytes.Store(512)
 	db.vlogGenerationRewriteBudgetConsumed.Store(1536)
 	db.vlogGenerationRewriteAgeBlockedUntilNS.Store(time.Now().Add(5 * time.Second).UnixNano())
+	db.vlogGenerationMaintenanceSkipStageNotDue.Store(5)
+	db.vlogGenerationMaintenanceSkipStageDue.Store(2)
+	db.vlogGenerationRewritePlanSelectedSegments.Store(6)
+	db.vlogGenerationRewriteExecSourceSegments.Store(3)
 
 	db.vlogGenerationRewriteQueueMu.Lock()
 	db.vlogGenerationRewriteQueueLoaded = true
@@ -5731,4 +5735,16 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct"]; got == "" {
 		t.Fatalf("rewrite budget utilization pct missing")
 	}
+	if got := stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due"]; got != "5" {
+		t.Fatalf("maintenance skip stage gate not due=%q want 5", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved"]; got != "2" {
+		t.Fatalf("maintenance skip stage gate due reserved=%q want 2", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"]; got != "6" {
+		t.Fatalf("rewrite plan selected segments total=%q want 6", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"]; got != "3" {
+		t.Fatalf("rewrite exec source segments total=%q want 3", got)
+	}
 }
diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md
index b97f3e2e0..29c5b7b66 100644
--- a/worklog/2026-03-27.md
+++ b/worklog/2026-03-27.md
@@ -189,3 +189,50 @@
     - `rewrite.runs: 0 -> 1`
     - `vacuum.runs: 0 -> 1`
     - `rewrite_budget.consumed_bytes_total: 0 -> 33073906`
+
+- Stage-gate/selection observability follow-up (live rewrite throughput diagnosis):
+  - code changes:
+    - `TreeDB/caching/db.go`
+      - added stage-gate split counters:
+        - `treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due`
+        - `treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved`
+      - added rewrite selection/execution segment counters:
+        - `treedb.cache.vlog_generation.rewrite.plan_selected_segments_total`
+        - `treedb.cache.vlog_generation.rewrite.exec.source_segments_total`
+      - incremented counters in:
+        - stage-gate early-return branches (`not_due` vs `due_reserved`)
+        - rewrite-plan outcome accounting (selected segments)
+        - rewrite execution completion (source segments executed)
+    - `TreeDB/caching/vlog_generation_scheduler_test.go`
+      - extended `TestVlogGenerationStats_ReportRewriteBacklogAndDurations` assertions for new keys.
+
+- Validation:
+  - `go test ./TreeDB/caching -count=1`
+
+- `run_celestia` run with new counters (baseline fast profile, no profile trigger override):
+  - command:
+    - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 STOP_AT_LOCAL_HEIGHT=2000 ~/run_celestia.sh`
+  - run home:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260327192126`
+  - final snapshot:
+    - `pprof-heap-max-rss-final-7983364k-20260327192636.treedb_vars.json`
+  - key counters:
+    - `maintenance.attempts=38`, `acquired=38`, `collisions=0`
+    - `rewrite.plan_runs=2`, `rewrite.plan_selected=2`
+    - `rewrite.plan_selected_segments_total=3`
+    - `rewrite.runs=1`, `rewrite.exec.source_segments_total=1`
+    - `rewrite.bytes_in=33073442`, `rewrite.reclaimed_bytes=0`
+    - `maintenance.skip.stage_gate=7`
+    - `maintenance.skip.stage_gate_not_due=7`
+    - `maintenance.skip.stage_gate_due_reserved=0`
+  - interpretation:
+    - planner selected more segment debt than was executed in-run (`3 selected vs 1 executed`).
+    - stage gating was entirely waiting-for-confirmation (`not_due`), not due-slot reservation.
+
+- Offline rewrite delta for same run home:
+  - pre: `du -sb application.db = 4707839386`
+  - `treemap vlog-rewrite ... -rw` output:
+    - `segments_before=20 segments_after=16 bytes_before=4637168004 bytes_after=2039183405 records=964467`
+  - post: `du -sb application.db = 2077350273`
+  - interpretation:
+    - live run still leaves substantial reclaimable headroom; new counters indicate confirmation-gated debt progression as one concrete limiter.

From 0dd0b18bbb5a6daa72363cb58731e0ba81fe7898 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 19:59:59 -1000
Subject: [PATCH 08/61] treedb: speed staged rewrite debt progression

---
 TreeDB/caching/db.go                          | 18 +++--
 .../caching/vlog_generation_scheduler_test.go | 70 +++++++++++++++++++
 worklog/2026-03-27.md                         | 56 +++++++++++++++
 3 files changed, 139 insertions(+), 5 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 90e2119f6..027e7ca12 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -12306,7 +12306,7 @@ func (db *DB) vlogGenerationRewriteMaxSegmentsForRun(queueLen int, budgetTokens
 	}
 	// Checkpoint-kick retries should keep each debt-drain run small to reduce
 	// write amplification when foreground ingest is still active.
-	if opts.bypassQuiet && !opts.skipCheckpoint {
+	if opts.bypassQuiet && !opts.skipCheckpoint && !vlogGenerationIsStageConfirmSource(opts) && !vlogGenerationIsAgeBlockedSource(opts) {
 		return 1
 	}
 	maxSegments = queueLen
@@ -13513,7 +13513,9 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 					}
 					confirmed := stableVlogGenerationRewriteLedgerSegments(stagedLedger, plan.SelectedSegments)
 					if len(confirmed) > 0 {
-						plan = filterVlogGenerationRewritePlanToSegments(plan, confirmed)
+						// Treat confirmation overlap as a stability signal, then run
+						// the current sparse plan (not just the overlap subset) so live
+						// maintenance can make forward progress within short sync windows.
 						shouldRewrite = true
 						reason = vlogGenerationReasonRewriteResume
 					} else {
@@ -13790,9 +13792,15 @@ planned:
 					}
 				}
 				rewriteQueue = append([]uint32(nil), rewritePlan.SourceFileIDs...)
-				// Do not debt-drain freshly planned work in the same pass; only apply
-				// multi-segment debt-drain to explicit resume queues.
-				rewriteMaxSegments = vlogGenerationRewriteResumeMaxSegments
+				// Do not debt-drain freshly planned work in the same pass. The only
+				// exception is a confirmed staged rewrite-resume pass, which should
+				// be allowed to consume debt in bounded multi-segment chunks.
+				allowPlanDebtDrain := reason == vlogGenerationReasonRewriteResume && opts.rewriteDebtDrain
+				if allowPlanDebtDrain {
+					rewriteMaxSegments = db.vlogGenerationRewriteMaxSegmentsForRun(len(rewriteQueue), budgetTokens, opts)
+				} else {
+					rewriteMaxSegments = vlogGenerationRewriteResumeMaxSegments
+				}
 				// If the token bucket is enabled and empty, persist the plan/ledger but
 				// skip running the rewrite until we have budget to spend.
 				if db.vlogGenerationRewriteBudgetEnabled() && budgetTokens <= 0 {
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 34d81d1a6..99f715ef1 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -2778,6 +2778,76 @@ func TestVlogGenerationRewritePlan_StageConfirmationExecutesConfirmedSubset(t *t
 	}
 }
 
+func TestVlogGenerationRewritePlan_StageConfirmationDebtDrainProcessesMultipleSegments(t *testing.T) {
+	prepareDirectSchedulerTest(t)
+
+	dir := t.TempDir()
+	backend, err := backenddb.Open(backenddb.Options{Dir: dir})
+	if err != nil {
+		t.Fatalf("open backend: %v", err)
+	}
+	recorder := &rewriteBudgetRecordingBackend{
+		DB: backend,
+		planResponse: backenddb.ValueLogRewritePlan{
+			SourceFileIDs: []uint32{11, 22, 33},
+			SelectedSegments: []backenddb.ValueLogRewritePlanSegment{
+				{FileID: 11, BytesTotal: 64 << 20, BytesLive: 8 << 20, BytesStale: 56 << 20, StaleRatio: 0.875},
+				{FileID: 22, BytesTotal: 64 << 20, BytesLive: 16 << 20, BytesStale: 48 << 20, StaleRatio: 0.75},
+				{FileID: 33, BytesTotal: 64 << 20, BytesLive: 24 << 20, BytesStale: 40 << 20, StaleRatio: 0.625},
+			},
+			SegmentsTotal:      3,
+			SegmentsSelected:   3,
+			BytesTotal:         192 << 20,
+			BytesLive:          48 << 20,
+			BytesStale:         144 << 20,
+			SelectedBytesTotal: 192 << 20,
+			SelectedBytesLive:  48 << 20,
+			SelectedBytesStale: 144 << 20,
+		},
+		rewriteResponse: backenddb.ValueLogRewriteStats{
+			BytesBefore:   192 << 20,
+			BytesAfter:    48 << 20,
+			RecordsCopied: 3,
+		},
+	}
+
+	db, cleanup := openRewriteQueueTestDB(t, dir, recorder)
+	t.Cleanup(cleanup)
+	db.valueLogRewriteTriggerBytes = 0
+	db.valueLogRewriteTriggerRatioPPM = 1
+	db.valueLogGenerationHotTarget = 0
+	db.vlogGenerationRewriteBudgetTokensBytes.Store(defaultVlogGenerationWarmTargetBytes * 4)
+	forceVlogMaintenanceIdle(db)
+
+	if err := db.setVlogGenerationRewriteLedgerWithStage([]backenddb.ValueLogRewritePlanSegment{
+		{FileID: 11, BytesTotal: 64 << 20, BytesLive: 8 << 20, BytesStale: 56 << 20, StaleRatio: 0.875},
+		{FileID: 22, BytesTotal: 64 << 20, BytesLive: 16 << 20, BytesStale: 48 << 20, StaleRatio: 0.75},
+		{FileID: 33, BytesTotal: 64 << 20, BytesLive: 24 << 20, BytesStale: 40 << 20, StaleRatio: 0.625},
+	}, true, time.Now().Add(-vlogGenerationRewriteMinInterval-time.Second).UnixNano()); err != nil {
+		t.Fatalf("seed staged rewrite ledger: %v", err)
+	}
+	forceRewriteStageConfirmDue(t, db)
+
+	db.maybeRunVlogGenerationMaintenanceWithOptions(false, vlogGenerationMaintenanceOptions{
+		bypassQuiet:           true,
+		skipRetainedPruneWait: true,
+		skipCheckpoint:        false,
+		rewriteDebtDrain:      true,
+		debugSource:           "rewrite_stage_confirm",
+	})
+
+	rewriteOpts, rewriteCalls := recorder.recordedRewrite()
+	if rewriteCalls != 1 {
+		t.Fatalf("rewrite calls after staged confirmation=%d want=1", rewriteCalls)
+	}
+	if got := len(rewriteOpts.SourceFileIDs); got <= 1 {
+		t.Fatalf("rewrite SourceFileIDs after staged confirmation=%v want multiple ids", rewriteOpts.SourceFileIDs)
+	}
+	if got := len(rewriteOpts.SourceFileIDs); got > vlogGenerationRewriteDebtDrainMaxSegments {
+		t.Fatalf("rewrite SourceFileIDs len=%d want <= %d", got, vlogGenerationRewriteDebtDrainMaxSegments)
+	}
+}
+
 func TestVlogGenerationRewritePlan_StageConfirmationReplansEvenWhenOtherTriggersFire(t *testing.T) {
 	prepareDirectSchedulerTest(t)
 
diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md
index 29c5b7b66..6e490ca7a 100644
--- a/worklog/2026-03-27.md
+++ b/worklog/2026-03-27.md
@@ -236,3 +236,59 @@
   - post: `du -sb application.db = 2077350273`
   - interpretation:
     - live run still leaves substantial reclaimable headroom; new counters indicate confirmation-gated debt progression as one concrete limiter.
+
+- Stage-confirm rewrite progression experiment (post-observability):
+  - hypothesis:
+    - live rewrite debt was bottlenecked by stage-confirm overlap collapse + single-segment execution, visible as `plan_selected_segments_total > rewrite.exec.source_segments_total`.
+  - code changes:
+    - `TreeDB/caching/db.go`
+      - `vlogGenerationRewriteMaxSegmentsForRun`:
+        - keep checkpoint-kick (`bypassQuiet && !skipCheckpoint`) capped to single-segment,
+        - but allow stage-confirm / age-blocked deferred sources to use bounded debt-drain sizing.
+      - rewrite execution path when `haveRewritePlan`:
+        - allow debt-drain sizing for confirmed `rewrite_resume` plans instead of forcing single-segment.
+      - stale-ratio staged confirmation handling:
+        - once overlap confirms stability, execute current sparse plan rather than filtering to overlap-only subset.
+    - `TreeDB/caching/vlog_generation_scheduler_test.go`
+      - added `TestVlogGenerationRewritePlan_StageConfirmationDebtDrainProcessesMultipleSegments`.
+
+- Validation:
+  - `go test ./TreeDB/caching -run 'TestVlogGenerationRewritePlan_StageConfirmationDebtDrainProcessesMultipleSegments|TestVlogGenerationRewriteQueue_CheckpointKickDebtDrainCapsSingleSegment|TestVlogGenerationRewriteQueue_DebtDrainProcessesMultipleSegments|TestVlogGenerationRewritePlan_StageConfirmationExecutesConfirmedSubset' -count=1`
+  - `go test ./TreeDB/caching -count=1`
+  - `go test ./TreeDB -count=1`
+
+- `run_celestia` comparison (same profile/height target):
+  - command:
+    - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 STOP_AT_LOCAL_HEIGHT=5000 ~/run_celestia.sh`
+  - pre-change reference home:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260327194254`
+    - final snapshot: `pprof-heap-max-rss-final-6937148k-20260327194740.treedb_vars.json`
+    - key counters:
+      - `plan_selected_segments_total=3`
+      - `rewrite.exec.source_segments_total=1`
+      - `rewrite.runs=1`
+      - `rewrite.bytes_in=33081912`
+      - `rewrite.reclaimed_bytes=0`
+  - post-change run home:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260327195053`
+    - final snapshot: `pprof-heap-max-rss-7548312k-20260327195551.treedb_vars.json`
+    - key counters:
+      - `plan_selected_segments_total=3`
+      - `rewrite.exec.source_segments_total=2`
+      - `rewrite.runs=1`
+      - `rewrite.bytes_in=38276046`
+      - `rewrite.reclaimed_bytes=0`
+
+- offline rewrite deltas (headroom remains large):
+  - pre-change home `/home/mikers/.celestia-app-mainnet-treedb-20260327194254`:
+    - pre: `4645103594`
+    - post: `2016029251`
+    - tool output: `segments_before=20 segments_after=15 bytes_before=4583082964 bytes_after=1978124746 records=956586`
+  - post-change home `/home/mikers/.celestia-app-mainnet-treedb-20260327195053`:
+    - pre: `4653743667`
+    - post: `2022437394`
+    - tool output: `segments_before=20 segments_after=15 bytes_before=4598014513 bytes_after=1984532899 records=958463`
+
+- interpretation:
+  - stage-confirm policy change increased in-run rewritten source segments (`1 -> 2`) in a comparable 5000-height window.
+  - immediate live reclaim remains `0`, and offline compaction still cuts ~2.6 GiB, so major headroom remains.

From 17a907ec3d3e627990d9b38de18384eac83e5d2c Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 20:24:57 -1000
Subject: [PATCH 09/61] treedb: add rewrite no-reclaim diagnostics

---
 TreeDB/caching/db.go                          | 22 +++++++++
 .../caching/vlog_generation_scheduler_test.go | 16 +++++++
 worklog/2026-03-27.md                         | 46 +++++++++++++++++++
 3 files changed, 84 insertions(+)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 027e7ca12..1baf6be7d 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5175,6 +5175,10 @@ type DB struct {
 	vlogGenerationRewriteBytesIn                atomic.Uint64
 	vlogGenerationRewriteBytesOut               atomic.Uint64
 	vlogGenerationRewriteReclaimedBytes         atomic.Uint64
+	vlogGenerationRewriteProcessedLiveBytes     atomic.Uint64
+	vlogGenerationRewriteProcessedStaleBytes    atomic.Uint64
+	vlogGenerationRewriteNoReclaimRuns          atomic.Uint64
+	vlogGenerationRewriteNoReclaimStaleBytes    atomic.Uint64
 	vlogGenerationRewriteRuns                   atomic.Uint64
 	vlogGenerationRewritePlanRuns               atomic.Uint64
 	vlogGenerationRewritePlanCanceled           atomic.Uint64
@@ -13964,6 +13968,14 @@ planned:
 				db.vlogGenerationRewriteReclaimedBytes.Add(uint64(effectiveBytesBefore - effectiveBytesAfter))
 			}
 			locallyEffectiveProcessedDebt := len(processedRewriteIDs) > 0 && processedLedgerOK && processedLedgerStaleBytes > 0 && stats.RecordsCopied > 0
+			if processedLedgerOK {
+				if processedLedgerLiveBytes > 0 {
+					db.vlogGenerationRewriteProcessedLiveBytes.Add(uint64(processedLedgerLiveBytes))
+				}
+				if processedLedgerStaleBytes > 0 {
+					db.vlogGenerationRewriteProcessedStaleBytes.Add(uint64(processedLedgerStaleBytes))
+				}
+			}
 			if effectiveBytesBefore > 0 && effectiveBytesAfter >= effectiveBytesBefore && !locallyEffectiveProcessedDebt {
 				db.vlogGenerationRewriteIneffectiveRuns.Add(1)
 				db.vlogGenerationRewriteIneffectiveBytesIn.Add(uint64(effectiveBytesBefore))
@@ -13994,6 +14006,12 @@ planned:
 				}
 			}
 			if locallyEffectiveProcessedDebt {
+				if effectiveBytesAfter >= effectiveBytesBefore {
+					db.vlogGenerationRewriteNoReclaimRuns.Add(1)
+					if processedLedgerStaleBytes > 0 {
+						db.vlogGenerationRewriteNoReclaimStaleBytes.Add(uint64(processedLedgerStaleBytes))
+					}
+				}
 				db.debugVlogMaintf(
 					"rewrite_effective_local reason=%s processed_ids=%d planned_total=%d planned_live=%d planned_stale=%d global_bytes_before=%d global_bytes_after=%d gc_bytes_deleted=%d records=%d",
 					vlogGenerationReasonString(reason),
@@ -19802,6 +19820,10 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.segments.cold"] = fmt.Sprintf("%d", retained.SegmentsCold)
 	stats["treedb.cache.vlog_generation.rewrite.bytes_in"] = fmt.Sprintf("%d", db.vlogGenerationRewriteBytesIn.Load())
 	stats["treedb.cache.vlog_generation.rewrite.bytes_out"] = fmt.Sprintf("%d", db.vlogGenerationRewriteBytesOut.Load())
+	stats["treedb.cache.vlog_generation.rewrite.processed_live_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteProcessedLiveBytes.Load())
+	stats["treedb.cache.vlog_generation.rewrite.processed_stale_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteProcessedStaleBytes.Load())
+	stats["treedb.cache.vlog_generation.rewrite.no_reclaim_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteNoReclaimRuns.Load())
+	stats["treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteNoReclaimStaleBytes.Load())
 	stats["treedb.cache.vlog_generation.rewrite.runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteRuns.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanRuns.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_canceled"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanCanceled.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 99f715ef1..230f09573 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -5717,6 +5717,10 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	db.vlogGenerationMaintenanceSkipStageDue.Store(2)
 	db.vlogGenerationRewritePlanSelectedSegments.Store(6)
 	db.vlogGenerationRewriteExecSourceSegments.Store(3)
+	db.vlogGenerationRewriteProcessedLiveBytes.Store(900)
+	db.vlogGenerationRewriteProcessedStaleBytes.Store(450)
+	db.vlogGenerationRewriteNoReclaimRuns.Store(3)
+	db.vlogGenerationRewriteNoReclaimStaleBytes.Store(320)
 
 	db.vlogGenerationRewriteQueueMu.Lock()
 	db.vlogGenerationRewriteQueueLoaded = true
@@ -5817,4 +5821,16 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"]; got != "3" {
 		t.Fatalf("rewrite exec source segments total=%q want 3", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.processed_live_bytes"]; got != "900" {
+		t.Fatalf("rewrite processed live bytes=%q want 900", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.processed_stale_bytes"]; got != "450" {
+		t.Fatalf("rewrite processed stale bytes=%q want 450", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.no_reclaim_runs"]; got != "3" {
+		t.Fatalf("rewrite no reclaim runs=%q want 3", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes"]; got != "320" {
+		t.Fatalf("rewrite no reclaim stale bytes=%q want 320", got)
+	}
 }
diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md
index 6e490ca7a..9068195d2 100644
--- a/worklog/2026-03-27.md
+++ b/worklog/2026-03-27.md
@@ -292,3 +292,49 @@
 - interpretation:
   - stage-confirm policy change increased in-run rewritten source segments (`1 -> 2`) in a comparable 5000-height window.
   - immediate live reclaim remains `0`, and offline compaction still cuts ~2.6 GiB, so major headroom remains.
+
+- No-reclaim diagnostics instrumentation for live rewrite:
+  - code changes:
+    - `TreeDB/caching/db.go`
+      - added rewrite economics counters:
+        - `treedb.cache.vlog_generation.rewrite.processed_live_bytes`
+        - `treedb.cache.vlog_generation.rewrite.processed_stale_bytes`
+        - `treedb.cache.vlog_generation.rewrite.no_reclaim_runs`
+        - `treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes`
+      - counters update in rewrite execution path:
+        - accumulate processed live/stale bytes from processed ledger chunk
+        - mark `no_reclaim_runs` when rewrite copied stale debt but global bytes did not fall in-pass
+    - `TreeDB/caching/vlog_generation_scheduler_test.go`
+      - extended `TestVlogGenerationStats_ReportRewriteBacklogAndDurations` assertions for new keys.
+
+- Validation:
+  - `go test ./TreeDB/caching -run 'TestVlogGenerationStats_ReportRewriteBacklogAndDurations|TestVlogGenerationRewritePlan_StageConfirmationDebtDrainProcessesMultipleSegments|TestVlogGenerationRewriteQueue_CheckpointKickDebtDrainCapsSingleSegment' -count=1`
+  - `go test ./TreeDB/caching -count=1`
+  - `go test ./TreeDB -count=1`
+
+- `run_celestia` readout with new counters:
+  - command:
+    - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 STOP_AT_LOCAL_HEIGHT=5000 ~/run_celestia.sh`
+  - run home:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260327201801`
+  - final snapshot:
+    - `pprof-heap-max-rss-final-7767828k-20260327202312.treedb_vars.json`
+  - key counters:
+    - `plan_selected_segments_total=3`
+    - `rewrite.exec.source_segments_total=2`
+    - `rewrite.runs=1`
+    - `rewrite.bytes_in=38292854`
+    - `rewrite.processed_live_bytes=38292854`
+    - `rewrite.processed_stale_bytes=498581006`
+    - `rewrite.no_reclaim_runs=1`
+    - `rewrite.no_reclaim_stale_bytes=498581006`
+    - `rewrite.reclaimed_bytes=0`
+    - `gc.deleted_bytes=0`
+
+- interpretation:
+  - live rewrite now clearly reports that substantial stale payload was processed in-pass (~498 MiB) with zero immediate reclaim, confirming reclaim is blocked/deferred downstream of selection+copy.
+
+- offline rewrite sanity check for same run:
+  - pre: `4747763395`
+  - post: `2064528109`
+  - tool output: `segments_before=20 segments_after=16 bytes_before=4674175679 bytes_after=2026328485 records=963752`

From b8d918682545e3c5b0102b6385f6a1392ae71854 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 20:50:28 -1000
Subject: [PATCH 10/61] treedb: export gc blocker classification stats

---
 TreeDB/caching/db.go                          | 44 +++++++++++++++++
 .../caching/vlog_generation_scheduler_test.go | 48 +++++++++++++++++++
 TreeDB/db/vlog_gc.go                          |  8 ++++
 worklog/2026-03-27.md                         | 27 +++++++++++
 4 files changed, 127 insertions(+)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 1baf6be7d..f26ae16eb 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5216,6 +5216,18 @@ type DB struct {
 	vlogGenerationLastGCDryRunUnixNano          atomic.Int64
 	vlogGenerationLastGCDryRunBytesEligible     atomic.Int64
 	vlogGenerationLastGCDryRunSegsEligible      atomic.Int64
+	vlogGenerationLastGCBytesReferenced         atomic.Int64
+	vlogGenerationLastGCSegmentsReferenced      atomic.Int64
+	vlogGenerationLastGCBytesActive             atomic.Int64
+	vlogGenerationLastGCSegmentsActive          atomic.Int64
+	vlogGenerationLastGCBytesProtected          atomic.Int64
+	vlogGenerationLastGCSegmentsProtected       atomic.Int64
+	vlogGenerationLastGCBytesEligible           atomic.Int64
+	vlogGenerationLastGCSegmentsEligible        atomic.Int64
+	vlogGenerationLastGCBytesDeleted            atomic.Int64
+	vlogGenerationLastGCSegmentsDeleted         atomic.Int64
+	vlogGenerationLastGCBytesPending            atomic.Int64
+	vlogGenerationLastGCSegmentsPending         atomic.Int64
 	vlogGenerationChurnBytes                    atomic.Uint64
 	vlogGenerationSchedulerState                atomic.Uint32
 	vlogGenerationMaintenanceActive             atomic.Bool
@@ -12497,6 +12509,24 @@ func (db *DB) observeVlogGenerationGCExecDuration(d time.Duration) {
 	observeDurationNanos(&db.vlogGenerationGCExecTotalNanos, &db.vlogGenerationGCExecMaxNanos, d)
 }
 
+func (db *DB) observeVlogGenerationGCStats(stats backenddb.ValueLogGCStats) {
+	if db == nil {
+		return
+	}
+	db.vlogGenerationLastGCBytesReferenced.Store(stats.BytesReferenced)
+	db.vlogGenerationLastGCSegmentsReferenced.Store(int64(stats.SegmentsReferenced))
+	db.vlogGenerationLastGCBytesActive.Store(stats.BytesActive)
+	db.vlogGenerationLastGCSegmentsActive.Store(int64(stats.SegmentsActive))
+	db.vlogGenerationLastGCBytesProtected.Store(stats.BytesProtected)
+	db.vlogGenerationLastGCSegmentsProtected.Store(int64(stats.SegmentsProtected))
+	db.vlogGenerationLastGCBytesEligible.Store(stats.BytesEligible)
+	db.vlogGenerationLastGCSegmentsEligible.Store(int64(stats.SegmentsEligible))
+	db.vlogGenerationLastGCBytesDeleted.Store(stats.BytesDeleted)
+	db.vlogGenerationLastGCSegmentsDeleted.Store(int64(stats.SegmentsDeleted))
+	db.vlogGenerationLastGCBytesPending.Store(stats.BytesPending)
+	db.vlogGenerationLastGCSegmentsPending.Store(int64(stats.SegmentsPending))
+}
+
 func (db *DB) observeVlogGenerationVacuumExecDuration(d time.Duration) {
 	if db == nil {
 		return
@@ -13955,6 +13985,7 @@ planned:
 					db.debugVlogMaintf("gc_after_rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), gcErr, float64(gcDur.Microseconds())/1000)
 					return fmt.Errorf("generational gc after rewrite: %w", gcErr)
 				}
+				db.observeVlogGenerationGCStats(gcStats)
 				if gcStats.BytesDeleted > 0 {
 					gcBytesDeleted = int64(gcStats.BytesDeleted)
 					effectiveBytesAfter -= gcBytesDeleted
@@ -14163,6 +14194,7 @@ planned:
 		if err != nil {
 			return fmt.Errorf("generational gc: %w", err)
 		}
+		db.observeVlogGenerationGCStats(gcStats)
 		db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle)
 		db.vlogGenerationGCRuns.Add(1)
 		if gcStats.SegmentsDeleted > 0 {
@@ -19864,6 +19896,18 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.rewrite.last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationLastRewriteUnixNano.Load())
 	stats["treedb.cache.vlog_generation.gc.deleted_segments"] = fmt.Sprintf("%d", db.vlogGenerationGCSegmentsDeleted.Load())
 	stats["treedb.cache.vlog_generation.gc.deleted_bytes"] = fmt.Sprintf("%d", db.vlogGenerationGCBytesDeleted.Load())
+	stats["treedb.cache.vlog_generation.gc.last_referenced_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsReferenced.Load())
+	stats["treedb.cache.vlog_generation.gc.last_referenced_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesReferenced.Load())
+	stats["treedb.cache.vlog_generation.gc.last_active_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsActive.Load())
+	stats["treedb.cache.vlog_generation.gc.last_active_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesActive.Load())
+	stats["treedb.cache.vlog_generation.gc.last_protected_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtected.Load())
+	stats["treedb.cache.vlog_generation.gc.last_protected_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtected.Load())
+	stats["treedb.cache.vlog_generation.gc.last_eligible_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsEligible.Load())
+	stats["treedb.cache.vlog_generation.gc.last_eligible_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesEligible.Load())
+	stats["treedb.cache.vlog_generation.gc.last_deleted_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsDeleted.Load())
+	stats["treedb.cache.vlog_generation.gc.last_deleted_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesDeleted.Load())
+	stats["treedb.cache.vlog_generation.gc.last_pending_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsPending.Load())
+	stats["treedb.cache.vlog_generation.gc.last_pending_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesPending.Load())
 	stats["treedb.cache.vlog_generation.gc.runs"] = fmt.Sprintf("%d", db.vlogGenerationGCRuns.Load())
 	stats["treedb.cache.vlog_generation.gc.exec.total_ms"] = fmt.Sprintf("%.3f", float64(gcExecTotalNS)/float64(time.Millisecond))
 	stats["treedb.cache.vlog_generation.gc.exec.max_ms"] = fmt.Sprintf("%.3f", float64(gcExecMaxNS)/float64(time.Millisecond))
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 230f09573..7c75f7fd2 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -5713,6 +5713,18 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	db.vlogGenerationRewriteBudgetTokensBytes.Store(512)
 	db.vlogGenerationRewriteBudgetConsumed.Store(1536)
 	db.vlogGenerationRewriteAgeBlockedUntilNS.Store(time.Now().Add(5 * time.Second).UnixNano())
+	db.vlogGenerationLastGCSegmentsReferenced.Store(7)
+	db.vlogGenerationLastGCBytesReferenced.Store(700)
+	db.vlogGenerationLastGCSegmentsActive.Store(4)
+	db.vlogGenerationLastGCBytesActive.Store(400)
+	db.vlogGenerationLastGCSegmentsProtected.Store(3)
+	db.vlogGenerationLastGCBytesProtected.Store(300)
+	db.vlogGenerationLastGCSegmentsEligible.Store(6)
+	db.vlogGenerationLastGCBytesEligible.Store(600)
+	db.vlogGenerationLastGCSegmentsDeleted.Store(2)
+	db.vlogGenerationLastGCBytesDeleted.Store(200)
+	db.vlogGenerationLastGCSegmentsPending.Store(4)
+	db.vlogGenerationLastGCBytesPending.Store(400)
 	db.vlogGenerationMaintenanceSkipStageNotDue.Store(5)
 	db.vlogGenerationMaintenanceSkipStageDue.Store(2)
 	db.vlogGenerationRewritePlanSelectedSegments.Store(6)
@@ -5764,6 +5776,42 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.gc.exec.avg_ms"]; got != "30.000" {
 		t.Fatalf("gc exec avg ms=%q want 30.000", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_referenced_segments"]; got != "7" {
+		t.Fatalf("gc last referenced segments=%q want 7", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_referenced_bytes"]; got != "700" {
+		t.Fatalf("gc last referenced bytes=%q want 700", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_active_segments"]; got != "4" {
+		t.Fatalf("gc last active segments=%q want 4", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_active_bytes"]; got != "400" {
+		t.Fatalf("gc last active bytes=%q want 400", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_protected_segments"]; got != "3" {
+		t.Fatalf("gc last protected segments=%q want 3", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_protected_bytes"]; got != "300" {
+		t.Fatalf("gc last protected bytes=%q want 300", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_eligible_segments"]; got != "6" {
+		t.Fatalf("gc last eligible segments=%q want 6", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_eligible_bytes"]; got != "600" {
+		t.Fatalf("gc last eligible bytes=%q want 600", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_deleted_segments"]; got != "2" {
+		t.Fatalf("gc last deleted segments=%q want 2", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_deleted_bytes"]; got != "200" {
+		t.Fatalf("gc last deleted bytes=%q want 200", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_pending_segments"]; got != "4" {
+		t.Fatalf("gc last pending segments=%q want 4", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_pending_bytes"]; got != "400" {
+		t.Fatalf("gc last pending bytes=%q want 400", got)
+	}
 	if got := stats["treedb.cache.vlog_generation.vacuum.exec.total_ms"]; got != "44.000" {
 		t.Fatalf("vacuum exec total ms=%q want 44.000", got)
 	}
diff --git a/TreeDB/db/vlog_gc.go b/TreeDB/db/vlog_gc.go
index d67d0aaf1..19ac1d43a 100644
--- a/TreeDB/db/vlog_gc.go
+++ b/TreeDB/db/vlog_gc.go
@@ -29,12 +29,14 @@ type ValueLogGCStats struct {
 	SegmentsProtected  int
 	SegmentsEligible   int
 	SegmentsDeleted    int
+	SegmentsPending    int
 	BytesTotal         int64
 	BytesReferenced    int64
 	BytesActive        int64
 	BytesProtected     int64
 	BytesEligible      int64
 	BytesDeleted       int64
+	BytesPending       int64
 }
 
 // ValueLogGC deletes fully-unreferenced value-log segments.
@@ -164,6 +166,12 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG
 			}
 		}
 	}
+	if stats.SegmentsEligible > stats.SegmentsDeleted {
+		stats.SegmentsPending = stats.SegmentsEligible - stats.SegmentsDeleted
+	}
+	if stats.BytesEligible > stats.BytesDeleted {
+		stats.BytesPending = stats.BytesEligible - stats.BytesDeleted
+	}
 
 	currentSet := vm.CurrentSetNoRefresh()
 	if currentSet != nil {
diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md
index 9068195d2..9b357c3b1 100644
--- a/worklog/2026-03-27.md
+++ b/worklog/2026-03-27.md
@@ -338,3 +338,30 @@
   - pre: `4747763395`
   - post: `2064528109`
   - tool output: `segments_before=20 segments_after=16 bytes_before=4674175679 bytes_after=2026328485 records=963752`
+
+- GC blocker classification instrumentation (follow-up to no-reclaim counters):
+  - goal:
+    - make no-reclaim episodes diagnosable in one snapshot by showing whether bytes are blocked by active/pinned/protected classes vs actually eligible but pending delete.
+  - code changes:
+    - `TreeDB/db/vlog_gc.go`
+      - extended `ValueLogGCStats` with:
+        - `SegmentsPending`
+        - `BytesPending`
+      - populated pending values after delete attempts as `eligible - deleted` when positive.
+    - `TreeDB/caching/db.go`
+      - added cached per-run GC classification fields to `DB` atomics.
+      - added `observeVlogGenerationGCStats(...)` and wired it into both:
+        - post-rewrite GC pass
+        - periodic GC pass
+      - exported new stats keys:
+        - `treedb.cache.vlog_generation.gc.last_referenced_segments/bytes`
+        - `treedb.cache.vlog_generation.gc.last_active_segments/bytes`
+        - `treedb.cache.vlog_generation.gc.last_protected_segments/bytes`
+        - `treedb.cache.vlog_generation.gc.last_eligible_segments/bytes`
+        - `treedb.cache.vlog_generation.gc.last_deleted_segments/bytes`
+        - `treedb.cache.vlog_generation.gc.last_pending_segments/bytes`
+    - `TreeDB/caching/vlog_generation_scheduler_test.go`
+      - extended `TestVlogGenerationStats_ReportRewriteBacklogAndDurations` with assertions for all new keys.
+
+- Validation:
+  - `go test ./TreeDB/db ./TreeDB/caching -count=1`

From 29ec3716034258431acbd6b991888587e6215f4f Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 21:01:09 -1000
Subject: [PATCH 11/61] worklog: record gc blocker readout from run_celestia

---
 worklog/2026-03-27.md | 53 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md
index 9b357c3b1..b9725f86f 100644
--- a/worklog/2026-03-27.md
+++ b/worklog/2026-03-27.md
@@ -365,3 +365,56 @@
 
 - Validation:
   - `go test ./TreeDB/db ./TreeDB/caching -count=1`
+
+- Corrected `run_celestia` validation after adding `gc.last_*` stats:
+  - initial rerun with `STOP_AT_LOCAL_HEIGHT=5000` was invalid for maintenance analysis because the script treats it as an absolute local-height target; after state sync jump to ~10.4M, it exited immediately with no rewrite activity.
+  - corrected run command:
+    - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh`
+  - run home:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260327205149`
+  - note on diagnostics source:
+    - per-instance application counters were read from `*.debug_vars.json` at:
+      - `.treedb.instances[".../data/application.db/maindb/wal#..."]`
+    - `*.treedb_application_vars.json` was `{}` in this run, so instance readout is the reliable source.
+
+- Final application-instance counters (`pprof-heap-max-rss-final-11027988k-20260327205709.debug_vars.json`):
+  - rewrite:
+    - `rewrite.plan_selected_segments_total=3`
+    - `rewrite.exec.source_segments_total=2`
+    - `rewrite.runs=1`
+    - `rewrite.bytes_in=38292854`
+    - `rewrite.processed_live_bytes=38292854`
+    - `rewrite.processed_stale_bytes=498582443`
+    - `rewrite.no_reclaim_runs=1`
+    - `rewrite.no_reclaim_stale_bytes=498582443`
+    - `rewrite.reclaimed_bytes=0`
+  - gc/classification:
+    - `gc.runs=1`
+    - `gc.deleted_bytes=0`
+    - `gc.last_referenced_segments=8`
+    - `gc.last_referenced_bytes=1294769679`
+    - `gc.last_active_segments=0`
+    - `gc.last_active_bytes=0`
+    - `gc.last_protected_segments=2`
+    - `gc.last_protected_bytes=536875297`
+    - `gc.last_eligible_segments=0`
+    - `gc.last_eligible_bytes=0`
+    - `gc.last_deleted_segments=0`
+    - `gc.last_deleted_bytes=0`
+    - `gc.last_pending_segments=0`
+    - `gc.last_pending_bytes=0`
+  - maintenance:
+    - `maintenance.attempts=35`
+    - `maintenance.acquired=35`
+    - `maintenance.collisions=0`
+
+- Interpretation:
+  - this run confirms stale bytes are being copied by live rewrite, but immediate reclaim is blocked because the final GC view reports `eligible=0` (not delete failure/pending).
+  - blocker class in this sample is dominated by `referenced + protected` bytes, not active segment pinning and not eligible-but-pending deletion.
+
+- Offline reclaim headroom on the same run home:
+  - command:
+    - `/home/mikers/dev/snissn/celestia-app-p4/build/treemap-local vlog-rewrite /home/mikers/.celestia-app-mainnet-treedb-20260327205149/data/application.db -rw`
+  - pre: `5035136550`
+  - tool output: `segments_before=21 segments_after=16 bytes_before=4888181282 bytes_after=2076143228 records=985926`
+  - post: `2115096516`

From aebe8037c3c7984f97a1d94f1aea73d02f630d02 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 21:20:11 -1000
Subject: [PATCH 12/61] treedb: split gc protected blockers by class

---
 TreeDB/caching/db.go                          | 351 ++++++++++--------
 .../caching/vlog_generation_scheduler_test.go |  32 ++
 TreeDB/db/vlog_gc.go                          | 103 ++++-
 TreeDB/db/vlog_gc_test.go                     |  71 ++++
 worklog/2026-03-27.md                         |  72 ++++
 5 files changed, 449 insertions(+), 180 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index f26ae16eb..e5ed17eba 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -3864,38 +3864,44 @@ func (db *DB) ValueLogRetainedPaths() []string {
 	return db.valueLogRetainedPaths()
 }
 
-func (db *DB) valueLogProtectedPaths() []string {
-	retained := db.valueLogRetainedPaths()
-	inUse := db.valueLogInUsePaths()
-	if len(retained) == 0 {
-		return inUse
-	}
-	if len(inUse) == 0 {
-		return retained
-	}
-	seen := make(map[string]struct{}, len(retained)+len(inUse))
-	paths := make([]string, 0, len(retained)+len(inUse))
-	for _, path := range retained {
-		if path == "" {
-			continue
-		}
-		if _, ok := seen[path]; ok {
-			continue
+func mergeUniqueNonEmptyStrings(pathSets ...[]string) []string {
+	seen := make(map[string]struct{})
+	var out []string
+	for _, paths := range pathSets {
+		for _, path := range paths {
+			if path == "" {
+				continue
+			}
+			if _, ok := seen[path]; ok {
+				continue
+			}
+			seen[path] = struct{}{}
+			out = append(out, path)
 		}
-		seen[path] = struct{}{}
-		paths = append(paths, path)
 	}
-	for _, path := range inUse {
-		if path == "" {
-			continue
-		}
-		if _, ok := seen[path]; ok {
-			continue
-		}
-		seen[path] = struct{}{}
-		paths = append(paths, path)
+	return out
+}
+
+func (db *DB) valueLogGCProtectedPathSets() (retained []string, inUse []string, merged []string) {
+	retained = db.valueLogRetainedPaths()
+	inUse = db.valueLogInUsePaths()
+	merged = mergeUniqueNonEmptyStrings(retained, inUse)
+	return retained, inUse, merged
+}
+
+func (db *DB) valueLogProtectedPaths() []string {
+	_, _, merged := db.valueLogGCProtectedPathSets()
+	return merged
+}
+
+func (db *DB) valueLogGCOptions(dryRun bool) backenddb.ValueLogGCOptions {
+	retained, inUse, merged := db.valueLogGCProtectedPathSets()
+	return backenddb.ValueLogGCOptions{
+		DryRun:                 dryRun,
+		ProtectedPaths:         merged,
+		ProtectedInUsePaths:    inUse,
+		ProtectedRetainedPaths: retained,
 	}
-	return paths
 }
 
 // valueLogInUsePaths returns a best-effort snapshot of value-log segment paths
@@ -5142,132 +5148,140 @@ type DB struct {
 	valueLogMaxSegmentBytes                           int64
 	journalCompression                                bool
 
-	disableJournal                              bool
-	relaxedSync                                 bool
-	notifyError                                 func(error)
-	debugFlushPointers                          bool
-	debugFlushTiming                            bool
-	debugPtrEligible                            atomic.Int64
-	debugPtrUsed                                atomic.Int64
-	debugPtrNoPtr                               atomic.Int64
-	debugPtrDenied                              atomic.Int64
-	debugPtrDisabled                            atomic.Int64
-	checkpointRuns                              atomic.Uint64
-	checkpointTotalNs                           atomic.Uint64
-	checkpointMaxNs                             atomic.Uint64
-	checkpointNoopSkips                         atomic.Uint64
-	checkpointFlushMuWaitNs                     atomic.Uint64
-	checkpointFlushMuWaitMaxNs                  atomic.Uint64
-	checkpointAutoVacuumRuns                    atomic.Uint64
-	checkpointAutoVacuumLastCheckRun            atomic.Uint64
-	checkpointAutoVacuumLastPages               atomic.Uint64
-	checkpointAutoVacuumLastInternalP50         atomic.Uint64
-	checkpointAutoVacuumLastInternalAvg         atomic.Uint64
-	lastForegroundWriteUnixNano                 atomic.Int64
-	lastForegroundReadUnixNano                  atomic.Int64
-	foregroundReadStampCounter                  atomic.Uint32
-	activeForegroundIterators                   atomic.Int64
-	retainedPruneLastStartUnixNano              atomic.Int64
-	retainedPruneMu                             sync.Mutex
-	retainedPruneDone                           chan struct{}
-	vlogGenerationRemapSuccesses                atomic.Uint64
-	vlogGenerationRemapFailures                 atomic.Uint64
-	vlogGenerationRewriteBytesIn                atomic.Uint64
-	vlogGenerationRewriteBytesOut               atomic.Uint64
-	vlogGenerationRewriteReclaimedBytes         atomic.Uint64
-	vlogGenerationRewriteProcessedLiveBytes     atomic.Uint64
-	vlogGenerationRewriteProcessedStaleBytes    atomic.Uint64
-	vlogGenerationRewriteNoReclaimRuns          atomic.Uint64
-	vlogGenerationRewriteNoReclaimStaleBytes    atomic.Uint64
-	vlogGenerationRewriteRuns                   atomic.Uint64
-	vlogGenerationRewritePlanRuns               atomic.Uint64
-	vlogGenerationRewritePlanCanceled           atomic.Uint64
-	vlogGenerationRewritePlanErrors             atomic.Uint64
-	vlogGenerationRewritePlanEmpty              atomic.Uint64
-	vlogGenerationRewritePlanSelected           atomic.Uint64
-	vlogGenerationRewritePlanSelectedSegments   atomic.Uint64
-	vlogGenerationRewritePlanSelectedBytes      atomic.Uint64
-	vlogGenerationRewritePlanSelectedLiveBytes  atomic.Uint64
-	vlogGenerationRewritePlanSelectedStaleBytes atomic.Uint64
-	vlogGenerationRewritePlanCanceledLastNS     atomic.Int64
-	vlogGenerationRewriteAgeBlockedUntilNS      atomic.Int64
-	vlogGenerationRewriteAgeBlockedWakeRunning  atomic.Bool
-	vlogGenerationRewriteIneffectiveLastNS      atomic.Int64
-	vlogGenerationRewriteIneffectiveRuns        atomic.Uint64
-	vlogGenerationRewriteIneffectiveBytesIn     atomic.Uint64
-	vlogGenerationRewriteIneffectiveBytesOut    atomic.Uint64
-	vlogGenerationRewriteCanceledRuns           atomic.Uint64
-	vlogGenerationRewriteCanceledLastNS         atomic.Int64
-	vlogGenerationRewriteQueuePruneRuns         atomic.Uint64
-	vlogGenerationRewriteQueuePruneIDs          atomic.Uint64
-	vlogGenerationGCSegmentsDeleted             atomic.Uint64
-	vlogGenerationGCBytesDeleted                atomic.Uint64
-	vlogGenerationGCRuns                        atomic.Uint64
-	vlogGenerationVacuumRuns                    atomic.Uint64
-	vlogGenerationVacuumFailures                atomic.Uint64
-	vlogGenerationVacuumSkippedDisabled         atomic.Uint64
-	vlogGenerationVacuumSkippedRewriteBytes     atomic.Uint64
-	vlogGenerationVacuumSkippedCooldown         atomic.Uint64
-	vlogGenerationLastVacuumUnixNano            atomic.Int64
-	vlogGenerationLastRewritePlanUnixNano       atomic.Int64
-	vlogGenerationLastRewriteUnixNano           atomic.Int64
-	vlogGenerationLastGCUnixNano                atomic.Int64
-	vlogGenerationLastCheckpointKickUnixNano    atomic.Int64
-	vlogGenerationLastGCDryRunUnixNano          atomic.Int64
-	vlogGenerationLastGCDryRunBytesEligible     atomic.Int64
-	vlogGenerationLastGCDryRunSegsEligible      atomic.Int64
-	vlogGenerationLastGCBytesReferenced         atomic.Int64
-	vlogGenerationLastGCSegmentsReferenced      atomic.Int64
-	vlogGenerationLastGCBytesActive             atomic.Int64
-	vlogGenerationLastGCSegmentsActive          atomic.Int64
-	vlogGenerationLastGCBytesProtected          atomic.Int64
-	vlogGenerationLastGCSegmentsProtected       atomic.Int64
-	vlogGenerationLastGCBytesEligible           atomic.Int64
-	vlogGenerationLastGCSegmentsEligible        atomic.Int64
-	vlogGenerationLastGCBytesDeleted            atomic.Int64
-	vlogGenerationLastGCSegmentsDeleted         atomic.Int64
-	vlogGenerationLastGCBytesPending            atomic.Int64
-	vlogGenerationLastGCSegmentsPending         atomic.Int64
-	vlogGenerationChurnBytes                    atomic.Uint64
-	vlogGenerationSchedulerState                atomic.Uint32
-	vlogGenerationMaintenanceActive             atomic.Bool
-	vlogGenerationMaintenanceAttempts           atomic.Uint64
-	vlogGenerationMaintenanceAcquired           atomic.Uint64
-	vlogGenerationMaintenanceCollisions         atomic.Uint64
-	vlogGenerationMaintenanceSkipWALOnPeriodic  atomic.Uint64
-	vlogGenerationMaintenanceSkipPhase          atomic.Uint64
-	vlogGenerationMaintenanceSkipStageGate      atomic.Uint64
-	vlogGenerationMaintenanceSkipStageNotDue    atomic.Uint64
-	vlogGenerationMaintenanceSkipStageDue       atomic.Uint64
-	vlogGenerationMaintenanceSkipAgeBlocked     atomic.Uint64
-	vlogGenerationMaintenanceSkipPriority       atomic.Uint64
-	vlogGenerationMaintenanceSkipQuiet          atomic.Uint64
-	vlogGenerationMaintenanceSkipPreCheckpoint  atomic.Uint64
-	vlogGenerationMaintenanceSkipCheckpointing  atomic.Uint64
-	vlogGenerationMaintenancePassNoop           atomic.Uint64
-	vlogGenerationMaintenancePassWithRewrite    atomic.Uint64
-	vlogGenerationMaintenancePassWithGC         atomic.Uint64
-	vlogGenerationMaintenancePassTotalNanos     atomic.Uint64
-	vlogGenerationMaintenancePassMaxNanos       atomic.Uint64
-	vlogGenerationLastReason                    atomic.Uint32
-	vlogGenerationCheckpointKickRuns            atomic.Uint64
-	vlogGenerationCheckpointKickRewriteRuns     atomic.Uint64
-	vlogGenerationCheckpointKickGCRuns          atomic.Uint64
-	vlogGenerationCheckpointKickPending         atomic.Bool
-	vlogGenerationDeferredMaintenancePending    atomic.Bool
-	vlogGenerationDeferredMaintenanceRunning    atomic.Bool
-	vlogGenerationRewriteStageWakeObservedNS    atomic.Int64
-	vlogGenerationRewriteQueueMu                sync.Mutex
-	vlogGenerationCheckpointKickActive          atomic.Bool
-	vlogGenerationRewriteQueue                  []uint32
-	vlogGenerationRewriteLedger                 []backenddb.ValueLogRewritePlanSegment
-	vlogGenerationRewritePenalties              map[uint32]valueLogGenerationRewritePenalty
-	vlogGenerationRewriteStagePending           bool
-	vlogGenerationRewriteStageObservedUnixNano  int64
-	vlogGenerationRewriteQueueLoaded            bool
-	vlogGenerationLastChurnBps                  atomic.Int64
-	vlogGenerationLastChurnSampleBytes          atomic.Uint64
-	vlogGenerationLastChurnSampleNS             atomic.Int64
+	disableJournal                                bool
+	relaxedSync                                   bool
+	notifyError                                   func(error)
+	debugFlushPointers                            bool
+	debugFlushTiming                              bool
+	debugPtrEligible                              atomic.Int64
+	debugPtrUsed                                  atomic.Int64
+	debugPtrNoPtr                                 atomic.Int64
+	debugPtrDenied                                atomic.Int64
+	debugPtrDisabled                              atomic.Int64
+	checkpointRuns                                atomic.Uint64
+	checkpointTotalNs                             atomic.Uint64
+	checkpointMaxNs                               atomic.Uint64
+	checkpointNoopSkips                           atomic.Uint64
+	checkpointFlushMuWaitNs                       atomic.Uint64
+	checkpointFlushMuWaitMaxNs                    atomic.Uint64
+	checkpointAutoVacuumRuns                      atomic.Uint64
+	checkpointAutoVacuumLastCheckRun              atomic.Uint64
+	checkpointAutoVacuumLastPages                 atomic.Uint64
+	checkpointAutoVacuumLastInternalP50           atomic.Uint64
+	checkpointAutoVacuumLastInternalAvg           atomic.Uint64
+	lastForegroundWriteUnixNano                   atomic.Int64
+	lastForegroundReadUnixNano                    atomic.Int64
+	foregroundReadStampCounter                    atomic.Uint32
+	activeForegroundIterators                     atomic.Int64
+	retainedPruneLastStartUnixNano                atomic.Int64
+	retainedPruneMu                               sync.Mutex
+	retainedPruneDone                             chan struct{}
+	vlogGenerationRemapSuccesses                  atomic.Uint64
+	vlogGenerationRemapFailures                   atomic.Uint64
+	vlogGenerationRewriteBytesIn                  atomic.Uint64
+	vlogGenerationRewriteBytesOut                 atomic.Uint64
+	vlogGenerationRewriteReclaimedBytes           atomic.Uint64
+	vlogGenerationRewriteProcessedLiveBytes       atomic.Uint64
+	vlogGenerationRewriteProcessedStaleBytes      atomic.Uint64
+	vlogGenerationRewriteNoReclaimRuns            atomic.Uint64
+	vlogGenerationRewriteNoReclaimStaleBytes      atomic.Uint64
+	vlogGenerationRewriteRuns                     atomic.Uint64
+	vlogGenerationRewritePlanRuns                 atomic.Uint64
+	vlogGenerationRewritePlanCanceled             atomic.Uint64
+	vlogGenerationRewritePlanErrors               atomic.Uint64
+	vlogGenerationRewritePlanEmpty                atomic.Uint64
+	vlogGenerationRewritePlanSelected             atomic.Uint64
+	vlogGenerationRewritePlanSelectedSegments     atomic.Uint64
+	vlogGenerationRewritePlanSelectedBytes        atomic.Uint64
+	vlogGenerationRewritePlanSelectedLiveBytes    atomic.Uint64
+	vlogGenerationRewritePlanSelectedStaleBytes   atomic.Uint64
+	vlogGenerationRewritePlanCanceledLastNS       atomic.Int64
+	vlogGenerationRewriteAgeBlockedUntilNS        atomic.Int64
+	vlogGenerationRewriteAgeBlockedWakeRunning    atomic.Bool
+	vlogGenerationRewriteIneffectiveLastNS        atomic.Int64
+	vlogGenerationRewriteIneffectiveRuns          atomic.Uint64
+	vlogGenerationRewriteIneffectiveBytesIn       atomic.Uint64
+	vlogGenerationRewriteIneffectiveBytesOut      atomic.Uint64
+	vlogGenerationRewriteCanceledRuns             atomic.Uint64
+	vlogGenerationRewriteCanceledLastNS           atomic.Int64
+	vlogGenerationRewriteQueuePruneRuns           atomic.Uint64
+	vlogGenerationRewriteQueuePruneIDs            atomic.Uint64
+	vlogGenerationGCSegmentsDeleted               atomic.Uint64
+	vlogGenerationGCBytesDeleted                  atomic.Uint64
+	vlogGenerationGCRuns                          atomic.Uint64
+	vlogGenerationVacuumRuns                      atomic.Uint64
+	vlogGenerationVacuumFailures                  atomic.Uint64
+	vlogGenerationVacuumSkippedDisabled           atomic.Uint64
+	vlogGenerationVacuumSkippedRewriteBytes       atomic.Uint64
+	vlogGenerationVacuumSkippedCooldown           atomic.Uint64
+	vlogGenerationLastVacuumUnixNano              atomic.Int64
+	vlogGenerationLastRewritePlanUnixNano         atomic.Int64
+	vlogGenerationLastRewriteUnixNano             atomic.Int64
+	vlogGenerationLastGCUnixNano                  atomic.Int64
+	vlogGenerationLastCheckpointKickUnixNano      atomic.Int64
+	vlogGenerationLastGCDryRunUnixNano            atomic.Int64
+	vlogGenerationLastGCDryRunBytesEligible       atomic.Int64
+	vlogGenerationLastGCDryRunSegsEligible        atomic.Int64
+	vlogGenerationLastGCBytesReferenced           atomic.Int64
+	vlogGenerationLastGCSegmentsReferenced        atomic.Int64
+	vlogGenerationLastGCBytesActive               atomic.Int64
+	vlogGenerationLastGCSegmentsActive            atomic.Int64
+	vlogGenerationLastGCBytesProtected            atomic.Int64
+	vlogGenerationLastGCSegmentsProtected         atomic.Int64
+	vlogGenerationLastGCBytesProtectedInUse       atomic.Int64
+	vlogGenerationLastGCSegmentsProtectedInUse    atomic.Int64
+	vlogGenerationLastGCBytesProtectedRetained    atomic.Int64
+	vlogGenerationLastGCSegmentsProtectedRetained atomic.Int64
+	vlogGenerationLastGCBytesProtectedOverlap     atomic.Int64
+	vlogGenerationLastGCSegmentsProtectedOverlap  atomic.Int64
+	vlogGenerationLastGCBytesProtectedOther       atomic.Int64
+	vlogGenerationLastGCSegmentsProtectedOther    atomic.Int64
+	vlogGenerationLastGCBytesEligible             atomic.Int64
+	vlogGenerationLastGCSegmentsEligible          atomic.Int64
+	vlogGenerationLastGCBytesDeleted              atomic.Int64
+	vlogGenerationLastGCSegmentsDeleted           atomic.Int64
+	vlogGenerationLastGCBytesPending              atomic.Int64
+	vlogGenerationLastGCSegmentsPending           atomic.Int64
+	vlogGenerationChurnBytes                      atomic.Uint64
+	vlogGenerationSchedulerState                  atomic.Uint32
+	vlogGenerationMaintenanceActive               atomic.Bool
+	vlogGenerationMaintenanceAttempts             atomic.Uint64
+	vlogGenerationMaintenanceAcquired             atomic.Uint64
+	vlogGenerationMaintenanceCollisions           atomic.Uint64
+	vlogGenerationMaintenanceSkipWALOnPeriodic    atomic.Uint64
+	vlogGenerationMaintenanceSkipPhase            atomic.Uint64
+	vlogGenerationMaintenanceSkipStageGate        atomic.Uint64
+	vlogGenerationMaintenanceSkipStageNotDue      atomic.Uint64
+	vlogGenerationMaintenanceSkipStageDue         atomic.Uint64
+	vlogGenerationMaintenanceSkipAgeBlocked       atomic.Uint64
+	vlogGenerationMaintenanceSkipPriority         atomic.Uint64
+	vlogGenerationMaintenanceSkipQuiet            atomic.Uint64
+	vlogGenerationMaintenanceSkipPreCheckpoint    atomic.Uint64
+	vlogGenerationMaintenanceSkipCheckpointing    atomic.Uint64
+	vlogGenerationMaintenancePassNoop             atomic.Uint64
+	vlogGenerationMaintenancePassWithRewrite      atomic.Uint64
+	vlogGenerationMaintenancePassWithGC           atomic.Uint64
+	vlogGenerationMaintenancePassTotalNanos       atomic.Uint64
+	vlogGenerationMaintenancePassMaxNanos         atomic.Uint64
+	vlogGenerationLastReason                      atomic.Uint32
+	vlogGenerationCheckpointKickRuns              atomic.Uint64
+	vlogGenerationCheckpointKickRewriteRuns       atomic.Uint64
+	vlogGenerationCheckpointKickGCRuns            atomic.Uint64
+	vlogGenerationCheckpointKickPending           atomic.Bool
+	vlogGenerationDeferredMaintenancePending      atomic.Bool
+	vlogGenerationDeferredMaintenanceRunning      atomic.Bool
+	vlogGenerationRewriteStageWakeObservedNS      atomic.Int64
+	vlogGenerationRewriteQueueMu                  sync.Mutex
+	vlogGenerationCheckpointKickActive            atomic.Bool
+	vlogGenerationRewriteQueue                    []uint32
+	vlogGenerationRewriteLedger                   []backenddb.ValueLogRewritePlanSegment
+	vlogGenerationRewritePenalties                map[uint32]valueLogGenerationRewritePenalty
+	vlogGenerationRewriteStagePending             bool
+	vlogGenerationRewriteStageObservedUnixNano    int64
+	vlogGenerationRewriteQueueLoaded              bool
+	vlogGenerationLastChurnBps                    atomic.Int64
+	vlogGenerationLastChurnSampleBytes            atomic.Uint64
+	vlogGenerationLastChurnSampleNS               atomic.Int64
 	// Rewrite budget token bucket (bytes) for online maintenance. This lets us
 	// interpret ValueLogRewriteBudgetBytesPerSec as a true per-second bandwidth
 	// budget while still running maintenance at coarse intervals.
@@ -12519,6 +12533,14 @@ func (db *DB) observeVlogGenerationGCStats(stats backenddb.ValueLogGCStats) {
 	db.vlogGenerationLastGCSegmentsActive.Store(int64(stats.SegmentsActive))
 	db.vlogGenerationLastGCBytesProtected.Store(stats.BytesProtected)
 	db.vlogGenerationLastGCSegmentsProtected.Store(int64(stats.SegmentsProtected))
+	db.vlogGenerationLastGCBytesProtectedInUse.Store(stats.BytesProtectedInUse)
+	db.vlogGenerationLastGCSegmentsProtectedInUse.Store(int64(stats.SegmentsProtectedInUse))
+	db.vlogGenerationLastGCBytesProtectedRetained.Store(stats.BytesProtectedRetained)
+	db.vlogGenerationLastGCSegmentsProtectedRetained.Store(int64(stats.SegmentsProtectedRetained))
+	db.vlogGenerationLastGCBytesProtectedOverlap.Store(stats.BytesProtectedOverlap)
+	db.vlogGenerationLastGCSegmentsProtectedOverlap.Store(int64(stats.SegmentsProtectedOverlap))
+	db.vlogGenerationLastGCBytesProtectedOther.Store(stats.BytesProtectedOther)
+	db.vlogGenerationLastGCSegmentsProtectedOther.Store(int64(stats.SegmentsProtectedOther))
 	db.vlogGenerationLastGCBytesEligible.Store(stats.BytesEligible)
 	db.vlogGenerationLastGCSegmentsEligible.Store(int64(stats.SegmentsEligible))
 	db.vlogGenerationLastGCBytesDeleted.Store(stats.BytesDeleted)
@@ -13975,9 +13997,7 @@ planned:
 			if gcer, ok := db.backend.(backendValueLogGCer); ok {
 				gcCtx, gcCancel := context.WithTimeout(context.Background(), 30*time.Second)
 				gcStart := time.Now()
-				gcStats, gcErr := gcer.ValueLogGC(gcCtx, backenddb.ValueLogGCOptions{
-					ProtectedPaths: db.valueLogProtectedPaths(),
-				})
+				gcStats, gcErr := gcer.ValueLogGC(gcCtx, db.valueLogGCOptions(false))
 				gcCancel()
 				gcDur := time.Since(gcStart)
 				db.observeVlogGenerationGCExecDuration(gcDur)
@@ -14186,7 +14206,7 @@ planned:
 		now := time.Now()
 		db.vlogGenerationLastGCUnixNano.Store(now.UnixNano())
 		ctx, cancel := db.foregroundMaintenanceContext(30 * time.Second)
-		gcOpts := backenddb.ValueLogGCOptions{ProtectedPaths: db.valueLogProtectedPaths()}
+		gcOpts := db.valueLogGCOptions(false)
 		gcStart := time.Now()
 		gcStats, err := gcer.ValueLogGC(ctx, gcOpts)
 		cancel()
@@ -14452,10 +14472,7 @@ func (db *DB) estimateVlogGenerationGCEligible(gcer backendValueLogGCer) (backen
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancel()
-	stats, err := gcer.ValueLogGC(ctx, backenddb.ValueLogGCOptions{
-		DryRun:         true,
-		ProtectedPaths: db.valueLogProtectedPaths(),
-	})
+	stats, err := gcer.ValueLogGC(ctx, db.valueLogGCOptions(true))
 	if err == nil {
 		db.vlogGenerationLastGCDryRunUnixNano.Store(time.Now().UnixNano())
 		db.vlogGenerationLastGCDryRunBytesEligible.Store(stats.BytesEligible)
@@ -19902,6 +19919,14 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.gc.last_active_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesActive.Load())
 	stats["treedb.cache.vlog_generation.gc.last_protected_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtected.Load())
 	stats["treedb.cache.vlog_generation.gc.last_protected_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtected.Load())
+	stats["treedb.cache.vlog_generation.gc.last_protected_in_use_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtectedInUse.Load())
+	stats["treedb.cache.vlog_generation.gc.last_protected_in_use_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtectedInUse.Load())
+	stats["treedb.cache.vlog_generation.gc.last_protected_retained_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtectedRetained.Load())
+	stats["treedb.cache.vlog_generation.gc.last_protected_retained_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtectedRetained.Load())
+	stats["treedb.cache.vlog_generation.gc.last_protected_overlap_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtectedOverlap.Load())
+	stats["treedb.cache.vlog_generation.gc.last_protected_overlap_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtectedOverlap.Load())
+	stats["treedb.cache.vlog_generation.gc.last_protected_other_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsProtectedOther.Load())
+	stats["treedb.cache.vlog_generation.gc.last_protected_other_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesProtectedOther.Load())
 	stats["treedb.cache.vlog_generation.gc.last_eligible_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsEligible.Load())
 	stats["treedb.cache.vlog_generation.gc.last_eligible_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesEligible.Load())
 	stats["treedb.cache.vlog_generation.gc.last_deleted_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsDeleted.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 7c75f7fd2..4bb6a8912 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -5719,6 +5719,14 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	db.vlogGenerationLastGCBytesActive.Store(400)
 	db.vlogGenerationLastGCSegmentsProtected.Store(3)
 	db.vlogGenerationLastGCBytesProtected.Store(300)
+	db.vlogGenerationLastGCSegmentsProtectedInUse.Store(1)
+	db.vlogGenerationLastGCBytesProtectedInUse.Store(100)
+	db.vlogGenerationLastGCSegmentsProtectedRetained.Store(1)
+	db.vlogGenerationLastGCBytesProtectedRetained.Store(120)
+	db.vlogGenerationLastGCSegmentsProtectedOverlap.Store(1)
+	db.vlogGenerationLastGCBytesProtectedOverlap.Store(80)
+	db.vlogGenerationLastGCSegmentsProtectedOther.Store(0)
+	db.vlogGenerationLastGCBytesProtectedOther.Store(0)
 	db.vlogGenerationLastGCSegmentsEligible.Store(6)
 	db.vlogGenerationLastGCBytesEligible.Store(600)
 	db.vlogGenerationLastGCSegmentsDeleted.Store(2)
@@ -5794,6 +5802,30 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.gc.last_protected_bytes"]; got != "300" {
 		t.Fatalf("gc last protected bytes=%q want 300", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_protected_in_use_segments"]; got != "1" {
+		t.Fatalf("gc last protected in use segments=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_protected_in_use_bytes"]; got != "100" {
+		t.Fatalf("gc last protected in use bytes=%q want 100", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_protected_retained_segments"]; got != "1" {
+		t.Fatalf("gc last protected retained segments=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_protected_retained_bytes"]; got != "120" {
+		t.Fatalf("gc last protected retained bytes=%q want 120", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_protected_overlap_segments"]; got != "1" {
+		t.Fatalf("gc last protected overlap segments=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_protected_overlap_bytes"]; got != "80" {
+		t.Fatalf("gc last protected overlap bytes=%q want 80", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_protected_other_segments"]; got != "0" {
+		t.Fatalf("gc last protected other segments=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_protected_other_bytes"]; got != "0" {
+		t.Fatalf("gc last protected other bytes=%q want 0", got)
+	}
 	if got := stats["treedb.cache.vlog_generation.gc.last_eligible_segments"]; got != "6" {
 		t.Fatalf("gc last eligible segments=%q want 6", got)
 	}
diff --git a/TreeDB/db/vlog_gc.go b/TreeDB/db/vlog_gc.go
index 19ac1d43a..cbdc96e49 100644
--- a/TreeDB/db/vlog_gc.go
+++ b/TreeDB/db/vlog_gc.go
@@ -17,26 +17,42 @@ const valueLogKeepRecentSegmentsPerLane = 2
 
 // ValueLogGCOptions controls value-log garbage collection.
 type ValueLogGCOptions struct {
-	DryRun         bool
+	DryRun bool
+	// ProtectedPaths preserves legacy callers that provide a single merged set
+	// of protected paths. Prefer the specific ProtectedInUsePaths and
+	// ProtectedRetainedPaths fields for blocker classification.
 	ProtectedPaths []string
+	// ProtectedInUsePaths are paths that may still be referenced by mutable
+	// in-memory state during online maintenance.
+	ProtectedInUsePaths []string
+	// ProtectedRetainedPaths are paths pinned by pointer lifecycle retention.
+	ProtectedRetainedPaths []string
 }
 
 // ValueLogGCStats summarizes value-log GC work.
 type ValueLogGCStats struct {
-	SegmentsTotal      int
-	SegmentsReferenced int
-	SegmentsActive     int
-	SegmentsProtected  int
-	SegmentsEligible   int
-	SegmentsDeleted    int
-	SegmentsPending    int
-	BytesTotal         int64
-	BytesReferenced    int64
-	BytesActive        int64
-	BytesProtected     int64
-	BytesEligible      int64
-	BytesDeleted       int64
-	BytesPending       int64
+	SegmentsTotal             int
+	SegmentsReferenced        int
+	SegmentsActive            int
+	SegmentsProtected         int
+	SegmentsProtectedInUse    int
+	SegmentsProtectedRetained int
+	SegmentsProtectedOverlap  int
+	SegmentsProtectedOther    int
+	SegmentsEligible          int
+	SegmentsDeleted           int
+	SegmentsPending           int
+	BytesTotal                int64
+	BytesReferenced           int64
+	BytesActive               int64
+	BytesProtected            int64
+	BytesProtectedInUse       int64
+	BytesProtectedRetained    int64
+	BytesProtectedOverlap     int64
+	BytesProtectedOther       int64
+	BytesEligible             int64
+	BytesDeleted              int64
+	BytesPending              int64
 }
 
 // ValueLogGC deletes fully-unreferenced value-log segments.
@@ -83,8 +99,9 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG
 		set = vm.CurrentSetNoRefresh()
 	}
 	keptIDs := currentValueLogIDs(set)
-	if len(opts.ProtectedPaths) > 0 {
-		if recent := recentValueLogIDsForProtectedPaths(set, valueLogKeepRecentSegmentsPerLane, opts.ProtectedPaths); len(recent) > 0 {
+	protectedAll := mergeUniqueNonEmptyPaths(opts.ProtectedPaths, opts.ProtectedInUsePaths, opts.ProtectedRetainedPaths)
+	if len(protectedAll) > 0 {
+		if recent := recentValueLogIDsForProtectedPaths(set, valueLogKeepRecentSegmentsPerLane, protectedAll); len(recent) > 0 {
 			keptIDs = recent
 		}
 	}
@@ -95,6 +112,20 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG
 		}
 		protectedPaths[path] = struct{}{}
 	}
+	protectedInUsePaths := make(map[string]struct{}, len(opts.ProtectedInUsePaths))
+	for _, path := range opts.ProtectedInUsePaths {
+		if path == "" {
+			continue
+		}
+		protectedInUsePaths[path] = struct{}{}
+	}
+	protectedRetainedPaths := make(map[string]struct{}, len(opts.ProtectedRetainedPaths))
+	for _, path := range opts.ProtectedRetainedPaths {
+		if path == "" {
+			continue
+		}
+		protectedRetainedPaths[path] = struct{}{}
+	}
 	type candidate struct {
 		path string
 		size int64
@@ -119,9 +150,29 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG
 			stats.BytesActive += size
 			continue
 		}
+		_, inUseProtected := protectedInUsePaths[f.Path]
+		_, retainedProtected := protectedRetainedPaths[f.Path]
+		if inUseProtected || retainedProtected {
+			stats.SegmentsProtected++
+			stats.BytesProtected += size
+			switch {
+			case inUseProtected && retainedProtected:
+				stats.SegmentsProtectedOverlap++
+				stats.BytesProtectedOverlap += size
+			case inUseProtected:
+				stats.SegmentsProtectedInUse++
+				stats.BytesProtectedInUse += size
+			default:
+				stats.SegmentsProtectedRetained++
+				stats.BytesProtectedRetained += size
+			}
+			continue
+		}
 		if _, ok := protectedPaths[f.Path]; ok {
 			stats.SegmentsProtected++
 			stats.BytesProtected += size
+			stats.SegmentsProtectedOther++
+			stats.BytesProtectedOther += size
 			continue
 		}
 
@@ -208,6 +259,24 @@ func currentValueLogIDs(set *valuelog.Set) map[uint32]struct{} {
 	return active
 }
 
+func mergeUniqueNonEmptyPaths(pathSets ...[]string) []string {
+	seen := make(map[string]struct{})
+	var out []string
+	for _, paths := range pathSets {
+		for _, path := range paths {
+			if path == "" {
+				continue
+			}
+			if _, ok := seen[path]; ok {
+				continue
+			}
+			seen[path] = struct{}{}
+			out = append(out, path)
+		}
+	}
+	return out
+}
+
 func recentValueLogIDs(set *valuelog.Set, keepPerLane int) map[uint32]struct{} {
 	if keepPerLane <= 1 {
 		return currentValueLogIDs(set)
diff --git a/TreeDB/db/vlog_gc_test.go b/TreeDB/db/vlog_gc_test.go
index fbdee4dc0..771f2b116 100644
--- a/TreeDB/db/vlog_gc_test.go
+++ b/TreeDB/db/vlog_gc_test.go
@@ -176,6 +176,77 @@ func TestValueLogGC_ProtectedPathsDoNotKeepHistoricalRewriteLanes(t *testing.T)
 	}
 }
 
+func TestValueLogGC_ProtectedPathBreakdownStats(t *testing.T) {
+	dir := t.TempDir()
+
+	db, err := Open(Options{Dir: dir})
+	if err != nil {
+		t.Fatalf("open: %v", err)
+	}
+	defer func() { _ = db.Close() }()
+
+	for seq := 1; seq <= 5; seq++ {
+		seq := seq
+		appendPointersInNewSegment(t, dir, 0, uint32(seq), uint64(seq)*1_000, 1, func(int) []byte {
+			return bytes.Repeat([]byte(fmt.Sprintf("lane0-seq%d|", seq)), 32)
+		})
+	}
+
+	if err := db.RefreshValueLogSet(); err != nil {
+		t.Fatalf("RefreshValueLogSet: %v", err)
+	}
+
+	inUseOnlyPath := filepath.Join(dir, "wal", "value-l0-000001.log")
+	retainedOnlyPath := filepath.Join(dir, "wal", "value-l0-000002.log")
+	overlapPath := filepath.Join(dir, "wal", "value-l0-000003.log")
+
+	stats, err := db.ValueLogGC(context.Background(), ValueLogGCOptions{
+		DryRun:                 true,
+		ProtectedInUsePaths:    []string{inUseOnlyPath, overlapPath},
+		ProtectedRetainedPaths: []string{retainedOnlyPath, overlapPath},
+	})
+	if err != nil {
+		t.Fatalf("ValueLogGC: %v", err)
+	}
+
+	if stats.SegmentsTotal != 5 {
+		t.Fatalf("segments total=%d want 5", stats.SegmentsTotal)
+	}
+	if stats.SegmentsActive != 2 {
+		t.Fatalf("segments active=%d want 2", stats.SegmentsActive)
+	}
+	if stats.SegmentsProtected != 3 {
+		t.Fatalf("segments protected=%d want 3", stats.SegmentsProtected)
+	}
+	if stats.SegmentsProtectedInUse != 1 {
+		t.Fatalf("segments protected in-use=%d want 1", stats.SegmentsProtectedInUse)
+	}
+	if stats.SegmentsProtectedRetained != 1 {
+		t.Fatalf("segments protected retained=%d want 1", stats.SegmentsProtectedRetained)
+	}
+	if stats.SegmentsProtectedOverlap != 1 {
+		t.Fatalf("segments protected overlap=%d want 1", stats.SegmentsProtectedOverlap)
+	}
+	if stats.SegmentsProtectedOther != 0 {
+		t.Fatalf("segments protected other=%d want 0", stats.SegmentsProtectedOther)
+	}
+	if stats.SegmentsEligible != 0 {
+		t.Fatalf("segments eligible=%d want 0", stats.SegmentsEligible)
+	}
+	if stats.SegmentsDeleted != 0 {
+		t.Fatalf("segments deleted=%d want 0", stats.SegmentsDeleted)
+	}
+	if stats.BytesProtected <= 0 {
+		t.Fatalf("bytes protected=%d want >0", stats.BytesProtected)
+	}
+	if stats.BytesProtectedInUse <= 0 || stats.BytesProtectedRetained <= 0 || stats.BytesProtectedOverlap <= 0 {
+		t.Fatalf("expected non-zero protected byte buckets, got %+v", stats)
+	}
+	if stats.BytesProtectedOther != 0 {
+		t.Fatalf("bytes protected other=%d want 0", stats.BytesProtectedOther)
+	}
+}
+
 func TestValueLogGC_KeepsReferencedPointerSegments_WithOuterLeavesInValueLog(t *testing.T) {
 	dir := t.TempDir()
 
diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md
index b9725f86f..b44c770fb 100644
--- a/worklog/2026-03-27.md
+++ b/worklog/2026-03-27.md
@@ -418,3 +418,75 @@
   - pre: `5035136550`
   - tool output: `segments_before=21 segments_after=16 bytes_before=4888181282 bytes_after=2076143228 records=985926`
   - post: `2115096516`
+
+- GC protected-class split instrumentation (retained vs in-use vs overlap):
+  - motivation:
+    - prior `gc.last_protected_{segments,bytes}` proved protection was the blocker class but did not identify whether protection came from in-memory in-use paths vs retained-path lifecycle pins.
+  - code changes:
+    - `TreeDB/db/vlog_gc.go`
+      - `ValueLogGCOptions` extended with:
+        - `ProtectedInUsePaths []string`
+        - `ProtectedRetainedPaths []string`
+      - `ValueLogGCStats` extended with protected split buckets:
+        - `SegmentsProtectedInUse`, `BytesProtectedInUse`
+        - `SegmentsProtectedRetained`, `BytesProtectedRetained`
+        - `SegmentsProtectedOverlap`, `BytesProtectedOverlap`
+        - `SegmentsProtectedOther`, `BytesProtectedOther`
+      - GC classification now tags protected candidates by class while preserving `SegmentsProtected/BytesProtected` totals.
+      - protected-lane recent-window keep logic now uses the union of legacy + split protected path lists.
+    - `TreeDB/caching/db.go`
+      - added helper `valueLogGCProtectedPathSets()` and `valueLogGCOptions(dryRun bool)` to pass split path sets into backend GC.
+      - `observeVlogGenerationGCStats` now records split protected classes.
+      - exported new stats keys:
+        - `treedb.cache.vlog_generation.gc.last_protected_in_use_{segments,bytes}`
+        - `treedb.cache.vlog_generation.gc.last_protected_retained_{segments,bytes}`
+        - `treedb.cache.vlog_generation.gc.last_protected_overlap_{segments,bytes}`
+        - `treedb.cache.vlog_generation.gc.last_protected_other_{segments,bytes}`
+    - tests:
+      - `TreeDB/db/vlog_gc_test.go`
+        - added `TestValueLogGC_ProtectedPathBreakdownStats`
+      - `TreeDB/caching/vlog_generation_scheduler_test.go`
+        - extended `TestVlogGenerationStats_ReportRewriteBacklogAndDurations` assertions for new keys.
+
+- Validation:
+  - `go test ./TreeDB/db -run 'TestValueLogGC_ProtectedPathsDoNotKeepHistoricalRewriteLanes|TestValueLogGC_ProtectedPathBreakdownStats' -count=1`
+  - `go test ./TreeDB/caching -run 'TestVlogGenerationStats_ReportRewriteBacklogAndDurations' -count=1`
+  - `go test ./TreeDB/db ./TreeDB/caching -count=1`
+
+- Live confirmation run for protected split counters:
+  - command:
+    - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh`
+  - run home:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260327211238`
+  - final debug snapshot:
+    - `pprof-heap-max-rss-final-11733668k-20260327211812.debug_vars.json`
+  - application-instance counters:
+    - `rewrite.plan_selected_segments_total=3`
+    - `rewrite.exec.source_segments_total=2`
+    - `rewrite.runs=1`
+    - `rewrite.processed_stale_bytes=498580183`
+    - `rewrite.reclaimed_bytes=0`
+    - `gc.runs=1`
+    - `gc.deleted_bytes=0`
+    - `gc.last_protected_segments=2`
+    - `gc.last_protected_bytes=536873037`
+    - `gc.last_protected_in_use_segments=0`
+    - `gc.last_protected_in_use_bytes=0`
+    - `gc.last_protected_retained_segments=2`
+    - `gc.last_protected_retained_bytes=536873037`
+    - `gc.last_protected_overlap_segments=0`
+    - `gc.last_protected_overlap_bytes=0`
+    - `gc.last_protected_other_segments=0`
+    - `gc.last_protected_other_bytes=0`
+    - `gc.last_eligible_segments=0`
+    - `gc.last_eligible_bytes=0`
+
+- Interpretation update:
+  - for this run window, no-reclaim is attributable to retained-path protection (not in-use protection and not eligible/pending delete).
+
+- Offline headroom on same run home:
+  - command:
+    - `/home/mikers/dev/snissn/celestia-app-p4/build/treemap-local vlog-rewrite /home/mikers/.celestia-app-mainnet-treedb-20260327211238/data/application.db -rw`
+  - pre: `5266839216`
+  - tool output: `segments_before=22 segments_after=16 bytes_before=4993530542 bytes_after=2108841030 records=995454`
+  - post: `2148318606`

From bcfc7ad3e8ea442c01026a6b4b7152384209a497 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 22:43:44 -1000
Subject: [PATCH 13/61] treedb: instrument retained prune scheduling and force
 preemption

---
 TreeDB/caching/db.go                | 454 +++++++++++++++++++---------
 TreeDB/caching/db_test.go           | 190 ++++++++++++
 TreeDB/caching/expvar_stats.go      |   1 +
 TreeDB/caching/expvar_stats_test.go |   4 +
 worklog/2026-03-27.md               |  56 ++++
 5 files changed, 562 insertions(+), 143 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index e5ed17eba..c9dfa7c01 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -4190,13 +4190,37 @@ type valueLogSetRefresher interface {
 	RefreshValueLogSet() error
 }
 
-func (db *DB) pruneRetainedValueLogs() {
+type retainedValueLogPruneStats struct {
+	RemovedSegments         int
+	RemovedBytes            int64
+	AbortedForegroundWrites bool
+}
+
+func (db *DB) valueLogClosedSegmentSize(path string) int64 {
+	if db == nil || path == "" {
+		return 0
+	}
+	laneID, _, _, ok := parseLogSeq(filepath.Base(path))
+	if !ok || laneID < 0 || laneID >= len(db.lanes) {
+		return 0
+	}
+	l := &db.lanes[laneID]
+	l.vlogMu.Lock()
+	defer l.vlogMu.Unlock()
+	if l.vlogClosedSizes == nil {
+		return 0
+	}
+	return l.vlogClosedSizes[path]
+}
+
+func (db *DB) pruneRetainedValueLogs() retainedValueLogPruneStats {
+	var out retainedValueLogPruneStats
 	if !db.valueLogEnabled() {
-		return
+		return out
 	}
 	paths := db.valueLogRetainedPaths()
 	if len(paths) == 0 {
-		return
+		return out
 	}
 
 	inUse := make(map[string]struct{})
@@ -4209,27 +4233,34 @@ func (db *DB) pruneRetainedValueLogs() {
 		if _, ok := inUse[path]; ok {
 			continue
 		}
+		size := db.valueLogClosedSegmentSize(path)
 		if db.cleanupMissingRetainedValueLog(path) {
+			if size > 0 {
+				out.RemovedSegments++
+				out.RemovedBytes += size
+			}
 			continue
 		}
 		candidatePaths = append(candidatePaths, path)
 	}
 	if len(candidatePaths) == 0 {
-		return
+		return out
 	}
 
 	live, err := db.collectValueLogLiveIDsUntil(db.lastForegroundWriteUnixNano.Load())
 	if err != nil {
 		if errors.Is(err, errForegroundWritesResumed) {
-			return
+			out.AbortedForegroundWrites = true
+			return out
 		}
 		db.reportError(fmt.Errorf("cachingdb: failed to scan value-log pointers: %w", err))
-		return
+		return out
 	}
 
 	removed := false
 	marked := false
 	for _, path := range candidatePaths {
+		size := db.valueLogClosedSegmentSize(path)
 		laneID, seq, valueLog, ok := parseLogSeq(filepath.Base(path))
 		if !ok || !valueLog {
 			continue
@@ -4252,9 +4283,17 @@ func (db *DB) pruneRetainedValueLogs() {
 			if err := marker.MarkValueLogZombie(id); err != nil {
 				if errors.Is(err, valuelog.ErrFileNotFound) && db.cleanupOrphanedRetainedValueLog(path) {
 					removed = true
+					if size > 0 {
+						out.RemovedSegments++
+						out.RemovedBytes += size
+					}
 					continue
 				}
 				if db.cleanupMissingRetainedValueLog(path) {
+					if size > 0 {
+						out.RemovedSegments++
+						out.RemovedBytes += size
+					}
 					continue
 				}
 				db.reportError(fmt.Errorf("cachingdb: failed to mark value-log %d zombie: %w", id, err))
@@ -4268,6 +4307,10 @@ func (db *DB) pruneRetainedValueLogs() {
 			db.untrackValueLogSegmentLocked(path)
 			db.mu.Unlock()
 			removed = true
+			if size > 0 {
+				out.RemovedSegments++
+				out.RemovedBytes += size
+			}
 		}
 		db.forgetValueLogRetain(path)
 	}
@@ -4282,6 +4325,7 @@ func (db *DB) pruneRetainedValueLogs() {
 	if removed {
 		db.syncDirBestEffort(db.dir)
 	}
+	return out
 }
 
 func (db *DB) retainedPrunePressureBytes() int64 {
@@ -4325,6 +4369,10 @@ func (db *DB) retainedPrunePressureBytes() int64 {
 }
 
 func (db *DB) shouldScheduleRetainedValueLogPrune() bool {
+	return db.shouldScheduleRetainedValueLogPruneWithForce(false)
+}
+
+func (db *DB) shouldScheduleRetainedValueLogPruneWithForce(force bool) bool {
 	if db == nil || !db.valueLogEnabled() {
 		return false
 	}
@@ -4332,22 +4380,67 @@ func (db *DB) shouldScheduleRetainedValueLogPrune() bool {
 	if closed <= 0 {
 		return false
 	}
+	if force {
+		return true
+	}
 	return closed >= db.retainedPrunePressureBytes()
 }
 
+func (db *DB) waitForRetainedValueLogPruneQuietOrForce(quietWindow time.Duration) bool {
+	if db == nil {
+		return false
+	}
+	if quietWindow <= 0 {
+		return db.retainedPruneForceRequested.Swap(false)
+	}
+	ticker := time.NewTicker(foregroundMaintenancePollInterval())
+	defer ticker.Stop()
+	for {
+		if db.closing.Load() {
+			return db.retainedPruneForceRequested.Swap(false)
+		}
+		if db.retainedPruneForceRequested.Swap(false) {
+			return true
+		}
+		if db.foregroundActivityQuietFor(time.Now(), quietWindow, vlogForegroundReadQuietWindow) {
+			return false
+		}
+		select {
+		case <-db.closeCh:
+			return db.retainedPruneForceRequested.Swap(false)
+		case <-ticker.C:
+		}
+	}
+}
+
 func (db *DB) scheduleRetainedValueLogPrune() {
+	db.scheduleRetainedValueLogPruneWithForce(false)
+}
+
+func (db *DB) scheduleRetainedValueLogPruneForce() {
+	db.scheduleRetainedValueLogPruneWithForce(true)
+}
+
+func (db *DB) scheduleRetainedValueLogPruneWithForce(force bool) {
 	if db == nil || !db.valueLogEnabled() {
 		return
 	}
+	db.retainedValueLogPruneScheduleRequests.Add(1)
+	if force {
+		db.retainedPruneForceRequested.Store(true)
+		db.retainedValueLogPruneScheduleForcedRequests.Add(1)
+	}
 	if db.testSkipRetainedPrune {
 		return
 	}
 	db.retainedPruneMu.Lock()
 	if db.closing.Load() {
+		db.retainedValueLogPruneScheduleSkipClosing.Add(1)
 		db.retainedPruneMu.Unlock()
 		return
 	}
 	if db.retainedPruneDone != nil {
+		db.retainedValueLogPruneScheduleSkipInFlight.Add(1)
 		db.retainedPruneMu.Unlock()
 		return
 	}
@@ -4361,8 +4454,17 @@ func (db *DB) scheduleRetainedValueLogPrune() {
 			db.retainedPruneDone = nil
 			db.retainedPruneMu.Unlock()
 		}()
-		db.waitForForegroundMaintenanceQuietWindow(retainedPruneQuietWindow)
-		if !db.shouldScheduleRetainedValueLogPrune() {
+		effectiveForce := force || db.retainedPruneForceRequested.Swap(false)
+		if !effectiveForce {
+			effectiveForce = db.waitForRetainedValueLogPruneQuietOrForce(retainedPruneQuietWindow)
+		}
+		if !db.shouldScheduleRetainedValueLogPruneWithForce(effectiveForce) {
+			closed := db.valueLogRetainedClosedBytes.Load()
+			if closed <= 0 {
+				db.retainedValueLogPruneScheduleSkipNoClosedBytes.Add(1)
+			} else if !effectiveForce && closed < db.retainedPrunePressureBytes() {
+				db.retainedValueLogPruneScheduleSkipBelowPressure.Add(1)
+			}
 			return
 		}
 		// Retained prune is opportunistic reclaim; do not compete with checkpoint
@@ -4379,10 +4481,25 @@ func (db *DB) scheduleRetainedValueLogPrune() {
 		now := time.Now()
 		last := db.retainedPruneLastStartUnixNano.Load()
 		if last > 0 && now.Sub(time.Unix(0, last)) < retainedPruneMinInterval {
+			db.retainedValueLogPruneScheduleSkipMinInterval.Add(1)
 			return
 		}
 		db.retainedPruneLastStartUnixNano.Store(now.UnixNano())
-		db.pruneRetainedValueLogs()
+		db.retainedValueLogPruneRuns.Add(1)
+		if effectiveForce {
+			db.retainedValueLogPruneForcedRuns.Add(1)
+		}
+		db.retainedValueLogPruneLastUnixNano.Store(now.UnixNano())
+		pruneStats := db.pruneRetainedValueLogs()
+		if pruneStats.AbortedForegroundWrites {
+			db.retainedValueLogPruneForegroundAbortRuns.Add(1)
+		}
+		if pruneStats.RemovedSegments > 0 {
+			db.retainedValueLogPruneRemovedSegments.Add(uint64(pruneStats.RemovedSegments))
+		}
+		if pruneStats.RemovedBytes > 0 {
+			db.retainedValueLogPruneRemovedBytes.Add(uint64(pruneStats.RemovedBytes))
+		}
 	}()
 }
 
@@ -5148,140 +5265,154 @@ type DB struct {
 	valueLogMaxSegmentBytes                           int64
 	journalCompression                                bool
 
-	disableJournal                                bool
-	relaxedSync                                   bool
-	notifyError                                   func(error)
-	debugFlushPointers                            bool
-	debugFlushTiming                              bool
-	debugPtrEligible                              atomic.Int64
-	debugPtrUsed                                  atomic.Int64
-	debugPtrNoPtr                                 atomic.Int64
-	debugPtrDenied                                atomic.Int64
-	debugPtrDisabled                              atomic.Int64
-	checkpointRuns                                atomic.Uint64
-	checkpointTotalNs                             atomic.Uint64
-	checkpointMaxNs                               atomic.Uint64
-	checkpointNoopSkips                           atomic.Uint64
-	checkpointFlushMuWaitNs                       atomic.Uint64
-	checkpointFlushMuWaitMaxNs                    atomic.Uint64
-	checkpointAutoVacuumRuns                      atomic.Uint64
-	checkpointAutoVacuumLastCheckRun              atomic.Uint64
-	checkpointAutoVacuumLastPages                 atomic.Uint64
-	checkpointAutoVacuumLastInternalP50           atomic.Uint64
-	checkpointAutoVacuumLastInternalAvg           atomic.Uint64
-	lastForegroundWriteUnixNano                   atomic.Int64
-	lastForegroundReadUnixNano                    atomic.Int64
-	foregroundReadStampCounter                    atomic.Uint32
-	activeForegroundIterators                     atomic.Int64
-	retainedPruneLastStartUnixNano                atomic.Int64
-	retainedPruneMu                               sync.Mutex
-	retainedPruneDone                             chan struct{}
-	vlogGenerationRemapSuccesses                  atomic.Uint64
-	vlogGenerationRemapFailures                   atomic.Uint64
-	vlogGenerationRewriteBytesIn                  atomic.Uint64
-	vlogGenerationRewriteBytesOut                 atomic.Uint64
-	vlogGenerationRewriteReclaimedBytes           atomic.Uint64
-	vlogGenerationRewriteProcessedLiveBytes       atomic.Uint64
-	vlogGenerationRewriteProcessedStaleBytes      atomic.Uint64
-	vlogGenerationRewriteNoReclaimRuns            atomic.Uint64
-	vlogGenerationRewriteNoReclaimStaleBytes      atomic.Uint64
-	vlogGenerationRewriteRuns                     atomic.Uint64
-	vlogGenerationRewritePlanRuns                 atomic.Uint64
-	vlogGenerationRewritePlanCanceled             atomic.Uint64
-	vlogGenerationRewritePlanErrors               atomic.Uint64
-	vlogGenerationRewritePlanEmpty                atomic.Uint64
-	vlogGenerationRewritePlanSelected             atomic.Uint64
-	vlogGenerationRewritePlanSelectedSegments     atomic.Uint64
-	vlogGenerationRewritePlanSelectedBytes        atomic.Uint64
-	vlogGenerationRewritePlanSelectedLiveBytes    atomic.Uint64
-	vlogGenerationRewritePlanSelectedStaleBytes   atomic.Uint64
-	vlogGenerationRewritePlanCanceledLastNS       atomic.Int64
-	vlogGenerationRewriteAgeBlockedUntilNS        atomic.Int64
-	vlogGenerationRewriteAgeBlockedWakeRunning    atomic.Bool
-	vlogGenerationRewriteIneffectiveLastNS        atomic.Int64
-	vlogGenerationRewriteIneffectiveRuns          atomic.Uint64
-	vlogGenerationRewriteIneffectiveBytesIn       atomic.Uint64
-	vlogGenerationRewriteIneffectiveBytesOut      atomic.Uint64
-	vlogGenerationRewriteCanceledRuns             atomic.Uint64
-	vlogGenerationRewriteCanceledLastNS           atomic.Int64
-	vlogGenerationRewriteQueuePruneRuns           atomic.Uint64
-	vlogGenerationRewriteQueuePruneIDs            atomic.Uint64
-	vlogGenerationGCSegmentsDeleted               atomic.Uint64
-	vlogGenerationGCBytesDeleted                  atomic.Uint64
-	vlogGenerationGCRuns                          atomic.Uint64
-	vlogGenerationVacuumRuns                      atomic.Uint64
-	vlogGenerationVacuumFailures                  atomic.Uint64
-	vlogGenerationVacuumSkippedDisabled           atomic.Uint64
-	vlogGenerationVacuumSkippedRewriteBytes       atomic.Uint64
-	vlogGenerationVacuumSkippedCooldown           atomic.Uint64
-	vlogGenerationLastVacuumUnixNano              atomic.Int64
-	vlogGenerationLastRewritePlanUnixNano         atomic.Int64
-	vlogGenerationLastRewriteUnixNano             atomic.Int64
-	vlogGenerationLastGCUnixNano                  atomic.Int64
-	vlogGenerationLastCheckpointKickUnixNano      atomic.Int64
-	vlogGenerationLastGCDryRunUnixNano            atomic.Int64
-	vlogGenerationLastGCDryRunBytesEligible       atomic.Int64
-	vlogGenerationLastGCDryRunSegsEligible        atomic.Int64
-	vlogGenerationLastGCBytesReferenced           atomic.Int64
-	vlogGenerationLastGCSegmentsReferenced        atomic.Int64
-	vlogGenerationLastGCBytesActive               atomic.Int64
-	vlogGenerationLastGCSegmentsActive            atomic.Int64
-	vlogGenerationLastGCBytesProtected            atomic.Int64
-	vlogGenerationLastGCSegmentsProtected         atomic.Int64
-	vlogGenerationLastGCBytesProtectedInUse       atomic.Int64
-	vlogGenerationLastGCSegmentsProtectedInUse    atomic.Int64
-	vlogGenerationLastGCBytesProtectedRetained    atomic.Int64
-	vlogGenerationLastGCSegmentsProtectedRetained atomic.Int64
-	vlogGenerationLastGCBytesProtectedOverlap     atomic.Int64
-	vlogGenerationLastGCSegmentsProtectedOverlap  atomic.Int64
-	vlogGenerationLastGCBytesProtectedOther       atomic.Int64
-	vlogGenerationLastGCSegmentsProtectedOther    atomic.Int64
-	vlogGenerationLastGCBytesEligible             atomic.Int64
-	vlogGenerationLastGCSegmentsEligible          atomic.Int64
-	vlogGenerationLastGCBytesDeleted              atomic.Int64
-	vlogGenerationLastGCSegmentsDeleted           atomic.Int64
-	vlogGenerationLastGCBytesPending              atomic.Int64
-	vlogGenerationLastGCSegmentsPending           atomic.Int64
-	vlogGenerationChurnBytes                      atomic.Uint64
-	vlogGenerationSchedulerState                  atomic.Uint32
-	vlogGenerationMaintenanceActive               atomic.Bool
-	vlogGenerationMaintenanceAttempts             atomic.Uint64
-	vlogGenerationMaintenanceAcquired             atomic.Uint64
-	vlogGenerationMaintenanceCollisions           atomic.Uint64
-	vlogGenerationMaintenanceSkipWALOnPeriodic    atomic.Uint64
-	vlogGenerationMaintenanceSkipPhase            atomic.Uint64
-	vlogGenerationMaintenanceSkipStageGate        atomic.Uint64
-	vlogGenerationMaintenanceSkipStageNotDue      atomic.Uint64
-	vlogGenerationMaintenanceSkipStageDue         atomic.Uint64
-	vlogGenerationMaintenanceSkipAgeBlocked       atomic.Uint64
-	vlogGenerationMaintenanceSkipPriority         atomic.Uint64
-	vlogGenerationMaintenanceSkipQuiet            atomic.Uint64
-	vlogGenerationMaintenanceSkipPreCheckpoint    atomic.Uint64
-	vlogGenerationMaintenanceSkipCheckpointing    atomic.Uint64
-	vlogGenerationMaintenancePassNoop             atomic.Uint64
-	vlogGenerationMaintenancePassWithRewrite      atomic.Uint64
-	vlogGenerationMaintenancePassWithGC           atomic.Uint64
-	vlogGenerationMaintenancePassTotalNanos       atomic.Uint64
-	vlogGenerationMaintenancePassMaxNanos         atomic.Uint64
-	vlogGenerationLastReason                      atomic.Uint32
-	vlogGenerationCheckpointKickRuns              atomic.Uint64
-	vlogGenerationCheckpointKickRewriteRuns       atomic.Uint64
-	vlogGenerationCheckpointKickGCRuns            atomic.Uint64
-	vlogGenerationCheckpointKickPending           atomic.Bool
-	vlogGenerationDeferredMaintenancePending      atomic.Bool
-	vlogGenerationDeferredMaintenanceRunning      atomic.Bool
-	vlogGenerationRewriteStageWakeObservedNS      atomic.Int64
-	vlogGenerationRewriteQueueMu                  sync.Mutex
-	vlogGenerationCheckpointKickActive            atomic.Bool
-	vlogGenerationRewriteQueue                    []uint32
-	vlogGenerationRewriteLedger                   []backenddb.ValueLogRewritePlanSegment
-	vlogGenerationRewritePenalties                map[uint32]valueLogGenerationRewritePenalty
-	vlogGenerationRewriteStagePending             bool
-	vlogGenerationRewriteStageObservedUnixNano    int64
-	vlogGenerationRewriteQueueLoaded              bool
-	vlogGenerationLastChurnBps                    atomic.Int64
-	vlogGenerationLastChurnSampleBytes            atomic.Uint64
-	vlogGenerationLastChurnSampleNS               atomic.Int64
+	disableJournal                                 bool
+	relaxedSync                                    bool
+	notifyError                                    func(error)
+	debugFlushPointers                             bool
+	debugFlushTiming                               bool
+	debugPtrEligible                               atomic.Int64
+	debugPtrUsed                                   atomic.Int64
+	debugPtrNoPtr                                  atomic.Int64
+	debugPtrDenied                                 atomic.Int64
+	debugPtrDisabled                               atomic.Int64
+	checkpointRuns                                 atomic.Uint64
+	checkpointTotalNs                              atomic.Uint64
+	checkpointMaxNs                                atomic.Uint64
+	checkpointNoopSkips                            atomic.Uint64
+	checkpointFlushMuWaitNs                        atomic.Uint64
+	checkpointFlushMuWaitMaxNs                     atomic.Uint64
+	checkpointAutoVacuumRuns                       atomic.Uint64
+	checkpointAutoVacuumLastCheckRun               atomic.Uint64
+	checkpointAutoVacuumLastPages                  atomic.Uint64
+	checkpointAutoVacuumLastInternalP50            atomic.Uint64
+	checkpointAutoVacuumLastInternalAvg            atomic.Uint64
+	lastForegroundWriteUnixNano                    atomic.Int64
+	lastForegroundReadUnixNano                     atomic.Int64
+	foregroundReadStampCounter                     atomic.Uint32
+	activeForegroundIterators                      atomic.Int64
+	retainedPruneLastStartUnixNano                 atomic.Int64
+	retainedValueLogPruneLastUnixNano              atomic.Int64
+	retainedValueLogPruneRuns                      atomic.Uint64
+	retainedValueLogPruneForcedRuns                atomic.Uint64
+	retainedValueLogPruneForegroundAbortRuns       atomic.Uint64
+	retainedValueLogPruneRemovedSegments           atomic.Uint64
+	retainedValueLogPruneRemovedBytes              atomic.Uint64
+	retainedValueLogPruneScheduleRequests          atomic.Uint64
+	retainedValueLogPruneScheduleForcedRequests    atomic.Uint64
+	retainedValueLogPruneScheduleSkipClosing       atomic.Uint64
+	retainedValueLogPruneScheduleSkipInFlight      atomic.Uint64
+	retainedValueLogPruneScheduleSkipNoClosedBytes atomic.Uint64
+	retainedValueLogPruneScheduleSkipBelowPressure atomic.Uint64
+	retainedValueLogPruneScheduleSkipMinInterval   atomic.Uint64
+	retainedPruneForceRequested                    atomic.Bool
+	retainedPruneMu                                sync.Mutex
+	retainedPruneDone                              chan struct{}
+	vlogGenerationRemapSuccesses                   atomic.Uint64
+	vlogGenerationRemapFailures                    atomic.Uint64
+	vlogGenerationRewriteBytesIn                   atomic.Uint64
+	vlogGenerationRewriteBytesOut                  atomic.Uint64
+	vlogGenerationRewriteReclaimedBytes            atomic.Uint64
+	vlogGenerationRewriteProcessedLiveBytes        atomic.Uint64
+	vlogGenerationRewriteProcessedStaleBytes       atomic.Uint64
+	vlogGenerationRewriteNoReclaimRuns             atomic.Uint64
+	vlogGenerationRewriteNoReclaimStaleBytes       atomic.Uint64
+	vlogGenerationRewriteRuns                      atomic.Uint64
+	vlogGenerationRewritePlanRuns                  atomic.Uint64
+	vlogGenerationRewritePlanCanceled              atomic.Uint64
+	vlogGenerationRewritePlanErrors                atomic.Uint64
+	vlogGenerationRewritePlanEmpty                 atomic.Uint64
+	vlogGenerationRewritePlanSelected              atomic.Uint64
+	vlogGenerationRewritePlanSelectedSegments      atomic.Uint64
+	vlogGenerationRewritePlanSelectedBytes         atomic.Uint64
+	vlogGenerationRewritePlanSelectedLiveBytes     atomic.Uint64
+	vlogGenerationRewritePlanSelectedStaleBytes    atomic.Uint64
+	vlogGenerationRewritePlanCanceledLastNS        atomic.Int64
+	vlogGenerationRewriteAgeBlockedUntilNS         atomic.Int64
+	vlogGenerationRewriteAgeBlockedWakeRunning     atomic.Bool
+	vlogGenerationRewriteIneffectiveLastNS         atomic.Int64
+	vlogGenerationRewriteIneffectiveRuns           atomic.Uint64
+	vlogGenerationRewriteIneffectiveBytesIn        atomic.Uint64
+	vlogGenerationRewriteIneffectiveBytesOut       atomic.Uint64
+	vlogGenerationRewriteCanceledRuns              atomic.Uint64
+	vlogGenerationRewriteCanceledLastNS            atomic.Int64
+	vlogGenerationRewriteQueuePruneRuns            atomic.Uint64
+	vlogGenerationRewriteQueuePruneIDs             atomic.Uint64
+	vlogGenerationGCSegmentsDeleted                atomic.Uint64
+	vlogGenerationGCBytesDeleted                   atomic.Uint64
+	vlogGenerationGCRuns                           atomic.Uint64
+	vlogGenerationVacuumRuns                       atomic.Uint64
+	vlogGenerationVacuumFailures                   atomic.Uint64
+	vlogGenerationVacuumSkippedDisabled            atomic.Uint64
+	vlogGenerationVacuumSkippedRewriteBytes        atomic.Uint64
+	vlogGenerationVacuumSkippedCooldown            atomic.Uint64
+	vlogGenerationLastVacuumUnixNano               atomic.Int64
+	vlogGenerationLastRewritePlanUnixNano          atomic.Int64
+	vlogGenerationLastRewriteUnixNano              atomic.Int64
+	vlogGenerationLastGCUnixNano                   atomic.Int64
+	vlogGenerationLastCheckpointKickUnixNano       atomic.Int64
+	vlogGenerationLastGCDryRunUnixNano             atomic.Int64
+	vlogGenerationLastGCDryRunBytesEligible        atomic.Int64
+	vlogGenerationLastGCDryRunSegsEligible         atomic.Int64
+	vlogGenerationLastGCBytesReferenced            atomic.Int64
+	vlogGenerationLastGCSegmentsReferenced         atomic.Int64
+	vlogGenerationLastGCBytesActive                atomic.Int64
+	vlogGenerationLastGCSegmentsActive             atomic.Int64
+	vlogGenerationLastGCBytesProtected             atomic.Int64
+	vlogGenerationLastGCSegmentsProtected          atomic.Int64
+	vlogGenerationLastGCBytesProtectedInUse        atomic.Int64
+	vlogGenerationLastGCSegmentsProtectedInUse     atomic.Int64
+	vlogGenerationLastGCBytesProtectedRetained     atomic.Int64
+	vlogGenerationLastGCSegmentsProtectedRetained  atomic.Int64
+	vlogGenerationLastGCBytesProtectedOverlap      atomic.Int64
+	vlogGenerationLastGCSegmentsProtectedOverlap   atomic.Int64
+	vlogGenerationLastGCBytesProtectedOther        atomic.Int64
+	vlogGenerationLastGCSegmentsProtectedOther     atomic.Int64
+	vlogGenerationLastGCBytesEligible              atomic.Int64
+	vlogGenerationLastGCSegmentsEligible           atomic.Int64
+	vlogGenerationLastGCBytesDeleted               atomic.Int64
+	vlogGenerationLastGCSegmentsDeleted            atomic.Int64
+	vlogGenerationLastGCBytesPending               atomic.Int64
+	vlogGenerationLastGCSegmentsPending            atomic.Int64
+	vlogGenerationChurnBytes                       atomic.Uint64
+	vlogGenerationSchedulerState                   atomic.Uint32
+	vlogGenerationMaintenanceActive                atomic.Bool
+	vlogGenerationMaintenanceAttempts              atomic.Uint64
+	vlogGenerationMaintenanceAcquired              atomic.Uint64
+	vlogGenerationMaintenanceCollisions            atomic.Uint64
+	vlogGenerationMaintenanceSkipWALOnPeriodic     atomic.Uint64
+	vlogGenerationMaintenanceSkipPhase             atomic.Uint64
+	vlogGenerationMaintenanceSkipStageGate         atomic.Uint64
+	vlogGenerationMaintenanceSkipStageNotDue       atomic.Uint64
+	vlogGenerationMaintenanceSkipStageDue          atomic.Uint64
+	vlogGenerationMaintenanceSkipAgeBlocked        atomic.Uint64
+	vlogGenerationMaintenanceSkipPriority          atomic.Uint64
+	vlogGenerationMaintenanceSkipQuiet             atomic.Uint64
+	vlogGenerationMaintenanceSkipPreCheckpoint     atomic.Uint64
+	vlogGenerationMaintenanceSkipCheckpointing     atomic.Uint64
+	vlogGenerationMaintenancePassNoop              atomic.Uint64
+	vlogGenerationMaintenancePassWithRewrite       atomic.Uint64
+	vlogGenerationMaintenancePassWithGC            atomic.Uint64
+	vlogGenerationMaintenancePassTotalNanos        atomic.Uint64
+	vlogGenerationMaintenancePassMaxNanos          atomic.Uint64
+	vlogGenerationLastReason                       atomic.Uint32
+	vlogGenerationCheckpointKickRuns               atomic.Uint64
+	vlogGenerationCheckpointKickRewriteRuns        atomic.Uint64
+	vlogGenerationCheckpointKickGCRuns             atomic.Uint64
+	vlogGenerationCheckpointKickPending            atomic.Bool
+	vlogGenerationDeferredMaintenancePending       atomic.Bool
+	vlogGenerationDeferredMaintenanceRunning       atomic.Bool
+	vlogGenerationRewriteStageWakeObservedNS       atomic.Int64
+	vlogGenerationRewriteQueueMu                   sync.Mutex
+	vlogGenerationCheckpointKickActive             atomic.Bool
+	vlogGenerationRewriteQueue                     []uint32
+	vlogGenerationRewriteLedger                    []backenddb.ValueLogRewritePlanSegment
+	vlogGenerationRewritePenalties                 map[uint32]valueLogGenerationRewritePenalty
+	vlogGenerationRewriteStagePending              bool
+	vlogGenerationRewriteStageObservedUnixNano     int64
+	vlogGenerationRewriteQueueLoaded               bool
+	vlogGenerationLastChurnBps                     atomic.Int64
+	vlogGenerationLastChurnSampleBytes             atomic.Uint64
+	vlogGenerationLastChurnSampleNS                atomic.Int64
 	// Rewrite budget token bucket (bytes) for online maintenance. This lets us
 	// interpret ValueLogRewriteBudgetBytesPerSec as a true per-second bandwidth
 	// budget while still running maintenance at coarse intervals.
@@ -7381,6 +7512,9 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) {
 					}
 					l.vlogClosedSizes[seg.path] = seg.size
 					l.vlogClosedBytes.Add(seg.size)
+					if _, retained := db.valueLogRetain[seg.path]; retained {
+						db.valueLogRetainedClosedBytes.Add(seg.size)
+					}
 				} else {
 					if seg.path == l.walPath {
 						continue
@@ -14006,6 +14140,19 @@ planned:
 					return fmt.Errorf("generational gc after rewrite: %w", gcErr)
 				}
 				db.observeVlogGenerationGCStats(gcStats)
+				db.vlogGenerationGCRuns.Add(1)
+				if gcStats.SegmentsDeleted > 0 {
+					db.vlogGenerationGCSegmentsDeleted.Add(uint64(gcStats.SegmentsDeleted))
+				}
+				if gcStats.BytesDeleted > 0 {
+					db.vlogGenerationGCBytesDeleted.Add(uint64(gcStats.BytesDeleted))
+				}
+				if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 {
+					// Retained-path protection can starve live reclaim even when rewrite
+					// processed stale payload in-pass. Kick an eager retained prune so
+					// lifecycle pins can drain without waiting for byte-pressure gates.
+					db.scheduleRetainedValueLogPruneForce()
+				}
 				if gcStats.BytesDeleted > 0 {
 					gcBytesDeleted = int64(gcStats.BytesDeleted)
 					effectiveBytesAfter -= gcBytesDeleted
@@ -14215,6 +14362,11 @@ planned:
 			return fmt.Errorf("generational gc: %w", err)
 		}
 		db.observeVlogGenerationGCStats(gcStats)
+		if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 {
+			// When GC classifies all reclaim blockers as retained-path protection,
+			// trigger an eager retained prune pass to release stale lifecycle pins.
+			db.scheduleRetainedValueLogPruneForce()
+		}
 		db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle)
 		db.vlogGenerationGCRuns.Add(1)
 		if gcStats.SegmentsDeleted > 0 {
@@ -19785,6 +19937,22 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_retained_segments"] = fmt.Sprintf("%d", vlogSegments)
 	stats["treedb.cache.vlog_retained_bytes_estimate"] = fmt.Sprintf("%d", vlogBytes)
 	stats["treedb.process.memory.vlog_retained_bytes_estimate"] = fmt.Sprintf("%d", vlogBytes)
+	stats["treedb.cache.vlog_retained_prune.closed_bytes"] = fmt.Sprintf("%d", db.valueLogRetainedClosedBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.last_unix_nano"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastUnixNano.Load())
+	stats["treedb.cache.vlog_retained_prune.runs"] = fmt.Sprintf("%d", db.retainedValueLogPruneRuns.Load())
+	stats["treedb.cache.vlog_retained_prune.forced_runs"] = fmt.Sprintf("%d", db.retainedValueLogPruneForcedRuns.Load())
+	stats["treedb.cache.vlog_retained_prune.foreground_abort_runs"] = fmt.Sprintf("%d", db.retainedValueLogPruneForegroundAbortRuns.Load())
+	stats["treedb.cache.vlog_retained_prune.removed_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneRemovedSegments.Load())
+	stats["treedb.cache.vlog_retained_prune.removed_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneRemovedBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.pressure_bytes"] = fmt.Sprintf("%d", db.retainedPrunePressureBytes())
+	stats["treedb.cache.vlog_retained_prune.schedule_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleRequests.Load())
+	stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleForcedRequests.Load())
+	stats["treedb.cache.vlog_retained_prune.schedule_skip.closing"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipClosing.Load())
+	stats["treedb.cache.vlog_retained_prune.schedule_skip.inflight"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipInFlight.Load())
+	stats["treedb.cache.vlog_retained_prune.schedule_skip.no_closed_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipNoClosedBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.schedule_skip.below_pressure"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipBelowPressure.Load())
+	stats["treedb.cache.vlog_retained_prune.schedule_skip.min_interval"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipMinInterval.Load())
+	stats["treedb.cache.vlog_retained_prune.force_pending"] = fmt.Sprintf("%t", db.retainedPruneForceRequested.Load())
 	stats["treedb.cache.vlog_generation.policy"] = fmt.Sprintf("%d", db.valueLogGenerationPolicy)
 	stats["treedb.cache.vlog_generation.enabled"] = fmt.Sprintf("%t", db.valueLogGenerationPolicy == uint8(backenddb.ValueLogGenerationHotWarmCold))
 	stats["treedb.cache.vlog_generation.maintenance_phase"] = maintenancePhaseString(db.maintenancePhase.Load())
diff --git a/TreeDB/caching/db_test.go b/TreeDB/caching/db_test.go
index cb6e8adb3..8882fb6db 100644
--- a/TreeDB/caching/db_test.go
+++ b/TreeDB/caching/db_test.go
@@ -1953,6 +1953,57 @@ func TestCachingDB_PrunesRetainedValueLog(t *testing.T) {
 	}
 }
 
+func TestOpen_InitializesRetainedClosedBytesFromExistingSegments(t *testing.T) {
+	dir := t.TempDir()
+
+	opts := Options{
+		DisableWAL:               true,
+		RelaxedSync:              true,
+		AllowUnsafe:              true,
+		FlushThreshold:           1 << 20,
+		ValueLogPointerThreshold: 1,
+	}
+
+	backend1, err := db.Open(db.Options{Dir: dir, ChunkSize: 64 * 1024})
+	if err != nil {
+		t.Fatalf("backend1 open: %v", err)
+	}
+	cache1, err := Open(dir, backend1, opts)
+	if err != nil {
+		_ = backend1.Close()
+		t.Fatalf("cache1 open: %v", err)
+	}
+
+	if err := cache1.Set([]byte("k"), bytes.Repeat([]byte("x"), page.DefaultInlineThreshold+256)); err != nil {
+		t.Fatalf("Set: %v", err)
+	}
+	cache1.flushAll(false)
+	if err := cache1.rotateValueLogLocked(&cache1.lanes[0]); err != nil {
+		t.Fatalf("rotateValueLogLocked: %v", err)
+	}
+	if got := cache1.valueLogRetainedClosedBytes.Load(); got <= 0 {
+		t.Fatalf("pre-close retained closed bytes=%d want >0", got)
+	}
+	if err := cache1.Close(); err != nil {
+		t.Fatalf("cache1 close: %v", err)
+	}
+
+	backend2, err := db.Open(db.Options{Dir: dir, ChunkSize: 64 * 1024})
+	if err != nil {
+		t.Fatalf("backend2 open: %v", err)
+	}
+	cache2, err := Open(dir, backend2, opts)
+	if err != nil {
+		_ = backend2.Close()
+		t.Fatalf("cache2 open: %v", err)
+	}
+	defer cache2.Close()
+
+	if got := cache2.valueLogRetainedClosedBytes.Load(); got <= 0 {
+		t.Fatalf("reopen retained closed bytes=%d want >0", got)
+	}
+}
+
 func TestPruneRetainedValueLogs_SkipsLiveScanWhenAllRetainedPathsInUse(t *testing.T) {
 	dir := t.TempDir()
 	backend := NewMockBackend()
@@ -2361,6 +2412,145 @@ func TestCheckpoint_SkipsRetainedValueLogPruneBelowPressureThreshold(t *testing.
 	if cache.retainedPruneActive() {
 		cache.waitForRetainedValueLogPrune()
 	}
+	stats := cache.Stats()
+	if got := stats["treedb.cache.vlog_retained_prune.schedule_requests"]; got != "1" {
+		t.Fatalf("schedule_requests=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"]; got != "0" {
+		t.Fatalf("schedule_forced_requests=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_retained_prune.schedule_skip.below_pressure"]; got != "1" {
+		t.Fatalf("schedule_skip.below_pressure=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_retained_prune.closed_bytes"]; got != "128" {
+		t.Fatalf("closed_bytes=%q want 128", got)
+	}
+}
+
+func TestRetainedValueLogPruneForce_BypassesPressureThreshold(t *testing.T) {
+	dir := t.TempDir()
+	backend := NewMockBackend()
+	backend.iteratorStartedCh = make(chan struct{})
+	backend.iteratorBlockCh = make(chan struct{})
+
+	cache, err := Open(dir, backend, Options{
+		DisableWAL:               true,
+		RelaxedSync:              true,
+		AllowUnsafe:              true,
+		FlushThreshold:           1 << 20,
+		MaxValueLogRetainedBytes: 1 << 20,
+		ValueLogPointerThreshold: 1,
+	})
+	if err != nil {
+		t.Fatalf("cache open: %v", err)
+	}
+	defer cache.Close()
+
+	fileID, err := valuelog.EncodeFileID(0, 245)
+	if err != nil {
+		t.Fatalf("EncodeFileID: %v", err)
+	}
+	retainedPath := filepath.Join(dir, "wal", "value-l0-000245.log")
+	if err := os.MkdirAll(filepath.Dir(retainedPath), 0o755); err != nil {
+		t.Fatalf("MkdirAll: %v", err)
+	}
+	w, err := valuelog.NewWriter(retainedPath, fileID)
+	if err != nil {
+		t.Fatalf("NewWriter: %v", err)
+	}
+	if _, err := w.Append(0, nil, 1, bytes.Repeat([]byte("t"), 128)); err != nil {
+		_ = w.Close()
+		t.Fatalf("Append: %v", err)
+	}
+	if err := w.Close(); err != nil {
+		t.Fatalf("Close writer: %v", err)
+	}
+	cache.markValueLogRetain(retainedPath)
+	seedRetainedPrunePressure(cache, retainedPath, 128)
+	cache.lastForegroundWriteUnixNano.Store(time.Now().Add(-2 * retainedPruneQuietWindow).UnixNano())
+
+	cache.scheduleRetainedValueLogPruneForce()
+
+	select {
+	case <-backend.iteratorStartedCh:
+	case <-time.After(2 * time.Second):
+		t.Fatalf("forced retained prune did not start below pressure threshold")
+	}
+	close(backend.iteratorBlockCh)
+	cache.waitForRetainedValueLogPrune()
+	stats := cache.Stats()
+	if got := stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"]; got != "1" {
+		t.Fatalf("schedule_forced_requests=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_retained_prune.forced_runs"]; got != "1" {
+		t.Fatalf("forced_runs=%q want 1", got)
+	}
+}
+
+func TestRetainedValueLogPruneForce_PreemptsQuietWait(t *testing.T) {
+	dir := t.TempDir()
+	backend := NewMockBackend()
+	backend.iteratorStartedCh = make(chan struct{})
+	backend.iteratorBlockCh = make(chan struct{})
+
+	cache, err := Open(dir, backend, Options{
+		DisableWAL:               true,
+		RelaxedSync:              true,
+		AllowUnsafe:              true,
+		FlushThreshold:           1 << 20,
+		ValueLogPointerThreshold: 1,
+	})
+	if err != nil {
+		t.Fatalf("cache open: %v", err)
+	}
+	defer cache.Close()
+
+	fileID, err := valuelog.EncodeFileID(0, 246)
+	if err != nil {
+		t.Fatalf("EncodeFileID: %v", err)
+	}
+	retainedPath := filepath.Join(dir, "wal", "value-l0-000246.log")
+	if err := os.MkdirAll(filepath.Dir(retainedPath), 0o755); err != nil {
+		t.Fatalf("MkdirAll: %v", err)
+	}
+	w, err := valuelog.NewWriter(retainedPath, fileID)
+	if err != nil {
+		t.Fatalf("NewWriter: %v", err)
+	}
+	if _, err := w.Append(0, nil, 1, bytes.Repeat([]byte("u"), 128)); err != nil {
+		_ = w.Close()
+		t.Fatalf("Append: %v", err)
+	}
+	if err := w.Close(); err != nil {
+		t.Fatalf("Close writer: %v", err)
+	}
+	cache.markValueLogRetain(retainedPath)
+	seedRetainedPrunePressure(cache, retainedPath, 2<<30)
+	cache.lastForegroundWriteUnixNano.Store(time.Now().UnixNano())
+
+	cache.scheduleRetainedValueLogPrune()
+	select {
+	case <-backend.iteratorStartedCh:
+		t.Fatalf("retained prune started before quiet window elapsed")
+	case <-time.After(retainedPruneNegativeAssertWait):
+	}
+
+	cache.scheduleRetainedValueLogPruneForce()
+
+	select {
+	case <-backend.iteratorStartedCh:
+	case <-time.After(2 * time.Second):
+		t.Fatalf("forced retained prune did not preempt quiet-window wait")
+	}
+	close(backend.iteratorBlockCh)
+	cache.waitForRetainedValueLogPrune()
+	stats := cache.Stats()
+	if got := stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"]; got != "1" {
+		t.Fatalf("schedule_forced_requests=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_retained_prune.forced_runs"]; got != "1" {
+		t.Fatalf("forced_runs=%q want 1", got)
+	}
 }
 
 func TestCheckpoint_DoesNotWaitForPriorRetainedValueLogPrune(t *testing.T) {
diff --git a/TreeDB/caching/expvar_stats.go b/TreeDB/caching/expvar_stats.go
index 5a7240f46..3e9f2b7ed 100644
--- a/TreeDB/caching/expvar_stats.go
+++ b/TreeDB/caching/expvar_stats.go
@@ -140,6 +140,7 @@ func selectTreeDBExpvarStats(stats map[string]string) map[string]any {
 			strings.HasPrefix(k, "treedb.cache.vlog_auto.") ||
 			strings.HasPrefix(k, "treedb.cache.vlog_dict.") ||
 			strings.HasPrefix(k, "treedb.cache.vlog_generation.") ||
+			strings.HasPrefix(k, "treedb.cache.vlog_retained_prune.") ||
 			strings.HasPrefix(k, "treedb.cache.vlog_payload_kind.") ||
 			strings.HasPrefix(k, "treedb.cache.vlog_outer_leaf_codec.") ||
 			strings.HasPrefix(k, "treedb.cache.batch_arena.") {
diff --git a/TreeDB/caching/expvar_stats_test.go b/TreeDB/caching/expvar_stats_test.go
index ff1982510..cc88d93ab 100644
--- a/TreeDB/caching/expvar_stats_test.go
+++ b/TreeDB/caching/expvar_stats_test.go
@@ -29,6 +29,7 @@ func TestSelectTreeDBExpvarStatsFiltersAndCoerces(t *testing.T) {
 		"treedb.cache.vlog_payload_kind.raw_bytes.single_value":         "2048",
 		"treedb.cache.vlog_outer_leaf_codec.raw_bytes.lz4":              "512",
 		"treedb.cache.vlog_generation.rewrite.reclaimed_bytes":          "1234",
+		"treedb.cache.vlog_retained_prune.runs":                         "3",
 		"treedb.process.memory.heap_inuse_bytes":                        "4096",
 		"treedb.process.memory.pool_pressure_level":                     "critical",
 		"treedb.cache.batch_arena.pool_bytes_estimate":                  "65536",
@@ -84,6 +85,9 @@ func TestSelectTreeDBExpvarStatsFiltersAndCoerces(t *testing.T) {
 	if v, ok := got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"].(int64); !ok || v != 1234 {
 		t.Fatalf("vlog_generation.rewrite.reclaimed_bytes=%T(%v) want int64(1234)", got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"], got["treedb.cache.vlog_generation.rewrite.reclaimed_bytes"])
 	}
+	if v, ok := got["treedb.cache.vlog_retained_prune.runs"].(int64); !ok || v != 3 {
+		t.Fatalf("vlog_retained_prune.runs=%T(%v) want int64(3)", got["treedb.cache.vlog_retained_prune.runs"], got["treedb.cache.vlog_retained_prune.runs"])
+	}
 	if v, ok := got["treedb.process.memory.heap_inuse_bytes"].(int64); !ok || v != 4096 {
 		t.Fatalf("heap_inuse_bytes=%T(%v) want int64(4096)", got["treedb.process.memory.heap_inuse_bytes"], got["treedb.process.memory.heap_inuse_bytes"])
 	}
diff --git a/worklog/2026-03-27.md b/worklog/2026-03-27.md
index b44c770fb..c4178103a 100644
--- a/worklog/2026-03-27.md
+++ b/worklog/2026-03-27.md
@@ -490,3 +490,59 @@
   - pre: `5266839216`
   - tool output: `segments_before=22 segments_after=16 bytes_before=4993530542 bytes_after=2108841030 records=995454`
   - post: `2148318606`
+
+## Retained-Prune Scheduling Instrumentation + Force Preemption (late update)
+
+- Code updates:
+  - `TreeDB/caching/db.go`
+    - retained-prune now exports explicit counters:
+      - `treedb.cache.vlog_retained_prune.closed_bytes`
+      - `treedb.cache.vlog_retained_prune.pressure_bytes`
+      - `treedb.cache.vlog_retained_prune.schedule_requests`
+      - `treedb.cache.vlog_retained_prune.schedule_forced_requests`
+      - `treedb.cache.vlog_retained_prune.schedule_skip.{closing,inflight,no_closed_bytes,below_pressure,min_interval}`
+      - `treedb.cache.vlog_retained_prune.force_pending`
+      - plus run/outcome counters (`runs`, `forced_runs`, `foreground_abort_runs`, `removed_*`).
+    - expvar now exports `treedb.cache.vlog_retained_prune.*` via allowlist.
+    - forced retained-prune requests can preempt a currently inflight quiet-window wait (instead of being starved by `schedule_skip.inflight`).
+    - retained-prune force trigger from GC-after-rewrite/periodic-GC is gated on `valueLogRetainedClosedBytes > 0`.
+    - `vlog_generation.gc.{runs,deleted_*}` accounting is updated in post-rewrite GC path as well.
+    - open-path retained closed-byte initialization includes existing retained segments found at startup.
+  - `TreeDB/caching/db_test.go`
+    - added:
+      - `TestOpen_InitializesRetainedClosedBytesFromExistingSegments`
+      - `TestRetainedValueLogPruneForce_BypassesPressureThreshold`
+      - `TestRetainedValueLogPruneForce_PreemptsQuietWait`
+    - extended `TestCheckpoint_SkipsRetainedValueLogPruneBelowPressureThreshold` with schedule/closed-byte assertions.
+  - `TreeDB/caching/expvar_stats_test.go`
+    - selector test now verifies retained-prune family inclusion.
+
+- Validation:
+  - focused:
+    - `go test ./TreeDB/caching -run 'TestSelectTreeDBExpvarStatsFiltersAndCoerces|TestOpen_InitializesRetainedClosedBytesFromExistingSegments|TestRetainedValueLogPruneForce_BypassesPressureThreshold|TestRetainedValueLogPruneForce_PreemptsQuietWait|TestCheckpoint_SkipsRetainedValueLogPruneBelowPressureThreshold|TestCheckpoint_DoesNotWaitForPriorRetainedValueLogPrune|TestCheckpoint_SchedulesRetainedValueLogPruneAsynchronously|TestCheckpoint_DefersRetainedValueLogPruneUntilForegroundQuiet|TestRetainedValueLogPrune_AbortsWhenForegroundWritesResume|TestCheckpoint_RateLimitsRetainedValueLogPrune|TestBackendMaintenance_DoesNotBlockOnRetainedValueLogPruneQuietWindow|TestVlogGenerationStats_ReportRewriteBacklogAndDurations' -count=1`
+  - full:
+    - `go test ./TreeDB/caching ./TreeDB/db -count=1`
+
+- Live run readouts:
+  - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260327222148`
+    - snapshot: `pprof-heap-max-rss-final-11238156k-20260327222731.debug_vars.json`
+    - application instance:
+      - `rewrite.runs=1`, `rewrite.processed_stale_bytes=498581053`, `rewrite.reclaimed_bytes=0`
+      - `gc.runs=2`, `gc.last_protected_retained_bytes=536873907`, `gc.last_eligible_bytes=0`
+      - `retained_prune.closed_bytes=5100295854`
+      - `retained_prune.pressure_bytes=17179869164`
+      - `retained_prune.schedule_requests=1551`
+      - `retained_prune.schedule_forced_requests=1`
+      - `retained_prune.schedule_skip.inflight=1549`
+      - `retained_prune.runs=0`
+    - interpretation:
+      - before force-preemption fix, one inflight quiet-window worker starved later forced request.
+  - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260327223633`
+    - snapshot: `pprof-heap-max-rss-8938296k-20260327224120.debug_vars.json`
+    - application instance:
+      - `rewrite.runs=0` (forced path not exercised on this short window)
+      - `retained_prune.closed_bytes=4563428411`
+      - `retained_prune.schedule_requests=185`
+      - `retained_prune.schedule_forced_requests=0`
+      - `retained_prune.schedule_skip.inflight=183`
+      - `retained_prune.runs=0`

From da497ab14e9048b40d7011452b5459ac30da85be Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 23:07:10 -1000
Subject: [PATCH 14/61] treedb: retry forced retained-prune scan without write
 gate

---
 TreeDB/caching/db.go      | 23 ++++++++++-
 TreeDB/caching/db_test.go | 81 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 101 insertions(+), 3 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index c9dfa7c01..ae87b8566 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -4194,6 +4194,8 @@ type retainedValueLogPruneStats struct {
 	RemovedSegments         int
 	RemovedBytes            int64
 	AbortedForegroundWrites bool
+	RetriedWithoutWriteGate bool
+	RetrySucceeded          bool
 }
 
 func (db *DB) valueLogClosedSegmentSize(path string) int64 {
@@ -4213,7 +4215,7 @@ func (db *DB) valueLogClosedSegmentSize(path string) int64 {
 	return l.vlogClosedSizes[path]
 }
 
-func (db *DB) pruneRetainedValueLogs() retainedValueLogPruneStats {
+func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats {
 	var out retainedValueLogPruneStats
 	if !db.valueLogEnabled() {
 		return out
@@ -4248,6 +4250,13 @@ func (db *DB) pruneRetainedValueLogs() retainedValueLogPruneStats {
 	}
 
 	live, err := db.collectValueLogLiveIDsUntil(db.lastForegroundWriteUnixNano.Load())
+	if err != nil && force && errors.Is(err, errForegroundWritesResumed) {
+		out.RetriedWithoutWriteGate = true
+		live, err = db.collectValueLogLiveIDsUntil(0)
+		if err == nil {
+			out.RetrySucceeded = true
+		}
+	}
 	if err != nil {
 		if errors.Is(err, errForegroundWritesResumed) {
 			out.AbortedForegroundWrites = true
@@ -4490,7 +4499,13 @@ func (db *DB) scheduleRetainedValueLogPruneWithForce(force bool) {
 			db.retainedValueLogPruneForcedRuns.Add(1)
 		}
 		db.retainedValueLogPruneLastUnixNano.Store(now.UnixNano())
-		pruneStats := db.pruneRetainedValueLogs()
+		pruneStats := db.pruneRetainedValueLogs(effectiveForce)
+		if pruneStats.RetriedWithoutWriteGate {
+			db.retainedValueLogPruneWriteGateRetries.Add(1)
+			if pruneStats.RetrySucceeded {
+				db.retainedValueLogPruneWriteGateRetrySuccesses.Add(1)
+			}
+		}
 		if pruneStats.AbortedForegroundWrites {
 			db.retainedValueLogPruneForegroundAbortRuns.Add(1)
 		}
@@ -5304,6 +5319,8 @@ type DB struct {
 	retainedValueLogPruneScheduleSkipNoClosedBytes atomic.Uint64
 	retainedValueLogPruneScheduleSkipBelowPressure atomic.Uint64
 	retainedValueLogPruneScheduleSkipMinInterval   atomic.Uint64
+	retainedValueLogPruneWriteGateRetries          atomic.Uint64
+	retainedValueLogPruneWriteGateRetrySuccesses   atomic.Uint64
 	retainedPruneForceRequested                    atomic.Bool
 	retainedPruneMu                                sync.Mutex
 	retainedPruneDone                              chan struct{}
@@ -19952,6 +19969,8 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_retained_prune.schedule_skip.no_closed_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipNoClosedBytes.Load())
 	stats["treedb.cache.vlog_retained_prune.schedule_skip.below_pressure"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipBelowPressure.Load())
 	stats["treedb.cache.vlog_retained_prune.schedule_skip.min_interval"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleSkipMinInterval.Load())
+	stats["treedb.cache.vlog_retained_prune.write_gate_retries"] = fmt.Sprintf("%d", db.retainedValueLogPruneWriteGateRetries.Load())
+	stats["treedb.cache.vlog_retained_prune.write_gate_retry_successes"] = fmt.Sprintf("%d", db.retainedValueLogPruneWriteGateRetrySuccesses.Load())
 	stats["treedb.cache.vlog_retained_prune.force_pending"] = fmt.Sprintf("%t", db.retainedPruneForceRequested.Load())
 	stats["treedb.cache.vlog_generation.policy"] = fmt.Sprintf("%d", db.valueLogGenerationPolicy)
 	stats["treedb.cache.vlog_generation.enabled"] = fmt.Sprintf("%t", db.valueLogGenerationPolicy == uint8(backenddb.ValueLogGenerationHotWarmCold))
diff --git a/TreeDB/caching/db_test.go b/TreeDB/caching/db_test.go
index 8882fb6db..3c8d029f5 100644
--- a/TreeDB/caching/db_test.go
+++ b/TreeDB/caching/db_test.go
@@ -2027,7 +2027,7 @@ func TestPruneRetainedValueLogs_SkipsLiveScanWhenAllRetainedPathsInUse(t *testin
 	}
 	cache.markValueLogRetain(retained)
 
-	cache.pruneRetainedValueLogs()
+	cache.pruneRetainedValueLogs(false)
 
 	backend.mu.RLock()
 	iteratorCalls := backend.iteratorCalls
@@ -2279,6 +2279,85 @@ func TestRetainedValueLogPrune_AbortsWhenForegroundWritesResume(t *testing.T) {
 	}
 }
 
+func TestRetainedValueLogPruneForce_RetriesAfterForegroundWritesResume(t *testing.T) {
+	dir := t.TempDir()
+	backend := NewMockBackend()
+	backend.iteratorStartedCh = make(chan struct{})
+	backend.iteratorBlockCh = make(chan struct{})
+
+	cache, err := Open(dir, backend, Options{
+		DisableWAL:               true,
+		RelaxedSync:              true,
+		AllowUnsafe:              true,
+		FlushThreshold:           1 << 20,
+		ValueLogPointerThreshold: 1,
+	})
+	if err != nil {
+		t.Fatalf("cache open: %v", err)
+	}
+	defer cache.Close()
+
+	fileID, err := valuelog.EncodeFileID(0, 212)
+	if err != nil {
+		t.Fatalf("EncodeFileID: %v", err)
+	}
+	retainedPath := filepath.Join(dir, "wal", "value-l0-000212.log")
+	if err := os.MkdirAll(filepath.Dir(retainedPath), 0o755); err != nil {
+		t.Fatalf("MkdirAll: %v", err)
+	}
+	w, err := valuelog.NewWriter(retainedPath, fileID)
+	if err != nil {
+		t.Fatalf("NewWriter: %v", err)
+	}
+	if _, err := w.Append(0, nil, 1, bytes.Repeat([]byte("r"), 128)); err != nil {
+		_ = w.Close()
+		t.Fatalf("Append: %v", err)
+	}
+	if err := w.Close(); err != nil {
+		t.Fatalf("Close writer: %v", err)
+	}
+	cache.markValueLogRetain(retainedPath)
+	seedRetainedPrunePressure(cache, retainedPath, 2<<30)
+	cache.lastForegroundWriteUnixNano.Store(time.Now().Add(-2 * retainedPruneQuietWindow).UnixNano())
+
+	cache.scheduleRetainedValueLogPruneForce()
+
+	select {
+	case <-backend.iteratorStartedCh:
+	case <-time.After(2 * time.Second):
+		t.Fatalf("forced prune did not start")
+	}
+
+	lastWrite := cache.lastForegroundWriteUnixNano.Load()
+	deadline := time.Now().Add(2 * time.Second)
+	for !cache.foregroundWritesResumedSince(lastWrite) {
+		if time.Now().After(deadline) {
+			t.Fatalf("foreground write timestamp did not advance")
+		}
+		cache.noteWrite()
+		time.Sleep(time.Millisecond)
+	}
+	close(backend.iteratorBlockCh)
+	cache.waitForRetainedValueLogPrune()
+
+	if cache.valueLogRetained(retainedPath) {
+		t.Fatalf("retained path still marked after forced retry prune")
+	}
+	stats := cache.Stats()
+	if got := stats["treedb.cache.vlog_retained_prune.forced_runs"]; got != "1" {
+		t.Fatalf("forced_runs=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_retained_prune.foreground_abort_runs"]; got != "0" {
+		t.Fatalf("foreground_abort_runs=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_retained_prune.write_gate_retries"]; got != "1" {
+		t.Fatalf("write_gate_retries=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_retained_prune.write_gate_retry_successes"]; got != "1" {
+		t.Fatalf("write_gate_retry_successes=%q want 1", got)
+	}
+}
+
 func TestCheckpoint_RateLimitsRetainedValueLogPrune(t *testing.T) {
 	dir := t.TempDir()
 	backend := NewMockBackend()

From c2007567f786975de9a0029fd7f5992bcaa7a59e Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 23:27:22 -1000
Subject: [PATCH 15/61] treedb: force retained prune when vlog hard cap is
 exceeded

---
 TreeDB/caching/db.go              |  3 +++
 TreeDB/caching/db_test.go         | 36 +++++++++++++++++++++++++++++++
 TreeDB/env_vlog_overrides_test.go | 13 +++++++++++
 TreeDB/public.go                  |  8 +++++++
 4 files changed, 60 insertions(+)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index ae87b8566..43293f422 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -4175,6 +4175,9 @@ func (db *DB) allowValueLogPointers() bool {
 	if bytes >= limit {
 		if db.valueLogHardCapWarned.CompareAndSwap(false, true) {
 			db.reportError(fmt.Errorf("cachingdb: retained value-log bytes %d exceed hard cap %d; disabling new value-log pointers", bytes, limit))
+			// Hard-cap entry means retained bytes are now constraining placement.
+			// Request an eager retained prune so lifecycle pins can drain promptly.
+			db.scheduleRetainedValueLogPruneForce()
 		}
 		return false
 	}
diff --git a/TreeDB/caching/db_test.go b/TreeDB/caching/db_test.go
index 3c8d029f5..3f8c3151f 100644
--- a/TreeDB/caching/db_test.go
+++ b/TreeDB/caching/db_test.go
@@ -2358,6 +2358,42 @@ func TestRetainedValueLogPruneForce_RetriesAfterForegroundWritesResume(t *testin
 	}
 }
 
+func TestAllowValueLogPointers_HardCapRequestsForcedRetainedPrune(t *testing.T) {
+	cache := &DB{}
+	cache.testSkipRetainedPrune = true
+	cache.maxValueLogRetainedBytesHard = 1024
+	cache.valueLogRetainedClosedBytes.Store(2048)
+
+	if cache.allowValueLogPointers() {
+		t.Fatalf("allowValueLogPointers=true, want false when hard cap exceeded")
+	}
+	if got := cache.retainedValueLogPruneScheduleForcedRequests.Load(); got != 1 {
+		t.Fatalf("schedule_forced_requests=%d want 1 after first hard-cap crossing", got)
+	}
+
+	// Re-check while still over cap should not repeatedly re-schedule until
+	// retained bytes drop back below the hard cap.
+	if cache.allowValueLogPointers() {
+		t.Fatalf("allowValueLogPointers=true on repeated over-cap check, want false")
+	}
+	if got := cache.retainedValueLogPruneScheduleForcedRequests.Load(); got != 1 {
+		t.Fatalf("schedule_forced_requests=%d want 1 after repeated over-cap check", got)
+	}
+
+	cache.valueLogRetainedClosedBytes.Store(0)
+	if !cache.allowValueLogPointers() {
+		t.Fatalf("allowValueLogPointers=false, want true after dropping below hard cap")
+	}
+
+	cache.valueLogRetainedClosedBytes.Store(4096)
+	if cache.allowValueLogPointers() {
+		t.Fatalf("allowValueLogPointers=true after second hard-cap crossing, want false")
+	}
+	if got := cache.retainedValueLogPruneScheduleForcedRequests.Load(); got != 2 {
+		t.Fatalf("schedule_forced_requests=%d want 2 after second hard-cap crossing", got)
+	}
+}
+
 func TestCheckpoint_RateLimitsRetainedValueLogPrune(t *testing.T) {
 	dir := t.TempDir()
 	backend := NewMockBackend()
diff --git a/TreeDB/env_vlog_overrides_test.go b/TreeDB/env_vlog_overrides_test.go
index 019b46ef0..27e9deeee 100644
--- a/TreeDB/env_vlog_overrides_test.go
+++ b/TreeDB/env_vlog_overrides_test.go
@@ -153,3 +153,16 @@ func TestApplyEnvMaintenanceOverrides_VlogDictClassModeDefaultAlias(t *testing.T
 		t.Fatalf("expected dict class mode single for default alias, got %v", got)
 	}
 }
+
+func TestApplyEnvMaintenanceOverrides_VlogRetainedCaps(t *testing.T) {
+	opts := Options{}
+	t.Setenv(envVlogMaxRetainedBytes, "123456")
+	t.Setenv(envVlogMaxRetainedBytesHard, "654321")
+	applyEnvMaintenanceOverrides(&opts)
+	if got := opts.ValueLog.MaxRetainedBytes; got != 123456 {
+		t.Fatalf("expected max retained bytes=123456, got %d", got)
+	}
+	if got := opts.ValueLog.MaxRetainedBytesHard; got != 654321 {
+		t.Fatalf("expected max retained bytes hard=654321, got %d", got)
+	}
+}
diff --git a/TreeDB/public.go b/TreeDB/public.go
index 619c9051f..66a853e7a 100644
--- a/TreeDB/public.go
+++ b/TreeDB/public.go
@@ -710,6 +710,8 @@ const (
 	envVlogDictEntropy           = "TREEDB_VLOG_DICT_ENTROPY"                   // bool
 	envVlogDictAdaptiveRatio     = "TREEDB_VLOG_DICT_ADAPTIVE_RATIO"            // float64
 	envVlogDictMinPayloadSavings = "TREEDB_VLOG_DICT_MIN_PAYLOAD_SAVINGS_RATIO" // float64
+	envVlogMaxRetainedBytes      = "TREEDB_VLOG_MAX_RETAINED_BYTES"             // int64
+	envVlogMaxRetainedBytesHard  = "TREEDB_VLOG_MAX_RETAINED_BYTES_HARD"        // int64
 )
 
 func applyEnvMaintenanceOverrides(opts *Options) {
@@ -828,6 +830,12 @@ func applyEnvMaintenanceOverrides(opts *Options) {
 	if v, ok := envFloat64(envVlogDictMinPayloadSavings); ok {
 		opts.ValueLog.DictMinPayloadSavingsRatio = v
 	}
+	if v, ok := envInt(envVlogMaxRetainedBytes); ok {
+		opts.ValueLog.MaxRetainedBytes = int64(v)
+	}
+	if v, ok := envInt(envVlogMaxRetainedBytesHard); ok {
+		opts.ValueLog.MaxRetainedBytesHard = int64(v)
+	}
 }
 
 func computeDurabilityMode(opts Options) string {

From 3b30044b1597faac4a064dd02f55def67ede6c48 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 23:41:33 -1000
Subject: [PATCH 16/61] treedb: add retained-prune reason counters

---
 TreeDB/caching/db.go      | 103 ++++++++++++++++++++++++++++++++++++--
 TreeDB/caching/db_test.go |   8 ++-
 2 files changed, 105 insertions(+), 6 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 43293f422..6ecb5a7f0 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -4196,6 +4196,16 @@ type valueLogSetRefresher interface {
 type retainedValueLogPruneStats struct {
 	RemovedSegments         int
 	RemovedBytes            int64
+	InUseSkippedSegments    int
+	InUseSkippedBytes       int64
+	CandidateSegments       int
+	CandidateBytes          int64
+	LiveSkippedSegments     int
+	LiveSkippedBytes        int64
+	ParseSkippedSegments    int
+	ParseSkippedBytes       int64
+	ZombieMarkedSegments    int
+	ZombieMarkedBytes       int64
 	AbortedForegroundWrites bool
 	RetriedWithoutWriteGate bool
 	RetrySucceeded          bool
@@ -4233,12 +4243,20 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats {
 		inUse[path] = struct{}{}
 	}
 
-	candidatePaths := make([]string, 0, len(paths))
+	type pruneCandidate struct {
+		path string
+		size int64
+	}
+	candidatePaths := make([]pruneCandidate, 0, len(paths))
 	for _, path := range paths {
+		size := db.valueLogClosedSegmentSize(path)
 		if _, ok := inUse[path]; ok {
+			out.InUseSkippedSegments++
+			if size > 0 {
+				out.InUseSkippedBytes += size
+			}
 			continue
 		}
-		size := db.valueLogClosedSegmentSize(path)
 		if db.cleanupMissingRetainedValueLog(path) {
 			if size > 0 {
 				out.RemovedSegments++
@@ -4246,7 +4264,11 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats {
 			}
 			continue
 		}
-		candidatePaths = append(candidatePaths, path)
+		out.CandidateSegments++
+		if size > 0 {
+			out.CandidateBytes += size
+		}
+		candidatePaths = append(candidatePaths, pruneCandidate{path: path, size: size})
 	}
 	if len(candidatePaths) == 0 {
 		return out
@@ -4271,20 +4293,37 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats {
 
 	removed := false
 	marked := false
-	for _, path := range candidatePaths {
-		size := db.valueLogClosedSegmentSize(path)
+	for _, candidate := range candidatePaths {
+		path := candidate.path
+		size := candidate.size
 		laneID, seq, valueLog, ok := parseLogSeq(filepath.Base(path))
 		if !ok || !valueLog {
+			out.ParseSkippedSegments++
+			if size > 0 {
+				out.ParseSkippedBytes += size
+			}
 			continue
 		}
 		if laneID < 0 {
+			out.ParseSkippedSegments++
+			if size > 0 {
+				out.ParseSkippedBytes += size
+			}
 			continue
 		}
 		id, err := valuelog.EncodeFileID(uint32(laneID), uint32(seq))
 		if err != nil {
+			out.ParseSkippedSegments++
+			if size > 0 {
+				out.ParseSkippedBytes += size
+			}
 			continue
 		}
 		if _, ok := live[id]; ok {
+			out.LiveSkippedSegments++
+			if size > 0 {
+				out.LiveSkippedBytes += size
+			}
 			continue
 		}
 
@@ -4311,6 +4350,10 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats {
 				db.reportError(fmt.Errorf("cachingdb: failed to mark value-log %d zombie: %w", id, err))
 				continue
 			}
+			out.ZombieMarkedSegments++
+			if size > 0 {
+				out.ZombieMarkedBytes += size
+			}
 			marked = true
 		} else {
 			db.dropValueLogSegment(path)
@@ -4518,6 +4561,36 @@ func (db *DB) scheduleRetainedValueLogPruneWithForce(force bool) {
 		if pruneStats.RemovedBytes > 0 {
 			db.retainedValueLogPruneRemovedBytes.Add(uint64(pruneStats.RemovedBytes))
 		}
+		if pruneStats.InUseSkippedSegments > 0 {
+			db.retainedValueLogPruneInUseSkippedSegments.Add(uint64(pruneStats.InUseSkippedSegments))
+		}
+		if pruneStats.InUseSkippedBytes > 0 {
+			db.retainedValueLogPruneInUseSkippedBytes.Add(uint64(pruneStats.InUseSkippedBytes))
+		}
+		if pruneStats.CandidateSegments > 0 {
+			db.retainedValueLogPruneCandidateSegments.Add(uint64(pruneStats.CandidateSegments))
+		}
+		if pruneStats.CandidateBytes > 0 {
+			db.retainedValueLogPruneCandidateBytes.Add(uint64(pruneStats.CandidateBytes))
+		}
+		if pruneStats.LiveSkippedSegments > 0 {
+			db.retainedValueLogPruneLiveSkippedSegments.Add(uint64(pruneStats.LiveSkippedSegments))
+		}
+		if pruneStats.LiveSkippedBytes > 0 {
+			db.retainedValueLogPruneLiveSkippedBytes.Add(uint64(pruneStats.LiveSkippedBytes))
+		}
+		if pruneStats.ParseSkippedSegments > 0 {
+			db.retainedValueLogPruneParseSkippedSegments.Add(uint64(pruneStats.ParseSkippedSegments))
+		}
+		if pruneStats.ParseSkippedBytes > 0 {
+			db.retainedValueLogPruneParseSkippedBytes.Add(uint64(pruneStats.ParseSkippedBytes))
+		}
+		if pruneStats.ZombieMarkedSegments > 0 {
+			db.retainedValueLogPruneZombieMarkedSegments.Add(uint64(pruneStats.ZombieMarkedSegments))
+		}
+		if pruneStats.ZombieMarkedBytes > 0 {
+			db.retainedValueLogPruneZombieMarkedBytes.Add(uint64(pruneStats.ZombieMarkedBytes))
+		}
 	}()
 }
 
@@ -5315,6 +5388,16 @@ type DB struct {
 	retainedValueLogPruneForegroundAbortRuns       atomic.Uint64
 	retainedValueLogPruneRemovedSegments           atomic.Uint64
 	retainedValueLogPruneRemovedBytes              atomic.Uint64
+	retainedValueLogPruneInUseSkippedSegments      atomic.Uint64
+	retainedValueLogPruneInUseSkippedBytes         atomic.Uint64
+	retainedValueLogPruneCandidateSegments         atomic.Uint64
+	retainedValueLogPruneCandidateBytes            atomic.Uint64
+	retainedValueLogPruneLiveSkippedSegments       atomic.Uint64
+	retainedValueLogPruneLiveSkippedBytes          atomic.Uint64
+	retainedValueLogPruneParseSkippedSegments      atomic.Uint64
+	retainedValueLogPruneParseSkippedBytes         atomic.Uint64
+	retainedValueLogPruneZombieMarkedSegments      atomic.Uint64
+	retainedValueLogPruneZombieMarkedBytes         atomic.Uint64
 	retainedValueLogPruneScheduleRequests          atomic.Uint64
 	retainedValueLogPruneScheduleForcedRequests    atomic.Uint64
 	retainedValueLogPruneScheduleSkipClosing       atomic.Uint64
@@ -19964,6 +20047,16 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_retained_prune.foreground_abort_runs"] = fmt.Sprintf("%d", db.retainedValueLogPruneForegroundAbortRuns.Load())
 	stats["treedb.cache.vlog_retained_prune.removed_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneRemovedSegments.Load())
 	stats["treedb.cache.vlog_retained_prune.removed_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneRemovedBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.in_use_skipped_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneInUseSkippedSegments.Load())
+	stats["treedb.cache.vlog_retained_prune.in_use_skipped_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneInUseSkippedBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.candidate_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneCandidateSegments.Load())
+	stats["treedb.cache.vlog_retained_prune.candidate_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneCandidateBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.live_skipped_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneLiveSkippedSegments.Load())
+	stats["treedb.cache.vlog_retained_prune.live_skipped_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneLiveSkippedBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.parse_skipped_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneParseSkippedSegments.Load())
+	stats["treedb.cache.vlog_retained_prune.parse_skipped_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneParseSkippedBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.zombie_marked_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneZombieMarkedSegments.Load())
+	stats["treedb.cache.vlog_retained_prune.zombie_marked_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneZombieMarkedBytes.Load())
 	stats["treedb.cache.vlog_retained_prune.pressure_bytes"] = fmt.Sprintf("%d", db.retainedPrunePressureBytes())
 	stats["treedb.cache.vlog_retained_prune.schedule_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleRequests.Load())
 	stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleForcedRequests.Load())
diff --git a/TreeDB/caching/db_test.go b/TreeDB/caching/db_test.go
index 3f8c3151f..9a2a4a024 100644
--- a/TreeDB/caching/db_test.go
+++ b/TreeDB/caching/db_test.go
@@ -2027,7 +2027,7 @@ func TestPruneRetainedValueLogs_SkipsLiveScanWhenAllRetainedPathsInUse(t *testin
 	}
 	cache.markValueLogRetain(retained)
 
-	cache.pruneRetainedValueLogs(false)
+	pruneStats := cache.pruneRetainedValueLogs(false)
 
 	backend.mu.RLock()
 	iteratorCalls := backend.iteratorCalls
@@ -2038,6 +2038,12 @@ func TestPruneRetainedValueLogs_SkipsLiveScanWhenAllRetainedPathsInUse(t *testin
 	if !cache.valueLogRetained(retained) {
 		t.Fatalf("expected in-use retained path to remain retained")
 	}
+	if pruneStats.InUseSkippedSegments != 1 {
+		t.Fatalf("InUseSkippedSegments=%d want 1", pruneStats.InUseSkippedSegments)
+	}
+	if pruneStats.CandidateSegments != 0 {
+		t.Fatalf("CandidateSegments=%d want 0", pruneStats.CandidateSegments)
+	}
 }
 
 func seedRetainedPrunePressure(cache *DB, retainedPath string, size int64) {

From 8c101d6580e1a61c4eac9ca742cb66ca2bdeb0b7 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Fri, 27 Mar 2026 23:53:16 -1000
Subject: [PATCH 17/61] treedb: instrument rewrite-plan empty reasons

---
 TreeDB/caching/db.go                          | 30 +++++++++++
 .../caching/vlog_generation_scheduler_test.go | 50 +++++++++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 6ecb5a7f0..77eb3f2c3 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5424,11 +5424,16 @@ type DB struct {
 	vlogGenerationRewritePlanCanceled              atomic.Uint64
 	vlogGenerationRewritePlanErrors                atomic.Uint64
 	vlogGenerationRewritePlanEmpty                 atomic.Uint64
+	vlogGenerationRewritePlanEmptyAgeBlocked       atomic.Uint64
+	vlogGenerationRewritePlanEmptyNoSelection      atomic.Uint64
 	vlogGenerationRewritePlanSelected              atomic.Uint64
 	vlogGenerationRewritePlanSelectedSegments      atomic.Uint64
 	vlogGenerationRewritePlanSelectedBytes         atomic.Uint64
 	vlogGenerationRewritePlanSelectedLiveBytes     atomic.Uint64
 	vlogGenerationRewritePlanSelectedStaleBytes    atomic.Uint64
+	vlogGenerationRewritePlanPenaltyFilterRuns     atomic.Uint64
+	vlogGenerationRewritePlanPenaltyFilterSegments atomic.Uint64
+	vlogGenerationRewritePlanPenaltyFilterToEmpty  atomic.Uint64
 	vlogGenerationRewritePlanCanceledLastNS        atomic.Int64
 	vlogGenerationRewriteAgeBlockedUntilNS         atomic.Int64
 	vlogGenerationRewriteAgeBlockedWakeRunning     atomic.Bool
@@ -12880,6 +12885,22 @@ func (db *DB) observeVlogGenerationRewritePlanOutcomeWithDuration(plan backenddb
 		return
 	}
 	db.vlogGenerationRewritePlanEmpty.Add(1)
+	if plan.AgeBlockedSegments > 0 && plan.AgeBlockedMinRemainingAge > 0 {
+		db.vlogGenerationRewritePlanEmptyAgeBlocked.Add(1)
+	} else {
+		db.vlogGenerationRewritePlanEmptyNoSelection.Add(1)
+	}
+}
+
+func (db *DB) observeVlogGenerationRewritePlanPenaltyFilter(before, after int) {
+	if db == nil || before <= 0 || after >= before {
+		return
+	}
+	db.vlogGenerationRewritePlanPenaltyFilterRuns.Add(1)
+	db.vlogGenerationRewritePlanPenaltyFilterSegments.Add(uint64(before - after))
+	if after == 0 {
+		db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Add(1)
+	}
 }
 
 func isVlogGenerationPlannerCanceled(err error) bool {
@@ -13785,6 +13806,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 			}
 		} else if len(plan.SourceFileIDs) > 0 {
 			db.clearVlogGenerationRewriteAgeBlockedUntil()
+			beforePenaltyFilter := len(plan.SourceFileIDs)
 			plan, err = db.filterVlogGenerationRewritePlanPenalties(plan, now)
 			if err != nil {
 				db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError)
@@ -13793,6 +13815,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 				}
 				return
 			}
+			db.observeVlogGenerationRewritePlanPenaltyFilter(beforePenaltyFilter, len(plan.SourceFileIDs))
 			updatePlanTimestamp = true
 			if len(plan.SourceFileIDs) > 0 {
 				if stagePending {
@@ -13940,6 +13963,7 @@ planned:
 				}
 				if len(plan.SourceFileIDs) > 0 {
 					db.clearVlogGenerationRewriteAgeBlockedUntil()
+					beforePenaltyFilter := len(plan.SourceFileIDs)
 					plan, err = db.filterVlogGenerationRewritePlanPenalties(plan, now)
 					if err != nil {
 						db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError)
@@ -13949,6 +13973,7 @@ planned:
 						}
 						return
 					}
+					db.observeVlogGenerationRewritePlanPenaltyFilter(beforePenaltyFilter, len(plan.SourceFileIDs))
 				}
 				if len(plan.SourceFileIDs) == 0 {
 					if shouldDeferVlogGenerationRewritePlanForAge(plan, vlogGenerationRewriteMinSegmentAge) {
@@ -20162,11 +20187,16 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.rewrite.plan_canceled_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanCanceledLastNS.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_errors"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanErrors.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_empty"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanEmpty.Load())
+	stats["treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanEmptyAgeBlocked.Load())
+	stats["treedb.cache.vlog_generation.rewrite.plan_empty.no_selection"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanEmptyNoSelection.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_selected"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelected.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedSegments.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedBytes.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_live"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedLiveBytes.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanSelectedStaleBytes.Load())
+	stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterRuns.Load())
+	stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.segments"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterSegments.Load())
+	stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Load())
 	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load())
 	stats["treedb.cache.vlog_generation.rewrite.canceled_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledRuns.Load())
 	stats["treedb.cache.vlog_generation.rewrite.canceled_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledLastNS.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 4bb6a8912..0d203c381 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -320,6 +320,41 @@ func TestObserveVlogGenerationRewritePlanOutcome_SelectedTracksSegmentFallbackBy
 	}
 }
 
+func TestObserveVlogGenerationRewritePlanOutcome_EmptyReasonBuckets(t *testing.T) {
+	db := &DB{}
+	db.observeVlogGenerationRewritePlanOutcome(backenddb.ValueLogRewritePlan{
+		AgeBlockedSegments:        2,
+		AgeBlockedMinRemainingAge: 3 * time.Second,
+	}, nil)
+	db.observeVlogGenerationRewritePlanOutcome(backenddb.ValueLogRewritePlan{}, nil)
+
+	if got, want := db.vlogGenerationRewritePlanEmpty.Load(), uint64(2); got != want {
+		t.Fatalf("plan empty=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationRewritePlanEmptyAgeBlocked.Load(), uint64(1); got != want {
+		t.Fatalf("plan empty age-blocked=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationRewritePlanEmptyNoSelection.Load(), uint64(1); got != want {
+		t.Fatalf("plan empty no-selection=%d want=%d", got, want)
+	}
+}
+
+func TestObserveVlogGenerationRewritePlanPenaltyFilterCounters(t *testing.T) {
+	db := &DB{}
+	db.observeVlogGenerationRewritePlanPenaltyFilter(5, 2)
+	db.observeVlogGenerationRewritePlanPenaltyFilter(2, 0)
+
+	if got, want := db.vlogGenerationRewritePlanPenaltyFilterRuns.Load(), uint64(2); got != want {
+		t.Fatalf("penalty filter runs=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationRewritePlanPenaltyFilterSegments.Load(), uint64(5); got != want {
+		t.Fatalf("penalty filter segments=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Load(), uint64(1); got != want {
+		t.Fatalf("penalty filter to-empty=%d want=%d", got, want)
+	}
+}
+
 func TestMaybeRunVlogGenerationMaintenanceWithOptions_TracksWalOnPeriodicSkip(t *testing.T) {
 	db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)}
 	db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{})
@@ -4559,6 +4594,12 @@ func TestVlogGenerationRewritePlan_TracksEmptyPlanOutcome(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.rewrite.plan_empty"]; got != "1" {
 		t.Fatalf("plan empty=%q want 1", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked"]; got != "0" {
+		t.Fatalf("plan empty age-blocked=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.plan_empty.no_selection"]; got != "1" {
+		t.Fatalf("plan empty no-selection=%q want 1", got)
+	}
 	if got := stats["treedb.cache.vlog_generation.rewrite.plan_selected"]; got != "0" {
 		t.Fatalf("plan selected=%q want 0", got)
 	}
@@ -4568,6 +4609,15 @@ func TestVlogGenerationRewritePlan_TracksEmptyPlanOutcome(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.rewrite.plan_errors"]; got != "0" {
 		t.Fatalf("plan errors=%q want 0", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.runs"]; got != "0" {
+		t.Fatalf("plan penalty-filter runs=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.segments"]; got != "0" {
+		t.Fatalf("plan penalty-filter segments=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"]; got != "0" {
+		t.Fatalf("plan penalty-filter to-empty=%q want 0", got)
+	}
 }
 
 func TestVlogGenerationRewritePlan_RunsOutsideMaintenanceBarrier(t *testing.T) {

From 046622297ba4ddc758083065fcaa82598dcfba51 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 00:08:36 -1000
Subject: [PATCH 18/61] treedb: add env overrides for rewrite budget and
 triggers

---
 TreeDB/env_vlog_overrides_test.go | 27 +++++++++++++++
 TreeDB/public.go                  | 55 ++++++++++++++++++++++---------
 2 files changed, 66 insertions(+), 16 deletions(-)

diff --git a/TreeDB/env_vlog_overrides_test.go b/TreeDB/env_vlog_overrides_test.go
index 27e9deeee..eeebdc1a5 100644
--- a/TreeDB/env_vlog_overrides_test.go
+++ b/TreeDB/env_vlog_overrides_test.go
@@ -166,3 +166,30 @@ func TestApplyEnvMaintenanceOverrides_VlogRetainedCaps(t *testing.T) {
 		t.Fatalf("expected max retained bytes hard=654321, got %d", got)
 	}
 }
+
+func TestApplyEnvMaintenanceOverrides_VlogRewriteControls(t *testing.T) {
+	opts := Options{}
+	t.Setenv(envVlogRewriteBudgetBytesPerSec, "123456789")
+	t.Setenv(envVlogRewriteBudgetRecordsPerSec, "4321")
+	t.Setenv(envVlogRewriteTriggerTotalBytes, "987654321")
+	t.Setenv(envVlogRewriteTriggerStaleRatioPPM, "345678")
+	t.Setenv(envVlogRewriteTriggerChurnPerSec, "13579")
+	applyEnvMaintenanceOverrides(&opts)
+
+	gen := opts.ValueLog.Generational
+	if got := gen.RewriteBudgetBytesPerSec; got != 123456789 {
+		t.Fatalf("expected rewrite budget bytes/sec=123456789, got %d", got)
+	}
+	if got := gen.RewriteBudgetRecordsPerSec; got != 4321 {
+		t.Fatalf("expected rewrite budget records/sec=4321, got %d", got)
+	}
+	if got := gen.RewriteTriggerTotalBytes; got != 987654321 {
+		t.Fatalf("expected rewrite trigger total bytes=987654321, got %d", got)
+	}
+	if got := gen.RewriteTriggerStaleRatioPPM; got != 345678 {
+		t.Fatalf("expected rewrite trigger stale ratio ppm=345678, got %d", got)
+	}
+	if got := gen.RewriteTriggerChurnPerSec; got != 13579 {
+		t.Fatalf("expected rewrite trigger churn/sec=13579, got %d", got)
+	}
+}
diff --git a/TreeDB/public.go b/TreeDB/public.go
index 66a853e7a..2af25ada1 100644
--- a/TreeDB/public.go
+++ b/TreeDB/public.go
@@ -696,22 +696,27 @@ const (
 	//   - Dict training enabled (TrainBytes > 0), and
 	//   - Side stores enabled (dictdb), and
 	//   - Split value log enabled (value pointers used).
-	envVlogDictEnable            = "TREEDB_VLOG_DICT_ENABLE"                    // bool
-	envVlogDictTrainBytes        = "TREEDB_VLOG_DICT_TRAIN_BYTES"               // int
-	envVlogDictBytes             = "TREEDB_VLOG_DICT_BYTES"                     // int
-	envVlogDictMinRecords        = "TREEDB_VLOG_DICT_MIN_RECORDS"               // int
-	envVlogDictMaxRecordBytes    = "TREEDB_VLOG_DICT_MAX_RECORD_BYTES"          // int
-	envVlogDictSampleStride      = "TREEDB_VLOG_DICT_SAMPLE_STRIDE"             // int
-	envVlogDictDedupWindow       = "TREEDB_VLOG_DICT_DEDUP_WINDOW"              // int
-	envVlogDictTrainLevel        = "TREEDB_VLOG_DICT_TRAIN_LEVEL"               // int
-	envVlogDictMaxK              = "TREEDB_VLOG_DICT_MAX_K"                     // int
-	envVlogDictClassMode         = "TREEDB_VLOG_DICT_CLASS_MODE"                // single|split_outer_leaf
-	envVlogDictZstdLevel         = "TREEDB_VLOG_DICT_ZSTD_LEVEL"                // fastest|default|better|best|int
-	envVlogDictEntropy           = "TREEDB_VLOG_DICT_ENTROPY"                   // bool
-	envVlogDictAdaptiveRatio     = "TREEDB_VLOG_DICT_ADAPTIVE_RATIO"            // float64
-	envVlogDictMinPayloadSavings = "TREEDB_VLOG_DICT_MIN_PAYLOAD_SAVINGS_RATIO" // float64
-	envVlogMaxRetainedBytes      = "TREEDB_VLOG_MAX_RETAINED_BYTES"             // int64
-	envVlogMaxRetainedBytesHard  = "TREEDB_VLOG_MAX_RETAINED_BYTES_HARD"        // int64
+	envVlogDictEnable                  = "TREEDB_VLOG_DICT_ENABLE"                     // bool
+	envVlogDictTrainBytes              = "TREEDB_VLOG_DICT_TRAIN_BYTES"                // int
+	envVlogDictBytes                   = "TREEDB_VLOG_DICT_BYTES"                      // int
+	envVlogDictMinRecords              = "TREEDB_VLOG_DICT_MIN_RECORDS"                // int
+	envVlogDictMaxRecordBytes          = "TREEDB_VLOG_DICT_MAX_RECORD_BYTES"           // int
+	envVlogDictSampleStride            = "TREEDB_VLOG_DICT_SAMPLE_STRIDE"              // int
+	envVlogDictDedupWindow             = "TREEDB_VLOG_DICT_DEDUP_WINDOW"               // int
+	envVlogDictTrainLevel              = "TREEDB_VLOG_DICT_TRAIN_LEVEL"                // int
+	envVlogDictMaxK                    = "TREEDB_VLOG_DICT_MAX_K"                      // int
+	envVlogDictClassMode               = "TREEDB_VLOG_DICT_CLASS_MODE"                 // single|split_outer_leaf
+	envVlogDictZstdLevel               = "TREEDB_VLOG_DICT_ZSTD_LEVEL"                 // fastest|default|better|best|int
+	envVlogDictEntropy                 = "TREEDB_VLOG_DICT_ENTROPY"                    // bool
+	envVlogDictAdaptiveRatio           = "TREEDB_VLOG_DICT_ADAPTIVE_RATIO"             // float64
+	envVlogDictMinPayloadSavings       = "TREEDB_VLOG_DICT_MIN_PAYLOAD_SAVINGS_RATIO"  // float64
+	envVlogMaxRetainedBytes            = "TREEDB_VLOG_MAX_RETAINED_BYTES"              // int64
+	envVlogMaxRetainedBytesHard        = "TREEDB_VLOG_MAX_RETAINED_BYTES_HARD"         // int64
+	envVlogRewriteBudgetBytesPerSec    = "TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC"    // int64
+	envVlogRewriteBudgetRecordsPerSec  = "TREEDB_VLOG_REWRITE_BUDGET_RECORDS_PER_SEC"  // int
+	envVlogRewriteTriggerTotalBytes    = "TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES"     // int64
+	envVlogRewriteTriggerStaleRatioPPM = "TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM" // uint32
+	envVlogRewriteTriggerChurnPerSec   = "TREEDB_VLOG_REWRITE_TRIGGER_CHURN_PER_SEC"   // int64
 )
 
 func applyEnvMaintenanceOverrides(opts *Options) {
@@ -836,6 +841,24 @@ func applyEnvMaintenanceOverrides(opts *Options) {
 	if v, ok := envInt(envVlogMaxRetainedBytesHard); ok {
 		opts.ValueLog.MaxRetainedBytesHard = int64(v)
 	}
+	if v, ok := envInt(envVlogRewriteBudgetBytesPerSec); ok {
+		opts.ValueLog.Generational.RewriteBudgetBytesPerSec = int64(v)
+	}
+	if v, ok := envInt(envVlogRewriteBudgetRecordsPerSec); ok {
+		opts.ValueLog.Generational.RewriteBudgetRecordsPerSec = v
+	}
+	if v, ok := envInt(envVlogRewriteTriggerTotalBytes); ok {
+		opts.ValueLog.Generational.RewriteTriggerTotalBytes = int64(v)
+	}
+	if v, ok := envInt(envVlogRewriteTriggerStaleRatioPPM); ok {
+		if v < 0 {
+			v = 0
+		}
+		opts.ValueLog.Generational.RewriteTriggerStaleRatioPPM = uint32(v)
+	}
+	if v, ok := envInt(envVlogRewriteTriggerChurnPerSec); ok {
+		opts.ValueLog.Generational.RewriteTriggerChurnPerSec = int64(v)
+	}
 }
 
 func computeDurabilityMode(opts Options) string {

From 6ee065be5b35c949211ac290616e2d13b2d691bb Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 00:26:11 -1000
Subject: [PATCH 19/61] treedb: split rewrite cancel metrics by fresh vs queued
 debt

---
 TreeDB/caching/db.go                          | 37 +++++++++++++++-
 .../caching/vlog_generation_scheduler_test.go | 44 +++++++++++++++++++
 2 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 77eb3f2c3..ca27de39b 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5442,7 +5442,13 @@ type DB struct {
 	vlogGenerationRewriteIneffectiveBytesIn        atomic.Uint64
 	vlogGenerationRewriteIneffectiveBytesOut       atomic.Uint64
 	vlogGenerationRewriteCanceledRuns              atomic.Uint64
+	vlogGenerationRewriteCanceledFreshPlanRuns     atomic.Uint64
+	vlogGenerationRewriteCanceledQueuedDebtRuns    atomic.Uint64
 	vlogGenerationRewriteCanceledLastNS            atomic.Int64
+	vlogGenerationRewriteDeadlineRuns              atomic.Uint64
+	vlogGenerationRewriteDeadlineFreshPlanRuns     atomic.Uint64
+	vlogGenerationRewriteDeadlineQueuedDebtRuns    atomic.Uint64
+	vlogGenerationRewriteDeadlineLastNS            atomic.Int64
 	vlogGenerationRewriteQueuePruneRuns            atomic.Uint64
 	vlogGenerationRewriteQueuePruneIDs             atomic.Uint64
 	vlogGenerationGCSegmentsDeleted                atomic.Uint64
@@ -12918,14 +12924,32 @@ func (db *DB) vlogGenerationRewritePlanBackoffActive(now time.Time) bool {
 	return now.Sub(time.Unix(0, lastCanceled)) < vlogGenerationRewritePlanCancelBackoff
 }
 
-func (db *DB) observeVlogGenerationRewriteCanceled() {
+func (db *DB) observeVlogGenerationRewriteCanceled(queuedDebt bool) {
 	if db == nil {
 		return
 	}
 	db.vlogGenerationRewriteCanceledRuns.Add(1)
+	if queuedDebt {
+		db.vlogGenerationRewriteCanceledQueuedDebtRuns.Add(1)
+	} else {
+		db.vlogGenerationRewriteCanceledFreshPlanRuns.Add(1)
+	}
 	db.vlogGenerationRewriteCanceledLastNS.Store(time.Now().UnixNano())
 }
 
+func (db *DB) observeVlogGenerationRewriteDeadline(queuedDebt bool) {
+	if db == nil {
+		return
+	}
+	db.vlogGenerationRewriteDeadlineRuns.Add(1)
+	if queuedDebt {
+		db.vlogGenerationRewriteDeadlineQueuedDebtRuns.Add(1)
+	} else {
+		db.vlogGenerationRewriteDeadlineFreshPlanRuns.Add(1)
+	}
+	db.vlogGenerationRewriteDeadlineLastNS.Store(time.Now().UnixNano())
+}
+
 func (db *DB) observeVlogGenerationRewriteQueuePrune(dropped int) {
 	if db == nil || dropped <= 0 {
 		return
@@ -14227,14 +14251,17 @@ planned:
 			db.observeVlogGenerationRewriteExecDuration(rewriteDur)
 			if err != nil {
 				db.debugVlogMaintf("rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), err, float64(rewriteDur.Microseconds())/1000)
+				queuedDebt := hadRewriteQueue && len(processedRewriteIDs) > 0
 				if errors.Is(err, context.Canceled) {
-					db.observeVlogGenerationRewriteCanceled()
+					db.observeVlogGenerationRewriteCanceled(queuedDebt)
 					if len(processedRewriteIDs) > 0 {
 						// A canceled rewrite that already selected a queued chunk should
 						// immediately queue a checkpoint-kick retry. The retry executes
 						// as resumable debt with bounded non-cancel semantics.
 						db.vlogGenerationCheckpointKickPending.Store(true)
 					}
+				} else if errors.Is(err, context.DeadlineExceeded) {
+					db.observeVlogGenerationRewriteDeadline(queuedDebt)
 				}
 				return fmt.Errorf("generational rewrite: %w", err)
 			}
@@ -20199,7 +20226,13 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Load())
 	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load())
 	stats["treedb.cache.vlog_generation.rewrite.canceled_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledRuns.Load())
+	stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledFreshPlanRuns.Load())
+	stats["treedb.cache.vlog_generation.rewrite.canceled_runs.queued_debt"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledQueuedDebtRuns.Load())
 	stats["treedb.cache.vlog_generation.rewrite.canceled_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledLastNS.Load())
+	stats["treedb.cache.vlog_generation.rewrite.deadline_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteDeadlineRuns.Load())
+	stats["treedb.cache.vlog_generation.rewrite.deadline_runs.fresh_plan"] = fmt.Sprintf("%d", db.vlogGenerationRewriteDeadlineFreshPlanRuns.Load())
+	stats["treedb.cache.vlog_generation.rewrite.deadline_runs.queued_debt"] = fmt.Sprintf("%d", db.vlogGenerationRewriteDeadlineQueuedDebtRuns.Load())
+	stats["treedb.cache.vlog_generation.rewrite.deadline_last_unix_nano"] = fmt.Sprintf("%d", db.vlogGenerationRewriteDeadlineLastNS.Load())
 	stats["treedb.cache.vlog_generation.rewrite.queue_prune_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteQueuePruneRuns.Load())
 	stats["treedb.cache.vlog_generation.rewrite.queue_prune_ids"] = fmt.Sprintf("%d", db.vlogGenerationRewriteQueuePruneIDs.Load())
 	stats["treedb.cache.vlog_generation.rewrite.ineffective_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteIneffectiveRuns.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 0d203c381..530f5df96 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -355,6 +355,38 @@ func TestObserveVlogGenerationRewritePlanPenaltyFilterCounters(t *testing.T) {
 	}
 }
 
+func TestObserveVlogGenerationRewriteCanceledCountersByQueueState(t *testing.T) {
+	db := &DB{}
+	db.observeVlogGenerationRewriteCanceled(false)
+	db.observeVlogGenerationRewriteCanceled(true)
+
+	if got, want := db.vlogGenerationRewriteCanceledRuns.Load(), uint64(2); got != want {
+		t.Fatalf("rewrite canceled total=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationRewriteCanceledFreshPlanRuns.Load(), uint64(1); got != want {
+		t.Fatalf("rewrite canceled fresh=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationRewriteCanceledQueuedDebtRuns.Load(), uint64(1); got != want {
+		t.Fatalf("rewrite canceled queued=%d want=%d", got, want)
+	}
+}
+
+func TestObserveVlogGenerationRewriteDeadlineCountersByQueueState(t *testing.T) {
+	db := &DB{}
+	db.observeVlogGenerationRewriteDeadline(false)
+	db.observeVlogGenerationRewriteDeadline(true)
+
+	if got, want := db.vlogGenerationRewriteDeadlineRuns.Load(), uint64(2); got != want {
+		t.Fatalf("rewrite deadline total=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationRewriteDeadlineFreshPlanRuns.Load(), uint64(1); got != want {
+		t.Fatalf("rewrite deadline fresh=%d want=%d", got, want)
+	}
+	if got, want := db.vlogGenerationRewriteDeadlineQueuedDebtRuns.Load(), uint64(1); got != want {
+		t.Fatalf("rewrite deadline queued=%d want=%d", got, want)
+	}
+}
+
 func TestMaybeRunVlogGenerationMaintenanceWithOptions_TracksWalOnPeriodicSkip(t *testing.T) {
 	db := &DB{valueLogGenerationPolicy: uint8(backenddb.ValueLogGenerationHotWarmCold)}
 	db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{})
@@ -978,6 +1010,12 @@ func TestVlogGenerationRewrite_QueuedExecIgnoresForegroundCancelUntilBoundedComp
 	if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs"]; got != "0" {
 		t.Fatalf("rewrite canceled runs=%q want 0 for bounded queued rewrite", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"]; got != "0" {
+		t.Fatalf("rewrite canceled fresh runs=%q want 0 for bounded queued rewrite", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.queued_debt"]; got != "0" {
+		t.Fatalf("rewrite canceled queued runs=%q want 0 for bounded queued rewrite", got)
+	}
 }
 
 func TestVlogGenerationRewrite_CanceledFreshPlanQueuesPendingResume(t *testing.T) {
@@ -1086,6 +1124,12 @@ func TestVlogGenerationRewrite_CanceledFreshPlanQueuesPendingResume(t *testing.T
 	if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs"]; got != "1" {
 		t.Fatalf("rewrite canceled runs=%q want 1", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"]; got != "1" {
+		t.Fatalf("rewrite canceled fresh runs=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.queued_debt"]; got != "0" {
+		t.Fatalf("rewrite canceled queued runs=%q want 0", got)
+	}
 }
 
 func TestVlogGenerationMaintenance_QueuesPendingCheckpointKickOnActiveCollision(t *testing.T) {

From fa183f0caa433a87d71148bdf4605e712f1fdbbc Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 00:50:13 -1000
Subject: [PATCH 20/61] treedb: bound fresh-plan rewrite exec to avoid
 foreground preemption

---
 TreeDB/caching/db.go                          |  2 +-
 .../caching/vlog_generation_scheduler_test.go | 48 ++++++++-----------
 2 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index ca27de39b..882c1f00c 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -14228,7 +14228,7 @@ planned:
 			}
 			var ctx context.Context
 			var cancel context.CancelFunc
-			if hadRewriteQueue && len(processedRewriteIDs) > 0 {
+			if len(processedRewriteIDs) > 0 {
 				ctx, cancel = context.WithTimeout(context.Background(), vlogGenerationRewriteBoundedExecTimeout)
 			} else {
 				ctx, cancel = db.foregroundMaintenanceContext(2 * time.Minute)
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 530f5df96..ce83ba165 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -1018,7 +1018,7 @@ func TestVlogGenerationRewrite_QueuedExecIgnoresForegroundCancelUntilBoundedComp
 	}
 }
 
-func TestVlogGenerationRewrite_CanceledFreshPlanQueuesPendingResume(t *testing.T) {
+func TestVlogGenerationRewrite_FreshPlanExecIgnoresForegroundCancelUntilBoundedComplete(t *testing.T) {
 	prepareDirectSchedulerTest(t)
 
 	dir := t.TempDir()
@@ -1090,42 +1090,34 @@ func TestVlogGenerationRewrite_CanceledFreshPlanQueuesPendingResume(t *testing.T
 
 	select {
 	case <-done:
-	case <-time.After(2 * wait):
-		t.Fatalf("initial rewrite did not cancel under foreground activity")
+		t.Fatalf("rewrite completed early under foreground activity; expected bounded fresh-plan rewrite to continue until release (ctx_ttl=%s)", blocking.recordedRewriteTTL())
+	case <-time.After(250 * time.Millisecond):
 	}
 
-	deadline := time.Now().Add(2 * wait)
-	for blocking.recordedRewriteCalls() < 2 {
-		if time.Now().After(deadline) {
-			t.Fatalf("pending checkpoint-kick resume did not run (calls=%d)", blocking.recordedRewriteCalls())
-		}
-		time.Sleep(10 * time.Millisecond)
+	releaseRewrite()
+	select {
+	case <-done:
+	case <-time.After(2 * wait):
+		t.Fatalf("rewrite did not finish after release")
 	}
 	if ttl := blocking.recordedRewriteTTL(); ttl < 20*time.Second {
-		t.Fatalf("resume rewrite context ttl=%s want around %s", ttl, vlogGenerationRewriteBoundedExecTimeout)
+		t.Fatalf("fresh-plan rewrite context ttl=%s want around %s", ttl, vlogGenerationRewriteBoundedExecTimeout)
 	}
 
-	releaseRewrite()
-	deadline = time.Now().Add(2 * wait)
-	for {
-		queue, qerr := db.currentVlogGenerationRewriteQueue()
-		if qerr != nil {
-			t.Fatalf("load rewrite queue: %v", qerr)
-		}
-		if len(queue) == 0 {
-			break
-		}
-		if time.Now().After(deadline) {
-			t.Fatalf("rewrite queue not drained after resume release: queue=%v calls=%d", queue, blocking.recordedRewriteCalls())
-		}
-		time.Sleep(10 * time.Millisecond)
+	queue, qerr := db.currentVlogGenerationRewriteQueue()
+	if qerr != nil {
+		t.Fatalf("load rewrite queue: %v", qerr)
 	}
+	if len(queue) != 0 {
+		t.Fatalf("rewrite queue not drained after release: queue=%v calls=%d", queue, blocking.recordedRewriteCalls())
+	}
+
 	stats := db.Stats()
-	if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs"]; got != "1" {
-		t.Fatalf("rewrite canceled runs=%q want 1", got)
+	if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs"]; got != "0" {
+		t.Fatalf("rewrite canceled runs=%q want 0", got)
 	}
-	if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"]; got != "1" {
-		t.Fatalf("rewrite canceled fresh runs=%q want 1", got)
+	if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"]; got != "0" {
+		t.Fatalf("rewrite canceled fresh runs=%q want 0", got)
 	}
 	if got := stats["treedb.cache.vlog_generation.rewrite.canceled_runs.queued_debt"]; got != "0" {
 		t.Fatalf("rewrite canceled queued runs=%q want 0", got)

From d7dca39a6b4d857089a6e268ce8d915e05ed98c3 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 01:23:11 -1000
Subject: [PATCH 21/61] caching: add rewrite source-segment outcome
 observability

---
 TreeDB/caching/db.go                          | 64 +++++++++++++++----
 .../caching/vlog_generation_scheduler_test.go | 24 +++++++
 TreeDB/db/vlog_rewrite.go                     | 20 ++++++
 TreeDB/db/vlog_rewrite_test.go                |  9 +++
 TreeDB/vlog_rewrite.go                        | 13 ++--
 5 files changed, 111 insertions(+), 19 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 882c1f00c..1fb10acd8 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5530,20 +5530,26 @@ type DB struct {
 	// Rewrite budget token bucket (bytes) for online maintenance. This lets us
 	// interpret ValueLogRewriteBudgetBytesPerSec as a true per-second bandwidth
 	// budget while still running maintenance at coarse intervals.
-	vlogGenerationRewriteBudgetLastUnixNano atomic.Int64
-	vlogGenerationRewriteBudgetTokensBytes  atomic.Int64
-	vlogGenerationRewriteBudgetConsumed     atomic.Uint64
-	vlogGenerationRewritePlanTotalNanos     atomic.Uint64
-	vlogGenerationRewritePlanMaxNanos       atomic.Uint64
-	vlogGenerationRewriteExecTotalNanos     atomic.Uint64
-	vlogGenerationRewriteExecMaxNanos       atomic.Uint64
-	vlogGenerationRewriteExecSourceSegments atomic.Uint64
-	vlogGenerationGCExecTotalNanos          atomic.Uint64
-	vlogGenerationGCExecMaxNanos            atomic.Uint64
-	vlogGenerationVacuumExecTotalNanos      atomic.Uint64
-	vlogGenerationVacuumExecMaxNanos        atomic.Uint64
-	bgErrMu                                 sync.Mutex
-	bgErr                                   error
+	vlogGenerationRewriteBudgetLastUnixNano                 atomic.Int64
+	vlogGenerationRewriteBudgetTokensBytes                  atomic.Int64
+	vlogGenerationRewriteBudgetConsumed                     atomic.Uint64
+	vlogGenerationRewritePlanTotalNanos                     atomic.Uint64
+	vlogGenerationRewritePlanMaxNanos                       atomic.Uint64
+	vlogGenerationRewriteExecTotalNanos                     atomic.Uint64
+	vlogGenerationRewriteExecMaxNanos                       atomic.Uint64
+	vlogGenerationRewriteExecSourceSegments                 atomic.Uint64
+	vlogGenerationRewriteSourceSegmentsRequestedTotal       atomic.Uint64
+	vlogGenerationRewriteSourceSegmentsStillReferencedTotal atomic.Uint64
+	vlogGenerationRewriteSourceSegmentsUnreferencedTotal    atomic.Uint64
+	vlogGenerationRewriteSourceSegmentsRequestedLast        atomic.Uint64
+	vlogGenerationRewriteSourceSegmentsStillReferencedLast  atomic.Uint64
+	vlogGenerationRewriteSourceSegmentsUnreferencedLast     atomic.Uint64
+	vlogGenerationGCExecTotalNanos                          atomic.Uint64
+	vlogGenerationGCExecMaxNanos                            atomic.Uint64
+	vlogGenerationVacuumExecTotalNanos                      atomic.Uint64
+	vlogGenerationVacuumExecMaxNanos                        atomic.Uint64
+	bgErrMu                                                 sync.Mutex
+	bgErr                                                   error
 
 	// Backpressure state
 	queueBacklogBytes                  atomic.Int64
@@ -14408,6 +14414,30 @@ planned:
 			if sourceSegments := len(rewriteOpts.SourceFileIDs); sourceSegments > 0 {
 				db.vlogGenerationRewriteExecSourceSegments.Add(uint64(sourceSegments))
 			}
+			sourceSegmentsRequested := uint64(0)
+			if stats.SourceSegmentsRequested > 0 {
+				sourceSegmentsRequested = uint64(stats.SourceSegmentsRequested)
+			}
+			sourceSegmentsStillReferenced := uint64(0)
+			if stats.SourceSegmentsStillReferenced > 0 {
+				sourceSegmentsStillReferenced = uint64(stats.SourceSegmentsStillReferenced)
+			}
+			sourceSegmentsUnreferenced := uint64(0)
+			if stats.SourceSegmentsUnreferenced > 0 {
+				sourceSegmentsUnreferenced = uint64(stats.SourceSegmentsUnreferenced)
+			}
+			db.vlogGenerationRewriteSourceSegmentsRequestedLast.Store(sourceSegmentsRequested)
+			db.vlogGenerationRewriteSourceSegmentsStillReferencedLast.Store(sourceSegmentsStillReferenced)
+			db.vlogGenerationRewriteSourceSegmentsUnreferencedLast.Store(sourceSegmentsUnreferenced)
+			if sourceSegmentsRequested > 0 {
+				db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Add(sourceSegmentsRequested)
+			}
+			if sourceSegmentsStillReferenced > 0 {
+				db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Add(sourceSegmentsStillReferenced)
+			}
+			if sourceSegmentsUnreferenced > 0 {
+				db.vlogGenerationRewriteSourceSegmentsUnreferencedTotal.Add(sourceSegmentsUnreferenced)
+			}
 			rewriteBytesIn := int64(0)
 			if processedLedgerOK {
 				rewriteBytesIn = processedLedgerLiveBytes
@@ -20225,6 +20255,12 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.segments"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterSegments.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Load())
 	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load())
+	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Load())
+	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Load())
+	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsUnreferencedTotal.Load())
+	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_last"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsRequestedLast.Load())
+	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_last"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsStillReferencedLast.Load())
+	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_last"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsUnreferencedLast.Load())
 	stats["treedb.cache.vlog_generation.rewrite.canceled_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledRuns.Load())
 	stats["treedb.cache.vlog_generation.rewrite.canceled_runs.fresh_plan"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledFreshPlanRuns.Load())
 	stats["treedb.cache.vlog_generation.rewrite.canceled_runs.queued_debt"] = fmt.Sprintf("%d", db.vlogGenerationRewriteCanceledQueuedDebtRuns.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index ce83ba165..264fb8b92 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -5823,6 +5823,12 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	db.vlogGenerationMaintenanceSkipStageDue.Store(2)
 	db.vlogGenerationRewritePlanSelectedSegments.Store(6)
 	db.vlogGenerationRewriteExecSourceSegments.Store(3)
+	db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Store(5)
+	db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Store(2)
+	db.vlogGenerationRewriteSourceSegmentsUnreferencedTotal.Store(3)
+	db.vlogGenerationRewriteSourceSegmentsRequestedLast.Store(2)
+	db.vlogGenerationRewriteSourceSegmentsStillReferencedLast.Store(1)
+	db.vlogGenerationRewriteSourceSegmentsUnreferencedLast.Store(1)
 	db.vlogGenerationRewriteProcessedLiveBytes.Store(900)
 	db.vlogGenerationRewriteProcessedStaleBytes.Store(450)
 	db.vlogGenerationRewriteNoReclaimRuns.Store(3)
@@ -5987,6 +5993,24 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"]; got != "3" {
 		t.Fatalf("rewrite exec source segments total=%q want 3", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"]; got != "5" {
+		t.Fatalf("rewrite exec source segments requested total=%q want 5", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"]; got != "2" {
+		t.Fatalf("rewrite exec source segments still referenced total=%q want 2", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_total"]; got != "3" {
+		t.Fatalf("rewrite exec source segments unreferenced total=%q want 3", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_last"]; got != "2" {
+		t.Fatalf("rewrite exec source segments requested last=%q want 2", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_last"]; got != "1" {
+		t.Fatalf("rewrite exec source segments still referenced last=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_last"]; got != "1" {
+		t.Fatalf("rewrite exec source segments unreferenced last=%q want 1", got)
+	}
 	if got := stats["treedb.cache.vlog_generation.rewrite.processed_live_bytes"]; got != "900" {
 		t.Fatalf("rewrite processed live bytes=%q want 900", got)
 	}
diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go
index efb73b610..ab5b5eb87 100644
--- a/TreeDB/db/vlog_rewrite.go
+++ b/TreeDB/db/vlog_rewrite.go
@@ -49,6 +49,15 @@ type ValueLogRewriteStats struct {
 	BytesBefore    int64
 	BytesAfter     int64
 	RecordsCopied  int
+	// SourceSegmentsRequested is the number of source segments selected for this
+	// rewrite run after applying selection filters.
+	SourceSegmentsRequested int
+	// SourceSegmentsStillReferenced is the subset of selected source segments
+	// that remained referenced after rewrite pointer swaps and cleanup.
+	SourceSegmentsStillReferenced int
+	// SourceSegmentsUnreferenced is the subset of selected source segments that
+	// became unreferenced after rewrite pointer swaps and cleanup.
+	SourceSegmentsUnreferenced int
 }
 
 // ValueLogRewritePlan summarizes which segments a sparse online rewrite would
@@ -1180,6 +1189,7 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 		}
 		sourceIDs, _ = selectRewriteSourceSegmentsWithStats(opts, set.Files, active, liveByID)
 		restrictSource = true
+		stats.SourceSegmentsRequested = len(sourceIDs)
 	}
 	_ = db.valueLogManager.Release(set)
 	if restrictSource && len(sourceIDs) == 0 {
@@ -1351,6 +1361,16 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 	if err != nil {
 		return stats, err
 	}
+	if len(sourceIDs) > 0 {
+		stillReferenced := 0
+		for id := range sourceIDs {
+			if _, ok := referencedAfter[id]; ok {
+				stillReferenced++
+			}
+		}
+		stats.SourceSegmentsStillReferenced = stillReferenced
+		stats.SourceSegmentsUnreferenced = len(sourceIDs) - stillReferenced
+	}
 	var protectedPaths map[string]struct{}
 	allowActiveSkip := len(opts.ProtectedPaths) > 0
 	if allowActiveSkip {
diff --git a/TreeDB/db/vlog_rewrite_test.go b/TreeDB/db/vlog_rewrite_test.go
index 8528853ea..91127e148 100644
--- a/TreeDB/db/vlog_rewrite_test.go
+++ b/TreeDB/db/vlog_rewrite_test.go
@@ -2867,6 +2867,15 @@ func TestValueLogRewriteOnline_SourceFileIDsWithStaleFilterMatchesPlanSelection(
 	if stats.RecordsCopied != 1 {
 		t.Fatalf("expected one rewritten record from selected explicit source, got %d", stats.RecordsCopied)
 	}
+	if stats.SourceSegmentsRequested != 1 {
+		t.Fatalf("source segments requested=%d want 1", stats.SourceSegmentsRequested)
+	}
+	if stats.SourceSegmentsStillReferenced != 0 {
+		t.Fatalf("source segments still referenced=%d want 0", stats.SourceSegmentsStillReferenced)
+	}
+	if stats.SourceSegmentsUnreferenced != 1 {
+		t.Fatalf("source segments unreferenced=%d want 1", stats.SourceSegmentsUnreferenced)
+	}
 
 	ptrK1, flagsK1 := readProjectedPointerByKey(t, db, []byte("k1"))
 	ptrK2, flagsK2 := readProjectedPointerByKey(t, db, []byte("k2"))
diff --git a/TreeDB/vlog_rewrite.go b/TreeDB/vlog_rewrite.go
index fd7879b87..5e60b37da 100644
--- a/TreeDB/vlog_rewrite.go
+++ b/TreeDB/vlog_rewrite.go
@@ -8,11 +8,14 @@ import (
 
 // ValueLogRewriteStats summarizes value-log rewrite compaction results.
 type ValueLogRewriteStats struct {
-	SegmentsBefore int
-	SegmentsAfter  int
-	BytesBefore    int64
-	BytesAfter     int64
-	RecordsCopied  int
+	SegmentsBefore                int
+	SegmentsAfter                 int
+	BytesBefore                   int64
+	BytesAfter                    int64
+	RecordsCopied                 int
+	SourceSegmentsRequested       int
+	SourceSegmentsStillReferenced int
+	SourceSegmentsUnreferenced    int
 }
 
 // ValueLogRewriteOnlineOptions controls online rewrite batching behavior.

From e6f54aa17d5e6886bb5f26a40aa8d938d4ff17c8 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 01:50:11 -1000
Subject: [PATCH 22/61] caching: probe rewrite source segments through gc
 protection buckets

---
 TreeDB/caching/db.go                          | 414 ++++++++++--------
 .../caching/vlog_generation_scheduler_test.go |  88 ++++
 TreeDB/db/vlog_gc.go                          | 134 ++++--
 TreeDB/db/vlog_gc_test.go                     |  69 +++
 4 files changed, 508 insertions(+), 197 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 1fb10acd8..89a9929c7 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5356,177 +5356,199 @@ type DB struct {
 	valueLogMaxSegmentBytes                           int64
 	journalCompression                                bool
 
-	disableJournal                                 bool
-	relaxedSync                                    bool
-	notifyError                                    func(error)
-	debugFlushPointers                             bool
-	debugFlushTiming                               bool
-	debugPtrEligible                               atomic.Int64
-	debugPtrUsed                                   atomic.Int64
-	debugPtrNoPtr                                  atomic.Int64
-	debugPtrDenied                                 atomic.Int64
-	debugPtrDisabled                               atomic.Int64
-	checkpointRuns                                 atomic.Uint64
-	checkpointTotalNs                              atomic.Uint64
-	checkpointMaxNs                                atomic.Uint64
-	checkpointNoopSkips                            atomic.Uint64
-	checkpointFlushMuWaitNs                        atomic.Uint64
-	checkpointFlushMuWaitMaxNs                     atomic.Uint64
-	checkpointAutoVacuumRuns                       atomic.Uint64
-	checkpointAutoVacuumLastCheckRun               atomic.Uint64
-	checkpointAutoVacuumLastPages                  atomic.Uint64
-	checkpointAutoVacuumLastInternalP50            atomic.Uint64
-	checkpointAutoVacuumLastInternalAvg            atomic.Uint64
-	lastForegroundWriteUnixNano                    atomic.Int64
-	lastForegroundReadUnixNano                     atomic.Int64
-	foregroundReadStampCounter                     atomic.Uint32
-	activeForegroundIterators                      atomic.Int64
-	retainedPruneLastStartUnixNano                 atomic.Int64
-	retainedValueLogPruneLastUnixNano              atomic.Int64
-	retainedValueLogPruneRuns                      atomic.Uint64
-	retainedValueLogPruneForcedRuns                atomic.Uint64
-	retainedValueLogPruneForegroundAbortRuns       atomic.Uint64
-	retainedValueLogPruneRemovedSegments           atomic.Uint64
-	retainedValueLogPruneRemovedBytes              atomic.Uint64
-	retainedValueLogPruneInUseSkippedSegments      atomic.Uint64
-	retainedValueLogPruneInUseSkippedBytes         atomic.Uint64
-	retainedValueLogPruneCandidateSegments         atomic.Uint64
-	retainedValueLogPruneCandidateBytes            atomic.Uint64
-	retainedValueLogPruneLiveSkippedSegments       atomic.Uint64
-	retainedValueLogPruneLiveSkippedBytes          atomic.Uint64
-	retainedValueLogPruneParseSkippedSegments      atomic.Uint64
-	retainedValueLogPruneParseSkippedBytes         atomic.Uint64
-	retainedValueLogPruneZombieMarkedSegments      atomic.Uint64
-	retainedValueLogPruneZombieMarkedBytes         atomic.Uint64
-	retainedValueLogPruneScheduleRequests          atomic.Uint64
-	retainedValueLogPruneScheduleForcedRequests    atomic.Uint64
-	retainedValueLogPruneScheduleSkipClosing       atomic.Uint64
-	retainedValueLogPruneScheduleSkipInFlight      atomic.Uint64
-	retainedValueLogPruneScheduleSkipNoClosedBytes atomic.Uint64
-	retainedValueLogPruneScheduleSkipBelowPressure atomic.Uint64
-	retainedValueLogPruneScheduleSkipMinInterval   atomic.Uint64
-	retainedValueLogPruneWriteGateRetries          atomic.Uint64
-	retainedValueLogPruneWriteGateRetrySuccesses   atomic.Uint64
-	retainedPruneForceRequested                    atomic.Bool
-	retainedPruneMu                                sync.Mutex
-	retainedPruneDone                              chan struct{}
-	vlogGenerationRemapSuccesses                   atomic.Uint64
-	vlogGenerationRemapFailures                    atomic.Uint64
-	vlogGenerationRewriteBytesIn                   atomic.Uint64
-	vlogGenerationRewriteBytesOut                  atomic.Uint64
-	vlogGenerationRewriteReclaimedBytes            atomic.Uint64
-	vlogGenerationRewriteProcessedLiveBytes        atomic.Uint64
-	vlogGenerationRewriteProcessedStaleBytes       atomic.Uint64
-	vlogGenerationRewriteNoReclaimRuns             atomic.Uint64
-	vlogGenerationRewriteNoReclaimStaleBytes       atomic.Uint64
-	vlogGenerationRewriteRuns                      atomic.Uint64
-	vlogGenerationRewritePlanRuns                  atomic.Uint64
-	vlogGenerationRewritePlanCanceled              atomic.Uint64
-	vlogGenerationRewritePlanErrors                atomic.Uint64
-	vlogGenerationRewritePlanEmpty                 atomic.Uint64
-	vlogGenerationRewritePlanEmptyAgeBlocked       atomic.Uint64
-	vlogGenerationRewritePlanEmptyNoSelection      atomic.Uint64
-	vlogGenerationRewritePlanSelected              atomic.Uint64
-	vlogGenerationRewritePlanSelectedSegments      atomic.Uint64
-	vlogGenerationRewritePlanSelectedBytes         atomic.Uint64
-	vlogGenerationRewritePlanSelectedLiveBytes     atomic.Uint64
-	vlogGenerationRewritePlanSelectedStaleBytes    atomic.Uint64
-	vlogGenerationRewritePlanPenaltyFilterRuns     atomic.Uint64
-	vlogGenerationRewritePlanPenaltyFilterSegments atomic.Uint64
-	vlogGenerationRewritePlanPenaltyFilterToEmpty  atomic.Uint64
-	vlogGenerationRewritePlanCanceledLastNS        atomic.Int64
-	vlogGenerationRewriteAgeBlockedUntilNS         atomic.Int64
-	vlogGenerationRewriteAgeBlockedWakeRunning     atomic.Bool
-	vlogGenerationRewriteIneffectiveLastNS         atomic.Int64
-	vlogGenerationRewriteIneffectiveRuns           atomic.Uint64
-	vlogGenerationRewriteIneffectiveBytesIn        atomic.Uint64
-	vlogGenerationRewriteIneffectiveBytesOut       atomic.Uint64
-	vlogGenerationRewriteCanceledRuns              atomic.Uint64
-	vlogGenerationRewriteCanceledFreshPlanRuns     atomic.Uint64
-	vlogGenerationRewriteCanceledQueuedDebtRuns    atomic.Uint64
-	vlogGenerationRewriteCanceledLastNS            atomic.Int64
-	vlogGenerationRewriteDeadlineRuns              atomic.Uint64
-	vlogGenerationRewriteDeadlineFreshPlanRuns     atomic.Uint64
-	vlogGenerationRewriteDeadlineQueuedDebtRuns    atomic.Uint64
-	vlogGenerationRewriteDeadlineLastNS            atomic.Int64
-	vlogGenerationRewriteQueuePruneRuns            atomic.Uint64
-	vlogGenerationRewriteQueuePruneIDs             atomic.Uint64
-	vlogGenerationGCSegmentsDeleted                atomic.Uint64
-	vlogGenerationGCBytesDeleted                   atomic.Uint64
-	vlogGenerationGCRuns                           atomic.Uint64
-	vlogGenerationVacuumRuns                       atomic.Uint64
-	vlogGenerationVacuumFailures                   atomic.Uint64
-	vlogGenerationVacuumSkippedDisabled            atomic.Uint64
-	vlogGenerationVacuumSkippedRewriteBytes        atomic.Uint64
-	vlogGenerationVacuumSkippedCooldown            atomic.Uint64
-	vlogGenerationLastVacuumUnixNano               atomic.Int64
-	vlogGenerationLastRewritePlanUnixNano          atomic.Int64
-	vlogGenerationLastRewriteUnixNano              atomic.Int64
-	vlogGenerationLastGCUnixNano                   atomic.Int64
-	vlogGenerationLastCheckpointKickUnixNano       atomic.Int64
-	vlogGenerationLastGCDryRunUnixNano             atomic.Int64
-	vlogGenerationLastGCDryRunBytesEligible        atomic.Int64
-	vlogGenerationLastGCDryRunSegsEligible         atomic.Int64
-	vlogGenerationLastGCBytesReferenced            atomic.Int64
-	vlogGenerationLastGCSegmentsReferenced         atomic.Int64
-	vlogGenerationLastGCBytesActive                atomic.Int64
-	vlogGenerationLastGCSegmentsActive             atomic.Int64
-	vlogGenerationLastGCBytesProtected             atomic.Int64
-	vlogGenerationLastGCSegmentsProtected          atomic.Int64
-	vlogGenerationLastGCBytesProtectedInUse        atomic.Int64
-	vlogGenerationLastGCSegmentsProtectedInUse     atomic.Int64
-	vlogGenerationLastGCBytesProtectedRetained     atomic.Int64
-	vlogGenerationLastGCSegmentsProtectedRetained  atomic.Int64
-	vlogGenerationLastGCBytesProtectedOverlap      atomic.Int64
-	vlogGenerationLastGCSegmentsProtectedOverlap   atomic.Int64
-	vlogGenerationLastGCBytesProtectedOther        atomic.Int64
-	vlogGenerationLastGCSegmentsProtectedOther     atomic.Int64
-	vlogGenerationLastGCBytesEligible              atomic.Int64
-	vlogGenerationLastGCSegmentsEligible           atomic.Int64
-	vlogGenerationLastGCBytesDeleted               atomic.Int64
-	vlogGenerationLastGCSegmentsDeleted            atomic.Int64
-	vlogGenerationLastGCBytesPending               atomic.Int64
-	vlogGenerationLastGCSegmentsPending            atomic.Int64
-	vlogGenerationChurnBytes                       atomic.Uint64
-	vlogGenerationSchedulerState                   atomic.Uint32
-	vlogGenerationMaintenanceActive                atomic.Bool
-	vlogGenerationMaintenanceAttempts              atomic.Uint64
-	vlogGenerationMaintenanceAcquired              atomic.Uint64
-	vlogGenerationMaintenanceCollisions            atomic.Uint64
-	vlogGenerationMaintenanceSkipWALOnPeriodic     atomic.Uint64
-	vlogGenerationMaintenanceSkipPhase             atomic.Uint64
-	vlogGenerationMaintenanceSkipStageGate         atomic.Uint64
-	vlogGenerationMaintenanceSkipStageNotDue       atomic.Uint64
-	vlogGenerationMaintenanceSkipStageDue          atomic.Uint64
-	vlogGenerationMaintenanceSkipAgeBlocked        atomic.Uint64
-	vlogGenerationMaintenanceSkipPriority          atomic.Uint64
-	vlogGenerationMaintenanceSkipQuiet             atomic.Uint64
-	vlogGenerationMaintenanceSkipPreCheckpoint     atomic.Uint64
-	vlogGenerationMaintenanceSkipCheckpointing     atomic.Uint64
-	vlogGenerationMaintenancePassNoop              atomic.Uint64
-	vlogGenerationMaintenancePassWithRewrite       atomic.Uint64
-	vlogGenerationMaintenancePassWithGC            atomic.Uint64
-	vlogGenerationMaintenancePassTotalNanos        atomic.Uint64
-	vlogGenerationMaintenancePassMaxNanos          atomic.Uint64
-	vlogGenerationLastReason                       atomic.Uint32
-	vlogGenerationCheckpointKickRuns               atomic.Uint64
-	vlogGenerationCheckpointKickRewriteRuns        atomic.Uint64
-	vlogGenerationCheckpointKickGCRuns             atomic.Uint64
-	vlogGenerationCheckpointKickPending            atomic.Bool
-	vlogGenerationDeferredMaintenancePending       atomic.Bool
-	vlogGenerationDeferredMaintenanceRunning       atomic.Bool
-	vlogGenerationRewriteStageWakeObservedNS       atomic.Int64
-	vlogGenerationRewriteQueueMu                   sync.Mutex
-	vlogGenerationCheckpointKickActive             atomic.Bool
-	vlogGenerationRewriteQueue                     []uint32
-	vlogGenerationRewriteLedger                    []backenddb.ValueLogRewritePlanSegment
-	vlogGenerationRewritePenalties                 map[uint32]valueLogGenerationRewritePenalty
-	vlogGenerationRewriteStagePending              bool
-	vlogGenerationRewriteStageObservedUnixNano     int64
-	vlogGenerationRewriteQueueLoaded               bool
-	vlogGenerationLastChurnBps                     atomic.Int64
-	vlogGenerationLastChurnSampleBytes             atomic.Uint64
-	vlogGenerationLastChurnSampleNS                atomic.Int64
+	disableJournal                                              bool
+	relaxedSync                                                 bool
+	notifyError                                                 func(error)
+	debugFlushPointers                                          bool
+	debugFlushTiming                                            bool
+	debugPtrEligible                                            atomic.Int64
+	debugPtrUsed                                                atomic.Int64
+	debugPtrNoPtr                                               atomic.Int64
+	debugPtrDenied                                              atomic.Int64
+	debugPtrDisabled                                            atomic.Int64
+	checkpointRuns                                              atomic.Uint64
+	checkpointTotalNs                                           atomic.Uint64
+	checkpointMaxNs                                             atomic.Uint64
+	checkpointNoopSkips                                         atomic.Uint64
+	checkpointFlushMuWaitNs                                     atomic.Uint64
+	checkpointFlushMuWaitMaxNs                                  atomic.Uint64
+	checkpointAutoVacuumRuns                                    atomic.Uint64
+	checkpointAutoVacuumLastCheckRun                            atomic.Uint64
+	checkpointAutoVacuumLastPages                               atomic.Uint64
+	checkpointAutoVacuumLastInternalP50                         atomic.Uint64
+	checkpointAutoVacuumLastInternalAvg                         atomic.Uint64
+	lastForegroundWriteUnixNano                                 atomic.Int64
+	lastForegroundReadUnixNano                                  atomic.Int64
+	foregroundReadStampCounter                                  atomic.Uint32
+	activeForegroundIterators                                   atomic.Int64
+	retainedPruneLastStartUnixNano                              atomic.Int64
+	retainedValueLogPruneLastUnixNano                           atomic.Int64
+	retainedValueLogPruneRuns                                   atomic.Uint64
+	retainedValueLogPruneForcedRuns                             atomic.Uint64
+	retainedValueLogPruneForegroundAbortRuns                    atomic.Uint64
+	retainedValueLogPruneRemovedSegments                        atomic.Uint64
+	retainedValueLogPruneRemovedBytes                           atomic.Uint64
+	retainedValueLogPruneInUseSkippedSegments                   atomic.Uint64
+	retainedValueLogPruneInUseSkippedBytes                      atomic.Uint64
+	retainedValueLogPruneCandidateSegments                      atomic.Uint64
+	retainedValueLogPruneCandidateBytes                         atomic.Uint64
+	retainedValueLogPruneLiveSkippedSegments                    atomic.Uint64
+	retainedValueLogPruneLiveSkippedBytes                       atomic.Uint64
+	retainedValueLogPruneParseSkippedSegments                   atomic.Uint64
+	retainedValueLogPruneParseSkippedBytes                      atomic.Uint64
+	retainedValueLogPruneZombieMarkedSegments                   atomic.Uint64
+	retainedValueLogPruneZombieMarkedBytes                      atomic.Uint64
+	retainedValueLogPruneScheduleRequests                       atomic.Uint64
+	retainedValueLogPruneScheduleForcedRequests                 atomic.Uint64
+	retainedValueLogPruneScheduleSkipClosing                    atomic.Uint64
+	retainedValueLogPruneScheduleSkipInFlight                   atomic.Uint64
+	retainedValueLogPruneScheduleSkipNoClosedBytes              atomic.Uint64
+	retainedValueLogPruneScheduleSkipBelowPressure              atomic.Uint64
+	retainedValueLogPruneScheduleSkipMinInterval                atomic.Uint64
+	retainedValueLogPruneWriteGateRetries                       atomic.Uint64
+	retainedValueLogPruneWriteGateRetrySuccesses                atomic.Uint64
+	retainedPruneForceRequested                                 atomic.Bool
+	retainedPruneMu                                             sync.Mutex
+	retainedPruneDone                                           chan struct{}
+	vlogGenerationRemapSuccesses                                atomic.Uint64
+	vlogGenerationRemapFailures                                 atomic.Uint64
+	vlogGenerationRewriteBytesIn                                atomic.Uint64
+	vlogGenerationRewriteBytesOut                               atomic.Uint64
+	vlogGenerationRewriteReclaimedBytes                         atomic.Uint64
+	vlogGenerationRewriteProcessedLiveBytes                     atomic.Uint64
+	vlogGenerationRewriteProcessedStaleBytes                    atomic.Uint64
+	vlogGenerationRewriteNoReclaimRuns                          atomic.Uint64
+	vlogGenerationRewriteNoReclaimStaleBytes                    atomic.Uint64
+	vlogGenerationRewriteRuns                                   atomic.Uint64
+	vlogGenerationRewritePlanRuns                               atomic.Uint64
+	vlogGenerationRewritePlanCanceled                           atomic.Uint64
+	vlogGenerationRewritePlanErrors                             atomic.Uint64
+	vlogGenerationRewritePlanEmpty                              atomic.Uint64
+	vlogGenerationRewritePlanEmptyAgeBlocked                    atomic.Uint64
+	vlogGenerationRewritePlanEmptyNoSelection                   atomic.Uint64
+	vlogGenerationRewritePlanSelected                           atomic.Uint64
+	vlogGenerationRewritePlanSelectedSegments                   atomic.Uint64
+	vlogGenerationRewritePlanSelectedBytes                      atomic.Uint64
+	vlogGenerationRewritePlanSelectedLiveBytes                  atomic.Uint64
+	vlogGenerationRewritePlanSelectedStaleBytes                 atomic.Uint64
+	vlogGenerationRewritePlanPenaltyFilterRuns                  atomic.Uint64
+	vlogGenerationRewritePlanPenaltyFilterSegments              atomic.Uint64
+	vlogGenerationRewritePlanPenaltyFilterToEmpty               atomic.Uint64
+	vlogGenerationRewritePlanCanceledLastNS                     atomic.Int64
+	vlogGenerationRewriteAgeBlockedUntilNS                      atomic.Int64
+	vlogGenerationRewriteAgeBlockedWakeRunning                  atomic.Bool
+	vlogGenerationRewriteIneffectiveLastNS                      atomic.Int64
+	vlogGenerationRewriteIneffectiveRuns                        atomic.Uint64
+	vlogGenerationRewriteIneffectiveBytesIn                     atomic.Uint64
+	vlogGenerationRewriteIneffectiveBytesOut                    atomic.Uint64
+	vlogGenerationRewriteCanceledRuns                           atomic.Uint64
+	vlogGenerationRewriteCanceledFreshPlanRuns                  atomic.Uint64
+	vlogGenerationRewriteCanceledQueuedDebtRuns                 atomic.Uint64
+	vlogGenerationRewriteCanceledLastNS                         atomic.Int64
+	vlogGenerationRewriteDeadlineRuns                           atomic.Uint64
+	vlogGenerationRewriteDeadlineFreshPlanRuns                  atomic.Uint64
+	vlogGenerationRewriteDeadlineQueuedDebtRuns                 atomic.Uint64
+	vlogGenerationRewriteDeadlineLastNS                         atomic.Int64
+	vlogGenerationRewriteQueuePruneRuns                         atomic.Uint64
+	vlogGenerationRewriteQueuePruneIDs                          atomic.Uint64
+	vlogGenerationGCSegmentsDeleted                             atomic.Uint64
+	vlogGenerationGCBytesDeleted                                atomic.Uint64
+	vlogGenerationGCRuns                                        atomic.Uint64
+	vlogGenerationVacuumRuns                                    atomic.Uint64
+	vlogGenerationVacuumFailures                                atomic.Uint64
+	vlogGenerationVacuumSkippedDisabled                         atomic.Uint64
+	vlogGenerationVacuumSkippedRewriteBytes                     atomic.Uint64
+	vlogGenerationVacuumSkippedCooldown                         atomic.Uint64
+	vlogGenerationLastVacuumUnixNano                            atomic.Int64
+	vlogGenerationLastRewritePlanUnixNano                       atomic.Int64
+	vlogGenerationLastRewriteUnixNano                           atomic.Int64
+	vlogGenerationLastGCUnixNano                                atomic.Int64
+	vlogGenerationLastCheckpointKickUnixNano                    atomic.Int64
+	vlogGenerationLastGCDryRunUnixNano                          atomic.Int64
+	vlogGenerationLastGCDryRunBytesEligible                     atomic.Int64
+	vlogGenerationLastGCDryRunSegsEligible                      atomic.Int64
+	vlogGenerationLastGCBytesReferenced                         atomic.Int64
+	vlogGenerationLastGCSegmentsReferenced                      atomic.Int64
+	vlogGenerationLastGCBytesActive                             atomic.Int64
+	vlogGenerationLastGCSegmentsActive                          atomic.Int64
+	vlogGenerationLastGCBytesProtected                          atomic.Int64
+	vlogGenerationLastGCSegmentsProtected                       atomic.Int64
+	vlogGenerationLastGCBytesProtectedInUse                     atomic.Int64
+	vlogGenerationLastGCSegmentsProtectedInUse                  atomic.Int64
+	vlogGenerationLastGCBytesProtectedRetained                  atomic.Int64
+	vlogGenerationLastGCSegmentsProtectedRetained               atomic.Int64
+	vlogGenerationLastGCBytesProtectedOverlap                   atomic.Int64
+	vlogGenerationLastGCSegmentsProtectedOverlap                atomic.Int64
+	vlogGenerationLastGCBytesProtectedOther                     atomic.Int64
+	vlogGenerationLastGCSegmentsProtectedOther                  atomic.Int64
+	vlogGenerationLastGCBytesEligible                           atomic.Int64
+	vlogGenerationLastGCSegmentsEligible                        atomic.Int64
+	vlogGenerationLastGCBytesDeleted                            atomic.Int64
+	vlogGenerationLastGCSegmentsDeleted                         atomic.Int64
+	vlogGenerationLastGCBytesPending                            atomic.Int64
+	vlogGenerationLastGCSegmentsPending                         atomic.Int64
+	vlogGenerationLastGCObservedSourceSegments                  atomic.Int64
+	vlogGenerationLastGCObservedSourceSegmentsReferenced        atomic.Int64
+	vlogGenerationLastGCObservedSourceSegmentsActive            atomic.Int64
+	vlogGenerationLastGCObservedSourceSegmentsProtected         atomic.Int64
+	vlogGenerationLastGCObservedSourceSegmentsProtectedInUse    atomic.Int64
+	vlogGenerationLastGCObservedSourceSegmentsProtectedRetained atomic.Int64
+	vlogGenerationLastGCObservedSourceSegmentsProtectedOverlap  atomic.Int64
+	vlogGenerationLastGCObservedSourceSegmentsProtectedOther    atomic.Int64
+	vlogGenerationLastGCObservedSourceSegmentsEligible          atomic.Int64
+	vlogGenerationLastGCObservedSourceSegmentsDeleted           atomic.Int64
+	vlogGenerationLastGCObservedSourceSegmentsPending           atomic.Int64
+	vlogGenerationLastGCObservedSourceBytes                     atomic.Int64
+	vlogGenerationLastGCObservedSourceBytesReferenced           atomic.Int64
+	vlogGenerationLastGCObservedSourceBytesActive               atomic.Int64
+	vlogGenerationLastGCObservedSourceBytesProtected            atomic.Int64
+	vlogGenerationLastGCObservedSourceBytesProtectedInUse       atomic.Int64
+	vlogGenerationLastGCObservedSourceBytesProtectedRetained    atomic.Int64
+	vlogGenerationLastGCObservedSourceBytesProtectedOverlap     atomic.Int64
+	vlogGenerationLastGCObservedSourceBytesProtectedOther       atomic.Int64
+	vlogGenerationLastGCObservedSourceBytesEligible             atomic.Int64
+	vlogGenerationLastGCObservedSourceBytesDeleted              atomic.Int64
+	vlogGenerationLastGCObservedSourceBytesPending              atomic.Int64
+	vlogGenerationChurnBytes                                    atomic.Uint64
+	vlogGenerationSchedulerState                                atomic.Uint32
+	vlogGenerationMaintenanceActive                             atomic.Bool
+	vlogGenerationMaintenanceAttempts                           atomic.Uint64
+	vlogGenerationMaintenanceAcquired                           atomic.Uint64
+	vlogGenerationMaintenanceCollisions                         atomic.Uint64
+	vlogGenerationMaintenanceSkipWALOnPeriodic                  atomic.Uint64
+	vlogGenerationMaintenanceSkipPhase                          atomic.Uint64
+	vlogGenerationMaintenanceSkipStageGate                      atomic.Uint64
+	vlogGenerationMaintenanceSkipStageNotDue                    atomic.Uint64
+	vlogGenerationMaintenanceSkipStageDue                       atomic.Uint64
+	vlogGenerationMaintenanceSkipAgeBlocked                     atomic.Uint64
+	vlogGenerationMaintenanceSkipPriority                       atomic.Uint64
+	vlogGenerationMaintenanceSkipQuiet                          atomic.Uint64
+	vlogGenerationMaintenanceSkipPreCheckpoint                  atomic.Uint64
+	vlogGenerationMaintenanceSkipCheckpointing                  atomic.Uint64
+	vlogGenerationMaintenancePassNoop                           atomic.Uint64
+	vlogGenerationMaintenancePassWithRewrite                    atomic.Uint64
+	vlogGenerationMaintenancePassWithGC                         atomic.Uint64
+	vlogGenerationMaintenancePassTotalNanos                     atomic.Uint64
+	vlogGenerationMaintenancePassMaxNanos                       atomic.Uint64
+	vlogGenerationLastReason                                    atomic.Uint32
+	vlogGenerationCheckpointKickRuns                            atomic.Uint64
+	vlogGenerationCheckpointKickRewriteRuns                     atomic.Uint64
+	vlogGenerationCheckpointKickGCRuns                          atomic.Uint64
+	vlogGenerationCheckpointKickPending                         atomic.Bool
+	vlogGenerationDeferredMaintenancePending                    atomic.Bool
+	vlogGenerationDeferredMaintenanceRunning                    atomic.Bool
+	vlogGenerationRewriteStageWakeObservedNS                    atomic.Int64
+	vlogGenerationRewriteQueueMu                                sync.Mutex
+	vlogGenerationCheckpointKickActive                          atomic.Bool
+	vlogGenerationRewriteQueue                                  []uint32
+	vlogGenerationRewriteLedger                                 []backenddb.ValueLogRewritePlanSegment
+	vlogGenerationRewritePenalties                              map[uint32]valueLogGenerationRewritePenalty
+	vlogGenerationRewriteStagePending                           bool
+	vlogGenerationRewriteStageObservedUnixNano                  int64
+	vlogGenerationRewriteQueueLoaded                            bool
+	vlogGenerationLastChurnBps                                  atomic.Int64
+	vlogGenerationLastChurnSampleBytes                          atomic.Uint64
+	vlogGenerationLastChurnSampleNS                             atomic.Int64
 	// Rewrite budget token bucket (bytes) for online maintenance. This lets us
 	// interpret ValueLogRewriteBudgetBytesPerSec as a true per-second bandwidth
 	// budget while still running maintenance at coarse intervals.
@@ -12801,6 +12823,28 @@ func (db *DB) observeVlogGenerationGCStats(stats backenddb.ValueLogGCStats) {
 	db.vlogGenerationLastGCSegmentsDeleted.Store(int64(stats.SegmentsDeleted))
 	db.vlogGenerationLastGCBytesPending.Store(stats.BytesPending)
 	db.vlogGenerationLastGCSegmentsPending.Store(int64(stats.SegmentsPending))
+	db.vlogGenerationLastGCObservedSourceSegments.Store(int64(stats.ObservedSourceSegments))
+	db.vlogGenerationLastGCObservedSourceSegmentsReferenced.Store(int64(stats.ObservedSourceSegmentsReferenced))
+	db.vlogGenerationLastGCObservedSourceSegmentsActive.Store(int64(stats.ObservedSourceSegmentsActive))
+	db.vlogGenerationLastGCObservedSourceSegmentsProtected.Store(int64(stats.ObservedSourceSegmentsProtected))
+	db.vlogGenerationLastGCObservedSourceSegmentsProtectedInUse.Store(int64(stats.ObservedSourceSegmentsProtectedInUse))
+	db.vlogGenerationLastGCObservedSourceSegmentsProtectedRetained.Store(int64(stats.ObservedSourceSegmentsProtectedRetained))
+	db.vlogGenerationLastGCObservedSourceSegmentsProtectedOverlap.Store(int64(stats.ObservedSourceSegmentsProtectedOverlap))
+	db.vlogGenerationLastGCObservedSourceSegmentsProtectedOther.Store(int64(stats.ObservedSourceSegmentsProtectedOther))
+	db.vlogGenerationLastGCObservedSourceSegmentsEligible.Store(int64(stats.ObservedSourceSegmentsEligible))
+	db.vlogGenerationLastGCObservedSourceSegmentsDeleted.Store(int64(stats.ObservedSourceSegmentsDeleted))
+	db.vlogGenerationLastGCObservedSourceSegmentsPending.Store(int64(stats.ObservedSourceSegmentsPending))
+	db.vlogGenerationLastGCObservedSourceBytes.Store(stats.ObservedSourceBytes)
+	db.vlogGenerationLastGCObservedSourceBytesReferenced.Store(stats.ObservedSourceBytesReferenced)
+	db.vlogGenerationLastGCObservedSourceBytesActive.Store(stats.ObservedSourceBytesActive)
+	db.vlogGenerationLastGCObservedSourceBytesProtected.Store(stats.ObservedSourceBytesProtected)
+	db.vlogGenerationLastGCObservedSourceBytesProtectedInUse.Store(stats.ObservedSourceBytesProtectedInUse)
+	db.vlogGenerationLastGCObservedSourceBytesProtectedRetained.Store(stats.ObservedSourceBytesProtectedRetained)
+	db.vlogGenerationLastGCObservedSourceBytesProtectedOverlap.Store(stats.ObservedSourceBytesProtectedOverlap)
+	db.vlogGenerationLastGCObservedSourceBytesProtectedOther.Store(stats.ObservedSourceBytesProtectedOther)
+	db.vlogGenerationLastGCObservedSourceBytesEligible.Store(stats.ObservedSourceBytesEligible)
+	db.vlogGenerationLastGCObservedSourceBytesDeleted.Store(stats.ObservedSourceBytesDeleted)
+	db.vlogGenerationLastGCObservedSourceBytesPending.Store(stats.ObservedSourceBytesPending)
 }
 
 func (db *DB) observeVlogGenerationVacuumExecDuration(d time.Duration) {
@@ -14292,7 +14336,11 @@ planned:
 			if gcer, ok := db.backend.(backendValueLogGCer); ok {
 				gcCtx, gcCancel := context.WithTimeout(context.Background(), 30*time.Second)
 				gcStart := time.Now()
-				gcStats, gcErr := gcer.ValueLogGC(gcCtx, db.valueLogGCOptions(false))
+				gcOpts := db.valueLogGCOptions(false)
+				if len(processedRewriteIDs) > 0 {
+					gcOpts.ObservedSourceFileIDs = append([]uint32(nil), processedRewriteIDs...)
+				}
+				gcStats, gcErr := gcer.ValueLogGC(gcCtx, gcOpts)
 				gcCancel()
 				gcDur := time.Since(gcStart)
 				db.observeVlogGenerationGCExecDuration(gcDur)
@@ -20315,6 +20363,28 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.gc.last_deleted_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesDeleted.Load())
 	stats["treedb.cache.vlog_generation.gc.last_pending_segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCSegmentsPending.Load())
 	stats["treedb.cache.vlog_generation.gc.last_pending_bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCBytesPending.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.segments"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegments.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_referenced"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsReferenced.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_active"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsActive.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtected.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_in_use"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtectedInUse.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_retained"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtectedRetained.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_overlap"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtectedOverlap.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_other"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsProtectedOther.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_eligible"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsEligible.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_deleted"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsDeleted.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_pending"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceSegmentsPending.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytes.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_referenced"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesReferenced.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_active"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesActive.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtected.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_in_use"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtectedInUse.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_retained"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtectedRetained.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_overlap"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtectedOverlap.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_other"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesProtectedOther.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_eligible"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesEligible.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_deleted"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesDeleted.Load())
+	stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_pending"] = fmt.Sprintf("%d", db.vlogGenerationLastGCObservedSourceBytesPending.Load())
 	stats["treedb.cache.vlog_generation.gc.runs"] = fmt.Sprintf("%d", db.vlogGenerationGCRuns.Load())
 	stats["treedb.cache.vlog_generation.gc.exec.total_ms"] = fmt.Sprintf("%.3f", float64(gcExecTotalNS)/float64(time.Millisecond))
 	stats["treedb.cache.vlog_generation.gc.exec.max_ms"] = fmt.Sprintf("%.3f", float64(gcExecMaxNS)/float64(time.Millisecond))
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 264fb8b92..9f822397c 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -5819,6 +5819,28 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	db.vlogGenerationLastGCBytesDeleted.Store(200)
 	db.vlogGenerationLastGCSegmentsPending.Store(4)
 	db.vlogGenerationLastGCBytesPending.Store(400)
+	db.vlogGenerationLastGCObservedSourceSegments.Store(2)
+	db.vlogGenerationLastGCObservedSourceSegmentsReferenced.Store(0)
+	db.vlogGenerationLastGCObservedSourceSegmentsActive.Store(0)
+	db.vlogGenerationLastGCObservedSourceSegmentsProtected.Store(2)
+	db.vlogGenerationLastGCObservedSourceSegmentsProtectedInUse.Store(0)
+	db.vlogGenerationLastGCObservedSourceSegmentsProtectedRetained.Store(2)
+	db.vlogGenerationLastGCObservedSourceSegmentsProtectedOverlap.Store(0)
+	db.vlogGenerationLastGCObservedSourceSegmentsProtectedOther.Store(0)
+	db.vlogGenerationLastGCObservedSourceSegmentsEligible.Store(0)
+	db.vlogGenerationLastGCObservedSourceSegmentsDeleted.Store(0)
+	db.vlogGenerationLastGCObservedSourceSegmentsPending.Store(0)
+	db.vlogGenerationLastGCObservedSourceBytes.Store(250)
+	db.vlogGenerationLastGCObservedSourceBytesReferenced.Store(0)
+	db.vlogGenerationLastGCObservedSourceBytesActive.Store(0)
+	db.vlogGenerationLastGCObservedSourceBytesProtected.Store(250)
+	db.vlogGenerationLastGCObservedSourceBytesProtectedInUse.Store(0)
+	db.vlogGenerationLastGCObservedSourceBytesProtectedRetained.Store(250)
+	db.vlogGenerationLastGCObservedSourceBytesProtectedOverlap.Store(0)
+	db.vlogGenerationLastGCObservedSourceBytesProtectedOther.Store(0)
+	db.vlogGenerationLastGCObservedSourceBytesEligible.Store(0)
+	db.vlogGenerationLastGCObservedSourceBytesDeleted.Store(0)
+	db.vlogGenerationLastGCObservedSourceBytesPending.Store(0)
 	db.vlogGenerationMaintenanceSkipStageNotDue.Store(5)
 	db.vlogGenerationMaintenanceSkipStageDue.Store(2)
 	db.vlogGenerationRewritePlanSelectedSegments.Store(6)
@@ -5936,6 +5958,72 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.gc.last_pending_bytes"]; got != "400" {
 		t.Fatalf("gc last pending bytes=%q want 400", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments"]; got != "2" {
+		t.Fatalf("gc last observed source segments=%q want 2", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_referenced"]; got != "0" {
+		t.Fatalf("gc last observed source segments referenced=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_active"]; got != "0" {
+		t.Fatalf("gc last observed source segments active=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected"]; got != "2" {
+		t.Fatalf("gc last observed source segments protected=%q want 2", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_in_use"]; got != "0" {
+		t.Fatalf("gc last observed source segments protected in-use=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_retained"]; got != "2" {
+		t.Fatalf("gc last observed source segments protected retained=%q want 2", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_overlap"]; got != "0" {
+		t.Fatalf("gc last observed source segments protected overlap=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_other"]; got != "0" {
+		t.Fatalf("gc last observed source segments protected other=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_eligible"]; got != "0" {
+		t.Fatalf("gc last observed source segments eligible=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_deleted"]; got != "0" {
+		t.Fatalf("gc last observed source segments deleted=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.segments_pending"]; got != "0" {
+		t.Fatalf("gc last observed source segments pending=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes"]; got != "250" {
+		t.Fatalf("gc last observed source bytes=%q want 250", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_referenced"]; got != "0" {
+		t.Fatalf("gc last observed source bytes referenced=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_active"]; got != "0" {
+		t.Fatalf("gc last observed source bytes active=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected"]; got != "250" {
+		t.Fatalf("gc last observed source bytes protected=%q want 250", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_in_use"]; got != "0" {
+		t.Fatalf("gc last observed source bytes protected in-use=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_retained"]; got != "250" {
+		t.Fatalf("gc last observed source bytes protected retained=%q want 250", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_overlap"]; got != "0" {
+		t.Fatalf("gc last observed source bytes protected overlap=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_other"]; got != "0" {
+		t.Fatalf("gc last observed source bytes protected other=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_eligible"]; got != "0" {
+		t.Fatalf("gc last observed source bytes eligible=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_deleted"]; got != "0" {
+		t.Fatalf("gc last observed source bytes deleted=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.gc.last_observed_source.bytes_pending"]; got != "0" {
+		t.Fatalf("gc last observed source bytes pending=%q want 0", got)
+	}
 	if got := stats["treedb.cache.vlog_generation.vacuum.exec.total_ms"]; got != "44.000" {
 		t.Fatalf("vacuum exec total ms=%q want 44.000", got)
 	}
diff --git a/TreeDB/db/vlog_gc.go b/TreeDB/db/vlog_gc.go
index cbdc96e49..258c03b5a 100644
--- a/TreeDB/db/vlog_gc.go
+++ b/TreeDB/db/vlog_gc.go
@@ -27,32 +27,58 @@ type ValueLogGCOptions struct {
 	ProtectedInUsePaths []string
 	// ProtectedRetainedPaths are paths pinned by pointer lifecycle retention.
 	ProtectedRetainedPaths []string
+	// ObservedSourceFileIDs enables per-classification probe counters for a
+	// caller-provided subset of segment IDs (for example, rewrite-selected
+	// source segments). IDs not present in the current set are ignored.
+	ObservedSourceFileIDs []uint32
 }
 
 // ValueLogGCStats summarizes value-log GC work.
 type ValueLogGCStats struct {
-	SegmentsTotal             int
-	SegmentsReferenced        int
-	SegmentsActive            int
-	SegmentsProtected         int
-	SegmentsProtectedInUse    int
-	SegmentsProtectedRetained int
-	SegmentsProtectedOverlap  int
-	SegmentsProtectedOther    int
-	SegmentsEligible          int
-	SegmentsDeleted           int
-	SegmentsPending           int
-	BytesTotal                int64
-	BytesReferenced           int64
-	BytesActive               int64
-	BytesProtected            int64
-	BytesProtectedInUse       int64
-	BytesProtectedRetained    int64
-	BytesProtectedOverlap     int64
-	BytesProtectedOther       int64
-	BytesEligible             int64
-	BytesDeleted              int64
-	BytesPending              int64
+	SegmentsTotal                           int
+	SegmentsReferenced                      int
+	SegmentsActive                          int
+	SegmentsProtected                       int
+	SegmentsProtectedInUse                  int
+	SegmentsProtectedRetained               int
+	SegmentsProtectedOverlap                int
+	SegmentsProtectedOther                  int
+	SegmentsEligible                        int
+	SegmentsDeleted                         int
+	SegmentsPending                         int
+	BytesTotal                              int64
+	BytesReferenced                         int64
+	BytesActive                             int64
+	BytesProtected                          int64
+	BytesProtectedInUse                     int64
+	BytesProtectedRetained                  int64
+	BytesProtectedOverlap                   int64
+	BytesProtectedOther                     int64
+	BytesEligible                           int64
+	BytesDeleted                            int64
+	BytesPending                            int64
+	ObservedSourceSegments                  int
+	ObservedSourceSegmentsReferenced        int
+	ObservedSourceSegmentsActive            int
+	ObservedSourceSegmentsProtected         int
+	ObservedSourceSegmentsProtectedInUse    int
+	ObservedSourceSegmentsProtectedRetained int
+	ObservedSourceSegmentsProtectedOverlap  int
+	ObservedSourceSegmentsProtectedOther    int
+	ObservedSourceSegmentsEligible          int
+	ObservedSourceSegmentsDeleted           int
+	ObservedSourceSegmentsPending           int
+	ObservedSourceBytes                     int64
+	ObservedSourceBytesReferenced           int64
+	ObservedSourceBytesActive               int64
+	ObservedSourceBytesProtected            int64
+	ObservedSourceBytesProtectedInUse       int64
+	ObservedSourceBytesProtectedRetained    int64
+	ObservedSourceBytesProtectedOverlap     int64
+	ObservedSourceBytesProtectedOther       int64
+	ObservedSourceBytesEligible             int64
+	ObservedSourceBytesDeleted              int64
+	ObservedSourceBytesPending              int64
 }
 
 // ValueLogGC deletes fully-unreferenced value-log segments.
@@ -127,27 +153,49 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG
 		protectedRetainedPaths[path] = struct{}{}
 	}
 	type candidate struct {
-		path string
-		size int64
+		path     string
+		size     int64
+		observed bool
 	}
 	candidates := make(map[uint32]candidate)
+	observedSourceIDs := make(map[uint32]struct{}, len(opts.ObservedSourceFileIDs))
+	for _, id := range opts.ObservedSourceFileIDs {
+		if id == 0 {
+			continue
+		}
+		observedSourceIDs[id] = struct{}{}
+	}
 
 	for id, f := range set.Files {
 		if err := ctx.Err(); err != nil {
 			return stats, err
 		}
 		size := fileSize(f)
+		observed := false
+		if _, ok := observedSourceIDs[id]; ok {
+			observed = true
+			stats.ObservedSourceSegments++
+			stats.ObservedSourceBytes += size
+		}
 		stats.SegmentsTotal++
 		stats.BytesTotal += size
 
 		if _, ok := referenced[id]; ok {
 			stats.SegmentsReferenced++
 			stats.BytesReferenced += size
+			if observed {
+				stats.ObservedSourceSegmentsReferenced++
+				stats.ObservedSourceBytesReferenced += size
+			}
 			continue
 		}
 		if _, ok := keptIDs[id]; ok {
 			stats.SegmentsActive++
 			stats.BytesActive += size
+			if observed {
+				stats.ObservedSourceSegmentsActive++
+				stats.ObservedSourceBytesActive += size
+			}
 			continue
 		}
 		_, inUseProtected := protectedInUsePaths[f.Path]
@@ -155,16 +203,32 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG
 		if inUseProtected || retainedProtected {
 			stats.SegmentsProtected++
 			stats.BytesProtected += size
+			if observed {
+				stats.ObservedSourceSegmentsProtected++
+				stats.ObservedSourceBytesProtected += size
+			}
 			switch {
 			case inUseProtected && retainedProtected:
 				stats.SegmentsProtectedOverlap++
 				stats.BytesProtectedOverlap += size
+				if observed {
+					stats.ObservedSourceSegmentsProtectedOverlap++
+					stats.ObservedSourceBytesProtectedOverlap += size
+				}
 			case inUseProtected:
 				stats.SegmentsProtectedInUse++
 				stats.BytesProtectedInUse += size
+				if observed {
+					stats.ObservedSourceSegmentsProtectedInUse++
+					stats.ObservedSourceBytesProtectedInUse += size
+				}
 			default:
 				stats.SegmentsProtectedRetained++
 				stats.BytesProtectedRetained += size
+				if observed {
+					stats.ObservedSourceSegmentsProtectedRetained++
+					stats.ObservedSourceBytesProtectedRetained += size
+				}
 			}
 			continue
 		}
@@ -173,11 +237,21 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG
 			stats.BytesProtected += size
 			stats.SegmentsProtectedOther++
 			stats.BytesProtectedOther += size
+			if observed {
+				stats.ObservedSourceSegmentsProtected++
+				stats.ObservedSourceBytesProtected += size
+				stats.ObservedSourceSegmentsProtectedOther++
+				stats.ObservedSourceBytesProtectedOther += size
+			}
 			continue
 		}
 
 		stats.SegmentsEligible++
 		stats.BytesEligible += size
+		if observed {
+			stats.ObservedSourceSegmentsEligible++
+			stats.ObservedSourceBytesEligible += size
+		}
 
 		if opts.DryRun {
 			continue
@@ -185,7 +259,7 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG
 		if err := vm.MarkZombie(id); err != nil {
 			return stats, err
 		}
-		candidates[id] = candidate{path: f.Path, size: size}
+		candidates[id] = candidate{path: f.Path, size: size, observed: observed}
 	}
 
 	if opts.DryRun {
@@ -212,6 +286,10 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG
 			if os.IsNotExist(err) {
 				stats.SegmentsDeleted++
 				stats.BytesDeleted += info.size
+				if info.observed {
+					stats.ObservedSourceSegmentsDeleted++
+					stats.ObservedSourceBytesDeleted += info.size
+				}
 			} else {
 				return stats, err
 			}
@@ -223,6 +301,12 @@ func (db *DB) ValueLogGC(ctx context.Context, opts ValueLogGCOptions) (ValueLogG
 	if stats.BytesEligible > stats.BytesDeleted {
 		stats.BytesPending = stats.BytesEligible - stats.BytesDeleted
 	}
+	if stats.ObservedSourceSegmentsEligible > stats.ObservedSourceSegmentsDeleted {
+		stats.ObservedSourceSegmentsPending = stats.ObservedSourceSegmentsEligible - stats.ObservedSourceSegmentsDeleted
+	}
+	if stats.ObservedSourceBytesEligible > stats.ObservedSourceBytesDeleted {
+		stats.ObservedSourceBytesPending = stats.ObservedSourceBytesEligible - stats.ObservedSourceBytesDeleted
+	}
 
 	currentSet := vm.CurrentSetNoRefresh()
 	if currentSet != nil {
diff --git a/TreeDB/db/vlog_gc_test.go b/TreeDB/db/vlog_gc_test.go
index 771f2b116..c6da6710c 100644
--- a/TreeDB/db/vlog_gc_test.go
+++ b/TreeDB/db/vlog_gc_test.go
@@ -199,11 +199,24 @@ func TestValueLogGC_ProtectedPathBreakdownStats(t *testing.T) {
 	inUseOnlyPath := filepath.Join(dir, "wal", "value-l0-000001.log")
 	retainedOnlyPath := filepath.Join(dir, "wal", "value-l0-000002.log")
 	overlapPath := filepath.Join(dir, "wal", "value-l0-000003.log")
+	observedInUseID, err := valuelog.EncodeFileID(0, 1)
+	if err != nil {
+		t.Fatalf("observed in-use fileid: %v", err)
+	}
+	observedRetainedID, err := valuelog.EncodeFileID(0, 2)
+	if err != nil {
+		t.Fatalf("observed retained fileid: %v", err)
+	}
+	observedOverlapID, err := valuelog.EncodeFileID(0, 3)
+	if err != nil {
+		t.Fatalf("observed overlap fileid: %v", err)
+	}
 
 	stats, err := db.ValueLogGC(context.Background(), ValueLogGCOptions{
 		DryRun:                 true,
 		ProtectedInUsePaths:    []string{inUseOnlyPath, overlapPath},
 		ProtectedRetainedPaths: []string{retainedOnlyPath, overlapPath},
+		ObservedSourceFileIDs:  []uint32{observedInUseID, observedRetainedID, observedOverlapID},
 	})
 	if err != nil {
 		t.Fatalf("ValueLogGC: %v", err)
@@ -245,6 +258,62 @@ func TestValueLogGC_ProtectedPathBreakdownStats(t *testing.T) {
 	if stats.BytesProtectedOther != 0 {
 		t.Fatalf("bytes protected other=%d want 0", stats.BytesProtectedOther)
 	}
+	if stats.ObservedSourceSegments != 3 {
+		t.Fatalf("observed source segments=%d want 3", stats.ObservedSourceSegments)
+	}
+	if stats.ObservedSourceSegmentsReferenced != 0 {
+		t.Fatalf("observed source segments referenced=%d want 0", stats.ObservedSourceSegmentsReferenced)
+	}
+	if stats.ObservedSourceSegmentsActive != 0 {
+		t.Fatalf("observed source segments active=%d want 0", stats.ObservedSourceSegmentsActive)
+	}
+	if stats.ObservedSourceSegmentsProtected != 3 {
+		t.Fatalf("observed source segments protected=%d want 3", stats.ObservedSourceSegmentsProtected)
+	}
+	if stats.ObservedSourceSegmentsProtectedInUse != 1 {
+		t.Fatalf("observed source segments protected in-use=%d want 1", stats.ObservedSourceSegmentsProtectedInUse)
+	}
+	if stats.ObservedSourceSegmentsProtectedRetained != 1 {
+		t.Fatalf("observed source segments protected retained=%d want 1", stats.ObservedSourceSegmentsProtectedRetained)
+	}
+	if stats.ObservedSourceSegmentsProtectedOverlap != 1 {
+		t.Fatalf("observed source segments protected overlap=%d want 1", stats.ObservedSourceSegmentsProtectedOverlap)
+	}
+	if stats.ObservedSourceSegmentsProtectedOther != 0 {
+		t.Fatalf("observed source segments protected other=%d want 0", stats.ObservedSourceSegmentsProtectedOther)
+	}
+	if stats.ObservedSourceSegmentsEligible != 0 {
+		t.Fatalf("observed source segments eligible=%d want 0", stats.ObservedSourceSegmentsEligible)
+	}
+	if stats.ObservedSourceSegmentsDeleted != 0 {
+		t.Fatalf("observed source segments deleted=%d want 0", stats.ObservedSourceSegmentsDeleted)
+	}
+	if stats.ObservedSourceSegmentsPending != 0 {
+		t.Fatalf("observed source segments pending=%d want 0", stats.ObservedSourceSegmentsPending)
+	}
+	if stats.ObservedSourceBytes <= 0 {
+		t.Fatalf("observed source bytes=%d want >0", stats.ObservedSourceBytes)
+	}
+	if stats.ObservedSourceBytesProtected <= 0 {
+		t.Fatalf("observed source bytes protected=%d want >0", stats.ObservedSourceBytesProtected)
+	}
+	if stats.ObservedSourceBytesProtectedInUse <= 0 ||
+		stats.ObservedSourceBytesProtectedRetained <= 0 ||
+		stats.ObservedSourceBytesProtectedOverlap <= 0 {
+		t.Fatalf("expected non-zero observed source protected byte buckets, got %+v", stats)
+	}
+	if stats.ObservedSourceBytesProtectedOther != 0 {
+		t.Fatalf("observed source bytes protected other=%d want 0", stats.ObservedSourceBytesProtectedOther)
+	}
+	if stats.ObservedSourceBytesEligible != 0 {
+		t.Fatalf("observed source bytes eligible=%d want 0", stats.ObservedSourceBytesEligible)
+	}
+	if stats.ObservedSourceBytesDeleted != 0 {
+		t.Fatalf("observed source bytes deleted=%d want 0", stats.ObservedSourceBytesDeleted)
+	}
+	if stats.ObservedSourceBytesPending != 0 {
+		t.Fatalf("observed source bytes pending=%d want 0", stats.ObservedSourceBytesPending)
+	}
 }
 
 func TestValueLogGC_KeepsReferencedPointerSegments_WithOuterLeavesInValueLog(t *testing.T) {

From 0813e223edda24ee0215f4b278a09007bf1c0de1 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 03:18:48 -1000
Subject: [PATCH 23/61] caching: trace rewrite-observed retained prune outcomes

---
 TreeDB/caching/db.go                          | 451 ++++++++++++++----
 .../caching/vlog_generation_scheduler_test.go | 120 ++++-
 2 files changed, 468 insertions(+), 103 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 89a9929c7..a6fcc3705 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -4194,21 +4194,100 @@ type valueLogSetRefresher interface {
 }
 
 type retainedValueLogPruneStats struct {
-	RemovedSegments         int
-	RemovedBytes            int64
-	InUseSkippedSegments    int
-	InUseSkippedBytes       int64
-	CandidateSegments       int
-	CandidateBytes          int64
-	LiveSkippedSegments     int
-	LiveSkippedBytes        int64
-	ParseSkippedSegments    int
-	ParseSkippedBytes       int64
-	ZombieMarkedSegments    int
-	ZombieMarkedBytes       int64
-	AbortedForegroundWrites bool
-	RetriedWithoutWriteGate bool
-	RetrySucceeded          bool
+	RemovedSegments                    int
+	RemovedBytes                       int64
+	InUseSkippedSegments               int
+	InUseSkippedBytes                  int64
+	CandidateSegments                  int
+	CandidateBytes                     int64
+	LiveSkippedSegments                int
+	LiveSkippedBytes                   int64
+	ParseSkippedSegments               int
+	ParseSkippedBytes                  int64
+	ZombieMarkedSegments               int
+	ZombieMarkedBytes                  int64
+	ObservedSourceSegments             int
+	ObservedSourceBytes                int64
+	ObservedSourceCandidateSegments    int
+	ObservedSourceCandidateBytes       int64
+	ObservedSourceRemovedSegments      int
+	ObservedSourceRemovedBytes         int64
+	ObservedSourceInUseSkippedSegments int
+	ObservedSourceInUseSkippedBytes    int64
+	ObservedSourceLiveSkippedSegments  int
+	ObservedSourceLiveSkippedBytes     int64
+	ObservedSourceParseSkippedSegments int
+	ObservedSourceParseSkippedBytes    int64
+	ObservedSourceZombieMarkedSegments int
+	ObservedSourceZombieMarkedBytes    int64
+	AbortedForegroundWrites            bool
+	RetriedWithoutWriteGate            bool
+	RetrySucceeded                     bool
+}
+
+func (db *DB) observeRetainedValueLogPruneStats(pruneStats retainedValueLogPruneStats) {
+	if db == nil {
+		return
+	}
+	db.retainedValueLogPruneLastObservedSourceSegments.Store(int64(pruneStats.ObservedSourceSegments))
+	db.retainedValueLogPruneLastObservedSourceBytes.Store(pruneStats.ObservedSourceBytes)
+	db.retainedValueLogPruneLastObservedSourceCandidateSegments.Store(int64(pruneStats.ObservedSourceCandidateSegments))
+	db.retainedValueLogPruneLastObservedSourceCandidateBytes.Store(pruneStats.ObservedSourceCandidateBytes)
+	db.retainedValueLogPruneLastObservedSourceRemovedSegments.Store(int64(pruneStats.ObservedSourceRemovedSegments))
+	db.retainedValueLogPruneLastObservedSourceRemovedBytes.Store(pruneStats.ObservedSourceRemovedBytes)
+	db.retainedValueLogPruneLastObservedSourceInUseSkippedSegments.Store(int64(pruneStats.ObservedSourceInUseSkippedSegments))
+	db.retainedValueLogPruneLastObservedSourceInUseSkippedBytes.Store(pruneStats.ObservedSourceInUseSkippedBytes)
+	db.retainedValueLogPruneLastObservedSourceLiveSkippedSegments.Store(int64(pruneStats.ObservedSourceLiveSkippedSegments))
+	db.retainedValueLogPruneLastObservedSourceLiveSkippedBytes.Store(pruneStats.ObservedSourceLiveSkippedBytes)
+	db.retainedValueLogPruneLastObservedSourceParseSkippedSegments.Store(int64(pruneStats.ObservedSourceParseSkippedSegments))
+	db.retainedValueLogPruneLastObservedSourceParseSkippedBytes.Store(pruneStats.ObservedSourceParseSkippedBytes)
+	db.retainedValueLogPruneLastObservedSourceZombieMarkedSegments.Store(int64(pruneStats.ObservedSourceZombieMarkedSegments))
+	db.retainedValueLogPruneLastObservedSourceZombieMarkedBytes.Store(pruneStats.ObservedSourceZombieMarkedBytes)
+	if pruneStats.RetriedWithoutWriteGate {
+		db.retainedValueLogPruneWriteGateRetries.Add(1)
+		if pruneStats.RetrySucceeded {
+			db.retainedValueLogPruneWriteGateRetrySuccesses.Add(1)
+		}
+	}
+	if pruneStats.AbortedForegroundWrites {
+		db.retainedValueLogPruneForegroundAbortRuns.Add(1)
+	}
+	if pruneStats.RemovedSegments > 0 {
+		db.retainedValueLogPruneRemovedSegments.Add(uint64(pruneStats.RemovedSegments))
+	}
+	if pruneStats.RemovedBytes > 0 {
+		db.retainedValueLogPruneRemovedBytes.Add(uint64(pruneStats.RemovedBytes))
+	}
+	if pruneStats.InUseSkippedSegments > 0 {
+		db.retainedValueLogPruneInUseSkippedSegments.Add(uint64(pruneStats.InUseSkippedSegments))
+	}
+	if pruneStats.InUseSkippedBytes > 0 {
+		db.retainedValueLogPruneInUseSkippedBytes.Add(uint64(pruneStats.InUseSkippedBytes))
+	}
+	if pruneStats.CandidateSegments > 0 {
+		db.retainedValueLogPruneCandidateSegments.Add(uint64(pruneStats.CandidateSegments))
+	}
+	if pruneStats.CandidateBytes > 0 {
+		db.retainedValueLogPruneCandidateBytes.Add(uint64(pruneStats.CandidateBytes))
+	}
+	if pruneStats.LiveSkippedSegments > 0 {
+		db.retainedValueLogPruneLiveSkippedSegments.Add(uint64(pruneStats.LiveSkippedSegments))
+	}
+	if pruneStats.LiveSkippedBytes > 0 {
+		db.retainedValueLogPruneLiveSkippedBytes.Add(uint64(pruneStats.LiveSkippedBytes))
+	}
+	if pruneStats.ParseSkippedSegments > 0 {
+		db.retainedValueLogPruneParseSkippedSegments.Add(uint64(pruneStats.ParseSkippedSegments))
+	}
+	if pruneStats.ParseSkippedBytes > 0 {
+		db.retainedValueLogPruneParseSkippedBytes.Add(uint64(pruneStats.ParseSkippedBytes))
+	}
+	if pruneStats.ZombieMarkedSegments > 0 {
+		db.retainedValueLogPruneZombieMarkedSegments.Add(uint64(pruneStats.ZombieMarkedSegments))
+	}
+	if pruneStats.ZombieMarkedBytes > 0 {
+		db.retainedValueLogPruneZombieMarkedBytes.Add(uint64(pruneStats.ZombieMarkedBytes))
+	}
 }
 
 func (db *DB) valueLogClosedSegmentSize(path string) int64 {
@@ -4229,6 +4308,10 @@ func (db *DB) valueLogClosedSegmentSize(path string) int64 {
 }
 
 func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats {
+	return db.pruneRetainedValueLogsWithObserved(force, nil)
+}
+
+func (db *DB) pruneRetainedValueLogsWithObserved(force bool, observedSourceIDs map[uint32]struct{}) retainedValueLogPruneStats {
 	var out retainedValueLogPruneStats
 	if !db.valueLogEnabled() {
 		return out
@@ -4244,17 +4327,40 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats {
 	}
 
 	type pruneCandidate struct {
-		path string
-		size int64
+		path     string
+		size     int64
+		id       uint32
+		hasID    bool
+		observed bool
 	}
 	candidatePaths := make([]pruneCandidate, 0, len(paths))
 	for _, path := range paths {
 		size := db.valueLogClosedSegmentSize(path)
+		candidate := pruneCandidate{path: path, size: size}
+		if laneID, seq, valueLog, ok := parseLogSeq(filepath.Base(path)); ok && valueLog && laneID >= 0 {
+			if id, err := valuelog.EncodeFileID(uint32(laneID), uint32(seq)); err == nil {
+				candidate.id = id
+				candidate.hasID = true
+				if _, ok := observedSourceIDs[id]; ok {
+					candidate.observed = true
+					out.ObservedSourceSegments++
+					if size > 0 {
+						out.ObservedSourceBytes += size
+					}
+				}
+			}
+		}
 		if _, ok := inUse[path]; ok {
 			out.InUseSkippedSegments++
 			if size > 0 {
 				out.InUseSkippedBytes += size
 			}
+			if candidate.observed {
+				out.ObservedSourceInUseSkippedSegments++
+				if size > 0 {
+					out.ObservedSourceInUseSkippedBytes += size
+				}
+			}
 			continue
 		}
 		if db.cleanupMissingRetainedValueLog(path) {
@@ -4262,13 +4368,25 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats {
 				out.RemovedSegments++
 				out.RemovedBytes += size
 			}
+			if candidate.observed {
+				out.ObservedSourceRemovedSegments++
+				if size > 0 {
+					out.ObservedSourceRemovedBytes += size
+				}
+			}
 			continue
 		}
 		out.CandidateSegments++
 		if size > 0 {
 			out.CandidateBytes += size
 		}
-		candidatePaths = append(candidatePaths, pruneCandidate{path: path, size: size})
+		if candidate.observed {
+			out.ObservedSourceCandidateSegments++
+			if size > 0 {
+				out.ObservedSourceCandidateBytes += size
+			}
+		}
+		candidatePaths = append(candidatePaths, candidate)
 	}
 	if len(candidatePaths) == 0 {
 		return out
@@ -4296,26 +4414,17 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats {
 	for _, candidate := range candidatePaths {
 		path := candidate.path
 		size := candidate.size
-		laneID, seq, valueLog, ok := parseLogSeq(filepath.Base(path))
-		if !ok || !valueLog {
+		id := candidate.id
+		if !candidate.hasID {
 			out.ParseSkippedSegments++
 			if size > 0 {
 				out.ParseSkippedBytes += size
 			}
-			continue
-		}
-		if laneID < 0 {
-			out.ParseSkippedSegments++
-			if size > 0 {
-				out.ParseSkippedBytes += size
-			}
-			continue
-		}
-		id, err := valuelog.EncodeFileID(uint32(laneID), uint32(seq))
-		if err != nil {
-			out.ParseSkippedSegments++
-			if size > 0 {
-				out.ParseSkippedBytes += size
+			if candidate.observed {
+				out.ObservedSourceParseSkippedSegments++
+				if size > 0 {
+					out.ObservedSourceParseSkippedBytes += size
+				}
 			}
 			continue
 		}
@@ -4324,6 +4433,12 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats {
 			if size > 0 {
 				out.LiveSkippedBytes += size
 			}
+			if candidate.observed {
+				out.ObservedSourceLiveSkippedSegments++
+				if size > 0 {
+					out.ObservedSourceLiveSkippedBytes += size
+				}
+			}
 			continue
 		}
 
@@ -4338,6 +4453,12 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats {
 						out.RemovedSegments++
 						out.RemovedBytes += size
 					}
+					if candidate.observed {
+						out.ObservedSourceRemovedSegments++
+						if size > 0 {
+							out.ObservedSourceRemovedBytes += size
+						}
+					}
 					continue
 				}
 				if db.cleanupMissingRetainedValueLog(path) {
@@ -4345,6 +4466,12 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats {
 						out.RemovedSegments++
 						out.RemovedBytes += size
 					}
+					if candidate.observed {
+						out.ObservedSourceRemovedSegments++
+						if size > 0 {
+							out.ObservedSourceRemovedBytes += size
+						}
+					}
 					continue
 				}
 				db.reportError(fmt.Errorf("cachingdb: failed to mark value-log %d zombie: %w", id, err))
@@ -4354,6 +4481,12 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats {
 			if size > 0 {
 				out.ZombieMarkedBytes += size
 			}
+			if candidate.observed {
+				out.ObservedSourceZombieMarkedSegments++
+				if size > 0 {
+					out.ObservedSourceZombieMarkedBytes += size
+				}
+			}
 			marked = true
 		} else {
 			db.dropValueLogSegment(path)
@@ -4366,6 +4499,12 @@ func (db *DB) pruneRetainedValueLogs(force bool) retainedValueLogPruneStats {
 				out.RemovedSegments++
 				out.RemovedBytes += size
 			}
+			if candidate.observed {
+				out.ObservedSourceRemovedSegments++
+				if size > 0 {
+					out.ObservedSourceRemovedBytes += size
+				}
+			}
 		}
 		db.forgetValueLogRetain(path)
 	}
@@ -4468,6 +4607,38 @@ func (db *DB) waitForRetainedValueLogPruneQuietOrForce(quietWindow time.Duration
 	}
 }
 
+func (db *DB) queueRetainedPruneObservedSourceIDs(ids []uint32) {
+	if db == nil || len(ids) == 0 {
+		return
+	}
+	db.retainedPruneObservedMu.Lock()
+	if db.retainedPruneObservedSourceIDs == nil {
+		db.retainedPruneObservedSourceIDs = make(map[uint32]struct{}, len(ids))
+	}
+	for _, id := range ids {
+		if id == 0 {
+			continue
+		}
+		db.retainedPruneObservedSourceIDs[id] = struct{}{}
+	}
+	db.retainedPruneObservedMu.Unlock()
+}
+
+func (db *DB) takeRetainedPruneObservedSourceIDs() map[uint32]struct{} {
+	if db == nil {
+		return nil
+	}
+	db.retainedPruneObservedMu.Lock()
+	if len(db.retainedPruneObservedSourceIDs) == 0 {
+		db.retainedPruneObservedMu.Unlock()
+		return nil
+	}
+	out := db.retainedPruneObservedSourceIDs
+	db.retainedPruneObservedSourceIDs = nil
+	db.retainedPruneObservedMu.Unlock()
+	return out
+}
+
 func (db *DB) scheduleRetainedValueLogPrune() {
 	db.scheduleRetainedValueLogPruneWithForce(false)
 }
@@ -4545,51 +4716,13 @@ func (db *DB) scheduleRetainedValueLogPruneWithForce(force bool) {
 			db.retainedValueLogPruneForcedRuns.Add(1)
 		}
 		db.retainedValueLogPruneLastUnixNano.Store(now.UnixNano())
-		pruneStats := db.pruneRetainedValueLogs(effectiveForce)
-		if pruneStats.RetriedWithoutWriteGate {
-			db.retainedValueLogPruneWriteGateRetries.Add(1)
-			if pruneStats.RetrySucceeded {
-				db.retainedValueLogPruneWriteGateRetrySuccesses.Add(1)
-			}
-		}
-		if pruneStats.AbortedForegroundWrites {
-			db.retainedValueLogPruneForegroundAbortRuns.Add(1)
-		}
-		if pruneStats.RemovedSegments > 0 {
-			db.retainedValueLogPruneRemovedSegments.Add(uint64(pruneStats.RemovedSegments))
-		}
-		if pruneStats.RemovedBytes > 0 {
-			db.retainedValueLogPruneRemovedBytes.Add(uint64(pruneStats.RemovedBytes))
-		}
-		if pruneStats.InUseSkippedSegments > 0 {
-			db.retainedValueLogPruneInUseSkippedSegments.Add(uint64(pruneStats.InUseSkippedSegments))
-		}
-		if pruneStats.InUseSkippedBytes > 0 {
-			db.retainedValueLogPruneInUseSkippedBytes.Add(uint64(pruneStats.InUseSkippedBytes))
-		}
-		if pruneStats.CandidateSegments > 0 {
-			db.retainedValueLogPruneCandidateSegments.Add(uint64(pruneStats.CandidateSegments))
-		}
-		if pruneStats.CandidateBytes > 0 {
-			db.retainedValueLogPruneCandidateBytes.Add(uint64(pruneStats.CandidateBytes))
-		}
-		if pruneStats.LiveSkippedSegments > 0 {
-			db.retainedValueLogPruneLiveSkippedSegments.Add(uint64(pruneStats.LiveSkippedSegments))
-		}
-		if pruneStats.LiveSkippedBytes > 0 {
-			db.retainedValueLogPruneLiveSkippedBytes.Add(uint64(pruneStats.LiveSkippedBytes))
-		}
-		if pruneStats.ParseSkippedSegments > 0 {
-			db.retainedValueLogPruneParseSkippedSegments.Add(uint64(pruneStats.ParseSkippedSegments))
-		}
-		if pruneStats.ParseSkippedBytes > 0 {
-			db.retainedValueLogPruneParseSkippedBytes.Add(uint64(pruneStats.ParseSkippedBytes))
-		}
-		if pruneStats.ZombieMarkedSegments > 0 {
-			db.retainedValueLogPruneZombieMarkedSegments.Add(uint64(pruneStats.ZombieMarkedSegments))
-		}
-		if pruneStats.ZombieMarkedBytes > 0 {
-			db.retainedValueLogPruneZombieMarkedBytes.Add(uint64(pruneStats.ZombieMarkedBytes))
+		observedSourceIDs := db.takeRetainedPruneObservedSourceIDs()
+		pruneStats := db.pruneRetainedValueLogsWithObserved(effectiveForce, observedSourceIDs)
+		db.observeRetainedValueLogPruneStats(pruneStats)
+		if len(observedSourceIDs) > 0 && (pruneStats.ObservedSourceZombieMarkedSegments > 0 || pruneStats.ObservedSourceRemovedSegments > 0) {
+			// When a retained prune processes rewrite-observed source segments,
+			// queue a near-term maintenance pass so GC can re-check reclaim state.
+			db.vlogGenerationCheckpointKickPending.Store(true)
 		}
 	}()
 }
@@ -5398,6 +5531,20 @@ type DB struct {
 	retainedValueLogPruneParseSkippedBytes                      atomic.Uint64
 	retainedValueLogPruneZombieMarkedSegments                   atomic.Uint64
 	retainedValueLogPruneZombieMarkedBytes                      atomic.Uint64
+	retainedValueLogPruneLastObservedSourceSegments             atomic.Int64
+	retainedValueLogPruneLastObservedSourceBytes                atomic.Int64
+	retainedValueLogPruneLastObservedSourceCandidateSegments    atomic.Int64
+	retainedValueLogPruneLastObservedSourceCandidateBytes       atomic.Int64
+	retainedValueLogPruneLastObservedSourceRemovedSegments      atomic.Int64
+	retainedValueLogPruneLastObservedSourceRemovedBytes         atomic.Int64
+	retainedValueLogPruneLastObservedSourceInUseSkippedSegments atomic.Int64
+	retainedValueLogPruneLastObservedSourceInUseSkippedBytes    atomic.Int64
+	retainedValueLogPruneLastObservedSourceLiveSkippedSegments  atomic.Int64
+	retainedValueLogPruneLastObservedSourceLiveSkippedBytes     atomic.Int64
+	retainedValueLogPruneLastObservedSourceParseSkippedSegments atomic.Int64
+	retainedValueLogPruneLastObservedSourceParseSkippedBytes    atomic.Int64
+	retainedValueLogPruneLastObservedSourceZombieMarkedSegments atomic.Int64
+	retainedValueLogPruneLastObservedSourceZombieMarkedBytes    atomic.Int64
 	retainedValueLogPruneScheduleRequests                       atomic.Uint64
 	retainedValueLogPruneScheduleForcedRequests                 atomic.Uint64
 	retainedValueLogPruneScheduleSkipClosing                    atomic.Uint64
@@ -5408,6 +5555,8 @@ type DB struct {
 	retainedValueLogPruneWriteGateRetries                       atomic.Uint64
 	retainedValueLogPruneWriteGateRetrySuccesses                atomic.Uint64
 	retainedPruneForceRequested                                 atomic.Bool
+	retainedPruneObservedMu                                     sync.Mutex
+	retainedPruneObservedSourceIDs                              map[uint32]struct{}
 	retainedPruneMu                                             sync.Mutex
 	retainedPruneDone                                           chan struct{}
 	vlogGenerationRemapSuccesses                                atomic.Uint64
@@ -14334,27 +14483,121 @@ planned:
 				}
 			}
 			if gcer, ok := db.backend.(backendValueLogGCer); ok {
-				gcCtx, gcCancel := context.WithTimeout(context.Background(), 30*time.Second)
-				gcStart := time.Now()
 				gcOpts := db.valueLogGCOptions(false)
 				if len(processedRewriteIDs) > 0 {
 					gcOpts.ObservedSourceFileIDs = append([]uint32(nil), processedRewriteIDs...)
 				}
-				gcStats, gcErr := gcer.ValueLogGC(gcCtx, gcOpts)
-				gcCancel()
-				gcDur := time.Since(gcStart)
-				db.observeVlogGenerationGCExecDuration(gcDur)
+				runGC := func(phase string) (backenddb.ValueLogGCStats, error) {
+					gcCtx, gcCancel := context.WithTimeout(context.Background(), 30*time.Second)
+					gcStart := time.Now()
+					gcStats, gcErr := gcer.ValueLogGC(gcCtx, gcOpts)
+					gcCancel()
+					gcDur := time.Since(gcStart)
+					db.observeVlogGenerationGCExecDuration(gcDur)
+					if gcErr != nil {
+						db.debugVlogMaintf(
+							"gc_after_rewrite_err reason=%s phase=%s err=%v dur_ms=%.3f",
+							vlogGenerationReasonString(reason),
+							phase,
+							gcErr,
+							float64(gcDur.Microseconds())/1000,
+						)
+						return backenddb.ValueLogGCStats{}, gcErr
+					}
+					db.observeVlogGenerationGCStats(gcStats)
+					db.vlogGenerationGCRuns.Add(1)
+					if gcStats.SegmentsDeleted > 0 {
+						db.vlogGenerationGCSegmentsDeleted.Add(uint64(gcStats.SegmentsDeleted))
+					}
+					if gcStats.BytesDeleted > 0 {
+						db.vlogGenerationGCBytesDeleted.Add(uint64(gcStats.BytesDeleted))
+						gcBytesDeleted += int64(gcStats.BytesDeleted)
+						effectiveBytesAfter -= int64(gcStats.BytesDeleted)
+						if effectiveBytesAfter < 0 {
+							effectiveBytesAfter = 0
+						}
+					}
+					db.debugVlogMaintf(
+						"gc_after_rewrite_done reason=%s phase=%s dur_ms=%.3f",
+						vlogGenerationReasonString(reason),
+						phase,
+						float64(gcDur.Microseconds())/1000,
+					)
+					return gcStats, nil
+				}
+
+				gcStats, gcErr := runGC("initial")
 				if gcErr != nil {
-					db.debugVlogMaintf("gc_after_rewrite_err reason=%s err=%v dur_ms=%.3f", vlogGenerationReasonString(reason), gcErr, float64(gcDur.Microseconds())/1000)
 					return fmt.Errorf("generational gc after rewrite: %w", gcErr)
 				}
-				db.observeVlogGenerationGCStats(gcStats)
-				db.vlogGenerationGCRuns.Add(1)
-				if gcStats.SegmentsDeleted > 0 {
-					db.vlogGenerationGCSegmentsDeleted.Add(uint64(gcStats.SegmentsDeleted))
-				}
-				if gcStats.BytesDeleted > 0 {
-					db.vlogGenerationGCBytesDeleted.Add(uint64(gcStats.BytesDeleted))
+
+				rewriteBlockedByRetained := len(processedRewriteIDs) > 0 &&
+					gcStats.ObservedSourceSegments > 0 &&
+					gcStats.ObservedSourceSegmentsReferenced == 0 &&
+					gcStats.ObservedSourceSegmentsEligible == 0 &&
+					gcStats.ObservedSourceSegmentsProtectedRetained > 0
+				if rewriteBlockedByRetained {
+					if db.retainedPruneActive() {
+						db.queueRetainedPruneObservedSourceIDs(processedRewriteIDs)
+						// A prune is already in flight. Ensure a follow-up attempt stays queued.
+						db.scheduleRetainedValueLogPruneForce()
+						// Request a follow-up maintenance pass so GC can re-evaluate
+						// rewrite-observed source segments after the in-flight prune completes.
+						db.vlogGenerationCheckpointKickPending.Store(true)
+					} else {
+						observedSourceIDSet := make(map[uint32]struct{}, len(processedRewriteIDs))
+						for _, id := range processedRewriteIDs {
+							if id == 0 {
+								continue
+							}
+							observedSourceIDSet[id] = struct{}{}
+						}
+						nowPrune := time.Now()
+						db.retainedPruneLastStartUnixNano.Store(nowPrune.UnixNano())
+						db.retainedValueLogPruneRuns.Add(1)
+						db.retainedValueLogPruneForcedRuns.Add(1)
+						db.retainedValueLogPruneLastUnixNano.Store(nowPrune.UnixNano())
+						pruneStats := db.pruneRetainedValueLogsWithObserved(true, observedSourceIDSet)
+						db.observeRetainedValueLogPruneStats(pruneStats)
+						db.debugVlogMaintf(
+							"rewrite_retained_prune reason=%s observed_source_retained_segments=%d observed_source_retained_bytes=%d observed_source_seen_segments=%d observed_source_seen_bytes=%d observed_source_candidate_segments=%d observed_source_candidate_bytes=%d observed_source_removed_segments=%d observed_source_removed_bytes=%d observed_source_zombie_marked_segments=%d observed_source_zombie_marked_bytes=%d observed_source_live_skipped_segments=%d observed_source_live_skipped_bytes=%d observed_source_in_use_skipped_segments=%d observed_source_in_use_skipped_bytes=%d observed_source_parse_skipped_segments=%d observed_source_parse_skipped_bytes=%d removed_segments=%d removed_bytes=%d zombie_marked_segments=%d zombie_marked_bytes=%d live_skipped_segments=%d live_skipped_bytes=%d aborted=%t",
+							vlogGenerationReasonString(reason),
+							gcStats.ObservedSourceSegmentsProtectedRetained,
+							gcStats.ObservedSourceBytesProtectedRetained,
+							pruneStats.ObservedSourceSegments,
+							pruneStats.ObservedSourceBytes,
+							pruneStats.ObservedSourceCandidateSegments,
+							pruneStats.ObservedSourceCandidateBytes,
+							pruneStats.ObservedSourceRemovedSegments,
+							pruneStats.ObservedSourceRemovedBytes,
+							pruneStats.ObservedSourceZombieMarkedSegments,
+							pruneStats.ObservedSourceZombieMarkedBytes,
+							pruneStats.ObservedSourceLiveSkippedSegments,
+							pruneStats.ObservedSourceLiveSkippedBytes,
+							pruneStats.ObservedSourceInUseSkippedSegments,
+							pruneStats.ObservedSourceInUseSkippedBytes,
+							pruneStats.ObservedSourceParseSkippedSegments,
+							pruneStats.ObservedSourceParseSkippedBytes,
+							pruneStats.RemovedSegments,
+							pruneStats.RemovedBytes,
+							pruneStats.ZombieMarkedSegments,
+							pruneStats.ZombieMarkedBytes,
+							pruneStats.LiveSkippedSegments,
+							pruneStats.LiveSkippedBytes,
+							pruneStats.AbortedForegroundWrites,
+						)
+						// Refresh protected path sets after inline retained prune so
+						// the follow-up GC pass evaluates updated retention state.
+						gcOpts = db.valueLogGCOptions(false)
+						if len(processedRewriteIDs) > 0 {
+							gcOpts.ObservedSourceFileIDs = append([]uint32(nil), processedRewriteIDs...)
+						}
+						gcStatsAfterPrune, gcErr := runGC("post_retained_prune")
+						if gcErr != nil {
+							return fmt.Errorf("generational gc after retained prune: %w", gcErr)
+						}
+						gcStats = gcStatsAfterPrune
+					}
 				}
 				if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 {
 					// Retained-path protection can starve live reclaim even when rewrite
@@ -14362,14 +14605,6 @@ planned:
 					// lifecycle pins can drain without waiting for byte-pressure gates.
 					db.scheduleRetainedValueLogPruneForce()
 				}
-				if gcStats.BytesDeleted > 0 {
-					gcBytesDeleted = int64(gcStats.BytesDeleted)
-					effectiveBytesAfter -= gcBytesDeleted
-					if effectiveBytesAfter < 0 {
-						effectiveBytesAfter = 0
-					}
-				}
-				db.debugVlogMaintf("gc_after_rewrite_done reason=%s dur_ms=%.3f", vlogGenerationReasonString(reason), float64(gcDur.Microseconds())/1000)
 			}
 			if effectiveBytesBefore > effectiveBytesAfter {
 				db.vlogGenerationRewriteReclaimedBytes.Add(uint64(effectiveBytesBefore - effectiveBytesAfter))
@@ -20187,6 +20422,20 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_retained_prune.parse_skipped_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneParseSkippedBytes.Load())
 	stats["treedb.cache.vlog_retained_prune.zombie_marked_segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneZombieMarkedSegments.Load())
 	stats["treedb.cache.vlog_retained_prune.zombie_marked_bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneZombieMarkedBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.last_observed_source.segments"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceSegments.Load())
+	stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_candidate"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceCandidateSegments.Load())
+	stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_candidate"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceCandidateBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_removed"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceRemovedSegments.Load())
+	stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_removed"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceRemovedBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_in_use_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceInUseSkippedSegments.Load())
+	stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_in_use_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceInUseSkippedBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_live_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceLiveSkippedSegments.Load())
+	stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_live_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceLiveSkippedBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_parse_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceParseSkippedSegments.Load())
+	stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_parse_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceParseSkippedBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_zombie_marked"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceZombieMarkedSegments.Load())
+	stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_zombie_marked"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceZombieMarkedBytes.Load())
 	stats["treedb.cache.vlog_retained_prune.pressure_bytes"] = fmt.Sprintf("%d", db.retainedPrunePressureBytes())
 	stats["treedb.cache.vlog_retained_prune.schedule_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleRequests.Load())
 	stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleForcedRequests.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 9f822397c..98d41c068 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -736,7 +736,9 @@ type rewriteBudgetRecordingBackend struct {
 	rewriteResponse backenddb.ValueLogRewriteStats
 	rewriteErr      error
 	gcCalls         int
+	gcOpts          []backenddb.ValueLogGCOptions
 	gcResponse      backenddb.ValueLogGCStats
+	gcResponses     []backenddb.ValueLogGCStats
 	gcErr           error
 }
 
@@ -766,7 +768,18 @@ func (b *rewriteBudgetRecordingBackend) ValueLogRewriteOnline(ctx context.Contex
 func (b *rewriteBudgetRecordingBackend) ValueLogGC(ctx context.Context, opts backenddb.ValueLogGCOptions) (backenddb.ValueLogGCStats, error) {
 	b.mu.Lock()
 	b.gcCalls++
+	b.gcOpts = append(b.gcOpts, cloneGCOptsForTest(opts))
 	stats := b.gcResponse
+	if len(b.gcResponses) > 0 {
+		idx := b.gcCalls - 1
+		if idx < 0 {
+			idx = 0
+		}
+		if idx >= len(b.gcResponses) {
+			idx = len(b.gcResponses) - 1
+		}
+		stats = b.gcResponses[idx]
+	}
 	err := b.gcErr
 	b.mu.Unlock()
 	return stats, err
@@ -785,6 +798,15 @@ func cloneRewriteOptsForTest(opts backenddb.ValueLogRewriteOnlineOptions) backen
 	return cloned
 }
 
+func cloneGCOptsForTest(opts backenddb.ValueLogGCOptions) backenddb.ValueLogGCOptions {
+	cloned := opts
+	cloned.ProtectedPaths = append([]string(nil), opts.ProtectedPaths...)
+	cloned.ProtectedInUsePaths = append([]string(nil), opts.ProtectedInUsePaths...)
+	cloned.ProtectedRetainedPaths = append([]string(nil), opts.ProtectedRetainedPaths...)
+	cloned.ObservedSourceFileIDs = append([]uint32(nil), opts.ObservedSourceFileIDs...)
+	return cloned
+}
+
 func (b *rewriteBudgetRecordingBackend) recordedPlan() (backenddb.ValueLogRewriteOnlineOptions, int) {
 	b.mu.Lock()
 	defer b.mu.Unlock()
@@ -794,7 +816,31 @@ func (b *rewriteBudgetRecordingBackend) recordedPlan() (backenddb.ValueLogRewrit
 func (b *rewriteBudgetRecordingBackend) recordedGC() (backenddb.ValueLogGCStats, int) {
 	b.mu.Lock()
 	defer b.mu.Unlock()
-	return b.gcResponse, b.gcCalls
+	stats := b.gcResponse
+	if len(b.gcResponses) > 0 && b.gcCalls > 0 {
+		idx := b.gcCalls - 1
+		if idx >= len(b.gcResponses) {
+			idx = len(b.gcResponses) - 1
+		}
+		stats = b.gcResponses[idx]
+	}
+	return stats, b.gcCalls
+}
+
+func (b *rewriteBudgetRecordingBackend) recordedGCObservedSourceCalls() int {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	count := 0
+	for _, opts := range b.gcOpts {
+		if opts.DryRun {
+			continue
+		}
+		if len(opts.ObservedSourceFileIDs) == 0 {
+			continue
+		}
+		count++
+	}
+	return count
 }
 
 func openRewriteQueueTestDB(t *testing.T, dir string, recorder *rewriteBudgetRecordingBackend) (*DB, func()) {
@@ -899,7 +945,7 @@ func TestVlogGenerationMaintenance_SerializesConcurrentRuns(t *testing.T) {
 
 	// While the first pass is still inside rewrite, a concurrent pass should be
 	// skipped by the maintenance-active gate instead of issuing a second rewrite.
-	db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{
+	db.maybeRunVlogGenerationMaintenanceWithOptions(false, vlogGenerationMaintenanceOptions{
 		bypassQuiet:           true,
 		skipRetainedPruneWait: true,
 		skipCheckpoint:        true,
@@ -1018,6 +1064,76 @@ func TestVlogGenerationRewrite_QueuedExecIgnoresForegroundCancelUntilBoundedComp
 	}
 }
 
+func TestVlogGenerationRewrite_ObservedSourceRetainedBlock_RunsSecondGC(t *testing.T) {
+	prepareDirectSchedulerTest(t)
+
+	dir := t.TempDir()
+	backend, err := backenddb.Open(backenddb.Options{Dir: dir})
+	if err != nil {
+		t.Fatalf("open backend: %v", err)
+	}
+	recorder := &rewriteBudgetRecordingBackend{
+		DB: backend,
+		rewriteResponse: backenddb.ValueLogRewriteStats{
+			BytesBefore:                   128,
+			BytesAfter:                    128,
+			RecordsCopied:                 1,
+			SourceSegmentsRequested:       1,
+			SourceSegmentsStillReferenced: 0,
+			SourceSegmentsUnreferenced:    1,
+		},
+		gcResponses: []backenddb.ValueLogGCStats{
+			{
+				BytesProtectedRetained:                  64,
+				BytesEligible:                           0,
+				ObservedSourceSegments:                  1,
+				ObservedSourceSegmentsReferenced:        0,
+				ObservedSourceSegmentsEligible:          0,
+				ObservedSourceSegmentsProtectedRetained: 1,
+				ObservedSourceBytesProtectedRetained:    64,
+			},
+			{
+				BytesProtectedRetained:         0,
+				BytesEligible:                  64,
+				BytesDeleted:                   64,
+				ObservedSourceSegments:         1,
+				ObservedSourceSegmentsEligible: 1,
+				ObservedSourceSegmentsDeleted:  1,
+				ObservedSourceBytesEligible:    64,
+				ObservedSourceBytesDeleted:     64,
+			},
+		},
+	}
+
+	db, cleanup := openRewriteQueueTestDB(t, dir, recorder)
+	defer cleanup()
+	skipRetainedPrune(db)
+
+	if err := db.setVlogGenerationRewriteQueue([]uint32{11}); err != nil {
+		t.Fatalf("seed rewrite queue: %v", err)
+	}
+	db.vlogGenerationRewriteBudgetTokensBytes.Store(1024)
+	forceVlogMaintenanceIdle(db)
+	forceRetainedPruneIdle(db)
+
+	db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{
+		bypassQuiet:           true,
+		skipRetainedPruneWait: true,
+		skipCheckpoint:        true,
+		rewriteDebtDrain:      true,
+	})
+
+	if got := recorder.recordedGCObservedSourceCalls(); got != 2 {
+		t.Fatalf("observed-source gc calls=%d want 2 when observed source is retained-blocked", got)
+	}
+	if got := db.vlogGenerationLastGCObservedSourceSegmentsEligible.Load(); got != 1 {
+		t.Fatalf("last observed source eligible segments=%d want 1 after second gc", got)
+	}
+	if got := db.vlogGenerationLastGCObservedSourceBytesDeleted.Load(); got != 64 {
+		t.Fatalf("last observed source deleted bytes=%d want 64 after second gc", got)
+	}
+}
+
 func TestVlogGenerationRewrite_FreshPlanExecIgnoresForegroundCancelUntilBoundedComplete(t *testing.T) {
 	prepareDirectSchedulerTest(t)
 

From d0898f78f35d88afb5879fdb4178ca1ef37cf2a3 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 03:57:17 -1000
Subject: [PATCH 24/61] caching: replay observed-source gc after retained prune

---
 TreeDB/caching/db.go | 103 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 100 insertions(+), 3 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index a6fcc3705..d1907120b 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -4639,6 +4639,61 @@ func (db *DB) takeRetainedPruneObservedSourceIDs() map[uint32]struct{} {
 	return out
 }
 
+func (db *DB) queueVlogGenerationObservedSourceGCList(ids []uint32) {
+	if db == nil || len(ids) == 0 {
+		return
+	}
+	db.vlogGenerationObservedGCMu.Lock()
+	if db.vlogGenerationObservedGCSourceIDs == nil {
+		db.vlogGenerationObservedGCSourceIDs = make(map[uint32]struct{}, len(ids))
+	}
+	for _, id := range ids {
+		if id == 0 {
+			continue
+		}
+		db.vlogGenerationObservedGCSourceIDs[id] = struct{}{}
+	}
+	db.vlogGenerationObservedGCMu.Unlock()
+}
+
+func (db *DB) queueVlogGenerationObservedSourceGCIDs(ids map[uint32]struct{}) {
+	if db == nil || len(ids) == 0 {
+		return
+	}
+	db.vlogGenerationObservedGCMu.Lock()
+	if db.vlogGenerationObservedGCSourceIDs == nil {
+		db.vlogGenerationObservedGCSourceIDs = make(map[uint32]struct{}, len(ids))
+	}
+	for id := range ids {
+		if id == 0 {
+			continue
+		}
+		db.vlogGenerationObservedGCSourceIDs[id] = struct{}{}
+	}
+	db.vlogGenerationObservedGCMu.Unlock()
+}
+
+func (db *DB) takeVlogGenerationObservedSourceGCList() []uint32 {
+	if db == nil {
+		return nil
+	}
+	db.vlogGenerationObservedGCMu.Lock()
+	if len(db.vlogGenerationObservedGCSourceIDs) == 0 {
+		db.vlogGenerationObservedGCMu.Unlock()
+		return nil
+	}
+	out := make([]uint32, 0, len(db.vlogGenerationObservedGCSourceIDs))
+	for id := range db.vlogGenerationObservedGCSourceIDs {
+		if id == 0 {
+			continue
+		}
+		out = append(out, id)
+	}
+	db.vlogGenerationObservedGCSourceIDs = nil
+	db.vlogGenerationObservedGCMu.Unlock()
+	return out
+}
+
 func (db *DB) scheduleRetainedValueLogPrune() {
 	db.scheduleRetainedValueLogPruneWithForce(false)
 }
@@ -4722,6 +4777,7 @@ func (db *DB) scheduleRetainedValueLogPruneWithForce(force bool) {
 		if len(observedSourceIDs) > 0 && (pruneStats.ObservedSourceZombieMarkedSegments > 0 || pruneStats.ObservedSourceRemovedSegments > 0) {
 			// When a retained prune processes rewrite-observed source segments,
 			// queue a near-term maintenance pass so GC can re-check reclaim state.
+			db.queueVlogGenerationObservedSourceGCIDs(observedSourceIDs)
 			db.vlogGenerationCheckpointKickPending.Store(true)
 		}
 	}()
@@ -5557,6 +5613,8 @@ type DB struct {
 	retainedPruneForceRequested                                 atomic.Bool
 	retainedPruneObservedMu                                     sync.Mutex
 	retainedPruneObservedSourceIDs                              map[uint32]struct{}
+	vlogGenerationObservedGCMu                                  sync.Mutex
+	vlogGenerationObservedGCSourceIDs                           map[uint32]struct{}
 	retainedPruneMu                                             sync.Mutex
 	retainedPruneDone                                           chan struct{}
 	vlogGenerationRemapSuccesses                                atomic.Uint64
@@ -14599,10 +14657,23 @@ planned:
 						gcStats = gcStatsAfterPrune
 					}
 				}
+				if len(processedRewriteIDs) > 0 &&
+					gcStats.ObservedSourceSegments > 0 &&
+					gcStats.ObservedSourceSegmentsProtectedRetained > 0 &&
+					gcStats.ObservedSourceSegmentsEligible == 0 {
+					// Rewrite-selected source segments remained retained-protected
+					// after in-pass prune/GC. Queue an observed-source replay GC for
+					// the next maintenance pass.
+					db.queueVlogGenerationObservedSourceGCList(processedRewriteIDs)
+					db.vlogGenerationCheckpointKickPending.Store(true)
+				}
 				if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 {
 					// Retained-path protection can starve live reclaim even when rewrite
 					// processed stale payload in-pass. Kick an eager retained prune so
 					// lifecycle pins can drain without waiting for byte-pressure gates.
+					if len(processedRewriteIDs) > 0 {
+						db.queueRetainedPruneObservedSourceIDs(processedRewriteIDs)
+					}
 					db.scheduleRetainedValueLogPruneForce()
 				}
 			}
@@ -14780,26 +14851,34 @@ planned:
 		return
 	}
 
+	observedSourceGCIDs := db.takeVlogGenerationObservedSourceGCList()
+	forceObservedSourceGC := len(observedSourceGCIDs) > 0
 	if envBool(envDisableVlogGenerationGC) {
+		if forceObservedSourceGC {
+			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
+		}
 		return
 	}
 	// GC is a best-effort background maintenance task. It requires a checkpoint
 	// barrier to be safe, and that barrier can be very expensive during sustained
 	// ingest/restore when the flush queue is non-empty. Avoid introducing long
 	// stalls by only running the GC path when the cached write queue is drained.
-	if queueLen != 0 {
+	if queueLen != 0 && !forceObservedSourceGC {
 		return
 	}
 	gcer, ok := db.backend.(backendValueLogGCer)
 	if !ok {
+		if forceObservedSourceGC {
+			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
+		}
 		return
 	}
-	needEligibilityEstimate := !runGC && !db.shouldRunVlogGenerationGC(retained, reclaimable, churnBps)
+	needEligibilityEstimate := !runGC && !forceObservedSourceGC && !db.shouldRunVlogGenerationGC(retained, reclaimable, churnBps)
 	now = time.Now()
 	lastGC := db.vlogGenerationLastGCUnixNano.Load()
 	if lastGC > 0 {
 		lastAt := time.Unix(0, lastGC)
-		if now.Sub(lastAt) < vlogGenerationGCMinInterval {
+		if !forceObservedSourceGC && now.Sub(lastAt) < vlogGenerationGCMinInterval {
 			return
 		}
 	}
@@ -14822,6 +14901,9 @@ planned:
 		db.vlogGenerationLastGCUnixNano.Store(now.UnixNano())
 		ctx, cancel := db.foregroundMaintenanceContext(30 * time.Second)
 		gcOpts := db.valueLogGCOptions(false)
+		if forceObservedSourceGC {
+			gcOpts.ObservedSourceFileIDs = append([]uint32(nil), observedSourceGCIDs...)
+		}
 		gcStart := time.Now()
 		gcStats, err := gcer.ValueLogGC(ctx, gcOpts)
 		cancel()
@@ -14833,8 +14915,20 @@ planned:
 		if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 {
 			// When GC classifies all reclaim blockers as retained-path protection,
 			// trigger an eager retained prune pass to release stale lifecycle pins.
+			if forceObservedSourceGC {
+				db.queueRetainedPruneObservedSourceIDs(observedSourceGCIDs)
+			}
 			db.scheduleRetainedValueLogPruneForce()
 		}
+		if forceObservedSourceGC &&
+			gcStats.ObservedSourceSegments > 0 &&
+			gcStats.ObservedSourceSegmentsProtectedRetained > 0 &&
+			gcStats.ObservedSourceSegmentsEligible == 0 {
+			db.queueRetainedPruneObservedSourceIDs(observedSourceGCIDs)
+			db.scheduleRetainedValueLogPruneForce()
+			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
+			db.vlogGenerationCheckpointKickPending.Store(true)
+		}
 		db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle)
 		db.vlogGenerationGCRuns.Add(1)
 		if gcStats.SegmentsDeleted > 0 {
@@ -14846,6 +14940,9 @@ planned:
 		return nil
 	})
 	if err != nil {
+		if forceObservedSourceGC {
+			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
+		}
 		if errors.Is(err, context.Canceled) {
 			db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle)
 			return

From 60c1639deee7e67b6fe14a6008b505d09bb1d01b Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 04:13:48 -1000
Subject: [PATCH 25/61] caching: instrument observed-source replay gc queue

---
 TreeDB/caching/db.go                          | 43 ++++++++++
 .../caching/vlog_generation_scheduler_test.go | 80 +++++++++++++++++++
 2 files changed, 123 insertions(+)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index d1907120b..bb407ff5f 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -4647,13 +4647,22 @@ func (db *DB) queueVlogGenerationObservedSourceGCList(ids []uint32) {
 	if db.vlogGenerationObservedGCSourceIDs == nil {
 		db.vlogGenerationObservedGCSourceIDs = make(map[uint32]struct{}, len(ids))
 	}
+	added := 0
 	for _, id := range ids {
 		if id == 0 {
 			continue
 		}
+		if _, exists := db.vlogGenerationObservedGCSourceIDs[id]; exists {
+			continue
+		}
 		db.vlogGenerationObservedGCSourceIDs[id] = struct{}{}
+		added++
 	}
 	db.vlogGenerationObservedGCMu.Unlock()
+	if added > 0 {
+		db.vlogGenerationObservedGCQueuedBatches.Add(1)
+		db.vlogGenerationObservedGCQueuedIDs.Add(uint64(added))
+	}
 }
 
 func (db *DB) queueVlogGenerationObservedSourceGCIDs(ids map[uint32]struct{}) {
@@ -4664,13 +4673,22 @@ func (db *DB) queueVlogGenerationObservedSourceGCIDs(ids map[uint32]struct{}) {
 	if db.vlogGenerationObservedGCSourceIDs == nil {
 		db.vlogGenerationObservedGCSourceIDs = make(map[uint32]struct{}, len(ids))
 	}
+	added := 0
 	for id := range ids {
 		if id == 0 {
 			continue
 		}
+		if _, exists := db.vlogGenerationObservedGCSourceIDs[id]; exists {
+			continue
+		}
 		db.vlogGenerationObservedGCSourceIDs[id] = struct{}{}
+		added++
 	}
 	db.vlogGenerationObservedGCMu.Unlock()
+	if added > 0 {
+		db.vlogGenerationObservedGCQueuedBatches.Add(1)
+		db.vlogGenerationObservedGCQueuedIDs.Add(uint64(added))
+	}
 }
 
 func (db *DB) takeVlogGenerationObservedSourceGCList() []uint32 {
@@ -4691,6 +4709,10 @@ func (db *DB) takeVlogGenerationObservedSourceGCList() []uint32 {
 	}
 	db.vlogGenerationObservedGCSourceIDs = nil
 	db.vlogGenerationObservedGCMu.Unlock()
+	if len(out) > 0 {
+		db.vlogGenerationObservedGCTakenBatches.Add(1)
+		db.vlogGenerationObservedGCTakenIDs.Add(uint64(len(out)))
+	}
 	return out
 }
 
@@ -5615,6 +5637,12 @@ type DB struct {
 	retainedPruneObservedSourceIDs                              map[uint32]struct{}
 	vlogGenerationObservedGCMu                                  sync.Mutex
 	vlogGenerationObservedGCSourceIDs                           map[uint32]struct{}
+	vlogGenerationObservedGCQueuedBatches                       atomic.Uint64
+	vlogGenerationObservedGCQueuedIDs                           atomic.Uint64
+	vlogGenerationObservedGCTakenBatches                        atomic.Uint64
+	vlogGenerationObservedGCTakenIDs                            atomic.Uint64
+	vlogGenerationObservedGCRuns                                atomic.Uint64
+	vlogGenerationObservedGCRetryQueued                         atomic.Uint64
 	retainedPruneMu                                             sync.Mutex
 	retainedPruneDone                                           chan struct{}
 	vlogGenerationRemapSuccesses                                atomic.Uint64
@@ -14855,6 +14883,7 @@ planned:
 	forceObservedSourceGC := len(observedSourceGCIDs) > 0
 	if envBool(envDisableVlogGenerationGC) {
 		if forceObservedSourceGC {
+			db.vlogGenerationObservedGCRetryQueued.Add(1)
 			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
 		}
 		return
@@ -14869,6 +14898,7 @@ planned:
 	gcer, ok := db.backend.(backendValueLogGCer)
 	if !ok {
 		if forceObservedSourceGC {
+			db.vlogGenerationObservedGCRetryQueued.Add(1)
 			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
 		}
 		return
@@ -14903,6 +14933,7 @@ planned:
 		gcOpts := db.valueLogGCOptions(false)
 		if forceObservedSourceGC {
 			gcOpts.ObservedSourceFileIDs = append([]uint32(nil), observedSourceGCIDs...)
+			db.vlogGenerationObservedGCRuns.Add(1)
 		}
 		gcStart := time.Now()
 		gcStats, err := gcer.ValueLogGC(ctx, gcOpts)
@@ -14926,6 +14957,7 @@ planned:
 			gcStats.ObservedSourceSegmentsEligible == 0 {
 			db.queueRetainedPruneObservedSourceIDs(observedSourceGCIDs)
 			db.scheduleRetainedValueLogPruneForce()
+			db.vlogGenerationObservedGCRetryQueued.Add(1)
 			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
 			db.vlogGenerationCheckpointKickPending.Store(true)
 		}
@@ -14941,6 +14973,7 @@ planned:
 	})
 	if err != nil {
 		if forceObservedSourceGC {
+			db.vlogGenerationObservedGCRetryQueued.Add(1)
 			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
 		}
 		if errors.Is(err, context.Canceled) {
@@ -20462,6 +20495,9 @@ func (db *DB) Stats() map[string]string {
 		}
 	}
 	db.vlogGenerationRewriteQueueMu.Unlock()
+	db.vlogGenerationObservedGCMu.Lock()
+	observedGCPending := len(db.vlogGenerationObservedGCSourceIDs)
+	db.vlogGenerationObservedGCMu.Unlock()
 	rewriteAgeBlockedUntilNS := db.vlogGenerationRewriteAgeBlockedUntilNS.Load()
 	rewriteAgeBlockedRemainingMS := int64(0)
 	if rewriteAgeBlockedUntilNS > 0 {
@@ -20648,6 +20684,13 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterRuns.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.segments"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterSegments.Load())
 	stats["treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewritePlanPenaltyFilterToEmpty.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.pending_ids"] = fmt.Sprintf("%d", observedGCPending)
+	stats["treedb.cache.vlog_generation.observed_gc.queued_batches"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCQueuedBatches.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.queued_ids"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCQueuedIDs.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.taken_batches"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCTakenBatches.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.taken_ids"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCTakenIDs.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.runs"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRuns.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.retry_queued"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRetryQueued.Load())
 	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load())
 	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Load())
 	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 98d41c068..c05292311 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"sort"
 	"sync"
 	"testing"
 	"time"
@@ -1134,6 +1135,52 @@ func TestVlogGenerationRewrite_ObservedSourceRetainedBlock_RunsSecondGC(t *testi
 	}
 }
 
+func TestVlogGenerationObservedSourceGCQueue_CountersAndDedupe(t *testing.T) {
+	db := &DB{}
+
+	db.queueVlogGenerationObservedSourceGCList([]uint32{7, 9, 7, 0})
+	db.queueVlogGenerationObservedSourceGCIDs(map[uint32]struct{}{
+		0:  {},
+		9:  {},
+		12: {},
+	})
+
+	if got := db.vlogGenerationObservedGCQueuedBatches.Load(); got != 2 {
+		t.Fatalf("queued batches=%d want 2", got)
+	}
+	if got := db.vlogGenerationObservedGCQueuedIDs.Load(); got != 3 {
+		t.Fatalf("queued ids=%d want 3", got)
+	}
+
+	ids := db.takeVlogGenerationObservedSourceGCList()
+	sort.Slice(ids, func(i, j int) bool { return ids[i] < ids[j] })
+	want := []uint32{7, 9, 12}
+	if len(ids) != len(want) {
+		t.Fatalf("taken ids len=%d want %d (%v)", len(ids), len(want), ids)
+	}
+	for i := range ids {
+		if ids[i] != want[i] {
+			t.Fatalf("taken ids[%d]=%d want %d (all=%v)", i, ids[i], want[i], ids)
+		}
+	}
+
+	if got := db.vlogGenerationObservedGCTakenBatches.Load(); got != 1 {
+		t.Fatalf("taken batches=%d want 1", got)
+	}
+	if got := db.vlogGenerationObservedGCTakenIDs.Load(); got != uint64(len(want)) {
+		t.Fatalf("taken ids=%d want %d", got, len(want))
+	}
+
+	// Empty take should not mutate taken counters.
+	_ = db.takeVlogGenerationObservedSourceGCList()
+	if got := db.vlogGenerationObservedGCTakenBatches.Load(); got != 1 {
+		t.Fatalf("taken batches after empty take=%d want 1", got)
+	}
+	if got := db.vlogGenerationObservedGCTakenIDs.Load(); got != uint64(len(want)) {
+		t.Fatalf("taken ids after empty take=%d want %d", got, len(want))
+	}
+}
+
 func TestVlogGenerationRewrite_FreshPlanExecIgnoresForegroundCancelUntilBoundedComplete(t *testing.T) {
 	prepareDirectSchedulerTest(t)
 
@@ -5971,6 +6018,12 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	db.vlogGenerationRewriteProcessedStaleBytes.Store(450)
 	db.vlogGenerationRewriteNoReclaimRuns.Store(3)
 	db.vlogGenerationRewriteNoReclaimStaleBytes.Store(320)
+	db.vlogGenerationObservedGCQueuedBatches.Store(5)
+	db.vlogGenerationObservedGCQueuedIDs.Store(12)
+	db.vlogGenerationObservedGCTakenBatches.Store(4)
+	db.vlogGenerationObservedGCTakenIDs.Store(9)
+	db.vlogGenerationObservedGCRuns.Store(3)
+	db.vlogGenerationObservedGCRetryQueued.Store(2)
 
 	db.vlogGenerationRewriteQueueMu.Lock()
 	db.vlogGenerationRewriteQueueLoaded = true
@@ -5985,6 +6038,12 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	db.vlogGenerationRewriteStagePending = true
 	db.vlogGenerationRewriteStageObservedUnixNano = 1234
 	db.vlogGenerationRewriteQueueMu.Unlock()
+	db.vlogGenerationObservedGCMu.Lock()
+	db.vlogGenerationObservedGCSourceIDs = map[uint32]struct{}{
+		101: {},
+		102: {},
+	}
+	db.vlogGenerationObservedGCMu.Unlock()
 
 	stats := db.Stats()
 	if got := stats["treedb.cache.vlog_generation.maintenance.pass.total_ms"]; got != "40.000" {
@@ -6227,4 +6286,25 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes"]; got != "320" {
 		t.Fatalf("rewrite no reclaim stale bytes=%q want 320", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.pending_ids"]; got != "2" {
+		t.Fatalf("observed gc pending ids=%q want 2", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.queued_batches"]; got != "5" {
+		t.Fatalf("observed gc queued batches=%q want 5", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.queued_ids"]; got != "12" {
+		t.Fatalf("observed gc queued ids=%q want 12", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.taken_batches"]; got != "4" {
+		t.Fatalf("observed gc taken batches=%q want 4", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.taken_ids"]; got != "9" {
+		t.Fatalf("observed gc taken ids=%q want 9", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.runs"]; got != "3" {
+		t.Fatalf("observed gc runs=%q want 3", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.retry_queued"]; got != "2" {
+		t.Fatalf("observed gc retry queued=%q want 2", got)
+	}
 }

From c07a887e3655033b528f0e195537fec0ddb54ff0 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 04:45:16 -1000
Subject: [PATCH 26/61] caching: keep bypass-quiet gc alive under foreground
 resume

---
 TreeDB/caching/db.go                          | 79 ++++++++++++++++++-
 .../caching/vlog_generation_scheduler_test.go | 65 +++++++++++++++
 2 files changed, 142 insertions(+), 2 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index bb407ff5f..287f08e17 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -14516,7 +14516,7 @@ planned:
 			if len(processedRewriteIDs) > 0 {
 				ctx, cancel = context.WithTimeout(context.Background(), vlogGenerationRewriteBoundedExecTimeout)
 			} else {
-				ctx, cancel = db.foregroundMaintenanceContext(2 * time.Minute)
+				ctx, cancel = db.vlogGenerationMaintenanceContext(2*time.Minute, opts)
 			}
 			db.debugVlogMaintf(
 				"rewrite_exec reason=%s source_ids=%d max_segments=%d budget_tokens=%d max_source_bytes=%d min_stale_ratio=%.6f queue_len=%d ledger_live_bytes=%d",
@@ -14882,6 +14882,12 @@ planned:
 	observedSourceGCIDs := db.takeVlogGenerationObservedSourceGCList()
 	forceObservedSourceGC := len(observedSourceGCIDs) > 0
 	if envBool(envDisableVlogGenerationGC) {
+		db.debugVlogMaintf(
+			"gc_skip reason=disabled_env run_gc=%t force_observed=%t observed_ids=%d",
+			runGC,
+			forceObservedSourceGC,
+			len(observedSourceGCIDs),
+		)
 		if forceObservedSourceGC {
 			db.vlogGenerationObservedGCRetryQueued.Add(1)
 			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
@@ -14893,10 +14899,22 @@ planned:
 	// ingest/restore when the flush queue is non-empty. Avoid introducing long
 	// stalls by only running the GC path when the cached write queue is drained.
 	if queueLen != 0 && !forceObservedSourceGC {
+		db.debugVlogMaintf(
+			"gc_skip reason=queue_not_drained run_gc=%t queue_len=%d force_observed=%t",
+			runGC,
+			queueLen,
+			forceObservedSourceGC,
+		)
 		return
 	}
 	gcer, ok := db.backend.(backendValueLogGCer)
 	if !ok {
+		db.debugVlogMaintf(
+			"gc_skip reason=backend_no_gcer run_gc=%t force_observed=%t observed_ids=%d",
+			runGC,
+			forceObservedSourceGC,
+			len(observedSourceGCIDs),
+		)
 		if forceObservedSourceGC {
 			db.vlogGenerationObservedGCRetryQueued.Add(1)
 			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
@@ -14909,6 +14927,14 @@ planned:
 	if lastGC > 0 {
 		lastAt := time.Unix(0, lastGC)
 		if !forceObservedSourceGC && now.Sub(lastAt) < vlogGenerationGCMinInterval {
+			db.debugVlogMaintf(
+				"gc_skip reason=min_interval run_gc=%t force_observed=%t observed_ids=%d since_ms=%.3f min_ms=%.3f",
+				runGC,
+				forceObservedSourceGC,
+				len(observedSourceGCIDs),
+				float64(now.Sub(lastAt).Microseconds())/1000,
+				float64(vlogGenerationGCMinInterval.Microseconds())/1000,
+			)
 			return
 		}
 	}
@@ -14924,24 +14950,59 @@ planned:
 				return fmt.Errorf("generational gc dry-run: %w", err)
 			}
 			if gcStats.BytesEligible < vlogGenerationGCMinBytes && gcStats.SegmentsEligible == 0 {
+				db.debugVlogMaintf(
+					"gc_skip reason=below_eligibility_floor run_gc=%t force_observed=%t eligible_bytes=%d eligible_segments=%d min_bytes=%d",
+					runGC,
+					forceObservedSourceGC,
+					gcStats.BytesEligible,
+					gcStats.SegmentsEligible,
+					vlogGenerationGCMinBytes,
+				)
 				return nil
 			}
 		}
 		now := time.Now()
 		db.vlogGenerationLastGCUnixNano.Store(now.UnixNano())
-		ctx, cancel := db.foregroundMaintenanceContext(30 * time.Second)
+		ctx, cancel := db.vlogGenerationMaintenanceContext(30*time.Second, opts)
 		gcOpts := db.valueLogGCOptions(false)
 		if forceObservedSourceGC {
 			gcOpts.ObservedSourceFileIDs = append([]uint32(nil), observedSourceGCIDs...)
 			db.vlogGenerationObservedGCRuns.Add(1)
 		}
+		db.debugVlogMaintf(
+			"gc_run start run_gc=%t force_observed=%t observed_ids=%d need_estimate=%t",
+			runGC,
+			forceObservedSourceGC,
+			len(observedSourceGCIDs),
+			needEligibilityEstimate,
+		)
 		gcStart := time.Now()
 		gcStats, err := gcer.ValueLogGC(ctx, gcOpts)
 		cancel()
 		db.observeVlogGenerationGCExecDuration(time.Since(gcStart))
 		if err != nil {
+			db.debugVlogMaintf(
+				"gc_run err run_gc=%t force_observed=%t observed_ids=%d err=%v",
+				runGC,
+				forceObservedSourceGC,
+				len(observedSourceGCIDs),
+				err,
+			)
 			return fmt.Errorf("generational gc: %w", err)
 		}
+		db.debugVlogMaintf(
+			"gc_run done run_gc=%t force_observed=%t observed_ids=%d deleted_segments=%d deleted_bytes=%d protected_retained_bytes=%d observed_segments=%d observed_eligible=%d observed_deleted=%d observed_protected_retained=%d",
+			runGC,
+			forceObservedSourceGC,
+			len(observedSourceGCIDs),
+			gcStats.SegmentsDeleted,
+			gcStats.BytesDeleted,
+			gcStats.BytesProtectedRetained,
+			gcStats.ObservedSourceSegments,
+			gcStats.ObservedSourceSegmentsEligible,
+			gcStats.ObservedSourceSegmentsDeleted,
+			gcStats.ObservedSourceSegmentsProtectedRetained,
+		)
 		db.observeVlogGenerationGCStats(gcStats)
 		if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 {
 			// When GC classifies all reclaim blockers as retained-path protection,
@@ -14955,6 +15016,13 @@ planned:
 			gcStats.ObservedSourceSegments > 0 &&
 			gcStats.ObservedSourceSegmentsProtectedRetained > 0 &&
 			gcStats.ObservedSourceSegmentsEligible == 0 {
+			db.debugVlogMaintf(
+				"gc_observed_retry reason=retained_protected observed_ids=%d observed_segments=%d observed_protected_retained=%d observed_eligible=%d",
+				len(observedSourceGCIDs),
+				gcStats.ObservedSourceSegments,
+				gcStats.ObservedSourceSegmentsProtectedRetained,
+				gcStats.ObservedSourceSegmentsEligible,
+			)
 			db.queueRetainedPruneObservedSourceIDs(observedSourceGCIDs)
 			db.scheduleRetainedValueLogPruneForce()
 			db.vlogGenerationObservedGCRetryQueued.Add(1)
@@ -14972,6 +15040,13 @@ planned:
 		return nil
 	})
 	if err != nil {
+		db.debugVlogMaintf(
+			"gc_maintenance_err run_gc=%t force_observed=%t observed_ids=%d err=%v",
+			runGC,
+			forceObservedSourceGC,
+			len(observedSourceGCIDs),
+			err,
+		)
 		if forceObservedSourceGC {
 			db.vlogGenerationObservedGCRetryQueued.Add(1)
 			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index c05292311..9c5cd67ff 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -741,6 +741,7 @@ type rewriteBudgetRecordingBackend struct {
 	gcResponse      backenddb.ValueLogGCStats
 	gcResponses     []backenddb.ValueLogGCStats
 	gcErr           error
+	gcFn            func(context.Context, backenddb.ValueLogGCOptions) (backenddb.ValueLogGCStats, error)
 }
 
 func (b *rewriteBudgetRecordingBackend) ValueLogRewritePlan(ctx context.Context, opts backenddb.ValueLogRewriteOnlineOptions) (backenddb.ValueLogRewritePlan, error) {
@@ -770,6 +771,7 @@ func (b *rewriteBudgetRecordingBackend) ValueLogGC(ctx context.Context, opts bac
 	b.mu.Lock()
 	b.gcCalls++
 	b.gcOpts = append(b.gcOpts, cloneGCOptsForTest(opts))
+	customFn := b.gcFn
 	stats := b.gcResponse
 	if len(b.gcResponses) > 0 {
 		idx := b.gcCalls - 1
@@ -783,6 +785,9 @@ func (b *rewriteBudgetRecordingBackend) ValueLogGC(ctx context.Context, opts bac
 	}
 	err := b.gcErr
 	b.mu.Unlock()
+	if customFn != nil {
+		return customFn(ctx, opts)
+	}
 	return stats, err
 }
 
@@ -1181,6 +1186,66 @@ func TestVlogGenerationObservedSourceGCQueue_CountersAndDedupe(t *testing.T) {
 	}
 }
 
+func TestVlogGenerationMaintenance_ObservedSourceGCBypassQuietIgnoresForegroundResume(t *testing.T) {
+	prepareDirectSchedulerTest(t)
+	t.Setenv(envDisableVlogGenerationRewrite, "1")
+
+	dir := t.TempDir()
+	backend, err := backenddb.Open(backenddb.Options{Dir: dir})
+	if err != nil {
+		t.Fatalf("open backend: %v", err)
+	}
+	recorder := &rewriteBudgetRecordingBackend{
+		DB: backend,
+		gcFn: func(ctx context.Context, _ backenddb.ValueLogGCOptions) (backenddb.ValueLogGCStats, error) {
+			select {
+			case <-time.After(200 * time.Millisecond):
+				if err := ctx.Err(); err != nil {
+					return backenddb.ValueLogGCStats{}, err
+				}
+				return backenddb.ValueLogGCStats{}, nil
+			case <-ctx.Done():
+				return backenddb.ValueLogGCStats{}, ctx.Err()
+			}
+		},
+	}
+
+	db, cleanup := openRewriteQueueTestDB(t, dir, recorder)
+	defer cleanup()
+	skipRetainedPrune(db)
+
+	db.queueVlogGenerationObservedSourceGCList([]uint32{11})
+	db.vlogGenerationLastGCUnixNano.Store(time.Now().Add(-time.Minute).UnixNano())
+	forceVlogMaintenanceIdle(db)
+
+	go func() {
+		time.Sleep(30 * time.Millisecond)
+		hot := time.Now().UnixNano()
+		db.lastForegroundWriteUnixNano.Store(hot)
+		db.lastForegroundReadUnixNano.Store(hot)
+	}()
+
+	db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{
+		bypassQuiet:           true,
+		skipRetainedPruneWait: true,
+		skipCheckpoint:        true,
+		rewriteDebtDrain:      true,
+	})
+
+	if got := recorder.recordedGCObservedSourceCalls(); got != 1 {
+		t.Fatalf("observed-source gc calls=%d want 1", got)
+	}
+	if got := db.vlogGenerationGCRuns.Load(); got != 1 {
+		t.Fatalf("gc runs=%d want 1", got)
+	}
+	if got := db.vlogGenerationObservedGCRetryQueued.Load(); got != 0 {
+		t.Fatalf("observed-source gc retry queued=%d want 0", got)
+	}
+	if pending := len(db.takeVlogGenerationObservedSourceGCList()); pending != 0 {
+		t.Fatalf("observed-source gc pending ids=%d want 0", pending)
+	}
+}
+
 func TestVlogGenerationRewrite_FreshPlanExecIgnoresForegroundCancelUntilBoundedComplete(t *testing.T) {
 	prepareDirectSchedulerTest(t)
 

From 76938986a104ba5b89bb163f7ed25a818130a73a Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 05:19:28 -1000
Subject: [PATCH 27/61] tools: add live vlog maintenance capacity analyzer

---
 docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md     |  21 +
 scripts/analyze_vlog_maintenance_capacity.py | 513 +++++++++++++++++++
 worklog/2026-03-28.md                        |  33 ++
 3 files changed, 567 insertions(+)
 create mode 100755 scripts/analyze_vlog_maintenance_capacity.py
 create mode 100644 worklog/2026-03-28.md

diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
index 052bd806d..2d0a98274 100644
--- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
+++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
@@ -32,6 +32,27 @@ Primary keys:
 - `treedb.cache.vlog_generation.vacuum.runs`
 - `treedb.cache.vlog_generation.vacuum.failures`
 
+## Live Run Capacity Report
+For `run_celestia`-style runs, analyze the latest diagnostics snapshot with:
+
+```bash
+./scripts/analyze_vlog_maintenance_capacity.py
+```
+
+Optional explicit input:
+
+```bash
+./scripts/analyze_vlog_maintenance_capacity.py ~/.celestia-app-mainnet-treedb-<timestamp>
+./scripts/analyze_vlog_maintenance_capacity.py ~/.celestia-app-mainnet-treedb-<timestamp>/sync/diagnostics/<file>.debug_vars.json
+```
+
+The report highlights:
+- maintenance lane pressure (attempt/acquire/collision + skip mix)
+- rewrite plan-to-exec realization
+- stale-bytes processed vs immediate reclaim
+- observed-source replay drain
+- GC eligibility/protection signals
+
 ## Bench Commands
 ### Churn sanity (TreeDB)
 ```bash
diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py
new file mode 100755
index 000000000..07aa4c926
--- /dev/null
+++ b/scripts/analyze_vlog_maintenance_capacity.py
@@ -0,0 +1,513 @@
+#!/usr/bin/env python3
+"""Summarize live TreeDB vlog maintenance capacity from run_celestia diagnostics.
+
+Input can be:
+- a run home dir (e.g. ~/.celestia-app-mainnet-treedb-YYYY...)
+- a diagnostics dir
+- a debug vars JSON file
+
+By default, the script scans the newest ~/.celestia-app-mainnet-treedb-* home.
+"""
+
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import math
+import os
+import sys
+from pathlib import Path
+from typing import Any
+
+
+def human_bytes(value: float) -> str:
+    if value is None or math.isnan(value):
+        return "n/a"
+    n = float(value)
+    if n < 0:
+        return f"-{human_bytes(-n)}"
+    units = ["B", "KiB", "MiB", "GiB", "TiB"]
+    idx = 0
+    while n >= 1024.0 and idx < len(units) - 1:
+        n /= 1024.0
+        idx += 1
+    if idx == 0:
+        return f"{int(n)} {units[idx]}"
+    return f"{n:.2f} {units[idx]}"
+
+
+def pct(num: float, den: float) -> float:
+    if den <= 0:
+        return 0.0
+    return 100.0 * num / den
+
+
+def safe_int(value: Any, default: int = 0) -> int:
+    if isinstance(value, bool):
+        return int(value)
+    if isinstance(value, int):
+        return value
+    if isinstance(value, float):
+        return int(value)
+    if isinstance(value, str):
+        s = value.strip().lower()
+        if not s:
+            return default
+        if s == "true":
+            return 1
+        if s == "false":
+            return 0
+        try:
+            return int(s)
+        except ValueError:
+            try:
+                return int(float(s))
+            except ValueError:
+                return default
+    return default
+
+
+def safe_float(value: Any, default: float = 0.0) -> float:
+    if isinstance(value, bool):
+        return float(int(value))
+    if isinstance(value, (int, float)):
+        return float(value)
+    if isinstance(value, str):
+        s = value.strip().lower()
+        if not s:
+            return default
+        if s == "true":
+            return 1.0
+        if s == "false":
+            return 0.0
+        try:
+            return float(s)
+        except ValueError:
+            return default
+    return default
+
+
+def pick_latest(paths: list[Path]) -> Path | None:
+    if not paths:
+        return None
+    return max(paths, key=lambda p: p.stat().st_mtime)
+
+
+def find_latest_home() -> Path | None:
+    homes: list[Path] = []
+    for raw in glob.glob(os.path.expanduser("~/.celestia-app-mainnet-treedb-*")):
+        p = Path(raw)
+        if p.is_dir():
+            homes.append(p)
+    return pick_latest(homes)
+
+
+def find_diagnostics_file(root: Path) -> Path | None:
+    roots: list[Path] = []
+    if (root / "sync" / "diagnostics").is_dir():
+        roots.append(root / "sync" / "diagnostics")
+    if (root / "diagnostics").is_dir():
+        roots.append(root / "diagnostics")
+    if root.is_dir() and root.name == "diagnostics":
+        roots.append(root)
+
+    patterns = ["*.debug_vars.json", "*.treedb_vars.json", "*.treedb_application_vars.json"]
+
+    # Prefer richer payload shapes in order. Ignore obviously empty snapshots.
+    for pat in patterns:
+        candidates: list[Path] = []
+        for diag in roots:
+            candidates.extend(diag.glob(pat))
+        # If caller passed a file-like path prefix directory with JSON files only.
+        if root.is_dir() and not roots:
+            candidates.extend(root.glob(pat))
+        candidates.sort(key=lambda p: p.stat().st_mtime, reverse=True)
+        for cand in candidates:
+            # "{}\n" snapshots are not useful for maintenance analysis.
+            if cand.stat().st_size <= 4:
+                continue
+            return cand
+
+    # Fallback: if all snapshots are tiny/empty, still return the newest one.
+    fallback: list[Path] = []
+    for pat in patterns:
+        for diag in roots:
+            fallback.extend(diag.glob(pat))
+        if root.is_dir() and not roots:
+            fallback.extend(root.glob(pat))
+    return pick_latest(fallback)
+
+
+def find_home_from_path(path: Path) -> str:
+    for parent in [path] + list(path.parents):
+        name = parent.name
+        if name.startswith(".celestia-app-mainnet-"):
+            return str(parent)
+    return ""
+
+
+def choose_instance(instances: dict[str, Any], pattern: str) -> tuple[str, dict[str, Any]]:
+    if not instances:
+        return "", {}
+
+    if pattern:
+        matches = [(k, v) for k, v in instances.items() if pattern in k and isinstance(v, dict)]
+        if matches:
+            # Prefer the richest stats object among matches.
+            matches.sort(key=lambda kv: len(kv[1]), reverse=True)
+            return matches[0][0], matches[0][1]
+
+    scored: list[tuple[int, int, str, dict[str, Any]]] = []
+    for k, v in instances.items():
+        if not isinstance(v, dict):
+            continue
+        vg_count = sum(1 for key in v.keys() if str(key).startswith("treedb.cache.vlog_generation."))
+        scored.append((vg_count, len(v), k, v))
+    if scored:
+        scored.sort(reverse=True)
+        _, _, k, v = scored[0]
+        return k, v
+
+    first_key = sorted(instances.keys())[0]
+    val = instances[first_key]
+    if isinstance(val, dict):
+        return first_key, val
+    return first_key, {}
+
+
+def extract_stats(payload: Any, instance_pattern: str) -> tuple[dict[str, Any], str]:
+    if not isinstance(payload, dict):
+        return {}, ""
+
+    # Most complete shape from debug vars snapshots:
+    # { "treedb": { "instances": { "...": { stats... } } } }
+    treedb = payload.get("treedb")
+    if isinstance(treedb, dict):
+        instances = treedb.get("instances")
+        if isinstance(instances, dict):
+            instance_name, stats = choose_instance(instances, instance_pattern)
+            return stats, instance_name
+
+    # Flat stats map shape.
+    if any(str(k).startswith("treedb.cache.") for k in payload.keys()):
+        return payload, ""
+
+    # Other possible shape: top-level instances.
+    instances = payload.get("instances")
+    if isinstance(instances, dict):
+        instance_name, stats = choose_instance(instances, instance_pattern)
+        return stats, instance_name
+
+    return {}, ""
+
+
+def metric_int(stats: dict[str, Any], key: str) -> int:
+    return safe_int(stats.get(key, 0), 0)
+
+
+def metric_float(stats: dict[str, Any], key: str) -> float:
+    return safe_float(stats.get(key, 0.0), 0.0)
+
+
+def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
+    m = {
+        "maintenance_attempts": metric_int(stats, "treedb.cache.vlog_generation.maintenance.attempts"),
+        "maintenance_acquired": metric_int(stats, "treedb.cache.vlog_generation.maintenance.acquired"),
+        "maintenance_collisions": metric_int(stats, "treedb.cache.vlog_generation.maintenance.collisions"),
+        "maintenance_noop": metric_int(stats, "treedb.cache.vlog_generation.maintenance.passes.noop"),
+        "maintenance_with_rewrite": metric_int(stats, "treedb.cache.vlog_generation.maintenance.passes.with_rewrite"),
+        "maintenance_with_gc": metric_int(stats, "treedb.cache.vlog_generation.maintenance.passes.with_gc"),
+        "rewrite_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.runs"),
+        "rewrite_plan_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_runs"),
+        "rewrite_plan_selected": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected"),
+        "rewrite_plan_empty": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_empty"),
+        "rewrite_plan_selected_segments_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"),
+        "rewrite_exec_source_segments_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_total"),
+        "rewrite_plan_selected_bytes_stale": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale"),
+        "rewrite_processed_stale_bytes": metric_int(stats, "treedb.cache.vlog_generation.rewrite.processed_stale_bytes"),
+        "rewrite_processed_live_bytes": metric_int(stats, "treedb.cache.vlog_generation.rewrite.processed_live_bytes"),
+        "rewrite_bytes_in": metric_int(stats, "treedb.cache.vlog_generation.rewrite.bytes_in"),
+        "rewrite_bytes_out": metric_int(stats, "treedb.cache.vlog_generation.rewrite.bytes_out"),
+        "rewrite_reclaimed_bytes": metric_int(stats, "treedb.cache.vlog_generation.rewrite.reclaimed_bytes"),
+        "rewrite_no_reclaim_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.no_reclaim_runs"),
+        "rewrite_exec_total_ms": metric_float(stats, "treedb.cache.vlog_generation.rewrite.exec.total_ms"),
+        "rewrite_exec_avg_ms": metric_float(stats, "treedb.cache.vlog_generation.rewrite.exec.avg_ms"),
+        "rewrite_ledger_bytes_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.ledger_bytes_total"),
+        "rewrite_ledger_bytes_stale": metric_int(stats, "treedb.cache.vlog_generation.rewrite.ledger_bytes_stale"),
+        "rewrite_ledger_segments": metric_int(stats, "treedb.cache.vlog_generation.rewrite.ledger_segments"),
+        "rewrite_age_blocked_remaining_ms": metric_int(stats, "treedb.cache.vlog_generation.rewrite.age_blocked_remaining_ms"),
+        "rewrite_penalties_active": metric_int(stats, "treedb.cache.vlog_generation.rewrite.penalties_active"),
+        "rewrite_budget_consumed_bytes_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"),
+        "rewrite_budget_tokens_utilization_pct": metric_float(stats, "treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct"),
+        "gc_runs": metric_int(stats, "treedb.cache.vlog_generation.gc.runs"),
+        "gc_deleted_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.deleted_bytes"),
+        "gc_deleted_segments": metric_int(stats, "treedb.cache.vlog_generation.gc.deleted_segments"),
+        "gc_exec_total_ms": metric_float(stats, "treedb.cache.vlog_generation.gc.exec.total_ms"),
+        "gc_exec_avg_ms": metric_float(stats, "treedb.cache.vlog_generation.gc.exec.avg_ms"),
+        "gc_last_eligible_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_eligible_bytes"),
+        "gc_last_pending_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_pending_bytes"),
+        "gc_last_protected_retained_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_protected_retained_bytes"),
+        "observed_gc_pending_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.pending_ids"),
+        "observed_gc_queued_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.queued_ids"),
+        "observed_gc_taken_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.taken_ids"),
+        "observed_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.runs"),
+        "observed_gc_retry_queued": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.retry_queued"),
+        "checkpoint_kick_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.runs"),
+        "checkpoint_kick_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.gc_runs"),
+        "checkpoint_kick_rewrite_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"),
+    }
+
+    skip_keys = [
+        "treedb.cache.vlog_generation.maintenance.skip.wal_on_periodic",
+        "treedb.cache.vlog_generation.maintenance.skip.maintenance_phase",
+        "treedb.cache.vlog_generation.maintenance.skip.stage_gate",
+        "treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due",
+        "treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved",
+        "treedb.cache.vlog_generation.maintenance.skip.age_blocked_gate",
+        "treedb.cache.vlog_generation.maintenance.skip.priority_pending",
+        "treedb.cache.vlog_generation.maintenance.skip.quiet_window",
+        "treedb.cache.vlog_generation.maintenance.skip.before_first_checkpoint",
+        "treedb.cache.vlog_generation.maintenance.skip.checkpoint_inflight",
+    ]
+    skip_map = {k.split(".")[-1]: metric_int(stats, k) for k in skip_keys}
+    m["maintenance_skip"] = skip_map
+    m["maintenance_skip_total"] = sum(skip_map.values())
+
+    passes_total = m["maintenance_noop"] + m["maintenance_with_rewrite"] + m["maintenance_with_gc"]
+    m["maintenance_passes_total"] = passes_total
+    m["maintenance_acquire_rate_pct"] = pct(m["maintenance_acquired"], m["maintenance_attempts"])
+    m["maintenance_collision_rate_pct"] = pct(m["maintenance_collisions"], m["maintenance_attempts"])
+    m["maintenance_rewrite_pass_share_pct"] = pct(m["maintenance_with_rewrite"], passes_total)
+    m["maintenance_gc_pass_share_pct"] = pct(m["maintenance_with_gc"], passes_total)
+
+    m["rewrite_plan_select_rate_pct"] = pct(m["rewrite_plan_selected"], m["rewrite_plan_runs"])
+    m["rewrite_segment_realization_pct"] = pct(
+        m["rewrite_exec_source_segments_total"],
+        m["rewrite_plan_selected_segments_total"],
+    )
+    m["rewrite_stale_selection_coverage_pct"] = pct(
+        m["rewrite_processed_stale_bytes"],
+        m["rewrite_plan_selected_bytes_stale"],
+    )
+    m["rewrite_immediate_reclaim_pct"] = pct(
+        m["rewrite_reclaimed_bytes"],
+        m["rewrite_processed_stale_bytes"],
+    )
+    m["rewrite_stale_not_reclaimed_bytes"] = max(
+        0,
+        m["rewrite_processed_stale_bytes"] - m["rewrite_reclaimed_bytes"],
+    )
+    rewrite_secs = m["rewrite_exec_total_ms"] / 1000.0
+    m["rewrite_exec_throughput_bytes_per_sec"] = (
+        (m["rewrite_bytes_in"] / rewrite_secs) if rewrite_secs > 0 else 0.0
+    )
+
+    gc_secs = m["gc_exec_total_ms"] / 1000.0
+    m["gc_delete_throughput_bytes_per_sec"] = (
+        (m["gc_deleted_bytes"] / gc_secs) if gc_secs > 0 else 0.0
+    )
+
+    m["observed_gc_drain_pct"] = pct(m["observed_gc_taken_ids"], m["observed_gc_queued_ids"])
+
+    return m
+
+
+def print_report(summary: dict[str, Any], source_file: Path, run_home: str, instance_name: str) -> None:
+    print(f"Source file: {source_file}")
+    if run_home:
+        print(f"Run home:    {run_home}")
+    if instance_name:
+        print(f"Instance:    {instance_name}")
+    print("")
+
+    print("Maintenance lane")
+    print(
+        "  attempts/acquired/collisions: "
+        f"{summary['maintenance_attempts']} / {summary['maintenance_acquired']} / {summary['maintenance_collisions']} "
+        f"(acquire={summary['maintenance_acquire_rate_pct']:.1f}%, collision={summary['maintenance_collision_rate_pct']:.1f}%)"
+    )
+    print(
+        "  passes: "
+        f"total={summary['maintenance_passes_total']} "
+        f"noop={summary['maintenance_noop']} "
+        f"rewrite={summary['maintenance_with_rewrite']} "
+        f"gc={summary['maintenance_with_gc']} "
+        f"(rewrite_share={summary['maintenance_rewrite_pass_share_pct']:.1f}%, gc_share={summary['maintenance_gc_pass_share_pct']:.1f}%)"
+    )
+    skips = summary["maintenance_skip"]
+    print(
+        "  skip pressure: "
+        f"total={summary['maintenance_skip_total']} "
+        f"stage_gate={skips['stage_gate']} "
+        f"stage_not_due={skips['stage_gate_not_due']} "
+        f"age_blocked={skips['age_blocked_gate']} "
+        f"quiet={skips['quiet_window']} "
+        f"checkpoint={skips['checkpoint_inflight']}"
+    )
+    print("")
+
+    print("Rewrite economics")
+    print(
+        "  plan runs/selected/empty: "
+        f"{summary['rewrite_plan_runs']} / {summary['rewrite_plan_selected']} / {summary['rewrite_plan_empty']} "
+        f"(select_rate={summary['rewrite_plan_select_rate_pct']:.1f}%)"
+    )
+    print(
+        "  selected->executed segments: "
+        f"{summary['rewrite_plan_selected_segments_total']} -> {summary['rewrite_exec_source_segments_total']} "
+        f"(realization={summary['rewrite_segment_realization_pct']:.1f}%)"
+    )
+    print(
+        "  selected stale vs processed stale: "
+        f"{human_bytes(summary['rewrite_plan_selected_bytes_stale'])} -> {human_bytes(summary['rewrite_processed_stale_bytes'])} "
+        f"(coverage={summary['rewrite_stale_selection_coverage_pct']:.1f}%)"
+    )
+    print(
+        "  bytes in/out/reclaimed: "
+        f"{human_bytes(summary['rewrite_bytes_in'])} / {human_bytes(summary['rewrite_bytes_out'])} / {human_bytes(summary['rewrite_reclaimed_bytes'])}"
+    )
+    print(
+        "  stale processed w/o immediate reclaim: "
+        f"{human_bytes(summary['rewrite_stale_not_reclaimed_bytes'])} "
+        f"(immediate_reclaim={summary['rewrite_immediate_reclaim_pct']:.2f}%, no_reclaim_runs={summary['rewrite_no_reclaim_runs']})"
+    )
+    print(
+        "  exec: "
+        f"runs={summary['rewrite_runs']} total_ms={summary['rewrite_exec_total_ms']:.3f} avg_ms={summary['rewrite_exec_avg_ms']:.3f} "
+        f"throughput={human_bytes(summary['rewrite_exec_throughput_bytes_per_sec'])}/s"
+    )
+    print(
+        "  debt/budget: "
+        f"ledger={human_bytes(summary['rewrite_ledger_bytes_total'])} (stale={human_bytes(summary['rewrite_ledger_bytes_stale'])}, segs={summary['rewrite_ledger_segments']}) "
+        f"age_blocked_ms={summary['rewrite_age_blocked_remaining_ms']} penalties={summary['rewrite_penalties_active']} "
+        f"budget_consumed={human_bytes(summary['rewrite_budget_consumed_bytes_total'])} "
+        f"budget_util={summary['rewrite_budget_tokens_utilization_pct']:.1f}%"
+    )
+    print("")
+
+    print("GC economics")
+    print(
+        "  runs/deleted: "
+        f"{summary['gc_runs']} / {summary['gc_deleted_segments']} segments, {human_bytes(summary['gc_deleted_bytes'])}"
+    )
+    print(
+        "  exec: "
+        f"total_ms={summary['gc_exec_total_ms']:.3f} avg_ms={summary['gc_exec_avg_ms']:.3f} "
+        f"delete_throughput={human_bytes(summary['gc_delete_throughput_bytes_per_sec'])}/s"
+    )
+    print(
+        "  last eligibility/protection: "
+        f"eligible={human_bytes(summary['gc_last_eligible_bytes'])} "
+        f"pending={human_bytes(summary['gc_last_pending_bytes'])} "
+        f"protected_retained={human_bytes(summary['gc_last_protected_retained_bytes'])}"
+    )
+    print(
+        "  checkpoint-kick: "
+        f"runs={summary['checkpoint_kick_runs']} rewrite_runs={summary['checkpoint_kick_rewrite_runs']} gc_runs={summary['checkpoint_kick_gc_runs']}"
+    )
+    print("")
+
+    print("Observed-source replay")
+    print(
+        "  queued/taken/pending ids: "
+        f"{summary['observed_gc_queued_ids']} / {summary['observed_gc_taken_ids']} / {summary['observed_gc_pending_ids']} "
+        f"(drain={summary['observed_gc_drain_pct']:.1f}%, retries={summary['observed_gc_retry_queued']}, runs={summary['observed_gc_runs']})"
+    )
+
+    print("")
+    notes: list[str] = []
+    if summary["rewrite_processed_stale_bytes"] > 0 and summary["rewrite_reclaimed_bytes"] == 0:
+        notes.append("rewrite copied stale bytes but immediate reclaim is zero; inspect GC eligibility/protection and post-run rewrite window")
+    if summary["observed_gc_pending_ids"] > 0:
+        notes.append("observed-source GC backlog still pending; may need longer run window or higher checkpoint-kick pressure")
+    if summary["maintenance_collision_rate_pct"] > 20.0:
+        notes.append("maintenance collision rate is high; lane contention may be throttling rewrite/GC progress")
+    if summary["rewrite_segment_realization_pct"] < 60.0 and summary["rewrite_plan_selected_segments_total"] > 0:
+        notes.append("rewrite segment realization is low; staged debt is being selected faster than executed")
+    if not notes:
+        notes.append("no obvious maintenance-lane bottleneck signature in this snapshot")
+
+    print("Signals")
+    for note in notes:
+        print(f"  - {note}")
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Analyze TreeDB live vlog maintenance capacity from run_celestia diagnostics")
+    p.add_argument(
+        "input",
+        nargs="?",
+        help="run home dir, diagnostics dir, or debug vars JSON file (default: latest ~/.celestia-app-mainnet-treedb-*)",
+    )
+    p.add_argument(
+        "--instance-pattern",
+        default="application.db",
+        help="prefer instance names containing this substring when debug_vars has multiple DB instances",
+    )
+    p.add_argument("--json", action="store_true", help="emit JSON summary instead of text report")
+    return p.parse_args()
+
+
+def resolve_source(input_arg: str | None) -> Path:
+    if input_arg:
+        p = Path(os.path.expanduser(input_arg)).resolve()
+        if not p.exists():
+            raise FileNotFoundError(f"input does not exist: {p}")
+        if p.is_file():
+            return p
+        src = find_diagnostics_file(p)
+        if src is None:
+            raise FileNotFoundError(f"no diagnostics JSON found under: {p}")
+        return src
+
+    home = find_latest_home()
+    if home is None:
+        raise FileNotFoundError("no ~/.celestia-app-mainnet-treedb-* directories found")
+    src = find_diagnostics_file(home)
+    if src is None:
+        raise FileNotFoundError(f"no diagnostics JSON found under: {home}")
+    return src
+
+
+def main() -> int:
+    args = parse_args()
+    try:
+        source = resolve_source(args.input)
+    except FileNotFoundError as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        return 2
+
+    try:
+        payload = json.loads(source.read_text(encoding="utf-8"))
+    except Exception as exc:
+        print(f"error: failed to parse JSON from {source}: {exc}", file=sys.stderr)
+        return 2
+
+    stats, instance_name = extract_stats(payload, args.instance_pattern)
+    if not stats:
+        print(
+            "error: could not extract treedb stats map from JSON (expected debug_vars shape or flat stats map)",
+            file=sys.stderr,
+        )
+        return 2
+
+    summary = build_summary(stats)
+    run_home = find_home_from_path(source)
+
+    if args.json:
+        out = {
+            "source_file": str(source),
+            "run_home": run_home,
+            "instance": instance_name,
+            "summary": summary,
+        }
+        print(json.dumps(out, indent=2, sort_keys=True))
+    else:
+        print_report(summary, source, run_home, instance_name)
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md
new file mode 100644
index 000000000..ecef3eff7
--- /dev/null
+++ b/worklog/2026-03-28.md
@@ -0,0 +1,33 @@
+# 2026-03-28
+
+- Added a repeatable live-maintenance capacity analyzer:
+  - `scripts/analyze_vlog_maintenance_capacity.py`
+  - Input modes:
+    - latest run home auto-discovery (default)
+    - explicit run home dir
+    - explicit diagnostics JSON snapshot
+  - Prefers `*.debug_vars.json` snapshots and the `application.db` instance in multi-instance payloads.
+  - Emits derived signals for:
+    - maintenance lane pressure (attempt/acquire/collision + skip mix)
+    - rewrite plan select rate and selected->executed realization
+    - selected stale bytes vs processed stale bytes
+    - immediate reclaim ratio
+    - observed-source replay queue drain
+    - GC eligibility/protection summary
+
+- Updated runbook docs with command + usage:
+  - `docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md`
+  - Added `Live Run Capacity Report` section.
+
+- Validation run (existing Celestia home):
+  - command:
+    - `./scripts/analyze_vlog_maintenance_capacity.py /home/mikers/.celestia-app-mainnet-treedb-20260328050437`
+  - key outputs:
+    - `maintenance attempts/acquired/collisions = 74 / 74 / 0`
+    - `rewrite plan runs/selected/empty = 9 / 4 / 5`
+    - `selected->executed segments = 14 -> 7 (50.0%)`
+    - `selected stale -> processed stale = 2.91 GiB -> 1.46 GiB (50.0%)`
+    - `rewrite reclaimed bytes = 0 B` with `processed stale = 1.46 GiB`
+    - `observed-source queued/taken/pending ids = 29 / 29 / 0`
+  - interpretation:
+    - forced observed-source replay now drains cleanly, but the dominant remaining bottleneck is still zero immediate reclaim despite substantial stale rewrite processing.

From 431d323f84a87851611739b70671164b17513352 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 05:30:48 -1000
Subject: [PATCH 28/61] worklog: record high-budget live rewrite run

---
 worklog/2026-03-28.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md
index ecef3eff7..86925f34a 100644
--- a/worklog/2026-03-28.md
+++ b/worklog/2026-03-28.md
@@ -31,3 +31,29 @@
     - `observed-source queued/taken/pending ids = 29 / 29 / 0`
   - interpretation:
     - forced observed-source replay now drains cleanly, but the dominant remaining bottleneck is still zero immediate reclaim despite substantial stale rewrite processing.
+
+- Live run: higher rewrite budget pass (same fast profile)
+  - command:
+    - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh`
+  - run home:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260328052003`
+  - analyzer readout:
+    - `./scripts/analyze_vlog_maintenance_capacity.py /home/mikers/.celestia-app-mainnet-treedb-20260328052003`
+  - delta vs prior run (`/home/mikers/.celestia-app-mainnet-treedb-20260328050437`):
+    - `rewrite.exec.source_segments_total`: `7 -> 8`
+    - `rewrite.segment_realization_pct`: `50.0% -> 61.5%`
+    - `rewrite.processed_stale_bytes`: `1.46 GiB -> 1.60 GiB`
+    - `rewrite.bytes_in`: `300.41 MiB -> 408.30 MiB`
+    - `rewrite exec throughput`: `9.72 MiB/s -> 13.22 MiB/s`
+    - `rewrite.reclaimed_bytes`: stayed `0 B`
+  - interpretation:
+    - Higher budget improves rewrite execution throughput and plan-to-exec realization, but does not solve the core in-run reclaim issue (GC eligibility still zero at final snapshot).
+
+- Post-run offline rewrite on the higher-budget home:
+  - command:
+    - `/home/mikers/dev/snissn/celestia-app-p4/build/treemap-local vlog-rewrite /home/mikers/.celestia-app-mainnet-treedb-20260328052003/data/application.db -rw`
+  - output:
+    - `vlog-rewrite: segments_before=16 segments_after=17 bytes_before=3496705485 bytes_after=2168049697 records=1011649`
+  - post-rewrite size/gzip:
+    - `du -sb`: `2208117397`
+    - `tar|gzip|wc -c`: `1781585169`

From f587805ed0a24728b4049f740200d809ce22e5e5 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 05:55:10 -1000
Subject: [PATCH 29/61] caching: expose cumulative observed-source gc totals

---
 TreeDB/caching/db.go                          | 21 +++++++++
 .../caching/vlog_generation_scheduler_test.go | 28 ++++++++++++
 scripts/analyze_vlog_maintenance_capacity.py  | 44 +++++++++++++++++++
 worklog/2026-03-28.md                         | 35 +++++++++++++++
 4 files changed, 128 insertions(+)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 287f08e17..b654355df 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5643,6 +5643,13 @@ type DB struct {
 	vlogGenerationObservedGCTakenIDs                            atomic.Uint64
 	vlogGenerationObservedGCRuns                                atomic.Uint64
 	vlogGenerationObservedGCRetryQueued                         atomic.Uint64
+	vlogGenerationObservedGCSourceSegmentsTotal                 atomic.Uint64
+	vlogGenerationObservedGCSourceSegmentsEligibleTotal         atomic.Uint64
+	vlogGenerationObservedGCSourceSegmentsDeletedTotal          atomic.Uint64
+	vlogGenerationObservedGCSourceBytesTotal                    atomic.Int64
+	vlogGenerationObservedGCSourceBytesEligibleTotal            atomic.Int64
+	vlogGenerationObservedGCSourceBytesDeletedTotal             atomic.Int64
+	vlogGenerationObservedGCSourceBytesProtectedRetainedTotal   atomic.Int64
 	retainedPruneMu                                             sync.Mutex
 	retainedPruneDone                                           chan struct{}
 	vlogGenerationRemapSuccesses                                atomic.Uint64
@@ -13080,6 +13087,13 @@ func (db *DB) observeVlogGenerationGCStats(stats backenddb.ValueLogGCStats) {
 	db.vlogGenerationLastGCObservedSourceBytesEligible.Store(stats.ObservedSourceBytesEligible)
 	db.vlogGenerationLastGCObservedSourceBytesDeleted.Store(stats.ObservedSourceBytesDeleted)
 	db.vlogGenerationLastGCObservedSourceBytesPending.Store(stats.ObservedSourceBytesPending)
+	db.vlogGenerationObservedGCSourceSegmentsTotal.Add(uint64(stats.ObservedSourceSegments))
+	db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Add(uint64(stats.ObservedSourceSegmentsEligible))
+	db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Add(uint64(stats.ObservedSourceSegmentsDeleted))
+	db.vlogGenerationObservedGCSourceBytesTotal.Add(stats.ObservedSourceBytes)
+	db.vlogGenerationObservedGCSourceBytesEligibleTotal.Add(stats.ObservedSourceBytesEligible)
+	db.vlogGenerationObservedGCSourceBytesDeletedTotal.Add(stats.ObservedSourceBytesDeleted)
+	db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Add(stats.ObservedSourceBytesProtectedRetained)
 }
 
 func (db *DB) observeVlogGenerationVacuumExecDuration(d time.Duration) {
@@ -20766,6 +20780,13 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.observed_gc.taken_ids"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCTakenIDs.Load())
 	stats["treedb.cache.vlog_generation.observed_gc.runs"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRuns.Load())
 	stats["treedb.cache.vlog_generation.observed_gc.retry_queued"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRetryQueued.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsTotal.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.source_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesTotal.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesEligibleTotal.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesDeletedTotal.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Load())
 	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load())
 	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Load())
 	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 9c5cd67ff..b2305abc4 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -6089,6 +6089,13 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	db.vlogGenerationObservedGCTakenIDs.Store(9)
 	db.vlogGenerationObservedGCRuns.Store(3)
 	db.vlogGenerationObservedGCRetryQueued.Store(2)
+	db.vlogGenerationObservedGCSourceSegmentsTotal.Store(11)
+	db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Store(5)
+	db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Store(3)
+	db.vlogGenerationObservedGCSourceBytesTotal.Store(1100)
+	db.vlogGenerationObservedGCSourceBytesEligibleTotal.Store(500)
+	db.vlogGenerationObservedGCSourceBytesDeletedTotal.Store(300)
+	db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Store(250)
 
 	db.vlogGenerationRewriteQueueMu.Lock()
 	db.vlogGenerationRewriteQueueLoaded = true
@@ -6372,4 +6379,25 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.observed_gc.retry_queued"]; got != "2" {
 		t.Fatalf("observed gc retry queued=%q want 2", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_total"]; got != "11" {
+		t.Fatalf("observed gc source segments total=%q want 11", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"]; got != "5" {
+		t.Fatalf("observed gc source segments eligible total=%q want 5", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"]; got != "3" {
+		t.Fatalf("observed gc source segments deleted total=%q want 3", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_total"]; got != "1100" {
+		t.Fatalf("observed gc source bytes total=%q want 1100", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total"]; got != "500" {
+		t.Fatalf("observed gc source bytes eligible total=%q want 500", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"]; got != "300" {
+		t.Fatalf("observed gc source bytes deleted total=%q want 300", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"]; got != "250" {
+		t.Fatalf("observed gc source bytes protected retained total=%q want 250", got)
+	}
 }
diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py
index 07aa4c926..509e2f508 100755
--- a/scripts/analyze_vlog_maintenance_capacity.py
+++ b/scripts/analyze_vlog_maintenance_capacity.py
@@ -253,6 +253,13 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
         "observed_gc_taken_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.taken_ids"),
         "observed_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.runs"),
         "observed_gc_retry_queued": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.retry_queued"),
+        "observed_gc_source_segments_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_total"),
+        "observed_gc_source_segments_eligible_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"),
+        "observed_gc_source_segments_deleted_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"),
+        "observed_gc_source_bytes_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_total"),
+        "observed_gc_source_bytes_eligible_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total"),
+        "observed_gc_source_bytes_deleted_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"),
+        "observed_gc_source_bytes_protected_retained_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"),
         "checkpoint_kick_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.runs"),
         "checkpoint_kick_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.gc_runs"),
         "checkpoint_kick_rewrite_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"),
@@ -309,6 +316,26 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
     )
 
     m["observed_gc_drain_pct"] = pct(m["observed_gc_taken_ids"], m["observed_gc_queued_ids"])
+    m["observed_gc_source_segments_eligible_pct"] = pct(
+        m["observed_gc_source_segments_eligible_total"],
+        m["observed_gc_source_segments_total"],
+    )
+    m["observed_gc_source_segments_deleted_pct"] = pct(
+        m["observed_gc_source_segments_deleted_total"],
+        m["observed_gc_source_segments_total"],
+    )
+    m["observed_gc_source_bytes_eligible_pct"] = pct(
+        m["observed_gc_source_bytes_eligible_total"],
+        m["observed_gc_source_bytes_total"],
+    )
+    m["observed_gc_source_bytes_deleted_pct"] = pct(
+        m["observed_gc_source_bytes_deleted_total"],
+        m["observed_gc_source_bytes_total"],
+    )
+    m["observed_gc_source_bytes_deleted_of_eligible_pct"] = pct(
+        m["observed_gc_source_bytes_deleted_total"],
+        m["observed_gc_source_bytes_eligible_total"],
+    )
 
     return m
 
@@ -414,6 +441,23 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst
         f"{summary['observed_gc_queued_ids']} / {summary['observed_gc_taken_ids']} / {summary['observed_gc_pending_ids']} "
         f"(drain={summary['observed_gc_drain_pct']:.1f}%, retries={summary['observed_gc_retry_queued']}, runs={summary['observed_gc_runs']})"
     )
+    print(
+        "  observed-source totals: "
+        f"segments total={summary['observed_gc_source_segments_total']} "
+        f"eligible={summary['observed_gc_source_segments_eligible_total']} "
+        f"deleted={summary['observed_gc_source_segments_deleted_total']} "
+        f"(eligible_pct={summary['observed_gc_source_segments_eligible_pct']:.1f}%, deleted_pct={summary['observed_gc_source_segments_deleted_pct']:.1f}%)"
+    )
+    print(
+        "  observed-source bytes: "
+        f"total={human_bytes(summary['observed_gc_source_bytes_total'])} "
+        f"eligible={human_bytes(summary['observed_gc_source_bytes_eligible_total'])} "
+        f"deleted={human_bytes(summary['observed_gc_source_bytes_deleted_total'])} "
+        f"protected_retained={human_bytes(summary['observed_gc_source_bytes_protected_retained_total'])} "
+        f"(eligible_pct={summary['observed_gc_source_bytes_eligible_pct']:.1f}%, "
+        f"deleted_pct={summary['observed_gc_source_bytes_deleted_pct']:.1f}%, "
+        f"deleted_of_eligible={summary['observed_gc_source_bytes_deleted_of_eligible_pct']:.1f}%)"
+    )
 
     print("")
     notes: list[str] = []
diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md
index 86925f34a..4ae00e805 100644
--- a/worklog/2026-03-28.md
+++ b/worklog/2026-03-28.md
@@ -57,3 +57,38 @@
   - post-rewrite size/gzip:
     - `du -sb`: `2208117397`
     - `tar|gzip|wc -c`: `1781585169`
+
+- Added observed-source GC cumulative totals to TreeDB stats + analyzer:
+  - new stats keys:
+    - `treedb.cache.vlog_generation.observed_gc.source_segments_total`
+    - `treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total`
+    - `treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total`
+    - `treedb.cache.vlog_generation.observed_gc.source_bytes_total`
+    - `treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total`
+    - `treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total`
+    - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total`
+  - analyzer now reports observed-source cumulative eligible/deleted percentages.
+
+- Validation run with forced rewrite trigger to exercise observed-source path:
+  - command:
+    - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=1073741824 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh`
+  - run home:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260328054206`
+  - analyzer highlights:
+    - rewrite: `plan_selected_segments_total=2`, `exec.source_segments_total=2`, `processed_stale_bytes=475.06 MiB`
+    - `rewrite.reclaimed_bytes=0`
+    - observed replay queue: `queued/taken/pending ids = 14 / 14 / 0`
+    - observed-source totals:
+      - segments: `total=12 eligible=0 deleted=0`
+      - bytes: `total=3.00 GiB eligible=0 B deleted=0 B protected_retained=3.00 GiB`
+  - interpretation:
+    - This confirms the bottleneck signature in-run is observed-source bytes remaining retained-protected (never becoming GC-eligible in the measured window), not queue drain failure.
+
+- Post-run offline checks on same home:
+  - `vlog-gc -rw`:
+    - `segments total=22 referenced=22 eligible=0 deleted=0 bytes_total=4737495161 bytes_eligible=0 bytes_deleted=0`
+  - `vlog-rewrite -rw`:
+    - `segments_before=22 segments_after=17 bytes_before=4737495161 bytes_after=2199392731 records=1021293`
+  - post-rewrite size/gzip:
+    - `du -sb`: `2239722809`
+    - `tar|gzip|wc -c`: `1805021465`

From 4a959bb4adbff225018fb4380bb0fa69eacc2176 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 05:56:51 -1000
Subject: [PATCH 30/61] tools: include retained-prune outcomes in capacity
 report

---
 scripts/analyze_vlog_maintenance_capacity.py | 34 ++++++++++++++++++++
 worklog/2026-03-28.md                        |  4 +++
 2 files changed, 38 insertions(+)

diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py
index 509e2f508..bc2df3e06 100755
--- a/scripts/analyze_vlog_maintenance_capacity.py
+++ b/scripts/analyze_vlog_maintenance_capacity.py
@@ -248,6 +248,19 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
         "gc_last_eligible_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_eligible_bytes"),
         "gc_last_pending_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_pending_bytes"),
         "gc_last_protected_retained_bytes": metric_int(stats, "treedb.cache.vlog_generation.gc.last_protected_retained_bytes"),
+        "retained_prune_closed_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.closed_bytes"),
+        "retained_prune_runs": metric_int(stats, "treedb.cache.vlog_retained_prune.runs"),
+        "retained_prune_forced_runs": metric_int(stats, "treedb.cache.vlog_retained_prune.forced_runs"),
+        "retained_prune_candidate_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.candidate_segments"),
+        "retained_prune_candidate_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.candidate_bytes"),
+        "retained_prune_removed_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.removed_segments"),
+        "retained_prune_removed_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.removed_bytes"),
+        "retained_prune_in_use_skipped_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.in_use_skipped_segments"),
+        "retained_prune_in_use_skipped_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.in_use_skipped_bytes"),
+        "retained_prune_live_skipped_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.live_skipped_segments"),
+        "retained_prune_live_skipped_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.live_skipped_bytes"),
+        "retained_prune_zombie_marked_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.zombie_marked_segments"),
+        "retained_prune_zombie_marked_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.zombie_marked_bytes"),
         "observed_gc_pending_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.pending_ids"),
         "observed_gc_queued_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.queued_ids"),
         "observed_gc_taken_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.taken_ids"),
@@ -336,6 +349,14 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
         m["observed_gc_source_bytes_deleted_total"],
         m["observed_gc_source_bytes_eligible_total"],
     )
+    m["retained_prune_removed_candidate_segments_pct"] = pct(
+        m["retained_prune_removed_segments"],
+        m["retained_prune_candidate_segments"],
+    )
+    m["retained_prune_removed_candidate_bytes_pct"] = pct(
+        m["retained_prune_removed_bytes"],
+        m["retained_prune_candidate_bytes"],
+    )
 
     return m
 
@@ -433,6 +454,19 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst
         "  checkpoint-kick: "
         f"runs={summary['checkpoint_kick_runs']} rewrite_runs={summary['checkpoint_kick_rewrite_runs']} gc_runs={summary['checkpoint_kick_gc_runs']}"
     )
+    print(
+        "  retained-prune: "
+        f"runs={summary['retained_prune_runs']} forced={summary['retained_prune_forced_runs']} closed={human_bytes(summary['retained_prune_closed_bytes'])} "
+        f"candidates={summary['retained_prune_candidate_segments']} ({human_bytes(summary['retained_prune_candidate_bytes'])}) "
+        f"removed={summary['retained_prune_removed_segments']} ({human_bytes(summary['retained_prune_removed_bytes'])}) "
+        f"(seg_removed_pct={summary['retained_prune_removed_candidate_segments_pct']:.1f}%, bytes_removed_pct={summary['retained_prune_removed_candidate_bytes_pct']:.1f}%)"
+    )
+    print(
+        "  retained-prune skips: "
+        f"in_use={summary['retained_prune_in_use_skipped_segments']} ({human_bytes(summary['retained_prune_in_use_skipped_bytes'])}) "
+        f"live={summary['retained_prune_live_skipped_segments']} ({human_bytes(summary['retained_prune_live_skipped_bytes'])}) "
+        f"zombie_marked={summary['retained_prune_zombie_marked_segments']} ({human_bytes(summary['retained_prune_zombie_marked_bytes'])})"
+    )
     print("")
 
     print("Observed-source replay")
diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md
index 4ae00e805..5713c7665 100644
--- a/worklog/2026-03-28.md
+++ b/worklog/2026-03-28.md
@@ -81,6 +81,10 @@
     - observed-source totals:
       - segments: `total=12 eligible=0 deleted=0`
       - bytes: `total=3.00 GiB eligible=0 B deleted=0 B protected_retained=3.00 GiB`
+    - retained-prune summary:
+      - `runs=2 forced=2 closed=4.75 GiB`
+      - `candidates=7 (1.75 GiB) removed=0`
+      - skips: `in_use=6`, `live=5 (1.25 GiB)`, `zombie_marked=2 (512 MiB)`
   - interpretation:
     - This confirms the bottleneck signature in-run is observed-source bytes remaining retained-protected (never becoming GC-eligible in the measured window), not queue drain failure.
 

From e7dd2a3448482969ec233d0d967ed198b57ab5ff Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 06:19:05 -1000
Subject: [PATCH 31/61] caching: accelerate observed-source retained prune
 pacing

---
 TreeDB/caching/db.go | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index b654355df..7302e0b3e 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -4639,6 +4639,16 @@ func (db *DB) takeRetainedPruneObservedSourceIDs() map[uint32]struct{} {
 	return out
 }
 
+func (db *DB) retainedPruneObservedSourcePending() bool {
+	if db == nil {
+		return false
+	}
+	db.retainedPruneObservedMu.Lock()
+	pending := len(db.retainedPruneObservedSourceIDs) > 0
+	db.retainedPruneObservedMu.Unlock()
+	return pending
+}
+
 func (db *DB) queueVlogGenerationObservedSourceGCList(ids []uint32) {
 	if db == nil || len(ids) == 0 {
 		return
@@ -4782,8 +4792,12 @@ func (db *DB) scheduleRetainedValueLogPruneWithForce(force bool) {
 		}
 		db.checkpointMu.Unlock()
 		now := time.Now()
+		minInterval := retainedPruneMinInterval
+		if effectiveForce && db.retainedPruneObservedSourcePending() {
+			minInterval = retainedPruneObservedMinInterval
+		}
 		last := db.retainedPruneLastStartUnixNano.Load()
-		if last > 0 && now.Sub(time.Unix(0, last)) < retainedPruneMinInterval {
+		if last > 0 && now.Sub(time.Unix(0, last)) < minInterval {
 			db.retainedValueLogPruneScheduleSkipMinInterval.Add(1)
 			return
 		}
@@ -5934,6 +5948,10 @@ const (
 	// Retained-path prune is opportunistic reclaim. Do not restart a full live-ID
 	// scan on every periodic checkpoint during a hot workload.
 	retainedPruneMinInterval = 30 * time.Second
+	// Rewrite-observed source IDs can quickly re-trigger forced retained-prune
+	// requests while replay GC is trying to converge. Allow a faster cadence for
+	// that targeted path without dropping the generic min-interval guard.
+	retainedPruneObservedMinInterval = 3 * time.Second
 	// Coordinate index vacuum with major rewrite windows; do not run on every GC.
 	vlogGenerationVacuumTriggerRewriteBytes = int64(64 << 20)
 	vlogGenerationVacuumMinInterval         = 5 * time.Minute

From e7ef33865a97d02e957d1148f5033fd67a2a9a3c Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 07:04:27 -1000
Subject: [PATCH 32/61] caching: add observed-prune and zombie lifecycle
 diagnostics

---
 TreeDB/caching/db.go                         | 80 +++++++++++++++++
 TreeDB/internal/valuelog/manager.go          | 44 ++++++++++
 TreeDB/internal/valuelog/manager_test.go     | 32 +++++++
 docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md     |  3 +
 scripts/analyze_vlog_maintenance_capacity.py | 91 ++++++++++++++++++++
 worklog/2026-03-28.md                        | 66 ++++++++++++++
 6 files changed, 316 insertions(+)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 7302e0b3e..14241a7c0 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -4243,6 +4243,48 @@ func (db *DB) observeRetainedValueLogPruneStats(pruneStats retainedValueLogPrune
 	db.retainedValueLogPruneLastObservedSourceParseSkippedBytes.Store(pruneStats.ObservedSourceParseSkippedBytes)
 	db.retainedValueLogPruneLastObservedSourceZombieMarkedSegments.Store(int64(pruneStats.ObservedSourceZombieMarkedSegments))
 	db.retainedValueLogPruneLastObservedSourceZombieMarkedBytes.Store(pruneStats.ObservedSourceZombieMarkedBytes)
+	if pruneStats.ObservedSourceSegments > 0 {
+		db.retainedValueLogPruneObservedSourceSegmentsTotal.Add(uint64(pruneStats.ObservedSourceSegments))
+	}
+	if pruneStats.ObservedSourceBytes > 0 {
+		db.retainedValueLogPruneObservedSourceBytesTotal.Add(pruneStats.ObservedSourceBytes)
+	}
+	if pruneStats.ObservedSourceCandidateSegments > 0 {
+		db.retainedValueLogPruneObservedSourceCandidateSegmentsTotal.Add(uint64(pruneStats.ObservedSourceCandidateSegments))
+	}
+	if pruneStats.ObservedSourceCandidateBytes > 0 {
+		db.retainedValueLogPruneObservedSourceCandidateBytesTotal.Add(pruneStats.ObservedSourceCandidateBytes)
+	}
+	if pruneStats.ObservedSourceRemovedSegments > 0 {
+		db.retainedValueLogPruneObservedSourceRemovedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceRemovedSegments))
+	}
+	if pruneStats.ObservedSourceRemovedBytes > 0 {
+		db.retainedValueLogPruneObservedSourceRemovedBytesTotal.Add(pruneStats.ObservedSourceRemovedBytes)
+	}
+	if pruneStats.ObservedSourceInUseSkippedSegments > 0 {
+		db.retainedValueLogPruneObservedSourceInUseSkippedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceInUseSkippedSegments))
+	}
+	if pruneStats.ObservedSourceInUseSkippedBytes > 0 {
+		db.retainedValueLogPruneObservedSourceInUseSkippedBytesTotal.Add(pruneStats.ObservedSourceInUseSkippedBytes)
+	}
+	if pruneStats.ObservedSourceLiveSkippedSegments > 0 {
+		db.retainedValueLogPruneObservedSourceLiveSkippedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceLiveSkippedSegments))
+	}
+	if pruneStats.ObservedSourceLiveSkippedBytes > 0 {
+		db.retainedValueLogPruneObservedSourceLiveSkippedBytesTotal.Add(pruneStats.ObservedSourceLiveSkippedBytes)
+	}
+	if pruneStats.ObservedSourceParseSkippedSegments > 0 {
+		db.retainedValueLogPruneObservedSourceParseSkippedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceParseSkippedSegments))
+	}
+	if pruneStats.ObservedSourceParseSkippedBytes > 0 {
+		db.retainedValueLogPruneObservedSourceParseSkippedBytesTotal.Add(pruneStats.ObservedSourceParseSkippedBytes)
+	}
+	if pruneStats.ObservedSourceZombieMarkedSegments > 0 {
+		db.retainedValueLogPruneObservedSourceZombieMarkedSegmentsTotal.Add(uint64(pruneStats.ObservedSourceZombieMarkedSegments))
+	}
+	if pruneStats.ObservedSourceZombieMarkedBytes > 0 {
+		db.retainedValueLogPruneObservedSourceZombieMarkedBytesTotal.Add(pruneStats.ObservedSourceZombieMarkedBytes)
+	}
 	if pruneStats.RetriedWithoutWriteGate {
 		db.retainedValueLogPruneWriteGateRetries.Add(1)
 		if pruneStats.RetrySucceeded {
@@ -5637,6 +5679,20 @@ type DB struct {
 	retainedValueLogPruneLastObservedSourceParseSkippedBytes    atomic.Int64
 	retainedValueLogPruneLastObservedSourceZombieMarkedSegments atomic.Int64
 	retainedValueLogPruneLastObservedSourceZombieMarkedBytes    atomic.Int64
+	retainedValueLogPruneObservedSourceSegmentsTotal            atomic.Uint64
+	retainedValueLogPruneObservedSourceBytesTotal               atomic.Int64
+	retainedValueLogPruneObservedSourceCandidateSegmentsTotal   atomic.Uint64
+	retainedValueLogPruneObservedSourceCandidateBytesTotal      atomic.Int64
+	retainedValueLogPruneObservedSourceRemovedSegmentsTotal     atomic.Uint64
+	retainedValueLogPruneObservedSourceRemovedBytesTotal        atomic.Int64
+	retainedValueLogPruneObservedSourceInUseSkippedSegmentsTotal atomic.Uint64
+	retainedValueLogPruneObservedSourceInUseSkippedBytesTotal   atomic.Int64
+	retainedValueLogPruneObservedSourceLiveSkippedSegmentsTotal atomic.Uint64
+	retainedValueLogPruneObservedSourceLiveSkippedBytesTotal    atomic.Int64
+	retainedValueLogPruneObservedSourceParseSkippedSegmentsTotal atomic.Uint64
+	retainedValueLogPruneObservedSourceParseSkippedBytesTotal   atomic.Int64
+	retainedValueLogPruneObservedSourceZombieMarkedSegmentsTotal atomic.Uint64
+	retainedValueLogPruneObservedSourceZombieMarkedBytesTotal   atomic.Int64
 	retainedValueLogPruneScheduleRequests                       atomic.Uint64
 	retainedValueLogPruneScheduleForcedRequests                 atomic.Uint64
 	retainedValueLogPruneScheduleSkipClosing                    atomic.Uint64
@@ -20676,6 +20732,20 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_parse_skipped"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceParseSkippedBytes.Load())
 	stats["treedb.cache.vlog_retained_prune.last_observed_source.segments_zombie_marked"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceZombieMarkedSegments.Load())
 	stats["treedb.cache.vlog_retained_prune.last_observed_source.bytes_zombie_marked"] = fmt.Sprintf("%d", db.retainedValueLogPruneLastObservedSourceZombieMarkedBytes.Load())
+	stats["treedb.cache.vlog_retained_prune.observed_source.segments_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceSegmentsTotal.Load())
+	stats["treedb.cache.vlog_retained_prune.observed_source.bytes_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceBytesTotal.Load())
+	stats["treedb.cache.vlog_retained_prune.observed_source.segments_candidate_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceCandidateSegmentsTotal.Load())
+	stats["treedb.cache.vlog_retained_prune.observed_source.bytes_candidate_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceCandidateBytesTotal.Load())
+	stats["treedb.cache.vlog_retained_prune.observed_source.segments_removed_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceRemovedSegmentsTotal.Load())
+	stats["treedb.cache.vlog_retained_prune.observed_source.bytes_removed_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceRemovedBytesTotal.Load())
+	stats["treedb.cache.vlog_retained_prune.observed_source.segments_in_use_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceInUseSkippedSegmentsTotal.Load())
+	stats["treedb.cache.vlog_retained_prune.observed_source.bytes_in_use_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceInUseSkippedBytesTotal.Load())
+	stats["treedb.cache.vlog_retained_prune.observed_source.segments_live_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceLiveSkippedSegmentsTotal.Load())
+	stats["treedb.cache.vlog_retained_prune.observed_source.bytes_live_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceLiveSkippedBytesTotal.Load())
+	stats["treedb.cache.vlog_retained_prune.observed_source.segments_parse_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceParseSkippedSegmentsTotal.Load())
+	stats["treedb.cache.vlog_retained_prune.observed_source.bytes_parse_skipped_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceParseSkippedBytesTotal.Load())
+	stats["treedb.cache.vlog_retained_prune.observed_source.segments_zombie_marked_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceZombieMarkedSegmentsTotal.Load())
+	stats["treedb.cache.vlog_retained_prune.observed_source.bytes_zombie_marked_total"] = fmt.Sprintf("%d", db.retainedValueLogPruneObservedSourceZombieMarkedBytesTotal.Load())
 	stats["treedb.cache.vlog_retained_prune.pressure_bytes"] = fmt.Sprintf("%d", db.retainedPrunePressureBytes())
 	stats["treedb.cache.vlog_retained_prune.schedule_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleRequests.Load())
 	stats["treedb.cache.vlog_retained_prune.schedule_forced_requests"] = fmt.Sprintf("%d", db.retainedValueLogPruneScheduleForcedRequests.Load())
@@ -21060,6 +21130,16 @@ func (db *DB) Stats() map[string]string {
 		stats["treedb.cache.vlog_mmap.dead_mappings.cap_base"] = fmt.Sprintf("%d", valuelog.MaxDeadMappings)
 		stats["treedb.cache.vlog_mmap.max_mapped_sealed_segments"] = fmt.Sprintf("%d", valuelog.MaxMappedSealedSegments)
 		stats["treedb.cache.vlog_mmap.max_mapped_sealed_bytes"] = fmt.Sprintf("%d", valuelog.MaxMappedSealedBytes)
+		zombieSegments, zombieBytes, zombiePinnedSegments, zombiePinnedBytes, zombieUnpinnedSegments, zombieUnpinnedBytes := db.valueLogReader.ZombieStats()
+		stats["treedb.cache.vlog_zombie.segments"] = fmt.Sprintf("%d", zombieSegments)
+		stats["treedb.cache.vlog_zombie.bytes"] = fmt.Sprintf("%d", zombieBytes)
+		stats["treedb.cache.vlog_zombie.pinned_segments"] = fmt.Sprintf("%d", zombiePinnedSegments)
+		stats["treedb.cache.vlog_zombie.pinned_bytes"] = fmt.Sprintf("%d", zombiePinnedBytes)
+		stats["treedb.cache.vlog_zombie.unpinned_segments"] = fmt.Sprintf("%d", zombieUnpinnedSegments)
+		stats["treedb.cache.vlog_zombie.unpinned_bytes"] = fmt.Sprintf("%d", zombieUnpinnedBytes)
+		stats["treedb.process.memory.vlog_zombie_bytes_estimate"] = fmt.Sprintf("%d", zombieBytes)
+		stats["treedb.process.memory.vlog_zombie_pinned_bytes_estimate"] = fmt.Sprintf("%d", zombiePinnedBytes)
+		stats["treedb.process.memory.vlog_zombie_unpinned_bytes_estimate"] = fmt.Sprintf("%d", zombieUnpinnedBytes)
 		stats["treedb.cache.vlog_mmap.active_segments"] = fmt.Sprintf("%d", cacheVlogMmap.activeSegments)
 		stats["treedb.cache.vlog_mmap.active_bytes"] = fmt.Sprintf("%d", cacheVlogMmap.activeBytes)
 		stats["treedb.cache.vlog_mmap.current_segments"] = fmt.Sprintf("%d", cacheVlogMmap.currentSegments)
diff --git a/TreeDB/internal/valuelog/manager.go b/TreeDB/internal/valuelog/manager.go
index 21c3d5663..230bcef3a 100644
--- a/TreeDB/internal/valuelog/manager.go
+++ b/TreeDB/internal/valuelog/manager.go
@@ -1290,6 +1290,50 @@ func (m *Manager) RemapStats() (remaps uint64, deadMappings uint64) {
 	return remaps, deadMappings
 }
 
+func valueLogFileSizeBestEffort(f *File) uint64 {
+	if f == nil {
+		return 0
+	}
+	if known := f.fileSize.Load(); known > 0 {
+		return uint64(known)
+	}
+	if data, _ := f.mmapData.Load().([]byte); len(data) > 0 {
+		return uint64(len(data))
+	}
+	if f.Path != "" {
+		if info, err := os.Stat(f.Path); err == nil && info.Size() > 0 {
+			return uint64(info.Size())
+		}
+	}
+	return 0
+}
+
+// ZombieStats reports tracked zombie segments and their approximate byte totals.
+// A zombie remains on disk until all snapshots release it (RefCount reaches 0).
+func (m *Manager) ZombieStats() (segments uint64, bytes uint64, pinnedSegments uint64, pinnedBytes uint64, unpinnedSegments uint64, unpinnedBytes uint64) {
+	if m == nil {
+		return 0, 0, 0, 0, 0, 0
+	}
+	m.mu.RLock()
+	for _, f := range m.files {
+		if f == nil || !f.IsZombie.Load() {
+			continue
+		}
+		segments++
+		size := valueLogFileSizeBestEffort(f)
+		bytes += size
+		if f.RefCount.Load() > 0 {
+			pinnedSegments++
+			pinnedBytes += size
+			continue
+		}
+		unpinnedSegments++
+		unpinnedBytes += size
+	}
+	m.mu.RUnlock()
+	return segments, bytes, pinnedSegments, pinnedBytes, unpinnedSegments, unpinnedBytes
+}
+
 // MmapResidencyStats reports aggregate mmap residency split by segment type:
 // current writable segments, sealed segments, and dead mappings/bytes.
 func (m *Manager) MmapResidencyStats() (currentSegments uint64, currentBytes uint64, sealedSegments uint64, sealedBytes uint64, deadMappings uint64, deadBytes uint64) {
diff --git a/TreeDB/internal/valuelog/manager_test.go b/TreeDB/internal/valuelog/manager_test.go
index d6cd2e780..e2b3fd43c 100644
--- a/TreeDB/internal/valuelog/manager_test.go
+++ b/TreeDB/internal/valuelog/manager_test.go
@@ -92,6 +92,38 @@ func TestManagerMmapResidencyStatsAggregatesCounters(t *testing.T) {
 	}
 }
 
+func TestManagerZombieStatsAggregatesPinnedAndUnpinned(t *testing.T) {
+	mgr := &Manager{
+		files: map[uint32]*File{
+			1: {},
+			2: {},
+			3: {},
+		},
+	}
+	// Zombie + pinned.
+	mgr.files[1].IsZombie.Store(true)
+	mgr.files[1].RefCount.Store(2)
+	mgr.files[1].fileSize.Store(100)
+	// Zombie + unpinned.
+	mgr.files[2].IsZombie.Store(true)
+	mgr.files[2].RefCount.Store(0)
+	mgr.files[2].fileSize.Store(200)
+	// Non-zombie should be ignored.
+	mgr.files[3].RefCount.Store(9)
+	mgr.files[3].fileSize.Store(300)
+
+	segments, bytes, pinnedSegments, pinnedBytes, unpinnedSegments, unpinnedBytes := mgr.ZombieStats()
+	if segments != 2 || bytes != 300 {
+		t.Fatalf("ZombieStats total mismatch: segments=%d bytes=%d want segments=2 bytes=300", segments, bytes)
+	}
+	if pinnedSegments != 1 || pinnedBytes != 100 {
+		t.Fatalf("ZombieStats pinned mismatch: segments=%d bytes=%d want segments=1 bytes=100", pinnedSegments, pinnedBytes)
+	}
+	if unpinnedSegments != 1 || unpinnedBytes != 200 {
+		t.Fatalf("ZombieStats unpinned mismatch: segments=%d bytes=%d want segments=1 bytes=200", unpinnedSegments, unpinnedBytes)
+	}
+}
+
 func TestManagerPromoteCurrentWritable_SwitchesPriorLaneSegmentToSealed(t *testing.T) {
 	mgr := &Manager{
 		files:                 make(map[uint32]*File),
diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
index 2d0a98274..3a783b9ce 100644
--- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
+++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
@@ -49,8 +49,11 @@ Optional explicit input:
 The report highlights:
 - maintenance lane pressure (attempt/acquire/collision + skip mix)
 - rewrite plan-to-exec realization
+- rewrite source outcomes (requested vs still-referenced vs unreferenced)
 - stale-bytes processed vs immediate reclaim
 - observed-source replay drain
+- observed-source retained-prune outcomes (candidate/live-skipped/zombie-marked/removed)
+- zombie inventory (pinned vs unpinned bytes)
 - GC eligibility/protection signals
 
 ## Bench Commands
diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py
index bc2df3e06..78c6731da 100755
--- a/scripts/analyze_vlog_maintenance_capacity.py
+++ b/scripts/analyze_vlog_maintenance_capacity.py
@@ -224,6 +224,12 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
         "rewrite_plan_empty": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_empty"),
         "rewrite_plan_selected_segments_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"),
         "rewrite_exec_source_segments_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_total"),
+        "rewrite_exec_source_segments_requested_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"),
+        "rewrite_exec_source_segments_still_referenced_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"),
+        "rewrite_exec_source_segments_unreferenced_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_total"),
+        "rewrite_exec_source_segments_requested_last": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_last"),
+        "rewrite_exec_source_segments_still_referenced_last": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_last"),
+        "rewrite_exec_source_segments_unreferenced_last": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_unreferenced_last"),
         "rewrite_plan_selected_bytes_stale": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale"),
         "rewrite_processed_stale_bytes": metric_int(stats, "treedb.cache.vlog_generation.rewrite.processed_stale_bytes"),
         "rewrite_processed_live_bytes": metric_int(stats, "treedb.cache.vlog_generation.rewrite.processed_live_bytes"),
@@ -261,6 +267,26 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
         "retained_prune_live_skipped_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.live_skipped_bytes"),
         "retained_prune_zombie_marked_segments": metric_int(stats, "treedb.cache.vlog_retained_prune.zombie_marked_segments"),
         "retained_prune_zombie_marked_bytes": metric_int(stats, "treedb.cache.vlog_retained_prune.zombie_marked_bytes"),
+        "vlog_zombie_segments": metric_int(stats, "treedb.cache.vlog_zombie.segments"),
+        "vlog_zombie_bytes": metric_int(stats, "treedb.cache.vlog_zombie.bytes"),
+        "vlog_zombie_pinned_segments": metric_int(stats, "treedb.cache.vlog_zombie.pinned_segments"),
+        "vlog_zombie_pinned_bytes": metric_int(stats, "treedb.cache.vlog_zombie.pinned_bytes"),
+        "vlog_zombie_unpinned_segments": metric_int(stats, "treedb.cache.vlog_zombie.unpinned_segments"),
+        "vlog_zombie_unpinned_bytes": metric_int(stats, "treedb.cache.vlog_zombie.unpinned_bytes"),
+        "retained_prune_observed_source_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_total"),
+        "retained_prune_observed_source_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_total"),
+        "retained_prune_observed_source_candidate_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_candidate_total"),
+        "retained_prune_observed_source_candidate_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_candidate_total"),
+        "retained_prune_observed_source_removed_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_removed_total"),
+        "retained_prune_observed_source_removed_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_removed_total"),
+        "retained_prune_observed_source_in_use_skipped_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_in_use_skipped_total"),
+        "retained_prune_observed_source_in_use_skipped_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_in_use_skipped_total"),
+        "retained_prune_observed_source_live_skipped_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_live_skipped_total"),
+        "retained_prune_observed_source_live_skipped_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_live_skipped_total"),
+        "retained_prune_observed_source_parse_skipped_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_parse_skipped_total"),
+        "retained_prune_observed_source_parse_skipped_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_parse_skipped_total"),
+        "retained_prune_observed_source_zombie_marked_segments_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.segments_zombie_marked_total"),
+        "retained_prune_observed_source_zombie_marked_bytes_total": metric_int(stats, "treedb.cache.vlog_retained_prune.observed_source.bytes_zombie_marked_total"),
         "observed_gc_pending_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.pending_ids"),
         "observed_gc_queued_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.queued_ids"),
         "observed_gc_taken_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.taken_ids"),
@@ -306,6 +332,14 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
         m["rewrite_exec_source_segments_total"],
         m["rewrite_plan_selected_segments_total"],
     )
+    m["rewrite_source_unreferenced_pct"] = pct(
+        m["rewrite_exec_source_segments_unreferenced_total"],
+        m["rewrite_exec_source_segments_requested_total"],
+    )
+    m["rewrite_source_still_referenced_pct"] = pct(
+        m["rewrite_exec_source_segments_still_referenced_total"],
+        m["rewrite_exec_source_segments_requested_total"],
+    )
     m["rewrite_stale_selection_coverage_pct"] = pct(
         m["rewrite_processed_stale_bytes"],
         m["rewrite_plan_selected_bytes_stale"],
@@ -357,6 +391,26 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
         m["retained_prune_removed_bytes"],
         m["retained_prune_candidate_bytes"],
     )
+    m["retained_prune_observed_removed_candidate_segments_pct"] = pct(
+        m["retained_prune_observed_source_removed_segments_total"],
+        m["retained_prune_observed_source_candidate_segments_total"],
+    )
+    m["retained_prune_observed_removed_candidate_bytes_pct"] = pct(
+        m["retained_prune_observed_source_removed_bytes_total"],
+        m["retained_prune_observed_source_candidate_bytes_total"],
+    )
+    m["retained_prune_observed_live_skipped_candidate_segments_pct"] = pct(
+        m["retained_prune_observed_source_live_skipped_segments_total"],
+        m["retained_prune_observed_source_candidate_segments_total"],
+    )
+    m["retained_prune_observed_live_skipped_candidate_bytes_pct"] = pct(
+        m["retained_prune_observed_source_live_skipped_bytes_total"],
+        m["retained_prune_observed_source_candidate_bytes_total"],
+    )
+    m["vlog_zombie_pinned_bytes_pct"] = pct(
+        m["vlog_zombie_pinned_bytes"],
+        m["vlog_zombie_bytes"],
+    )
 
     return m
 
@@ -406,6 +460,16 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst
         f"{summary['rewrite_plan_selected_segments_total']} -> {summary['rewrite_exec_source_segments_total']} "
         f"(realization={summary['rewrite_segment_realization_pct']:.1f}%)"
     )
+    print(
+        "  source outcomes (exec): "
+        f"requested_total={summary['rewrite_exec_source_segments_requested_total']} "
+        f"unreferenced_total={summary['rewrite_exec_source_segments_unreferenced_total']} "
+        f"still_referenced_total={summary['rewrite_exec_source_segments_still_referenced_total']} "
+        f"(unref_pct={summary['rewrite_source_unreferenced_pct']:.1f}%, still_ref_pct={summary['rewrite_source_still_referenced_pct']:.1f}%) "
+        f"last=requested:{summary['rewrite_exec_source_segments_requested_last']} "
+        f"unref:{summary['rewrite_exec_source_segments_unreferenced_last']} "
+        f"still_ref:{summary['rewrite_exec_source_segments_still_referenced_last']}"
+    )
     print(
         "  selected stale vs processed stale: "
         f"{human_bytes(summary['rewrite_plan_selected_bytes_stale'])} -> {human_bytes(summary['rewrite_processed_stale_bytes'])} "
@@ -467,6 +531,13 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst
         f"live={summary['retained_prune_live_skipped_segments']} ({human_bytes(summary['retained_prune_live_skipped_bytes'])}) "
         f"zombie_marked={summary['retained_prune_zombie_marked_segments']} ({human_bytes(summary['retained_prune_zombie_marked_bytes'])})"
     )
+    print(
+        "  zombie inventory: "
+        f"total={summary['vlog_zombie_segments']} ({human_bytes(summary['vlog_zombie_bytes'])}) "
+        f"pinned={summary['vlog_zombie_pinned_segments']} ({human_bytes(summary['vlog_zombie_pinned_bytes'])}) "
+        f"unpinned={summary['vlog_zombie_unpinned_segments']} ({human_bytes(summary['vlog_zombie_unpinned_bytes'])}) "
+        f"(pinned_bytes_pct={summary['vlog_zombie_pinned_bytes_pct']:.1f}%)"
+    )
     print("")
 
     print("Observed-source replay")
@@ -492,6 +563,19 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst
         f"deleted_pct={summary['observed_gc_source_bytes_deleted_pct']:.1f}%, "
         f"deleted_of_eligible={summary['observed_gc_source_bytes_deleted_of_eligible_pct']:.1f}%)"
     )
+    print(
+        "  observed-source retained-prune totals: "
+        f"seen={summary['retained_prune_observed_source_segments_total']} ({human_bytes(summary['retained_prune_observed_source_bytes_total'])}) "
+        f"candidate={summary['retained_prune_observed_source_candidate_segments_total']} ({human_bytes(summary['retained_prune_observed_source_candidate_bytes_total'])}) "
+        f"removed={summary['retained_prune_observed_source_removed_segments_total']} ({human_bytes(summary['retained_prune_observed_source_removed_bytes_total'])}) "
+        f"zombie_marked={summary['retained_prune_observed_source_zombie_marked_segments_total']} ({human_bytes(summary['retained_prune_observed_source_zombie_marked_bytes_total'])}) "
+        f"live_skipped={summary['retained_prune_observed_source_live_skipped_segments_total']} ({human_bytes(summary['retained_prune_observed_source_live_skipped_bytes_total'])}) "
+        f"in_use_skipped={summary['retained_prune_observed_source_in_use_skipped_segments_total']} ({human_bytes(summary['retained_prune_observed_source_in_use_skipped_bytes_total'])}) "
+        f"(removed_of_candidate={summary['retained_prune_observed_removed_candidate_segments_pct']:.1f}% seg / "
+        f"{summary['retained_prune_observed_removed_candidate_bytes_pct']:.1f}% bytes, "
+        f"live_skip_of_candidate={summary['retained_prune_observed_live_skipped_candidate_segments_pct']:.1f}% seg / "
+        f"{summary['retained_prune_observed_live_skipped_candidate_bytes_pct']:.1f}% bytes)"
+    )
 
     print("")
     notes: list[str] = []
@@ -503,6 +587,13 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst
         notes.append("maintenance collision rate is high; lane contention may be throttling rewrite/GC progress")
     if summary["rewrite_segment_realization_pct"] < 60.0 and summary["rewrite_plan_selected_segments_total"] > 0:
         notes.append("rewrite segment realization is low; staged debt is being selected faster than executed")
+    if (
+        summary["rewrite_exec_source_segments_unreferenced_total"] > 0
+        and summary["retained_prune_observed_source_zombie_marked_segments_total"] > 0
+        and summary["observed_gc_source_segments_deleted_total"] == 0
+        and summary["vlog_zombie_segments"] == 0
+    ):
+        notes.append("rewrite-selected sources became unreferenced and were zombie-marked, but GC delete counters stayed zero; reclaim likely happened via zombie lifecycle outside GC byte accounting")
     if not notes:
         notes.append("no obvious maintenance-lane bottleneck signature in this snapshot")
 
diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md
index 5713c7665..00a52cf2b 100644
--- a/worklog/2026-03-28.md
+++ b/worklog/2026-03-28.md
@@ -96,3 +96,69 @@
   - post-rewrite size/gzip:
     - `du -sb`: `2239722809`
     - `tar|gzip|wc -c`: `1805021465`
+
+- Added retained-prune observed-source cumulative counters (not just last-run snapshot):
+  - `treedb.cache.vlog_retained_prune.observed_source.segments_total`
+  - `treedb.cache.vlog_retained_prune.observed_source.bytes_total`
+  - `treedb.cache.vlog_retained_prune.observed_source.segments_candidate_total`
+  - `treedb.cache.vlog_retained_prune.observed_source.bytes_candidate_total`
+  - `treedb.cache.vlog_retained_prune.observed_source.segments_removed_total`
+  - `treedb.cache.vlog_retained_prune.observed_source.bytes_removed_total`
+  - `treedb.cache.vlog_retained_prune.observed_source.segments_in_use_skipped_total`
+  - `treedb.cache.vlog_retained_prune.observed_source.bytes_in_use_skipped_total`
+  - `treedb.cache.vlog_retained_prune.observed_source.segments_live_skipped_total`
+  - `treedb.cache.vlog_retained_prune.observed_source.bytes_live_skipped_total`
+  - `treedb.cache.vlog_retained_prune.observed_source.segments_parse_skipped_total`
+  - `treedb.cache.vlog_retained_prune.observed_source.bytes_parse_skipped_total`
+  - `treedb.cache.vlog_retained_prune.observed_source.segments_zombie_marked_total`
+  - `treedb.cache.vlog_retained_prune.observed_source.bytes_zombie_marked_total`
+
+- Added value-log zombie inventory stats from manager:
+  - `treedb.cache.vlog_zombie.segments`
+  - `treedb.cache.vlog_zombie.bytes`
+  - `treedb.cache.vlog_zombie.pinned_segments`
+  - `treedb.cache.vlog_zombie.pinned_bytes`
+  - `treedb.cache.vlog_zombie.unpinned_segments`
+  - `treedb.cache.vlog_zombie.unpinned_bytes`
+  - plus process-memory estimates for zombie bytes.
+
+- Analyzer/report updates:
+  - Include rewrite source outcomes (`requested/still_referenced/unreferenced`).
+  - Include observed-source retained-prune cumulative outcomes.
+  - Include zombie inventory (pinned vs unpinned bytes).
+  - Add signal note when rewrite-selected segments become unreferenced and zombie-marked while GC delete counters remain zero.
+
+- Validation run (`fast`, forced trigger, new counters)
+  - command:
+    - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=134217728 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh`
+  - run home:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260328063722`
+  - analyzer highlights:
+    - rewrite source outcomes: `requested_total=4 unreferenced_total=4 still_referenced_total=0`
+    - observed-source retained-prune totals: `seen=4 (1.00 GiB), candidate=4 (1.00 GiB), zombie_marked=4 (1.00 GiB), live_skipped=0`
+    - observed-source GC cumulative: `total=3.50 GiB, eligible=0, deleted=0, protected_retained=3.50 GiB`
+    - retained-prune global: `zombie_marked=4 (1.00 GiB)`
+  - interpretation:
+    - rewrite-selected source segments are becoming unreferenced and are then being zombie-marked in retained-prune; replay queue is draining.
+    - zero observed-source GC deleted bytes is not explained by queue starvation or live-skips on observed sources.
+
+- Second validation run with zombie inventory keys active:
+  - run home:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260328065017`
+  - analyzer highlights:
+    - rewrite source outcomes: `requested_total=4 unreferenced_total=4 still_referenced_total=0`
+    - observed-source retained-prune totals: `seen=4, candidate=4, zombie_marked=4, removed=0, live_skipped=0`
+    - zombie inventory at final snapshot: `total=0, pinned=0, unpinned=0`
+    - observed-source GC cumulative still `eligible=0 deleted=0 protected_retained=2.75 GiB`
+  - interpretation:
+    - observed-source segments are zombie-marked and eventually not present as tracked zombies by run end, yet GC delete counters remain zero; this indicates reclaim is occurring outside current GC deleted-byte accounting and that the larger disk gap is primarily about how much stale data live rewrite selected during the run.
+
+- Headroom check on same run (`20260328065017`) via offline rewrite:
+  - pre: `du -sb maindb/wal = 3805802931`
+  - command:
+    - `/home/mikers/dev/snissn/celestia-app-p4/build/treemap-local vlog-rewrite /home/mikers/.celestia-app-mainnet-treedb-20260328065017/data/application.db -rw`
+  - output:
+    - `vlog-rewrite: segments_before=20 segments_after=16 bytes_before=3805798835 bytes_after=2068426925 records=983187`
+  - post: `du -sb maindb/wal = 2068431021`
+  - implication:
+    - ~1.74 GiB additional compaction headroom remains versus end-of-live-run size under this workload.

From a9e6fc3a9ae07801e33043a3a3d5fb599de0b6fa Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 07:42:36 -1000
Subject: [PATCH 33/61] caching: honor configured stale-ratio threshold in
 generic rewrite

---
 TreeDB/caching/db.go                          |  7 ++--
 .../caching/vlog_generation_scheduler_test.go | 17 +++++---
 worklog/2026-03-28.md                         | 42 +++++++++++++++++++
 3 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 14241a7c0..88dd47510 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -15440,11 +15440,10 @@ func (db *DB) vlogGenerationRewriteMinStaleRatioForGenericPass(totalBytes int64)
 	if totalBytes < vlogGenerationRewriteEfficacyMinTotalBytes {
 		return 0
 	}
-	ratio := vlogGenerationRewriteGenericMinSegmentStaleRatio
-	if configured := db.vlogGenerationRewriteMinStaleRatioForStaleRatioTrigger(totalBytes); configured > ratio {
-		ratio = configured
+	if configured := db.vlogGenerationRewriteMinStaleRatioForStaleRatioTrigger(totalBytes); configured > 0 {
+		return configured
 	}
-	return ratio
+	return vlogGenerationRewriteGenericMinSegmentStaleRatio
 }
 
 func (db *DB) vlogGenerationRewriteMinStaleRatioForQueuedDebt(totalBytes int64, reason uint32) float64 {
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index b2305abc4..c3e0325e6 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -487,16 +487,16 @@ func TestRunVlogGenerationMaintenanceRetries_CoalescesPendingCollisionRetries(t
 	}
 }
 
-func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesQualityFloor(t *testing.T) {
+func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesConfiguredTriggerRatio(t *testing.T) {
 	db := &DB{valueLogRewriteTriggerRatioPPM: 200000}
-	if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want {
+	if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), 0.50; got != want {
 		t.Fatalf("generic min stale ratio=%f want=%f", got, want)
 	}
 }
 
-func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesHigherConfiguredRatio(t *testing.T) {
+func TestVlogGenerationRewriteMinStaleRatioForGenericPass_UsesHigherConfiguredTriggerRatio(t *testing.T) {
 	db := &DB{valueLogRewriteTriggerRatioPPM: 800000}
-	if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want {
+	if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), 0.80; got != want {
 		t.Fatalf("generic min stale ratio=%f want=%f", got, want)
 	}
 }
@@ -515,9 +515,16 @@ func TestVlogGenerationRewriteMinStaleRatioForGenericPass_DisabledBelowEfficacyF
 	}
 }
 
+func TestVlogGenerationRewriteMinStaleRatioForGenericPass_DefaultWithoutConfiguredTrigger(t *testing.T) {
+	db := &DB{valueLogRewriteTriggerRatioPPM: 0}
+	if got, want := db.vlogGenerationRewriteMinStaleRatioForGenericPass(8<<30), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want {
+		t.Fatalf("generic min stale ratio=%f want=%f", got, want)
+	}
+}
+
 func TestVlogGenerationRewriteMinStaleRatioForQueuedDebt_UsesGenericFloorForTotalBytes(t *testing.T) {
 	db := &DB{valueLogRewriteTriggerRatioPPM: 200000}
-	if got, want := db.vlogGenerationRewriteMinStaleRatioForQueuedDebt(8<<30, vlogGenerationReasonTotalBytes), vlogGenerationRewriteGenericMinSegmentStaleRatio; got != want {
+	if got, want := db.vlogGenerationRewriteMinStaleRatioForQueuedDebt(8<<30, vlogGenerationReasonTotalBytes), 0.50; got != want {
 		t.Fatalf("queued total-bytes min stale ratio=%f want=%f", got, want)
 	}
 }
diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md
index 00a52cf2b..69db38543 100644
--- a/worklog/2026-03-28.md
+++ b/worklog/2026-03-28.md
@@ -162,3 +162,45 @@
   - post: `du -sb maindb/wal = 2068431021`
   - implication:
     - ~1.74 GiB additional compaction headroom remains versus end-of-live-run size under this workload.
+
+- Stale-ratio trigger sweep (live run_celestia) to isolate rewrite-selection threshold impact:
+  - low stale ratio path (forces ~0.50 segment threshold):
+    - command:
+      - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=0 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=100000 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh`
+    - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328070649`
+    - analyzer highlights:
+      - `selected->executed segments = 13 -> 7`
+      - `processed_stale_bytes = 1.51 GiB`
+    - end WAL: `3093987987`
+    - offline rewrite on same home: `3093983891 -> 2128313686` (`du -sb` post `2128317782`)
+  - high stale ratio control (~0.85 threshold):
+    - command:
+      - `... TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=0 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=850000 ...`
+    - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328071500`
+    - analyzer highlights:
+      - `selected->executed segments = 4 -> 4`
+      - `processed_stale_bytes = 983.89 MiB`
+    - end WAL: `3944887635`
+  - interpretation:
+    - lower stale-threshold selection materially improves in-run compaction and closes offline headroom.
+
+- Code change: allow explicitly configured stale-ratio trigger to drive generic/total-bytes rewrite segment selection threshold.
+  - file: `TreeDB/caching/db.go`
+  - changed `vlogGenerationRewriteMinStaleRatioForGenericPass` so when `TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM` is set, generic planning uses that configured threshold (with existing stale-ratio floor behavior), instead of always enforcing the stricter generic constant.
+  - default behavior remains unchanged when stale-ratio trigger is unset.
+  - tests updated in `TreeDB/caching/vlog_generation_scheduler_test.go`:
+    - generic pass uses configured trigger ratio when set
+    - queued debt under total-bytes reflects configured ratio
+    - default generic ratio remains unchanged when trigger ratio is unset
+
+- Validation run after code change with both triggers enabled:
+  - command:
+    - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=536870912 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=134217728 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=100000 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh`
+  - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328072830`
+  - analyzer highlights:
+    - `selected->executed segments = 8 -> 8`
+    - `processed_stale_bytes = 1.58 GiB`
+  - end WAL: `3320308275` (improved vs prior total-bytes-trigger baselines around `3.7-3.9 GiB`)
+  - offline rewrite on same home: `3320304179 -> 2132071399` (`du -sb` post `2132075495`)
+  - interpretation:
+    - the threshold change improves total-bytes-triggered live rewrite coverage while preserving trigger semantics.

From 8e9a018a23cdcb4e311b6b9786c2ec90f7eb506f Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 08:52:34 -1000
Subject: [PATCH 34/61] caching: add pre-checkpoint rewrite override for
 WAL-off runs

---
 TreeDB/caching/db.go                          |  41 ++++-
 .../caching/vlog_generation_scheduler_test.go | 172 ++++++++++++++++++
 docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md      |   7 +
 scripts/analyze_vlog_maintenance_capacity.py  |   2 +
 worklog/2026-03-28.md                         |  45 +++++
 5 files changed, 260 insertions(+), 7 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 88dd47510..575e4694e 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -1860,6 +1860,10 @@ const (
 	envDisableVlogGenerationVacuum         = "TREEDB_DISABLE_VLOG_GENERATION_VACUUM"
 	envDisableVlogGenerationLoop           = "TREEDB_DISABLE_VLOG_GENERATION_LOOP"
 	envDisableVlogGenerationCheckpointKick = "TREEDB_DISABLE_VLOG_GENERATION_CHECKPOINT_KICK"
+	// Experimental WAL-off override: allow rewrite planning/execution before the
+	// first explicit checkpoint. Disabled by default because it can add restore
+	// contention during early state-sync.
+	envEnableVlogGenerationPreCheckpointRewrite = "TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE"
 	// Diagnostic toggle for WAL-off checkpoint-time sparse-index vacuum.
 	envDisableCheckpointAutoVacuum         = "TREEDB_DISABLE_CHECKPOINT_AUTO_VACUUM"
 	minMemtablePrealloc                    = 64 * 1024
@@ -6039,6 +6043,11 @@ const (
 	// During checkpoint-kick debt drain, allow a bounded multi-segment rewrite
 	// selection so debt can converge faster than one-segment-per-pass.
 	vlogGenerationRewriteDebtDrainMaxSegments = 8
+	// Freshly planned rewrites normally execute one segment to limit immediate
+	// write amplification. In explicit debt-drain mode, allow a small burst once
+	// the queue is materially large so convergence does not stall.
+	vlogGenerationRewriteFreshPlanDebtDrainMinSegments = 4
+	vlogGenerationRewriteFreshPlanDebtDrainMaxSegments = 4
 )
 
 func (db *DB) flushBackendEntriesCap(totalOps int, sync bool) int {
@@ -12964,6 +12973,23 @@ func (db *DB) vlogGenerationRewriteMaxSegmentsForRun(queueLen int, budgetTokens
 	return maxSegments
 }
 
+func (db *DB) vlogGenerationRewriteMaxSegmentsForFreshPlan(queueLen int, budgetTokens int64, opts vlogGenerationMaintenanceOptions) int {
+	if db == nil || queueLen <= 1 || !opts.rewriteDebtDrain {
+		return vlogGenerationRewriteResumeMaxSegments
+	}
+	if queueLen < vlogGenerationRewriteFreshPlanDebtDrainMinSegments {
+		return vlogGenerationRewriteResumeMaxSegments
+	}
+	maxSegments := db.vlogGenerationRewriteMaxSegmentsForRun(queueLen, budgetTokens, opts)
+	if maxSegments > vlogGenerationRewriteFreshPlanDebtDrainMaxSegments {
+		maxSegments = vlogGenerationRewriteFreshPlanDebtDrainMaxSegments
+	}
+	if maxSegments < 1 {
+		maxSegments = 1
+	}
+	return maxSegments
+}
+
 const maxPositiveInt64 = int64(^uint64(0) >> 1)
 
 func addClampInt64(cur, add, limit int64) int64 {
@@ -14021,7 +14047,8 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 	// caused real restore stalls. Keep WAL-on profiles eligible for maintenance
 	// before the first checkpoint; starving that path causes the main value-log
 	// lane to grow unchecked during restore.
-	if db.disableJournal && db.checkpointRuns.Load() == 0 && !runGC && len(rewriteQueue) == 0 && !opts.skipCheckpoint {
+	allowPreCheckpointRewrite := envBool(envEnableVlogGenerationPreCheckpointRewrite)
+	if db.disableJournal && db.checkpointRuns.Load() == 0 && !runGC && len(rewriteQueue) == 0 && !opts.skipCheckpoint && !allowPreCheckpointRewrite {
 		db.vlogGenerationMaintenanceSkipPreCheckpoint.Add(1)
 		return
 	}
@@ -14510,12 +14537,12 @@ planned:
 				// Do not debt-drain freshly planned work in the same pass. The only
 				// exception is a confirmed staged rewrite-resume pass, which should
 				// be allowed to consume debt in bounded multi-segment chunks.
-				allowPlanDebtDrain := reason == vlogGenerationReasonRewriteResume && opts.rewriteDebtDrain
-				if allowPlanDebtDrain {
-					rewriteMaxSegments = db.vlogGenerationRewriteMaxSegmentsForRun(len(rewriteQueue), budgetTokens, opts)
-				} else {
-					rewriteMaxSegments = vlogGenerationRewriteResumeMaxSegments
-				}
+					allowPlanDebtDrain := reason == vlogGenerationReasonRewriteResume && opts.rewriteDebtDrain
+					if allowPlanDebtDrain {
+						rewriteMaxSegments = db.vlogGenerationRewriteMaxSegmentsForRun(len(rewriteQueue), budgetTokens, opts)
+					} else {
+						rewriteMaxSegments = db.vlogGenerationRewriteMaxSegmentsForFreshPlan(len(rewriteQueue), budgetTokens, opts)
+					}
 				// If the token bucket is enabled and empty, persist the plan/ledger but
 				// skip running the rewrite until we have budget to spend.
 				if db.vlogGenerationRewriteBudgetEnabled() && budgetTokens <= 0 {
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index c3e0325e6..692e6b389 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -730,6 +730,51 @@ func TestVlogGenerationRewriteMaxSegmentsForRun_ClampsDebtDrainQueue(t *testing.
 	}
 }
 
+func TestVlogGenerationRewriteMaxSegmentsForFreshPlan_BelowQueueThreshold(t *testing.T) {
+	db := &DB{
+		valueLogRewriteBudgetBytes:   1024,
+		valueLogGenerationWarmTarget: 256,
+	}
+	got := db.vlogGenerationRewriteMaxSegmentsForFreshPlan(
+		vlogGenerationRewriteFreshPlanDebtDrainMinSegments-1,
+		1<<20,
+		vlogGenerationMaintenanceOptions{rewriteDebtDrain: true, debugSource: "rewrite_age_blocked"},
+	)
+	if got != vlogGenerationRewriteResumeMaxSegments {
+		t.Fatalf("fresh-plan queue<threshold got=%d want=%d", got, vlogGenerationRewriteResumeMaxSegments)
+	}
+}
+
+func TestVlogGenerationRewriteMaxSegmentsForFreshPlan_ClampsToFreshCap(t *testing.T) {
+	db := &DB{
+		valueLogRewriteBudgetBytes:   1 << 20,
+		valueLogGenerationWarmTarget: 64,
+	}
+	got := db.vlogGenerationRewriteMaxSegmentsForFreshPlan(
+		vlogGenerationRewriteFreshPlanDebtDrainMinSegments+8,
+		1<<20,
+		vlogGenerationMaintenanceOptions{rewriteDebtDrain: true, debugSource: "rewrite_age_blocked"},
+	)
+	if got != vlogGenerationRewriteFreshPlanDebtDrainMaxSegments {
+		t.Fatalf("fresh-plan clamp got=%d want=%d", got, vlogGenerationRewriteFreshPlanDebtDrainMaxSegments)
+	}
+}
+
+func TestVlogGenerationRewriteMaxSegmentsForFreshPlan_AllowsStaleRatioDebtDrain(t *testing.T) {
+	db := &DB{
+		valueLogRewriteBudgetBytes:   1 << 20,
+		valueLogGenerationWarmTarget: 64,
+	}
+	got := db.vlogGenerationRewriteMaxSegmentsForFreshPlan(
+		vlogGenerationRewriteFreshPlanDebtDrainMinSegments+8,
+		1<<20,
+		vlogGenerationMaintenanceOptions{rewriteDebtDrain: true, debugSource: "rewrite_age_blocked"},
+	)
+	if got != vlogGenerationRewriteFreshPlanDebtDrainMaxSegments {
+		t.Fatalf("fresh-plan stale-ratio got=%d want=%d", got, vlogGenerationRewriteFreshPlanDebtDrainMaxSegments)
+	}
+}
+
 type rewriteBudgetRecordingBackend struct {
 	*backenddb.DB
 
@@ -4457,6 +4502,133 @@ func TestVlogGenerationMaintenance_PeriodicGCSkipsInWALOnMode(t *testing.T) {
 	}
 }
 
+func TestVlogGenerationMaintenance_WALOffPreCheckpointSkipsRewriteByDefault(t *testing.T) {
+	prepareDirectSchedulerTest(t)
+
+	dir := t.TempDir()
+	backend, err := backenddb.Open(backenddb.Options{Dir: dir})
+	if err != nil {
+		t.Fatalf("open backend: %v", err)
+	}
+	recorder := &rewriteBudgetRecordingBackend{
+		DB: backend,
+		planResponse: backenddb.ValueLogRewritePlan{
+			SourceFileIDs: []uint32{11},
+			SelectedSegments: []backenddb.ValueLogRewritePlanSegment{
+				{FileID: 11, BytesTotal: 64, BytesLive: 32, BytesStale: 32, StaleRatio: 0.5},
+			},
+			SegmentsSelected:   1,
+			SelectedBytesTotal: 64,
+			SelectedBytesLive:  32,
+			SelectedBytesStale: 32,
+		},
+		rewriteResponse: backenddb.ValueLogRewriteStats{BytesBefore: 64, BytesAfter: 32, RecordsCopied: 1},
+	}
+
+	db, err := Open(dir, recorder, Options{
+		AllowUnsafe:                      true,
+		DisableWAL:                       true,
+		JournalLanes:                     1,
+		ValueLogGenerationPolicy:         uint8(backenddb.ValueLogGenerationHotWarmCold),
+		ValueLogRewriteTriggerTotalBytes: 1,
+		ValueLogRewriteBudgetBytesPerSec: 1024,
+		ForceValueLogPointers:            true,
+	})
+	if err != nil {
+		t.Fatalf("open cachingdb: %v", err)
+	}
+	t.Cleanup(func() { _ = db.Close() })
+	skipRetainedPrune(db)
+
+	value := make([]byte, 2048)
+	b := db.NewBatch()
+	if err := b.Set([]byte("k"), value); err != nil {
+		_ = b.Close()
+		t.Fatalf("set: %v", err)
+	}
+	if err := b.Write(); err != nil {
+		_ = b.Close()
+		t.Fatalf("write: %v", err)
+	}
+	_ = b.Close()
+	forceVlogMaintenanceIdle(db)
+	db.vlogGenerationRewriteBudgetTokensBytes.Store(1024)
+
+	db.maybeRunVlogGenerationMaintenance(false)
+
+	if _, calls := recorder.recordedRewrite(); calls != 0 {
+		t.Fatalf("rewrite calls=%d want 0 before first checkpoint", calls)
+	}
+	stats := db.Stats()
+	if got := stats["treedb.cache.vlog_generation.maintenance.skip.before_first_checkpoint"]; got != "1" {
+		t.Fatalf("pre-checkpoint skip=%q want 1", got)
+	}
+}
+
+func TestVlogGenerationMaintenance_WALOffPreCheckpointCanRunWithEnvOverride(t *testing.T) {
+	prepareDirectSchedulerTest(t)
+	t.Setenv(envEnableVlogGenerationPreCheckpointRewrite, "1")
+
+	dir := t.TempDir()
+	backend, err := backenddb.Open(backenddb.Options{Dir: dir})
+	if err != nil {
+		t.Fatalf("open backend: %v", err)
+	}
+	recorder := &rewriteBudgetRecordingBackend{
+		DB: backend,
+		planResponse: backenddb.ValueLogRewritePlan{
+			SourceFileIDs: []uint32{11},
+			SelectedSegments: []backenddb.ValueLogRewritePlanSegment{
+				{FileID: 11, BytesTotal: 64, BytesLive: 32, BytesStale: 32, StaleRatio: 0.5},
+			},
+			SegmentsSelected:   1,
+			SelectedBytesTotal: 64,
+			SelectedBytesLive:  32,
+			SelectedBytesStale: 32,
+		},
+		rewriteResponse: backenddb.ValueLogRewriteStats{BytesBefore: 64, BytesAfter: 32, RecordsCopied: 1},
+	}
+
+	db, err := Open(dir, recorder, Options{
+		AllowUnsafe:                      true,
+		DisableWAL:                       true,
+		JournalLanes:                     1,
+		ValueLogGenerationPolicy:         uint8(backenddb.ValueLogGenerationHotWarmCold),
+		ValueLogRewriteTriggerTotalBytes: 1,
+		ValueLogRewriteBudgetBytesPerSec: 1024,
+		ForceValueLogPointers:            true,
+	})
+	if err != nil {
+		t.Fatalf("open cachingdb: %v", err)
+	}
+	t.Cleanup(func() { _ = db.Close() })
+	skipRetainedPrune(db)
+
+	value := make([]byte, 2048)
+	b := db.NewBatch()
+	if err := b.Set([]byte("k"), value); err != nil {
+		_ = b.Close()
+		t.Fatalf("set: %v", err)
+	}
+	if err := b.Write(); err != nil {
+		_ = b.Close()
+		t.Fatalf("write: %v", err)
+	}
+	_ = b.Close()
+	forceVlogMaintenanceIdle(db)
+	db.vlogGenerationRewriteBudgetTokensBytes.Store(1024)
+
+	db.maybeRunVlogGenerationMaintenance(false)
+
+	if _, calls := recorder.recordedRewrite(); calls != 1 {
+		t.Fatalf("rewrite calls=%d want 1 with pre-checkpoint override", calls)
+	}
+	stats := db.Stats()
+	if got := stats["treedb.cache.vlog_generation.maintenance.skip.before_first_checkpoint"]; got != "0" {
+		t.Fatalf("pre-checkpoint skip=%q want 0 with override", got)
+	}
+}
+
 func TestVlogGenerationMaintenance_PeriodicSkipsWhenMaintenancePhaseNonSteady(t *testing.T) {
 	prepareDirectSchedulerTest(t)
 
diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
index 3a783b9ce..113bebed3 100644
--- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
+++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
@@ -56,6 +56,13 @@ The report highlights:
 - zombie inventory (pinned vs unpinned bytes)
 - GC eligibility/protection signals
 
+## Experimental Knob
+- `TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1`
+  - WAL-off only.
+  - Allows rewrite planning/execution before the first explicit checkpoint.
+  - Default is disabled to avoid adding early restore contention.
+  - Use for controlled `run_celestia` experiments when `maintenance.skip.before_first_checkpoint` dominates and live rewrite never starts.
+
 ## Bench Commands
 ### Churn sanity (TreeDB)
 ```bash
diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py
index 78c6731da..1033aa1ce 100755
--- a/scripts/analyze_vlog_maintenance_capacity.py
+++ b/scripts/analyze_vlog_maintenance_capacity.py
@@ -441,10 +441,12 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst
     print(
         "  skip pressure: "
         f"total={summary['maintenance_skip_total']} "
+        f"pre_checkpoint={skips['before_first_checkpoint']} "
         f"stage_gate={skips['stage_gate']} "
         f"stage_not_due={skips['stage_gate_not_due']} "
         f"age_blocked={skips['age_blocked_gate']} "
         f"quiet={skips['quiet_window']} "
+        f"priority={skips['priority_pending']} "
         f"checkpoint={skips['checkpoint_inflight']}"
     )
     print("")
diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md
index 69db38543..8ee838c56 100644
--- a/worklog/2026-03-28.md
+++ b/worklog/2026-03-28.md
@@ -204,3 +204,48 @@
   - offline rewrite on same home: `3320304179 -> 2132071399` (`du -sb` post `2132075495`)
   - interpretation:
     - the threshold change improves total-bytes-triggered live rewrite coverage while preserving trigger semantics.
+
+- Follow-up experiments (fresh-plan burst + WAL-off pre-checkpoint gate)
+  - Added fresh-plan debt-drain burst policy for planned rewrite queues:
+    - `vlogGenerationRewriteFreshPlanDebtDrainMinSegments=4`
+    - `vlogGenerationRewriteFreshPlanDebtDrainMaxSegments=4`
+    - path: `TreeDB/caching/db.go` (`vlogGenerationRewriteMaxSegmentsForFreshPlan`)
+    - tests: `TestVlogGenerationRewriteMaxSegmentsForFreshPlan_*`
+
+- Capacity analyzer output improvement:
+  - `scripts/analyze_vlog_maintenance_capacity.py` now prints `pre_checkpoint` and `priority` in the maintenance skip-pressure line.
+
+- Root-cause check for no-rewrite outlier:
+  - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328082023`
+  - observed:
+    - analyzer: `rewrite runs=0`, end WAL `5522118526`
+    - skip counters (`debug_vars`): `maintenance.skip.before_first_checkpoint=11`
+  - interpretation:
+    - WAL-off pre-checkpoint gate can suppress all rewrite activity on some short runs.
+
+- Added experimental override for WAL-off pre-checkpoint rewrite:
+  - env: `TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1`
+  - default remains disabled.
+  - gate change in `TreeDB/caching/db.go` allows bypassing `maintenance.skip.before_first_checkpoint` when env is set.
+  - docs updated: `docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md`.
+  - tests added:
+    - `TestVlogGenerationMaintenance_WALOffPreCheckpointSkipsRewriteByDefault`
+    - `TestVlogGenerationMaintenance_WALOffPreCheckpointCanRunWithEnvOverride`
+
+- Validation runs
+  - Baseline-like run (no override), home `/home/mikers/.celestia-app-mainnet-treedb-20260328075104`:
+    - end WAL: `3438411416`
+    - analyzer: `rewrite runs=8`, `selected->executed=8->8`, `processed_stale=1.59 GiB`
+    - offline rewrite: `3438407320 -> 2171030759` (post `du -sb`: `2171034855`)
+  - No-rewrite outlier (no override), home `/home/mikers/.celestia-app-mainnet-treedb-20260328082023`:
+    - end WAL: `5522118526`
+    - analyzer: `rewrite runs=0`, `pre_checkpoint skip dominated`
+    - offline rewrite: `5522114430 -> 2205781521` (post `du -sb`: `2205785617`)
+  - Override run (`TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1`), home `/home/mikers/.celestia-app-mainnet-treedb-20260328083336`:
+    - end WAL: `3477220043`
+    - analyzer: `pre_checkpoint=0`, `rewrite runs=8`, `selected->executed=9->8`, `processed_stale=1.59 GiB`
+    - offline rewrite: `3477215947 -> 2238622807` (post `du -sb`: `2238626903`)
+
+- Takeaway:
+  - pre-checkpoint gating is a first-order driver of run-to-run variance in live rewrite coverage under WAL-off fast runs.
+  - enabling the pre-checkpoint override avoids the catastrophic `rewrite runs=0` failure mode and restores expected live rewrite activity.

From 8806b4e822f857da184b18653b95ca230baba47a Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 10:55:34 -1000
Subject: [PATCH 35/61] analyzer: surface rewrite plan-empty reasons

---
 scripts/analyze_vlog_maintenance_capacity.py | 16 +++++++
 worklog/2026-03-28.md                        | 48 ++++++++++++++++++++
 2 files changed, 64 insertions(+)

diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py
index 1033aa1ce..f98c0245c 100755
--- a/scripts/analyze_vlog_maintenance_capacity.py
+++ b/scripts/analyze_vlog_maintenance_capacity.py
@@ -222,7 +222,12 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
         "rewrite_plan_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_runs"),
         "rewrite_plan_selected": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected"),
         "rewrite_plan_empty": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_empty"),
+        "rewrite_plan_empty_no_selection": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_empty.no_selection"),
+        "rewrite_plan_empty_age_blocked": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked"),
         "rewrite_plan_selected_segments_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_selected_segments_total"),
+        "rewrite_plan_penalty_filter_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_penalty_filter.runs"),
+        "rewrite_plan_penalty_filter_segments": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_penalty_filter.segments"),
+        "rewrite_plan_penalty_filter_to_empty_runs": metric_int(stats, "treedb.cache.vlog_generation.rewrite.plan_penalty_filter.to_empty_runs"),
         "rewrite_exec_source_segments_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_total"),
         "rewrite_exec_source_segments_requested_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"),
         "rewrite_exec_source_segments_still_referenced_total": metric_int(stats, "treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"),
@@ -457,6 +462,17 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst
         f"{summary['rewrite_plan_runs']} / {summary['rewrite_plan_selected']} / {summary['rewrite_plan_empty']} "
         f"(select_rate={summary['rewrite_plan_select_rate_pct']:.1f}%)"
     )
+    print(
+        "  plan-empty breakdown: "
+        f"no_selection={summary['rewrite_plan_empty_no_selection']} "
+        f"age_blocked={summary['rewrite_plan_empty_age_blocked']}"
+    )
+    print(
+        "  plan penalty-filter: "
+        f"runs={summary['rewrite_plan_penalty_filter_runs']} "
+        f"segments={summary['rewrite_plan_penalty_filter_segments']} "
+        f"to_empty_runs={summary['rewrite_plan_penalty_filter_to_empty_runs']}"
+    )
     print(
         "  selected->executed segments: "
         f"{summary['rewrite_plan_selected_segments_total']} -> {summary['rewrite_exec_source_segments_total']} "
diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md
index 8ee838c56..4b7fb02ba 100644
--- a/worklog/2026-03-28.md
+++ b/worklog/2026-03-28.md
@@ -249,3 +249,51 @@
 - Takeaway:
   - pre-checkpoint gating is a first-order driver of run-to-run variance in live rewrite coverage under WAL-off fast runs.
   - enabling the pre-checkpoint override avoids the catastrophic `rewrite runs=0` failure mode and restores expected live rewrite activity.
+
+- Additional live sweep (focus: robust lower end-of-run WAL under `fast` + pre-checkpoint rewrite):
+  - fixed env baseline:
+    - `TREEDB_OPEN_PROFILE=fast`
+    - `TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1`
+    - `TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=1073741824`
+    - `FREEZE_REMOTE_HEIGHT_AT_START=1`
+  - no total-bytes backstop (outlier repro):
+    - command:
+      - `... TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=0 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=100000 ...`
+    - run home: `/home/mikers/.celestia-app-mainnet-treedb-20260328094006`
+    - analyzer highlights:
+      - `rewrite runs=1`
+      - `selected->executed=3->2`
+      - `processed_stale=475.49 MiB`
+      - `skip stage_gate/stage_not_due=7/7`
+    - end WAL: `4274361669`
+    - offline rewrite: `4274357573 -> 2093567828` (post `du -sb`: `2093571924`, `gzip -1`: `1749325383`)
+  - add total-bytes backstop @ `128 MiB`, stale ratio `100k`:
+    - run homes:
+      - `/home/mikers/.celestia-app-mainnet-treedb-20260328094856`
+      - `/home/mikers/.celestia-app-mainnet-treedb-20260328100118` (replicate)
+    - analyzer highlights:
+      - run1: `rewrite runs=7`, `selected->executed=7->7`, `processed_stale=1.44 GiB`, `end WAL=3362578071`
+      - run2: `rewrite runs=6`, `selected->executed=6->6`, `processed_stale=1.27 GiB`, `end WAL=3574791009`
+    - offline rewrite:
+      - run1: `3362573975 -> 2116702484` (post `du -sb`: `2116706580`, `gzip -1`: `1767440551`)
+      - run2: `3574786913 -> 2132053768` (post `du -sb`: `2132057864`, `gzip -1`: `1778930169`)
+  - total-bytes backstop @ `64 MiB`, stale ratio sweep:
+    - stale `100k` (`/home/mikers/.celestia-app-mainnet-treedb-20260328101255`):
+      - `rewrite runs=8`, `selected->executed=8->8`, `processed_stale=1.60 GiB`, `end WAL=3391412031`
+      - rewrite `3391407935 -> 2156175550` (post `du -sb`: `2156179646`, `gzip -1`: `1793519331`)
+    - stale `50k` (`/home/mikers/.celestia-app-mainnet-treedb-20260328102614`):
+      - `rewrite runs=7`, `selected->executed=7->7`, `processed_stale=1.40 GiB`, `end WAL=3569727005`
+      - rewrite `3569722909 -> 2175068990` (post `du -sb`: `2175073086`, `gzip -1`: `1806440477`)
+    - stale `10k` (`/home/mikers/.celestia-app-mainnet-treedb-20260328103947`):
+      - `rewrite runs=9`, `selected->executed=9->9`, `processed_stale=1.77 GiB`, `end WAL=3588198674`
+      - rewrite `3588194578 -> 2188079023` (post `du -sb`: `2188083119`, `gzip -1`: `1817157727`)
+  - interpretation:
+    - adding a nonzero `trigger_total_bytes` backstop prevents the catastrophic low-coverage outlier seen with stale-ratio-only triggering.
+    - in this window, pushing stale-ratio lower (`100k -> 50k -> 10k`) increases rewrite volume but does **not** improve end-of-run or post-rewrite bytes; it trends worse, consistent with extra rewrite churn without live reclaim.
+    - best observed point in this sweep: `trigger_total_bytes=128MiB`, `stale_ratio_ppm=100000` (lowest end WAL and best post-rewrite/gzip among these runs).
+
+- Capacity analyzer output improvement (follow-up):
+  - `scripts/analyze_vlog_maintenance_capacity.py` now prints:
+    - `plan-empty breakdown: no_selection / age_blocked`
+    - `plan penalty-filter: runs / segments / to_empty_runs`
+  - this helps distinguish threshold-limited empty plans (`no_selection`) from penalty/cooldown suppression.

From db46ff7081895781da60a3557caeeee96d54eff9 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 11:01:28 -1000
Subject: [PATCH 36/61] vlog: add observed-source protection mix counters

---
 TreeDB/caching/db.go                          | 21 +++++++
 .../caching/vlog_generation_scheduler_test.go | 28 +++++++++
 scripts/analyze_vlog_maintenance_capacity.py  | 58 +++++++++++++++++++
 worklog/2026-03-28.md                         | 14 +++++
 4 files changed, 121 insertions(+)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 575e4694e..ce07ab36d 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5720,10 +5720,17 @@ type DB struct {
 	vlogGenerationObservedGCSourceSegmentsTotal                 atomic.Uint64
 	vlogGenerationObservedGCSourceSegmentsEligibleTotal         atomic.Uint64
 	vlogGenerationObservedGCSourceSegmentsDeletedTotal          atomic.Uint64
+	vlogGenerationObservedGCSourceSegmentsProtectedInUseTotal   atomic.Uint64
+	vlogGenerationObservedGCSourceSegmentsProtectedRetainedTotal atomic.Uint64
+	vlogGenerationObservedGCSourceSegmentsProtectedOverlapTotal atomic.Uint64
+	vlogGenerationObservedGCSourceSegmentsProtectedOtherTotal   atomic.Uint64
 	vlogGenerationObservedGCSourceBytesTotal                    atomic.Int64
 	vlogGenerationObservedGCSourceBytesEligibleTotal            atomic.Int64
 	vlogGenerationObservedGCSourceBytesDeletedTotal             atomic.Int64
+	vlogGenerationObservedGCSourceBytesProtectedInUseTotal      atomic.Int64
 	vlogGenerationObservedGCSourceBytesProtectedRetainedTotal   atomic.Int64
+	vlogGenerationObservedGCSourceBytesProtectedOverlapTotal    atomic.Int64
+	vlogGenerationObservedGCSourceBytesProtectedOtherTotal      atomic.Int64
 	retainedPruneMu                                             sync.Mutex
 	retainedPruneDone                                           chan struct{}
 	vlogGenerationRemapSuccesses                                atomic.Uint64
@@ -13190,10 +13197,17 @@ func (db *DB) observeVlogGenerationGCStats(stats backenddb.ValueLogGCStats) {
 	db.vlogGenerationObservedGCSourceSegmentsTotal.Add(uint64(stats.ObservedSourceSegments))
 	db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Add(uint64(stats.ObservedSourceSegmentsEligible))
 	db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Add(uint64(stats.ObservedSourceSegmentsDeleted))
+	db.vlogGenerationObservedGCSourceSegmentsProtectedInUseTotal.Add(uint64(stats.ObservedSourceSegmentsProtectedInUse))
+	db.vlogGenerationObservedGCSourceSegmentsProtectedRetainedTotal.Add(uint64(stats.ObservedSourceSegmentsProtectedRetained))
+	db.vlogGenerationObservedGCSourceSegmentsProtectedOverlapTotal.Add(uint64(stats.ObservedSourceSegmentsProtectedOverlap))
+	db.vlogGenerationObservedGCSourceSegmentsProtectedOtherTotal.Add(uint64(stats.ObservedSourceSegmentsProtectedOther))
 	db.vlogGenerationObservedGCSourceBytesTotal.Add(stats.ObservedSourceBytes)
 	db.vlogGenerationObservedGCSourceBytesEligibleTotal.Add(stats.ObservedSourceBytesEligible)
 	db.vlogGenerationObservedGCSourceBytesDeletedTotal.Add(stats.ObservedSourceBytesDeleted)
+	db.vlogGenerationObservedGCSourceBytesProtectedInUseTotal.Add(stats.ObservedSourceBytesProtectedInUse)
 	db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Add(stats.ObservedSourceBytesProtectedRetained)
+	db.vlogGenerationObservedGCSourceBytesProtectedOverlapTotal.Add(stats.ObservedSourceBytesProtectedOverlap)
+	db.vlogGenerationObservedGCSourceBytesProtectedOtherTotal.Add(stats.ObservedSourceBytesProtectedOther)
 }
 
 func (db *DB) observeVlogGenerationVacuumExecDuration(d time.Duration) {
@@ -20897,10 +20911,17 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.observed_gc.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsTotal.Load())
 	stats["treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Load())
 	stats["treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_in_use_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsProtectedInUseTotal.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_retained_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsProtectedRetainedTotal.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_overlap_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsProtectedOverlapTotal.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_other_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsProtectedOtherTotal.Load())
 	stats["treedb.cache.vlog_generation.observed_gc.source_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesTotal.Load())
 	stats["treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesEligibleTotal.Load())
 	stats["treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesDeletedTotal.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_in_use_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesProtectedInUseTotal.Load())
 	stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_overlap_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesProtectedOverlapTotal.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_other_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceBytesProtectedOtherTotal.Load())
 	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteExecSourceSegments.Load())
 	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_requested_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsRequestedTotal.Load())
 	stats["treedb.cache.vlog_generation.rewrite.exec.source_segments_still_referenced_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteSourceSegmentsStillReferencedTotal.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 692e6b389..eb7636bc3 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -6271,10 +6271,17 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	db.vlogGenerationObservedGCSourceSegmentsTotal.Store(11)
 	db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Store(5)
 	db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Store(3)
+	db.vlogGenerationObservedGCSourceSegmentsProtectedInUseTotal.Store(1)
+	db.vlogGenerationObservedGCSourceSegmentsProtectedRetainedTotal.Store(2)
+	db.vlogGenerationObservedGCSourceSegmentsProtectedOverlapTotal.Store(3)
+	db.vlogGenerationObservedGCSourceSegmentsProtectedOtherTotal.Store(4)
 	db.vlogGenerationObservedGCSourceBytesTotal.Store(1100)
 	db.vlogGenerationObservedGCSourceBytesEligibleTotal.Store(500)
 	db.vlogGenerationObservedGCSourceBytesDeletedTotal.Store(300)
+	db.vlogGenerationObservedGCSourceBytesProtectedInUseTotal.Store(50)
 	db.vlogGenerationObservedGCSourceBytesProtectedRetainedTotal.Store(250)
+	db.vlogGenerationObservedGCSourceBytesProtectedOverlapTotal.Store(75)
+	db.vlogGenerationObservedGCSourceBytesProtectedOtherTotal.Store(25)
 
 	db.vlogGenerationRewriteQueueMu.Lock()
 	db.vlogGenerationRewriteQueueLoaded = true
@@ -6567,6 +6574,18 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"]; got != "3" {
 		t.Fatalf("observed gc source segments deleted total=%q want 3", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_in_use_total"]; got != "1" {
+		t.Fatalf("observed gc source segments protected in-use total=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_retained_total"]; got != "2" {
+		t.Fatalf("observed gc source segments protected retained total=%q want 2", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_overlap_total"]; got != "3" {
+		t.Fatalf("observed gc source segments protected overlap total=%q want 3", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_protected_other_total"]; got != "4" {
+		t.Fatalf("observed gc source segments protected other total=%q want 4", got)
+	}
 	if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_total"]; got != "1100" {
 		t.Fatalf("observed gc source bytes total=%q want 1100", got)
 	}
@@ -6576,7 +6595,16 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"]; got != "300" {
 		t.Fatalf("observed gc source bytes deleted total=%q want 300", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_in_use_total"]; got != "50" {
+		t.Fatalf("observed gc source bytes protected in-use total=%q want 50", got)
+	}
 	if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"]; got != "250" {
 		t.Fatalf("observed gc source bytes protected retained total=%q want 250", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_overlap_total"]; got != "75" {
+		t.Fatalf("observed gc source bytes protected overlap total=%q want 75", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.source_bytes_protected_other_total"]; got != "25" {
+		t.Fatalf("observed gc source bytes protected other total=%q want 25", got)
+	}
 }
diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py
index f98c0245c..a719f5cff 100755
--- a/scripts/analyze_vlog_maintenance_capacity.py
+++ b/scripts/analyze_vlog_maintenance_capacity.py
@@ -300,10 +300,17 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
         "observed_gc_source_segments_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_total"),
         "observed_gc_source_segments_eligible_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"),
         "observed_gc_source_segments_deleted_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"),
+        "observed_gc_source_segments_protected_in_use_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_protected_in_use_total"),
+        "observed_gc_source_segments_protected_retained_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_protected_retained_total"),
+        "observed_gc_source_segments_protected_overlap_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_protected_overlap_total"),
+        "observed_gc_source_segments_protected_other_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_protected_other_total"),
         "observed_gc_source_bytes_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_total"),
         "observed_gc_source_bytes_eligible_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_eligible_total"),
         "observed_gc_source_bytes_deleted_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_deleted_total"),
+        "observed_gc_source_bytes_protected_in_use_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_protected_in_use_total"),
         "observed_gc_source_bytes_protected_retained_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total"),
+        "observed_gc_source_bytes_protected_overlap_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_protected_overlap_total"),
+        "observed_gc_source_bytes_protected_other_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_bytes_protected_other_total"),
         "checkpoint_kick_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.runs"),
         "checkpoint_kick_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.gc_runs"),
         "checkpoint_kick_rewrite_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"),
@@ -388,6 +395,38 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
         m["observed_gc_source_bytes_deleted_total"],
         m["observed_gc_source_bytes_eligible_total"],
     )
+    m["observed_gc_source_segments_protected_in_use_pct"] = pct(
+        m["observed_gc_source_segments_protected_in_use_total"],
+        m["observed_gc_source_segments_total"],
+    )
+    m["observed_gc_source_segments_protected_retained_pct"] = pct(
+        m["observed_gc_source_segments_protected_retained_total"],
+        m["observed_gc_source_segments_total"],
+    )
+    m["observed_gc_source_segments_protected_overlap_pct"] = pct(
+        m["observed_gc_source_segments_protected_overlap_total"],
+        m["observed_gc_source_segments_total"],
+    )
+    m["observed_gc_source_segments_protected_other_pct"] = pct(
+        m["observed_gc_source_segments_protected_other_total"],
+        m["observed_gc_source_segments_total"],
+    )
+    m["observed_gc_source_bytes_protected_in_use_pct"] = pct(
+        m["observed_gc_source_bytes_protected_in_use_total"],
+        m["observed_gc_source_bytes_total"],
+    )
+    m["observed_gc_source_bytes_protected_retained_pct"] = pct(
+        m["observed_gc_source_bytes_protected_retained_total"],
+        m["observed_gc_source_bytes_total"],
+    )
+    m["observed_gc_source_bytes_protected_overlap_pct"] = pct(
+        m["observed_gc_source_bytes_protected_overlap_total"],
+        m["observed_gc_source_bytes_total"],
+    )
+    m["observed_gc_source_bytes_protected_other_pct"] = pct(
+        m["observed_gc_source_bytes_protected_other_total"],
+        m["observed_gc_source_bytes_total"],
+    )
     m["retained_prune_removed_candidate_segments_pct"] = pct(
         m["retained_prune_removed_segments"],
         m["retained_prune_candidate_segments"],
@@ -581,6 +620,25 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst
         f"deleted_pct={summary['observed_gc_source_bytes_deleted_pct']:.1f}%, "
         f"deleted_of_eligible={summary['observed_gc_source_bytes_deleted_of_eligible_pct']:.1f}%)"
     )
+    print(
+        "  observed-source protection mix: "
+        f"segments in_use={summary['observed_gc_source_segments_protected_in_use_total']} "
+        f"retained={summary['observed_gc_source_segments_protected_retained_total']} "
+        f"overlap={summary['observed_gc_source_segments_protected_overlap_total']} "
+        f"other={summary['observed_gc_source_segments_protected_other_total']} "
+        f"(in_use={summary['observed_gc_source_segments_protected_in_use_pct']:.1f}%, "
+        f"retained={summary['observed_gc_source_segments_protected_retained_pct']:.1f}%, "
+        f"overlap={summary['observed_gc_source_segments_protected_overlap_pct']:.1f}%, "
+        f"other={summary['observed_gc_source_segments_protected_other_pct']:.1f}%) "
+        f"bytes in_use={human_bytes(summary['observed_gc_source_bytes_protected_in_use_total'])} "
+        f"retained={human_bytes(summary['observed_gc_source_bytes_protected_retained_total'])} "
+        f"overlap={human_bytes(summary['observed_gc_source_bytes_protected_overlap_total'])} "
+        f"other={human_bytes(summary['observed_gc_source_bytes_protected_other_total'])} "
+        f"(in_use={summary['observed_gc_source_bytes_protected_in_use_pct']:.1f}%, "
+        f"retained={summary['observed_gc_source_bytes_protected_retained_pct']:.1f}%, "
+        f"overlap={summary['observed_gc_source_bytes_protected_overlap_pct']:.1f}%, "
+        f"other={summary['observed_gc_source_bytes_protected_other_pct']:.1f}%)"
+    )
     print(
         "  observed-source retained-prune totals: "
         f"seen={summary['retained_prune_observed_source_segments_total']} ({human_bytes(summary['retained_prune_observed_source_bytes_total'])}) "
diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md
index 4b7fb02ba..fa3ec5f10 100644
--- a/worklog/2026-03-28.md
+++ b/worklog/2026-03-28.md
@@ -297,3 +297,17 @@
     - `plan-empty breakdown: no_selection / age_blocked`
     - `plan penalty-filter: runs / segments / to_empty_runs`
   - this helps distinguish threshold-limited empty plans (`no_selection`) from penalty/cooldown suppression.
+
+- Observability extension for observed-source GC protection breakdown:
+  - Added cumulative stats counters in `TreeDB/caching/db.go`:
+    - `treedb.cache.vlog_generation.observed_gc.source_segments_protected_in_use_total`
+    - `treedb.cache.vlog_generation.observed_gc.source_segments_protected_retained_total`
+    - `treedb.cache.vlog_generation.observed_gc.source_segments_protected_overlap_total`
+    - `treedb.cache.vlog_generation.observed_gc.source_segments_protected_other_total`
+    - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_in_use_total`
+    - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_retained_total`
+    - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_overlap_total`
+    - `treedb.cache.vlog_generation.observed_gc.source_bytes_protected_other_total`
+  - Extended `scripts/analyze_vlog_maintenance_capacity.py` to report observed-source protection mix (segments + bytes + percentages).
+  - Updated stats test coverage:
+    - `TreeDB/caching/vlog_generation_scheduler_test.go` (`TestVlogGenerationStats_ReportRewriteBacklogAndDurations`).

From 6cc124c4c000cab383e33d77f1921f41be06beec Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 11:16:13 -1000
Subject: [PATCH 37/61] worklog: capture protection-mix validation run

---
 worklog/2026-03-28.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/worklog/2026-03-28.md b/worklog/2026-03-28.md
index fa3ec5f10..5db2ed86a 100644
--- a/worklog/2026-03-28.md
+++ b/worklog/2026-03-28.md
@@ -311,3 +311,20 @@
   - Extended `scripts/analyze_vlog_maintenance_capacity.py` to report observed-source protection mix (segments + bytes + percentages).
   - Updated stats test coverage:
     - `TreeDB/caching/vlog_generation_scheduler_test.go` (`TestVlogGenerationStats_ReportRewriteBacklogAndDurations`).
+
+- Validation run using new protection-mix counters (best current config):
+  - command:
+    - `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active TREEDB_OPEN_PROFILE=fast TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1 TREEDB_VLOG_REWRITE_BUDGET_BYTES_PER_SEC=1073741824 TREEDB_VLOG_REWRITE_TRIGGER_TOTAL_BYTES=134217728 TREEDB_VLOG_REWRITE_TRIGGER_STALE_RATIO_PPM=100000 FREEZE_REMOTE_HEIGHT_AT_START=1 ~/run_celestia.sh`
+  - run home:
+    - `/home/mikers/.celestia-app-mainnet-treedb-20260328110211`
+  - analyzer highlights:
+    - `rewrite runs=8`, `selected->executed=9->8`, `processed_stale=1.60 GiB`
+    - `plan-empty breakdown: no_selection=6 age_blocked=5`
+    - observed-source protection mix:
+      - segments: `in_use=0 retained=23 overlap=0 other=0`
+      - bytes: `in_use=0 B retained=5.75 GiB overlap=0 B other=0 B`
+  - size:
+    - end WAL: `3639153423`
+    - offline rewrite: `3639149327 -> 2230505477` (post `du -sb`: `2230509573`, `gzip -1`: `1848452954`)
+  - interpretation:
+    - in this run, observed-source protection is entirely `retained` (not `in_use` or overlap), confirming retained-lifecycle protection as the dominant in-run reclaim blocker.

From 71be1df8d373487fcb3adef072a48adce283da6a Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 12:09:18 -1000
Subject: [PATCH 38/61] vlog: add observed-source retry budget and celestia a/b
 harness

---
 TreeDB/caching/db.go                          | 167 +++++-
 .../caching/vlog_generation_scheduler_test.go | 159 ++++++
 docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md      |  41 ++
 scripts/analyze_vlog_maintenance_capacity.py  |  24 +
 scripts/run_celestia_ab.sh                    | 510 ++++++++++++++++++
 5 files changed, 892 insertions(+), 9 deletions(-)
 create mode 100755 scripts/run_celestia_ab.sh

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index ce07ab36d..10ff71d72 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -4699,10 +4699,14 @@ func (db *DB) queueVlogGenerationObservedSourceGCList(ids []uint32) {
 	if db == nil || len(ids) == 0 {
 		return
 	}
+	nowUnixNano := time.Now().UnixNano()
 	db.vlogGenerationObservedGCMu.Lock()
 	if db.vlogGenerationObservedGCSourceIDs == nil {
 		db.vlogGenerationObservedGCSourceIDs = make(map[uint32]struct{}, len(ids))
 	}
+	if db.vlogGenerationObservedGCFirstQueuedUnixNano == nil {
+		db.vlogGenerationObservedGCFirstQueuedUnixNano = make(map[uint32]int64, len(ids))
+	}
 	added := 0
 	for _, id := range ids {
 		if id == 0 {
@@ -4712,6 +4716,9 @@ func (db *DB) queueVlogGenerationObservedSourceGCList(ids []uint32) {
 			continue
 		}
 		db.vlogGenerationObservedGCSourceIDs[id] = struct{}{}
+		if _, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[id]; !exists {
+			db.vlogGenerationObservedGCFirstQueuedUnixNano[id] = nowUnixNano
+		}
 		added++
 	}
 	db.vlogGenerationObservedGCMu.Unlock()
@@ -4725,10 +4732,14 @@ func (db *DB) queueVlogGenerationObservedSourceGCIDs(ids map[uint32]struct{}) {
 	if db == nil || len(ids) == 0 {
 		return
 	}
+	nowUnixNano := time.Now().UnixNano()
 	db.vlogGenerationObservedGCMu.Lock()
 	if db.vlogGenerationObservedGCSourceIDs == nil {
 		db.vlogGenerationObservedGCSourceIDs = make(map[uint32]struct{}, len(ids))
 	}
+	if db.vlogGenerationObservedGCFirstQueuedUnixNano == nil {
+		db.vlogGenerationObservedGCFirstQueuedUnixNano = make(map[uint32]int64, len(ids))
+	}
 	added := 0
 	for id := range ids {
 		if id == 0 {
@@ -4738,6 +4749,9 @@ func (db *DB) queueVlogGenerationObservedSourceGCIDs(ids map[uint32]struct{}) {
 			continue
 		}
 		db.vlogGenerationObservedGCSourceIDs[id] = struct{}{}
+		if _, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[id]; !exists {
+			db.vlogGenerationObservedGCFirstQueuedUnixNano[id] = nowUnixNano
+		}
 		added++
 	}
 	db.vlogGenerationObservedGCMu.Unlock()
@@ -4772,6 +4786,92 @@ func (db *DB) takeVlogGenerationObservedSourceGCList() []uint32 {
 	return out
 }
 
+func (db *DB) finalizeVlogGenerationObservedSourceGCIDs(ids []uint32, dropped bool) {
+	if db == nil || len(ids) == 0 {
+		return
+	}
+	nowUnixNano := time.Now().UnixNano()
+	totalLatencyMS := uint64(0)
+	maxLatencyMS := uint64(0)
+	finalized := 0
+	seen := make(map[uint32]struct{}, len(ids))
+	db.vlogGenerationObservedGCMu.Lock()
+	for _, id := range ids {
+		if id == 0 {
+			continue
+		}
+		if _, exists := seen[id]; exists {
+			continue
+		}
+		seen[id] = struct{}{}
+		finalized++
+		delete(db.vlogGenerationObservedGCRetryAttempts, id)
+		if startUnixNano, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[id]; exists {
+			delete(db.vlogGenerationObservedGCFirstQueuedUnixNano, id)
+			if startUnixNano > 0 && nowUnixNano > startUnixNano {
+				latencyMS := uint64((nowUnixNano - startUnixNano) / int64(time.Millisecond))
+				totalLatencyMS += latencyMS
+				if latencyMS > maxLatencyMS {
+					maxLatencyMS = latencyMS
+				}
+			}
+		}
+	}
+	db.vlogGenerationObservedGCMu.Unlock()
+	if finalized == 0 {
+		return
+	}
+	if dropped {
+		db.vlogGenerationObservedGCLatencyDroppedIDs.Add(uint64(finalized))
+	} else {
+		db.vlogGenerationObservedGCLatencyCompletedIDs.Add(uint64(finalized))
+	}
+	if totalLatencyMS > 0 {
+		db.vlogGenerationObservedGCLatencyTotalMS.Add(totalLatencyMS)
+		updateAtomicMaxUint64(&db.vlogGenerationObservedGCLatencyMaxMS, maxLatencyMS)
+	}
+}
+
+func (db *DB) retryVlogGenerationObservedSourceGCList(ids []uint32) (queuedIDs, droppedIDs int) {
+	if db == nil || len(ids) == 0 {
+		return 0, 0
+	}
+	retry := make([]uint32, 0, len(ids))
+	dropped := make([]uint32, 0, len(ids))
+	seen := make(map[uint32]struct{}, len(ids))
+	db.vlogGenerationObservedGCMu.Lock()
+	if db.vlogGenerationObservedGCRetryAttempts == nil {
+		db.vlogGenerationObservedGCRetryAttempts = make(map[uint32]uint8, len(ids))
+	}
+	for _, id := range ids {
+		if id == 0 {
+			continue
+		}
+		if _, exists := seen[id]; exists {
+			continue
+		}
+		seen[id] = struct{}{}
+		attempts := db.vlogGenerationObservedGCRetryAttempts[id]
+		if attempts >= vlogGenerationObservedGCRetryMaxAttempts {
+			delete(db.vlogGenerationObservedGCRetryAttempts, id)
+			dropped = append(dropped, id)
+			continue
+		}
+		db.vlogGenerationObservedGCRetryAttempts[id] = attempts + 1
+		retry = append(retry, id)
+	}
+	db.vlogGenerationObservedGCMu.Unlock()
+	if len(retry) > 0 {
+		db.vlogGenerationObservedGCRetryQueued.Add(1)
+		db.queueVlogGenerationObservedSourceGCList(retry)
+	}
+	if len(dropped) > 0 {
+		db.vlogGenerationObservedGCRetryDropped.Add(uint64(len(dropped)))
+		db.finalizeVlogGenerationObservedSourceGCIDs(dropped, true)
+	}
+	return len(retry), len(dropped)
+}
+
 func (db *DB) scheduleRetainedValueLogPrune() {
 	db.scheduleRetainedValueLogPruneWithForce(false)
 }
@@ -5717,6 +5817,13 @@ type DB struct {
 	vlogGenerationObservedGCTakenIDs                            atomic.Uint64
 	vlogGenerationObservedGCRuns                                atomic.Uint64
 	vlogGenerationObservedGCRetryQueued                         atomic.Uint64
+	vlogGenerationObservedGCRetryDropped                        atomic.Uint64
+	vlogGenerationObservedGCRetryAttempts                       map[uint32]uint8
+	vlogGenerationObservedGCFirstQueuedUnixNano                 map[uint32]int64
+	vlogGenerationObservedGCLatencyCompletedIDs                 atomic.Uint64
+	vlogGenerationObservedGCLatencyDroppedIDs                   atomic.Uint64
+	vlogGenerationObservedGCLatencyTotalMS                      atomic.Uint64
+	vlogGenerationObservedGCLatencyMaxMS                        atomic.Uint64
 	vlogGenerationObservedGCSourceSegmentsTotal                 atomic.Uint64
 	vlogGenerationObservedGCSourceSegmentsEligibleTotal         atomic.Uint64
 	vlogGenerationObservedGCSourceSegmentsDeletedTotal          atomic.Uint64
@@ -6019,6 +6126,9 @@ const (
 	// requests while replay GC is trying to converge. Allow a faster cadence for
 	// that targeted path without dropping the generic min-interval guard.
 	retainedPruneObservedMinInterval = 3 * time.Second
+	// Bound observed-source replay retries so a permanently retained-protected ID
+	// cannot stay queued forever when replay GC cannot make progress.
+	vlogGenerationObservedGCRetryMaxAttempts = uint8(3)
 	// Coordinate index vacuum with major rewrite windows; do not run on every GC.
 	vlogGenerationVacuumTriggerRewriteBytes = int64(64 << 20)
 	vlogGenerationVacuumMinInterval         = 5 * time.Minute
@@ -15018,8 +15128,13 @@ planned:
 			len(observedSourceGCIDs),
 		)
 		if forceObservedSourceGC {
-			db.vlogGenerationObservedGCRetryQueued.Add(1)
-			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
+			queuedIDs, droppedIDs := db.retryVlogGenerationObservedSourceGCList(observedSourceGCIDs)
+			db.debugVlogMaintf(
+				"gc_observed_retry reason=disabled_env observed_ids=%d queued_ids=%d dropped_ids=%d",
+				len(observedSourceGCIDs),
+				queuedIDs,
+				droppedIDs,
+			)
 		}
 		return
 	}
@@ -15045,8 +15160,13 @@ planned:
 			len(observedSourceGCIDs),
 		)
 		if forceObservedSourceGC {
-			db.vlogGenerationObservedGCRetryQueued.Add(1)
-			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
+			queuedIDs, droppedIDs := db.retryVlogGenerationObservedSourceGCList(observedSourceGCIDs)
+			db.debugVlogMaintf(
+				"gc_observed_retry reason=backend_no_gcer observed_ids=%d queued_ids=%d dropped_ids=%d",
+				len(observedSourceGCIDs),
+				queuedIDs,
+				droppedIDs,
+			)
 		}
 		return
 	}
@@ -15154,9 +15274,19 @@ planned:
 			)
 			db.queueRetainedPruneObservedSourceIDs(observedSourceGCIDs)
 			db.scheduleRetainedValueLogPruneForce()
-			db.vlogGenerationObservedGCRetryQueued.Add(1)
-			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
-			db.vlogGenerationCheckpointKickPending.Store(true)
+			queuedIDs, droppedIDs := db.retryVlogGenerationObservedSourceGCList(observedSourceGCIDs)
+			if queuedIDs > 0 {
+				db.vlogGenerationCheckpointKickPending.Store(true)
+			}
+			db.debugVlogMaintf(
+				"gc_observed_retry_result reason=retained_protected observed_ids=%d queued_ids=%d dropped_ids=%d max_attempts=%d",
+				len(observedSourceGCIDs),
+				queuedIDs,
+				droppedIDs,
+				vlogGenerationObservedGCRetryMaxAttempts,
+			)
+		} else if forceObservedSourceGC {
+			db.finalizeVlogGenerationObservedSourceGCIDs(observedSourceGCIDs, false)
 		}
 		db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle)
 		db.vlogGenerationGCRuns.Add(1)
@@ -15177,8 +15307,13 @@ planned:
 			err,
 		)
 		if forceObservedSourceGC {
-			db.vlogGenerationObservedGCRetryQueued.Add(1)
-			db.queueVlogGenerationObservedSourceGCList(observedSourceGCIDs)
+			queuedIDs, droppedIDs := db.retryVlogGenerationObservedSourceGCList(observedSourceGCIDs)
+			db.debugVlogMaintf(
+				"gc_observed_retry reason=gc_error observed_ids=%d queued_ids=%d dropped_ids=%d",
+				len(observedSourceGCIDs),
+				queuedIDs,
+				droppedIDs,
+			)
 		}
 		if errors.Is(err, context.Canceled) {
 			db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerIdle)
@@ -20701,6 +20836,13 @@ func (db *DB) Stats() map[string]string {
 	db.vlogGenerationObservedGCMu.Lock()
 	observedGCPending := len(db.vlogGenerationObservedGCSourceIDs)
 	db.vlogGenerationObservedGCMu.Unlock()
+	observedGCLatencyCompleted := db.vlogGenerationObservedGCLatencyCompletedIDs.Load()
+	observedGCLatencyDropped := db.vlogGenerationObservedGCLatencyDroppedIDs.Load()
+	observedGCLatencyTotalMS := db.vlogGenerationObservedGCLatencyTotalMS.Load()
+	observedGCLatencyAvgMS := 0.0
+	if totalObservedGCLatencyIDs := observedGCLatencyCompleted + observedGCLatencyDropped; totalObservedGCLatencyIDs > 0 {
+		observedGCLatencyAvgMS = float64(observedGCLatencyTotalMS) / float64(totalObservedGCLatencyIDs)
+	}
 	rewriteAgeBlockedUntilNS := db.vlogGenerationRewriteAgeBlockedUntilNS.Load()
 	rewriteAgeBlockedRemainingMS := int64(0)
 	if rewriteAgeBlockedUntilNS > 0 {
@@ -20908,6 +21050,13 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.observed_gc.taken_ids"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCTakenIDs.Load())
 	stats["treedb.cache.vlog_generation.observed_gc.runs"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRuns.Load())
 	stats["treedb.cache.vlog_generation.observed_gc.retry_queued"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRetryQueued.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.retry_dropped"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCRetryDropped.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.retry_max_attempts"] = fmt.Sprintf("%d", vlogGenerationObservedGCRetryMaxAttempts)
+	stats["treedb.cache.vlog_generation.observed_gc.latency.completed_ids"] = fmt.Sprintf("%d", observedGCLatencyCompleted)
+	stats["treedb.cache.vlog_generation.observed_gc.latency.dropped_ids"] = fmt.Sprintf("%d", observedGCLatencyDropped)
+	stats["treedb.cache.vlog_generation.observed_gc.latency.total_ms"] = fmt.Sprintf("%d", observedGCLatencyTotalMS)
+	stats["treedb.cache.vlog_generation.observed_gc.latency.max_ms"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCLatencyMaxMS.Load())
+	stats["treedb.cache.vlog_generation.observed_gc.latency.avg_ms"] = fmt.Sprintf("%.3f", observedGCLatencyAvgMS)
 	stats["treedb.cache.vlog_generation.observed_gc.source_segments_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsTotal.Load())
 	stats["treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Load())
 	stats["treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"] = fmt.Sprintf("%d", db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index eb7636bc3..1ee40dd58 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -1298,6 +1298,139 @@ func TestVlogGenerationMaintenance_ObservedSourceGCBypassQuietIgnoresForegroundR
 	}
 }
 
+func TestVlogGenerationMaintenance_ObservedSourceGCCompletionClearsRetryState(t *testing.T) {
+	prepareDirectSchedulerTest(t)
+	t.Setenv(envDisableVlogGenerationRewrite, "1")
+
+	dir := t.TempDir()
+	backend, err := backenddb.Open(backenddb.Options{Dir: dir})
+	if err != nil {
+		t.Fatalf("open backend: %v", err)
+	}
+	recorder := &rewriteBudgetRecordingBackend{
+		DB: backend,
+		gcResponse: backenddb.ValueLogGCStats{
+			ObservedSourceSegments:         1,
+			ObservedSourceSegmentsEligible: 1,
+			ObservedSourceSegmentsDeleted:  1,
+			ObservedSourceBytes:            256,
+			ObservedSourceBytesEligible:    256,
+			ObservedSourceBytesDeleted:     256,
+		},
+	}
+
+	db, cleanup := openRewriteQueueTestDB(t, dir, recorder)
+	defer cleanup()
+	skipRetainedPrune(db)
+
+	db.queueVlogGenerationObservedSourceGCList([]uint32{41})
+	db.vlogGenerationLastGCUnixNano.Store(time.Now().Add(-time.Minute).UnixNano())
+	forceVlogMaintenanceIdle(db)
+
+	db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{
+		bypassQuiet:           true,
+		skipRetainedPruneWait: true,
+		skipCheckpoint:        true,
+		rewriteDebtDrain:      true,
+	})
+
+	if got := recorder.recordedGCObservedSourceCalls(); got != 1 {
+		t.Fatalf("observed-source gc calls=%d want 1", got)
+	}
+	if got := db.vlogGenerationObservedGCRetryQueued.Load(); got != 0 {
+		t.Fatalf("observed-source gc retry queued=%d want 0", got)
+	}
+	if got := db.vlogGenerationObservedGCRetryDropped.Load(); got != 0 {
+		t.Fatalf("observed-source gc retry dropped=%d want 0", got)
+	}
+	if got := db.vlogGenerationObservedGCLatencyCompletedIDs.Load(); got != 1 {
+		t.Fatalf("observed-source gc latency completed ids=%d want 1", got)
+	}
+	if got := db.vlogGenerationObservedGCLatencyDroppedIDs.Load(); got != 0 {
+		t.Fatalf("observed-source gc latency dropped ids=%d want 0", got)
+	}
+	if pending := len(db.takeVlogGenerationObservedSourceGCList()); pending != 0 {
+		t.Fatalf("observed-source gc pending ids=%d want 0", pending)
+	}
+	db.vlogGenerationObservedGCMu.Lock()
+	if _, exists := db.vlogGenerationObservedGCRetryAttempts[41]; exists {
+		db.vlogGenerationObservedGCMu.Unlock()
+		t.Fatalf("retry attempt state still present for observed id 41")
+	}
+	if _, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[41]; exists {
+		db.vlogGenerationObservedGCMu.Unlock()
+		t.Fatalf("first queued timestamp still present for observed id 41")
+	}
+	db.vlogGenerationObservedGCMu.Unlock()
+}
+
+func TestVlogGenerationMaintenance_ObservedSourceGCRetryBudgetDropsAfterMaxAttempts(t *testing.T) {
+	prepareDirectSchedulerTest(t)
+	t.Setenv(envDisableVlogGenerationRewrite, "1")
+
+	dir := t.TempDir()
+	backend, err := backenddb.Open(backenddb.Options{Dir: dir})
+	if err != nil {
+		t.Fatalf("open backend: %v", err)
+	}
+	recorder := &rewriteBudgetRecordingBackend{
+		DB: backend,
+		gcResponse: backenddb.ValueLogGCStats{
+			ObservedSourceSegments:                  1,
+			ObservedSourceSegmentsEligible:          0,
+			ObservedSourceSegmentsProtectedRetained: 1,
+			ObservedSourceBytes:                     128,
+			ObservedSourceBytesProtectedRetained:    128,
+		},
+	}
+
+	db, cleanup := openRewriteQueueTestDB(t, dir, recorder)
+	defer cleanup()
+	skipRetainedPrune(db)
+
+	db.queueVlogGenerationObservedSourceGCList([]uint32{73})
+	passes := int(vlogGenerationObservedGCRetryMaxAttempts) + 1
+	for i := 0; i < passes; i++ {
+		db.vlogGenerationLastGCUnixNano.Store(time.Now().Add(-time.Minute).UnixNano())
+		forceVlogMaintenanceIdle(db)
+		db.maybeRunVlogGenerationMaintenanceWithOptions(true, vlogGenerationMaintenanceOptions{
+			bypassQuiet:           true,
+			skipRetainedPruneWait: true,
+			skipCheckpoint:        true,
+			rewriteDebtDrain:      true,
+		})
+	}
+
+	if got := recorder.recordedGCObservedSourceCalls(); got != passes {
+		t.Fatalf("observed-source gc calls=%d want %d", got, passes)
+	}
+	if got := db.vlogGenerationObservedGCRetryQueued.Load(); got != uint64(vlogGenerationObservedGCRetryMaxAttempts) {
+		t.Fatalf("observed-source gc retry queued=%d want %d", got, vlogGenerationObservedGCRetryMaxAttempts)
+	}
+	if got := db.vlogGenerationObservedGCRetryDropped.Load(); got != 1 {
+		t.Fatalf("observed-source gc retry dropped=%d want 1", got)
+	}
+	if got := db.vlogGenerationObservedGCLatencyCompletedIDs.Load(); got != 0 {
+		t.Fatalf("observed-source gc latency completed ids=%d want 0", got)
+	}
+	if got := db.vlogGenerationObservedGCLatencyDroppedIDs.Load(); got != 1 {
+		t.Fatalf("observed-source gc latency dropped ids=%d want 1", got)
+	}
+	if pending := len(db.takeVlogGenerationObservedSourceGCList()); pending != 0 {
+		t.Fatalf("observed-source gc pending ids=%d want 0", pending)
+	}
+	db.vlogGenerationObservedGCMu.Lock()
+	if _, exists := db.vlogGenerationObservedGCRetryAttempts[73]; exists {
+		db.vlogGenerationObservedGCMu.Unlock()
+		t.Fatalf("retry attempt state still present for observed id 73 after drop")
+	}
+	if _, exists := db.vlogGenerationObservedGCFirstQueuedUnixNano[73]; exists {
+		db.vlogGenerationObservedGCMu.Unlock()
+		t.Fatalf("first queued timestamp still present for observed id 73 after drop")
+	}
+	db.vlogGenerationObservedGCMu.Unlock()
+}
+
 func TestVlogGenerationRewrite_FreshPlanExecIgnoresForegroundCancelUntilBoundedComplete(t *testing.T) {
 	prepareDirectSchedulerTest(t)
 
@@ -6268,6 +6401,11 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	db.vlogGenerationObservedGCTakenIDs.Store(9)
 	db.vlogGenerationObservedGCRuns.Store(3)
 	db.vlogGenerationObservedGCRetryQueued.Store(2)
+	db.vlogGenerationObservedGCRetryDropped.Store(1)
+	db.vlogGenerationObservedGCLatencyCompletedIDs.Store(6)
+	db.vlogGenerationObservedGCLatencyDroppedIDs.Store(2)
+	db.vlogGenerationObservedGCLatencyTotalMS.Store(640)
+	db.vlogGenerationObservedGCLatencyMaxMS.Store(210)
 	db.vlogGenerationObservedGCSourceSegmentsTotal.Store(11)
 	db.vlogGenerationObservedGCSourceSegmentsEligibleTotal.Store(5)
 	db.vlogGenerationObservedGCSourceSegmentsDeletedTotal.Store(3)
@@ -6565,6 +6703,27 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.observed_gc.retry_queued"]; got != "2" {
 		t.Fatalf("observed gc retry queued=%q want 2", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.retry_dropped"]; got != "1" {
+		t.Fatalf("observed gc retry dropped=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.retry_max_attempts"]; got != "3" {
+		t.Fatalf("observed gc retry max attempts=%q want 3", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.latency.completed_ids"]; got != "6" {
+		t.Fatalf("observed gc latency completed ids=%q want 6", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.latency.dropped_ids"]; got != "2" {
+		t.Fatalf("observed gc latency dropped ids=%q want 2", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.latency.total_ms"]; got != "640" {
+		t.Fatalf("observed gc latency total ms=%q want 640", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.latency.max_ms"]; got != "210" {
+		t.Fatalf("observed gc latency max ms=%q want 210", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.observed_gc.latency.avg_ms"]; got != "80.000" {
+		t.Fatalf("observed gc latency avg ms=%q want 80.000", got)
+	}
 	if got := stats["treedb.cache.vlog_generation.observed_gc.source_segments_total"]; got != "11" {
 		t.Fatalf("observed gc source segments total=%q want 11", got)
 	}
diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
index 113bebed3..6f5d83aa0 100644
--- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
+++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
@@ -56,6 +56,47 @@ The report highlights:
 - zombie inventory (pinned vs unpinned bytes)
 - GC eligibility/protection signals
 
+## Interleaved A/B Harness
+For sync+rewrite tradeoff validation, use the interleaved harness:
+
+```bash
+cat >/tmp/cel_control.env <<'EOF'
+LOCAL_GOMAP_DIR=/path/to/control/gomap
+TREEDB_OPEN_PROFILE=fast
+EOF
+
+cat >/tmp/cel_candidate.env <<'EOF'
+LOCAL_GOMAP_DIR=/path/to/candidate/gomap
+TREEDB_OPEN_PROFILE=fast
+EOF
+
+CONTROL_ENV_FILE=/tmp/cel_control.env \
+CANDIDATE_ENV_FILE=/tmp/cel_candidate.env \
+MAX_PAIRS=10 \
+MIN_PAIRS=4 \
+CLEAR_WIN_PAIRS=3 \
+CLEAR_LOSS_PAIRS=3 \
+./scripts/run_celestia_ab.sh
+```
+
+Default pair metric focus:
+- `T_sync`: sync duration (seconds)
+- `S_sync_app`: app dir bytes at sync end
+- `S_sync_wal`: `application.db/maindb/wal` bytes at sync end
+- `T_rw`: offline `vlog-rewrite` wall time
+- `S_post_wal`: WAL bytes after offline rewrite
+- `T_total = T_sync + T_rw`
+- `max_rss_kb` (memory guardrail)
+
+Outputs:
+- `artifacts/celestia_ab/<ts>/runs.csv`
+- `artifacts/celestia_ab/<ts>/pairs.csv`
+- `artifacts/celestia_ab/<ts>/summary.md`
+- per-run JSON under `artifacts/celestia_ab/<ts>/runs/*/run.json`
+
+The harness alternates run order per pair (`control->candidate`, then
+`candidate->control`) and can stop early on clear win/loss signals.
+
 ## Experimental Knob
 - `TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1`
   - WAL-off only.
diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py
index a719f5cff..292a8a9e8 100755
--- a/scripts/analyze_vlog_maintenance_capacity.py
+++ b/scripts/analyze_vlog_maintenance_capacity.py
@@ -297,6 +297,12 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
         "observed_gc_taken_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.taken_ids"),
         "observed_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.runs"),
         "observed_gc_retry_queued": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.retry_queued"),
+        "observed_gc_retry_dropped": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.retry_dropped"),
+        "observed_gc_retry_max_attempts": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.retry_max_attempts"),
+        "observed_gc_latency_completed_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.latency.completed_ids"),
+        "observed_gc_latency_dropped_ids": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.latency.dropped_ids"),
+        "observed_gc_latency_total_ms": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.latency.total_ms"),
+        "observed_gc_latency_max_ms": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.latency.max_ms"),
         "observed_gc_source_segments_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_total"),
         "observed_gc_source_segments_eligible_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_eligible_total"),
         "observed_gc_source_segments_deleted_total": metric_int(stats, "treedb.cache.vlog_generation.observed_gc.source_segments_deleted_total"),
@@ -375,6 +381,12 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
     )
 
     m["observed_gc_drain_pct"] = pct(m["observed_gc_taken_ids"], m["observed_gc_queued_ids"])
+    m["observed_gc_latency_finalized_ids"] = m["observed_gc_latency_completed_ids"] + m["observed_gc_latency_dropped_ids"]
+    m["observed_gc_latency_avg_ms"] = (
+        (float(m["observed_gc_latency_total_ms"]) / float(m["observed_gc_latency_finalized_ids"]))
+        if m["observed_gc_latency_finalized_ids"] > 0
+        else 0.0
+    )
     m["observed_gc_source_segments_eligible_pct"] = pct(
         m["observed_gc_source_segments_eligible_total"],
         m["observed_gc_source_segments_total"],
@@ -603,6 +615,16 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst
         f"{summary['observed_gc_queued_ids']} / {summary['observed_gc_taken_ids']} / {summary['observed_gc_pending_ids']} "
         f"(drain={summary['observed_gc_drain_pct']:.1f}%, retries={summary['observed_gc_retry_queued']}, runs={summary['observed_gc_runs']})"
     )
+    print(
+        "  retry budget/latency: "
+        f"max_attempts={summary['observed_gc_retry_max_attempts']} "
+        f"retry_dropped={summary['observed_gc_retry_dropped']} "
+        f"finalized_ids={summary['observed_gc_latency_finalized_ids']} "
+        f"(completed={summary['observed_gc_latency_completed_ids']}, dropped={summary['observed_gc_latency_dropped_ids']}) "
+        f"latency total_ms={summary['observed_gc_latency_total_ms']} "
+        f"avg_ms={summary['observed_gc_latency_avg_ms']:.3f} "
+        f"max_ms={summary['observed_gc_latency_max_ms']}"
+    )
     print(
         "  observed-source totals: "
         f"segments total={summary['observed_gc_source_segments_total']} "
@@ -659,6 +681,8 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst
         notes.append("rewrite copied stale bytes but immediate reclaim is zero; inspect GC eligibility/protection and post-run rewrite window")
     if summary["observed_gc_pending_ids"] > 0:
         notes.append("observed-source GC backlog still pending; may need longer run window or higher checkpoint-kick pressure")
+    if summary["observed_gc_retry_dropped"] > 0:
+        notes.append("observed-source GC retries hit max-attempt budget for some IDs; inspect retained-prune throughput and checkpoint-kick cadence")
     if summary["maintenance_collision_rate_pct"] > 20.0:
         notes.append("maintenance collision rate is high; lane contention may be throttling rewrite/GC progress")
     if summary["rewrite_segment_realization_pct"] < 60.0 and summary["rewrite_plan_selected_segments_total"] > 0:
diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh
new file mode 100755
index 000000000..71ef9e3db
--- /dev/null
+++ b/scripts/run_celestia_ab.sh
@@ -0,0 +1,510 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
+ANALYZER="${ANALYZER:-$ROOT/scripts/analyze_vlog_maintenance_capacity.py}"
+RUN_HOME_GLOB="${RUN_HOME_GLOB:-$HOME/.celestia-app-mainnet-treedb-*}"
+RUN_CMD="${RUN_CMD:-$HOME/run_celestia.sh}"
+CONTROL_ENV_FILE="${CONTROL_ENV_FILE:-}"
+CANDIDATE_ENV_FILE="${CANDIDATE_ENV_FILE:-}"
+TREEMAP_BIN="${TREEMAP_BIN:-/home/mikers/dev/snissn/celestia-app-p4/build/treemap-local}"
+REWRITE_ENABLED="${REWRITE_ENABLED:-1}"
+MAX_PAIRS="${MAX_PAIRS:-10}"
+MIN_PAIRS="${MIN_PAIRS:-4}"
+CLEAR_WIN_PAIRS="${CLEAR_WIN_PAIRS:-3}"
+CLEAR_LOSS_PAIRS="${CLEAR_LOSS_PAIRS:-3}"
+SIZE_TOLERANCE_BYTES="${SIZE_TOLERANCE_BYTES:-67108864}"
+TIME_TOLERANCE_SECONDS="${TIME_TOLERANCE_SECONDS:-120}"
+STOP_ON_CLEAR="${STOP_ON_CLEAR:-1}"
+SLEEP_BETWEEN_RUNS_SECONDS="${SLEEP_BETWEEN_RUNS_SECONDS:-5}"
+TS="$(date +%Y%m%d%H%M%S)"
+OUT="${OUT_DIR:-$ROOT/artifacts/celestia_ab/$TS}"
+
+if ! command -v python3 >/dev/null 2>&1; then
+  echo "python3 is required" >&2
+  exit 1
+fi
+if [[ ! -x "$ANALYZER" ]]; then
+  echo "analyzer not found/executable: $ANALYZER" >&2
+  exit 1
+fi
+if [[ "$MAX_PAIRS" -lt 1 ]]; then
+  echo "MAX_PAIRS must be >= 1" >&2
+  exit 1
+fi
+
+mkdir -p "$OUT/runs"
+
+cat >"$OUT/meta.txt" <<META
+ts=$TS
+root=$ROOT
+run_cmd=$RUN_CMD
+control_env_file=$CONTROL_ENV_FILE
+candidate_env_file=$CANDIDATE_ENV_FILE
+treemap_bin=$TREEMAP_BIN
+rewrite_enabled=$REWRITE_ENABLED
+max_pairs=$MAX_PAIRS
+min_pairs=$MIN_PAIRS
+clear_win_pairs=$CLEAR_WIN_PAIRS
+clear_loss_pairs=$CLEAR_LOSS_PAIRS
+size_tolerance_bytes=$SIZE_TOLERANCE_BYTES
+time_tolerance_seconds=$TIME_TOLERANCE_SECONDS
+stop_on_clear=$STOP_ON_CLEAR
+sleep_between_runs_seconds=$SLEEP_BETWEEN_RUNS_SECONDS
+META
+
+list_run_homes() {
+  ls -1dt $RUN_HOME_GLOB 2>/dev/null || true
+}
+
+du_bytes() {
+  local target="$1"
+  if [[ ! -e "$target" ]]; then
+    echo 0
+    return 0
+  fi
+  if du -sb "$target" >/dev/null 2>&1; then
+    du -sb "$target" 2>/dev/null | awk '{print $1}'
+    return 0
+  fi
+  du -sk "$target" 2>/dev/null | awk '{print $1 * 1024}'
+}
+
+detect_new_run_home() {
+  local before_file="$1"
+  local -A seen=()
+  while IFS= read -r path; do
+    [[ -n "$path" ]] && seen["$path"]=1
+  done <"$before_file"
+
+  while IFS= read -r path; do
+    if [[ -z "$path" ]]; then
+      continue
+    fi
+    if [[ -z "${seen[$path]+x}" ]]; then
+      echo "$path"
+      return 0
+    fi
+  done < <(list_run_homes)
+
+  list_run_homes | head -n 1
+}
+
+run_variant() {
+  local pair_index="$1"
+  local variant="$2"
+  local env_file="$3"
+
+  local run_id
+  run_id=$(printf "%02d_%s" "$pair_index" "$variant")
+  local run_dir="$OUT/runs/$run_id"
+  mkdir -p "$run_dir"
+
+  local before_file="$run_dir/before_homes.txt"
+  list_run_homes >"$before_file"
+
+  local run_start
+  run_start=$(date +%s)
+  (
+    set -euo pipefail
+    if [[ -n "$env_file" ]]; then
+      # shellcheck source=/dev/null
+      source "$env_file"
+    fi
+    bash -lc "$RUN_CMD"
+  ) >"$run_dir/launcher.log" 2>&1
+  local run_end
+  run_end=$(date +%s)
+
+  local run_home
+  run_home="$(detect_new_run_home "$before_file")"
+  if [[ -z "$run_home" || ! -d "$run_home" ]]; then
+    echo "failed to detect run home for $run_id" >&2
+    exit 1
+  fi
+
+  local app_db="$run_home/data/application.db"
+  local pre_app_bytes pre_wal_bytes
+  pre_app_bytes="$(du_bytes "$app_db")"
+  pre_wal_bytes="$(du_bytes "$app_db/maindb/wal")"
+
+  local analyze_json="$run_dir/maintenance.json"
+  if ! "$ANALYZER" --json "$run_home" >"$analyze_json" 2>"$run_dir/analyze.stderr.log"; then
+    rm -f "$analyze_json"
+  fi
+
+  local rewrite_attempted=0
+  local rewrite_seconds=0
+  local rewrite_rc=0
+  if [[ "$REWRITE_ENABLED" == "1" && -x "$TREEMAP_BIN" && -d "$app_db" ]]; then
+    rewrite_attempted=1
+    local rewrite_start
+    rewrite_start=$(date +%s)
+    set +e
+    "$TREEMAP_BIN" vlog-rewrite "$app_db" -rw >"$run_dir/rewrite.log" 2>&1
+    rewrite_rc=$?
+    set -e
+    local rewrite_end
+    rewrite_end=$(date +%s)
+    rewrite_seconds=$((rewrite_end - rewrite_start))
+  else
+    rewrite_rc=0
+  fi
+
+  local post_app_bytes post_wal_bytes
+  post_app_bytes="$(du_bytes "$app_db")"
+  post_wal_bytes="$(du_bytes "$app_db/maindb/wal")"
+
+  local run_json="$run_dir/run.json"
+  python3 - "$run_home" "$run_json" "$variant" "$pair_index" "$run_start" "$run_end" "$rewrite_attempted" "$rewrite_seconds" "$rewrite_rc" "$pre_app_bytes" "$pre_wal_bytes" "$post_app_bytes" "$post_wal_bytes" "$analyze_json" <<'PY'
+import json
+import sys
+from pathlib import Path
+
+run_home = Path(sys.argv[1])
+out_path = Path(sys.argv[2])
+variant = sys.argv[3]
+pair_index = int(sys.argv[4])
+run_start = int(sys.argv[5])
+run_end = int(sys.argv[6])
+rewrite_attempted = int(sys.argv[7])
+rewrite_seconds = int(sys.argv[8])
+rewrite_rc = int(sys.argv[9])
+pre_app_bytes = int(sys.argv[10])
+pre_wal_bytes = int(sys.argv[11])
+post_app_bytes = int(sys.argv[12])
+post_wal_bytes = int(sys.argv[13])
+analyze_json_path = Path(sys.argv[14])
+
+def parse_sync_time(path: Path) -> dict[str, str]:
+    out: dict[str, str] = {}
+    if not path.is_file():
+        return out
+    for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
+        line = line.strip()
+        if not line or line == "---" or "=" not in line:
+            continue
+        k, v = line.split("=", 1)
+        out[k.strip()] = v.strip()
+    return out
+
+def safe_int(raw: str | None, default: int = 0) -> int:
+    if raw is None:
+        return default
+    s = str(raw).strip()
+    if not s:
+        return default
+    try:
+        return int(s)
+    except Exception:
+        try:
+            return int(float(s))
+        except Exception:
+            return default
+
+sync = parse_sync_time(run_home / "sync" / "sync-time.log")
+maintenance = {}
+if analyze_json_path.is_file():
+    try:
+        payload = json.loads(analyze_json_path.read_text(encoding="utf-8"))
+        if isinstance(payload, dict):
+            summary = payload.get("summary")
+            if isinstance(summary, dict):
+                maintenance = summary
+    except Exception:
+        maintenance = {}
+
+t_sync = safe_int(sync.get("duration_seconds"), max(0, run_end - run_start))
+t_rw = rewrite_seconds if rewrite_attempted == 1 else 0
+if rewrite_attempted == 1 and rewrite_rc != 0:
+    t_total = None
+else:
+    t_total = t_sync + t_rw
+
+result = {
+    "pair_index": pair_index,
+    "variant": variant,
+    "run_home": str(run_home),
+    "sync": {
+        "duration_seconds": t_sync,
+        "max_rss_kb": safe_int(sync.get("max_rss_kb"), 0),
+        "max_hwm_kb": safe_int(sync.get("max_hwm_kb"), 0),
+        "end_app_bytes": safe_int(sync.get("end_app_bytes"), pre_app_bytes),
+        "end_data_bytes": safe_int(sync.get("end_data_bytes"), 0),
+        "end_home_bytes": safe_int(sync.get("end_home_bytes"), 0),
+    },
+    "rewrite": {
+        "attempted": rewrite_attempted == 1,
+        "seconds": t_rw,
+        "exit_code": rewrite_rc,
+    },
+    "sizes": {
+        "sync_app_bytes": pre_app_bytes,
+        "sync_wal_bytes": pre_wal_bytes,
+        "post_app_bytes": post_app_bytes,
+        "post_wal_bytes": post_wal_bytes,
+    },
+    "metrics": {
+        "t_sync_seconds": t_sync,
+        "t_rewrite_seconds": t_rw,
+        "t_total_seconds": t_total,
+        "s_sync_app_bytes": pre_app_bytes,
+        "s_sync_wal_bytes": pre_wal_bytes,
+        "s_post_app_bytes": post_app_bytes,
+        "s_post_wal_bytes": post_wal_bytes,
+        "max_rss_kb": safe_int(sync.get("max_rss_kb"), 0),
+    },
+    "maintenance_summary": maintenance,
+}
+out_path.write_text(json.dumps(result, indent=2, sort_keys=True), encoding="utf-8")
+print(out_path)
+PY
+
+  echo "run_id=$run_id run_home=$run_home json=$run_json"
+}
+
+aggregate_and_decide() {
+  local decision_json="$OUT/decision.json"
+  python3 - "$OUT" "$SIZE_TOLERANCE_BYTES" "$TIME_TOLERANCE_SECONDS" "$MIN_PAIRS" "$CLEAR_WIN_PAIRS" "$CLEAR_LOSS_PAIRS" "$MAX_PAIRS" "$STOP_ON_CLEAR" "$decision_json" <<'PY'
+import csv
+import json
+import sys
+from pathlib import Path
+
+out = Path(sys.argv[1])
+size_tol = int(sys.argv[2])
+time_tol = int(sys.argv[3])
+min_pairs = int(sys.argv[4])
+clear_win_pairs = int(sys.argv[5])
+clear_loss_pairs = int(sys.argv[6])
+max_pairs = int(sys.argv[7])
+stop_on_clear = sys.argv[8] == "1"
+decision_path = Path(sys.argv[9])
+
+run_files = sorted(out.glob("runs/*/run.json"))
+runs = []
+for p in run_files:
+    try:
+        runs.append(json.loads(p.read_text(encoding="utf-8")))
+    except Exception:
+        continue
+
+runs.sort(key=lambda r: (int(r.get("pair_index", 0)), str(r.get("variant", ""))))
+
+runs_csv = out / "runs.csv"
+with runs_csv.open("w", newline="", encoding="utf-8") as fh:
+    w = csv.writer(fh)
+    w.writerow([
+        "pair_index",
+        "variant",
+        "run_home",
+        "t_sync_seconds",
+        "t_rewrite_seconds",
+        "t_total_seconds",
+        "s_sync_app_bytes",
+        "s_sync_wal_bytes",
+        "s_post_app_bytes",
+        "s_post_wal_bytes",
+        "max_rss_kb",
+        "rewrite_exit_code",
+        "rewrite_runs",
+        "gc_runs",
+        "observed_gc_retry_queued",
+        "observed_gc_retry_dropped",
+    ])
+    for r in runs:
+        m = r.get("metrics", {}) or {}
+        s = r.get("sizes", {}) or {}
+        rw = r.get("rewrite", {}) or {}
+        summary = r.get("maintenance_summary", {}) or {}
+        w.writerow([
+            int(r.get("pair_index", 0)),
+            str(r.get("variant", "")),
+            str(r.get("run_home", "")),
+            m.get("t_sync_seconds"),
+            m.get("t_rewrite_seconds"),
+            m.get("t_total_seconds"),
+            s.get("sync_app_bytes"),
+            s.get("sync_wal_bytes"),
+            s.get("post_app_bytes"),
+            s.get("post_wal_bytes"),
+            m.get("max_rss_kb"),
+            rw.get("exit_code"),
+            summary.get("rewrite_runs", 0),
+            summary.get("gc_runs", 0),
+            summary.get("observed_gc_retry_queued", 0),
+            summary.get("observed_gc_retry_dropped", 0),
+        ])
+
+by_pair: dict[int, dict[str, dict]] = {}
+for r in runs:
+    pair = int(r.get("pair_index", 0))
+    by_pair.setdefault(pair, {})[str(r.get("variant", ""))] = r
+
+pair_rows = []
+wins = 0
+losses = 0
+for pair in sorted(by_pair):
+    row = by_pair[pair]
+    ctrl = row.get("control")
+    cand = row.get("candidate")
+    if not ctrl or not cand:
+        continue
+    cm = cand.get("metrics", {}) or {}
+    bm = ctrl.get("metrics", {}) or {}
+    cand_total = cm.get("t_total_seconds")
+    base_total = bm.get("t_total_seconds")
+    cand_post_wal = cm.get("s_post_wal_bytes")
+    base_post_wal = bm.get("s_post_wal_bytes")
+    cand_sync = cm.get("t_sync_seconds")
+    base_sync = bm.get("t_sync_seconds")
+    cand_sync_app = cm.get("s_sync_app_bytes")
+    base_sync_app = bm.get("s_sync_app_bytes")
+
+    def delta(a, b):
+        if a is None or b is None:
+            return None
+        return a - b
+
+    d_total = delta(cand_total, base_total)
+    d_sync = delta(cand_sync, base_sync)
+    d_post_wal = delta(cand_post_wal, base_post_wal)
+    d_sync_app = delta(cand_sync_app, base_sync_app)
+
+    outcome = "neutral"
+    if d_post_wal is not None and d_total is not None:
+        win = (d_post_wal <= -size_tol) and (d_total <= time_tol)
+        loss = (d_post_wal >= size_tol) and (d_total >= -time_tol)
+        if win and not loss:
+            outcome = "win"
+            wins += 1
+        elif loss and not win:
+            outcome = "loss"
+            losses += 1
+
+    pair_rows.append({
+        "pair_index": pair,
+        "delta_t_sync_seconds": d_sync,
+        "delta_t_total_seconds": d_total,
+        "delta_s_sync_app_bytes": d_sync_app,
+        "delta_s_post_wal_bytes": d_post_wal,
+        "outcome": outcome,
+    })
+
+pairs_csv = out / "pairs.csv"
+with pairs_csv.open("w", newline="", encoding="utf-8") as fh:
+    w = csv.writer(fh)
+    w.writerow([
+        "pair_index",
+        "delta_t_sync_seconds",
+        "delta_t_total_seconds",
+        "delta_s_sync_app_bytes",
+        "delta_s_post_wal_bytes",
+        "outcome",
+    ])
+    for r in pair_rows:
+        w.writerow([
+            r["pair_index"],
+            r["delta_t_sync_seconds"],
+            r["delta_t_total_seconds"],
+            r["delta_s_sync_app_bytes"],
+            r["delta_s_post_wal_bytes"],
+            r["outcome"],
+        ])
+
+completed_pairs = len(pair_rows)
+reason = "continue"
+stop = False
+if completed_pairs >= max_pairs:
+    stop = True
+    reason = "max_pairs"
+elif stop_on_clear and completed_pairs >= min_pairs:
+    if wins >= clear_win_pairs and wins > losses:
+        stop = True
+        reason = "clear_improvement"
+    elif losses >= clear_loss_pairs and losses > wins:
+        stop = True
+        reason = "clear_regression"
+
+summary_md = out / "summary.md"
+lines = []
+lines.append("# run_celestia A/B summary")
+lines.append("")
+lines.append(f"- completed pairs: `{completed_pairs}`")
+lines.append(f"- wins/losses/neutral: `{wins}` / `{losses}` / `{max(0, completed_pairs - wins - losses)}`")
+lines.append(f"- size tolerance bytes: `{size_tol}`")
+lines.append(f"- time tolerance seconds: `{time_tol}`")
+lines.append(f"- decision: `{reason}`")
+lines.append("")
+lines.append("## Artifacts")
+lines.append("")
+lines.append(f"- runs csv: `{runs_csv}`")
+lines.append(f"- pairs csv: `{pairs_csv}`")
+lines.append(f"- per-run json: `{out / 'runs'}`")
+if pair_rows:
+    last = pair_rows[-1]
+    lines.append("")
+    lines.append("## Last Pair")
+    lines.append("")
+    lines.append(f"- pair: `{last['pair_index']}` outcome=`{last['outcome']}`")
+    lines.append(f"- delta_t_sync_seconds: `{last['delta_t_sync_seconds']}`")
+    lines.append(f"- delta_t_total_seconds: `{last['delta_t_total_seconds']}`")
+    lines.append(f"- delta_s_sync_app_bytes: `{last['delta_s_sync_app_bytes']}`")
+    lines.append(f"- delta_s_post_wal_bytes: `{last['delta_s_post_wal_bytes']}`")
+summary_md.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+payload = {
+    "completed_pairs": completed_pairs,
+    "wins": wins,
+    "losses": losses,
+    "neutral": max(0, completed_pairs - wins - losses),
+    "stop": stop,
+    "reason": reason,
+}
+decision_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
+print(json.dumps(payload, sort_keys=True))
+PY
+}
+
+run_pair() {
+  local pair_index="$1"
+  if (( pair_index % 2 == 1 )); then
+    run_variant "$pair_index" "control" "$CONTROL_ENV_FILE"
+    sleep "$SLEEP_BETWEEN_RUNS_SECONDS"
+    run_variant "$pair_index" "candidate" "$CANDIDATE_ENV_FILE"
+  else
+    run_variant "$pair_index" "candidate" "$CANDIDATE_ENV_FILE"
+    sleep "$SLEEP_BETWEEN_RUNS_SECONDS"
+    run_variant "$pair_index" "control" "$CONTROL_ENV_FILE"
+  fi
+}
+
+echo "output=$OUT"
+echo "run_cmd=$RUN_CMD"
+
+decision_reason="continue"
+for ((pair = 1; pair <= MAX_PAIRS; pair++)); do
+  echo "pair=$pair start"
+  run_pair "$pair"
+  aggregate_and_decide
+  decision_reason="$(python3 - "$OUT/decision.json" <<'PY'
+import json
+import sys
+payload = json.loads(open(sys.argv[1], 'r', encoding='utf-8').read())
+print(payload.get('reason', 'continue'))
+print('1' if payload.get('stop') else '0')
+PY
+)"
+  stop_flag="$(echo "$decision_reason" | tail -n 1)"
+  decision_reason="$(echo "$decision_reason" | head -n 1)"
+  echo "pair=$pair decision=$decision_reason"
+  if [[ "$stop_flag" == "1" ]]; then
+    break
+  fi
+  sleep "$SLEEP_BETWEEN_RUNS_SECONDS"
+done
+
+echo "completed decision=$decision_reason"
+echo "summary=$OUT/summary.md"
+echo "runs_csv=$OUT/runs.csv"
+echo "pairs_csv=$OUT/pairs.csv"

From 491559139ed94ed5ed9fae0a78643303fc2e1878 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 12:23:22 -1000
Subject: [PATCH 39/61] bench: export env file vars in run_celestia a/b harness

---
 scripts/run_celestia_ab.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh
index 71ef9e3db..d238e63af 100755
--- a/scripts/run_celestia_ab.sh
+++ b/scripts/run_celestia_ab.sh
@@ -109,7 +109,9 @@ run_variant() {
     set -euo pipefail
     if [[ -n "$env_file" ]]; then
       # shellcheck source=/dev/null
+      set -a
       source "$env_file"
+      set +a
     fi
     bash -lc "$RUN_CMD"
   ) >"$run_dir/launcher.log" 2>&1

From 27e7fe34311c8be9a2246f2407ac01027499955a Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 12:26:58 -1000
Subject: [PATCH 40/61] bench: avoid login-shell startup in celestia a/b
 harness

---
 scripts/run_celestia_ab.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh
index d238e63af..73b691921 100755
--- a/scripts/run_celestia_ab.sh
+++ b/scripts/run_celestia_ab.sh
@@ -113,7 +113,9 @@ run_variant() {
       source "$env_file"
       set +a
     fi
-    bash -lc "$RUN_CMD"
+    # Non-login shell avoids user profile side effects (e.g. tty-dependent exports)
+    # that can fail under nohup/background runs.
+    bash -c "$RUN_CMD"
   ) >"$run_dir/launcher.log" 2>&1
   local run_end
   run_end=$(date +%s)

From 1c216249fd20b9f578e7682b40fa33077f454e3c Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 14:51:36 -1000
Subject: [PATCH 41/61] caching: reduce retry gc churn in live maintenance

---
 TreeDB/caching/db.go                          | 65 ++++++++++++----
 .../caching/vlog_generation_scheduler_test.go | 76 +++++++++++++------
 2 files changed, 105 insertions(+), 36 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 10ff71d72..139119a46 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5893,6 +5893,7 @@ type DB struct {
 	vlogGenerationLastRewritePlanUnixNano                       atomic.Int64
 	vlogGenerationLastRewriteUnixNano                           atomic.Int64
 	vlogGenerationLastGCUnixNano                                atomic.Int64
+	vlogGenerationLastGCNoopUnixNano                            atomic.Int64
 	vlogGenerationLastCheckpointKickUnixNano                    atomic.Int64
 	vlogGenerationLastGCDryRunUnixNano                          atomic.Int64
 	vlogGenerationLastGCDryRunBytesEligible                     atomic.Int64
@@ -6096,6 +6097,7 @@ const (
 	vlogGenerationGCMinBytes                = int64(1 << 20)
 	vlogGenerationRewriteMinInterval        = 30 * time.Second
 	vlogGenerationGCMinInterval             = 45 * time.Second
+	vlogGenerationGCNoopMinInterval         = 3 * time.Minute
 	vlogGenerationCheckpointKickMinInterval = 5 * time.Second
 	vlogGenerationCheckpointKickRetryWindow = 5 * time.Second
 	vlogGenerationDeferredRetryWindow       = 30 * time.Second
@@ -12959,17 +12961,16 @@ func (db *DB) maybeRunPeriodicVlogGenerationMaintenance(runGC bool) bool {
 		return false
 	}
 	// Coarse preflight: while foreground activity is hot, avoid entering the
-	// maintenance engine unless a deferred/checkpoint wake is pending. This
-	// prevents high-frequency periodic no-op acquisitions.
-	if !runGC {
-		now := time.Now()
-		quiet := db.foregroundActivityQuietFor(now, vlogGenerationMaintenanceQuietWindow, vlogForegroundReadQuietWindow)
-		if !quiet &&
-			!db.vlogGenerationCheckpointKickPending.Load() &&
-			!db.vlogGenerationDeferredMaintenancePending.Load() &&
-			!db.vlogGenerationDeferredMaintenanceDue(now) {
-			return false
-		}
+	// maintenance engine unless a deferred/checkpoint wake is pending. Apply this
+	// to both rewrite and periodic GC ticks; otherwise runGC ticks can still
+	// issue expensive full scans every interval during restore-heavy sync phases.
+	now := time.Now()
+	quiet := db.foregroundActivityQuietFor(now, vlogGenerationMaintenanceQuietWindow, vlogForegroundReadQuietWindow)
+	if !quiet &&
+		!db.vlogGenerationCheckpointKickPending.Load() &&
+		!db.vlogGenerationDeferredMaintenancePending.Load() &&
+		!db.vlogGenerationDeferredMaintenanceDue(now) {
+		return false
 	}
 	db.maybeRunVlogGenerationMaintenance(runGC)
 	return true
@@ -13900,7 +13901,7 @@ func (db *DB) scheduleDueVlogGenerationDeferredMaintenance() {
 }
 
 func (db *DB) runVlogGenerationCheckpointKickRetries(opts vlogGenerationMaintenanceOptions) {
-	db.runVlogGenerationMaintenanceRetries(opts, vlogGenerationCheckpointKickRetryWindow, false)
+	db.runVlogGenerationMaintenanceRetries(opts, vlogGenerationCheckpointKickRetryWindow, true)
 }
 
 func (db *DB) runVlogGenerationMaintenanceRetries(opts vlogGenerationMaintenanceOptions, retryWindow time.Duration, stopWhenAcquired bool) {
@@ -13951,7 +13952,11 @@ func (db *DB) runVlogGenerationMaintenanceRetries(opts vlogGenerationMaintenance
 				db.vlogGenerationMaintenanceActive.Load(),
 			)
 		}
-		ran := db.maybeRunVlogGenerationMaintenanceWithOptions(true, opts)
+		// Retry-driven maintenance (checkpoint kick / deferred stage confirmation)
+		// prioritizes rewrite debt progress. Keep periodic/full-scan GC on the
+		// normal scheduler path to avoid introducing long full-scan stalls on hot
+		// checkpoint-triggered retries.
+		ran := db.maybeRunVlogGenerationMaintenanceWithOptions(false, opts)
 		if stopWhenAcquired && ran {
 			if opts.debugSource != "" {
 				db.debugVlogMaintf(
@@ -15120,6 +15125,12 @@ planned:
 
 	observedSourceGCIDs := db.takeVlogGenerationObservedSourceGCList()
 	forceObservedSourceGC := len(observedSourceGCIDs) > 0
+	if !runGC && opts.bypassQuiet && !forceObservedSourceGC {
+		// Checkpoint-kick/deferred retry passes are rewrite-priority. Do not run
+		// opportunistic GC here unless we are replaying observed-source IDs from
+		// a prior rewrite/GC cycle.
+		return
+	}
 	if envBool(envDisableVlogGenerationGC) {
 		db.debugVlogMaintf(
 			"gc_skip reason=disabled_env run_gc=%t force_observed=%t observed_ids=%d",
@@ -15170,7 +15181,9 @@ planned:
 		}
 		return
 	}
-	needEligibilityEstimate := !runGC && !forceObservedSourceGC && !db.shouldRunVlogGenerationGC(retained, reclaimable, churnBps)
+	// Retry-driven checkpoint/deferred passes are rewrite-priority paths. Avoid
+	// issuing GC dry-run scans there; let periodic/manual GC decide eligibility.
+	needEligibilityEstimate := !runGC && !opts.bypassQuiet && !forceObservedSourceGC && !db.shouldRunVlogGenerationGC(retained, reclaimable, churnBps)
 	now = time.Now()
 	lastGC := db.vlogGenerationLastGCUnixNano.Load()
 	if lastGC > 0 {
@@ -15187,6 +15200,21 @@ planned:
 			return
 		}
 	}
+	if !forceObservedSourceGC {
+		lastNoop := db.vlogGenerationLastGCNoopUnixNano.Load()
+		if lastNoop > 0 {
+			lastNoopAt := time.Unix(0, lastNoop)
+			if now.Sub(lastNoopAt) < vlogGenerationGCNoopMinInterval {
+				db.debugVlogMaintf(
+					"gc_skip reason=noop_cooldown run_gc=%t since_ms=%.3f min_ms=%.3f",
+					runGC,
+					float64(now.Sub(lastNoopAt).Microseconds())/1000,
+					float64(vlogGenerationGCNoopMinInterval.Microseconds())/1000,
+				)
+				return
+			}
+		}
+	}
 	db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerRunning)
 	db.vlogGenerationLastReason.Store(vlogGenerationReasonPeriodicGC)
 	err = db.runWithBackendMaintenanceOptions(backendMaintenanceOptions{
@@ -15253,6 +15281,15 @@ planned:
 			gcStats.ObservedSourceSegmentsProtectedRetained,
 		)
 		db.observeVlogGenerationGCStats(gcStats)
+		if !forceObservedSourceGC &&
+			gcStats.BytesDeleted == 0 &&
+			gcStats.SegmentsDeleted == 0 &&
+			gcStats.BytesEligible == 0 &&
+			gcStats.SegmentsEligible == 0 {
+			db.vlogGenerationLastGCNoopUnixNano.Store(now.UnixNano())
+		} else {
+			db.vlogGenerationLastGCNoopUnixNano.Store(0)
+		}
 		if gcStats.BytesProtectedRetained > 0 && gcStats.BytesEligible == 0 && db.valueLogRetainedClosedBytes.Load() > 0 {
 			// When GC classifies all reclaim blockers as retained-path protection,
 			// trigger an eager retained prune pass to release stale lifecycle pins.
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 1ee40dd58..f25b8b4dd 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -4309,8 +4309,8 @@ func TestCheckpoint_KicksVlogGenerationRewriteDespiteRecentForegroundActivity(t
 	if _, calls := recorder.recordedPlan(); calls != 1 {
 		t.Fatalf("plan calls=%d want=1", calls)
 	}
-	if got := db.checkpointRuns.Load(); got < 2 {
-		t.Fatalf("checkpoint runs=%d want >=2", got)
+	if got := db.checkpointRuns.Load(); got < 1 {
+		t.Fatalf("checkpoint runs=%d want >=1", got)
 	}
 	stats := db.Stats()
 	if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "1" {
@@ -4489,7 +4489,7 @@ func TestCheckpoint_KickSelfDrainsMaintenanceCollision(t *testing.T) {
 	<-release
 }
 
-func TestCheckpoint_KicksVlogGenerationGCDespiteRecentForegroundActivity(t *testing.T) {
+func TestCheckpoint_KickDoesNotForceGCDuringRecentForegroundActivity(t *testing.T) {
 	disableVlogGenerationLoop(t)
 	t.Setenv(envDisableVlogGenerationRewrite, "1")
 
@@ -4536,31 +4536,19 @@ func TestCheckpoint_KicksVlogGenerationGCDespiteRecentForegroundActivity(t *test
 		t.Fatalf("checkpoint: %v", err)
 	}
 
-	deadline := time.Now().Add(2 * schedulerTestWait(t))
-	for {
-		_, realCalls, _ := recorder.recordedCalls()
-		if realCalls == 1 {
-			break
-		}
-		if time.Now().After(deadline) {
-			dryCalls, realCalls, _ := recorder.recordedCalls()
-			t.Fatalf("checkpoint kick did not run gc in time: dryCalls=%d realCalls=%d", dryCalls, realCalls)
-		}
-		time.Sleep(10 * time.Millisecond)
+	time.Sleep(150 * time.Millisecond)
+	if dryCalls, realCalls, _ := recorder.recordedCalls(); dryCalls != 0 || realCalls != 0 {
+		t.Fatalf("gc calls dry=%d real=%d want dry=0 real=0", dryCalls, realCalls)
 	}
-
-	if dryCalls, realCalls, _ := recorder.recordedCalls(); dryCalls != 0 || realCalls != 1 {
-		t.Fatalf("gc calls dry=%d real=%d want dry=0 real=1", dryCalls, realCalls)
-	}
-	if got := db.checkpointRuns.Load(); got < 2 {
-		t.Fatalf("checkpoint runs=%d want >=2", got)
+	if got := db.checkpointRuns.Load(); got != 1 {
+		t.Fatalf("checkpoint runs=%d want 1", got)
 	}
 	stats := db.Stats()
 	if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "1" {
 		t.Fatalf("checkpoint kick runs=%q want 1", got)
 	}
-	if got := stats["treedb.cache.vlog_generation.checkpoint_kick.gc_runs"]; got != "1" {
-		t.Fatalf("checkpoint kick gc runs=%q want 1", got)
+	if got := stats["treedb.cache.vlog_generation.checkpoint_kick.gc_runs"]; got != "0" {
+		t.Fatalf("checkpoint kick gc runs=%q want 0", got)
 	}
 	if got := stats["treedb.cache.vlog_generation.checkpoint_kick.active"]; got != "false" {
 		t.Fatalf("checkpoint kick active=%q want false", got)
@@ -4597,6 +4585,50 @@ func TestVlogGenerationMaintenance_PeriodicGCSkipsWhileRewriteAgeBlocked(t *test
 	}
 }
 
+func TestVlogGenerationMaintenance_PeriodicGCNoopCooldown(t *testing.T) {
+	prepareDirectSchedulerTest(t)
+	t.Setenv(envDisableVlogGenerationRewrite, "1")
+
+	dir := t.TempDir()
+
+	backend, err := backenddb.Open(backenddb.Options{Dir: dir})
+	if err != nil {
+		t.Fatalf("open backend: %v", err)
+	}
+	recorder := &rewriteBudgetRecordingBackend{
+		DB:         backend,
+		gcResponse: backenddb.ValueLogGCStats{},
+	}
+
+	db, cleanup := openRewriteQueueTestDB(t, dir, recorder)
+	t.Cleanup(cleanup)
+	skipRetainedPrune(db)
+	forceVlogMaintenanceIdle(db)
+
+	quietSince := time.Now().Add(-2 * vlogGenerationMaintenanceQuietWindow).UnixNano()
+	db.lastForegroundWriteUnixNano.Store(quietSince)
+	db.lastForegroundReadUnixNano.Store(quietSince)
+	db.activeForegroundIterators.Store(0)
+
+	db.maybeRunVlogGenerationMaintenance(true)
+
+	if _, calls := recorder.recordedGC(); calls != 1 {
+		t.Fatalf("first periodic GC calls=%d want=1", calls)
+	}
+	if got := db.vlogGenerationLastGCNoopUnixNano.Load(); got <= 0 {
+		t.Fatalf("last GC noop unix nano=%d want >0 after zero-eligibility pass", got)
+	}
+
+	// Bypass the normal min-interval gate; noop cooldown should still suppress.
+	db.vlogGenerationLastGCUnixNano.Store(time.Now().Add(-2 * vlogGenerationGCMinInterval).UnixNano())
+	forceVlogMaintenanceIdle(db)
+	db.maybeRunVlogGenerationMaintenance(true)
+
+	if _, calls := recorder.recordedGC(); calls != 1 {
+		t.Fatalf("periodic GC should skip under noop cooldown; calls=%d want=1", calls)
+	}
+}
+
 func TestVlogGenerationMaintenance_PeriodicGCSkipsInWALOnMode(t *testing.T) {
 	prepareDirectSchedulerTest(t)
 

From 70eb677594bb72f5cabf2f8ccea8f83e075db4c7 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 15:54:42 -1000
Subject: [PATCH 42/61] bench: add celestia fast-gate loop and low-signal stop
 rules

---
 docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md | 118 +++
 scripts/celestia_fast_gate.sh                | 780 +++++++++++++++++++
 scripts/run_celestia_ab.sh                   |  46 +-
 3 files changed, 936 insertions(+), 8 deletions(-)
 create mode 100644 docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md
 create mode 100755 scripts/celestia_fast_gate.sh

diff --git a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md
new file mode 100644
index 000000000..25ba010c0
--- /dev/null
+++ b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md
@@ -0,0 +1,118 @@
+# Celestia Compression Iteration Loop
+
+This loop exists to avoid slow, low-signal experimentation.
+
+Primary objective:
+- Reduce on-disk `application.db` bytes.
+
+Secondary objectives:
+- Keep combined wall time (`sync + rewrite`) bounded.
+- Avoid memory regressions (`max_rss`).
+- Keep gzip as a sanity check, not the primary objective.
+
+## Stage 0: Hypothesis Contract (Required)
+
+Before running anything expensive, define:
+- hypothesis: what changed and why it should help
+- expected effect size: minimum size delta worth promoting
+- time budget: max acceptable wall-time regression
+- rollback condition: what result means we stop and redesign
+
+If expected effect size is below threshold, do not run full `run_celestia` yet.
+
+## Stage 1: Fast Gate (Default Iteration Loop)
+
+Use `scripts/celestia_fast_gate.sh` for fast interleaved control/candidate A/B.
+
+What it measures per run:
+- pre-rewrite size: `sync_app`, `sync_wal`, optional `sync_gzip`
+- post-rewrite size: `post_app`, `post_wal`, optional `post_gzip`
+- timing: benchmark duration + rewrite duration + total
+- throughput: batch-write ops/sec from unified-bench output
+
+Defaults chosen for celestia-like pressure:
+- `-profile fast`
+- `-val-pattern celestia_height_prefix_fill`
+- dict compression enabled
+- dict defaults passed explicitly:
+  - `-treedb-vlog-dict-train-bytes=1048576`
+  - `-treedb-vlog-dict-dict-bytes=32768`
+
+Fast-gate anti-loop safeguards:
+- interleaved order alternates each pair (bias reduction)
+- early clear stop (improvement/regression)
+- futility stop when remaining pairs cannot reach a clear decision
+- low-signal stop on neutral-streak threshold
+- per-run process review artifact (`process_review.md`)
+
+Example:
+
+```bash
+MAX_PAIRS=6 \
+MIN_PAIRS=3 \
+CLEAR_WIN_PAIRS=2 \
+CLEAR_LOSS_PAIRS=2 \
+LOW_SIGNAL_MIN_PAIRS=3 \
+LOW_SIGNAL_NEUTRAL_STREAK=3 \
+SIZE_FIELD=s_post_app_bytes \
+SIZE_TOLERANCE_BYTES=$((64<<20)) \
+TIME_TOLERANCE_SECONDS=30 \
+./scripts/celestia_fast_gate.sh
+```
+
+Outputs:
+- `summary.md`
+- `process_review.md`
+- `runs.csv`
+- `pairs.csv`
+- per-run `run.json`
+
+## Stage 2: Pprof/Implementation Efficiency Pass
+
+Run this stage before full `run_celestia` if fast gate shows:
+- promising size gains with time regression, or
+- ambiguous neutral outcomes near threshold.
+
+Goal:
+- remove avoidable implementation overhead (copying/alloc/lock contention)
+- preserve size gains while pulling time back inside budget
+
+## Stage 3: Full `run_celestia` A/B Confirmation
+
+Only promote candidates that pass Stage 1 and Stage 2.
+
+Use `scripts/run_celestia_ab.sh` with interleaved pairs and stop rules.
+
+Now includes anti-loop safeguards:
+- clear stop (improvement/regression)
+- futility stop (`futile_remaining_pairs`)
+- low-signal neutral-streak stop (`low_signal_neutral_streak`)
+
+Example:
+
+```bash
+MAX_PAIRS=4 \
+MIN_PAIRS=3 \
+CLEAR_WIN_PAIRS=2 \
+CLEAR_LOSS_PAIRS=2 \
+LOW_SIGNAL_MIN_PAIRS=3 \
+LOW_SIGNAL_NEUTRAL_STREAK=3 \
+REWRITE_ENABLED=1 \
+./scripts/run_celestia_ab.sh
+```
+
+## Process Review Cadence
+
+Review and revise the loop after every decision event:
+- `clear_improvement`
+- `clear_regression`
+- `futile_remaining_pairs`
+- `low_signal_neutral_streak`
+
+Required review questions:
+- Was the fast gate predictive of full-run direction?
+- Were thresholds too strict or too loose for current goals?
+- Did we spend time validating changes below meaningful effect size?
+- Is the next candidate large enough to justify promotion?
+
+If two consecutive campaigns end in low-signal/futility, tighten promotion gates and bundle larger candidate deltas before next full run.
diff --git a/scripts/celestia_fast_gate.sh b/scripts/celestia_fast_gate.sh
new file mode 100755
index 000000000..f79292e0f
--- /dev/null
+++ b/scripts/celestia_fast_gate.sh
@@ -0,0 +1,780 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)
+cd "$ROOT"
+
+BASELINE_HASH="${BASELINE_HASH:-origin/main}"
+CANDIDATE_UNIFIED_BIN="${CANDIDATE_UNIFIED_BIN:-}"
+CANDIDATE_TREEMAP_BIN="${CANDIDATE_TREEMAP_BIN:-}"
+BASELINE_UNIFIED_BIN="${BASELINE_UNIFIED_BIN:-}"
+BASELINE_TREEMAP_BIN="${BASELINE_TREEMAP_BIN:-}"
+SCRIPT_GOWORK="${SCRIPT_GOWORK:-off}"
+
+MAX_PAIRS="${MAX_PAIRS:-6}"
+MIN_PAIRS="${MIN_PAIRS:-3}"
+CLEAR_WIN_PAIRS="${CLEAR_WIN_PAIRS:-2}"
+CLEAR_LOSS_PAIRS="${CLEAR_LOSS_PAIRS:-2}"
+STOP_ON_CLEAR="${STOP_ON_CLEAR:-1}"
+LOW_SIGNAL_MIN_PAIRS="${LOW_SIGNAL_MIN_PAIRS:-3}"
+LOW_SIGNAL_NEUTRAL_STREAK="${LOW_SIGNAL_NEUTRAL_STREAK:-3}"
+SLEEP_BETWEEN_RUNS_SECONDS="${SLEEP_BETWEEN_RUNS_SECONDS:-2}"
+
+SIZE_FIELD="${SIZE_FIELD:-s_post_app_bytes}"
+SIZE_TOLERANCE_BYTES="${SIZE_TOLERANCE_BYTES:-67108864}"
+TIME_TOLERANCE_SECONDS="${TIME_TOLERANCE_SECONDS:-30}"
+
+PROFILE="${PROFILE:-fast}"
+DBS="${DBS:-treedb}"
+TESTS="${TESTS:-batch_write}"
+KEYS="${KEYS:-500000}"
+VALSIZE="${VALSIZE:-128}"
+BATCHSIZE="${BATCHSIZE:-8000}"
+VAL_PATTERN="${VAL_PATTERN:-celestia_height_prefix_fill}"
+SEED="${SEED:-1}"
+
+FORCE_VALUE_POINTERS="${FORCE_VALUE_POINTERS:-true}"
+OUTER_LEAVES_IN_VLOG="${OUTER_LEAVES_IN_VLOG:-true}"
+VLOG_COMPRESSION="${VLOG_COMPRESSION:-dict}"
+VLOG_COMPRESSION_AUTOTUNE="${VLOG_COMPRESSION_AUTOTUNE:-aggressive}"
+VLOG_COMPRESSION_VARIANT="${VLOG_COMPRESSION_VARIANT:-dict}"
+DICT_TRAIN_BYTES="${DICT_TRAIN_BYTES:-1048576}"
+DICT_BYTES="${DICT_BYTES:-32768}"
+
+REWRITE_ENABLED="${REWRITE_ENABLED:-1}"
+REWRITE_ARGS="${REWRITE_ARGS:--rw}"
+MEASURE_GZIP="${MEASURE_GZIP:-1}"
+KEEP_DB_DIRS="${KEEP_DB_DIRS:-1}"
+
+COMMON_EXTRA_FLAGS="${COMMON_EXTRA_FLAGS:-}"
+CONTROL_EXTRA_FLAGS="${CONTROL_EXTRA_FLAGS:-}"
+CANDIDATE_EXTRA_FLAGS="${CANDIDATE_EXTRA_FLAGS:-}"
+
+TS="$(date +%Y%m%d%H%M%S)"
+OUT="${OUT_DIR:-$ROOT/artifacts/celestia_fast_gate/$TS}"
+
+WORKTREE_PATH=""
+
+require_cmd() {
+  if ! command -v "$1" >/dev/null 2>&1; then
+    echo "missing required command: $1" >&2
+    exit 2
+  fi
+}
+
+du_bytes() {
+  local target="$1"
+  if [[ ! -e "$target" ]]; then
+    echo 0
+    return 0
+  fi
+  if du -sb "$target" >/dev/null 2>&1; then
+    du -sb "$target" 2>/dev/null | awk '{print $1}'
+    return 0
+  fi
+  du -sk "$target" 2>/dev/null | awk '{print $1 * 1024}'
+}
+
+gzip_dir_bytes() {
+  local target="$1"
+  if [[ "$MEASURE_GZIP" != "1" ]]; then
+    echo 0
+    return 0
+  fi
+  if [[ ! -d "$target" ]]; then
+    echo 0
+    return 0
+  fi
+  tar -C "$target" -cf - . 2>/dev/null | gzip -1 -c | wc -c | tr -d '[:space:]'
+}
+
+cleanup() {
+  if [[ -n "$WORKTREE_PATH" && -d "$WORKTREE_PATH" ]]; then
+    git worktree remove --force "$WORKTREE_PATH" >/dev/null 2>&1 || true
+  fi
+}
+trap cleanup EXIT
+
+parse_bench_log() {
+  local log_path="$1"
+  python3 - "$log_path" <<'PY'
+import re
+import sys
+from pathlib import Path
+
+path = Path(sys.argv[1])
+text = path.read_text(encoding="utf-8", errors="replace")
+lines = text.splitlines()
+
+throughput = None
+for line in lines:
+    m = re.search(r"Batch Write\s*/\s*TreeDB[^=]*=\s*([0-9][0-9,]*(?:\.[0-9]+)?)", line)
+    if m:
+        throughput = float(m.group(1).replace(",", ""))
+        break
+if throughput is None:
+    for line in lines:
+        m = re.match(r"\s*Batch Write\s+([0-9][0-9,]*(?:\.[0-9]+)?)\s*$", line)
+        if m:
+            throughput = float(m.group(1).replace(",", ""))
+            break
+
+keep_dir = ""
+in_keep_block = False
+for line in lines:
+    stripped = line.strip()
+    if stripped == "Kept Data Directories":
+        in_keep_block = True
+        continue
+    if in_keep_block:
+        if not stripped:
+            continue
+        if stripped.startswith("TreeDB (") and ":" in stripped:
+            maybe = stripped.split(":", 1)[1].strip()
+            if maybe.startswith("/"):
+                keep_dir = maybe
+                break
+
+if not keep_dir:
+    m = re.search(r"TreeDB \([^\)]*\):\s+(/tmp/bench[^\s]+)", text)
+    if m:
+        keep_dir = m.group(1)
+
+if not keep_dir:
+    raise SystemExit("unable to locate kept data directory in unified-bench output")
+
+if throughput is None:
+    throughput = 0.0
+
+print(f"{keep_dir}\t{throughput}")
+PY
+}
+
+setup_bins() {
+  mkdir -p "$OUT/bin" "$OUT/worktrees" "$OUT/runs"
+
+  if [[ -z "$CANDIDATE_UNIFIED_BIN" ]]; then
+    CANDIDATE_UNIFIED_BIN="$OUT/bin/unified-bench-candidate"
+    GOWORK="$SCRIPT_GOWORK" go build -o "$CANDIDATE_UNIFIED_BIN" ./cmd/unified_bench
+  fi
+  if [[ ! -x "$CANDIDATE_UNIFIED_BIN" ]]; then
+    echo "candidate unified-bench binary not executable: $CANDIDATE_UNIFIED_BIN" >&2
+    exit 2
+  fi
+
+  if [[ -z "$CANDIDATE_TREEMAP_BIN" ]]; then
+    CANDIDATE_TREEMAP_BIN="$OUT/bin/treemap-candidate"
+    GOWORK="$SCRIPT_GOWORK" go build -o "$CANDIDATE_TREEMAP_BIN" ./TreeDB/cmd/treemap
+  fi
+  if [[ ! -x "$CANDIDATE_TREEMAP_BIN" ]]; then
+    echo "candidate treemap binary not executable: $CANDIDATE_TREEMAP_BIN" >&2
+    exit 2
+  fi
+
+  if [[ -n "$BASELINE_UNIFIED_BIN" && -n "$BASELINE_TREEMAP_BIN" ]]; then
+    if [[ ! -x "$BASELINE_UNIFIED_BIN" ]]; then
+      echo "baseline unified-bench binary not executable: $BASELINE_UNIFIED_BIN" >&2
+      exit 2
+    fi
+    if [[ ! -x "$BASELINE_TREEMAP_BIN" ]]; then
+      echo "baseline treemap binary not executable: $BASELINE_TREEMAP_BIN" >&2
+      exit 2
+    fi
+    return 0
+  fi
+
+  if ! git cat-file -e "${BASELINE_HASH}^{commit}" >/dev/null 2>&1; then
+    git fetch --no-tags --depth=1 origin "$BASELINE_HASH" >/dev/null 2>&1 || git fetch --no-tags origin "$BASELINE_HASH" >/dev/null 2>&1
+  fi
+
+  WORKTREE_PATH="$OUT/worktrees/baseline"
+  git worktree add --detach "$WORKTREE_PATH" "$BASELINE_HASH" >/dev/null
+
+  if [[ -z "$BASELINE_UNIFIED_BIN" ]]; then
+    BASELINE_UNIFIED_BIN="$OUT/bin/unified-bench-baseline"
+    (
+      cd "$WORKTREE_PATH"
+      GOWORK="$SCRIPT_GOWORK" go build -o "$BASELINE_UNIFIED_BIN" ./cmd/unified_bench
+    )
+  fi
+  if [[ -z "$BASELINE_TREEMAP_BIN" ]]; then
+    BASELINE_TREEMAP_BIN="$OUT/bin/treemap-baseline"
+    (
+      cd "$WORKTREE_PATH"
+      GOWORK="$SCRIPT_GOWORK" go build -o "$BASELINE_TREEMAP_BIN" ./TreeDB/cmd/treemap
+    )
+  fi
+
+  if [[ ! -x "$BASELINE_UNIFIED_BIN" ]]; then
+    echo "baseline unified-bench binary not executable: $BASELINE_UNIFIED_BIN" >&2
+    exit 2
+  fi
+  if [[ ! -x "$BASELINE_TREEMAP_BIN" ]]; then
+    echo "baseline treemap binary not executable: $BASELINE_TREEMAP_BIN" >&2
+    exit 2
+  fi
+}
+
+run_variant() {
+  local pair_index="$1"
+  local variant="$2"
+
+  local bench_bin treemap_bin extra_flags
+  if [[ "$variant" == "candidate" ]]; then
+    bench_bin="$CANDIDATE_UNIFIED_BIN"
+    treemap_bin="$CANDIDATE_TREEMAP_BIN"
+    extra_flags="$CANDIDATE_EXTRA_FLAGS"
+  else
+    bench_bin="$BASELINE_UNIFIED_BIN"
+    treemap_bin="$BASELINE_TREEMAP_BIN"
+    extra_flags="$CONTROL_EXTRA_FLAGS"
+  fi
+
+  local run_id
+  run_id=$(printf "%02d_%s" "$pair_index" "$variant")
+  local run_dir="$OUT/runs/$run_id"
+  mkdir -p "$run_dir"
+
+  local cmd=(
+    "$bench_bin"
+    -profile "$PROFILE"
+    -dbs "$DBS"
+    -keys "$KEYS"
+    -valsize "$VALSIZE"
+    -batchsize "$BATCHSIZE"
+    -test "$TESTS"
+    -val-pattern "$VAL_PATTERN"
+    -seed "$SEED"
+    -progress=false
+    -keep
+    -treedb-force-value-pointers="$FORCE_VALUE_POINTERS"
+    -treedb-index-outer-leaves-in-vlog="$OUTER_LEAVES_IN_VLOG"
+    -treedb-vlog-compression "$VLOG_COMPRESSION"
+    -treedb-vlog-compression-autotune "$VLOG_COMPRESSION_AUTOTUNE"
+    -treedb-vlog-compression-variant "$VLOG_COMPRESSION_VARIANT"
+    -treedb-vlog-dict-train-bytes "$DICT_TRAIN_BYTES"
+    -treedb-vlog-dict-dict-bytes "$DICT_BYTES"
+  )
+
+  if [[ -n "$COMMON_EXTRA_FLAGS" ]]; then
+    # shellcheck disable=SC2206
+    local common_extra=( $COMMON_EXTRA_FLAGS )
+    cmd+=("${common_extra[@]}")
+  fi
+  if [[ -n "$extra_flags" ]]; then
+    # shellcheck disable=SC2206
+    local variant_extra=( $extra_flags )
+    cmd+=("${variant_extra[@]}")
+  fi
+
+  printf '%q ' "${cmd[@]}" >"$run_dir/cmd.txt"
+  echo >>"$run_dir/cmd.txt"
+
+  local bench_log="$run_dir/unified.log"
+  local run_start run_end
+  run_start=$(date +%s)
+  "${cmd[@]}" >"$bench_log" 2>&1
+  run_end=$(date +%s)
+
+  local parse_out keep_dir batch_write_ops
+  parse_out="$(parse_bench_log "$bench_log")"
+  keep_dir="${parse_out%%$'\t'*}"
+  batch_write_ops="${parse_out#*$'\t'}"
+
+  if [[ -z "$keep_dir" || ! -d "$keep_dir" ]]; then
+    echo "missing kept dir for $run_id (parsed=$keep_dir)" >&2
+    exit 1
+  fi
+
+  local sync_app_bytes sync_wal_bytes sync_gzip_bytes
+  sync_app_bytes="$(du_bytes "$keep_dir")"
+  sync_wal_bytes="$(du_bytes "$keep_dir/maindb/wal")"
+  sync_gzip_bytes="$(gzip_dir_bytes "$keep_dir")"
+
+  local rewrite_attempted=0
+  local rewrite_seconds=0
+  local rewrite_rc=0
+  local rewrite_log="$run_dir/rewrite.log"
+  if [[ "$REWRITE_ENABLED" == "1" ]]; then
+    rewrite_attempted=1
+    local rw_start rw_end
+    rw_start=$(date +%s)
+    # shellcheck disable=SC2206
+    local rw_args=( $REWRITE_ARGS )
+    set +e
+    "$treemap_bin" vlog-rewrite "$keep_dir" "${rw_args[@]}" >"$rewrite_log" 2>&1
+    rewrite_rc=$?
+    set -e
+    rw_end=$(date +%s)
+    rewrite_seconds=$((rw_end - rw_start))
+  fi
+
+  local post_app_bytes post_wal_bytes post_gzip_bytes
+  post_app_bytes="$(du_bytes "$keep_dir")"
+  post_wal_bytes="$(du_bytes "$keep_dir/maindb/wal")"
+  post_gzip_bytes="$(gzip_dir_bytes "$keep_dir")"
+
+  local run_json="$run_dir/run.json"
+  python3 - "$run_json" "$pair_index" "$variant" "$run_start" "$run_end" "$rewrite_attempted" "$rewrite_seconds" "$rewrite_rc" "$batch_write_ops" "$keep_dir" "$sync_app_bytes" "$sync_wal_bytes" "$sync_gzip_bytes" "$post_app_bytes" "$post_wal_bytes" "$post_gzip_bytes" <<'PY'
+import json
+import sys
+from pathlib import Path
+
+out_path = Path(sys.argv[1])
+pair_index = int(sys.argv[2])
+variant = sys.argv[3]
+run_start = int(sys.argv[4])
+run_end = int(sys.argv[5])
+rewrite_attempted = int(sys.argv[6])
+rewrite_seconds = int(sys.argv[7])
+rewrite_rc = int(sys.argv[8])
+batch_write_ops = float(sys.argv[9])
+keep_dir = sys.argv[10]
+s_sync_app = int(sys.argv[11])
+s_sync_wal = int(sys.argv[12])
+s_sync_gzip = int(sys.argv[13])
+s_post_app = int(sys.argv[14])
+s_post_wal = int(sys.argv[15])
+s_post_gzip = int(sys.argv[16])
+
+t_sync = max(0, run_end - run_start)
+t_rewrite = rewrite_seconds if rewrite_attempted == 1 else 0
+if rewrite_attempted == 1 and rewrite_rc != 0:
+    t_total = None
+else:
+    t_total = t_sync + t_rewrite
+
+payload = {
+    "pair_index": pair_index,
+    "variant": variant,
+    "keep_dir": keep_dir,
+    "bench": {
+        "duration_seconds": t_sync,
+        "batch_write_ops_per_sec": batch_write_ops,
+    },
+    "rewrite": {
+        "attempted": rewrite_attempted == 1,
+        "seconds": t_rewrite,
+        "exit_code": rewrite_rc,
+    },
+    "sizes": {
+        "sync_app_bytes": s_sync_app,
+        "sync_wal_bytes": s_sync_wal,
+        "sync_gzip_bytes": s_sync_gzip,
+        "post_app_bytes": s_post_app,
+        "post_wal_bytes": s_post_wal,
+        "post_gzip_bytes": s_post_gzip,
+    },
+    "metrics": {
+        "t_sync_seconds": t_sync,
+        "t_rewrite_seconds": t_rewrite,
+        "t_total_seconds": t_total,
+        "batch_write_ops_per_sec": batch_write_ops,
+        "s_sync_app_bytes": s_sync_app,
+        "s_sync_wal_bytes": s_sync_wal,
+        "s_sync_gzip_bytes": s_sync_gzip,
+        "s_post_app_bytes": s_post_app,
+        "s_post_wal_bytes": s_post_wal,
+        "s_post_gzip_bytes": s_post_gzip,
+    },
+}
+out_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
+print(out_path)
+PY
+
+  if [[ "$KEEP_DB_DIRS" != "1" ]]; then
+    rm -rf "$keep_dir"
+  fi
+
+  echo "run_id=$run_id keep_dir=$keep_dir json=$run_json"
+}
+
+aggregate_and_decide() {
+  local decision_json="$OUT/decision.json"
+  python3 - "$OUT" "$SIZE_FIELD" "$SIZE_TOLERANCE_BYTES" "$TIME_TOLERANCE_SECONDS" "$MIN_PAIRS" "$CLEAR_WIN_PAIRS" "$CLEAR_LOSS_PAIRS" "$MAX_PAIRS" "$STOP_ON_CLEAR" "$LOW_SIGNAL_MIN_PAIRS" "$LOW_SIGNAL_NEUTRAL_STREAK" "$decision_json" <<'PY'
+import csv
+import json
+import statistics
+import sys
+from pathlib import Path
+
+out = Path(sys.argv[1])
+size_field = sys.argv[2]
+size_tol = int(sys.argv[3])
+time_tol = int(sys.argv[4])
+min_pairs = int(sys.argv[5])
+clear_win_pairs = int(sys.argv[6])
+clear_loss_pairs = int(sys.argv[7])
+max_pairs = int(sys.argv[8])
+stop_on_clear = sys.argv[9] == "1"
+low_signal_min_pairs = int(sys.argv[10])
+low_signal_neutral_streak = int(sys.argv[11])
+decision_path = Path(sys.argv[12])
+
+run_files = sorted(out.glob("runs/*/run.json"))
+runs = []
+for p in run_files:
+    try:
+        runs.append(json.loads(p.read_text(encoding="utf-8")))
+    except Exception:
+        continue
+runs.sort(key=lambda r: (int(r.get("pair_index", 0)), str(r.get("variant", ""))))
+
+runs_csv = out / "runs.csv"
+with runs_csv.open("w", newline="", encoding="utf-8") as fh:
+    w = csv.writer(fh)
+    w.writerow([
+        "pair_index",
+        "variant",
+        "keep_dir",
+        "t_sync_seconds",
+        "t_rewrite_seconds",
+        "t_total_seconds",
+        "batch_write_ops_per_sec",
+        "s_sync_app_bytes",
+        "s_sync_wal_bytes",
+        "s_sync_gzip_bytes",
+        "s_post_app_bytes",
+        "s_post_wal_bytes",
+        "s_post_gzip_bytes",
+        "rewrite_exit_code",
+    ])
+    for r in runs:
+        m = r.get("metrics", {}) or {}
+        rw = r.get("rewrite", {}) or {}
+        w.writerow([
+            int(r.get("pair_index", 0)),
+            str(r.get("variant", "")),
+            str(r.get("keep_dir", "")),
+            m.get("t_sync_seconds"),
+            m.get("t_rewrite_seconds"),
+            m.get("t_total_seconds"),
+            m.get("batch_write_ops_per_sec"),
+            m.get("s_sync_app_bytes"),
+            m.get("s_sync_wal_bytes"),
+            m.get("s_sync_gzip_bytes"),
+            m.get("s_post_app_bytes"),
+            m.get("s_post_wal_bytes"),
+            m.get("s_post_gzip_bytes"),
+            rw.get("exit_code"),
+        ])
+
+by_pair = {}
+for r in runs:
+    pair = int(r.get("pair_index", 0))
+    by_pair.setdefault(pair, {})[str(r.get("variant", ""))] = r
+
+def delta(a, b):
+    if a is None or b is None:
+        return None
+    try:
+        return a - b
+    except Exception:
+        return None
+
+pair_rows = []
+wins = 0
+losses = 0
+for pair in sorted(by_pair):
+    row = by_pair[pair]
+    ctrl = row.get("control")
+    cand = row.get("candidate")
+    if not ctrl or not cand:
+        continue
+
+    cm = cand.get("metrics", {}) or {}
+    bm = ctrl.get("metrics", {}) or {}
+
+    d_sync = delta(cm.get("t_sync_seconds"), bm.get("t_sync_seconds"))
+    d_total = delta(cm.get("t_total_seconds"), bm.get("t_total_seconds"))
+    d_bw = delta(cm.get("batch_write_ops_per_sec"), bm.get("batch_write_ops_per_sec"))
+
+    d_sync_app = delta(cm.get("s_sync_app_bytes"), bm.get("s_sync_app_bytes"))
+    d_sync_wal = delta(cm.get("s_sync_wal_bytes"), bm.get("s_sync_wal_bytes"))
+    d_sync_gzip = delta(cm.get("s_sync_gzip_bytes"), bm.get("s_sync_gzip_bytes"))
+    d_post_app = delta(cm.get("s_post_app_bytes"), bm.get("s_post_app_bytes"))
+    d_post_wal = delta(cm.get("s_post_wal_bytes"), bm.get("s_post_wal_bytes"))
+    d_post_gzip = delta(cm.get("s_post_gzip_bytes"), bm.get("s_post_gzip_bytes"))
+
+    d_size_primary = delta(cm.get(size_field), bm.get(size_field))
+
+    outcome = "neutral"
+    if d_size_primary is not None and d_total is not None:
+        win = (d_size_primary <= -size_tol) and (d_total <= time_tol)
+        loss = (d_size_primary >= size_tol) and (d_total >= -time_tol)
+        if win and not loss:
+            outcome = "win"
+            wins += 1
+        elif loss and not win:
+            outcome = "loss"
+            losses += 1
+
+    pair_rows.append(
+        {
+            "pair_index": pair,
+            "delta_t_sync_seconds": d_sync,
+            "delta_t_total_seconds": d_total,
+            "delta_batch_write_ops_per_sec": d_bw,
+            "delta_s_sync_app_bytes": d_sync_app,
+            "delta_s_sync_wal_bytes": d_sync_wal,
+            "delta_s_sync_gzip_bytes": d_sync_gzip,
+            "delta_s_post_app_bytes": d_post_app,
+            "delta_s_post_wal_bytes": d_post_wal,
+            "delta_s_post_gzip_bytes": d_post_gzip,
+            "delta_size_primary_bytes": d_size_primary,
+            "outcome": outcome,
+        }
+    )
+
+pairs_csv = out / "pairs.csv"
+with pairs_csv.open("w", newline="", encoding="utf-8") as fh:
+    w = csv.writer(fh)
+    w.writerow(
+        [
+            "pair_index",
+            "delta_t_sync_seconds",
+            "delta_t_total_seconds",
+            "delta_batch_write_ops_per_sec",
+            "delta_s_sync_app_bytes",
+            "delta_s_sync_wal_bytes",
+            "delta_s_sync_gzip_bytes",
+            "delta_s_post_app_bytes",
+            "delta_s_post_wal_bytes",
+            "delta_s_post_gzip_bytes",
+            "delta_size_primary_bytes",
+            "outcome",
+        ]
+    )
+    for r in pair_rows:
+        w.writerow(
+            [
+                r["pair_index"],
+                r["delta_t_sync_seconds"],
+                r["delta_t_total_seconds"],
+                r["delta_batch_write_ops_per_sec"],
+                r["delta_s_sync_app_bytes"],
+                r["delta_s_sync_wal_bytes"],
+                r["delta_s_sync_gzip_bytes"],
+                r["delta_s_post_app_bytes"],
+                r["delta_s_post_wal_bytes"],
+                r["delta_s_post_gzip_bytes"],
+                r["delta_size_primary_bytes"],
+                r["outcome"],
+            ]
+        )
+
+completed_pairs = len(pair_rows)
+neutral = max(0, completed_pairs - wins - losses)
+neutral_streak = 0
+for row in reversed(pair_rows):
+    if row.get("outcome") == "neutral":
+        neutral_streak += 1
+        continue
+    break
+
+reason = "continue"
+stop = False
+if stop_on_clear and completed_pairs >= min_pairs:
+    if wins >= clear_win_pairs and wins > losses:
+        stop = True
+        reason = "clear_improvement"
+    elif losses >= clear_loss_pairs and losses > wins:
+        stop = True
+        reason = "clear_regression"
+    else:
+        remaining = max(0, max_pairs - completed_pairs)
+        can_reach_clear_win = (wins + remaining) >= clear_win_pairs
+        can_reach_clear_loss = (losses + remaining) >= clear_loss_pairs
+        if not can_reach_clear_win and not can_reach_clear_loss:
+            stop = True
+            reason = "futile_remaining_pairs"
+
+if (not stop) and completed_pairs >= low_signal_min_pairs and neutral_streak >= low_signal_neutral_streak:
+    stop = True
+    reason = "low_signal_neutral_streak"
+
+if (not stop) and completed_pairs >= max_pairs:
+    stop = True
+    reason = "max_pairs"
+
+med_delta_size = None
+med_delta_total = None
+size_values = [r["delta_size_primary_bytes"] for r in pair_rows if r.get("delta_size_primary_bytes") is not None]
+time_values = [r["delta_t_total_seconds"] for r in pair_rows if r.get("delta_t_total_seconds") is not None]
+if size_values:
+    med_delta_size = statistics.median(size_values)
+if time_values:
+    med_delta_total = statistics.median(time_values)
+
+summary_md = out / "summary.md"
+lines = []
+lines.append("# celestia_fast_gate summary")
+lines.append("")
+lines.append(f"- completed pairs: `{completed_pairs}`")
+lines.append(f"- wins/losses/neutral: `{wins}` / `{losses}` / `{neutral}`")
+lines.append(f"- neutral streak (tail): `{neutral_streak}`")
+lines.append(f"- size field: `{size_field}`")
+lines.append(f"- size tolerance bytes: `{size_tol}`")
+lines.append(f"- time tolerance seconds: `{time_tol}`")
+lines.append(f"- low-signal min pairs: `{low_signal_min_pairs}`")
+lines.append(f"- low-signal neutral streak: `{low_signal_neutral_streak}`")
+lines.append(f"- median delta(size): `{med_delta_size}`")
+lines.append(f"- median delta(time_total): `{med_delta_total}`")
+lines.append(f"- decision: `{reason}`")
+lines.append("")
+lines.append("## Artifacts")
+lines.append("")
+lines.append(f"- runs csv: `{runs_csv}`")
+lines.append(f"- pairs csv: `{pairs_csv}`")
+lines.append(f"- per-run json: `{out / 'runs'}`")
+summary_md.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+review_md = out / "process_review.md"
+review = []
+review.append("# Fast Loop Review")
+review.append("")
+review.append("## Signal Check")
+review.append("")
+review.append(f"- completed_pairs={completed_pairs}")
+review.append(f"- neutral_streak={neutral_streak}")
+review.append(f"- reason={reason}")
+if med_delta_size is not None:
+    review.append(f"- median_delta_size_bytes={int(med_delta_size)}")
+if med_delta_total is not None:
+    review.append(f"- median_delta_time_seconds={int(med_delta_total)}")
+review.append("")
+review.append("## Suggested Next Action")
+review.append("")
+if reason in {"low_signal_neutral_streak", "futile_remaining_pairs"}:
+    review.append("- Stop long validation; this loop is currently low-signal for the configured tolerance.")
+    review.append("- Increase expected effect size (bundle larger code changes) or increase micro workload stress before re-running.")
+elif reason == "clear_regression":
+    review.append("- Reject candidate as-is; run pprof on this fast gate to isolate removable overhead before retrying.")
+elif reason == "clear_improvement":
+    review.append("- Promote candidate to run_celestia A/B confirmation.")
+else:
+    review.append("- Continue collecting interleaved pairs until a clear outcome or low-signal stop triggers.")
+review_md.write_text("\n".join(review) + "\n", encoding="utf-8")
+
+payload = {
+    "completed_pairs": completed_pairs,
+    "wins": wins,
+    "losses": losses,
+    "neutral": neutral,
+    "neutral_streak": neutral_streak,
+    "size_field": size_field,
+    "size_tolerance_bytes": size_tol,
+    "time_tolerance_seconds": time_tol,
+    "median_delta_size_bytes": med_delta_size,
+    "median_delta_time_seconds": med_delta_total,
+    "stop": stop,
+    "reason": reason,
+}
+decision_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
+print(json.dumps(payload, sort_keys=True))
+PY
+}
+
+run_pair() {
+  local pair_index="$1"
+  if (( pair_index % 2 == 1 )); then
+    run_variant "$pair_index" "control"
+    sleep "$SLEEP_BETWEEN_RUNS_SECONDS"
+    run_variant "$pair_index" "candidate"
+  else
+    run_variant "$pair_index" "candidate"
+    sleep "$SLEEP_BETWEEN_RUNS_SECONDS"
+    run_variant "$pair_index" "control"
+  fi
+}
+
+require_cmd git
+require_cmd go
+require_cmd python3
+require_cmd tar
+require_cmd gzip
+require_cmd wc
+
+if (( MAX_PAIRS < 1 )); then
+  echo "MAX_PAIRS must be >= 1" >&2
+  exit 2
+fi
+
+mkdir -p "$OUT"
+setup_bins
+
+cat >"$OUT/meta.txt" <<META
+ts=$TS
+root=$ROOT
+baseline_hash=$BASELINE_HASH
+candidate_unified_bin=$CANDIDATE_UNIFIED_BIN
+candidate_treemap_bin=$CANDIDATE_TREEMAP_BIN
+baseline_unified_bin=$BASELINE_UNIFIED_BIN
+baseline_treemap_bin=$BASELINE_TREEMAP_BIN
+max_pairs=$MAX_PAIRS
+min_pairs=$MIN_PAIRS
+clear_win_pairs=$CLEAR_WIN_PAIRS
+clear_loss_pairs=$CLEAR_LOSS_PAIRS
+stop_on_clear=$STOP_ON_CLEAR
+low_signal_min_pairs=$LOW_SIGNAL_MIN_PAIRS
+low_signal_neutral_streak=$LOW_SIGNAL_NEUTRAL_STREAK
+size_field=$SIZE_FIELD
+size_tolerance_bytes=$SIZE_TOLERANCE_BYTES
+time_tolerance_seconds=$TIME_TOLERANCE_SECONDS
+sleep_between_runs_seconds=$SLEEP_BETWEEN_RUNS_SECONDS
+profile=$PROFILE
+dbs=$DBS
+tests=$TESTS
+keys=$KEYS
+valsize=$VALSIZE
+batchsize=$BATCHSIZE
+val_pattern=$VAL_PATTERN
+seed=$SEED
+force_value_pointers=$FORCE_VALUE_POINTERS
+outer_leaves_in_vlog=$OUTER_LEAVES_IN_VLOG
+vlog_compression=$VLOG_COMPRESSION
+vlog_compression_autotune=$VLOG_COMPRESSION_AUTOTUNE
+vlog_compression_variant=$VLOG_COMPRESSION_VARIANT
+dict_train_bytes=$DICT_TRAIN_BYTES
+dict_bytes=$DICT_BYTES
+rewrite_enabled=$REWRITE_ENABLED
+rewrite_args=$REWRITE_ARGS
+measure_gzip=$MEASURE_GZIP
+keep_db_dirs=$KEEP_DB_DIRS
+script_gowork=$SCRIPT_GOWORK
+common_extra_flags=$COMMON_EXTRA_FLAGS
+control_extra_flags=$CONTROL_EXTRA_FLAGS
+candidate_extra_flags=$CANDIDATE_EXTRA_FLAGS
+META
+
+echo "output=$OUT"
+echo "baseline_hash=$BASELINE_HASH"
+echo "size_field=$SIZE_FIELD"
+
+decision_reason="continue"
+for ((pair = 1; pair <= MAX_PAIRS; pair++)); do
+  echo "pair=$pair start"
+  run_pair "$pair"
+  aggregate_and_decide
+  decision_reason="$(python3 - "$OUT/decision.json" <<'PY'
+import json
+import sys
+payload = json.loads(open(sys.argv[1], 'r', encoding='utf-8').read())
+print(payload.get('reason', 'continue'))
+print('1' if payload.get('stop') else '0')
+PY
+)"
+  stop_flag="$(echo "$decision_reason" | tail -n 1)"
+  decision_reason="$(echo "$decision_reason" | head -n 1)"
+  echo "pair=$pair decision=$decision_reason"
+  if [[ "$stop_flag" == "1" ]]; then
+    break
+  fi
+  sleep "$SLEEP_BETWEEN_RUNS_SECONDS"
+done
+
+echo "completed decision=$decision_reason"
+echo "summary=$OUT/summary.md"
+echo "process_review=$OUT/process_review.md"
+echo "runs_csv=$OUT/runs.csv"
+echo "pairs_csv=$OUT/pairs.csv"
diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh
index 73b691921..63321696d 100755
--- a/scripts/run_celestia_ab.sh
+++ b/scripts/run_celestia_ab.sh
@@ -17,6 +17,8 @@ SIZE_TOLERANCE_BYTES="${SIZE_TOLERANCE_BYTES:-67108864}"
 TIME_TOLERANCE_SECONDS="${TIME_TOLERANCE_SECONDS:-120}"
 STOP_ON_CLEAR="${STOP_ON_CLEAR:-1}"
 SLEEP_BETWEEN_RUNS_SECONDS="${SLEEP_BETWEEN_RUNS_SECONDS:-5}"
+LOW_SIGNAL_MIN_PAIRS="${LOW_SIGNAL_MIN_PAIRS:-3}"
+LOW_SIGNAL_NEUTRAL_STREAK="${LOW_SIGNAL_NEUTRAL_STREAK:-3}"
 TS="$(date +%Y%m%d%H%M%S)"
 OUT="${OUT_DIR:-$ROOT/artifacts/celestia_ab/$TS}"
 
@@ -51,6 +53,8 @@ size_tolerance_bytes=$SIZE_TOLERANCE_BYTES
 time_tolerance_seconds=$TIME_TOLERANCE_SECONDS
 stop_on_clear=$STOP_ON_CLEAR
 sleep_between_runs_seconds=$SLEEP_BETWEEN_RUNS_SECONDS
+low_signal_min_pairs=$LOW_SIGNAL_MIN_PAIRS
+low_signal_neutral_streak=$LOW_SIGNAL_NEUTRAL_STREAK
 META
 
 list_run_homes() {
@@ -269,7 +273,7 @@ PY
 
 aggregate_and_decide() {
   local decision_json="$OUT/decision.json"
-  python3 - "$OUT" "$SIZE_TOLERANCE_BYTES" "$TIME_TOLERANCE_SECONDS" "$MIN_PAIRS" "$CLEAR_WIN_PAIRS" "$CLEAR_LOSS_PAIRS" "$MAX_PAIRS" "$STOP_ON_CLEAR" "$decision_json" <<'PY'
+  python3 - "$OUT" "$SIZE_TOLERANCE_BYTES" "$TIME_TOLERANCE_SECONDS" "$MIN_PAIRS" "$CLEAR_WIN_PAIRS" "$CLEAR_LOSS_PAIRS" "$MAX_PAIRS" "$STOP_ON_CLEAR" "$LOW_SIGNAL_MIN_PAIRS" "$LOW_SIGNAL_NEUTRAL_STREAK" "$decision_json" <<'PY'
 import csv
 import json
 import sys
@@ -283,7 +287,9 @@ clear_win_pairs = int(sys.argv[5])
 clear_loss_pairs = int(sys.argv[6])
 max_pairs = int(sys.argv[7])
 stop_on_clear = sys.argv[8] == "1"
-decision_path = Path(sys.argv[9])
+low_signal_min_pairs = int(sys.argv[9])
+low_signal_neutral_streak = int(sys.argv[10])
+decision_path = Path(sys.argv[11])
 
 run_files = sorted(out.glob("runs/*/run.json"))
 runs = []
@@ -417,27 +423,50 @@ with pairs_csv.open("w", newline="", encoding="utf-8") as fh:
         ])
 
 completed_pairs = len(pair_rows)
+neutral = max(0, completed_pairs - wins - losses)
+neutral_streak = 0
+for row in reversed(pair_rows):
+    if row.get("outcome") == "neutral":
+        neutral_streak += 1
+        continue
+    break
+
 reason = "continue"
 stop = False
-if completed_pairs >= max_pairs:
-    stop = True
-    reason = "max_pairs"
-elif stop_on_clear and completed_pairs >= min_pairs:
+if stop_on_clear and completed_pairs >= min_pairs:
     if wins >= clear_win_pairs and wins > losses:
         stop = True
         reason = "clear_improvement"
     elif losses >= clear_loss_pairs and losses > wins:
         stop = True
         reason = "clear_regression"
+    else:
+        remaining = max(0, max_pairs - completed_pairs)
+        can_reach_clear_win = (wins + remaining) >= clear_win_pairs
+        can_reach_clear_loss = (losses + remaining) >= clear_loss_pairs
+        if not can_reach_clear_win and not can_reach_clear_loss:
+            stop = True
+            reason = "futile_remaining_pairs"
+
+if (not stop) and completed_pairs >= low_signal_min_pairs and neutral_streak >= low_signal_neutral_streak:
+    stop = True
+    reason = "low_signal_neutral_streak"
+
+if (not stop) and completed_pairs >= max_pairs:
+    stop = True
+    reason = "max_pairs"
 
 summary_md = out / "summary.md"
 lines = []
 lines.append("# run_celestia A/B summary")
 lines.append("")
 lines.append(f"- completed pairs: `{completed_pairs}`")
-lines.append(f"- wins/losses/neutral: `{wins}` / `{losses}` / `{max(0, completed_pairs - wins - losses)}`")
+lines.append(f"- wins/losses/neutral: `{wins}` / `{losses}` / `{neutral}`")
+lines.append(f"- neutral streak (tail): `{neutral_streak}`")
 lines.append(f"- size tolerance bytes: `{size_tol}`")
 lines.append(f"- time tolerance seconds: `{time_tol}`")
+lines.append(f"- low-signal min pairs: `{low_signal_min_pairs}`")
+lines.append(f"- low-signal neutral streak: `{low_signal_neutral_streak}`")
 lines.append(f"- decision: `{reason}`")
 lines.append("")
 lines.append("## Artifacts")
@@ -461,7 +490,8 @@ payload = {
     "completed_pairs": completed_pairs,
     "wins": wins,
     "losses": losses,
-    "neutral": max(0, completed_pairs - wins - losses),
+    "neutral": neutral,
+    "neutral_streak": neutral_streak,
     "stop": stop,
     "reason": reason,
 }

From 55a41d1fd7039ebb03760ce5d02297d4830cbde6 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 17:41:39 -1000
Subject: [PATCH 43/61] treedb: add opt-in hot-debt-only checkpoint-kick gate

---
 TreeDB/caching/db.go                          |  47 ++++++--
 .../caching/vlog_generation_scheduler_test.go | 104 ++++++++++++++++++
 docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md      |   5 +
 ...HECKPOINT_KICK_HOT_DEBT_ONLY_2026-03-28.md |  89 +++++++++++++++
 scripts/analyze_vlog_maintenance_capacity.py  |   8 ++
 5 files changed, 243 insertions(+), 10 deletions(-)
 create mode 100644 docs/benchmarks/CELESTIA_CHECKPOINT_KICK_HOT_DEBT_ONLY_2026-03-28.md

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 139119a46..1858c4810 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -1860,6 +1860,11 @@ const (
 	envDisableVlogGenerationVacuum         = "TREEDB_DISABLE_VLOG_GENERATION_VACUUM"
 	envDisableVlogGenerationLoop           = "TREEDB_DISABLE_VLOG_GENERATION_LOOP"
 	envDisableVlogGenerationCheckpointKick = "TREEDB_DISABLE_VLOG_GENERATION_CHECKPOINT_KICK"
+	// Experimental WAL-off checkpoint-kick guard: when enabled, avoid starting
+	// fresh rewrite planning during hot foreground activity. Queued rewrite debt
+	// (or deferred maintenance due) remains eligible so resumable progress is not
+	// starved.
+	envEnableVlogGenerationCheckpointKickHotDebtOnly = "TREEDB_ENABLE_VLOG_GENERATION_CHECKPOINT_KICK_HOT_DEBT_ONLY"
 	// Experimental WAL-off override: allow rewrite planning/execution before the
 	// first explicit checkpoint. Disabled by default because it can add restore
 	// contention during early state-sync.
@@ -5965,6 +5970,7 @@ type DB struct {
 	vlogGenerationCheckpointKickRuns                            atomic.Uint64
 	vlogGenerationCheckpointKickRewriteRuns                     atomic.Uint64
 	vlogGenerationCheckpointKickGCRuns                          atomic.Uint64
+	vlogGenerationCheckpointKickSkippedHotNoDebt                atomic.Uint64
 	vlogGenerationCheckpointKickPending                         atomic.Bool
 	vlogGenerationDeferredMaintenancePending                    atomic.Bool
 	vlogGenerationDeferredMaintenanceRunning                    atomic.Bool
@@ -15429,6 +15435,34 @@ func (db *DB) maybeKickVlogGenerationMaintenanceAfterCheckpoint() {
 		return
 	}
 	now := time.Now()
+	rewriteDisabled := envBool(envDisableVlogGenerationRewrite)
+	rewriteQueueLen := 0
+	if !rewriteDisabled {
+		rewriteQueue, qerr := db.currentVlogGenerationRewriteQueue()
+		if qerr != nil {
+			db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError)
+			if db.notifyError != nil {
+				db.notifyError(fmt.Errorf("cachingdb: load generational rewrite queue for checkpoint kick: %w", qerr))
+			}
+			return
+		}
+		rewriteQueueLen = len(rewriteQueue)
+	}
+	if envBool(envEnableVlogGenerationCheckpointKickHotDebtOnly) && !rewriteDisabled {
+		quiet := db.foregroundActivityQuietFor(now, vlogGenerationMaintenanceQuietWindow, vlogForegroundReadQuietWindow)
+		if !quiet && rewriteQueueLen == 0 && !db.vlogGenerationDeferredMaintenanceDue(now) {
+			db.vlogGenerationCheckpointKickSkippedHotNoDebt.Add(1)
+			db.debugVlogMaintf(
+				"checkpoint_kick_skip reason=foreground_hot_no_debt quiet=%t queue_len=%d checkpoint_pending=%t deferred_pending=%t deferred_due=%t",
+				quiet,
+				rewriteQueueLen,
+				db.vlogGenerationCheckpointKickPending.Load(),
+				db.vlogGenerationDeferredMaintenancePending.Load(),
+				db.vlogGenerationDeferredMaintenanceDue(now),
+			)
+			return
+		}
+	}
 	last := db.vlogGenerationLastCheckpointKickUnixNano.Load()
 	if last > 0 && now.Sub(time.Unix(0, last)) < vlogGenerationCheckpointKickMinInterval {
 		db.debugVlogMaintf(
@@ -15440,16 +15474,8 @@ func (db *DB) maybeKickVlogGenerationMaintenanceAfterCheckpoint() {
 	}
 	// Avoid forcing extra checkpoint boundaries when rewrite is clearly ineligible.
 	// Skip this fast-path when rewrite is disabled so GC-only kicks still run.
-	if !envBool(envDisableVlogGenerationRewrite) {
-		rewriteQueue, qerr := db.currentVlogGenerationRewriteQueue()
-		if qerr != nil {
-			db.vlogGenerationSchedulerState.Store(vlogGenerationSchedulerError)
-			if db.notifyError != nil {
-				db.notifyError(fmt.Errorf("cachingdb: load generational rewrite queue for checkpoint kick: %w", qerr))
-			}
-			return
-		}
-		if len(rewriteQueue) == 0 {
+	if !rewriteDisabled {
+		if rewriteQueueLen == 0 {
 			if trigger := db.valueLogRewriteTriggerBytes; trigger > 0 {
 				retained, bytes := db.valueLogRetainedStats()
 				if bytes < trigger && retained < 2 {
@@ -20987,6 +21013,7 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.checkpoint_kick.runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickRuns.Load())
 	stats["treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickRewriteRuns.Load())
 	stats["treedb.cache.vlog_generation.checkpoint_kick.gc_runs"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickGCRuns.Load())
+	stats["treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt"] = fmt.Sprintf("%d", db.vlogGenerationCheckpointKickSkippedHotNoDebt.Load())
 	stats["treedb.cache.vlog_generation.maintenance.attempts"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceAttempts.Load())
 	stats["treedb.cache.vlog_generation.maintenance.acquired"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceAcquired.Load())
 	stats["treedb.cache.vlog_generation.maintenance.collisions"] = fmt.Sprintf("%d", db.vlogGenerationMaintenanceCollisions.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index f25b8b4dd..61f4818ff 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -4324,6 +4324,52 @@ func TestCheckpoint_KicksVlogGenerationRewriteDespiteRecentForegroundActivity(t
 	}
 }
 
+func TestCheckpoint_KickHotDebtOnlySkipsFreshPlanDuringRecentForegroundActivity(t *testing.T) {
+	disableVlogGenerationLoop(t)
+	t.Setenv(envEnableVlogGenerationCheckpointKickHotDebtOnly, "1")
+
+	dir := t.TempDir()
+
+	backend, err := backenddb.Open(backenddb.Options{Dir: dir})
+	if err != nil {
+		t.Fatalf("open backend: %v", err)
+	}
+	recorder := &rewriteBudgetRecordingBackend{
+		DB: backend,
+		planResponse: backenddb.ValueLogRewritePlan{
+			SourceFileIDs:     []uint32{11},
+			SelectedBytesLive: 128,
+		},
+		rewriteResponse: backenddb.ValueLogRewriteStats{BytesBefore: 64, BytesAfter: 32, RecordsCopied: 1},
+	}
+
+	db, cleanup := openRewriteQueueTestDB(t, dir, recorder)
+	t.Cleanup(cleanup)
+	skipRetainedPrune(db)
+	db.testSkipVlogCheckpointKick = false
+
+	hot := time.Now().UnixNano()
+	db.lastForegroundWriteUnixNano.Store(hot)
+	db.lastForegroundReadUnixNano.Store(hot)
+
+	db.maybeKickVlogGenerationMaintenanceAfterCheckpoint()
+
+	time.Sleep(150 * time.Millisecond)
+	if _, calls := recorder.recordedPlan(); calls != 0 {
+		t.Fatalf("plan calls=%d want 0", calls)
+	}
+	if _, calls := recorder.recordedRewrite(); calls != 0 {
+		t.Fatalf("rewrite calls=%d want 0", calls)
+	}
+	stats := db.Stats()
+	if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "0" {
+		t.Fatalf("checkpoint kick runs=%q want 0", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt"]; got != "1" {
+		t.Fatalf("checkpoint kick skipped_hot_no_debt=%q want 1", got)
+	}
+}
+
 func TestCheckpoint_DoesNotKickVlogGenerationRewrite_WALOn(t *testing.T) {
 	disableVlogGenerationLoop(t)
 
@@ -4431,6 +4477,64 @@ func TestCheckpoint_KicksQueuedRewriteDebtBelowTriggerFloor(t *testing.T) {
 		time.Sleep(10 * time.Millisecond)
 	}
 
+	stats := db.Stats()
+	if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "1" {
+		t.Fatalf("checkpoint kick runs=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"]; got != "1" {
+		t.Fatalf("checkpoint kick rewrite runs=%q want 1", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt"]; got != "0" {
+		t.Fatalf("checkpoint kick skipped_hot_no_debt=%q want 0", got)
+	}
+}
+
+func TestCheckpoint_KickHotDebtOnlyStillRunsQueuedRewriteDebtDuringRecentForegroundActivity(t *testing.T) {
+	disableVlogGenerationLoop(t)
+	t.Setenv(envEnableVlogGenerationCheckpointKickHotDebtOnly, "1")
+
+	dir := t.TempDir()
+
+	backend, err := backenddb.Open(backenddb.Options{Dir: dir})
+	if err != nil {
+		t.Fatalf("open backend: %v", err)
+	}
+	recorder := &rewriteBudgetRecordingBackend{
+		DB: backend,
+		planResponse: backenddb.ValueLogRewritePlan{
+			SourceFileIDs:     []uint32{11},
+			SelectedBytesLive: 128,
+		},
+		rewriteResponse: backenddb.ValueLogRewriteStats{BytesBefore: 64, BytesAfter: 32, RecordsCopied: 1},
+	}
+
+	db, cleanup := openRewriteQueueTestDB(t, dir, recorder)
+	t.Cleanup(cleanup)
+	skipRetainedPrune(db)
+	db.testSkipVlogCheckpointKick = false
+	db.valueLogRewriteTriggerBytes = 1 << 30
+	if err := db.setVlogGenerationRewriteQueue([]uint32{11}); err != nil {
+		t.Fatalf("seed rewrite queue: %v", err)
+	}
+	db.vlogGenerationRewriteBudgetTokensBytes.Store(1024)
+	hot := time.Now().UnixNano()
+	db.lastForegroundWriteUnixNano.Store(hot)
+	db.lastForegroundReadUnixNano.Store(hot)
+
+	db.maybeKickVlogGenerationMaintenanceAfterCheckpoint()
+
+	deadline := time.Now().Add(2 * schedulerTestWait(t))
+	for {
+		if _, calls := recorder.recordedRewrite(); calls == 1 {
+			break
+		}
+		if time.Now().After(deadline) {
+			_, rewriteCalls := recorder.recordedRewrite()
+			t.Fatalf("checkpoint kick with queued debt did not run rewrite in time: rewriteCalls=%d", rewriteCalls)
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
+
 	stats := db.Stats()
 	if got := stats["treedb.cache.vlog_generation.checkpoint_kick.runs"]; got != "1" {
 		t.Fatalf("checkpoint kick runs=%q want 1", got)
diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
index 6f5d83aa0..249a2b753 100644
--- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
+++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
@@ -103,6 +103,11 @@ The harness alternates run order per pair (`control->candidate`, then
   - Allows rewrite planning/execution before the first explicit checkpoint.
   - Default is disabled to avoid adding early restore contention.
   - Use for controlled `run_celestia` experiments when `maintenance.skip.before_first_checkpoint` dominates and live rewrite never starts.
+- `TREEDB_ENABLE_VLOG_GENERATION_CHECKPOINT_KICK_HOT_DEBT_ONLY=1`
+  - WAL-off only.
+  - During checkpoint-kick maintenance, skips starting a fresh rewrite plan while foreground activity is hot and rewrite queue debt is empty.
+  - Still allows queued rewrite debt (and deferred-due passes) to run.
+  - Default is disabled.
 
 ## Bench Commands
 ### Churn sanity (TreeDB)
diff --git a/docs/benchmarks/CELESTIA_CHECKPOINT_KICK_HOT_DEBT_ONLY_2026-03-28.md b/docs/benchmarks/CELESTIA_CHECKPOINT_KICK_HOT_DEBT_ONLY_2026-03-28.md
new file mode 100644
index 000000000..2e40ec607
--- /dev/null
+++ b/docs/benchmarks/CELESTIA_CHECKPOINT_KICK_HOT_DEBT_ONLY_2026-03-28.md
@@ -0,0 +1,89 @@
+# Celestia: Checkpoint-Kick Hot-Debt-Only Gate (2026-03-28)
+
+## Goal
+Reduce `run_celestia` sync wall-time regression from live value-log maintenance while preserving on-disk size gains.
+
+## Change Under Test
+Candidate enables:
+
+- `TREEDB_ENABLE_VLOG_GENERATION_CHECKPOINT_KICK_HOT_DEBT_ONLY=1`
+
+Behavior:
+
+- In WAL-off checkpoint-kick path, if foreground is hot and rewrite queue is empty, skip starting a fresh rewrite plan.
+- Queued rewrite debt and deferred-due maintenance still run.
+- Default behavior remains unchanged unless this env flag is set.
+
+## Commands
+Both campaigns used fixed trust/target and a single interleaved pair (`MAX_PAIRS=1`) with offline rewrite enabled.
+
+Common env (both variants):
+
+- `TREEDB_OPEN_PROFILE=fast`
+- `POLL_INTERVAL_SECONDS=1`
+- `FREEZE_REMOTE_HEIGHT_AT_START=1`
+- `ALLOW_CLAMPED_TARGET_EARLY_EXIT=1`
+- `STOP_AT_LOCAL_HEIGHT=<captured at campaign start>`
+- `TRUST_HEIGHT=<captured at campaign start>`
+- `TRUST_HASH=<captured at campaign start>`
+
+Variant-specific env:
+
+- `main`: `LOCAL_GOMAP_DIR=/tmp/gomap_ab_base_20260328162444`
+- `hot_debt_only`: `LOCAL_GOMAP_DIR=/home/mikers/dev/snissn/gomap-phasehook-active` + `TREEDB_ENABLE_VLOG_GENERATION_CHECKPOINT_KICK_HOT_DEBT_ONLY=1`
+
+Harness:
+
+```bash
+OUT_DIR=<out> \
+CONTROL_ENV_FILE=<control_env> \
+CANDIDATE_ENV_FILE=<candidate_env> \
+MAX_PAIRS=1 MIN_PAIRS=1 CLEAR_WIN_PAIRS=1 CLEAR_LOSS_PAIRS=1 \
+LOW_SIGNAL_MIN_PAIRS=1 LOW_SIGNAL_NEUTRAL_STREAK=1 \
+SIZE_TOLERANCE_BYTES=$((64<<20)) TIME_TOLERANCE_SECONDS=120 \
+REWRITE_ENABLED=1 \
+./scripts/run_celestia_ab.sh
+```
+
+## Runs
+- control=main, candidate=hot_debt_only:
+  - `/tmp/celestia_ab_hotdebt_20260328171204`
+- control=hot_debt_only, candidate=main (swapped to counter order bias):
+  - `/tmp/celestia_ab_hotdebt_swapped_20260328172453`
+
+## Normalized Results (hot_debt_only - main)
+- Run A (hot_debt_only as candidate):
+  - `delta_t_sync_seconds = -16`
+  - `delta_t_total_seconds = -17`
+  - `delta_s_sync_app_bytes = -694,418,294`
+  - `delta_s_post_wal_bytes = +3,315,722`
+- Run B (hot_debt_only as control, normalized):
+  - `delta_t_sync_seconds = +3`
+  - `delta_t_total_seconds = +2`
+  - `delta_s_sync_app_bytes = -98,696,592`
+  - `delta_s_post_wal_bytes = -3,665,002`
+
+Two-run median/average (same with n=2):
+
+- `delta_t_sync_seconds = -6.5s`
+- `delta_t_total_seconds = -7.5s`
+- `delta_s_sync_app_bytes = -396,557,443B` (~`-378.2 MiB`)
+- `delta_s_post_wal_bytes = -174,640B` (~`-170.5 KiB`, effectively neutral)
+
+## Maintenance Counters
+Across both runs, both variants showed:
+
+- `rewrite_runs=0`
+- `checkpoint_kick_runs=0`
+
+Candidate (`hot_debt_only`) showed one lightweight GC pass in each run (`gc_runs=1`), with no rewrite execution.
+
+## Takeaway
+The hot-debt-only gate removed checkpoint-kick rewrite pressure during hot sync windows and improved sync+rewrite wall time in this small sample, while keeping pre-rewrite app size better than main and post-rewrite WAL roughly neutral.
+
+## Next Step
+Run an interleaved sequence with more pairs (stop-on-significance) and include the new stat key:
+
+- `treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt`
+
+to confirm skip path activation frequency under full mainnet sync pressure.
diff --git a/scripts/analyze_vlog_maintenance_capacity.py b/scripts/analyze_vlog_maintenance_capacity.py
index 292a8a9e8..b5b6eabc0 100755
--- a/scripts/analyze_vlog_maintenance_capacity.py
+++ b/scripts/analyze_vlog_maintenance_capacity.py
@@ -320,6 +320,7 @@ def build_summary(stats: dict[str, Any]) -> dict[str, Any]:
         "checkpoint_kick_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.runs"),
         "checkpoint_kick_gc_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.gc_runs"),
         "checkpoint_kick_rewrite_runs": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.rewrite_runs"),
+        "checkpoint_kick_skipped_hot_no_debt": metric_int(stats, "treedb.cache.vlog_generation.checkpoint_kick.skipped_hot_no_debt"),
     }
 
     skip_keys = [
@@ -505,6 +506,13 @@ def print_report(summary: dict[str, Any], source_file: Path, run_home: str, inst
         f"priority={skips['priority_pending']} "
         f"checkpoint={skips['checkpoint_inflight']}"
     )
+    print(
+        "  checkpoint-kick: "
+        f"runs={summary['checkpoint_kick_runs']} "
+        f"rewrite_runs={summary['checkpoint_kick_rewrite_runs']} "
+        f"gc_runs={summary['checkpoint_kick_gc_runs']} "
+        f"skipped_hot_no_debt={summary['checkpoint_kick_skipped_hot_no_debt']}"
+    )
     print("")
 
     print("Rewrite economics")

From 53d39b6bfbebc74636f878d4b2d20674ac3115bc Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 18:31:28 -1000
Subject: [PATCH 44/61] compression: reuse encoder and scratch in k-profile
 eval

---
 TreeDB/internal/compression/profile.go      | 52 +++++++++++--
 TreeDB/internal/compression/profile_test.go | 84 +++++++++++++++++++++
 2 files changed, 131 insertions(+), 5 deletions(-)
 create mode 100644 TreeDB/internal/compression/profile_test.go

diff --git a/TreeDB/internal/compression/profile.go b/TreeDB/internal/compression/profile.go
index 0114c98e6..cf0c95745 100644
--- a/TreeDB/internal/compression/profile.go
+++ b/TreeDB/internal/compression/profile.go
@@ -79,6 +79,21 @@ func ChooseKForDictOptions(dict []byte, samples [][]byte, opts ChooseKOptions) (
 		ks = []int{1, 2, 4, 8, 16, 32}
 	}
 	ks = normalizeCandidateK(ks)
+	var sharedEnc *zstd.Encoder
+	if dict != nil {
+		if enc, err := zstd.NewWriter(nil, zstd.WithEncoderDict(dict), zstd.WithEncoderLevel(zstd.SpeedFastest)); err == nil {
+			sharedEnc = enc
+		}
+	} else {
+		if enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedFastest)); err == nil {
+			sharedEnc = enc
+		}
+	}
+	if sharedEnc != nil {
+		defer sharedEnc.Close()
+	}
+	var concatScratch []byte
+	var encodedScratch []byte
 	scores := make([]kScore, 0, len(ks))
 	var baseline kScore
 	for _, k := range ks {
@@ -89,7 +104,12 @@ func ChooseKForDictOptions(dict []byte, samples [][]byte, opts ChooseKOptions) (
 		if used == 0 {
 			continue
 		}
-		payload, meta, raw, encodeNs := batchTotals(dict, eval[:used], k, opts.EncodeNsPerRawByte)
+		payload, meta, raw, encodeNs := 0, 0, 0, int64(0)
+		if sharedEnc != nil {
+			payload, meta, raw, encodeNs = batchTotalsWithEncoder(sharedEnc, eval[:used], k, opts.EncodeNsPerRawByte, &concatScratch, &encodedScratch)
+		} else {
+			payload, meta, raw, encodeNs = batchTotals(dict, eval[:used], k, opts.EncodeNsPerRawByte)
+		}
 		if raw == 0 {
 			continue
 		}
@@ -200,7 +220,6 @@ func batchTotals(dict []byte, samples [][]byte, k int, encodeNsPerRawByte float6
 		return 0, 0, 0, 0
 	}
 	samples = samples[:n]
-	batches := n / k
 	var enc *zstd.Encoder
 	var err error
 	if dict != nil {
@@ -212,6 +231,23 @@ func batchTotals(dict []byte, samples [][]byte, k int, encodeNsPerRawByte float6
 		return 0, 0, 0, 0
 	}
 	defer enc.Close()
+	var concatScratch []byte
+	var encodedScratch []byte
+	return batchTotalsWithEncoder(enc, samples, k, encodeNsPerRawByte, &concatScratch, &encodedScratch)
+}
+
+func batchTotalsWithEncoder(enc *zstd.Encoder, samples [][]byte, k int, encodeNsPerRawByte float64, concatScratch *[]byte, encodedScratch *[]byte) (payload int, meta int, raw int, encodeNs int64) {
+	if enc == nil || k <= 0 {
+		return 0, 0, 0, 0
+	}
+	n := (len(samples) / k) * k
+	if n == 0 {
+		return 0, 0, 0, 0
+	}
+	samples = samples[:n]
+	batches := n / k
+	buf := *concatScratch
+	encoded := *encodedScratch
 	started := time.Now()
 	for b := 0; b < batches; b++ {
 		start := b * k
@@ -221,14 +257,18 @@ func batchTotals(dict []byte, samples [][]byte, k int, encodeNsPerRawByte float6
 			raw += len(samples[i])
 			total += len(samples[i])
 		}
-		buf := make([]byte, total)
+		if cap(buf) < total {
+			buf = make([]byte, total)
+		} else {
+			buf = buf[:total]
+		}
 		pos := 0
 		for i := start; i < end; i++ {
 			copy(buf[pos:], samples[i])
 			pos += len(samples[i])
 		}
-		c := enc.EncodeAll(buf, nil)
-		payload += len(c)
+		encoded = enc.EncodeAll(buf, encoded[:0])
+		payload += len(encoded)
 		// Account for the full on-disk framing overhead:
 		// - record header (CRC/version/flags/txn/bodyLen)
 		// - frame header + dict_id + RID table + offsets table
@@ -245,6 +285,8 @@ func batchTotals(dict []byte, samples [][]byte, k int, encodeNsPerRawByte float6
 	} else {
 		encodeNs = time.Since(started).Nanoseconds()
 	}
+	*concatScratch = buf[:0]
+	*encodedScratch = encoded[:0]
 	return payload, meta, raw, encodeNs
 }
 
diff --git a/TreeDB/internal/compression/profile_test.go b/TreeDB/internal/compression/profile_test.go
new file mode 100644
index 000000000..15dbd18d2
--- /dev/null
+++ b/TreeDB/internal/compression/profile_test.go
@@ -0,0 +1,84 @@
+package compression
+
+import (
+	"bytes"
+	"encoding/binary"
+	"testing"
+
+	"github.com/snissn/compress/zstd"
+)
+
+func buildProfileSamples(n int) [][]byte {
+	samples := make([][]byte, 0, n)
+	base := bytes.Repeat([]byte("compressible-"), 64)
+	for i := 0; i < n; i++ {
+		buf := make([]byte, 1024)
+		copy(buf, base)
+		binary.LittleEndian.PutUint32(buf[len(buf)-4:], uint32(i))
+		samples = append(samples, buf)
+	}
+	return samples
+}
+
+func mustBuildValidDict(t *testing.T, samples [][]byte) []byte {
+	t.Helper()
+	history := make([]byte, 0, 1<<16)
+	for _, s := range samples {
+		history = append(history, s...)
+	}
+	dict, err := buildAndValidateDict(42, samples, history, zstd.SpeedFastest)
+	if err != nil {
+		t.Fatalf("build dict: %v", err)
+	}
+	if len(dict) == 0 {
+		t.Fatalf("expected non-empty dict")
+	}
+	return dict
+}
+
+func TestBatchTotalsWithEncoder_MatchesBatchTotals_NoDict(t *testing.T) {
+	samples := buildProfileSamples(16)
+	encodeNsPerRawByte := 1.25
+
+	for _, k := range []int{1, 2, 4, 8} {
+		wantPayload, wantMeta, wantRaw, wantEncodeNS := batchTotals(nil, samples, k, encodeNsPerRawByte)
+		enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedFastest))
+		if err != nil {
+			t.Fatalf("new writer: %v", err)
+		}
+
+		var concatScratch []byte
+		var encodedScratch []byte
+		gotPayload, gotMeta, gotRaw, gotEncodeNS := batchTotalsWithEncoder(enc, samples, k, encodeNsPerRawByte, &concatScratch, &encodedScratch)
+		_ = enc.Close()
+
+		if gotPayload != wantPayload || gotMeta != wantMeta || gotRaw != wantRaw || gotEncodeNS != wantEncodeNS {
+			t.Fatalf("k=%d mismatch got=(payload=%d meta=%d raw=%d encodeNs=%d) want=(payload=%d meta=%d raw=%d encodeNs=%d)",
+				k, gotPayload, gotMeta, gotRaw, gotEncodeNS, wantPayload, wantMeta, wantRaw, wantEncodeNS)
+		}
+	}
+}
+
+func TestBatchTotalsWithEncoder_MatchesBatchTotals_WithDict(t *testing.T) {
+	samples := buildProfileSamples(256)
+	dict := mustBuildValidDict(t, samples)
+	encodeNsPerRawByte := 2.0
+
+	for _, k := range []int{1, 2, 3, 6} {
+		wantPayload, wantMeta, wantRaw, wantEncodeNS := batchTotals(dict, samples, k, encodeNsPerRawByte)
+		enc, err := zstd.NewWriter(nil, zstd.WithEncoderDict(dict), zstd.WithEncoderLevel(zstd.SpeedFastest))
+		if err != nil {
+			t.Fatalf("new dict writer: %v", err)
+		}
+
+		var concatScratch []byte
+		var encodedScratch []byte
+		gotPayload, gotMeta, gotRaw, gotEncodeNS := batchTotalsWithEncoder(enc, samples, k, encodeNsPerRawByte, &concatScratch, &encodedScratch)
+		_ = enc.Close()
+
+		if gotPayload != wantPayload || gotMeta != wantMeta || gotRaw != wantRaw || gotEncodeNS != wantEncodeNS {
+			t.Fatalf("k=%d mismatch got=(payload=%d meta=%d raw=%d encodeNs=%d) want=(payload=%d meta=%d raw=%d encodeNs=%d)",
+				k, gotPayload, gotMeta, gotRaw, gotEncodeNS, wantPayload, wantMeta, wantRaw, wantEncodeNS)
+		}
+	}
+}

From 6aaca742a729d7a2f17700a8a21adcb3fe35b04b Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 18:46:42 -1000
Subject: [PATCH 45/61] compression: reduce dict autotune encoder overhead

---
 TreeDB/internal/compression/profile.go      | 91 +++++++++++++++++----
 TreeDB/internal/compression/profile_test.go | 13 ++-
 TreeDB/internal/compression/trainer.go      | 14 +++-
 3 files changed, 97 insertions(+), 21 deletions(-)

diff --git a/TreeDB/internal/compression/profile.go b/TreeDB/internal/compression/profile.go
index cf0c95745..2e5e705a3 100644
--- a/TreeDB/internal/compression/profile.go
+++ b/TreeDB/internal/compression/profile.go
@@ -44,6 +44,13 @@ type kScore struct {
 	score        float64
 }
 
+const (
+	// Bound evaluation work so training cost stays predictable on long streams.
+	// Use even down-sampling rather than prefix truncation to preserve shape.
+	maxChooseKEvalSamples = 4096
+	maxDecodeCostSamples  = 256
+)
+
 func ChooseKForDict(dict []byte, samples [][]byte) (profile *ActiveProfile) {
 	return ChooseKForDictOptions(dict, samples, ChooseKOptions{})
 }
@@ -59,8 +66,8 @@ func ChooseKForDictOptions(dict []byte, samples [][]byte, opts ChooseKOptions) (
 		return nil
 	}
 	eval := samples
-	if len(eval) > 10000 {
-		eval = eval[:10000]
+	if len(eval) > maxChooseKEvalSamples {
+		eval = evenlySampleRecords(eval, maxChooseKEvalSamples)
 	}
 	rawTotal := 0
 	for _, v := range eval {
@@ -70,9 +77,9 @@ func ChooseKForDictOptions(dict []byte, samples [][]byte, opts ChooseKOptions) (
 		return nil
 	}
 
-	nsPerByte := decodeCostEstimate(dict, eval)
-	if opts.DecodeNsPerRawByte > 0 {
-		nsPerByte = opts.DecodeNsPerRawByte
+	nsPerByte := opts.DecodeNsPerRawByte
+	if nsPerByte <= 0 {
+		nsPerByte = decodeCostEstimate(dict, eval)
 	}
 	ks := opts.CandidateK
 	if len(ks) == 0 {
@@ -81,11 +88,20 @@ func ChooseKForDictOptions(dict []byte, samples [][]byte, opts ChooseKOptions) (
 	ks = normalizeCandidateK(ks)
 	var sharedEnc *zstd.Encoder
 	if dict != nil {
-		if enc, err := zstd.NewWriter(nil, zstd.WithEncoderDict(dict), zstd.WithEncoderLevel(zstd.SpeedFastest)); err == nil {
+		if enc, err := zstd.NewWriter(nil,
+			zstd.WithEncoderDict(dict),
+			zstd.WithEncoderLevel(zstd.SpeedFastest),
+			zstd.WithEncoderConcurrency(1),
+			zstd.WithEncoderCRC(false),
+		); err == nil {
 			sharedEnc = enc
 		}
 	} else {
-		if enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedFastest)); err == nil {
+		if enc, err := zstd.NewWriter(nil,
+			zstd.WithEncoderLevel(zstd.SpeedFastest),
+			zstd.WithEncoderConcurrency(1),
+			zstd.WithEncoderCRC(false),
+		); err == nil {
 			sharedEnc = enc
 		}
 	}
@@ -223,9 +239,18 @@ func batchTotals(dict []byte, samples [][]byte, k int, encodeNsPerRawByte float6
 	var enc *zstd.Encoder
 	var err error
 	if dict != nil {
-		enc, err = zstd.NewWriter(nil, zstd.WithEncoderDict(dict), zstd.WithEncoderLevel(zstd.SpeedFastest))
+		enc, err = zstd.NewWriter(nil,
+			zstd.WithEncoderDict(dict),
+			zstd.WithEncoderLevel(zstd.SpeedFastest),
+			zstd.WithEncoderConcurrency(1),
+			zstd.WithEncoderCRC(false),
+		)
 	} else {
-		enc, err = zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedFastest))
+		enc, err = zstd.NewWriter(nil,
+			zstd.WithEncoderLevel(zstd.SpeedFastest),
+			zstd.WithEncoderConcurrency(1),
+			zstd.WithEncoderCRC(false),
+		)
 	}
 	if err != nil {
 		return 0, 0, 0, 0
@@ -291,30 +316,39 @@ func batchTotalsWithEncoder(enc *zstd.Encoder, samples [][]byte, k int, encodeNs
 }
 
 func decodeCostEstimate(dict []byte, samples [][]byte) float64 {
-	n := len(samples)
-	if n > 500 {
-		n = 500
+	eval := samples
+	if len(eval) > maxDecodeCostSamples {
+		eval = evenlySampleRecords(eval, maxDecodeCostSamples)
 	}
-	enc, err := zstd.NewWriter(nil, zstd.WithEncoderDict(dict), zstd.WithEncoderLevel(zstd.SpeedFastest))
+	n := len(eval)
+	enc, err := zstd.NewWriter(nil,
+		zstd.WithEncoderDict(dict),
+		zstd.WithEncoderLevel(zstd.SpeedFastest),
+		zstd.WithEncoderConcurrency(1),
+		zstd.WithEncoderCRC(false),
+	)
 	if err != nil {
 		return 1.0
 	}
 	defer enc.Close()
-	frames := make([][]byte, n)
+
 	totalRaw := 0
+	var encoded []byte
 	for i := 0; i < n; i++ {
-		totalRaw += len(samples[i])
-		frames[i] = enc.EncodeAll(samples[i], nil)
+		totalRaw += len(eval[i])
+		encoded = enc.EncodeAll(eval[i], encoded[:0])
 	}
 	dec, err := zstd.NewReader(nil, zstd.WithDecoderDicts(dict))
 	if err != nil {
 		return 1.0
 	}
 	defer dec.Close()
+
 	var out []byte
 	start := time.Now()
 	for i := 0; i < n; i++ {
-		out, _ = dec.DecodeAll(frames[i], out[:0])
+		encoded = enc.EncodeAll(eval[i], encoded[:0])
+		out, _ = dec.DecodeAll(encoded, out[:0])
 		if len(out) > 0 {
 			_ = out[0]
 		}
@@ -325,3 +359,26 @@ func decodeCostEstimate(dict []byte, samples [][]byte) float64 {
 	}
 	return float64(elapsed.Nanoseconds()) / float64(totalRaw)
 }
+
+func evenlySampleRecords(samples [][]byte, limit int) [][]byte {
+	if limit <= 0 || len(samples) <= limit {
+		return samples
+	}
+	out := make([][]byte, 0, limit)
+	last := -1
+	for i := 0; i < limit; i++ {
+		idx := (i * len(samples)) / limit
+		if idx >= len(samples) {
+			idx = len(samples) - 1
+		}
+		if idx <= last {
+			idx = last + 1
+			if idx >= len(samples) {
+				idx = len(samples) - 1
+			}
+		}
+		last = idx
+		out = append(out, samples[idx])
+	}
+	return out
+}
diff --git a/TreeDB/internal/compression/profile_test.go b/TreeDB/internal/compression/profile_test.go
index 15dbd18d2..1c1c26136 100644
--- a/TreeDB/internal/compression/profile_test.go
+++ b/TreeDB/internal/compression/profile_test.go
@@ -42,7 +42,11 @@ func TestBatchTotalsWithEncoder_MatchesBatchTotals_NoDict(t *testing.T) {
 
 	for _, k := range []int{1, 2, 4, 8} {
 		wantPayload, wantMeta, wantRaw, wantEncodeNS := batchTotals(nil, samples, k, encodeNsPerRawByte)
-		enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedFastest))
+		enc, err := zstd.NewWriter(nil,
+			zstd.WithEncoderLevel(zstd.SpeedFastest),
+			zstd.WithEncoderConcurrency(1),
+			zstd.WithEncoderCRC(false),
+		)
 		if err != nil {
 			t.Fatalf("new writer: %v", err)
 		}
@@ -66,7 +70,12 @@ func TestBatchTotalsWithEncoder_MatchesBatchTotals_WithDict(t *testing.T) {
 
 	for _, k := range []int{1, 2, 3, 6} {
 		wantPayload, wantMeta, wantRaw, wantEncodeNS := batchTotals(dict, samples, k, encodeNsPerRawByte)
-		enc, err := zstd.NewWriter(nil, zstd.WithEncoderDict(dict), zstd.WithEncoderLevel(zstd.SpeedFastest))
+		enc, err := zstd.NewWriter(nil,
+			zstd.WithEncoderDict(dict),
+			zstd.WithEncoderLevel(zstd.SpeedFastest),
+			zstd.WithEncoderConcurrency(1),
+			zstd.WithEncoderCRC(false),
+		)
 		if err != nil {
 			t.Fatalf("new dict writer: %v", err)
 		}
diff --git a/TreeDB/internal/compression/trainer.go b/TreeDB/internal/compression/trainer.go
index 31c004bbc..ee525935c 100644
--- a/TreeDB/internal/compression/trainer.go
+++ b/TreeDB/internal/compression/trainer.go
@@ -838,7 +838,12 @@ func (t *Trainer) train(samples [][]byte, dictBytes int, level zstd.EncoderLevel
 		}
 	}
 
-	enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(level), zstd.WithEncoderCRC(false), zstd.WithEncoderDict(bestProfile.Dict))
+	enc, err := zstd.NewWriter(nil,
+		zstd.WithEncoderLevel(level),
+		zstd.WithEncoderCRC(false),
+		zstd.WithEncoderConcurrency(1),
+		zstd.WithEncoderDict(bestProfile.Dict),
+	)
 	if err != nil {
 		log.Printf("treedb: dict training encode setup failed stream=%d err=%v", slabID, err)
 		return
@@ -930,7 +935,12 @@ func shapeAndValidateDict(dict []byte, dictBytes int, level zstd.EncoderLevel) (
 }
 
 func validateDict(dict []byte, level zstd.EncoderLevel) error {
-	enc, err := zstd.NewWriter(nil, zstd.WithEncoderLevel(level), zstd.WithEncoderCRC(false), zstd.WithEncoderDict(dict))
+	enc, err := zstd.NewWriter(nil,
+		zstd.WithEncoderLevel(level),
+		zstd.WithEncoderCRC(false),
+		zstd.WithEncoderConcurrency(1),
+		zstd.WithEncoderDict(dict),
+	)
 	if err != nil {
 		return err
 	}

From 87c345ae65b125d5952c2374b5352dc570f4bcf2 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 18:53:00 -1000
Subject: [PATCH 46/61] caching: add rewrite efficiency and budget rate stats

---
 TreeDB/caching/db.go                          | 61 +++++++++++++++++--
 .../caching/vlog_generation_scheduler_test.go | 32 ++++++++++
 cmd/unified_bench/main.go                     |  9 +++
 3 files changed, 96 insertions(+), 6 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 1858c4810..ea4482b18 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -20937,6 +20937,46 @@ func (db *DB) Stats() map[string]string {
 	rewriteExecTotalNS := db.vlogGenerationRewriteExecTotalNanos.Load()
 	rewriteExecMaxNS := db.vlogGenerationRewriteExecMaxNanos.Load()
 	rewriteRuns := db.vlogGenerationRewriteRuns.Load()
+	rewriteBytesInTotal := db.vlogGenerationRewriteBytesIn.Load()
+	rewriteBytesOutTotal := db.vlogGenerationRewriteBytesOut.Load()
+	rewriteReclaimedBytesTotal := db.vlogGenerationRewriteReclaimedBytes.Load()
+	rewriteProcessedLiveBytes := db.vlogGenerationRewriteProcessedLiveBytes.Load()
+	rewriteProcessedStaleBytes := db.vlogGenerationRewriteProcessedStaleBytes.Load()
+	rewriteProcessedTotal := rewriteProcessedLiveBytes + rewriteProcessedStaleBytes
+	rewriteBudgetConsumedTotal := db.vlogGenerationRewriteBudgetConsumed.Load()
+	rewriteChurnBps := db.vlogGenerationLastChurnBps.Load()
+	rewriteExecSeconds := 0.0
+	if rewriteExecTotalNS > 0 {
+		rewriteExecSeconds = float64(rewriteExecTotalNS) / float64(time.Second)
+	}
+	rewriteBytesInPerSec := 0.0
+	rewriteBytesOutPerSec := 0.0
+	rewriteReclaimedBytesPerSec := 0.0
+	rewriteBudgetConsumedPerSec := 0.0
+	if rewriteExecSeconds > 0 {
+		rewriteBytesInPerSec = float64(rewriteBytesInTotal) / rewriteExecSeconds
+		rewriteBytesOutPerSec = float64(rewriteBytesOutTotal) / rewriteExecSeconds
+		rewriteReclaimedBytesPerSec = float64(rewriteReclaimedBytesTotal) / rewriteExecSeconds
+		rewriteBudgetConsumedPerSec = float64(rewriteBudgetConsumedTotal) / rewriteExecSeconds
+	}
+	rewriteOutputRatio := 0.0
+	rewriteReclaimRatio := 0.0
+	if rewriteBytesInTotal > 0 {
+		rewriteOutputRatio = float64(rewriteBytesOutTotal) / float64(rewriteBytesInTotal)
+		rewriteReclaimRatio = float64(rewriteReclaimedBytesTotal) / float64(rewriteBytesInTotal)
+	}
+	rewriteProcessedStaleRatio := 0.0
+	if rewriteProcessedTotal > 0 {
+		rewriteProcessedStaleRatio = float64(rewriteProcessedStaleBytes) / float64(rewriteProcessedTotal)
+	}
+	rewriteBudgetConsumedSharePct := 0.0
+	if db.valueLogRewriteBudgetBytes > 0 {
+		rewriteBudgetConsumedSharePct = (rewriteBudgetConsumedPerSec / float64(db.valueLogRewriteBudgetBytes)) * 100.0
+	}
+	rewriteReclaimedVsChurnRatio := 0.0
+	if rewriteChurnBps > 0 {
+		rewriteReclaimedVsChurnRatio = rewriteReclaimedBytesPerSec / float64(rewriteChurnBps)
+	}
 	gcExecTotalNS := db.vlogGenerationGCExecTotalNanos.Load()
 	gcExecMaxNS := db.vlogGenerationGCExecMaxNanos.Load()
 	gcRuns := db.vlogGenerationGCRuns.Load()
@@ -21038,7 +21078,7 @@ func (db *DB) Stats() map[string]string {
 		stats["treedb.cache.vlog_generation.maintenance.pass.avg_ms"] = "0.000"
 	}
 	stats["treedb.cache.vlog_generation.churn_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationChurnBytes.Load())
-	stats["treedb.cache.vlog_generation.churn_bytes_per_sec"] = fmt.Sprintf("%d", db.vlogGenerationLastChurnBps.Load())
+	stats["treedb.cache.vlog_generation.churn_bytes_per_sec"] = fmt.Sprintf("%d", rewriteChurnBps)
 	stats["treedb.cache.vlog_generation.rewrite.queue_len"] = fmt.Sprintf("%d", rewriteQueueLen)
 	stats["treedb.cache.vlog_generation.rewrite.queue_loaded"] = fmt.Sprintf("%t", rewriteQueueLoaded)
 	stats["treedb.cache.vlog_generation.rewrite.ledger_segments"] = fmt.Sprintf("%d", rewriteLedgerSegments)
@@ -21063,7 +21103,9 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.rewrite_budget.tokens_bytes"] = fmt.Sprintf("%d", rewriteBudgetTokens)
 	stats["treedb.cache.vlog_generation.rewrite_budget.tokens_cap_bytes"] = fmt.Sprintf("%d", rewriteBudgetCap)
 	stats["treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct"] = fmt.Sprintf("%.3f", rewriteBudgetUtilPct)
-	stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"] = fmt.Sprintf("%d", db.vlogGenerationRewriteBudgetConsumed.Load())
+	stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"] = fmt.Sprintf("%d", rewriteBudgetConsumedTotal)
+	stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_per_sec"] = fmt.Sprintf("%.3f", rewriteBudgetConsumedPerSec)
+	stats["treedb.cache.vlog_generation.rewrite_budget.consumed_share_of_budget_pct"] = fmt.Sprintf("%.3f", rewriteBudgetConsumedSharePct)
 	stats["treedb.cache.vlog_generation.rewrite_trigger.stale_ratio_ppm"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerRatioPPM)
 	stats["treedb.cache.vlog_generation.rewrite_trigger.total_bytes"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerBytes)
 	stats["treedb.cache.vlog_generation.rewrite_trigger.churn_per_sec"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerChurn)
@@ -21085,10 +21127,17 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.segments.hot"] = fmt.Sprintf("%d", retained.SegmentsHot)
 	stats["treedb.cache.vlog_generation.segments.warm"] = fmt.Sprintf("%d", retained.SegmentsWarm)
 	stats["treedb.cache.vlog_generation.segments.cold"] = fmt.Sprintf("%d", retained.SegmentsCold)
-	stats["treedb.cache.vlog_generation.rewrite.bytes_in"] = fmt.Sprintf("%d", db.vlogGenerationRewriteBytesIn.Load())
-	stats["treedb.cache.vlog_generation.rewrite.bytes_out"] = fmt.Sprintf("%d", db.vlogGenerationRewriteBytesOut.Load())
-	stats["treedb.cache.vlog_generation.rewrite.processed_live_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteProcessedLiveBytes.Load())
-	stats["treedb.cache.vlog_generation.rewrite.processed_stale_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteProcessedStaleBytes.Load())
+	stats["treedb.cache.vlog_generation.rewrite.bytes_in"] = fmt.Sprintf("%d", rewriteBytesInTotal)
+	stats["treedb.cache.vlog_generation.rewrite.bytes_out"] = fmt.Sprintf("%d", rewriteBytesOutTotal)
+	stats["treedb.cache.vlog_generation.rewrite.processed_live_bytes"] = fmt.Sprintf("%d", rewriteProcessedLiveBytes)
+	stats["treedb.cache.vlog_generation.rewrite.processed_stale_bytes"] = fmt.Sprintf("%d", rewriteProcessedStaleBytes)
+	stats["treedb.cache.vlog_generation.rewrite.reclaim_ratio"] = fmt.Sprintf("%.6f", rewriteReclaimRatio)
+	stats["treedb.cache.vlog_generation.rewrite.output_ratio"] = fmt.Sprintf("%.6f", rewriteOutputRatio)
+	stats["treedb.cache.vlog_generation.rewrite.processed_stale_ratio"] = fmt.Sprintf("%.6f", rewriteProcessedStaleRatio)
+	stats["treedb.cache.vlog_generation.rewrite.exec.bytes_in_per_sec"] = fmt.Sprintf("%.3f", rewriteBytesInPerSec)
+	stats["treedb.cache.vlog_generation.rewrite.exec.bytes_out_per_sec"] = fmt.Sprintf("%.3f", rewriteBytesOutPerSec)
+	stats["treedb.cache.vlog_generation.rewrite.exec.reclaimed_bytes_per_sec"] = fmt.Sprintf("%.3f", rewriteReclaimedBytesPerSec)
+	stats["treedb.cache.vlog_generation.rewrite.exec.reclaimed_vs_churn_ratio"] = fmt.Sprintf("%.6f", rewriteReclaimedVsChurnRatio)
 	stats["treedb.cache.vlog_generation.rewrite.no_reclaim_runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteNoReclaimRuns.Load())
 	stats["treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes"] = fmt.Sprintf("%d", db.vlogGenerationRewriteNoReclaimStaleBytes.Load())
 	stats["treedb.cache.vlog_generation.rewrite.runs"] = fmt.Sprintf("%d", db.vlogGenerationRewriteRuns.Load())
diff --git a/TreeDB/caching/vlog_generation_scheduler_test.go b/TreeDB/caching/vlog_generation_scheduler_test.go
index 61f4818ff..dc469bf50 100644
--- a/TreeDB/caching/vlog_generation_scheduler_test.go
+++ b/TreeDB/caching/vlog_generation_scheduler_test.go
@@ -6466,6 +6466,9 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	db.vlogGenerationRewriteRuns.Store(3)
 	db.vlogGenerationRewriteExecTotalNanos.Store(uint64((150 * time.Millisecond).Nanoseconds()))
 	db.vlogGenerationRewriteExecMaxNanos.Store(uint64((70 * time.Millisecond).Nanoseconds()))
+	db.vlogGenerationRewriteBytesIn.Store(1000)
+	db.vlogGenerationRewriteBytesOut.Store(600)
+	db.vlogGenerationRewriteReclaimedBytes.Store(400)
 	db.vlogGenerationGCRuns.Store(2)
 	db.vlogGenerationGCExecTotalNanos.Store(uint64((60 * time.Millisecond).Nanoseconds()))
 	db.vlogGenerationGCExecMaxNanos.Store(uint64((35 * time.Millisecond).Nanoseconds()))
@@ -6474,6 +6477,8 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	db.vlogGenerationVacuumExecMaxNanos.Store(uint64((25 * time.Millisecond).Nanoseconds()))
 	db.vlogGenerationRewriteBudgetTokensBytes.Store(512)
 	db.vlogGenerationRewriteBudgetConsumed.Store(1536)
+	db.valueLogRewriteBudgetBytes = 2048
+	db.vlogGenerationLastChurnBps.Store(2500)
 	db.vlogGenerationRewriteAgeBlockedUntilNS.Store(time.Now().Add(5 * time.Second).UnixNano())
 	db.vlogGenerationLastGCSegmentsReferenced.Store(7)
 	db.vlogGenerationLastGCBytesReferenced.Store(700)
@@ -6770,6 +6775,12 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total"]; got != "1536" {
 		t.Fatalf("rewrite budget consumed=%q want 1536", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_per_sec"]; got != "10240.000" {
+		t.Fatalf("rewrite budget consumed bytes/sec=%q want 10240.000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite_budget.consumed_share_of_budget_pct"]; got != "500.000" {
+		t.Fatalf("rewrite budget consumed share pct=%q want 500.000", got)
+	}
 	if got := stats["treedb.cache.vlog_generation.rewrite_budget.tokens_cap_bytes"]; got == "0" {
 		t.Fatalf("rewrite budget cap bytes=%q want non-zero", got)
 	}
@@ -6812,6 +6823,27 @@ func TestVlogGenerationStats_ReportRewriteBacklogAndDurations(t *testing.T) {
 	if got := stats["treedb.cache.vlog_generation.rewrite.processed_stale_bytes"]; got != "450" {
 		t.Fatalf("rewrite processed stale bytes=%q want 450", got)
 	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.reclaim_ratio"]; got != "0.400000" {
+		t.Fatalf("rewrite reclaim ratio=%q want 0.400000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.output_ratio"]; got != "0.600000" {
+		t.Fatalf("rewrite output ratio=%q want 0.600000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.processed_stale_ratio"]; got != "0.333333" {
+		t.Fatalf("rewrite processed stale ratio=%q want 0.333333", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.exec.bytes_in_per_sec"]; got != "6666.667" {
+		t.Fatalf("rewrite exec bytes in/sec=%q want 6666.667", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.exec.bytes_out_per_sec"]; got != "4000.000" {
+		t.Fatalf("rewrite exec bytes out/sec=%q want 4000.000", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.exec.reclaimed_bytes_per_sec"]; got != "2666.667" {
+		t.Fatalf("rewrite exec reclaimed bytes/sec=%q want 2666.667", got)
+	}
+	if got := stats["treedb.cache.vlog_generation.rewrite.exec.reclaimed_vs_churn_ratio"]; got != "1.066667" {
+		t.Fatalf("rewrite reclaimed vs churn ratio=%q want 1.066667", got)
+	}
 	if got := stats["treedb.cache.vlog_generation.rewrite.no_reclaim_runs"]; got != "3" {
 		t.Fatalf("rewrite no reclaim runs=%q want 3", got)
 	}
diff --git a/cmd/unified_bench/main.go b/cmd/unified_bench/main.go
index d7737e6ed..461ae0235 100644
--- a/cmd/unified_bench/main.go
+++ b/cmd/unified_bench/main.go
@@ -1287,6 +1287,15 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) {
 		"treedb.cache.vlog_generation.segments.cold",
 		"treedb.cache.vlog_generation.rewrite.bytes_in",
 		"treedb.cache.vlog_generation.rewrite.bytes_out",
+		"treedb.cache.vlog_generation.rewrite.reclaim_ratio",
+		"treedb.cache.vlog_generation.rewrite.output_ratio",
+		"treedb.cache.vlog_generation.rewrite.processed_stale_ratio",
+		"treedb.cache.vlog_generation.rewrite.exec.bytes_in_per_sec",
+		"treedb.cache.vlog_generation.rewrite.exec.bytes_out_per_sec",
+		"treedb.cache.vlog_generation.rewrite.exec.reclaimed_bytes_per_sec",
+		"treedb.cache.vlog_generation.rewrite.exec.reclaimed_vs_churn_ratio",
+		"treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_per_sec",
+		"treedb.cache.vlog_generation.rewrite_budget.consumed_share_of_budget_pct",
 		"treedb.cache.vlog_generation.rewrite.runs",
 		"treedb.cache.vlog_generation.gc.deleted_segments",
 		"treedb.cache.vlog_generation.gc.deleted_bytes",

From f031d074d3f0ab46c1d4cf0331c1b3021974cf0a Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 19:17:55 -1000
Subject: [PATCH 47/61] treedb: add rewrite min-age lab knob and richer rewrite
 stats

---
 TreeDB/caching/db.go                          | 25 +++++--
 TreeDB/db/db.go                               |  8 +++
 TreeDB/public.go                              |  1 +
 cmd/unified_bench/README.md                   |  1 +
 cmd/unified_bench/adapter_treedb.go           |  7 ++
 cmd/unified_bench/adapter_treedb_vlog_test.go | 17 +++++
 cmd/unified_bench/main.go                     | 52 ++++++++++++++
 .../profiles_treedb_index_test.go             |  4 ++
 docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md      |  7 ++
 ...EWRITE_MIN_SEGMENT_AGE_SWEEP_2026-03-28.md | 72 +++++++++++++++++++
 scripts/celestia_fast_gate.sh                 |  5 ++
 11 files changed, 194 insertions(+), 5 deletions(-)
 create mode 100644 docs/benchmarks/VLOG_REWRITE_MIN_SEGMENT_AGE_SWEEP_2026-03-28.md

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index ea4482b18..8fbca7141 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5343,6 +5343,11 @@ type Options struct {
 	ValueLogRewriteTriggerTotalBytes int64
 	// ValueLogRewriteTriggerChurnPerSec triggers rewrite by churn rate.
 	ValueLogRewriteTriggerChurnPerSec int64
+	// ValueLogRewriteMinSegmentAge gates online rewrite to source segments that
+	// are at least this old.
+	//
+	// 0 uses the implementation default.
+	ValueLogRewriteMinSegmentAge time.Duration
 	// ForceValueLogPointers stores all values out-of-line in the value log.
 	ForceValueLogPointers bool
 	// DisableReadChecksum skips CRC verification on value-log reads.
@@ -5589,6 +5594,7 @@ type DB struct {
 	valueLogRewriteTriggerRatioPPM uint32
 	valueLogRewriteTriggerBytes    int64
 	valueLogRewriteTriggerChurn    int64
+	valueLogRewriteMinSegmentAge   time.Duration
 	valueLogReader                 *valuelog.Manager
 	valueLogHotLanes               []int
 	valueLogWarmLanes              []int
@@ -7654,6 +7660,7 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) {
 	valueLogRewriteTriggerRatioPPM := opts.ValueLogRewriteTriggerStaleRatioPPM
 	valueLogRewriteTriggerBytes := opts.ValueLogRewriteTriggerTotalBytes
 	valueLogRewriteTriggerChurn := opts.ValueLogRewriteTriggerChurnPerSec
+	valueLogRewriteMinSegmentAge := opts.ValueLogRewriteMinSegmentAge
 	if valueLogGenerationHotTarget < 0 {
 		return nil, fmt.Errorf("cachingdb: invalid value-log generational hot segment target bytes %d", valueLogGenerationHotTarget)
 	}
@@ -7675,6 +7682,9 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) {
 	if valueLogRewriteTriggerChurn < 0 {
 		return nil, fmt.Errorf("cachingdb: invalid value-log generational rewrite trigger churn/sec %d", valueLogRewriteTriggerChurn)
 	}
+	if valueLogRewriteMinSegmentAge < 0 {
+		return nil, fmt.Errorf("cachingdb: invalid value-log generational rewrite min segment age %s", valueLogRewriteMinSegmentAge)
+	}
 	if valueLogGenerationPolicyUint8 == uint8(backenddb.ValueLogGenerationHotWarmCold) {
 		if valueLogGenerationHotTarget == 0 {
 			valueLogGenerationHotTarget = defaultVlogGenerationHotTargetBytes
@@ -7695,6 +7705,9 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) {
 			valueLogRewriteTriggerRatioPPM = defaultVlogRewriteTriggerStalePPM
 		}
 	}
+	if valueLogRewriteMinSegmentAge == 0 {
+		valueLogRewriteMinSegmentAge = vlogGenerationRewriteMinSegmentAge
+	}
 	valueLogRawWritevMinAvgBytes := opts.ValueLogRawWritevMinAvgBytes
 	if valueLogRawWritevMinAvgBytes < 0 {
 		valueLogRawWritevMinAvgBytes = 0
@@ -7992,6 +8005,7 @@ func Open(dir string, backend BackendDB, opts Options) (*DB, error) {
 		valueLogRewriteTriggerRatioPPM:       valueLogRewriteTriggerRatioPPM,
 		valueLogRewriteTriggerBytes:          valueLogRewriteTriggerBytes,
 		valueLogRewriteTriggerChurn:          valueLogRewriteTriggerChurn,
+		valueLogRewriteMinSegmentAge:         valueLogRewriteMinSegmentAge,
 		memtableValueLogPointers:             true,
 		indexOuterLeavesInValueLog:           opts.IndexOuterLeavesInValueLog,
 		valueLogReader:                       valueLogReader,
@@ -14322,7 +14336,7 @@ func (db *DB) maybeRunVlogGenerationMaintenanceWithOptions(runGC bool, opts vlog
 			MaxSourceBytes:       maxSourceBytes,
 			MinSegmentStaleRatio: minStaleRatio,
 			MinSegmentStaleBytes: 1,
-			MinSegmentAge:        vlogGenerationRewriteMinSegmentAge,
+			MinSegmentAge:        db.valueLogRewriteMinSegmentAge,
 		}
 		planStart := time.Now()
 		plan, err := planner.ValueLogRewritePlan(ctx, planOpts)
@@ -14481,7 +14495,7 @@ planned:
 					MaxSourceBytes:       maxSourceBytes,
 					MinSegmentStaleRatio: minStaleRatio,
 					MinSegmentStaleBytes: vlogGenerationRewriteMinSegmentStaleBytes,
-					MinSegmentAge:        vlogGenerationRewriteMinSegmentAge,
+					MinSegmentAge:        db.valueLogRewriteMinSegmentAge,
 				})
 				cancel()
 				planDur := time.Since(planStart)
@@ -14535,14 +14549,14 @@ planned:
 					db.observeVlogGenerationRewritePlanPenaltyFilter(beforePenaltyFilter, len(plan.SourceFileIDs))
 				}
 				if len(plan.SourceFileIDs) == 0 {
-					if shouldDeferVlogGenerationRewritePlanForAge(plan, vlogGenerationRewriteMinSegmentAge) {
+					if shouldDeferVlogGenerationRewritePlanForAge(plan, db.valueLogRewriteMinSegmentAge) {
 						db.setVlogGenerationRewriteAgeBlockedUntil(now.Add(plan.AgeBlockedMinRemainingAge))
 						db.debugVlogMaintf(
 							"rewrite_plan pre_rewrite age_blocked segments=%d stale_bytes=%d retry_after_ms=%d min_age_ms=%d",
 							plan.AgeBlockedSegments,
 							plan.AgeBlockedBytesStale,
 							plan.AgeBlockedMinRemainingAge.Milliseconds(),
-							vlogGenerationRewriteMinSegmentAge.Milliseconds(),
+							db.valueLogRewriteMinSegmentAge.Milliseconds(),
 						)
 					} else {
 						db.clearVlogGenerationRewriteAgeBlockedUntil()
@@ -14759,7 +14773,7 @@ planned:
 				rewriteOpts.MaxSourceBytes = maxSourceBytes
 				rewriteOpts.MinSegmentStaleRatio = db.vlogGenerationRewriteMinStaleRatioForGenericPass(totalBytes)
 				rewriteOpts.MinSegmentStaleBytes = vlogGenerationRewriteMinSegmentStaleBytes
-				rewriteOpts.MinSegmentAge = vlogGenerationRewriteMinSegmentAge
+				rewriteOpts.MinSegmentAge = db.valueLogRewriteMinSegmentAge
 			}
 			var ctx context.Context
 			var cancel context.CancelFunc
@@ -21109,6 +21123,7 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.rewrite_trigger.stale_ratio_ppm"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerRatioPPM)
 	stats["treedb.cache.vlog_generation.rewrite_trigger.total_bytes"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerBytes)
 	stats["treedb.cache.vlog_generation.rewrite_trigger.churn_per_sec"] = fmt.Sprintf("%d", db.valueLogRewriteTriggerChurn)
+	stats["treedb.cache.vlog_generation.rewrite.min_segment_age_ms"] = fmt.Sprintf("%d", db.valueLogRewriteMinSegmentAge.Milliseconds())
 	// PR1 scaffolding: legacy allocator still owns placement; report retained
 	// totals under hot generation until generation-aware allocator lands.
 	stats["treedb.cache.vlog_generation.bytes.live.total"] = fmt.Sprintf("%d", retained.BytesTotal)
diff --git a/TreeDB/db/db.go b/TreeDB/db/db.go
index ae4b583d3..d7b2f7c05 100644
--- a/TreeDB/db/db.go
+++ b/TreeDB/db/db.go
@@ -280,6 +280,11 @@ type ValueLogGenerationConfig struct {
 	// RewriteTriggerChurnPerSec triggers rewrite when churn rate exceeds
 	// threshold (0 disables).
 	RewriteTriggerChurnPerSec int64
+	// RewriteMinSegmentAge gates online rewrite to source segments that are at
+	// least this old.
+	//
+	// 0 uses the implementation default.
+	RewriteMinSegmentAge time.Duration
 }
 
 // ValueLogDomainThreshold overrides inline-vs-pointer placement policy for keys
@@ -968,6 +973,9 @@ func validateOptions(opts Options) error {
 	if opts.ValueLog.Generational.RewriteTriggerChurnPerSec < 0 {
 		return fmt.Errorf("treedb: invalid value-log generational rewrite trigger churn/sec %d", opts.ValueLog.Generational.RewriteTriggerChurnPerSec)
 	}
+	if opts.ValueLog.Generational.RewriteMinSegmentAge < 0 {
+		return fmt.Errorf("treedb: invalid value-log generational rewrite min segment age %s", opts.ValueLog.Generational.RewriteMinSegmentAge)
+	}
 	seenDomains := make(map[string]struct{}, len(opts.ValueLog.DomainInlineThresholds))
 	for i := range opts.ValueLog.DomainInlineThresholds {
 		d := opts.ValueLog.DomainInlineThresholds[i]
diff --git a/TreeDB/public.go b/TreeDB/public.go
index 2af25ada1..8436b482a 100644
--- a/TreeDB/public.go
+++ b/TreeDB/public.go
@@ -590,6 +590,7 @@ func Open(opts Options) (*DB, error) {
 		ValueLogRewriteTriggerStaleRatioPPM:      opts.ValueLog.Generational.RewriteTriggerStaleRatioPPM,
 		ValueLogRewriteTriggerTotalBytes:         opts.ValueLog.Generational.RewriteTriggerTotalBytes,
 		ValueLogRewriteTriggerChurnPerSec:        opts.ValueLog.Generational.RewriteTriggerChurnPerSec,
+		ValueLogRewriteMinSegmentAge:             opts.ValueLog.Generational.RewriteMinSegmentAge,
 		ForceValueLogPointers:                    opts.ValueLog.ForcePointers,
 		ValueLogDictTrain:                        opts.ValueLog.DictTrain,
 		ValueLogDictMaxK:                         opts.ValueLog.DictMaxK,
diff --git a/cmd/unified_bench/README.md b/cmd/unified_bench/README.md
index 93c26afdd..85ef9e258 100644
--- a/cmd/unified_bench/README.md
+++ b/cmd/unified_bench/README.md
@@ -95,6 +95,7 @@ GOWORK=off GOMEMLIMIT=4GiB GOMAXPROCS=2 go test -json -p 1 . \
 - `-treedb-allow-unsafe` TreeDB: allow unsafe durability/integrity options (required for unsafe toggles)
 - `-treedb-vlog-dict` TreeDB: value-log dict compression mode (`default|on|off|both`)
 - `-treedb-vlog-auto-policy` TreeDB: value-log auto policy (`balanced|throughput|size`)
+- `-treedb-vlog-rewrite-min-segment-age-ms` TreeDB: minimum source segment age for online generational rewrite (`0`=default)
 - `-treedb-vlog-dict-frame-encode-level` TreeDB: dict frame zstd encoder level (`engine|fastest|default|better|best|all|<int>`)
 - `-treedb-vlog-dict-frame-entropy` TreeDB: dict frame entropy mode (`engine|on|off|both`)
 - `-seed` PRNG seed for randomized tests (default 1; `0` = time-based)
diff --git a/cmd/unified_bench/adapter_treedb.go b/cmd/unified_bench/adapter_treedb.go
index 016af5206..ae19a2359 100644
--- a/cmd/unified_bench/adapter_treedb.go
+++ b/cmd/unified_bench/adapter_treedb.go
@@ -70,6 +70,7 @@ var (
 	treedbVlogRewriteTriggerStaleRatioPPM = flag.Uint("treedb-vlog-rewrite-trigger-stale-ratio-ppm", 0, "TreeDB: generational rewrite stale/live trigger in ppm (0=disabled)")
 	treedbVlogRewriteTriggerTotalBytes    = flag.Int64("treedb-vlog-rewrite-trigger-total-bytes", 0, "TreeDB: generational rewrite total retained bytes trigger (0=disabled)")
 	treedbVlogRewriteTriggerChurnPerSec   = flag.Int64("treedb-vlog-rewrite-trigger-churn-per-sec", 0, "TreeDB: generational rewrite churn trigger in bytes/sec (0=disabled)")
+	treedbVlogRewriteMinSegmentAgeMS      = flag.Int("treedb-vlog-rewrite-min-segment-age-ms", 0, "TreeDB: generational rewrite minimum source segment age in milliseconds (0=default)")
 	treedbVlogBlockTargetBytes            = flag.Int("treedb-vlog-block-target-bytes", 0, "TreeDB: value-log block target compressed bytes (0=default)")
 	treedbVlogIncompressibleHoldBytes     = flag.Int("treedb-vlog-incompressible-hold-bytes", 0, "TreeDB: auto-mode incompressible hold bytes (0=default)")
 	treedbVlogIncompressibleProbeBytes    = flag.Int("treedb-vlog-incompressible-probe-bytes", 0, "TreeDB: auto-mode incompressible probe interval bytes (0=default)")
@@ -359,6 +360,11 @@ func (r treeDBOptionsReport) formatText(indent string) string {
 	lines = append(lines, fmt.Sprintf("vlog.rewrite_trigger_stale_ratio_ppm=%d", r.opts.ValueLog.Generational.RewriteTriggerStaleRatioPPM))
 	lines = append(lines, fmt.Sprintf("vlog.rewrite_trigger_total_bytes=%d", r.opts.ValueLog.Generational.RewriteTriggerTotalBytes))
 	lines = append(lines, fmt.Sprintf("vlog.rewrite_trigger_churn_per_sec=%d", r.opts.ValueLog.Generational.RewriteTriggerChurnPerSec))
+	if minAge := r.opts.ValueLog.Generational.RewriteMinSegmentAge; minAge <= 0 {
+		lines = append(lines, fmt.Sprintf("vlog.rewrite_min_segment_age_ms=default (effective=%d)", int((30*time.Second)/time.Millisecond)))
+	} else {
+		lines = append(lines, fmt.Sprintf("vlog.rewrite_min_segment_age_ms=%d", int(minAge/time.Millisecond)))
+	}
 	if target := r.opts.ValueLog.BlockTargetCompressedBytes; target <= 0 {
 		lines = append(lines, "vlog.block_target_bytes=default (effective=4096B)")
 	} else {
@@ -663,6 +669,7 @@ func buildTreeDBOptions(dir string) (treedb.Options, treeDBOptionsReport, error)
 	opts.ValueLog.Generational.RewriteTriggerStaleRatioPPM = clampUint32(uint64(*treedbVlogRewriteTriggerStaleRatioPPM))
 	opts.ValueLog.Generational.RewriteTriggerTotalBytes = *treedbVlogRewriteTriggerTotalBytes
 	opts.ValueLog.Generational.RewriteTriggerChurnPerSec = *treedbVlogRewriteTriggerChurnPerSec
+	opts.ValueLog.Generational.RewriteMinSegmentAge = time.Duration(*treedbVlogRewriteMinSegmentAgeMS) * time.Millisecond
 
 	if maintenanceMode == "bench" {
 		// Disable background maintenance loops. "bench" mode aims for stable
diff --git a/cmd/unified_bench/adapter_treedb_vlog_test.go b/cmd/unified_bench/adapter_treedb_vlog_test.go
index 3a2dbd9f5..f54948848 100644
--- a/cmd/unified_bench/adapter_treedb_vlog_test.go
+++ b/cmd/unified_bench/adapter_treedb_vlog_test.go
@@ -131,6 +131,23 @@ func TestBuildTreeDBOptions_VlogDictClassModeFlag(t *testing.T) {
 	}
 }
 
+func TestBuildTreeDBOptions_VlogRewriteMinSegmentAgeFlag(t *testing.T) {
+	saved := saveTreeDBFlagState()
+	defer restoreTreeDBFlagState(saved)
+
+	*treedbVlogRewriteMinSegmentAgeMS = 5000
+	opts, rep, err := buildTreeDBOptions("")
+	if err != nil {
+		t.Fatalf("buildTreeDBOptions: %v", err)
+	}
+	if got := opts.ValueLog.Generational.RewriteMinSegmentAge.Milliseconds(); got != 5000 {
+		t.Fatalf("unexpected rewrite min segment age ms: got=%d want=5000", got)
+	}
+	if got := rep.formatText(""); !strings.Contains(got, "vlog.rewrite_min_segment_age_ms=5000") {
+		t.Fatalf("resolved options missing rewrite min segment age: %q", got)
+	}
+}
+
 func TestBuildTreeDBOptions_InvalidVlogDictClassMode(t *testing.T) {
 	saved := saveTreeDBFlagState()
 	defer restoreTreeDBFlagState(saved)
diff --git a/cmd/unified_bench/main.go b/cmd/unified_bench/main.go
index 461ae0235..e4d3aa592 100644
--- a/cmd/unified_bench/main.go
+++ b/cmd/unified_bench/main.go
@@ -1270,8 +1270,26 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) {
 		"treedb.cache.vlog_generation.policy",
 		"treedb.cache.vlog_generation.scheduler_state",
 		"treedb.cache.vlog_generation.scheduler_last_reason",
+		"treedb.cache.vlog_generation.maintenance_phase",
+		"treedb.cache.vlog_generation.maintenance.attempts",
+		"treedb.cache.vlog_generation.maintenance.acquired",
+		"treedb.cache.vlog_generation.maintenance.collisions",
+		"treedb.cache.vlog_generation.maintenance.skip.wal_on_periodic",
+		"treedb.cache.vlog_generation.maintenance.skip.maintenance_phase",
+		"treedb.cache.vlog_generation.maintenance.skip.stage_gate",
+		"treedb.cache.vlog_generation.maintenance.skip.stage_gate_not_due",
+		"treedb.cache.vlog_generation.maintenance.skip.stage_gate_due_reserved",
+		"treedb.cache.vlog_generation.maintenance.skip.age_blocked_gate",
+		"treedb.cache.vlog_generation.maintenance.skip.priority_pending",
+		"treedb.cache.vlog_generation.maintenance.skip.quiet_window",
+		"treedb.cache.vlog_generation.maintenance.skip.before_first_checkpoint",
+		"treedb.cache.vlog_generation.maintenance.skip.checkpoint_inflight",
 		"treedb.cache.vlog_generation.churn_bytes_total",
 		"treedb.cache.vlog_generation.churn_bytes_per_sec",
+		"treedb.cache.vlog_generation.rewrite_trigger.stale_ratio_ppm",
+		"treedb.cache.vlog_generation.rewrite_trigger.total_bytes",
+		"treedb.cache.vlog_generation.rewrite_trigger.churn_per_sec",
+		"treedb.cache.vlog_generation.rewrite.min_segment_age_ms",
 		"treedb.cache.vlog_generation.bytes.live.total",
 		"treedb.cache.vlog_generation.bytes.live.hot",
 		"treedb.cache.vlog_generation.bytes.live.warm",
@@ -1285,6 +1303,29 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) {
 		"treedb.cache.vlog_generation.segments.hot",
 		"treedb.cache.vlog_generation.segments.warm",
 		"treedb.cache.vlog_generation.segments.cold",
+		"treedb.cache.vlog_generation.rewrite.queue_len",
+		"treedb.cache.vlog_generation.rewrite.queue_loaded",
+		"treedb.cache.vlog_generation.rewrite.ledger_segments",
+		"treedb.cache.vlog_generation.rewrite.ledger_bytes_total",
+		"treedb.cache.vlog_generation.rewrite.ledger_bytes_live",
+		"treedb.cache.vlog_generation.rewrite.ledger_bytes_stale",
+		"treedb.cache.vlog_generation.rewrite.ledger_stale_ratio_ppm",
+		"treedb.cache.vlog_generation.rewrite.stage_pending",
+		"treedb.cache.vlog_generation.rewrite.stage_observed_unix_nano",
+		"treedb.cache.vlog_generation.rewrite.penalties_active",
+		"treedb.cache.vlog_generation.rewrite.age_blocked_until_unix_nano",
+		"treedb.cache.vlog_generation.rewrite.age_blocked_remaining_ms",
+		"treedb.cache.vlog_generation.rewrite.plan_runs",
+		"treedb.cache.vlog_generation.rewrite.plan_canceled",
+		"treedb.cache.vlog_generation.rewrite.plan_errors",
+		"treedb.cache.vlog_generation.rewrite.plan_empty",
+		"treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked",
+		"treedb.cache.vlog_generation.rewrite.plan_empty.no_selection",
+		"treedb.cache.vlog_generation.rewrite.plan_selected",
+		"treedb.cache.vlog_generation.rewrite.plan_selected_segments_total",
+		"treedb.cache.vlog_generation.rewrite.plan_selected_bytes_total",
+		"treedb.cache.vlog_generation.rewrite.plan_selected_bytes_live",
+		"treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale",
 		"treedb.cache.vlog_generation.rewrite.bytes_in",
 		"treedb.cache.vlog_generation.rewrite.bytes_out",
 		"treedb.cache.vlog_generation.rewrite.reclaim_ratio",
@@ -1294,8 +1335,19 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) {
 		"treedb.cache.vlog_generation.rewrite.exec.bytes_out_per_sec",
 		"treedb.cache.vlog_generation.rewrite.exec.reclaimed_bytes_per_sec",
 		"treedb.cache.vlog_generation.rewrite.exec.reclaimed_vs_churn_ratio",
+		"treedb.cache.vlog_generation.rewrite.no_reclaim_runs",
+		"treedb.cache.vlog_generation.rewrite.no_reclaim_stale_bytes",
+		"treedb.cache.vlog_generation.rewrite.canceled_runs",
+		"treedb.cache.vlog_generation.rewrite.deadline_runs",
+		"treedb.cache.vlog_generation.rewrite.ineffective_runs",
 		"treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_per_sec",
 		"treedb.cache.vlog_generation.rewrite_budget.consumed_share_of_budget_pct",
+		"treedb.cache.vlog_generation.rewrite_budget.bytes_per_sec",
+		"treedb.cache.vlog_generation.rewrite_budget.records_per_sec",
+		"treedb.cache.vlog_generation.rewrite_budget.tokens_bytes",
+		"treedb.cache.vlog_generation.rewrite_budget.tokens_cap_bytes",
+		"treedb.cache.vlog_generation.rewrite_budget.tokens_utilization_pct",
+		"treedb.cache.vlog_generation.rewrite_budget.consumed_bytes_total",
 		"treedb.cache.vlog_generation.rewrite.runs",
 		"treedb.cache.vlog_generation.gc.deleted_segments",
 		"treedb.cache.vlog_generation.gc.deleted_bytes",
diff --git a/cmd/unified_bench/profiles_treedb_index_test.go b/cmd/unified_bench/profiles_treedb_index_test.go
index 80720f73c..36a897562 100644
--- a/cmd/unified_bench/profiles_treedb_index_test.go
+++ b/cmd/unified_bench/profiles_treedb_index_test.go
@@ -165,6 +165,7 @@ type savedTreeDBFlagState struct {
 	vlogGenColdBytes        int64
 	vlogRewriteBudgetBPS    int64
 	vlogRewriteBudgetRPS    int
+	vlogRewriteMinAgeMS     int
 	disableWAL              bool
 	relaxedSync             bool
 	disableChecksum         bool
@@ -197,6 +198,7 @@ func saveTreeDBFlagState() savedTreeDBFlagState {
 		vlogGenColdBytes:        *treedbVlogGenerationColdSegmentBytes,
 		vlogRewriteBudgetBPS:    *treedbVlogRewriteBudgetBytesPerSec,
 		vlogRewriteBudgetRPS:    *treedbVlogRewriteBudgetRecordsPerSec,
+		vlogRewriteMinAgeMS:     *treedbVlogRewriteMinSegmentAgeMS,
 		disableWAL:              *treedbDisableWAL,
 		relaxedSync:             *treedbRelaxedSync,
 		disableChecksum:         *treedbDisableReadChecksum,
@@ -225,6 +227,7 @@ func restoreTreeDBFlagState(s savedTreeDBFlagState) {
 	*treedbVlogGenerationColdSegmentBytes = s.vlogGenColdBytes
 	*treedbVlogRewriteBudgetBytesPerSec = s.vlogRewriteBudgetBPS
 	*treedbVlogRewriteBudgetRecordsPerSec = s.vlogRewriteBudgetRPS
+	*treedbVlogRewriteMinSegmentAgeMS = s.vlogRewriteMinAgeMS
 	*treedbDisableWAL = s.disableWAL
 	*treedbRelaxedSync = s.relaxedSync
 	*treedbDisableReadChecksum = s.disableChecksum
@@ -252,6 +255,7 @@ func resetTreeDBIndexFlagsForTest() {
 	*treedbVlogGenerationColdSegmentBytes = 0
 	*treedbVlogRewriteBudgetBytesPerSec = 0
 	*treedbVlogRewriteBudgetRecordsPerSec = 0
+	*treedbVlogRewriteMinSegmentAgeMS = 0
 	*treedbDisableWAL = false
 	*treedbRelaxedSync = false
 	*treedbDisableReadChecksum = false
diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
index 249a2b753..648120a26 100644
--- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
+++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
@@ -10,6 +10,7 @@
 - `-treedb-vlog-generation-policy hot_warm_cold`
 - `-treedb-vlog-rewrite-trigger-total-bytes` set for your dataset size
 - `-treedb-vlog-rewrite-budget-bytes-per-sec` and/or `-treedb-vlog-rewrite-budget-records-per-sec`
+- `-treedb-vlog-rewrite-min-segment-age-ms` keep default for production; lower only for short-loop experiments
 
 ## Maintenance Model
 - Rewrite: threshold-triggered and budget-bounded.
@@ -23,6 +24,12 @@ Primary keys:
 - `treedb.cache.vlog_generation.scheduler_state`
 - `treedb.cache.vlog_generation.scheduler_last_reason`
 - `treedb.cache.vlog_generation.churn_bytes_per_sec`
+- `treedb.cache.vlog_generation.rewrite.min_segment_age_ms`
+- `treedb.cache.vlog_generation.rewrite.plan_runs`
+- `treedb.cache.vlog_generation.rewrite.plan_empty`
+- `treedb.cache.vlog_generation.rewrite.plan_empty.age_blocked`
+- `treedb.cache.vlog_generation.rewrite.plan_selected`
+- `treedb.cache.vlog_generation.rewrite.ledger_bytes_stale`
 - `treedb.cache.vlog_generation.rewrite.runs`
 - `treedb.cache.vlog_generation.rewrite.bytes_in`
 - `treedb.cache.vlog_generation.rewrite.bytes_out`
diff --git a/docs/benchmarks/VLOG_REWRITE_MIN_SEGMENT_AGE_SWEEP_2026-03-28.md b/docs/benchmarks/VLOG_REWRITE_MIN_SEGMENT_AGE_SWEEP_2026-03-28.md
new file mode 100644
index 000000000..6f3608eb7
--- /dev/null
+++ b/docs/benchmarks/VLOG_REWRITE_MIN_SEGMENT_AGE_SWEEP_2026-03-28.md
@@ -0,0 +1,72 @@
+# VLOG Rewrite Min-Segment-Age Sweep (2026-03-28)
+
+## Goal
+
+Evaluate whether lowering online rewrite min-segment-age improves short-loop
+signal without harming sync-time or end-of-run app-dir size.
+
+## Workload
+
+- Command core:
+  - `./bin/unified-bench`
+  - `-profile fast`
+  - `-dbs treedb`
+  - `-keys 900000`
+  - `-valsize 256`
+  - `-batchsize 4000`
+  - `-test batch_write_steady,random_write`
+  - `-val-pattern celestia_height_prefix_fill`
+  - `-checkpoint-every-bytes 4194304`
+  - `-treedb-force-value-pointers=true`
+  - `-treedb-vlog-compression dict`
+  - `-treedb-vlog-compression-autotune aggressive`
+  - `-treedb-vlog-generation-policy hot_warm_cold`
+  - `-treedb-vlog-rewrite-trigger-total-bytes 1`
+  - `-treedb-vlog-rewrite-trigger-stale-ratio-ppm 1`
+  - `-treedb-vlog-rewrite-trigger-churn-per-sec 1`
+  - `-treedb-vlog-rewrite-budget-bytes-per-sec 134217728`
+  - `-treedb-cache-stats-after-tests=true`
+
+- Swept:
+  - default (effective 30000ms)
+  - `-treedb-vlog-rewrite-min-segment-age-ms 1000`
+  - `-treedb-vlog-rewrite-min-segment-age-ms 5000`
+  - `-treedb-vlog-rewrite-min-segment-age-ms 10000`
+
+## Results
+
+| min age | rewrite activity | dir bytes | wal bytes | note |
+|---|---:|---:|---:|---|
+| default (30000ms) | rewrite_runs=0, plan_empty.age_blocked=1 | 567,439,668 | 553,889,306 | baseline behavior |
+| 1000ms | rewrite_runs=1, plan_selected=1, gc_runs=1 | 702,734,421 | 685,611,243 | clear regression |
+| 5000ms | rewrite_runs=0, plan_empty.age_blocked=1 | 567,406,884 | 553,889,290 | effectively baseline |
+| 10000ms | rewrite_runs=0, plan_empty.age_blocked=1 | 567,439,650 | 553,889,288 | effectively baseline |
+
+Observed for the regressing 1000ms run:
+
+- `rewrite.bytes_in` ~= 64MB
+- `rewrite.bytes_out` ~= 528MB
+- `rewrite.reclaim_ratio` = `0.000000`
+- `gc.deleted_segments` = `0`
+
+Interpretation: rewrite executes too early and amplifies bytes without reclaim,
+so this setting is not suitable for production-like loops.
+
+## Interleaved A/B confirmation
+
+Using `scripts/celestia_fast_gate.sh` with same binaries and only this flag as
+candidate delta (`CANDIDATE_EXTRA_FLAGS='-treedb-vlog-rewrite-min-segment-age-ms 1'`):
+
+- Output: `/tmp/gomap_minage_gate_ctr4Ji/gate`
+- Decision: `clear_regression`
+- Completed pairs: 2
+- Median delta (`candidate - control`):
+  - `s_sync_app_bytes`: +135,580,501.5
+  - `t_sync_seconds`: +13
+
+## Conclusion
+
+- Keep default min-segment-age for normal runs.
+- Keep the flag as an explicit lab-only override for controlled scheduler
+  experiments.
+- Do not enable low values (1ms/1000ms) in gate/default configs.
diff --git a/scripts/celestia_fast_gate.sh b/scripts/celestia_fast_gate.sh
index f79292e0f..59d93e551 100755
--- a/scripts/celestia_fast_gate.sh
+++ b/scripts/celestia_fast_gate.sh
@@ -40,6 +40,7 @@ VLOG_COMPRESSION_AUTOTUNE="${VLOG_COMPRESSION_AUTOTUNE:-aggressive}"
 VLOG_COMPRESSION_VARIANT="${VLOG_COMPRESSION_VARIANT:-dict}"
 DICT_TRAIN_BYTES="${DICT_TRAIN_BYTES:-1048576}"
 DICT_BYTES="${DICT_BYTES:-32768}"
+VLOG_REWRITE_MIN_SEGMENT_AGE_MS="${VLOG_REWRITE_MIN_SEGMENT_AGE_MS:-}"
 
 REWRITE_ENABLED="${REWRITE_ENABLED:-1}"
 REWRITE_ARGS="${REWRITE_ARGS:--rw}"
@@ -255,6 +256,9 @@ run_variant() {
     -treedb-vlog-dict-train-bytes "$DICT_TRAIN_BYTES"
     -treedb-vlog-dict-dict-bytes "$DICT_BYTES"
   )
+  if [[ -n "$VLOG_REWRITE_MIN_SEGMENT_AGE_MS" ]]; then
+    cmd+=(-treedb-vlog-rewrite-min-segment-age-ms "$VLOG_REWRITE_MIN_SEGMENT_AGE_MS")
+  fi
 
   if [[ -n "$COMMON_EXTRA_FLAGS" ]]; then
     # shellcheck disable=SC2206
@@ -737,6 +741,7 @@ vlog_compression_autotune=$VLOG_COMPRESSION_AUTOTUNE
 vlog_compression_variant=$VLOG_COMPRESSION_VARIANT
 dict_train_bytes=$DICT_TRAIN_BYTES
 dict_bytes=$DICT_BYTES
+vlog_rewrite_min_segment_age_ms=$VLOG_REWRITE_MIN_SEGMENT_AGE_MS
 rewrite_enabled=$REWRITE_ENABLED
 rewrite_args=$REWRITE_ARGS
 measure_gzip=$MEASURE_GZIP

From 4a52d51fff5049c8af0238172ccd662dfbe9613e Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 19:35:02 -1000
Subject: [PATCH 48/61] treedb: reuse decode scratch in online rewrite reads

---
 TreeDB/db/vlog_rewrite.go | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go
index ab5b5eb87..be3c3d069 100644
--- a/TreeDB/db/vlog_rewrite.go
+++ b/TreeDB/db/vlog_rewrite.go
@@ -31,6 +31,7 @@ const defaultValueLogRewriteSegmentBytes = 128 << 20
 
 const rewriteDictMinPayloadBytes = 32 << 10
 const rewriteDictBatchMaxK = 64
+const rewriteReadScratchMaxCap = 1 << 20 // 1MiB cap to avoid retaining oversized decode buffers
 
 func rewriteAllowDictForSmallPayload(value []byte) bool {
 	if len(value) < page.PageSize {
@@ -1230,6 +1231,7 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 	swaps := make([]rewriteSwap, 0, batchSize)
 	localityPolicy := normalizeValueLogRewriteLocalityPolicy(opts.LocalityPolicy)
 	candidates := make([]rewriteCandidate, 0, batchSize)
+	var rewriteReadScratch []byte
 	var canceledErr error
 
 	flushBatch := func() error {
@@ -1243,7 +1245,7 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 			return err
 		}
 		for _, candidate := range candidates {
-			val, err := db.valueLogManager.Read(candidate.oldPtr)
+			val, usedScratch, err := db.valueLogManager.ReadUnsafeTo(candidate.oldPtr, rewriteReadScratch)
 			if err != nil {
 				return err
 			}
@@ -1251,6 +1253,15 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 			if err != nil {
 				return err
 			}
+			if usedScratch {
+				// Reuse decode storage across records to reduce alloc churn while
+				// bounding retained capacity to avoid RSS blow-ups on outliers.
+				if cap(val) > rewriteReadScratchMaxCap {
+					rewriteReadScratch = nil
+				} else {
+					rewriteReadScratch = val[:0]
+				}
+			}
 			startRID++
 			stats.RecordsCopied++
 			swaps = append(swaps, rewriteSwap{

From f764ab93cee5e357e712ab299d5b50ccf3bd43bf Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 19:41:52 -1000
Subject: [PATCH 49/61] treedb: expose rewrite value vs leaf copy stats

---
 TreeDB/caching/db.go      | 24 ++++++++++++++++++
 TreeDB/db/vlog_rewrite.go | 52 +++++++++++++++++++++++++--------------
 TreeDB/vlog_rewrite.go    |  4 +++
 cmd/unified_bench/main.go |  4 +++
 4 files changed, 65 insertions(+), 19 deletions(-)

diff --git a/TreeDB/caching/db.go b/TreeDB/caching/db.go
index 8fbca7141..ec960e5bd 100644
--- a/TreeDB/caching/db.go
+++ b/TreeDB/caching/db.go
@@ -5856,6 +5856,10 @@ type DB struct {
 	vlogGenerationRewriteBytesIn                                atomic.Uint64
 	vlogGenerationRewriteBytesOut                               atomic.Uint64
 	vlogGenerationRewriteReclaimedBytes                         atomic.Uint64
+	vlogGenerationRewriteValueRecordsCopied                     atomic.Uint64
+	vlogGenerationRewriteValueBytesCopied                       atomic.Uint64
+	vlogGenerationRewriteLeafRefRecordsCopied                   atomic.Uint64
+	vlogGenerationRewriteLeafRefBytesCopied                     atomic.Uint64
 	vlogGenerationRewriteProcessedLiveBytes                     atomic.Uint64
 	vlogGenerationRewriteProcessedStaleBytes                    atomic.Uint64
 	vlogGenerationRewriteNoReclaimRuns                          atomic.Uint64
@@ -15117,6 +15121,18 @@ planned:
 			if stats.RecordsCopied > 0 {
 				db.vlogGenerationRemapSuccesses.Add(uint64(stats.RecordsCopied))
 			}
+			if stats.ValueRecordsCopied > 0 {
+				db.vlogGenerationRewriteValueRecordsCopied.Add(uint64(stats.ValueRecordsCopied))
+			}
+			if stats.ValueBytesCopied > 0 {
+				db.vlogGenerationRewriteValueBytesCopied.Add(uint64(stats.ValueBytesCopied))
+			}
+			if stats.LeafRefRecordsCopied > 0 {
+				db.vlogGenerationRewriteLeafRefRecordsCopied.Add(uint64(stats.LeafRefRecordsCopied))
+			}
+			if stats.LeafRefBytesCopied > 0 {
+				db.vlogGenerationRewriteLeafRefBytesCopied.Add(uint64(stats.LeafRefBytesCopied))
+			}
 			if consumed > 0 {
 				db.vlogGenerationConsumeRewriteBudgetBytes(consumed)
 			}
@@ -20954,6 +20970,10 @@ func (db *DB) Stats() map[string]string {
 	rewriteBytesInTotal := db.vlogGenerationRewriteBytesIn.Load()
 	rewriteBytesOutTotal := db.vlogGenerationRewriteBytesOut.Load()
 	rewriteReclaimedBytesTotal := db.vlogGenerationRewriteReclaimedBytes.Load()
+	rewriteValueRecordsCopiedTotal := db.vlogGenerationRewriteValueRecordsCopied.Load()
+	rewriteValueBytesCopiedTotal := db.vlogGenerationRewriteValueBytesCopied.Load()
+	rewriteLeafRefRecordsCopiedTotal := db.vlogGenerationRewriteLeafRefRecordsCopied.Load()
+	rewriteLeafRefBytesCopiedTotal := db.vlogGenerationRewriteLeafRefBytesCopied.Load()
 	rewriteProcessedLiveBytes := db.vlogGenerationRewriteProcessedLiveBytes.Load()
 	rewriteProcessedStaleBytes := db.vlogGenerationRewriteProcessedStaleBytes.Load()
 	rewriteProcessedTotal := rewriteProcessedLiveBytes + rewriteProcessedStaleBytes
@@ -21144,6 +21164,10 @@ func (db *DB) Stats() map[string]string {
 	stats["treedb.cache.vlog_generation.segments.cold"] = fmt.Sprintf("%d", retained.SegmentsCold)
 	stats["treedb.cache.vlog_generation.rewrite.bytes_in"] = fmt.Sprintf("%d", rewriteBytesInTotal)
 	stats["treedb.cache.vlog_generation.rewrite.bytes_out"] = fmt.Sprintf("%d", rewriteBytesOutTotal)
+	stats["treedb.cache.vlog_generation.rewrite.value_records_copied"] = fmt.Sprintf("%d", rewriteValueRecordsCopiedTotal)
+	stats["treedb.cache.vlog_generation.rewrite.value_bytes_copied"] = fmt.Sprintf("%d", rewriteValueBytesCopiedTotal)
+	stats["treedb.cache.vlog_generation.rewrite.leafref_records_copied"] = fmt.Sprintf("%d", rewriteLeafRefRecordsCopiedTotal)
+	stats["treedb.cache.vlog_generation.rewrite.leafref_bytes_copied"] = fmt.Sprintf("%d", rewriteLeafRefBytesCopiedTotal)
 	stats["treedb.cache.vlog_generation.rewrite.processed_live_bytes"] = fmt.Sprintf("%d", rewriteProcessedLiveBytes)
 	stats["treedb.cache.vlog_generation.rewrite.processed_stale_bytes"] = fmt.Sprintf("%d", rewriteProcessedStaleBytes)
 	stats["treedb.cache.vlog_generation.rewrite.reclaim_ratio"] = fmt.Sprintf("%.6f", rewriteReclaimRatio)
diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go
index be3c3d069..133d1de4b 100644
--- a/TreeDB/db/vlog_rewrite.go
+++ b/TreeDB/db/vlog_rewrite.go
@@ -50,6 +50,14 @@ type ValueLogRewriteStats struct {
 	BytesBefore    int64
 	BytesAfter     int64
 	RecordsCopied  int
+	// Value* counters track key/value-pointer payload copied by the main rewrite
+	// pointer swap path.
+	ValueRecordsCopied int
+	ValueBytesCopied   int64
+	// LeafRef* counters track outer-leaf page payload copied by the leaf-ref
+	// rewrite path (indexOuterLeavesInValueLog mode).
+	LeafRefRecordsCopied int
+	LeafRefBytesCopied   int64
 	// SourceSegmentsRequested is the number of source segments selected for this
 	// rewrite run after applying selection filters.
 	SourceSegmentsRequested int
@@ -1264,6 +1272,8 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 			}
 			startRID++
 			stats.RecordsCopied++
+			stats.ValueRecordsCopied++
+			stats.ValueBytesCopied += int64(len(val))
 			swaps = append(swaps, rewriteSwap{
 				key:    candidate.key,
 				oldPtr: candidate.oldPtr,
@@ -1334,11 +1344,13 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 		// referenced leaf pages out of the selected source segments so cleanup can
 		// actually reclaim space.
 		if restrictSource && db.indexOuterLeavesInValueLog && len(sourceIDs) > 0 {
-			copied, err := db.rewriteLeafRefsOnline(ctx, writer, ridAlloc, sourceIDs, opts.SyncEachBatch)
+			copied, copiedBytes, err := db.rewriteLeafRefsOnline(ctx, writer, ridAlloc, sourceIDs, opts.SyncEachBatch)
 			if err != nil {
 				return stats, err
 			}
 			stats.RecordsCopied += copied
+			stats.LeafRefRecordsCopied += copied
+			stats.LeafRefBytesCopied += copiedBytes
 		}
 	} else {
 		// Stop publishing further swaps after cancellation; cleanup below still
@@ -1484,8 +1496,9 @@ type leafRefRewriteCtx struct {
 	leafMap     map[uint64]uint64 // old leafref id -> new leafref id
 	internalMap map[uint64]uint64 // old internal page id -> new page id
 
-	retired []uint64
-	copied  int
+	retired     []uint64
+	copied      int
+	copiedBytes int64
 }
 
 func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) {
@@ -1542,6 +1555,7 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) {
 		}
 		c.leafMap[id] = leafID
 		c.copied++
+		c.copiedBytes += int64(len(leafPage))
 		return leafID, true, nil
 	}
 
@@ -1639,32 +1653,32 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) {
 	}
 }
 
-func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, ridAlloc *rewriteRIDAllocator, sourceIDs map[uint32]struct{}, sync bool) (copied int, err error) {
+func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, ridAlloc *rewriteRIDAllocator, sourceIDs map[uint32]struct{}, sync bool) (copied int, copiedBytes int64, err error) {
 	if db == nil {
-		return 0, fmt.Errorf("missing db")
+		return 0, 0, fmt.Errorf("missing db")
 	}
 	if !db.indexOuterLeavesInValueLog {
-		return 0, nil
+		return 0, 0, nil
 	}
 	if db.readOnly {
-		return 0, ErrReadOnly
+		return 0, 0, ErrReadOnly
 	}
 	if db.valueLogManager == nil {
-		return 0, fmt.Errorf("value log manager unavailable")
+		return 0, 0, fmt.Errorf("value log manager unavailable")
 	}
 	if writer == nil || ridAlloc == nil {
-		return 0, fmt.Errorf("vlog-rewrite: missing writer/rid state")
+		return 0, 0, fmt.Errorf("vlog-rewrite: missing writer/rid state")
 	}
 	// Treat nil sourceIDs as "all sources" and an empty, non-nil map as "no
 	// sources". The latter means there is nothing to rewrite.
 	if sourceIDs != nil && len(sourceIDs) == 0 {
-		return 0, nil
+		return 0, 0, nil
 	}
 	if ctx == nil {
 		ctx = context.Background()
 	}
 	if err := ctx.Err(); err != nil {
-		return 0, err
+		return 0, 0, err
 	}
 
 	db.writeMu.Lock()
@@ -1673,7 +1687,7 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter,
 	snap := db.AcquireSnapshot()
 	if snap == nil || snap.idx == nil || snap.state == nil {
 		closeRewriteSnapshot(&err, snap)
-		return 0, fmt.Errorf("missing snapshot state")
+		return 0, 0, fmt.Errorf("missing snapshot state")
 	}
 	defer closeRewriteSnapshot(&err, snap)
 
@@ -1710,33 +1724,33 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter,
 
 	newSysRoot, sysChanged, err := leafCtx.rewriteNode(sysRoot)
 	if err != nil {
-		return 0, err
+		return 0, 0, err
 	}
 	newRoot, userChanged, err := leafCtx.rewriteNode(rootID)
 	if err != nil {
-		return 0, err
+		return 0, 0, err
 	}
 	if !sysChanged && !userChanged {
-		return 0, nil
+		return 0, 0, nil
 	}
 
 	// Ensure the copied leaf-page records are visible before publishing new leaf
 	// refs that point at them.
 	if sync {
 		if err := writer.Sync(); err != nil {
-			return 0, err
+			return 0, 0, err
 		}
 	} else {
 		if err := writer.Flush(); err != nil {
-			return 0, err
+			return 0, 0, err
 		}
 	}
 
 	if err := db.finalizeCommit(newRoot, newSysRoot, leafCtx.retired, sync, adaptive.Metrics{}, nil, db.indexOuterLeavesInValueLog, nil); err != nil {
-		return 0, err
+		return 0, 0, err
 	}
 	tracker = nil
-	return leafCtx.copied, nil
+	return leafCtx.copied, leafCtx.copiedBytes, nil
 }
 
 func nextRewriteRIDStart(segments []logSegment) (uint64, error) {
diff --git a/TreeDB/vlog_rewrite.go b/TreeDB/vlog_rewrite.go
index 5e60b37da..e685aec54 100644
--- a/TreeDB/vlog_rewrite.go
+++ b/TreeDB/vlog_rewrite.go
@@ -13,6 +13,10 @@ type ValueLogRewriteStats struct {
 	BytesBefore                   int64
 	BytesAfter                    int64
 	RecordsCopied                 int
+	ValueRecordsCopied            int
+	ValueBytesCopied              int64
+	LeafRefRecordsCopied          int
+	LeafRefBytesCopied            int64
 	SourceSegmentsRequested       int
 	SourceSegmentsStillReferenced int
 	SourceSegmentsUnreferenced    int
diff --git a/cmd/unified_bench/main.go b/cmd/unified_bench/main.go
index e4d3aa592..aebc1ff9d 100644
--- a/cmd/unified_bench/main.go
+++ b/cmd/unified_bench/main.go
@@ -1328,6 +1328,10 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) {
 		"treedb.cache.vlog_generation.rewrite.plan_selected_bytes_stale",
 		"treedb.cache.vlog_generation.rewrite.bytes_in",
 		"treedb.cache.vlog_generation.rewrite.bytes_out",
+		"treedb.cache.vlog_generation.rewrite.value_records_copied",
+		"treedb.cache.vlog_generation.rewrite.value_bytes_copied",
+		"treedb.cache.vlog_generation.rewrite.leafref_records_copied",
+		"treedb.cache.vlog_generation.rewrite.leafref_bytes_copied",
 		"treedb.cache.vlog_generation.rewrite.reclaim_ratio",
 		"treedb.cache.vlog_generation.rewrite.output_ratio",
 		"treedb.cache.vlog_generation.rewrite.processed_stale_ratio",

From f688e39209880fd833cfa9ed1a1b4e37f0498803 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 20:04:18 -1000
Subject: [PATCH 50/61] unified-bench: print retained-prune stats

---
 cmd/unified_bench/main.go | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cmd/unified_bench/main.go b/cmd/unified_bench/main.go
index aebc1ff9d..76053da17 100644
--- a/cmd/unified_bench/main.go
+++ b/cmd/unified_bench/main.go
@@ -1356,6 +1356,14 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) {
 		"treedb.cache.vlog_generation.gc.deleted_segments",
 		"treedb.cache.vlog_generation.gc.deleted_bytes",
 		"treedb.cache.vlog_generation.gc.runs",
+		"treedb.cache.vlog_retained_prune.runs",
+		"treedb.cache.vlog_retained_prune.forced_runs",
+		"treedb.cache.vlog_retained_prune.removed_segments",
+		"treedb.cache.vlog_retained_prune.removed_bytes",
+		"treedb.cache.vlog_retained_prune.live_skipped_segments",
+		"treedb.cache.vlog_retained_prune.live_skipped_bytes",
+		"treedb.cache.vlog_retained_prune.zombie_marked_segments",
+		"treedb.cache.vlog_retained_prune.zombie_marked_bytes",
 		"treedb.cache.vlog_generation.vacuum.runs",
 		"treedb.cache.vlog_generation.vacuum.failures",
 		"treedb.cache.vlog_generation.remap.successes",

From 186801c643916a9b23cb90c7cf798c1dd3d0e87b Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 20:19:53 -1000
Subject: [PATCH 51/61] valuelog: cache grouped compressed fallback reads

---
 TreeDB/internal/valuelog/manager.go       | 181 ++++++++++++++++++++++
 TreeDB/internal/valuelog/valuelog_test.go |  86 ++++++++++
 2 files changed, 267 insertions(+)

diff --git a/TreeDB/internal/valuelog/manager.go b/TreeDB/internal/valuelog/manager.go
index 230bcef3a..f6102a29d 100644
--- a/TreeDB/internal/valuelog/manager.go
+++ b/TreeDB/internal/valuelog/manager.go
@@ -504,6 +504,11 @@ func (f *File) ReadUnsafeTo(ptr page.ValuePtr, verifyCRC bool, dst []byte) ([]by
 			}
 		}
 		f.mmapReadFallbackReadAt.Add(1)
+		if !verifyCRC {
+			if val, usedDst, err, ok := f.readGroupedCompressedFromFileTo(ptr, dst); ok {
+				return val, usedDst, err
+			}
+		}
 		return ReadAtWithDictTo(f.File, ptr, verifyCRC, f.dictLookup, f.templateLookup, f.templateDefCache, f.templateDecodeOpts, dst)
 	}
 	// Avoid per-read Stat/lock churn once we have exhausted the dead-mapping
@@ -519,9 +524,185 @@ func (f *File) ReadUnsafeTo(ptr page.ValuePtr, verifyCRC bool, dst []byte) ([]by
 		f.mmapReadMissDeadMappingCap.Add(1)
 	}
 	f.mmapReadFallbackReadAt.Add(1)
+	if !verifyCRC {
+		if val, usedDst, err, ok := f.readGroupedCompressedFromFileTo(ptr, dst); ok {
+			return val, usedDst, err
+		}
+	}
 	return ReadAtWithDictTo(f.File, ptr, verifyCRC, f.dictLookup, f.templateLookup, f.templateDefCache, f.templateDecodeOpts, dst)
 }
 
+// readGroupedCompressedFromFileTo handles grouped+compressed reads on the
+// non-mmap fallback path while reusing File grouped-frame cache entries.
+//
+// ok=false means the caller should fall back to the generic ReadAtWithDictTo
+// decoder path (for non-grouped / uncompressed / checksum-verified cases).
+func (f *File) readGroupedCompressedFromFileTo(ptr page.ValuePtr, dst []byte) ([]byte, bool, error, bool) {
+	if f == nil || f.File == nil {
+		return nil, false, errors.New("valuelog: nil file"), true
+	}
+	if ptr.Offset < 4 || !page.ValuePtrIsGrouped(ptr) {
+		return nil, false, nil, false
+	}
+
+	start := int64(ptr.Offset - 4)
+	var header [HeaderSize]byte
+	if _, err := f.File.ReadAt(header[:], start); err != nil {
+		return nil, false, err, true
+	}
+	if header[4] != Version {
+		return nil, false, ErrCorrupt, true
+	}
+	if header[5]&recordFlagGrouped == 0 {
+		return nil, false, nil, false
+	}
+	valueLen := binary.LittleEndian.Uint32(header[16:20])
+	if recordSizeExceedsMax(valueLen) {
+		return nil, false, ErrRecordTooLarge, true
+	}
+	expectedLen := uint32(headerWithoutCRC) + valueLen
+	if !page.ValuePtrRecordLengthHintMatches(ptr, expectedLen) {
+		return nil, false, ErrCorrupt, true
+	}
+	if int(valueLen) < FrameHeaderSize {
+		return nil, false, ErrCorrupt, true
+	}
+
+	frameOff := start + HeaderSize
+	var frameHeader [FrameHeaderSize]byte
+	if _, err := f.File.ReadAt(frameHeader[:], frameOff); err != nil {
+		return nil, false, err, true
+	}
+	if frameHeader[0] != FrameVersion {
+		return nil, false, ErrCorrupt, true
+	}
+	k := int(frameHeader[2])
+	if k <= 0 || k > MaxFrameK {
+		return nil, false, ErrCorrupt, true
+	}
+	if frameHeader[1]&FrameFlagCompressed == 0 {
+		return nil, false, nil, false
+	}
+
+	subIndex := int(page.ValuePtrSubIndex(ptr))
+	if subIndex < 0 || subIndex >= k {
+		return nil, false, ErrCorrupt, true
+	}
+	if cachedRaw, valStart, valEnd, rawLen, hit := f.groupedFrameCacheLookup(start, false, subIndex); hit {
+		if uint32(len(cachedRaw)) != rawLen || valEnd < valStart || valEnd > rawLen {
+			return nil, false, ErrCorrupt, true
+		}
+		val := cachedRaw[valStart:valEnd]
+		if f.templateLookup != nil && templ.IsEncodedPayload(val) {
+			decoded, err := templ.DecodePayloadAppend(nil, val, func(id uint64) (templ.TemplateDef, error) {
+				return resolveTemplateDef(id, f.templateLookup, f.templateDefCache)
+			}, f.templateDecodeOpts)
+			if err != nil {
+				return nil, false, err, true
+			}
+			return decoded, false, nil, true
+		}
+		if dst != nil && cap(dst) >= len(val) {
+			out := dst[:len(val)]
+			copy(out, val)
+			return out, true, nil, true
+		}
+		out := make([]byte, len(val))
+		copy(out, val)
+		return out, false, nil, true
+	}
+
+	ridBytes := k * 8
+	offsetBytes := (k + 1) * 4
+	prefixLen := FrameHeaderSize + ridBytes + offsetBytes
+	if int(valueLen) < prefixLen {
+		return nil, false, ErrCorrupt, true
+	}
+
+	payloadScratch := getDecodeScratch(int(valueLen))
+	defer putDecodeScratch(payloadScratch)
+	payload := payloadScratch[:int(valueLen)]
+	if _, err := f.File.ReadAt(payload, start+HeaderSize); err != nil {
+		return nil, false, err, true
+	}
+
+	off := FrameHeaderSize + ridBytes
+	var offsets [MaxFrameK + 1]uint32
+	prev := uint32(0)
+	for i := 0; i < k+1; i++ {
+		cur := binary.LittleEndian.Uint32(payload[off : off+4])
+		if cur < prev {
+			return nil, false, ErrCorrupt, true
+		}
+		offsets[i] = cur
+		prev = cur
+		off += 4
+	}
+	rawLen := offsets[k]
+	if limits.MaxRecordSize > 0 && int64(rawLen) > limits.MaxRecordSize {
+		return nil, false, ErrRecordTooLarge, true
+	}
+	valStart := offsets[subIndex]
+	valEnd := offsets[subIndex+1]
+	if valEnd < valStart || valEnd > rawLen {
+		return nil, false, ErrCorrupt, true
+	}
+
+	frame := FrameHeader{
+		Version:  frameHeader[0],
+		Flags:    frameHeader[1],
+		K:        uint8(k),
+		Reserved: frameHeader[3],
+		DictID:   binary.LittleEndian.Uint64(frameHeader[4:12]),
+	}
+
+	raw := f.takeDecodeScratch(int(rawLen))
+	pooledRaw := true
+	raw, err := decodeFramePayloadTo(frame, payload[prefixLen:], f.dictLookup, rawLen, raw)
+	if err != nil {
+		if pooledRaw {
+			f.releaseDecodeScratch(raw)
+		}
+		return nil, false, err, true
+	}
+	if uint32(len(raw)) != rawLen {
+		if pooledRaw {
+			f.releaseDecodeScratch(raw)
+		}
+		return nil, false, ErrCorrupt, true
+	}
+	cachedRaw := f.groupedFrameCacheStore(start, false, k, offsets, raw, true)
+
+	val := raw[valStart:valEnd]
+	if f.templateLookup != nil && templ.IsEncodedPayload(val) {
+		decoded, err := templ.DecodePayloadAppend(nil, val, func(id uint64) (templ.TemplateDef, error) {
+			return resolveTemplateDef(id, f.templateLookup, f.templateDefCache)
+		}, f.templateDecodeOpts)
+		if pooledRaw && !cachedRaw {
+			f.releaseDecodeScratch(raw)
+		}
+		if err != nil {
+			return nil, false, err, true
+		}
+		return decoded, false, nil, true
+	}
+
+	if dst != nil && cap(dst) >= len(val) {
+		out := dst[:len(val)]
+		copy(out, val)
+		if pooledRaw && !cachedRaw {
+			f.releaseDecodeScratch(raw)
+		}
+		return out, true, nil, true
+	}
+	out := make([]byte, len(val))
+	copy(out, val)
+	if pooledRaw && !cachedRaw {
+		f.releaseDecodeScratch(raw)
+	}
+	return out, false, nil, true
+}
+
 func (f *File) ReadAppend(ptr page.ValuePtr, verifyCRC bool, dst []byte) ([]byte, error) {
 	if f == nil || f.File == nil {
 		return nil, errors.New("valuelog: nil file")
diff --git a/TreeDB/internal/valuelog/valuelog_test.go b/TreeDB/internal/valuelog/valuelog_test.go
index b974c59ff..96ade0364 100644
--- a/TreeDB/internal/valuelog/valuelog_test.go
+++ b/TreeDB/internal/valuelog/valuelog_test.go
@@ -802,6 +802,92 @@ func TestValueLogManager_GroupedFrameCache_MaxRawBytesSkipsOversize(t *testing.T
 	}
 }
 
+func TestValueLogManager_ReadUnsafeTo_CompressedGroupedFallbackUsesCache(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("mmap not supported on windows")
+	}
+
+	// Force file-read fallback so this test exercises the non-mmap path.
+	withMappedSealedBudget(t, 0)
+
+	dir := t.TempDir()
+	fileID, err := EncodeFileID(0, 1)
+	if err != nil {
+		t.Fatalf("encode file id: %v", err)
+	}
+	path := filepath.Join(dir, "value-l0-000001.log")
+
+	writer, err := NewWriter(path, fileID)
+	if err != nil {
+		t.Fatalf("new writer: %v", err)
+	}
+	writer.SetBlockCompression(BlockCodecSnappy, true)
+	ptrs, want := appendCompressedFrameForCacheTests(t, writer, 0, 4)
+	if err := writer.Close(); err != nil {
+		t.Fatalf("close: %v", err)
+	}
+
+	m, err := NewManager(dir)
+	if err != nil {
+		t.Fatalf("new manager: %v", err)
+	}
+	defer func() { _ = m.Close() }()
+	m.SetDisableReadChecksum(true)
+	m.SetGroupedFrameCacheEntries(4)
+
+	f := m.files[fileID]
+	if f == nil {
+		t.Fatalf("missing opened file for id=%d", fileID)
+	}
+
+	dst := make([]byte, 0, 512)
+	got0, used0, err := m.ReadUnsafeTo(ptrs[0], dst[:0])
+	if err != nil {
+		t.Fatalf("read unsafe to first: %v", err)
+	}
+	if !used0 {
+		t.Fatalf("expected first read to use dst")
+	}
+	if !bytes.Equal(got0, want[0]) {
+		t.Fatalf("first value mismatch: got=%q want=%q", got0, want[0])
+	}
+
+	hits0, misses0, entries0, _ := f.groupedFrameCacheStats()
+	if misses0 == 0 {
+		t.Fatalf("expected first compressed grouped read to miss cache")
+	}
+	if entries0 == 0 {
+		t.Fatalf("expected first compressed grouped read to populate cache")
+	}
+
+	got1, used1, err := m.ReadUnsafeTo(ptrs[1], dst[:0])
+	if err != nil {
+		t.Fatalf("read unsafe to second: %v", err)
+	}
+	if !used1 {
+		t.Fatalf("expected second read to use dst")
+	}
+	if !bytes.Equal(got1, want[1]) {
+		t.Fatalf("second value mismatch: got=%q want=%q", got1, want[1])
+	}
+
+	hits1, misses1, entries1, _ := f.groupedFrameCacheStats()
+	if hits1 <= hits0 {
+		t.Fatalf("expected second read to hit grouped cache: hits before=%d after=%d", hits0, hits1)
+	}
+	if misses1 != misses0 {
+		t.Fatalf("unexpected cache miss increase on second read: before=%d after=%d", misses0, misses1)
+	}
+	if entries1 == 0 {
+		t.Fatalf("expected grouped cache entries to remain populated")
+	}
+
+	_, _, missNoMapping, _, fallbacks := m.MmapReadStats()
+	if missNoMapping == 0 || fallbacks == 0 {
+		t.Fatalf("expected fallback path stats to reflect no-mmap reads: miss_no_mapping=%d fallbacks=%d", missNoMapping, fallbacks)
+	}
+}
+
 func TestReadAtGroupedFastPathWithoutChecksum(t *testing.T) {
 	dir := t.TempDir()
 	path := filepath.Join(dir, "value-000001.log")

From 8afac0815a0def682c3e59141d1fd832e15f64a5 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sat, 28 Mar 2026 20:21:34 -1000
Subject: [PATCH 52/61] unified-bench: print gc observed-source stats

---
 cmd/unified_bench/main.go | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cmd/unified_bench/main.go b/cmd/unified_bench/main.go
index 76053da17..51f4c76f7 100644
--- a/cmd/unified_bench/main.go
+++ b/cmd/unified_bench/main.go
@@ -1355,6 +1355,18 @@ func printTreeDBCacheStats(w io.Writer, inst *DBInstance, prefix string) {
 		"treedb.cache.vlog_generation.rewrite.runs",
 		"treedb.cache.vlog_generation.gc.deleted_segments",
 		"treedb.cache.vlog_generation.gc.deleted_bytes",
+		"treedb.cache.vlog_generation.gc.last_observed_source.segments",
+		"treedb.cache.vlog_generation.gc.last_observed_source.segments_referenced",
+		"treedb.cache.vlog_generation.gc.last_observed_source.segments_eligible",
+		"treedb.cache.vlog_generation.gc.last_observed_source.segments_pending",
+		"treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_retained",
+		"treedb.cache.vlog_generation.gc.last_observed_source.segments_protected_in_use",
+		"treedb.cache.vlog_generation.gc.last_observed_source.bytes",
+		"treedb.cache.vlog_generation.gc.last_observed_source.bytes_referenced",
+		"treedb.cache.vlog_generation.gc.last_observed_source.bytes_eligible",
+		"treedb.cache.vlog_generation.gc.last_observed_source.bytes_pending",
+		"treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_retained",
+		"treedb.cache.vlog_generation.gc.last_observed_source.bytes_protected_in_use",
 		"treedb.cache.vlog_generation.gc.runs",
 		"treedb.cache.vlog_retained_prune.runs",
 		"treedb.cache.vlog_retained_prune.forced_runs",

From 71fc32a6bbe9cb0f497762038748fbf2bfd8198b Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sun, 29 Mar 2026 00:31:05 -1000
Subject: [PATCH 53/61] bench: harden run_celestia AB loop against stuck
 outliers

---
 docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md |  11 +
 scripts/run_celestia_ab.sh                   | 302 +++++++++++++++----
 2 files changed, 254 insertions(+), 59 deletions(-)

diff --git a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md
index 25ba010c0..92dc8875b 100644
--- a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md
+++ b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md
@@ -87,6 +87,9 @@ Now includes anti-loop safeguards:
 - clear stop (improvement/regression)
 - futility stop (`futile_remaining_pairs`)
 - low-signal neutral-streak stop (`low_signal_neutral_streak`)
+- strict new-run-home detection (no fallback to old run dirs)
+- per-variant timeout/retry for stuck syncs
+- invalid-pair streak stop (`invalid_pair_streak`)
 
 Example:
 
@@ -97,10 +100,18 @@ CLEAR_WIN_PAIRS=2 \
 CLEAR_LOSS_PAIRS=2 \
 LOW_SIGNAL_MIN_PAIRS=3 \
 LOW_SIGNAL_NEUTRAL_STREAK=3 \
+RUN_TIMEOUT_SECONDS=1800 \
+RUN_MAX_ATTEMPTS_PER_VARIANT=2 \
+RUN_RETRY_SLEEP_SECONDS=20 \
+INVALID_PAIR_STREAK_STOP=2 \
 REWRITE_ENABLED=1 \
 ./scripts/run_celestia_ab.sh
 ```
 
+Notes:
+- Pair execution remains strictly single-run at a time and interleaved by pair order.
+- Invalid runs (timeout, launcher failure, missing new run home, rewrite failure) are recorded but excluded from pair scoring.
+
 ## Process Review Cadence
 
 Review and revise the loop after every decision event:
diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh
index 63321696d..eadfd0f9a 100755
--- a/scripts/run_celestia_ab.sh
+++ b/scripts/run_celestia_ab.sh
@@ -19,6 +19,10 @@ STOP_ON_CLEAR="${STOP_ON_CLEAR:-1}"
 SLEEP_BETWEEN_RUNS_SECONDS="${SLEEP_BETWEEN_RUNS_SECONDS:-5}"
 LOW_SIGNAL_MIN_PAIRS="${LOW_SIGNAL_MIN_PAIRS:-3}"
 LOW_SIGNAL_NEUTRAL_STREAK="${LOW_SIGNAL_NEUTRAL_STREAK:-3}"
+RUN_TIMEOUT_SECONDS="${RUN_TIMEOUT_SECONDS:-1800}"
+RUN_MAX_ATTEMPTS_PER_VARIANT="${RUN_MAX_ATTEMPTS_PER_VARIANT:-2}"
+RUN_RETRY_SLEEP_SECONDS="${RUN_RETRY_SLEEP_SECONDS:-20}"
+INVALID_PAIR_STREAK_STOP="${INVALID_PAIR_STREAK_STOP:-2}"
 TS="$(date +%Y%m%d%H%M%S)"
 OUT="${OUT_DIR:-$ROOT/artifacts/celestia_ab/$TS}"
 
@@ -34,6 +38,22 @@ if [[ "$MAX_PAIRS" -lt 1 ]]; then
   echo "MAX_PAIRS must be >= 1" >&2
   exit 1
 fi
+if [[ "$RUN_TIMEOUT_SECONDS" -lt 0 ]]; then
+  echo "RUN_TIMEOUT_SECONDS must be >= 0" >&2
+  exit 1
+fi
+if [[ "$RUN_MAX_ATTEMPTS_PER_VARIANT" -lt 1 ]]; then
+  echo "RUN_MAX_ATTEMPTS_PER_VARIANT must be >= 1" >&2
+  exit 1
+fi
+if [[ "$RUN_RETRY_SLEEP_SECONDS" -lt 0 ]]; then
+  echo "RUN_RETRY_SLEEP_SECONDS must be >= 0" >&2
+  exit 1
+fi
+if [[ "$INVALID_PAIR_STREAK_STOP" -lt 1 ]]; then
+  echo "INVALID_PAIR_STREAK_STOP must be >= 1" >&2
+  exit 1
+fi
 
 mkdir -p "$OUT/runs"
 
@@ -55,6 +75,10 @@ stop_on_clear=$STOP_ON_CLEAR
 sleep_between_runs_seconds=$SLEEP_BETWEEN_RUNS_SECONDS
 low_signal_min_pairs=$LOW_SIGNAL_MIN_PAIRS
 low_signal_neutral_streak=$LOW_SIGNAL_NEUTRAL_STREAK
+run_timeout_seconds=$RUN_TIMEOUT_SECONDS
+run_max_attempts_per_variant=$RUN_MAX_ATTEMPTS_PER_VARIANT
+run_retry_sleep_seconds=$RUN_RETRY_SLEEP_SECONDS
+invalid_pair_streak_stop=$INVALID_PAIR_STREAK_STOP
 META
 
 list_run_homes() {
@@ -90,8 +114,7 @@ detect_new_run_home() {
       return 0
     fi
   done < <(list_run_homes)
-
-  list_run_homes | head -n 1
+  return 1
 }
 
 run_variant() {
@@ -104,47 +127,86 @@ run_variant() {
   local run_dir="$OUT/runs/$run_id"
   mkdir -p "$run_dir"
 
-  local before_file="$run_dir/before_homes.txt"
-  list_run_homes >"$before_file"
-
-  local run_start
-  run_start=$(date +%s)
-  (
-    set -euo pipefail
-    if [[ -n "$env_file" ]]; then
-      # shellcheck source=/dev/null
-      set -a
-      source "$env_file"
-      set +a
+  local run_home=""
+  local app_db=""
+  local run_start=0
+  local run_end=0
+  local run_rc=0
+  local attempt_used=0
+  local invalid_reason=""
+  local pre_app_bytes=0
+  local pre_wal_bytes=0
+  local post_app_bytes=0
+  local post_wal_bytes=0
+  local rewrite_attempted=0
+  local rewrite_seconds=0
+  local rewrite_rc=0
+  local analyze_json="$run_dir/maintenance.json"
+  rm -f "$analyze_json"
+  : >"$run_dir/attempts.log"
+
+  local attempt
+  for ((attempt = 1; attempt <= RUN_MAX_ATTEMPTS_PER_VARIANT; attempt++)); do
+    attempt_used="$attempt"
+    local attempt_dir="$run_dir/attempt_${attempt}"
+    mkdir -p "$attempt_dir"
+
+    local before_file="$attempt_dir/before_homes.txt"
+    list_run_homes >"$before_file"
+
+    run_start=$(date +%s)
+    set +e
+    (
+      set -euo pipefail
+      if [[ -n "$env_file" ]]; then
+        # shellcheck source=/dev/null
+        set -a
+        source "$env_file"
+        set +a
+      fi
+      # Non-login shell avoids user profile side effects (e.g. tty-dependent exports)
+      # that can fail under nohup/background runs.
+      if [[ "$RUN_TIMEOUT_SECONDS" -gt 0 ]] && command -v timeout >/dev/null 2>&1; then
+        timeout --signal=TERM --kill-after=60 "${RUN_TIMEOUT_SECONDS}s" bash -c "$RUN_CMD"
+      else
+        bash -c "$RUN_CMD"
+      fi
+    ) >"$attempt_dir/launcher.log" 2>&1
+    run_rc=$?
+    set -e
+    cp "$attempt_dir/launcher.log" "$run_dir/launcher.log"
+    run_end=$(date +%s)
+
+    run_home="$(detect_new_run_home "$before_file" || true)"
+    invalid_reason=""
+    if [[ "$run_rc" -eq 124 || "$run_rc" -eq 137 || "$run_rc" -eq 143 ]]; then
+      invalid_reason="run_timeout"
+    elif [[ "$run_rc" -ne 0 ]]; then
+      invalid_reason="run_cmd_failed"
+    elif [[ -z "$run_home" || ! -d "$run_home" ]]; then
+      invalid_reason="run_home_missing"
     fi
-    # Non-login shell avoids user profile side effects (e.g. tty-dependent exports)
-    # that can fail under nohup/background runs.
-    bash -c "$RUN_CMD"
-  ) >"$run_dir/launcher.log" 2>&1
-  local run_end
-  run_end=$(date +%s)
-
-  local run_home
-  run_home="$(detect_new_run_home "$before_file")"
-  if [[ -z "$run_home" || ! -d "$run_home" ]]; then
-    echo "failed to detect run home for $run_id" >&2
-    exit 1
-  fi
 
-  local app_db="$run_home/data/application.db"
-  local pre_app_bytes pre_wal_bytes
-  pre_app_bytes="$(du_bytes "$app_db")"
-  pre_wal_bytes="$(du_bytes "$app_db/maindb/wal")"
+    echo "attempt=$attempt run_exit_code=$run_rc invalid_reason=${invalid_reason:-none} run_home=${run_home:-<none>}" >>"$run_dir/attempts.log"
+    if [[ -z "$invalid_reason" ]]; then
+      break
+    fi
+    if (( attempt < RUN_MAX_ATTEMPTS_PER_VARIANT )); then
+      sleep "$RUN_RETRY_SLEEP_SECONDS"
+    fi
+  done
+
+  if [[ -n "$run_home" && -d "$run_home" ]]; then
+    app_db="$run_home/data/application.db"
+    pre_app_bytes="$(du_bytes "$app_db")"
+    pre_wal_bytes="$(du_bytes "$app_db/maindb/wal")"
 
-  local analyze_json="$run_dir/maintenance.json"
-  if ! "$ANALYZER" --json "$run_home" >"$analyze_json" 2>"$run_dir/analyze.stderr.log"; then
-    rm -f "$analyze_json"
+    if ! "$ANALYZER" --json "$run_home" >"$analyze_json" 2>"$run_dir/analyze.stderr.log"; then
+      rm -f "$analyze_json"
+    fi
   fi
 
-  local rewrite_attempted=0
-  local rewrite_seconds=0
-  local rewrite_rc=0
-  if [[ "$REWRITE_ENABLED" == "1" && -x "$TREEMAP_BIN" && -d "$app_db" ]]; then
+  if [[ -z "$invalid_reason" && "$REWRITE_ENABLED" == "1" && -x "$TREEMAP_BIN" && -n "$app_db" && -d "$app_db" ]]; then
     rewrite_attempted=1
     local rewrite_start
     rewrite_start=$(date +%s)
@@ -155,21 +217,23 @@ run_variant() {
     local rewrite_end
     rewrite_end=$(date +%s)
     rewrite_seconds=$((rewrite_end - rewrite_start))
-  else
-    rewrite_rc=0
+    if [[ "$rewrite_rc" -ne 0 ]]; then
+      invalid_reason="rewrite_failed"
+    fi
   fi
 
-  local post_app_bytes post_wal_bytes
-  post_app_bytes="$(du_bytes "$app_db")"
-  post_wal_bytes="$(du_bytes "$app_db/maindb/wal")"
+  if [[ -n "$app_db" ]]; then
+    post_app_bytes="$(du_bytes "$app_db")"
+    post_wal_bytes="$(du_bytes "$app_db/maindb/wal")"
+  fi
 
   local run_json="$run_dir/run.json"
-  python3 - "$run_home" "$run_json" "$variant" "$pair_index" "$run_start" "$run_end" "$rewrite_attempted" "$rewrite_seconds" "$rewrite_rc" "$pre_app_bytes" "$pre_wal_bytes" "$post_app_bytes" "$post_wal_bytes" "$analyze_json" <<'PY'
+  python3 - "$run_home" "$run_json" "$variant" "$pair_index" "$run_start" "$run_end" "$rewrite_attempted" "$rewrite_seconds" "$rewrite_rc" "$pre_app_bytes" "$pre_wal_bytes" "$post_app_bytes" "$post_wal_bytes" "$analyze_json" "$invalid_reason" "$run_rc" "$attempt_used" "$RUN_MAX_ATTEMPTS_PER_VARIANT" "$RUN_TIMEOUT_SECONDS" <<'PY'
 import json
 import sys
 from pathlib import Path
 
-run_home = Path(sys.argv[1])
+run_home_raw = sys.argv[1]
 out_path = Path(sys.argv[2])
 variant = sys.argv[3]
 pair_index = int(sys.argv[4])
@@ -183,6 +247,12 @@ pre_wal_bytes = int(sys.argv[11])
 post_app_bytes = int(sys.argv[12])
 post_wal_bytes = int(sys.argv[13])
 analyze_json_path = Path(sys.argv[14])
+invalid_reason = str(sys.argv[15]).strip()
+run_exit_code = int(sys.argv[16])
+attempt = int(sys.argv[17])
+max_attempts = int(sys.argv[18])
+run_timeout_seconds = int(sys.argv[19])
+run_home = Path(run_home_raw) if run_home_raw else None
 
 def parse_sync_time(path: Path) -> dict[str, str]:
     out: dict[str, str] = {}
@@ -210,7 +280,8 @@ def safe_int(raw: str | None, default: int = 0) -> int:
         except Exception:
             return default
 
-sync = parse_sync_time(run_home / "sync" / "sync-time.log")
+sync_path = run_home / "sync" / "sync-time.log" if run_home is not None else None
+sync = parse_sync_time(sync_path) if sync_path is not None else {}
 maintenance = {}
 if analyze_json_path.is_file():
     try:
@@ -224,15 +295,25 @@ if analyze_json_path.is_file():
 
 t_sync = safe_int(sync.get("duration_seconds"), max(0, run_end - run_start))
 t_rw = rewrite_seconds if rewrite_attempted == 1 else 0
-if rewrite_attempted == 1 and rewrite_rc != 0:
-    t_total = None
-else:
-    t_total = t_sync + t_rw
+resolved_invalid_reason = invalid_reason
+if not resolved_invalid_reason and rewrite_attempted == 1 and rewrite_rc != 0:
+    resolved_invalid_reason = "rewrite_failed"
+valid = resolved_invalid_reason == ""
+t_total = (t_sync + t_rw) if valid else None
 
 result = {
     "pair_index": pair_index,
     "variant": variant,
-    "run_home": str(run_home),
+    "run_home": run_home_raw,
+    "status": {
+        "valid": valid,
+        "invalid_reason": resolved_invalid_reason,
+        "run_exit_code": run_exit_code,
+        "attempt": attempt,
+        "max_attempts": max_attempts,
+        "run_timeout_seconds": run_timeout_seconds,
+        "sync_time_present": sync_path.is_file() if sync_path is not None else False,
+    },
     "sync": {
         "duration_seconds": t_sync,
         "max_rss_kb": safe_int(sync.get("max_rss_kb"), 0),
@@ -268,12 +349,16 @@ out_path.write_text(json.dumps(result, indent=2, sort_keys=True), encoding="utf-
 print(out_path)
 PY
 
-  echo "run_id=$run_id run_home=$run_home json=$run_json"
+  local run_valid="false"
+  if [[ -z "$invalid_reason" ]]; then
+    run_valid="true"
+  fi
+  echo "run_id=$run_id run_home=${run_home:-<none>} valid=$run_valid invalid_reason=${invalid_reason:-none} attempts=$attempt_used/$RUN_MAX_ATTEMPTS_PER_VARIANT json=$run_json"
 }
 
 aggregate_and_decide() {
   local decision_json="$OUT/decision.json"
-  python3 - "$OUT" "$SIZE_TOLERANCE_BYTES" "$TIME_TOLERANCE_SECONDS" "$MIN_PAIRS" "$CLEAR_WIN_PAIRS" "$CLEAR_LOSS_PAIRS" "$MAX_PAIRS" "$STOP_ON_CLEAR" "$LOW_SIGNAL_MIN_PAIRS" "$LOW_SIGNAL_NEUTRAL_STREAK" "$decision_json" <<'PY'
+  python3 - "$OUT" "$SIZE_TOLERANCE_BYTES" "$TIME_TOLERANCE_SECONDS" "$MIN_PAIRS" "$CLEAR_WIN_PAIRS" "$CLEAR_LOSS_PAIRS" "$MAX_PAIRS" "$STOP_ON_CLEAR" "$LOW_SIGNAL_MIN_PAIRS" "$LOW_SIGNAL_NEUTRAL_STREAK" "$INVALID_PAIR_STREAK_STOP" "$decision_json" <<'PY'
 import csv
 import json
 import sys
@@ -289,7 +374,8 @@ max_pairs = int(sys.argv[7])
 stop_on_clear = sys.argv[8] == "1"
 low_signal_min_pairs = int(sys.argv[9])
 low_signal_neutral_streak = int(sys.argv[10])
-decision_path = Path(sys.argv[11])
+invalid_pair_streak_stop = int(sys.argv[11])
+decision_path = Path(sys.argv[12])
 
 run_files = sorted(out.glob("runs/*/run.json"))
 runs = []
@@ -301,6 +387,39 @@ for p in run_files:
 
 runs.sort(key=lambda r: (int(r.get("pair_index", 0)), str(r.get("variant", ""))))
 
+def run_is_valid(run: dict) -> bool:
+    status = run.get("status")
+    if isinstance(status, dict) and "valid" in status:
+        return bool(status.get("valid"))
+    metrics = run.get("metrics", {}) or {}
+    rewrite = run.get("rewrite", {}) or {}
+    return metrics.get("t_total_seconds") is not None and int(rewrite.get("exit_code", 0)) == 0
+
+def run_invalid_reason(run: dict) -> str:
+    status = run.get("status")
+    if isinstance(status, dict):
+        return str(status.get("invalid_reason", "") or "")
+    return ""
+
+def run_attempt(run: dict):
+    status = run.get("status")
+    if isinstance(status, dict):
+        return status.get("attempt")
+    return None
+
+def run_max_attempts(run: dict):
+    status = run.get("status")
+    if isinstance(status, dict):
+        return status.get("max_attempts")
+    return None
+
+def run_exit_code(run: dict):
+    status = run.get("status")
+    if isinstance(status, dict) and status.get("run_exit_code") is not None:
+        return status.get("run_exit_code")
+    rewrite = run.get("rewrite", {}) or {}
+    return rewrite.get("exit_code")
+
 runs_csv = out / "runs.csv"
 with runs_csv.open("w", newline="", encoding="utf-8") as fh:
     w = csv.writer(fh)
@@ -316,6 +435,11 @@ with runs_csv.open("w", newline="", encoding="utf-8") as fh:
         "s_post_app_bytes",
         "s_post_wal_bytes",
         "max_rss_kb",
+        "valid",
+        "invalid_reason",
+        "run_exit_code",
+        "run_attempt",
+        "run_max_attempts",
         "rewrite_exit_code",
         "rewrite_runs",
         "gc_runs",
@@ -327,6 +451,7 @@ with runs_csv.open("w", newline="", encoding="utf-8") as fh:
         s = r.get("sizes", {}) or {}
         rw = r.get("rewrite", {}) or {}
         summary = r.get("maintenance_summary", {}) or {}
+        valid = run_is_valid(r)
         w.writerow([
             int(r.get("pair_index", 0)),
             str(r.get("variant", "")),
@@ -339,6 +464,11 @@ with runs_csv.open("w", newline="", encoding="utf-8") as fh:
             s.get("post_app_bytes"),
             s.get("post_wal_bytes"),
             m.get("max_rss_kb"),
+            valid,
+            run_invalid_reason(r),
+            run_exit_code(r),
+            run_attempt(r),
+            run_max_attempts(r),
             rw.get("exit_code"),
             summary.get("rewrite_runs", 0),
             summary.get("gc_runs", 0),
@@ -354,12 +484,34 @@ for r in runs:
 pair_rows = []
 wins = 0
 losses = 0
+raw_pairs = 0
+invalid_pairs = 0
 for pair in sorted(by_pair):
     row = by_pair[pair]
     ctrl = row.get("control")
     cand = row.get("candidate")
     if not ctrl or not cand:
         continue
+    raw_pairs += 1
+    ctrl_valid = run_is_valid(ctrl)
+    cand_valid = run_is_valid(cand)
+    ctrl_reason = run_invalid_reason(ctrl)
+    cand_reason = run_invalid_reason(cand)
+    if not ctrl_valid or not cand_valid:
+        invalid_pairs += 1
+        pair_rows.append({
+            "pair_index": pair,
+            "delta_t_sync_seconds": None,
+            "delta_t_total_seconds": None,
+            "delta_s_sync_app_bytes": None,
+            "delta_s_post_wal_bytes": None,
+            "control_valid": ctrl_valid,
+            "candidate_valid": cand_valid,
+            "control_invalid_reason": ctrl_reason,
+            "candidate_invalid_reason": cand_reason,
+            "outcome": "invalid",
+        })
+        continue
     cm = cand.get("metrics", {}) or {}
     bm = ctrl.get("metrics", {}) or {}
     cand_total = cm.get("t_total_seconds")
@@ -398,6 +550,10 @@ for pair in sorted(by_pair):
         "delta_t_total_seconds": d_total,
         "delta_s_sync_app_bytes": d_sync_app,
         "delta_s_post_wal_bytes": d_post_wal,
+        "control_valid": ctrl_valid,
+        "candidate_valid": cand_valid,
+        "control_invalid_reason": ctrl_reason,
+        "candidate_invalid_reason": cand_reason,
         "outcome": outcome,
     })
 
@@ -410,6 +566,10 @@ with pairs_csv.open("w", newline="", encoding="utf-8") as fh:
         "delta_t_total_seconds",
         "delta_s_sync_app_bytes",
         "delta_s_post_wal_bytes",
+        "control_valid",
+        "candidate_valid",
+        "control_invalid_reason",
+        "candidate_invalid_reason",
         "outcome",
     ])
     for r in pair_rows:
@@ -419,17 +579,28 @@ with pairs_csv.open("w", newline="", encoding="utf-8") as fh:
             r["delta_t_total_seconds"],
             r["delta_s_sync_app_bytes"],
             r["delta_s_post_wal_bytes"],
+            r["control_valid"],
+            r["candidate_valid"],
+            r["control_invalid_reason"],
+            r["candidate_invalid_reason"],
             r["outcome"],
         ])
 
-completed_pairs = len(pair_rows)
+scored_rows = [row for row in pair_rows if row.get("outcome") != "invalid"]
+completed_pairs = len(scored_rows)
 neutral = max(0, completed_pairs - wins - losses)
 neutral_streak = 0
-for row in reversed(pair_rows):
+for row in reversed(scored_rows):
     if row.get("outcome") == "neutral":
         neutral_streak += 1
         continue
     break
+invalid_streak = 0
+for row in reversed(pair_rows):
+    if row.get("outcome") == "invalid":
+        invalid_streak += 1
+        continue
+    break
 
 reason = "continue"
 stop = False
@@ -441,7 +612,7 @@ if stop_on_clear and completed_pairs >= min_pairs:
         stop = True
         reason = "clear_regression"
     else:
-        remaining = max(0, max_pairs - completed_pairs)
+        remaining = max(0, max_pairs - raw_pairs)
         can_reach_clear_win = (wins + remaining) >= clear_win_pairs
         can_reach_clear_loss = (losses + remaining) >= clear_loss_pairs
         if not can_reach_clear_win and not can_reach_clear_loss:
@@ -452,7 +623,11 @@ if (not stop) and completed_pairs >= low_signal_min_pairs and neutral_streak >=
     stop = True
     reason = "low_signal_neutral_streak"
 
-if (not stop) and completed_pairs >= max_pairs:
+if (not stop) and invalid_streak >= invalid_pair_streak_stop:
+    stop = True
+    reason = "invalid_pair_streak"
+
+if (not stop) and raw_pairs >= max_pairs:
     stop = True
     reason = "max_pairs"
 
@@ -460,13 +635,17 @@ summary_md = out / "summary.md"
 lines = []
 lines.append("# run_celestia A/B summary")
 lines.append("")
-lines.append(f"- completed pairs: `{completed_pairs}`")
+lines.append(f"- observed pairs: `{raw_pairs}`")
+lines.append(f"- scored pairs: `{completed_pairs}`")
+lines.append(f"- invalid pairs skipped: `{invalid_pairs}`")
 lines.append(f"- wins/losses/neutral: `{wins}` / `{losses}` / `{neutral}`")
 lines.append(f"- neutral streak (tail): `{neutral_streak}`")
+lines.append(f"- invalid streak (tail): `{invalid_streak}`")
 lines.append(f"- size tolerance bytes: `{size_tol}`")
 lines.append(f"- time tolerance seconds: `{time_tol}`")
 lines.append(f"- low-signal min pairs: `{low_signal_min_pairs}`")
 lines.append(f"- low-signal neutral streak: `{low_signal_neutral_streak}`")
+lines.append(f"- invalid pair streak stop: `{invalid_pair_streak_stop}`")
 lines.append(f"- decision: `{reason}`")
 lines.append("")
 lines.append("## Artifacts")
@@ -480,6 +659,8 @@ if pair_rows:
     lines.append("## Last Pair")
     lines.append("")
     lines.append(f"- pair: `{last['pair_index']}` outcome=`{last['outcome']}`")
+    lines.append(f"- control_valid: `{last['control_valid']}` reason=`{last['control_invalid_reason']}`")
+    lines.append(f"- candidate_valid: `{last['candidate_valid']}` reason=`{last['candidate_invalid_reason']}`")
     lines.append(f"- delta_t_sync_seconds: `{last['delta_t_sync_seconds']}`")
     lines.append(f"- delta_t_total_seconds: `{last['delta_t_total_seconds']}`")
     lines.append(f"- delta_s_sync_app_bytes: `{last['delta_s_sync_app_bytes']}`")
@@ -487,11 +668,14 @@ if pair_rows:
 summary_md.write_text("\n".join(lines) + "\n", encoding="utf-8")
 
 payload = {
+    "observed_pairs": raw_pairs,
     "completed_pairs": completed_pairs,
+    "invalid_pairs": invalid_pairs,
     "wins": wins,
     "losses": losses,
     "neutral": neutral,
     "neutral_streak": neutral_streak,
+    "invalid_streak": invalid_streak,
     "stop": stop,
     "reason": reason,
 }

From 1ba077e49a36ebd5362b1736137687e8230cbbea Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sun, 29 Mar 2026 01:26:37 -1000
Subject: [PATCH 54/61] bench: enrich celestia timeout sync probe totals

---
 docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md |  1 +
 scripts/run_celestia_ab.sh                   | 48 ++++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md
index 92dc8875b..7b0836451 100644
--- a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md
+++ b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md
@@ -111,6 +111,7 @@ REWRITE_ENABLED=1 \
 Notes:
 - Pair execution remains strictly single-run at a time and interleaved by pair order.
 - Invalid runs (timeout, launcher failure, missing new run home, rewrite failure) are recorded but excluded from pair scoring.
+- Per-run `run.json` now includes `status.sync_probe` (last snapshot chunk, last and max snapshot totals, fetch event count, state-sync-complete flag) for timeout forensics.
 
 ## Process Review Cadence
 
diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh
index eadfd0f9a..ef8c3f088 100755
--- a/scripts/run_celestia_ab.sh
+++ b/scripts/run_celestia_ab.sh
@@ -230,6 +230,7 @@ run_variant() {
   local run_json="$run_dir/run.json"
   python3 - "$run_home" "$run_json" "$variant" "$pair_index" "$run_start" "$run_end" "$rewrite_attempted" "$rewrite_seconds" "$rewrite_rc" "$pre_app_bytes" "$pre_wal_bytes" "$post_app_bytes" "$post_wal_bytes" "$analyze_json" "$invalid_reason" "$run_rc" "$attempt_used" "$RUN_MAX_ATTEMPTS_PER_VARIANT" "$RUN_TIMEOUT_SECONDS" <<'PY'
 import json
+import re
 import sys
 from pathlib import Path
 
@@ -280,8 +281,54 @@ def safe_int(raw: str | None, default: int = 0) -> int:
         except Exception:
             return default
 
+def probe_sync_progress(node_log_path: Path | None) -> dict[str, object]:
+    progress = {
+        "node_log_present": False,
+        "last_snapshot_chunk": 0,
+        "last_snapshot_total": 0,
+        "last_nonzero_snapshot_total": 0,
+        "max_snapshot_total": 0,
+        "snapshot_fetch_events": 0,
+        "state_sync_complete": False,
+    }
+    if node_log_path is None or not node_log_path.is_file():
+        return progress
+
+    progress["node_log_present"] = True
+    try:
+        text = node_log_path.read_text(encoding="utf-8", errors="replace")
+    except Exception:
+        return progress
+
+    last_chunk = 0
+    last_total = 0
+    last_nonzero_total = 0
+    max_total = 0
+    events = 0
+    for m in re.finditer(r"Fetching snapshot chunk chunk=(\d+).*total=(\d+)", text):
+        events += 1
+        try:
+            last_chunk = int(m.group(1))
+            last_total = int(m.group(2))
+            if last_total > 0:
+                last_nonzero_total = last_total
+            if last_total > max_total:
+                max_total = last_total
+        except Exception:
+            continue
+
+    progress["last_snapshot_chunk"] = last_chunk
+    progress["last_snapshot_total"] = last_total
+    progress["last_nonzero_snapshot_total"] = last_nonzero_total
+    progress["max_snapshot_total"] = max_total
+    progress["snapshot_fetch_events"] = events
+    progress["state_sync_complete"] = ("State sync complete" in text) or ("statesync complete" in text.lower())
+    return progress
+
 sync_path = run_home / "sync" / "sync-time.log" if run_home is not None else None
 sync = parse_sync_time(sync_path) if sync_path is not None else {}
+node_log_path = run_home / "sync" / "node.log" if run_home is not None else None
+sync_probe = probe_sync_progress(node_log_path)
 maintenance = {}
 if analyze_json_path.is_file():
     try:
@@ -313,6 +360,7 @@ result = {
         "max_attempts": max_attempts,
         "run_timeout_seconds": run_timeout_seconds,
         "sync_time_present": sync_path.is_file() if sync_path is not None else False,
+        "sync_probe": sync_probe,
     },
     "sync": {
         "duration_seconds": t_sync,

From 3c29b271da64d9a9f6cb6a2840d2708275128f55 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sun, 29 Mar 2026 08:20:30 -1000
Subject: [PATCH 55/61] treedb: cut rewrite alloc churn and sharpen bench
 signal

---
 TreeDB/db/vlog_health.go             |  57 +++-
 TreeDB/db/vlog_rewrite.go            | 363 +++++++++++++++++------
 TreeDB/db/vlog_rewrite_bench_test.go | 424 +++++++++++++++++++++++++++
 3 files changed, 758 insertions(+), 86 deletions(-)
 create mode 100644 TreeDB/db/vlog_rewrite_bench_test.go

diff --git a/TreeDB/db/vlog_health.go b/TreeDB/db/vlog_health.go
index fb2a1d51f..5117b1d78 100644
--- a/TreeDB/db/vlog_health.go
+++ b/TreeDB/db/vlog_health.go
@@ -95,6 +95,30 @@ func segmentAgeSeconds(path string, now time.Time) int64 {
 	return int64(age / time.Second)
 }
 
+func advanceSegmentAgeSeconds(h valueLogSegmentHealth, now time.Time) int64 {
+	age := h.AgeSeconds
+	if age < 0 {
+		age = 0
+	}
+	if h.LastUpdatedUnixNano <= 0 {
+		return age
+	}
+	prevSec := h.LastUpdatedUnixNano / int64(time.Second)
+	nowSec := now.Unix()
+	if nowSec <= prevSec {
+		return age
+	}
+	delta := nowSec - prevSec
+	if delta <= 0 {
+		return age
+	}
+	// Clamp on overflow to preserve monotonic, bounded metadata.
+	if age > int64(^uint64(0)>>1)-delta {
+		return int64(^uint64(0) >> 1)
+	}
+	return age + delta
+}
+
 func updateValueLogHealthAfterGC(dbDir string, set *valuelog.Set, referenced map[uint32]struct{}) error {
 	path := valueLogHealthPath(dbDir)
 	health, err := loadValueLogHealth(path)
@@ -120,7 +144,10 @@ func updateValueLogHealthAfterGC(dbDir string, set *valuelog.Set, referenced map
 			} else if size > 0 && h.LiveBytes > size {
 				h.LiveBytes = size
 			}
-			h.AgeSeconds = segmentAgeSeconds(f.Path, now)
+			// Avoid per-segment stat calls on the GC fast path; preserve age via
+			// monotonic last-update deltas and refresh from disk only in fallback
+			// scans below.
+			h.AgeSeconds = advanceSegmentAgeSeconds(h, now)
 			h.LastUpdatedUnixNano = now.UnixNano()
 			health[id] = h
 		}
@@ -163,7 +190,7 @@ func updateValueLogHealthAfterGC(dbDir string, set *valuelog.Set, referenced map
 	return saveValueLogHealth(path, health)
 }
 
-func updateValueLogHealthAfterRewrite(dbDir string, oldValueIDs map[uint32]struct{}) error {
+func updateValueLogHealthAfterRewrite(dbDir string, oldValueIDs map[uint32]struct{}, set *valuelog.Set) error {
 	path := valueLogHealthPath(dbDir)
 	health, err := loadValueLogHealth(path)
 	if err != nil {
@@ -177,6 +204,32 @@ func updateValueLogHealthAfterRewrite(dbDir string, oldValueIDs map[uint32]struc
 		}
 	}
 
+	// Online rewrite callers can provide a current manager set and avoid an
+	// expensive directory rescan on the hot path.
+	if set != nil {
+		out := make(map[uint32]valueLogSegmentHealth, len(set.Files))
+		for id, f := range set.Files {
+			if f == nil {
+				continue
+			}
+			h := health[id]
+			if _, wasOld := oldValueIDs[id]; !wasOld {
+				if h.RewriteCount < nextRewriteCount {
+					h.RewriteCount = nextRewriteCount
+				}
+			}
+			size := fileSize(f)
+			h.SegmentBytes = size
+			h.LiveBytes = size
+			// Online rewrite callers pass a manager set; avoid expensive stat calls
+			// per segment and keep age monotonic from prior metadata timestamps.
+			h.AgeSeconds = advanceSegmentAgeSeconds(h, now)
+			h.LastUpdatedUnixNano = now.UnixNano()
+			out[id] = h
+		}
+		return saveValueLogHealth(path, out)
+	}
+
 	segments, err := listWALSegments(dbDir)
 	if err != nil {
 		return err
diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go
index 133d1de4b..b63886625 100644
--- a/TreeDB/db/vlog_rewrite.go
+++ b/TreeDB/db/vlog_rewrite.go
@@ -32,6 +32,10 @@ const defaultValueLogRewriteSegmentBytes = 128 << 20
 const rewriteDictMinPayloadBytes = 32 << 10
 const rewriteDictBatchMaxK = 64
 const rewriteReadScratchMaxCap = 1 << 20 // 1MiB cap to avoid retaining oversized decode buffers
+const rewriteKeyArenaMaxCap = 1 << 20    // 1MiB cap to avoid retaining oversized key arenas
+
+var rewriteRIDStartScanner = nextRewriteRIDStart
+var rewriteWALSegmentsLister = listWALSegments
 
 func rewriteAllowDictForSmallPayload(value []byte) bool {
 	if len(value) < page.PageSize {
@@ -706,7 +710,7 @@ func (db *DB) estimateValueLogLiveBytesBySegment(ctx context.Context) (_ map[uin
 	var seenGroupedRecords map[groupedRecordKey]struct{}
 
 	userIter := snap.tree.IteratorWithOptions(nil, nil, tree.IteratorOptions{Mode: tree.IteratorModePointerProjection})
-	if err := db.collectValueLogLiveBytes(ctx, userIter, liveByID, &seenGroupedRecords); err != nil {
+	if err := db.collectValueLogLiveBytes(ctx, userIter, liveByID, &seenGroupedRecords, snap.state.ValueLogSet); err != nil {
 		_ = userIter.Close()
 		return nil, err
 	}
@@ -714,7 +718,7 @@ func (db *DB) estimateValueLogLiveBytesBySegment(ctx context.Context) (_ map[uin
 
 	sysIter := tree.New(snap.idx.pager, newValueReader(snap.state.ValueLogSet), snap.state.SystemRootPageID).
 		IteratorWithOptions(nil, nil, tree.IteratorOptions{Mode: tree.IteratorModePointerProjection})
-	if err := db.collectValueLogLiveBytes(ctx, sysIter, liveByID, &seenGroupedRecords); err != nil {
+	if err := db.collectValueLogLiveBytes(ctx, sysIter, liveByID, &seenGroupedRecords, snap.state.ValueLogSet); err != nil {
 		_ = sysIter.Close()
 		return nil, err
 	}
@@ -725,10 +729,10 @@ func (db *DB) estimateValueLogLiveBytesBySegment(ctx context.Context) (_ map[uin
 	// live-byte estimation; otherwise rewrite planning can select "stale" segments
 	// that are actually pinned by live leaf pages.
 	if snap.idx != nil && snap.idx.pager != nil {
-		if err := db.collectLeafRefValueLogLiveBytes(ctx, snap.idx.pager, snap.state.RootPageID, liveByID, &seenGroupedRecords); err != nil {
+		if err := db.collectLeafRefValueLogLiveBytes(ctx, snap.idx.pager, snap.state.RootPageID, liveByID, &seenGroupedRecords, snap.state.ValueLogSet); err != nil {
 			return nil, err
 		}
-		if err := db.collectLeafRefValueLogLiveBytes(ctx, snap.idx.pager, snap.state.SystemRootPageID, liveByID, &seenGroupedRecords); err != nil {
+		if err := db.collectLeafRefValueLogLiveBytes(ctx, snap.idx.pager, snap.state.SystemRootPageID, liveByID, &seenGroupedRecords, snap.state.ValueLogSet); err != nil {
 			return nil, err
 		}
 	}
@@ -743,7 +747,7 @@ type groupedRecordKey struct {
 	start  uint64
 }
 
-func (db *DB) collectValueLogLiveBytes(ctx context.Context, it iterator.UnsafeIterator, liveByID map[uint32]int64, seenGroupedRecords *map[groupedRecordKey]struct{}) error {
+func (db *DB) collectValueLogLiveBytes(ctx context.Context, it iterator.UnsafeIterator, liveByID map[uint32]int64, seenGroupedRecords *map[groupedRecordKey]struct{}, set *valuelog.Set) error {
 	for it.Valid() {
 		if err := ctx.Err(); err != nil {
 			return err
@@ -776,7 +780,7 @@ func (db *DB) collectValueLogLiveBytes(ctx context.Context, it iterator.UnsafeIt
 			seen[k] = struct{}{}
 		}
 
-		recordLen, err := db.valueLogRecordLengthForRewrite(ptr)
+		recordLen, err := db.valueLogRecordLengthForRewriteInSet(ptr, set)
 		if err != nil {
 			return err
 		}
@@ -786,7 +790,7 @@ func (db *DB) collectValueLogLiveBytes(ctx context.Context, it iterator.UnsafeIt
 	return it.Error()
 }
 
-func (db *DB) collectLeafRefValueLogLiveBytes(ctx context.Context, p *pager.Pager, rootID uint64, liveByID map[uint32]int64, seenGroupedRecords *map[groupedRecordKey]struct{}) error {
+func (db *DB) collectLeafRefValueLogLiveBytes(ctx context.Context, p *pager.Pager, rootID uint64, liveByID map[uint32]int64, seenGroupedRecords *map[groupedRecordKey]struct{}, set *valuelog.Set) error {
 	if ctx == nil {
 		ctx = context.Background()
 	}
@@ -794,7 +798,7 @@ func (db *DB) collectLeafRefValueLogLiveBytes(ctx context.Context, p *pager.Page
 		return nil
 	}
 	if ptr, ok := page.DecodeLeafRef(rootID); ok {
-		return db.collectLeafRefPtrLiveBytes(ptr, liveByID, seenGroupedRecords)
+		return db.collectLeafRefPtrLiveBytes(ptr, liveByID, seenGroupedRecords, set)
 	}
 	stack := make([]uint64, 0, 128)
 	stack = append(stack, rootID)
@@ -835,7 +839,7 @@ func (db *DB) collectLeafRefValueLogLiveBytes(ctx context.Context, p *pager.Page
 					return err
 				}
 				if ptr, ok := page.DecodeLeafRef(childID); ok {
-					if err := db.collectLeafRefPtrLiveBytes(ptr, liveByID, seenGroupedRecords); err != nil {
+					if err := db.collectLeafRefPtrLiveBytes(ptr, liveByID, seenGroupedRecords, set); err != nil {
 						return err
 					}
 					continue
@@ -852,7 +856,7 @@ func (db *DB) collectLeafRefValueLogLiveBytes(ctx context.Context, p *pager.Page
 	return nil
 }
 
-func (db *DB) collectLeafRefPtrLiveBytes(ptr page.ValuePtr, liveByID map[uint32]int64, seenGroupedRecords *map[groupedRecordKey]struct{}) error {
+func (db *DB) collectLeafRefPtrLiveBytes(ptr page.ValuePtr, liveByID map[uint32]int64, seenGroupedRecords *map[groupedRecordKey]struct{}, set *valuelog.Set) error {
 	if liveByID == nil {
 		return nil
 	}
@@ -881,7 +885,7 @@ func (db *DB) collectLeafRefPtrLiveBytes(ptr page.ValuePtr, liveByID map[uint32]
 		seen[k] = struct{}{}
 	}
 
-	recordLen, err := db.valueLogRecordLengthForRewrite(ptr)
+	recordLen, err := db.valueLogRecordLengthForRewriteInSet(ptr, set)
 	if err != nil {
 		return err
 	}
@@ -906,6 +910,10 @@ func readValueLogRecordLengthFromHeader(r io.ReaderAt, start int64) (uint32, err
 }
 
 func (db *DB) valueLogRecordLengthForRewrite(ptr page.ValuePtr) (uint32, error) {
+	return db.valueLogRecordLengthForRewriteInSet(ptr, nil)
+}
+
+func (db *DB) valueLogRecordLengthForRewriteInSet(ptr page.ValuePtr, set *valuelog.Set) (uint32, error) {
 	hint := page.ValuePtrRecordLength(ptr)
 	if !valueLogRecordLengthNeedsHeader(ptr, hint) {
 		return hint, nil
@@ -913,24 +921,31 @@ func (db *DB) valueLogRecordLengthForRewrite(ptr page.ValuePtr) (uint32, error)
 	if ptr.Offset < 4 {
 		return 0, fmt.Errorf("vlog-rewrite: invalid pointer offset %d", ptr.Offset)
 	}
+	if set != nil {
+		f := set.Files[ptr.FileID]
+		if f != nil && f.File != nil {
+			start := int64(ptr.Offset - 4)
+			return readValueLogRecordLengthFromHeader(f.File, start)
+		}
+	}
 	if db == nil || db.valueLogManager == nil {
 		return 0, fmt.Errorf("vlog-rewrite: value-log manager unavailable")
 	}
-	set := db.valueLogManager.CurrentSetNoRefresh()
-	if set == nil || set.Files[ptr.FileID] == nil {
-		if set != nil {
-			_ = db.valueLogManager.Release(set)
+	currentSet := db.valueLogManager.CurrentSetNoRefresh()
+	if currentSet == nil || currentSet.Files[ptr.FileID] == nil {
+		if currentSet != nil {
+			_ = db.valueLogManager.Release(currentSet)
 		}
 		if err := db.valueLogManager.Refresh(); err != nil {
 			return 0, err
 		}
-		set = db.valueLogManager.CurrentSetNoRefresh()
+		currentSet = db.valueLogManager.CurrentSetNoRefresh()
 	}
-	if set == nil {
+	if currentSet == nil {
 		return 0, fmt.Errorf("vlog-rewrite: value-log set unavailable")
 	}
-	defer func() { _ = db.valueLogManager.Release(set) }()
-	f := set.Files[ptr.FileID]
+	defer func() { _ = db.valueLogManager.Release(currentSet) }()
+	f := currentSet.Files[ptr.FileID]
 	if f == nil || f.File == nil {
 		return 0, fmt.Errorf("vlog-rewrite: missing segment for pointer %s", formatValueLogPtr(ptr))
 	}
@@ -1208,16 +1223,40 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 		return stats, nil
 	}
 
-	segments, err := listWALSegments(db.dir)
-	if err != nil {
-		return stats, err
+	nextRID := uint64(0)
+	var (
+		segments    []logSegment
+		lane        uint32
+		startSeq    uint32
+		needSegScan = true
+	)
+	if opts.ReserveRIDs != nil && db.valueLogManager != nil {
+		if hintLane, hintSeq, ok := db.valueLogManager.RewriteLaneHint(); ok {
+			probePath := filepath.Join(db.dir, "wal", fmt.Sprintf("value-l%d-%06d.log", hintLane, hintSeq+1))
+			if _, statErr := os.Stat(probePath); statErr == nil {
+				needSegScan = true
+			} else if os.IsNotExist(statErr) {
+				lane, startSeq = hintLane, hintSeq
+				needSegScan = false
+			} else {
+				return stats, statErr
+			}
+		}
 	}
-	nextRID, err := nextRewriteRIDStart(segments)
-	if err != nil {
-		return stats, err
+	if needSegScan {
+		segments, err = rewriteWALSegmentsLister(db.dir)
+		if err != nil {
+			return stats, err
+		}
+		lane, startSeq = chooseRewriteLane(segments)
+	}
+	if opts.ReserveRIDs == nil {
+		nextRID, err = rewriteRIDStartScanner(segments)
+		if err != nil {
+			return stats, err
+		}
 	}
 	ridAlloc := newRewriteRIDAllocator(nextRID, opts.ReserveRIDs)
-	lane, startSeq := chooseRewriteLane(segments)
 	maxBytes := opts.MaxSegmentBytes
 	if maxBytes <= 0 {
 		maxBytes = defaultValueLogRewriteSegmentBytes
@@ -1237,9 +1276,14 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 
 	batchSize := normalizeValueLogRewriteBatchSize(opts.BatchSize)
 	swaps := make([]rewriteSwap, 0, batchSize)
+	batchCreatedIDs := make([]uint32, 0, 4)
 	localityPolicy := normalizeValueLogRewriteLocalityPolicy(opts.LocalityPolicy)
 	candidates := make([]rewriteCandidate, 0, batchSize)
-	var rewriteReadScratch []byte
+	candidateKeyArena := make([]byte, 0, 16<<10)
+	// Seed decode scratch so ReadUnsafeTo can immediately reuse caller-owned
+	// storage for grouped compressed reads instead of allocating per-record.
+	const rewriteReadScratchInitCap = 1024
+	rewriteReadScratch := make([]byte, 0, rewriteReadScratchInitCap)
 	var canceledErr error
 
 	flushBatch := func() error {
@@ -1248,11 +1292,15 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 		}
 		orderRewriteCandidates(candidates, localityPolicy)
 		swaps = swaps[:0]
+		batchCreatedIDs = batchCreatedIDs[:0]
 		startRID, err := ridAlloc.Reserve(len(candidates))
 		if err != nil {
 			return err
 		}
 		for _, candidate := range candidates {
+			if rewriteReadScratch == nil {
+				rewriteReadScratch = make([]byte, 0, rewriteReadScratchInitCap)
+			}
 			val, usedScratch, err := db.valueLogManager.ReadUnsafeTo(candidate.oldPtr, rewriteReadScratch)
 			if err != nil {
 				return err
@@ -1274,6 +1322,16 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 			stats.RecordsCopied++
 			stats.ValueRecordsCopied++
 			stats.ValueBytesCopied += int64(len(val))
+			seenID := false
+			for _, id := range batchCreatedIDs {
+				if id == newPtr.FileID {
+					seenID = true
+					break
+				}
+			}
+			if !seenID {
+				batchCreatedIDs = append(batchCreatedIDs, newPtr.FileID)
+			}
 			swaps = append(swaps, rewriteSwap{
 				key:    candidate.key,
 				oldPtr: candidate.oldPtr,
@@ -1289,10 +1347,26 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 				return err
 			}
 		}
+		// Register rewrite-created segments before publishing pointer swaps so
+		// finalizeCommit can stay on CurrentSetNoRefresh and avoid full scans.
+		for _, id := range batchCreatedIDs {
+			path := db.valueLogManager.SegmentPath(id)
+			if err := db.valueLogManager.RegisterSegment(path, id); err != nil {
+				return err
+			}
+			if err := db.valueLogManager.PromoteCurrentWritable(id); err != nil {
+				return err
+			}
+		}
 		if err := db.applyRewriteSwapBatch(swaps, opts.SyncEachBatch); err != nil {
 			return err
 		}
 		candidates = candidates[:0]
+		if cap(candidateKeyArena) > rewriteKeyArenaMaxCap {
+			candidateKeyArena = nil
+		} else {
+			candidateKeyArena = candidateKeyArena[:0]
+		}
 		return nil
 	}
 
@@ -1316,7 +1390,10 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 				continue
 			}
 		}
-		key := append([]byte(nil), it.UnsafeKey()...)
+		unsafeKey := it.UnsafeKey()
+		keyStart := len(candidateKeyArena)
+		candidateKeyArena = append(candidateKeyArena, unsafeKey...)
+		key := candidateKeyArena[keyStart:len(candidateKeyArena):len(candidateKeyArena)]
 		candidates = append(candidates, rewriteCandidate{
 			key:    key,
 			oldPtr: oldPtr,
@@ -1462,16 +1539,24 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 	if err := db.publishValueLogSetNoRefresh(); err != nil {
 		return stats, err
 	}
-	if err := updateValueLogHealthAfterRewrite(db.dir, oldValueIDs); err != nil {
+	postSet := db.valueLogManager.CurrentSetNoRefresh()
+	if postSet != nil {
+		defer func() { _ = db.valueLogManager.Release(postSet) }()
+	}
+	if err := updateValueLogHealthAfterRewrite(db.dir, oldValueIDs, postSet); err != nil {
 		return stats, err
 	}
 
-	afterSegs, afterBytes, err := valueLogSegmentStats(db.dir)
-	if err != nil {
-		return stats, err
+	if postSet != nil {
+		stats.SegmentsAfter, stats.BytesAfter = valueLogSegmentStatsFromSet(postSet)
+	} else {
+		afterSegs, afterBytes, err := valueLogSegmentStats(db.dir)
+		if err != nil {
+			return stats, err
+		}
+		stats.SegmentsAfter = afterSegs
+		stats.BytesAfter = afterBytes
 	}
-	stats.SegmentsAfter = afterSegs
-	stats.BytesAfter = afterBytes
 	if canceledErr != nil {
 		return stats, canceledErr
 	}
@@ -1482,9 +1567,11 @@ type leafRefRewriteCtx struct {
 	ctx context.Context
 	db  *DB
 
-	pager      *pager.Pager
-	leafReader tree.SlabReader
-	alloc      interface {
+	pager       *pager.Pager
+	leafReader  tree.SlabReader
+	leafToer    unsafeToReader
+	leafScratch []byte
+	alloc       interface {
 		Alloc(hint uint64) (uint64, error)
 	}
 
@@ -1501,6 +1588,29 @@ type leafRefRewriteCtx struct {
 	copiedBytes int64
 }
 
+func (c *leafRefRewriteCtx) readLeafPage(ptr page.ValuePtr) ([]byte, error) {
+	if c == nil || c.leafReader == nil {
+		return nil, fmt.Errorf("vlog-rewrite: value-log snapshot reader unavailable")
+	}
+	if c.leafToer != nil {
+		if cap(c.leafScratch) < page.PageSize {
+			c.leafScratch = make([]byte, 0, page.PageSize)
+		} else {
+			c.leafScratch = c.leafScratch[:0]
+		}
+		leafPage, usedScratch, err := c.leafToer.ReadUnsafeTo(ptr, c.leafScratch[:0])
+		if err != nil {
+			return nil, err
+		}
+		if usedScratch {
+			// Keep the caller-provided decode buffer hot across leafref rewrites.
+			c.leafScratch = leafPage[:0]
+		}
+		return leafPage, nil
+	}
+	return c.leafReader.ReadUnsafe(ptr)
+}
+
 func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) {
 	if c == nil {
 		return id, false, errors.New("vlog-rewrite: nil leafref rewrite ctx")
@@ -1531,7 +1641,7 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) {
 		if c.writer == nil || c.ridAlloc == nil {
 			return id, false, fmt.Errorf("vlog-rewrite: rewrite writer unavailable")
 		}
-		leafPage, err := c.leafReader.ReadUnsafe(ptr)
+		leafPage, err := c.readLeafPage(ptr)
 		if err != nil {
 			return id, false, err
 		}
@@ -1587,11 +1697,9 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) {
 		if count == 0 {
 			return id, false, nil
 		}
-		childIDs := make([]uint64, int(count))
-		keys := make([][]byte, int(count))
-		changed := false
+		var childIDs []uint64
 		for i := uint16(0); i < count; i++ {
-			keyView, childID, err := n.GetInternalEntryView(i)
+			_, childID, err := n.GetInternalEntryView(i)
 			if err != nil {
 				return id, false, err
 			}
@@ -1599,13 +1707,21 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) {
 			if err != nil {
 				return id, false, err
 			}
-			if childChanged {
-				changed = true
+			if childChanged && childIDs == nil {
+				childIDs = make([]uint64, int(count))
+				for j := uint16(0); j < i; j++ {
+					_, prevChild, err := n.GetInternalEntryView(j)
+					if err != nil {
+						return id, false, err
+					}
+					childIDs[int(j)] = prevChild
+				}
+			}
+			if childIDs != nil {
+				childIDs[int(i)] = nextChild
 			}
-			childIDs[int(i)] = nextChild
-			keys[int(i)] = append([]byte(nil), keyView...)
 		}
-		if !changed {
+		if childIDs == nil {
 			return id, false, nil
 		}
 		if c.alloc == nil {
@@ -1628,8 +1744,12 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) {
 		} else if ok {
 			b.SetInternalFenceBounds(low, high)
 		}
-		for i := range childIDs {
-			if err := b.AddInternalChild(keys[i], childIDs[i]); err != nil {
+		for i := uint16(0); i < count; i++ {
+			keyView, _, err := n.GetInternalEntryView(i)
+			if err != nil {
+				return id, false, err
+			}
+			if err := b.AddInternalChild(keyView, childIDs[int(i)]); err != nil {
 				return id, false, err
 			}
 		}
@@ -1721,6 +1841,10 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter,
 		ridAlloc:   ridAlloc,
 		sourceIDs:  sourceIDs,
 	}
+	if toer, ok := leafCtx.leafReader.(unsafeToReader); ok {
+		leafCtx.leafToer = toer
+		leafCtx.leafScratch = make([]byte, 0, page.PageSize)
+	}
 
 	newSysRoot, sysChanged, err := leafCtx.rewriteNode(sysRoot)
 	if err != nil {
@@ -1745,8 +1869,23 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter,
 			return 0, 0, err
 		}
 	}
+	createdIDs, err := writer.createdFileIDs()
+	if err != nil {
+		return 0, 0, err
+	}
+	if len(createdIDs) > 0 {
+		// Register rewrite-created segments before commit publication so
+		// finalizeCommit can publish CurrentSetNoRefresh without forcing a
+		// filesystem rescan in leafref-heavy rewrite paths.
+		for _, id := range createdIDs {
+			path := db.valueLogManager.SegmentPath(id)
+			if err := db.valueLogManager.RegisterSegment(path, id); err != nil {
+				return 0, 0, err
+			}
+		}
+	}
 
-	if err := db.finalizeCommit(newRoot, newSysRoot, leafCtx.retired, sync, adaptive.Metrics{}, nil, db.indexOuterLeavesInValueLog, nil); err != nil {
+	if err := db.finalizeCommit(newRoot, newSysRoot, leafCtx.retired, sync, adaptive.Metrics{}, createdIDs, false, nil); err != nil {
 		return 0, 0, err
 	}
 	tracker = nil
@@ -1754,12 +1893,13 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter,
 }
 
 func nextRewriteRIDStart(segments []logSegment) (uint64, error) {
+	const ridScanReaderBufferSize = 64 << 10
 	maxRID := uint64(0)
 	for _, segment := range segments {
 		if !segment.valueLog {
 			continue
 		}
-		reader, err := valuelog.NewReader(segment.path, segment.fileID)
+		reader, err := valuelog.NewReaderWithBufferSize(segment.path, segment.fileID, ridScanReaderBufferSize)
 		if err != nil {
 			if errors.Is(err, os.ErrNotExist) {
 				continue
@@ -1837,20 +1977,8 @@ func (db *DB) applyRewriteSwapBatchOptimistic(swaps []rewriteSwap, sync bool) (b
 	defer batch.Release(b)
 	b.Reserve(len(swaps))
 
-	for _, swap := range swaps {
-		entry, err := tr.GetEntry(swap.key)
-		if err != nil {
-			if errors.Is(err, tree.ErrKeyNotFound) {
-				continue
-			}
-			return false, err
-		}
-		if entry.Flags&node.FlagPointer == 0 || entry.ValuePtr != swap.oldPtr {
-			continue
-		}
-		if err := b.SetPointerView(swap.key, swap.newPtr); err != nil {
-			return false, err
-		}
+	if err := collectRewriteSwapPointerMatches(tr, b, swaps); err != nil {
+		return false, err
 	}
 
 	entries := b.SortedEntries()
@@ -1936,20 +2064,8 @@ func (db *DB) applyRewriteSwapBatchSerialized(swaps []rewriteSwap, sync bool) er
 	defer batch.Release(b)
 	b.Reserve(len(swaps))
 
-	for _, swap := range swaps {
-		entry, err := tr.GetEntry(swap.key)
-		if err != nil {
-			if errors.Is(err, tree.ErrKeyNotFound) {
-				continue
-			}
-			return err
-		}
-		if entry.Flags&node.FlagPointer == 0 || entry.ValuePtr != swap.oldPtr {
-			continue
-		}
-		if err := b.SetPointerView(swap.key, swap.newPtr); err != nil {
-			return err
-		}
+	if err := collectRewriteSwapPointerMatches(tr, b, swaps); err != nil {
+		return err
 	}
 
 	entries := b.SortedEntries()
@@ -1976,6 +2092,42 @@ func (db *DB) applyRewriteSwapBatchSerialized(swaps []rewriteSwap, sync bool) er
 	return nil
 }
 
+func collectRewriteSwapPointerMatches(tr *tree.Tree, b *batch.Batch, swaps []rewriteSwap) error {
+	if tr == nil || b == nil || len(swaps) == 0 {
+		return nil
+	}
+	// Sort in-place to avoid per-batch swap-slice copies on rewrite hot paths.
+	sort.Slice(swaps, func(i, j int) bool {
+		return bytes.Compare(swaps[i].key, swaps[j].key) < 0
+	})
+
+	it := tr.IteratorWithOptions(swaps[0].key, nil, tree.IteratorOptions{Mode: tree.IteratorModePointerProjection})
+	defer func() { _ = it.Close() }()
+
+	for _, swap := range swaps {
+		for it.Valid() {
+			curr := it.UnsafeKey()
+			cmp := bytes.Compare(curr, swap.key)
+			if cmp < 0 {
+				it.Next()
+				continue
+			}
+			if cmp > 0 {
+				break
+			}
+			_, ptr, flags := it.UnsafeEntry()
+			if flags&node.FlagPointer != 0 && ptr == swap.oldPtr {
+				if err := b.SetPointerView(swap.key, swap.newPtr); err != nil {
+					return err
+				}
+			}
+			it.Next()
+			break
+		}
+	}
+	return it.Error()
+}
+
 // ValueLogRewriteOffline rewrites value-log pointers into new segments and
 // swaps index.db to reference the new log. This is an offline operation
 // (requires exclusive lock and a clean commitlog).
@@ -2044,7 +2196,7 @@ func ValueLogRewriteOffline(opts Options) (ValueLogRewriteStats, error) {
 	stats.BytesBefore = beforeBytes
 
 	lane, startSeq := chooseRewriteLane(segments)
-	nextRID, err := nextRewriteRIDStart(segments)
+	nextRID, err := rewriteRIDStartScanner(segments)
 	if err != nil {
 		_ = d.Close()
 		return stats, err
@@ -2234,7 +2386,7 @@ func ValueLogRewriteOffline(opts Options) (ValueLogRewriteStats, error) {
 	if err := removeOldValueLogSegments(segments); err != nil {
 		return stats, err
 	}
-	if err := updateValueLogHealthAfterRewrite(opts.Dir, oldValueIDs); err != nil {
+	if err := updateValueLogHealthAfterRewrite(opts.Dir, oldValueIDs, nil); err != nil {
 		if opts.NotifyError != nil {
 			opts.NotifyError(fmt.Errorf("value-log health update after rewrite: %w", err))
 		}
@@ -2257,6 +2409,10 @@ type rewriteWriter struct {
 	start   uint32
 	maxSize int64
 	nextRID uint64
+	// currentPath/currentFileID cache the active writer segment identity so
+	// CurrentValueLogSegment can avoid per-call path/fileID recomputation.
+	currentPath   string
+	currentFileID uint32
 	// blockCompression enables per-frame block compression for dictID=0 append
 	// paths (used by online rewrite). Offline rewrites use AppendRawRecord and do
 	// not consult this setting.
@@ -2397,6 +2553,15 @@ func (w *rewriteWriter) AppendLeafPage(leafPage []byte) (page.ValuePtr, error) {
 	return w.appendValue(rid, leafPage)
 }
 
+// CurrentValueLogSegment reports the writer's current segment identity.
+// This lets commit publication register the segment without directory scans.
+func (w *rewriteWriter) CurrentValueLogSegment() (string, uint32, bool) {
+	if w == nil || w.currentPath == "" || w.currentFileID == 0 {
+		return "", 0, false
+	}
+	return w.currentPath, w.currentFileID, true
+}
+
 func (w *rewriteWriter) ensureWriter() error {
 	if w.w != nil {
 		return nil
@@ -2425,6 +2590,8 @@ func (w *rewriteWriter) rotate() error {
 		writer.SetKeepPolicy(w.keepIoNsPerByte, w.keepEncodeNsRaw, w.keepSafetyMargin)
 		w.w = writer
 		w.seq = nextSeq
+		w.currentPath = path
+		w.currentFileID = fileID
 		return nil
 	}
 	if err := w.w.RotateTo(path, fileID); err != nil {
@@ -2433,6 +2600,8 @@ func (w *rewriteWriter) rotate() error {
 	w.w.SetBlockCompression(w.blockCodec, w.blockCompression)
 	w.w.SetKeepPolicy(w.keepIoNsPerByte, w.keepEncodeNsRaw, w.keepSafetyMargin)
 	w.seq = nextSeq
+	w.currentPath = path
+	w.currentFileID = fileID
 	return nil
 }
 
@@ -3259,14 +3428,40 @@ func valueLogSegmentStats(dir string) (count int, bytes int64, err error) {
 		if !seg.valueLog {
 			continue
 		}
+		if seg.size > 0 {
+			count++
+			bytes += seg.size
+			continue
+		}
+		if seg.size == 0 {
+			// Keep zero-length segments visible in stats (rare but possible for
+			// newly-created/truncated files).
+			if _, statErr := os.Stat(seg.path); statErr == nil {
+				count++
+			}
+			continue
+		}
 		info, statErr := os.Stat(seg.path)
-		if statErr != nil {
+		if statErr == nil {
+			count++
+			bytes += info.Size()
+		}
+	}
+	return count, bytes, nil
+}
+
+func valueLogSegmentStatsFromSet(set *valuelog.Set) (count int, bytes int64) {
+	if set == nil {
+		return 0, 0
+	}
+	for _, f := range set.Files {
+		if f == nil {
 			continue
 		}
 		count++
-		bytes += info.Size()
+		bytes += fileSize(f)
 	}
-	return count, bytes, nil
+	return count, bytes
 }
 
 func removeOldValueLogSegments(segments []logSegment) error {
diff --git a/TreeDB/db/vlog_rewrite_bench_test.go b/TreeDB/db/vlog_rewrite_bench_test.go
new file mode 100644
index 000000000..41c8a2fb3
--- /dev/null
+++ b/TreeDB/db/vlog_rewrite_bench_test.go
@@ -0,0 +1,424 @@
+package db
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"testing"
+
+	"github.com/snissn/gomap/TreeDB/internal/valuelog"
+	"github.com/snissn/gomap/TreeDB/node"
+	"github.com/snissn/gomap/TreeDB/page"
+	"github.com/snissn/gomap/TreeDB/pager"
+)
+
+func BenchmarkValueLogRewriteOnline_ValuePointers(b *testing.B) {
+	const (
+		seg1Records = 2048
+		seg2Records = 1024
+	)
+
+	var totalCopied int64
+	var totalBytes int64
+	var totalRefreshScans uint64
+	var totalRewriteAllocs uint64
+
+	for i := 0; i < b.N; i++ {
+		b.StopTimer()
+		db, sourceIDs, cleanup := setupValuePointerRewriteBench(b, seg1Records, seg2Records)
+		refreshBefore := db.valueLogManager.RefreshScanCount()
+		var memBefore runtime.MemStats
+		runtime.ReadMemStats(&memBefore)
+		b.StartTimer()
+
+		stats, err := db.ValueLogRewriteOnline(context.Background(), ValueLogRewriteOnlineOptions{
+			SourceFileIDs: sourceIDs,
+			BatchSize:     512,
+		})
+		b.StopTimer()
+		if err != nil {
+			cleanup()
+			b.Fatalf("ValueLogRewriteOnline: %v", err)
+		}
+		totalCopied += int64(stats.ValueRecordsCopied)
+		totalBytes += stats.ValueBytesCopied
+		totalRefreshScans += db.valueLogManager.RefreshScanCount() - refreshBefore
+		var memAfter runtime.MemStats
+		runtime.ReadMemStats(&memAfter)
+		if memAfter.Mallocs > memBefore.Mallocs {
+			totalRewriteAllocs += memAfter.Mallocs - memBefore.Mallocs
+		}
+		cleanup()
+	}
+
+	if b.N > 0 {
+		b.ReportMetric(float64(totalCopied)/float64(b.N), "value_records/op")
+		b.ReportMetric(float64(totalBytes)/float64(b.N), "value_bytes/op")
+		b.ReportMetric(float64(totalRefreshScans)/float64(b.N), "refresh_scans/op")
+		b.ReportMetric(float64(totalRewriteAllocs)/float64(b.N), "rewrite_allocs/op")
+	}
+}
+
+func BenchmarkValueLogRewriteOnline_LeafRefs(b *testing.B) {
+	const keyCount = 1536
+
+	var totalCopied int64
+	var totalBytes int64
+	var totalRefreshScans uint64
+	var totalRewriteAllocs uint64
+
+	for i := 0; i < b.N; i++ {
+		b.StopTimer()
+		db, sourceIDs, cleanup := setupLeafRefRewriteBench(b, keyCount)
+		refreshBefore := db.valueLogManager.RefreshScanCount()
+		var memBefore runtime.MemStats
+		runtime.ReadMemStats(&memBefore)
+		b.StartTimer()
+
+		stats, err := db.ValueLogRewriteOnline(context.Background(), ValueLogRewriteOnlineOptions{
+			SourceFileIDs: sourceIDs,
+			BatchSize:     256,
+		})
+		b.StopTimer()
+		if err != nil {
+			cleanup()
+			b.Fatalf("ValueLogRewriteOnline: %v", err)
+		}
+		totalCopied += int64(stats.LeafRefRecordsCopied)
+		totalBytes += stats.LeafRefBytesCopied
+		totalRefreshScans += db.valueLogManager.RefreshScanCount() - refreshBefore
+		var memAfter runtime.MemStats
+		runtime.ReadMemStats(&memAfter)
+		if memAfter.Mallocs > memBefore.Mallocs {
+			totalRewriteAllocs += memAfter.Mallocs - memBefore.Mallocs
+		}
+		cleanup()
+	}
+
+	if b.N > 0 {
+		b.ReportMetric(float64(totalCopied)/float64(b.N), "leafref_records/op")
+		b.ReportMetric(float64(totalBytes)/float64(b.N), "leafref_bytes/op")
+		b.ReportMetric(float64(totalRefreshScans)/float64(b.N), "refresh_scans/op")
+		b.ReportMetric(float64(totalRewriteAllocs)/float64(b.N), "rewrite_allocs/op")
+	}
+}
+
+func BenchmarkValueLogRewriteOnline_LeafRefs_ReserveRIDs(b *testing.B) {
+	const keyCount = 1536
+
+	var totalCopied int64
+	var totalBytes int64
+	var totalRefreshScans uint64
+	var totalRewriteAllocs uint64
+
+	for i := 0; i < b.N; i++ {
+		b.StopTimer()
+		db, sourceIDs, cleanup := setupLeafRefRewriteBench(b, keyCount)
+		refreshBefore := db.valueLogManager.RefreshScanCount()
+		nextRID := uint64(1 << 42)
+		var memBefore runtime.MemStats
+		runtime.ReadMemStats(&memBefore)
+		b.StartTimer()
+
+		stats, err := db.ValueLogRewriteOnline(context.Background(), ValueLogRewriteOnlineOptions{
+			SourceFileIDs: sourceIDs,
+			BatchSize:     256,
+			ReserveRIDs: func(count int) (uint64, error) {
+				if count <= 0 {
+					return 0, fmt.Errorf("invalid count %d", count)
+				}
+				start := nextRID
+				nextRID += uint64(count)
+				return start, nil
+			},
+		})
+		b.StopTimer()
+		if err != nil {
+			cleanup()
+			b.Fatalf("ValueLogRewriteOnline: %v", err)
+		}
+		totalCopied += int64(stats.LeafRefRecordsCopied)
+		totalBytes += stats.LeafRefBytesCopied
+		totalRefreshScans += db.valueLogManager.RefreshScanCount() - refreshBefore
+		var memAfter runtime.MemStats
+		runtime.ReadMemStats(&memAfter)
+		if memAfter.Mallocs > memBefore.Mallocs {
+			totalRewriteAllocs += memAfter.Mallocs - memBefore.Mallocs
+		}
+		cleanup()
+	}
+
+	if b.N > 0 {
+		b.ReportMetric(float64(totalCopied)/float64(b.N), "leafref_records/op")
+		b.ReportMetric(float64(totalBytes)/float64(b.N), "leafref_bytes/op")
+		b.ReportMetric(float64(totalRefreshScans)/float64(b.N), "refresh_scans/op")
+		b.ReportMetric(float64(totalRewriteAllocs)/float64(b.N), "rewrite_allocs/op")
+	}
+}
+
+func setupValuePointerRewriteBench(tb testing.TB, seg1Records, seg2Records int) (*DB, []uint32, func()) {
+	tb.Helper()
+	dir, err := os.MkdirTemp("", "treedb-vlog-rewrite-value-bench-*")
+	if err != nil {
+		tb.Fatalf("MkdirTemp: %v", err)
+	}
+
+	db, err := Open(Options{
+		Dir:                    dir,
+		Durability:             DurabilityWALOffRelaxed,
+		DisableBackgroundPrune: true,
+		LeafPrefixCompression:  true,
+		IndexColumnarLeaves:    true,
+		IndexPackedValuePtr:    true,
+		ValueLog: ValueLogOptions{
+			ForcePointers: true,
+		},
+	})
+	if err != nil {
+		_ = os.RemoveAll(dir)
+		tb.Fatalf("Open: %v", err)
+	}
+
+	ptrs1 := appendPointersInNewSegmentBench(tb, dir, 0, 1, 1_000_000, seg1Records, func(i int) []byte {
+		return bytes.Repeat([]byte{byte(i % 251)}, 768)
+	})
+	ptrs2 := appendPointersInNewSegmentBench(tb, dir, 0, 2, 2_000_000, seg2Records, func(i int) []byte {
+		return bytes.Repeat([]byte{byte((i + 7) % 251)}, 768)
+	})
+
+	bt, ok := db.NewBatch().(*Batch)
+	if !ok {
+		_ = db.Close()
+		_ = os.RemoveAll(dir)
+		tb.Fatalf("NewBatch type assertion failed")
+	}
+	// Keep only a subset of segment-1 pointers live so rewrite has deterministic
+	// stale bytes in the selected source.
+	for i := range ptrs1 {
+		if i%4 != 0 {
+			continue
+		}
+		if err := bt.SetPointer([]byte(fmt.Sprintf("s1-live-%06d", i)), ptrs1[i]); err != nil {
+			_ = bt.Close()
+			_ = db.Close()
+			_ = os.RemoveAll(dir)
+			tb.Fatalf("SetPointer(s1): %v", err)
+		}
+	}
+	for i := range ptrs2 {
+		if err := bt.SetPointer([]byte(fmt.Sprintf("s2-live-%06d", i)), ptrs2[i]); err != nil {
+			_ = bt.Close()
+			_ = db.Close()
+			_ = os.RemoveAll(dir)
+			tb.Fatalf("SetPointer(s2): %v", err)
+		}
+	}
+	if err := bt.Write(); err != nil {
+		_ = bt.Close()
+		_ = db.Close()
+		_ = os.RemoveAll(dir)
+		tb.Fatalf("seed Write: %v", err)
+	}
+	_ = bt.Close()
+
+	if err := db.RefreshValueLogSet(); err != nil {
+		_ = db.Close()
+		_ = os.RemoveAll(dir)
+		tb.Fatalf("RefreshValueLogSet: %v", err)
+	}
+
+	sourceIDs := []uint32{ptrs1[0].FileID}
+	cleanup := func() {
+		_ = db.Close()
+		_ = os.RemoveAll(dir)
+	}
+	return db, sourceIDs, cleanup
+}
+
+func setupLeafRefRewriteBench(tb testing.TB, keyCount int) (*DB, []uint32, func()) {
+	tb.Helper()
+	dir, err := os.MkdirTemp("", "treedb-vlog-rewrite-leaf-bench-*")
+	if err != nil {
+		tb.Fatalf("MkdirTemp: %v", err)
+	}
+
+	db, err := Open(Options{
+		Dir:                        dir,
+		Durability:                 DurabilityWALOffRelaxed,
+		DisableBackgroundPrune:     true,
+		IndexOuterLeavesInValueLog: true,
+		LeafPrefixCompression:      true,
+		IndexColumnarLeaves:        true,
+		IndexPackedValuePtr:        true,
+	})
+	if err != nil {
+		_ = os.RemoveAll(dir)
+		tb.Fatalf("Open: %v", err)
+	}
+
+	walDir := filepath.Join(dir, "wal")
+	if err := os.MkdirAll(walDir, 0o755); err != nil {
+		_ = db.Close()
+		_ = os.RemoveAll(dir)
+		tb.Fatalf("mkdir wal: %v", err)
+	}
+
+	leafLog := newRewriteWriter(walDir, 0, 0, 16<<10)
+	leafLog.blockCompression = false
+	leafLog.blockCodec = valuelog.BlockCodecSnappy
+	db.SetLeafPageLog(leafLog)
+
+	value := bytes.Repeat([]byte("leafref-bench-value-"), 3)
+	bt, ok := db.NewBatch().(*Batch)
+	if !ok {
+		_ = leafLog.Close()
+		_ = db.Close()
+		_ = os.RemoveAll(dir)
+		tb.Fatalf("NewBatch type assertion failed")
+	}
+	for i := 0; i < keyCount; i++ {
+		key := []byte(fmt.Sprintf("k%06d", i))
+		if err := bt.Set(key, value); err != nil {
+			_ = bt.Close()
+			_ = leafLog.Close()
+			_ = db.Close()
+			_ = os.RemoveAll(dir)
+			tb.Fatalf("Batch.Set(%q): %v", key, err)
+		}
+	}
+	if err := bt.Write(); err != nil {
+		_ = bt.Close()
+		_ = leafLog.Close()
+		_ = db.Close()
+		_ = os.RemoveAll(dir)
+		tb.Fatalf("seed Write: %v", err)
+	}
+	_ = bt.Close()
+	if err := db.RefreshValueLogSet(); err != nil {
+		_ = leafLog.Close()
+		_ = db.Close()
+		_ = os.RemoveAll(dir)
+		tb.Fatalf("RefreshValueLogSet: %v", err)
+	}
+	state := db.State()
+	if state == nil {
+		_ = leafLog.Close()
+		_ = db.Close()
+		_ = os.RemoveAll(dir)
+		tb.Fatalf("missing DB state")
+	}
+
+	counts := make(map[uint32]int, 16)
+	collectLeafRefFileCountsBench(tb, db.Pager(), state.RootPageID, counts)
+	collectLeafRefFileCountsBench(tb, db.Pager(), state.SystemRootPageID, counts)
+	if len(counts) == 0 {
+		_ = leafLog.Close()
+		_ = db.Close()
+		_ = os.RemoveAll(dir)
+		tb.Fatalf("no leafref source file IDs found")
+	}
+
+	targetID := uint32(0)
+	targetCount := -1
+	for fileID, n := range counts {
+		if n > targetCount || (n == targetCount && fileID < targetID) {
+			targetID = fileID
+			targetCount = n
+		}
+	}
+
+	sourceIDs := []uint32{targetID}
+	cleanup := func() {
+		_ = leafLog.Close()
+		_ = db.Close()
+		_ = os.RemoveAll(dir)
+	}
+	return db, sourceIDs, cleanup
+}
+
+func collectLeafRefFileCountsBench(tb testing.TB, p *pager.Pager, rootID uint64, counts map[uint32]int) {
+	tb.Helper()
+	if p == nil || rootID == 0 || counts == nil {
+		return
+	}
+	if ptr, ok := page.DecodeLeafRef(rootID); ok {
+		counts[ptr.FileID]++
+		return
+	}
+
+	stack := make([]uint64, 0, 128)
+	stack = append(stack, rootID)
+	visited := make(map[uint64]struct{}, 256)
+
+	for len(stack) > 0 {
+		pageID := stack[len(stack)-1]
+		stack = stack[:len(stack)-1]
+		if _, ok := visited[pageID]; ok {
+			continue
+		}
+		visited[pageID] = struct{}{}
+
+		if ptr, ok := page.DecodeLeafRef(pageID); ok {
+			counts[ptr.FileID]++
+			continue
+		}
+
+		data, err := p.Get(pageID)
+		if err != nil {
+			tb.Fatalf("pager.Get(%d): %v", pageID, err)
+		}
+		n := node.NewNodeView(data)
+		if !n.VerifyChecksum() {
+			tb.Fatalf("checksum mismatch on page %d", pageID)
+		}
+
+		switch n.Type() {
+		case page.PageTypeLeaf:
+			// no children
+		case page.PageTypeInternal:
+			count := n.Count()
+			for i := uint16(0); i < count; i++ {
+				childID, err := n.GetInternalChildID(i)
+				if err != nil {
+					tb.Fatalf("GetInternalChildID(%d,%d): %v", pageID, i, err)
+				}
+				stack = append(stack, childID)
+			}
+		default:
+			tb.Fatalf("unexpected page type %d at page %d", n.Type(), pageID)
+		}
+	}
+}
+
+func appendPointersInNewSegmentBench(tb testing.TB, dir string, lane, seq uint32, ridBase uint64, n int, valueAt func(i int) []byte) []page.ValuePtr {
+	tb.Helper()
+	walDir := filepath.Join(dir, "wal")
+	if err := os.MkdirAll(walDir, 0o755); err != nil {
+		tb.Fatalf("mkdir wal: %v", err)
+	}
+	fileID, err := valuelog.EncodeFileID(lane, seq)
+	if err != nil {
+		tb.Fatalf("encode file id lane=%d seq=%d: %v", lane, seq, err)
+	}
+	path := filepath.Join(walDir, fmt.Sprintf("value-l%d-%06d.log", lane, seq))
+	w, err := valuelog.NewWriter(path, fileID)
+	if err != nil {
+		tb.Fatalf("new writer: %v", err)
+	}
+	ptrs := make([]page.ValuePtr, 0, n)
+	for i := 0; i < n; i++ {
+		ptr, err := w.Append(0, nil, ridBase+uint64(i), valueAt(i))
+		if err != nil {
+			_ = w.Close()
+			tb.Fatalf("append rid=%d: %v", ridBase+uint64(i), err)
+		}
+		ptrs = append(ptrs, ptr)
+	}
+	if err := w.Close(); err != nil {
+		tb.Fatalf("close writer: %v", err)
+	}
+	return ptrs
+}

From 7973722c3cccec2a61139e921ef8e4f685253d8d Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sun, 29 Mar 2026 08:23:19 -1000
Subject: [PATCH 56/61] treedb: derive rewrite ref deltas from matched swaps

---
 TreeDB/db/vlog_rewrite.go | 45 +++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go
index b63886625..09dd0b049 100644
--- a/TreeDB/db/vlog_rewrite.go
+++ b/TreeDB/db/vlog_rewrite.go
@@ -1977,7 +1977,8 @@ func (db *DB) applyRewriteSwapBatchOptimistic(swaps []rewriteSwap, sync bool) (b
 	defer batch.Release(b)
 	b.Reserve(len(swaps))
 
-	if err := collectRewriteSwapPointerMatches(tr, b, swaps); err != nil {
+	rewriteDelta, err := collectRewriteSwapPointerMatches(tr, b, swaps)
+	if err != nil {
 		return false, err
 	}
 
@@ -1998,13 +1999,9 @@ func (db *DB) applyRewriteSwapBatchOptimistic(swaps []rewriteSwap, sync bool) (b
 		return false, err
 	}
 	entries = b.SortedEntries()
-	vlogRefDelta, err := db.buildValueLogRefDelta(idx.pager, rootID, baseSeq, entries)
-	if err != nil {
-		freeErr := tracker.FreeAll()
-		if freeErr != nil {
-			return false, errors.Join(err, freeErr)
-		}
-		return false, err
+	var vlogRefDelta *valueLogRefDelta
+	if db.valueLogRefTracker != nil && db.valueLogRefTracker.canTrack(baseSeq) && !db.indexOuterLeavesInValueLog {
+		vlogRefDelta = rewriteDelta
 	}
 
 	db.commitMu.Lock()
@@ -2064,7 +2061,8 @@ func (db *DB) applyRewriteSwapBatchSerialized(swaps []rewriteSwap, sync bool) er
 	defer batch.Release(b)
 	b.Reserve(len(swaps))
 
-	if err := collectRewriteSwapPointerMatches(tr, b, swaps); err != nil {
+	rewriteDelta, err := collectRewriteSwapPointerMatches(tr, b, swaps)
+	if err != nil {
 		return err
 	}
 
@@ -2079,9 +2077,9 @@ func (db *DB) applyRewriteSwapBatchSerialized(swaps []rewriteSwap, sync bool) er
 		return err
 	}
 	entries = b.SortedEntries()
-	vlogRefDelta, err := db.buildValueLogRefDelta(idx.pager, rootID, baseSeq, entries)
-	if err != nil {
-		return err
+	var vlogRefDelta *valueLogRefDelta
+	if db.valueLogRefTracker != nil && db.valueLogRefTracker.canTrack(baseSeq) && !db.indexOuterLeavesInValueLog {
+		vlogRefDelta = rewriteDelta
 	}
 	if err := db.finalizeCommit(newRoot, sysRoot, retired, sync, metrics, touchedValueLogSegments, db.indexOuterLeavesInValueLog, vlogRefDelta); err != nil {
 		return err
@@ -2092,9 +2090,9 @@ func (db *DB) applyRewriteSwapBatchSerialized(swaps []rewriteSwap, sync bool) er
 	return nil
 }
 
-func collectRewriteSwapPointerMatches(tr *tree.Tree, b *batch.Batch, swaps []rewriteSwap) error {
+func collectRewriteSwapPointerMatches(tr *tree.Tree, b *batch.Batch, swaps []rewriteSwap) (*valueLogRefDelta, error) {
 	if tr == nil || b == nil || len(swaps) == 0 {
-		return nil
+		return nil, nil
 	}
 	// Sort in-place to avoid per-batch swap-slice copies on rewrite hot paths.
 	sort.Slice(swaps, func(i, j int) bool {
@@ -2103,6 +2101,7 @@ func collectRewriteSwapPointerMatches(tr *tree.Tree, b *batch.Batch, swaps []rew
 
 	it := tr.IteratorWithOptions(swaps[0].key, nil, tree.IteratorOptions{Mode: tree.IteratorModePointerProjection})
 	defer func() { _ = it.Close() }()
+	var delta *valueLogRefDelta
 
 	for _, swap := range swaps {
 		for it.Valid() {
@@ -2118,14 +2117,28 @@ func collectRewriteSwapPointerMatches(tr *tree.Tree, b *batch.Batch, swaps []rew
 			_, ptr, flags := it.UnsafeEntry()
 			if flags&node.FlagPointer != 0 && ptr == swap.oldPtr {
 				if err := b.SetPointerView(swap.key, swap.newPtr); err != nil {
-					return err
+					return nil, err
+				}
+				if page.IsValueLogFileID(swap.oldPtr.FileID) || page.IsValueLogFileID(swap.newPtr.FileID) {
+					if delta == nil {
+						delta = newValueLogRefDelta()
+					}
+					if page.IsValueLogFileID(swap.oldPtr.FileID) {
+						delta.add(swap.oldPtr.FileID, -1)
+					}
+					if page.IsValueLogFileID(swap.newPtr.FileID) {
+						delta.add(swap.newPtr.FileID, 1)
+					}
 				}
 			}
 			it.Next()
 			break
 		}
 	}
-	return it.Error()
+	if err := it.Error(); err != nil {
+		return nil, err
+	}
+	return delta, nil
 }
 
 // ValueLogRewriteOffline rewrites value-log pointers into new segments and

From 0f4039c2ab7548e62f75d2f79989c106b448f0ac Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sun, 29 Mar 2026 08:54:45 -1000
Subject: [PATCH 57/61] bench: add block-normalized metrics to run_celestia AB
 harness

---
 docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md     |  3 +
 docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md | 13 +++
 scripts/run_celestia_ab.sh                   | 95 ++++++++++++++++++++
 3 files changed, 111 insertions(+)

diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
index 648120a26..1b4625bc2 100644
--- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
+++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
@@ -94,6 +94,7 @@ Default pair metric focus:
 - `S_post_wal`: WAL bytes after offline rewrite
 - `T_total = T_sync + T_rw`
 - `max_rss_kb` (memory guardrail)
+- `blocks_synced` and normalized metrics (`*_per_block`) to de-noise moving-target runs
 
 Outputs:
 - `artifacts/celestia_ab/<ts>/runs.csv`
@@ -103,6 +104,8 @@ Outputs:
 
 The harness alternates run order per pair (`control->candidate`, then
 `candidate->control`) and can stop early on clear win/loss signals.
+For stable pair scoring, prefer `FREEZE_REMOTE_HEIGHT_AT_START=1` and validate
+`delta_blocks_synced` stays near zero across pairs.
 
 ## Experimental Knob
 - `TREEDB_ENABLE_VLOG_GENERATION_PRECHECKPOINT_REWRITE=1`
diff --git a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md
index 7b0836451..d092f4bbd 100644
--- a/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md
+++ b/docs/benchmarks/CELESTIA_COMPRESSION_LOOP.md
@@ -67,6 +67,19 @@ Outputs:
 - `pairs.csv`
 - per-run `run.json`
 
+Signal hygiene additions in `run_celestia` A/B artifacts:
+- `runs.csv` now includes `blocks_synced` plus normalized metrics:
+  - `s_sync_app_bytes_per_block`
+  - `s_post_app_bytes_per_block`
+  - `t_sync_seconds_per_block`
+  - `t_total_seconds_per_block`
+- `pairs.csv` now includes:
+  - `delta_blocks_synced`
+  - `delta_s_sync_app_bytes_per_block`
+  - `delta_t_total_seconds_per_block`
+- `summary.md` includes `pairs with block-count drift` so moving-target runs
+  are visible before making a promote/reject decision.
+
 ## Stage 2: Pprof/Implementation Efficiency Pass
 
 Run this stage before full `run_celestia` if fast gate shows:
diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh
index ef8c3f088..4b271caed 100755
--- a/scripts/run_celestia_ab.sh
+++ b/scripts/run_celestia_ab.sh
@@ -281,6 +281,16 @@ def safe_int(raw: str | None, default: int = 0) -> int:
         except Exception:
             return default
 
+def safe_div(num, den):
+    try:
+        if den is None or float(den) == 0:
+            return None
+        if num is None:
+            return None
+        return float(num) / float(den)
+    except Exception:
+        return None
+
 def probe_sync_progress(node_log_path: Path | None) -> dict[str, object]:
     progress = {
         "node_log_present": False,
@@ -347,6 +357,22 @@ if not resolved_invalid_reason and rewrite_attempted == 1 and rewrite_rc != 0:
     resolved_invalid_reason = "rewrite_failed"
 valid = resolved_invalid_reason == ""
 t_total = (t_sync + t_rw) if valid else None
+trust_height = safe_int(sync.get("trust_height"), 0)
+stop_at_local_height = safe_int(sync.get("stop_at_local_height"), 0)
+final_local_height = safe_int(sync.get("final_local_height"), 0)
+final_remote_height = safe_int(sync.get("final_remote_height"), 0)
+final_remote_height_actual = safe_int(sync.get("final_remote_height_actual"), 0)
+freeze_remote_height_at_start = safe_int(sync.get("freeze_remote_height_at_start"), 0)
+blocks_synced = 0
+if trust_height > 0 and final_local_height >= trust_height:
+    blocks_synced = final_local_height - trust_height
+remote_minus_stop_height = None
+if stop_at_local_height > 0 and final_remote_height > 0:
+    remote_minus_stop_height = final_remote_height - stop_at_local_height
+s_sync_app_bytes_per_block = safe_div(pre_app_bytes, blocks_synced)
+s_post_app_bytes_per_block = safe_div(post_app_bytes, blocks_synced)
+t_sync_seconds_per_block = safe_div(t_sync, blocks_synced)
+t_total_seconds_per_block = safe_div(t_total, blocks_synced) if t_total is not None else None
 
 result = {
     "pair_index": pair_index,
@@ -366,6 +392,14 @@ result = {
         "duration_seconds": t_sync,
         "max_rss_kb": safe_int(sync.get("max_rss_kb"), 0),
         "max_hwm_kb": safe_int(sync.get("max_hwm_kb"), 0),
+        "freeze_remote_height_at_start": freeze_remote_height_at_start,
+        "trust_height": trust_height,
+        "stop_at_local_height": stop_at_local_height,
+        "final_local_height": final_local_height,
+        "final_remote_height": final_remote_height,
+        "final_remote_height_actual": final_remote_height_actual,
+        "blocks_synced": blocks_synced,
+        "remote_minus_stop_height": remote_minus_stop_height,
         "end_app_bytes": safe_int(sync.get("end_app_bytes"), pre_app_bytes),
         "end_data_bytes": safe_int(sync.get("end_data_bytes"), 0),
         "end_home_bytes": safe_int(sync.get("end_home_bytes"), 0),
@@ -389,7 +423,12 @@ result = {
         "s_sync_wal_bytes": pre_wal_bytes,
         "s_post_app_bytes": post_app_bytes,
         "s_post_wal_bytes": post_wal_bytes,
+        "s_sync_app_bytes_per_block": s_sync_app_bytes_per_block,
+        "s_post_app_bytes_per_block": s_post_app_bytes_per_block,
         "max_rss_kb": safe_int(sync.get("max_rss_kb"), 0),
+        "blocks_synced": blocks_synced,
+        "t_sync_seconds_per_block": t_sync_seconds_per_block,
+        "t_total_seconds_per_block": t_total_seconds_per_block,
     },
     "maintenance_summary": maintenance,
 }
@@ -483,6 +522,18 @@ with runs_csv.open("w", newline="", encoding="utf-8") as fh:
         "s_post_app_bytes",
         "s_post_wal_bytes",
         "max_rss_kb",
+        "blocks_synced",
+        "trust_height",
+        "stop_at_local_height",
+        "final_local_height",
+        "final_remote_height",
+        "final_remote_height_actual",
+        "freeze_remote_height_at_start",
+        "remote_minus_stop_height",
+        "s_sync_app_bytes_per_block",
+        "s_post_app_bytes_per_block",
+        "t_sync_seconds_per_block",
+        "t_total_seconds_per_block",
         "valid",
         "invalid_reason",
         "run_exit_code",
@@ -499,6 +550,7 @@ with runs_csv.open("w", newline="", encoding="utf-8") as fh:
         s = r.get("sizes", {}) or {}
         rw = r.get("rewrite", {}) or {}
         summary = r.get("maintenance_summary", {}) or {}
+        sync = r.get("sync", {}) or {}
         valid = run_is_valid(r)
         w.writerow([
             int(r.get("pair_index", 0)),
@@ -512,6 +564,18 @@ with runs_csv.open("w", newline="", encoding="utf-8") as fh:
             s.get("post_app_bytes"),
             s.get("post_wal_bytes"),
             m.get("max_rss_kb"),
+            m.get("blocks_synced"),
+            sync.get("trust_height"),
+            sync.get("stop_at_local_height"),
+            sync.get("final_local_height"),
+            sync.get("final_remote_height"),
+            sync.get("final_remote_height_actual"),
+            sync.get("freeze_remote_height_at_start"),
+            sync.get("remote_minus_stop_height"),
+            m.get("s_sync_app_bytes_per_block"),
+            m.get("s_post_app_bytes_per_block"),
+            m.get("t_sync_seconds_per_block"),
+            m.get("t_total_seconds_per_block"),
             valid,
             run_invalid_reason(r),
             run_exit_code(r),
@@ -553,6 +617,9 @@ for pair in sorted(by_pair):
             "delta_t_total_seconds": None,
             "delta_s_sync_app_bytes": None,
             "delta_s_post_wal_bytes": None,
+            "delta_blocks_synced": None,
+            "delta_s_sync_app_bytes_per_block": None,
+            "delta_t_total_seconds_per_block": None,
             "control_valid": ctrl_valid,
             "candidate_valid": cand_valid,
             "control_invalid_reason": ctrl_reason,
@@ -570,6 +637,12 @@ for pair in sorted(by_pair):
     base_sync = bm.get("t_sync_seconds")
     cand_sync_app = cm.get("s_sync_app_bytes")
     base_sync_app = bm.get("s_sync_app_bytes")
+    cand_blocks = cm.get("blocks_synced")
+    base_blocks = bm.get("blocks_synced")
+    cand_sync_app_per_block = cm.get("s_sync_app_bytes_per_block")
+    base_sync_app_per_block = bm.get("s_sync_app_bytes_per_block")
+    cand_total_per_block = cm.get("t_total_seconds_per_block")
+    base_total_per_block = bm.get("t_total_seconds_per_block")
 
     def delta(a, b):
         if a is None or b is None:
@@ -580,6 +653,9 @@ for pair in sorted(by_pair):
     d_sync = delta(cand_sync, base_sync)
     d_post_wal = delta(cand_post_wal, base_post_wal)
     d_sync_app = delta(cand_sync_app, base_sync_app)
+    d_blocks = delta(cand_blocks, base_blocks)
+    d_sync_app_per_block = delta(cand_sync_app_per_block, base_sync_app_per_block)
+    d_total_per_block = delta(cand_total_per_block, base_total_per_block)
 
     outcome = "neutral"
     if d_post_wal is not None and d_total is not None:
@@ -598,6 +674,9 @@ for pair in sorted(by_pair):
         "delta_t_total_seconds": d_total,
         "delta_s_sync_app_bytes": d_sync_app,
         "delta_s_post_wal_bytes": d_post_wal,
+        "delta_blocks_synced": d_blocks,
+        "delta_s_sync_app_bytes_per_block": d_sync_app_per_block,
+        "delta_t_total_seconds_per_block": d_total_per_block,
         "control_valid": ctrl_valid,
         "candidate_valid": cand_valid,
         "control_invalid_reason": ctrl_reason,
@@ -614,6 +693,9 @@ with pairs_csv.open("w", newline="", encoding="utf-8") as fh:
         "delta_t_total_seconds",
         "delta_s_sync_app_bytes",
         "delta_s_post_wal_bytes",
+        "delta_blocks_synced",
+        "delta_s_sync_app_bytes_per_block",
+        "delta_t_total_seconds_per_block",
         "control_valid",
         "candidate_valid",
         "control_invalid_reason",
@@ -627,6 +709,9 @@ with pairs_csv.open("w", newline="", encoding="utf-8") as fh:
             r["delta_t_total_seconds"],
             r["delta_s_sync_app_bytes"],
             r["delta_s_post_wal_bytes"],
+            r["delta_blocks_synced"],
+            r["delta_s_sync_app_bytes_per_block"],
+            r["delta_t_total_seconds_per_block"],
             r["control_valid"],
             r["candidate_valid"],
             r["control_invalid_reason"],
@@ -637,6 +722,11 @@ with pairs_csv.open("w", newline="", encoding="utf-8") as fh:
 scored_rows = [row for row in pair_rows if row.get("outcome") != "invalid"]
 completed_pairs = len(scored_rows)
 neutral = max(0, completed_pairs - wins - losses)
+nonzero_block_drift_pairs = 0
+for row in scored_rows:
+    d = row.get("delta_blocks_synced")
+    if d is not None and d != 0:
+        nonzero_block_drift_pairs += 1
 neutral_streak = 0
 for row in reversed(scored_rows):
     if row.get("outcome") == "neutral":
@@ -687,6 +777,7 @@ lines.append(f"- observed pairs: `{raw_pairs}`")
 lines.append(f"- scored pairs: `{completed_pairs}`")
 lines.append(f"- invalid pairs skipped: `{invalid_pairs}`")
 lines.append(f"- wins/losses/neutral: `{wins}` / `{losses}` / `{neutral}`")
+lines.append(f"- pairs with block-count drift: `{nonzero_block_drift_pairs}`")
 lines.append(f"- neutral streak (tail): `{neutral_streak}`")
 lines.append(f"- invalid streak (tail): `{invalid_streak}`")
 lines.append(f"- size tolerance bytes: `{size_tol}`")
@@ -713,6 +804,9 @@ if pair_rows:
     lines.append(f"- delta_t_total_seconds: `{last['delta_t_total_seconds']}`")
     lines.append(f"- delta_s_sync_app_bytes: `{last['delta_s_sync_app_bytes']}`")
     lines.append(f"- delta_s_post_wal_bytes: `{last['delta_s_post_wal_bytes']}`")
+    lines.append(f"- delta_blocks_synced: `{last['delta_blocks_synced']}`")
+    lines.append(f"- delta_s_sync_app_bytes_per_block: `{last['delta_s_sync_app_bytes_per_block']}`")
+    lines.append(f"- delta_t_total_seconds_per_block: `{last['delta_t_total_seconds_per_block']}`")
 summary_md.write_text("\n".join(lines) + "\n", encoding="utf-8")
 
 payload = {
@@ -722,6 +816,7 @@ payload = {
     "wins": wins,
     "losses": losses,
     "neutral": neutral,
+    "nonzero_block_drift_pairs": nonzero_block_drift_pairs,
     "neutral_streak": neutral_streak,
     "invalid_streak": invalid_streak,
     "stop": stop,

From 45ec9cdb07de21c8e98b8239765ab3dc6e6e9470 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sun, 29 Mar 2026 10:01:17 -1000
Subject: [PATCH 58/61] docs: add zero-local fast-fail guidance for celestia AB

---
 docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
index 1b4625bc2..93123d95a 100644
--- a/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
+++ b/docs/TREEDB_VLOG_GENERATIONAL_RUNBOOK.md
@@ -86,6 +86,13 @@ CLEAR_LOSS_PAIRS=3 \
 ./scripts/run_celestia_ab.sh
 ```
 
+Recommended for probe loops (faster fail on low-signal state-sync stalls):
+- Set `FREEZE_REMOTE_HEIGHT_AT_START=1` in both env files so pair targets are stable.
+- Set `ZERO_LOCAL_FAIL_SECONDS=<n>` (for example `120` to `300`) to abort runs that
+  stay at `local=0` too long even if restore I/O is active.
+- Keep `NO_PROGRESS_FAIL_SECONDS`/`NO_PROGRESS_HARD_FAIL_SECONDS` as a secondary
+  backstop for non-zero-local stalls.
+
 Default pair metric focus:
 - `T_sync`: sync duration (seconds)
 - `S_sync_app`: app dir bytes at sync end

From 1dfa257361a26220286df480253b128d3ffb2cd7 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sun, 29 Mar 2026 11:58:59 -1000
Subject: [PATCH 59/61] db: refresh value-log set for outer-leaf commit
 publication

---
 TreeDB/db/db.go | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/TreeDB/db/db.go b/TreeDB/db/db.go
index d7b2f7c05..a9ed96475 100644
--- a/TreeDB/db/db.go
+++ b/TreeDB/db/db.go
@@ -1587,8 +1587,8 @@ func (db *DB) finalizeCommitLocked(newRootID uint64, sysRootID uint64, retired [
 	post.oldState = db.state.Load()
 	var valueLogSet *valuelog.Set
 	if db.valueLogManager != nil {
-		needRefresh := forceValueLogRefresh
-		if !needRefresh && len(touchedValueLogSegments) > 0 {
+		needRefresh := false
+		if len(touchedValueLogSegments) > 0 {
 			for _, id := range touchedValueLogSegments {
 				if !db.valueLogManager.HasSegment(id) {
 					needRefresh = true
@@ -1596,6 +1596,13 @@ func (db *DB) finalizeCommitLocked(newRootID uint64, sysRootID uint64, retired [
 				}
 			}
 		}
+		if forceValueLogRefresh {
+			// Outer-leaf commits can rotate multiple value-log segments within a
+			// single commit. Registering only the current segment can miss
+			// intermediate referenced segments, so force a full refresh to keep the
+			// published ValueLogSet complete for snapshot readers.
+			needRefresh = true
+		}
 		if needRefresh {
 			if err := db.valueLogManager.Refresh(); err != nil {
 				db.mu.Unlock()

From 98c43ef8c94c6d14f8135b21b991c56466367265 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sun, 29 Mar 2026 12:51:38 -1000
Subject: [PATCH 60/61] vlog-rewrite: fast-path explicit sources and stabilize
 AB runs

---
 TreeDB/db/vlog_rewrite.go  | 39 ++++++++++++++++++++++++++++++++++----
 scripts/run_celestia_ab.sh | 18 ++++++++++++++++++
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go
index 09dd0b049..bafe6066c 100644
--- a/TreeDB/db/vlog_rewrite.go
+++ b/TreeDB/db/vlog_rewrite.go
@@ -431,6 +431,32 @@ func hasRewriteSourceSelection(opts ValueLogRewriteOnlineOptions) bool {
 	return false
 }
 
+func hasOnlyExplicitRewriteSources(opts ValueLogRewriteOnlineOptions) bool {
+	return len(opts.SourceFileIDs) > 0 &&
+		opts.MaxSourceSegments <= 0 &&
+		opts.MaxSourceBytes <= 0 &&
+		opts.MinSegmentStaleRatio <= 0 &&
+		opts.MinSegmentStaleBytes <= 0 &&
+		opts.MinSegmentAge <= 0
+}
+
+func selectExplicitRewriteSourceIDs(sourceFileIDs []uint32, files map[uint32]*valuelog.File) map[uint32]struct{} {
+	if len(sourceFileIDs) == 0 || len(files) == 0 {
+		return nil
+	}
+	selected := make(map[uint32]struct{}, len(sourceFileIDs))
+	for _, id := range sourceFileIDs {
+		if _, ok := files[id]; !ok {
+			continue
+		}
+		selected[id] = struct{}{}
+	}
+	if len(selected) == 0 {
+		return nil
+	}
+	return selected
+}
+
 func rewritePlanNeedsLiveEstimate(opts ValueLogRewriteOnlineOptions) bool {
 	if !hasRewriteSourceSelection(opts) {
 		return false
@@ -491,8 +517,6 @@ func (db *DB) ValueLogRewritePlan(ctx context.Context, opts ValueLogRewriteOnlin
 		plan.BytesTotal += fileSize(f)
 	}
 
-	active := currentValueLogIDs(set)
-
 	var liveByID map[uint32]int64
 	var err error
 	// Without selection knobs, the plan is just the global totals and should not
@@ -508,7 +532,10 @@ func (db *DB) ValueLogRewritePlan(ctx context.Context, opts ValueLogRewriteOnlin
 
 	sourceIDs := map[uint32]struct{}(nil)
 	var selectionStats rewriteSourceSelectionStats
-	if hasRewriteSourceSelection(opts) {
+	if hasOnlyExplicitRewriteSources(opts) {
+		sourceIDs = selectExplicitRewriteSourceIDs(opts.SourceFileIDs, set.Files)
+	} else if hasRewriteSourceSelection(opts) {
+		active := currentValueLogIDs(set)
 		sourceIDs, selectionStats = selectRewriteSourceSegmentsWithStats(opts, set.Files, active, liveByID)
 	}
 	plan.AgeBlockedSegments = selectionStats.ageBlockedSegments
@@ -1201,7 +1228,11 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 		sourceIDs      map[uint32]struct{}
 		restrictSource bool
 	)
-	if hasRewriteSourceSelection(opts) {
+	if hasOnlyExplicitRewriteSources(opts) {
+		sourceIDs = selectExplicitRewriteSourceIDs(opts.SourceFileIDs, set.Files)
+		restrictSource = true
+		stats.SourceSegmentsRequested = len(sourceIDs)
+	} else if hasRewriteSourceSelection(opts) {
 		active := currentValueLogIDs(set)
 		var liveByID map[uint32]int64
 		if rewritePlanNeedsLiveEstimate(opts) {
diff --git a/scripts/run_celestia_ab.sh b/scripts/run_celestia_ab.sh
index 4b271caed..8e1c06cb4 100755
--- a/scripts/run_celestia_ab.sh
+++ b/scripts/run_celestia_ab.sh
@@ -23,6 +23,11 @@ RUN_TIMEOUT_SECONDS="${RUN_TIMEOUT_SECONDS:-1800}"
 RUN_MAX_ATTEMPTS_PER_VARIANT="${RUN_MAX_ATTEMPTS_PER_VARIANT:-2}"
 RUN_RETRY_SLEEP_SECONDS="${RUN_RETRY_SLEEP_SECONDS:-20}"
 INVALID_PAIR_STREAK_STOP="${INVALID_PAIR_STREAK_STOP:-2}"
+AB_DISABLE_HEAVY_DIAGNOSTICS="${AB_DISABLE_HEAVY_DIAGNOSTICS:-1}"
+AB_CAPTURE_HEAP_ON_MAX_RSS="${AB_CAPTURE_HEAP_ON_MAX_RSS:-0}"
+AB_CAPTURE_PPROF_ON_STUCK="${AB_CAPTURE_PPROF_ON_STUCK:-0}"
+AB_CAPTURE_FULL_SMAPS_ON_MAX_RSS="${AB_CAPTURE_FULL_SMAPS_ON_MAX_RSS:-0}"
+AB_CAPTURE_DEBUG_VARS_ON_MAX_RSS="${AB_CAPTURE_DEBUG_VARS_ON_MAX_RSS:-0}"
 TS="$(date +%Y%m%d%H%M%S)"
 OUT="${OUT_DIR:-$ROOT/artifacts/celestia_ab/$TS}"
 
@@ -79,6 +84,11 @@ run_timeout_seconds=$RUN_TIMEOUT_SECONDS
 run_max_attempts_per_variant=$RUN_MAX_ATTEMPTS_PER_VARIANT
 run_retry_sleep_seconds=$RUN_RETRY_SLEEP_SECONDS
 invalid_pair_streak_stop=$INVALID_PAIR_STREAK_STOP
+ab_disable_heavy_diagnostics=$AB_DISABLE_HEAVY_DIAGNOSTICS
+ab_capture_heap_on_max_rss=$AB_CAPTURE_HEAP_ON_MAX_RSS
+ab_capture_pprof_on_stuck=$AB_CAPTURE_PPROF_ON_STUCK
+ab_capture_full_smaps_on_max_rss=$AB_CAPTURE_FULL_SMAPS_ON_MAX_RSS
+ab_capture_debug_vars_on_max_rss=$AB_CAPTURE_DEBUG_VARS_ON_MAX_RSS
 META
 
 list_run_homes() {
@@ -164,6 +174,14 @@ run_variant() {
         source "$env_file"
         set +a
       fi
+      if [[ "$AB_DISABLE_HEAVY_DIAGNOSTICS" == "1" ]]; then
+        # A/B runs prioritize stable wall-time+size measurements. Heavy
+        # diagnostics can dominate runtime and produce invalid comparisons.
+        export CAPTURE_HEAP_ON_MAX_RSS="${CAPTURE_HEAP_ON_MAX_RSS:-$AB_CAPTURE_HEAP_ON_MAX_RSS}"
+        export CAPTURE_PPROF_ON_STUCK="${CAPTURE_PPROF_ON_STUCK:-$AB_CAPTURE_PPROF_ON_STUCK}"
+        export CAPTURE_FULL_SMAPS_ON_MAX_RSS="${CAPTURE_FULL_SMAPS_ON_MAX_RSS:-$AB_CAPTURE_FULL_SMAPS_ON_MAX_RSS}"
+        export CAPTURE_DEBUG_VARS_ON_MAX_RSS="${CAPTURE_DEBUG_VARS_ON_MAX_RSS:-$AB_CAPTURE_DEBUG_VARS_ON_MAX_RSS}"
+      fi
       # Non-login shell avoids user profile side effects (e.g. tty-dependent exports)
       # that can fail under nohup/background runs.
       if [[ "$RUN_TIMEOUT_SECONDS" -gt 0 ]] && command -v timeout >/dev/null 2>&1; then

From 1c5c909d955d66ce0efbd9f9c5bc29fb7e38dab9 Mon Sep 17 00:00:00 2001
From: Mikers <snissn@gmail.com>
Date: Sun, 29 Mar 2026 13:09:55 -1000
Subject: [PATCH 61/61] vlog-rewrite: trim source/zombie alloc churn in online
 rewrite

---
 TreeDB/db/vlog_rewrite.go | 144 ++++++++++++++++++++++++++------------
 1 file changed, 99 insertions(+), 45 deletions(-)

diff --git a/TreeDB/db/vlog_rewrite.go b/TreeDB/db/vlog_rewrite.go
index bafe6066c..24510285b 100644
--- a/TreeDB/db/vlog_rewrite.go
+++ b/TreeDB/db/vlog_rewrite.go
@@ -457,6 +457,17 @@ func selectExplicitRewriteSourceIDs(sourceFileIDs []uint32, files map[uint32]*va
 	return selected
 }
 
+func selectSingleExplicitRewriteSourceID(sourceFileIDs []uint32, files map[uint32]*valuelog.File) (uint32, bool) {
+	if len(sourceFileIDs) != 1 || len(files) == 0 {
+		return 0, false
+	}
+	id := sourceFileIDs[0]
+	if _, ok := files[id]; !ok {
+		return 0, false
+	}
+	return id, true
+}
+
 func rewritePlanNeedsLiveEstimate(opts ValueLogRewriteOnlineOptions) bool {
 	if !hasRewriteSourceSelection(opts) {
 		return false
@@ -1225,13 +1236,26 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 		stats.BytesBefore += fileSize(set.Files[id])
 	}
 	var (
-		sourceIDs      map[uint32]struct{}
-		restrictSource bool
+		sourceIDs          map[uint32]struct{}
+		singleSourceID     uint32
+		restrictSource     bool
+		restrictSingleID   bool
+		sourceSegmentCount int
 	)
 	if hasOnlyExplicitRewriteSources(opts) {
-		sourceIDs = selectExplicitRewriteSourceIDs(opts.SourceFileIDs, set.Files)
+		if id, ok := selectSingleExplicitRewriteSourceID(opts.SourceFileIDs, set.Files); ok {
+			singleSourceID = id
+			restrictSingleID = true
+		} else {
+			sourceIDs = selectExplicitRewriteSourceIDs(opts.SourceFileIDs, set.Files)
+		}
 		restrictSource = true
-		stats.SourceSegmentsRequested = len(sourceIDs)
+		if restrictSingleID {
+			sourceSegmentCount = 1
+		} else {
+			sourceSegmentCount = len(sourceIDs)
+		}
+		stats.SourceSegmentsRequested = sourceSegmentCount
 	} else if hasRewriteSourceSelection(opts) {
 		active := currentValueLogIDs(set)
 		var liveByID map[uint32]int64
@@ -1244,10 +1268,11 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 		}
 		sourceIDs, _ = selectRewriteSourceSegmentsWithStats(opts, set.Files, active, liveByID)
 		restrictSource = true
-		stats.SourceSegmentsRequested = len(sourceIDs)
+		sourceSegmentCount = len(sourceIDs)
+		stats.SourceSegmentsRequested = sourceSegmentCount
 	}
 	_ = db.valueLogManager.Release(set)
-	if restrictSource && len(sourceIDs) == 0 {
+	if restrictSource && sourceSegmentCount == 0 {
 		// No source segments selected: this rewrite pass is a no-op.
 		stats.SegmentsAfter = stats.SegmentsBefore
 		stats.BytesAfter = stats.BytesBefore
@@ -1417,8 +1442,14 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 			continue
 		}
 		if restrictSource {
-			if _, ok := sourceIDs[oldPtr.FileID]; !ok {
-				continue
+			if restrictSingleID {
+				if oldPtr.FileID != singleSourceID {
+					continue
+				}
+			} else {
+				if _, ok := sourceIDs[oldPtr.FileID]; !ok {
+					continue
+				}
 			}
 		}
 		unsafeKey := it.UnsafeKey()
@@ -1451,8 +1482,8 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 		// by LeafRef pointers even if all key/value pointers are rewritten. Move
 		// referenced leaf pages out of the selected source segments so cleanup can
 		// actually reclaim space.
-		if restrictSource && db.indexOuterLeavesInValueLog && len(sourceIDs) > 0 {
-			copied, copiedBytes, err := db.rewriteLeafRefsOnline(ctx, writer, ridAlloc, sourceIDs, opts.SyncEachBatch)
+		if restrictSource && db.indexOuterLeavesInValueLog && sourceSegmentCount > 0 {
+			copied, copiedBytes, err := db.rewriteLeafRefsOnline(ctx, writer, ridAlloc, sourceIDs, singleSourceID, restrictSingleID, opts.SyncEachBatch)
 			if err != nil {
 				return stats, err
 			}
@@ -1492,15 +1523,25 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 	if err != nil {
 		return stats, err
 	}
-	if len(sourceIDs) > 0 {
-		stillReferenced := 0
-		for id := range sourceIDs {
-			if _, ok := referencedAfter[id]; ok {
-				stillReferenced++
+	if sourceSegmentCount > 0 {
+		if restrictSingleID {
+			if _, ok := referencedAfter[singleSourceID]; ok {
+				stats.SourceSegmentsStillReferenced = 1
+				stats.SourceSegmentsUnreferenced = 0
+			} else {
+				stats.SourceSegmentsStillReferenced = 0
+				stats.SourceSegmentsUnreferenced = 1
 			}
+		} else {
+			stillReferenced := 0
+			for id := range sourceIDs {
+				if _, ok := referencedAfter[id]; ok {
+					stillReferenced++
+				}
+			}
+			stats.SourceSegmentsStillReferenced = stillReferenced
+			stats.SourceSegmentsUnreferenced = len(sourceIDs) - stillReferenced
 		}
-		stats.SourceSegmentsStillReferenced = stillReferenced
-		stats.SourceSegmentsUnreferenced = len(sourceIDs) - stillReferenced
 	}
 	var protectedPaths map[string]struct{}
 	allowActiveSkip := len(opts.ProtectedPaths) > 0
@@ -1538,32 +1579,37 @@ func (db *DB) ValueLogRewriteOnline(ctx context.Context, opts ValueLogRewriteOnl
 		}
 		_ = db.valueLogManager.Release(currentSet)
 	}
-	zombieCandidates := make(map[uint32]struct{}, len(oldValueIDs)+len(newValueIDs))
-	for id := range oldValueIDs {
-		zombieCandidates[id] = struct{}{}
-	}
-	for _, id := range newValueIDs {
-		zombieCandidates[id] = struct{}{}
-	}
-	for id := range zombieCandidates {
+	markZombieCandidate := func(id uint32, existedBefore bool) error {
 		if _, ok := referencedAfter[id]; ok {
-			continue
+			return nil
 		}
 		if _, ok := protectedIDs[id]; ok {
-			continue
+			return nil
 		}
 		// Never mark currently-active pre-existing segments zombie when callers
 		// provide ProtectedPaths (cached-mode maintenance). Concurrent writers may
 		// still be appending records whose pointers are not yet visible in the
 		// backend index.
-		if allowActiveSkip {
+		if allowActiveSkip && existedBefore {
 			if _, ok := activeIDs[id]; ok {
-				if _, existed := oldValueIDs[id]; existed {
-					continue
-				}
+				return nil
 			}
 		}
 		if err := db.valueLogManager.MarkZombie(id); err != nil {
+			return err
+		}
+		return nil
+	}
+	for id := range oldValueIDs {
+		if err := markZombieCandidate(id, true); err != nil {
+			return stats, err
+		}
+	}
+	for _, id := range newValueIDs {
+		if _, existed := oldValueIDs[id]; existed {
+			continue
+		}
+		if err := markZombieCandidate(id, false); err != nil {
 			return stats, err
 		}
 	}
@@ -1609,7 +1655,9 @@ type leafRefRewriteCtx struct {
 	writer   *rewriteWriter
 	ridAlloc *rewriteRIDAllocator
 
-	sourceIDs map[uint32]struct{}
+	sourceIDs      map[uint32]struct{}
+	singleSourceID uint32
+	hasSingleID    bool
 
 	leafMap     map[uint64]uint64 // old leafref id -> new leafref id
 	internalMap map[uint64]uint64 // old internal page id -> new page id
@@ -1661,7 +1709,11 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) {
 				return mapped, mapped != id, nil
 			}
 		}
-		if c.sourceIDs != nil {
+		if c.hasSingleID {
+			if ptr.FileID != c.singleSourceID {
+				return id, false, nil
+			}
+		} else if c.sourceIDs != nil {
 			if _, ok := c.sourceIDs[ptr.FileID]; !ok {
 				return id, false, nil
 			}
@@ -1804,7 +1856,7 @@ func (c *leafRefRewriteCtx) rewriteNode(id uint64) (uint64, bool, error) {
 	}
 }
 
-func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, ridAlloc *rewriteRIDAllocator, sourceIDs map[uint32]struct{}, sync bool) (copied int, copiedBytes int64, err error) {
+func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter, ridAlloc *rewriteRIDAllocator, sourceIDs map[uint32]struct{}, singleSourceID uint32, hasSingleSourceID bool, sync bool) (copied int, copiedBytes int64, err error) {
 	if db == nil {
 		return 0, 0, fmt.Errorf("missing db")
 	}
@@ -1820,9 +1872,9 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter,
 	if writer == nil || ridAlloc == nil {
 		return 0, 0, fmt.Errorf("vlog-rewrite: missing writer/rid state")
 	}
-	// Treat nil sourceIDs as "all sources" and an empty, non-nil map as "no
-	// sources". The latter means there is nothing to rewrite.
-	if sourceIDs != nil && len(sourceIDs) == 0 {
+	// Treat nil sourceIDs (with no single-source constraint) as "all sources"
+	// and an empty, non-nil map as "no sources".
+	if !hasSingleSourceID && sourceIDs != nil && len(sourceIDs) == 0 {
 		return 0, 0, nil
 	}
 	if ctx == nil {
@@ -1863,14 +1915,16 @@ func (db *DB) rewriteLeafRefsOnline(ctx context.Context, writer *rewriteWriter,
 	}()
 
 	leafCtx := &leafRefRewriteCtx{
-		ctx:        ctx,
-		db:         db,
-		pager:      idx.pager,
-		leafReader: &snap.reader,
-		alloc:      tracker,
-		writer:     writer,
-		ridAlloc:   ridAlloc,
-		sourceIDs:  sourceIDs,
+		ctx:            ctx,
+		db:             db,
+		pager:          idx.pager,
+		leafReader:     &snap.reader,
+		alloc:          tracker,
+		writer:         writer,
+		ridAlloc:       ridAlloc,
+		sourceIDs:      sourceIDs,
+		singleSourceID: singleSourceID,
+		hasSingleID:    hasSingleSourceID,
 	}
 	if toer, ok := leafCtx.leafReader.(unsafeToReader); ok {
 		leafCtx.leafToer = toer